]>
Commit | Line | Data |
---|---|---|
b3bbd485 JK |
1 | diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt |
2 | index 2cc08d4a326e..e28f7f29f2b3 100644 | |
3 | --- a/Documentation/trace/events.txt | |
4 | +++ b/Documentation/trace/events.txt | |
5 | @@ -517,1550 +517,4 @@ The following commands are supported: | |
6 | totals derived from one or more trace event format fields and/or | |
7 | event counts (hitcount). | |
e4b2b4a8 | 8 | |
b3bbd485 | 9 | - The format of a hist trigger is as follows: |
e4b2b4a8 | 10 | - |
b3bbd485 JK |
11 | - hist:keys=<field1[,field2,...]>[:values=<field1[,field2,...]>] |
12 | - [:sort=<field1[,field2,...]>][:size=#entries][:pause][:continue] | |
13 | - [:clear][:name=histname1] [if <filter>] | |
e4b2b4a8 | 14 | - |
b3bbd485 JK |
15 | - When a matching event is hit, an entry is added to a hash table |
16 | - using the key(s) and value(s) named. Keys and values correspond to | |
17 | - fields in the event's format description. Values must correspond to | |
18 | - numeric fields - on an event hit, the value(s) will be added to a | |
19 | - sum kept for that field. The special string 'hitcount' can be used | |
20 | - in place of an explicit value field - this is simply a count of | |
21 | - event hits. If 'values' isn't specified, an implicit 'hitcount' | |
22 | - value will be automatically created and used as the only value. | |
23 | - Keys can be any field, or the special string 'stacktrace', which | |
24 | - will use the event's kernel stacktrace as the key. The keywords | |
25 | - 'keys' or 'key' can be used to specify keys, and the keywords | |
26 | - 'values', 'vals', or 'val' can be used to specify values. Compound | |
27 | - keys consisting of up to two fields can be specified by the 'keys' | |
28 | - keyword. Hashing a compound key produces a unique entry in the | |
29 | - table for each unique combination of component keys, and can be | |
30 | - useful for providing more fine-grained summaries of event data. | |
31 | - Additionally, sort keys consisting of up to two fields can be | |
32 | - specified by the 'sort' keyword. If more than one field is | |
33 | - specified, the result will be a 'sort within a sort': the first key | |
34 | - is taken to be the primary sort key and the second the secondary | |
35 | - key. If a hist trigger is given a name using the 'name' parameter, | |
36 | - its histogram data will be shared with other triggers of the same | |
37 | - name, and trigger hits will update this common data. Only triggers | |
38 | - with 'compatible' fields can be combined in this way; triggers are | |
39 | - 'compatible' if the fields named in the trigger share the same | |
40 | - number and type of fields and those fields also have the same names. | |
41 | - Note that any two events always share the compatible 'hitcount' and | |
42 | - 'stacktrace' fields and can therefore be combined using those | |
43 | - fields, however pointless that may be. | |
e4b2b4a8 JK |
44 | - |
45 | - 'hist' triggers add a 'hist' file to each event's subdirectory. | |
46 | - Reading the 'hist' file for the event will dump the hash table in | |
47 | - its entirety to stdout. If there are multiple hist triggers | |
48 | - attached to an event, there will be a table for each trigger in the | |
49 | - output. The table displayed for a named trigger will be the same as | |
50 | - any other instance having the same name. Each printed hash table | |
51 | - entry is a simple list of the keys and values comprising the entry; | |
52 | - keys are printed first and are delineated by curly braces, and are | |
53 | - followed by the set of value fields for the entry. By default, | |
54 | - numeric fields are displayed as base-10 integers. This can be | |
55 | - modified by appending any of the following modifiers to the field | |
56 | - name: | |
57 | - | |
58 | - .hex display a number as a hex value | |
59 | - .sym display an address as a symbol | |
60 | - .sym-offset display an address as a symbol and offset | |
61 | - .syscall display a syscall id as a system call name | |
62 | - .execname display a common_pid as a program name | |
63 | - | |
64 | - Note that in general the semantics of a given field aren't | |
65 | - interpreted when applying a modifier to it, but there are some | |
66 | - restrictions to be aware of in this regard: | |
67 | - | |
68 | - - only the 'hex' modifier can be used for values (because values | |
69 | - are essentially sums, and the other modifiers don't make sense | |
70 | - in that context). | |
71 | - - the 'execname' modifier can only be used on a 'common_pid'. The | |
72 | - reason for this is that the execname is simply the 'comm' value | |
73 | - saved for the 'current' process when an event was triggered, | |
74 | - which is the same as the common_pid value saved by the event | |
75 | - tracing code. Trying to apply that comm value to other pid | |
76 | - values wouldn't be correct, and typically events that care save | |
77 | - pid-specific comm fields in the event itself. | |
78 | - | |
79 | - A typical usage scenario would be the following to enable a hist | |
80 | - trigger, read its current contents, and then turn it off: | |
81 | - | |
82 | - # echo 'hist:keys=skbaddr.hex:vals=len' > \ | |
83 | - /sys/kernel/debug/tracing/events/net/netif_rx/trigger | |
84 | - | |
85 | - # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist | |
86 | - | |
87 | - # echo '!hist:keys=skbaddr.hex:vals=len' > \ | |
88 | - /sys/kernel/debug/tracing/events/net/netif_rx/trigger | |
89 | - | |
90 | - The trigger file itself can be read to show the details of the | |
91 | - currently attached hist trigger. This information is also displayed | |
92 | - at the top of the 'hist' file when read. | |
93 | - | |
94 | - By default, the size of the hash table is 2048 entries. The 'size' | |
95 | - parameter can be used to specify more or fewer than that. The units | |
96 | - are in terms of hashtable entries - if a run uses more entries than | |
97 | - specified, the results will show the number of 'drops', the number | |
98 | - of hits that were ignored. The size should be a power of 2 between | |
99 | - 128 and 131072 (any non- power-of-2 number specified will be rounded | |
100 | - up). | |
101 | - | |
102 | - The 'sort' parameter can be used to specify a value field to sort | |
103 | - on. The default if unspecified is 'hitcount' and the default sort | |
104 | - order is 'ascending'. To sort in the opposite direction, append | |
105 | - .descending' to the sort key. | |
106 | - | |
107 | - The 'pause' parameter can be used to pause an existing hist trigger | |
108 | - or to start a hist trigger but not log any events until told to do | |
109 | - so. 'continue' or 'cont' can be used to start or restart a paused | |
110 | - hist trigger. | |
111 | - | |
112 | - The 'clear' parameter will clear the contents of a running hist | |
113 | - trigger and leave its current paused/active state. | |
114 | - | |
115 | - Note that the 'pause', 'cont', and 'clear' parameters should be | |
116 | - applied using 'append' shell operator ('>>') if applied to an | |
117 | - existing trigger, rather than via the '>' operator, which will cause | |
118 | - the trigger to be removed through truncation. | |
119 | - | |
120 | -- enable_hist/disable_hist | |
121 | - | |
122 | - The enable_hist and disable_hist triggers can be used to have one | |
123 | - event conditionally start and stop another event's already-attached | |
124 | - hist trigger. Any number of enable_hist and disable_hist triggers | |
125 | - can be attached to a given event, allowing that event to kick off | |
126 | - and stop aggregations on a host of other events. | |
127 | - | |
128 | - The format is very similar to the enable/disable_event triggers: | |
129 | - | |
130 | - enable_hist:<system>:<event>[:count] | |
131 | - disable_hist:<system>:<event>[:count] | |
132 | - | |
133 | - Instead of enabling or disabling the tracing of the target event | |
134 | - into the trace buffer as the enable/disable_event triggers do, the | |
135 | - enable/disable_hist triggers enable or disable the aggregation of | |
136 | - the target event into a hash table. | |
137 | - | |
138 | - A typical usage scenario for the enable_hist/disable_hist triggers | |
139 | - would be to first set up a paused hist trigger on some event, | |
140 | - followed by an enable_hist/disable_hist pair that turns the hist | |
141 | - aggregation on and off when conditions of interest are hit: | |
142 | - | |
143 | - # echo 'hist:keys=skbaddr.hex:vals=len:pause' > \ | |
144 | - /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger | |
145 | - | |
146 | - # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \ | |
147 | - /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger | |
148 | - | |
149 | - # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \ | |
150 | - /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger | |
151 | - | |
152 | - The above sets up an initially paused hist trigger which is unpaused | |
153 | - and starts aggregating events when a given program is executed, and | |
154 | - which stops aggregating when the process exits and the hist trigger | |
155 | - is paused again. | |
156 | - | |
157 | - The examples below provide a more concrete illustration of the | |
158 | - concepts and typical usage patterns discussed above. | |
159 | - | |
160 | - | |
161 | -6.2 'hist' trigger examples | |
162 | ---------------------------- | |
163 | - | |
164 | - The first set of examples creates aggregations using the kmalloc | |
165 | - event. The fields that can be used for the hist trigger are listed | |
166 | - in the kmalloc event's format file: | |
167 | - | |
168 | - # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/format | |
169 | - name: kmalloc | |
170 | - ID: 374 | |
171 | - format: | |
172 | - field:unsigned short common_type; offset:0; size:2; signed:0; | |
173 | - field:unsigned char common_flags; offset:2; size:1; signed:0; | |
174 | - field:unsigned char common_preempt_count; offset:3; size:1; signed:0; | |
175 | - field:int common_pid; offset:4; size:4; signed:1; | |
176 | - | |
177 | - field:unsigned long call_site; offset:8; size:8; signed:0; | |
178 | - field:const void * ptr; offset:16; size:8; signed:0; | |
179 | - field:size_t bytes_req; offset:24; size:8; signed:0; | |
180 | - field:size_t bytes_alloc; offset:32; size:8; signed:0; | |
181 | - field:gfp_t gfp_flags; offset:40; size:4; signed:0; | |
182 | - | |
183 | - We'll start by creating a hist trigger that generates a simple table | |
184 | - that lists the total number of bytes requested for each function in | |
185 | - the kernel that made one or more calls to kmalloc: | |
186 | - | |
187 | - # echo 'hist:key=call_site:val=bytes_req' > \ | |
188 | - /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger | |
189 | - | |
190 | - This tells the tracing system to create a 'hist' trigger using the | |
191 | - call_site field of the kmalloc event as the key for the table, which | |
192 | - just means that each unique call_site address will have an entry | |
193 | - created for it in the table. The 'val=bytes_req' parameter tells | |
194 | - the hist trigger that for each unique entry (call_site) in the | |
195 | - table, it should keep a running total of the number of bytes | |
196 | - requested by that call_site. | |
197 | - | |
198 | - We'll let it run for awhile and then dump the contents of the 'hist' | |
199 | - file in the kmalloc event's subdirectory (for readability, a number | |
200 | - of entries have been omitted): | |
201 | - | |
202 | - # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist | |
203 | - # trigger info: hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active] | |
204 | - | |
205 | - { call_site: 18446744072106379007 } hitcount: 1 bytes_req: 176 | |
206 | - { call_site: 18446744071579557049 } hitcount: 1 bytes_req: 1024 | |
207 | - { call_site: 18446744071580608289 } hitcount: 1 bytes_req: 16384 | |
208 | - { call_site: 18446744071581827654 } hitcount: 1 bytes_req: 24 | |
209 | - { call_site: 18446744071580700980 } hitcount: 1 bytes_req: 8 | |
210 | - { call_site: 18446744071579359876 } hitcount: 1 bytes_req: 152 | |
211 | - { call_site: 18446744071580795365 } hitcount: 3 bytes_req: 144 | |
212 | - { call_site: 18446744071581303129 } hitcount: 3 bytes_req: 144 | |
213 | - { call_site: 18446744071580713234 } hitcount: 4 bytes_req: 2560 | |
214 | - { call_site: 18446744071580933750 } hitcount: 4 bytes_req: 736 | |
215 | - . | |
216 | - . | |
217 | - . | |
218 | - { call_site: 18446744072106047046 } hitcount: 69 bytes_req: 5576 | |
219 | - { call_site: 18446744071582116407 } hitcount: 73 bytes_req: 2336 | |
220 | - { call_site: 18446744072106054684 } hitcount: 136 bytes_req: 140504 | |
221 | - { call_site: 18446744072106224230 } hitcount: 136 bytes_req: 19584 | |
222 | - { call_site: 18446744072106078074 } hitcount: 153 bytes_req: 2448 | |
223 | - { call_site: 18446744072106062406 } hitcount: 153 bytes_req: 36720 | |
224 | - { call_site: 18446744071582507929 } hitcount: 153 bytes_req: 37088 | |
225 | - { call_site: 18446744072102520590 } hitcount: 273 bytes_req: 10920 | |
226 | - { call_site: 18446744071582143559 } hitcount: 358 bytes_req: 716 | |
227 | - { call_site: 18446744072106465852 } hitcount: 417 bytes_req: 56712 | |
228 | - { call_site: 18446744072102523378 } hitcount: 485 bytes_req: 27160 | |
229 | - { call_site: 18446744072099568646 } hitcount: 1676 bytes_req: 33520 | |
230 | - | |
231 | - Totals: | |
232 | - Hits: 4610 | |
233 | - Entries: 45 | |
234 | - Dropped: 0 | |
235 | - | |
236 | - The output displays a line for each entry, beginning with the key | |
237 | - specified in the trigger, followed by the value(s) also specified in | |
238 | - the trigger. At the beginning of the output is a line that displays | |
239 | - the trigger info, which can also be displayed by reading the | |
240 | - 'trigger' file: | |
241 | - | |
242 | - # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger | |
243 | - hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active] | |
244 | - | |
245 | - At the end of the output are a few lines that display the overall | |
246 | - totals for the run. The 'Hits' field shows the total number of | |
247 | - times the event trigger was hit, the 'Entries' field shows the total | |
248 | - number of used entries in the hash table, and the 'Dropped' field | |
249 | - shows the number of hits that were dropped because the number of | |
250 | - used entries for the run exceeded the maximum number of entries | |
251 | - allowed for the table (normally 0, but if not a hint that you may | |
252 | - want to increase the size of the table using the 'size' parameter). | |
253 | - | |
254 | - Notice in the above output that there's an extra field, 'hitcount', | |
255 | - which wasn't specified in the trigger. Also notice that in the | |
256 | - trigger info output, there's a parameter, 'sort=hitcount', which | |
257 | - wasn't specified in the trigger either. The reason for that is that | |
258 | - every trigger implicitly keeps a count of the total number of hits | |
259 | - attributed to a given entry, called the 'hitcount'. That hitcount | |
260 | - information is explicitly displayed in the output, and in the | |
261 | - absence of a user-specified sort parameter, is used as the default | |
262 | - sort field. | |
263 | - | |
264 | - The value 'hitcount' can be used in place of an explicit value in | |
265 | - the 'values' parameter if you don't really need to have any | |
266 | - particular field summed and are mainly interested in hit | |
267 | - frequencies. | |
268 | - | |
269 | - To turn the hist trigger off, simply call up the trigger in the | |
270 | - command history and re-execute it with a '!' prepended: | |
271 | - | |
272 | - # echo '!hist:key=call_site:val=bytes_req' > \ | |
273 | - /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger | |
274 | - | |
275 | - Finally, notice that the call_site as displayed in the output above | |
276 | - isn't really very useful. It's an address, but normally addresses | |
277 | - are displayed in hex. To have a numeric field displayed as a hex | |
278 | - value, simply append '.hex' to the field name in the trigger: | |
279 | - | |
280 | - # echo 'hist:key=call_site.hex:val=bytes_req' > \ | |
281 | - /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger | |
282 | - | |
283 | - # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist | |
284 | - # trigger info: hist:keys=call_site.hex:vals=bytes_req:sort=hitcount:size=2048 [active] | |
285 | - | |
286 | - { call_site: ffffffffa026b291 } hitcount: 1 bytes_req: 433 | |
287 | - { call_site: ffffffffa07186ff } hitcount: 1 bytes_req: 176 | |
288 | - { call_site: ffffffff811ae721 } hitcount: 1 bytes_req: 16384 | |
289 | - { call_site: ffffffff811c5134 } hitcount: 1 bytes_req: 8 | |
290 | - { call_site: ffffffffa04a9ebb } hitcount: 1 bytes_req: 511 | |
291 | - { call_site: ffffffff8122e0a6 } hitcount: 1 bytes_req: 12 | |
292 | - { call_site: ffffffff8107da84 } hitcount: 1 bytes_req: 152 | |
293 | - { call_site: ffffffff812d8246 } hitcount: 1 bytes_req: 24 | |
294 | - { call_site: ffffffff811dc1e5 } hitcount: 3 bytes_req: 144 | |
295 | - { call_site: ffffffffa02515e8 } hitcount: 3 bytes_req: 648 | |
296 | - { call_site: ffffffff81258159 } hitcount: 3 bytes_req: 144 | |
297 | - { call_site: ffffffff811c80f4 } hitcount: 4 bytes_req: 544 | |
298 | - . | |
299 | - . | |
300 | - . | |
301 | - { call_site: ffffffffa06c7646 } hitcount: 106 bytes_req: 8024 | |
302 | - { call_site: ffffffffa06cb246 } hitcount: 132 bytes_req: 31680 | |
303 | - { call_site: ffffffffa06cef7a } hitcount: 132 bytes_req: 2112 | |
304 | - { call_site: ffffffff8137e399 } hitcount: 132 bytes_req: 23232 | |
305 | - { call_site: ffffffffa06c941c } hitcount: 185 bytes_req: 171360 | |
306 | - { call_site: ffffffffa06f2a66 } hitcount: 185 bytes_req: 26640 | |
307 | - { call_site: ffffffffa036a70e } hitcount: 265 bytes_req: 10600 | |
308 | - { call_site: ffffffff81325447 } hitcount: 292 bytes_req: 584 | |
309 | - { call_site: ffffffffa072da3c } hitcount: 446 bytes_req: 60656 | |
310 | - { call_site: ffffffffa036b1f2 } hitcount: 526 bytes_req: 29456 | |
311 | - { call_site: ffffffffa0099c06 } hitcount: 1780 bytes_req: 35600 | |
312 | - | |
313 | - Totals: | |
314 | - Hits: 4775 | |
315 | - Entries: 46 | |
316 | - Dropped: 0 | |
317 | - | |
318 | - Even that's only marginally more useful - while hex values do look | |
319 | - more like addresses, what users are typically more interested in | |
320 | - when looking at text addresses are the corresponding symbols | |
321 | - instead. To have an address displayed as symbolic value instead, | |
322 | - simply append '.sym' or '.sym-offset' to the field name in the | |
323 | - trigger: | |
324 | - | |
325 | - # echo 'hist:key=call_site.sym:val=bytes_req' > \ | |
326 | - /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger | |
327 | - | |
328 | - # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist | |
329 | - # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=hitcount:size=2048 [active] | |
330 | - | |
331 | - { call_site: [ffffffff810adcb9] syslog_print_all } hitcount: 1 bytes_req: 1024 | |
332 | - { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8 | |
333 | - { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7 | |
334 | - { call_site: [ffffffff8154acbe] usb_alloc_urb } hitcount: 1 bytes_req: 192 | |
335 | - { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7 | |
336 | - { call_site: [ffffffff811e3a25] __seq_open_private } hitcount: 1 bytes_req: 40 | |
337 | - { call_site: [ffffffff8109524a] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 | |
338 | - { call_site: [ffffffff811febd5] fsnotify_alloc_group } hitcount: 2 bytes_req: 528 | |
339 | - { call_site: [ffffffff81440f58] __tty_buffer_request_room } hitcount: 2 bytes_req: 2624 | |
340 | - { call_site: [ffffffff81200ba6] inotify_new_group } hitcount: 2 bytes_req: 96 | |
341 | - { call_site: [ffffffffa05e19af] ieee80211_start_tx_ba_session [mac80211] } hitcount: 2 bytes_req: 464 | |
342 | - { call_site: [ffffffff81672406] tcp_get_metrics } hitcount: 2 bytes_req: 304 | |
343 | - { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 | |
344 | - { call_site: [ffffffff81089b05] sched_create_group } hitcount: 2 bytes_req: 1424 | |
345 | - . | |
346 | - . | |
347 | - . | |
348 | - { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 1185 bytes_req: 123240 | |
349 | - { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm] } hitcount: 1185 bytes_req: 104280 | |
350 | - { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 1402 bytes_req: 190672 | |
351 | - { call_site: [ffffffff812891ca] ext4_find_extent } hitcount: 1518 bytes_req: 146208 | |
352 | - { call_site: [ffffffffa029070e] drm_vma_node_allow [drm] } hitcount: 1746 bytes_req: 69840 | |
353 | - { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 2021 bytes_req: 792312 | |
354 | - { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 2592 bytes_req: 145152 | |
355 | - { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 2629 bytes_req: 378576 | |
356 | - { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 2629 bytes_req: 3783248 | |
357 | - { call_site: [ffffffff81325607] apparmor_file_alloc_security } hitcount: 5192 bytes_req: 10384 | |
358 | - { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 5529 bytes_req: 110584 | |
359 | - { call_site: [ffffffff8131ebf7] aa_alloc_task_context } hitcount: 21943 bytes_req: 702176 | |
360 | - { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 55759 bytes_req: 5074265 | |
361 | - | |
362 | - Totals: | |
363 | - Hits: 109928 | |
364 | - Entries: 71 | |
365 | - Dropped: 0 | |
366 | - | |
367 | - Because the default sort key above is 'hitcount', the above shows a | |
368 | - the list of call_sites by increasing hitcount, so that at the bottom | |
369 | - we see the functions that made the most kmalloc calls during the | |
370 | - run. If instead we we wanted to see the top kmalloc callers in | |
371 | - terms of the number of bytes requested rather than the number of | |
372 | - calls, and we wanted the top caller to appear at the top, we can use | |
373 | - the 'sort' parameter, along with the 'descending' modifier: | |
374 | - | |
375 | - # echo 'hist:key=call_site.sym:val=bytes_req:sort=bytes_req.descending' > \ | |
376 | - /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger | |
377 | - | |
378 | - # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist | |
379 | - # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=bytes_req.descending:size=2048 [active] | |
380 | - | |
381 | - { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 2186 bytes_req: 3397464 | |
382 | - { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 1790 bytes_req: 712176 | |
383 | - { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 8132 bytes_req: 513135 | |
384 | - { call_site: [ffffffff811e2a1b] seq_buf_alloc } hitcount: 106 bytes_req: 440128 | |
385 | - { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 2186 bytes_req: 314784 | |
386 | - { call_site: [ffffffff812891ca] ext4_find_extent } hitcount: 2174 bytes_req: 208992 | |
387 | - { call_site: [ffffffff811ae8e1] __kmalloc } hitcount: 8 bytes_req: 131072 | |
388 | - { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 859 bytes_req: 116824 | |
389 | - { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 1834 bytes_req: 102704 | |
390 | - { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 972 bytes_req: 101088 | |
391 | - { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm] } hitcount: 972 bytes_req: 85536 | |
392 | - { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 3333 bytes_req: 66664 | |
393 | - { call_site: [ffffffff8137e559] sg_kmalloc } hitcount: 209 bytes_req: 61632 | |
394 | - . | |
395 | - . | |
396 | - . | |
397 | - { call_site: [ffffffff81095225] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 | |
398 | - { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 | |
399 | - { call_site: [ffffffff812d8406] copy_semundo } hitcount: 2 bytes_req: 48 | |
400 | - { call_site: [ffffffff81200ba6] inotify_new_group } hitcount: 1 bytes_req: 48 | |
401 | - { call_site: [ffffffffa027121a] drm_getmagic [drm] } hitcount: 1 bytes_req: 48 | |
402 | - { call_site: [ffffffff811e3a25] __seq_open_private } hitcount: 1 bytes_req: 40 | |
403 | - { call_site: [ffffffff811c52f4] bprm_change_interp } hitcount: 2 bytes_req: 16 | |
404 | - { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8 | |
405 | - { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7 | |
406 | - { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7 | |
407 | - | |
408 | - Totals: | |
409 | - Hits: 32133 | |
410 | - Entries: 81 | |
411 | - Dropped: 0 | |
412 | - | |
413 | - To display the offset and size information in addition to the symbol | |
414 | - name, just use 'sym-offset' instead: | |
415 | - | |
416 | - # echo 'hist:key=call_site.sym-offset:val=bytes_req:sort=bytes_req.descending' > \ | |
417 | - /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger | |
418 | - | |
419 | - # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist | |
420 | - # trigger info: hist:keys=call_site.sym-offset:vals=bytes_req:sort=bytes_req.descending:size=2048 [active] | |
421 | - | |
422 | - { call_site: [ffffffffa046041c] i915_gem_execbuffer2+0x6c/0x2c0 [i915] } hitcount: 4569 bytes_req: 3163720 | |
423 | - { call_site: [ffffffffa0489a66] intel_ring_begin+0xc6/0x1f0 [i915] } hitcount: 4569 bytes_req: 657936 | |
424 | - { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23+0x694/0x1020 [i915] } hitcount: 1519 bytes_req: 472936 | |
425 | - { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23+0x516/0x1020 [i915] } hitcount: 3050 bytes_req: 211832 | |
426 | - { call_site: [ffffffff811e2a1b] seq_buf_alloc+0x1b/0x50 } hitcount: 34 bytes_req: 148384 | |
427 | - { call_site: [ffffffffa04a580c] intel_crtc_page_flip+0xbc/0x870 [i915] } hitcount: 1385 bytes_req: 144040 | |
428 | - { call_site: [ffffffff811ae8e1] __kmalloc+0x191/0x1b0 } hitcount: 8 bytes_req: 131072 | |
429 | - { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl+0x282/0x360 [drm] } hitcount: 1385 bytes_req: 121880 | |
430 | - { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc+0x32/0x100 [drm] } hitcount: 1848 bytes_req: 103488 | |
431 | - { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state+0x2c/0xa0 [i915] } hitcount: 461 bytes_req: 62696 | |
432 | - { call_site: [ffffffffa029070e] drm_vma_node_allow+0x2e/0xd0 [drm] } hitcount: 1541 bytes_req: 61640 | |
433 | - { call_site: [ffffffff815f8d7b] sk_prot_alloc+0xcb/0x1b0 } hitcount: 57 bytes_req: 57456 | |
434 | - . | |
435 | - . | |
436 | - . | |
437 | - { call_site: [ffffffff8109524a] alloc_fair_sched_group+0x5a/0x1a0 } hitcount: 2 bytes_req: 128 | |
438 | - { call_site: [ffffffffa027b921] drm_vm_open_locked+0x31/0xa0 [drm] } hitcount: 3 bytes_req: 96 | |
439 | - { call_site: [ffffffff8122e266] proc_self_follow_link+0x76/0xb0 } hitcount: 8 bytes_req: 96 | |
440 | - { call_site: [ffffffff81213e80] load_elf_binary+0x240/0x1650 } hitcount: 3 bytes_req: 84 | |
441 | - { call_site: [ffffffff8154bc62] usb_control_msg+0x42/0x110 } hitcount: 1 bytes_req: 8 | |
442 | - { call_site: [ffffffffa00bf6fe] hidraw_send_report+0x7e/0x1a0 [hid] } hitcount: 1 bytes_req: 7 | |
443 | - { call_site: [ffffffffa00bf1ca] hidraw_report_event+0x8a/0x120 [hid] } hitcount: 1 bytes_req: 7 | |
444 | - | |
445 | - Totals: | |
446 | - Hits: 26098 | |
447 | - Entries: 64 | |
448 | - Dropped: 0 | |
449 | - | |
450 | - We can also add multiple fields to the 'values' parameter. For | |
451 | - example, we might want to see the total number of bytes allocated | |
452 | - alongside bytes requested, and display the result sorted by bytes | |
453 | - allocated in a descending order: | |
454 | - | |
455 | - # echo 'hist:keys=call_site.sym:values=bytes_req,bytes_alloc:sort=bytes_alloc.descending' > \ | |
456 | - /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger | |
457 | - | |
458 | - # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist | |
459 | - # trigger info: hist:keys=call_site.sym:vals=bytes_req,bytes_alloc:sort=bytes_alloc.descending:size=2048 [active] | |
460 | - | |
461 | - { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 7403 bytes_req: 4084360 bytes_alloc: 5958016 | |
462 | - { call_site: [ffffffff811e2a1b] seq_buf_alloc } hitcount: 541 bytes_req: 2213968 bytes_alloc: 2228224 | |
463 | - { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 7404 bytes_req: 1066176 bytes_alloc: 1421568 | |
464 | - { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 1565 bytes_req: 557368 bytes_alloc: 1037760 | |
465 | - { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 9557 bytes_req: 595778 bytes_alloc: 695744 | |
466 | - { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 5839 bytes_req: 430680 bytes_alloc: 470400 | |
467 | - { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 2388 bytes_req: 324768 bytes_alloc: 458496 | |
468 | - { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 3911 bytes_req: 219016 bytes_alloc: 250304 | |
469 | - { call_site: [ffffffff815f8d7b] sk_prot_alloc } hitcount: 235 bytes_req: 236880 bytes_alloc: 240640 | |
470 | - { call_site: [ffffffff8137e559] sg_kmalloc } hitcount: 557 bytes_req: 169024 bytes_alloc: 221760 | |
471 | - { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 9378 bytes_req: 187548 bytes_alloc: 206312 | |
472 | - { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 1519 bytes_req: 157976 bytes_alloc: 194432 | |
473 | - . | |
474 | - . | |
475 | - . | |
476 | - { call_site: [ffffffff8109bd3b] sched_autogroup_create_attach } hitcount: 2 bytes_req: 144 bytes_alloc: 192 | |
477 | - { call_site: [ffffffff81097ee8] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128 | |
478 | - { call_site: [ffffffff8109524a] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128 | |
479 | - { call_site: [ffffffff81095225] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128 | |
480 | - { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128 | |
481 | - { call_site: [ffffffff81213e80] load_elf_binary } hitcount: 3 bytes_req: 84 bytes_alloc: 96 | |
482 | - { call_site: [ffffffff81079a2e] kthread_create_on_node } hitcount: 1 bytes_req: 56 bytes_alloc: 64 | |
483 | - { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7 bytes_alloc: 8 | |
484 | - { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8 bytes_alloc: 8 | |
485 | - { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7 bytes_alloc: 8 | |
486 | - | |
487 | - Totals: | |
488 | - Hits: 66598 | |
489 | - Entries: 65 | |
490 | - Dropped: 0 | |
491 | - | |
492 | - Finally, to finish off our kmalloc example, instead of simply having | |
493 | - the hist trigger display symbolic call_sites, we can have the hist | |
494 | - trigger additionally display the complete set of kernel stack traces | |
495 | - that led to each call_site. To do that, we simply use the special | |
496 | - value 'stacktrace' for the key parameter: | |
497 | - | |
498 | - # echo 'hist:keys=stacktrace:values=bytes_req,bytes_alloc:sort=bytes_alloc' > \ | |
499 | - /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger | |
500 | - | |
501 | - The above trigger will use the kernel stack trace in effect when an | |
502 | - event is triggered as the key for the hash table. This allows the | |
503 | - enumeration of every kernel callpath that led up to a particular | |
504 | - event, along with a running total of any of the event fields for | |
505 | - that event. Here we tally bytes requested and bytes allocated for | |
506 | - every callpath in the system that led up to a kmalloc (in this case | |
507 | - every callpath to a kmalloc for a kernel compile): | |
508 | - | |
509 | - # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist | |
510 | - # trigger info: hist:keys=stacktrace:vals=bytes_req,bytes_alloc:sort=bytes_alloc:size=2048 [active] | |
511 | - | |
512 | - { stacktrace: | |
513 | - __kmalloc_track_caller+0x10b/0x1a0 | |
514 | - kmemdup+0x20/0x50 | |
515 | - hidraw_report_event+0x8a/0x120 [hid] | |
516 | - hid_report_raw_event+0x3ea/0x440 [hid] | |
517 | - hid_input_report+0x112/0x190 [hid] | |
518 | - hid_irq_in+0xc2/0x260 [usbhid] | |
519 | - __usb_hcd_giveback_urb+0x72/0x120 | |
520 | - usb_giveback_urb_bh+0x9e/0xe0 | |
521 | - tasklet_hi_action+0xf8/0x100 | |
522 | - __do_softirq+0x114/0x2c0 | |
523 | - irq_exit+0xa5/0xb0 | |
524 | - do_IRQ+0x5a/0xf0 | |
525 | - ret_from_intr+0x0/0x30 | |
526 | - cpuidle_enter+0x17/0x20 | |
527 | - cpu_startup_entry+0x315/0x3e0 | |
528 | - rest_init+0x7c/0x80 | |
529 | - } hitcount: 3 bytes_req: 21 bytes_alloc: 24 | |
530 | - { stacktrace: | |
531 | - __kmalloc_track_caller+0x10b/0x1a0 | |
532 | - kmemdup+0x20/0x50 | |
533 | - hidraw_report_event+0x8a/0x120 [hid] | |
534 | - hid_report_raw_event+0x3ea/0x440 [hid] | |
535 | - hid_input_report+0x112/0x190 [hid] | |
536 | - hid_irq_in+0xc2/0x260 [usbhid] | |
537 | - __usb_hcd_giveback_urb+0x72/0x120 | |
538 | - usb_giveback_urb_bh+0x9e/0xe0 | |
539 | - tasklet_hi_action+0xf8/0x100 | |
540 | - __do_softirq+0x114/0x2c0 | |
541 | - irq_exit+0xa5/0xb0 | |
542 | - do_IRQ+0x5a/0xf0 | |
543 | - ret_from_intr+0x0/0x30 | |
544 | - } hitcount: 3 bytes_req: 21 bytes_alloc: 24 | |
545 | - { stacktrace: | |
546 | - kmem_cache_alloc_trace+0xeb/0x150 | |
547 | - aa_alloc_task_context+0x27/0x40 | |
548 | - apparmor_cred_prepare+0x1f/0x50 | |
549 | - security_prepare_creds+0x16/0x20 | |
550 | - prepare_creds+0xdf/0x1a0 | |
551 | - SyS_capset+0xb5/0x200 | |
552 | - system_call_fastpath+0x12/0x6a | |
553 | - } hitcount: 1 bytes_req: 32 bytes_alloc: 32 | |
554 | - . | |
555 | - . | |
556 | - . | |
557 | - { stacktrace: | |
558 | - __kmalloc+0x11b/0x1b0 | |
559 | - i915_gem_execbuffer2+0x6c/0x2c0 [i915] | |
560 | - drm_ioctl+0x349/0x670 [drm] | |
561 | - do_vfs_ioctl+0x2f0/0x4f0 | |
562 | - SyS_ioctl+0x81/0xa0 | |
563 | - system_call_fastpath+0x12/0x6a | |
564 | - } hitcount: 17726 bytes_req: 13944120 bytes_alloc: 19593808 | |
565 | - { stacktrace: | |
566 | - __kmalloc+0x11b/0x1b0 | |
567 | - load_elf_phdrs+0x76/0xa0 | |
568 | - load_elf_binary+0x102/0x1650 | |
569 | - search_binary_handler+0x97/0x1d0 | |
570 | - do_execveat_common.isra.34+0x551/0x6e0 | |
571 | - SyS_execve+0x3a/0x50 | |
572 | - return_from_execve+0x0/0x23 | |
573 | - } hitcount: 33348 bytes_req: 17152128 bytes_alloc: 20226048 | |
574 | - { stacktrace: | |
575 | - kmem_cache_alloc_trace+0xeb/0x150 | |
576 | - apparmor_file_alloc_security+0x27/0x40 | |
577 | - security_file_alloc+0x16/0x20 | |
578 | - get_empty_filp+0x93/0x1c0 | |
579 | - path_openat+0x31/0x5f0 | |
580 | - do_filp_open+0x3a/0x90 | |
581 | - do_sys_open+0x128/0x220 | |
582 | - SyS_open+0x1e/0x20 | |
583 | - system_call_fastpath+0x12/0x6a | |
584 | - } hitcount: 4766422 bytes_req: 9532844 bytes_alloc: 38131376 | |
585 | - { stacktrace: | |
586 | - __kmalloc+0x11b/0x1b0 | |
587 | - seq_buf_alloc+0x1b/0x50 | |
588 | - seq_read+0x2cc/0x370 | |
589 | - proc_reg_read+0x3d/0x80 | |
590 | - __vfs_read+0x28/0xe0 | |
591 | - vfs_read+0x86/0x140 | |
592 | - SyS_read+0x46/0xb0 | |
593 | - system_call_fastpath+0x12/0x6a | |
594 | - } hitcount: 19133 bytes_req: 78368768 bytes_alloc: 78368768 | |
595 | - | |
596 | - Totals: | |
597 | - Hits: 6085872 | |
598 | - Entries: 253 | |
599 | - Dropped: 0 | |
600 | - | |
601 | - If you key a hist trigger on common_pid, in order for example to | |
602 | - gather and display sorted totals for each process, you can use the | |
603 | - special .execname modifier to display the executable names for the | |
604 | - processes in the table rather than raw pids. The example below | |
605 | - keeps a per-process sum of total bytes read: | |
606 | - | |
607 | - # echo 'hist:key=common_pid.execname:val=count:sort=count.descending' > \ | |
608 | - /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/trigger | |
609 | - | |
610 | - # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/hist | |
611 | - # trigger info: hist:keys=common_pid.execname:vals=count:sort=count.descending:size=2048 [active] | |
612 | - | |
613 | - { common_pid: gnome-terminal [ 3196] } hitcount: 280 count: 1093512 | |
614 | - { common_pid: Xorg [ 1309] } hitcount: 525 count: 256640 | |
615 | - { common_pid: compiz [ 2889] } hitcount: 59 count: 254400 | |
616 | - { common_pid: bash [ 8710] } hitcount: 3 count: 66369 | |
617 | - { common_pid: dbus-daemon-lau [ 8703] } hitcount: 49 count: 47739 | |
618 | - { common_pid: irqbalance [ 1252] } hitcount: 27 count: 27648 | |
619 | - { common_pid: 01ifupdown [ 8705] } hitcount: 3 count: 17216 | |
620 | - { common_pid: dbus-daemon [ 772] } hitcount: 10 count: 12396 | |
621 | - { common_pid: Socket Thread [ 8342] } hitcount: 11 count: 11264 | |
622 | - { common_pid: nm-dhcp-client. [ 8701] } hitcount: 6 count: 7424 | |
623 | - { common_pid: gmain [ 1315] } hitcount: 18 count: 6336 | |
624 | - . | |
625 | - . | |
626 | - . | |
627 | - { common_pid: postgres [ 1892] } hitcount: 2 count: 32 | |
628 | - { common_pid: postgres [ 1891] } hitcount: 2 count: 32 | |
629 | - { common_pid: gmain [ 8704] } hitcount: 2 count: 32 | |
630 | - { common_pid: upstart-dbus-br [ 2740] } hitcount: 21 count: 21 | |
631 | - { common_pid: nm-dispatcher.a [ 8696] } hitcount: 1 count: 16 | |
632 | - { common_pid: indicator-datet [ 2904] } hitcount: 1 count: 16 | |
633 | - { common_pid: gdbus [ 2998] } hitcount: 1 count: 16 | |
634 | - { common_pid: rtkit-daemon [ 2052] } hitcount: 1 count: 8 | |
635 | - { common_pid: init [ 1] } hitcount: 2 count: 2 | |
636 | - | |
637 | - Totals: | |
638 | - Hits: 2116 | |
639 | - Entries: 51 | |
640 | - Dropped: 0 | |
641 | - | |
642 | - Similarly, if you key a hist trigger on syscall id, for example to | |
643 | - gather and display a list of systemwide syscall hits, you can use | |
644 | - the special .syscall modifier to display the syscall names rather | |
645 | - than raw ids. The example below keeps a running total of syscall | |
646 | - counts for the system during the run: | |
647 | - | |
648 | - # echo 'hist:key=id.syscall:val=hitcount' > \ | |
649 | - /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger | |
650 | - | |
651 | - # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist | |
652 | - # trigger info: hist:keys=id.syscall:vals=hitcount:sort=hitcount:size=2048 [active] | |
653 | - | |
654 | - { id: sys_fsync [ 74] } hitcount: 1 | |
655 | - { id: sys_newuname [ 63] } hitcount: 1 | |
656 | - { id: sys_prctl [157] } hitcount: 1 | |
657 | - { id: sys_statfs [137] } hitcount: 1 | |
658 | - { id: sys_symlink [ 88] } hitcount: 1 | |
659 | - { id: sys_sendmmsg [307] } hitcount: 1 | |
660 | - { id: sys_semctl [ 66] } hitcount: 1 | |
661 | - { id: sys_readlink [ 89] } hitcount: 3 | |
662 | - { id: sys_bind [ 49] } hitcount: 3 | |
663 | - { id: sys_getsockname [ 51] } hitcount: 3 | |
664 | - { id: sys_unlink [ 87] } hitcount: 3 | |
665 | - { id: sys_rename [ 82] } hitcount: 4 | |
666 | - { id: unknown_syscall [ 58] } hitcount: 4 | |
667 | - { id: sys_connect [ 42] } hitcount: 4 | |
668 | - { id: sys_getpid [ 39] } hitcount: 4 | |
669 | - . | |
670 | - . | |
671 | - . | |
672 | - { id: sys_rt_sigprocmask [ 14] } hitcount: 952 | |
673 | - { id: sys_futex [202] } hitcount: 1534 | |
674 | - { id: sys_write [ 1] } hitcount: 2689 | |
675 | - { id: sys_setitimer [ 38] } hitcount: 2797 | |
676 | - { id: sys_read [ 0] } hitcount: 3202 | |
677 | - { id: sys_select [ 23] } hitcount: 3773 | |
678 | - { id: sys_writev [ 20] } hitcount: 4531 | |
679 | - { id: sys_poll [ 7] } hitcount: 8314 | |
680 | - { id: sys_recvmsg [ 47] } hitcount: 13738 | |
681 | - { id: sys_ioctl [ 16] } hitcount: 21843 | |
682 | - | |
683 | - Totals: | |
684 | - Hits: 67612 | |
685 | - Entries: 72 | |
686 | - Dropped: 0 | |
687 | - | |
688 | - The syscall counts above provide a rough overall picture of system | |
689 | - call activity on the system; we can see for example that the most | |
690 | - popular system call on this system was the 'sys_ioctl' system call. | |
691 | - | |
692 | - We can use 'compound' keys to refine that number and provide some | |
693 | - further insight as to which processes exactly contribute to the | |
694 | - overall ioctl count. | |
695 | - | |
696 | - The command below keeps a hitcount for every unique combination of | |
697 | - system call id and pid - the end result is essentially a table | |
698 | - that keeps a per-pid sum of system call hits. The results are | |
699 | - sorted using the system call id as the primary key, and the | |
700 | - hitcount sum as the secondary key: | |
701 | - | |
702 | - # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount' > \ | |
703 | - /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger | |
704 | - | |
705 | - # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist | |
706 | - # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 [active] | |
707 | - | |
708 | - { id: sys_read [ 0], common_pid: rtkit-daemon [ 1877] } hitcount: 1 | |
709 | - { id: sys_read [ 0], common_pid: gdbus [ 2976] } hitcount: 1 | |
710 | - { id: sys_read [ 0], common_pid: console-kit-dae [ 3400] } hitcount: 1 | |
711 | - { id: sys_read [ 0], common_pid: postgres [ 1865] } hitcount: 1 | |
712 | - { id: sys_read [ 0], common_pid: deja-dup-monito [ 3543] } hitcount: 2 | |
713 | - { id: sys_read [ 0], common_pid: NetworkManager [ 890] } hitcount: 2 | |
714 | - { id: sys_read [ 0], common_pid: evolution-calen [ 3048] } hitcount: 2 | |
715 | - { id: sys_read [ 0], common_pid: postgres [ 1864] } hitcount: 2 | |
716 | - { id: sys_read [ 0], common_pid: nm-applet [ 3022] } hitcount: 2 | |
717 | - { id: sys_read [ 0], common_pid: whoopsie [ 1212] } hitcount: 2 | |
718 | - . | |
719 | - . | |
720 | - . | |
721 | - { id: sys_ioctl [ 16], common_pid: bash [ 8479] } hitcount: 1 | |
722 | - { id: sys_ioctl [ 16], common_pid: bash [ 3472] } hitcount: 12 | |
723 | - { id: sys_ioctl [ 16], common_pid: gnome-terminal [ 3199] } hitcount: 16 | |
724 | - { id: sys_ioctl [ 16], common_pid: Xorg [ 1267] } hitcount: 1808 | |
725 | - { id: sys_ioctl [ 16], common_pid: compiz [ 2994] } hitcount: 5580 | |
726 | - . | |
727 | - . | |
728 | - . | |
729 | - { id: sys_waitid [247], common_pid: upstart-dbus-br [ 2690] } hitcount: 3 | |
730 | - { id: sys_waitid [247], common_pid: upstart-dbus-br [ 2688] } hitcount: 16 | |
731 | - { id: sys_inotify_add_watch [254], common_pid: gmain [ 975] } hitcount: 2 | |
732 | - { id: sys_inotify_add_watch [254], common_pid: gmain [ 3204] } hitcount: 4 | |
733 | - { id: sys_inotify_add_watch [254], common_pid: gmain [ 2888] } hitcount: 4 | |
734 | - { id: sys_inotify_add_watch [254], common_pid: gmain [ 3003] } hitcount: 4 | |
735 | - { id: sys_inotify_add_watch [254], common_pid: gmain [ 2873] } hitcount: 4 | |
736 | - { id: sys_inotify_add_watch [254], common_pid: gmain [ 3196] } hitcount: 6 | |
737 | - { id: sys_openat [257], common_pid: java [ 2623] } hitcount: 2 | |
738 | - { id: sys_eventfd2 [290], common_pid: ibus-ui-gtk3 [ 2760] } hitcount: 4 | |
739 | - { id: sys_eventfd2 [290], common_pid: compiz [ 2994] } hitcount: 6 | |
740 | - | |
741 | - Totals: | |
742 | - Hits: 31536 | |
743 | - Entries: 323 | |
744 | - Dropped: 0 | |
745 | - | |
746 | - The above list does give us a breakdown of the ioctl syscall by | |
747 | - pid, but it also gives us quite a bit more than that, which we | |
748 | - don't really care about at the moment. Since we know the syscall | |
749 | - id for sys_ioctl (16, displayed next to the sys_ioctl name), we | |
750 | - can use that to filter out all the other syscalls: | |
751 | - | |
752 | - # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount if id == 16' > \ | |
753 | - /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger | |
754 | - | |
755 | - # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist | |
756 | - # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 if id == 16 [active] | |
757 | - | |
758 | - { id: sys_ioctl [ 16], common_pid: gmain [ 2769] } hitcount: 1 | |
759 | - { id: sys_ioctl [ 16], common_pid: evolution-addre [ 8571] } hitcount: 1 | |
760 | - { id: sys_ioctl [ 16], common_pid: gmain [ 3003] } hitcount: 1 | |
761 | - { id: sys_ioctl [ 16], common_pid: gmain [ 2781] } hitcount: 1 | |
762 | - { id: sys_ioctl [ 16], common_pid: gmain [ 2829] } hitcount: 1 | |
763 | - { id: sys_ioctl [ 16], common_pid: bash [ 8726] } hitcount: 1 | |
764 | - { id: sys_ioctl [ 16], common_pid: bash [ 8508] } hitcount: 1 | |
765 | - { id: sys_ioctl [ 16], common_pid: gmain [ 2970] } hitcount: 1 | |
766 | - { id: sys_ioctl [ 16], common_pid: gmain [ 2768] } hitcount: 1 | |
767 | - . | |
768 | - . | |
769 | - . | |
770 | - { id: sys_ioctl [ 16], common_pid: pool [ 8559] } hitcount: 45 | |
771 | - { id: sys_ioctl [ 16], common_pid: pool [ 8555] } hitcount: 48 | |
772 | - { id: sys_ioctl [ 16], common_pid: pool [ 8551] } hitcount: 48 | |
773 | - { id: sys_ioctl [ 16], common_pid: avahi-daemon [ 896] } hitcount: 66 | |
774 | - { id: sys_ioctl [ 16], common_pid: Xorg [ 1267] } hitcount: 26674 | |
775 | - { id: sys_ioctl [ 16], common_pid: compiz [ 2994] } hitcount: 73443 | |
776 | - | |
777 | - Totals: | |
778 | - Hits: 101162 | |
779 | - Entries: 103 | |
780 | - Dropped: 0 | |
781 | - | |
782 | - The above output shows that 'compiz' and 'Xorg' are far and away | |
783 | - the heaviest ioctl callers (which might lead to questions about | |
784 | - whether they really need to be making all those calls and to | |
785 | - possible avenues for further investigation.) | |
786 | - | |
787 | - The compound key examples used a key and a sum value (hitcount) to | |
788 | - sort the output, but we can just as easily use two keys instead. | |
789 | - Here's an example where we use a compound key composed of the the | |
790 | - common_pid and size event fields. Sorting with pid as the primary | |
791 | - key and 'size' as the secondary key allows us to display an | |
792 | - ordered summary of the recvfrom sizes, with counts, received by | |
793 | - each process: | |
794 | - | |
795 | - # echo 'hist:key=common_pid.execname,size:val=hitcount:sort=common_pid,size' > \ | |
796 | - /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/trigger | |
797 | - | |
798 | - # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/hist | |
799 | - # trigger info: hist:keys=common_pid.execname,size:vals=hitcount:sort=common_pid.execname,size:size=2048 [active] | |
800 | - | |
801 | - { common_pid: smbd [ 784], size: 4 } hitcount: 1 | |
802 | - { common_pid: dnsmasq [ 1412], size: 4096 } hitcount: 672 | |
803 | - { common_pid: postgres [ 1796], size: 1000 } hitcount: 6 | |
804 | - { common_pid: postgres [ 1867], size: 1000 } hitcount: 10 | |
805 | - { common_pid: bamfdaemon [ 2787], size: 28 } hitcount: 2 | |
806 | - { common_pid: bamfdaemon [ 2787], size: 14360 } hitcount: 1 | |
807 | - { common_pid: compiz [ 2994], size: 8 } hitcount: 1 | |
808 | - { common_pid: compiz [ 2994], size: 20 } hitcount: 11 | |
809 | - { common_pid: gnome-terminal [ 3199], size: 4 } hitcount: 2 | |
810 | - { common_pid: firefox [ 8817], size: 4 } hitcount: 1 | |
811 | - { common_pid: firefox [ 8817], size: 8 } hitcount: 5 | |
812 | - { common_pid: firefox [ 8817], size: 588 } hitcount: 2 | |
813 | - { common_pid: firefox [ 8817], size: 628 } hitcount: 1 | |
814 | - { common_pid: firefox [ 8817], size: 6944 } hitcount: 1 | |
815 | - { common_pid: firefox [ 8817], size: 408880 } hitcount: 2 | |
816 | - { common_pid: firefox [ 8822], size: 8 } hitcount: 2 | |
817 | - { common_pid: firefox [ 8822], size: 160 } hitcount: 2 | |
818 | - { common_pid: firefox [ 8822], size: 320 } hitcount: 2 | |
819 | - { common_pid: firefox [ 8822], size: 352 } hitcount: 1 | |
820 | - . | |
821 | - . | |
822 | - . | |
823 | - { common_pid: pool [ 8923], size: 1960 } hitcount: 10 | |
824 | - { common_pid: pool [ 8923], size: 2048 } hitcount: 10 | |
825 | - { common_pid: pool [ 8924], size: 1960 } hitcount: 10 | |
826 | - { common_pid: pool [ 8924], size: 2048 } hitcount: 10 | |
827 | - { common_pid: pool [ 8928], size: 1964 } hitcount: 4 | |
828 | - { common_pid: pool [ 8928], size: 1965 } hitcount: 2 | |
829 | - { common_pid: pool [ 8928], size: 2048 } hitcount: 6 | |
830 | - { common_pid: pool [ 8929], size: 1982 } hitcount: 1 | |
831 | - { common_pid: pool [ 8929], size: 2048 } hitcount: 1 | |
832 | - | |
833 | - Totals: | |
834 | - Hits: 2016 | |
835 | - Entries: 224 | |
836 | - Dropped: 0 | |
837 | - | |
838 | - The above example also illustrates the fact that although a compound | |
839 | - key is treated as a single entity for hashing purposes, the sub-keys | |
840 | - it's composed of can be accessed independently. | |
841 | - | |
842 | - The next example uses a string field as the hash key and | |
843 | - demonstrates how you can manually pause and continue a hist trigger. | |
844 | - In this example, we'll aggregate fork counts and don't expect a | |
845 | - large number of entries in the hash table, so we'll drop it to a | |
846 | - much smaller number, say 256: | |
847 | - | |
848 | - # echo 'hist:key=child_comm:val=hitcount:size=256' > \ | |
849 | - /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger | |
850 | - | |
851 | - # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist | |
852 | - # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active] | |
853 | - | |
854 | - { child_comm: dconf worker } hitcount: 1 | |
855 | - { child_comm: ibus-daemon } hitcount: 1 | |
856 | - { child_comm: whoopsie } hitcount: 1 | |
857 | - { child_comm: smbd } hitcount: 1 | |
858 | - { child_comm: gdbus } hitcount: 1 | |
859 | - { child_comm: kthreadd } hitcount: 1 | |
860 | - { child_comm: dconf worker } hitcount: 1 | |
861 | - { child_comm: evolution-alarm } hitcount: 2 | |
862 | - { child_comm: Socket Thread } hitcount: 2 | |
863 | - { child_comm: postgres } hitcount: 2 | |
864 | - { child_comm: bash } hitcount: 3 | |
865 | - { child_comm: compiz } hitcount: 3 | |
866 | - { child_comm: evolution-sourc } hitcount: 4 | |
867 | - { child_comm: dhclient } hitcount: 4 | |
868 | - { child_comm: pool } hitcount: 5 | |
869 | - { child_comm: nm-dispatcher.a } hitcount: 8 | |
870 | - { child_comm: firefox } hitcount: 8 | |
871 | - { child_comm: dbus-daemon } hitcount: 8 | |
872 | - { child_comm: glib-pacrunner } hitcount: 10 | |
873 | - { child_comm: evolution } hitcount: 23 | |
874 | - | |
875 | - Totals: | |
876 | - Hits: 89 | |
877 | - Entries: 20 | |
878 | - Dropped: 0 | |
879 | - | |
880 | - If we want to pause the hist trigger, we can simply append :pause to | |
881 | - the command that started the trigger. Notice that the trigger info | |
882 | - displays as [paused]: | |
883 | - | |
884 | - # echo 'hist:key=child_comm:val=hitcount:size=256:pause' >> \ | |
885 | - /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger | |
886 | - | |
887 | - # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist | |
888 | - # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [paused] | |
889 | - | |
890 | - { child_comm: dconf worker } hitcount: 1 | |
891 | - { child_comm: kthreadd } hitcount: 1 | |
892 | - { child_comm: dconf worker } hitcount: 1 | |
893 | - { child_comm: gdbus } hitcount: 1 | |
894 | - { child_comm: ibus-daemon } hitcount: 1 | |
895 | - { child_comm: Socket Thread } hitcount: 2 | |
896 | - { child_comm: evolution-alarm } hitcount: 2 | |
897 | - { child_comm: smbd } hitcount: 2 | |
898 | - { child_comm: bash } hitcount: 3 | |
899 | - { child_comm: whoopsie } hitcount: 3 | |
900 | - { child_comm: compiz } hitcount: 3 | |
901 | - { child_comm: evolution-sourc } hitcount: 4 | |
902 | - { child_comm: pool } hitcount: 5 | |
903 | - { child_comm: postgres } hitcount: 6 | |
904 | - { child_comm: firefox } hitcount: 8 | |
905 | - { child_comm: dhclient } hitcount: 10 | |
906 | - { child_comm: emacs } hitcount: 12 | |
907 | - { child_comm: dbus-daemon } hitcount: 20 | |
908 | - { child_comm: nm-dispatcher.a } hitcount: 20 | |
909 | - { child_comm: evolution } hitcount: 35 | |
910 | - { child_comm: glib-pacrunner } hitcount: 59 | |
911 | - | |
912 | - Totals: | |
913 | - Hits: 199 | |
914 | - Entries: 21 | |
915 | - Dropped: 0 | |
916 | - | |
917 | - To manually continue having the trigger aggregate events, append | |
918 | - :cont instead. Notice that the trigger info displays as [active] | |
919 | - again, and the data has changed: | |
920 | - | |
921 | - # echo 'hist:key=child_comm:val=hitcount:size=256:cont' >> \ | |
922 | - /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger | |
923 | - | |
924 | - # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist | |
925 | - # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active] | |
926 | - | |
927 | - { child_comm: dconf worker } hitcount: 1 | |
928 | - { child_comm: dconf worker } hitcount: 1 | |
929 | - { child_comm: kthreadd } hitcount: 1 | |
930 | - { child_comm: gdbus } hitcount: 1 | |
931 | - { child_comm: ibus-daemon } hitcount: 1 | |
932 | - { child_comm: Socket Thread } hitcount: 2 | |
933 | - { child_comm: evolution-alarm } hitcount: 2 | |
934 | - { child_comm: smbd } hitcount: 2 | |
935 | - { child_comm: whoopsie } hitcount: 3 | |
936 | - { child_comm: compiz } hitcount: 3 | |
937 | - { child_comm: evolution-sourc } hitcount: 4 | |
938 | - { child_comm: bash } hitcount: 5 | |
939 | - { child_comm: pool } hitcount: 5 | |
940 | - { child_comm: postgres } hitcount: 6 | |
941 | - { child_comm: firefox } hitcount: 8 | |
942 | - { child_comm: dhclient } hitcount: 11 | |
943 | - { child_comm: emacs } hitcount: 12 | |
944 | - { child_comm: dbus-daemon } hitcount: 22 | |
945 | - { child_comm: nm-dispatcher.a } hitcount: 22 | |
946 | - { child_comm: evolution } hitcount: 35 | |
947 | - { child_comm: glib-pacrunner } hitcount: 59 | |
948 | - | |
949 | - Totals: | |
950 | - Hits: 206 | |
951 | - Entries: 21 | |
952 | - Dropped: 0 | |
953 | - | |
954 | - The previous example showed how to start and stop a hist trigger by | |
955 | - appending 'pause' and 'continue' to the hist trigger command. A | |
956 | - hist trigger can also be started in a paused state by initially | |
957 | - starting the trigger with ':pause' appended. This allows you to | |
958 | - start the trigger only when you're ready to start collecting data | |
959 | - and not before. For example, you could start the trigger in a | |
960 | - paused state, then unpause it and do something you want to measure, | |
961 | - then pause the trigger again when done. | |
962 | - | |
963 | - Of course, doing this manually can be difficult and error-prone, but | |
964 | - it is possible to automatically start and stop a hist trigger based | |
965 | - on some condition, via the enable_hist and disable_hist triggers. | |
966 | - | |
967 | - For example, suppose we wanted to take a look at the relative | |
968 | - weights in terms of skb length for each callpath that leads to a | |
969 | - netif_receieve_skb event when downloading a decent-sized file using | |
970 | - wget. | |
971 | - | |
972 | - First we set up an initially paused stacktrace trigger on the | |
973 | - netif_receive_skb event: | |
974 | - | |
975 | - # echo 'hist:key=stacktrace:vals=len:pause' > \ | |
976 | - /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger | |
977 | - | |
978 | - Next, we set up an 'enable_hist' trigger on the sched_process_exec | |
979 | - event, with an 'if filename==/usr/bin/wget' filter. The effect of | |
980 | - this new trigger is that it will 'unpause' the hist trigger we just | |
981 | - set up on netif_receive_skb if and only if it sees a | |
982 | - sched_process_exec event with a filename of '/usr/bin/wget'. When | |
983 | - that happens, all netif_receive_skb events are aggregated into a | |
984 | - hash table keyed on stacktrace: | |
985 | - | |
986 | - # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \ | |
987 | - /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger | |
988 | - | |
989 | - The aggregation continues until the netif_receive_skb is paused | |
990 | - again, which is what the following disable_hist event does by | |
991 | - creating a similar setup on the sched_process_exit event, using the | |
992 | - filter 'comm==wget': | |
993 | - | |
994 | - # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \ | |
995 | - /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger | |
996 | - | |
997 | - Whenever a process exits and the comm field of the disable_hist | |
998 | - trigger filter matches 'comm==wget', the netif_receive_skb hist | |
999 | - trigger is disabled. | |
1000 | - | |
1001 | - The overall effect is that netif_receive_skb events are aggregated | |
1002 | - into the hash table for only the duration of the wget. Executing a | |
1003 | - wget command and then listing the 'hist' file will display the | |
1004 | - output generated by the wget command: | |
1005 | - | |
1006 | - $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz | |
1007 | - | |
1008 | - # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist | |
1009 | - # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused] | |
1010 | - | |
1011 | - { stacktrace: | |
1012 | - __netif_receive_skb_core+0x46d/0x990 | |
1013 | - __netif_receive_skb+0x18/0x60 | |
1014 | - netif_receive_skb_internal+0x23/0x90 | |
1015 | - napi_gro_receive+0xc8/0x100 | |
1016 | - ieee80211_deliver_skb+0xd6/0x270 [mac80211] | |
1017 | - ieee80211_rx_handlers+0xccf/0x22f0 [mac80211] | |
1018 | - ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211] | |
1019 | - ieee80211_rx+0x31d/0x900 [mac80211] | |
1020 | - iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm] | |
1021 | - iwl_rx_dispatch+0x8e/0xf0 [iwldvm] | |
1022 | - iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi] | |
1023 | - irq_thread_fn+0x20/0x50 | |
1024 | - irq_thread+0x11f/0x150 | |
1025 | - kthread+0xd2/0xf0 | |
1026 | - ret_from_fork+0x42/0x70 | |
1027 | - } hitcount: 85 len: 28884 | |
1028 | - { stacktrace: | |
1029 | - __netif_receive_skb_core+0x46d/0x990 | |
1030 | - __netif_receive_skb+0x18/0x60 | |
1031 | - netif_receive_skb_internal+0x23/0x90 | |
1032 | - napi_gro_complete+0xa4/0xe0 | |
1033 | - dev_gro_receive+0x23a/0x360 | |
1034 | - napi_gro_receive+0x30/0x100 | |
1035 | - ieee80211_deliver_skb+0xd6/0x270 [mac80211] | |
1036 | - ieee80211_rx_handlers+0xccf/0x22f0 [mac80211] | |
1037 | - ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211] | |
1038 | - ieee80211_rx+0x31d/0x900 [mac80211] | |
1039 | - iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm] | |
1040 | - iwl_rx_dispatch+0x8e/0xf0 [iwldvm] | |
1041 | - iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi] | |
1042 | - irq_thread_fn+0x20/0x50 | |
1043 | - irq_thread+0x11f/0x150 | |
1044 | - kthread+0xd2/0xf0 | |
1045 | - } hitcount: 98 len: 664329 | |
1046 | - { stacktrace: | |
1047 | - __netif_receive_skb_core+0x46d/0x990 | |
1048 | - __netif_receive_skb+0x18/0x60 | |
1049 | - process_backlog+0xa8/0x150 | |
1050 | - net_rx_action+0x15d/0x340 | |
1051 | - __do_softirq+0x114/0x2c0 | |
1052 | - do_softirq_own_stack+0x1c/0x30 | |
1053 | - do_softirq+0x65/0x70 | |
1054 | - __local_bh_enable_ip+0xb5/0xc0 | |
1055 | - ip_finish_output+0x1f4/0x840 | |
1056 | - ip_output+0x6b/0xc0 | |
1057 | - ip_local_out_sk+0x31/0x40 | |
1058 | - ip_send_skb+0x1a/0x50 | |
1059 | - udp_send_skb+0x173/0x2a0 | |
1060 | - udp_sendmsg+0x2bf/0x9f0 | |
1061 | - inet_sendmsg+0x64/0xa0 | |
1062 | - sock_sendmsg+0x3d/0x50 | |
1063 | - } hitcount: 115 len: 13030 | |
1064 | - { stacktrace: | |
1065 | - __netif_receive_skb_core+0x46d/0x990 | |
1066 | - __netif_receive_skb+0x18/0x60 | |
1067 | - netif_receive_skb_internal+0x23/0x90 | |
1068 | - napi_gro_complete+0xa4/0xe0 | |
1069 | - napi_gro_flush+0x6d/0x90 | |
1070 | - iwl_pcie_irq_handler+0x92a/0x12f0 [iwlwifi] | |
1071 | - irq_thread_fn+0x20/0x50 | |
1072 | - irq_thread+0x11f/0x150 | |
1073 | - kthread+0xd2/0xf0 | |
1074 | - ret_from_fork+0x42/0x70 | |
1075 | - } hitcount: 934 len: 5512212 | |
1076 | - | |
1077 | - Totals: | |
1078 | - Hits: 1232 | |
1079 | - Entries: 4 | |
1080 | - Dropped: 0 | |
1081 | - | |
1082 | - The above shows all the netif_receive_skb callpaths and their total | |
1083 | - lengths for the duration of the wget command. | |
1084 | - | |
1085 | - The 'clear' hist trigger param can be used to clear the hash table. | |
1086 | - Suppose we wanted to try another run of the previous example but | |
1087 | - this time also wanted to see the complete list of events that went | |
1088 | - into the histogram. In order to avoid having to set everything up | |
1089 | - again, we can just clear the histogram first: | |
1090 | - | |
1091 | - # echo 'hist:key=stacktrace:vals=len:clear' >> \ | |
1092 | - /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger | |
1093 | - | |
1094 | - Just to verify that it is in fact cleared, here's what we now see in | |
1095 | - the hist file: | |
1096 | - | |
1097 | - # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist | |
1098 | - # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused] | |
1099 | - | |
1100 | - Totals: | |
1101 | - Hits: 0 | |
1102 | - Entries: 0 | |
1103 | - Dropped: 0 | |
1104 | - | |
1105 | - Since we want to see the detailed list of every netif_receive_skb | |
1106 | - event occurring during the new run, which are in fact the same | |
1107 | - events being aggregated into the hash table, we add some additional | |
1108 | - 'enable_event' events to the triggering sched_process_exec and | |
1109 | - sched_process_exit events as such: | |
1110 | - | |
1111 | - # echo 'enable_event:net:netif_receive_skb if filename==/usr/bin/wget' > \ | |
1112 | - /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger | |
1113 | - | |
1114 | - # echo 'disable_event:net:netif_receive_skb if comm==wget' > \ | |
1115 | - /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger | |
1116 | - | |
1117 | - If you read the trigger files for the sched_process_exec and | |
1118 | - sched_process_exit triggers, you should see two triggers for each: | |
1119 | - one enabling/disabling the hist aggregation and the other | |
1120 | - enabling/disabling the logging of events: | |
1121 | - | |
1122 | - # cat /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger | |
1123 | - enable_event:net:netif_receive_skb:unlimited if filename==/usr/bin/wget | |
1124 | - enable_hist:net:netif_receive_skb:unlimited if filename==/usr/bin/wget | |
1125 | - | |
1126 | - # cat /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger | |
1127 | - enable_event:net:netif_receive_skb:unlimited if comm==wget | |
1128 | - disable_hist:net:netif_receive_skb:unlimited if comm==wget | |
1129 | - | |
1130 | - In other words, whenever either of the sched_process_exec or | |
1131 | - sched_process_exit events is hit and matches 'wget', it enables or | |
1132 | - disables both the histogram and the event log, and what you end up | |
1133 | - with is a hash table and set of events just covering the specified | |
1134 | - duration. Run the wget command again: | |
1135 | - | |
1136 | - $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz | |
1137 | - | |
1138 | - Displaying the 'hist' file should show something similar to what you | |
1139 | - saw in the last run, but this time you should also see the | |
1140 | - individual events in the trace file: | |
1141 | - | |
1142 | - # cat /sys/kernel/debug/tracing/trace | |
1143 | - | |
1144 | - # tracer: nop | |
1145 | - # | |
1146 | - # entries-in-buffer/entries-written: 183/1426 #P:4 | |
1147 | - # | |
1148 | - # _-----=> irqs-off | |
1149 | - # / _----=> need-resched | |
1150 | - # | / _---=> hardirq/softirq | |
1151 | - # || / _--=> preempt-depth | |
1152 | - # ||| / delay | |
1153 | - # TASK-PID CPU# |||| TIMESTAMP FUNCTION | |
1154 | - # | | | |||| | | | |
1155 | - wget-15108 [000] ..s1 31769.606929: netif_receive_skb: dev=lo skbaddr=ffff88009c353100 len=60 | |
1156 | - wget-15108 [000] ..s1 31769.606999: netif_receive_skb: dev=lo skbaddr=ffff88009c353200 len=60 | |
1157 | - dnsmasq-1382 [000] ..s1 31769.677652: netif_receive_skb: dev=lo skbaddr=ffff88009c352b00 len=130 | |
1158 | - dnsmasq-1382 [000] ..s1 31769.685917: netif_receive_skb: dev=lo skbaddr=ffff88009c352200 len=138 | |
1159 | - ##### CPU 2 buffer started #### | |
1160 | - irq/29-iwlwifi-559 [002] ..s. 31772.031529: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433d00 len=2948 | |
1161 | - irq/29-iwlwifi-559 [002] ..s. 31772.031572: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432200 len=1500 | |
1162 | - irq/29-iwlwifi-559 [002] ..s. 31772.032196: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433100 len=2948 | |
1163 | - irq/29-iwlwifi-559 [002] ..s. 31772.032761: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433000 len=2948 | |
1164 | - irq/29-iwlwifi-559 [002] ..s. 31772.033220: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432e00 len=1500 | |
1165 | - . | |
1166 | - . | |
1167 | - . | |
1168 | - | |
1169 | - The following example demonstrates how multiple hist triggers can be | |
1170 | - attached to a given event. This capability can be useful for | |
1171 | - creating a set of different summaries derived from the same set of | |
1172 | - events, or for comparing the effects of different filters, among | |
1173 | - other things. | |
1174 | - | |
1175 | - # echo 'hist:keys=skbaddr.hex:vals=len if len < 0' >> \ | |
1176 | - /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger | |
1177 | - # echo 'hist:keys=skbaddr.hex:vals=len if len > 4096' >> \ | |
1178 | - /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger | |
1179 | - # echo 'hist:keys=skbaddr.hex:vals=len if len == 256' >> \ | |
1180 | - /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger | |
1181 | - # echo 'hist:keys=skbaddr.hex:vals=len' >> \ | |
1182 | - /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger | |
1183 | - # echo 'hist:keys=len:vals=common_preempt_count' >> \ | |
1184 | - /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger | |
1185 | - | |
1186 | - The above set of commands create four triggers differing only in | |
1187 | - their filters, along with a completely different though fairly | |
1188 | - nonsensical trigger. Note that in order to append multiple hist | |
1189 | - triggers to the same file, you should use the '>>' operator to | |
1190 | - append them ('>' will also add the new hist trigger, but will remove | |
1191 | - any existing hist triggers beforehand). | |
1192 | - | |
1193 | - Displaying the contents of the 'hist' file for the event shows the | |
1194 | - contents of all five histograms: | |
1195 | - | |
1196 | - # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist | |
1197 | - | |
1198 | - # event histogram | |
1199 | - # | |
1200 | - # trigger info: hist:keys=len:vals=hitcount,common_preempt_count:sort=hitcount:size=2048 [active] | |
1201 | - # | |
1202 | - | |
1203 | - { len: 176 } hitcount: 1 common_preempt_count: 0 | |
1204 | - { len: 223 } hitcount: 1 common_preempt_count: 0 | |
1205 | - { len: 4854 } hitcount: 1 common_preempt_count: 0 | |
1206 | - { len: 395 } hitcount: 1 common_preempt_count: 0 | |
1207 | - { len: 177 } hitcount: 1 common_preempt_count: 0 | |
1208 | - { len: 446 } hitcount: 1 common_preempt_count: 0 | |
1209 | - { len: 1601 } hitcount: 1 common_preempt_count: 0 | |
1210 | - . | |
1211 | - . | |
1212 | - . | |
1213 | - { len: 1280 } hitcount: 66 common_preempt_count: 0 | |
1214 | - { len: 116 } hitcount: 81 common_preempt_count: 40 | |
1215 | - { len: 708 } hitcount: 112 common_preempt_count: 0 | |
1216 | - { len: 46 } hitcount: 221 common_preempt_count: 0 | |
1217 | - { len: 1264 } hitcount: 458 common_preempt_count: 0 | |
1218 | - | |
1219 | - Totals: | |
1220 | - Hits: 1428 | |
1221 | - Entries: 147 | |
1222 | - Dropped: 0 | |
1223 | - | |
1224 | - | |
1225 | - # event histogram | |
1226 | - # | |
1227 | - # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active] | |
1228 | - # | |
1229 | - | |
1230 | - { skbaddr: ffff8800baee5e00 } hitcount: 1 len: 130 | |
1231 | - { skbaddr: ffff88005f3d5600 } hitcount: 1 len: 1280 | |
1232 | - { skbaddr: ffff88005f3d4900 } hitcount: 1 len: 1280 | |
1233 | - { skbaddr: ffff88009fed6300 } hitcount: 1 len: 115 | |
1234 | - { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 115 | |
1235 | - { skbaddr: ffff88008cdb1900 } hitcount: 1 len: 46 | |
1236 | - { skbaddr: ffff880064b5ef00 } hitcount: 1 len: 118 | |
1237 | - { skbaddr: ffff880044e3c700 } hitcount: 1 len: 60 | |
1238 | - { skbaddr: ffff880100065900 } hitcount: 1 len: 46 | |
1239 | - { skbaddr: ffff8800d46bd500 } hitcount: 1 len: 116 | |
1240 | - { skbaddr: ffff88005f3d5f00 } hitcount: 1 len: 1280 | |
1241 | - { skbaddr: ffff880100064700 } hitcount: 1 len: 365 | |
1242 | - { skbaddr: ffff8800badb6f00 } hitcount: 1 len: 60 | |
1243 | - . | |
1244 | - . | |
1245 | - . | |
1246 | - { skbaddr: ffff88009fe0be00 } hitcount: 27 len: 24677 | |
1247 | - { skbaddr: ffff88009fe0a400 } hitcount: 27 len: 23052 | |
1248 | - { skbaddr: ffff88009fe0b700 } hitcount: 31 len: 25589 | |
1249 | - { skbaddr: ffff88009fe0b600 } hitcount: 32 len: 27326 | |
1250 | - { skbaddr: ffff88006a462800 } hitcount: 68 len: 71678 | |
1251 | - { skbaddr: ffff88006a463700 } hitcount: 70 len: 72678 | |
1252 | - { skbaddr: ffff88006a462b00 } hitcount: 71 len: 77589 | |
1253 | - { skbaddr: ffff88006a463600 } hitcount: 73 len: 71307 | |
1254 | - { skbaddr: ffff88006a462200 } hitcount: 81 len: 81032 | |
1255 | - | |
1256 | - Totals: | |
1257 | - Hits: 1451 | |
1258 | - Entries: 318 | |
1259 | - Dropped: 0 | |
1260 | - | |
1261 | - | |
1262 | - # event histogram | |
1263 | - # | |
1264 | - # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len == 256 [active] | |
1265 | - # | |
1266 | - | |
1267 | - | |
1268 | - Totals: | |
1269 | - Hits: 0 | |
1270 | - Entries: 0 | |
1271 | - Dropped: 0 | |
1272 | - | |
1273 | - | |
1274 | - # event histogram | |
1275 | - # | |
1276 | - # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len > 4096 [active] | |
1277 | - # | |
1278 | - | |
1279 | - { skbaddr: ffff88009fd2c300 } hitcount: 1 len: 7212 | |
1280 | - { skbaddr: ffff8800d2bcce00 } hitcount: 1 len: 7212 | |
1281 | - { skbaddr: ffff8800d2bcd700 } hitcount: 1 len: 7212 | |
1282 | - { skbaddr: ffff8800d2bcda00 } hitcount: 1 len: 21492 | |
1283 | - { skbaddr: ffff8800ae2e2d00 } hitcount: 1 len: 7212 | |
1284 | - { skbaddr: ffff8800d2bcdb00 } hitcount: 1 len: 7212 | |
1285 | - { skbaddr: ffff88006a4df500 } hitcount: 1 len: 4854 | |
1286 | - { skbaddr: ffff88008ce47b00 } hitcount: 1 len: 18636 | |
1287 | - { skbaddr: ffff8800ae2e2200 } hitcount: 1 len: 12924 | |
1288 | - { skbaddr: ffff88005f3e1000 } hitcount: 1 len: 4356 | |
1289 | - { skbaddr: ffff8800d2bcdc00 } hitcount: 2 len: 24420 | |
1290 | - { skbaddr: ffff8800d2bcc200 } hitcount: 2 len: 12996 | |
1291 | - | |
1292 | - Totals: | |
1293 | - Hits: 14 | |
1294 | - Entries: 12 | |
1295 | - Dropped: 0 | |
1296 | - | |
1297 | - | |
1298 | - # event histogram | |
1299 | - # | |
1300 | - # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len < 0 [active] | |
1301 | - # | |
1302 | - | |
1303 | - | |
1304 | - Totals: | |
1305 | - Hits: 0 | |
1306 | - Entries: 0 | |
1307 | - Dropped: 0 | |
1308 | - | |
1309 | - Named triggers can be used to have triggers share a common set of | |
1310 | - histogram data. This capability is mostly useful for combining the | |
1311 | - output of events generated by tracepoints contained inside inline | |
1312 | - functions, but names can be used in a hist trigger on any event. | |
1313 | - For example, these two triggers when hit will update the same 'len' | |
1314 | - field in the shared 'foo' histogram data: | |
1315 | - | |
1316 | - # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \ | |
1317 | - /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger | |
1318 | - # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \ | |
1319 | - /sys/kernel/debug/tracing/events/net/netif_rx/trigger | |
1320 | - | |
1321 | - You can see that they're updating common histogram data by reading | |
1322 | - each event's hist files at the same time: | |
1323 | - | |
1324 | - # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist; | |
1325 | - cat /sys/kernel/debug/tracing/events/net/netif_rx/hist | |
1326 | - | |
1327 | - # event histogram | |
1328 | - # | |
1329 | - # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active] | |
1330 | - # | |
1331 | - | |
1332 | - { skbaddr: ffff88000ad53500 } hitcount: 1 len: 46 | |
1333 | - { skbaddr: ffff8800af5a1500 } hitcount: 1 len: 76 | |
1334 | - { skbaddr: ffff8800d62a1900 } hitcount: 1 len: 46 | |
1335 | - { skbaddr: ffff8800d2bccb00 } hitcount: 1 len: 468 | |
1336 | - { skbaddr: ffff8800d3c69900 } hitcount: 1 len: 46 | |
1337 | - { skbaddr: ffff88009ff09100 } hitcount: 1 len: 52 | |
1338 | - { skbaddr: ffff88010f13ab00 } hitcount: 1 len: 168 | |
1339 | - { skbaddr: ffff88006a54f400 } hitcount: 1 len: 46 | |
1340 | - { skbaddr: ffff8800d2bcc500 } hitcount: 1 len: 260 | |
1341 | - { skbaddr: ffff880064505000 } hitcount: 1 len: 46 | |
1342 | - { skbaddr: ffff8800baf24e00 } hitcount: 1 len: 32 | |
1343 | - { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 46 | |
1344 | - { skbaddr: ffff8800d3edff00 } hitcount: 1 len: 44 | |
1345 | - { skbaddr: ffff88009fe0b400 } hitcount: 1 len: 168 | |
1346 | - { skbaddr: ffff8800a1c55a00 } hitcount: 1 len: 40 | |
1347 | - { skbaddr: ffff8800d2bcd100 } hitcount: 1 len: 40 | |
1348 | - { skbaddr: ffff880064505f00 } hitcount: 1 len: 174 | |
1349 | - { skbaddr: ffff8800a8bff200 } hitcount: 1 len: 160 | |
1350 | - { skbaddr: ffff880044e3cc00 } hitcount: 1 len: 76 | |
1351 | - { skbaddr: ffff8800a8bfe700 } hitcount: 1 len: 46 | |
1352 | - { skbaddr: ffff8800d2bcdc00 } hitcount: 1 len: 32 | |
1353 | - { skbaddr: ffff8800a1f64800 } hitcount: 1 len: 46 | |
1354 | - { skbaddr: ffff8800d2bcde00 } hitcount: 1 len: 988 | |
1355 | - { skbaddr: ffff88006a5dea00 } hitcount: 1 len: 46 | |
1356 | - { skbaddr: ffff88002e37a200 } hitcount: 1 len: 44 | |
1357 | - { skbaddr: ffff8800a1f32c00 } hitcount: 2 len: 676 | |
1358 | - { skbaddr: ffff88000ad52600 } hitcount: 2 len: 107 | |
1359 | - { skbaddr: ffff8800a1f91e00 } hitcount: 2 len: 92 | |
1360 | - { skbaddr: ffff8800af5a0200 } hitcount: 2 len: 142 | |
1361 | - { skbaddr: ffff8800d2bcc600 } hitcount: 2 len: 220 | |
1362 | - { skbaddr: ffff8800ba36f500 } hitcount: 2 len: 92 | |
1363 | - { skbaddr: ffff8800d021f800 } hitcount: 2 len: 92 | |
1364 | - { skbaddr: ffff8800a1f33600 } hitcount: 2 len: 675 | |
1365 | - { skbaddr: ffff8800a8bfff00 } hitcount: 3 len: 138 | |
1366 | - { skbaddr: ffff8800d62a1300 } hitcount: 3 len: 138 | |
1367 | - { skbaddr: ffff88002e37a100 } hitcount: 4 len: 184 | |
1368 | - { skbaddr: ffff880064504400 } hitcount: 4 len: 184 | |
1369 | - { skbaddr: ffff8800a8bfec00 } hitcount: 4 len: 184 | |
1370 | - { skbaddr: ffff88000ad53700 } hitcount: 5 len: 230 | |
1371 | - { skbaddr: ffff8800d2bcdb00 } hitcount: 5 len: 196 | |
1372 | - { skbaddr: ffff8800a1f90000 } hitcount: 6 len: 276 | |
1373 | - { skbaddr: ffff88006a54f900 } hitcount: 6 len: 276 | |
1374 | - | |
1375 | - Totals: | |
1376 | - Hits: 81 | |
1377 | - Entries: 42 | |
1378 | - Dropped: 0 | |
1379 | - # event histogram | |
1380 | - # | |
1381 | - # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active] | |
1382 | - # | |
1383 | - | |
1384 | - { skbaddr: ffff88000ad53500 } hitcount: 1 len: 46 | |
1385 | - { skbaddr: ffff8800af5a1500 } hitcount: 1 len: 76 | |
1386 | - { skbaddr: ffff8800d62a1900 } hitcount: 1 len: 46 | |
1387 | - { skbaddr: ffff8800d2bccb00 } hitcount: 1 len: 468 | |
1388 | - { skbaddr: ffff8800d3c69900 } hitcount: 1 len: 46 | |
1389 | - { skbaddr: ffff88009ff09100 } hitcount: 1 len: 52 | |
1390 | - { skbaddr: ffff88010f13ab00 } hitcount: 1 len: 168 | |
1391 | - { skbaddr: ffff88006a54f400 } hitcount: 1 len: 46 | |
1392 | - { skbaddr: ffff8800d2bcc500 } hitcount: 1 len: 260 | |
1393 | - { skbaddr: ffff880064505000 } hitcount: 1 len: 46 | |
1394 | - { skbaddr: ffff8800baf24e00 } hitcount: 1 len: 32 | |
1395 | - { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 46 | |
1396 | - { skbaddr: ffff8800d3edff00 } hitcount: 1 len: 44 | |
1397 | - { skbaddr: ffff88009fe0b400 } hitcount: 1 len: 168 | |
1398 | - { skbaddr: ffff8800a1c55a00 } hitcount: 1 len: 40 | |
1399 | - { skbaddr: ffff8800d2bcd100 } hitcount: 1 len: 40 | |
1400 | - { skbaddr: ffff880064505f00 } hitcount: 1 len: 174 | |
1401 | - { skbaddr: ffff8800a8bff200 } hitcount: 1 len: 160 | |
1402 | - { skbaddr: ffff880044e3cc00 } hitcount: 1 len: 76 | |
1403 | - { skbaddr: ffff8800a8bfe700 } hitcount: 1 len: 46 | |
1404 | - { skbaddr: ffff8800d2bcdc00 } hitcount: 1 len: 32 | |
1405 | - { skbaddr: ffff8800a1f64800 } hitcount: 1 len: 46 | |
1406 | - { skbaddr: ffff8800d2bcde00 } hitcount: 1 len: 988 | |
1407 | - { skbaddr: ffff88006a5dea00 } hitcount: 1 len: 46 | |
1408 | - { skbaddr: ffff88002e37a200 } hitcount: 1 len: 44 | |
1409 | - { skbaddr: ffff8800a1f32c00 } hitcount: 2 len: 676 | |
1410 | - { skbaddr: ffff88000ad52600 } hitcount: 2 len: 107 | |
1411 | - { skbaddr: ffff8800a1f91e00 } hitcount: 2 len: 92 | |
1412 | - { skbaddr: ffff8800af5a0200 } hitcount: 2 len: 142 | |
1413 | - { skbaddr: ffff8800d2bcc600 } hitcount: 2 len: 220 | |
1414 | - { skbaddr: ffff8800ba36f500 } hitcount: 2 len: 92 | |
1415 | - { skbaddr: ffff8800d021f800 } hitcount: 2 len: 92 | |
1416 | - { skbaddr: ffff8800a1f33600 } hitcount: 2 len: 675 | |
1417 | - { skbaddr: ffff8800a8bfff00 } hitcount: 3 len: 138 | |
1418 | - { skbaddr: ffff8800d62a1300 } hitcount: 3 len: 138 | |
1419 | - { skbaddr: ffff88002e37a100 } hitcount: 4 len: 184 | |
1420 | - { skbaddr: ffff880064504400 } hitcount: 4 len: 184 | |
1421 | - { skbaddr: ffff8800a8bfec00 } hitcount: 4 len: 184 | |
1422 | - { skbaddr: ffff88000ad53700 } hitcount: 5 len: 230 | |
1423 | - { skbaddr: ffff8800d2bcdb00 } hitcount: 5 len: 196 | |
1424 | - { skbaddr: ffff8800a1f90000 } hitcount: 6 len: 276 | |
1425 | - { skbaddr: ffff88006a54f900 } hitcount: 6 len: 276 | |
1426 | - | |
1427 | - Totals: | |
1428 | - Hits: 81 | |
1429 | - Entries: 42 | |
1430 | - Dropped: 0 | |
1431 | - | |
1432 | - And here's an example that shows how to combine histogram data from | |
1433 | - any two events even if they don't share any 'compatible' fields | |
1434 | - other than 'hitcount' and 'stacktrace'. These commands create a | |
1435 | - couple of triggers named 'bar' using those fields: | |
1436 | - | |
1437 | - # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \ | |
1438 | - /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger | |
1439 | - # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \ | |
1440 | - /sys/kernel/debug/tracing/events/net/netif_rx/trigger | |
1441 | - | |
1442 | - And displaying the output of either shows some interesting if | |
1443 | - somewhat confusing output: | |
1444 | - | |
1445 | - # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist | |
1446 | - # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist | |
1447 | - | |
1448 | - # event histogram | |
1449 | - # | |
1450 | - # trigger info: hist:name=bar:keys=stacktrace:vals=hitcount:sort=hitcount:size=2048 [active] | |
1451 | - # | |
1452 | - | |
1453 | - { stacktrace: | |
1454 | - _do_fork+0x18e/0x330 | |
1455 | - kernel_thread+0x29/0x30 | |
1456 | - kthreadd+0x154/0x1b0 | |
1457 | - ret_from_fork+0x3f/0x70 | |
1458 | - } hitcount: 1 | |
1459 | - { stacktrace: | |
1460 | - netif_rx_internal+0xb2/0xd0 | |
1461 | - netif_rx_ni+0x20/0x70 | |
1462 | - dev_loopback_xmit+0xaa/0xd0 | |
1463 | - ip_mc_output+0x126/0x240 | |
1464 | - ip_local_out_sk+0x31/0x40 | |
1465 | - igmp_send_report+0x1e9/0x230 | |
1466 | - igmp_timer_expire+0xe9/0x120 | |
1467 | - call_timer_fn+0x39/0xf0 | |
1468 | - run_timer_softirq+0x1e1/0x290 | |
1469 | - __do_softirq+0xfd/0x290 | |
1470 | - irq_exit+0x98/0xb0 | |
1471 | - smp_apic_timer_interrupt+0x4a/0x60 | |
1472 | - apic_timer_interrupt+0x6d/0x80 | |
1473 | - cpuidle_enter+0x17/0x20 | |
1474 | - call_cpuidle+0x3b/0x60 | |
1475 | - cpu_startup_entry+0x22d/0x310 | |
1476 | - } hitcount: 1 | |
1477 | - { stacktrace: | |
1478 | - netif_rx_internal+0xb2/0xd0 | |
1479 | - netif_rx_ni+0x20/0x70 | |
1480 | - dev_loopback_xmit+0xaa/0xd0 | |
1481 | - ip_mc_output+0x17f/0x240 | |
1482 | - ip_local_out_sk+0x31/0x40 | |
1483 | - ip_send_skb+0x1a/0x50 | |
1484 | - udp_send_skb+0x13e/0x270 | |
1485 | - udp_sendmsg+0x2bf/0x980 | |
1486 | - inet_sendmsg+0x67/0xa0 | |
1487 | - sock_sendmsg+0x38/0x50 | |
1488 | - SYSC_sendto+0xef/0x170 | |
1489 | - SyS_sendto+0xe/0x10 | |
1490 | - entry_SYSCALL_64_fastpath+0x12/0x6a | |
1491 | - } hitcount: 2 | |
1492 | - { stacktrace: | |
1493 | - netif_rx_internal+0xb2/0xd0 | |
1494 | - netif_rx+0x1c/0x60 | |
1495 | - loopback_xmit+0x6c/0xb0 | |
1496 | - dev_hard_start_xmit+0x219/0x3a0 | |
1497 | - __dev_queue_xmit+0x415/0x4f0 | |
1498 | - dev_queue_xmit_sk+0x13/0x20 | |
1499 | - ip_finish_output2+0x237/0x340 | |
1500 | - ip_finish_output+0x113/0x1d0 | |
1501 | - ip_output+0x66/0xc0 | |
1502 | - ip_local_out_sk+0x31/0x40 | |
1503 | - ip_send_skb+0x1a/0x50 | |
1504 | - udp_send_skb+0x16d/0x270 | |
1505 | - udp_sendmsg+0x2bf/0x980 | |
1506 | - inet_sendmsg+0x67/0xa0 | |
1507 | - sock_sendmsg+0x38/0x50 | |
1508 | - ___sys_sendmsg+0x14e/0x270 | |
1509 | - } hitcount: 76 | |
1510 | - { stacktrace: | |
1511 | - netif_rx_internal+0xb2/0xd0 | |
1512 | - netif_rx+0x1c/0x60 | |
1513 | - loopback_xmit+0x6c/0xb0 | |
1514 | - dev_hard_start_xmit+0x219/0x3a0 | |
1515 | - __dev_queue_xmit+0x415/0x4f0 | |
1516 | - dev_queue_xmit_sk+0x13/0x20 | |
1517 | - ip_finish_output2+0x237/0x340 | |
1518 | - ip_finish_output+0x113/0x1d0 | |
1519 | - ip_output+0x66/0xc0 | |
1520 | - ip_local_out_sk+0x31/0x40 | |
1521 | - ip_send_skb+0x1a/0x50 | |
1522 | - udp_send_skb+0x16d/0x270 | |
1523 | - udp_sendmsg+0x2bf/0x980 | |
1524 | - inet_sendmsg+0x67/0xa0 | |
1525 | - sock_sendmsg+0x38/0x50 | |
1526 | - ___sys_sendmsg+0x269/0x270 | |
1527 | - } hitcount: 77 | |
1528 | - { stacktrace: | |
1529 | - netif_rx_internal+0xb2/0xd0 | |
1530 | - netif_rx+0x1c/0x60 | |
1531 | - loopback_xmit+0x6c/0xb0 | |
1532 | - dev_hard_start_xmit+0x219/0x3a0 | |
1533 | - __dev_queue_xmit+0x415/0x4f0 | |
1534 | - dev_queue_xmit_sk+0x13/0x20 | |
1535 | - ip_finish_output2+0x237/0x340 | |
1536 | - ip_finish_output+0x113/0x1d0 | |
1537 | - ip_output+0x66/0xc0 | |
1538 | - ip_local_out_sk+0x31/0x40 | |
1539 | - ip_send_skb+0x1a/0x50 | |
1540 | - udp_send_skb+0x16d/0x270 | |
1541 | - udp_sendmsg+0x2bf/0x980 | |
1542 | - inet_sendmsg+0x67/0xa0 | |
1543 | - sock_sendmsg+0x38/0x50 | |
1544 | - SYSC_sendto+0xef/0x170 | |
1545 | - } hitcount: 88 | |
1546 | - { stacktrace: | |
1547 | - _do_fork+0x18e/0x330 | |
1548 | - SyS_clone+0x19/0x20 | |
1549 | - entry_SYSCALL_64_fastpath+0x12/0x6a | |
1550 | - } hitcount: 244 | |
1551 | - | |
1552 | - Totals: | |
1553 | - Hits: 489 | |
1554 | - Entries: 7 | |
1555 | - Dropped: 0 | |
1556 | + See Documentation/trace/histogram.txt for details and examples. | |
b3bbd485 JK |
1557 | diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt |
1558 | index d4601df6e72e..54213e5c23f6 100644 | |
1559 | --- a/Documentation/trace/ftrace.txt | |
1560 | +++ b/Documentation/trace/ftrace.txt | |
1561 | @@ -539,6 +539,30 @@ of ftrace. Here is a list of some of the key files: | |
e4b2b4a8 JK |
1562 | |
1563 | See events.txt for more information. | |
1564 | ||
1565 | + timestamp_mode: | |
1566 | + | |
1567 | + Certain tracers may change the timestamp mode used when | |
1568 | + logging trace events into the event buffer. Events with | |
1569 | + different modes can coexist within a buffer but the mode in | |
1570 | + effect when an event is logged determines which timestamp mode | |
1571 | + is used for that event. The default timestamp mode is | |
1572 | + 'delta'. | |
1573 | + | |
1574 | + Usual timestamp modes for tracing: | |
1575 | + | |
1576 | + # cat timestamp_mode | |
1577 | + [delta] absolute | |
1578 | + | |
1579 | + The timestamp mode with the square brackets around it is the | |
1580 | + one in effect. | |
1581 | + | |
1582 | + delta: Default timestamp mode - timestamp is a delta against | |
1583 | + a per-buffer timestamp. | |
1584 | + | |
1585 | + absolute: The timestamp is a full timestamp, not a delta | |
1586 | + against some other value. As such it takes up more | |
1587 | + space and is less efficient. | |
1588 | + | |
1589 | hwlat_detector: | |
1590 | ||
1591 | Directory for the Hardware Latency Detector. | |
b3bbd485 JK |
1592 | diff --git a/Documentation/trace/histogram.txt b/Documentation/trace/histogram.txt |
1593 | new file mode 100644 | |
1594 | index 000000000000..6e05510afc28 | |
1595 | --- /dev/null | |
1596 | +++ b/Documentation/trace/histogram.txt | |
e4b2b4a8 JK |
1597 | @@ -0,0 +1,1995 @@ |
1598 | + Event Histograms | |
1599 | + | |
1600 | + Documentation written by Tom Zanussi | |
1601 | + | |
1602 | +1. Introduction | |
1603 | +=============== | |
1604 | + | |
1605 | + Histogram triggers are special event triggers that can be used to | |
1606 | + aggregate trace event data into histograms. For information on | |
1607 | + trace events and event triggers, see Documentation/trace/events.txt. | |
1608 | + | |
1609 | + | |
1610 | +2. Histogram Trigger Command | |
1611 | +============================ | |
1612 | + | |
1613 | + A histogram trigger command is an event trigger command that | |
1614 | + aggregates event hits into a hash table keyed on one or more trace | |
1615 | + event format fields (or stacktrace) and a set of running totals | |
1616 | + derived from one or more trace event format fields and/or event | |
1617 | + counts (hitcount). | |
1618 | + | |
1619 | + The format of a hist trigger is as follows: | |
1620 | + | |
1621 | + hist:keys=<field1[,field2,...]>[:values=<field1[,field2,...]>] | |
1622 | + [:sort=<field1[,field2,...]>][:size=#entries][:pause][:continue] | |
1623 | + [:clear][:name=histname1] [if <filter>] | |
1624 | + | |
1625 | + When a matching event is hit, an entry is added to a hash table | |
1626 | + using the key(s) and value(s) named. Keys and values correspond to | |
1627 | + fields in the event's format description. Values must correspond to | |
1628 | + numeric fields - on an event hit, the value(s) will be added to a | |
1629 | + sum kept for that field. The special string 'hitcount' can be used | |
1630 | + in place of an explicit value field - this is simply a count of | |
1631 | + event hits. If 'values' isn't specified, an implicit 'hitcount' | |
1632 | + value will be automatically created and used as the only value. | |
1633 | + Keys can be any field, or the special string 'stacktrace', which | |
1634 | + will use the event's kernel stacktrace as the key. The keywords | |
1635 | + 'keys' or 'key' can be used to specify keys, and the keywords | |
1636 | + 'values', 'vals', or 'val' can be used to specify values. Compound | |
1637 | + keys consisting of up to two fields can be specified by the 'keys' | |
1638 | + keyword. Hashing a compound key produces a unique entry in the | |
1639 | + table for each unique combination of component keys, and can be | |
1640 | + useful for providing more fine-grained summaries of event data. | |
1641 | + Additionally, sort keys consisting of up to two fields can be | |
1642 | + specified by the 'sort' keyword. If more than one field is | |
1643 | + specified, the result will be a 'sort within a sort': the first key | |
1644 | + is taken to be the primary sort key and the second the secondary | |
1645 | + key. If a hist trigger is given a name using the 'name' parameter, | |
1646 | + its histogram data will be shared with other triggers of the same | |
1647 | + name, and trigger hits will update this common data. Only triggers | |
1648 | + with 'compatible' fields can be combined in this way; triggers are | |
1649 | + 'compatible' if the fields named in the trigger share the same | |
1650 | + number and type of fields and those fields also have the same names. | |
1651 | + Note that any two events always share the compatible 'hitcount' and | |
1652 | + 'stacktrace' fields and can therefore be combined using those | |
1653 | + fields, however pointless that may be. | |
1654 | + | |
1655 | + 'hist' triggers add a 'hist' file to each event's subdirectory. | |
1656 | + Reading the 'hist' file for the event will dump the hash table in | |
1657 | + its entirety to stdout. If there are multiple hist triggers | |
1658 | + attached to an event, there will be a table for each trigger in the | |
1659 | + output. The table displayed for a named trigger will be the same as | |
1660 | + any other instance having the same name. Each printed hash table | |
1661 | + entry is a simple list of the keys and values comprising the entry; | |
1662 | + keys are printed first and are delineated by curly braces, and are | |
1663 | + followed by the set of value fields for the entry. By default, | |
1664 | + numeric fields are displayed as base-10 integers. This can be | |
1665 | + modified by appending any of the following modifiers to the field | |
1666 | + name: | |
1667 | + | |
1668 | + .hex display a number as a hex value | |
1669 | + .sym display an address as a symbol | |
1670 | + .sym-offset display an address as a symbol and offset | |
1671 | + .syscall display a syscall id as a system call name | |
1672 | + .execname display a common_pid as a program name | |
1673 | + .log2 display log2 value rather than raw number | |
1674 | + .usecs display a common_timestamp in microseconds | |
1675 | + | |
1676 | + Note that in general the semantics of a given field aren't | |
1677 | + interpreted when applying a modifier to it, but there are some | |
1678 | + restrictions to be aware of in this regard: | |
1679 | + | |
1680 | + - only the 'hex' modifier can be used for values (because values | |
1681 | + are essentially sums, and the other modifiers don't make sense | |
1682 | + in that context). | |
1683 | + - the 'execname' modifier can only be used on a 'common_pid'. The | |
1684 | + reason for this is that the execname is simply the 'comm' value | |
1685 | + saved for the 'current' process when an event was triggered, | |
1686 | + which is the same as the common_pid value saved by the event | |
1687 | + tracing code. Trying to apply that comm value to other pid | |
1688 | + values wouldn't be correct, and typically events that care save | |
1689 | + pid-specific comm fields in the event itself. | |
1690 | + | |
1691 | + A typical usage scenario would be the following to enable a hist | |
1692 | + trigger, read its current contents, and then turn it off: | |
1693 | + | |
1694 | + # echo 'hist:keys=skbaddr.hex:vals=len' > \ | |
1695 | + /sys/kernel/debug/tracing/events/net/netif_rx/trigger | |
1696 | + | |
1697 | + # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist | |
1698 | + | |
1699 | + # echo '!hist:keys=skbaddr.hex:vals=len' > \ | |
1700 | + /sys/kernel/debug/tracing/events/net/netif_rx/trigger | |
1701 | + | |
1702 | + The trigger file itself can be read to show the details of the | |
1703 | + currently attached hist trigger. This information is also displayed | |
1704 | + at the top of the 'hist' file when read. | |
1705 | + | |
1706 | + By default, the size of the hash table is 2048 entries. The 'size' | |
1707 | + parameter can be used to specify more or fewer than that. The units | |
1708 | + are in terms of hashtable entries - if a run uses more entries than | |
1709 | + specified, the results will show the number of 'drops', the number | |
1710 | + of hits that were ignored. The size should be a power of 2 between | |
1711 | + 128 and 131072 (any non- power-of-2 number specified will be rounded | |
1712 | + up). | |
1713 | + | |
1714 | + The 'sort' parameter can be used to specify a value field to sort | |
1715 | + on. The default if unspecified is 'hitcount' and the default sort | |
1716 | + order is 'ascending'. To sort in the opposite direction, append | |
1717 | + .descending' to the sort key. | |
1718 | + | |
1719 | + The 'pause' parameter can be used to pause an existing hist trigger | |
1720 | + or to start a hist trigger but not log any events until told to do | |
1721 | + so. 'continue' or 'cont' can be used to start or restart a paused | |
1722 | + hist trigger. | |
1723 | + | |
1724 | + The 'clear' parameter will clear the contents of a running hist | |
1725 | + trigger and leave its current paused/active state. | |
1726 | + | |
1727 | + Note that the 'pause', 'cont', and 'clear' parameters should be | |
1728 | + applied using 'append' shell operator ('>>') if applied to an | |
1729 | + existing trigger, rather than via the '>' operator, which will cause | |
1730 | + the trigger to be removed through truncation. | |
1731 | + | |
1732 | +- enable_hist/disable_hist | |
1733 | + | |
1734 | + The enable_hist and disable_hist triggers can be used to have one | |
1735 | + event conditionally start and stop another event's already-attached | |
1736 | + hist trigger. Any number of enable_hist and disable_hist triggers | |
1737 | + can be attached to a given event, allowing that event to kick off | |
1738 | + and stop aggregations on a host of other events. | |
1739 | + | |
1740 | + The format is very similar to the enable/disable_event triggers: | |
1741 | + | |
1742 | + enable_hist:<system>:<event>[:count] | |
1743 | + disable_hist:<system>:<event>[:count] | |
1744 | + | |
1745 | + Instead of enabling or disabling the tracing of the target event | |
1746 | + into the trace buffer as the enable/disable_event triggers do, the | |
1747 | + enable/disable_hist triggers enable or disable the aggregation of | |
1748 | + the target event into a hash table. | |
1749 | + | |
1750 | + A typical usage scenario for the enable_hist/disable_hist triggers | |
1751 | + would be to first set up a paused hist trigger on some event, | |
1752 | + followed by an enable_hist/disable_hist pair that turns the hist | |
1753 | + aggregation on and off when conditions of interest are hit: | |
1754 | + | |
1755 | + # echo 'hist:keys=skbaddr.hex:vals=len:pause' > \ | |
1756 | + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger | |
1757 | + | |
1758 | + # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \ | |
1759 | + /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger | |
1760 | + | |
1761 | + # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \ | |
1762 | + /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger | |
1763 | + | |
1764 | + The above sets up an initially paused hist trigger which is unpaused | |
1765 | + and starts aggregating events when a given program is executed, and | |
1766 | + which stops aggregating when the process exits and the hist trigger | |
1767 | + is paused again. | |
1768 | + | |
1769 | + The examples below provide a more concrete illustration of the | |
1770 | + concepts and typical usage patterns discussed above. | |
1771 | + | |
1772 | + 'special' event fields | |
1773 | + ------------------------ | |
1774 | + | |
1775 | + There are a number of 'special event fields' available for use as | |
1776 | + keys or values in a hist trigger. These look like and behave as if | |
1777 | + they were actual event fields, but aren't really part of the event's | |
1778 | + field definition or format file. They are however available for any | |
1779 | + event, and can be used anywhere an actual event field could be. | |
1780 | + They are: | |
1781 | + | |
1782 | + common_timestamp u64 - timestamp (from ring buffer) associated | |
1783 | + with the event, in nanoseconds. May be | |
1784 | + modified by .usecs to have timestamps | |
1785 | + interpreted as microseconds. | |
1786 | + cpu int - the cpu on which the event occurred. | |
1787 | + | |
1788 | + Extended error information | |
1789 | + -------------------------- | |
1790 | + | |
1791 | + For some error conditions encountered when invoking a hist trigger | |
1792 | + command, extended error information is available via the | |
1793 | + corresponding event's 'hist' file. Reading the hist file after an | |
1794 | + error will display more detailed information about what went wrong, | |
1795 | + if information is available. This extended error information will | |
1796 | + be available until the next hist trigger command for that event. | |
1797 | + | |
1798 | + If available for a given error condition, the extended error | |
1799 | + information and usage takes the following form: | |
1800 | + | |
1801 | + # echo xxx > /sys/kernel/debug/tracing/events/sched/sched_wakeup/trigger | |
1802 | + echo: write error: Invalid argument | |
1803 | + | |
1804 | + # cat /sys/kernel/debug/tracing/events/sched/sched_wakeup/hist | |
1805 | + ERROR: Couldn't yyy: zzz | |
1806 | + Last command: xxx | |
1807 | + | |
1808 | +6.2 'hist' trigger examples | |
1809 | +--------------------------- | |
1810 | + | |
1811 | + The first set of examples creates aggregations using the kmalloc | |
1812 | + event. The fields that can be used for the hist trigger are listed | |
1813 | + in the kmalloc event's format file: | |
1814 | + | |
1815 | + # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/format | |
1816 | + name: kmalloc | |
1817 | + ID: 374 | |
1818 | + format: | |
1819 | + field:unsigned short common_type; offset:0; size:2; signed:0; | |
1820 | + field:unsigned char common_flags; offset:2; size:1; signed:0; | |
1821 | + field:unsigned char common_preempt_count; offset:3; size:1; signed:0; | |
1822 | + field:int common_pid; offset:4; size:4; signed:1; | |
1823 | + | |
1824 | + field:unsigned long call_site; offset:8; size:8; signed:0; | |
1825 | + field:const void * ptr; offset:16; size:8; signed:0; | |
1826 | + field:size_t bytes_req; offset:24; size:8; signed:0; | |
1827 | + field:size_t bytes_alloc; offset:32; size:8; signed:0; | |
1828 | + field:gfp_t gfp_flags; offset:40; size:4; signed:0; | |
1829 | + | |
1830 | + We'll start by creating a hist trigger that generates a simple table | |
1831 | + that lists the total number of bytes requested for each function in | |
1832 | + the kernel that made one or more calls to kmalloc: | |
1833 | + | |
1834 | + # echo 'hist:key=call_site:val=bytes_req' > \ | |
1835 | + /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger | |
1836 | + | |
1837 | + This tells the tracing system to create a 'hist' trigger using the | |
1838 | + call_site field of the kmalloc event as the key for the table, which | |
1839 | + just means that each unique call_site address will have an entry | |
1840 | + created for it in the table. The 'val=bytes_req' parameter tells | |
1841 | + the hist trigger that for each unique entry (call_site) in the | |
1842 | + table, it should keep a running total of the number of bytes | |
1843 | + requested by that call_site. | |
1844 | + | |
1845 | + We'll let it run for awhile and then dump the contents of the 'hist' | |
1846 | + file in the kmalloc event's subdirectory (for readability, a number | |
1847 | + of entries have been omitted): | |
1848 | + | |
1849 | + # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist | |
1850 | + # trigger info: hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active] | |
1851 | + | |
1852 | + { call_site: 18446744072106379007 } hitcount: 1 bytes_req: 176 | |
1853 | + { call_site: 18446744071579557049 } hitcount: 1 bytes_req: 1024 | |
1854 | + { call_site: 18446744071580608289 } hitcount: 1 bytes_req: 16384 | |
1855 | + { call_site: 18446744071581827654 } hitcount: 1 bytes_req: 24 | |
1856 | + { call_site: 18446744071580700980 } hitcount: 1 bytes_req: 8 | |
1857 | + { call_site: 18446744071579359876 } hitcount: 1 bytes_req: 152 | |
1858 | + { call_site: 18446744071580795365 } hitcount: 3 bytes_req: 144 | |
1859 | + { call_site: 18446744071581303129 } hitcount: 3 bytes_req: 144 | |
1860 | + { call_site: 18446744071580713234 } hitcount: 4 bytes_req: 2560 | |
1861 | + { call_site: 18446744071580933750 } hitcount: 4 bytes_req: 736 | |
1862 | + . | |
1863 | + . | |
1864 | + . | |
1865 | + { call_site: 18446744072106047046 } hitcount: 69 bytes_req: 5576 | |
1866 | + { call_site: 18446744071582116407 } hitcount: 73 bytes_req: 2336 | |
1867 | + { call_site: 18446744072106054684 } hitcount: 136 bytes_req: 140504 | |
1868 | + { call_site: 18446744072106224230 } hitcount: 136 bytes_req: 19584 | |
1869 | + { call_site: 18446744072106078074 } hitcount: 153 bytes_req: 2448 | |
1870 | + { call_site: 18446744072106062406 } hitcount: 153 bytes_req: 36720 | |
1871 | + { call_site: 18446744071582507929 } hitcount: 153 bytes_req: 37088 | |
1872 | + { call_site: 18446744072102520590 } hitcount: 273 bytes_req: 10920 | |
1873 | + { call_site: 18446744071582143559 } hitcount: 358 bytes_req: 716 | |
1874 | + { call_site: 18446744072106465852 } hitcount: 417 bytes_req: 56712 | |
1875 | + { call_site: 18446744072102523378 } hitcount: 485 bytes_req: 27160 | |
1876 | + { call_site: 18446744072099568646 } hitcount: 1676 bytes_req: 33520 | |
1877 | + | |
1878 | + Totals: | |
1879 | + Hits: 4610 | |
1880 | + Entries: 45 | |
1881 | + Dropped: 0 | |
1882 | + | |
1883 | + The output displays a line for each entry, beginning with the key | |
1884 | + specified in the trigger, followed by the value(s) also specified in | |
1885 | + the trigger. At the beginning of the output is a line that displays | |
1886 | + the trigger info, which can also be displayed by reading the | |
1887 | + 'trigger' file: | |
1888 | + | |
1889 | + # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger | |
1890 | + hist:keys=call_site:vals=bytes_req:sort=hitcount:size=2048 [active] | |
1891 | + | |
1892 | + At the end of the output are a few lines that display the overall | |
1893 | + totals for the run. The 'Hits' field shows the total number of | |
1894 | + times the event trigger was hit, the 'Entries' field shows the total | |
1895 | + number of used entries in the hash table, and the 'Dropped' field | |
1896 | + shows the number of hits that were dropped because the number of | |
1897 | + used entries for the run exceeded the maximum number of entries | |
1898 | + allowed for the table (normally 0, but if not a hint that you may | |
1899 | + want to increase the size of the table using the 'size' parameter). | |
1900 | + | |
1901 | + Notice in the above output that there's an extra field, 'hitcount', | |
1902 | + which wasn't specified in the trigger. Also notice that in the | |
1903 | + trigger info output, there's a parameter, 'sort=hitcount', which | |
1904 | + wasn't specified in the trigger either. The reason for that is that | |
1905 | + every trigger implicitly keeps a count of the total number of hits | |
1906 | + attributed to a given entry, called the 'hitcount'. That hitcount | |
1907 | + information is explicitly displayed in the output, and in the | |
1908 | + absence of a user-specified sort parameter, is used as the default | |
1909 | + sort field. | |
1910 | + | |
1911 | + The value 'hitcount' can be used in place of an explicit value in | |
1912 | + the 'values' parameter if you don't really need to have any | |
1913 | + particular field summed and are mainly interested in hit | |
1914 | + frequencies. | |
1915 | + | |
1916 | + To turn the hist trigger off, simply call up the trigger in the | |
1917 | + command history and re-execute it with a '!' prepended: | |
1918 | + | |
1919 | + # echo '!hist:key=call_site:val=bytes_req' > \ | |
1920 | + /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger | |
1921 | + | |
1922 | + Finally, notice that the call_site as displayed in the output above | |
1923 | + isn't really very useful. It's an address, but normally addresses | |
1924 | + are displayed in hex. To have a numeric field displayed as a hex | |
1925 | + value, simply append '.hex' to the field name in the trigger: | |
1926 | + | |
1927 | + # echo 'hist:key=call_site.hex:val=bytes_req' > \ | |
1928 | + /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger | |
1929 | + | |
1930 | + # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist | |
1931 | + # trigger info: hist:keys=call_site.hex:vals=bytes_req:sort=hitcount:size=2048 [active] | |
1932 | + | |
1933 | + { call_site: ffffffffa026b291 } hitcount: 1 bytes_req: 433 | |
1934 | + { call_site: ffffffffa07186ff } hitcount: 1 bytes_req: 176 | |
1935 | + { call_site: ffffffff811ae721 } hitcount: 1 bytes_req: 16384 | |
1936 | + { call_site: ffffffff811c5134 } hitcount: 1 bytes_req: 8 | |
1937 | + { call_site: ffffffffa04a9ebb } hitcount: 1 bytes_req: 511 | |
1938 | + { call_site: ffffffff8122e0a6 } hitcount: 1 bytes_req: 12 | |
1939 | + { call_site: ffffffff8107da84 } hitcount: 1 bytes_req: 152 | |
1940 | + { call_site: ffffffff812d8246 } hitcount: 1 bytes_req: 24 | |
1941 | + { call_site: ffffffff811dc1e5 } hitcount: 3 bytes_req: 144 | |
1942 | + { call_site: ffffffffa02515e8 } hitcount: 3 bytes_req: 648 | |
1943 | + { call_site: ffffffff81258159 } hitcount: 3 bytes_req: 144 | |
1944 | + { call_site: ffffffff811c80f4 } hitcount: 4 bytes_req: 544 | |
1945 | + . | |
1946 | + . | |
1947 | + . | |
1948 | + { call_site: ffffffffa06c7646 } hitcount: 106 bytes_req: 8024 | |
1949 | + { call_site: ffffffffa06cb246 } hitcount: 132 bytes_req: 31680 | |
1950 | + { call_site: ffffffffa06cef7a } hitcount: 132 bytes_req: 2112 | |
1951 | + { call_site: ffffffff8137e399 } hitcount: 132 bytes_req: 23232 | |
1952 | + { call_site: ffffffffa06c941c } hitcount: 185 bytes_req: 171360 | |
1953 | + { call_site: ffffffffa06f2a66 } hitcount: 185 bytes_req: 26640 | |
1954 | + { call_site: ffffffffa036a70e } hitcount: 265 bytes_req: 10600 | |
1955 | + { call_site: ffffffff81325447 } hitcount: 292 bytes_req: 584 | |
1956 | + { call_site: ffffffffa072da3c } hitcount: 446 bytes_req: 60656 | |
1957 | + { call_site: ffffffffa036b1f2 } hitcount: 526 bytes_req: 29456 | |
1958 | + { call_site: ffffffffa0099c06 } hitcount: 1780 bytes_req: 35600 | |
1959 | + | |
1960 | + Totals: | |
1961 | + Hits: 4775 | |
1962 | + Entries: 46 | |
1963 | + Dropped: 0 | |
1964 | + | |
1965 | + Even that's only marginally more useful - while hex values do look | |
1966 | + more like addresses, what users are typically more interested in | |
1967 | + when looking at text addresses are the corresponding symbols | |
1968 | + instead. To have an address displayed as symbolic value instead, | |
1969 | + simply append '.sym' or '.sym-offset' to the field name in the | |
1970 | + trigger: | |
1971 | + | |
1972 | + # echo 'hist:key=call_site.sym:val=bytes_req' > \ | |
1973 | + /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger | |
1974 | + | |
1975 | + # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist | |
1976 | + # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=hitcount:size=2048 [active] | |
1977 | + | |
1978 | + { call_site: [ffffffff810adcb9] syslog_print_all } hitcount: 1 bytes_req: 1024 | |
1979 | + { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8 | |
1980 | + { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7 | |
1981 | + { call_site: [ffffffff8154acbe] usb_alloc_urb } hitcount: 1 bytes_req: 192 | |
1982 | + { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7 | |
1983 | + { call_site: [ffffffff811e3a25] __seq_open_private } hitcount: 1 bytes_req: 40 | |
1984 | + { call_site: [ffffffff8109524a] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 | |
1985 | + { call_site: [ffffffff811febd5] fsnotify_alloc_group } hitcount: 2 bytes_req: 528 | |
1986 | + { call_site: [ffffffff81440f58] __tty_buffer_request_room } hitcount: 2 bytes_req: 2624 | |
1987 | + { call_site: [ffffffff81200ba6] inotify_new_group } hitcount: 2 bytes_req: 96 | |
1988 | + { call_site: [ffffffffa05e19af] ieee80211_start_tx_ba_session [mac80211] } hitcount: 2 bytes_req: 464 | |
1989 | + { call_site: [ffffffff81672406] tcp_get_metrics } hitcount: 2 bytes_req: 304 | |
1990 | + { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 | |
1991 | + { call_site: [ffffffff81089b05] sched_create_group } hitcount: 2 bytes_req: 1424 | |
1992 | + . | |
1993 | + . | |
1994 | + . | |
1995 | + { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 1185 bytes_req: 123240 | |
1996 | + { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm] } hitcount: 1185 bytes_req: 104280 | |
1997 | + { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 1402 bytes_req: 190672 | |
1998 | + { call_site: [ffffffff812891ca] ext4_find_extent } hitcount: 1518 bytes_req: 146208 | |
1999 | + { call_site: [ffffffffa029070e] drm_vma_node_allow [drm] } hitcount: 1746 bytes_req: 69840 | |
2000 | + { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 2021 bytes_req: 792312 | |
2001 | + { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 2592 bytes_req: 145152 | |
2002 | + { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 2629 bytes_req: 378576 | |
2003 | + { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 2629 bytes_req: 3783248 | |
2004 | + { call_site: [ffffffff81325607] apparmor_file_alloc_security } hitcount: 5192 bytes_req: 10384 | |
2005 | + { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 5529 bytes_req: 110584 | |
2006 | + { call_site: [ffffffff8131ebf7] aa_alloc_task_context } hitcount: 21943 bytes_req: 702176 | |
2007 | + { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 55759 bytes_req: 5074265 | |
2008 | + | |
2009 | + Totals: | |
2010 | + Hits: 109928 | |
2011 | + Entries: 71 | |
2012 | + Dropped: 0 | |
2013 | + | |
2014 | + Because the default sort key above is 'hitcount', the above shows a | |
2015 | + the list of call_sites by increasing hitcount, so that at the bottom | |
2016 | + we see the functions that made the most kmalloc calls during the | |
2017 | + run. If instead we we wanted to see the top kmalloc callers in | |
2018 | + terms of the number of bytes requested rather than the number of | |
2019 | + calls, and we wanted the top caller to appear at the top, we can use | |
2020 | + the 'sort' parameter, along with the 'descending' modifier: | |
2021 | + | |
2022 | + # echo 'hist:key=call_site.sym:val=bytes_req:sort=bytes_req.descending' > \ | |
2023 | + /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger | |
2024 | + | |
2025 | + # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist | |
2026 | + # trigger info: hist:keys=call_site.sym:vals=bytes_req:sort=bytes_req.descending:size=2048 [active] | |
2027 | + | |
2028 | + { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 2186 bytes_req: 3397464 | |
2029 | + { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 1790 bytes_req: 712176 | |
2030 | + { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 8132 bytes_req: 513135 | |
2031 | + { call_site: [ffffffff811e2a1b] seq_buf_alloc } hitcount: 106 bytes_req: 440128 | |
2032 | + { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 2186 bytes_req: 314784 | |
2033 | + { call_site: [ffffffff812891ca] ext4_find_extent } hitcount: 2174 bytes_req: 208992 | |
2034 | + { call_site: [ffffffff811ae8e1] __kmalloc } hitcount: 8 bytes_req: 131072 | |
2035 | + { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 859 bytes_req: 116824 | |
2036 | + { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 1834 bytes_req: 102704 | |
2037 | + { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 972 bytes_req: 101088 | |
2038 | + { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl [drm] } hitcount: 972 bytes_req: 85536 | |
2039 | + { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 3333 bytes_req: 66664 | |
2040 | + { call_site: [ffffffff8137e559] sg_kmalloc } hitcount: 209 bytes_req: 61632 | |
2041 | + . | |
2042 | + . | |
2043 | + . | |
2044 | + { call_site: [ffffffff81095225] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 | |
2045 | + { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 | |
2046 | + { call_site: [ffffffff812d8406] copy_semundo } hitcount: 2 bytes_req: 48 | |
2047 | + { call_site: [ffffffff81200ba6] inotify_new_group } hitcount: 1 bytes_req: 48 | |
2048 | + { call_site: [ffffffffa027121a] drm_getmagic [drm] } hitcount: 1 bytes_req: 48 | |
2049 | + { call_site: [ffffffff811e3a25] __seq_open_private } hitcount: 1 bytes_req: 40 | |
2050 | + { call_site: [ffffffff811c52f4] bprm_change_interp } hitcount: 2 bytes_req: 16 | |
2051 | + { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8 | |
2052 | + { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7 | |
2053 | + { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7 | |
2054 | + | |
2055 | + Totals: | |
2056 | + Hits: 32133 | |
2057 | + Entries: 81 | |
2058 | + Dropped: 0 | |
2059 | + | |
2060 | + To display the offset and size information in addition to the symbol | |
2061 | + name, just use 'sym-offset' instead: | |
2062 | + | |
2063 | + # echo 'hist:key=call_site.sym-offset:val=bytes_req:sort=bytes_req.descending' > \ | |
2064 | + /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger | |
2065 | + | |
2066 | + # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist | |
2067 | + # trigger info: hist:keys=call_site.sym-offset:vals=bytes_req:sort=bytes_req.descending:size=2048 [active] | |
2068 | + | |
2069 | + { call_site: [ffffffffa046041c] i915_gem_execbuffer2+0x6c/0x2c0 [i915] } hitcount: 4569 bytes_req: 3163720 | |
2070 | + { call_site: [ffffffffa0489a66] intel_ring_begin+0xc6/0x1f0 [i915] } hitcount: 4569 bytes_req: 657936 | |
2071 | + { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23+0x694/0x1020 [i915] } hitcount: 1519 bytes_req: 472936 | |
2072 | + { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23+0x516/0x1020 [i915] } hitcount: 3050 bytes_req: 211832 | |
2073 | + { call_site: [ffffffff811e2a1b] seq_buf_alloc+0x1b/0x50 } hitcount: 34 bytes_req: 148384 | |
2074 | + { call_site: [ffffffffa04a580c] intel_crtc_page_flip+0xbc/0x870 [i915] } hitcount: 1385 bytes_req: 144040 | |
2075 | + { call_site: [ffffffff811ae8e1] __kmalloc+0x191/0x1b0 } hitcount: 8 bytes_req: 131072 | |
2076 | + { call_site: [ffffffffa0287592] drm_mode_page_flip_ioctl+0x282/0x360 [drm] } hitcount: 1385 bytes_req: 121880 | |
2077 | + { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc+0x32/0x100 [drm] } hitcount: 1848 bytes_req: 103488 | |
2078 | + { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state+0x2c/0xa0 [i915] } hitcount: 461 bytes_req: 62696 | |
2079 | + { call_site: [ffffffffa029070e] drm_vma_node_allow+0x2e/0xd0 [drm] } hitcount: 1541 bytes_req: 61640 | |
2080 | + { call_site: [ffffffff815f8d7b] sk_prot_alloc+0xcb/0x1b0 } hitcount: 57 bytes_req: 57456 | |
2081 | + . | |
2082 | + . | |
2083 | + . | |
2084 | + { call_site: [ffffffff8109524a] alloc_fair_sched_group+0x5a/0x1a0 } hitcount: 2 bytes_req: 128 | |
2085 | + { call_site: [ffffffffa027b921] drm_vm_open_locked+0x31/0xa0 [drm] } hitcount: 3 bytes_req: 96 | |
2086 | + { call_site: [ffffffff8122e266] proc_self_follow_link+0x76/0xb0 } hitcount: 8 bytes_req: 96 | |
2087 | + { call_site: [ffffffff81213e80] load_elf_binary+0x240/0x1650 } hitcount: 3 bytes_req: 84 | |
2088 | + { call_site: [ffffffff8154bc62] usb_control_msg+0x42/0x110 } hitcount: 1 bytes_req: 8 | |
2089 | + { call_site: [ffffffffa00bf6fe] hidraw_send_report+0x7e/0x1a0 [hid] } hitcount: 1 bytes_req: 7 | |
2090 | + { call_site: [ffffffffa00bf1ca] hidraw_report_event+0x8a/0x120 [hid] } hitcount: 1 bytes_req: 7 | |
2091 | + | |
2092 | + Totals: | |
2093 | + Hits: 26098 | |
2094 | + Entries: 64 | |
2095 | + Dropped: 0 | |
2096 | + | |
2097 | + We can also add multiple fields to the 'values' parameter. For | |
2098 | + example, we might want to see the total number of bytes allocated | |
2099 | + alongside bytes requested, and display the result sorted by bytes | |
2100 | + allocated in a descending order: | |
2101 | + | |
2102 | + # echo 'hist:keys=call_site.sym:values=bytes_req,bytes_alloc:sort=bytes_alloc.descending' > \ | |
2103 | + /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger | |
2104 | + | |
2105 | + # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist | |
2106 | + # trigger info: hist:keys=call_site.sym:vals=bytes_req,bytes_alloc:sort=bytes_alloc.descending:size=2048 [active] | |
2107 | + | |
2108 | + { call_site: [ffffffffa046041c] i915_gem_execbuffer2 [i915] } hitcount: 7403 bytes_req: 4084360 bytes_alloc: 5958016 | |
2109 | + { call_site: [ffffffff811e2a1b] seq_buf_alloc } hitcount: 541 bytes_req: 2213968 bytes_alloc: 2228224 | |
2110 | + { call_site: [ffffffffa0489a66] intel_ring_begin [i915] } hitcount: 7404 bytes_req: 1066176 bytes_alloc: 1421568 | |
2111 | + { call_site: [ffffffffa045e7c4] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 1565 bytes_req: 557368 bytes_alloc: 1037760 | |
2112 | + { call_site: [ffffffff8125847d] ext4_htree_store_dirent } hitcount: 9557 bytes_req: 595778 bytes_alloc: 695744 | |
2113 | + { call_site: [ffffffffa045e646] i915_gem_do_execbuffer.isra.23 [i915] } hitcount: 5839 bytes_req: 430680 bytes_alloc: 470400 | |
2114 | + { call_site: [ffffffffa04c4a3c] intel_plane_duplicate_state [i915] } hitcount: 2388 bytes_req: 324768 bytes_alloc: 458496 | |
2115 | + { call_site: [ffffffffa02911f2] drm_modeset_lock_crtc [drm] } hitcount: 3911 bytes_req: 219016 bytes_alloc: 250304 | |
2116 | + { call_site: [ffffffff815f8d7b] sk_prot_alloc } hitcount: 235 bytes_req: 236880 bytes_alloc: 240640 | |
2117 | + { call_site: [ffffffff8137e559] sg_kmalloc } hitcount: 557 bytes_req: 169024 bytes_alloc: 221760 | |
2118 | + { call_site: [ffffffffa00b7c06] hid_report_raw_event [hid] } hitcount: 9378 bytes_req: 187548 bytes_alloc: 206312 | |
2119 | + { call_site: [ffffffffa04a580c] intel_crtc_page_flip [i915] } hitcount: 1519 bytes_req: 157976 bytes_alloc: 194432 | |
2120 | + . | |
2121 | + . | |
2122 | + . | |
2123 | + { call_site: [ffffffff8109bd3b] sched_autogroup_create_attach } hitcount: 2 bytes_req: 144 bytes_alloc: 192 | |
2124 | + { call_site: [ffffffff81097ee8] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128 | |
2125 | + { call_site: [ffffffff8109524a] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128 | |
2126 | + { call_site: [ffffffff81095225] alloc_fair_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128 | |
2127 | + { call_site: [ffffffff81097ec2] alloc_rt_sched_group } hitcount: 2 bytes_req: 128 bytes_alloc: 128 | |
2128 | + { call_site: [ffffffff81213e80] load_elf_binary } hitcount: 3 bytes_req: 84 bytes_alloc: 96 | |
2129 | + { call_site: [ffffffff81079a2e] kthread_create_on_node } hitcount: 1 bytes_req: 56 bytes_alloc: 64 | |
2130 | + { call_site: [ffffffffa00bf6fe] hidraw_send_report [hid] } hitcount: 1 bytes_req: 7 bytes_alloc: 8 | |
2131 | + { call_site: [ffffffff8154bc62] usb_control_msg } hitcount: 1 bytes_req: 8 bytes_alloc: 8 | |
2132 | + { call_site: [ffffffffa00bf1ca] hidraw_report_event [hid] } hitcount: 1 bytes_req: 7 bytes_alloc: 8 | |
2133 | + | |
2134 | + Totals: | |
2135 | + Hits: 66598 | |
2136 | + Entries: 65 | |
2137 | + Dropped: 0 | |
2138 | + | |
2139 | + Finally, to finish off our kmalloc example, instead of simply having | |
2140 | + the hist trigger display symbolic call_sites, we can have the hist | |
2141 | + trigger additionally display the complete set of kernel stack traces | |
2142 | + that led to each call_site. To do that, we simply use the special | |
2143 | + value 'stacktrace' for the key parameter: | |
2144 | + | |
2145 | + # echo 'hist:keys=stacktrace:values=bytes_req,bytes_alloc:sort=bytes_alloc' > \ | |
2146 | + /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger | |
2147 | + | |
2148 | + The above trigger will use the kernel stack trace in effect when an | |
2149 | + event is triggered as the key for the hash table. This allows the | |
2150 | + enumeration of every kernel callpath that led up to a particular | |
2151 | + event, along with a running total of any of the event fields for | |
2152 | + that event. Here we tally bytes requested and bytes allocated for | |
2153 | + every callpath in the system that led up to a kmalloc (in this case | |
2154 | + every callpath to a kmalloc for a kernel compile): | |
2155 | + | |
2156 | + # cat /sys/kernel/debug/tracing/events/kmem/kmalloc/hist | |
2157 | + # trigger info: hist:keys=stacktrace:vals=bytes_req,bytes_alloc:sort=bytes_alloc:size=2048 [active] | |
2158 | + | |
2159 | + { stacktrace: | |
2160 | + __kmalloc_track_caller+0x10b/0x1a0 | |
2161 | + kmemdup+0x20/0x50 | |
2162 | + hidraw_report_event+0x8a/0x120 [hid] | |
2163 | + hid_report_raw_event+0x3ea/0x440 [hid] | |
2164 | + hid_input_report+0x112/0x190 [hid] | |
2165 | + hid_irq_in+0xc2/0x260 [usbhid] | |
2166 | + __usb_hcd_giveback_urb+0x72/0x120 | |
2167 | + usb_giveback_urb_bh+0x9e/0xe0 | |
2168 | + tasklet_hi_action+0xf8/0x100 | |
2169 | + __do_softirq+0x114/0x2c0 | |
2170 | + irq_exit+0xa5/0xb0 | |
2171 | + do_IRQ+0x5a/0xf0 | |
2172 | + ret_from_intr+0x0/0x30 | |
2173 | + cpuidle_enter+0x17/0x20 | |
2174 | + cpu_startup_entry+0x315/0x3e0 | |
2175 | + rest_init+0x7c/0x80 | |
2176 | + } hitcount: 3 bytes_req: 21 bytes_alloc: 24 | |
2177 | + { stacktrace: | |
2178 | + __kmalloc_track_caller+0x10b/0x1a0 | |
2179 | + kmemdup+0x20/0x50 | |
2180 | + hidraw_report_event+0x8a/0x120 [hid] | |
2181 | + hid_report_raw_event+0x3ea/0x440 [hid] | |
2182 | + hid_input_report+0x112/0x190 [hid] | |
2183 | + hid_irq_in+0xc2/0x260 [usbhid] | |
2184 | + __usb_hcd_giveback_urb+0x72/0x120 | |
2185 | + usb_giveback_urb_bh+0x9e/0xe0 | |
2186 | + tasklet_hi_action+0xf8/0x100 | |
2187 | + __do_softirq+0x114/0x2c0 | |
2188 | + irq_exit+0xa5/0xb0 | |
2189 | + do_IRQ+0x5a/0xf0 | |
2190 | + ret_from_intr+0x0/0x30 | |
2191 | + } hitcount: 3 bytes_req: 21 bytes_alloc: 24 | |
2192 | + { stacktrace: | |
2193 | + kmem_cache_alloc_trace+0xeb/0x150 | |
2194 | + aa_alloc_task_context+0x27/0x40 | |
2195 | + apparmor_cred_prepare+0x1f/0x50 | |
2196 | + security_prepare_creds+0x16/0x20 | |
2197 | + prepare_creds+0xdf/0x1a0 | |
2198 | + SyS_capset+0xb5/0x200 | |
2199 | + system_call_fastpath+0x12/0x6a | |
2200 | + } hitcount: 1 bytes_req: 32 bytes_alloc: 32 | |
2201 | + . | |
2202 | + . | |
2203 | + . | |
2204 | + { stacktrace: | |
2205 | + __kmalloc+0x11b/0x1b0 | |
2206 | + i915_gem_execbuffer2+0x6c/0x2c0 [i915] | |
2207 | + drm_ioctl+0x349/0x670 [drm] | |
2208 | + do_vfs_ioctl+0x2f0/0x4f0 | |
2209 | + SyS_ioctl+0x81/0xa0 | |
2210 | + system_call_fastpath+0x12/0x6a | |
2211 | + } hitcount: 17726 bytes_req: 13944120 bytes_alloc: 19593808 | |
2212 | + { stacktrace: | |
2213 | + __kmalloc+0x11b/0x1b0 | |
2214 | + load_elf_phdrs+0x76/0xa0 | |
2215 | + load_elf_binary+0x102/0x1650 | |
2216 | + search_binary_handler+0x97/0x1d0 | |
2217 | + do_execveat_common.isra.34+0x551/0x6e0 | |
2218 | + SyS_execve+0x3a/0x50 | |
2219 | + return_from_execve+0x0/0x23 | |
2220 | + } hitcount: 33348 bytes_req: 17152128 bytes_alloc: 20226048 | |
2221 | + { stacktrace: | |
2222 | + kmem_cache_alloc_trace+0xeb/0x150 | |
2223 | + apparmor_file_alloc_security+0x27/0x40 | |
2224 | + security_file_alloc+0x16/0x20 | |
2225 | + get_empty_filp+0x93/0x1c0 | |
2226 | + path_openat+0x31/0x5f0 | |
2227 | + do_filp_open+0x3a/0x90 | |
2228 | + do_sys_open+0x128/0x220 | |
2229 | + SyS_open+0x1e/0x20 | |
2230 | + system_call_fastpath+0x12/0x6a | |
2231 | + } hitcount: 4766422 bytes_req: 9532844 bytes_alloc: 38131376 | |
2232 | + { stacktrace: | |
2233 | + __kmalloc+0x11b/0x1b0 | |
2234 | + seq_buf_alloc+0x1b/0x50 | |
2235 | + seq_read+0x2cc/0x370 | |
2236 | + proc_reg_read+0x3d/0x80 | |
2237 | + __vfs_read+0x28/0xe0 | |
2238 | + vfs_read+0x86/0x140 | |
2239 | + SyS_read+0x46/0xb0 | |
2240 | + system_call_fastpath+0x12/0x6a | |
2241 | + } hitcount: 19133 bytes_req: 78368768 bytes_alloc: 78368768 | |
2242 | + | |
2243 | + Totals: | |
2244 | + Hits: 6085872 | |
2245 | + Entries: 253 | |
2246 | + Dropped: 0 | |
2247 | + | |
2248 | + If you key a hist trigger on common_pid, in order for example to | |
2249 | + gather and display sorted totals for each process, you can use the | |
2250 | + special .execname modifier to display the executable names for the | |
2251 | + processes in the table rather than raw pids. The example below | |
2252 | + keeps a per-process sum of total bytes read: | |
2253 | + | |
2254 | + # echo 'hist:key=common_pid.execname:val=count:sort=count.descending' > \ | |
2255 | + /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/trigger | |
2256 | + | |
2257 | + # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/hist | |
2258 | + # trigger info: hist:keys=common_pid.execname:vals=count:sort=count.descending:size=2048 [active] | |
2259 | + | |
2260 | + { common_pid: gnome-terminal [ 3196] } hitcount: 280 count: 1093512 | |
2261 | + { common_pid: Xorg [ 1309] } hitcount: 525 count: 256640 | |
2262 | + { common_pid: compiz [ 2889] } hitcount: 59 count: 254400 | |
2263 | + { common_pid: bash [ 8710] } hitcount: 3 count: 66369 | |
2264 | + { common_pid: dbus-daemon-lau [ 8703] } hitcount: 49 count: 47739 | |
2265 | + { common_pid: irqbalance [ 1252] } hitcount: 27 count: 27648 | |
2266 | + { common_pid: 01ifupdown [ 8705] } hitcount: 3 count: 17216 | |
2267 | + { common_pid: dbus-daemon [ 772] } hitcount: 10 count: 12396 | |
2268 | + { common_pid: Socket Thread [ 8342] } hitcount: 11 count: 11264 | |
2269 | + { common_pid: nm-dhcp-client. [ 8701] } hitcount: 6 count: 7424 | |
2270 | + { common_pid: gmain [ 1315] } hitcount: 18 count: 6336 | |
2271 | + . | |
2272 | + . | |
2273 | + . | |
2274 | + { common_pid: postgres [ 1892] } hitcount: 2 count: 32 | |
2275 | + { common_pid: postgres [ 1891] } hitcount: 2 count: 32 | |
2276 | + { common_pid: gmain [ 8704] } hitcount: 2 count: 32 | |
2277 | + { common_pid: upstart-dbus-br [ 2740] } hitcount: 21 count: 21 | |
2278 | + { common_pid: nm-dispatcher.a [ 8696] } hitcount: 1 count: 16 | |
2279 | + { common_pid: indicator-datet [ 2904] } hitcount: 1 count: 16 | |
2280 | + { common_pid: gdbus [ 2998] } hitcount: 1 count: 16 | |
2281 | + { common_pid: rtkit-daemon [ 2052] } hitcount: 1 count: 8 | |
2282 | + { common_pid: init [ 1] } hitcount: 2 count: 2 | |
2283 | + | |
2284 | + Totals: | |
2285 | + Hits: 2116 | |
2286 | + Entries: 51 | |
2287 | + Dropped: 0 | |
2288 | + | |
2289 | + Similarly, if you key a hist trigger on syscall id, for example to | |
2290 | + gather and display a list of systemwide syscall hits, you can use | |
2291 | + the special .syscall modifier to display the syscall names rather | |
2292 | + than raw ids. The example below keeps a running total of syscall | |
2293 | + counts for the system during the run: | |
2294 | + | |
2295 | + # echo 'hist:key=id.syscall:val=hitcount' > \ | |
2296 | + /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger | |
2297 | + | |
2298 | + # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist | |
2299 | + # trigger info: hist:keys=id.syscall:vals=hitcount:sort=hitcount:size=2048 [active] | |
2300 | + | |
2301 | + { id: sys_fsync [ 74] } hitcount: 1 | |
2302 | + { id: sys_newuname [ 63] } hitcount: 1 | |
2303 | + { id: sys_prctl [157] } hitcount: 1 | |
2304 | + { id: sys_statfs [137] } hitcount: 1 | |
2305 | + { id: sys_symlink [ 88] } hitcount: 1 | |
2306 | + { id: sys_sendmmsg [307] } hitcount: 1 | |
2307 | + { id: sys_semctl [ 66] } hitcount: 1 | |
2308 | + { id: sys_readlink [ 89] } hitcount: 3 | |
2309 | + { id: sys_bind [ 49] } hitcount: 3 | |
2310 | + { id: sys_getsockname [ 51] } hitcount: 3 | |
2311 | + { id: sys_unlink [ 87] } hitcount: 3 | |
2312 | + { id: sys_rename [ 82] } hitcount: 4 | |
2313 | + { id: unknown_syscall [ 58] } hitcount: 4 | |
2314 | + { id: sys_connect [ 42] } hitcount: 4 | |
2315 | + { id: sys_getpid [ 39] } hitcount: 4 | |
2316 | + . | |
2317 | + . | |
2318 | + . | |
2319 | + { id: sys_rt_sigprocmask [ 14] } hitcount: 952 | |
2320 | + { id: sys_futex [202] } hitcount: 1534 | |
2321 | + { id: sys_write [ 1] } hitcount: 2689 | |
2322 | + { id: sys_setitimer [ 38] } hitcount: 2797 | |
2323 | + { id: sys_read [ 0] } hitcount: 3202 | |
2324 | + { id: sys_select [ 23] } hitcount: 3773 | |
2325 | + { id: sys_writev [ 20] } hitcount: 4531 | |
2326 | + { id: sys_poll [ 7] } hitcount: 8314 | |
2327 | + { id: sys_recvmsg [ 47] } hitcount: 13738 | |
2328 | + { id: sys_ioctl [ 16] } hitcount: 21843 | |
2329 | + | |
2330 | + Totals: | |
2331 | + Hits: 67612 | |
2332 | + Entries: 72 | |
2333 | + Dropped: 0 | |
2334 | + | |
2335 | + The syscall counts above provide a rough overall picture of system | |
2336 | + call activity on the system; we can see for example that the most | |
2337 | + popular system call on this system was the 'sys_ioctl' system call. | |
2338 | + | |
2339 | + We can use 'compound' keys to refine that number and provide some | |
2340 | + further insight as to which processes exactly contribute to the | |
2341 | + overall ioctl count. | |
2342 | + | |
2343 | + The command below keeps a hitcount for every unique combination of | |
2344 | + system call id and pid - the end result is essentially a table | |
2345 | + that keeps a per-pid sum of system call hits. The results are | |
2346 | + sorted using the system call id as the primary key, and the | |
2347 | + hitcount sum as the secondary key: | |
2348 | + | |
2349 | + # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount' > \ | |
2350 | + /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger | |
2351 | + | |
2352 | + # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist | |
2353 | + # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 [active] | |
2354 | + | |
2355 | + { id: sys_read [ 0], common_pid: rtkit-daemon [ 1877] } hitcount: 1 | |
2356 | + { id: sys_read [ 0], common_pid: gdbus [ 2976] } hitcount: 1 | |
2357 | + { id: sys_read [ 0], common_pid: console-kit-dae [ 3400] } hitcount: 1 | |
2358 | + { id: sys_read [ 0], common_pid: postgres [ 1865] } hitcount: 1 | |
2359 | + { id: sys_read [ 0], common_pid: deja-dup-monito [ 3543] } hitcount: 2 | |
2360 | + { id: sys_read [ 0], common_pid: NetworkManager [ 890] } hitcount: 2 | |
2361 | + { id: sys_read [ 0], common_pid: evolution-calen [ 3048] } hitcount: 2 | |
2362 | + { id: sys_read [ 0], common_pid: postgres [ 1864] } hitcount: 2 | |
2363 | + { id: sys_read [ 0], common_pid: nm-applet [ 3022] } hitcount: 2 | |
2364 | + { id: sys_read [ 0], common_pid: whoopsie [ 1212] } hitcount: 2 | |
2365 | + . | |
2366 | + . | |
2367 | + . | |
2368 | + { id: sys_ioctl [ 16], common_pid: bash [ 8479] } hitcount: 1 | |
2369 | + { id: sys_ioctl [ 16], common_pid: bash [ 3472] } hitcount: 12 | |
2370 | + { id: sys_ioctl [ 16], common_pid: gnome-terminal [ 3199] } hitcount: 16 | |
2371 | + { id: sys_ioctl [ 16], common_pid: Xorg [ 1267] } hitcount: 1808 | |
2372 | + { id: sys_ioctl [ 16], common_pid: compiz [ 2994] } hitcount: 5580 | |
2373 | + . | |
2374 | + . | |
2375 | + . | |
2376 | + { id: sys_waitid [247], common_pid: upstart-dbus-br [ 2690] } hitcount: 3 | |
2377 | + { id: sys_waitid [247], common_pid: upstart-dbus-br [ 2688] } hitcount: 16 | |
2378 | + { id: sys_inotify_add_watch [254], common_pid: gmain [ 975] } hitcount: 2 | |
2379 | + { id: sys_inotify_add_watch [254], common_pid: gmain [ 3204] } hitcount: 4 | |
2380 | + { id: sys_inotify_add_watch [254], common_pid: gmain [ 2888] } hitcount: 4 | |
2381 | + { id: sys_inotify_add_watch [254], common_pid: gmain [ 3003] } hitcount: 4 | |
2382 | + { id: sys_inotify_add_watch [254], common_pid: gmain [ 2873] } hitcount: 4 | |
2383 | + { id: sys_inotify_add_watch [254], common_pid: gmain [ 3196] } hitcount: 6 | |
2384 | + { id: sys_openat [257], common_pid: java [ 2623] } hitcount: 2 | |
2385 | + { id: sys_eventfd2 [290], common_pid: ibus-ui-gtk3 [ 2760] } hitcount: 4 | |
2386 | + { id: sys_eventfd2 [290], common_pid: compiz [ 2994] } hitcount: 6 | |
2387 | + | |
2388 | + Totals: | |
2389 | + Hits: 31536 | |
2390 | + Entries: 323 | |
2391 | + Dropped: 0 | |
2392 | + | |
2393 | + The above list does give us a breakdown of the ioctl syscall by | |
2394 | + pid, but it also gives us quite a bit more than that, which we | |
2395 | + don't really care about at the moment. Since we know the syscall | |
2396 | + id for sys_ioctl (16, displayed next to the sys_ioctl name), we | |
2397 | + can use that to filter out all the other syscalls: | |
2398 | + | |
2399 | + # echo 'hist:key=id.syscall,common_pid.execname:val=hitcount:sort=id,hitcount if id == 16' > \ | |
2400 | + /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger | |
2401 | + | |
2402 | + # cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist | |
2403 | + # trigger info: hist:keys=id.syscall,common_pid.execname:vals=hitcount:sort=id.syscall,hitcount:size=2048 if id == 16 [active] | |
2404 | + | |
2405 | + { id: sys_ioctl [ 16], common_pid: gmain [ 2769] } hitcount: 1 | |
2406 | + { id: sys_ioctl [ 16], common_pid: evolution-addre [ 8571] } hitcount: 1 | |
2407 | + { id: sys_ioctl [ 16], common_pid: gmain [ 3003] } hitcount: 1 | |
2408 | + { id: sys_ioctl [ 16], common_pid: gmain [ 2781] } hitcount: 1 | |
2409 | + { id: sys_ioctl [ 16], common_pid: gmain [ 2829] } hitcount: 1 | |
2410 | + { id: sys_ioctl [ 16], common_pid: bash [ 8726] } hitcount: 1 | |
2411 | + { id: sys_ioctl [ 16], common_pid: bash [ 8508] } hitcount: 1 | |
2412 | + { id: sys_ioctl [ 16], common_pid: gmain [ 2970] } hitcount: 1 | |
2413 | + { id: sys_ioctl [ 16], common_pid: gmain [ 2768] } hitcount: 1 | |
2414 | + . | |
2415 | + . | |
2416 | + . | |
2417 | + { id: sys_ioctl [ 16], common_pid: pool [ 8559] } hitcount: 45 | |
2418 | + { id: sys_ioctl [ 16], common_pid: pool [ 8555] } hitcount: 48 | |
2419 | + { id: sys_ioctl [ 16], common_pid: pool [ 8551] } hitcount: 48 | |
2420 | + { id: sys_ioctl [ 16], common_pid: avahi-daemon [ 896] } hitcount: 66 | |
2421 | + { id: sys_ioctl [ 16], common_pid: Xorg [ 1267] } hitcount: 26674 | |
2422 | + { id: sys_ioctl [ 16], common_pid: compiz [ 2994] } hitcount: 73443 | |
2423 | + | |
2424 | + Totals: | |
2425 | + Hits: 101162 | |
2426 | + Entries: 103 | |
2427 | + Dropped: 0 | |
2428 | + | |
2429 | + The above output shows that 'compiz' and 'Xorg' are far and away | |
2430 | + the heaviest ioctl callers (which might lead to questions about | |
2431 | + whether they really need to be making all those calls and to | |
2432 | + possible avenues for further investigation.) | |
2433 | + | |
2434 | + The compound key examples used a key and a sum value (hitcount) to | |
2435 | + sort the output, but we can just as easily use two keys instead. | |
2436 | + Here's an example where we use a compound key composed of the the | |
2437 | + common_pid and size event fields. Sorting with pid as the primary | |
2438 | + key and 'size' as the secondary key allows us to display an | |
2439 | + ordered summary of the recvfrom sizes, with counts, received by | |
2440 | + each process: | |
2441 | + | |
2442 | + # echo 'hist:key=common_pid.execname,size:val=hitcount:sort=common_pid,size' > \ | |
2443 | + /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/trigger | |
2444 | + | |
2445 | + # cat /sys/kernel/debug/tracing/events/syscalls/sys_enter_recvfrom/hist | |
2446 | + # trigger info: hist:keys=common_pid.execname,size:vals=hitcount:sort=common_pid.execname,size:size=2048 [active] | |
2447 | + | |
2448 | + { common_pid: smbd [ 784], size: 4 } hitcount: 1 | |
2449 | + { common_pid: dnsmasq [ 1412], size: 4096 } hitcount: 672 | |
2450 | + { common_pid: postgres [ 1796], size: 1000 } hitcount: 6 | |
2451 | + { common_pid: postgres [ 1867], size: 1000 } hitcount: 10 | |
2452 | + { common_pid: bamfdaemon [ 2787], size: 28 } hitcount: 2 | |
2453 | + { common_pid: bamfdaemon [ 2787], size: 14360 } hitcount: 1 | |
2454 | + { common_pid: compiz [ 2994], size: 8 } hitcount: 1 | |
2455 | + { common_pid: compiz [ 2994], size: 20 } hitcount: 11 | |
2456 | + { common_pid: gnome-terminal [ 3199], size: 4 } hitcount: 2 | |
2457 | + { common_pid: firefox [ 8817], size: 4 } hitcount: 1 | |
2458 | + { common_pid: firefox [ 8817], size: 8 } hitcount: 5 | |
2459 | + { common_pid: firefox [ 8817], size: 588 } hitcount: 2 | |
2460 | + { common_pid: firefox [ 8817], size: 628 } hitcount: 1 | |
2461 | + { common_pid: firefox [ 8817], size: 6944 } hitcount: 1 | |
2462 | + { common_pid: firefox [ 8817], size: 408880 } hitcount: 2 | |
2463 | + { common_pid: firefox [ 8822], size: 8 } hitcount: 2 | |
2464 | + { common_pid: firefox [ 8822], size: 160 } hitcount: 2 | |
2465 | + { common_pid: firefox [ 8822], size: 320 } hitcount: 2 | |
2466 | + { common_pid: firefox [ 8822], size: 352 } hitcount: 1 | |
2467 | + . | |
2468 | + . | |
2469 | + . | |
2470 | + { common_pid: pool [ 8923], size: 1960 } hitcount: 10 | |
2471 | + { common_pid: pool [ 8923], size: 2048 } hitcount: 10 | |
2472 | + { common_pid: pool [ 8924], size: 1960 } hitcount: 10 | |
2473 | + { common_pid: pool [ 8924], size: 2048 } hitcount: 10 | |
2474 | + { common_pid: pool [ 8928], size: 1964 } hitcount: 4 | |
2475 | + { common_pid: pool [ 8928], size: 1965 } hitcount: 2 | |
2476 | + { common_pid: pool [ 8928], size: 2048 } hitcount: 6 | |
2477 | + { common_pid: pool [ 8929], size: 1982 } hitcount: 1 | |
2478 | + { common_pid: pool [ 8929], size: 2048 } hitcount: 1 | |
2479 | + | |
2480 | + Totals: | |
2481 | + Hits: 2016 | |
2482 | + Entries: 224 | |
2483 | + Dropped: 0 | |
2484 | + | |
2485 | + The above example also illustrates the fact that although a compound | |
2486 | + key is treated as a single entity for hashing purposes, the sub-keys | |
2487 | + it's composed of can be accessed independently. | |
2488 | + | |
2489 | + The next example uses a string field as the hash key and | |
2490 | + demonstrates how you can manually pause and continue a hist trigger. | |
2491 | + In this example, we'll aggregate fork counts and don't expect a | |
2492 | + large number of entries in the hash table, so we'll drop it to a | |
2493 | + much smaller number, say 256: | |
2494 | + | |
2495 | + # echo 'hist:key=child_comm:val=hitcount:size=256' > \ | |
2496 | + /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger | |
2497 | + | |
2498 | + # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist | |
2499 | + # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active] | |
2500 | + | |
2501 | + { child_comm: dconf worker } hitcount: 1 | |
2502 | + { child_comm: ibus-daemon } hitcount: 1 | |
2503 | + { child_comm: whoopsie } hitcount: 1 | |
2504 | + { child_comm: smbd } hitcount: 1 | |
2505 | + { child_comm: gdbus } hitcount: 1 | |
2506 | + { child_comm: kthreadd } hitcount: 1 | |
2507 | + { child_comm: dconf worker } hitcount: 1 | |
2508 | + { child_comm: evolution-alarm } hitcount: 2 | |
2509 | + { child_comm: Socket Thread } hitcount: 2 | |
2510 | + { child_comm: postgres } hitcount: 2 | |
2511 | + { child_comm: bash } hitcount: 3 | |
2512 | + { child_comm: compiz } hitcount: 3 | |
2513 | + { child_comm: evolution-sourc } hitcount: 4 | |
2514 | + { child_comm: dhclient } hitcount: 4 | |
2515 | + { child_comm: pool } hitcount: 5 | |
2516 | + { child_comm: nm-dispatcher.a } hitcount: 8 | |
2517 | + { child_comm: firefox } hitcount: 8 | |
2518 | + { child_comm: dbus-daemon } hitcount: 8 | |
2519 | + { child_comm: glib-pacrunner } hitcount: 10 | |
2520 | + { child_comm: evolution } hitcount: 23 | |
2521 | + | |
2522 | + Totals: | |
2523 | + Hits: 89 | |
2524 | + Entries: 20 | |
2525 | + Dropped: 0 | |
2526 | + | |
2527 | + If we want to pause the hist trigger, we can simply append :pause to | |
2528 | + the command that started the trigger. Notice that the trigger info | |
2529 | + displays as [paused]: | |
2530 | + | |
2531 | + # echo 'hist:key=child_comm:val=hitcount:size=256:pause' >> \ | |
2532 | + /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger | |
2533 | + | |
2534 | + # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist | |
2535 | + # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [paused] | |
2536 | + | |
2537 | + { child_comm: dconf worker } hitcount: 1 | |
2538 | + { child_comm: kthreadd } hitcount: 1 | |
2539 | + { child_comm: dconf worker } hitcount: 1 | |
2540 | + { child_comm: gdbus } hitcount: 1 | |
2541 | + { child_comm: ibus-daemon } hitcount: 1 | |
2542 | + { child_comm: Socket Thread } hitcount: 2 | |
2543 | + { child_comm: evolution-alarm } hitcount: 2 | |
2544 | + { child_comm: smbd } hitcount: 2 | |
2545 | + { child_comm: bash } hitcount: 3 | |
2546 | + { child_comm: whoopsie } hitcount: 3 | |
2547 | + { child_comm: compiz } hitcount: 3 | |
2548 | + { child_comm: evolution-sourc } hitcount: 4 | |
2549 | + { child_comm: pool } hitcount: 5 | |
2550 | + { child_comm: postgres } hitcount: 6 | |
2551 | + { child_comm: firefox } hitcount: 8 | |
2552 | + { child_comm: dhclient } hitcount: 10 | |
2553 | + { child_comm: emacs } hitcount: 12 | |
2554 | + { child_comm: dbus-daemon } hitcount: 20 | |
2555 | + { child_comm: nm-dispatcher.a } hitcount: 20 | |
2556 | + { child_comm: evolution } hitcount: 35 | |
2557 | + { child_comm: glib-pacrunner } hitcount: 59 | |
2558 | + | |
2559 | + Totals: | |
2560 | + Hits: 199 | |
2561 | + Entries: 21 | |
2562 | + Dropped: 0 | |
2563 | + | |
2564 | + To manually continue having the trigger aggregate events, append | |
2565 | + :cont instead. Notice that the trigger info displays as [active] | |
2566 | + again, and the data has changed: | |
2567 | + | |
2568 | + # echo 'hist:key=child_comm:val=hitcount:size=256:cont' >> \ | |
2569 | + /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger | |
2570 | + | |
2571 | + # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist | |
2572 | + # trigger info: hist:keys=child_comm:vals=hitcount:sort=hitcount:size=256 [active] | |
2573 | + | |
2574 | + { child_comm: dconf worker } hitcount: 1 | |
2575 | + { child_comm: dconf worker } hitcount: 1 | |
2576 | + { child_comm: kthreadd } hitcount: 1 | |
2577 | + { child_comm: gdbus } hitcount: 1 | |
2578 | + { child_comm: ibus-daemon } hitcount: 1 | |
2579 | + { child_comm: Socket Thread } hitcount: 2 | |
2580 | + { child_comm: evolution-alarm } hitcount: 2 | |
2581 | + { child_comm: smbd } hitcount: 2 | |
2582 | + { child_comm: whoopsie } hitcount: 3 | |
2583 | + { child_comm: compiz } hitcount: 3 | |
2584 | + { child_comm: evolution-sourc } hitcount: 4 | |
2585 | + { child_comm: bash } hitcount: 5 | |
2586 | + { child_comm: pool } hitcount: 5 | |
2587 | + { child_comm: postgres } hitcount: 6 | |
2588 | + { child_comm: firefox } hitcount: 8 | |
2589 | + { child_comm: dhclient } hitcount: 11 | |
2590 | + { child_comm: emacs } hitcount: 12 | |
2591 | + { child_comm: dbus-daemon } hitcount: 22 | |
2592 | + { child_comm: nm-dispatcher.a } hitcount: 22 | |
2593 | + { child_comm: evolution } hitcount: 35 | |
2594 | + { child_comm: glib-pacrunner } hitcount: 59 | |
2595 | + | |
2596 | + Totals: | |
2597 | + Hits: 206 | |
2598 | + Entries: 21 | |
2599 | + Dropped: 0 | |
2600 | + | |
2601 | + The previous example showed how to start and stop a hist trigger by | |
2602 | + appending 'pause' and 'continue' to the hist trigger command. A | |
2603 | + hist trigger can also be started in a paused state by initially | |
2604 | + starting the trigger with ':pause' appended. This allows you to | |
2605 | + start the trigger only when you're ready to start collecting data | |
2606 | + and not before. For example, you could start the trigger in a | |
2607 | + paused state, then unpause it and do something you want to measure, | |
2608 | + then pause the trigger again when done. | |
2609 | + | |
2610 | + Of course, doing this manually can be difficult and error-prone, but | |
2611 | + it is possible to automatically start and stop a hist trigger based | |
2612 | + on some condition, via the enable_hist and disable_hist triggers. | |
2613 | + | |
2614 | + For example, suppose we wanted to take a look at the relative | |
2615 | + weights in terms of skb length for each callpath that leads to a | |
2616 | + netif_receieve_skb event when downloading a decent-sized file using | |
2617 | + wget. | |
2618 | + | |
2619 | + First we set up an initially paused stacktrace trigger on the | |
2620 | + netif_receive_skb event: | |
2621 | + | |
2622 | + # echo 'hist:key=stacktrace:vals=len:pause' > \ | |
2623 | + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger | |
2624 | + | |
2625 | + Next, we set up an 'enable_hist' trigger on the sched_process_exec | |
2626 | + event, with an 'if filename==/usr/bin/wget' filter. The effect of | |
2627 | + this new trigger is that it will 'unpause' the hist trigger we just | |
2628 | + set up on netif_receive_skb if and only if it sees a | |
2629 | + sched_process_exec event with a filename of '/usr/bin/wget'. When | |
2630 | + that happens, all netif_receive_skb events are aggregated into a | |
2631 | + hash table keyed on stacktrace: | |
2632 | + | |
2633 | + # echo 'enable_hist:net:netif_receive_skb if filename==/usr/bin/wget' > \ | |
2634 | + /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger | |
2635 | + | |
2636 | + The aggregation continues until the netif_receive_skb is paused | |
2637 | + again, which is what the following disable_hist event does by | |
2638 | + creating a similar setup on the sched_process_exit event, using the | |
2639 | + filter 'comm==wget': | |
2640 | + | |
2641 | + # echo 'disable_hist:net:netif_receive_skb if comm==wget' > \ | |
2642 | + /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger | |
2643 | + | |
2644 | + Whenever a process exits and the comm field of the disable_hist | |
2645 | + trigger filter matches 'comm==wget', the netif_receive_skb hist | |
2646 | + trigger is disabled. | |
2647 | + | |
2648 | + The overall effect is that netif_receive_skb events are aggregated | |
2649 | + into the hash table for only the duration of the wget. Executing a | |
2650 | + wget command and then listing the 'hist' file will display the | |
2651 | + output generated by the wget command: | |
2652 | + | |
2653 | + $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz | |
2654 | + | |
2655 | + # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist | |
2656 | + # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused] | |
2657 | + | |
2658 | + { stacktrace: | |
2659 | + __netif_receive_skb_core+0x46d/0x990 | |
2660 | + __netif_receive_skb+0x18/0x60 | |
2661 | + netif_receive_skb_internal+0x23/0x90 | |
2662 | + napi_gro_receive+0xc8/0x100 | |
2663 | + ieee80211_deliver_skb+0xd6/0x270 [mac80211] | |
2664 | + ieee80211_rx_handlers+0xccf/0x22f0 [mac80211] | |
2665 | + ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211] | |
2666 | + ieee80211_rx+0x31d/0x900 [mac80211] | |
2667 | + iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm] | |
2668 | + iwl_rx_dispatch+0x8e/0xf0 [iwldvm] | |
2669 | + iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi] | |
2670 | + irq_thread_fn+0x20/0x50 | |
2671 | + irq_thread+0x11f/0x150 | |
2672 | + kthread+0xd2/0xf0 | |
2673 | + ret_from_fork+0x42/0x70 | |
2674 | + } hitcount: 85 len: 28884 | |
2675 | + { stacktrace: | |
2676 | + __netif_receive_skb_core+0x46d/0x990 | |
2677 | + __netif_receive_skb+0x18/0x60 | |
2678 | + netif_receive_skb_internal+0x23/0x90 | |
2679 | + napi_gro_complete+0xa4/0xe0 | |
2680 | + dev_gro_receive+0x23a/0x360 | |
2681 | + napi_gro_receive+0x30/0x100 | |
2682 | + ieee80211_deliver_skb+0xd6/0x270 [mac80211] | |
2683 | + ieee80211_rx_handlers+0xccf/0x22f0 [mac80211] | |
2684 | + ieee80211_prepare_and_rx_handle+0x4e7/0xc40 [mac80211] | |
2685 | + ieee80211_rx+0x31d/0x900 [mac80211] | |
2686 | + iwlagn_rx_reply_rx+0x3db/0x6f0 [iwldvm] | |
2687 | + iwl_rx_dispatch+0x8e/0xf0 [iwldvm] | |
2688 | + iwl_pcie_irq_handler+0xe3c/0x12f0 [iwlwifi] | |
2689 | + irq_thread_fn+0x20/0x50 | |
2690 | + irq_thread+0x11f/0x150 | |
2691 | + kthread+0xd2/0xf0 | |
2692 | + } hitcount: 98 len: 664329 | |
2693 | + { stacktrace: | |
2694 | + __netif_receive_skb_core+0x46d/0x990 | |
2695 | + __netif_receive_skb+0x18/0x60 | |
2696 | + process_backlog+0xa8/0x150 | |
2697 | + net_rx_action+0x15d/0x340 | |
2698 | + __do_softirq+0x114/0x2c0 | |
2699 | + do_softirq_own_stack+0x1c/0x30 | |
2700 | + do_softirq+0x65/0x70 | |
2701 | + __local_bh_enable_ip+0xb5/0xc0 | |
2702 | + ip_finish_output+0x1f4/0x840 | |
2703 | + ip_output+0x6b/0xc0 | |
2704 | + ip_local_out_sk+0x31/0x40 | |
2705 | + ip_send_skb+0x1a/0x50 | |
2706 | + udp_send_skb+0x173/0x2a0 | |
2707 | + udp_sendmsg+0x2bf/0x9f0 | |
2708 | + inet_sendmsg+0x64/0xa0 | |
2709 | + sock_sendmsg+0x3d/0x50 | |
2710 | + } hitcount: 115 len: 13030 | |
2711 | + { stacktrace: | |
2712 | + __netif_receive_skb_core+0x46d/0x990 | |
2713 | + __netif_receive_skb+0x18/0x60 | |
2714 | + netif_receive_skb_internal+0x23/0x90 | |
2715 | + napi_gro_complete+0xa4/0xe0 | |
2716 | + napi_gro_flush+0x6d/0x90 | |
2717 | + iwl_pcie_irq_handler+0x92a/0x12f0 [iwlwifi] | |
2718 | + irq_thread_fn+0x20/0x50 | |
2719 | + irq_thread+0x11f/0x150 | |
2720 | + kthread+0xd2/0xf0 | |
2721 | + ret_from_fork+0x42/0x70 | |
2722 | + } hitcount: 934 len: 5512212 | |
2723 | + | |
2724 | + Totals: | |
2725 | + Hits: 1232 | |
2726 | + Entries: 4 | |
2727 | + Dropped: 0 | |
2728 | + | |
2729 | + The above shows all the netif_receive_skb callpaths and their total | |
2730 | + lengths for the duration of the wget command. | |
2731 | + | |
2732 | + The 'clear' hist trigger param can be used to clear the hash table. | |
2733 | + Suppose we wanted to try another run of the previous example but | |
2734 | + this time also wanted to see the complete list of events that went | |
2735 | + into the histogram. In order to avoid having to set everything up | |
2736 | + again, we can just clear the histogram first: | |
2737 | + | |
2738 | + # echo 'hist:key=stacktrace:vals=len:clear' >> \ | |
2739 | + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger | |
2740 | + | |
2741 | + Just to verify that it is in fact cleared, here's what we now see in | |
2742 | + the hist file: | |
2743 | + | |
2744 | + # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist | |
2745 | + # trigger info: hist:keys=stacktrace:vals=len:sort=hitcount:size=2048 [paused] | |
2746 | + | |
2747 | + Totals: | |
2748 | + Hits: 0 | |
2749 | + Entries: 0 | |
2750 | + Dropped: 0 | |
2751 | + | |
2752 | + Since we want to see the detailed list of every netif_receive_skb | |
2753 | + event occurring during the new run, which are in fact the same | |
2754 | + events being aggregated into the hash table, we add some additional | |
2755 | + 'enable_event' events to the triggering sched_process_exec and | |
2756 | + sched_process_exit events as such: | |
2757 | + | |
2758 | + # echo 'enable_event:net:netif_receive_skb if filename==/usr/bin/wget' > \ | |
2759 | + /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger | |
2760 | + | |
2761 | + # echo 'disable_event:net:netif_receive_skb if comm==wget' > \ | |
2762 | + /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger | |
2763 | + | |
2764 | + If you read the trigger files for the sched_process_exec and | |
2765 | + sched_process_exit triggers, you should see two triggers for each: | |
2766 | + one enabling/disabling the hist aggregation and the other | |
2767 | + enabling/disabling the logging of events: | |
2768 | + | |
2769 | + # cat /sys/kernel/debug/tracing/events/sched/sched_process_exec/trigger | |
2770 | + enable_event:net:netif_receive_skb:unlimited if filename==/usr/bin/wget | |
2771 | + enable_hist:net:netif_receive_skb:unlimited if filename==/usr/bin/wget | |
2772 | + | |
2773 | + # cat /sys/kernel/debug/tracing/events/sched/sched_process_exit/trigger | |
2774 | + enable_event:net:netif_receive_skb:unlimited if comm==wget | |
2775 | + disable_hist:net:netif_receive_skb:unlimited if comm==wget | |
2776 | + | |
2777 | + In other words, whenever either of the sched_process_exec or | |
2778 | + sched_process_exit events is hit and matches 'wget', it enables or | |
2779 | + disables both the histogram and the event log, and what you end up | |
2780 | + with is a hash table and set of events just covering the specified | |
2781 | + duration. Run the wget command again: | |
2782 | + | |
2783 | + $ wget https://www.kernel.org/pub/linux/kernel/v3.x/patch-3.19.xz | |
2784 | + | |
2785 | + Displaying the 'hist' file should show something similar to what you | |
2786 | + saw in the last run, but this time you should also see the | |
2787 | + individual events in the trace file: | |
2788 | + | |
2789 | + # cat /sys/kernel/debug/tracing/trace | |
2790 | + | |
2791 | + # tracer: nop | |
2792 | + # | |
2793 | + # entries-in-buffer/entries-written: 183/1426 #P:4 | |
2794 | + # | |
2795 | + # _-----=> irqs-off | |
2796 | + # / _----=> need-resched | |
2797 | + # | / _---=> hardirq/softirq | |
2798 | + # || / _--=> preempt-depth | |
2799 | + # ||| / delay | |
2800 | + # TASK-PID CPU# |||| TIMESTAMP FUNCTION | |
2801 | + # | | | |||| | | | |
2802 | + wget-15108 [000] ..s1 31769.606929: netif_receive_skb: dev=lo skbaddr=ffff88009c353100 len=60 | |
2803 | + wget-15108 [000] ..s1 31769.606999: netif_receive_skb: dev=lo skbaddr=ffff88009c353200 len=60 | |
2804 | + dnsmasq-1382 [000] ..s1 31769.677652: netif_receive_skb: dev=lo skbaddr=ffff88009c352b00 len=130 | |
2805 | + dnsmasq-1382 [000] ..s1 31769.685917: netif_receive_skb: dev=lo skbaddr=ffff88009c352200 len=138 | |
2806 | + ##### CPU 2 buffer started #### | |
2807 | + irq/29-iwlwifi-559 [002] ..s. 31772.031529: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433d00 len=2948 | |
2808 | + irq/29-iwlwifi-559 [002] ..s. 31772.031572: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432200 len=1500 | |
2809 | + irq/29-iwlwifi-559 [002] ..s. 31772.032196: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433100 len=2948 | |
2810 | + irq/29-iwlwifi-559 [002] ..s. 31772.032761: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d433000 len=2948 | |
2811 | + irq/29-iwlwifi-559 [002] ..s. 31772.033220: netif_receive_skb: dev=wlan0 skbaddr=ffff88009d432e00 len=1500 | |
2812 | + . | |
2813 | + . | |
2814 | + . | |
2815 | + | |
2816 | + The following example demonstrates how multiple hist triggers can be | |
2817 | + attached to a given event. This capability can be useful for | |
2818 | + creating a set of different summaries derived from the same set of | |
2819 | + events, or for comparing the effects of different filters, among | |
2820 | + other things. | |
2821 | + | |
2822 | + # echo 'hist:keys=skbaddr.hex:vals=len if len < 0' >> \ | |
2823 | + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger | |
2824 | + # echo 'hist:keys=skbaddr.hex:vals=len if len > 4096' >> \ | |
2825 | + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger | |
2826 | + # echo 'hist:keys=skbaddr.hex:vals=len if len == 256' >> \ | |
2827 | + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger | |
2828 | + # echo 'hist:keys=skbaddr.hex:vals=len' >> \ | |
2829 | + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger | |
2830 | + # echo 'hist:keys=len:vals=common_preempt_count' >> \ | |
2831 | + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger | |
2832 | + | |
2833 | + The above set of commands create four triggers differing only in | |
2834 | + their filters, along with a completely different though fairly | |
2835 | + nonsensical trigger. Note that in order to append multiple hist | |
2836 | + triggers to the same file, you should use the '>>' operator to | |
2837 | + append them ('>' will also add the new hist trigger, but will remove | |
2838 | + any existing hist triggers beforehand). | |
2839 | + | |
2840 | + Displaying the contents of the 'hist' file for the event shows the | |
2841 | + contents of all five histograms: | |
2842 | + | |
2843 | + # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist | |
2844 | + | |
2845 | + # event histogram | |
2846 | + # | |
2847 | + # trigger info: hist:keys=len:vals=hitcount,common_preempt_count:sort=hitcount:size=2048 [active] | |
2848 | + # | |
2849 | + | |
2850 | + { len: 176 } hitcount: 1 common_preempt_count: 0 | |
2851 | + { len: 223 } hitcount: 1 common_preempt_count: 0 | |
2852 | + { len: 4854 } hitcount: 1 common_preempt_count: 0 | |
2853 | + { len: 395 } hitcount: 1 common_preempt_count: 0 | |
2854 | + { len: 177 } hitcount: 1 common_preempt_count: 0 | |
2855 | + { len: 446 } hitcount: 1 common_preempt_count: 0 | |
2856 | + { len: 1601 } hitcount: 1 common_preempt_count: 0 | |
2857 | + . | |
2858 | + . | |
2859 | + . | |
2860 | + { len: 1280 } hitcount: 66 common_preempt_count: 0 | |
2861 | + { len: 116 } hitcount: 81 common_preempt_count: 40 | |
2862 | + { len: 708 } hitcount: 112 common_preempt_count: 0 | |
2863 | + { len: 46 } hitcount: 221 common_preempt_count: 0 | |
2864 | + { len: 1264 } hitcount: 458 common_preempt_count: 0 | |
2865 | + | |
2866 | + Totals: | |
2867 | + Hits: 1428 | |
2868 | + Entries: 147 | |
2869 | + Dropped: 0 | |
2870 | + | |
2871 | + | |
2872 | + # event histogram | |
2873 | + # | |
2874 | + # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active] | |
2875 | + # | |
2876 | + | |
2877 | + { skbaddr: ffff8800baee5e00 } hitcount: 1 len: 130 | |
2878 | + { skbaddr: ffff88005f3d5600 } hitcount: 1 len: 1280 | |
2879 | + { skbaddr: ffff88005f3d4900 } hitcount: 1 len: 1280 | |
2880 | + { skbaddr: ffff88009fed6300 } hitcount: 1 len: 115 | |
2881 | + { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 115 | |
2882 | + { skbaddr: ffff88008cdb1900 } hitcount: 1 len: 46 | |
2883 | + { skbaddr: ffff880064b5ef00 } hitcount: 1 len: 118 | |
2884 | + { skbaddr: ffff880044e3c700 } hitcount: 1 len: 60 | |
2885 | + { skbaddr: ffff880100065900 } hitcount: 1 len: 46 | |
2886 | + { skbaddr: ffff8800d46bd500 } hitcount: 1 len: 116 | |
2887 | + { skbaddr: ffff88005f3d5f00 } hitcount: 1 len: 1280 | |
2888 | + { skbaddr: ffff880100064700 } hitcount: 1 len: 365 | |
2889 | + { skbaddr: ffff8800badb6f00 } hitcount: 1 len: 60 | |
2890 | + . | |
2891 | + . | |
2892 | + . | |
2893 | + { skbaddr: ffff88009fe0be00 } hitcount: 27 len: 24677 | |
2894 | + { skbaddr: ffff88009fe0a400 } hitcount: 27 len: 23052 | |
2895 | + { skbaddr: ffff88009fe0b700 } hitcount: 31 len: 25589 | |
2896 | + { skbaddr: ffff88009fe0b600 } hitcount: 32 len: 27326 | |
2897 | + { skbaddr: ffff88006a462800 } hitcount: 68 len: 71678 | |
2898 | + { skbaddr: ffff88006a463700 } hitcount: 70 len: 72678 | |
2899 | + { skbaddr: ffff88006a462b00 } hitcount: 71 len: 77589 | |
2900 | + { skbaddr: ffff88006a463600 } hitcount: 73 len: 71307 | |
2901 | + { skbaddr: ffff88006a462200 } hitcount: 81 len: 81032 | |
2902 | + | |
2903 | + Totals: | |
2904 | + Hits: 1451 | |
2905 | + Entries: 318 | |
2906 | + Dropped: 0 | |
2907 | + | |
2908 | + | |
2909 | + # event histogram | |
2910 | + # | |
2911 | + # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len == 256 [active] | |
2912 | + # | |
2913 | + | |
2914 | + | |
2915 | + Totals: | |
2916 | + Hits: 0 | |
2917 | + Entries: 0 | |
2918 | + Dropped: 0 | |
2919 | + | |
2920 | + | |
2921 | + # event histogram | |
2922 | + # | |
2923 | + # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len > 4096 [active] | |
2924 | + # | |
2925 | + | |
2926 | + { skbaddr: ffff88009fd2c300 } hitcount: 1 len: 7212 | |
2927 | + { skbaddr: ffff8800d2bcce00 } hitcount: 1 len: 7212 | |
2928 | + { skbaddr: ffff8800d2bcd700 } hitcount: 1 len: 7212 | |
2929 | + { skbaddr: ffff8800d2bcda00 } hitcount: 1 len: 21492 | |
2930 | + { skbaddr: ffff8800ae2e2d00 } hitcount: 1 len: 7212 | |
2931 | + { skbaddr: ffff8800d2bcdb00 } hitcount: 1 len: 7212 | |
2932 | + { skbaddr: ffff88006a4df500 } hitcount: 1 len: 4854 | |
2933 | + { skbaddr: ffff88008ce47b00 } hitcount: 1 len: 18636 | |
2934 | + { skbaddr: ffff8800ae2e2200 } hitcount: 1 len: 12924 | |
2935 | + { skbaddr: ffff88005f3e1000 } hitcount: 1 len: 4356 | |
2936 | + { skbaddr: ffff8800d2bcdc00 } hitcount: 2 len: 24420 | |
2937 | + { skbaddr: ffff8800d2bcc200 } hitcount: 2 len: 12996 | |
2938 | + | |
2939 | + Totals: | |
2940 | + Hits: 14 | |
2941 | + Entries: 12 | |
2942 | + Dropped: 0 | |
2943 | + | |
2944 | + | |
2945 | + # event histogram | |
2946 | + # | |
2947 | + # trigger info: hist:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 if len < 0 [active] | |
2948 | + # | |
2949 | + | |
2950 | + | |
2951 | + Totals: | |
2952 | + Hits: 0 | |
2953 | + Entries: 0 | |
2954 | + Dropped: 0 | |
2955 | + | |
2956 | + Named triggers can be used to have triggers share a common set of | |
2957 | + histogram data. This capability is mostly useful for combining the | |
2958 | + output of events generated by tracepoints contained inside inline | |
2959 | + functions, but names can be used in a hist trigger on any event. | |
2960 | + For example, these two triggers when hit will update the same 'len' | |
2961 | + field in the shared 'foo' histogram data: | |
2962 | + | |
2963 | + # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \ | |
2964 | + /sys/kernel/debug/tracing/events/net/netif_receive_skb/trigger | |
2965 | + # echo 'hist:name=foo:keys=skbaddr.hex:vals=len' > \ | |
2966 | + /sys/kernel/debug/tracing/events/net/netif_rx/trigger | |
2967 | + | |
2968 | + You can see that they're updating common histogram data by reading | |
2969 | + each event's hist files at the same time: | |
2970 | + | |
2971 | + # cat /sys/kernel/debug/tracing/events/net/netif_receive_skb/hist; | |
2972 | + cat /sys/kernel/debug/tracing/events/net/netif_rx/hist | |
2973 | + | |
2974 | + # event histogram | |
2975 | + # | |
2976 | + # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active] | |
2977 | + # | |
2978 | + | |
2979 | + { skbaddr: ffff88000ad53500 } hitcount: 1 len: 46 | |
2980 | + { skbaddr: ffff8800af5a1500 } hitcount: 1 len: 76 | |
2981 | + { skbaddr: ffff8800d62a1900 } hitcount: 1 len: 46 | |
2982 | + { skbaddr: ffff8800d2bccb00 } hitcount: 1 len: 468 | |
2983 | + { skbaddr: ffff8800d3c69900 } hitcount: 1 len: 46 | |
2984 | + { skbaddr: ffff88009ff09100 } hitcount: 1 len: 52 | |
2985 | + { skbaddr: ffff88010f13ab00 } hitcount: 1 len: 168 | |
2986 | + { skbaddr: ffff88006a54f400 } hitcount: 1 len: 46 | |
2987 | + { skbaddr: ffff8800d2bcc500 } hitcount: 1 len: 260 | |
2988 | + { skbaddr: ffff880064505000 } hitcount: 1 len: 46 | |
2989 | + { skbaddr: ffff8800baf24e00 } hitcount: 1 len: 32 | |
2990 | + { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 46 | |
2991 | + { skbaddr: ffff8800d3edff00 } hitcount: 1 len: 44 | |
2992 | + { skbaddr: ffff88009fe0b400 } hitcount: 1 len: 168 | |
2993 | + { skbaddr: ffff8800a1c55a00 } hitcount: 1 len: 40 | |
2994 | + { skbaddr: ffff8800d2bcd100 } hitcount: 1 len: 40 | |
2995 | + { skbaddr: ffff880064505f00 } hitcount: 1 len: 174 | |
2996 | + { skbaddr: ffff8800a8bff200 } hitcount: 1 len: 160 | |
2997 | + { skbaddr: ffff880044e3cc00 } hitcount: 1 len: 76 | |
2998 | + { skbaddr: ffff8800a8bfe700 } hitcount: 1 len: 46 | |
2999 | + { skbaddr: ffff8800d2bcdc00 } hitcount: 1 len: 32 | |
3000 | + { skbaddr: ffff8800a1f64800 } hitcount: 1 len: 46 | |
3001 | + { skbaddr: ffff8800d2bcde00 } hitcount: 1 len: 988 | |
3002 | + { skbaddr: ffff88006a5dea00 } hitcount: 1 len: 46 | |
3003 | + { skbaddr: ffff88002e37a200 } hitcount: 1 len: 44 | |
3004 | + { skbaddr: ffff8800a1f32c00 } hitcount: 2 len: 676 | |
3005 | + { skbaddr: ffff88000ad52600 } hitcount: 2 len: 107 | |
3006 | + { skbaddr: ffff8800a1f91e00 } hitcount: 2 len: 92 | |
3007 | + { skbaddr: ffff8800af5a0200 } hitcount: 2 len: 142 | |
3008 | + { skbaddr: ffff8800d2bcc600 } hitcount: 2 len: 220 | |
3009 | + { skbaddr: ffff8800ba36f500 } hitcount: 2 len: 92 | |
3010 | + { skbaddr: ffff8800d021f800 } hitcount: 2 len: 92 | |
3011 | + { skbaddr: ffff8800a1f33600 } hitcount: 2 len: 675 | |
3012 | + { skbaddr: ffff8800a8bfff00 } hitcount: 3 len: 138 | |
3013 | + { skbaddr: ffff8800d62a1300 } hitcount: 3 len: 138 | |
3014 | + { skbaddr: ffff88002e37a100 } hitcount: 4 len: 184 | |
3015 | + { skbaddr: ffff880064504400 } hitcount: 4 len: 184 | |
3016 | + { skbaddr: ffff8800a8bfec00 } hitcount: 4 len: 184 | |
3017 | + { skbaddr: ffff88000ad53700 } hitcount: 5 len: 230 | |
3018 | + { skbaddr: ffff8800d2bcdb00 } hitcount: 5 len: 196 | |
3019 | + { skbaddr: ffff8800a1f90000 } hitcount: 6 len: 276 | |
3020 | + { skbaddr: ffff88006a54f900 } hitcount: 6 len: 276 | |
3021 | + | |
3022 | + Totals: | |
3023 | + Hits: 81 | |
3024 | + Entries: 42 | |
3025 | + Dropped: 0 | |
3026 | + # event histogram | |
3027 | + # | |
3028 | + # trigger info: hist:name=foo:keys=skbaddr.hex:vals=hitcount,len:sort=hitcount:size=2048 [active] | |
3029 | + # | |
3030 | + | |
3031 | + { skbaddr: ffff88000ad53500 } hitcount: 1 len: 46 | |
3032 | + { skbaddr: ffff8800af5a1500 } hitcount: 1 len: 76 | |
3033 | + { skbaddr: ffff8800d62a1900 } hitcount: 1 len: 46 | |
3034 | + { skbaddr: ffff8800d2bccb00 } hitcount: 1 len: 468 | |
3035 | + { skbaddr: ffff8800d3c69900 } hitcount: 1 len: 46 | |
3036 | + { skbaddr: ffff88009ff09100 } hitcount: 1 len: 52 | |
3037 | + { skbaddr: ffff88010f13ab00 } hitcount: 1 len: 168 | |
3038 | + { skbaddr: ffff88006a54f400 } hitcount: 1 len: 46 | |
3039 | + { skbaddr: ffff8800d2bcc500 } hitcount: 1 len: 260 | |
3040 | + { skbaddr: ffff880064505000 } hitcount: 1 len: 46 | |
3041 | + { skbaddr: ffff8800baf24e00 } hitcount: 1 len: 32 | |
3042 | + { skbaddr: ffff88009fe0ad00 } hitcount: 1 len: 46 | |
3043 | + { skbaddr: ffff8800d3edff00 } hitcount: 1 len: 44 | |
3044 | + { skbaddr: ffff88009fe0b400 } hitcount: 1 len: 168 | |
3045 | + { skbaddr: ffff8800a1c55a00 } hitcount: 1 len: 40 | |
3046 | + { skbaddr: ffff8800d2bcd100 } hitcount: 1 len: 40 | |
3047 | + { skbaddr: ffff880064505f00 } hitcount: 1 len: 174 | |
3048 | + { skbaddr: ffff8800a8bff200 } hitcount: 1 len: 160 | |
3049 | + { skbaddr: ffff880044e3cc00 } hitcount: 1 len: 76 | |
3050 | + { skbaddr: ffff8800a8bfe700 } hitcount: 1 len: 46 | |
3051 | + { skbaddr: ffff8800d2bcdc00 } hitcount: 1 len: 32 | |
3052 | + { skbaddr: ffff8800a1f64800 } hitcount: 1 len: 46 | |
3053 | + { skbaddr: ffff8800d2bcde00 } hitcount: 1 len: 988 | |
3054 | + { skbaddr: ffff88006a5dea00 } hitcount: 1 len: 46 | |
3055 | + { skbaddr: ffff88002e37a200 } hitcount: 1 len: 44 | |
3056 | + { skbaddr: ffff8800a1f32c00 } hitcount: 2 len: 676 | |
3057 | + { skbaddr: ffff88000ad52600 } hitcount: 2 len: 107 | |
3058 | + { skbaddr: ffff8800a1f91e00 } hitcount: 2 len: 92 | |
3059 | + { skbaddr: ffff8800af5a0200 } hitcount: 2 len: 142 | |
3060 | + { skbaddr: ffff8800d2bcc600 } hitcount: 2 len: 220 | |
3061 | + { skbaddr: ffff8800ba36f500 } hitcount: 2 len: 92 | |
3062 | + { skbaddr: ffff8800d021f800 } hitcount: 2 len: 92 | |
3063 | + { skbaddr: ffff8800a1f33600 } hitcount: 2 len: 675 | |
3064 | + { skbaddr: ffff8800a8bfff00 } hitcount: 3 len: 138 | |
3065 | + { skbaddr: ffff8800d62a1300 } hitcount: 3 len: 138 | |
3066 | + { skbaddr: ffff88002e37a100 } hitcount: 4 len: 184 | |
3067 | + { skbaddr: ffff880064504400 } hitcount: 4 len: 184 | |
3068 | + { skbaddr: ffff8800a8bfec00 } hitcount: 4 len: 184 | |
3069 | + { skbaddr: ffff88000ad53700 } hitcount: 5 len: 230 | |
3070 | + { skbaddr: ffff8800d2bcdb00 } hitcount: 5 len: 196 | |
3071 | + { skbaddr: ffff8800a1f90000 } hitcount: 6 len: 276 | |
3072 | + { skbaddr: ffff88006a54f900 } hitcount: 6 len: 276 | |
3073 | + | |
3074 | + Totals: | |
3075 | + Hits: 81 | |
3076 | + Entries: 42 | |
3077 | + Dropped: 0 | |
3078 | + | |
3079 | + And here's an example that shows how to combine histogram data from | |
3080 | + any two events even if they don't share any 'compatible' fields | |
3081 | + other than 'hitcount' and 'stacktrace'. These commands create a | |
3082 | + couple of triggers named 'bar' using those fields: | |
3083 | + | |
3084 | + # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \ | |
3085 | + /sys/kernel/debug/tracing/events/sched/sched_process_fork/trigger | |
3086 | + # echo 'hist:name=bar:key=stacktrace:val=hitcount' > \ | |
3087 | + /sys/kernel/debug/tracing/events/net/netif_rx/trigger | |
3088 | + | |
3089 | + And displaying the output of either shows some interesting if | |
3090 | + somewhat confusing output: | |
3091 | + | |
3092 | + # cat /sys/kernel/debug/tracing/events/sched/sched_process_fork/hist | |
3093 | + # cat /sys/kernel/debug/tracing/events/net/netif_rx/hist | |
3094 | + | |
3095 | + # event histogram | |
3096 | + # | |
3097 | + # trigger info: hist:name=bar:keys=stacktrace:vals=hitcount:sort=hitcount:size=2048 [active] | |
3098 | + # | |
3099 | + | |
3100 | + { stacktrace: | |
3101 | + _do_fork+0x18e/0x330 | |
3102 | + kernel_thread+0x29/0x30 | |
3103 | + kthreadd+0x154/0x1b0 | |
3104 | + ret_from_fork+0x3f/0x70 | |
3105 | + } hitcount: 1 | |
3106 | + { stacktrace: | |
3107 | + netif_rx_internal+0xb2/0xd0 | |
3108 | + netif_rx_ni+0x20/0x70 | |
3109 | + dev_loopback_xmit+0xaa/0xd0 | |
3110 | + ip_mc_output+0x126/0x240 | |
3111 | + ip_local_out_sk+0x31/0x40 | |
3112 | + igmp_send_report+0x1e9/0x230 | |
3113 | + igmp_timer_expire+0xe9/0x120 | |
3114 | + call_timer_fn+0x39/0xf0 | |
3115 | + run_timer_softirq+0x1e1/0x290 | |
3116 | + __do_softirq+0xfd/0x290 | |
3117 | + irq_exit+0x98/0xb0 | |
3118 | + smp_apic_timer_interrupt+0x4a/0x60 | |
3119 | + apic_timer_interrupt+0x6d/0x80 | |
3120 | + cpuidle_enter+0x17/0x20 | |
3121 | + call_cpuidle+0x3b/0x60 | |
3122 | + cpu_startup_entry+0x22d/0x310 | |
3123 | + } hitcount: 1 | |
3124 | + { stacktrace: | |
3125 | + netif_rx_internal+0xb2/0xd0 | |
3126 | + netif_rx_ni+0x20/0x70 | |
3127 | + dev_loopback_xmit+0xaa/0xd0 | |
3128 | + ip_mc_output+0x17f/0x240 | |
3129 | + ip_local_out_sk+0x31/0x40 | |
3130 | + ip_send_skb+0x1a/0x50 | |
3131 | + udp_send_skb+0x13e/0x270 | |
3132 | + udp_sendmsg+0x2bf/0x980 | |
3133 | + inet_sendmsg+0x67/0xa0 | |
3134 | + sock_sendmsg+0x38/0x50 | |
3135 | + SYSC_sendto+0xef/0x170 | |
3136 | + SyS_sendto+0xe/0x10 | |
3137 | + entry_SYSCALL_64_fastpath+0x12/0x6a | |
3138 | + } hitcount: 2 | |
3139 | + { stacktrace: | |
3140 | + netif_rx_internal+0xb2/0xd0 | |
3141 | + netif_rx+0x1c/0x60 | |
3142 | + loopback_xmit+0x6c/0xb0 | |
3143 | + dev_hard_start_xmit+0x219/0x3a0 | |
3144 | + __dev_queue_xmit+0x415/0x4f0 | |
3145 | + dev_queue_xmit_sk+0x13/0x20 | |
3146 | + ip_finish_output2+0x237/0x340 | |
3147 | + ip_finish_output+0x113/0x1d0 | |
3148 | + ip_output+0x66/0xc0 | |
3149 | + ip_local_out_sk+0x31/0x40 | |
3150 | + ip_send_skb+0x1a/0x50 | |
3151 | + udp_send_skb+0x16d/0x270 | |
3152 | + udp_sendmsg+0x2bf/0x980 | |
3153 | + inet_sendmsg+0x67/0xa0 | |
3154 | + sock_sendmsg+0x38/0x50 | |
3155 | + ___sys_sendmsg+0x14e/0x270 | |
3156 | + } hitcount: 76 | |
3157 | + { stacktrace: | |
3158 | + netif_rx_internal+0xb2/0xd0 | |
3159 | + netif_rx+0x1c/0x60 | |
3160 | + loopback_xmit+0x6c/0xb0 | |
3161 | + dev_hard_start_xmit+0x219/0x3a0 | |
3162 | + __dev_queue_xmit+0x415/0x4f0 | |
3163 | + dev_queue_xmit_sk+0x13/0x20 | |
3164 | + ip_finish_output2+0x237/0x340 | |
3165 | + ip_finish_output+0x113/0x1d0 | |
3166 | + ip_output+0x66/0xc0 | |
3167 | + ip_local_out_sk+0x31/0x40 | |
3168 | + ip_send_skb+0x1a/0x50 | |
3169 | + udp_send_skb+0x16d/0x270 | |
3170 | + udp_sendmsg+0x2bf/0x980 | |
3171 | + inet_sendmsg+0x67/0xa0 | |
3172 | + sock_sendmsg+0x38/0x50 | |
3173 | + ___sys_sendmsg+0x269/0x270 | |
3174 | + } hitcount: 77 | |
3175 | + { stacktrace: | |
3176 | + netif_rx_internal+0xb2/0xd0 | |
3177 | + netif_rx+0x1c/0x60 | |
3178 | + loopback_xmit+0x6c/0xb0 | |
3179 | + dev_hard_start_xmit+0x219/0x3a0 | |
3180 | + __dev_queue_xmit+0x415/0x4f0 | |
3181 | + dev_queue_xmit_sk+0x13/0x20 | |
3182 | + ip_finish_output2+0x237/0x340 | |
3183 | + ip_finish_output+0x113/0x1d0 | |
3184 | + ip_output+0x66/0xc0 | |
3185 | + ip_local_out_sk+0x31/0x40 | |
3186 | + ip_send_skb+0x1a/0x50 | |
3187 | + udp_send_skb+0x16d/0x270 | |
3188 | + udp_sendmsg+0x2bf/0x980 | |
3189 | + inet_sendmsg+0x67/0xa0 | |
3190 | + sock_sendmsg+0x38/0x50 | |
3191 | + SYSC_sendto+0xef/0x170 | |
3192 | + } hitcount: 88 | |
3193 | + { stacktrace: | |
3194 | + _do_fork+0x18e/0x330 | |
3195 | + SyS_clone+0x19/0x20 | |
3196 | + entry_SYSCALL_64_fastpath+0x12/0x6a | |
3197 | + } hitcount: 244 | |
3198 | + | |
b3bbd485 JK |
3199 | + Totals: |
3200 | + Hits: 489 | |
3201 | + Entries: 7 | |
3202 | + Dropped: 0 | |
3203 | + | |
3204 | + | |
3205 | +2.2 Inter-event hist triggers | |
3206 | +----------------------------- | |
3207 | + | |
3208 | +Inter-event hist triggers are hist triggers that combine values from | |
3209 | +one or more other events and create a histogram using that data. Data | |
3210 | +from an inter-event histogram can in turn become the source for | |
3211 | +further combined histograms, thus providing a chain of related | |
3212 | +histograms, which is important for some applications. | |
3213 | + | |
3214 | +The most important example of an inter-event quantity that can be used | |
3215 | +in this manner is latency, which is simply a difference in timestamps | |
3216 | +between two events. Although latency is the most important | |
3217 | +inter-event quantity, note that because the support is completely | |
3218 | +general across the trace event subsystem, any event field can be used | |
3219 | +in an inter-event quantity. | |
3220 | + | |
3221 | +An example of a histogram that combines data from other histograms | |
3222 | +into a useful chain would be a 'wakeupswitch latency' histogram that | |
3223 | +combines a 'wakeup latency' histogram and a 'switch latency' | |
3224 | +histogram. | |
3225 | + | |
3226 | +Normally, a hist trigger specification consists of a (possibly | |
3227 | +compound) key along with one or more numeric values, which are | |
3228 | +continually updated sums associated with that key. A histogram | |
3229 | +specification in this case consists of individual key and value | |
3230 | +specifications that refer to trace event fields associated with a | |
3231 | +single event type. | |
3232 | + | |
3233 | +The inter-event hist trigger extension allows fields from multiple | |
3234 | +events to be referenced and combined into a multi-event histogram | |
3235 | +specification. In support of this overall goal, a few enabling | |
3236 | +features have been added to the hist trigger support: | |
3237 | + | |
3238 | + - In order to compute an inter-event quantity, a value from one | |
3239 | + event needs to saved and then referenced from another event. This | |
3240 | + requires the introduction of support for histogram 'variables'. | |
3241 | + | |
3242 | + - The computation of inter-event quantities and their combination | |
3243 | + require some minimal amount of support for applying simple | |
3244 | + expressions to variables (+ and -). | |
3245 | + | |
3246 | + - A histogram consisting of inter-event quantities isn't logically a | |
3247 | + histogram on either event (so having the 'hist' file for either | |
3248 | + event host the histogram output doesn't really make sense). To | |
3249 | + address the idea that the histogram is associated with a | |
3250 | + combination of events, support is added allowing the creation of | |
3251 | + 'synthetic' events that are events derived from other events. | |
3252 | + These synthetic events are full-fledged events just like any other | |
3253 | + and can be used as such, as for instance to create the | |
3254 | + 'combination' histograms mentioned previously. | |
3255 | + | |
3256 | + - A set of 'actions' can be associated with histogram entries - | |
3257 | + these can be used to generate the previously mentioned synthetic | |
3258 | + events, but can also be used for other purposes, such as for | |
3259 | + example saving context when a 'max' latency has been hit. | |
3260 | + | |
3261 | + - Trace events don't have a 'timestamp' associated with them, but | |
3262 | + there is an implicit timestamp saved along with an event in the | |
3263 | + underlying ftrace ring buffer. This timestamp is now exposed as a | |
3264 | + a synthetic field named 'common_timestamp' which can be used in | |
3265 | + histograms as if it were any other event field; it isn't an actual | |
3266 | + field in the trace format but rather is a synthesized value that | |
3267 | + nonetheless can be used as if it were an actual field. By default | |
3268 | + it is in units of nanoseconds; appending '.usecs' to a | |
3269 | + common_timestamp field changes the units to microseconds. | |
3270 | + | |
3271 | +A note on inter-event timestamps: If common_timestamp is used in a | |
3272 | +histogram, the trace buffer is automatically switched over to using | |
3273 | +absolute timestamps and the "global" trace clock, in order to avoid | |
3274 | +bogus timestamp differences with other clocks that aren't coherent | |
3275 | +across CPUs. This can be overridden by specifying one of the other | |
3276 | +trace clocks instead, using the "clock=XXX" hist trigger attribute, | |
3277 | +where XXX is any of the clocks listed in the tracing/trace_clock | |
3278 | +pseudo-file. | |
3279 | + | |
3280 | +These features are described in more detail in the following sections. | |
3281 | + | |
3282 | +2.2.1 Histogram Variables | |
3283 | +------------------------- | |
3284 | + | |
3285 | +Variables are simply named locations used for saving and retrieving | |
3286 | +values between matching events. A 'matching' event is defined as an | |
3287 | +event that has a matching key - if a variable is saved for a histogram | |
3288 | +entry corresponding to that key, any subsequent event with a matching | |
3289 | +key can access that variable. | |
3290 | + | |
3291 | +A variable's value is normally available to any subsequent event until | |
3292 | +it is set to something else by a subsequent event. The one exception | |
3293 | +to that rule is that any variable used in an expression is essentially | |
3294 | +'read-once' - once it's used by an expression in a subsequent event, | |
3295 | +it's reset to its 'unset' state, which means it can't be used again | |
3296 | +unless it's set again. This ensures not only that an event doesn't | |
3297 | +use an uninitialized variable in a calculation, but that that variable | |
3298 | +is used only once and not for any unrelated subsequent match. | |
3299 | + | |
3300 | +The basic syntax for saving a variable is to simply prefix a unique | |
3301 | +variable name not corresponding to any keyword along with an '=' sign | |
3302 | +to any event field. | |
3303 | + | |
3304 | +Either keys or values can be saved and retrieved in this way. This | |
3305 | +creates a variable named 'ts0' for a histogram entry with the key | |
3306 | +'next_pid': | |
3307 | + | |
3308 | + # echo 'hist:keys=next_pid:vals=$ts0:ts0=common_timestamp ... >> \ | |
3309 | + event/trigger | |
3310 | + | |
3311 | +The ts0 variable can be accessed by any subsequent event having the | |
3312 | +same pid as 'next_pid'. | |
3313 | + | |
3314 | +Variable references are formed by prepending the variable name with | |
3315 | +the '$' sign. Thus for example, the ts0 variable above would be | |
3316 | +referenced as '$ts0' in expressions. | |
3317 | + | |
3318 | +Because 'vals=' is used, the common_timestamp variable value above | |
3319 | +will also be summed as a normal histogram value would (though for a | |
3320 | +timestamp it makes little sense). | |
3321 | + | |
3322 | +The below shows that a key value can also be saved in the same way: | |
3323 | + | |
3324 | + # echo 'hist:timer_pid=common_pid:key=timer_pid ...' >> event/trigger | |
3325 | + | |
3326 | +If a variable isn't a key variable or prefixed with 'vals=', the | |
3327 | +associated event field will be saved in a variable but won't be summed | |
3328 | +as a value: | |
3329 | + | |
3330 | + # echo 'hist:keys=next_pid:ts1=common_timestamp ... >> event/trigger | |
3331 | + | |
3332 | +Multiple variables can be assigned at the same time. The below would | |
3333 | +result in both ts0 and b being created as variables, with both | |
3334 | +common_timestamp and field1 additionally being summed as values: | |
3335 | + | |
3336 | + # echo 'hist:keys=pid:vals=$ts0,$b:ts0=common_timestamp,b=field1 ... >> \ | |
3337 | + event/trigger | |
3338 | + | |
3339 | +Note that variable assignments can appear either preceding or | |
3340 | +following their use. The command below behaves identically to the | |
3341 | +command above: | |
3342 | + | |
3343 | + # echo 'hist:keys=pid:ts0=common_timestamp,b=field1:vals=$ts0,$b ... >> \ | |
3344 | + event/trigger | |
3345 | + | |
3346 | +Any number of variables not bound to a 'vals=' prefix can also be | |
3347 | +assigned by simply separating them with colons. Below is the same | |
3348 | +thing but without the values being summed in the histogram: | |
3349 | + | |
3350 | + # echo 'hist:keys=pid:ts0=common_timestamp:b=field1 ... >> event/trigger | |
3351 | + | |
3352 | +Variables set as above can be referenced and used in expressions on | |
3353 | +another event. | |
3354 | + | |
3355 | +For example, here's how a latency can be calculated: | |
3356 | + | |
3357 | + # echo 'hist:keys=pid,prio:ts0=common_timestamp ... >> event1/trigger | |
3358 | + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp-$ts0 ... >> event2/trigger | |
3359 | + | |
3360 | +In the first line above, the event's timetamp is saved into the | |
3361 | +variable ts0. In the next line, ts0 is subtracted from the second | |
3362 | +event's timestamp to produce the latency, which is then assigned into | |
3363 | +yet another variable, 'wakeup_lat'. The hist trigger below in turn | |
3364 | +makes use of the wakeup_lat variable to compute a combined latency | |
3365 | +using the same key and variable from yet another event: | |
3366 | + | |
3367 | + # echo 'hist:key=pid:wakeupswitch_lat=$wakeup_lat+$switchtime_lat ... >> event3/trigger | |
3368 | + | |
3369 | +2.2.2 Synthetic Events | |
3370 | +---------------------- | |
3371 | + | |
3372 | +Synthetic events are user-defined events generated from hist trigger | |
3373 | +variables or fields associated with one or more other events. Their | |
3374 | +purpose is to provide a mechanism for displaying data spanning | |
3375 | +multiple events consistent with the existing and already familiar | |
3376 | +usage for normal events. | |
3377 | + | |
3378 | +To define a synthetic event, the user writes a simple specification | |
3379 | +consisting of the name of the new event along with one or more | |
3380 | +variables and their types, which can be any valid field type, | |
3381 | +separated by semicolons, to the tracing/synthetic_events file. | |
3382 | + | |
3383 | +For instance, the following creates a new event named 'wakeup_latency' | |
3384 | +with 3 fields: lat, pid, and prio. Each of those fields is simply a | |
3385 | +variable reference to a variable on another event: | |
3386 | + | |
3387 | + # echo 'wakeup_latency \ | |
3388 | + u64 lat; \ | |
3389 | + pid_t pid; \ | |
3390 | + int prio' >> \ | |
3391 | + /sys/kernel/debug/tracing/synthetic_events | |
3392 | + | |
3393 | +Reading the tracing/synthetic_events file lists all the currently | |
3394 | +defined synthetic events, in this case the event defined above: | |
3395 | + | |
3396 | + # cat /sys/kernel/debug/tracing/synthetic_events | |
3397 | + wakeup_latency u64 lat; pid_t pid; int prio | |
3398 | + | |
3399 | +An existing synthetic event definition can be removed by prepending | |
3400 | +the command that defined it with a '!': | |
3401 | + | |
3402 | + # echo '!wakeup_latency u64 lat pid_t pid int prio' >> \ | |
3403 | + /sys/kernel/debug/tracing/synthetic_events | |
3404 | + | |
3405 | +At this point, there isn't yet an actual 'wakeup_latency' event | |
3406 | +instantiated in the event subsytem - for this to happen, a 'hist | |
3407 | +trigger action' needs to be instantiated and bound to actual fields | |
3408 | +and variables defined on other events (see Section 6.3.3 below). | |
3409 | + | |
3410 | +Once that is done, an event instance is created, and a histogram can | |
3411 | +be defined using it: | |
3412 | + | |
3413 | + # echo 'hist:keys=pid,prio,lat.log2:sort=pid,lat' >> \ | |
3414 | + /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger | |
3415 | + | |
3416 | +The new event is created under the tracing/events/synthetic/ directory | |
3417 | +and looks and behaves just like any other event: | |
3418 | + | |
3419 | + # ls /sys/kernel/debug/tracing/events/synthetic/wakeup_latency | |
3420 | + enable filter format hist id trigger | |
3421 | + | |
3422 | +Like any other event, once a histogram is enabled for the event, the | |
3423 | +output can be displayed by reading the event's 'hist' file. | |
3424 | + | |
3425 | +2.2.3 Hist trigger 'actions' | |
3426 | +---------------------------- | |
3427 | + | |
3428 | +A hist trigger 'action' is a function that's executed whenever a | |
3429 | +histogram entry is added or updated. | |
3430 | + | |
3431 | +The default 'action' if no special function is explicity specified is | |
3432 | +as it always has been, to simply update the set of values associated | |
3433 | +with an entry. Some applications, however, may want to perform | |
3434 | +additional actions at that point, such as generate another event, or | |
3435 | +compare and save a maximum. | |
3436 | + | |
3437 | +The following additional actions are available. To specify an action | |
3438 | +for a given event, simply specify the action between colons in the | |
3439 | +hist trigger specification. | |
3440 | + | |
3441 | + - onmatch(matching.event).<synthetic_event_name>(param list) | |
3442 | + | |
3443 | + The 'onmatch(matching.event).<synthetic_event_name>(params)' hist | |
3444 | + trigger action is invoked whenever an event matches and the | |
3445 | + histogram entry would be added or updated. It causes the named | |
3446 | + synthetic event to be generated with the values given in the | |
3447 | + 'param list'. The result is the generation of a synthetic event | |
3448 | + that consists of the values contained in those variables at the | |
3449 | + time the invoking event was hit. | |
3450 | + | |
3451 | + The 'param list' consists of one or more parameters which may be | |
3452 | + either variables or fields defined on either the 'matching.event' | |
3453 | + or the target event. The variables or fields specified in the | |
3454 | + param list may be either fully-qualified or unqualified. If a | |
3455 | + variable is specified as unqualified, it must be unique between | |
3456 | + the two events. A field name used as a param can be unqualified | |
3457 | + if it refers to the target event, but must be fully qualified if | |
3458 | + it refers to the matching event. A fully-qualified name is of the | |
3459 | + form 'system.event_name.$var_name' or 'system.event_name.field'. | |
3460 | + | |
3461 | + The 'matching.event' specification is simply the fully qualified | |
3462 | + event name of the event that matches the target event for the | |
3463 | + onmatch() functionality, in the form 'system.event_name'. | |
3464 | + | |
3465 | + Finally, the number and type of variables/fields in the 'param | |
3466 | + list' must match the number and types of the fields in the | |
3467 | + synthetic event being generated. | |
3468 | + | |
3469 | + As an example the below defines a simple synthetic event and uses | |
3470 | + a variable defined on the sched_wakeup_new event as a parameter | |
3471 | + when invoking the synthetic event. Here we define the synthetic | |
3472 | + event: | |
3473 | + | |
3474 | + # echo 'wakeup_new_test pid_t pid' >> \ | |
3475 | + /sys/kernel/debug/tracing/synthetic_events | |
3476 | + | |
3477 | + # cat /sys/kernel/debug/tracing/synthetic_events | |
3478 | + wakeup_new_test pid_t pid | |
3479 | + | |
3480 | + The following hist trigger both defines the missing testpid | |
3481 | + variable and specifies an onmatch() action that generates a | |
3482 | + wakeup_new_test synthetic event whenever a sched_wakeup_new event | |
3483 | + occurs, which because of the 'if comm == "cyclictest"' filter only | |
3484 | + happens when the executable is cyclictest: | |
3485 | + | |
3486 | + # echo 'hist:keys=$testpid:testpid=pid:onmatch(sched.sched_wakeup_new).\ | |
3487 | + wakeup_new_test($testpid) if comm=="cyclictest"' >> \ | |
3488 | + /sys/kernel/debug/tracing/events/sched/sched_wakeup_new/trigger | |
3489 | + | |
3490 | + Creating and displaying a histogram based on those events is now | |
3491 | + just a matter of using the fields and new synthetic event in the | |
3492 | + tracing/events/synthetic directory, as usual: | |
3493 | + | |
3494 | + # echo 'hist:keys=pid:sort=pid' >> \ | |
3495 | + /sys/kernel/debug/tracing/events/synthetic/wakeup_new_test/trigger | |
3496 | + | |
3497 | + Running 'cyclictest' should cause wakeup_new events to generate | |
3498 | + wakeup_new_test synthetic events which should result in histogram | |
3499 | + output in the wakeup_new_test event's hist file: | |
3500 | + | |
3501 | + # cat /sys/kernel/debug/tracing/events/synthetic/wakeup_new_test/hist | |
3502 | + | |
3503 | + A more typical usage would be to use two events to calculate a | |
3504 | + latency. The following example uses a set of hist triggers to | |
3505 | + produce a 'wakeup_latency' histogram: | |
3506 | + | |
3507 | + First, we define a 'wakeup_latency' synthetic event: | |
3508 | + | |
3509 | + # echo 'wakeup_latency u64 lat; pid_t pid; int prio' >> \ | |
3510 | + /sys/kernel/debug/tracing/synthetic_events | |
e4b2b4a8 | 3511 | + |
b3bbd485 JK |
3512 | + Next, we specify that whenever we see a sched_waking event for a |
3513 | + cyclictest thread, save the timestamp in a 'ts0' variable: | |
e4b2b4a8 | 3514 | + |
b3bbd485 JK |
3515 | + # echo 'hist:keys=$saved_pid:saved_pid=pid:ts0=common_timestamp.usecs \ |
3516 | + if comm=="cyclictest"' >> \ | |
3517 | + /sys/kernel/debug/tracing/events/sched/sched_waking/trigger | |
e4b2b4a8 | 3518 | + |
b3bbd485 JK |
3519 | + Then, when the corresponding thread is actually scheduled onto the |
3520 | + CPU by a sched_switch event, calculate the latency and use that | |
3521 | + along with another variable and an event field to generate a | |
3522 | + wakeup_latency synthetic event: | |
e4b2b4a8 | 3523 | + |
b3bbd485 JK |
3524 | + # echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:\ |
3525 | + onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,\ | |
3526 | + $saved_pid,next_prio) if next_comm=="cyclictest"' >> \ | |
3527 | + /sys/kernel/debug/tracing/events/sched/sched_switch/trigger | |
e4b2b4a8 | 3528 | + |
b3bbd485 JK |
3529 | + We also need to create a histogram on the wakeup_latency synthetic |
3530 | + event in order to aggregate the generated synthetic event data: | |
e4b2b4a8 | 3531 | + |
b3bbd485 JK |
3532 | + # echo 'hist:keys=pid,prio,lat:sort=pid,lat' >> \ |
3533 | + /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/trigger | |
e4b2b4a8 | 3534 | + |
b3bbd485 JK |
3535 | + Finally, once we've run cyclictest to actually generate some |
3536 | + events, we can see the output by looking at the wakeup_latency | |
3537 | + synthetic event's hist file: | |
e4b2b4a8 | 3538 | + |
b3bbd485 | 3539 | + # cat /sys/kernel/debug/tracing/events/synthetic/wakeup_latency/hist |
e4b2b4a8 | 3540 | + |
b3bbd485 | 3541 | + - onmax(var).save(field,.. .) |
e4b2b4a8 | 3542 | + |
b3bbd485 JK |
3543 | + The 'onmax(var).save(field,...)' hist trigger action is invoked |
3544 | + whenever the value of 'var' associated with a histogram entry | |
3545 | + exceeds the current maximum contained in that variable. | |
e4b2b4a8 | 3546 | + |
b3bbd485 JK |
3547 | + The end result is that the trace event fields specified as the |
3548 | + onmax.save() params will be saved if 'var' exceeds the current | |
3549 | + maximum for that hist trigger entry. This allows context from the | |
3550 | + event that exhibited the new maximum to be saved for later | |
3551 | + reference. When the histogram is displayed, additional fields | |
3552 | + displaying the saved values will be printed. | |
e4b2b4a8 | 3553 | + |
b3bbd485 JK |
3554 | + As an example the below defines a couple of hist triggers, one for |
3555 | + sched_waking and another for sched_switch, keyed on pid. Whenever | |
3556 | + a sched_waking occurs, the timestamp is saved in the entry | |
3557 | + corresponding to the current pid, and when the scheduler switches | |
3558 | + back to that pid, the timestamp difference is calculated. If the | |
3559 | + resulting latency, stored in wakeup_lat, exceeds the current | |
3560 | + maximum latency, the values specified in the save() fields are | |
3561 | + recoreded: | |
e4b2b4a8 | 3562 | + |
b3bbd485 JK |
3563 | + # echo 'hist:keys=pid:ts0=common_timestamp.usecs \ |
3564 | + if comm=="cyclictest"' >> \ | |
3565 | + /sys/kernel/debug/tracing/events/sched/sched_waking/trigger | |
e4b2b4a8 | 3566 | + |
b3bbd485 JK |
3567 | + # echo 'hist:keys=next_pid:\ |
3568 | + wakeup_lat=common_timestamp.usecs-$ts0:\ | |
3569 | + onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) \ | |
3570 | + if next_comm=="cyclictest"' >> \ | |
3571 | + /sys/kernel/debug/tracing/events/sched/sched_switch/trigger | |
e4b2b4a8 | 3572 | + |
b3bbd485 JK |
3573 | + When the histogram is displayed, the max value and the saved |
3574 | + values corresponding to the max are displayed following the rest | |
3575 | + of the fields: | |
e4b2b4a8 | 3576 | + |
b3bbd485 JK |
3577 | + # cat /sys/kernel/debug/tracing/events/sched/sched_switch/hist |
3578 | + { next_pid: 2255 } hitcount: 239 | |
3579 | + common_timestamp-ts0: 0 | |
3580 | + max: 27 | |
3581 | + next_comm: cyclictest | |
3582 | + prev_pid: 0 prev_prio: 120 prev_comm: swapper/1 | |
e4b2b4a8 | 3583 | + |
b3bbd485 JK |
3584 | + { next_pid: 2256 } hitcount: 2355 |
3585 | + common_timestamp-ts0: 0 | |
3586 | + max: 49 next_comm: cyclictest | |
3587 | + prev_pid: 0 prev_prio: 120 prev_comm: swapper/0 | |
e4b2b4a8 | 3588 | + |
b3bbd485 JK |
3589 | + Totals: |
3590 | + Hits: 12970 | |
3591 | + Entries: 2 | |
3592 | + Dropped: 0 | |
3593 | diff --git a/arch/Kconfig b/arch/Kconfig | |
3594 | index 40dc31fea90c..7c6108479209 100644 | |
3595 | --- a/arch/Kconfig | |
3596 | +++ b/arch/Kconfig | |
3597 | @@ -20,6 +20,7 @@ config OPROFILE | |
3598 | tristate "OProfile system profiling" | |
3599 | depends on PROFILING | |
3600 | depends on HAVE_OPROFILE | |
3601 | + depends on !PREEMPT_RT_FULL | |
3602 | select RING_BUFFER | |
3603 | select RING_BUFFER_ALLOW_SWAP | |
3604 | help | |
3605 | diff --git a/arch/alpha/include/asm/spinlock_types.h b/arch/alpha/include/asm/spinlock_types.h | |
3606 | index 1d5716bc060b..6883bc952d22 100644 | |
3607 | --- a/arch/alpha/include/asm/spinlock_types.h | |
3608 | +++ b/arch/alpha/include/asm/spinlock_types.h | |
3609 | @@ -2,10 +2,6 @@ | |
3610 | #ifndef _ALPHA_SPINLOCK_TYPES_H | |
3611 | #define _ALPHA_SPINLOCK_TYPES_H | |
3612 | ||
3613 | -#ifndef __LINUX_SPINLOCK_TYPES_H | |
3614 | -# error "please don't include this file directly" | |
3615 | -#endif | |
3616 | - | |
3617 | typedef struct { | |
3618 | volatile unsigned int lock; | |
3619 | } arch_spinlock_t; | |
3620 | diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig | |
3621 | index d1346a160760..558b0995e94a 100644 | |
3622 | --- a/arch/arm/Kconfig | |
3623 | +++ b/arch/arm/Kconfig | |
3624 | @@ -45,7 +45,7 @@ config ARM | |
3625 | select HARDIRQS_SW_RESEND | |
3626 | select HAVE_ARCH_AUDITSYSCALL if (AEABI && !OABI_COMPAT) | |
3627 | select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6 | |
3628 | - select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU | |
3629 | + select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT_BASE | |
3630 | select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU | |
3631 | select HAVE_ARCH_MMAP_RND_BITS if MMU | |
3632 | select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT) | |
3633 | @@ -85,6 +85,7 @@ config ARM | |
3634 | select HAVE_PERF_EVENTS | |
3635 | select HAVE_PERF_REGS | |
3636 | select HAVE_PERF_USER_STACK_DUMP | |
3637 | + select HAVE_PREEMPT_LAZY | |
3638 | select HAVE_RCU_TABLE_FREE if (SMP && ARM_LPAE) | |
3639 | select HAVE_REGS_AND_STACK_ACCESS_API | |
3640 | select HAVE_SYSCALL_TRACEPOINTS | |
3641 | @@ -2164,7 +2165,7 @@ config NEON | |
3642 | ||
3643 | config KERNEL_MODE_NEON | |
3644 | bool "Support for NEON in kernel mode" | |
3645 | - depends on NEON && AEABI | |
3646 | + depends on NEON && AEABI && !PREEMPT_RT_BASE | |
3647 | help | |
3648 | Say Y to include support for NEON in kernel mode. | |
3649 | ||
3650 | diff --git a/arch/arm/include/asm/irq.h b/arch/arm/include/asm/irq.h | |
3651 | index b6f319606e30..ad377ef73739 100644 | |
3652 | --- a/arch/arm/include/asm/irq.h | |
3653 | +++ b/arch/arm/include/asm/irq.h | |
3654 | @@ -23,6 +23,8 @@ | |
3655 | #endif | |
3656 | ||
3657 | #ifndef __ASSEMBLY__ | |
3658 | +#include <linux/cpumask.h> | |
e4b2b4a8 | 3659 | + |
b3bbd485 JK |
3660 | struct irqaction; |
3661 | struct pt_regs; | |
3662 | extern void migrate_irqs(void); | |
3663 | diff --git a/arch/arm/include/asm/spinlock_types.h b/arch/arm/include/asm/spinlock_types.h | |
3664 | index 5976958647fe..a37c0803954b 100644 | |
3665 | --- a/arch/arm/include/asm/spinlock_types.h | |
3666 | +++ b/arch/arm/include/asm/spinlock_types.h | |
3667 | @@ -2,10 +2,6 @@ | |
3668 | #ifndef __ASM_SPINLOCK_TYPES_H | |
3669 | #define __ASM_SPINLOCK_TYPES_H | |
3670 | ||
3671 | -#ifndef __LINUX_SPINLOCK_TYPES_H | |
3672 | -# error "please don't include this file directly" | |
3673 | -#endif | |
3674 | - | |
3675 | #define TICKET_SHIFT 16 | |
3676 | ||
3677 | typedef struct { | |
3678 | diff --git a/arch/arm/include/asm/switch_to.h b/arch/arm/include/asm/switch_to.h | |
3679 | index d3e937dcee4d..6ab96a2ce1f8 100644 | |
3680 | --- a/arch/arm/include/asm/switch_to.h | |
3681 | +++ b/arch/arm/include/asm/switch_to.h | |
3682 | @@ -4,6 +4,13 @@ | |
3683 | ||
3684 | #include <linux/thread_info.h> | |
3685 | ||
3686 | +#if defined CONFIG_PREEMPT_RT_FULL && defined CONFIG_HIGHMEM | |
3687 | +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p); | |
3688 | +#else | |
3689 | +static inline void | |
3690 | +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { } | |
3691 | +#endif | |
e4b2b4a8 | 3692 | + |
b3bbd485 JK |
3693 | /* |
3694 | * For v7 SMP cores running a preemptible kernel we may be pre-empted | |
3695 | * during a TLB maintenance operation, so execute an inner-shareable dsb | |
3696 | @@ -26,6 +33,7 @@ extern struct task_struct *__switch_to(struct task_struct *, struct thread_info | |
3697 | #define switch_to(prev,next,last) \ | |
3698 | do { \ | |
3699 | __complete_pending_tlbi(); \ | |
3700 | + switch_kmaps(prev, next); \ | |
3701 | last = __switch_to(prev,task_thread_info(prev), task_thread_info(next)); \ | |
3702 | } while (0) | |
3703 | ||
3704 | diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h | |
5dd41b01 | 3705 | index 57d2ad9c75ca..cdfb6855943b 100644 |
b3bbd485 JK |
3706 | --- a/arch/arm/include/asm/thread_info.h |
3707 | +++ b/arch/arm/include/asm/thread_info.h | |
3708 | @@ -49,6 +49,7 @@ struct cpu_context_save { | |
3709 | struct thread_info { | |
3710 | unsigned long flags; /* low level flags */ | |
3711 | int preempt_count; /* 0 => preemptable, <0 => bug */ | |
3712 | + int preempt_lazy_count; /* 0 => preemptable, <0 => bug */ | |
3713 | mm_segment_t addr_limit; /* address limit */ | |
3714 | struct task_struct *task; /* main task structure */ | |
3715 | __u32 cpu; /* cpu */ | |
5dd41b01 | 3716 | @@ -142,7 +143,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, |
b3bbd485 JK |
3717 | #define TIF_SYSCALL_TRACE 4 /* syscall trace active */ |
3718 | #define TIF_SYSCALL_AUDIT 5 /* syscall auditing active */ | |
3719 | #define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */ | |
3720 | -#define TIF_SECCOMP 7 /* seccomp syscall filtering active */ | |
3721 | +#define TIF_SECCOMP 8 /* seccomp syscall filtering active */ | |
3722 | +#define TIF_NEED_RESCHED_LAZY 7 | |
3723 | ||
3724 | #define TIF_NOHZ 12 /* in adaptive nohz mode */ | |
3725 | #define TIF_USING_IWMMXT 17 | |
5dd41b01 | 3726 | @@ -152,6 +154,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, |
b3bbd485 JK |
3727 | #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) |
3728 | #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) | |
3729 | #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) | |
3730 | +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) | |
3731 | #define _TIF_UPROBE (1 << TIF_UPROBE) | |
3732 | #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) | |
3733 | #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) | |
5dd41b01 | 3734 | @@ -167,7 +170,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, |
b3bbd485 JK |
3735 | * Change these and you break ASM code in entry-common.S |
3736 | */ | |
3737 | #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ | |
3738 | - _TIF_NOTIFY_RESUME | _TIF_UPROBE) | |
3739 | + _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ | |
3740 | + _TIF_NEED_RESCHED_LAZY) | |
3741 | ||
3742 | #endif /* __KERNEL__ */ | |
3743 | #endif /* __ASM_ARM_THREAD_INFO_H */ | |
3744 | diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c | |
3745 | index 608008229c7d..3866da3f7bb7 100644 | |
3746 | --- a/arch/arm/kernel/asm-offsets.c | |
3747 | +++ b/arch/arm/kernel/asm-offsets.c | |
3748 | @@ -65,6 +65,7 @@ int main(void) | |
3749 | BLANK(); | |
3750 | DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); | |
3751 | DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count)); | |
3752 | + DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count)); | |
3753 | DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit)); | |
3754 | DEFINE(TI_TASK, offsetof(struct thread_info, task)); | |
3755 | DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); | |
3756 | diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S | |
3757 | index fbc707626b3e..b434c59d2b64 100644 | |
3758 | --- a/arch/arm/kernel/entry-armv.S | |
3759 | +++ b/arch/arm/kernel/entry-armv.S | |
3760 | @@ -220,11 +220,18 @@ __irq_svc: | |
3761 | ||
3762 | #ifdef CONFIG_PREEMPT | |
3763 | ldr r8, [tsk, #TI_PREEMPT] @ get preempt count | |
3764 | - ldr r0, [tsk, #TI_FLAGS] @ get flags | |
3765 | teq r8, #0 @ if preempt count != 0 | |
3766 | + bne 1f @ return from exeption | |
3767 | + ldr r0, [tsk, #TI_FLAGS] @ get flags | |
3768 | + tst r0, #_TIF_NEED_RESCHED @ if NEED_RESCHED is set | |
3769 | + blne svc_preempt @ preempt! | |
e4b2b4a8 | 3770 | + |
b3bbd485 JK |
3771 | + ldr r8, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count |
3772 | + teq r8, #0 @ if preempt lazy count != 0 | |
3773 | movne r0, #0 @ force flags to 0 | |
3774 | - tst r0, #_TIF_NEED_RESCHED | |
3775 | + tst r0, #_TIF_NEED_RESCHED_LAZY | |
3776 | blne svc_preempt | |
3777 | +1: | |
3778 | #endif | |
3779 | ||
3780 | svc_exit r5, irq = 1 @ return from exception | |
3781 | @@ -239,8 +246,14 @@ svc_preempt: | |
3782 | 1: bl preempt_schedule_irq @ irq en/disable is done inside | |
3783 | ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS | |
3784 | tst r0, #_TIF_NEED_RESCHED | |
3785 | + bne 1b | |
3786 | + tst r0, #_TIF_NEED_RESCHED_LAZY | |
3787 | reteq r8 @ go again | |
3788 | - b 1b | |
3789 | + ldr r0, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count | |
3790 | + teq r0, #0 @ if preempt lazy count != 0 | |
3791 | + beq 1b | |
3792 | + ret r8 @ go again | |
e4b2b4a8 | 3793 | + |
b3bbd485 JK |
3794 | #endif |
3795 | ||
3796 | __und_fault: | |
3797 | diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S | |
5dd41b01 | 3798 | index 54c10503d71f..3fdeade24e3f 100644 |
b3bbd485 JK |
3799 | --- a/arch/arm/kernel/entry-common.S |
3800 | +++ b/arch/arm/kernel/entry-common.S | |
3801 | @@ -53,7 +53,9 @@ ret_fast_syscall: | |
3802 | cmp r2, #TASK_SIZE | |
3803 | blne addr_limit_check_failed | |
3804 | ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing | |
3805 | - tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK | |
3806 | + tst r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP) | |
3807 | + bne fast_work_pending | |
3808 | + tst r1, #_TIF_SECCOMP | |
3809 | bne fast_work_pending | |
3810 | ||
3811 | ||
3812 | @@ -83,8 +85,11 @@ ret_fast_syscall: | |
3813 | cmp r2, #TASK_SIZE | |
3814 | blne addr_limit_check_failed | |
3815 | ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing | |
3816 | - tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK | |
3817 | + tst r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP) | |
3818 | + bne do_slower_path | |
3819 | + tst r1, #_TIF_SECCOMP | |
3820 | beq no_work_pending | |
3821 | +do_slower_path: | |
3822 | UNWIND(.fnend ) | |
3823 | ENDPROC(ret_fast_syscall) | |
3824 | ||
3825 | diff --git a/arch/arm/kernel/patch.c b/arch/arm/kernel/patch.c | |
3826 | index a50dc00d79a2..d0a05a3bdb96 100644 | |
3827 | --- a/arch/arm/kernel/patch.c | |
3828 | +++ b/arch/arm/kernel/patch.c | |
3829 | @@ -16,7 +16,7 @@ struct patch { | |
3830 | unsigned int insn; | |
3831 | }; | |
3832 | ||
3833 | -static DEFINE_SPINLOCK(patch_lock); | |
3834 | +static DEFINE_RAW_SPINLOCK(patch_lock); | |
3835 | ||
3836 | static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags) | |
3837 | __acquires(&patch_lock) | |
3838 | @@ -33,7 +33,7 @@ static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags) | |
3839 | return addr; | |
3840 | ||
3841 | if (flags) | |
3842 | - spin_lock_irqsave(&patch_lock, *flags); | |
3843 | + raw_spin_lock_irqsave(&patch_lock, *flags); | |
3844 | else | |
3845 | __acquire(&patch_lock); | |
3846 | ||
3847 | @@ -48,7 +48,7 @@ static void __kprobes patch_unmap(int fixmap, unsigned long *flags) | |
3848 | clear_fixmap(fixmap); | |
3849 | ||
3850 | if (flags) | |
3851 | - spin_unlock_irqrestore(&patch_lock, *flags); | |
3852 | + raw_spin_unlock_irqrestore(&patch_lock, *flags); | |
3853 | else | |
3854 | __release(&patch_lock); | |
3855 | } | |
3856 | diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c | |
3857 | index d96714e1858c..cf4e1452d4b4 100644 | |
3858 | --- a/arch/arm/kernel/process.c | |
3859 | +++ b/arch/arm/kernel/process.c | |
3860 | @@ -325,6 +325,30 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) | |
3861 | } | |
3862 | ||
3863 | #ifdef CONFIG_MMU | |
3864 | +/* | |
3865 | + * CONFIG_SPLIT_PTLOCK_CPUS results in a page->ptl lock. If the lock is not | |
3866 | + * initialized by pgtable_page_ctor() then a coredump of the vector page will | |
3867 | + * fail. | |
3868 | + */ | |
3869 | +static int __init vectors_user_mapping_init_page(void) | |
3870 | +{ | |
3871 | + struct page *page; | |
3872 | + unsigned long addr = 0xffff0000; | |
3873 | + pgd_t *pgd; | |
3874 | + pud_t *pud; | |
3875 | + pmd_t *pmd; | |
e4b2b4a8 | 3876 | + |
b3bbd485 JK |
3877 | + pgd = pgd_offset_k(addr); |
3878 | + pud = pud_offset(pgd, addr); | |
3879 | + pmd = pmd_offset(pud, addr); | |
3880 | + page = pmd_page(*(pmd)); | |
e4b2b4a8 | 3881 | + |
b3bbd485 JK |
3882 | + pgtable_page_ctor(page); |
3883 | + | |
3884 | + return 0; | |
3885 | +} | |
3886 | +late_initcall(vectors_user_mapping_init_page); | |
3887 | + | |
3888 | #ifdef CONFIG_KUSER_HELPERS | |
3889 | /* | |
3890 | * The vectors page is always readable from user space for the | |
3891 | diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c | |
5dd41b01 | 3892 | index cdfe52b15a0a..198cf8bf0b37 100644 |
b3bbd485 JK |
3893 | --- a/arch/arm/kernel/signal.c |
3894 | +++ b/arch/arm/kernel/signal.c | |
3895 | @@ -615,7 +615,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall) | |
3896 | */ | |
3897 | trace_hardirqs_off(); | |
3898 | do { | |
3899 | - if (likely(thread_flags & _TIF_NEED_RESCHED)) { | |
3900 | + if (likely(thread_flags & (_TIF_NEED_RESCHED | | |
3901 | + _TIF_NEED_RESCHED_LAZY))) { | |
3902 | schedule(); | |
3903 | } else { | |
3904 | if (unlikely(!user_mode(regs))) | |
3905 | diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c | |
5dd41b01 | 3906 | index e61af0600133..d8f2e77d5651 100644 |
b3bbd485 JK |
3907 | --- a/arch/arm/kernel/smp.c |
3908 | +++ b/arch/arm/kernel/smp.c | |
5dd41b01 | 3909 | @@ -237,8 +237,6 @@ int __cpu_disable(void) |
b3bbd485 JK |
3910 | flush_cache_louis(); |
3911 | local_flush_tlb_all(); | |
3912 | ||
3913 | - clear_tasks_mm_cpumask(cpu); | |
3914 | - | |
3915 | return 0; | |
3916 | } | |
3917 | ||
5dd41b01 | 3918 | @@ -256,6 +254,7 @@ void __cpu_die(unsigned int cpu) |
b3bbd485 JK |
3919 | } |
3920 | pr_debug("CPU%u: shutdown\n", cpu); | |
3921 | ||
3922 | + clear_tasks_mm_cpumask(cpu); | |
3923 | /* | |
3924 | * platform_cpu_kill() is generally expected to do the powering off | |
3925 | * and/or cutting of clocks to the dying CPU. Optionally, this may | |
3926 | diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c | |
3927 | index 0bee233fef9a..314cfb232a63 100644 | |
3928 | --- a/arch/arm/kernel/unwind.c | |
3929 | +++ b/arch/arm/kernel/unwind.c | |
3930 | @@ -93,7 +93,7 @@ extern const struct unwind_idx __start_unwind_idx[]; | |
3931 | static const struct unwind_idx *__origin_unwind_idx; | |
3932 | extern const struct unwind_idx __stop_unwind_idx[]; | |
3933 | ||
3934 | -static DEFINE_SPINLOCK(unwind_lock); | |
3935 | +static DEFINE_RAW_SPINLOCK(unwind_lock); | |
3936 | static LIST_HEAD(unwind_tables); | |
3937 | ||
3938 | /* Convert a prel31 symbol to an absolute address */ | |
3939 | @@ -201,7 +201,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr) | |
3940 | /* module unwind tables */ | |
3941 | struct unwind_table *table; | |
3942 | ||
3943 | - spin_lock_irqsave(&unwind_lock, flags); | |
3944 | + raw_spin_lock_irqsave(&unwind_lock, flags); | |
3945 | list_for_each_entry(table, &unwind_tables, list) { | |
3946 | if (addr >= table->begin_addr && | |
3947 | addr < table->end_addr) { | |
3948 | @@ -213,7 +213,7 @@ static const struct unwind_idx *unwind_find_idx(unsigned long addr) | |
3949 | break; | |
3950 | } | |
3951 | } | |
3952 | - spin_unlock_irqrestore(&unwind_lock, flags); | |
3953 | + raw_spin_unlock_irqrestore(&unwind_lock, flags); | |
3954 | } | |
3955 | ||
3956 | pr_debug("%s: idx = %p\n", __func__, idx); | |
3957 | @@ -529,9 +529,9 @@ struct unwind_table *unwind_table_add(unsigned long start, unsigned long size, | |
3958 | tab->begin_addr = text_addr; | |
3959 | tab->end_addr = text_addr + text_size; | |
3960 | ||
3961 | - spin_lock_irqsave(&unwind_lock, flags); | |
3962 | + raw_spin_lock_irqsave(&unwind_lock, flags); | |
3963 | list_add_tail(&tab->list, &unwind_tables); | |
3964 | - spin_unlock_irqrestore(&unwind_lock, flags); | |
3965 | + raw_spin_unlock_irqrestore(&unwind_lock, flags); | |
3966 | ||
3967 | return tab; | |
3968 | } | |
3969 | @@ -543,9 +543,9 @@ void unwind_table_del(struct unwind_table *tab) | |
3970 | if (!tab) | |
3971 | return; | |
3972 | ||
3973 | - spin_lock_irqsave(&unwind_lock, flags); | |
3974 | + raw_spin_lock_irqsave(&unwind_lock, flags); | |
3975 | list_del(&tab->list); | |
3976 | - spin_unlock_irqrestore(&unwind_lock, flags); | |
3977 | + raw_spin_unlock_irqrestore(&unwind_lock, flags); | |
3978 | ||
3979 | kfree(tab); | |
3980 | } | |
3981 | diff --git a/arch/arm/mach-exynos/platsmp.c b/arch/arm/mach-exynos/platsmp.c | |
3982 | index 5a03bffe7226..3080ea833d19 100644 | |
3983 | --- a/arch/arm/mach-exynos/platsmp.c | |
3984 | +++ b/arch/arm/mach-exynos/platsmp.c | |
3985 | @@ -229,7 +229,7 @@ static void __iomem *scu_base_addr(void) | |
3986 | return (void __iomem *)(S5P_VA_SCU); | |
3987 | } | |
3988 | ||
3989 | -static DEFINE_SPINLOCK(boot_lock); | |
3990 | +static DEFINE_RAW_SPINLOCK(boot_lock); | |
3991 | ||
3992 | static void exynos_secondary_init(unsigned int cpu) | |
3993 | { | |
3994 | @@ -242,8 +242,8 @@ static void exynos_secondary_init(unsigned int cpu) | |
3995 | /* | |
3996 | * Synchronise with the boot thread. | |
3997 | */ | |
3998 | - spin_lock(&boot_lock); | |
3999 | - spin_unlock(&boot_lock); | |
4000 | + raw_spin_lock(&boot_lock); | |
4001 | + raw_spin_unlock(&boot_lock); | |
4002 | } | |
4003 | ||
4004 | int exynos_set_boot_addr(u32 core_id, unsigned long boot_addr) | |
4005 | @@ -307,7 +307,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle) | |
4006 | * Set synchronisation state between this boot processor | |
4007 | * and the secondary one | |
4008 | */ | |
4009 | - spin_lock(&boot_lock); | |
4010 | + raw_spin_lock(&boot_lock); | |
4011 | ||
4012 | /* | |
4013 | * The secondary processor is waiting to be released from | |
4014 | @@ -334,7 +334,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle) | |
4015 | ||
4016 | if (timeout == 0) { | |
4017 | printk(KERN_ERR "cpu1 power enable failed"); | |
4018 | - spin_unlock(&boot_lock); | |
4019 | + raw_spin_unlock(&boot_lock); | |
4020 | return -ETIMEDOUT; | |
4021 | } | |
4022 | } | |
4023 | @@ -380,7 +380,7 @@ static int exynos_boot_secondary(unsigned int cpu, struct task_struct *idle) | |
4024 | * calibrations, then wait for it to finish | |
4025 | */ | |
4026 | fail: | |
4027 | - spin_unlock(&boot_lock); | |
4028 | + raw_spin_unlock(&boot_lock); | |
4029 | ||
4030 | return pen_release != -1 ? ret : 0; | |
4031 | } | |
4032 | diff --git a/arch/arm/mach-hisi/platmcpm.c b/arch/arm/mach-hisi/platmcpm.c | |
4033 | index f66815c3dd07..00524abd963f 100644 | |
4034 | --- a/arch/arm/mach-hisi/platmcpm.c | |
4035 | +++ b/arch/arm/mach-hisi/platmcpm.c | |
4036 | @@ -61,7 +61,7 @@ | |
4037 | ||
4038 | static void __iomem *sysctrl, *fabric; | |
4039 | static int hip04_cpu_table[HIP04_MAX_CLUSTERS][HIP04_MAX_CPUS_PER_CLUSTER]; | |
4040 | -static DEFINE_SPINLOCK(boot_lock); | |
4041 | +static DEFINE_RAW_SPINLOCK(boot_lock); | |
4042 | static u32 fabric_phys_addr; | |
4043 | /* | |
4044 | * [0]: bootwrapper physical address | |
4045 | @@ -113,7 +113,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle) | |
4046 | if (cluster >= HIP04_MAX_CLUSTERS || cpu >= HIP04_MAX_CPUS_PER_CLUSTER) | |
4047 | return -EINVAL; | |
4048 | ||
4049 | - spin_lock_irq(&boot_lock); | |
4050 | + raw_spin_lock_irq(&boot_lock); | |
4051 | ||
4052 | if (hip04_cpu_table[cluster][cpu]) | |
4053 | goto out; | |
4054 | @@ -147,7 +147,7 @@ static int hip04_boot_secondary(unsigned int l_cpu, struct task_struct *idle) | |
4055 | ||
4056 | out: | |
4057 | hip04_cpu_table[cluster][cpu]++; | |
4058 | - spin_unlock_irq(&boot_lock); | |
4059 | + raw_spin_unlock_irq(&boot_lock); | |
4060 | ||
4061 | return 0; | |
4062 | } | |
4063 | @@ -162,11 +162,11 @@ static void hip04_cpu_die(unsigned int l_cpu) | |
4064 | cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0); | |
4065 | cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1); | |
4066 | ||
4067 | - spin_lock(&boot_lock); | |
4068 | + raw_spin_lock(&boot_lock); | |
4069 | hip04_cpu_table[cluster][cpu]--; | |
4070 | if (hip04_cpu_table[cluster][cpu] == 1) { | |
4071 | /* A power_up request went ahead of us. */ | |
4072 | - spin_unlock(&boot_lock); | |
4073 | + raw_spin_unlock(&boot_lock); | |
4074 | return; | |
4075 | } else if (hip04_cpu_table[cluster][cpu] > 1) { | |
4076 | pr_err("Cluster %d CPU%d boots multiple times\n", cluster, cpu); | |
4077 | @@ -174,7 +174,7 @@ static void hip04_cpu_die(unsigned int l_cpu) | |
4078 | } | |
4079 | ||
4080 | last_man = hip04_cluster_is_down(cluster); | |
4081 | - spin_unlock(&boot_lock); | |
4082 | + raw_spin_unlock(&boot_lock); | |
4083 | if (last_man) { | |
4084 | /* Since it's Cortex A15, disable L2 prefetching. */ | |
4085 | asm volatile( | |
4086 | @@ -203,7 +203,7 @@ static int hip04_cpu_kill(unsigned int l_cpu) | |
4087 | cpu >= HIP04_MAX_CPUS_PER_CLUSTER); | |
4088 | ||
4089 | count = TIMEOUT_MSEC / POLL_MSEC; | |
4090 | - spin_lock_irq(&boot_lock); | |
4091 | + raw_spin_lock_irq(&boot_lock); | |
4092 | for (tries = 0; tries < count; tries++) { | |
4093 | if (hip04_cpu_table[cluster][cpu]) | |
4094 | goto err; | |
4095 | @@ -211,10 +211,10 @@ static int hip04_cpu_kill(unsigned int l_cpu) | |
4096 | data = readl_relaxed(sysctrl + SC_CPU_RESET_STATUS(cluster)); | |
4097 | if (data & CORE_WFI_STATUS(cpu)) | |
4098 | break; | |
4099 | - spin_unlock_irq(&boot_lock); | |
4100 | + raw_spin_unlock_irq(&boot_lock); | |
4101 | /* Wait for clean L2 when the whole cluster is down. */ | |
4102 | msleep(POLL_MSEC); | |
4103 | - spin_lock_irq(&boot_lock); | |
4104 | + raw_spin_lock_irq(&boot_lock); | |
4105 | } | |
4106 | if (tries >= count) | |
4107 | goto err; | |
4108 | @@ -231,10 +231,10 @@ static int hip04_cpu_kill(unsigned int l_cpu) | |
4109 | goto err; | |
4110 | if (hip04_cluster_is_down(cluster)) | |
4111 | hip04_set_snoop_filter(cluster, 0); | |
4112 | - spin_unlock_irq(&boot_lock); | |
4113 | + raw_spin_unlock_irq(&boot_lock); | |
4114 | return 1; | |
4115 | err: | |
4116 | - spin_unlock_irq(&boot_lock); | |
4117 | + raw_spin_unlock_irq(&boot_lock); | |
4118 | return 0; | |
4119 | } | |
4120 | #endif | |
4121 | diff --git a/arch/arm/mach-omap2/omap-smp.c b/arch/arm/mach-omap2/omap-smp.c | |
4122 | index 1c73694c871a..ac4d2f030b87 100644 | |
4123 | --- a/arch/arm/mach-omap2/omap-smp.c | |
4124 | +++ b/arch/arm/mach-omap2/omap-smp.c | |
4125 | @@ -69,7 +69,7 @@ static const struct omap_smp_config omap5_cfg __initconst = { | |
4126 | .startup_addr = omap5_secondary_startup, | |
4127 | }; | |
4128 | ||
4129 | -static DEFINE_SPINLOCK(boot_lock); | |
4130 | +static DEFINE_RAW_SPINLOCK(boot_lock); | |
4131 | ||
4132 | void __iomem *omap4_get_scu_base(void) | |
4133 | { | |
4134 | @@ -177,8 +177,8 @@ static void omap4_secondary_init(unsigned int cpu) | |
4135 | /* | |
4136 | * Synchronise with the boot thread. | |
4137 | */ | |
4138 | - spin_lock(&boot_lock); | |
4139 | - spin_unlock(&boot_lock); | |
4140 | + raw_spin_lock(&boot_lock); | |
4141 | + raw_spin_unlock(&boot_lock); | |
4142 | } | |
4143 | ||
4144 | static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle) | |
4145 | @@ -191,7 +191,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle) | |
4146 | * Set synchronisation state between this boot processor | |
4147 | * and the secondary one | |
4148 | */ | |
4149 | - spin_lock(&boot_lock); | |
4150 | + raw_spin_lock(&boot_lock); | |
4151 | ||
4152 | /* | |
4153 | * Update the AuxCoreBoot0 with boot state for secondary core. | |
4154 | @@ -270,7 +270,7 @@ static int omap4_boot_secondary(unsigned int cpu, struct task_struct *idle) | |
4155 | * Now the secondary core is starting up let it run its | |
4156 | * calibrations, then wait for it to finish | |
4157 | */ | |
4158 | - spin_unlock(&boot_lock); | |
4159 | + raw_spin_unlock(&boot_lock); | |
4160 | ||
4161 | return 0; | |
4162 | } | |
4163 | diff --git a/arch/arm/mach-prima2/platsmp.c b/arch/arm/mach-prima2/platsmp.c | |
4164 | index 75ef5d4be554..c17c86e5d860 100644 | |
4165 | --- a/arch/arm/mach-prima2/platsmp.c | |
4166 | +++ b/arch/arm/mach-prima2/platsmp.c | |
4167 | @@ -22,7 +22,7 @@ | |
4168 | ||
4169 | static void __iomem *clk_base; | |
4170 | ||
4171 | -static DEFINE_SPINLOCK(boot_lock); | |
4172 | +static DEFINE_RAW_SPINLOCK(boot_lock); | |
4173 | ||
4174 | static void sirfsoc_secondary_init(unsigned int cpu) | |
4175 | { | |
4176 | @@ -36,8 +36,8 @@ static void sirfsoc_secondary_init(unsigned int cpu) | |
4177 | /* | |
4178 | * Synchronise with the boot thread. | |
4179 | */ | |
4180 | - spin_lock(&boot_lock); | |
4181 | - spin_unlock(&boot_lock); | |
4182 | + raw_spin_lock(&boot_lock); | |
4183 | + raw_spin_unlock(&boot_lock); | |
4184 | } | |
4185 | ||
4186 | static const struct of_device_id clk_ids[] = { | |
4187 | @@ -75,7 +75,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle) | |
4188 | /* make sure write buffer is drained */ | |
4189 | mb(); | |
4190 | ||
4191 | - spin_lock(&boot_lock); | |
4192 | + raw_spin_lock(&boot_lock); | |
4193 | ||
4194 | /* | |
4195 | * The secondary processor is waiting to be released from | |
4196 | @@ -107,7 +107,7 @@ static int sirfsoc_boot_secondary(unsigned int cpu, struct task_struct *idle) | |
4197 | * now the secondary core is starting up let it run its | |
4198 | * calibrations, then wait for it to finish | |
4199 | */ | |
4200 | - spin_unlock(&boot_lock); | |
4201 | + raw_spin_unlock(&boot_lock); | |
4202 | ||
4203 | return pen_release != -1 ? -ENOSYS : 0; | |
4204 | } | |
4205 | diff --git a/arch/arm/mach-qcom/platsmp.c b/arch/arm/mach-qcom/platsmp.c | |
4206 | index 5494c9e0c909..e8ce157d3548 100644 | |
4207 | --- a/arch/arm/mach-qcom/platsmp.c | |
4208 | +++ b/arch/arm/mach-qcom/platsmp.c | |
4209 | @@ -46,7 +46,7 @@ | |
4210 | ||
4211 | extern void secondary_startup_arm(void); | |
4212 | ||
4213 | -static DEFINE_SPINLOCK(boot_lock); | |
4214 | +static DEFINE_RAW_SPINLOCK(boot_lock); | |
4215 | ||
4216 | #ifdef CONFIG_HOTPLUG_CPU | |
4217 | static void qcom_cpu_die(unsigned int cpu) | |
4218 | @@ -60,8 +60,8 @@ static void qcom_secondary_init(unsigned int cpu) | |
4219 | /* | |
4220 | * Synchronise with the boot thread. | |
4221 | */ | |
4222 | - spin_lock(&boot_lock); | |
4223 | - spin_unlock(&boot_lock); | |
4224 | + raw_spin_lock(&boot_lock); | |
4225 | + raw_spin_unlock(&boot_lock); | |
4226 | } | |
4227 | ||
4228 | static int scss_release_secondary(unsigned int cpu) | |
4229 | @@ -284,7 +284,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int)) | |
4230 | * set synchronisation state between this boot processor | |
4231 | * and the secondary one | |
4232 | */ | |
4233 | - spin_lock(&boot_lock); | |
4234 | + raw_spin_lock(&boot_lock); | |
4235 | ||
4236 | /* | |
4237 | * Send the secondary CPU a soft interrupt, thereby causing | |
4238 | @@ -297,7 +297,7 @@ static int qcom_boot_secondary(unsigned int cpu, int (*func)(unsigned int)) | |
4239 | * now the secondary core is starting up let it run its | |
4240 | * calibrations, then wait for it to finish | |
4241 | */ | |
4242 | - spin_unlock(&boot_lock); | |
4243 | + raw_spin_unlock(&boot_lock); | |
4244 | ||
4245 | return ret; | |
4246 | } | |
4247 | diff --git a/arch/arm/mach-spear/platsmp.c b/arch/arm/mach-spear/platsmp.c | |
4248 | index 39038a03836a..6da5c93872bf 100644 | |
4249 | --- a/arch/arm/mach-spear/platsmp.c | |
4250 | +++ b/arch/arm/mach-spear/platsmp.c | |
4251 | @@ -32,7 +32,7 @@ static void write_pen_release(int val) | |
4252 | sync_cache_w(&pen_release); | |
4253 | } | |
4254 | ||
4255 | -static DEFINE_SPINLOCK(boot_lock); | |
4256 | +static DEFINE_RAW_SPINLOCK(boot_lock); | |
4257 | ||
4258 | static void __iomem *scu_base = IOMEM(VA_SCU_BASE); | |
4259 | ||
4260 | @@ -47,8 +47,8 @@ static void spear13xx_secondary_init(unsigned int cpu) | |
4261 | /* | |
4262 | * Synchronise with the boot thread. | |
4263 | */ | |
4264 | - spin_lock(&boot_lock); | |
4265 | - spin_unlock(&boot_lock); | |
4266 | + raw_spin_lock(&boot_lock); | |
4267 | + raw_spin_unlock(&boot_lock); | |
4268 | } | |
4269 | ||
4270 | static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle) | |
4271 | @@ -59,7 +59,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle) | |
4272 | * set synchronisation state between this boot processor | |
4273 | * and the secondary one | |
4274 | */ | |
4275 | - spin_lock(&boot_lock); | |
4276 | + raw_spin_lock(&boot_lock); | |
4277 | ||
4278 | /* | |
4279 | * The secondary processor is waiting to be released from | |
4280 | @@ -84,7 +84,7 @@ static int spear13xx_boot_secondary(unsigned int cpu, struct task_struct *idle) | |
4281 | * now the secondary core is starting up let it run its | |
4282 | * calibrations, then wait for it to finish | |
4283 | */ | |
4284 | - spin_unlock(&boot_lock); | |
4285 | + raw_spin_unlock(&boot_lock); | |
4286 | ||
4287 | return pen_release != -1 ? -ENOSYS : 0; | |
4288 | } | |
4289 | diff --git a/arch/arm/mach-sti/platsmp.c b/arch/arm/mach-sti/platsmp.c | |
4290 | index 231f19e17436..a3419b7003e6 100644 | |
4291 | --- a/arch/arm/mach-sti/platsmp.c | |
4292 | +++ b/arch/arm/mach-sti/platsmp.c | |
4293 | @@ -35,7 +35,7 @@ static void write_pen_release(int val) | |
4294 | sync_cache_w(&pen_release); | |
4295 | } | |
4296 | ||
4297 | -static DEFINE_SPINLOCK(boot_lock); | |
4298 | +static DEFINE_RAW_SPINLOCK(boot_lock); | |
4299 | ||
4300 | static void sti_secondary_init(unsigned int cpu) | |
4301 | { | |
4302 | @@ -48,8 +48,8 @@ static void sti_secondary_init(unsigned int cpu) | |
4303 | /* | |
4304 | * Synchronise with the boot thread. | |
4305 | */ | |
4306 | - spin_lock(&boot_lock); | |
4307 | - spin_unlock(&boot_lock); | |
4308 | + raw_spin_lock(&boot_lock); | |
4309 | + raw_spin_unlock(&boot_lock); | |
4310 | } | |
4311 | ||
4312 | static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle) | |
4313 | @@ -60,7 +60,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle) | |
4314 | * set synchronisation state between this boot processor | |
4315 | * and the secondary one | |
4316 | */ | |
4317 | - spin_lock(&boot_lock); | |
4318 | + raw_spin_lock(&boot_lock); | |
4319 | ||
4320 | /* | |
4321 | * The secondary processor is waiting to be released from | |
4322 | @@ -91,7 +91,7 @@ static int sti_boot_secondary(unsigned int cpu, struct task_struct *idle) | |
4323 | * now the secondary core is starting up let it run its | |
4324 | * calibrations, then wait for it to finish | |
4325 | */ | |
4326 | - spin_unlock(&boot_lock); | |
4327 | + raw_spin_unlock(&boot_lock); | |
4328 | ||
4329 | return pen_release != -1 ? -ENOSYS : 0; | |
4330 | } | |
4331 | diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c | |
5dd41b01 | 4332 | index 49b1b8048635..b261967ea028 100644 |
b3bbd485 JK |
4333 | --- a/arch/arm/mm/fault.c |
4334 | +++ b/arch/arm/mm/fault.c | |
5dd41b01 | 4335 | @@ -437,6 +437,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr, |
b3bbd485 JK |
4336 | if (addr < TASK_SIZE) |
4337 | return do_page_fault(addr, fsr, regs); | |
4338 | ||
4339 | + if (interrupts_enabled(regs)) | |
4340 | + local_irq_enable(); | |
e4b2b4a8 | 4341 | + |
b3bbd485 JK |
4342 | if (user_mode(regs)) |
4343 | goto bad_area; | |
4344 | ||
5dd41b01 | 4345 | @@ -504,6 +507,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr, |
b3bbd485 JK |
4346 | static int |
4347 | do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) | |
4348 | { | |
4349 | + if (interrupts_enabled(regs)) | |
4350 | + local_irq_enable(); | |
e4b2b4a8 | 4351 | + |
b3bbd485 JK |
4352 | do_bad_area(addr, fsr, regs); |
4353 | return 0; | |
4354 | } | |
4355 | diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c | |
4356 | index d02f8187b1cc..542692dbd40a 100644 | |
4357 | --- a/arch/arm/mm/highmem.c | |
4358 | +++ b/arch/arm/mm/highmem.c | |
4359 | @@ -34,6 +34,11 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr) | |
4360 | return *ptep; | |
4361 | } | |
4362 | ||
4363 | +static unsigned int fixmap_idx(int type) | |
4364 | +{ | |
4365 | + return FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id(); | |
4366 | +} | |
e4b2b4a8 | 4367 | + |
b3bbd485 JK |
4368 | void *kmap(struct page *page) |
4369 | { | |
4370 | might_sleep(); | |
4371 | @@ -54,12 +59,13 @@ EXPORT_SYMBOL(kunmap); | |
4372 | ||
4373 | void *kmap_atomic(struct page *page) | |
4374 | { | |
4375 | + pte_t pte = mk_pte(page, kmap_prot); | |
4376 | unsigned int idx; | |
4377 | unsigned long vaddr; | |
4378 | void *kmap; | |
4379 | int type; | |
4380 | ||
4381 | - preempt_disable(); | |
4382 | + preempt_disable_nort(); | |
4383 | pagefault_disable(); | |
4384 | if (!PageHighMem(page)) | |
4385 | return page_address(page); | |
4386 | @@ -79,7 +85,7 @@ void *kmap_atomic(struct page *page) | |
4387 | ||
4388 | type = kmap_atomic_idx_push(); | |
4389 | ||
4390 | - idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id(); | |
4391 | + idx = fixmap_idx(type); | |
4392 | vaddr = __fix_to_virt(idx); | |
4393 | #ifdef CONFIG_DEBUG_HIGHMEM | |
4394 | /* | |
4395 | @@ -93,7 +99,10 @@ void *kmap_atomic(struct page *page) | |
4396 | * in place, so the contained TLB flush ensures the TLB is updated | |
4397 | * with the new mapping. | |
4398 | */ | |
4399 | - set_fixmap_pte(idx, mk_pte(page, kmap_prot)); | |
4400 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
4401 | + current->kmap_pte[type] = pte; | |
4402 | +#endif | |
4403 | + set_fixmap_pte(idx, pte); | |
4404 | ||
4405 | return (void *)vaddr; | |
4406 | } | |
4407 | @@ -106,44 +115,75 @@ void __kunmap_atomic(void *kvaddr) | |
4408 | ||
4409 | if (kvaddr >= (void *)FIXADDR_START) { | |
4410 | type = kmap_atomic_idx(); | |
4411 | - idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id(); | |
4412 | + idx = fixmap_idx(type); | |
4413 | ||
4414 | if (cache_is_vivt()) | |
4415 | __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE); | |
4416 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
4417 | + current->kmap_pte[type] = __pte(0); | |
4418 | +#endif | |
4419 | #ifdef CONFIG_DEBUG_HIGHMEM | |
4420 | BUG_ON(vaddr != __fix_to_virt(idx)); | |
4421 | - set_fixmap_pte(idx, __pte(0)); | |
4422 | #else | |
4423 | (void) idx; /* to kill a warning */ | |
4424 | #endif | |
4425 | + set_fixmap_pte(idx, __pte(0)); | |
4426 | kmap_atomic_idx_pop(); | |
4427 | } else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) { | |
4428 | /* this address was obtained through kmap_high_get() */ | |
4429 | kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)])); | |
4430 | } | |
4431 | pagefault_enable(); | |
4432 | - preempt_enable(); | |
4433 | + preempt_enable_nort(); | |
4434 | } | |
4435 | EXPORT_SYMBOL(__kunmap_atomic); | |
4436 | ||
4437 | void *kmap_atomic_pfn(unsigned long pfn) | |
4438 | { | |
4439 | + pte_t pte = pfn_pte(pfn, kmap_prot); | |
4440 | unsigned long vaddr; | |
4441 | int idx, type; | |
4442 | struct page *page = pfn_to_page(pfn); | |
4443 | ||
4444 | - preempt_disable(); | |
4445 | + preempt_disable_nort(); | |
4446 | pagefault_disable(); | |
4447 | if (!PageHighMem(page)) | |
4448 | return page_address(page); | |
4449 | ||
4450 | type = kmap_atomic_idx_push(); | |
4451 | - idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id(); | |
4452 | + idx = fixmap_idx(type); | |
4453 | vaddr = __fix_to_virt(idx); | |
4454 | #ifdef CONFIG_DEBUG_HIGHMEM | |
4455 | BUG_ON(!pte_none(get_fixmap_pte(vaddr))); | |
4456 | #endif | |
4457 | - set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot)); | |
4458 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
4459 | + current->kmap_pte[type] = pte; | |
4460 | +#endif | |
4461 | + set_fixmap_pte(idx, pte); | |
4462 | ||
4463 | return (void *)vaddr; | |
4464 | } | |
4465 | +#if defined CONFIG_PREEMPT_RT_FULL | |
4466 | +void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) | |
4467 | +{ | |
4468 | + int i; | |
e4b2b4a8 | 4469 | + |
b3bbd485 JK |
4470 | + /* |
4471 | + * Clear @prev's kmap_atomic mappings | |
4472 | + */ | |
4473 | + for (i = 0; i < prev_p->kmap_idx; i++) { | |
4474 | + int idx = fixmap_idx(i); | |
e4b2b4a8 | 4475 | + |
b3bbd485 JK |
4476 | + set_fixmap_pte(idx, __pte(0)); |
4477 | + } | |
4478 | + /* | |
4479 | + * Restore @next_p's kmap_atomic mappings | |
4480 | + */ | |
4481 | + for (i = 0; i < next_p->kmap_idx; i++) { | |
4482 | + int idx = fixmap_idx(i); | |
e4b2b4a8 | 4483 | + |
b3bbd485 JK |
4484 | + if (!pte_none(next_p->kmap_pte[i])) |
4485 | + set_fixmap_pte(idx, next_p->kmap_pte[i]); | |
4486 | + } | |
4487 | +} | |
4488 | +#endif | |
4489 | diff --git a/arch/arm/plat-versatile/platsmp.c b/arch/arm/plat-versatile/platsmp.c | |
4490 | index c2366510187a..6b60f582b738 100644 | |
4491 | --- a/arch/arm/plat-versatile/platsmp.c | |
4492 | +++ b/arch/arm/plat-versatile/platsmp.c | |
4493 | @@ -32,7 +32,7 @@ static void write_pen_release(int val) | |
4494 | sync_cache_w(&pen_release); | |
4495 | } | |
4496 | ||
4497 | -static DEFINE_SPINLOCK(boot_lock); | |
4498 | +static DEFINE_RAW_SPINLOCK(boot_lock); | |
4499 | ||
4500 | void versatile_secondary_init(unsigned int cpu) | |
4501 | { | |
4502 | @@ -45,8 +45,8 @@ void versatile_secondary_init(unsigned int cpu) | |
4503 | /* | |
4504 | * Synchronise with the boot thread. | |
4505 | */ | |
4506 | - spin_lock(&boot_lock); | |
4507 | - spin_unlock(&boot_lock); | |
4508 | + raw_spin_lock(&boot_lock); | |
4509 | + raw_spin_unlock(&boot_lock); | |
4510 | } | |
4511 | ||
4512 | int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle) | |
4513 | @@ -57,7 +57,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle) | |
4514 | * Set synchronisation state between this boot processor | |
4515 | * and the secondary one | |
4516 | */ | |
4517 | - spin_lock(&boot_lock); | |
4518 | + raw_spin_lock(&boot_lock); | |
4519 | ||
4520 | /* | |
4521 | * This is really belt and braces; we hold unintended secondary | |
4522 | @@ -87,7 +87,7 @@ int versatile_boot_secondary(unsigned int cpu, struct task_struct *idle) | |
4523 | * now the secondary core is starting up let it run its | |
4524 | * calibrations, then wait for it to finish | |
4525 | */ | |
4526 | - spin_unlock(&boot_lock); | |
4527 | + raw_spin_unlock(&boot_lock); | |
4528 | ||
4529 | return pen_release != -1 ? -ENOSYS : 0; | |
4530 | } | |
4531 | diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig | |
4532 | index c30cd78b6918..458d2033ffde 100644 | |
4533 | --- a/arch/arm64/Kconfig | |
4534 | +++ b/arch/arm64/Kconfig | |
4535 | @@ -103,6 +103,7 @@ config ARM64 | |
4536 | select HAVE_PERF_EVENTS | |
4537 | select HAVE_PERF_REGS | |
4538 | select HAVE_PERF_USER_STACK_DUMP | |
4539 | + select HAVE_PREEMPT_LAZY | |
4540 | select HAVE_REGS_AND_STACK_ACCESS_API | |
4541 | select HAVE_RCU_TABLE_FREE | |
4542 | select HAVE_SYSCALL_TRACEPOINTS | |
4543 | diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig | |
4544 | index 70c517aa4501..2a5f05b5a19a 100644 | |
4545 | --- a/arch/arm64/crypto/Kconfig | |
4546 | +++ b/arch/arm64/crypto/Kconfig | |
4547 | @@ -19,19 +19,19 @@ config CRYPTO_SHA512_ARM64 | |
4548 | ||
4549 | config CRYPTO_SHA1_ARM64_CE | |
4550 | tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)" | |
4551 | - depends on KERNEL_MODE_NEON | |
4552 | + depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE | |
4553 | select CRYPTO_HASH | |
4554 | select CRYPTO_SHA1 | |
4555 | ||
4556 | config CRYPTO_SHA2_ARM64_CE | |
4557 | tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)" | |
4558 | - depends on KERNEL_MODE_NEON | |
4559 | + depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE | |
4560 | select CRYPTO_HASH | |
4561 | select CRYPTO_SHA256_ARM64 | |
4562 | ||
4563 | config CRYPTO_GHASH_ARM64_CE | |
4564 | tristate "GHASH/AES-GCM using ARMv8 Crypto Extensions" | |
4565 | - depends on KERNEL_MODE_NEON | |
4566 | + depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE | |
4567 | select CRYPTO_HASH | |
4568 | select CRYPTO_GF128MUL | |
4569 | select CRYPTO_AES | |
4570 | @@ -39,7 +39,7 @@ config CRYPTO_GHASH_ARM64_CE | |
4571 | ||
4572 | config CRYPTO_CRCT10DIF_ARM64_CE | |
4573 | tristate "CRCT10DIF digest algorithm using PMULL instructions" | |
4574 | - depends on KERNEL_MODE_NEON && CRC_T10DIF | |
4575 | + depends on KERNEL_MODE_NEON && CRC_T10DIF && !PREEMPT_RT_BASE | |
4576 | select CRYPTO_HASH | |
4577 | ||
4578 | config CRYPTO_CRC32_ARM64_CE | |
4579 | @@ -53,13 +53,13 @@ config CRYPTO_AES_ARM64 | |
4580 | ||
4581 | config CRYPTO_AES_ARM64_CE | |
4582 | tristate "AES core cipher using ARMv8 Crypto Extensions" | |
4583 | - depends on ARM64 && KERNEL_MODE_NEON | |
4584 | + depends on ARM64 && KERNEL_MODE_NEON && !PREEMPT_RT_BASE | |
4585 | select CRYPTO_ALGAPI | |
4586 | select CRYPTO_AES_ARM64 | |
4587 | ||
4588 | config CRYPTO_AES_ARM64_CE_CCM | |
4589 | tristate "AES in CCM mode using ARMv8 Crypto Extensions" | |
4590 | - depends on ARM64 && KERNEL_MODE_NEON | |
4591 | + depends on ARM64 && KERNEL_MODE_NEON && !PREEMPT_RT_BASE | |
4592 | select CRYPTO_ALGAPI | |
4593 | select CRYPTO_AES_ARM64_CE | |
4594 | select CRYPTO_AES_ARM64 | |
4595 | @@ -67,7 +67,7 @@ config CRYPTO_AES_ARM64_CE_CCM | |
4596 | ||
4597 | config CRYPTO_AES_ARM64_CE_BLK | |
4598 | tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions" | |
4599 | - depends on KERNEL_MODE_NEON | |
4600 | + depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE | |
4601 | select CRYPTO_BLKCIPHER | |
4602 | select CRYPTO_AES_ARM64_CE | |
4603 | select CRYPTO_AES_ARM64 | |
4604 | @@ -75,7 +75,7 @@ config CRYPTO_AES_ARM64_CE_BLK | |
4605 | ||
4606 | config CRYPTO_AES_ARM64_NEON_BLK | |
4607 | tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions" | |
4608 | - depends on KERNEL_MODE_NEON | |
4609 | + depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE | |
4610 | select CRYPTO_BLKCIPHER | |
4611 | select CRYPTO_AES_ARM64 | |
4612 | select CRYPTO_AES | |
4613 | @@ -83,13 +83,13 @@ config CRYPTO_AES_ARM64_NEON_BLK | |
4614 | ||
4615 | config CRYPTO_CHACHA20_NEON | |
4616 | tristate "NEON accelerated ChaCha20 symmetric cipher" | |
4617 | - depends on KERNEL_MODE_NEON | |
4618 | + depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE | |
4619 | select CRYPTO_BLKCIPHER | |
4620 | select CRYPTO_CHACHA20 | |
4621 | ||
4622 | config CRYPTO_AES_ARM64_BS | |
4623 | tristate "AES in ECB/CBC/CTR/XTS modes using bit-sliced NEON algorithm" | |
4624 | - depends on KERNEL_MODE_NEON | |
4625 | + depends on KERNEL_MODE_NEON && !PREEMPT_RT_BASE | |
4626 | select CRYPTO_BLKCIPHER | |
4627 | select CRYPTO_AES_ARM64_NEON_BLK | |
4628 | select CRYPTO_AES_ARM64 | |
4629 | diff --git a/arch/arm64/crypto/crc32-ce-glue.c b/arch/arm64/crypto/crc32-ce-glue.c | |
4630 | index 34b4e3d46aab..ae055cdad8cf 100644 | |
4631 | --- a/arch/arm64/crypto/crc32-ce-glue.c | |
4632 | +++ b/arch/arm64/crypto/crc32-ce-glue.c | |
4633 | @@ -208,7 +208,8 @@ static struct shash_alg crc32_pmull_algs[] = { { | |
4634 | ||
4635 | static int __init crc32_pmull_mod_init(void) | |
4636 | { | |
4637 | - if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && (elf_hwcap & HWCAP_PMULL)) { | |
4638 | + if (IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && | |
4639 | + !IS_ENABLED(CONFIG_PREEMPT_RT_BASE) && (elf_hwcap & HWCAP_PMULL)) { | |
4640 | crc32_pmull_algs[0].update = crc32_pmull_update; | |
4641 | crc32_pmull_algs[1].update = crc32c_pmull_update; | |
4642 | ||
4643 | diff --git a/arch/arm64/include/asm/spinlock_types.h b/arch/arm64/include/asm/spinlock_types.h | |
4644 | index 55be59a35e3f..ba0cf1361f65 100644 | |
4645 | --- a/arch/arm64/include/asm/spinlock_types.h | |
4646 | +++ b/arch/arm64/include/asm/spinlock_types.h | |
4647 | @@ -16,10 +16,6 @@ | |
4648 | #ifndef __ASM_SPINLOCK_TYPES_H | |
4649 | #define __ASM_SPINLOCK_TYPES_H | |
4650 | ||
4651 | -#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__ASM_SPINLOCK_H) | |
4652 | -# error "please don't include this file directly" | |
4653 | -#endif | |
4654 | - | |
4655 | #include <linux/types.h> | |
4656 | ||
4657 | #define TICKET_SHIFT 16 | |
4658 | diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h | |
4659 | index fc786d344e46..b833258b7594 100644 | |
4660 | --- a/arch/arm64/include/asm/thread_info.h | |
4661 | +++ b/arch/arm64/include/asm/thread_info.h | |
4662 | @@ -43,6 +43,7 @@ struct thread_info { | |
4663 | u64 ttbr0; /* saved TTBR0_EL1 */ | |
4664 | #endif | |
4665 | int preempt_count; /* 0 => preemptable, <0 => bug */ | |
4666 | + int preempt_lazy_count; /* 0 => preemptable, <0 => bug */ | |
4667 | }; | |
4668 | ||
4669 | #define INIT_THREAD_INFO(tsk) \ | |
4670 | @@ -82,6 +83,7 @@ void arch_setup_new_exec(void); | |
4671 | #define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */ | |
4672 | #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ | |
4673 | #define TIF_FSCHECK 5 /* Check FS is USER_DS on return */ | |
4674 | +#define TIF_NEED_RESCHED_LAZY 6 | |
4675 | #define TIF_NOHZ 7 | |
4676 | #define TIF_SYSCALL_TRACE 8 | |
4677 | #define TIF_SYSCALL_AUDIT 9 | |
4678 | @@ -98,6 +100,7 @@ void arch_setup_new_exec(void); | |
4679 | #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) | |
4680 | #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) | |
4681 | #define _TIF_FOREIGN_FPSTATE (1 << TIF_FOREIGN_FPSTATE) | |
4682 | +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) | |
4683 | #define _TIF_NOHZ (1 << TIF_NOHZ) | |
4684 | #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) | |
4685 | #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) | |
4686 | @@ -109,8 +112,9 @@ void arch_setup_new_exec(void); | |
4687 | ||
4688 | #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ | |
4689 | _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ | |
4690 | - _TIF_UPROBE | _TIF_FSCHECK) | |
4691 | + _TIF_UPROBE | _TIF_FSCHECK | _TIF_NEED_RESCHED_LAZY) | |
4692 | ||
4693 | +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) | |
4694 | #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ | |
4695 | _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ | |
4696 | _TIF_NOHZ) | |
4697 | diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c | |
4698 | index b5e43b01b396..ae26a1664436 100644 | |
4699 | --- a/arch/arm64/kernel/asm-offsets.c | |
4700 | +++ b/arch/arm64/kernel/asm-offsets.c | |
4701 | @@ -39,6 +39,7 @@ int main(void) | |
4702 | BLANK(); | |
4703 | DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags)); | |
4704 | DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count)); | |
4705 | + DEFINE(TSK_TI_PREEMPT_LAZY, offsetof(struct task_struct, thread_info.preempt_lazy_count)); | |
4706 | DEFINE(TSK_TI_ADDR_LIMIT, offsetof(struct task_struct, thread_info.addr_limit)); | |
4707 | #ifdef CONFIG_ARM64_SW_TTBR0_PAN | |
4708 | DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0)); | |
4709 | diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S | |
4710 | index c1ffa95c0ad2..c60ecb5a3916 100644 | |
4711 | --- a/arch/arm64/kernel/entry.S | |
4712 | +++ b/arch/arm64/kernel/entry.S | |
4713 | @@ -637,11 +637,16 @@ el1_irq: | |
4714 | ||
4715 | #ifdef CONFIG_PREEMPT | |
4716 | ldr w24, [tsk, #TSK_TI_PREEMPT] // get preempt count | |
4717 | - cbnz w24, 1f // preempt count != 0 | |
4718 | + cbnz w24, 2f // preempt count != 0 | |
4719 | ldr x0, [tsk, #TSK_TI_FLAGS] // get flags | |
4720 | - tbz x0, #TIF_NEED_RESCHED, 1f // needs rescheduling? | |
4721 | - bl el1_preempt | |
4722 | + tbnz x0, #TIF_NEED_RESCHED, 1f // needs rescheduling? | |
e4b2b4a8 | 4723 | + |
b3bbd485 JK |
4724 | + ldr w24, [tsk, #TSK_TI_PREEMPT_LAZY] // get preempt lazy count |
4725 | + cbnz w24, 2f // preempt lazy count != 0 | |
4726 | + tbz x0, #TIF_NEED_RESCHED_LAZY, 2f // needs rescheduling? | |
4727 | 1: | |
4728 | + bl el1_preempt | |
4729 | +2: | |
4730 | #endif | |
4731 | #ifdef CONFIG_TRACE_IRQFLAGS | |
4732 | bl trace_hardirqs_on | |
4733 | @@ -655,6 +660,7 @@ el1_preempt: | |
4734 | 1: bl preempt_schedule_irq // irq en/disable is done inside | |
4735 | ldr x0, [tsk, #TSK_TI_FLAGS] // get new tasks TI_FLAGS | |
4736 | tbnz x0, #TIF_NEED_RESCHED, 1b // needs rescheduling? | |
4737 | + tbnz x0, #TIF_NEED_RESCHED_LAZY, 1b // needs rescheduling? | |
4738 | ret x24 | |
4739 | #endif | |
4740 | ||
4741 | diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c | |
4742 | index 43442b3a463f..81bf9545a589 100644 | |
4743 | --- a/arch/arm64/kernel/signal.c | |
4744 | +++ b/arch/arm64/kernel/signal.c | |
4745 | @@ -756,7 +756,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, | |
4746 | /* Check valid user FS if needed */ | |
4747 | addr_limit_user_check(); | |
4748 | ||
4749 | - if (thread_flags & _TIF_NEED_RESCHED) { | |
4750 | + if (thread_flags & _TIF_NEED_RESCHED_MASK) { | |
4751 | schedule(); | |
4752 | } else { | |
4753 | local_irq_enable(); | |
4754 | diff --git a/arch/blackfin/include/asm/spinlock_types.h b/arch/blackfin/include/asm/spinlock_types.h | |
4755 | index 1a33608c958b..103b34d3dcf6 100644 | |
4756 | --- a/arch/blackfin/include/asm/spinlock_types.h | |
4757 | +++ b/arch/blackfin/include/asm/spinlock_types.h | |
4758 | @@ -7,10 +7,6 @@ | |
4759 | #ifndef __ASM_SPINLOCK_TYPES_H | |
4760 | #define __ASM_SPINLOCK_TYPES_H | |
4761 | ||
4762 | -#ifndef __LINUX_SPINLOCK_TYPES_H | |
4763 | -# error "please don't include this file directly" | |
4764 | -#endif | |
4765 | - | |
4766 | #include <asm/rwlock.h> | |
4767 | ||
4768 | typedef struct { | |
4769 | diff --git a/arch/hexagon/include/asm/spinlock_types.h b/arch/hexagon/include/asm/spinlock_types.h | |
4770 | index 7a906b5214a4..d8f596fec022 100644 | |
4771 | --- a/arch/hexagon/include/asm/spinlock_types.h | |
4772 | +++ b/arch/hexagon/include/asm/spinlock_types.h | |
4773 | @@ -21,10 +21,6 @@ | |
4774 | #ifndef _ASM_SPINLOCK_TYPES_H | |
4775 | #define _ASM_SPINLOCK_TYPES_H | |
4776 | ||
4777 | -#ifndef __LINUX_SPINLOCK_TYPES_H | |
4778 | -# error "please don't include this file directly" | |
4779 | -#endif | |
4780 | - | |
4781 | typedef struct { | |
4782 | volatile unsigned int lock; | |
4783 | } arch_spinlock_t; | |
4784 | diff --git a/arch/ia64/include/asm/spinlock_types.h b/arch/ia64/include/asm/spinlock_types.h | |
4785 | index 6e345fefcdca..681408d6816f 100644 | |
4786 | --- a/arch/ia64/include/asm/spinlock_types.h | |
4787 | +++ b/arch/ia64/include/asm/spinlock_types.h | |
4788 | @@ -2,10 +2,6 @@ | |
4789 | #ifndef _ASM_IA64_SPINLOCK_TYPES_H | |
4790 | #define _ASM_IA64_SPINLOCK_TYPES_H | |
4791 | ||
4792 | -#ifndef __LINUX_SPINLOCK_TYPES_H | |
4793 | -# error "please don't include this file directly" | |
4794 | -#endif | |
4795 | - | |
4796 | typedef struct { | |
4797 | volatile unsigned int lock; | |
4798 | } arch_spinlock_t; | |
4799 | diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c | |
4800 | index 555b11180156..6866201a7603 100644 | |
4801 | --- a/arch/ia64/kernel/mca.c | |
4802 | +++ b/arch/ia64/kernel/mca.c | |
4803 | @@ -1824,7 +1824,7 @@ format_mca_init_stack(void *mca_data, unsigned long offset, | |
4804 | ti->cpu = cpu; | |
4805 | p->stack = ti; | |
4806 | p->state = TASK_UNINTERRUPTIBLE; | |
4807 | - cpumask_set_cpu(cpu, &p->cpus_allowed); | |
4808 | + cpumask_set_cpu(cpu, &p->cpus_mask); | |
4809 | INIT_LIST_HEAD(&p->tasks); | |
4810 | p->parent = p->real_parent = p->group_leader = p; | |
4811 | INIT_LIST_HEAD(&p->children); | |
4812 | diff --git a/arch/m32r/include/asm/spinlock_types.h b/arch/m32r/include/asm/spinlock_types.h | |
4813 | index bb0d17b64198..fc6afa42fe11 100644 | |
4814 | --- a/arch/m32r/include/asm/spinlock_types.h | |
4815 | +++ b/arch/m32r/include/asm/spinlock_types.h | |
4816 | @@ -2,10 +2,6 @@ | |
4817 | #ifndef _ASM_M32R_SPINLOCK_TYPES_H | |
4818 | #define _ASM_M32R_SPINLOCK_TYPES_H | |
4819 | ||
4820 | -#ifndef __LINUX_SPINLOCK_TYPES_H | |
4821 | -# error "please don't include this file directly" | |
4822 | -#endif | |
4823 | - | |
4824 | typedef struct { | |
4825 | volatile int slock; | |
4826 | } arch_spinlock_t; | |
4827 | diff --git a/arch/metag/include/asm/spinlock_types.h b/arch/metag/include/asm/spinlock_types.h | |
4828 | index cd197f1bed59..adc26e9797c5 100644 | |
4829 | --- a/arch/metag/include/asm/spinlock_types.h | |
4830 | +++ b/arch/metag/include/asm/spinlock_types.h | |
4831 | @@ -2,10 +2,6 @@ | |
4832 | #ifndef _ASM_METAG_SPINLOCK_TYPES_H | |
4833 | #define _ASM_METAG_SPINLOCK_TYPES_H | |
4834 | ||
4835 | -#ifndef __LINUX_SPINLOCK_TYPES_H | |
4836 | -# error "please don't include this file directly" | |
4837 | -#endif | |
4838 | - | |
4839 | typedef struct { | |
4840 | volatile unsigned int lock; | |
4841 | } arch_spinlock_t; | |
4842 | diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig | |
4843 | index c82457b0e733..7bb1838508de 100644 | |
4844 | --- a/arch/mips/Kconfig | |
4845 | +++ b/arch/mips/Kconfig | |
4846 | @@ -2519,7 +2519,7 @@ config MIPS_ASID_BITS_VARIABLE | |
4847 | # | |
4848 | config HIGHMEM | |
4849 | bool "High Memory Support" | |
4850 | - depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA | |
4851 | + depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA && !PREEMPT_RT_FULL | |
4852 | ||
4853 | config CPU_SUPPORTS_HIGHMEM | |
4854 | bool | |
4855 | diff --git a/arch/mips/include/asm/switch_to.h b/arch/mips/include/asm/switch_to.h | |
4856 | index e610473d61b8..1428b4febbc9 100644 | |
4857 | --- a/arch/mips/include/asm/switch_to.h | |
4858 | +++ b/arch/mips/include/asm/switch_to.h | |
4859 | @@ -42,7 +42,7 @@ extern struct task_struct *ll_task; | |
4860 | * inline to try to keep the overhead down. If we have been forced to run on | |
4861 | * a "CPU" with an FPU because of a previous high level of FP computation, | |
4862 | * but did not actually use the FPU during the most recent time-slice (CU1 | |
4863 | - * isn't set), we undo the restriction on cpus_allowed. | |
4864 | + * isn't set), we undo the restriction on cpus_mask. | |
4865 | * | |
4866 | * We're not calling set_cpus_allowed() here, because we have no need to | |
4867 | * force prompt migration - we're already switching the current CPU to a | |
4868 | @@ -57,7 +57,7 @@ do { \ | |
4869 | test_ti_thread_flag(__prev_ti, TIF_FPUBOUND) && \ | |
4870 | (!(KSTK_STATUS(prev) & ST0_CU1))) { \ | |
4871 | clear_ti_thread_flag(__prev_ti, TIF_FPUBOUND); \ | |
4872 | - prev->cpus_allowed = prev->thread.user_cpus_allowed; \ | |
4873 | + prev->cpus_mask = prev->thread.user_cpus_allowed; \ | |
4874 | } \ | |
4875 | next->thread.emulated_fp = 0; \ | |
4876 | } while(0) | |
4877 | diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c | |
4878 | index a7c0f97e4b0d..1a08428eedcf 100644 | |
4879 | --- a/arch/mips/kernel/mips-mt-fpaff.c | |
4880 | +++ b/arch/mips/kernel/mips-mt-fpaff.c | |
4881 | @@ -177,7 +177,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len, | |
4882 | if (retval) | |
4883 | goto out_unlock; | |
4884 | ||
4885 | - cpumask_or(&allowed, &p->thread.user_cpus_allowed, &p->cpus_allowed); | |
4886 | + cpumask_or(&allowed, &p->thread.user_cpus_allowed, p->cpus_ptr); | |
4887 | cpumask_and(&mask, &allowed, cpu_active_mask); | |
4888 | ||
4889 | out_unlock: | |
4890 | diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c | |
4891 | index 583aed906933..24ad7aaca5eb 100644 | |
4892 | --- a/arch/mips/kernel/traps.c | |
4893 | +++ b/arch/mips/kernel/traps.c | |
4894 | @@ -1193,12 +1193,12 @@ static void mt_ase_fp_affinity(void) | |
4895 | * restricted the allowed set to exclude any CPUs with FPUs, | |
4896 | * we'll skip the procedure. | |
4897 | */ | |
4898 | - if (cpumask_intersects(¤t->cpus_allowed, &mt_fpu_cpumask)) { | |
4899 | + if (cpumask_intersects(¤t->cpus_mask, &mt_fpu_cpumask)) { | |
4900 | cpumask_t tmask; | |
4901 | ||
4902 | current->thread.user_cpus_allowed | |
4903 | - = current->cpus_allowed; | |
4904 | - cpumask_and(&tmask, ¤t->cpus_allowed, | |
4905 | + = current->cpus_mask; | |
4906 | + cpumask_and(&tmask, ¤t->cpus_mask, | |
4907 | &mt_fpu_cpumask); | |
4908 | set_cpus_allowed_ptr(current, &tmask); | |
4909 | set_thread_flag(TIF_FPUBOUND); | |
4910 | diff --git a/arch/mn10300/include/asm/spinlock_types.h b/arch/mn10300/include/asm/spinlock_types.h | |
4911 | index 32abdc89bbc7..c45230a12d60 100644 | |
4912 | --- a/arch/mn10300/include/asm/spinlock_types.h | |
4913 | +++ b/arch/mn10300/include/asm/spinlock_types.h | |
4914 | @@ -2,10 +2,6 @@ | |
4915 | #ifndef _ASM_SPINLOCK_TYPES_H | |
4916 | #define _ASM_SPINLOCK_TYPES_H | |
4917 | ||
4918 | -#ifndef __LINUX_SPINLOCK_TYPES_H | |
4919 | -# error "please don't include this file directly" | |
4920 | -#endif | |
4921 | - | |
4922 | typedef struct arch_spinlock { | |
4923 | unsigned int slock; | |
4924 | } arch_spinlock_t; | |
4925 | diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig | |
4926 | index fe418226df7f..b5658e925465 100644 | |
4927 | --- a/arch/powerpc/Kconfig | |
4928 | +++ b/arch/powerpc/Kconfig | |
4929 | @@ -111,10 +111,11 @@ config LOCKDEP_SUPPORT | |
4930 | ||
4931 | config RWSEM_GENERIC_SPINLOCK | |
4932 | bool | |
4933 | + default y if PREEMPT_RT_FULL | |
4934 | ||
4935 | config RWSEM_XCHGADD_ALGORITHM | |
4936 | bool | |
4937 | - default y | |
4938 | + default y if !PREEMPT_RT_FULL | |
4939 | ||
4940 | config GENERIC_LOCKBREAK | |
4941 | bool | |
4942 | @@ -215,6 +216,7 @@ config PPC | |
4943 | select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH | |
4944 | select HAVE_PERF_REGS | |
4945 | select HAVE_PERF_USER_STACK_DUMP | |
4946 | + select HAVE_PREEMPT_LAZY | |
4947 | select HAVE_RCU_TABLE_FREE if SMP | |
4948 | select HAVE_REGS_AND_STACK_ACCESS_API | |
4949 | select HAVE_SYSCALL_TRACEPOINTS | |
4950 | @@ -390,7 +392,7 @@ menu "Kernel options" | |
4951 | ||
4952 | config HIGHMEM | |
4953 | bool "High memory support" | |
4954 | - depends on PPC32 | |
4955 | + depends on PPC32 && !PREEMPT_RT_FULL | |
4956 | ||
4957 | source kernel/Kconfig.hz | |
4958 | source kernel/Kconfig.preempt | |
4959 | diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h | |
4960 | index 87adaf13b7e8..7305cb6a53e4 100644 | |
4961 | --- a/arch/powerpc/include/asm/spinlock_types.h | |
4962 | +++ b/arch/powerpc/include/asm/spinlock_types.h | |
4963 | @@ -2,10 +2,6 @@ | |
4964 | #ifndef _ASM_POWERPC_SPINLOCK_TYPES_H | |
4965 | #define _ASM_POWERPC_SPINLOCK_TYPES_H | |
4966 | ||
4967 | -#ifndef __LINUX_SPINLOCK_TYPES_H | |
4968 | -# error "please don't include this file directly" | |
4969 | -#endif | |
4970 | - | |
4971 | typedef struct { | |
4972 | volatile unsigned int slock; | |
4973 | } arch_spinlock_t; | |
4974 | diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h | |
4975 | index a264c3ad366b..020afb8329a1 100644 | |
4976 | --- a/arch/powerpc/include/asm/thread_info.h | |
4977 | +++ b/arch/powerpc/include/asm/thread_info.h | |
4978 | @@ -36,6 +36,8 @@ struct thread_info { | |
4979 | int cpu; /* cpu we're on */ | |
4980 | int preempt_count; /* 0 => preemptable, | |
4981 | <0 => BUG */ | |
4982 | + int preempt_lazy_count; /* 0 => preemptable, | |
4983 | + <0 => BUG */ | |
4984 | unsigned long local_flags; /* private flags for thread */ | |
4985 | #ifdef CONFIG_LIVEPATCH | |
4986 | unsigned long *livepatch_sp; | |
4987 | @@ -81,8 +83,7 @@ static inline struct thread_info *current_thread_info(void) | |
4988 | #define TIF_SYSCALL_TRACE 0 /* syscall trace active */ | |
4989 | #define TIF_SIGPENDING 1 /* signal pending */ | |
4990 | #define TIF_NEED_RESCHED 2 /* rescheduling necessary */ | |
4991 | -#define TIF_POLLING_NRFLAG 3 /* true if poll_idle() is polling | |
4992 | - TIF_NEED_RESCHED */ | |
4993 | +#define TIF_NEED_RESCHED_LAZY 3 /* lazy rescheduling necessary */ | |
4994 | #define TIF_32BIT 4 /* 32 bit binary */ | |
4995 | #define TIF_RESTORE_TM 5 /* need to restore TM FP/VEC/VSX */ | |
4996 | #define TIF_PATCH_PENDING 6 /* pending live patching update */ | |
4997 | @@ -101,6 +102,8 @@ static inline struct thread_info *current_thread_info(void) | |
4998 | #if defined(CONFIG_PPC64) | |
4999 | #define TIF_ELF2ABI 18 /* function descriptors must die! */ | |
5000 | #endif | |
5001 | +#define TIF_POLLING_NRFLAG 19 /* true if poll_idle() is polling | |
5002 | + TIF_NEED_RESCHED */ | |
5003 | ||
5004 | /* as above, but as bit values */ | |
5005 | #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) | |
5006 | @@ -120,14 +123,16 @@ static inline struct thread_info *current_thread_info(void) | |
5007 | #define _TIF_SYSCALL_TRACEPOINT (1<<TIF_SYSCALL_TRACEPOINT) | |
5008 | #define _TIF_EMULATE_STACK_STORE (1<<TIF_EMULATE_STACK_STORE) | |
5009 | #define _TIF_NOHZ (1<<TIF_NOHZ) | |
5010 | +#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY) | |
5011 | #define _TIF_SYSCALL_DOTRACE (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ | |
5012 | _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \ | |
5013 | _TIF_NOHZ) | |
5014 | ||
5015 | #define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \ | |
5016 | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ | |
5017 | - _TIF_RESTORE_TM | _TIF_PATCH_PENDING) | |
5018 | + _TIF_RESTORE_TM | _TIF_PATCH_PENDING | _TIF_NEED_RESCHED_LAZY) | |
5019 | #define _TIF_PERSYSCALL_MASK (_TIF_RESTOREALL|_TIF_NOERROR) | |
5020 | +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) | |
5021 | ||
5022 | /* Bits in local_flags */ | |
5023 | /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */ | |
5024 | diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c | |
5025 | index 2e5ea300258a..a2cb40098d7c 100644 | |
5026 | --- a/arch/powerpc/kernel/asm-offsets.c | |
5027 | +++ b/arch/powerpc/kernel/asm-offsets.c | |
5028 | @@ -156,6 +156,7 @@ int main(void) | |
5029 | OFFSET(TI_FLAGS, thread_info, flags); | |
5030 | OFFSET(TI_LOCAL_FLAGS, thread_info, local_flags); | |
5031 | OFFSET(TI_PREEMPT, thread_info, preempt_count); | |
5032 | + OFFSET(TI_PREEMPT_LAZY, thread_info, preempt_lazy_count); | |
5033 | OFFSET(TI_TASK, thread_info, task); | |
5034 | OFFSET(TI_CPU, thread_info, cpu); | |
5035 | ||
5036 | diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S | |
5037 | index e780e1fbf6c2..dc7fe90ff6a9 100644 | |
5038 | --- a/arch/powerpc/kernel/entry_32.S | |
5039 | +++ b/arch/powerpc/kernel/entry_32.S | |
5040 | @@ -866,7 +866,14 @@ resume_kernel: | |
5041 | cmpwi 0,r0,0 /* if non-zero, just restore regs and return */ | |
5042 | bne restore | |
5043 | andi. r8,r8,_TIF_NEED_RESCHED | |
5044 | + bne+ 1f | |
5045 | + lwz r0,TI_PREEMPT_LAZY(r9) | |
5046 | + cmpwi 0,r0,0 /* if non-zero, just restore regs and return */ | |
5047 | + bne restore | |
5048 | + lwz r0,TI_FLAGS(r9) | |
5049 | + andi. r0,r0,_TIF_NEED_RESCHED_LAZY | |
5050 | beq+ restore | |
5051 | +1: | |
5052 | lwz r3,_MSR(r1) | |
5053 | andi. r0,r3,MSR_EE /* interrupts off? */ | |
5054 | beq restore /* don't schedule if so */ | |
5055 | @@ -877,11 +884,11 @@ resume_kernel: | |
5056 | */ | |
5057 | bl trace_hardirqs_off | |
5058 | #endif | |
5059 | -1: bl preempt_schedule_irq | |
5060 | +2: bl preempt_schedule_irq | |
5061 | CURRENT_THREAD_INFO(r9, r1) | |
5062 | lwz r3,TI_FLAGS(r9) | |
5063 | - andi. r0,r3,_TIF_NEED_RESCHED | |
5064 | - bne- 1b | |
5065 | + andi. r0,r3,_TIF_NEED_RESCHED_MASK | |
5066 | + bne- 2b | |
5067 | #ifdef CONFIG_TRACE_IRQFLAGS | |
5068 | /* And now, to properly rebalance the above, we tell lockdep they | |
5069 | * are being turned back on, which will happen when we return | |
5070 | @@ -1204,7 +1211,7 @@ global_dbcr0: | |
5071 | #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */ | |
5072 | ||
5073 | do_work: /* r10 contains MSR_KERNEL here */ | |
5074 | - andi. r0,r9,_TIF_NEED_RESCHED | |
5075 | + andi. r0,r9,_TIF_NEED_RESCHED_MASK | |
5076 | beq do_user_signal | |
5077 | ||
5078 | do_resched: /* r10 contains MSR_KERNEL here */ | |
5079 | @@ -1225,7 +1232,7 @@ recheck: | |
5080 | MTMSRD(r10) /* disable interrupts */ | |
5081 | CURRENT_THREAD_INFO(r9, r1) | |
5082 | lwz r9,TI_FLAGS(r9) | |
5083 | - andi. r0,r9,_TIF_NEED_RESCHED | |
5084 | + andi. r0,r9,_TIF_NEED_RESCHED_MASK | |
5085 | bne- do_resched | |
5086 | andi. r0,r9,_TIF_USER_WORK_MASK | |
5087 | beq restore_user | |
5088 | diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S | |
5089 | index c194f4c8e66b..117c1f6cab66 100644 | |
5090 | --- a/arch/powerpc/kernel/entry_64.S | |
5091 | +++ b/arch/powerpc/kernel/entry_64.S | |
5092 | @@ -690,7 +690,7 @@ _GLOBAL(ret_from_except_lite) | |
5093 | bl restore_math | |
5094 | b restore | |
5095 | #endif | |
5096 | -1: andi. r0,r4,_TIF_NEED_RESCHED | |
5097 | +1: andi. r0,r4,_TIF_NEED_RESCHED_MASK | |
5098 | beq 2f | |
5099 | bl restore_interrupts | |
5100 | SCHEDULE_USER | |
5101 | @@ -752,10 +752,18 @@ resume_kernel: | |
5102 | ||
5103 | #ifdef CONFIG_PREEMPT | |
5104 | /* Check if we need to preempt */ | |
5105 | + lwz r8,TI_PREEMPT(r9) | |
5106 | + cmpwi 0,r8,0 /* if non-zero, just restore regs and return */ | |
5107 | + bne restore | |
5108 | andi. r0,r4,_TIF_NEED_RESCHED | |
5109 | + bne+ check_count | |
e4b2b4a8 | 5110 | + |
b3bbd485 JK |
5111 | + andi. r0,r4,_TIF_NEED_RESCHED_LAZY |
5112 | beq+ restore | |
5113 | + lwz r8,TI_PREEMPT_LAZY(r9) | |
e4b2b4a8 | 5114 | + |
b3bbd485 JK |
5115 | /* Check that preempt_count() == 0 and interrupts are enabled */ |
5116 | - lwz r8,TI_PREEMPT(r9) | |
5117 | +check_count: | |
5118 | cmpwi cr1,r8,0 | |
5119 | ld r0,SOFTE(r1) | |
5120 | cmpdi r0,0 | |
5121 | @@ -772,7 +780,7 @@ resume_kernel: | |
5122 | /* Re-test flags and eventually loop */ | |
5123 | CURRENT_THREAD_INFO(r9, r1) | |
5124 | ld r4,TI_FLAGS(r9) | |
5125 | - andi. r0,r4,_TIF_NEED_RESCHED | |
5126 | + andi. r0,r4,_TIF_NEED_RESCHED_MASK | |
5127 | bne 1b | |
5128 | ||
5129 | /* | |
5130 | diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c | |
5131 | index 0ce8b0e5d7ba..375adb3048fc 100644 | |
5132 | --- a/arch/powerpc/kernel/irq.c | |
5133 | +++ b/arch/powerpc/kernel/irq.c | |
5134 | @@ -693,6 +693,7 @@ void irq_ctx_init(void) | |
5135 | } | |
5136 | } | |
5137 | ||
5138 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
5139 | void do_softirq_own_stack(void) | |
5140 | { | |
5141 | struct thread_info *curtp, *irqtp; | |
5142 | @@ -710,6 +711,7 @@ void do_softirq_own_stack(void) | |
5143 | if (irqtp->flags) | |
5144 | set_bits(irqtp->flags, &curtp->flags); | |
5145 | } | |
5146 | +#endif | |
5147 | ||
5148 | irq_hw_number_t virq_to_hw(unsigned int virq) | |
5149 | { | |
5150 | diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S | |
5151 | index 3f7a9a2d2435..1795359d27b6 100644 | |
5152 | --- a/arch/powerpc/kernel/misc_32.S | |
5153 | +++ b/arch/powerpc/kernel/misc_32.S | |
5154 | @@ -41,6 +41,7 @@ | |
5155 | * We store the saved ksp_limit in the unused part | |
5156 | * of the STACK_FRAME_OVERHEAD | |
5157 | */ | |
5158 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
5159 | _GLOBAL(call_do_softirq) | |
5160 | mflr r0 | |
5161 | stw r0,4(r1) | |
5162 | @@ -57,6 +58,7 @@ _GLOBAL(call_do_softirq) | |
5163 | stw r10,THREAD+KSP_LIMIT(r2) | |
5164 | mtlr r0 | |
5165 | blr | |
5166 | +#endif | |
5167 | ||
5168 | /* | |
5169 | * void call_do_irq(struct pt_regs *regs, struct thread_info *irqtp); | |
5170 | diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S | |
5171 | index 3280953a82cf..dd2a80d190c4 100644 | |
5172 | --- a/arch/powerpc/kernel/misc_64.S | |
5173 | +++ b/arch/powerpc/kernel/misc_64.S | |
5174 | @@ -31,6 +31,7 @@ | |
5175 | ||
5176 | .text | |
5177 | ||
5178 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
5179 | _GLOBAL(call_do_softirq) | |
5180 | mflr r0 | |
5181 | std r0,16(r1) | |
5182 | @@ -41,6 +42,7 @@ _GLOBAL(call_do_softirq) | |
5183 | ld r0,16(r1) | |
5184 | mtlr r0 | |
5185 | blr | |
5186 | +#endif | |
5187 | ||
5188 | _GLOBAL(call_do_irq) | |
5189 | mflr r0 | |
5190 | diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig | |
5191 | index 648160334abf..9d24331fc9b4 100644 | |
5192 | --- a/arch/powerpc/kvm/Kconfig | |
5193 | +++ b/arch/powerpc/kvm/Kconfig | |
5194 | @@ -177,6 +177,7 @@ config KVM_E500MC | |
5195 | config KVM_MPIC | |
5196 | bool "KVM in-kernel MPIC emulation" | |
5197 | depends on KVM && E500 | |
5198 | + depends on !PREEMPT_RT_FULL | |
5199 | select HAVE_KVM_IRQCHIP | |
5200 | select HAVE_KVM_IRQFD | |
5201 | select HAVE_KVM_IRQ_ROUTING | |
5202 | diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c | |
5203 | index 1fbb5da17dd2..ca86366d5424 100644 | |
5204 | --- a/arch/powerpc/platforms/cell/spufs/sched.c | |
5205 | +++ b/arch/powerpc/platforms/cell/spufs/sched.c | |
5206 | @@ -141,7 +141,7 @@ void __spu_update_sched_info(struct spu_context *ctx) | |
5207 | * runqueue. The context will be rescheduled on the proper node | |
5208 | * if it is timesliced or preempted. | |
5209 | */ | |
5210 | - cpumask_copy(&ctx->cpus_allowed, ¤t->cpus_allowed); | |
5211 | + cpumask_copy(&ctx->cpus_allowed, current->cpus_ptr); | |
5212 | ||
5213 | /* Save the current cpu id for spu interrupt routing. */ | |
5214 | ctx->last_ran = raw_smp_processor_id(); | |
5215 | diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c | |
5216 | index e48462447ff0..2670cee66064 100644 | |
5217 | --- a/arch/powerpc/platforms/ps3/device-init.c | |
5218 | +++ b/arch/powerpc/platforms/ps3/device-init.c | |
5219 | @@ -752,7 +752,7 @@ static int ps3_notification_read_write(struct ps3_notification_device *dev, | |
5220 | } | |
5221 | pr_debug("%s:%u: notification %s issued\n", __func__, __LINE__, op); | |
5222 | ||
5223 | - res = wait_event_interruptible(dev->done.wait, | |
5224 | + res = swait_event_interruptible(dev->done.wait, | |
5225 | dev->done.done || kthread_should_stop()); | |
5226 | if (kthread_should_stop()) | |
5227 | res = -EINTR; | |
5228 | diff --git a/arch/s390/include/asm/spinlock_types.h b/arch/s390/include/asm/spinlock_types.h | |
5229 | index 1861a0c5dd47..74092ebaca3c 100644 | |
5230 | --- a/arch/s390/include/asm/spinlock_types.h | |
5231 | +++ b/arch/s390/include/asm/spinlock_types.h | |
5232 | @@ -2,10 +2,6 @@ | |
5233 | #ifndef __ASM_SPINLOCK_TYPES_H | |
5234 | #define __ASM_SPINLOCK_TYPES_H | |
5235 | ||
5236 | -#ifndef __LINUX_SPINLOCK_TYPES_H | |
5237 | -# error "please don't include this file directly" | |
5238 | -#endif | |
5239 | - | |
5240 | typedef struct { | |
5241 | int lock; | |
5242 | } __attribute__ ((aligned (4))) arch_spinlock_t; | |
5243 | diff --git a/arch/sh/include/asm/spinlock_types.h b/arch/sh/include/asm/spinlock_types.h | |
5244 | index e82369f286a2..22ca9a98bbb8 100644 | |
5245 | --- a/arch/sh/include/asm/spinlock_types.h | |
5246 | +++ b/arch/sh/include/asm/spinlock_types.h | |
5247 | @@ -2,10 +2,6 @@ | |
5248 | #ifndef __ASM_SH_SPINLOCK_TYPES_H | |
5249 | #define __ASM_SH_SPINLOCK_TYPES_H | |
5250 | ||
5251 | -#ifndef __LINUX_SPINLOCK_TYPES_H | |
5252 | -# error "please don't include this file directly" | |
5253 | -#endif | |
5254 | - | |
5255 | typedef struct { | |
5256 | volatile unsigned int lock; | |
5257 | } arch_spinlock_t; | |
5258 | diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c | |
5259 | index 245dbeb20afe..e298c82d2a69 100644 | |
5260 | --- a/arch/sh/kernel/irq.c | |
5261 | +++ b/arch/sh/kernel/irq.c | |
5262 | @@ -148,6 +148,7 @@ void irq_ctx_exit(int cpu) | |
5263 | hardirq_ctx[cpu] = NULL; | |
5264 | } | |
5265 | ||
5266 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
5267 | void do_softirq_own_stack(void) | |
5268 | { | |
5269 | struct thread_info *curctx; | |
5270 | @@ -175,6 +176,7 @@ void do_softirq_own_stack(void) | |
5271 | "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr" | |
5272 | ); | |
5273 | } | |
5274 | +#endif | |
5275 | #else | |
5276 | static inline void handle_one_irq(unsigned int irq) | |
5277 | { | |
5278 | diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig | |
5279 | index 4e83f950713e..7f9d71523763 100644 | |
5280 | --- a/arch/sparc/Kconfig | |
5281 | +++ b/arch/sparc/Kconfig | |
5282 | @@ -206,12 +206,10 @@ config NR_CPUS | |
5283 | source kernel/Kconfig.hz | |
5284 | ||
5285 | config RWSEM_GENERIC_SPINLOCK | |
5286 | - bool | |
5287 | - default y if SPARC32 | |
5288 | + def_bool PREEMPT_RT_FULL | |
5289 | ||
5290 | config RWSEM_XCHGADD_ALGORITHM | |
5291 | - bool | |
5292 | - default y if SPARC64 | |
5293 | + def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL | |
5294 | ||
5295 | config GENERIC_HWEIGHT | |
5296 | bool | |
5297 | diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c | |
5298 | index d66dde833f5e..f87b3f8f4d43 100644 | |
5299 | --- a/arch/sparc/kernel/irq_64.c | |
5300 | +++ b/arch/sparc/kernel/irq_64.c | |
5301 | @@ -855,6 +855,7 @@ void __irq_entry handler_irq(int pil, struct pt_regs *regs) | |
5302 | set_irq_regs(old_regs); | |
5303 | } | |
5304 | ||
5305 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
5306 | void do_softirq_own_stack(void) | |
5307 | { | |
5308 | void *orig_sp, *sp = softirq_stack[smp_processor_id()]; | |
5309 | @@ -869,6 +870,7 @@ void do_softirq_own_stack(void) | |
5310 | __asm__ __volatile__("mov %0, %%sp" | |
5311 | : : "r" (orig_sp)); | |
5312 | } | |
5313 | +#endif | |
5314 | ||
5315 | #ifdef CONFIG_HOTPLUG_CPU | |
5316 | void fixup_irqs(void) | |
5317 | diff --git a/arch/tile/include/asm/setup.h b/arch/tile/include/asm/setup.h | |
5318 | index 2a0347af0702..670fa2f4cfc3 100644 | |
5319 | --- a/arch/tile/include/asm/setup.h | |
5320 | +++ b/arch/tile/include/asm/setup.h | |
5321 | @@ -49,7 +49,7 @@ int hardwall_ipi_valid(int cpu); | |
5322 | ||
5323 | /* Hook hardwall code into changes in affinity. */ | |
5324 | #define arch_set_cpus_allowed(p, new_mask) do { \ | |
5325 | - if (!cpumask_equal(&p->cpus_allowed, new_mask)) \ | |
5326 | + if (!cpumask_equal(p->cpus_ptr, new_mask)) \ | |
5327 | hardwall_deactivate_all(p); \ | |
5328 | } while (0) | |
5329 | #endif | |
5330 | diff --git a/arch/tile/include/asm/spinlock_types.h b/arch/tile/include/asm/spinlock_types.h | |
5331 | index a71f59b49c50..9311c6ff2abc 100644 | |
5332 | --- a/arch/tile/include/asm/spinlock_types.h | |
5333 | +++ b/arch/tile/include/asm/spinlock_types.h | |
5334 | @@ -15,10 +15,6 @@ | |
5335 | #ifndef _ASM_TILE_SPINLOCK_TYPES_H | |
5336 | #define _ASM_TILE_SPINLOCK_TYPES_H | |
5337 | ||
5338 | -#ifndef __LINUX_SPINLOCK_TYPES_H | |
5339 | -# error "please don't include this file directly" | |
5340 | -#endif | |
5341 | - | |
5342 | #ifdef __tilegx__ | |
5343 | ||
5344 | /* Low 15 bits are "next"; high 15 bits are "current". */ | |
5345 | diff --git a/arch/tile/kernel/hardwall.c b/arch/tile/kernel/hardwall.c | |
5346 | index 2fd1694ac1d0..98f4fb696289 100644 | |
5347 | --- a/arch/tile/kernel/hardwall.c | |
5348 | +++ b/arch/tile/kernel/hardwall.c | |
5349 | @@ -590,12 +590,12 @@ static int hardwall_activate(struct hardwall_info *info) | |
5350 | * Get our affinity; if we're not bound to this tile uniquely, | |
5351 | * we can't access the network registers. | |
5352 | */ | |
5353 | - if (cpumask_weight(&p->cpus_allowed) != 1) | |
5354 | + if (p->nr_cpus_allowed != 1) | |
5355 | return -EPERM; | |
5356 | ||
5357 | /* Make sure we are bound to a cpu assigned to this resource. */ | |
5358 | cpu = smp_processor_id(); | |
5359 | - BUG_ON(cpumask_first(&p->cpus_allowed) != cpu); | |
5360 | + BUG_ON(cpumask_first(p->cpus_ptr) != cpu); | |
5361 | if (!cpumask_test_cpu(cpu, &info->cpumask)) | |
5362 | return -EINVAL; | |
5363 | ||
5364 | @@ -621,17 +621,17 @@ static int hardwall_activate(struct hardwall_info *info) | |
5365 | * Deactivate a task's hardwall. Must hold lock for hardwall_type. | |
5366 | * This method may be called from exit_thread(), so we don't want to | |
5367 | * rely on too many fields of struct task_struct still being valid. | |
5368 | - * We assume the cpus_allowed, pid, and comm fields are still valid. | |
5369 | + * We assume the nr_cpus_allowed, pid, and comm fields are still valid. | |
5370 | */ | |
5371 | static void _hardwall_deactivate(struct hardwall_type *hwt, | |
5372 | struct task_struct *task) | |
5373 | { | |
5374 | struct thread_struct *ts = &task->thread; | |
5375 | ||
5376 | - if (cpumask_weight(&task->cpus_allowed) != 1) { | |
5377 | + if (task->nr_cpus_allowed != 1) { | |
5378 | pr_err("pid %d (%s) releasing %s hardwall with an affinity mask containing %d cpus!\n", | |
5379 | task->pid, task->comm, hwt->name, | |
5380 | - cpumask_weight(&task->cpus_allowed)); | |
5381 | + task->nr_cpus_allowed); | |
5382 | BUG(); | |
5383 | } | |
5384 | ||
5385 | diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig | |
5386 | index 2af0af33362a..7764f936d6ab 100644 | |
5387 | --- a/arch/x86/Kconfig | |
5388 | +++ b/arch/x86/Kconfig | |
5389 | @@ -169,6 +169,7 @@ config X86 | |
5390 | select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI | |
5391 | select HAVE_PERF_REGS | |
5392 | select HAVE_PERF_USER_STACK_DUMP | |
5393 | + select HAVE_PREEMPT_LAZY | |
5394 | select HAVE_RCU_TABLE_FREE | |
5395 | select HAVE_RCU_TABLE_INVALIDATE if HAVE_RCU_TABLE_FREE | |
5396 | select HAVE_REGS_AND_STACK_ACCESS_API | |
5397 | @@ -257,8 +258,11 @@ config ARCH_MAY_HAVE_PC_FDC | |
5398 | def_bool y | |
5399 | depends on ISA_DMA_API | |
5400 | ||
5401 | +config RWSEM_GENERIC_SPINLOCK | |
5402 | + def_bool PREEMPT_RT_FULL | |
e4b2b4a8 | 5403 | + |
b3bbd485 JK |
5404 | config RWSEM_XCHGADD_ALGORITHM |
5405 | - def_bool y | |
5406 | + def_bool !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT_FULL | |
5407 | ||
5408 | config GENERIC_CALIBRATE_DELAY | |
5409 | def_bool y | |
5410 | @@ -933,7 +937,7 @@ config IOMMU_HELPER | |
5411 | config MAXSMP | |
5412 | bool "Enable Maximum number of SMP Processors and NUMA Nodes" | |
5413 | depends on X86_64 && SMP && DEBUG_KERNEL | |
5414 | - select CPUMASK_OFFSTACK | |
5415 | + select CPUMASK_OFFSTACK if !PREEMPT_RT_FULL | |
5416 | ---help--- | |
5417 | Enable maximum number of CPUS and NUMA Nodes for this architecture. | |
5418 | If unsure, say N. | |
5419 | diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c | |
5420 | index c690ddc78c03..7a3138d33e33 100644 | |
5421 | --- a/arch/x86/crypto/aesni-intel_glue.c | |
5422 | +++ b/arch/x86/crypto/aesni-intel_glue.c | |
5423 | @@ -387,14 +387,14 @@ static int ecb_encrypt(struct skcipher_request *req) | |
5424 | ||
5425 | err = skcipher_walk_virt(&walk, req, true); | |
5426 | ||
5427 | - kernel_fpu_begin(); | |
5428 | while ((nbytes = walk.nbytes)) { | |
5429 | + kernel_fpu_begin(); | |
5430 | aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, | |
5431 | nbytes & AES_BLOCK_MASK); | |
5432 | + kernel_fpu_end(); | |
5433 | nbytes &= AES_BLOCK_SIZE - 1; | |
5434 | err = skcipher_walk_done(&walk, nbytes); | |
5435 | } | |
5436 | - kernel_fpu_end(); | |
5437 | ||
5438 | return err; | |
5439 | } | |
5440 | @@ -409,14 +409,14 @@ static int ecb_decrypt(struct skcipher_request *req) | |
5441 | ||
5442 | err = skcipher_walk_virt(&walk, req, true); | |
5443 | ||
5444 | - kernel_fpu_begin(); | |
5445 | while ((nbytes = walk.nbytes)) { | |
5446 | + kernel_fpu_begin(); | |
5447 | aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, | |
5448 | nbytes & AES_BLOCK_MASK); | |
5449 | + kernel_fpu_end(); | |
5450 | nbytes &= AES_BLOCK_SIZE - 1; | |
5451 | err = skcipher_walk_done(&walk, nbytes); | |
5452 | } | |
5453 | - kernel_fpu_end(); | |
5454 | ||
5455 | return err; | |
5456 | } | |
5457 | @@ -431,14 +431,14 @@ static int cbc_encrypt(struct skcipher_request *req) | |
5458 | ||
5459 | err = skcipher_walk_virt(&walk, req, true); | |
5460 | ||
5461 | - kernel_fpu_begin(); | |
5462 | while ((nbytes = walk.nbytes)) { | |
5463 | + kernel_fpu_begin(); | |
5464 | aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, | |
5465 | nbytes & AES_BLOCK_MASK, walk.iv); | |
5466 | + kernel_fpu_end(); | |
5467 | nbytes &= AES_BLOCK_SIZE - 1; | |
5468 | err = skcipher_walk_done(&walk, nbytes); | |
5469 | } | |
5470 | - kernel_fpu_end(); | |
5471 | ||
5472 | return err; | |
5473 | } | |
5474 | @@ -453,14 +453,14 @@ static int cbc_decrypt(struct skcipher_request *req) | |
5475 | ||
5476 | err = skcipher_walk_virt(&walk, req, true); | |
5477 | ||
5478 | - kernel_fpu_begin(); | |
5479 | while ((nbytes = walk.nbytes)) { | |
5480 | + kernel_fpu_begin(); | |
5481 | aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, | |
5482 | nbytes & AES_BLOCK_MASK, walk.iv); | |
5483 | + kernel_fpu_end(); | |
5484 | nbytes &= AES_BLOCK_SIZE - 1; | |
5485 | err = skcipher_walk_done(&walk, nbytes); | |
5486 | } | |
5487 | - kernel_fpu_end(); | |
5488 | ||
5489 | return err; | |
5490 | } | |
5491 | @@ -510,18 +510,20 @@ static int ctr_crypt(struct skcipher_request *req) | |
5492 | ||
5493 | err = skcipher_walk_virt(&walk, req, true); | |
5494 | ||
5495 | - kernel_fpu_begin(); | |
5496 | while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) { | |
5497 | + kernel_fpu_begin(); | |
5498 | aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr, | |
5499 | nbytes & AES_BLOCK_MASK, walk.iv); | |
5500 | + kernel_fpu_end(); | |
5501 | nbytes &= AES_BLOCK_SIZE - 1; | |
5502 | err = skcipher_walk_done(&walk, nbytes); | |
5503 | } | |
5504 | if (walk.nbytes) { | |
5505 | + kernel_fpu_begin(); | |
5506 | ctr_crypt_final(ctx, &walk); | |
5507 | + kernel_fpu_end(); | |
5508 | err = skcipher_walk_done(&walk, 0); | |
5509 | } | |
5510 | - kernel_fpu_end(); | |
5511 | ||
5512 | return err; | |
5513 | } | |
5514 | diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c | |
5515 | index 60907c139c4e..0902db7d326a 100644 | |
5516 | --- a/arch/x86/crypto/camellia_aesni_avx2_glue.c | |
5517 | +++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c | |
5518 | @@ -206,6 +206,20 @@ struct crypt_priv { | |
5519 | bool fpu_enabled; | |
5520 | }; | |
5521 | ||
5522 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
5523 | +static void camellia_fpu_end_rt(struct crypt_priv *ctx) | |
5524 | +{ | |
5525 | + bool fpu_enabled = ctx->fpu_enabled; | |
e4b2b4a8 | 5526 | + |
b3bbd485 JK |
5527 | + if (!fpu_enabled) |
5528 | + return; | |
5529 | + camellia_fpu_end(fpu_enabled); | |
5530 | + ctx->fpu_enabled = false; | |
5531 | +} | |
5532 | +#else | |
5533 | +static void camellia_fpu_end_rt(struct crypt_priv *ctx) { } | |
5534 | +#endif | |
e4b2b4a8 | 5535 | + |
b3bbd485 JK |
5536 | static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) |
5537 | { | |
5538 | const unsigned int bsize = CAMELLIA_BLOCK_SIZE; | |
5539 | @@ -221,16 +235,19 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | |
5540 | } | |
5541 | ||
5542 | if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) { | |
5543 | + kernel_fpu_resched(); | |
5544 | camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst); | |
5545 | srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; | |
5546 | nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; | |
5547 | } | |
5548 | ||
5549 | while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { | |
5550 | + kernel_fpu_resched(); | |
5551 | camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst); | |
5552 | srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; | |
5553 | nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; | |
5554 | } | |
5555 | + camellia_fpu_end_rt(ctx); | |
5556 | ||
5557 | for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) | |
5558 | camellia_enc_blk(ctx->ctx, srcdst, srcdst); | |
5559 | @@ -251,16 +268,19 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | |
5560 | } | |
5561 | ||
5562 | if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) { | |
5563 | + kernel_fpu_resched(); | |
5564 | camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst); | |
5565 | srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; | |
5566 | nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS; | |
5567 | } | |
5568 | ||
5569 | while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { | |
5570 | + kernel_fpu_resched(); | |
5571 | camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst); | |
5572 | srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; | |
5573 | nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; | |
5574 | } | |
5575 | + camellia_fpu_end_rt(ctx); | |
5576 | ||
5577 | for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) | |
5578 | camellia_dec_blk(ctx->ctx, srcdst, srcdst); | |
5579 | diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c | |
5580 | index d96429da88eb..3b8e91841039 100644 | |
5581 | --- a/arch/x86/crypto/camellia_aesni_avx_glue.c | |
5582 | +++ b/arch/x86/crypto/camellia_aesni_avx_glue.c | |
5583 | @@ -210,6 +210,21 @@ struct crypt_priv { | |
5584 | bool fpu_enabled; | |
5585 | }; | |
5586 | ||
5587 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
5588 | +static void camellia_fpu_end_rt(struct crypt_priv *ctx) | |
5589 | +{ | |
5590 | + bool fpu_enabled = ctx->fpu_enabled; | |
e4b2b4a8 | 5591 | + |
b3bbd485 JK |
5592 | + if (!fpu_enabled) |
5593 | + return; | |
5594 | + camellia_fpu_end(fpu_enabled); | |
5595 | + ctx->fpu_enabled = false; | |
5596 | +} | |
e4b2b4a8 | 5597 | + |
b3bbd485 JK |
5598 | +#else |
5599 | +static void camellia_fpu_end_rt(struct crypt_priv *ctx) { } | |
5600 | +#endif | |
e4b2b4a8 | 5601 | + |
b3bbd485 JK |
5602 | static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) |
5603 | { | |
5604 | const unsigned int bsize = CAMELLIA_BLOCK_SIZE; | |
5605 | @@ -225,10 +240,12 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | |
5606 | } | |
5607 | ||
5608 | while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { | |
5609 | + kernel_fpu_resched(); | |
5610 | camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst); | |
5611 | srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; | |
5612 | nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; | |
5613 | } | |
5614 | + camellia_fpu_end_rt(ctx); | |
5615 | ||
5616 | for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) | |
5617 | camellia_enc_blk(ctx->ctx, srcdst, srcdst); | |
5618 | @@ -249,10 +266,12 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | |
5619 | } | |
5620 | ||
5621 | while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) { | |
5622 | + kernel_fpu_resched(); | |
5623 | camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst); | |
5624 | srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS; | |
5625 | nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS; | |
5626 | } | |
5627 | + camellia_fpu_end_rt(ctx); | |
5628 | ||
5629 | for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) | |
5630 | camellia_dec_blk(ctx->ctx, srcdst, srcdst); | |
5631 | diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c | |
5632 | index 575292a33bdf..0a4b0a222b18 100644 | |
5633 | --- a/arch/x86/crypto/cast5_avx_glue.c | |
5634 | +++ b/arch/x86/crypto/cast5_avx_glue.c | |
5635 | @@ -59,7 +59,7 @@ static inline void cast5_fpu_end(bool fpu_enabled) | |
5636 | static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, | |
5637 | bool enc) | |
5638 | { | |
5639 | - bool fpu_enabled = false; | |
5640 | + bool fpu_enabled; | |
5641 | struct cast5_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); | |
5642 | const unsigned int bsize = CAST5_BLOCK_SIZE; | |
5643 | unsigned int nbytes; | |
5644 | @@ -73,7 +73,7 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, | |
5645 | u8 *wsrc = walk->src.virt.addr; | |
5646 | u8 *wdst = walk->dst.virt.addr; | |
5647 | ||
5648 | - fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes); | |
5649 | + fpu_enabled = cast5_fpu_begin(false, nbytes); | |
5650 | ||
5651 | /* Process multi-block batch */ | |
5652 | if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { | |
5653 | @@ -102,10 +102,9 @@ static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, | |
5654 | } while (nbytes >= bsize); | |
5655 | ||
5656 | done: | |
5657 | + cast5_fpu_end(fpu_enabled); | |
5658 | err = blkcipher_walk_done(desc, walk, nbytes); | |
5659 | } | |
5660 | - | |
5661 | - cast5_fpu_end(fpu_enabled); | |
5662 | return err; | |
5663 | } | |
5664 | ||
5665 | @@ -226,7 +225,7 @@ static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, | |
5666 | static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | |
5667 | struct scatterlist *src, unsigned int nbytes) | |
5668 | { | |
5669 | - bool fpu_enabled = false; | |
5670 | + bool fpu_enabled; | |
5671 | struct blkcipher_walk walk; | |
5672 | int err; | |
5673 | ||
5674 | @@ -235,12 +234,11 @@ static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, | |
5675 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | |
5676 | ||
5677 | while ((nbytes = walk.nbytes)) { | |
5678 | - fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes); | |
5679 | + fpu_enabled = cast5_fpu_begin(false, nbytes); | |
5680 | nbytes = __cbc_decrypt(desc, &walk); | |
5681 | + cast5_fpu_end(fpu_enabled); | |
5682 | err = blkcipher_walk_done(desc, &walk, nbytes); | |
5683 | } | |
5684 | - | |
5685 | - cast5_fpu_end(fpu_enabled); | |
5686 | return err; | |
5687 | } | |
5688 | ||
5689 | @@ -309,7 +307,7 @@ static unsigned int __ctr_crypt(struct blkcipher_desc *desc, | |
5690 | static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, | |
5691 | struct scatterlist *src, unsigned int nbytes) | |
5692 | { | |
5693 | - bool fpu_enabled = false; | |
5694 | + bool fpu_enabled; | |
5695 | struct blkcipher_walk walk; | |
5696 | int err; | |
5697 | ||
5698 | @@ -318,13 +316,12 @@ static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, | |
5699 | desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; | |
5700 | ||
5701 | while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) { | |
5702 | - fpu_enabled = cast5_fpu_begin(fpu_enabled, nbytes); | |
5703 | + fpu_enabled = cast5_fpu_begin(false, nbytes); | |
5704 | nbytes = __ctr_crypt(desc, &walk); | |
5705 | + cast5_fpu_end(fpu_enabled); | |
5706 | err = blkcipher_walk_done(desc, &walk, nbytes); | |
5707 | } | |
5708 | ||
5709 | - cast5_fpu_end(fpu_enabled); | |
5710 | - | |
5711 | if (walk.nbytes) { | |
5712 | ctr_crypt_final(desc, &walk); | |
5713 | err = blkcipher_walk_done(desc, &walk, 0); | |
5714 | diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c | |
5715 | index 50e684768c55..8caf9ba8c1da 100644 | |
5716 | --- a/arch/x86/crypto/cast6_avx_glue.c | |
5717 | +++ b/arch/x86/crypto/cast6_avx_glue.c | |
5718 | @@ -205,19 +205,33 @@ struct crypt_priv { | |
5719 | bool fpu_enabled; | |
5720 | }; | |
5721 | ||
5722 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
5723 | +static void cast6_fpu_end_rt(struct crypt_priv *ctx) | |
5724 | +{ | |
5725 | + bool fpu_enabled = ctx->fpu_enabled; | |
e4b2b4a8 | 5726 | + |
b3bbd485 JK |
5727 | + if (!fpu_enabled) |
5728 | + return; | |
5729 | + cast6_fpu_end(fpu_enabled); | |
5730 | + ctx->fpu_enabled = false; | |
5731 | +} | |
e4b2b4a8 | 5732 | + |
b3bbd485 JK |
5733 | +#else |
5734 | +static void cast6_fpu_end_rt(struct crypt_priv *ctx) { } | |
5735 | +#endif | |
e4b2b4a8 | 5736 | + |
b3bbd485 JK |
5737 | static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) |
5738 | { | |
5739 | const unsigned int bsize = CAST6_BLOCK_SIZE; | |
5740 | struct crypt_priv *ctx = priv; | |
5741 | int i; | |
5742 | ||
5743 | - ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); | |
5744 | - | |
5745 | if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) { | |
5746 | + ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); | |
5747 | cast6_ecb_enc_8way(ctx->ctx, srcdst, srcdst); | |
5748 | + cast6_fpu_end_rt(ctx); | |
5749 | return; | |
5750 | } | |
5751 | - | |
5752 | for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) | |
5753 | __cast6_encrypt(ctx->ctx, srcdst, srcdst); | |
5754 | } | |
5755 | @@ -228,10 +242,10 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | |
5756 | struct crypt_priv *ctx = priv; | |
5757 | int i; | |
5758 | ||
5759 | - ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); | |
5760 | - | |
5761 | if (nbytes == bsize * CAST6_PARALLEL_BLOCKS) { | |
5762 | + ctx->fpu_enabled = cast6_fpu_begin(ctx->fpu_enabled, nbytes); | |
5763 | cast6_ecb_dec_8way(ctx->ctx, srcdst, srcdst); | |
5764 | + cast6_fpu_end_rt(ctx); | |
5765 | return; | |
5766 | } | |
5767 | ||
5768 | diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c | |
5769 | index 1e6af1b35f7b..e7809fd2a4fd 100644 | |
5770 | --- a/arch/x86/crypto/chacha20_glue.c | |
5771 | +++ b/arch/x86/crypto/chacha20_glue.c | |
5772 | @@ -81,23 +81,24 @@ static int chacha20_simd(struct skcipher_request *req) | |
5773 | ||
5774 | crypto_chacha20_init(state, ctx, walk.iv); | |
5775 | ||
5776 | - kernel_fpu_begin(); | |
5777 | - | |
5778 | while (walk.nbytes >= CHACHA20_BLOCK_SIZE) { | |
5779 | + kernel_fpu_begin(); | |
e4b2b4a8 | 5780 | + |
b3bbd485 JK |
5781 | chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, |
5782 | rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE)); | |
5783 | + kernel_fpu_end(); | |
5784 | err = skcipher_walk_done(&walk, | |
5785 | walk.nbytes % CHACHA20_BLOCK_SIZE); | |
5786 | } | |
5787 | ||
5788 | if (walk.nbytes) { | |
5789 | + kernel_fpu_begin(); | |
5790 | chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr, | |
5791 | walk.nbytes); | |
5792 | + kernel_fpu_end(); | |
5793 | err = skcipher_walk_done(&walk, 0); | |
5794 | } | |
5795 | ||
5796 | - kernel_fpu_end(); | |
5797 | - | |
5798 | return err; | |
5799 | } | |
5800 | ||
5801 | diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c | |
5802 | index d61e57960fe0..c67560d9718a 100644 | |
5803 | --- a/arch/x86/crypto/glue_helper.c | |
5804 | +++ b/arch/x86/crypto/glue_helper.c | |
5805 | @@ -40,7 +40,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, | |
5806 | void *ctx = crypto_blkcipher_ctx(desc->tfm); | |
5807 | const unsigned int bsize = 128 / 8; | |
5808 | unsigned int nbytes, i, func_bytes; | |
5809 | - bool fpu_enabled = false; | |
5810 | + bool fpu_enabled; | |
5811 | int err; | |
5812 | ||
5813 | err = blkcipher_walk_virt(desc, walk); | |
5814 | @@ -50,7 +50,7 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, | |
5815 | u8 *wdst = walk->dst.virt.addr; | |
5816 | ||
5817 | fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, | |
5818 | - desc, fpu_enabled, nbytes); | |
5819 | + desc, false, nbytes); | |
5820 | ||
5821 | for (i = 0; i < gctx->num_funcs; i++) { | |
5822 | func_bytes = bsize * gctx->funcs[i].num_blocks; | |
5823 | @@ -72,10 +72,10 @@ static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, | |
5824 | } | |
5825 | ||
5826 | done: | |
5827 | + glue_fpu_end(fpu_enabled); | |
5828 | err = blkcipher_walk_done(desc, walk, nbytes); | |
5829 | } | |
5830 | ||
5831 | - glue_fpu_end(fpu_enabled); | |
5832 | return err; | |
5833 | } | |
5834 | ||
5835 | @@ -192,7 +192,7 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx, | |
5836 | struct scatterlist *src, unsigned int nbytes) | |
5837 | { | |
5838 | const unsigned int bsize = 128 / 8; | |
5839 | - bool fpu_enabled = false; | |
5840 | + bool fpu_enabled; | |
5841 | struct blkcipher_walk walk; | |
5842 | int err; | |
5843 | ||
5844 | @@ -201,12 +201,12 @@ int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx, | |
5845 | ||
5846 | while ((nbytes = walk.nbytes)) { | |
5847 | fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, | |
5848 | - desc, fpu_enabled, nbytes); | |
5849 | + desc, false, nbytes); | |
5850 | nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk); | |
5851 | + glue_fpu_end(fpu_enabled); | |
5852 | err = blkcipher_walk_done(desc, &walk, nbytes); | |
5853 | } | |
5854 | ||
5855 | - glue_fpu_end(fpu_enabled); | |
5856 | return err; | |
5857 | } | |
5858 | EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit); | |
5859 | @@ -275,7 +275,7 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, | |
5860 | struct scatterlist *src, unsigned int nbytes) | |
5861 | { | |
5862 | const unsigned int bsize = 128 / 8; | |
5863 | - bool fpu_enabled = false; | |
5864 | + bool fpu_enabled; | |
5865 | struct blkcipher_walk walk; | |
5866 | int err; | |
5867 | ||
5868 | @@ -284,13 +284,12 @@ int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, | |
5869 | ||
5870 | while ((nbytes = walk.nbytes) >= bsize) { | |
5871 | fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, | |
5872 | - desc, fpu_enabled, nbytes); | |
5873 | + desc, false, nbytes); | |
5874 | nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk); | |
5875 | + glue_fpu_end(fpu_enabled); | |
5876 | err = blkcipher_walk_done(desc, &walk, nbytes); | |
5877 | } | |
5878 | ||
5879 | - glue_fpu_end(fpu_enabled); | |
5880 | - | |
5881 | if (walk.nbytes) { | |
5882 | glue_ctr_crypt_final_128bit( | |
5883 | gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk); | |
5884 | @@ -380,7 +379,7 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, | |
5885 | void *tweak_ctx, void *crypt_ctx) | |
5886 | { | |
5887 | const unsigned int bsize = 128 / 8; | |
5888 | - bool fpu_enabled = false; | |
5889 | + bool fpu_enabled; | |
5890 | struct blkcipher_walk walk; | |
5891 | int err; | |
5892 | ||
5893 | @@ -393,21 +392,21 @@ int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, | |
5894 | ||
5895 | /* set minimum length to bsize, for tweak_fn */ | |
5896 | fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, | |
5897 | - desc, fpu_enabled, | |
5898 | + desc, false, | |
5899 | nbytes < bsize ? bsize : nbytes); | |
5900 | - | |
5901 | /* calculate first value of T */ | |
5902 | tweak_fn(tweak_ctx, walk.iv, walk.iv); | |
5903 | + glue_fpu_end(fpu_enabled); | |
5904 | ||
5905 | while (nbytes) { | |
5906 | + fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, | |
5907 | + desc, false, nbytes); | |
5908 | nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk); | |
5909 | ||
5910 | + glue_fpu_end(fpu_enabled); | |
5911 | err = blkcipher_walk_done(desc, &walk, nbytes); | |
5912 | nbytes = walk.nbytes; | |
5913 | } | |
5914 | - | |
5915 | - glue_fpu_end(fpu_enabled); | |
5916 | - | |
5917 | return err; | |
5918 | } | |
5919 | EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit); | |
5920 | diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c | |
5921 | index 870f6d812a2d..5c806bf39f1d 100644 | |
5922 | --- a/arch/x86/crypto/serpent_avx2_glue.c | |
5923 | +++ b/arch/x86/crypto/serpent_avx2_glue.c | |
5924 | @@ -184,6 +184,21 @@ struct crypt_priv { | |
5925 | bool fpu_enabled; | |
5926 | }; | |
5927 | ||
5928 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
5929 | +static void serpent_fpu_end_rt(struct crypt_priv *ctx) | |
5930 | +{ | |
5931 | + bool fpu_enabled = ctx->fpu_enabled; | |
e4b2b4a8 | 5932 | + |
b3bbd485 JK |
5933 | + if (!fpu_enabled) |
5934 | + return; | |
5935 | + serpent_fpu_end(fpu_enabled); | |
5936 | + ctx->fpu_enabled = false; | |
5937 | +} | |
e4b2b4a8 | 5938 | + |
b3bbd485 JK |
5939 | +#else |
5940 | +static void serpent_fpu_end_rt(struct crypt_priv *ctx) { } | |
5941 | +#endif | |
e4b2b4a8 | 5942 | + |
b3bbd485 JK |
5943 | static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) |
5944 | { | |
5945 | const unsigned int bsize = SERPENT_BLOCK_SIZE; | |
5946 | @@ -199,10 +214,12 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | |
5947 | } | |
5948 | ||
5949 | while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) { | |
5950 | + kernel_fpu_resched(); | |
5951 | serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst); | |
5952 | srcdst += bsize * SERPENT_PARALLEL_BLOCKS; | |
5953 | nbytes -= bsize * SERPENT_PARALLEL_BLOCKS; | |
5954 | } | |
5955 | + serpent_fpu_end_rt(ctx); | |
5956 | ||
5957 | for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) | |
5958 | __serpent_encrypt(ctx->ctx, srcdst, srcdst); | |
5959 | @@ -223,10 +240,12 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | |
5960 | } | |
5961 | ||
5962 | while (nbytes >= SERPENT_PARALLEL_BLOCKS * bsize) { | |
5963 | + kernel_fpu_resched(); | |
5964 | serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst); | |
5965 | srcdst += bsize * SERPENT_PARALLEL_BLOCKS; | |
5966 | nbytes -= bsize * SERPENT_PARALLEL_BLOCKS; | |
5967 | } | |
5968 | + serpent_fpu_end_rt(ctx); | |
5969 | ||
5970 | for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) | |
5971 | __serpent_decrypt(ctx->ctx, srcdst, srcdst); | |
5972 | diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c | |
5973 | index 6f778d3daa22..46dcbdbd0518 100644 | |
5974 | --- a/arch/x86/crypto/serpent_avx_glue.c | |
5975 | +++ b/arch/x86/crypto/serpent_avx_glue.c | |
5976 | @@ -218,16 +218,31 @@ struct crypt_priv { | |
5977 | bool fpu_enabled; | |
5978 | }; | |
5979 | ||
5980 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
5981 | +static void serpent_fpu_end_rt(struct crypt_priv *ctx) | |
5982 | +{ | |
5983 | + bool fpu_enabled = ctx->fpu_enabled; | |
e4b2b4a8 | 5984 | + |
b3bbd485 JK |
5985 | + if (!fpu_enabled) |
5986 | + return; | |
5987 | + serpent_fpu_end(fpu_enabled); | |
5988 | + ctx->fpu_enabled = false; | |
5989 | +} | |
e4b2b4a8 | 5990 | + |
b3bbd485 JK |
5991 | +#else |
5992 | +static void serpent_fpu_end_rt(struct crypt_priv *ctx) { } | |
5993 | +#endif | |
e4b2b4a8 | 5994 | + |
b3bbd485 JK |
5995 | static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) |
5996 | { | |
5997 | const unsigned int bsize = SERPENT_BLOCK_SIZE; | |
5998 | struct crypt_priv *ctx = priv; | |
5999 | int i; | |
6000 | ||
6001 | - ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); | |
6002 | - | |
6003 | if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { | |
6004 | + ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); | |
6005 | serpent_ecb_enc_8way_avx(ctx->ctx, srcdst, srcdst); | |
6006 | + serpent_fpu_end_rt(ctx); | |
6007 | return; | |
6008 | } | |
6009 | ||
6010 | @@ -241,10 +256,10 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | |
6011 | struct crypt_priv *ctx = priv; | |
6012 | int i; | |
6013 | ||
6014 | - ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); | |
6015 | - | |
6016 | if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { | |
6017 | + ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); | |
6018 | serpent_ecb_dec_8way_avx(ctx->ctx, srcdst, srcdst); | |
6019 | + serpent_fpu_end_rt(ctx); | |
6020 | return; | |
6021 | } | |
6022 | ||
6023 | diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c | |
6024 | index ac0e831943f5..d35f607d067f 100644 | |
6025 | --- a/arch/x86/crypto/serpent_sse2_glue.c | |
6026 | +++ b/arch/x86/crypto/serpent_sse2_glue.c | |
6027 | @@ -187,16 +187,31 @@ struct crypt_priv { | |
6028 | bool fpu_enabled; | |
6029 | }; | |
6030 | ||
6031 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
6032 | +static void serpent_fpu_end_rt(struct crypt_priv *ctx) | |
6033 | +{ | |
6034 | + bool fpu_enabled = ctx->fpu_enabled; | |
e4b2b4a8 | 6035 | + |
b3bbd485 JK |
6036 | + if (!fpu_enabled) |
6037 | + return; | |
6038 | + serpent_fpu_end(fpu_enabled); | |
6039 | + ctx->fpu_enabled = false; | |
6040 | +} | |
e4b2b4a8 | 6041 | + |
b3bbd485 JK |
6042 | +#else |
6043 | +static void serpent_fpu_end_rt(struct crypt_priv *ctx) { } | |
6044 | +#endif | |
e4b2b4a8 | 6045 | + |
b3bbd485 JK |
6046 | static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) |
6047 | { | |
6048 | const unsigned int bsize = SERPENT_BLOCK_SIZE; | |
6049 | struct crypt_priv *ctx = priv; | |
6050 | int i; | |
6051 | ||
6052 | - ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); | |
6053 | - | |
6054 | if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { | |
6055 | + ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); | |
6056 | serpent_enc_blk_xway(ctx->ctx, srcdst, srcdst); | |
6057 | + serpent_fpu_end_rt(ctx); | |
6058 | return; | |
6059 | } | |
6060 | ||
6061 | @@ -210,10 +225,10 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | |
6062 | struct crypt_priv *ctx = priv; | |
6063 | int i; | |
6064 | ||
6065 | - ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); | |
6066 | - | |
6067 | if (nbytes == bsize * SERPENT_PARALLEL_BLOCKS) { | |
6068 | + ctx->fpu_enabled = serpent_fpu_begin(ctx->fpu_enabled, nbytes); | |
6069 | serpent_dec_blk_xway(ctx->ctx, srcdst, srcdst); | |
6070 | + serpent_fpu_end_rt(ctx); | |
6071 | return; | |
6072 | } | |
6073 | ||
6074 | diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c | |
6075 | index b7a3904b953c..de00fe24927e 100644 | |
6076 | --- a/arch/x86/crypto/twofish_avx_glue.c | |
6077 | +++ b/arch/x86/crypto/twofish_avx_glue.c | |
6078 | @@ -218,6 +218,21 @@ struct crypt_priv { | |
6079 | bool fpu_enabled; | |
6080 | }; | |
6081 | ||
6082 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
6083 | +static void twofish_fpu_end_rt(struct crypt_priv *ctx) | |
6084 | +{ | |
6085 | + bool fpu_enabled = ctx->fpu_enabled; | |
e4b2b4a8 | 6086 | + |
b3bbd485 JK |
6087 | + if (!fpu_enabled) |
6088 | + return; | |
6089 | + twofish_fpu_end(fpu_enabled); | |
6090 | + ctx->fpu_enabled = false; | |
6091 | +} | |
e4b2b4a8 | 6092 | + |
b3bbd485 JK |
6093 | +#else |
6094 | +static void twofish_fpu_end_rt(struct crypt_priv *ctx) { } | |
6095 | +#endif | |
e4b2b4a8 | 6096 | + |
b3bbd485 JK |
6097 | static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) |
6098 | { | |
6099 | const unsigned int bsize = TF_BLOCK_SIZE; | |
6100 | @@ -228,12 +243,16 @@ static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | |
6101 | ||
6102 | if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) { | |
6103 | twofish_ecb_enc_8way(ctx->ctx, srcdst, srcdst); | |
6104 | + twofish_fpu_end_rt(ctx); | |
6105 | return; | |
6106 | } | |
6107 | ||
6108 | - for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) | |
6109 | + for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) { | |
6110 | + kernel_fpu_resched(); | |
6111 | twofish_enc_blk_3way(ctx->ctx, srcdst, srcdst); | |
6112 | + } | |
6113 | ||
6114 | + twofish_fpu_end_rt(ctx); | |
6115 | nbytes %= bsize * 3; | |
6116 | ||
6117 | for (i = 0; i < nbytes / bsize; i++, srcdst += bsize) | |
6118 | @@ -250,11 +269,15 @@ static void decrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes) | |
6119 | ||
6120 | if (nbytes == bsize * TWOFISH_PARALLEL_BLOCKS) { | |
6121 | twofish_ecb_dec_8way(ctx->ctx, srcdst, srcdst); | |
6122 | + twofish_fpu_end_rt(ctx); | |
6123 | return; | |
6124 | } | |
6125 | ||
6126 | - for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) | |
6127 | + for (i = 0; i < nbytes / (bsize * 3); i++, srcdst += bsize * 3) { | |
6128 | + kernel_fpu_resched(); | |
6129 | twofish_dec_blk_3way(ctx->ctx, srcdst, srcdst); | |
6130 | + } | |
6131 | + twofish_fpu_end_rt(ctx); | |
6132 | ||
6133 | nbytes %= bsize * 3; | |
6134 | ||
6135 | diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c | |
6136 | index 60e21ccfb6d6..0e27f35febe7 100644 | |
6137 | --- a/arch/x86/entry/common.c | |
6138 | +++ b/arch/x86/entry/common.c | |
6139 | @@ -133,7 +133,7 @@ static long syscall_trace_enter(struct pt_regs *regs) | |
6140 | ||
6141 | #define EXIT_TO_USERMODE_LOOP_FLAGS \ | |
6142 | (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ | |
6143 | - _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING) | |
6144 | + _TIF_NEED_RESCHED_MASK | _TIF_USER_RETURN_NOTIFY | _TIF_PATCH_PENDING) | |
6145 | ||
6146 | static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) | |
6147 | { | |
6148 | @@ -148,9 +148,16 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) | |
6149 | /* We have work to do. */ | |
6150 | local_irq_enable(); | |
6151 | ||
6152 | - if (cached_flags & _TIF_NEED_RESCHED) | |
6153 | + if (cached_flags & _TIF_NEED_RESCHED_MASK) | |
6154 | schedule(); | |
6155 | ||
6156 | +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND | |
6157 | + if (unlikely(current->forced_info.si_signo)) { | |
6158 | + struct task_struct *t = current; | |
6159 | + force_sig_info(t->forced_info.si_signo, &t->forced_info, t); | |
6160 | + t->forced_info.si_signo = 0; | |
6161 | + } | |
6162 | +#endif | |
6163 | if (cached_flags & _TIF_UPROBE) | |
6164 | uprobe_notify_resume(regs); | |
6165 | ||
6166 | diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S | |
6167 | index 60c4c342316c..cd0c7c56e2dd 100644 | |
6168 | --- a/arch/x86/entry/entry_32.S | |
6169 | +++ b/arch/x86/entry/entry_32.S | |
6170 | @@ -350,8 +350,25 @@ END(ret_from_exception) | |
6171 | ENTRY(resume_kernel) | |
6172 | DISABLE_INTERRUPTS(CLBR_ANY) | |
6173 | .Lneed_resched: | |
6174 | + # preempt count == 0 + NEED_RS set? | |
6175 | cmpl $0, PER_CPU_VAR(__preempt_count) | |
6176 | +#ifndef CONFIG_PREEMPT_LAZY | |
6177 | jnz restore_all | |
6178 | +#else | |
6179 | + jz test_int_off | |
e4b2b4a8 | 6180 | + |
b3bbd485 JK |
6181 | + # atleast preempt count == 0 ? |
6182 | + cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count) | |
6183 | + jne restore_all | |
e4b2b4a8 | 6184 | + |
b3bbd485 JK |
6185 | + movl PER_CPU_VAR(current_task), %ebp |
6186 | + cmpl $0,TASK_TI_preempt_lazy_count(%ebp) # non-zero preempt_lazy_count ? | |
6187 | + jnz restore_all | |
e4b2b4a8 | 6188 | + |
b3bbd485 JK |
6189 | + testl $_TIF_NEED_RESCHED_LAZY, TASK_TI_flags(%ebp) |
6190 | + jz restore_all | |
6191 | +test_int_off: | |
6192 | +#endif | |
6193 | testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? | |
6194 | jz restore_all | |
6195 | call preempt_schedule_irq | |
6196 | diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S | |
5dd41b01 | 6197 | index 164cd7529f0b..75d42cb8a7c9 100644 |
b3bbd485 JK |
6198 | --- a/arch/x86/entry/entry_64.S |
6199 | +++ b/arch/x86/entry/entry_64.S | |
6200 | @@ -633,7 +633,23 @@ retint_kernel: | |
5dd41b01 | 6201 | btl $9, EFLAGS(%rsp) /* were interrupts off? */ |
b3bbd485 JK |
6202 | jnc 1f |
6203 | 0: cmpl $0, PER_CPU_VAR(__preempt_count) | |
6204 | +#ifndef CONFIG_PREEMPT_LAZY | |
6205 | jnz 1f | |
6206 | +#else | |
6207 | + jz do_preempt_schedule_irq | |
e4b2b4a8 | 6208 | + |
b3bbd485 JK |
6209 | + # atleast preempt count == 0 ? |
6210 | + cmpl $_PREEMPT_ENABLED,PER_CPU_VAR(__preempt_count) | |
6211 | + jnz 1f | |
e4b2b4a8 | 6212 | + |
b3bbd485 JK |
6213 | + movq PER_CPU_VAR(current_task), %rcx |
6214 | + cmpl $0, TASK_TI_preempt_lazy_count(%rcx) | |
6215 | + jnz 1f | |
e4b2b4a8 | 6216 | + |
b3bbd485 JK |
6217 | + bt $TIF_NEED_RESCHED_LAZY,TASK_TI_flags(%rcx) |
6218 | + jnc 1f | |
6219 | +do_preempt_schedule_irq: | |
6220 | +#endif | |
6221 | call preempt_schedule_irq | |
6222 | jmp 0b | |
6223 | 1: | |
6224 | @@ -988,6 +1004,7 @@ bad_gs: | |
6225 | jmp 2b | |
6226 | .previous | |
6227 | ||
6228 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
6229 | /* Call softirq on interrupt stack. Interrupts are off. */ | |
6230 | ENTRY(do_softirq_own_stack) | |
6231 | pushq %rbp | |
6232 | @@ -998,6 +1015,7 @@ ENTRY(do_softirq_own_stack) | |
6233 | leaveq | |
6234 | ret | |
6235 | ENDPROC(do_softirq_own_stack) | |
6236 | +#endif | |
6237 | ||
6238 | #ifdef CONFIG_XEN | |
6239 | idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0 | |
6240 | diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h | |
6241 | index a9caac9d4a72..18b31f22ca5d 100644 | |
6242 | --- a/arch/x86/include/asm/fpu/api.h | |
6243 | +++ b/arch/x86/include/asm/fpu/api.h | |
6244 | @@ -25,6 +25,7 @@ extern void __kernel_fpu_begin(void); | |
6245 | extern void __kernel_fpu_end(void); | |
6246 | extern void kernel_fpu_begin(void); | |
6247 | extern void kernel_fpu_end(void); | |
6248 | +extern void kernel_fpu_resched(void); | |
6249 | extern bool irq_fpu_usable(void); | |
6250 | ||
6251 | /* | |
6252 | diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h | |
6253 | index 7f2dbd91fc74..22992c837795 100644 | |
6254 | --- a/arch/x86/include/asm/preempt.h | |
6255 | +++ b/arch/x86/include/asm/preempt.h | |
6256 | @@ -86,17 +86,46 @@ static __always_inline void __preempt_count_sub(int val) | |
6257 | * a decrement which hits zero means we have no preempt_count and should | |
6258 | * reschedule. | |
6259 | */ | |
6260 | -static __always_inline bool __preempt_count_dec_and_test(void) | |
6261 | +static __always_inline bool ____preempt_count_dec_and_test(void) | |
6262 | { | |
6263 | GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), e); | |
6264 | } | |
6265 | ||
6266 | +static __always_inline bool __preempt_count_dec_and_test(void) | |
6267 | +{ | |
6268 | + if (____preempt_count_dec_and_test()) | |
6269 | + return true; | |
6270 | +#ifdef CONFIG_PREEMPT_LAZY | |
6271 | + if (current_thread_info()->preempt_lazy_count) | |
6272 | + return false; | |
6273 | + return test_thread_flag(TIF_NEED_RESCHED_LAZY); | |
6274 | +#else | |
6275 | + return false; | |
6276 | +#endif | |
6277 | +} | |
e4b2b4a8 | 6278 | + |
b3bbd485 JK |
6279 | /* |
6280 | * Returns true when we need to resched and can (barring IRQ state). | |
6281 | */ | |
6282 | static __always_inline bool should_resched(int preempt_offset) | |
6283 | { | |
6284 | +#ifdef CONFIG_PREEMPT_LAZY | |
6285 | + u32 tmp; | |
e4b2b4a8 | 6286 | + |
b3bbd485 JK |
6287 | + tmp = raw_cpu_read_4(__preempt_count); |
6288 | + if (tmp == preempt_offset) | |
6289 | + return true; | |
e4b2b4a8 | 6290 | + |
b3bbd485 JK |
6291 | + /* preempt count == 0 ? */ |
6292 | + tmp &= ~PREEMPT_NEED_RESCHED; | |
6293 | + if (tmp) | |
6294 | + return false; | |
6295 | + if (current_thread_info()->preempt_lazy_count) | |
6296 | + return false; | |
6297 | + return test_thread_flag(TIF_NEED_RESCHED_LAZY); | |
6298 | +#else | |
6299 | return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset); | |
6300 | +#endif | |
6301 | } | |
6302 | ||
6303 | #ifdef CONFIG_PREEMPT | |
6304 | diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h | |
6305 | index 5f9012ff52ed..39117e57caf2 100644 | |
6306 | --- a/arch/x86/include/asm/signal.h | |
6307 | +++ b/arch/x86/include/asm/signal.h | |
6308 | @@ -28,6 +28,19 @@ typedef struct { | |
6309 | #define SA_IA32_ABI 0x02000000u | |
6310 | #define SA_X32_ABI 0x01000000u | |
6311 | ||
6312 | +/* | |
6313 | + * Because some traps use the IST stack, we must keep preemption | |
6314 | + * disabled while calling do_trap(), but do_trap() may call | |
6315 | + * force_sig_info() which will grab the signal spin_locks for the | |
6316 | + * task, which in PREEMPT_RT_FULL are mutexes. By defining | |
6317 | + * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set | |
6318 | + * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the | |
6319 | + * trap. | |
6320 | + */ | |
6321 | +#if defined(CONFIG_PREEMPT_RT_FULL) | |
6322 | +#define ARCH_RT_DELAYS_SIGNAL_SEND | |
6323 | +#endif | |
e4b2b4a8 | 6324 | + |
b3bbd485 JK |
6325 | #ifndef CONFIG_COMPAT |
6326 | typedef sigset_t compat_sigset_t; | |
6327 | #endif | |
6328 | diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h | |
6329 | index 371b3a4af000..06613a805b25 100644 | |
6330 | --- a/arch/x86/include/asm/stackprotector.h | |
6331 | +++ b/arch/x86/include/asm/stackprotector.h | |
6332 | @@ -60,7 +60,7 @@ | |
6333 | */ | |
6334 | static __always_inline void boot_init_stack_canary(void) | |
6335 | { | |
6336 | - u64 canary; | |
6337 | + u64 uninitialized_var(canary); | |
6338 | u64 tsc; | |
6339 | ||
6340 | #ifdef CONFIG_X86_64 | |
6341 | @@ -71,8 +71,14 @@ static __always_inline void boot_init_stack_canary(void) | |
6342 | * of randomness. The TSC only matters for very early init, | |
6343 | * there it already has some randomness on most systems. Later | |
6344 | * on during the bootup the random pool has true entropy too. | |
6345 | + * For preempt-rt we need to weaken the randomness a bit, as | |
6346 | + * we can't call into the random generator from atomic context | |
6347 | + * due to locking constraints. We just leave canary | |
6348 | + * uninitialized and use the TSC based randomness on top of it. | |
6349 | */ | |
6350 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
6351 | get_random_bytes(&canary, sizeof(canary)); | |
6352 | +#endif | |
6353 | tsc = rdtsc(); | |
6354 | canary += tsc + (tsc << 32UL); | |
6355 | canary &= CANARY_MASK; | |
6356 | diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h | |
6357 | index 95ff2d7f553f..b1c9129f64fc 100644 | |
6358 | --- a/arch/x86/include/asm/thread_info.h | |
6359 | +++ b/arch/x86/include/asm/thread_info.h | |
6360 | @@ -56,11 +56,14 @@ struct task_struct; | |
6361 | struct thread_info { | |
6362 | unsigned long flags; /* low level flags */ | |
6363 | u32 status; /* thread synchronous flags */ | |
6364 | + int preempt_lazy_count; /* 0 => lazy preemptable | |
6365 | + <0 => BUG */ | |
6366 | }; | |
6367 | ||
6368 | #define INIT_THREAD_INFO(tsk) \ | |
6369 | { \ | |
6370 | .flags = 0, \ | |
6371 | + .preempt_lazy_count = 0, \ | |
6372 | } | |
6373 | ||
6374 | #define init_stack (init_thread_union.stack) | |
6375 | @@ -69,6 +72,10 @@ struct thread_info { | |
6376 | ||
6377 | #include <asm/asm-offsets.h> | |
6378 | ||
6379 | +#define GET_THREAD_INFO(reg) \ | |
6380 | + _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \ | |
6381 | + _ASM_SUB $(THREAD_SIZE),reg ; | |
e4b2b4a8 | 6382 | + |
b3bbd485 JK |
6383 | #endif |
6384 | ||
6385 | /* | |
6386 | @@ -85,6 +92,7 @@ struct thread_info { | |
6387 | #define TIF_SYSCALL_EMU 6 /* syscall emulation active */ | |
6388 | #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ | |
6389 | #define TIF_SECCOMP 8 /* secure computing */ | |
6390 | +#define TIF_NEED_RESCHED_LAZY 9 /* lazy rescheduling necessary */ | |
6391 | #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ | |
6392 | #define TIF_UPROBE 12 /* breakpointed or singlestepping */ | |
6393 | #define TIF_PATCH_PENDING 13 /* pending live patching update */ | |
6394 | @@ -112,6 +120,7 @@ struct thread_info { | |
6395 | #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) | |
6396 | #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) | |
6397 | #define _TIF_SECCOMP (1 << TIF_SECCOMP) | |
6398 | +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) | |
6399 | #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) | |
6400 | #define _TIF_UPROBE (1 << TIF_UPROBE) | |
6401 | #define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING) | |
6402 | @@ -153,6 +162,8 @@ struct thread_info { | |
6403 | #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY) | |
6404 | #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) | |
6405 | ||
6406 | +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) | |
e4b2b4a8 | 6407 | + |
b3bbd485 JK |
6408 | #define STACK_WARN (THREAD_SIZE/8) |
6409 | ||
6410 | /* | |
6411 | diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c | |
6412 | index 96a8a68f9c79..c9af5afebc4a 100644 | |
6413 | --- a/arch/x86/kernel/apic/io_apic.c | |
6414 | +++ b/arch/x86/kernel/apic/io_apic.c | |
6415 | @@ -1688,19 +1688,20 @@ static bool io_apic_level_ack_pending(struct mp_chip_data *data) | |
6416 | return false; | |
6417 | } | |
6418 | ||
6419 | -static inline bool ioapic_irqd_mask(struct irq_data *data) | |
6420 | +static inline bool ioapic_prepare_move(struct irq_data *data) | |
6421 | { | |
6422 | /* If we are moving the irq we need to mask it */ | |
6423 | if (unlikely(irqd_is_setaffinity_pending(data))) { | |
6424 | - mask_ioapic_irq(data); | |
6425 | + if (!irqd_irq_masked(data)) | |
6426 | + mask_ioapic_irq(data); | |
6427 | return true; | |
6428 | } | |
6429 | return false; | |
6430 | } | |
6431 | ||
6432 | -static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked) | |
6433 | +static inline void ioapic_finish_move(struct irq_data *data, bool moveit) | |
6434 | { | |
6435 | - if (unlikely(masked)) { | |
6436 | + if (unlikely(moveit)) { | |
6437 | /* Only migrate the irq if the ack has been received. | |
6438 | * | |
6439 | * On rare occasions the broadcast level triggered ack gets | |
6440 | @@ -1729,15 +1730,17 @@ static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked) | |
6441 | */ | |
6442 | if (!io_apic_level_ack_pending(data->chip_data)) | |
6443 | irq_move_masked_irq(data); | |
6444 | - unmask_ioapic_irq(data); | |
6445 | + /* If the irq is masked in the core, leave it */ | |
6446 | + if (!irqd_irq_masked(data)) | |
6447 | + unmask_ioapic_irq(data); | |
6448 | } | |
6449 | } | |
6450 | #else | |
6451 | -static inline bool ioapic_irqd_mask(struct irq_data *data) | |
6452 | +static inline bool ioapic_prepare_move(struct irq_data *data) | |
6453 | { | |
6454 | return false; | |
6455 | } | |
6456 | -static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked) | |
6457 | +static inline void ioapic_finish_move(struct irq_data *data, bool moveit) | |
6458 | { | |
6459 | } | |
6460 | #endif | |
6461 | @@ -1746,11 +1749,11 @@ static void ioapic_ack_level(struct irq_data *irq_data) | |
6462 | { | |
6463 | struct irq_cfg *cfg = irqd_cfg(irq_data); | |
6464 | unsigned long v; | |
6465 | - bool masked; | |
6466 | + bool moveit; | |
6467 | int i; | |
6468 | ||
6469 | irq_complete_move(cfg); | |
6470 | - masked = ioapic_irqd_mask(irq_data); | |
6471 | + moveit = ioapic_prepare_move(irq_data); | |
6472 | ||
6473 | /* | |
6474 | * It appears there is an erratum which affects at least version 0x11 | |
6475 | @@ -1805,7 +1808,7 @@ static void ioapic_ack_level(struct irq_data *irq_data) | |
6476 | eoi_ioapic_pin(cfg->vector, irq_data->chip_data); | |
6477 | } | |
6478 | ||
6479 | - ioapic_irqd_unmask(irq_data, masked); | |
6480 | + ioapic_finish_move(irq_data, moveit); | |
6481 | } | |
6482 | ||
6483 | static void ioapic_ir_ack_level(struct irq_data *irq_data) | |
6484 | diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c | |
6485 | index 76417a9aab73..62c3e27c8e1c 100644 | |
6486 | --- a/arch/x86/kernel/asm-offsets.c | |
6487 | +++ b/arch/x86/kernel/asm-offsets.c | |
6488 | @@ -38,6 +38,7 @@ void common(void) { | |
6489 | ||
6490 | BLANK(); | |
6491 | OFFSET(TASK_TI_flags, task_struct, thread_info.flags); | |
6492 | + OFFSET(TASK_TI_preempt_lazy_count, task_struct, thread_info.preempt_lazy_count); | |
6493 | OFFSET(TASK_addr_limit, task_struct, thread.addr_limit); | |
6494 | ||
6495 | BLANK(); | |
6496 | @@ -94,6 +95,7 @@ void common(void) { | |
6497 | ||
6498 | BLANK(); | |
6499 | DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); | |
6500 | + DEFINE(_PREEMPT_ENABLED, PREEMPT_ENABLED); | |
6501 | ||
6502 | /* TLB state for the entry code */ | |
6503 | OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask); | |
6504 | diff --git a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c | |
6505 | index 7f85b76f43bc..9e74b805070f 100644 | |
6506 | --- a/arch/x86/kernel/cpu/mcheck/dev-mcelog.c | |
6507 | +++ b/arch/x86/kernel/cpu/mcheck/dev-mcelog.c | |
6508 | @@ -14,6 +14,7 @@ | |
6509 | #include <linux/slab.h> | |
6510 | #include <linux/kmod.h> | |
6511 | #include <linux/poll.h> | |
6512 | +#include <linux/swork.h> | |
6513 | ||
6514 | #include "mce-internal.h" | |
6515 | ||
6516 | @@ -86,13 +87,43 @@ static void mce_do_trigger(struct work_struct *work) | |
6517 | ||
6518 | static DECLARE_WORK(mce_trigger_work, mce_do_trigger); | |
6519 | ||
6520 | - | |
6521 | -void mce_work_trigger(void) | |
6522 | +static void __mce_work_trigger(struct swork_event *event) | |
6523 | { | |
6524 | if (mce_helper[0]) | |
6525 | schedule_work(&mce_trigger_work); | |
6526 | } | |
6527 | ||
6528 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
6529 | +static bool notify_work_ready __read_mostly; | |
6530 | +static struct swork_event notify_work; | |
e4b2b4a8 | 6531 | + |
b3bbd485 JK |
6532 | +static int mce_notify_work_init(void) |
6533 | +{ | |
6534 | + int err; | |
e4b2b4a8 | 6535 | + |
b3bbd485 JK |
6536 | + err = swork_get(); |
6537 | + if (err) | |
6538 | + return err; | |
e4b2b4a8 | 6539 | + |
b3bbd485 JK |
6540 | + INIT_SWORK(¬ify_work, __mce_work_trigger); |
6541 | + notify_work_ready = true; | |
6542 | + return 0; | |
6543 | +} | |
e4b2b4a8 | 6544 | + |
b3bbd485 JK |
6545 | +void mce_work_trigger(void) |
6546 | +{ | |
6547 | + if (notify_work_ready) | |
6548 | + swork_queue(¬ify_work); | |
6549 | +} | |
e4b2b4a8 | 6550 | + |
b3bbd485 JK |
6551 | +#else |
6552 | +void mce_work_trigger(void) | |
6553 | +{ | |
6554 | + __mce_work_trigger(NULL); | |
6555 | +} | |
6556 | +static inline int mce_notify_work_init(void) { return 0; } | |
6557 | +#endif | |
e4b2b4a8 | 6558 | + |
b3bbd485 JK |
6559 | static ssize_t |
6560 | show_trigger(struct device *s, struct device_attribute *attr, char *buf) | |
6561 | { | |
6562 | @@ -356,7 +387,7 @@ static __init int dev_mcelog_init_device(void) | |
6563 | ||
6564 | return err; | |
6565 | } | |
6566 | - | |
6567 | + mce_notify_work_init(); | |
6568 | mce_register_decode_chain(&dev_mcelog_nb); | |
6569 | return 0; | |
6570 | } | |
6571 | diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c | |
6572 | index 98e4e4dc4a3b..5cce2ee3b9f6 100644 | |
6573 | --- a/arch/x86/kernel/cpu/mcheck/mce.c | |
6574 | +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |
6575 | @@ -42,6 +42,7 @@ | |
6576 | #include <linux/debugfs.h> | |
6577 | #include <linux/irq_work.h> | |
6578 | #include <linux/export.h> | |
6579 | +#include <linux/jiffies.h> | |
6580 | #include <linux/jump_label.h> | |
6581 | ||
6582 | #include <asm/intel-family.h> | |
6583 | @@ -1365,7 +1366,7 @@ int memory_failure(unsigned long pfn, int vector, int flags) | |
6584 | static unsigned long check_interval = INITIAL_CHECK_INTERVAL; | |
6585 | ||
6586 | static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ | |
6587 | -static DEFINE_PER_CPU(struct timer_list, mce_timer); | |
6588 | +static DEFINE_PER_CPU(struct hrtimer, mce_timer); | |
6589 | ||
6590 | static unsigned long mce_adjust_timer_default(unsigned long interval) | |
6591 | { | |
6592 | @@ -1374,27 +1375,19 @@ static unsigned long mce_adjust_timer_default(unsigned long interval) | |
6593 | ||
6594 | static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default; | |
6595 | ||
6596 | -static void __start_timer(struct timer_list *t, unsigned long interval) | |
6597 | +static void __start_timer(struct hrtimer *t, unsigned long iv) | |
6598 | { | |
6599 | - unsigned long when = jiffies + interval; | |
6600 | - unsigned long flags; | |
6601 | - | |
6602 | - local_irq_save(flags); | |
6603 | - | |
6604 | - if (!timer_pending(t) || time_before(when, t->expires)) | |
6605 | - mod_timer(t, round_jiffies(when)); | |
6606 | + if (!iv) | |
6607 | + return; | |
6608 | ||
6609 | - local_irq_restore(flags); | |
6610 | + hrtimer_start_range_ns(t, ns_to_ktime(jiffies_to_usecs(iv) * 1000ULL), | |
6611 | + 0, HRTIMER_MODE_REL_PINNED); | |
6612 | } | |
6613 | ||
6614 | -static void mce_timer_fn(unsigned long data) | |
6615 | +static enum hrtimer_restart mce_timer_fn(struct hrtimer *timer) | |
6616 | { | |
6617 | - struct timer_list *t = this_cpu_ptr(&mce_timer); | |
6618 | - int cpu = smp_processor_id(); | |
6619 | unsigned long iv; | |
6620 | ||
6621 | - WARN_ON(cpu != data); | |
6622 | - | |
6623 | iv = __this_cpu_read(mce_next_interval); | |
6624 | ||
6625 | if (mce_available(this_cpu_ptr(&cpu_info))) { | |
6626 | @@ -1417,7 +1410,11 @@ static void mce_timer_fn(unsigned long data) | |
6627 | ||
6628 | done: | |
6629 | __this_cpu_write(mce_next_interval, iv); | |
6630 | - __start_timer(t, iv); | |
6631 | + if (!iv) | |
6632 | + return HRTIMER_NORESTART; | |
e4b2b4a8 | 6633 | + |
b3bbd485 JK |
6634 | + hrtimer_forward_now(timer, ns_to_ktime(jiffies_to_nsecs(iv))); |
6635 | + return HRTIMER_RESTART; | |
6636 | } | |
6637 | ||
6638 | /* | |
6639 | @@ -1425,7 +1422,7 @@ static void mce_timer_fn(unsigned long data) | |
6640 | */ | |
6641 | void mce_timer_kick(unsigned long interval) | |
6642 | { | |
6643 | - struct timer_list *t = this_cpu_ptr(&mce_timer); | |
6644 | + struct hrtimer *t = this_cpu_ptr(&mce_timer); | |
6645 | unsigned long iv = __this_cpu_read(mce_next_interval); | |
6646 | ||
6647 | __start_timer(t, interval); | |
6648 | @@ -1440,7 +1437,7 @@ static void mce_timer_delete_all(void) | |
6649 | int cpu; | |
6650 | ||
6651 | for_each_online_cpu(cpu) | |
6652 | - del_timer_sync(&per_cpu(mce_timer, cpu)); | |
6653 | + hrtimer_cancel(&per_cpu(mce_timer, cpu)); | |
6654 | } | |
6655 | ||
6656 | /* | |
6657 | @@ -1769,7 +1766,7 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c) | |
6658 | } | |
6659 | } | |
6660 | ||
6661 | -static void mce_start_timer(struct timer_list *t) | |
6662 | +static void mce_start_timer(struct hrtimer *t) | |
6663 | { | |
6664 | unsigned long iv = check_interval * HZ; | |
6665 | ||
6666 | @@ -1782,18 +1779,19 @@ static void mce_start_timer(struct timer_list *t) | |
6667 | ||
6668 | static void __mcheck_cpu_setup_timer(void) | |
6669 | { | |
6670 | - struct timer_list *t = this_cpu_ptr(&mce_timer); | |
6671 | - unsigned int cpu = smp_processor_id(); | |
6672 | + struct hrtimer *t = this_cpu_ptr(&mce_timer); | |
6673 | ||
6674 | - setup_pinned_timer(t, mce_timer_fn, cpu); | |
6675 | + hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | |
6676 | + t->function = mce_timer_fn; | |
6677 | } | |
6678 | ||
6679 | static void __mcheck_cpu_init_timer(void) | |
6680 | { | |
6681 | - struct timer_list *t = this_cpu_ptr(&mce_timer); | |
6682 | - unsigned int cpu = smp_processor_id(); | |
6683 | + struct hrtimer *t = this_cpu_ptr(&mce_timer); | |
e4b2b4a8 | 6684 | + |
b3bbd485 JK |
6685 | + hrtimer_init(t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
6686 | + t->function = mce_timer_fn; | |
6687 | ||
6688 | - setup_pinned_timer(t, mce_timer_fn, cpu); | |
6689 | mce_start_timer(t); | |
6690 | } | |
6691 | ||
6692 | @@ -2309,7 +2307,7 @@ static int mce_cpu_dead(unsigned int cpu) | |
6693 | ||
6694 | static int mce_cpu_online(unsigned int cpu) | |
6695 | { | |
6696 | - struct timer_list *t = this_cpu_ptr(&mce_timer); | |
6697 | + struct hrtimer *t = this_cpu_ptr(&mce_timer); | |
6698 | int ret; | |
6699 | ||
6700 | mce_device_create(cpu); | |
6701 | @@ -2326,10 +2324,10 @@ static int mce_cpu_online(unsigned int cpu) | |
6702 | ||
6703 | static int mce_cpu_pre_down(unsigned int cpu) | |
6704 | { | |
6705 | - struct timer_list *t = this_cpu_ptr(&mce_timer); | |
6706 | + struct hrtimer *t = this_cpu_ptr(&mce_timer); | |
6707 | ||
6708 | mce_disable_cpu(); | |
6709 | - del_timer_sync(t); | |
6710 | + hrtimer_cancel(t); | |
6711 | mce_threshold_remove_device(cpu); | |
6712 | mce_device_remove(cpu); | |
6713 | return 0; | |
6714 | diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c | |
6715 | index 2ea85b32421a..6914dc569d1e 100644 | |
6716 | --- a/arch/x86/kernel/fpu/core.c | |
6717 | +++ b/arch/x86/kernel/fpu/core.c | |
6718 | @@ -138,6 +138,18 @@ void kernel_fpu_end(void) | |
6719 | } | |
6720 | EXPORT_SYMBOL_GPL(kernel_fpu_end); | |
6721 | ||
6722 | +void kernel_fpu_resched(void) | |
6723 | +{ | |
6724 | + WARN_ON_FPU(!this_cpu_read(in_kernel_fpu)); | |
e4b2b4a8 | 6725 | + |
b3bbd485 JK |
6726 | + if (should_resched(PREEMPT_OFFSET)) { |
6727 | + kernel_fpu_end(); | |
6728 | + cond_resched(); | |
6729 | + kernel_fpu_begin(); | |
6730 | + } | |
6731 | +} | |
6732 | +EXPORT_SYMBOL_GPL(kernel_fpu_resched); | |
e4b2b4a8 | 6733 | + |
b3bbd485 JK |
6734 | /* |
6735 | * Save the FPU state (mark it for reload if necessary): | |
6736 | * | |
6737 | diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c | |
6738 | index 95600a99ae93..9192d76085ba 100644 | |
6739 | --- a/arch/x86/kernel/irq_32.c | |
6740 | +++ b/arch/x86/kernel/irq_32.c | |
6741 | @@ -130,6 +130,7 @@ void irq_ctx_init(int cpu) | |
6742 | cpu, per_cpu(hardirq_stack, cpu), per_cpu(softirq_stack, cpu)); | |
6743 | } | |
6744 | ||
6745 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
6746 | void do_softirq_own_stack(void) | |
6747 | { | |
6748 | struct irq_stack *irqstk; | |
6749 | @@ -146,6 +147,7 @@ void do_softirq_own_stack(void) | |
6750 | ||
6751 | call_on_stack(__do_softirq, isp); | |
6752 | } | |
6753 | +#endif | |
6754 | ||
6755 | bool handle_irq(struct irq_desc *desc, struct pt_regs *regs) | |
6756 | { | |
6757 | diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c | |
6758 | index 5224c6099184..9b2b1f0409c5 100644 | |
6759 | --- a/arch/x86/kernel/process_32.c | |
6760 | +++ b/arch/x86/kernel/process_32.c | |
6761 | @@ -38,6 +38,7 @@ | |
6762 | #include <linux/io.h> | |
6763 | #include <linux/kdebug.h> | |
6764 | #include <linux/syscalls.h> | |
6765 | +#include <linux/highmem.h> | |
6766 | ||
6767 | #include <asm/pgtable.h> | |
6768 | #include <asm/ldt.h> | |
6769 | @@ -198,6 +199,35 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) | |
6770 | } | |
6771 | EXPORT_SYMBOL_GPL(start_thread); | |
6772 | ||
6773 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
6774 | +static void switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) | |
6775 | +{ | |
6776 | + int i; | |
e4b2b4a8 | 6777 | + |
b3bbd485 JK |
6778 | + /* |
6779 | + * Clear @prev's kmap_atomic mappings | |
6780 | + */ | |
6781 | + for (i = 0; i < prev_p->kmap_idx; i++) { | |
6782 | + int idx = i + KM_TYPE_NR * smp_processor_id(); | |
6783 | + pte_t *ptep = kmap_pte - idx; | |
e4b2b4a8 | 6784 | + |
b3bbd485 JK |
6785 | + kpte_clear_flush(ptep, __fix_to_virt(FIX_KMAP_BEGIN + idx)); |
6786 | + } | |
6787 | + /* | |
6788 | + * Restore @next_p's kmap_atomic mappings | |
6789 | + */ | |
6790 | + for (i = 0; i < next_p->kmap_idx; i++) { | |
6791 | + int idx = i + KM_TYPE_NR * smp_processor_id(); | |
e4b2b4a8 | 6792 | + |
b3bbd485 JK |
6793 | + if (!pte_none(next_p->kmap_pte[i])) |
6794 | + set_pte(kmap_pte - idx, next_p->kmap_pte[i]); | |
6795 | + } | |
6796 | +} | |
6797 | +#else | |
6798 | +static inline void | |
6799 | +switch_kmaps(struct task_struct *prev_p, struct task_struct *next_p) { } | |
6800 | +#endif | |
e4b2b4a8 | 6801 | + |
b3bbd485 JK |
6802 | |
6803 | /* | |
6804 | * switch_to(x,y) should switch tasks from x to y. | |
6805 | @@ -273,6 +303,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) | |
6806 | task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) | |
6807 | __switch_to_xtra(prev_p, next_p, tss); | |
6808 | ||
6809 | + switch_kmaps(prev_p, next_p); | |
e4b2b4a8 | 6810 | + |
b3bbd485 JK |
6811 | /* |
6812 | * Leave lazy mode, flushing any hypercalls made here. | |
6813 | * This must be done before restoring TLS segments so | |
6814 | diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c | |
5dd41b01 | 6815 | index 13dfb55b84db..dd66f629d1d0 100644 |
b3bbd485 JK |
6816 | --- a/arch/x86/kvm/lapic.c |
6817 | +++ b/arch/x86/kvm/lapic.c | |
5dd41b01 | 6818 | @@ -2136,7 +2136,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) |
b3bbd485 JK |
6819 | apic->vcpu = vcpu; |
6820 | ||
6821 | hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, | |
6822 | - HRTIMER_MODE_ABS_PINNED); | |
6823 | + HRTIMER_MODE_ABS_PINNED_HARD); | |
6824 | apic->lapic_timer.timer.function = apic_timer_fn; | |
6825 | ||
6826 | /* | |
6827 | diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c | |
6828 | index 3856828ee1dc..407658146ae1 100644 | |
6829 | --- a/arch/x86/kvm/x86.c | |
6830 | +++ b/arch/x86/kvm/x86.c | |
6831 | @@ -6287,6 +6287,13 @@ int kvm_arch_init(void *opaque) | |
6832 | goto out; | |
6833 | } | |
6834 | ||
6835 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
6836 | + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { | |
6837 | + printk(KERN_ERR "RT requires X86_FEATURE_CONSTANT_TSC\n"); | |
6838 | + return -EOPNOTSUPP; | |
6839 | + } | |
6840 | +#endif | |
e4b2b4a8 | 6841 | + |
b3bbd485 JK |
6842 | r = kvm_mmu_module_init(); |
6843 | if (r) | |
6844 | goto out_free_percpu; | |
6845 | diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c | |
6846 | index 6d18b70ed5a9..f752724c22e8 100644 | |
6847 | --- a/arch/x86/mm/highmem_32.c | |
6848 | +++ b/arch/x86/mm/highmem_32.c | |
6849 | @@ -32,10 +32,11 @@ EXPORT_SYMBOL(kunmap); | |
6850 | */ | |
6851 | void *kmap_atomic_prot(struct page *page, pgprot_t prot) | |
6852 | { | |
6853 | + pte_t pte = mk_pte(page, prot); | |
6854 | unsigned long vaddr; | |
6855 | int idx, type; | |
6856 | ||
6857 | - preempt_disable(); | |
6858 | + preempt_disable_nort(); | |
6859 | pagefault_disable(); | |
6860 | ||
6861 | if (!PageHighMem(page)) | |
6862 | @@ -45,7 +46,10 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot) | |
6863 | idx = type + KM_TYPE_NR*smp_processor_id(); | |
6864 | vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); | |
6865 | BUG_ON(!pte_none(*(kmap_pte-idx))); | |
6866 | - set_pte(kmap_pte-idx, mk_pte(page, prot)); | |
6867 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
6868 | + current->kmap_pte[type] = pte; | |
6869 | +#endif | |
6870 | + set_pte(kmap_pte-idx, pte); | |
6871 | arch_flush_lazy_mmu_mode(); | |
6872 | ||
6873 | return (void *)vaddr; | |
6874 | @@ -88,6 +92,9 @@ void __kunmap_atomic(void *kvaddr) | |
6875 | * is a bad idea also, in case the page changes cacheability | |
6876 | * attributes or becomes a protected page in a hypervisor. | |
6877 | */ | |
6878 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
6879 | + current->kmap_pte[type] = __pte(0); | |
6880 | +#endif | |
6881 | kpte_clear_flush(kmap_pte-idx, vaddr); | |
6882 | kmap_atomic_idx_pop(); | |
6883 | arch_flush_lazy_mmu_mode(); | |
6884 | @@ -100,7 +107,7 @@ void __kunmap_atomic(void *kvaddr) | |
6885 | #endif | |
6886 | ||
6887 | pagefault_enable(); | |
6888 | - preempt_enable(); | |
6889 | + preempt_enable_nort(); | |
6890 | } | |
6891 | EXPORT_SYMBOL(__kunmap_atomic); | |
6892 | ||
6893 | diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c | |
6894 | index ada98b39b8ad..585f6829653b 100644 | |
6895 | --- a/arch/x86/mm/iomap_32.c | |
6896 | +++ b/arch/x86/mm/iomap_32.c | |
6897 | @@ -56,6 +56,7 @@ EXPORT_SYMBOL_GPL(iomap_free); | |
6898 | ||
6899 | void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) | |
6900 | { | |
6901 | + pte_t pte = pfn_pte(pfn, prot); | |
6902 | unsigned long vaddr; | |
6903 | int idx, type; | |
6904 | ||
6905 | @@ -65,7 +66,12 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) | |
6906 | type = kmap_atomic_idx_push(); | |
6907 | idx = type + KM_TYPE_NR * smp_processor_id(); | |
6908 | vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); | |
6909 | - set_pte(kmap_pte - idx, pfn_pte(pfn, prot)); | |
6910 | + WARN_ON(!pte_none(*(kmap_pte - idx))); | |
e4b2b4a8 | 6911 | + |
b3bbd485 JK |
6912 | +#ifdef CONFIG_PREEMPT_RT_FULL |
6913 | + current->kmap_pte[type] = pte; | |
6914 | +#endif | |
6915 | + set_pte(kmap_pte - idx, pte); | |
6916 | arch_flush_lazy_mmu_mode(); | |
6917 | ||
6918 | return (void *)vaddr; | |
6919 | @@ -113,6 +119,9 @@ iounmap_atomic(void __iomem *kvaddr) | |
6920 | * is a bad idea also, in case the page changes cacheability | |
6921 | * attributes or becomes a protected page in a hypervisor. | |
6922 | */ | |
6923 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
6924 | + current->kmap_pte[type] = __pte(0); | |
6925 | +#endif | |
6926 | kpte_clear_flush(kmap_pte-idx, vaddr); | |
6927 | kmap_atomic_idx_pop(); | |
6928 | } | |
6929 | diff --git a/arch/xtensa/include/asm/spinlock_types.h b/arch/xtensa/include/asm/spinlock_types.h | |
6930 | index bb1fe6c1816e..8a22f1e7b6c9 100644 | |
6931 | --- a/arch/xtensa/include/asm/spinlock_types.h | |
6932 | +++ b/arch/xtensa/include/asm/spinlock_types.h | |
6933 | @@ -2,10 +2,6 @@ | |
6934 | #ifndef __ASM_SPINLOCK_TYPES_H | |
6935 | #define __ASM_SPINLOCK_TYPES_H | |
6936 | ||
6937 | -#ifndef __LINUX_SPINLOCK_TYPES_H | |
6938 | -# error "please don't include this file directly" | |
6939 | -#endif | |
6940 | - | |
6941 | typedef struct { | |
6942 | volatile unsigned int slock; | |
6943 | } arch_spinlock_t; | |
6944 | diff --git a/block/blk-core.c b/block/blk-core.c | |
5dd41b01 | 6945 | index 6aa2bc4e9652..f005077ae291 100644 |
b3bbd485 JK |
6946 | --- a/block/blk-core.c |
6947 | +++ b/block/blk-core.c | |
6948 | @@ -116,6 +116,9 @@ void blk_rq_init(struct request_queue *q, struct request *rq) | |
6949 | ||
6950 | INIT_LIST_HEAD(&rq->queuelist); | |
6951 | INIT_LIST_HEAD(&rq->timeout_list); | |
6952 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
6953 | + INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work); | |
6954 | +#endif | |
6955 | rq->cpu = -1; | |
6956 | rq->q = q; | |
6957 | rq->__sector = (sector_t) -1; | |
6958 | @@ -280,7 +283,7 @@ EXPORT_SYMBOL(blk_start_queue_async); | |
6959 | void blk_start_queue(struct request_queue *q) | |
6960 | { | |
6961 | lockdep_assert_held(q->queue_lock); | |
6962 | - WARN_ON(!in_interrupt() && !irqs_disabled()); | |
6963 | + WARN_ON_NONRT(!in_interrupt() && !irqs_disabled()); | |
6964 | WARN_ON_ONCE(q->mq_ops); | |
6965 | ||
6966 | queue_flag_clear(QUEUE_FLAG_STOPPED, q); | |
5dd41b01 | 6967 | @@ -812,12 +815,21 @@ void blk_queue_exit(struct request_queue *q) |
b3bbd485 JK |
6968 | percpu_ref_put(&q->q_usage_counter); |
6969 | } | |
6970 | ||
6971 | +static void blk_queue_usage_counter_release_swork(struct swork_event *sev) | |
6972 | +{ | |
6973 | + struct request_queue *q = | |
6974 | + container_of(sev, struct request_queue, mq_pcpu_wake); | |
e4b2b4a8 | 6975 | + |
b3bbd485 JK |
6976 | + wake_up_all(&q->mq_freeze_wq); |
6977 | +} | |
e4b2b4a8 | 6978 | + |
b3bbd485 JK |
6979 | static void blk_queue_usage_counter_release(struct percpu_ref *ref) |
6980 | { | |
6981 | struct request_queue *q = | |
6982 | container_of(ref, struct request_queue, q_usage_counter); | |
6983 | ||
6984 | - wake_up_all(&q->mq_freeze_wq); | |
6985 | + if (wq_has_sleeper(&q->mq_freeze_wq)) | |
6986 | + swork_queue(&q->mq_pcpu_wake); | |
6987 | } | |
6988 | ||
6989 | static void blk_rq_timed_out_timer(unsigned long data) | |
5dd41b01 | 6990 | @@ -894,6 +906,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) |
b3bbd485 JK |
6991 | __set_bit(QUEUE_FLAG_BYPASS, &q->queue_flags); |
6992 | ||
6993 | init_waitqueue_head(&q->mq_freeze_wq); | |
6994 | + INIT_SWORK(&q->mq_pcpu_wake, blk_queue_usage_counter_release_swork); | |
6995 | ||
6996 | /* | |
6997 | * Init percpu_ref in atomic mode so that it's faster to shutdown. | |
5dd41b01 | 6998 | @@ -3313,7 +3326,7 @@ static void queue_unplugged(struct request_queue *q, unsigned int depth, |
b3bbd485 JK |
6999 | blk_run_queue_async(q); |
7000 | else | |
7001 | __blk_run_queue(q); | |
7002 | - spin_unlock(q->queue_lock); | |
7003 | + spin_unlock_irq(q->queue_lock); | |
7004 | } | |
7005 | ||
7006 | static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule) | |
5dd41b01 | 7007 | @@ -3361,7 +3374,6 @@ EXPORT_SYMBOL(blk_check_plugged); |
b3bbd485 JK |
7008 | void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) |
7009 | { | |
7010 | struct request_queue *q; | |
7011 | - unsigned long flags; | |
7012 | struct request *rq; | |
7013 | LIST_HEAD(list); | |
7014 | unsigned int depth; | |
5dd41b01 | 7015 | @@ -3381,11 +3393,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) |
b3bbd485 JK |
7016 | q = NULL; |
7017 | depth = 0; | |
7018 | ||
7019 | - /* | |
7020 | - * Save and disable interrupts here, to avoid doing it for every | |
7021 | - * queue lock we have to take. | |
7022 | - */ | |
7023 | - local_irq_save(flags); | |
7024 | while (!list_empty(&list)) { | |
7025 | rq = list_entry_rq(list.next); | |
7026 | list_del_init(&rq->queuelist); | |
5dd41b01 | 7027 | @@ -3398,7 +3405,7 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) |
b3bbd485 JK |
7028 | queue_unplugged(q, depth, from_schedule); |
7029 | q = rq->q; | |
7030 | depth = 0; | |
7031 | - spin_lock(q->queue_lock); | |
7032 | + spin_lock_irq(q->queue_lock); | |
7033 | } | |
7034 | ||
7035 | /* | |
5dd41b01 | 7036 | @@ -3425,8 +3432,6 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) |
b3bbd485 JK |
7037 | */ |
7038 | if (q) | |
7039 | queue_unplugged(q, depth, from_schedule); | |
7040 | - | |
7041 | - local_irq_restore(flags); | |
7042 | } | |
7043 | ||
7044 | void blk_finish_plug(struct blk_plug *plug) | |
5dd41b01 | 7045 | @@ -3638,6 +3643,8 @@ int __init blk_dev_init(void) |
b3bbd485 JK |
7046 | if (!kblockd_workqueue) |
7047 | panic("Failed to create kblockd\n"); | |
7048 | ||
7049 | + BUG_ON(swork_get()); | |
e4b2b4a8 | 7050 | + |
b3bbd485 JK |
7051 | request_cachep = kmem_cache_create("blkdev_requests", |
7052 | sizeof(struct request), 0, SLAB_PANIC, NULL); | |
7053 | ||
7054 | diff --git a/block/blk-ioc.c b/block/blk-ioc.c | |
7055 | index f23311e4b201..ca9ea624f159 100644 | |
7056 | --- a/block/blk-ioc.c | |
7057 | +++ b/block/blk-ioc.c | |
7058 | @@ -9,6 +9,7 @@ | |
7059 | #include <linux/blkdev.h> | |
7060 | #include <linux/slab.h> | |
7061 | #include <linux/sched/task.h> | |
7062 | +#include <linux/delay.h> | |
7063 | ||
7064 | #include "blk.h" | |
7065 | ||
7066 | @@ -118,7 +119,7 @@ static void ioc_release_fn(struct work_struct *work) | |
7067 | spin_unlock(q->queue_lock); | |
7068 | } else { | |
7069 | spin_unlock_irqrestore(&ioc->lock, flags); | |
7070 | - cpu_relax(); | |
7071 | + cpu_chill(); | |
7072 | spin_lock_irqsave_nested(&ioc->lock, flags, 1); | |
7073 | } | |
7074 | } | |
7075 | @@ -202,7 +203,7 @@ void put_io_context_active(struct io_context *ioc) | |
7076 | spin_unlock(icq->q->queue_lock); | |
7077 | } else { | |
7078 | spin_unlock_irqrestore(&ioc->lock, flags); | |
7079 | - cpu_relax(); | |
7080 | + cpu_chill(); | |
7081 | goto retry; | |
7082 | } | |
7083 | } | |
7084 | diff --git a/block/blk-mq.c b/block/blk-mq.c | |
5dd41b01 | 7085 | index eac444804736..a6314b82273e 100644 |
b3bbd485 JK |
7086 | --- a/block/blk-mq.c |
7087 | +++ b/block/blk-mq.c | |
7088 | @@ -339,6 +339,9 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, | |
7089 | /* tag was already set */ | |
7090 | rq->extra_len = 0; | |
7091 | ||
7092 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
7093 | + INIT_WORK(&rq->work, __blk_mq_complete_request_remote_work); | |
7094 | +#endif | |
7095 | INIT_LIST_HEAD(&rq->timeout_list); | |
7096 | rq->timeout = 0; | |
7097 | ||
7098 | @@ -533,12 +536,24 @@ void blk_mq_end_request(struct request *rq, blk_status_t error) | |
7099 | } | |
7100 | EXPORT_SYMBOL(blk_mq_end_request); | |
7101 | ||
7102 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
e4b2b4a8 | 7103 | + |
b3bbd485 JK |
7104 | +void __blk_mq_complete_request_remote_work(struct work_struct *work) |
7105 | +{ | |
7106 | + struct request *rq = container_of(work, struct request, work); | |
e4b2b4a8 | 7107 | + |
b3bbd485 JK |
7108 | + rq->q->softirq_done_fn(rq); |
7109 | +} | |
e4b2b4a8 | 7110 | + |
b3bbd485 | 7111 | +#else |
e4b2b4a8 | 7112 | + |
b3bbd485 JK |
7113 | static void __blk_mq_complete_request_remote(void *data) |
7114 | { | |
7115 | struct request *rq = data; | |
7116 | ||
7117 | rq->q->softirq_done_fn(rq); | |
7118 | } | |
7119 | +#endif | |
7120 | ||
7121 | static void __blk_mq_complete_request(struct request *rq) | |
7122 | { | |
7123 | @@ -558,19 +573,27 @@ static void __blk_mq_complete_request(struct request *rq) | |
7124 | return; | |
7125 | } | |
7126 | ||
7127 | - cpu = get_cpu(); | |
7128 | + cpu = get_cpu_light(); | |
7129 | if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags)) | |
7130 | shared = cpus_share_cache(cpu, ctx->cpu); | |
7131 | ||
7132 | if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { | |
7133 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
7134 | + /* | |
7135 | + * We could force QUEUE_FLAG_SAME_FORCE then we would not get in | |
7136 | + * here. But we could try to invoke it one the CPU like this. | |
7137 | + */ | |
7138 | + schedule_work_on(ctx->cpu, &rq->work); | |
7139 | +#else | |
7140 | rq->csd.func = __blk_mq_complete_request_remote; | |
7141 | rq->csd.info = rq; | |
7142 | rq->csd.flags = 0; | |
7143 | smp_call_function_single_async(ctx->cpu, &rq->csd); | |
7144 | +#endif | |
7145 | } else { | |
7146 | rq->q->softirq_done_fn(rq); | |
7147 | } | |
7148 | - put_cpu(); | |
7149 | + put_cpu_light(); | |
7150 | } | |
7151 | ||
7152 | /** | |
7153 | @@ -1238,14 +1261,14 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, | |
7154 | return; | |
7155 | ||
7156 | if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { | |
7157 | - int cpu = get_cpu(); | |
7158 | + int cpu = get_cpu_light(); | |
7159 | if (cpumask_test_cpu(cpu, hctx->cpumask)) { | |
7160 | __blk_mq_run_hw_queue(hctx); | |
7161 | - put_cpu(); | |
7162 | + put_cpu_light(); | |
7163 | return; | |
7164 | } | |
7165 | ||
7166 | - put_cpu(); | |
7167 | + put_cpu_light(); | |
7168 | } | |
7169 | ||
7170 | kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), | |
7171 | @@ -2863,10 +2886,9 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, | |
7172 | kt = nsecs; | |
7173 | ||
7174 | mode = HRTIMER_MODE_REL; | |
7175 | - hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode); | |
7176 | + hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode, current); | |
7177 | hrtimer_set_expires(&hs.timer, kt); | |
7178 | ||
7179 | - hrtimer_init_sleeper(&hs, current); | |
7180 | do { | |
7181 | if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) | |
7182 | break; | |
7183 | diff --git a/block/blk-mq.h b/block/blk-mq.h | |
7184 | index 877237e09083..d944750bade0 100644 | |
7185 | --- a/block/blk-mq.h | |
7186 | +++ b/block/blk-mq.h | |
7187 | @@ -98,12 +98,12 @@ static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, | |
7188 | */ | |
7189 | static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) | |
7190 | { | |
7191 | - return __blk_mq_get_ctx(q, get_cpu()); | |
7192 | + return __blk_mq_get_ctx(q, get_cpu_light()); | |
7193 | } | |
7194 | ||
7195 | static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx) | |
7196 | { | |
7197 | - put_cpu(); | |
7198 | + put_cpu_light(); | |
7199 | } | |
7200 | ||
7201 | struct blk_mq_alloc_data { | |
7202 | diff --git a/block/blk-softirq.c b/block/blk-softirq.c | |
7203 | index 01e2b353a2b9..e8c0d4945f5a 100644 | |
7204 | --- a/block/blk-softirq.c | |
7205 | +++ b/block/blk-softirq.c | |
7206 | @@ -53,6 +53,7 @@ static void trigger_softirq(void *data) | |
7207 | raise_softirq_irqoff(BLOCK_SOFTIRQ); | |
7208 | ||
7209 | local_irq_restore(flags); | |
7210 | + preempt_check_resched_rt(); | |
7211 | } | |
7212 | ||
7213 | /* | |
7214 | @@ -91,6 +92,7 @@ static int blk_softirq_cpu_dead(unsigned int cpu) | |
7215 | this_cpu_ptr(&blk_cpu_done)); | |
7216 | raise_softirq_irqoff(BLOCK_SOFTIRQ); | |
7217 | local_irq_enable(); | |
7218 | + preempt_check_resched_rt(); | |
7219 | ||
7220 | return 0; | |
7221 | } | |
7222 | @@ -143,6 +145,7 @@ void __blk_complete_request(struct request *req) | |
7223 | goto do_local; | |
7224 | ||
7225 | local_irq_restore(flags); | |
7226 | + preempt_check_resched_rt(); | |
7227 | } | |
7228 | ||
7229 | /** | |
7230 | diff --git a/block/bounce.c b/block/bounce.c | |
7231 | index 1d05c422c932..0101ffefddc4 100644 | |
7232 | --- a/block/bounce.c | |
7233 | +++ b/block/bounce.c | |
7234 | @@ -66,11 +66,11 @@ static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) | |
7235 | unsigned long flags; | |
7236 | unsigned char *vto; | |
7237 | ||
7238 | - local_irq_save(flags); | |
7239 | + local_irq_save_nort(flags); | |
7240 | vto = kmap_atomic(to->bv_page); | |
7241 | memcpy(vto + to->bv_offset, vfrom, to->bv_len); | |
7242 | kunmap_atomic(vto); | |
7243 | - local_irq_restore(flags); | |
7244 | + local_irq_restore_nort(flags); | |
7245 | } | |
7246 | ||
7247 | #else /* CONFIG_HIGHMEM */ | |
7248 | diff --git a/crypto/algapi.c b/crypto/algapi.c | |
7249 | index 50eb828db767..7bce92a6599a 100644 | |
7250 | --- a/crypto/algapi.c | |
7251 | +++ b/crypto/algapi.c | |
7252 | @@ -731,13 +731,13 @@ EXPORT_SYMBOL_GPL(crypto_spawn_tfm2); | |
7253 | ||
7254 | int crypto_register_notifier(struct notifier_block *nb) | |
7255 | { | |
7256 | - return blocking_notifier_chain_register(&crypto_chain, nb); | |
7257 | + return srcu_notifier_chain_register(&crypto_chain, nb); | |
7258 | } | |
7259 | EXPORT_SYMBOL_GPL(crypto_register_notifier); | |
7260 | ||
7261 | int crypto_unregister_notifier(struct notifier_block *nb) | |
7262 | { | |
7263 | - return blocking_notifier_chain_unregister(&crypto_chain, nb); | |
7264 | + return srcu_notifier_chain_unregister(&crypto_chain, nb); | |
7265 | } | |
7266 | EXPORT_SYMBOL_GPL(crypto_unregister_notifier); | |
7267 | ||
7268 | diff --git a/crypto/api.c b/crypto/api.c | |
5dd41b01 | 7269 | index e485aed11ad0..089e648d2fa9 100644 |
b3bbd485 JK |
7270 | --- a/crypto/api.c |
7271 | +++ b/crypto/api.c | |
7272 | @@ -31,7 +31,7 @@ EXPORT_SYMBOL_GPL(crypto_alg_list); | |
7273 | DECLARE_RWSEM(crypto_alg_sem); | |
7274 | EXPORT_SYMBOL_GPL(crypto_alg_sem); | |
7275 | ||
7276 | -BLOCKING_NOTIFIER_HEAD(crypto_chain); | |
7277 | +SRCU_NOTIFIER_HEAD(crypto_chain); | |
7278 | EXPORT_SYMBOL_GPL(crypto_chain); | |
7279 | ||
7280 | static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg); | |
7281 | @@ -236,10 +236,10 @@ int crypto_probing_notify(unsigned long val, void *v) | |
7282 | { | |
7283 | int ok; | |
7284 | ||
7285 | - ok = blocking_notifier_call_chain(&crypto_chain, val, v); | |
7286 | + ok = srcu_notifier_call_chain(&crypto_chain, val, v); | |
7287 | if (ok == NOTIFY_DONE) { | |
7288 | request_module("cryptomgr"); | |
7289 | - ok = blocking_notifier_call_chain(&crypto_chain, val, v); | |
7290 | + ok = srcu_notifier_call_chain(&crypto_chain, val, v); | |
7291 | } | |
7292 | ||
7293 | return ok; | |
7294 | diff --git a/crypto/cryptd.c b/crypto/cryptd.c | |
7295 | index 248f6ba41688..54b7985c8caa 100644 | |
7296 | --- a/crypto/cryptd.c | |
7297 | +++ b/crypto/cryptd.c | |
7298 | @@ -37,6 +37,7 @@ | |
7299 | struct cryptd_cpu_queue { | |
7300 | struct crypto_queue queue; | |
7301 | struct work_struct work; | |
7302 | + spinlock_t qlock; | |
7303 | }; | |
7304 | ||
7305 | struct cryptd_queue { | |
7306 | @@ -115,6 +116,7 @@ static int cryptd_init_queue(struct cryptd_queue *queue, | |
7307 | cpu_queue = per_cpu_ptr(queue->cpu_queue, cpu); | |
7308 | crypto_init_queue(&cpu_queue->queue, max_cpu_qlen); | |
7309 | INIT_WORK(&cpu_queue->work, cryptd_queue_worker); | |
7310 | + spin_lock_init(&cpu_queue->qlock); | |
7311 | } | |
7312 | return 0; | |
7313 | } | |
7314 | @@ -139,8 +141,10 @@ static int cryptd_enqueue_request(struct cryptd_queue *queue, | |
7315 | atomic_t *refcnt; | |
7316 | bool may_backlog; | |
7317 | ||
7318 | - cpu = get_cpu(); | |
7319 | - cpu_queue = this_cpu_ptr(queue->cpu_queue); | |
7320 | + cpu_queue = raw_cpu_ptr(queue->cpu_queue); | |
7321 | + spin_lock_bh(&cpu_queue->qlock); | |
7322 | + cpu = smp_processor_id(); | |
e4b2b4a8 | 7323 | + |
b3bbd485 JK |
7324 | err = crypto_enqueue_request(&cpu_queue->queue, request); |
7325 | ||
7326 | refcnt = crypto_tfm_ctx(request->tfm); | |
7327 | @@ -157,7 +161,7 @@ static int cryptd_enqueue_request(struct cryptd_queue *queue, | |
7328 | atomic_inc(refcnt); | |
7329 | ||
7330 | out_put_cpu: | |
7331 | - put_cpu(); | |
7332 | + spin_unlock_bh(&cpu_queue->qlock); | |
7333 | ||
7334 | return err; | |
7335 | } | |
7336 | @@ -173,16 +177,11 @@ static void cryptd_queue_worker(struct work_struct *work) | |
7337 | cpu_queue = container_of(work, struct cryptd_cpu_queue, work); | |
7338 | /* | |
7339 | * Only handle one request at a time to avoid hogging crypto workqueue. | |
7340 | - * preempt_disable/enable is used to prevent being preempted by | |
7341 | - * cryptd_enqueue_request(). local_bh_disable/enable is used to prevent | |
7342 | - * cryptd_enqueue_request() being accessed from software interrupts. | |
7343 | */ | |
7344 | - local_bh_disable(); | |
7345 | - preempt_disable(); | |
7346 | + spin_lock_bh(&cpu_queue->qlock); | |
7347 | backlog = crypto_get_backlog(&cpu_queue->queue); | |
7348 | req = crypto_dequeue_request(&cpu_queue->queue); | |
7349 | - preempt_enable(); | |
7350 | - local_bh_enable(); | |
7351 | + spin_unlock_bh(&cpu_queue->qlock); | |
7352 | ||
7353 | if (!req) | |
7354 | return; | |
7355 | diff --git a/crypto/internal.h b/crypto/internal.h | |
7356 | index f07320423191..333d985088fe 100644 | |
7357 | --- a/crypto/internal.h | |
7358 | +++ b/crypto/internal.h | |
7359 | @@ -47,7 +47,7 @@ struct crypto_larval { | |
7360 | ||
7361 | extern struct list_head crypto_alg_list; | |
7362 | extern struct rw_semaphore crypto_alg_sem; | |
7363 | -extern struct blocking_notifier_head crypto_chain; | |
7364 | +extern struct srcu_notifier_head crypto_chain; | |
7365 | ||
7366 | #ifdef CONFIG_PROC_FS | |
7367 | void __init crypto_init_proc(void); | |
7368 | @@ -143,7 +143,7 @@ static inline int crypto_is_moribund(struct crypto_alg *alg) | |
7369 | ||
7370 | static inline void crypto_notify(unsigned long val, void *v) | |
7371 | { | |
7372 | - blocking_notifier_call_chain(&crypto_chain, val, v); | |
7373 | + srcu_notifier_call_chain(&crypto_chain, val, v); | |
7374 | } | |
7375 | ||
7376 | #endif /* _CRYPTO_INTERNAL_H */ | |
7377 | diff --git a/crypto/scompress.c b/crypto/scompress.c | |
7378 | index 2075e2c4e7df..c6b4e265c6bf 100644 | |
7379 | --- a/crypto/scompress.c | |
7380 | +++ b/crypto/scompress.c | |
7381 | @@ -24,6 +24,7 @@ | |
7382 | #include <linux/cryptouser.h> | |
7383 | #include <net/netlink.h> | |
7384 | #include <linux/scatterlist.h> | |
7385 | +#include <linux/locallock.h> | |
7386 | #include <crypto/scatterwalk.h> | |
7387 | #include <crypto/internal/acompress.h> | |
7388 | #include <crypto/internal/scompress.h> | |
7389 | @@ -34,6 +35,7 @@ static void * __percpu *scomp_src_scratches; | |
7390 | static void * __percpu *scomp_dst_scratches; | |
7391 | static int scomp_scratch_users; | |
7392 | static DEFINE_MUTEX(scomp_lock); | |
7393 | +static DEFINE_LOCAL_IRQ_LOCK(scomp_scratches_lock); | |
7394 | ||
7395 | #ifdef CONFIG_NET | |
7396 | static int crypto_scomp_report(struct sk_buff *skb, struct crypto_alg *alg) | |
7397 | @@ -193,7 +195,7 @@ static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir) | |
7398 | void **tfm_ctx = acomp_tfm_ctx(tfm); | |
7399 | struct crypto_scomp *scomp = *tfm_ctx; | |
7400 | void **ctx = acomp_request_ctx(req); | |
7401 | - const int cpu = get_cpu(); | |
7402 | + const int cpu = local_lock_cpu(scomp_scratches_lock); | |
7403 | u8 *scratch_src = *per_cpu_ptr(scomp_src_scratches, cpu); | |
7404 | u8 *scratch_dst = *per_cpu_ptr(scomp_dst_scratches, cpu); | |
7405 | int ret; | |
7406 | @@ -228,7 +230,7 @@ static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir) | |
7407 | 1); | |
7408 | } | |
7409 | out: | |
7410 | - put_cpu(); | |
7411 | + local_unlock_cpu(scomp_scratches_lock); | |
7412 | return ret; | |
7413 | } | |
7414 | ||
7415 | diff --git a/drivers/acpi/acpica/acglobal.h b/drivers/acpi/acpica/acglobal.h | |
7416 | index 95eed442703f..50bc5b61d899 100644 | |
7417 | --- a/drivers/acpi/acpica/acglobal.h | |
7418 | +++ b/drivers/acpi/acpica/acglobal.h | |
7419 | @@ -116,7 +116,7 @@ ACPI_GLOBAL(u8, acpi_gbl_global_lock_pending); | |
e4b2b4a8 JK |
7420 | * interrupt level |
7421 | */ | |
7422 | ACPI_GLOBAL(acpi_spinlock, acpi_gbl_gpe_lock); /* For GPE data structs and registers */ | |
7423 | -ACPI_GLOBAL(acpi_spinlock, acpi_gbl_hardware_lock); /* For ACPI H/W except GPE registers */ | |
7424 | +ACPI_GLOBAL(acpi_raw_spinlock, acpi_gbl_hardware_lock); /* For ACPI H/W except GPE registers */ | |
7425 | ACPI_GLOBAL(acpi_spinlock, acpi_gbl_reference_count_lock); | |
1a6e0f06 | 7426 | |
e4b2b4a8 | 7427 | /* Mutex for _OSI support */ |
b3bbd485 JK |
7428 | diff --git a/drivers/acpi/acpica/hwregs.c b/drivers/acpi/acpica/hwregs.c |
7429 | index acb417b58bbb..ea49e08c263f 100644 | |
7430 | --- a/drivers/acpi/acpica/hwregs.c | |
7431 | +++ b/drivers/acpi/acpica/hwregs.c | |
7432 | @@ -428,14 +428,14 @@ acpi_status acpi_hw_clear_acpi_status(void) | |
e4b2b4a8 JK |
7433 | ACPI_BITMASK_ALL_FIXED_STATUS, |
7434 | ACPI_FORMAT_UINT64(acpi_gbl_xpm1a_status.address))); | |
1a6e0f06 | 7435 | |
e4b2b4a8 JK |
7436 | - lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock); |
7437 | + raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags); | |
1a6e0f06 | 7438 | |
e4b2b4a8 | 7439 | /* Clear the fixed events in PM1 A/B */ |
1a6e0f06 | 7440 | |
e4b2b4a8 JK |
7441 | status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS, |
7442 | ACPI_BITMASK_ALL_FIXED_STATUS); | |
1a6e0f06 | 7443 | |
e4b2b4a8 JK |
7444 | - acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags); |
7445 | + raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags); | |
1a6e0f06 | 7446 | |
e4b2b4a8 JK |
7447 | if (ACPI_FAILURE(status)) { |
7448 | goto exit; | |
b3bbd485 JK |
7449 | diff --git a/drivers/acpi/acpica/hwxface.c b/drivers/acpi/acpica/hwxface.c |
7450 | index 34684ae89981..fb84983e1839 100644 | |
7451 | --- a/drivers/acpi/acpica/hwxface.c | |
7452 | +++ b/drivers/acpi/acpica/hwxface.c | |
7453 | @@ -373,7 +373,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value) | |
e4b2b4a8 JK |
7454 | return_ACPI_STATUS(AE_BAD_PARAMETER); |
7455 | } | |
1a6e0f06 | 7456 | |
e4b2b4a8 JK |
7457 | - lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock); |
7458 | + raw_spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags); | |
1a6e0f06 | 7459 | |
e4b2b4a8 JK |
7460 | /* |
7461 | * At this point, we know that the parent register is one of the | |
b3bbd485 | 7462 | @@ -434,7 +434,7 @@ acpi_status acpi_write_bit_register(u32 register_id, u32 value) |
1a6e0f06 | 7463 | |
e4b2b4a8 JK |
7464 | unlock_and_exit: |
7465 | ||
7466 | - acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags); | |
7467 | + raw_spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags); | |
7468 | return_ACPI_STATUS(status); | |
7469 | } | |
7470 | ||
b3bbd485 JK |
7471 | diff --git a/drivers/acpi/acpica/utmutex.c b/drivers/acpi/acpica/utmutex.c |
7472 | index 586354788018..3a3c2a86437f 100644 | |
7473 | --- a/drivers/acpi/acpica/utmutex.c | |
7474 | +++ b/drivers/acpi/acpica/utmutex.c | |
7475 | @@ -88,7 +88,7 @@ acpi_status acpi_ut_mutex_initialize(void) | |
e4b2b4a8 JK |
7476 | return_ACPI_STATUS (status); |
7477 | } | |
7478 | ||
7479 | - status = acpi_os_create_lock (&acpi_gbl_hardware_lock); | |
7480 | + status = acpi_os_create_raw_lock (&acpi_gbl_hardware_lock); | |
7481 | if (ACPI_FAILURE (status)) { | |
7482 | return_ACPI_STATUS (status); | |
7483 | } | |
b3bbd485 | 7484 | @@ -145,7 +145,7 @@ void acpi_ut_mutex_terminate(void) |
e4b2b4a8 JK |
7485 | /* Delete the spinlocks */ |
7486 | ||
7487 | acpi_os_delete_lock(acpi_gbl_gpe_lock); | |
7488 | - acpi_os_delete_lock(acpi_gbl_hardware_lock); | |
7489 | + acpi_os_delete_raw_lock(acpi_gbl_hardware_lock); | |
7490 | acpi_os_delete_lock(acpi_gbl_reference_count_lock); | |
7491 | ||
7492 | /* Delete the reader/writer lock */ | |
b3bbd485 JK |
7493 | diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c |
7494 | index cc2f2e35f4c2..0f0bc86e02df 100644 | |
7495 | --- a/drivers/ata/libata-sff.c | |
7496 | +++ b/drivers/ata/libata-sff.c | |
7497 | @@ -679,9 +679,9 @@ unsigned int ata_sff_data_xfer_noirq(struct ata_queued_cmd *qc, unsigned char *b | |
e4b2b4a8 JK |
7498 | unsigned long flags; |
7499 | unsigned int consumed; | |
7500 | ||
7501 | - local_irq_save(flags); | |
7502 | + local_irq_save_nort(flags); | |
7503 | consumed = ata_sff_data_xfer32(qc, buf, buflen, rw); | |
7504 | - local_irq_restore(flags); | |
7505 | + local_irq_restore_nort(flags); | |
7506 | ||
7507 | return consumed; | |
7508 | } | |
b3bbd485 JK |
7509 | diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c |
7510 | index cdd6f256da59..2269d379c92f 100644 | |
7511 | --- a/drivers/base/power/wakeup.c | |
7512 | +++ b/drivers/base/power/wakeup.c | |
7513 | @@ -52,7 +52,7 @@ static void split_counters(unsigned int *cnt, unsigned int *inpr) | |
7514 | /* A preserved old value of the events counter. */ | |
7515 | static unsigned int saved_count; | |
7516 | ||
7517 | -static DEFINE_SPINLOCK(events_lock); | |
7518 | +static DEFINE_RAW_SPINLOCK(events_lock); | |
7519 | ||
7520 | static void pm_wakeup_timer_fn(unsigned long data); | |
7521 | ||
7522 | @@ -180,9 +180,9 @@ void wakeup_source_add(struct wakeup_source *ws) | |
7523 | ws->active = false; | |
7524 | ws->last_time = ktime_get(); | |
7525 | ||
7526 | - spin_lock_irqsave(&events_lock, flags); | |
7527 | + raw_spin_lock_irqsave(&events_lock, flags); | |
7528 | list_add_rcu(&ws->entry, &wakeup_sources); | |
7529 | - spin_unlock_irqrestore(&events_lock, flags); | |
7530 | + raw_spin_unlock_irqrestore(&events_lock, flags); | |
7531 | } | |
7532 | EXPORT_SYMBOL_GPL(wakeup_source_add); | |
7533 | ||
7534 | @@ -197,9 +197,9 @@ void wakeup_source_remove(struct wakeup_source *ws) | |
7535 | if (WARN_ON(!ws)) | |
7536 | return; | |
7537 | ||
7538 | - spin_lock_irqsave(&events_lock, flags); | |
7539 | + raw_spin_lock_irqsave(&events_lock, flags); | |
7540 | list_del_rcu(&ws->entry); | |
7541 | - spin_unlock_irqrestore(&events_lock, flags); | |
7542 | + raw_spin_unlock_irqrestore(&events_lock, flags); | |
7543 | synchronize_srcu(&wakeup_srcu); | |
7544 | } | |
7545 | EXPORT_SYMBOL_GPL(wakeup_source_remove); | |
7546 | @@ -844,7 +844,7 @@ bool pm_wakeup_pending(void) | |
7547 | unsigned long flags; | |
7548 | bool ret = false; | |
7549 | ||
7550 | - spin_lock_irqsave(&events_lock, flags); | |
7551 | + raw_spin_lock_irqsave(&events_lock, flags); | |
7552 | if (events_check_enabled) { | |
7553 | unsigned int cnt, inpr; | |
7554 | ||
7555 | @@ -852,7 +852,7 @@ bool pm_wakeup_pending(void) | |
7556 | ret = (cnt != saved_count || inpr > 0); | |
7557 | events_check_enabled = !ret; | |
7558 | } | |
7559 | - spin_unlock_irqrestore(&events_lock, flags); | |
7560 | + raw_spin_unlock_irqrestore(&events_lock, flags); | |
7561 | ||
7562 | if (ret) { | |
7563 | pr_info("PM: Wakeup pending, aborting suspend\n"); | |
7564 | @@ -941,13 +941,13 @@ bool pm_save_wakeup_count(unsigned int count) | |
7565 | unsigned long flags; | |
7566 | ||
7567 | events_check_enabled = false; | |
7568 | - spin_lock_irqsave(&events_lock, flags); | |
7569 | + raw_spin_lock_irqsave(&events_lock, flags); | |
7570 | split_counters(&cnt, &inpr); | |
7571 | if (cnt == count && inpr == 0) { | |
7572 | saved_count = count; | |
7573 | events_check_enabled = true; | |
7574 | } | |
7575 | - spin_unlock_irqrestore(&events_lock, flags); | |
7576 | + raw_spin_unlock_irqrestore(&events_lock, flags); | |
7577 | return events_check_enabled; | |
7578 | } | |
7579 | ||
7580 | diff --git a/drivers/block/brd.c b/drivers/block/brd.c | |
7581 | index 2d7178f7754e..c1cf87718c2e 100644 | |
7582 | --- a/drivers/block/brd.c | |
7583 | +++ b/drivers/block/brd.c | |
7584 | @@ -60,7 +60,6 @@ struct brd_device { | |
e4b2b4a8 JK |
7585 | /* |
7586 | * Look up and return a brd's page for a given sector. | |
7587 | */ | |
7588 | -static DEFINE_MUTEX(brd_mutex); | |
7589 | static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) | |
7590 | { | |
7591 | pgoff_t idx; | |
b3bbd485 JK |
7592 | diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c |
7593 | index 5b8992beffec..40345483a022 100644 | |
7594 | --- a/drivers/block/zram/zcomp.c | |
7595 | +++ b/drivers/block/zram/zcomp.c | |
7596 | @@ -116,12 +116,20 @@ ssize_t zcomp_available_show(const char *comp, char *buf) | |
1a6e0f06 JK |
7597 | |
7598 | struct zcomp_strm *zcomp_stream_get(struct zcomp *comp) | |
7599 | { | |
7600 | - return *get_cpu_ptr(comp->stream); | |
7601 | + struct zcomp_strm *zstrm; | |
7602 | + | |
e4b2b4a8 | 7603 | + zstrm = *get_local_ptr(comp->stream); |
1a6e0f06 JK |
7604 | + spin_lock(&zstrm->zcomp_lock); |
7605 | + return zstrm; | |
7606 | } | |
7607 | ||
7608 | void zcomp_stream_put(struct zcomp *comp) | |
7609 | { | |
7610 | - put_cpu_ptr(comp->stream); | |
7611 | + struct zcomp_strm *zstrm; | |
7612 | + | |
7613 | + zstrm = *this_cpu_ptr(comp->stream); | |
7614 | + spin_unlock(&zstrm->zcomp_lock); | |
e4b2b4a8 | 7615 | + put_local_ptr(zstrm); |
1a6e0f06 JK |
7616 | } |
7617 | ||
7618 | int zcomp_compress(struct zcomp_strm *zstrm, | |
b3bbd485 | 7619 | @@ -171,6 +179,7 @@ int zcomp_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) |
e4b2b4a8 JK |
7620 | pr_err("Can't allocate a compression stream\n"); |
7621 | return -ENOMEM; | |
7622 | } | |
7623 | + spin_lock_init(&zstrm->zcomp_lock); | |
7624 | *per_cpu_ptr(comp->stream, cpu) = zstrm; | |
7625 | return 0; | |
7626 | } | |
b3bbd485 JK |
7627 | diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h |
7628 | index 41c1002a7d7d..d424eafcbf8e 100644 | |
7629 | --- a/drivers/block/zram/zcomp.h | |
7630 | +++ b/drivers/block/zram/zcomp.h | |
7631 | @@ -14,6 +14,7 @@ struct zcomp_strm { | |
1a6e0f06 JK |
7632 | /* compression/decompression buffer */ |
7633 | void *buffer; | |
7634 | struct crypto_comp *tfm; | |
7635 | + spinlock_t zcomp_lock; | |
7636 | }; | |
7637 | ||
7638 | /* dynamic per-device compression frontend */ | |
b3bbd485 JK |
7639 | diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c |
7640 | index 1e2648e4c286..c5d61209eb05 100644 | |
7641 | --- a/drivers/block/zram/zram_drv.c | |
7642 | +++ b/drivers/block/zram/zram_drv.c | |
7643 | @@ -761,6 +761,30 @@ static DEVICE_ATTR_RO(io_stat); | |
e4b2b4a8 JK |
7644 | static DEVICE_ATTR_RO(mm_stat); |
7645 | static DEVICE_ATTR_RO(debug_stat); | |
1a6e0f06 | 7646 | |
e4b2b4a8 JK |
7647 | +#ifdef CONFIG_PREEMPT_RT_BASE |
7648 | +static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) | |
7649 | +{ | |
7650 | + size_t index; | |
7651 | + | |
7652 | + for (index = 0; index < num_pages; index++) | |
7653 | + spin_lock_init(&zram->table[index].lock); | |
7654 | +} | |
7655 | + | |
7656 | +static void zram_slot_lock(struct zram *zram, u32 index) | |
7657 | +{ | |
7658 | + spin_lock(&zram->table[index].lock); | |
7659 | + __set_bit(ZRAM_ACCESS, &zram->table[index].value); | |
7660 | +} | |
7661 | + | |
7662 | +static void zram_slot_unlock(struct zram *zram, u32 index) | |
7663 | +{ | |
7664 | + __clear_bit(ZRAM_ACCESS, &zram->table[index].value); | |
7665 | + spin_unlock(&zram->table[index].lock); | |
7666 | +} | |
7667 | + | |
7668 | +#else | |
7669 | +static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) { } | |
1a6e0f06 | 7670 | + |
e4b2b4a8 JK |
7671 | static void zram_slot_lock(struct zram *zram, u32 index) |
7672 | { | |
7673 | bit_spin_lock(ZRAM_ACCESS, &zram->table[index].value); | |
b3bbd485 | 7674 | @@ -770,6 +794,7 @@ static void zram_slot_unlock(struct zram *zram, u32 index) |
e4b2b4a8 JK |
7675 | { |
7676 | bit_spin_unlock(ZRAM_ACCESS, &zram->table[index].value); | |
7677 | } | |
7678 | +#endif | |
1a6e0f06 | 7679 | |
e4b2b4a8 JK |
7680 | static void zram_meta_free(struct zram *zram, u64 disksize) |
7681 | { | |
b3bbd485 | 7682 | @@ -799,6 +824,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) |
e4b2b4a8 JK |
7683 | return false; |
7684 | } | |
7685 | ||
7686 | + zram_meta_init_table_locks(zram, num_pages); | |
7687 | return true; | |
7688 | } | |
7689 | ||
b3bbd485 | 7690 | @@ -850,6 +876,7 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, |
1a6e0f06 JK |
7691 | unsigned long handle; |
7692 | unsigned int size; | |
e4b2b4a8 | 7693 | void *src, *dst; |
1a6e0f06 JK |
7694 | + struct zcomp_strm *zstrm; |
7695 | ||
e4b2b4a8 JK |
7696 | if (zram_wb_enabled(zram)) { |
7697 | zram_slot_lock(zram, index); | |
b3bbd485 | 7698 | @@ -884,6 +911,7 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, |
1a6e0f06 | 7699 | |
e4b2b4a8 | 7700 | size = zram_get_obj_size(zram, index); |
1a6e0f06 JK |
7701 | |
7702 | + zstrm = zcomp_stream_get(zram->comp); | |
e4b2b4a8 | 7703 | src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); |
1a6e0f06 | 7704 | if (size == PAGE_SIZE) { |
e4b2b4a8 | 7705 | dst = kmap_atomic(page); |
b3bbd485 | 7706 | @@ -891,14 +919,13 @@ static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index, |
e4b2b4a8 JK |
7707 | kunmap_atomic(dst); |
7708 | ret = 0; | |
1a6e0f06 JK |
7709 | } else { |
7710 | - struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp); | |
e4b2b4a8 JK |
7711 | |
7712 | dst = kmap_atomic(page); | |
7713 | ret = zcomp_decompress(zstrm, src, size, dst); | |
7714 | kunmap_atomic(dst); | |
1a6e0f06 JK |
7715 | - zcomp_stream_put(zram->comp); |
7716 | } | |
e4b2b4a8 | 7717 | zs_unmap_object(zram->mem_pool, handle); |
1a6e0f06 | 7718 | + zcomp_stream_put(zram->comp); |
e4b2b4a8 | 7719 | zram_slot_unlock(zram, index); |
1a6e0f06 JK |
7720 | |
7721 | /* Should NEVER happen. Return bio error if it does. */ | |
b3bbd485 JK |
7722 | diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h |
7723 | index 31762db861e3..a417c96b8f3f 100644 | |
7724 | --- a/drivers/block/zram/zram_drv.h | |
7725 | +++ b/drivers/block/zram/zram_drv.h | |
7726 | @@ -77,6 +77,9 @@ struct zram_table_entry { | |
e4b2b4a8 JK |
7727 | unsigned long element; |
7728 | }; | |
1a6e0f06 JK |
7729 | unsigned long value; |
7730 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
7731 | + spinlock_t lock; | |
7732 | +#endif | |
7733 | }; | |
7734 | ||
7735 | struct zram_stats { | |
b3bbd485 JK |
7736 | diff --git a/drivers/char/random.c b/drivers/char/random.c |
7737 | index ea4dbfa30657..c72a7f0b4494 100644 | |
7738 | --- a/drivers/char/random.c | |
7739 | +++ b/drivers/char/random.c | |
e4b2b4a8 JK |
7740 | @@ -265,6 +265,7 @@ |
7741 | #include <linux/syscalls.h> | |
7742 | #include <linux/completion.h> | |
7743 | #include <linux/uuid.h> | |
7744 | +#include <linux/locallock.h> | |
7745 | #include <crypto/chacha20.h> | |
7746 | ||
7747 | #include <asm/processor.h> | |
b3bbd485 | 7748 | @@ -856,7 +857,7 @@ static int crng_fast_load(const char *cp, size_t len) |
e4b2b4a8 JK |
7749 | invalidate_batched_entropy(); |
7750 | crng_init = 1; | |
7751 | wake_up_interruptible(&crng_init_wait); | |
7752 | - pr_notice("random: fast init done\n"); | |
7753 | + /* pr_notice("random: fast init done\n"); */ | |
7754 | } | |
7755 | return 1; | |
7756 | } | |
b3bbd485 | 7757 | @@ -941,17 +942,21 @@ static void crng_reseed(struct crng_state *crng, struct entropy_store *r) |
e4b2b4a8 JK |
7758 | crng_init = 2; |
7759 | process_random_ready_list(); | |
7760 | wake_up_interruptible(&crng_init_wait); | |
7761 | - pr_notice("random: crng init done\n"); | |
7762 | + /* pr_notice("random: crng init done\n"); */ | |
7763 | if (unseeded_warning.missed) { | |
7764 | +#if 0 | |
7765 | pr_notice("random: %d get_random_xx warning(s) missed " | |
7766 | "due to ratelimiting\n", | |
7767 | unseeded_warning.missed); | |
7768 | +#endif | |
7769 | unseeded_warning.missed = 0; | |
7770 | } | |
7771 | if (urandom_warning.missed) { | |
7772 | +#if 0 | |
7773 | pr_notice("random: %d urandom warning(s) missed " | |
7774 | "due to ratelimiting\n", | |
7775 | urandom_warning.missed); | |
7776 | +#endif | |
7777 | urandom_warning.missed = 0; | |
7778 | } | |
7779 | } | |
b3bbd485 | 7780 | @@ -1122,8 +1127,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num) |
1a6e0f06 JK |
7781 | } sample; |
7782 | long delta, delta2, delta3; | |
7783 | ||
7784 | - preempt_disable(); | |
7785 | - | |
7786 | sample.jiffies = jiffies; | |
7787 | sample.cycles = random_get_entropy(); | |
7788 | sample.num = num; | |
b3bbd485 | 7789 | @@ -1164,7 +1167,6 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num) |
1a6e0f06 JK |
7790 | */ |
7791 | credit_entropy_bits(r, min_t(int, fls(delta>>1), 11)); | |
7792 | } | |
7793 | - preempt_enable(); | |
7794 | } | |
7795 | ||
7796 | void add_input_randomness(unsigned int type, unsigned int code, | |
b3bbd485 | 7797 | @@ -1221,28 +1223,27 @@ static __u32 get_reg(struct fast_pool *f, struct pt_regs *regs) |
e4b2b4a8 | 7798 | return *ptr; |
1a6e0f06 JK |
7799 | } |
7800 | ||
7801 | -void add_interrupt_randomness(int irq, int irq_flags) | |
7802 | +void add_interrupt_randomness(int irq, int irq_flags, __u64 ip) | |
7803 | { | |
7804 | struct entropy_store *r; | |
7805 | struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness); | |
7806 | - struct pt_regs *regs = get_irq_regs(); | |
7807 | unsigned long now = jiffies; | |
7808 | cycles_t cycles = random_get_entropy(); | |
7809 | __u32 c_high, j_high; | |
7810 | - __u64 ip; | |
7811 | unsigned long seed; | |
7812 | int credit = 0; | |
7813 | ||
7814 | if (cycles == 0) | |
7815 | - cycles = get_reg(fast_pool, regs); | |
7816 | + cycles = get_reg(fast_pool, NULL); | |
7817 | c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0; | |
7818 | j_high = (sizeof(now) > 4) ? now >> 32 : 0; | |
7819 | fast_pool->pool[0] ^= cycles ^ j_high ^ irq; | |
7820 | fast_pool->pool[1] ^= now ^ c_high; | |
7821 | - ip = regs ? instruction_pointer(regs) : _RET_IP_; | |
7822 | + if (!ip) | |
7823 | + ip = _RET_IP_; | |
7824 | fast_pool->pool[2] ^= ip; | |
7825 | fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 : | |
7826 | - get_reg(fast_pool, regs); | |
7827 | + get_reg(fast_pool, NULL); | |
7828 | ||
7829 | fast_mix(fast_pool); | |
7830 | add_interrupt_bench(cycles); | |
b3bbd485 | 7831 | @@ -2200,6 +2201,7 @@ static rwlock_t batched_entropy_reset_lock = __RW_LOCK_UNLOCKED(batched_entropy_ |
e4b2b4a8 JK |
7832 | * at any point prior. |
7833 | */ | |
7834 | static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_u64); | |
7835 | +static DEFINE_LOCAL_IRQ_LOCK(batched_entropy_u64_lock); | |
7836 | u64 get_random_u64(void) | |
7837 | { | |
7838 | u64 ret; | |
b3bbd485 | 7839 | @@ -2220,7 +2222,7 @@ u64 get_random_u64(void) |
e4b2b4a8 JK |
7840 | warn_unseeded_randomness(&previous); |
7841 | ||
7842 | use_lock = READ_ONCE(crng_init) < 2; | |
7843 | - batch = &get_cpu_var(batched_entropy_u64); | |
7844 | + batch = &get_locked_var(batched_entropy_u64_lock, batched_entropy_u64); | |
7845 | if (use_lock) | |
7846 | read_lock_irqsave(&batched_entropy_reset_lock, flags); | |
7847 | if (batch->position % ARRAY_SIZE(batch->entropy_u64) == 0) { | |
b3bbd485 | 7848 | @@ -2230,12 +2232,13 @@ u64 get_random_u64(void) |
e4b2b4a8 JK |
7849 | ret = batch->entropy_u64[batch->position++]; |
7850 | if (use_lock) | |
7851 | read_unlock_irqrestore(&batched_entropy_reset_lock, flags); | |
7852 | - put_cpu_var(batched_entropy_u64); | |
7853 | + put_locked_var(batched_entropy_u64_lock, batched_entropy_u64); | |
7854 | return ret; | |
7855 | } | |
7856 | EXPORT_SYMBOL(get_random_u64); | |
7857 | ||
7858 | static DEFINE_PER_CPU(struct batched_entropy, batched_entropy_u32); | |
7859 | +static DEFINE_LOCAL_IRQ_LOCK(batched_entropy_u32_lock); | |
7860 | u32 get_random_u32(void) | |
7861 | { | |
7862 | u32 ret; | |
b3bbd485 | 7863 | @@ -2250,7 +2253,7 @@ u32 get_random_u32(void) |
e4b2b4a8 JK |
7864 | warn_unseeded_randomness(&previous); |
7865 | ||
7866 | use_lock = READ_ONCE(crng_init) < 2; | |
7867 | - batch = &get_cpu_var(batched_entropy_u32); | |
7868 | + batch = &get_locked_var(batched_entropy_u32_lock, batched_entropy_u32); | |
7869 | if (use_lock) | |
7870 | read_lock_irqsave(&batched_entropy_reset_lock, flags); | |
7871 | if (batch->position % ARRAY_SIZE(batch->entropy_u32) == 0) { | |
b3bbd485 | 7872 | @@ -2260,7 +2263,7 @@ u32 get_random_u32(void) |
e4b2b4a8 JK |
7873 | ret = batch->entropy_u32[batch->position++]; |
7874 | if (use_lock) | |
7875 | read_unlock_irqrestore(&batched_entropy_reset_lock, flags); | |
7876 | - put_cpu_var(batched_entropy_u32); | |
7877 | + put_locked_var(batched_entropy_u32_lock, batched_entropy_u32); | |
7878 | return ret; | |
7879 | } | |
7880 | EXPORT_SYMBOL(get_random_u32); | |
b3bbd485 JK |
7881 | diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c |
7882 | index 50b59a69dc33..cbdb0a6c5337 100644 | |
7883 | --- a/drivers/char/tpm/tpm_tis.c | |
7884 | +++ b/drivers/char/tpm/tpm_tis.c | |
7885 | @@ -52,6 +52,31 @@ static inline struct tpm_tis_tcg_phy *to_tpm_tis_tcg_phy(struct tpm_tis_data *da | |
e4b2b4a8 JK |
7886 | return container_of(data, struct tpm_tis_tcg_phy, priv); |
7887 | } | |
7888 | ||
7889 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
7890 | +/* | |
7891 | + * Flushes previous write operations to chip so that a subsequent | |
7892 | + * ioread*()s won't stall a cpu. | |
7893 | + */ | |
7894 | +static inline void tpm_tis_flush(void __iomem *iobase) | |
7895 | +{ | |
7896 | + ioread8(iobase + TPM_ACCESS(0)); | |
7897 | +} | |
7898 | +#else | |
7899 | +#define tpm_tis_flush(iobase) do { } while (0) | |
7900 | +#endif | |
7901 | + | |
7902 | +static inline void tpm_tis_iowrite8(u8 b, void __iomem *iobase, u32 addr) | |
7903 | +{ | |
7904 | + iowrite8(b, iobase + addr); | |
7905 | + tpm_tis_flush(iobase); | |
7906 | +} | |
7907 | + | |
7908 | +static inline void tpm_tis_iowrite32(u32 b, void __iomem *iobase, u32 addr) | |
7909 | +{ | |
7910 | + iowrite32(b, iobase + addr); | |
7911 | + tpm_tis_flush(iobase); | |
7912 | +} | |
7913 | + | |
7914 | static bool interrupts = true; | |
7915 | module_param(interrupts, bool, 0444); | |
7916 | MODULE_PARM_DESC(interrupts, "Enable interrupts"); | |
b3bbd485 | 7917 | @@ -149,7 +174,7 @@ static int tpm_tcg_write_bytes(struct tpm_tis_data *data, u32 addr, u16 len, |
e4b2b4a8 JK |
7918 | struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data); |
7919 | ||
7920 | while (len--) | |
7921 | - iowrite8(*value++, phy->iobase + addr); | |
7922 | + tpm_tis_iowrite8(*value++, phy->iobase, addr); | |
7923 | ||
7924 | return 0; | |
7925 | } | |
b3bbd485 | 7926 | @@ -176,7 +201,7 @@ static int tpm_tcg_write32(struct tpm_tis_data *data, u32 addr, u32 value) |
e4b2b4a8 JK |
7927 | { |
7928 | struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data); | |
7929 | ||
7930 | - iowrite32(value, phy->iobase + addr); | |
7931 | + tpm_tis_iowrite32(value, phy->iobase, addr); | |
7932 | ||
7933 | return 0; | |
7934 | } | |
b3bbd485 JK |
7935 | diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c |
7936 | index 9de47d4d2d9e..05f4b88bb955 100644 | |
7937 | --- a/drivers/clocksource/tcb_clksrc.c | |
7938 | +++ b/drivers/clocksource/tcb_clksrc.c | |
e4b2b4a8 | 7939 | @@ -25,8 +25,7 @@ |
1a6e0f06 JK |
7940 | * this 32 bit free-running counter. the second channel is not used. |
7941 | * | |
7942 | * - The third channel may be used to provide a 16-bit clockevent | |
7943 | - * source, used in either periodic or oneshot mode. This runs | |
7944 | - * at 32 KiHZ, and can handle delays of up to two seconds. | |
7945 | + * source, used in either periodic or oneshot mode. | |
7946 | * | |
7947 | * A boot clocksource and clockevent source are also currently needed, | |
7948 | * unless the relevant platforms (ARM/AT91, AVR32/AT32) are changed so | |
b3bbd485 | 7949 | @@ -126,6 +125,8 @@ static struct clocksource clksrc = { |
1a6e0f06 JK |
7950 | struct tc_clkevt_device { |
7951 | struct clock_event_device clkevt; | |
7952 | struct clk *clk; | |
7953 | + bool clk_enabled; | |
7954 | + u32 freq; | |
7955 | void __iomem *regs; | |
7956 | }; | |
7957 | ||
b3bbd485 | 7958 | @@ -134,15 +135,26 @@ static struct tc_clkevt_device *to_tc_clkevt(struct clock_event_device *clkevt) |
1a6e0f06 JK |
7959 | return container_of(clkevt, struct tc_clkevt_device, clkevt); |
7960 | } | |
7961 | ||
7962 | -/* For now, we always use the 32K clock ... this optimizes for NO_HZ, | |
7963 | - * because using one of the divided clocks would usually mean the | |
7964 | - * tick rate can never be less than several dozen Hz (vs 0.5 Hz). | |
7965 | - * | |
7966 | - * A divided clock could be good for high resolution timers, since | |
7967 | - * 30.5 usec resolution can seem "low". | |
7968 | - */ | |
7969 | static u32 timer_clock; | |
7970 | ||
7971 | +static void tc_clk_disable(struct clock_event_device *d) | |
7972 | +{ | |
7973 | + struct tc_clkevt_device *tcd = to_tc_clkevt(d); | |
7974 | + | |
7975 | + clk_disable(tcd->clk); | |
7976 | + tcd->clk_enabled = false; | |
7977 | +} | |
7978 | + | |
7979 | +static void tc_clk_enable(struct clock_event_device *d) | |
7980 | +{ | |
7981 | + struct tc_clkevt_device *tcd = to_tc_clkevt(d); | |
7982 | + | |
7983 | + if (tcd->clk_enabled) | |
7984 | + return; | |
7985 | + clk_enable(tcd->clk); | |
7986 | + tcd->clk_enabled = true; | |
7987 | +} | |
7988 | + | |
7989 | static int tc_shutdown(struct clock_event_device *d) | |
7990 | { | |
7991 | struct tc_clkevt_device *tcd = to_tc_clkevt(d); | |
b3bbd485 | 7992 | @@ -150,8 +162,14 @@ static int tc_shutdown(struct clock_event_device *d) |
1a6e0f06 | 7993 | |
e4b2b4a8 JK |
7994 | writel(0xff, regs + ATMEL_TC_REG(2, IDR)); |
7995 | writel(ATMEL_TC_CLKDIS, regs + ATMEL_TC_REG(2, CCR)); | |
1a6e0f06 JK |
7996 | + return 0; |
7997 | +} | |
7998 | + | |
7999 | +static int tc_shutdown_clk_off(struct clock_event_device *d) | |
8000 | +{ | |
8001 | + tc_shutdown(d); | |
8002 | if (!clockevent_state_detached(d)) | |
8003 | - clk_disable(tcd->clk); | |
8004 | + tc_clk_disable(d); | |
8005 | ||
8006 | return 0; | |
8007 | } | |
b3bbd485 | 8008 | @@ -164,9 +182,9 @@ static int tc_set_oneshot(struct clock_event_device *d) |
1a6e0f06 JK |
8009 | if (clockevent_state_oneshot(d) || clockevent_state_periodic(d)) |
8010 | tc_shutdown(d); | |
8011 | ||
8012 | - clk_enable(tcd->clk); | |
8013 | + tc_clk_enable(d); | |
8014 | ||
8015 | - /* slow clock, count up to RC, then irq and stop */ | |
8016 | + /* count up to RC, then irq and stop */ | |
e4b2b4a8 | 8017 | writel(timer_clock | ATMEL_TC_CPCSTOP | ATMEL_TC_WAVE | |
1a6e0f06 | 8018 | ATMEL_TC_WAVESEL_UP_AUTO, regs + ATMEL_TC_REG(2, CMR)); |
e4b2b4a8 | 8019 | writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER)); |
b3bbd485 | 8020 | @@ -186,12 +204,12 @@ static int tc_set_periodic(struct clock_event_device *d) |
1a6e0f06 JK |
8021 | /* By not making the gentime core emulate periodic mode on top |
8022 | * of oneshot, we get lower overhead and improved accuracy. | |
8023 | */ | |
8024 | - clk_enable(tcd->clk); | |
8025 | + tc_clk_enable(d); | |
8026 | ||
8027 | - /* slow clock, count up to RC, then irq and restart */ | |
8028 | + /* count up to RC, then irq and restart */ | |
e4b2b4a8 | 8029 | writel(timer_clock | ATMEL_TC_WAVE | ATMEL_TC_WAVESEL_UP_AUTO, |
1a6e0f06 | 8030 | regs + ATMEL_TC_REG(2, CMR)); |
e4b2b4a8 JK |
8031 | - writel((32768 + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC)); |
8032 | + writel((tcd->freq + HZ / 2) / HZ, tcaddr + ATMEL_TC_REG(2, RC)); | |
1a6e0f06 JK |
8033 | |
8034 | /* Enable clock and interrupts on RC compare */ | |
e4b2b4a8 | 8035 | writel(ATMEL_TC_CPCS, regs + ATMEL_TC_REG(2, IER)); |
b3bbd485 | 8036 | @@ -218,9 +236,13 @@ static struct tc_clkevt_device clkevt = { |
1a6e0f06 JK |
8037 | .features = CLOCK_EVT_FEAT_PERIODIC | |
8038 | CLOCK_EVT_FEAT_ONESHOT, | |
8039 | /* Should be lower than at91rm9200's system timer */ | |
8040 | +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK | |
8041 | .rating = 125, | |
8042 | +#else | |
8043 | + .rating = 200, | |
8044 | +#endif | |
8045 | .set_next_event = tc_next_event, | |
8046 | - .set_state_shutdown = tc_shutdown, | |
8047 | + .set_state_shutdown = tc_shutdown_clk_off, | |
8048 | .set_state_periodic = tc_set_periodic, | |
8049 | .set_state_oneshot = tc_set_oneshot, | |
8050 | }, | |
b3bbd485 | 8051 | @@ -240,8 +262,9 @@ static irqreturn_t ch2_irq(int irq, void *handle) |
1a6e0f06 JK |
8052 | return IRQ_NONE; |
8053 | } | |
8054 | ||
8055 | -static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx) | |
8056 | +static int __init setup_clkevents(struct atmel_tc *tc, int divisor_idx) | |
8057 | { | |
8058 | + unsigned divisor = atmel_tc_divisors[divisor_idx]; | |
8059 | int ret; | |
8060 | struct clk *t2_clk = tc->clk[2]; | |
8061 | int irq = tc->irq[2]; | |
b3bbd485 | 8062 | @@ -262,7 +285,11 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx) |
1a6e0f06 JK |
8063 | clkevt.regs = tc->regs; |
8064 | clkevt.clk = t2_clk; | |
8065 | ||
8066 | - timer_clock = clk32k_divisor_idx; | |
8067 | + timer_clock = divisor_idx; | |
8068 | + if (!divisor) | |
8069 | + clkevt.freq = 32768; | |
8070 | + else | |
8071 | + clkevt.freq = clk_get_rate(t2_clk) / divisor; | |
8072 | ||
8073 | clkevt.clkevt.cpumask = cpumask_of(0); | |
8074 | ||
b3bbd485 | 8075 | @@ -273,7 +300,7 @@ static int __init setup_clkevents(struct atmel_tc *tc, int clk32k_divisor_idx) |
1a6e0f06 JK |
8076 | return ret; |
8077 | } | |
8078 | ||
8079 | - clockevents_config_and_register(&clkevt.clkevt, 32768, 1, 0xffff); | |
8080 | + clockevents_config_and_register(&clkevt.clkevt, clkevt.freq, 1, 0xffff); | |
8081 | ||
8082 | return ret; | |
8083 | } | |
b3bbd485 | 8084 | @@ -410,7 +437,11 @@ static int __init tcb_clksrc_init(void) |
1a6e0f06 JK |
8085 | goto err_disable_t1; |
8086 | ||
8087 | /* channel 2: periodic and oneshot timer support */ | |
8088 | +#ifdef CONFIG_ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK | |
8089 | ret = setup_clkevents(tc, clk32k_divisor_idx); | |
8090 | +#else | |
8091 | + ret = setup_clkevents(tc, best_divisor_idx); | |
8092 | +#endif | |
8093 | if (ret) | |
8094 | goto err_unregister_clksrc; | |
8095 | ||
b3bbd485 | 8096 | diff --git a/drivers/clocksource/timer-atmel-pit.c b/drivers/clocksource/timer-atmel-pit.c |
5dd41b01 | 8097 | index 2fab18fae4fc..98460c1bdec0 100644 |
b3bbd485 JK |
8098 | --- a/drivers/clocksource/timer-atmel-pit.c |
8099 | +++ b/drivers/clocksource/timer-atmel-pit.c | |
8100 | @@ -46,6 +46,7 @@ struct pit_data { | |
1a6e0f06 JK |
8101 | u32 cycle; |
8102 | u32 cnt; | |
8103 | unsigned int irq; | |
8104 | + bool irq_requested; | |
8105 | struct clk *mck; | |
8106 | }; | |
8107 | ||
b3bbd485 | 8108 | @@ -96,15 +97,29 @@ static int pit_clkevt_shutdown(struct clock_event_device *dev) |
1a6e0f06 JK |
8109 | |
8110 | /* disable irq, leaving the clocksource active */ | |
8111 | pit_write(data->base, AT91_PIT_MR, (data->cycle - 1) | AT91_PIT_PITEN); | |
8112 | + if (data->irq_requested) { | |
8113 | + free_irq(data->irq, data); | |
8114 | + data->irq_requested = false; | |
8115 | + } | |
8116 | return 0; | |
8117 | } | |
8118 | ||
8119 | +static irqreturn_t at91sam926x_pit_interrupt(int irq, void *dev_id); | |
8120 | /* | |
8121 | * Clockevent device: interrupts every 1/HZ (== pit_cycles * MCK/16) | |
8122 | */ | |
8123 | static int pit_clkevt_set_periodic(struct clock_event_device *dev) | |
8124 | { | |
8125 | struct pit_data *data = clkevt_to_pit_data(dev); | |
8126 | + int ret; | |
8127 | + | |
8128 | + ret = request_irq(data->irq, at91sam926x_pit_interrupt, | |
8129 | + IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL, | |
8130 | + "at91_tick", data); | |
8131 | + if (ret) | |
8132 | + panic(pr_fmt("Unable to setup IRQ\n")); | |
8133 | + | |
8134 | + data->irq_requested = true; | |
8135 | ||
8136 | /* update clocksource counter */ | |
8137 | data->cnt += data->cycle * PIT_PICNT(pit_read(data->base, AT91_PIT_PIVR)); | |
5dd41b01 JK |
8138 | @@ -233,16 +248,6 @@ static int __init at91sam926x_pit_dt_init(struct device_node *node) |
8139 | goto exit; | |
1a6e0f06 JK |
8140 | } |
8141 | ||
8142 | - /* Set up irq handler */ | |
8143 | - ret = request_irq(data->irq, at91sam926x_pit_interrupt, | |
8144 | - IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL, | |
8145 | - "at91_tick", data); | |
8146 | - if (ret) { | |
8147 | - pr_err("Unable to setup IRQ\n"); | |
5dd41b01 JK |
8148 | - clocksource_unregister(&data->clksrc); |
8149 | - goto exit; | |
1a6e0f06 JK |
8150 | - } |
8151 | - | |
8152 | /* Set up and register clockevents */ | |
8153 | data->clkevt.name = "pit"; | |
8154 | data->clkevt.features = CLOCK_EVT_FEAT_PERIODIC; | |
b3bbd485 JK |
8155 | diff --git a/drivers/clocksource/timer-atmel-st.c b/drivers/clocksource/timer-atmel-st.c |
8156 | index d2e660f475af..c63b96cfc23e 100644 | |
8157 | --- a/drivers/clocksource/timer-atmel-st.c | |
8158 | +++ b/drivers/clocksource/timer-atmel-st.c | |
8159 | @@ -115,18 +115,29 @@ static void clkdev32k_disable_and_flush_irq(void) | |
1a6e0f06 JK |
8160 | last_crtr = read_CRTR(); |
8161 | } | |
8162 | ||
8163 | +static int atmel_st_irq; | |
8164 | + | |
8165 | static int clkevt32k_shutdown(struct clock_event_device *evt) | |
8166 | { | |
8167 | clkdev32k_disable_and_flush_irq(); | |
8168 | irqmask = 0; | |
8169 | regmap_write(regmap_st, AT91_ST_IER, irqmask); | |
8170 | + free_irq(atmel_st_irq, regmap_st); | |
8171 | return 0; | |
8172 | } | |
8173 | ||
8174 | static int clkevt32k_set_oneshot(struct clock_event_device *dev) | |
8175 | { | |
8176 | + int ret; | |
8177 | + | |
8178 | clkdev32k_disable_and_flush_irq(); | |
8179 | ||
8180 | + ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt, | |
8181 | + IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL, | |
8182 | + "at91_tick", regmap_st); | |
8183 | + if (ret) | |
8184 | + panic(pr_fmt("Unable to setup IRQ\n")); | |
8185 | + | |
8186 | /* | |
8187 | * ALM for oneshot irqs, set by next_event() | |
8188 | * before 32 seconds have passed. | |
b3bbd485 | 8189 | @@ -139,8 +150,16 @@ static int clkevt32k_set_oneshot(struct clock_event_device *dev) |
1a6e0f06 JK |
8190 | |
8191 | static int clkevt32k_set_periodic(struct clock_event_device *dev) | |
8192 | { | |
8193 | + int ret; | |
8194 | + | |
8195 | clkdev32k_disable_and_flush_irq(); | |
8196 | ||
8197 | + ret = request_irq(atmel_st_irq, at91rm9200_timer_interrupt, | |
8198 | + IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL, | |
8199 | + "at91_tick", regmap_st); | |
8200 | + if (ret) | |
8201 | + panic(pr_fmt("Unable to setup IRQ\n")); | |
8202 | + | |
8203 | /* PIT for periodic irqs; fixed rate of 1/HZ */ | |
8204 | irqmask = AT91_ST_PITS; | |
8205 | regmap_write(regmap_st, AT91_ST_PIMR, timer_latch); | |
b3bbd485 | 8206 | @@ -198,7 +217,7 @@ static int __init atmel_st_timer_init(struct device_node *node) |
1a6e0f06 JK |
8207 | { |
8208 | struct clk *sclk; | |
8209 | unsigned int sclk_rate, val; | |
8210 | - int irq, ret; | |
8211 | + int ret; | |
8212 | ||
8213 | regmap_st = syscon_node_to_regmap(node); | |
8214 | if (IS_ERR(regmap_st)) { | |
b3bbd485 | 8215 | @@ -212,21 +231,12 @@ static int __init atmel_st_timer_init(struct device_node *node) |
1a6e0f06 JK |
8216 | regmap_read(regmap_st, AT91_ST_SR, &val); |
8217 | ||
8218 | /* Get the interrupts property */ | |
8219 | - irq = irq_of_parse_and_map(node, 0); | |
8220 | - if (!irq) { | |
8221 | + atmel_st_irq = irq_of_parse_and_map(node, 0); | |
8222 | + if (!atmel_st_irq) { | |
8223 | pr_err("Unable to get IRQ from DT\n"); | |
8224 | return -EINVAL; | |
8225 | } | |
8226 | ||
8227 | - /* Make IRQs happen for the system timer */ | |
8228 | - ret = request_irq(irq, at91rm9200_timer_interrupt, | |
8229 | - IRQF_SHARED | IRQF_TIMER | IRQF_IRQPOLL, | |
8230 | - "at91_tick", regmap_st); | |
8231 | - if (ret) { | |
8232 | - pr_err("Unable to setup IRQ\n"); | |
8233 | - return ret; | |
8234 | - } | |
8235 | - | |
8236 | sclk = of_clk_get(node, 0); | |
8237 | if (IS_ERR(sclk)) { | |
8238 | pr_err("Unable to get slow clock\n"); | |
b3bbd485 JK |
8239 | diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c |
8240 | index a782ce87715c..19d265948526 100644 | |
8241 | --- a/drivers/connector/cn_proc.c | |
8242 | +++ b/drivers/connector/cn_proc.c | |
1a6e0f06 JK |
8243 | @@ -32,6 +32,7 @@ |
8244 | #include <linux/pid_namespace.h> | |
8245 | ||
8246 | #include <linux/cn_proc.h> | |
8247 | +#include <linux/locallock.h> | |
8248 | ||
8249 | /* | |
8250 | * Size of a cn_msg followed by a proc_event structure. Since the | |
b3bbd485 | 8251 | @@ -54,10 +55,11 @@ static struct cb_id cn_proc_event_id = { CN_IDX_PROC, CN_VAL_PROC }; |
1a6e0f06 JK |
8252 | |
8253 | /* proc_event_counts is used as the sequence number of the netlink message */ | |
8254 | static DEFINE_PER_CPU(__u32, proc_event_counts) = { 0 }; | |
8255 | +static DEFINE_LOCAL_IRQ_LOCK(send_msg_lock); | |
8256 | ||
8257 | static inline void send_msg(struct cn_msg *msg) | |
8258 | { | |
8259 | - preempt_disable(); | |
8260 | + local_lock(send_msg_lock); | |
8261 | ||
8262 | msg->seq = __this_cpu_inc_return(proc_event_counts) - 1; | |
8263 | ((struct proc_event *)msg->data)->cpu = smp_processor_id(); | |
b3bbd485 | 8264 | @@ -70,7 +72,7 @@ static inline void send_msg(struct cn_msg *msg) |
1a6e0f06 JK |
8265 | */ |
8266 | cn_netlink_send(msg, 0, CN_IDX_PROC, GFP_NOWAIT); | |
8267 | ||
8268 | - preempt_enable(); | |
8269 | + local_unlock(send_msg_lock); | |
8270 | } | |
8271 | ||
8272 | void proc_fork_connector(struct task_struct *task) | |
b3bbd485 JK |
8273 | diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86 |
8274 | index 35f71825b7f3..bb4a6160d0f7 100644 | |
8275 | --- a/drivers/cpufreq/Kconfig.x86 | |
8276 | +++ b/drivers/cpufreq/Kconfig.x86 | |
8277 | @@ -125,7 +125,7 @@ config X86_POWERNOW_K7_ACPI | |
1a6e0f06 JK |
8278 | |
8279 | config X86_POWERNOW_K8 | |
8280 | tristate "AMD Opteron/Athlon64 PowerNow!" | |
8281 | - depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ | |
8282 | + depends on ACPI && ACPI_PROCESSOR && X86_ACPI_CPUFREQ && !PREEMPT_RT_BASE | |
8283 | help | |
8284 | This adds the CPUFreq driver for K8/early Opteron/Athlon64 processors. | |
8285 | Support for K10 and newer processors is now in acpi-cpufreq. | |
b3bbd485 JK |
8286 | diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c |
8287 | index c3eefa126e3b..47093745a53c 100644 | |
8288 | --- a/drivers/firmware/efi/efi.c | |
8289 | +++ b/drivers/firmware/efi/efi.c | |
8290 | @@ -74,7 +74,7 @@ static unsigned long *efi_tables[] = { | |
8291 | &efi.mem_attr_table, | |
8292 | }; | |
8293 | ||
8294 | -static bool disable_runtime; | |
8295 | +static bool disable_runtime = IS_ENABLED(CONFIG_PREEMPT_RT_BASE); | |
8296 | static int __init setup_noefi(char *arg) | |
8297 | { | |
8298 | disable_runtime = true; | |
8299 | @@ -100,6 +100,9 @@ static int __init parse_efi_cmdline(char *str) | |
8300 | if (parse_option_str(str, "noruntime")) | |
8301 | disable_runtime = true; | |
8302 | ||
8303 | + if (parse_option_str(str, "runtime")) | |
8304 | + disable_runtime = false; | |
8305 | + | |
8306 | return 0; | |
8307 | } | |
8308 | early_param("efi", parse_efi_cmdline); | |
8309 | diff --git a/drivers/gpu/drm/i915/i915_gem_timeline.c b/drivers/gpu/drm/i915/i915_gem_timeline.c | |
8310 | index c597ce277a04..c1108d3921f8 100644 | |
8311 | --- a/drivers/gpu/drm/i915/i915_gem_timeline.c | |
8312 | +++ b/drivers/gpu/drm/i915/i915_gem_timeline.c | |
8313 | @@ -33,11 +33,8 @@ static void __intel_timeline_init(struct intel_timeline *tl, | |
e4b2b4a8 JK |
8314 | { |
8315 | tl->fence_context = context; | |
8316 | tl->common = parent; | |
8317 | -#ifdef CONFIG_DEBUG_SPINLOCK | |
8318 | - __raw_spin_lock_init(&tl->lock.rlock, lockname, lockclass); | |
8319 | -#else | |
8320 | spin_lock_init(&tl->lock); | |
8321 | -#endif | |
8322 | + lockdep_set_class_and_name(&tl->lock, lockclass, lockname); | |
8323 | init_request_active(&tl->last_request, NULL); | |
8324 | INIT_LIST_HEAD(&tl->requests); | |
8325 | i915_syncmap_init(&tl->sync); | |
b3bbd485 JK |
8326 | diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c |
8327 | index 20a471ad0ad2..5d34d48a8b7b 100644 | |
8328 | --- a/drivers/gpu/drm/i915/i915_irq.c | |
8329 | +++ b/drivers/gpu/drm/i915/i915_irq.c | |
8330 | @@ -867,6 +867,7 @@ static bool i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe, | |
1a6e0f06 JK |
8331 | spin_lock_irqsave(&dev_priv->uncore.lock, irqflags); |
8332 | ||
8333 | /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */ | |
8334 | + preempt_disable_rt(); | |
8335 | ||
8336 | /* Get optional system timestamp before query. */ | |
8337 | if (stime) | |
b3bbd485 | 8338 | @@ -918,6 +919,7 @@ static bool i915_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe, |
1a6e0f06 JK |
8339 | *etime = ktime_get(); |
8340 | ||
8341 | /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */ | |
8342 | + preempt_enable_rt(); | |
8343 | ||
8344 | spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags); | |
8345 | ||
b3bbd485 JK |
8346 | diff --git a/drivers/gpu/drm/i915/intel_sprite.c b/drivers/gpu/drm/i915/intel_sprite.c |
8347 | index 41e31a454604..7e0cadf51b31 100644 | |
8348 | --- a/drivers/gpu/drm/i915/intel_sprite.c | |
8349 | +++ b/drivers/gpu/drm/i915/intel_sprite.c | |
e4b2b4a8 | 8350 | @@ -36,6 +36,7 @@ |
c7c16703 JK |
8351 | #include <drm/drm_rect.h> |
8352 | #include <drm/drm_atomic.h> | |
8353 | #include <drm/drm_plane_helper.h> | |
8354 | +#include <linux/locallock.h> | |
1a6e0f06 | 8355 | #include "intel_drv.h" |
c7c16703 | 8356 | #include "intel_frontbuffer.h" |
1a6e0f06 | 8357 | #include <drm/i915_drm.h> |
b3bbd485 | 8358 | @@ -67,7 +68,7 @@ int intel_usecs_to_scanlines(const struct drm_display_mode *adjusted_mode, |
1a6e0f06 JK |
8359 | } |
8360 | ||
e4b2b4a8 JK |
8361 | #define VBLANK_EVASION_TIME_US 100 |
8362 | - | |
1a6e0f06 | 8363 | +static DEFINE_LOCAL_IRQ_LOCK(pipe_update_lock); |
1a6e0f06 JK |
8364 | /** |
8365 | * intel_pipe_update_start() - start update of a set of display registers | |
8366 | * @crtc: the crtc of which the registers are going to be updated | |
b3bbd485 | 8367 | @@ -102,7 +103,7 @@ void intel_pipe_update_start(struct intel_crtc *crtc) |
e4b2b4a8 | 8368 | VBLANK_EVASION_TIME_US); |
1a6e0f06 JK |
8369 | max = vblank_start - 1; |
8370 | ||
8371 | - local_irq_disable(); | |
8372 | + local_lock_irq(pipe_update_lock); | |
8373 | ||
8374 | if (min <= 0 || max <= 0) | |
8375 | return; | |
b3bbd485 | 8376 | @@ -132,11 +133,11 @@ void intel_pipe_update_start(struct intel_crtc *crtc) |
1a6e0f06 JK |
8377 | break; |
8378 | } | |
8379 | ||
8380 | - local_irq_enable(); | |
8381 | + local_unlock_irq(pipe_update_lock); | |
8382 | ||
8383 | timeout = schedule_timeout(timeout); | |
8384 | ||
8385 | - local_irq_disable(); | |
8386 | + local_lock_irq(pipe_update_lock); | |
8387 | } | |
8388 | ||
8389 | finish_wait(wq, &wait); | |
b3bbd485 | 8390 | @@ -201,7 +202,7 @@ void intel_pipe_update_end(struct intel_crtc *crtc) |
1a6e0f06 JK |
8391 | crtc->base.state->event = NULL; |
8392 | } | |
8393 | ||
8394 | - local_irq_enable(); | |
8395 | + local_unlock_irq(pipe_update_lock); | |
8396 | ||
e4b2b4a8 JK |
8397 | if (intel_vgpu_active(dev_priv)) |
8398 | return; | |
b3bbd485 JK |
8399 | diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c |
8400 | index ddfe91efa61e..3157bcf6428f 100644 | |
8401 | --- a/drivers/gpu/drm/radeon/radeon_display.c | |
8402 | +++ b/drivers/gpu/drm/radeon/radeon_display.c | |
8403 | @@ -1839,6 +1839,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe, | |
1a6e0f06 JK |
8404 | struct radeon_device *rdev = dev->dev_private; |
8405 | ||
8406 | /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */ | |
8407 | + preempt_disable_rt(); | |
8408 | ||
8409 | /* Get optional system timestamp before query. */ | |
8410 | if (stime) | |
b3bbd485 | 8411 | @@ -1931,6 +1932,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe, |
1a6e0f06 JK |
8412 | *etime = ktime_get(); |
8413 | ||
8414 | /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */ | |
8415 | + preempt_enable_rt(); | |
8416 | ||
8417 | /* Decode into vertical and horizontal scanout position. */ | |
8418 | *vpos = position & 0x1fff; | |
b3bbd485 JK |
8419 | diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h |
8420 | index 49569f8fe038..a3608cd52805 100644 | |
8421 | --- a/drivers/hv/hyperv_vmbus.h | |
8422 | +++ b/drivers/hv/hyperv_vmbus.h | |
8423 | @@ -30,6 +30,7 @@ | |
8424 | #include <linux/atomic.h> | |
8425 | #include <linux/hyperv.h> | |
8426 | #include <linux/interrupt.h> | |
8427 | +#include <linux/irq.h> | |
8428 | ||
8429 | /* | |
8430 | * Timeout for services such as KVP and fcopy. | |
8431 | diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c | |
8432 | index 2cd134dd94d2..cedf225d4182 100644 | |
8433 | --- a/drivers/hv/vmbus_drv.c | |
8434 | +++ b/drivers/hv/vmbus_drv.c | |
8435 | @@ -966,6 +966,8 @@ static void vmbus_isr(void) | |
e4b2b4a8 | 8436 | void *page_addr = hv_cpu->synic_event_page; |
1a6e0f06 JK |
8437 | struct hv_message *msg; |
8438 | union hv_synic_event_flags *event; | |
8439 | + struct pt_regs *regs = get_irq_regs(); | |
8440 | + u64 ip = regs ? instruction_pointer(regs) : 0; | |
8441 | bool handled = false; | |
8442 | ||
e4b2b4a8 | 8443 | if (unlikely(page_addr == NULL)) |
b3bbd485 | 8444 | @@ -1009,7 +1011,7 @@ static void vmbus_isr(void) |
e4b2b4a8 | 8445 | tasklet_schedule(&hv_cpu->msg_dpc); |
1a6e0f06 JK |
8446 | } |
8447 | ||
8448 | - add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0); | |
8449 | + add_interrupt_randomness(HYPERVISOR_CALLBACK_VECTOR, 0, ip); | |
8450 | } | |
8451 | ||
8452 | ||
b3bbd485 JK |
8453 | diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c |
8454 | index 36f76e28a0bf..394f142f90c7 100644 | |
8455 | --- a/drivers/ide/alim15x3.c | |
8456 | +++ b/drivers/ide/alim15x3.c | |
8457 | @@ -234,7 +234,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev) | |
1a6e0f06 JK |
8458 | |
8459 | isa_dev = pci_get_device(PCI_VENDOR_ID_AL, PCI_DEVICE_ID_AL_M1533, NULL); | |
8460 | ||
8461 | - local_irq_save(flags); | |
8462 | + local_irq_save_nort(flags); | |
8463 | ||
8464 | if (m5229_revision < 0xC2) { | |
8465 | /* | |
b3bbd485 | 8466 | @@ -325,7 +325,7 @@ static int init_chipset_ali15x3(struct pci_dev *dev) |
1a6e0f06 JK |
8467 | } |
8468 | pci_dev_put(north); | |
8469 | pci_dev_put(isa_dev); | |
8470 | - local_irq_restore(flags); | |
8471 | + local_irq_restore_nort(flags); | |
8472 | return 0; | |
8473 | } | |
8474 | ||
b3bbd485 JK |
8475 | diff --git a/drivers/ide/hpt366.c b/drivers/ide/hpt366.c |
8476 | index 4b5dc0162e67..590cc7d64622 100644 | |
8477 | --- a/drivers/ide/hpt366.c | |
8478 | +++ b/drivers/ide/hpt366.c | |
8479 | @@ -1236,7 +1236,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif, | |
1a6e0f06 JK |
8480 | |
8481 | dma_old = inb(base + 2); | |
8482 | ||
8483 | - local_irq_save(flags); | |
8484 | + local_irq_save_nort(flags); | |
8485 | ||
8486 | dma_new = dma_old; | |
8487 | pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma); | |
b3bbd485 | 8488 | @@ -1247,7 +1247,7 @@ static int init_dma_hpt366(ide_hwif_t *hwif, |
1a6e0f06 JK |
8489 | if (dma_new != dma_old) |
8490 | outb(dma_new, base + 2); | |
8491 | ||
8492 | - local_irq_restore(flags); | |
8493 | + local_irq_restore_nort(flags); | |
8494 | ||
8495 | printk(KERN_INFO " %s: BM-DMA at 0x%04lx-0x%04lx\n", | |
8496 | hwif->name, base, base + 7); | |
b3bbd485 JK |
8497 | diff --git a/drivers/ide/ide-io-std.c b/drivers/ide/ide-io-std.c |
8498 | index 19763977568c..4169433faab5 100644 | |
8499 | --- a/drivers/ide/ide-io-std.c | |
8500 | +++ b/drivers/ide/ide-io-std.c | |
8501 | @@ -175,7 +175,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf, | |
1a6e0f06 JK |
8502 | unsigned long uninitialized_var(flags); |
8503 | ||
8504 | if ((io_32bit & 2) && !mmio) { | |
8505 | - local_irq_save(flags); | |
8506 | + local_irq_save_nort(flags); | |
8507 | ata_vlb_sync(io_ports->nsect_addr); | |
8508 | } | |
8509 | ||
b3bbd485 | 8510 | @@ -186,7 +186,7 @@ void ide_input_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf, |
1a6e0f06 JK |
8511 | insl(data_addr, buf, words); |
8512 | ||
8513 | if ((io_32bit & 2) && !mmio) | |
8514 | - local_irq_restore(flags); | |
8515 | + local_irq_restore_nort(flags); | |
8516 | ||
8517 | if (((len + 1) & 3) < 2) | |
8518 | return; | |
b3bbd485 | 8519 | @@ -219,7 +219,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf, |
1a6e0f06 JK |
8520 | unsigned long uninitialized_var(flags); |
8521 | ||
8522 | if ((io_32bit & 2) && !mmio) { | |
8523 | - local_irq_save(flags); | |
8524 | + local_irq_save_nort(flags); | |
8525 | ata_vlb_sync(io_ports->nsect_addr); | |
8526 | } | |
8527 | ||
b3bbd485 | 8528 | @@ -230,7 +230,7 @@ void ide_output_data(ide_drive_t *drive, struct ide_cmd *cmd, void *buf, |
1a6e0f06 JK |
8529 | outsl(data_addr, buf, words); |
8530 | ||
8531 | if ((io_32bit & 2) && !mmio) | |
8532 | - local_irq_restore(flags); | |
8533 | + local_irq_restore_nort(flags); | |
8534 | ||
8535 | if (((len + 1) & 3) < 2) | |
8536 | return; | |
b3bbd485 JK |
8537 | diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c |
8538 | index 3a234701d92c..420e4e645856 100644 | |
8539 | --- a/drivers/ide/ide-io.c | |
8540 | +++ b/drivers/ide/ide-io.c | |
8541 | @@ -660,7 +660,7 @@ void ide_timer_expiry (unsigned long data) | |
8542 | /* disable_irq_nosync ?? */ | |
8543 | disable_irq(hwif->irq); | |
8544 | /* local CPU only, as if we were handling an interrupt */ | |
8545 | - local_irq_disable(); | |
8546 | + local_irq_disable_nort(); | |
8547 | if (hwif->polling) { | |
8548 | startstop = handler(drive); | |
8549 | } else if (drive_is_ready(drive)) { | |
8550 | diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c | |
8551 | index 210a0887dd29..7bf05b6147e8 100644 | |
8552 | --- a/drivers/ide/ide-iops.c | |
8553 | +++ b/drivers/ide/ide-iops.c | |
8554 | @@ -129,12 +129,12 @@ int __ide_wait_stat(ide_drive_t *drive, u8 good, u8 bad, | |
8555 | if ((stat & ATA_BUSY) == 0) | |
8556 | break; | |
8557 | ||
8558 | - local_irq_restore(flags); | |
8559 | + local_irq_restore_nort(flags); | |
8560 | *rstat = stat; | |
8561 | return -EBUSY; | |
8562 | } | |
8563 | } | |
8564 | - local_irq_restore(flags); | |
8565 | + local_irq_restore_nort(flags); | |
8566 | } | |
8567 | /* | |
8568 | * Allow status to settle, then read it again. | |
8569 | diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c | |
8570 | index eaf39e5db08b..be4c941eaa83 100644 | |
8571 | --- a/drivers/ide/ide-probe.c | |
8572 | +++ b/drivers/ide/ide-probe.c | |
8573 | @@ -196,10 +196,10 @@ static void do_identify(ide_drive_t *drive, u8 cmd, u16 *id) | |
1a6e0f06 JK |
8574 | int bswap = 1; |
8575 | ||
8576 | /* local CPU only; some systems need this */ | |
8577 | - local_irq_save(flags); | |
8578 | + local_irq_save_nort(flags); | |
8579 | /* read 512 bytes of id info */ | |
8580 | hwif->tp_ops->input_data(drive, NULL, id, SECTOR_SIZE); | |
8581 | - local_irq_restore(flags); | |
8582 | + local_irq_restore_nort(flags); | |
8583 | ||
8584 | drive->dev_flags |= IDE_DFLAG_ID_READ; | |
8585 | #ifdef DEBUG | |
b3bbd485 JK |
8586 | diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c |
8587 | index 4efe4c6e956c..7eae3aa1def7 100644 | |
8588 | --- a/drivers/ide/ide-taskfile.c | |
8589 | +++ b/drivers/ide/ide-taskfile.c | |
8590 | @@ -251,7 +251,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd, | |
1a6e0f06 JK |
8591 | |
8592 | page_is_high = PageHighMem(page); | |
8593 | if (page_is_high) | |
8594 | - local_irq_save(flags); | |
8595 | + local_irq_save_nort(flags); | |
8596 | ||
8597 | buf = kmap_atomic(page) + offset; | |
8598 | ||
b3bbd485 | 8599 | @@ -272,7 +272,7 @@ void ide_pio_bytes(ide_drive_t *drive, struct ide_cmd *cmd, |
1a6e0f06 JK |
8600 | kunmap_atomic(buf); |
8601 | ||
8602 | if (page_is_high) | |
8603 | - local_irq_restore(flags); | |
8604 | + local_irq_restore_nort(flags); | |
8605 | ||
8606 | len -= nr_bytes; | |
8607 | } | |
b3bbd485 | 8608 | @@ -415,7 +415,7 @@ static ide_startstop_t pre_task_out_intr(ide_drive_t *drive, |
1a6e0f06 JK |
8609 | } |
8610 | ||
8611 | if ((drive->dev_flags & IDE_DFLAG_UNMASK) == 0) | |
8612 | - local_irq_disable(); | |
8613 | + local_irq_disable_nort(); | |
8614 | ||
8615 | ide_set_handler(drive, &task_pio_intr, WAIT_WORSTCASE); | |
8616 | ||
b3bbd485 JK |
8617 | diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c |
8618 | index b197e925fe36..95ac319c8e69 100644 | |
8619 | --- a/drivers/infiniband/hw/hfi1/affinity.c | |
8620 | +++ b/drivers/infiniband/hw/hfi1/affinity.c | |
8621 | @@ -593,7 +593,7 @@ int hfi1_get_proc_affinity(int node) | |
e4b2b4a8 JK |
8622 | struct hfi1_affinity_node *entry; |
8623 | cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask; | |
8624 | const struct cpumask *node_mask, | |
8625 | - *proc_mask = ¤t->cpus_allowed; | |
8626 | + *proc_mask = current->cpus_ptr; | |
8627 | struct hfi1_affinity_node_list *affinity = &node_affinity; | |
8628 | struct cpu_mask_set *set = &affinity->proc; | |
8629 | ||
b3bbd485 | 8630 | @@ -601,7 +601,7 @@ int hfi1_get_proc_affinity(int node) |
e4b2b4a8 JK |
8631 | * check whether process/context affinity has already |
8632 | * been set | |
8633 | */ | |
8634 | - if (cpumask_weight(proc_mask) == 1) { | |
8635 | + if (current->nr_cpus_allowed == 1) { | |
8636 | hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl", | |
8637 | current->pid, current->comm, | |
8638 | cpumask_pr_args(proc_mask)); | |
b3bbd485 | 8639 | @@ -612,7 +612,7 @@ int hfi1_get_proc_affinity(int node) |
e4b2b4a8 JK |
8640 | cpu = cpumask_first(proc_mask); |
8641 | cpumask_set_cpu(cpu, &set->used); | |
8642 | goto done; | |
8643 | - } else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) { | |
8644 | + } else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) { | |
8645 | hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl", | |
8646 | current->pid, current->comm, | |
8647 | cpumask_pr_args(proc_mask)); | |
b3bbd485 JK |
8648 | diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c |
8649 | index 6781bcdb10b3..d069ad261572 100644 | |
8650 | --- a/drivers/infiniband/hw/hfi1/sdma.c | |
8651 | +++ b/drivers/infiniband/hw/hfi1/sdma.c | |
8652 | @@ -856,14 +856,13 @@ struct sdma_engine *sdma_select_user_engine(struct hfi1_devdata *dd, | |
e4b2b4a8 JK |
8653 | { |
8654 | struct sdma_rht_node *rht_node; | |
8655 | struct sdma_engine *sde = NULL; | |
8656 | - const struct cpumask *current_mask = ¤t->cpus_allowed; | |
8657 | unsigned long cpu_id; | |
8658 | ||
8659 | /* | |
8660 | * To ensure that always the same sdma engine(s) will be | |
8661 | * selected make sure the process is pinned to this CPU only. | |
8662 | */ | |
8663 | - if (cpumask_weight(current_mask) != 1) | |
8664 | + if (current->nr_cpus_allowed != 1) | |
8665 | goto out; | |
8666 | ||
8667 | cpu_id = smp_processor_id(); | |
b3bbd485 JK |
8668 | diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c |
8669 | index 40efc9151ec4..12924aad90cc 100644 | |
8670 | --- a/drivers/infiniband/hw/qib/qib_file_ops.c | |
8671 | +++ b/drivers/infiniband/hw/qib/qib_file_ops.c | |
8672 | @@ -1167,7 +1167,7 @@ static unsigned int qib_poll(struct file *fp, struct poll_table_struct *pt) | |
e4b2b4a8 JK |
8673 | static void assign_ctxt_affinity(struct file *fp, struct qib_devdata *dd) |
8674 | { | |
8675 | struct qib_filedata *fd = fp->private_data; | |
8676 | - const unsigned int weight = cpumask_weight(¤t->cpus_allowed); | |
8677 | + const unsigned int weight = current->nr_cpus_allowed; | |
8678 | const struct cpumask *local_mask = cpumask_of_pcibus(dd->pcidev->bus); | |
8679 | int local_cpu; | |
8680 | ||
b3bbd485 | 8681 | @@ -1648,9 +1648,8 @@ static int qib_assign_ctxt(struct file *fp, const struct qib_user_info *uinfo) |
e4b2b4a8 JK |
8682 | ret = find_free_ctxt(i_minor - 1, fp, uinfo); |
8683 | else { | |
8684 | int unit; | |
8685 | - const unsigned int cpu = cpumask_first(¤t->cpus_allowed); | |
8686 | - const unsigned int weight = | |
8687 | - cpumask_weight(¤t->cpus_allowed); | |
8688 | + const unsigned int cpu = cpumask_first(current->cpus_ptr); | |
8689 | + const unsigned int weight = current->nr_cpus_allowed; | |
8690 | ||
8691 | if (weight == 1 && !test_bit(cpu, qib_cpulist)) | |
8692 | if (!find_hca(cpu, &unit) && unit >= 0) | |
b3bbd485 JK |
8693 | diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c |
8694 | index 9b3f47ae2016..8327b598d909 100644 | |
8695 | --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c | |
8696 | +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c | |
8697 | @@ -898,7 +898,7 @@ void ipoib_mcast_restart_task(struct work_struct *work) | |
1a6e0f06 JK |
8698 | |
8699 | ipoib_dbg_mcast(priv, "restarting multicast task\n"); | |
8700 | ||
8701 | - local_irq_save(flags); | |
8702 | + local_irq_save_nort(flags); | |
8703 | netif_addr_lock(dev); | |
8704 | spin_lock(&priv->lock); | |
8705 | ||
b3bbd485 | 8706 | @@ -980,7 +980,7 @@ void ipoib_mcast_restart_task(struct work_struct *work) |
1a6e0f06 JK |
8707 | |
8708 | spin_unlock(&priv->lock); | |
8709 | netif_addr_unlock(dev); | |
8710 | - local_irq_restore(flags); | |
8711 | + local_irq_restore_nort(flags); | |
8712 | ||
e4b2b4a8 JK |
8713 | ipoib_mcast_remove_list(&remove_list); |
8714 | ||
b3bbd485 JK |
8715 | diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c |
8716 | index cedc665364cd..4a4fdef151aa 100644 | |
8717 | --- a/drivers/input/gameport/gameport.c | |
8718 | +++ b/drivers/input/gameport/gameport.c | |
8719 | @@ -91,13 +91,13 @@ static int gameport_measure_speed(struct gameport *gameport) | |
e4b2b4a8 JK |
8720 | tx = ~0; |
8721 | ||
8722 | for (i = 0; i < 50; i++) { | |
8723 | - local_irq_save(flags); | |
8724 | + local_irq_save_nort(flags); | |
8725 | t1 = ktime_get_ns(); | |
8726 | for (t = 0; t < 50; t++) | |
8727 | gameport_read(gameport); | |
8728 | t2 = ktime_get_ns(); | |
8729 | t3 = ktime_get_ns(); | |
8730 | - local_irq_restore(flags); | |
8731 | + local_irq_restore_nort(flags); | |
8732 | udelay(i * 10); | |
8733 | t = (t2 - t1) - (t3 - t2); | |
8734 | if (t < tx) | |
b3bbd485 | 8735 | @@ -124,12 +124,12 @@ static int old_gameport_measure_speed(struct gameport *gameport) |
e4b2b4a8 JK |
8736 | tx = 1 << 30; |
8737 | ||
8738 | for(i = 0; i < 50; i++) { | |
8739 | - local_irq_save(flags); | |
8740 | + local_irq_save_nort(flags); | |
8741 | GET_TIME(t1); | |
8742 | for (t = 0; t < 50; t++) gameport_read(gameport); | |
8743 | GET_TIME(t2); | |
8744 | GET_TIME(t3); | |
8745 | - local_irq_restore(flags); | |
8746 | + local_irq_restore_nort(flags); | |
8747 | udelay(i * 10); | |
8748 | if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t; | |
8749 | } | |
b3bbd485 | 8750 | @@ -148,11 +148,11 @@ static int old_gameport_measure_speed(struct gameport *gameport) |
e4b2b4a8 JK |
8751 | tx = 1 << 30; |
8752 | ||
8753 | for(i = 0; i < 50; i++) { | |
8754 | - local_irq_save(flags); | |
8755 | + local_irq_save_nort(flags); | |
8756 | t1 = rdtsc(); | |
8757 | for (t = 0; t < 50; t++) gameport_read(gameport); | |
8758 | t2 = rdtsc(); | |
8759 | - local_irq_restore(flags); | |
8760 | + local_irq_restore_nort(flags); | |
8761 | udelay(i * 10); | |
8762 | if (t2 - t1 < tx) tx = t2 - t1; | |
8763 | } | |
b3bbd485 | 8764 | diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c |
5dd41b01 | 8765 | index efa6cd2500b9..7d9d41f803d1 100644 |
b3bbd485 JK |
8766 | --- a/drivers/iommu/amd_iommu.c |
8767 | +++ b/drivers/iommu/amd_iommu.c | |
e4b2b4a8 JK |
8768 | @@ -81,11 +81,12 @@ |
8769 | */ | |
8770 | #define AMD_IOMMU_PGSIZES ((~0xFFFUL) & ~(2ULL << 38)) | |
8771 | ||
8772 | -static DEFINE_RWLOCK(amd_iommu_devtable_lock); | |
8773 | +static DEFINE_SPINLOCK(amd_iommu_devtable_lock); | |
8774 | +static DEFINE_SPINLOCK(pd_bitmap_lock); | |
8775 | +static DEFINE_SPINLOCK(iommu_table_lock); | |
8776 | ||
8777 | /* List of all available dev_data structures */ | |
8778 | -static LIST_HEAD(dev_data_list); | |
8779 | -static DEFINE_SPINLOCK(dev_data_list_lock); | |
8780 | +static LLIST_HEAD(dev_data_list); | |
8781 | ||
8782 | LIST_HEAD(ioapic_map); | |
8783 | LIST_HEAD(hpet_map); | |
b3bbd485 | 8784 | @@ -204,40 +205,33 @@ static struct dma_ops_domain* to_dma_ops_domain(struct protection_domain *domain |
e4b2b4a8 JK |
8785 | static struct iommu_dev_data *alloc_dev_data(u16 devid) |
8786 | { | |
8787 | struct iommu_dev_data *dev_data; | |
8788 | - unsigned long flags; | |
8789 | ||
8790 | dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL); | |
8791 | if (!dev_data) | |
8792 | return NULL; | |
8793 | ||
8794 | dev_data->devid = devid; | |
8795 | - | |
8796 | - spin_lock_irqsave(&dev_data_list_lock, flags); | |
8797 | - list_add_tail(&dev_data->dev_data_list, &dev_data_list); | |
8798 | - spin_unlock_irqrestore(&dev_data_list_lock, flags); | |
8799 | - | |
8800 | ratelimit_default_init(&dev_data->rs); | |
8801 | ||
8802 | + llist_add(&dev_data->dev_data_list, &dev_data_list); | |
8803 | return dev_data; | |
8804 | } | |
8805 | ||
8806 | static struct iommu_dev_data *search_dev_data(u16 devid) | |
8807 | { | |
8808 | struct iommu_dev_data *dev_data; | |
8809 | - unsigned long flags; | |
8810 | + struct llist_node *node; | |
b3bbd485 JK |
8811 | + |
8812 | + if (llist_empty(&dev_data_list)) | |
8813 | + return NULL; | |
e4b2b4a8 JK |
8814 | |
8815 | - spin_lock_irqsave(&dev_data_list_lock, flags); | |
8816 | - list_for_each_entry(dev_data, &dev_data_list, dev_data_list) { | |
e4b2b4a8 JK |
8817 | + node = dev_data_list.first; |
8818 | + llist_for_each_entry(dev_data, node, dev_data_list) { | |
8819 | if (dev_data->devid == devid) | |
8820 | - goto out_unlock; | |
8821 | + return dev_data; | |
8822 | } | |
8823 | ||
8824 | - dev_data = NULL; | |
8825 | - | |
8826 | -out_unlock: | |
8827 | - spin_unlock_irqrestore(&dev_data_list_lock, flags); | |
8828 | - | |
8829 | - return dev_data; | |
8830 | + return NULL; | |
8831 | } | |
8832 | ||
8833 | static int __last_alias(struct pci_dev *pdev, u16 alias, void *data) | |
5dd41b01 | 8834 | @@ -1062,9 +1056,9 @@ static int iommu_queue_command_sync(struct amd_iommu *iommu, |
e4b2b4a8 JK |
8835 | unsigned long flags; |
8836 | int ret; | |
8837 | ||
8838 | - spin_lock_irqsave(&iommu->lock, flags); | |
8839 | + raw_spin_lock_irqsave(&iommu->lock, flags); | |
8840 | ret = __iommu_queue_command_sync(iommu, cmd, sync); | |
8841 | - spin_unlock_irqrestore(&iommu->lock, flags); | |
8842 | + raw_spin_unlock_irqrestore(&iommu->lock, flags); | |
8843 | ||
8844 | return ret; | |
8845 | } | |
5dd41b01 | 8846 | @@ -1090,7 +1084,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu) |
e4b2b4a8 JK |
8847 | |
8848 | build_completion_wait(&cmd, (u64)&iommu->cmd_sem); | |
8849 | ||
8850 | - spin_lock_irqsave(&iommu->lock, flags); | |
8851 | + raw_spin_lock_irqsave(&iommu->lock, flags); | |
8852 | ||
8853 | iommu->cmd_sem = 0; | |
8854 | ||
5dd41b01 | 8855 | @@ -1101,7 +1095,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu) |
e4b2b4a8 JK |
8856 | ret = wait_on_sem(&iommu->cmd_sem); |
8857 | ||
8858 | out_unlock: | |
8859 | - spin_unlock_irqrestore(&iommu->lock, flags); | |
8860 | + raw_spin_unlock_irqrestore(&iommu->lock, flags); | |
8861 | ||
8862 | return ret; | |
8863 | } | |
5dd41b01 | 8864 | @@ -1610,29 +1604,26 @@ static void del_domain_from_list(struct protection_domain *domain) |
e4b2b4a8 JK |
8865 | |
8866 | static u16 domain_id_alloc(void) | |
8867 | { | |
8868 | - unsigned long flags; | |
8869 | int id; | |
8870 | ||
8871 | - write_lock_irqsave(&amd_iommu_devtable_lock, flags); | |
8872 | + spin_lock(&pd_bitmap_lock); | |
8873 | id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID); | |
8874 | BUG_ON(id == 0); | |
8875 | if (id > 0 && id < MAX_DOMAIN_ID) | |
8876 | __set_bit(id, amd_iommu_pd_alloc_bitmap); | |
8877 | else | |
8878 | id = 0; | |
8879 | - write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); | |
8880 | + spin_unlock(&pd_bitmap_lock); | |
8881 | ||
8882 | return id; | |
8883 | } | |
8884 | ||
8885 | static void domain_id_free(int id) | |
8886 | { | |
8887 | - unsigned long flags; | |
8888 | - | |
8889 | - write_lock_irqsave(&amd_iommu_devtable_lock, flags); | |
8890 | + spin_lock(&pd_bitmap_lock); | |
8891 | if (id > 0 && id < MAX_DOMAIN_ID) | |
8892 | __clear_bit(id, amd_iommu_pd_alloc_bitmap); | |
8893 | - write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); | |
8894 | + spin_unlock(&pd_bitmap_lock); | |
8895 | } | |
8896 | ||
8897 | #define DEFINE_FREE_PT_FN(LVL, FN) \ | |
5dd41b01 | 8898 | @@ -1952,10 +1943,10 @@ static int __attach_device(struct iommu_dev_data *dev_data, |
e4b2b4a8 JK |
8899 | int ret; |
8900 | ||
8901 | /* | |
8902 | - * Must be called with IRQs disabled. Warn here to detect early | |
8903 | - * when its not. | |
8904 | + * Must be called with IRQs disabled on a non RT kernel. Warn here to | |
8905 | + * detect early when its not. | |
8906 | */ | |
8907 | - WARN_ON(!irqs_disabled()); | |
8908 | + WARN_ON_NONRT(!irqs_disabled()); | |
8909 | ||
8910 | /* lock domain */ | |
8911 | spin_lock(&domain->lock); | |
5dd41b01 | 8912 | @@ -2101,9 +2092,9 @@ static int attach_device(struct device *dev, |
e4b2b4a8 JK |
8913 | } |
8914 | ||
8915 | skip_ats_check: | |
8916 | - write_lock_irqsave(&amd_iommu_devtable_lock, flags); | |
8917 | + spin_lock_irqsave(&amd_iommu_devtable_lock, flags); | |
8918 | ret = __attach_device(dev_data, domain); | |
8919 | - write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); | |
8920 | + spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags); | |
8921 | ||
8922 | /* | |
8923 | * We might boot into a crash-kernel here. The crashed kernel | |
5dd41b01 | 8924 | @@ -2123,10 +2114,10 @@ static void __detach_device(struct iommu_dev_data *dev_data) |
e4b2b4a8 JK |
8925 | struct protection_domain *domain; |
8926 | ||
8927 | /* | |
8928 | - * Must be called with IRQs disabled. Warn here to detect early | |
8929 | - * when its not. | |
8930 | + * Must be called with IRQs disabled on a non RT kernel. Warn here to | |
8931 | + * detect early when its not. | |
8932 | */ | |
8933 | - WARN_ON(!irqs_disabled()); | |
8934 | + WARN_ON_NONRT(!irqs_disabled()); | |
8935 | ||
8936 | if (WARN_ON(!dev_data->domain)) | |
8937 | return; | |
5dd41b01 | 8938 | @@ -2153,9 +2144,9 @@ static void detach_device(struct device *dev) |
e4b2b4a8 JK |
8939 | domain = dev_data->domain; |
8940 | ||
8941 | /* lock device table */ | |
8942 | - write_lock_irqsave(&amd_iommu_devtable_lock, flags); | |
8943 | + spin_lock_irqsave(&amd_iommu_devtable_lock, flags); | |
8944 | __detach_device(dev_data); | |
8945 | - write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); | |
8946 | + spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags); | |
8947 | ||
8948 | if (!dev_is_pci(dev)) | |
8949 | return; | |
5dd41b01 | 8950 | @@ -2819,7 +2810,7 @@ static void cleanup_domain(struct protection_domain *domain) |
e4b2b4a8 JK |
8951 | struct iommu_dev_data *entry; |
8952 | unsigned long flags; | |
8953 | ||
8954 | - write_lock_irqsave(&amd_iommu_devtable_lock, flags); | |
8955 | + spin_lock_irqsave(&amd_iommu_devtable_lock, flags); | |
8956 | ||
8957 | while (!list_empty(&domain->dev_list)) { | |
8958 | entry = list_first_entry(&domain->dev_list, | |
5dd41b01 | 8959 | @@ -2827,7 +2818,7 @@ static void cleanup_domain(struct protection_domain *domain) |
e4b2b4a8 JK |
8960 | __detach_device(entry); |
8961 | } | |
8962 | ||
8963 | - write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); | |
8964 | + spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags); | |
8965 | } | |
8966 | ||
8967 | static void protection_domain_free(struct protection_domain *domain) | |
5dd41b01 | 8968 | @@ -3594,14 +3585,62 @@ static void set_dte_irq_entry(u16 devid, struct irq_remap_table *table) |
e4b2b4a8 JK |
8969 | amd_iommu_dev_table[devid].data[2] = dte; |
8970 | } | |
8971 | ||
8972 | -static struct irq_remap_table *get_irq_table(u16 devid, bool ioapic) | |
8973 | +static struct irq_remap_table *get_irq_table(u16 devid) | |
8974 | +{ | |
8975 | + struct irq_remap_table *table; | |
8976 | + | |
8977 | + if (WARN_ONCE(!amd_iommu_rlookup_table[devid], | |
8978 | + "%s: no iommu for devid %x\n", __func__, devid)) | |
8979 | + return NULL; | |
8980 | + | |
8981 | + table = irq_lookup_table[devid]; | |
8982 | + if (WARN_ONCE(!table, "%s: no table for devid %x\n", __func__, devid)) | |
8983 | + return NULL; | |
8984 | + | |
8985 | + return table; | |
8986 | +} | |
8987 | + | |
8988 | +static struct irq_remap_table *__alloc_irq_table(void) | |
8989 | +{ | |
8990 | + struct irq_remap_table *table; | |
8991 | + | |
8992 | + table = kzalloc(sizeof(*table), GFP_KERNEL); | |
8993 | + if (!table) | |
8994 | + return NULL; | |
8995 | + | |
8996 | + table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_KERNEL); | |
8997 | + if (!table->table) { | |
8998 | + kfree(table); | |
8999 | + return NULL; | |
9000 | + } | |
9001 | + raw_spin_lock_init(&table->lock); | |
9002 | + | |
9003 | + if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir)) | |
9004 | + memset(table->table, 0, | |
9005 | + MAX_IRQS_PER_TABLE * sizeof(u32)); | |
9006 | + else | |
9007 | + memset(table->table, 0, | |
9008 | + (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2))); | |
9009 | + return table; | |
9010 | +} | |
9011 | + | |
9012 | +static void set_remap_table_entry(struct amd_iommu *iommu, u16 devid, | |
9013 | + struct irq_remap_table *table) | |
9014 | +{ | |
9015 | + irq_lookup_table[devid] = table; | |
9016 | + set_dte_irq_entry(devid, table); | |
9017 | + iommu_flush_dte(iommu, devid); | |
9018 | +} | |
9019 | + | |
9020 | +static struct irq_remap_table *alloc_irq_table(u16 devid) | |
9021 | { | |
9022 | struct irq_remap_table *table = NULL; | |
9023 | + struct irq_remap_table *new_table = NULL; | |
9024 | struct amd_iommu *iommu; | |
9025 | unsigned long flags; | |
9026 | u16 alias; | |
9027 | ||
9028 | - write_lock_irqsave(&amd_iommu_devtable_lock, flags); | |
9029 | + spin_lock_irqsave(&iommu_table_lock, flags); | |
9030 | ||
9031 | iommu = amd_iommu_rlookup_table[devid]; | |
9032 | if (!iommu) | |
5dd41b01 | 9033 | @@ -3614,60 +3653,45 @@ static struct irq_remap_table *get_irq_table(u16 devid, bool ioapic) |
e4b2b4a8 JK |
9034 | alias = amd_iommu_alias_table[devid]; |
9035 | table = irq_lookup_table[alias]; | |
9036 | if (table) { | |
9037 | - irq_lookup_table[devid] = table; | |
9038 | - set_dte_irq_entry(devid, table); | |
9039 | - iommu_flush_dte(iommu, devid); | |
9040 | - goto out; | |
9041 | + set_remap_table_entry(iommu, devid, table); | |
9042 | + goto out_wait; | |
9043 | } | |
9044 | + spin_unlock_irqrestore(&iommu_table_lock, flags); | |
9045 | ||
9046 | /* Nothing there yet, allocate new irq remapping table */ | |
9047 | - table = kzalloc(sizeof(*table), GFP_ATOMIC); | |
9048 | - if (!table) | |
9049 | - goto out_unlock; | |
9050 | - | |
9051 | - /* Initialize table spin-lock */ | |
9052 | - spin_lock_init(&table->lock); | |
9053 | + new_table = __alloc_irq_table(); | |
9054 | + if (!new_table) | |
9055 | + return NULL; | |
9056 | ||
9057 | - if (ioapic) | |
9058 | - /* Keep the first 32 indexes free for IOAPIC interrupts */ | |
9059 | - table->min_index = 32; | |
9060 | + spin_lock_irqsave(&iommu_table_lock, flags); | |
9061 | ||
9062 | - table->table = kmem_cache_alloc(amd_iommu_irq_cache, GFP_ATOMIC); | |
9063 | - if (!table->table) { | |
9064 | - kfree(table); | |
9065 | - table = NULL; | |
9066 | + table = irq_lookup_table[devid]; | |
9067 | + if (table) | |
9068 | goto out_unlock; | |
9069 | - } | |
9070 | - | |
9071 | - if (!AMD_IOMMU_GUEST_IR_GA(amd_iommu_guest_ir)) | |
9072 | - memset(table->table, 0, | |
9073 | - MAX_IRQS_PER_TABLE * sizeof(u32)); | |
9074 | - else | |
9075 | - memset(table->table, 0, | |
9076 | - (MAX_IRQS_PER_TABLE * (sizeof(u64) * 2))); | |
b3bbd485 | 9077 | |
e4b2b4a8 JK |
9078 | - if (ioapic) { |
9079 | - int i; | |
b3bbd485 | 9080 | - |
e4b2b4a8 JK |
9081 | - for (i = 0; i < 32; ++i) |
9082 | - iommu->irte_ops->set_allocated(table, i); | |
9083 | + table = irq_lookup_table[alias]; | |
9084 | + if (table) { | |
9085 | + set_remap_table_entry(iommu, devid, table); | |
9086 | + goto out_wait; | |
9087 | } | |
9088 | ||
9089 | - irq_lookup_table[devid] = table; | |
9090 | - set_dte_irq_entry(devid, table); | |
9091 | - iommu_flush_dte(iommu, devid); | |
9092 | - if (devid != alias) { | |
9093 | - irq_lookup_table[alias] = table; | |
9094 | - set_dte_irq_entry(alias, table); | |
9095 | - iommu_flush_dte(iommu, alias); | |
9096 | - } | |
9097 | + table = new_table; | |
9098 | + new_table = NULL; | |
9099 | ||
9100 | -out: | |
9101 | + set_remap_table_entry(iommu, devid, table); | |
9102 | + if (devid != alias) | |
9103 | + set_remap_table_entry(iommu, alias, table); | |
9104 | + | |
9105 | +out_wait: | |
9106 | iommu_completion_wait(iommu); | |
9107 | ||
9108 | out_unlock: | |
9109 | - write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); | |
9110 | + spin_unlock_irqrestore(&iommu_table_lock, flags); | |
9111 | ||
9112 | + if (new_table) { | |
9113 | + kmem_cache_free(amd_iommu_irq_cache, new_table->table); | |
9114 | + kfree(new_table); | |
9115 | + } | |
9116 | return table; | |
9117 | } | |
9118 | ||
5dd41b01 | 9119 | @@ -3681,11 +3705,11 @@ static int alloc_irq_index(u16 devid, int count) |
e4b2b4a8 JK |
9120 | if (!iommu) |
9121 | return -ENODEV; | |
9122 | ||
9123 | - table = get_irq_table(devid, false); | |
9124 | + table = alloc_irq_table(devid); | |
9125 | if (!table) | |
9126 | return -ENODEV; | |
9127 | ||
9128 | - spin_lock_irqsave(&table->lock, flags); | |
9129 | + raw_spin_lock_irqsave(&table->lock, flags); | |
9130 | ||
9131 | /* Scan table for free entries */ | |
9132 | for (c = 0, index = table->min_index; | |
5dd41b01 | 9133 | @@ -3708,7 +3732,7 @@ static int alloc_irq_index(u16 devid, int count) |
e4b2b4a8 JK |
9134 | index = -ENOSPC; |
9135 | ||
9136 | out: | |
9137 | - spin_unlock_irqrestore(&table->lock, flags); | |
9138 | + raw_spin_unlock_irqrestore(&table->lock, flags); | |
9139 | ||
9140 | return index; | |
9141 | } | |
5dd41b01 | 9142 | @@ -3725,11 +3749,11 @@ static int modify_irte_ga(u16 devid, int index, struct irte_ga *irte, |
e4b2b4a8 JK |
9143 | if (iommu == NULL) |
9144 | return -EINVAL; | |
9145 | ||
9146 | - table = get_irq_table(devid, false); | |
9147 | + table = get_irq_table(devid); | |
9148 | if (!table) | |
9149 | return -ENOMEM; | |
9150 | ||
9151 | - spin_lock_irqsave(&table->lock, flags); | |
9152 | + raw_spin_lock_irqsave(&table->lock, flags); | |
9153 | ||
9154 | entry = (struct irte_ga *)table->table; | |
9155 | entry = &entry[index]; | |
5dd41b01 | 9156 | @@ -3740,7 +3764,7 @@ static int modify_irte_ga(u16 devid, int index, struct irte_ga *irte, |
e4b2b4a8 JK |
9157 | if (data) |
9158 | data->ref = entry; | |
9159 | ||
9160 | - spin_unlock_irqrestore(&table->lock, flags); | |
9161 | + raw_spin_unlock_irqrestore(&table->lock, flags); | |
9162 | ||
9163 | iommu_flush_irt(iommu, devid); | |
9164 | iommu_completion_wait(iommu); | |
5dd41b01 | 9165 | @@ -3758,13 +3782,13 @@ static int modify_irte(u16 devid, int index, union irte *irte) |
e4b2b4a8 JK |
9166 | if (iommu == NULL) |
9167 | return -EINVAL; | |
9168 | ||
9169 | - table = get_irq_table(devid, false); | |
9170 | + table = get_irq_table(devid); | |
9171 | if (!table) | |
9172 | return -ENOMEM; | |
9173 | ||
9174 | - spin_lock_irqsave(&table->lock, flags); | |
9175 | + raw_spin_lock_irqsave(&table->lock, flags); | |
9176 | table->table[index] = irte->val; | |
9177 | - spin_unlock_irqrestore(&table->lock, flags); | |
9178 | + raw_spin_unlock_irqrestore(&table->lock, flags); | |
9179 | ||
9180 | iommu_flush_irt(iommu, devid); | |
9181 | iommu_completion_wait(iommu); | |
5dd41b01 | 9182 | @@ -3782,13 +3806,13 @@ static void free_irte(u16 devid, int index) |
e4b2b4a8 JK |
9183 | if (iommu == NULL) |
9184 | return; | |
9185 | ||
9186 | - table = get_irq_table(devid, false); | |
9187 | + table = get_irq_table(devid); | |
9188 | if (!table) | |
9189 | return; | |
9190 | ||
9191 | - spin_lock_irqsave(&table->lock, flags); | |
9192 | + raw_spin_lock_irqsave(&table->lock, flags); | |
9193 | iommu->irte_ops->clear_allocated(table, index); | |
9194 | - spin_unlock_irqrestore(&table->lock, flags); | |
9195 | + raw_spin_unlock_irqrestore(&table->lock, flags); | |
9196 | ||
9197 | iommu_flush_irt(iommu, devid); | |
9198 | iommu_completion_wait(iommu); | |
5dd41b01 | 9199 | @@ -3869,10 +3893,8 @@ static void irte_ga_set_affinity(void *entry, u16 devid, u16 index, |
e4b2b4a8 JK |
9200 | u8 vector, u32 dest_apicid) |
9201 | { | |
9202 | struct irte_ga *irte = (struct irte_ga *) entry; | |
9203 | - struct iommu_dev_data *dev_data = search_dev_data(devid); | |
9204 | ||
9205 | - if (!dev_data || !dev_data->use_vapic || | |
9206 | - !irte->lo.fields_remap.guest_mode) { | |
9207 | + if (!irte->lo.fields_remap.guest_mode) { | |
9208 | irte->hi.fields.vector = vector; | |
9209 | irte->lo.fields_remap.destination = dest_apicid; | |
9210 | modify_irte_ga(devid, index, irte, NULL); | |
5dd41b01 | 9211 | @@ -4078,7 +4100,7 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, |
e4b2b4a8 JK |
9212 | struct amd_ir_data *data = NULL; |
9213 | struct irq_cfg *cfg; | |
9214 | int i, ret, devid; | |
9215 | - int index = -1; | |
9216 | + int index; | |
9217 | ||
9218 | if (!info) | |
9219 | return -EINVAL; | |
5dd41b01 | 9220 | @@ -4102,10 +4124,26 @@ static int irq_remapping_alloc(struct irq_domain *domain, unsigned int virq, |
e4b2b4a8 JK |
9221 | return ret; |
9222 | ||
9223 | if (info->type == X86_IRQ_ALLOC_TYPE_IOAPIC) { | |
9224 | - if (get_irq_table(devid, true)) | |
9225 | + struct irq_remap_table *table; | |
9226 | + struct amd_iommu *iommu; | |
9227 | + | |
9228 | + table = alloc_irq_table(devid); | |
9229 | + if (table) { | |
9230 | + if (!table->min_index) { | |
9231 | + /* | |
9232 | + * Keep the first 32 indexes free for IOAPIC | |
9233 | + * interrupts. | |
9234 | + */ | |
9235 | + table->min_index = 32; | |
9236 | + iommu = amd_iommu_rlookup_table[devid]; | |
9237 | + for (i = 0; i < 32; ++i) | |
9238 | + iommu->irte_ops->set_allocated(table, i); | |
9239 | + } | |
9240 | + WARN_ON(table->min_index != 32); | |
9241 | index = info->ioapic_pin; | |
9242 | - else | |
9243 | - ret = -ENOMEM; | |
9244 | + } else { | |
9245 | + index = -ENOMEM; | |
9246 | + } | |
9247 | } else { | |
9248 | index = alloc_irq_index(devid, nr_irqs); | |
9249 | } | |
5dd41b01 | 9250 | @@ -4349,7 +4387,7 @@ int amd_iommu_update_ga(int cpu, bool is_run, void *data) |
e4b2b4a8 JK |
9251 | { |
9252 | unsigned long flags; | |
9253 | struct amd_iommu *iommu; | |
9254 | - struct irq_remap_table *irt; | |
9255 | + struct irq_remap_table *table; | |
9256 | struct amd_ir_data *ir_data = (struct amd_ir_data *)data; | |
9257 | int devid = ir_data->irq_2_irte.devid; | |
9258 | struct irte_ga *entry = (struct irte_ga *) ir_data->entry; | |
5dd41b01 | 9259 | @@ -4363,11 +4401,11 @@ int amd_iommu_update_ga(int cpu, bool is_run, void *data) |
e4b2b4a8 JK |
9260 | if (!iommu) |
9261 | return -ENODEV; | |
9262 | ||
9263 | - irt = get_irq_table(devid, false); | |
9264 | - if (!irt) | |
9265 | + table = get_irq_table(devid); | |
9266 | + if (!table) | |
9267 | return -ENODEV; | |
9268 | ||
9269 | - spin_lock_irqsave(&irt->lock, flags); | |
9270 | + raw_spin_lock_irqsave(&table->lock, flags); | |
9271 | ||
9272 | if (ref->lo.fields_vapic.guest_mode) { | |
9273 | if (cpu >= 0) | |
5dd41b01 | 9274 | @@ -4376,7 +4414,7 @@ int amd_iommu_update_ga(int cpu, bool is_run, void *data) |
e4b2b4a8 JK |
9275 | barrier(); |
9276 | } | |
9277 | ||
9278 | - spin_unlock_irqrestore(&irt->lock, flags); | |
9279 | + raw_spin_unlock_irqrestore(&table->lock, flags); | |
9280 | ||
9281 | iommu_flush_irt(iommu, devid); | |
9282 | iommu_completion_wait(iommu); | |
b3bbd485 JK |
9283 | diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c |
9284 | index 6fe2d0346073..e3cd81b32a33 100644 | |
9285 | --- a/drivers/iommu/amd_iommu_init.c | |
9286 | +++ b/drivers/iommu/amd_iommu_init.c | |
9287 | @@ -1474,7 +1474,7 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) | |
e4b2b4a8 JK |
9288 | { |
9289 | int ret; | |
9290 | ||
9291 | - spin_lock_init(&iommu->lock); | |
9292 | + raw_spin_lock_init(&iommu->lock); | |
9293 | ||
9294 | /* Add IOMMU to internal data structures */ | |
9295 | list_add_tail(&iommu->list, &amd_iommu_list); | |
b3bbd485 JK |
9296 | diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h |
9297 | index f6b24c7d8b70..16b1404da58c 100644 | |
9298 | --- a/drivers/iommu/amd_iommu_types.h | |
9299 | +++ b/drivers/iommu/amd_iommu_types.h | |
9300 | @@ -406,7 +406,7 @@ extern bool amd_iommu_iotlb_sup; | |
e4b2b4a8 JK |
9301 | #define IRQ_TABLE_ALIGNMENT 128 |
9302 | ||
9303 | struct irq_remap_table { | |
9304 | - spinlock_t lock; | |
9305 | + raw_spinlock_t lock; | |
9306 | unsigned min_index; | |
9307 | u32 *table; | |
9308 | }; | |
b3bbd485 | 9309 | @@ -488,7 +488,7 @@ struct amd_iommu { |
e4b2b4a8 JK |
9310 | int index; |
9311 | ||
9312 | /* locks the accesses to the hardware */ | |
9313 | - spinlock_t lock; | |
9314 | + raw_spinlock_t lock; | |
9315 | ||
9316 | /* Pointer to PCI device of this IOMMU */ | |
9317 | struct pci_dev *dev; | |
b3bbd485 | 9318 | @@ -625,7 +625,7 @@ struct devid_map { |
e4b2b4a8 JK |
9319 | */ |
9320 | struct iommu_dev_data { | |
9321 | struct list_head list; /* For domain->dev_list */ | |
9322 | - struct list_head dev_data_list; /* For global dev_data_list */ | |
9323 | + struct llist_node dev_data_list; /* For global dev_data_list */ | |
9324 | struct protection_domain *domain; /* Domain the device is bound to */ | |
9325 | u16 devid; /* PCI Device ID */ | |
9326 | u16 alias; /* Alias Device ID */ | |
b3bbd485 JK |
9327 | diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c |
9328 | index 33edfa794ae9..b30900025c62 100644 | |
9329 | --- a/drivers/iommu/iova.c | |
9330 | +++ b/drivers/iommu/iova.c | |
9331 | @@ -570,7 +570,7 @@ void queue_iova(struct iova_domain *iovad, | |
e4b2b4a8 JK |
9332 | unsigned long pfn, unsigned long pages, |
9333 | unsigned long data) | |
9334 | { | |
9335 | - struct iova_fq *fq = get_cpu_ptr(iovad->fq); | |
9336 | + struct iova_fq *fq = raw_cpu_ptr(iovad->fq); | |
9337 | unsigned long flags; | |
9338 | unsigned idx; | |
9339 | ||
b3bbd485 | 9340 | @@ -600,8 +600,6 @@ void queue_iova(struct iova_domain *iovad, |
e4b2b4a8 JK |
9341 | if (atomic_cmpxchg(&iovad->fq_timer_on, 0, 1) == 0) |
9342 | mod_timer(&iovad->fq_timer, | |
9343 | jiffies + msecs_to_jiffies(IOVA_FQ_TIMEOUT)); | |
9344 | - | |
9345 | - put_cpu_ptr(iovad->fq); | |
9346 | } | |
9347 | EXPORT_SYMBOL_GPL(queue_iova); | |
9348 | ||
b3bbd485 JK |
9349 | diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c |
9350 | index 2ea39a83737f..a3e23d0fc4af 100644 | |
9351 | --- a/drivers/irqchip/irq-gic-v3-its.c | |
9352 | +++ b/drivers/irqchip/irq-gic-v3-its.c | |
9353 | @@ -148,7 +148,7 @@ static struct { | |
9354 | } vpe_proxy; | |
9355 | ||
9356 | static LIST_HEAD(its_nodes); | |
9357 | -static DEFINE_SPINLOCK(its_lock); | |
9358 | +static DEFINE_RAW_SPINLOCK(its_lock); | |
9359 | static struct rdists *gic_rdists; | |
9360 | static struct irq_domain *its_parent; | |
9361 | ||
9362 | @@ -165,6 +165,7 @@ static DEFINE_RAW_SPINLOCK(vmovp_lock); | |
9363 | static DEFINE_IDA(its_vpeid_ida); | |
9364 | ||
9365 | #define gic_data_rdist() (raw_cpu_ptr(gic_rdists->rdist)) | |
9366 | +#define gic_data_rdist_cpu(cpu) (per_cpu_ptr(gic_rdists->rdist, cpu)) | |
9367 | #define gic_data_rdist_rd_base() (gic_data_rdist()->rd_base) | |
9368 | #define gic_data_rdist_vlpi_base() (gic_data_rdist_rd_base() + SZ_128K) | |
9369 | ||
9370 | @@ -1432,7 +1433,7 @@ static void its_free_prop_table(struct page *prop_page) | |
9371 | get_order(LPI_PROPBASE_SZ)); | |
9372 | } | |
9373 | ||
9374 | -static int __init its_alloc_lpi_tables(void) | |
9375 | +static int __init its_alloc_lpi_prop_table(void) | |
9376 | { | |
9377 | phys_addr_t paddr; | |
9378 | ||
9379 | @@ -1758,30 +1759,47 @@ static void its_free_pending_table(struct page *pt) | |
9380 | get_order(max_t(u32, LPI_PENDBASE_SZ, SZ_64K))); | |
9381 | } | |
9382 | ||
9383 | -static void its_cpu_init_lpis(void) | |
9384 | +static int __init allocate_lpi_tables(void) | |
9385 | { | |
9386 | - void __iomem *rbase = gic_data_rdist_rd_base(); | |
9387 | - struct page *pend_page; | |
9388 | - u64 val, tmp; | |
9389 | + int err, cpu; | |
9390 | ||
9391 | - /* If we didn't allocate the pending table yet, do it now */ | |
9392 | - pend_page = gic_data_rdist()->pend_page; | |
9393 | - if (!pend_page) { | |
9394 | - phys_addr_t paddr; | |
9395 | + err = its_alloc_lpi_prop_table(); | |
9396 | + if (err) | |
9397 | + return err; | |
9398 | + | |
9399 | + /* | |
9400 | + * We allocate all the pending tables anyway, as we may have a | |
9401 | + * mix of RDs that have had LPIs enabled, and some that | |
9402 | + * don't. We'll free the unused ones as each CPU comes online. | |
9403 | + */ | |
9404 | + for_each_possible_cpu(cpu) { | |
9405 | + struct page *pend_page; | |
9406 | ||
9407 | pend_page = its_allocate_pending_table(GFP_NOWAIT); | |
9408 | if (!pend_page) { | |
9409 | - pr_err("Failed to allocate PENDBASE for CPU%d\n", | |
9410 | - smp_processor_id()); | |
9411 | - return; | |
9412 | + pr_err("Failed to allocate PENDBASE for CPU%d\n", cpu); | |
9413 | + return -ENOMEM; | |
9414 | } | |
9415 | ||
9416 | - paddr = page_to_phys(pend_page); | |
9417 | - pr_info("CPU%d: using LPI pending table @%pa\n", | |
9418 | - smp_processor_id(), &paddr); | |
9419 | - gic_data_rdist()->pend_page = pend_page; | |
9420 | + gic_data_rdist_cpu(cpu)->pend_page = pend_page; | |
9421 | } | |
9422 | ||
9423 | + return 0; | |
9424 | +} | |
9425 | + | |
9426 | +static void its_cpu_init_lpis(void) | |
9427 | +{ | |
9428 | + void __iomem *rbase = gic_data_rdist_rd_base(); | |
9429 | + struct page *pend_page; | |
9430 | + phys_addr_t paddr; | |
9431 | + u64 val, tmp; | |
9432 | + | |
9433 | + if (gic_data_rdist()->lpi_enabled) | |
9434 | + return; | |
9435 | + | |
9436 | + pend_page = gic_data_rdist()->pend_page; | |
9437 | + paddr = page_to_phys(pend_page); | |
9438 | + | |
9439 | /* Disable LPIs */ | |
9440 | val = readl_relaxed(rbase + GICR_CTLR); | |
9441 | val &= ~GICR_CTLR_ENABLE_LPIS; | |
9442 | @@ -1843,6 +1861,10 @@ static void its_cpu_init_lpis(void) | |
9443 | ||
9444 | /* Make sure the GIC has seen the above */ | |
9445 | dsb(sy); | |
9446 | + gic_data_rdist()->lpi_enabled = true; | |
9447 | + pr_info("GICv3: CPU%d: using LPI pending table @%pa\n", | |
9448 | + smp_processor_id(), | |
9449 | + &paddr); | |
9450 | } | |
9451 | ||
9452 | static void its_cpu_init_collection(void) | |
9453 | @@ -1850,7 +1872,7 @@ static void its_cpu_init_collection(void) | |
9454 | struct its_node *its; | |
9455 | int cpu; | |
9456 | ||
9457 | - spin_lock(&its_lock); | |
9458 | + raw_spin_lock(&its_lock); | |
9459 | cpu = smp_processor_id(); | |
9460 | ||
9461 | list_for_each_entry(its, &its_nodes, entry) { | |
9462 | @@ -1892,7 +1914,7 @@ static void its_cpu_init_collection(void) | |
9463 | its_send_invall(its, &its->collections[cpu]); | |
9464 | } | |
9465 | ||
9466 | - spin_unlock(&its_lock); | |
9467 | + raw_spin_unlock(&its_lock); | |
9468 | } | |
9469 | ||
9470 | static struct its_device *its_find_device(struct its_node *its, u32 dev_id) | |
9471 | @@ -3041,9 +3063,9 @@ static int __init its_probe_one(struct resource *res, | |
9472 | if (err) | |
9473 | goto out_free_tables; | |
9474 | ||
9475 | - spin_lock(&its_lock); | |
9476 | + raw_spin_lock(&its_lock); | |
9477 | list_add(&its->entry, &its_nodes); | |
9478 | - spin_unlock(&its_lock); | |
9479 | + raw_spin_unlock(&its_lock); | |
9480 | ||
9481 | return 0; | |
9482 | ||
9483 | @@ -3278,7 +3300,8 @@ int __init its_init(struct fwnode_handle *handle, struct rdists *rdists, | |
9484 | } | |
9485 | ||
9486 | gic_rdists = rdists; | |
9487 | - err = its_alloc_lpi_tables(); | |
9488 | + | |
9489 | + err = allocate_lpi_tables(); | |
9490 | if (err) | |
9491 | return err; | |
9492 | ||
9493 | diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig | |
9494 | index 3f9ddb9fafa7..09da5b6b44a1 100644 | |
9495 | --- a/drivers/leds/trigger/Kconfig | |
9496 | +++ b/drivers/leds/trigger/Kconfig | |
9497 | @@ -69,7 +69,7 @@ config LEDS_TRIGGER_BACKLIGHT | |
e4b2b4a8 JK |
9498 | |
9499 | config LEDS_TRIGGER_CPU | |
9500 | bool "LED CPU Trigger" | |
9501 | - depends on LEDS_TRIGGERS | |
9502 | + depends on LEDS_TRIGGERS && !PREEMPT_RT_BASE | |
9503 | help | |
9504 | This allows LEDs to be controlled by active CPUs. This shows | |
9505 | the active CPUs across an array of LEDs so you can see which | |
b3bbd485 JK |
9506 | diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig |
9507 | index 4d200883c505..98b64ed5cb81 100644 | |
9508 | --- a/drivers/md/bcache/Kconfig | |
9509 | +++ b/drivers/md/bcache/Kconfig | |
e4b2b4a8 JK |
9510 | @@ -1,6 +1,7 @@ |
9511 | ||
9512 | config BCACHE | |
9513 | tristate "Block device as cache" | |
9514 | + depends on !PREEMPT_RT_FULL | |
9515 | ---help--- | |
9516 | Allows a block device to be used as cache for other devices; uses | |
9517 | a btree for indexing and the layout is optimized for SSDs. | |
b3bbd485 JK |
9518 | diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c |
9519 | index eadfcfd106ff..8824aeda85cf 100644 | |
9520 | --- a/drivers/md/dm-rq.c | |
9521 | +++ b/drivers/md/dm-rq.c | |
9522 | @@ -671,7 +671,7 @@ static void dm_old_request_fn(struct request_queue *q) | |
e4b2b4a8 JK |
9523 | /* Establish tio->ti before queuing work (map_tio_request) */ |
9524 | tio->ti = ti; | |
9525 | kthread_queue_work(&md->kworker, &tio->work); | |
9526 | - BUG_ON(!irqs_disabled()); | |
9527 | + BUG_ON_NONRT(!irqs_disabled()); | |
9528 | } | |
9529 | } | |
9530 | ||
b3bbd485 | 9531 | diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c |
5dd41b01 | 9532 | index dbf51b4c21b3..5cfccaf87687 100644 |
b3bbd485 JK |
9533 | --- a/drivers/md/raid5.c |
9534 | +++ b/drivers/md/raid5.c | |
9535 | @@ -410,7 +410,7 @@ void raid5_release_stripe(struct stripe_head *sh) | |
e4b2b4a8 JK |
9536 | md_wakeup_thread(conf->mddev->thread); |
9537 | return; | |
9538 | slow_path: | |
9539 | - local_irq_save(flags); | |
9540 | + local_irq_save_nort(flags); | |
9541 | /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ | |
9542 | if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { | |
9543 | INIT_LIST_HEAD(&list); | |
b3bbd485 | 9544 | @@ -419,7 +419,7 @@ void raid5_release_stripe(struct stripe_head *sh) |
e4b2b4a8 JK |
9545 | spin_unlock(&conf->device_lock); |
9546 | release_inactive_stripe_list(conf, &list, hash); | |
9547 | } | |
9548 | - local_irq_restore(flags); | |
9549 | + local_irq_restore_nort(flags); | |
9550 | } | |
9551 | ||
9552 | static inline void remove_hash(struct stripe_head *sh) | |
b3bbd485 | 9553 | @@ -2067,8 +2067,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) |
e4b2b4a8 JK |
9554 | struct raid5_percpu *percpu; |
9555 | unsigned long cpu; | |
9556 | ||
9557 | - cpu = get_cpu(); | |
9558 | + cpu = get_cpu_light(); | |
9559 | percpu = per_cpu_ptr(conf->percpu, cpu); | |
9560 | + spin_lock(&percpu->lock); | |
9561 | if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { | |
9562 | ops_run_biofill(sh); | |
9563 | overlap_clear++; | |
b3bbd485 | 9564 | @@ -2127,7 +2128,8 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) |
e4b2b4a8 JK |
9565 | if (test_and_clear_bit(R5_Overlap, &dev->flags)) |
9566 | wake_up(&sh->raid_conf->wait_for_overlap); | |
9567 | } | |
9568 | - put_cpu(); | |
9569 | + spin_unlock(&percpu->lock); | |
9570 | + put_cpu_light(); | |
9571 | } | |
9572 | ||
9573 | static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh) | |
b3bbd485 | 9574 | @@ -6781,6 +6783,7 @@ static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) |
e4b2b4a8 JK |
9575 | __func__, cpu); |
9576 | return -ENOMEM; | |
9577 | } | |
9578 | + spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock); | |
9579 | return 0; | |
9580 | } | |
9581 | ||
b3bbd485 | 9582 | @@ -6791,7 +6794,6 @@ static int raid5_alloc_percpu(struct r5conf *conf) |
e4b2b4a8 JK |
9583 | conf->percpu = alloc_percpu(struct raid5_percpu); |
9584 | if (!conf->percpu) | |
9585 | return -ENOMEM; | |
9586 | - | |
9587 | err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node); | |
9588 | if (!err) { | |
9589 | conf->scribble_disks = max(conf->raid_disks, | |
b3bbd485 JK |
9590 | diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h |
9591 | index 2e6123825095..37a6021418a2 100644 | |
9592 | --- a/drivers/md/raid5.h | |
9593 | +++ b/drivers/md/raid5.h | |
9594 | @@ -624,6 +624,7 @@ struct r5conf { | |
e4b2b4a8 JK |
9595 | int recovery_disabled; |
9596 | /* per cpu variables */ | |
9597 | struct raid5_percpu { | |
9598 | + spinlock_t lock; /* Protection for -RT */ | |
9599 | struct page *spare_page; /* Used when checking P/Q in raid6 */ | |
9600 | struct flex_array *scribble; /* space for constructing buffer | |
9601 | * lists and performing address | |
b3bbd485 JK |
9602 | diff --git a/drivers/mfd/atmel-smc.c b/drivers/mfd/atmel-smc.c |
9603 | index 7d77948567d7..0adbd2e796fe 100644 | |
9604 | --- a/drivers/mfd/atmel-smc.c | |
9605 | +++ b/drivers/mfd/atmel-smc.c | |
e4b2b4a8 JK |
9606 | @@ -12,6 +12,7 @@ |
9607 | */ | |
9608 | ||
9609 | #include <linux/mfd/syscon/atmel-smc.h> | |
9610 | +#include <linux/string.h> | |
9611 | ||
9612 | /** | |
9613 | * atmel_smc_cs_conf_init - initialize a SMC CS conf | |
b3bbd485 JK |
9614 | diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig |
9615 | index 8136dc7e863d..86e83b9629d7 100644 | |
9616 | --- a/drivers/misc/Kconfig | |
9617 | +++ b/drivers/misc/Kconfig | |
9618 | @@ -54,6 +54,7 @@ config AD525X_DPOT_SPI | |
e4b2b4a8 JK |
9619 | config ATMEL_TCLIB |
9620 | bool "Atmel AT32/AT91 Timer/Counter Library" | |
9621 | depends on (AVR32 || ARCH_AT91) | |
9622 | + default y if PREEMPT_RT_FULL | |
9623 | help | |
9624 | Select this if you want a library to allocate the Timer/Counter | |
9625 | blocks found on many Atmel processors. This facilitates using | |
b3bbd485 | 9626 | @@ -69,8 +70,7 @@ config ATMEL_TCB_CLKSRC |
e4b2b4a8 JK |
9627 | are combined to make a single 32-bit timer. |
9628 | ||
9629 | When GENERIC_CLOCKEVENTS is defined, the third timer channel | |
9630 | - may be used as a clock event device supporting oneshot mode | |
9631 | - (delays of up to two seconds) based on the 32 KiHz clock. | |
9632 | + may be used as a clock event device supporting oneshot mode. | |
9633 | ||
9634 | config ATMEL_TCB_CLKSRC_BLOCK | |
9635 | int | |
b3bbd485 | 9636 | @@ -84,6 +84,15 @@ config ATMEL_TCB_CLKSRC_BLOCK |
e4b2b4a8 JK |
9637 | TC can be used for other purposes, such as PWM generation and |
9638 | interval timing. | |
9639 | ||
9640 | +config ATMEL_TCB_CLKSRC_USE_SLOW_CLOCK | |
9641 | + bool "TC Block use 32 KiHz clock" | |
9642 | + depends on ATMEL_TCB_CLKSRC | |
9643 | + default y if !PREEMPT_RT_FULL | |
9644 | + help | |
9645 | + Select this to use 32 KiHz base clock rate as TC block clock | |
9646 | + source for clock events. | |
9647 | + | |
9648 | + | |
9649 | config DUMMY_IRQ | |
9650 | tristate "Dummy IRQ handler" | |
9651 | default n | |
b3bbd485 JK |
9652 | diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c |
9653 | index f1f54a818489..ce102378df02 100644 | |
9654 | --- a/drivers/mmc/host/mmci.c | |
9655 | +++ b/drivers/mmc/host/mmci.c | |
9656 | @@ -1200,15 +1200,12 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id) | |
e4b2b4a8 JK |
9657 | struct sg_mapping_iter *sg_miter = &host->sg_miter; |
9658 | struct variant_data *variant = host->variant; | |
9659 | void __iomem *base = host->base; | |
9660 | - unsigned long flags; | |
9661 | u32 status; | |
9662 | ||
9663 | status = readl(base + MMCISTATUS); | |
9664 | ||
9665 | dev_dbg(mmc_dev(host->mmc), "irq1 (pio) %08x\n", status); | |
9666 | ||
9667 | - local_irq_save(flags); | |
9668 | - | |
9669 | do { | |
9670 | unsigned int remain, len; | |
9671 | char *buffer; | |
b3bbd485 | 9672 | @@ -1248,8 +1245,6 @@ static irqreturn_t mmci_pio_irq(int irq, void *dev_id) |
e4b2b4a8 JK |
9673 | |
9674 | sg_miter_stop(sg_miter); | |
9675 | ||
9676 | - local_irq_restore(flags); | |
9677 | - | |
9678 | /* | |
9679 | * If we have less than the fifo 'half-full' threshold to transfer, | |
9680 | * trigger a PIO interrupt as soon as any data is available. | |
b3bbd485 JK |
9681 | diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c |
9682 | index 402d9090ad29..9bc02563b853 100644 | |
9683 | --- a/drivers/net/ethernet/3com/3c59x.c | |
9684 | +++ b/drivers/net/ethernet/3com/3c59x.c | |
9685 | @@ -842,9 +842,9 @@ static void poll_vortex(struct net_device *dev) | |
e4b2b4a8 JK |
9686 | { |
9687 | struct vortex_private *vp = netdev_priv(dev); | |
9688 | unsigned long flags; | |
9689 | - local_irq_save(flags); | |
9690 | + local_irq_save_nort(flags); | |
9691 | (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev); | |
9692 | - local_irq_restore(flags); | |
9693 | + local_irq_restore_nort(flags); | |
9694 | } | |
9695 | #endif | |
9696 | ||
b3bbd485 | 9697 | @@ -1908,12 +1908,12 @@ static void vortex_tx_timeout(struct net_device *dev) |
e4b2b4a8 JK |
9698 | * Block interrupts because vortex_interrupt does a bare spin_lock() |
9699 | */ | |
9700 | unsigned long flags; | |
9701 | - local_irq_save(flags); | |
9702 | + local_irq_save_nort(flags); | |
9703 | if (vp->full_bus_master_tx) | |
9704 | boomerang_interrupt(dev->irq, dev); | |
9705 | else | |
9706 | vortex_interrupt(dev->irq, dev); | |
9707 | - local_irq_restore(flags); | |
9708 | + local_irq_restore_nort(flags); | |
9709 | } | |
9710 | } | |
9711 | ||
b3bbd485 | 9712 | diff --git a/drivers/net/ethernet/marvell/mvpp2.c b/drivers/net/ethernet/marvell/mvpp2.c |
5dd41b01 | 9713 | index 00e6f1d155a6..9c69ab2c5b07 100644 |
b3bbd485 JK |
9714 | --- a/drivers/net/ethernet/marvell/mvpp2.c |
9715 | +++ b/drivers/net/ethernet/marvell/mvpp2.c | |
5dd41b01 | 9716 | @@ -831,9 +831,8 @@ struct mvpp2_pcpu_stats { |
e4b2b4a8 JK |
9717 | /* Per-CPU port control */ |
9718 | struct mvpp2_port_pcpu { | |
9719 | struct hrtimer tx_done_timer; | |
9720 | + struct net_device *dev; | |
9721 | bool timer_scheduled; | |
9722 | - /* Tasklet for egress finalization */ | |
9723 | - struct tasklet_struct tx_done_tasklet; | |
9724 | }; | |
9725 | ||
9726 | struct mvpp2_queue_vector { | |
5dd41b01 | 9727 | @@ -5955,46 +5954,34 @@ static void mvpp2_link_event(struct net_device *dev) |
e4b2b4a8 JK |
9728 | } |
9729 | } | |
9730 | ||
9731 | -static void mvpp2_timer_set(struct mvpp2_port_pcpu *port_pcpu) | |
9732 | -{ | |
9733 | - ktime_t interval; | |
9734 | - | |
9735 | - if (!port_pcpu->timer_scheduled) { | |
9736 | - port_pcpu->timer_scheduled = true; | |
9737 | - interval = MVPP2_TXDONE_HRTIMER_PERIOD_NS; | |
9738 | - hrtimer_start(&port_pcpu->tx_done_timer, interval, | |
9739 | - HRTIMER_MODE_REL_PINNED); | |
9740 | - } | |
9741 | -} | |
9742 | - | |
9743 | -static void mvpp2_tx_proc_cb(unsigned long data) | |
9744 | +static enum hrtimer_restart mvpp2_hr_timer_cb(struct hrtimer *timer) | |
9745 | { | |
9746 | - struct net_device *dev = (struct net_device *)data; | |
9747 | - struct mvpp2_port *port = netdev_priv(dev); | |
9748 | - struct mvpp2_port_pcpu *port_pcpu = this_cpu_ptr(port->pcpu); | |
9749 | + struct net_device *dev; | |
9750 | + struct mvpp2_port *port; | |
9751 | + struct mvpp2_port_pcpu *port_pcpu; | |
9752 | unsigned int tx_todo, cause; | |
9753 | ||
9754 | + port_pcpu = container_of(timer, struct mvpp2_port_pcpu, tx_done_timer); | |
9755 | + dev = port_pcpu->dev; | |
9756 | + | |
9757 | if (!netif_running(dev)) | |
9758 | - return; | |
9759 | + return HRTIMER_NORESTART; | |
9760 | + | |
9761 | port_pcpu->timer_scheduled = false; | |
9762 | + port = netdev_priv(dev); | |
9763 | ||
9764 | /* Process all the Tx queues */ | |
9765 | cause = (1 << port->ntxqs) - 1; | |
9766 | tx_todo = mvpp2_tx_done(port, cause, smp_processor_id()); | |
9767 | ||
9768 | /* Set the timer in case not all the packets were processed */ | |
9769 | - if (tx_todo) | |
9770 | - mvpp2_timer_set(port_pcpu); | |
9771 | -} | |
9772 | - | |
9773 | -static enum hrtimer_restart mvpp2_hr_timer_cb(struct hrtimer *timer) | |
9774 | -{ | |
9775 | - struct mvpp2_port_pcpu *port_pcpu = container_of(timer, | |
9776 | - struct mvpp2_port_pcpu, | |
9777 | - tx_done_timer); | |
9778 | - | |
9779 | - tasklet_schedule(&port_pcpu->tx_done_tasklet); | |
9780 | + if (tx_todo && !port_pcpu->timer_scheduled) { | |
9781 | + port_pcpu->timer_scheduled = true; | |
9782 | + hrtimer_forward_now(&port_pcpu->tx_done_timer, | |
9783 | + MVPP2_TXDONE_HRTIMER_PERIOD_NS); | |
9784 | ||
9785 | + return HRTIMER_RESTART; | |
9786 | + } | |
9787 | return HRTIMER_NORESTART; | |
9788 | } | |
9789 | ||
5dd41b01 | 9790 | @@ -6484,7 +6471,12 @@ static int mvpp2_tx(struct sk_buff *skb, struct net_device *dev) |
e4b2b4a8 JK |
9791 | txq_pcpu->count > 0) { |
9792 | struct mvpp2_port_pcpu *port_pcpu = this_cpu_ptr(port->pcpu); | |
9793 | ||
9794 | - mvpp2_timer_set(port_pcpu); | |
9795 | + if (!port_pcpu->timer_scheduled) { | |
9796 | + port_pcpu->timer_scheduled = true; | |
9797 | + hrtimer_start(&port_pcpu->tx_done_timer, | |
9798 | + MVPP2_TXDONE_HRTIMER_PERIOD_NS, | |
9799 | + HRTIMER_MODE_REL_PINNED_SOFT); | |
9800 | + } | |
9801 | } | |
9802 | ||
9803 | return NETDEV_TX_OK; | |
5dd41b01 | 9804 | @@ -6875,7 +6867,6 @@ static int mvpp2_stop(struct net_device *dev) |
e4b2b4a8 JK |
9805 | |
9806 | hrtimer_cancel(&port_pcpu->tx_done_timer); | |
9807 | port_pcpu->timer_scheduled = false; | |
9808 | - tasklet_kill(&port_pcpu->tx_done_tasklet); | |
9809 | } | |
9810 | } | |
9811 | mvpp2_cleanup_rxqs(port); | |
5dd41b01 | 9812 | @@ -7648,13 +7639,10 @@ static int mvpp2_port_probe(struct platform_device *pdev, |
e4b2b4a8 JK |
9813 | port_pcpu = per_cpu_ptr(port->pcpu, cpu); |
9814 | ||
9815 | hrtimer_init(&port_pcpu->tx_done_timer, CLOCK_MONOTONIC, | |
9816 | - HRTIMER_MODE_REL_PINNED); | |
9817 | + HRTIMER_MODE_REL_PINNED_SOFT); | |
9818 | port_pcpu->tx_done_timer.function = mvpp2_hr_timer_cb; | |
9819 | port_pcpu->timer_scheduled = false; | |
9820 | - | |
9821 | - tasklet_init(&port_pcpu->tx_done_tasklet, | |
9822 | - mvpp2_tx_proc_cb, | |
9823 | - (unsigned long)dev); | |
9824 | + port_pcpu->dev = dev; | |
9825 | } | |
9826 | } | |
9827 | ||
b3bbd485 JK |
9828 | diff --git a/drivers/net/wireless/intersil/orinoco/orinoco_usb.c b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c |
9829 | index 56f6e3b71f48..a50350d01a80 100644 | |
9830 | --- a/drivers/net/wireless/intersil/orinoco/orinoco_usb.c | |
9831 | +++ b/drivers/net/wireless/intersil/orinoco/orinoco_usb.c | |
9832 | @@ -697,7 +697,7 @@ static void ezusb_req_ctx_wait(struct ezusb_priv *upriv, | |
e4b2b4a8 JK |
9833 | while (!ctx->done.done && msecs--) |
9834 | udelay(1000); | |
9835 | } else { | |
9836 | - wait_event_interruptible(ctx->done.wait, | |
9837 | + swait_event_interruptible(ctx->done.wait, | |
9838 | ctx->done.done); | |
9839 | } | |
9840 | break; | |
b3bbd485 | 9841 | diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c |
5dd41b01 | 9842 | index aafa7aa18fbd..388f6d71ba71 100644 |
b3bbd485 JK |
9843 | --- a/drivers/net/wireless/mac80211_hwsim.c |
9844 | +++ b/drivers/net/wireless/mac80211_hwsim.c | |
9845 | @@ -537,7 +537,7 @@ struct mac80211_hwsim_data { | |
e4b2b4a8 JK |
9846 | unsigned int rx_filter; |
9847 | bool started, idle, scanning; | |
9848 | struct mutex mutex; | |
9849 | - struct tasklet_hrtimer beacon_timer; | |
9850 | + struct hrtimer beacon_timer; | |
9851 | enum ps_mode { | |
9852 | PS_DISABLED, PS_ENABLED, PS_AUTO_POLL, PS_MANUAL_POLL | |
9853 | } ps; | |
b3bbd485 | 9854 | @@ -1423,7 +1423,7 @@ static void mac80211_hwsim_stop(struct ieee80211_hw *hw) |
e4b2b4a8 JK |
9855 | { |
9856 | struct mac80211_hwsim_data *data = hw->priv; | |
9857 | data->started = false; | |
9858 | - tasklet_hrtimer_cancel(&data->beacon_timer); | |
9859 | + hrtimer_cancel(&data->beacon_timer); | |
9860 | wiphy_debug(hw->wiphy, "%s\n", __func__); | |
9861 | } | |
9862 | ||
b3bbd485 | 9863 | @@ -1546,14 +1546,12 @@ static enum hrtimer_restart |
e4b2b4a8 JK |
9864 | mac80211_hwsim_beacon(struct hrtimer *timer) |
9865 | { | |
9866 | struct mac80211_hwsim_data *data = | |
9867 | - container_of(timer, struct mac80211_hwsim_data, | |
9868 | - beacon_timer.timer); | |
9869 | + container_of(timer, struct mac80211_hwsim_data, beacon_timer); | |
9870 | struct ieee80211_hw *hw = data->hw; | |
9871 | u64 bcn_int = data->beacon_int; | |
9872 | - ktime_t next_bcn; | |
9873 | ||
9874 | if (!data->started) | |
9875 | - goto out; | |
9876 | + return HRTIMER_NORESTART; | |
9877 | ||
9878 | ieee80211_iterate_active_interfaces_atomic( | |
9879 | hw, IEEE80211_IFACE_ITER_NORMAL, | |
b3bbd485 | 9880 | @@ -1565,11 +1563,9 @@ mac80211_hwsim_beacon(struct hrtimer *timer) |
e4b2b4a8 JK |
9881 | data->bcn_delta = 0; |
9882 | } | |
9883 | ||
9884 | - next_bcn = ktime_add(hrtimer_get_expires(timer), | |
9885 | - ns_to_ktime(bcn_int * 1000)); | |
9886 | - tasklet_hrtimer_start(&data->beacon_timer, next_bcn, HRTIMER_MODE_ABS); | |
9887 | -out: | |
9888 | - return HRTIMER_NORESTART; | |
9889 | + hrtimer_forward(&data->beacon_timer, hrtimer_get_expires(timer), | |
9890 | + ns_to_ktime(bcn_int * NSEC_PER_USEC)); | |
9891 | + return HRTIMER_RESTART; | |
9892 | } | |
9893 | ||
9894 | static const char * const hwsim_chanwidths[] = { | |
b3bbd485 | 9895 | @@ -1643,15 +1639,15 @@ static int mac80211_hwsim_config(struct ieee80211_hw *hw, u32 changed) |
e4b2b4a8 JK |
9896 | mutex_unlock(&data->mutex); |
9897 | ||
9898 | if (!data->started || !data->beacon_int) | |
9899 | - tasklet_hrtimer_cancel(&data->beacon_timer); | |
9900 | - else if (!hrtimer_is_queued(&data->beacon_timer.timer)) { | |
9901 | + hrtimer_cancel(&data->beacon_timer); | |
9902 | + else if (!hrtimer_is_queued(&data->beacon_timer)) { | |
9903 | u64 tsf = mac80211_hwsim_get_tsf(hw, NULL); | |
9904 | u32 bcn_int = data->beacon_int; | |
9905 | u64 until_tbtt = bcn_int - do_div(tsf, bcn_int); | |
9906 | ||
9907 | - tasklet_hrtimer_start(&data->beacon_timer, | |
9908 | - ns_to_ktime(until_tbtt * 1000), | |
9909 | - HRTIMER_MODE_REL); | |
9910 | + hrtimer_start(&data->beacon_timer, | |
9911 | + ns_to_ktime(until_tbtt * 1000), | |
9912 | + HRTIMER_MODE_REL_SOFT); | |
9913 | } | |
9914 | ||
9915 | return 0; | |
b3bbd485 | 9916 | @@ -1714,7 +1710,7 @@ static void mac80211_hwsim_bss_info_changed(struct ieee80211_hw *hw, |
e4b2b4a8 JK |
9917 | info->enable_beacon, info->beacon_int); |
9918 | vp->bcn_en = info->enable_beacon; | |
9919 | if (data->started && | |
9920 | - !hrtimer_is_queued(&data->beacon_timer.timer) && | |
9921 | + !hrtimer_is_queued(&data->beacon_timer) && | |
9922 | info->enable_beacon) { | |
9923 | u64 tsf, until_tbtt; | |
9924 | u32 bcn_int; | |
b3bbd485 | 9925 | @@ -1722,9 +1718,9 @@ static void mac80211_hwsim_bss_info_changed(struct ieee80211_hw *hw, |
e4b2b4a8 JK |
9926 | tsf = mac80211_hwsim_get_tsf(hw, vif); |
9927 | bcn_int = data->beacon_int; | |
9928 | until_tbtt = bcn_int - do_div(tsf, bcn_int); | |
9929 | - tasklet_hrtimer_start(&data->beacon_timer, | |
9930 | - ns_to_ktime(until_tbtt * 1000), | |
9931 | - HRTIMER_MODE_REL); | |
9932 | + hrtimer_start(&data->beacon_timer, | |
9933 | + ns_to_ktime(until_tbtt * 1000), | |
9934 | + HRTIMER_MODE_REL_SOFT); | |
9935 | } else if (!info->enable_beacon) { | |
9936 | unsigned int count = 0; | |
9937 | ieee80211_iterate_active_interfaces_atomic( | |
b3bbd485 | 9938 | @@ -1733,7 +1729,7 @@ static void mac80211_hwsim_bss_info_changed(struct ieee80211_hw *hw, |
e4b2b4a8 JK |
9939 | wiphy_debug(hw->wiphy, " beaconing vifs remaining: %u", |
9940 | count); | |
9941 | if (count == 0) { | |
9942 | - tasklet_hrtimer_cancel(&data->beacon_timer); | |
9943 | + hrtimer_cancel(&data->beacon_timer); | |
9944 | data->beacon_int = 0; | |
9945 | } | |
9946 | } | |
5dd41b01 | 9947 | @@ -2722,9 +2718,9 @@ static int mac80211_hwsim_new_radio(struct genl_info *info, |
e4b2b4a8 JK |
9948 | data->debugfs, |
9949 | data, &hwsim_simulate_radar); | |
9950 | ||
9951 | - tasklet_hrtimer_init(&data->beacon_timer, | |
9952 | - mac80211_hwsim_beacon, | |
9953 | - CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | |
9954 | + hrtimer_init(&data->beacon_timer, CLOCK_MONOTONIC, | |
9955 | + HRTIMER_MODE_ABS_SOFT); | |
9956 | + data->beacon_timer.function = mac80211_hwsim_beacon; | |
9957 | ||
9958 | spin_lock_bh(&hwsim_radio_lock); | |
9959 | list_add_tail(&data->list, &hwsim_radios); | |
b3bbd485 JK |
9960 | diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c |
9961 | index 620f5b995a12..7fd1548a2905 100644 | |
9962 | --- a/drivers/pci/switch/switchtec.c | |
9963 | +++ b/drivers/pci/switch/switchtec.c | |
9964 | @@ -308,10 +308,11 @@ struct switchtec_user { | |
e4b2b4a8 JK |
9965 | |
9966 | enum mrpc_state state; | |
9967 | ||
9968 | - struct completion comp; | |
9969 | + wait_queue_head_t cmd_comp; | |
9970 | struct kref kref; | |
9971 | struct list_head list; | |
9972 | ||
9973 | + bool cmd_done; | |
9974 | u32 cmd; | |
9975 | u32 status; | |
9976 | u32 return_code; | |
b3bbd485 | 9977 | @@ -333,7 +334,7 @@ static struct switchtec_user *stuser_create(struct switchtec_dev *stdev) |
e4b2b4a8 JK |
9978 | stuser->stdev = stdev; |
9979 | kref_init(&stuser->kref); | |
9980 | INIT_LIST_HEAD(&stuser->list); | |
9981 | - init_completion(&stuser->comp); | |
9982 | + init_waitqueue_head(&stuser->cmd_comp); | |
9983 | stuser->event_cnt = atomic_read(&stdev->event_cnt); | |
9984 | ||
9985 | dev_dbg(&stdev->dev, "%s: %p\n", __func__, stuser); | |
b3bbd485 | 9986 | @@ -416,7 +417,7 @@ static int mrpc_queue_cmd(struct switchtec_user *stuser) |
e4b2b4a8 JK |
9987 | kref_get(&stuser->kref); |
9988 | stuser->read_len = sizeof(stuser->data); | |
9989 | stuser_set_state(stuser, MRPC_QUEUED); | |
9990 | - init_completion(&stuser->comp); | |
9991 | + stuser->cmd_done = false; | |
9992 | list_add_tail(&stuser->list, &stdev->mrpc_queue); | |
9993 | ||
9994 | mrpc_cmd_submit(stdev); | |
b3bbd485 | 9995 | @@ -453,7 +454,8 @@ static void mrpc_complete_cmd(struct switchtec_dev *stdev) |
e4b2b4a8 JK |
9996 | stuser->read_len); |
9997 | ||
9998 | out: | |
9999 | - complete_all(&stuser->comp); | |
10000 | + stuser->cmd_done = true; | |
10001 | + wake_up_interruptible(&stuser->cmd_comp); | |
10002 | list_del_init(&stuser->list); | |
10003 | stuser_put(stuser); | |
10004 | stdev->mrpc_busy = 0; | |
b3bbd485 | 10005 | @@ -723,10 +725,11 @@ static ssize_t switchtec_dev_read(struct file *filp, char __user *data, |
e4b2b4a8 JK |
10006 | mutex_unlock(&stdev->mrpc_mutex); |
10007 | ||
10008 | if (filp->f_flags & O_NONBLOCK) { | |
10009 | - if (!try_wait_for_completion(&stuser->comp)) | |
10010 | + if (!READ_ONCE(stuser->cmd_done)) | |
10011 | return -EAGAIN; | |
10012 | } else { | |
10013 | - rc = wait_for_completion_interruptible(&stuser->comp); | |
10014 | + rc = wait_event_interruptible(stuser->cmd_comp, | |
10015 | + stuser->cmd_done); | |
10016 | if (rc < 0) | |
10017 | return rc; | |
10018 | } | |
b3bbd485 | 10019 | @@ -774,7 +777,7 @@ static unsigned int switchtec_dev_poll(struct file *filp, poll_table *wait) |
e4b2b4a8 JK |
10020 | struct switchtec_dev *stdev = stuser->stdev; |
10021 | int ret = 0; | |
10022 | ||
10023 | - poll_wait(filp, &stuser->comp.wait, wait); | |
10024 | + poll_wait(filp, &stuser->cmd_comp, wait); | |
10025 | poll_wait(filp, &stdev->event_wq, wait); | |
10026 | ||
10027 | if (lock_mutex_and_test_alive(stdev)) | |
b3bbd485 | 10028 | @@ -782,7 +785,7 @@ static unsigned int switchtec_dev_poll(struct file *filp, poll_table *wait) |
e4b2b4a8 JK |
10029 | |
10030 | mutex_unlock(&stdev->mrpc_mutex); | |
10031 | ||
10032 | - if (try_wait_for_completion(&stuser->comp)) | |
10033 | + if (READ_ONCE(stuser->cmd_done)) | |
10034 | ret |= POLLIN | POLLRDNORM; | |
10035 | ||
10036 | if (stuser->event_cnt != atomic_read(&stdev->event_cnt)) | |
b3bbd485 | 10037 | @@ -1259,7 +1262,8 @@ static void stdev_kill(struct switchtec_dev *stdev) |
e4b2b4a8 JK |
10038 | |
10039 | /* Wake up and kill any users waiting on an MRPC request */ | |
10040 | list_for_each_entry_safe(stuser, tmpuser, &stdev->mrpc_queue, list) { | |
10041 | - complete_all(&stuser->comp); | |
10042 | + stuser->cmd_done = true; | |
10043 | + wake_up_interruptible(&stuser->cmd_comp); | |
10044 | list_del_init(&stuser->list); | |
10045 | stuser_put(stuser); | |
10046 | } | |
b3bbd485 JK |
10047 | diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c |
10048 | index 85f9a3eba387..08ea05ddcd82 100644 | |
10049 | --- a/drivers/scsi/fcoe/fcoe.c | |
10050 | +++ b/drivers/scsi/fcoe/fcoe.c | |
10051 | @@ -1464,11 +1464,11 @@ static int fcoe_rcv(struct sk_buff *skb, struct net_device *netdev, | |
e4b2b4a8 JK |
10052 | static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen) |
10053 | { | |
10054 | struct fcoe_percpu_s *fps; | |
10055 | - int rc; | |
10056 | + int rc, cpu = get_cpu_light(); | |
10057 | ||
10058 | - fps = &get_cpu_var(fcoe_percpu); | |
10059 | + fps = &per_cpu(fcoe_percpu, cpu); | |
10060 | rc = fcoe_get_paged_crc_eof(skb, tlen, fps); | |
10061 | - put_cpu_var(fcoe_percpu); | |
10062 | + put_cpu_light(); | |
10063 | ||
10064 | return rc; | |
10065 | } | |
b3bbd485 | 10066 | @@ -1655,11 +1655,11 @@ static inline int fcoe_filter_frames(struct fc_lport *lport, |
e4b2b4a8 JK |
10067 | return 0; |
10068 | } | |
10069 | ||
10070 | - stats = per_cpu_ptr(lport->stats, get_cpu()); | |
10071 | + stats = per_cpu_ptr(lport->stats, get_cpu_light()); | |
10072 | stats->InvalidCRCCount++; | |
10073 | if (stats->InvalidCRCCount < 5) | |
10074 | printk(KERN_WARNING "fcoe: dropping frame with CRC error\n"); | |
10075 | - put_cpu(); | |
10076 | + put_cpu_light(); | |
10077 | return -EINVAL; | |
10078 | } | |
10079 | ||
b3bbd485 | 10080 | @@ -1702,7 +1702,7 @@ static void fcoe_recv_frame(struct sk_buff *skb) |
e4b2b4a8 JK |
10081 | */ |
10082 | hp = (struct fcoe_hdr *) skb_network_header(skb); | |
10083 | ||
10084 | - stats = per_cpu_ptr(lport->stats, get_cpu()); | |
10085 | + stats = per_cpu_ptr(lport->stats, get_cpu_light()); | |
10086 | if (unlikely(FC_FCOE_DECAPS_VER(hp) != FC_FCOE_VER)) { | |
10087 | if (stats->ErrorFrames < 5) | |
10088 | printk(KERN_WARNING "fcoe: FCoE version " | |
b3bbd485 | 10089 | @@ -1734,13 +1734,13 @@ static void fcoe_recv_frame(struct sk_buff *skb) |
e4b2b4a8 JK |
10090 | goto drop; |
10091 | ||
10092 | if (!fcoe_filter_frames(lport, fp)) { | |
10093 | - put_cpu(); | |
10094 | + put_cpu_light(); | |
10095 | fc_exch_recv(lport, fp); | |
10096 | return; | |
10097 | } | |
10098 | drop: | |
10099 | stats->ErrorFrames++; | |
10100 | - put_cpu(); | |
10101 | + put_cpu_light(); | |
10102 | kfree_skb(skb); | |
10103 | } | |
10104 | ||
b3bbd485 JK |
10105 | diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c |
10106 | index 03019e07abb9..9ec11316bfe6 100644 | |
10107 | --- a/drivers/scsi/fcoe/fcoe_ctlr.c | |
10108 | +++ b/drivers/scsi/fcoe/fcoe_ctlr.c | |
10109 | @@ -835,7 +835,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip) | |
e4b2b4a8 JK |
10110 | |
10111 | INIT_LIST_HEAD(&del_list); | |
10112 | ||
10113 | - stats = per_cpu_ptr(fip->lp->stats, get_cpu()); | |
10114 | + stats = per_cpu_ptr(fip->lp->stats, get_cpu_light()); | |
10115 | ||
10116 | list_for_each_entry_safe(fcf, next, &fip->fcfs, list) { | |
10117 | deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2; | |
b3bbd485 | 10118 | @@ -871,7 +871,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip) |
e4b2b4a8 JK |
10119 | sel_time = fcf->time; |
10120 | } | |
10121 | } | |
10122 | - put_cpu(); | |
10123 | + put_cpu_light(); | |
10124 | ||
10125 | list_for_each_entry_safe(fcf, next, &del_list, list) { | |
10126 | /* Removes fcf from current list */ | |
b3bbd485 JK |
10127 | diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c |
10128 | index 42bcf7f3a0f9..2ce045d6860c 100644 | |
10129 | --- a/drivers/scsi/libfc/fc_exch.c | |
10130 | +++ b/drivers/scsi/libfc/fc_exch.c | |
10131 | @@ -833,10 +833,10 @@ static struct fc_exch *fc_exch_em_alloc(struct fc_lport *lport, | |
e4b2b4a8 JK |
10132 | } |
10133 | memset(ep, 0, sizeof(*ep)); | |
10134 | ||
10135 | - cpu = get_cpu(); | |
10136 | + cpu = get_cpu_light(); | |
10137 | pool = per_cpu_ptr(mp->pool, cpu); | |
10138 | spin_lock_bh(&pool->lock); | |
10139 | - put_cpu(); | |
10140 | + put_cpu_light(); | |
10141 | ||
10142 | /* peek cache of free slot */ | |
10143 | if (pool->left != FC_XID_UNKNOWN) { | |
b3bbd485 JK |
10144 | diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c |
10145 | index 70be4425ae0b..a23ef685deac 100644 | |
10146 | --- a/drivers/scsi/libsas/sas_ata.c | |
10147 | +++ b/drivers/scsi/libsas/sas_ata.c | |
10148 | @@ -190,7 +190,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc) | |
e4b2b4a8 JK |
10149 | /* TODO: audit callers to ensure they are ready for qc_issue to |
10150 | * unconditionally re-enable interrupts | |
10151 | */ | |
10152 | - local_irq_save(flags); | |
10153 | + local_irq_save_nort(flags); | |
10154 | spin_unlock(ap->lock); | |
10155 | ||
10156 | /* If the device fell off, no sense in issuing commands */ | |
b3bbd485 | 10157 | @@ -252,7 +252,7 @@ static unsigned int sas_ata_qc_issue(struct ata_queued_cmd *qc) |
e4b2b4a8 JK |
10158 | |
10159 | out: | |
10160 | spin_lock(ap->lock); | |
10161 | - local_irq_restore(flags); | |
10162 | + local_irq_restore_nort(flags); | |
10163 | return ret; | |
10164 | } | |
10165 | ||
b3bbd485 JK |
10166 | diff --git a/drivers/scsi/qla2xxx/qla_inline.h b/drivers/scsi/qla2xxx/qla_inline.h |
10167 | index 3f5a0f0f8b62..c75783143dc1 100644 | |
10168 | --- a/drivers/scsi/qla2xxx/qla_inline.h | |
10169 | +++ b/drivers/scsi/qla2xxx/qla_inline.h | |
10170 | @@ -59,12 +59,12 @@ qla2x00_poll(struct rsp_que *rsp) | |
e4b2b4a8 JK |
10171 | { |
10172 | unsigned long flags; | |
10173 | struct qla_hw_data *ha = rsp->hw; | |
10174 | - local_irq_save(flags); | |
10175 | + local_irq_save_nort(flags); | |
10176 | if (IS_P3P_TYPE(ha)) | |
10177 | qla82xx_poll(0, rsp); | |
10178 | else | |
10179 | ha->isp_ops->intr_handler(0, rsp); | |
10180 | - local_irq_restore(flags); | |
10181 | + local_irq_restore_nort(flags); | |
10182 | } | |
10183 | ||
10184 | static inline uint8_t * | |
b3bbd485 JK |
10185 | diff --git a/drivers/staging/greybus/audio_manager.c b/drivers/staging/greybus/audio_manager.c |
10186 | index aa6508b44fab..045696ce85c7 100644 | |
10187 | --- a/drivers/staging/greybus/audio_manager.c | |
10188 | +++ b/drivers/staging/greybus/audio_manager.c | |
e4b2b4a8 JK |
10189 | @@ -10,7 +10,7 @@ |
10190 | #include <linux/sysfs.h> | |
10191 | #include <linux/module.h> | |
10192 | #include <linux/init.h> | |
10193 | -#include <linux/rwlock.h> | |
10194 | +#include <linux/spinlock.h> | |
10195 | #include <linux/idr.h> | |
10196 | ||
10197 | #include "audio_manager.h" | |
b3bbd485 JK |
10198 | diff --git a/drivers/target/target_core_tmr.c b/drivers/target/target_core_tmr.c |
10199 | index 9c7bc1ca341a..3d35dad1de2c 100644 | |
10200 | --- a/drivers/target/target_core_tmr.c | |
10201 | +++ b/drivers/target/target_core_tmr.c | |
10202 | @@ -114,8 +114,6 @@ static bool __target_check_io_state(struct se_cmd *se_cmd, | |
e4b2b4a8 JK |
10203 | { |
10204 | struct se_session *sess = se_cmd->se_sess; | |
10205 | ||
10206 | - assert_spin_locked(&sess->sess_cmd_lock); | |
10207 | - WARN_ON_ONCE(!irqs_disabled()); | |
10208 | /* | |
10209 | * If command already reached CMD_T_COMPLETE state within | |
10210 | * target_complete_cmd() or CMD_T_FABRIC_STOP due to shutdown, | |
b3bbd485 JK |
10211 | diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c |
10212 | index 0d0be7d8b9d6..f652e58e2988 100644 | |
10213 | --- a/drivers/target/target_core_transport.c | |
10214 | +++ b/drivers/target/target_core_transport.c | |
10215 | @@ -2967,9 +2967,6 @@ __transport_wait_for_tasks(struct se_cmd *cmd, bool fabric_stop, | |
e4b2b4a8 JK |
10216 | __acquires(&cmd->t_state_lock) |
10217 | { | |
10218 | ||
10219 | - assert_spin_locked(&cmd->t_state_lock); | |
10220 | - WARN_ON_ONCE(!irqs_disabled()); | |
10221 | - | |
10222 | if (fabric_stop) | |
10223 | cmd->transport_state |= CMD_T_FABRIC_STOP; | |
10224 | ||
b3bbd485 | 10225 | @@ -3239,9 +3236,6 @@ static int __transport_check_aborted_status(struct se_cmd *cmd, int send_status) |
e4b2b4a8 JK |
10226 | { |
10227 | int ret; | |
10228 | ||
10229 | - assert_spin_locked(&cmd->t_state_lock); | |
10230 | - WARN_ON_ONCE(!irqs_disabled()); | |
10231 | - | |
10232 | if (!(cmd->transport_state & CMD_T_ABORTED)) | |
10233 | return 0; | |
10234 | /* | |
b3bbd485 JK |
10235 | diff --git a/drivers/thermal/x86_pkg_temp_thermal.c b/drivers/thermal/x86_pkg_temp_thermal.c |
10236 | index d93eee2f101b..0287333b1f3c 100644 | |
10237 | --- a/drivers/thermal/x86_pkg_temp_thermal.c | |
10238 | +++ b/drivers/thermal/x86_pkg_temp_thermal.c | |
e4b2b4a8 JK |
10239 | @@ -29,6 +29,7 @@ |
10240 | #include <linux/pm.h> | |
10241 | #include <linux/thermal.h> | |
10242 | #include <linux/debugfs.h> | |
10243 | +#include <linux/swork.h> | |
10244 | #include <asm/cpu_device_id.h> | |
10245 | #include <asm/mce.h> | |
10246 | ||
b3bbd485 | 10247 | @@ -329,7 +330,7 @@ static void pkg_thermal_schedule_work(int cpu, struct delayed_work *work) |
e4b2b4a8 JK |
10248 | schedule_delayed_work_on(cpu, work, ms); |
10249 | } | |
10250 | ||
10251 | -static int pkg_thermal_notify(u64 msr_val) | |
10252 | +static void pkg_thermal_notify_work(struct swork_event *event) | |
10253 | { | |
10254 | int cpu = smp_processor_id(); | |
10255 | struct pkg_device *pkgdev; | |
b3bbd485 | 10256 | @@ -348,9 +349,47 @@ static int pkg_thermal_notify(u64 msr_val) |
e4b2b4a8 JK |
10257 | } |
10258 | ||
10259 | spin_unlock_irqrestore(&pkg_temp_lock, flags); | |
10260 | +} | |
10261 | + | |
10262 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
10263 | +static struct swork_event notify_work; | |
10264 | + | |
10265 | +static int pkg_thermal_notify_work_init(void) | |
10266 | +{ | |
10267 | + int err; | |
10268 | + | |
10269 | + err = swork_get(); | |
10270 | + if (err) | |
10271 | + return err; | |
10272 | + | |
10273 | + INIT_SWORK(¬ify_work, pkg_thermal_notify_work); | |
b3bbd485 JK |
10274 | return 0; |
10275 | } | |
10276 | ||
e4b2b4a8 JK |
10277 | +static void pkg_thermal_notify_work_cleanup(void) |
10278 | +{ | |
10279 | + swork_put(); | |
10280 | +} | |
10281 | + | |
10282 | +static int pkg_thermal_notify(u64 msr_val) | |
10283 | +{ | |
10284 | + swork_queue(¬ify_work); | |
10285 | + return 0; | |
10286 | +} | |
10287 | + | |
10288 | +#else /* !CONFIG_PREEMPT_RT_FULL */ | |
10289 | + | |
10290 | +static int pkg_thermal_notify_work_init(void) { return 0; } | |
10291 | + | |
10292 | +static void pkg_thermal_notify_work_cleanup(void) { } | |
10293 | + | |
10294 | +static int pkg_thermal_notify(u64 msr_val) | |
10295 | +{ | |
10296 | + pkg_thermal_notify_work(NULL); | |
b3bbd485 JK |
10297 | + return 0; |
10298 | +} | |
e4b2b4a8 | 10299 | +#endif /* CONFIG_PREEMPT_RT_FULL */ |
b3bbd485 | 10300 | + |
e4b2b4a8 JK |
10301 | static int pkg_temp_thermal_device_add(unsigned int cpu) |
10302 | { | |
b3bbd485 JK |
10303 | int pkgid = topology_logical_package_id(cpu); |
10304 | @@ -515,10 +554,15 @@ static int __init pkg_temp_thermal_init(void) | |
e4b2b4a8 JK |
10305 | if (!x86_match_cpu(pkg_temp_thermal_ids)) |
10306 | return -ENODEV; | |
10307 | ||
10308 | + if (!pkg_thermal_notify_work_init()) | |
10309 | + return -ENODEV; | |
10310 | + | |
10311 | max_packages = topology_max_packages(); | |
10312 | packages = kzalloc(max_packages * sizeof(struct pkg_device *), GFP_KERNEL); | |
10313 | - if (!packages) | |
10314 | - return -ENOMEM; | |
10315 | + if (!packages) { | |
10316 | + ret = -ENOMEM; | |
10317 | + goto err; | |
10318 | + } | |
10319 | ||
10320 | ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "thermal/x86_pkg:online", | |
10321 | pkg_thermal_cpu_online, pkg_thermal_cpu_offline); | |
b3bbd485 | 10322 | @@ -536,6 +580,7 @@ static int __init pkg_temp_thermal_init(void) |
e4b2b4a8 JK |
10323 | return 0; |
10324 | ||
10325 | err: | |
10326 | + pkg_thermal_notify_work_cleanup(); | |
10327 | kfree(packages); | |
10328 | return ret; | |
10329 | } | |
b3bbd485 | 10330 | @@ -549,6 +594,7 @@ static void __exit pkg_temp_thermal_exit(void) |
e4b2b4a8 JK |
10331 | cpuhp_remove_state(pkg_thermal_hp_state); |
10332 | debugfs_remove_recursive(debugfs); | |
10333 | kfree(packages); | |
10334 | + pkg_thermal_notify_work_cleanup(); | |
10335 | } | |
10336 | module_exit(pkg_temp_thermal_exit) | |
10337 | ||
b3bbd485 JK |
10338 | diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c |
10339 | index d29b512a7d9f..bc8cbb995b29 100644 | |
10340 | --- a/drivers/tty/serial/8250/8250_core.c | |
10341 | +++ b/drivers/tty/serial/8250/8250_core.c | |
10342 | @@ -58,7 +58,16 @@ static struct uart_driver serial8250_reg; | |
e4b2b4a8 JK |
10343 | |
10344 | static unsigned int skip_txen_test; /* force skip of txen test at init time */ | |
10345 | ||
10346 | -#define PASS_LIMIT 512 | |
10347 | +/* | |
10348 | + * On -rt we can have a more delays, and legitimately | |
10349 | + * so - so don't drop work spuriously and spam the | |
10350 | + * syslog: | |
10351 | + */ | |
10352 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
10353 | +# define PASS_LIMIT 1000000 | |
10354 | +#else | |
10355 | +# define PASS_LIMIT 512 | |
10356 | +#endif | |
10357 | ||
10358 | #include <asm/serial.h> | |
10359 | /* | |
b3bbd485 JK |
10360 | diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c |
10361 | index ecf3d631bc09..6e029f34f37f 100644 | |
10362 | --- a/drivers/tty/serial/8250/8250_port.c | |
10363 | +++ b/drivers/tty/serial/8250/8250_port.c | |
e4b2b4a8 JK |
10364 | @@ -35,6 +35,7 @@ |
10365 | #include <linux/nmi.h> | |
10366 | #include <linux/mutex.h> | |
10367 | #include <linux/slab.h> | |
10368 | +#include <linux/kdb.h> | |
10369 | #include <linux/uaccess.h> | |
10370 | #include <linux/pm_runtime.h> | |
10371 | #include <linux/ktime.h> | |
b3bbd485 | 10372 | @@ -3224,9 +3225,9 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, |
e4b2b4a8 JK |
10373 | |
10374 | serial8250_rpm_get(up); | |
10375 | ||
10376 | - if (port->sysrq) | |
10377 | + if (port->sysrq || oops_in_progress) | |
10378 | locked = 0; | |
10379 | - else if (oops_in_progress) | |
10380 | + else if (in_kdb_printk()) | |
10381 | locked = spin_trylock_irqsave(&port->lock, flags); | |
10382 | else | |
10383 | spin_lock_irqsave(&port->lock, flags); | |
b3bbd485 JK |
10384 | diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c |
10385 | index c9f701aca677..81d6b15fb80a 100644 | |
10386 | --- a/drivers/tty/serial/amba-pl011.c | |
10387 | +++ b/drivers/tty/serial/amba-pl011.c | |
10388 | @@ -2236,13 +2236,19 @@ pl011_console_write(struct console *co, const char *s, unsigned int count) | |
e4b2b4a8 JK |
10389 | |
10390 | clk_enable(uap->clk); | |
10391 | ||
10392 | - local_irq_save(flags); | |
10393 | + /* | |
10394 | + * local_irq_save(flags); | |
10395 | + * | |
10396 | + * This local_irq_save() is nonsense. If we come in via sysrq | |
10397 | + * handling then interrupts are already disabled. Aside of | |
10398 | + * that the port.sysrq check is racy on SMP regardless. | |
10399 | + */ | |
10400 | if (uap->port.sysrq) | |
10401 | locked = 0; | |
10402 | else if (oops_in_progress) | |
10403 | - locked = spin_trylock(&uap->port.lock); | |
10404 | + locked = spin_trylock_irqsave(&uap->port.lock, flags); | |
10405 | else | |
10406 | - spin_lock(&uap->port.lock); | |
10407 | + spin_lock_irqsave(&uap->port.lock, flags); | |
10408 | ||
10409 | /* | |
10410 | * First save the CR then disable the interrupts | |
b3bbd485 | 10411 | @@ -2268,8 +2274,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count) |
e4b2b4a8 JK |
10412 | pl011_write(old_cr, uap, REG_CR); |
10413 | ||
10414 | if (locked) | |
10415 | - spin_unlock(&uap->port.lock); | |
10416 | - local_irq_restore(flags); | |
10417 | + spin_unlock_irqrestore(&uap->port.lock, flags); | |
10418 | ||
10419 | clk_disable(uap->clk); | |
10420 | } | |
b3bbd485 JK |
10421 | diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c |
10422 | index 26a22b100df1..69117e355bcd 100644 | |
10423 | --- a/drivers/tty/serial/omap-serial.c | |
10424 | +++ b/drivers/tty/serial/omap-serial.c | |
10425 | @@ -1311,13 +1311,10 @@ serial_omap_console_write(struct console *co, const char *s, | |
e4b2b4a8 JK |
10426 | |
10427 | pm_runtime_get_sync(up->dev); | |
10428 | ||
10429 | - local_irq_save(flags); | |
10430 | - if (up->port.sysrq) | |
10431 | - locked = 0; | |
10432 | - else if (oops_in_progress) | |
10433 | - locked = spin_trylock(&up->port.lock); | |
10434 | + if (up->port.sysrq || oops_in_progress) | |
10435 | + locked = spin_trylock_irqsave(&up->port.lock, flags); | |
10436 | else | |
10437 | - spin_lock(&up->port.lock); | |
10438 | + spin_lock_irqsave(&up->port.lock, flags); | |
10439 | ||
10440 | /* | |
10441 | * First save the IER then disable the interrupts | |
b3bbd485 | 10442 | @@ -1346,8 +1343,7 @@ serial_omap_console_write(struct console *co, const char *s, |
e4b2b4a8 JK |
10443 | pm_runtime_mark_last_busy(up->dev); |
10444 | pm_runtime_put_autosuspend(up->dev); | |
10445 | if (locked) | |
10446 | - spin_unlock(&up->port.lock); | |
10447 | - local_irq_restore(flags); | |
10448 | + spin_unlock_irqrestore(&up->port.lock, flags); | |
10449 | } | |
10450 | ||
10451 | static int __init | |
b3bbd485 JK |
10452 | diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c |
10453 | index d0b2e0ed9bab..91f4f2bd55b0 100644 | |
10454 | --- a/drivers/usb/core/hcd.c | |
10455 | +++ b/drivers/usb/core/hcd.c | |
10456 | @@ -1775,9 +1775,9 @@ static void __usb_hcd_giveback_urb(struct urb *urb) | |
e4b2b4a8 JK |
10457 | * and no one may trigger the above deadlock situation when |
10458 | * running complete() in tasklet. | |
10459 | */ | |
10460 | - local_irq_save(flags); | |
10461 | + local_irq_save_nort(flags); | |
10462 | urb->complete(urb); | |
10463 | - local_irq_restore(flags); | |
10464 | + local_irq_restore_nort(flags); | |
10465 | ||
10466 | usb_anchor_resume_wakeups(anchor); | |
10467 | atomic_dec(&urb->use_count); | |
b3bbd485 JK |
10468 | diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c |
10469 | index 17467545391b..42ec6f2db6a9 100644 | |
10470 | --- a/drivers/usb/gadget/function/f_fs.c | |
10471 | +++ b/drivers/usb/gadget/function/f_fs.c | |
10472 | @@ -1623,7 +1623,7 @@ static void ffs_data_put(struct ffs_data *ffs) | |
e4b2b4a8 JK |
10473 | pr_info("%s(): freeing\n", __func__); |
10474 | ffs_data_clear(ffs); | |
10475 | BUG_ON(waitqueue_active(&ffs->ev.waitq) || | |
10476 | - waitqueue_active(&ffs->ep0req_completion.wait) || | |
10477 | + swait_active(&ffs->ep0req_completion.wait) || | |
10478 | waitqueue_active(&ffs->wait)); | |
10479 | destroy_workqueue(ffs->io_completion_wq); | |
10480 | kfree(ffs->dev_name); | |
b3bbd485 JK |
10481 | diff --git a/drivers/usb/gadget/function/f_ncm.c b/drivers/usb/gadget/function/f_ncm.c |
10482 | index 45b334ceaf2e..5f24e6d3b6eb 100644 | |
10483 | --- a/drivers/usb/gadget/function/f_ncm.c | |
10484 | +++ b/drivers/usb/gadget/function/f_ncm.c | |
10485 | @@ -77,9 +77,7 @@ struct f_ncm { | |
e4b2b4a8 JK |
10486 | struct sk_buff *skb_tx_ndp; |
10487 | u16 ndp_dgram_count; | |
10488 | bool timer_force_tx; | |
10489 | - struct tasklet_struct tx_tasklet; | |
10490 | struct hrtimer task_timer; | |
10491 | - | |
10492 | bool timer_stopping; | |
10493 | }; | |
10494 | ||
b3bbd485 | 10495 | @@ -1108,7 +1106,7 @@ static struct sk_buff *ncm_wrap_ntb(struct gether *port, |
e4b2b4a8 JK |
10496 | |
10497 | /* Delay the timer. */ | |
10498 | hrtimer_start(&ncm->task_timer, TX_TIMEOUT_NSECS, | |
10499 | - HRTIMER_MODE_REL); | |
10500 | + HRTIMER_MODE_REL_SOFT); | |
10501 | ||
10502 | /* Add the datagram position entries */ | |
10503 | ntb_ndp = skb_put_zero(ncm->skb_tx_ndp, dgram_idx_len); | |
b3bbd485 | 10504 | @@ -1152,17 +1150,15 @@ static struct sk_buff *ncm_wrap_ntb(struct gether *port, |
e4b2b4a8 JK |
10505 | } |
10506 | ||
10507 | /* | |
10508 | - * This transmits the NTB if there are frames waiting. | |
10509 | + * The transmit should only be run if no skb data has been sent | |
10510 | + * for a certain duration. | |
10511 | */ | |
10512 | -static void ncm_tx_tasklet(unsigned long data) | |
10513 | +static enum hrtimer_restart ncm_tx_timeout(struct hrtimer *data) | |
10514 | { | |
10515 | - struct f_ncm *ncm = (void *)data; | |
10516 | - | |
10517 | - if (ncm->timer_stopping) | |
10518 | - return; | |
10519 | + struct f_ncm *ncm = container_of(data, struct f_ncm, task_timer); | |
10520 | ||
10521 | /* Only send if data is available. */ | |
10522 | - if (ncm->skb_tx_data) { | |
10523 | + if (!ncm->timer_stopping && ncm->skb_tx_data) { | |
10524 | ncm->timer_force_tx = true; | |
10525 | ||
10526 | /* XXX This allowance of a NULL skb argument to ndo_start_xmit | |
b3bbd485 | 10527 | @@ -1175,16 +1171,6 @@ static void ncm_tx_tasklet(unsigned long data) |
e4b2b4a8 JK |
10528 | |
10529 | ncm->timer_force_tx = false; | |
10530 | } | |
10531 | -} | |
10532 | - | |
10533 | -/* | |
10534 | - * The transmit should only be run if no skb data has been sent | |
10535 | - * for a certain duration. | |
10536 | - */ | |
10537 | -static enum hrtimer_restart ncm_tx_timeout(struct hrtimer *data) | |
10538 | -{ | |
10539 | - struct f_ncm *ncm = container_of(data, struct f_ncm, task_timer); | |
10540 | - tasklet_schedule(&ncm->tx_tasklet); | |
10541 | return HRTIMER_NORESTART; | |
10542 | } | |
10543 | ||
b3bbd485 | 10544 | @@ -1517,8 +1503,7 @@ static int ncm_bind(struct usb_configuration *c, struct usb_function *f) |
e4b2b4a8 JK |
10545 | ncm->port.open = ncm_open; |
10546 | ncm->port.close = ncm_close; | |
10547 | ||
10548 | - tasklet_init(&ncm->tx_tasklet, ncm_tx_tasklet, (unsigned long) ncm); | |
10549 | - hrtimer_init(&ncm->task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | |
10550 | + hrtimer_init(&ncm->task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); | |
10551 | ncm->task_timer.function = ncm_tx_timeout; | |
10552 | ||
10553 | DBG(cdev, "CDC Network: %s speed IN/%s OUT/%s NOTIFY/%s\n", | |
b3bbd485 | 10554 | @@ -1627,7 +1612,6 @@ static void ncm_unbind(struct usb_configuration *c, struct usb_function *f) |
e4b2b4a8 JK |
10555 | DBG(c->cdev, "ncm unbind\n"); |
10556 | ||
10557 | hrtimer_cancel(&ncm->task_timer); | |
10558 | - tasklet_kill(&ncm->tx_tasklet); | |
10559 | ||
10560 | ncm_string_defs[0].id = 0; | |
10561 | usb_free_all_descriptors(f); | |
b3bbd485 JK |
10562 | diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c |
10563 | index 5c28bee327e1..ed49dba4704d 100644 | |
10564 | --- a/drivers/usb/gadget/legacy/inode.c | |
10565 | +++ b/drivers/usb/gadget/legacy/inode.c | |
10566 | @@ -347,7 +347,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len) | |
e4b2b4a8 JK |
10567 | spin_unlock_irq (&epdata->dev->lock); |
10568 | ||
10569 | if (likely (value == 0)) { | |
10570 | - value = wait_event_interruptible (done.wait, done.done); | |
10571 | + value = swait_event_interruptible (done.wait, done.done); | |
10572 | if (value != 0) { | |
10573 | spin_lock_irq (&epdata->dev->lock); | |
10574 | if (likely (epdata->ep != NULL)) { | |
b3bbd485 | 10575 | @@ -356,7 +356,7 @@ ep_io (struct ep_data *epdata, void *buf, unsigned len) |
e4b2b4a8 JK |
10576 | usb_ep_dequeue (epdata->ep, epdata->req); |
10577 | spin_unlock_irq (&epdata->dev->lock); | |
10578 | ||
10579 | - wait_event (done.wait, done.done); | |
10580 | + swait_event (done.wait, done.done); | |
10581 | if (epdata->status == -ECONNRESET) | |
10582 | epdata->status = -EINTR; | |
10583 | } else { | |
b3bbd485 JK |
10584 | diff --git a/fs/aio.c b/fs/aio.c |
10585 | index 3a749c3a92e3..24c6ceadaae6 100644 | |
10586 | --- a/fs/aio.c | |
10587 | +++ b/fs/aio.c | |
e4b2b4a8 JK |
10588 | @@ -40,6 +40,7 @@ |
10589 | #include <linux/ramfs.h> | |
10590 | #include <linux/percpu-refcount.h> | |
10591 | #include <linux/mount.h> | |
10592 | +#include <linux/swork.h> | |
10593 | ||
10594 | #include <asm/kmap_types.h> | |
10595 | #include <linux/uaccess.h> | |
b3bbd485 | 10596 | @@ -117,6 +118,7 @@ struct kioctx { |
e4b2b4a8 JK |
10597 | |
10598 | struct rcu_head free_rcu; | |
10599 | struct work_struct free_work; /* see free_ioctx() */ | |
10600 | + struct swork_event free_swork; /* see free_ioctx() */ | |
10601 | ||
10602 | /* | |
10603 | * signals when all in-flight requests are done | |
b3bbd485 | 10604 | @@ -259,6 +261,7 @@ static int __init aio_setup(void) |
e4b2b4a8 JK |
10605 | .mount = aio_mount, |
10606 | .kill_sb = kill_anon_super, | |
10607 | }; | |
10608 | + BUG_ON(swork_get()); | |
10609 | aio_mnt = kern_mount(&aio_fs); | |
10610 | if (IS_ERR(aio_mnt)) | |
10611 | panic("Failed to create aio fs mount."); | |
b3bbd485 | 10612 | @@ -633,9 +636,9 @@ static void free_ioctx_reqs(struct percpu_ref *ref) |
e4b2b4a8 JK |
10613 | * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - |
10614 | * now it's safe to cancel any that need to be. | |
10615 | */ | |
10616 | -static void free_ioctx_users(struct percpu_ref *ref) | |
10617 | +static void free_ioctx_users_work(struct swork_event *sev) | |
10618 | { | |
10619 | - struct kioctx *ctx = container_of(ref, struct kioctx, users); | |
10620 | + struct kioctx *ctx = container_of(sev, struct kioctx, free_swork); | |
10621 | struct aio_kiocb *req; | |
10622 | ||
10623 | spin_lock_irq(&ctx->ctx_lock); | |
b3bbd485 | 10624 | @@ -653,6 +656,14 @@ static void free_ioctx_users(struct percpu_ref *ref) |
e4b2b4a8 JK |
10625 | percpu_ref_put(&ctx->reqs); |
10626 | } | |
10627 | ||
10628 | +static void free_ioctx_users(struct percpu_ref *ref) | |
10629 | +{ | |
10630 | + struct kioctx *ctx = container_of(ref, struct kioctx, users); | |
10631 | + | |
10632 | + INIT_SWORK(&ctx->free_swork, free_ioctx_users_work); | |
10633 | + swork_queue(&ctx->free_swork); | |
10634 | +} | |
10635 | + | |
10636 | static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) | |
10637 | { | |
10638 | unsigned i, new_nr; | |
b3bbd485 JK |
10639 | diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h |
10640 | index ce696d6c4641..b120fbd41483 100644 | |
10641 | --- a/fs/autofs4/autofs_i.h | |
10642 | +++ b/fs/autofs4/autofs_i.h | |
e4b2b4a8 JK |
10643 | @@ -20,6 +20,7 @@ |
10644 | #include <linux/sched.h> | |
10645 | #include <linux/mount.h> | |
10646 | #include <linux/namei.h> | |
10647 | +#include <linux/delay.h> | |
10648 | #include <linux/uaccess.h> | |
10649 | #include <linux/mutex.h> | |
10650 | #include <linux/spinlock.h> | |
b3bbd485 JK |
10651 | diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c |
10652 | index 57725d4a8c59..62220508bace 100644 | |
10653 | --- a/fs/autofs4/expire.c | |
10654 | +++ b/fs/autofs4/expire.c | |
10655 | @@ -148,7 +148,7 @@ static struct dentry *get_next_positive_dentry(struct dentry *prev, | |
e4b2b4a8 JK |
10656 | parent = p->d_parent; |
10657 | if (!spin_trylock(&parent->d_lock)) { | |
10658 | spin_unlock(&p->d_lock); | |
10659 | - cpu_relax(); | |
10660 | + cpu_chill(); | |
10661 | goto relock; | |
10662 | } | |
10663 | spin_unlock(&p->d_lock); | |
b3bbd485 JK |
10664 | diff --git a/fs/buffer.c b/fs/buffer.c |
10665 | index b96f3b98a6ef..4ca5f222537a 100644 | |
10666 | --- a/fs/buffer.c | |
10667 | +++ b/fs/buffer.c | |
10668 | @@ -302,8 +302,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) | |
e4b2b4a8 JK |
10669 | * decide that the page is now completely done. |
10670 | */ | |
10671 | first = page_buffers(page); | |
10672 | - local_irq_save(flags); | |
10673 | - bit_spin_lock(BH_Uptodate_Lock, &first->b_state); | |
10674 | + flags = bh_uptodate_lock_irqsave(first); | |
10675 | clear_buffer_async_read(bh); | |
10676 | unlock_buffer(bh); | |
10677 | tmp = bh; | |
b3bbd485 | 10678 | @@ -316,8 +315,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) |
e4b2b4a8 JK |
10679 | } |
10680 | tmp = tmp->b_this_page; | |
10681 | } while (tmp != bh); | |
10682 | - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); | |
10683 | - local_irq_restore(flags); | |
10684 | + bh_uptodate_unlock_irqrestore(first, flags); | |
10685 | ||
10686 | /* | |
10687 | * If none of the buffers had errors and they are all | |
b3bbd485 | 10688 | @@ -329,9 +327,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) |
e4b2b4a8 JK |
10689 | return; |
10690 | ||
10691 | still_busy: | |
10692 | - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); | |
10693 | - local_irq_restore(flags); | |
10694 | - return; | |
10695 | + bh_uptodate_unlock_irqrestore(first, flags); | |
10696 | } | |
10697 | ||
10698 | /* | |
b3bbd485 | 10699 | @@ -358,8 +354,7 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate) |
e4b2b4a8 JK |
10700 | } |
10701 | ||
10702 | first = page_buffers(page); | |
10703 | - local_irq_save(flags); | |
10704 | - bit_spin_lock(BH_Uptodate_Lock, &first->b_state); | |
10705 | + flags = bh_uptodate_lock_irqsave(first); | |
10706 | ||
10707 | clear_buffer_async_write(bh); | |
10708 | unlock_buffer(bh); | |
b3bbd485 | 10709 | @@ -371,15 +366,12 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate) |
e4b2b4a8 JK |
10710 | } |
10711 | tmp = tmp->b_this_page; | |
10712 | } | |
10713 | - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); | |
10714 | - local_irq_restore(flags); | |
10715 | + bh_uptodate_unlock_irqrestore(first, flags); | |
10716 | end_page_writeback(page); | |
10717 | return; | |
10718 | ||
10719 | still_busy: | |
10720 | - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); | |
10721 | - local_irq_restore(flags); | |
10722 | - return; | |
10723 | + bh_uptodate_unlock_irqrestore(first, flags); | |
10724 | } | |
10725 | EXPORT_SYMBOL(end_buffer_async_write); | |
10726 | ||
b3bbd485 | 10727 | @@ -3417,6 +3409,7 @@ struct buffer_head *alloc_buffer_head(gfp_t gfp_flags) |
e4b2b4a8 JK |
10728 | struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags); |
10729 | if (ret) { | |
10730 | INIT_LIST_HEAD(&ret->b_assoc_buffers); | |
10731 | + buffer_head_init_locks(ret); | |
10732 | preempt_disable(); | |
10733 | __this_cpu_inc(bh_accounting.nr); | |
10734 | recalc_bh_state(); | |
b3bbd485 | 10735 | diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c |
5dd41b01 | 10736 | index ef24b4527459..3ce6331a1101 100644 |
b3bbd485 JK |
10737 | --- a/fs/cifs/readdir.c |
10738 | +++ b/fs/cifs/readdir.c | |
10739 | @@ -80,7 +80,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, | |
e4b2b4a8 JK |
10740 | struct inode *inode; |
10741 | struct super_block *sb = parent->d_sb; | |
10742 | struct cifs_sb_info *cifs_sb = CIFS_SB(sb); | |
10743 | - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); | |
10744 | + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); | |
10745 | ||
10746 | cifs_dbg(FYI, "%s: for %s\n", __func__, name->name); | |
10747 | ||
b3bbd485 | 10748 | diff --git a/fs/dcache.c b/fs/dcache.c |
5dd41b01 | 10749 | index 28b2e770bb69..b08506ef464a 100644 |
b3bbd485 JK |
10750 | --- a/fs/dcache.c |
10751 | +++ b/fs/dcache.c | |
e4b2b4a8 JK |
10752 | @@ -19,6 +19,7 @@ |
10753 | #include <linux/mm.h> | |
10754 | #include <linux/fs.h> | |
10755 | #include <linux/fsnotify.h> | |
10756 | +#include <linux/delay.h> | |
10757 | #include <linux/slab.h> | |
10758 | #include <linux/init.h> | |
10759 | #include <linux/hash.h> | |
5dd41b01 | 10760 | @@ -808,6 +809,8 @@ static inline bool fast_dput(struct dentry *dentry) |
e4b2b4a8 JK |
10761 | */ |
10762 | void dput(struct dentry *dentry) | |
10763 | { | |
10764 | + struct dentry *parent; | |
10765 | + | |
10766 | if (unlikely(!dentry)) | |
10767 | return; | |
10768 | ||
5dd41b01 | 10769 | @@ -844,9 +847,18 @@ void dput(struct dentry *dentry) |
e4b2b4a8 JK |
10770 | return; |
10771 | ||
10772 | kill_it: | |
10773 | - dentry = dentry_kill(dentry); | |
10774 | - if (dentry) { | |
10775 | - cond_resched(); | |
10776 | + parent = dentry_kill(dentry); | |
10777 | + if (parent) { | |
10778 | + int r; | |
10779 | + | |
10780 | + if (parent == dentry) { | |
10781 | + /* the task with the highest priority won't schedule */ | |
10782 | + r = cond_resched(); | |
10783 | + if (!r) | |
10784 | + cpu_chill(); | |
10785 | + } else { | |
10786 | + dentry = parent; | |
10787 | + } | |
10788 | goto repeat; | |
10789 | } | |
10790 | } | |
5dd41b01 | 10791 | @@ -2414,7 +2426,7 @@ void d_delete(struct dentry * dentry) |
e4b2b4a8 JK |
10792 | if (dentry->d_lockref.count == 1) { |
10793 | if (!spin_trylock(&inode->i_lock)) { | |
10794 | spin_unlock(&dentry->d_lock); | |
10795 | - cpu_relax(); | |
10796 | + cpu_chill(); | |
10797 | goto again; | |
10798 | } | |
10799 | dentry->d_flags &= ~DCACHE_CANT_MOUNT; | |
5dd41b01 | 10800 | @@ -2459,9 +2471,10 @@ EXPORT_SYMBOL(d_rehash); |
e4b2b4a8 JK |
10801 | static inline unsigned start_dir_add(struct inode *dir) |
10802 | { | |
10803 | ||
10804 | + preempt_disable_rt(); | |
10805 | for (;;) { | |
10806 | - unsigned n = dir->i_dir_seq; | |
10807 | - if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) | |
10808 | + unsigned n = dir->__i_dir_seq; | |
10809 | + if (!(n & 1) && cmpxchg(&dir->__i_dir_seq, n, n + 1) == n) | |
10810 | return n; | |
10811 | cpu_relax(); | |
10812 | } | |
5dd41b01 | 10813 | @@ -2469,26 +2482,30 @@ static inline unsigned start_dir_add(struct inode *dir) |
e4b2b4a8 JK |
10814 | |
10815 | static inline void end_dir_add(struct inode *dir, unsigned n) | |
10816 | { | |
10817 | - smp_store_release(&dir->i_dir_seq, n + 2); | |
10818 | + smp_store_release(&dir->__i_dir_seq, n + 2); | |
10819 | + preempt_enable_rt(); | |
10820 | } | |
10821 | ||
10822 | static void d_wait_lookup(struct dentry *dentry) | |
10823 | { | |
10824 | - if (d_in_lookup(dentry)) { | |
10825 | - DECLARE_WAITQUEUE(wait, current); | |
10826 | - add_wait_queue(dentry->d_wait, &wait); | |
10827 | - do { | |
10828 | - set_current_state(TASK_UNINTERRUPTIBLE); | |
10829 | - spin_unlock(&dentry->d_lock); | |
10830 | - schedule(); | |
10831 | - spin_lock(&dentry->d_lock); | |
10832 | - } while (d_in_lookup(dentry)); | |
10833 | - } | |
10834 | + struct swait_queue __wait; | |
10835 | + | |
10836 | + if (!d_in_lookup(dentry)) | |
10837 | + return; | |
10838 | + | |
10839 | + INIT_LIST_HEAD(&__wait.task_list); | |
10840 | + do { | |
10841 | + prepare_to_swait(dentry->d_wait, &__wait, TASK_UNINTERRUPTIBLE); | |
10842 | + spin_unlock(&dentry->d_lock); | |
10843 | + schedule(); | |
10844 | + spin_lock(&dentry->d_lock); | |
10845 | + } while (d_in_lookup(dentry)); | |
10846 | + finish_swait(dentry->d_wait, &__wait); | |
10847 | } | |
10848 | ||
10849 | struct dentry *d_alloc_parallel(struct dentry *parent, | |
10850 | const struct qstr *name, | |
10851 | - wait_queue_head_t *wq) | |
10852 | + struct swait_queue_head *wq) | |
10853 | { | |
10854 | unsigned int hash = name->hash; | |
10855 | struct hlist_bl_head *b = in_lookup_hash(parent, hash); | |
5dd41b01 | 10856 | @@ -2502,7 +2519,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent, |
e4b2b4a8 JK |
10857 | |
10858 | retry: | |
10859 | rcu_read_lock(); | |
10860 | - seq = smp_load_acquire(&parent->d_inode->i_dir_seq); | |
10861 | + seq = smp_load_acquire(&parent->d_inode->__i_dir_seq); | |
10862 | r_seq = read_seqbegin(&rename_lock); | |
10863 | dentry = __d_lookup_rcu(parent, name, &d_seq); | |
10864 | if (unlikely(dentry)) { | |
5dd41b01 | 10865 | @@ -2530,7 +2547,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent, |
e4b2b4a8 JK |
10866 | } |
10867 | ||
10868 | hlist_bl_lock(b); | |
10869 | - if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) { | |
10870 | + if (unlikely(READ_ONCE(parent->d_inode->__i_dir_seq) != seq)) { | |
10871 | hlist_bl_unlock(b); | |
10872 | rcu_read_unlock(); | |
10873 | goto retry; | |
5dd41b01 | 10874 | @@ -2603,7 +2620,7 @@ void __d_lookup_done(struct dentry *dentry) |
e4b2b4a8 JK |
10875 | hlist_bl_lock(b); |
10876 | dentry->d_flags &= ~DCACHE_PAR_LOOKUP; | |
10877 | __hlist_bl_del(&dentry->d_u.d_in_lookup_hash); | |
10878 | - wake_up_all(dentry->d_wait); | |
10879 | + swake_up_all(dentry->d_wait); | |
10880 | dentry->d_wait = NULL; | |
10881 | hlist_bl_unlock(b); | |
10882 | INIT_HLIST_NODE(&dentry->d_u.d_alias); | |
5dd41b01 | 10883 | @@ -3638,6 +3655,8 @@ __setup("dhash_entries=", set_dhash_entries); |
e4b2b4a8 JK |
10884 | |
10885 | static void __init dcache_init_early(void) | |
10886 | { | |
10887 | + unsigned int loop; | |
10888 | + | |
10889 | /* If hashes are distributed across NUMA nodes, defer | |
10890 | * hash allocation until vmalloc space is available. | |
10891 | */ | |
5dd41b01 | 10892 | @@ -3654,10 +3673,14 @@ static void __init dcache_init_early(void) |
e4b2b4a8 JK |
10893 | &d_hash_mask, |
10894 | 0, | |
10895 | 0); | |
10896 | + | |
10897 | + for (loop = 0; loop < (1U << d_hash_shift); loop++) | |
10898 | + INIT_HLIST_BL_HEAD(dentry_hashtable + loop); | |
10899 | } | |
10900 | ||
10901 | static void __init dcache_init(void) | |
10902 | { | |
10903 | + unsigned int loop; | |
10904 | /* | |
10905 | * A constructor could be added for stable state like the lists, | |
10906 | * but it is probably not worth it because of the cache nature | |
5dd41b01 | 10907 | @@ -3680,6 +3703,10 @@ static void __init dcache_init(void) |
e4b2b4a8 JK |
10908 | &d_hash_mask, |
10909 | 0, | |
10910 | 0); | |
10911 | + | |
10912 | + for (loop = 0; loop < (1U << d_hash_shift); loop++) | |
10913 | + INIT_HLIST_BL_HEAD(dentry_hashtable + loop); | |
10914 | + | |
10915 | } | |
10916 | ||
10917 | /* SLAB cache for __getname() consumers */ | |
b3bbd485 JK |
10918 | diff --git a/fs/eventpoll.c b/fs/eventpoll.c |
10919 | index 2fabd19cdeea..b768c32631eb 100644 | |
10920 | --- a/fs/eventpoll.c | |
10921 | +++ b/fs/eventpoll.c | |
10922 | @@ -587,12 +587,12 @@ static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests) | |
e4b2b4a8 JK |
10923 | */ |
10924 | static void ep_poll_safewake(wait_queue_head_t *wq) | |
10925 | { | |
10926 | - int this_cpu = get_cpu(); | |
10927 | + int this_cpu = get_cpu_light(); | |
10928 | ||
10929 | ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS, | |
10930 | ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu); | |
10931 | ||
10932 | - put_cpu(); | |
10933 | + put_cpu_light(); | |
10934 | } | |
10935 | ||
10936 | static void ep_remove_wait_queue(struct eppoll_entry *pwq) | |
b3bbd485 JK |
10937 | diff --git a/fs/exec.c b/fs/exec.c |
10938 | index 0da4d748b4e6..609aee4dbfa9 100644 | |
10939 | --- a/fs/exec.c | |
10940 | +++ b/fs/exec.c | |
10941 | @@ -1024,12 +1024,14 @@ static int exec_mmap(struct mm_struct *mm) | |
e4b2b4a8 JK |
10942 | } |
10943 | } | |
10944 | task_lock(tsk); | |
10945 | + preempt_disable_rt(); | |
10946 | active_mm = tsk->active_mm; | |
10947 | tsk->mm = mm; | |
10948 | tsk->active_mm = mm; | |
10949 | activate_mm(active_mm, mm); | |
10950 | tsk->mm->vmacache_seqnum = 0; | |
10951 | vmacache_flush(tsk); | |
10952 | + preempt_enable_rt(); | |
10953 | task_unlock(tsk); | |
10954 | if (old_mm) { | |
10955 | up_read(&old_mm->mmap_sem); | |
b3bbd485 JK |
10956 | diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c |
10957 | index db7590178dfc..d76364124443 100644 | |
10958 | --- a/fs/ext4/page-io.c | |
10959 | +++ b/fs/ext4/page-io.c | |
10960 | @@ -95,8 +95,7 @@ static void ext4_finish_bio(struct bio *bio) | |
e4b2b4a8 JK |
10961 | * We check all buffers in the page under BH_Uptodate_Lock |
10962 | * to avoid races with other end io clearing async_write flags | |
10963 | */ | |
10964 | - local_irq_save(flags); | |
10965 | - bit_spin_lock(BH_Uptodate_Lock, &head->b_state); | |
10966 | + flags = bh_uptodate_lock_irqsave(head); | |
10967 | do { | |
10968 | if (bh_offset(bh) < bio_start || | |
10969 | bh_offset(bh) + bh->b_size > bio_end) { | |
b3bbd485 | 10970 | @@ -108,8 +107,7 @@ static void ext4_finish_bio(struct bio *bio) |
e4b2b4a8 JK |
10971 | if (bio->bi_status) |
10972 | buffer_io_error(bh); | |
10973 | } while ((bh = bh->b_this_page) != head); | |
10974 | - bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); | |
10975 | - local_irq_restore(flags); | |
10976 | + bh_uptodate_unlock_irqrestore(head, flags); | |
10977 | if (!under_io) { | |
10978 | #ifdef CONFIG_EXT4_FS_ENCRYPTION | |
10979 | if (data_page) | |
b3bbd485 JK |
10980 | diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c |
10981 | index 29868c35c19a..76d354eee035 100644 | |
10982 | --- a/fs/fuse/dir.c | |
10983 | +++ b/fs/fuse/dir.c | |
10984 | @@ -1188,7 +1188,7 @@ static int fuse_direntplus_link(struct file *file, | |
e4b2b4a8 JK |
10985 | struct inode *dir = d_inode(parent); |
10986 | struct fuse_conn *fc; | |
10987 | struct inode *inode; | |
10988 | - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); | |
10989 | + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); | |
10990 | ||
10991 | if (!o->nodeid) { | |
10992 | /* | |
b3bbd485 JK |
10993 | diff --git a/fs/inode.c b/fs/inode.c |
10994 | index cfc36d11bcb3..b77ce179798a 100644 | |
10995 | --- a/fs/inode.c | |
10996 | +++ b/fs/inode.c | |
10997 | @@ -154,7 +154,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) | |
e4b2b4a8 JK |
10998 | inode->i_bdev = NULL; |
10999 | inode->i_cdev = NULL; | |
11000 | inode->i_link = NULL; | |
11001 | - inode->i_dir_seq = 0; | |
11002 | + inode->__i_dir_seq = 0; | |
11003 | inode->i_rdev = 0; | |
11004 | inode->dirtied_when = 0; | |
11005 | ||
b3bbd485 JK |
11006 | diff --git a/fs/libfs.c b/fs/libfs.c |
11007 | index 3aabe553fc45..b5d63bf1ad8e 100644 | |
11008 | --- a/fs/libfs.c | |
11009 | +++ b/fs/libfs.c | |
11010 | @@ -90,7 +90,7 @@ static struct dentry *next_positive(struct dentry *parent, | |
e4b2b4a8 JK |
11011 | struct list_head *from, |
11012 | int count) | |
11013 | { | |
11014 | - unsigned *seq = &parent->d_inode->i_dir_seq, n; | |
11015 | + unsigned *seq = &parent->d_inode->__i_dir_seq, n; | |
11016 | struct dentry *res; | |
11017 | struct list_head *p; | |
11018 | bool skipped; | |
b3bbd485 | 11019 | @@ -123,8 +123,9 @@ static struct dentry *next_positive(struct dentry *parent, |
e4b2b4a8 JK |
11020 | static void move_cursor(struct dentry *cursor, struct list_head *after) |
11021 | { | |
11022 | struct dentry *parent = cursor->d_parent; | |
11023 | - unsigned n, *seq = &parent->d_inode->i_dir_seq; | |
11024 | + unsigned n, *seq = &parent->d_inode->__i_dir_seq; | |
11025 | spin_lock(&parent->d_lock); | |
11026 | + preempt_disable_rt(); | |
11027 | for (;;) { | |
11028 | n = *seq; | |
11029 | if (!(n & 1) && cmpxchg(seq, n, n + 1) == n) | |
b3bbd485 | 11030 | @@ -137,6 +138,7 @@ static void move_cursor(struct dentry *cursor, struct list_head *after) |
e4b2b4a8 JK |
11031 | else |
11032 | list_add_tail(&cursor->d_child, &parent->d_subdirs); | |
11033 | smp_store_release(seq, n + 2); | |
11034 | + preempt_enable_rt(); | |
11035 | spin_unlock(&parent->d_lock); | |
11036 | } | |
11037 | ||
b3bbd485 | 11038 | diff --git a/fs/locks.c b/fs/locks.c |
5dd41b01 | 11039 | index 665e3ce9ab47..47b66bfc4fa3 100644 |
b3bbd485 JK |
11040 | --- a/fs/locks.c |
11041 | +++ b/fs/locks.c | |
11042 | @@ -945,7 +945,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request) | |
e4b2b4a8 JK |
11043 | return -ENOMEM; |
11044 | } | |
11045 | ||
11046 | - percpu_down_read_preempt_disable(&file_rwsem); | |
11047 | + percpu_down_read(&file_rwsem); | |
11048 | spin_lock(&ctx->flc_lock); | |
11049 | if (request->fl_flags & FL_ACCESS) | |
11050 | goto find_conflict; | |
b3bbd485 | 11051 | @@ -986,7 +986,7 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request) |
e4b2b4a8 JK |
11052 | |
11053 | out: | |
11054 | spin_unlock(&ctx->flc_lock); | |
11055 | - percpu_up_read_preempt_enable(&file_rwsem); | |
11056 | + percpu_up_read(&file_rwsem); | |
11057 | if (new_fl) | |
11058 | locks_free_lock(new_fl); | |
11059 | locks_dispose_list(&dispose); | |
b3bbd485 | 11060 | @@ -1023,7 +1023,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, |
e4b2b4a8 JK |
11061 | new_fl2 = locks_alloc_lock(); |
11062 | } | |
11063 | ||
11064 | - percpu_down_read_preempt_disable(&file_rwsem); | |
11065 | + percpu_down_read(&file_rwsem); | |
11066 | spin_lock(&ctx->flc_lock); | |
11067 | /* | |
11068 | * New lock request. Walk all POSIX locks and look for conflicts. If | |
b3bbd485 | 11069 | @@ -1195,7 +1195,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request, |
e4b2b4a8 JK |
11070 | } |
11071 | out: | |
11072 | spin_unlock(&ctx->flc_lock); | |
11073 | - percpu_up_read_preempt_enable(&file_rwsem); | |
11074 | + percpu_up_read(&file_rwsem); | |
11075 | /* | |
11076 | * Free any unused locks. | |
11077 | */ | |
b3bbd485 | 11078 | @@ -1470,7 +1470,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) |
e4b2b4a8 JK |
11079 | return error; |
11080 | } | |
11081 | ||
11082 | - percpu_down_read_preempt_disable(&file_rwsem); | |
11083 | + percpu_down_read(&file_rwsem); | |
11084 | spin_lock(&ctx->flc_lock); | |
11085 | ||
11086 | time_out_leases(inode, &dispose); | |
b3bbd485 | 11087 | @@ -1522,13 +1522,13 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) |
e4b2b4a8 JK |
11088 | locks_insert_block(fl, new_fl); |
11089 | trace_break_lease_block(inode, new_fl); | |
11090 | spin_unlock(&ctx->flc_lock); | |
11091 | - percpu_up_read_preempt_enable(&file_rwsem); | |
11092 | + percpu_up_read(&file_rwsem); | |
11093 | ||
11094 | locks_dispose_list(&dispose); | |
11095 | error = wait_event_interruptible_timeout(new_fl->fl_wait, | |
11096 | !new_fl->fl_next, break_time); | |
11097 | ||
11098 | - percpu_down_read_preempt_disable(&file_rwsem); | |
11099 | + percpu_down_read(&file_rwsem); | |
11100 | spin_lock(&ctx->flc_lock); | |
11101 | trace_break_lease_unblock(inode, new_fl); | |
11102 | locks_delete_block(new_fl); | |
b3bbd485 | 11103 | @@ -1545,7 +1545,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) |
e4b2b4a8 JK |
11104 | } |
11105 | out: | |
11106 | spin_unlock(&ctx->flc_lock); | |
11107 | - percpu_up_read_preempt_enable(&file_rwsem); | |
11108 | + percpu_up_read(&file_rwsem); | |
11109 | locks_dispose_list(&dispose); | |
11110 | locks_free_lock(new_fl); | |
11111 | return error; | |
b3bbd485 | 11112 | @@ -1619,7 +1619,7 @@ int fcntl_getlease(struct file *filp) |
e4b2b4a8 JK |
11113 | |
11114 | ctx = smp_load_acquire(&inode->i_flctx); | |
11115 | if (ctx && !list_empty_careful(&ctx->flc_lease)) { | |
11116 | - percpu_down_read_preempt_disable(&file_rwsem); | |
11117 | + percpu_down_read(&file_rwsem); | |
11118 | spin_lock(&ctx->flc_lock); | |
11119 | time_out_leases(inode, &dispose); | |
11120 | list_for_each_entry(fl, &ctx->flc_lease, fl_list) { | |
b3bbd485 | 11121 | @@ -1629,7 +1629,7 @@ int fcntl_getlease(struct file *filp) |
e4b2b4a8 JK |
11122 | break; |
11123 | } | |
11124 | spin_unlock(&ctx->flc_lock); | |
11125 | - percpu_up_read_preempt_enable(&file_rwsem); | |
11126 | + percpu_up_read(&file_rwsem); | |
11127 | ||
11128 | locks_dispose_list(&dispose); | |
11129 | } | |
b3bbd485 | 11130 | @@ -1704,7 +1704,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr |
e4b2b4a8 JK |
11131 | return -EINVAL; |
11132 | } | |
11133 | ||
11134 | - percpu_down_read_preempt_disable(&file_rwsem); | |
11135 | + percpu_down_read(&file_rwsem); | |
11136 | spin_lock(&ctx->flc_lock); | |
11137 | time_out_leases(inode, &dispose); | |
11138 | error = check_conflicting_open(dentry, arg, lease->fl_flags); | |
b3bbd485 | 11139 | @@ -1775,7 +1775,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr |
e4b2b4a8 JK |
11140 | lease->fl_lmops->lm_setup(lease, priv); |
11141 | out: | |
11142 | spin_unlock(&ctx->flc_lock); | |
11143 | - percpu_up_read_preempt_enable(&file_rwsem); | |
11144 | + percpu_up_read(&file_rwsem); | |
11145 | locks_dispose_list(&dispose); | |
11146 | if (is_deleg) | |
11147 | inode_unlock(inode); | |
b3bbd485 | 11148 | @@ -1798,7 +1798,7 @@ static int generic_delete_lease(struct file *filp, void *owner) |
e4b2b4a8 JK |
11149 | return error; |
11150 | } | |
11151 | ||
11152 | - percpu_down_read_preempt_disable(&file_rwsem); | |
11153 | + percpu_down_read(&file_rwsem); | |
11154 | spin_lock(&ctx->flc_lock); | |
11155 | list_for_each_entry(fl, &ctx->flc_lease, fl_list) { | |
11156 | if (fl->fl_file == filp && | |
b3bbd485 | 11157 | @@ -1811,7 +1811,7 @@ static int generic_delete_lease(struct file *filp, void *owner) |
e4b2b4a8 JK |
11158 | if (victim) |
11159 | error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose); | |
11160 | spin_unlock(&ctx->flc_lock); | |
11161 | - percpu_up_read_preempt_enable(&file_rwsem); | |
11162 | + percpu_up_read(&file_rwsem); | |
11163 | locks_dispose_list(&dispose); | |
11164 | return error; | |
11165 | } | |
5dd41b01 | 11166 | @@ -2542,13 +2542,13 @@ locks_remove_lease(struct file *filp, struct file_lock_context *ctx) |
e4b2b4a8 JK |
11167 | if (list_empty(&ctx->flc_lease)) |
11168 | return; | |
11169 | ||
11170 | - percpu_down_read_preempt_disable(&file_rwsem); | |
11171 | + percpu_down_read(&file_rwsem); | |
11172 | spin_lock(&ctx->flc_lock); | |
11173 | list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) | |
11174 | if (filp == fl->fl_file) | |
11175 | lease_modify(fl, F_UNLCK, &dispose); | |
11176 | spin_unlock(&ctx->flc_lock); | |
11177 | - percpu_up_read_preempt_enable(&file_rwsem); | |
11178 | + percpu_up_read(&file_rwsem); | |
11179 | ||
11180 | locks_dispose_list(&dispose); | |
11181 | } | |
b3bbd485 JK |
11182 | diff --git a/fs/namei.c b/fs/namei.c |
11183 | index 0b46b858cd42..f5c6c2ec44ce 100644 | |
11184 | --- a/fs/namei.c | |
11185 | +++ b/fs/namei.c | |
11186 | @@ -1627,7 +1627,7 @@ static struct dentry *lookup_slow(const struct qstr *name, | |
e4b2b4a8 JK |
11187 | { |
11188 | struct dentry *dentry = ERR_PTR(-ENOENT), *old; | |
11189 | struct inode *inode = dir->d_inode; | |
11190 | - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); | |
11191 | + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); | |
11192 | ||
11193 | inode_lock_shared(inode); | |
11194 | /* Don't go there if it's already dead */ | |
b3bbd485 | 11195 | @@ -3100,7 +3100,7 @@ static int lookup_open(struct nameidata *nd, struct path *path, |
e4b2b4a8 JK |
11196 | struct dentry *dentry; |
11197 | int error, create_error = 0; | |
11198 | umode_t mode = op->mode; | |
11199 | - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); | |
11200 | + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); | |
11201 | ||
11202 | if (unlikely(IS_DEADDIR(dir_inode))) | |
11203 | return -ENOENT; | |
b3bbd485 JK |
11204 | diff --git a/fs/namespace.c b/fs/namespace.c |
11205 | index 9dc146e7b5e0..85bfe5e55adf 100644 | |
11206 | --- a/fs/namespace.c | |
11207 | +++ b/fs/namespace.c | |
e4b2b4a8 JK |
11208 | @@ -14,6 +14,7 @@ |
11209 | #include <linux/mnt_namespace.h> | |
11210 | #include <linux/user_namespace.h> | |
11211 | #include <linux/namei.h> | |
11212 | +#include <linux/delay.h> | |
11213 | #include <linux/security.h> | |
11214 | #include <linux/cred.h> | |
11215 | #include <linux/idr.h> | |
b3bbd485 | 11216 | @@ -353,8 +354,11 @@ int __mnt_want_write(struct vfsmount *m) |
e4b2b4a8 JK |
11217 | * incremented count after it has set MNT_WRITE_HOLD. |
11218 | */ | |
11219 | smp_mb(); | |
11220 | - while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) | |
11221 | - cpu_relax(); | |
11222 | + while (ACCESS_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) { | |
11223 | + preempt_enable(); | |
11224 | + cpu_chill(); | |
11225 | + preempt_disable(); | |
11226 | + } | |
11227 | /* | |
11228 | * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will | |
11229 | * be set to match its requirements. So we must not load that until | |
b3bbd485 JK |
11230 | diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c |
11231 | index 606dd3871f66..fa41eb75b4d8 100644 | |
11232 | --- a/fs/nfs/delegation.c | |
11233 | +++ b/fs/nfs/delegation.c | |
11234 | @@ -150,11 +150,11 @@ static int nfs_delegation_claim_opens(struct inode *inode, | |
e4b2b4a8 JK |
11235 | sp = state->owner; |
11236 | /* Block nfs4_proc_unlck */ | |
11237 | mutex_lock(&sp->so_delegreturn_mutex); | |
11238 | - seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); | |
11239 | + seq = read_seqbegin(&sp->so_reclaim_seqlock); | |
11240 | err = nfs4_open_delegation_recall(ctx, state, stateid, type); | |
11241 | if (!err) | |
11242 | err = nfs_delegation_claim_locks(ctx, state, stateid); | |
11243 | - if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) | |
11244 | + if (!err && read_seqretry(&sp->so_reclaim_seqlock, seq)) | |
11245 | err = -EAGAIN; | |
11246 | mutex_unlock(&sp->so_delegreturn_mutex); | |
11247 | put_nfs_open_context(ctx); | |
b3bbd485 JK |
11248 | diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c |
11249 | index bf2c43635062..f43f5da4a8c3 100644 | |
11250 | --- a/fs/nfs/dir.c | |
11251 | +++ b/fs/nfs/dir.c | |
11252 | @@ -452,7 +452,7 @@ static | |
e4b2b4a8 JK |
11253 | void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) |
11254 | { | |
11255 | struct qstr filename = QSTR_INIT(entry->name, entry->len); | |
11256 | - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); | |
11257 | + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); | |
11258 | struct dentry *dentry; | |
11259 | struct dentry *alias; | |
11260 | struct inode *dir = d_inode(parent); | |
b3bbd485 | 11261 | @@ -1443,7 +1443,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, |
e4b2b4a8 JK |
11262 | struct file *file, unsigned open_flags, |
11263 | umode_t mode, int *opened) | |
11264 | { | |
11265 | - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); | |
11266 | + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); | |
11267 | struct nfs_open_context *ctx; | |
11268 | struct dentry *res; | |
11269 | struct iattr attr = { .ia_valid = ATTR_OPEN }; | |
b3bbd485 | 11270 | @@ -1763,7 +1763,11 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry) |
e4b2b4a8 JK |
11271 | |
11272 | trace_nfs_rmdir_enter(dir, dentry); | |
11273 | if (d_really_is_positive(dentry)) { | |
11274 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
11275 | + down(&NFS_I(d_inode(dentry))->rmdir_sem); | |
11276 | +#else | |
11277 | down_write(&NFS_I(d_inode(dentry))->rmdir_sem); | |
11278 | +#endif | |
11279 | error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); | |
11280 | /* Ensure the VFS deletes this inode */ | |
11281 | switch (error) { | |
b3bbd485 | 11282 | @@ -1773,7 +1777,11 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry) |
e4b2b4a8 JK |
11283 | case -ENOENT: |
11284 | nfs_dentry_handle_enoent(dentry); | |
11285 | } | |
11286 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
11287 | + up(&NFS_I(d_inode(dentry))->rmdir_sem); | |
11288 | +#else | |
11289 | up_write(&NFS_I(d_inode(dentry))->rmdir_sem); | |
11290 | +#endif | |
11291 | } else | |
11292 | error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); | |
11293 | trace_nfs_rmdir_exit(dir, dentry, error); | |
b3bbd485 JK |
11294 | diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c |
11295 | index 134d9f560240..ff64167f9811 100644 | |
11296 | --- a/fs/nfs/inode.c | |
11297 | +++ b/fs/nfs/inode.c | |
11298 | @@ -2014,7 +2014,11 @@ static void init_once(void *foo) | |
e4b2b4a8 JK |
11299 | atomic_long_set(&nfsi->nrequests, 0); |
11300 | atomic_long_set(&nfsi->commit_info.ncommit, 0); | |
11301 | atomic_set(&nfsi->commit_info.rpcs_out, 0); | |
11302 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
11303 | + sema_init(&nfsi->rmdir_sem, 1); | |
11304 | +#else | |
11305 | init_rwsem(&nfsi->rmdir_sem); | |
11306 | +#endif | |
11307 | mutex_init(&nfsi->commit_mutex); | |
11308 | nfs4_init_once(nfsi); | |
11309 | } | |
b3bbd485 JK |
11310 | diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h |
11311 | index a73144b3cb8c..0c403d280b96 100644 | |
11312 | --- a/fs/nfs/nfs4_fs.h | |
11313 | +++ b/fs/nfs/nfs4_fs.h | |
11314 | @@ -112,7 +112,7 @@ struct nfs4_state_owner { | |
e4b2b4a8 JK |
11315 | unsigned long so_flags; |
11316 | struct list_head so_states; | |
11317 | struct nfs_seqid_counter so_seqid; | |
11318 | - seqcount_t so_reclaim_seqcount; | |
11319 | + seqlock_t so_reclaim_seqlock; | |
11320 | struct mutex so_delegreturn_mutex; | |
11321 | }; | |
11322 | ||
b3bbd485 | 11323 | diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c |
5dd41b01 | 11324 | index a3b67d3b1dfb..4ce6ec109c2b 100644 |
b3bbd485 JK |
11325 | --- a/fs/nfs/nfs4proc.c |
11326 | +++ b/fs/nfs/nfs4proc.c | |
5dd41b01 | 11327 | @@ -2700,7 +2700,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, |
e4b2b4a8 JK |
11328 | unsigned int seq; |
11329 | int ret; | |
11330 | ||
11331 | - seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); | |
11332 | + seq = raw_seqcount_begin(&sp->so_reclaim_seqlock.seqcount); | |
11333 | ||
11334 | ret = _nfs4_proc_open(opendata); | |
11335 | if (ret != 0) | |
5dd41b01 | 11336 | @@ -2738,7 +2738,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, |
e4b2b4a8 JK |
11337 | |
11338 | if (d_inode(dentry) == state->inode) { | |
11339 | nfs_inode_attach_open_context(ctx); | |
11340 | - if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) | |
11341 | + if (read_seqretry(&sp->so_reclaim_seqlock, seq)) | |
11342 | nfs4_schedule_stateid_recovery(server, state); | |
11343 | } | |
11344 | out: | |
b3bbd485 | 11345 | diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c |
5dd41b01 | 11346 | index e1d88bca815e..c51bcc176026 100644 |
b3bbd485 JK |
11347 | --- a/fs/nfs/nfs4state.c |
11348 | +++ b/fs/nfs/nfs4state.c | |
11349 | @@ -494,7 +494,7 @@ nfs4_alloc_state_owner(struct nfs_server *server, | |
e4b2b4a8 JK |
11350 | nfs4_init_seqid_counter(&sp->so_seqid); |
11351 | atomic_set(&sp->so_count, 1); | |
11352 | INIT_LIST_HEAD(&sp->so_lru); | |
11353 | - seqcount_init(&sp->so_reclaim_seqcount); | |
11354 | + seqlock_init(&sp->so_reclaim_seqlock); | |
11355 | mutex_init(&sp->so_delegreturn_mutex); | |
11356 | return sp; | |
11357 | } | |
5dd41b01 | 11358 | @@ -1521,8 +1521,12 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs |
e4b2b4a8 JK |
11359 | * recovering after a network partition or a reboot from a |
11360 | * server that doesn't support a grace period. | |
11361 | */ | |
11362 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
11363 | + write_seqlock(&sp->so_reclaim_seqlock); | |
11364 | +#else | |
11365 | + write_seqcount_begin(&sp->so_reclaim_seqlock.seqcount); | |
11366 | +#endif | |
11367 | spin_lock(&sp->so_lock); | |
11368 | - raw_write_seqcount_begin(&sp->so_reclaim_seqcount); | |
11369 | restart: | |
11370 | list_for_each_entry(state, &sp->so_states, open_states) { | |
11371 | if (!test_and_clear_bit(ops->state_flag_bit, &state->flags)) | |
5dd41b01 | 11372 | @@ -1591,14 +1595,20 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs |
e4b2b4a8 JK |
11373 | spin_lock(&sp->so_lock); |
11374 | goto restart; | |
11375 | } | |
11376 | - raw_write_seqcount_end(&sp->so_reclaim_seqcount); | |
11377 | spin_unlock(&sp->so_lock); | |
11378 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
11379 | + write_sequnlock(&sp->so_reclaim_seqlock); | |
11380 | +#else | |
11381 | + write_seqcount_end(&sp->so_reclaim_seqlock.seqcount); | |
11382 | +#endif | |
11383 | return 0; | |
11384 | out_err: | |
11385 | nfs4_put_open_state(state); | |
11386 | - spin_lock(&sp->so_lock); | |
11387 | - raw_write_seqcount_end(&sp->so_reclaim_seqcount); | |
11388 | - spin_unlock(&sp->so_lock); | |
11389 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
11390 | + write_sequnlock(&sp->so_reclaim_seqlock); | |
11391 | +#else | |
11392 | + write_seqcount_end(&sp->so_reclaim_seqlock.seqcount); | |
11393 | +#endif | |
11394 | return status; | |
11395 | } | |
11396 | ||
b3bbd485 JK |
11397 | diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c |
11398 | index 630b4a3c1a93..0dc1d3e6a62f 100644 | |
11399 | --- a/fs/nfs/unlink.c | |
11400 | +++ b/fs/nfs/unlink.c | |
e4b2b4a8 JK |
11401 | @@ -13,7 +13,7 @@ |
11402 | #include <linux/sunrpc/clnt.h> | |
11403 | #include <linux/nfs_fs.h> | |
11404 | #include <linux/sched.h> | |
11405 | -#include <linux/wait.h> | |
11406 | +#include <linux/swait.h> | |
11407 | #include <linux/namei.h> | |
11408 | #include <linux/fsnotify.h> | |
11409 | ||
b3bbd485 | 11410 | @@ -52,6 +52,29 @@ static void nfs_async_unlink_done(struct rpc_task *task, void *calldata) |
e4b2b4a8 JK |
11411 | rpc_restart_call_prepare(task); |
11412 | } | |
11413 | ||
11414 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
11415 | +static void nfs_down_anon(struct semaphore *sema) | |
11416 | +{ | |
11417 | + down(sema); | |
11418 | +} | |
11419 | + | |
11420 | +static void nfs_up_anon(struct semaphore *sema) | |
11421 | +{ | |
11422 | + up(sema); | |
11423 | +} | |
11424 | + | |
11425 | +#else | |
11426 | +static void nfs_down_anon(struct rw_semaphore *rwsem) | |
11427 | +{ | |
11428 | + down_read_non_owner(rwsem); | |
11429 | +} | |
11430 | + | |
11431 | +static void nfs_up_anon(struct rw_semaphore *rwsem) | |
11432 | +{ | |
11433 | + up_read_non_owner(rwsem); | |
11434 | +} | |
11435 | +#endif | |
11436 | + | |
11437 | /** | |
11438 | * nfs_async_unlink_release - Release the sillydelete data. | |
11439 | * @task: rpc_task of the sillydelete | |
b3bbd485 | 11440 | @@ -65,7 +88,7 @@ static void nfs_async_unlink_release(void *calldata) |
e4b2b4a8 JK |
11441 | struct dentry *dentry = data->dentry; |
11442 | struct super_block *sb = dentry->d_sb; | |
11443 | ||
11444 | - up_read_non_owner(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem); | |
11445 | + nfs_up_anon(&NFS_I(d_inode(dentry->d_parent))->rmdir_sem); | |
11446 | d_lookup_done(dentry); | |
11447 | nfs_free_unlinkdata(data); | |
11448 | dput(dentry); | |
b3bbd485 | 11449 | @@ -118,10 +141,10 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data) |
e4b2b4a8 JK |
11450 | struct inode *dir = d_inode(dentry->d_parent); |
11451 | struct dentry *alias; | |
11452 | ||
11453 | - down_read_non_owner(&NFS_I(dir)->rmdir_sem); | |
11454 | + nfs_down_anon(&NFS_I(dir)->rmdir_sem); | |
11455 | alias = d_alloc_parallel(dentry->d_parent, &data->args.name, &data->wq); | |
11456 | if (IS_ERR(alias)) { | |
11457 | - up_read_non_owner(&NFS_I(dir)->rmdir_sem); | |
11458 | + nfs_up_anon(&NFS_I(dir)->rmdir_sem); | |
11459 | return 0; | |
11460 | } | |
11461 | if (!d_in_lookup(alias)) { | |
b3bbd485 | 11462 | @@ -143,7 +166,7 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data) |
e4b2b4a8 JK |
11463 | ret = 0; |
11464 | spin_unlock(&alias->d_lock); | |
11465 | dput(alias); | |
11466 | - up_read_non_owner(&NFS_I(dir)->rmdir_sem); | |
11467 | + nfs_up_anon(&NFS_I(dir)->rmdir_sem); | |
11468 | /* | |
11469 | * If we'd displaced old cached devname, free it. At that | |
11470 | * point dentry is definitely not a root, so we won't need | |
b3bbd485 | 11471 | @@ -183,7 +206,7 @@ nfs_async_unlink(struct dentry *dentry, const struct qstr *name) |
e4b2b4a8 JK |
11472 | goto out_free_name; |
11473 | } | |
11474 | data->res.dir_attr = &data->dir_attr; | |
11475 | - init_waitqueue_head(&data->wq); | |
11476 | + init_swait_queue_head(&data->wq); | |
11477 | ||
11478 | status = -EBUSY; | |
11479 | spin_lock(&dentry->d_lock); | |
b3bbd485 JK |
11480 | diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c |
11481 | index cc91856b5e2d..a982d7c3ad91 100644 | |
11482 | --- a/fs/ntfs/aops.c | |
11483 | +++ b/fs/ntfs/aops.c | |
11484 | @@ -93,13 +93,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) | |
e4b2b4a8 JK |
11485 | ofs = 0; |
11486 | if (file_ofs < init_size) | |
11487 | ofs = init_size - file_ofs; | |
11488 | - local_irq_save(flags); | |
11489 | + local_irq_save_nort(flags); | |
11490 | kaddr = kmap_atomic(page); | |
11491 | memset(kaddr + bh_offset(bh) + ofs, 0, | |
11492 | bh->b_size - ofs); | |
11493 | flush_dcache_page(page); | |
11494 | kunmap_atomic(kaddr); | |
11495 | - local_irq_restore(flags); | |
11496 | + local_irq_restore_nort(flags); | |
11497 | } | |
11498 | } else { | |
11499 | clear_buffer_uptodate(bh); | |
b3bbd485 | 11500 | @@ -108,8 +108,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) |
e4b2b4a8 JK |
11501 | "0x%llx.", (unsigned long long)bh->b_blocknr); |
11502 | } | |
11503 | first = page_buffers(page); | |
11504 | - local_irq_save(flags); | |
11505 | - bit_spin_lock(BH_Uptodate_Lock, &first->b_state); | |
11506 | + flags = bh_uptodate_lock_irqsave(first); | |
11507 | clear_buffer_async_read(bh); | |
11508 | unlock_buffer(bh); | |
11509 | tmp = bh; | |
b3bbd485 | 11510 | @@ -124,8 +123,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) |
e4b2b4a8 JK |
11511 | } |
11512 | tmp = tmp->b_this_page; | |
11513 | } while (tmp != bh); | |
11514 | - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); | |
11515 | - local_irq_restore(flags); | |
11516 | + bh_uptodate_unlock_irqrestore(first, flags); | |
11517 | /* | |
11518 | * If none of the buffers had errors then we can set the page uptodate, | |
11519 | * but we first have to perform the post read mst fixups, if the | |
b3bbd485 | 11520 | @@ -146,13 +144,13 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) |
e4b2b4a8 JK |
11521 | recs = PAGE_SIZE / rec_size; |
11522 | /* Should have been verified before we got here... */ | |
11523 | BUG_ON(!recs); | |
11524 | - local_irq_save(flags); | |
11525 | + local_irq_save_nort(flags); | |
11526 | kaddr = kmap_atomic(page); | |
11527 | for (i = 0; i < recs; i++) | |
11528 | post_read_mst_fixup((NTFS_RECORD*)(kaddr + | |
11529 | i * rec_size), rec_size); | |
11530 | kunmap_atomic(kaddr); | |
11531 | - local_irq_restore(flags); | |
11532 | + local_irq_restore_nort(flags); | |
11533 | flush_dcache_page(page); | |
11534 | if (likely(page_uptodate && !PageError(page))) | |
11535 | SetPageUptodate(page); | |
b3bbd485 | 11536 | @@ -160,9 +158,7 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) |
e4b2b4a8 JK |
11537 | unlock_page(page); |
11538 | return; | |
11539 | still_busy: | |
11540 | - bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); | |
11541 | - local_irq_restore(flags); | |
11542 | - return; | |
11543 | + bh_uptodate_unlock_irqrestore(first, flags); | |
11544 | } | |
11545 | ||
11546 | /** | |
b3bbd485 JK |
11547 | diff --git a/fs/proc/array.c b/fs/proc/array.c |
11548 | index 4ac811e1a26c..9dcb40690cde 100644 | |
11549 | --- a/fs/proc/array.c | |
11550 | +++ b/fs/proc/array.c | |
11551 | @@ -386,9 +386,9 @@ static inline void task_context_switch_counts(struct seq_file *m, | |
e4b2b4a8 JK |
11552 | static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) |
11553 | { | |
11554 | seq_printf(m, "Cpus_allowed:\t%*pb\n", | |
11555 | - cpumask_pr_args(&task->cpus_allowed)); | |
11556 | + cpumask_pr_args(task->cpus_ptr)); | |
11557 | seq_printf(m, "Cpus_allowed_list:\t%*pbl\n", | |
11558 | - cpumask_pr_args(&task->cpus_allowed)); | |
11559 | + cpumask_pr_args(task->cpus_ptr)); | |
11560 | } | |
11561 | ||
11562 | int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, | |
b3bbd485 | 11563 | diff --git a/fs/proc/base.c b/fs/proc/base.c |
5dd41b01 | 11564 | index 9063738ff1f0..4085e56e261c 100644 |
b3bbd485 JK |
11565 | --- a/fs/proc/base.c |
11566 | +++ b/fs/proc/base.c | |
5dd41b01 | 11567 | @@ -1900,7 +1900,7 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx, |
e4b2b4a8 JK |
11568 | |
11569 | child = d_hash_and_lookup(dir, &qname); | |
11570 | if (!child) { | |
11571 | - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); | |
11572 | + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); | |
11573 | child = d_alloc_parallel(dir, &qname, &wq); | |
11574 | if (IS_ERR(child)) | |
11575 | goto end_instantiate; | |
b3bbd485 JK |
11576 | diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c |
11577 | index 82ac5f682b73..c35714621a38 100644 | |
11578 | --- a/fs/proc/proc_sysctl.c | |
11579 | +++ b/fs/proc/proc_sysctl.c | |
11580 | @@ -679,7 +679,7 @@ static bool proc_sys_fill_cache(struct file *file, | |
e4b2b4a8 JK |
11581 | |
11582 | child = d_lookup(dir, &qname); | |
11583 | if (!child) { | |
11584 | - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); | |
11585 | + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); | |
11586 | child = d_alloc_parallel(dir, &qname, &wq); | |
11587 | if (IS_ERR(child)) | |
11588 | return false; | |
b3bbd485 JK |
11589 | diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c |
11590 | index 23a9c28ad8ea..6a73c4fa88e7 100644 | |
11591 | --- a/fs/squashfs/decompressor_multi_percpu.c | |
11592 | +++ b/fs/squashfs/decompressor_multi_percpu.c | |
11593 | @@ -10,6 +10,7 @@ | |
11594 | #include <linux/slab.h> | |
11595 | #include <linux/percpu.h> | |
11596 | #include <linux/buffer_head.h> | |
11597 | +#include <linux/locallock.h> | |
11598 | ||
11599 | #include "squashfs_fs.h" | |
11600 | #include "squashfs_fs_sb.h" | |
11601 | @@ -25,6 +26,8 @@ struct squashfs_stream { | |
11602 | void *stream; | |
11603 | }; | |
11604 | ||
11605 | +static DEFINE_LOCAL_IRQ_LOCK(stream_lock); | |
11606 | + | |
11607 | void *squashfs_decompressor_create(struct squashfs_sb_info *msblk, | |
11608 | void *comp_opts) | |
11609 | { | |
11610 | @@ -79,10 +82,15 @@ int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh, | |
11611 | { | |
11612 | struct squashfs_stream __percpu *percpu = | |
11613 | (struct squashfs_stream __percpu *) msblk->stream; | |
11614 | - struct squashfs_stream *stream = get_cpu_ptr(percpu); | |
11615 | - int res = msblk->decompressor->decompress(msblk, stream->stream, bh, b, | |
11616 | - offset, length, output); | |
11617 | - put_cpu_ptr(stream); | |
11618 | + struct squashfs_stream *stream; | |
11619 | + int res; | |
11620 | + | |
11621 | + stream = get_locked_ptr(stream_lock, percpu); | |
11622 | + | |
11623 | + res = msblk->decompressor->decompress(msblk, stream->stream, bh, b, | |
11624 | + offset, length, output); | |
11625 | + | |
11626 | + put_locked_ptr(stream_lock, stream); | |
11627 | ||
11628 | if (res < 0) | |
11629 | ERROR("%s decompression failed, data probably corrupt\n", | |
11630 | diff --git a/fs/timerfd.c b/fs/timerfd.c | |
11631 | index 040612ec9598..b3d9d435926c 100644 | |
11632 | --- a/fs/timerfd.c | |
11633 | +++ b/fs/timerfd.c | |
11634 | @@ -471,7 +471,10 @@ static int do_timerfd_settime(int ufd, int flags, | |
e4b2b4a8 JK |
11635 | break; |
11636 | } | |
11637 | spin_unlock_irq(&ctx->wqh.lock); | |
11638 | - cpu_relax(); | |
11639 | + if (isalarm(ctx)) | |
11640 | + hrtimer_wait_for_timer(&ctx->t.alarm.timer); | |
11641 | + else | |
11642 | + hrtimer_wait_for_timer(&ctx->t.tmr); | |
11643 | } | |
11644 | ||
11645 | /* | |
b3bbd485 JK |
11646 | diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c |
11647 | index b0cccf8a81a8..eaa4383defec 100644 | |
11648 | --- a/fs/xfs/xfs_aops.c | |
11649 | +++ b/fs/xfs/xfs_aops.c | |
11650 | @@ -120,8 +120,7 @@ xfs_finish_page_writeback( | |
e4b2b4a8 JK |
11651 | ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE); |
11652 | ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0); | |
11653 | ||
11654 | - local_irq_save(flags); | |
11655 | - bit_spin_lock(BH_Uptodate_Lock, &head->b_state); | |
11656 | + flags = bh_uptodate_lock_irqsave(head); | |
11657 | do { | |
11658 | if (off >= bvec->bv_offset && | |
11659 | off < bvec->bv_offset + bvec->bv_len) { | |
b3bbd485 | 11660 | @@ -143,8 +142,7 @@ xfs_finish_page_writeback( |
e4b2b4a8 JK |
11661 | } |
11662 | off += bh->b_size; | |
11663 | } while ((bh = bh->b_this_page) != head); | |
11664 | - bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); | |
11665 | - local_irq_restore(flags); | |
11666 | + bh_uptodate_unlock_irqrestore(head, flags); | |
11667 | ||
11668 | if (!busy) | |
11669 | end_page_writeback(bvec->bv_page); | |
b3bbd485 JK |
11670 | diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h |
11671 | index 1b473efd9eb6..89ee5e1dac48 100644 | |
11672 | --- a/include/acpi/platform/aclinux.h | |
11673 | +++ b/include/acpi/platform/aclinux.h | |
e4b2b4a8 JK |
11674 | @@ -134,6 +134,7 @@ |
11675 | ||
11676 | #define acpi_cache_t struct kmem_cache | |
11677 | #define acpi_spinlock spinlock_t * | |
11678 | +#define acpi_raw_spinlock raw_spinlock_t * | |
11679 | #define acpi_cpu_flags unsigned long | |
11680 | ||
11681 | /* Use native linux version of acpi_os_allocate_zeroed */ | |
11682 | @@ -152,6 +153,20 @@ | |
11683 | #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_get_thread_id | |
11684 | #define ACPI_USE_ALTERNATE_PROTOTYPE_acpi_os_create_lock | |
11685 | ||
11686 | +#define acpi_os_create_raw_lock(__handle) \ | |
11687 | +({ \ | |
11688 | + raw_spinlock_t *lock = ACPI_ALLOCATE(sizeof(*lock)); \ | |
11689 | + \ | |
11690 | + if (lock) { \ | |
11691 | + *(__handle) = lock; \ | |
11692 | + raw_spin_lock_init(*(__handle)); \ | |
11693 | + } \ | |
11694 | + lock ? AE_OK : AE_NO_MEMORY; \ | |
11695 | + }) | |
11696 | + | |
11697 | +#define acpi_os_delete_raw_lock(__handle) kfree(__handle) | |
11698 | + | |
11699 | + | |
11700 | /* | |
11701 | * OSL interfaces used by debugger/disassembler | |
11702 | */ | |
b3bbd485 JK |
11703 | diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h |
11704 | index ae1a33aa8955..c6d04eca8345 100644 | |
11705 | --- a/include/asm-generic/bug.h | |
11706 | +++ b/include/asm-generic/bug.h | |
11707 | @@ -234,6 +234,20 @@ void __warn(const char *file, int line, void *caller, unsigned taint, | |
e4b2b4a8 JK |
11708 | # define WARN_ON_SMP(x) ({0;}) |
11709 | #endif | |
11710 | ||
11711 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
11712 | +# define BUG_ON_RT(c) BUG_ON(c) | |
11713 | +# define BUG_ON_NONRT(c) do { } while (0) | |
11714 | +# define WARN_ON_RT(condition) WARN_ON(condition) | |
11715 | +# define WARN_ON_NONRT(condition) do { } while (0) | |
11716 | +# define WARN_ON_ONCE_NONRT(condition) do { } while (0) | |
11717 | +#else | |
11718 | +# define BUG_ON_RT(c) do { } while (0) | |
11719 | +# define BUG_ON_NONRT(c) BUG_ON(c) | |
11720 | +# define WARN_ON_RT(condition) do { } while (0) | |
11721 | +# define WARN_ON_NONRT(condition) WARN_ON(condition) | |
11722 | +# define WARN_ON_ONCE_NONRT(condition) WARN_ON_ONCE(condition) | |
11723 | +#endif | |
11724 | + | |
11725 | #endif /* __ASSEMBLY__ */ | |
11726 | ||
11727 | #endif | |
b3bbd485 JK |
11728 | diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h |
11729 | index 994cbb0f7ffc..0d4b7e3489a9 100644 | |
11730 | --- a/include/linux/blk-mq.h | |
11731 | +++ b/include/linux/blk-mq.h | |
11732 | @@ -226,7 +226,7 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag) | |
11733 | return unique_tag & BLK_MQ_UNIQUE_TAG_MASK; | |
11734 | } | |
11735 | ||
11736 | - | |
11737 | +void __blk_mq_complete_request_remote_work(struct work_struct *work); | |
11738 | int blk_mq_request_started(struct request *rq); | |
11739 | void blk_mq_start_request(struct request *rq); | |
11740 | void blk_mq_end_request(struct request *rq, blk_status_t error); | |
11741 | diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h | |
11742 | index 4d4af0e94059..cbf9d5730dd3 100644 | |
11743 | --- a/include/linux/blkdev.h | |
11744 | +++ b/include/linux/blkdev.h | |
e4b2b4a8 JK |
11745 | @@ -27,6 +27,7 @@ |
11746 | #include <linux/percpu-refcount.h> | |
11747 | #include <linux/scatterlist.h> | |
11748 | #include <linux/blkzoned.h> | |
11749 | +#include <linux/swork.h> | |
11750 | ||
11751 | struct module; | |
11752 | struct scsi_ioctl_command; | |
b3bbd485 | 11753 | @@ -134,6 +135,9 @@ typedef __u32 __bitwise req_flags_t; |
e4b2b4a8 JK |
11754 | */ |
11755 | struct request { | |
11756 | struct list_head queuelist; | |
11757 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
11758 | + struct work_struct work; | |
11759 | +#endif | |
11760 | union { | |
11761 | struct __call_single_data csd; | |
11762 | u64 fifo_time; | |
b3bbd485 | 11763 | @@ -596,6 +600,7 @@ struct request_queue { |
e4b2b4a8 JK |
11764 | #endif |
11765 | struct rcu_head rcu_head; | |
11766 | wait_queue_head_t mq_freeze_wq; | |
11767 | + struct swork_event mq_pcpu_wake; | |
11768 | struct percpu_ref q_usage_counter; | |
11769 | struct list_head all_q_node; | |
11770 | ||
b3bbd485 JK |
11771 | diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h |
11772 | index a19519f4241d..40dd5ef9c154 100644 | |
11773 | --- a/include/linux/bottom_half.h | |
11774 | +++ b/include/linux/bottom_half.h | |
e4b2b4a8 JK |
11775 | @@ -4,6 +4,39 @@ |
11776 | ||
11777 | #include <linux/preempt.h> | |
11778 | ||
11779 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
11780 | + | |
11781 | +extern void __local_bh_disable(void); | |
11782 | +extern void _local_bh_enable(void); | |
11783 | +extern void __local_bh_enable(void); | |
11784 | + | |
11785 | +static inline void local_bh_disable(void) | |
11786 | +{ | |
11787 | + __local_bh_disable(); | |
11788 | +} | |
11789 | + | |
11790 | +static inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) | |
11791 | +{ | |
11792 | + __local_bh_disable(); | |
11793 | +} | |
11794 | + | |
11795 | +static inline void local_bh_enable(void) | |
11796 | +{ | |
11797 | + __local_bh_enable(); | |
11798 | +} | |
11799 | + | |
11800 | +static inline void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) | |
11801 | +{ | |
11802 | + __local_bh_enable(); | |
11803 | +} | |
11804 | + | |
11805 | +static inline void local_bh_enable_ip(unsigned long ip) | |
11806 | +{ | |
11807 | + __local_bh_enable(); | |
11808 | +} | |
11809 | + | |
11810 | +#else | |
11811 | + | |
11812 | #ifdef CONFIG_TRACE_IRQFLAGS | |
11813 | extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt); | |
11814 | #else | |
b3bbd485 | 11815 | @@ -31,5 +64,6 @@ static inline void local_bh_enable(void) |
e4b2b4a8 JK |
11816 | { |
11817 | __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET); | |
11818 | } | |
11819 | +#endif | |
11820 | ||
11821 | #endif /* _LINUX_BH_H */ | |
b3bbd485 JK |
11822 | diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h |
11823 | index afa37f807f12..48505fade7e1 100644 | |
11824 | --- a/include/linux/buffer_head.h | |
11825 | +++ b/include/linux/buffer_head.h | |
11826 | @@ -76,8 +76,50 @@ struct buffer_head { | |
e4b2b4a8 JK |
11827 | struct address_space *b_assoc_map; /* mapping this buffer is |
11828 | associated with */ | |
11829 | atomic_t b_count; /* users using this buffer_head */ | |
11830 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
11831 | + spinlock_t b_uptodate_lock; | |
11832 | +#if IS_ENABLED(CONFIG_JBD2) | |
11833 | + spinlock_t b_state_lock; | |
11834 | + spinlock_t b_journal_head_lock; | |
11835 | +#endif | |
11836 | +#endif | |
11837 | }; | |
11838 | ||
11839 | +static inline unsigned long bh_uptodate_lock_irqsave(struct buffer_head *bh) | |
11840 | +{ | |
11841 | + unsigned long flags; | |
11842 | + | |
11843 | +#ifndef CONFIG_PREEMPT_RT_BASE | |
11844 | + local_irq_save(flags); | |
11845 | + bit_spin_lock(BH_Uptodate_Lock, &bh->b_state); | |
11846 | +#else | |
11847 | + spin_lock_irqsave(&bh->b_uptodate_lock, flags); | |
11848 | +#endif | |
11849 | + return flags; | |
11850 | +} | |
11851 | + | |
11852 | +static inline void | |
11853 | +bh_uptodate_unlock_irqrestore(struct buffer_head *bh, unsigned long flags) | |
11854 | +{ | |
11855 | +#ifndef CONFIG_PREEMPT_RT_BASE | |
11856 | + bit_spin_unlock(BH_Uptodate_Lock, &bh->b_state); | |
11857 | + local_irq_restore(flags); | |
11858 | +#else | |
11859 | + spin_unlock_irqrestore(&bh->b_uptodate_lock, flags); | |
11860 | +#endif | |
11861 | +} | |
11862 | + | |
11863 | +static inline void buffer_head_init_locks(struct buffer_head *bh) | |
11864 | +{ | |
11865 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
11866 | + spin_lock_init(&bh->b_uptodate_lock); | |
11867 | +#if IS_ENABLED(CONFIG_JBD2) | |
11868 | + spin_lock_init(&bh->b_state_lock); | |
11869 | + spin_lock_init(&bh->b_journal_head_lock); | |
11870 | +#endif | |
11871 | +#endif | |
11872 | +} | |
11873 | + | |
11874 | /* | |
11875 | * macro tricks to expand the set_buffer_foo(), clear_buffer_foo() | |
11876 | * and buffer_foo() functions. | |
b3bbd485 | 11877 | diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h |
5dd41b01 | 11878 | index e7905d9353e8..4ecf7875e04f 100644 |
b3bbd485 JK |
11879 | --- a/include/linux/cgroup-defs.h |
11880 | +++ b/include/linux/cgroup-defs.h | |
e4b2b4a8 JK |
11881 | @@ -19,6 +19,7 @@ |
11882 | #include <linux/percpu-rwsem.h> | |
11883 | #include <linux/workqueue.h> | |
11884 | #include <linux/bpf-cgroup.h> | |
11885 | +#include <linux/swork.h> | |
11886 | ||
11887 | #ifdef CONFIG_CGROUPS | |
11888 | ||
b3bbd485 | 11889 | @@ -152,6 +153,7 @@ struct cgroup_subsys_state { |
e4b2b4a8 JK |
11890 | /* percpu_ref killing and RCU release */ |
11891 | struct rcu_head rcu_head; | |
11892 | struct work_struct destroy_work; | |
11893 | + struct swork_event destroy_swork; | |
11894 | ||
11895 | /* | |
11896 | * PI: the parent css. Placed here for cache proximity to following | |
b3bbd485 JK |
11897 | diff --git a/include/linux/completion.h b/include/linux/completion.h |
11898 | index 7828451e161a..f5838b10cf84 100644 | |
11899 | --- a/include/linux/completion.h | |
11900 | +++ b/include/linux/completion.h | |
e4b2b4a8 JK |
11901 | @@ -9,7 +9,7 @@ |
11902 | * See kernel/sched/completion.c for details. | |
11903 | */ | |
11904 | ||
11905 | -#include <linux/wait.h> | |
11906 | +#include <linux/swait.h> | |
11907 | #ifdef CONFIG_LOCKDEP_COMPLETIONS | |
11908 | #include <linux/lockdep.h> | |
11909 | #endif | |
11910 | @@ -28,7 +28,7 @@ | |
11911 | */ | |
11912 | struct completion { | |
11913 | unsigned int done; | |
11914 | - wait_queue_head_t wait; | |
11915 | + struct swait_queue_head wait; | |
11916 | #ifdef CONFIG_LOCKDEP_COMPLETIONS | |
11917 | struct lockdep_map_cross map; | |
11918 | #endif | |
b3bbd485 | 11919 | @@ -67,11 +67,11 @@ static inline void complete_release_commit(struct completion *x) {} |
e4b2b4a8 JK |
11920 | |
11921 | #ifdef CONFIG_LOCKDEP_COMPLETIONS | |
11922 | #define COMPLETION_INITIALIZER(work) \ | |
11923 | - { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait), \ | |
11924 | + { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait), \ | |
11925 | STATIC_CROSS_LOCKDEP_MAP_INIT("(complete)" #work, &(work)) } | |
11926 | #else | |
11927 | #define COMPLETION_INITIALIZER(work) \ | |
11928 | - { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) } | |
11929 | + { 0, __SWAIT_QUEUE_HEAD_INITIALIZER((work).wait) } | |
11930 | #endif | |
11931 | ||
11932 | #define COMPLETION_INITIALIZER_ONSTACK(work) \ | |
b3bbd485 | 11933 | @@ -117,7 +117,7 @@ static inline void complete_release_commit(struct completion *x) {} |
e4b2b4a8 JK |
11934 | static inline void __init_completion(struct completion *x) |
11935 | { | |
11936 | x->done = 0; | |
11937 | - init_waitqueue_head(&x->wait); | |
11938 | + init_swait_queue_head(&x->wait); | |
11939 | } | |
11940 | ||
11941 | /** | |
b3bbd485 JK |
11942 | diff --git a/include/linux/cpu.h b/include/linux/cpu.h |
11943 | index 2a378d261914..b418d3c5159d 100644 | |
11944 | --- a/include/linux/cpu.h | |
11945 | +++ b/include/linux/cpu.h | |
11946 | @@ -120,6 +120,8 @@ extern void cpu_hotplug_disable(void); | |
e4b2b4a8 JK |
11947 | extern void cpu_hotplug_enable(void); |
11948 | void clear_tasks_mm_cpumask(int cpu); | |
11949 | int cpu_down(unsigned int cpu); | |
11950 | +extern void pin_current_cpu(void); | |
11951 | +extern void unpin_current_cpu(void); | |
11952 | ||
11953 | #else /* CONFIG_HOTPLUG_CPU */ | |
11954 | ||
b3bbd485 | 11955 | @@ -130,6 +132,9 @@ static inline void cpus_read_unlock(void) { } |
e4b2b4a8 JK |
11956 | static inline void lockdep_assert_cpus_held(void) { } |
11957 | static inline void cpu_hotplug_disable(void) { } | |
11958 | static inline void cpu_hotplug_enable(void) { } | |
11959 | +static inline void pin_current_cpu(void) { } | |
11960 | +static inline void unpin_current_cpu(void) { } | |
11961 | + | |
11962 | #endif /* !CONFIG_HOTPLUG_CPU */ | |
11963 | ||
11964 | /* Wrappers which go away once all code is converted */ | |
b3bbd485 JK |
11965 | diff --git a/include/linux/dcache.h b/include/linux/dcache.h |
11966 | index 006f4ccda5f5..d413993f7f17 100644 | |
11967 | --- a/include/linux/dcache.h | |
11968 | +++ b/include/linux/dcache.h | |
11969 | @@ -107,7 +107,7 @@ struct dentry { | |
e4b2b4a8 JK |
11970 | |
11971 | union { | |
11972 | struct list_head d_lru; /* LRU list */ | |
11973 | - wait_queue_head_t *d_wait; /* in-lookup ones only */ | |
11974 | + struct swait_queue_head *d_wait; /* in-lookup ones only */ | |
11975 | }; | |
11976 | struct list_head d_child; /* child of parent list */ | |
11977 | struct list_head d_subdirs; /* our children */ | |
b3bbd485 | 11978 | @@ -238,7 +238,7 @@ extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op |
e4b2b4a8 JK |
11979 | extern struct dentry * d_alloc(struct dentry *, const struct qstr *); |
11980 | extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *); | |
11981 | extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *, | |
11982 | - wait_queue_head_t *); | |
11983 | + struct swait_queue_head *); | |
11984 | extern struct dentry * d_splice_alias(struct inode *, struct dentry *); | |
11985 | extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *); | |
11986 | extern struct dentry * d_exact_alias(struct dentry *, struct inode *); | |
b3bbd485 JK |
11987 | diff --git a/include/linux/delay.h b/include/linux/delay.h |
11988 | index b78bab4395d8..7c4bc414a504 100644 | |
11989 | --- a/include/linux/delay.h | |
11990 | +++ b/include/linux/delay.h | |
11991 | @@ -64,4 +64,10 @@ static inline void ssleep(unsigned int seconds) | |
e4b2b4a8 JK |
11992 | msleep(seconds * 1000); |
11993 | } | |
11994 | ||
11995 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
11996 | +extern void cpu_chill(void); | |
11997 | +#else | |
11998 | +# define cpu_chill() cpu_relax() | |
11999 | +#endif | |
12000 | + | |
12001 | #endif /* defined(_LINUX_DELAY_H) */ | |
b3bbd485 JK |
12002 | diff --git a/include/linux/fs.h b/include/linux/fs.h |
12003 | index cc613f20e5a6..b806e2116f5c 100644 | |
12004 | --- a/include/linux/fs.h | |
12005 | +++ b/include/linux/fs.h | |
12006 | @@ -655,7 +655,7 @@ struct inode { | |
e4b2b4a8 JK |
12007 | struct block_device *i_bdev; |
12008 | struct cdev *i_cdev; | |
12009 | char *i_link; | |
12010 | - unsigned i_dir_seq; | |
12011 | + unsigned __i_dir_seq; | |
12012 | }; | |
12013 | ||
12014 | __u32 i_generation; | |
b3bbd485 JK |
12015 | diff --git a/include/linux/highmem.h b/include/linux/highmem.h |
12016 | index 776f90f3a1cd..5f0bd7a3e6a7 100644 | |
12017 | --- a/include/linux/highmem.h | |
12018 | +++ b/include/linux/highmem.h | |
e4b2b4a8 JK |
12019 | @@ -8,6 +8,7 @@ |
12020 | #include <linux/mm.h> | |
12021 | #include <linux/uaccess.h> | |
12022 | #include <linux/hardirq.h> | |
12023 | +#include <linux/sched.h> | |
12024 | ||
12025 | #include <asm/cacheflush.h> | |
12026 | ||
b3bbd485 | 12027 | @@ -66,7 +67,7 @@ static inline void kunmap(struct page *page) |
e4b2b4a8 JK |
12028 | |
12029 | static inline void *kmap_atomic(struct page *page) | |
12030 | { | |
12031 | - preempt_disable(); | |
12032 | + preempt_disable_nort(); | |
12033 | pagefault_disable(); | |
12034 | return page_address(page); | |
12035 | } | |
b3bbd485 | 12036 | @@ -75,7 +76,7 @@ static inline void *kmap_atomic(struct page *page) |
e4b2b4a8 JK |
12037 | static inline void __kunmap_atomic(void *addr) |
12038 | { | |
12039 | pagefault_enable(); | |
12040 | - preempt_enable(); | |
12041 | + preempt_enable_nort(); | |
12042 | } | |
12043 | ||
12044 | #define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn)) | |
b3bbd485 | 12045 | @@ -87,32 +88,51 @@ static inline void __kunmap_atomic(void *addr) |
e4b2b4a8 JK |
12046 | |
12047 | #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32) | |
12048 | ||
12049 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
12050 | DECLARE_PER_CPU(int, __kmap_atomic_idx); | |
12051 | +#endif | |
12052 | ||
12053 | static inline int kmap_atomic_idx_push(void) | |
12054 | { | |
12055 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
12056 | int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1; | |
12057 | ||
12058 | -#ifdef CONFIG_DEBUG_HIGHMEM | |
12059 | +# ifdef CONFIG_DEBUG_HIGHMEM | |
12060 | WARN_ON_ONCE(in_irq() && !irqs_disabled()); | |
12061 | BUG_ON(idx >= KM_TYPE_NR); | |
12062 | -#endif | |
12063 | +# endif | |
12064 | return idx; | |
12065 | +#else | |
12066 | + current->kmap_idx++; | |
12067 | + BUG_ON(current->kmap_idx > KM_TYPE_NR); | |
12068 | + return current->kmap_idx - 1; | |
12069 | +#endif | |
12070 | } | |
12071 | ||
12072 | static inline int kmap_atomic_idx(void) | |
12073 | { | |
12074 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
12075 | return __this_cpu_read(__kmap_atomic_idx) - 1; | |
12076 | +#else | |
12077 | + return current->kmap_idx - 1; | |
12078 | +#endif | |
12079 | } | |
12080 | ||
12081 | static inline void kmap_atomic_idx_pop(void) | |
12082 | { | |
12083 | -#ifdef CONFIG_DEBUG_HIGHMEM | |
12084 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
12085 | +# ifdef CONFIG_DEBUG_HIGHMEM | |
12086 | int idx = __this_cpu_dec_return(__kmap_atomic_idx); | |
12087 | ||
12088 | BUG_ON(idx < 0); | |
12089 | -#else | |
12090 | +# else | |
12091 | __this_cpu_dec(__kmap_atomic_idx); | |
12092 | +# endif | |
12093 | +#else | |
12094 | + current->kmap_idx--; | |
12095 | +# ifdef CONFIG_DEBUG_HIGHMEM | |
12096 | + BUG_ON(current->kmap_idx < 0); | |
12097 | +# endif | |
12098 | #endif | |
12099 | } | |
12100 | ||
b3bbd485 JK |
12101 | diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h |
12102 | index 012c37fdb688..3bd606859b0a 100644 | |
12103 | --- a/include/linux/hrtimer.h | |
12104 | +++ b/include/linux/hrtimer.h | |
e4b2b4a8 JK |
12105 | @@ -22,19 +22,42 @@ |
12106 | #include <linux/percpu.h> | |
12107 | #include <linux/timer.h> | |
12108 | #include <linux/timerqueue.h> | |
12109 | +#include <linux/wait.h> | |
12110 | ||
12111 | struct hrtimer_clock_base; | |
12112 | struct hrtimer_cpu_base; | |
12113 | ||
12114 | /* | |
12115 | * Mode arguments of xxx_hrtimer functions: | |
12116 | + * | |
12117 | + * HRTIMER_MODE_ABS - Time value is absolute | |
12118 | + * HRTIMER_MODE_REL - Time value is relative to now | |
12119 | + * HRTIMER_MODE_PINNED - Timer is bound to CPU (is only considered | |
12120 | + * when starting the timer) | |
12121 | + * HRTIMER_MODE_SOFT - Timer callback function will be executed in | |
12122 | + * soft irq context | |
12123 | */ | |
12124 | enum hrtimer_mode { | |
12125 | - HRTIMER_MODE_ABS = 0x0, /* Time value is absolute */ | |
12126 | - HRTIMER_MODE_REL = 0x1, /* Time value is relative to now */ | |
12127 | - HRTIMER_MODE_PINNED = 0x02, /* Timer is bound to CPU */ | |
12128 | - HRTIMER_MODE_ABS_PINNED = 0x02, | |
12129 | - HRTIMER_MODE_REL_PINNED = 0x03, | |
12130 | + HRTIMER_MODE_ABS = 0x00, | |
12131 | + HRTIMER_MODE_REL = 0x01, | |
12132 | + HRTIMER_MODE_PINNED = 0x02, | |
12133 | + HRTIMER_MODE_SOFT = 0x04, | |
12134 | + HRTIMER_MODE_HARD = 0x08, | |
12135 | + | |
12136 | + HRTIMER_MODE_ABS_PINNED = HRTIMER_MODE_ABS | HRTIMER_MODE_PINNED, | |
12137 | + HRTIMER_MODE_REL_PINNED = HRTIMER_MODE_REL | HRTIMER_MODE_PINNED, | |
12138 | + | |
12139 | + HRTIMER_MODE_ABS_SOFT = HRTIMER_MODE_ABS | HRTIMER_MODE_SOFT, | |
12140 | + HRTIMER_MODE_REL_SOFT = HRTIMER_MODE_REL | HRTIMER_MODE_SOFT, | |
12141 | + | |
12142 | + HRTIMER_MODE_ABS_PINNED_SOFT = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_SOFT, | |
12143 | + HRTIMER_MODE_REL_PINNED_SOFT = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_SOFT, | |
12144 | + | |
12145 | + HRTIMER_MODE_ABS_HARD = HRTIMER_MODE_ABS | HRTIMER_MODE_HARD, | |
12146 | + HRTIMER_MODE_REL_HARD = HRTIMER_MODE_REL | HRTIMER_MODE_HARD, | |
12147 | + | |
12148 | + HRTIMER_MODE_ABS_PINNED_HARD = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_HARD, | |
12149 | + HRTIMER_MODE_REL_PINNED_HARD = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_HARD, | |
12150 | }; | |
12151 | ||
12152 | /* | |
b3bbd485 | 12153 | @@ -87,6 +110,7 @@ enum hrtimer_restart { |
e4b2b4a8 JK |
12154 | * @base: pointer to the timer base (per cpu and per clock) |
12155 | * @state: state information (See bit values above) | |
12156 | * @is_rel: Set if the timer was armed relative | |
12157 | + * @is_soft: Set if hrtimer will be expired in soft interrupt context. | |
12158 | * | |
12159 | * The hrtimer structure must be initialized by hrtimer_init() | |
12160 | */ | |
b3bbd485 | 12161 | @@ -97,6 +121,7 @@ struct hrtimer { |
e4b2b4a8 JK |
12162 | struct hrtimer_clock_base *base; |
12163 | u8 state; | |
12164 | u8 is_rel; | |
12165 | + u8 is_soft; | |
12166 | }; | |
12167 | ||
12168 | /** | |
b3bbd485 | 12169 | @@ -112,9 +137,9 @@ struct hrtimer_sleeper { |
e4b2b4a8 JK |
12170 | }; |
12171 | ||
12172 | #ifdef CONFIG_64BIT | |
12173 | -# define HRTIMER_CLOCK_BASE_ALIGN 64 | |
12174 | +# define __hrtimer_clock_base_align ____cacheline_aligned | |
12175 | #else | |
12176 | -# define HRTIMER_CLOCK_BASE_ALIGN 32 | |
12177 | +# define __hrtimer_clock_base_align | |
12178 | #endif | |
12179 | ||
12180 | /** | |
b3bbd485 | 12181 | @@ -123,48 +148,57 @@ struct hrtimer_sleeper { |
e4b2b4a8 JK |
12182 | * @index: clock type index for per_cpu support when moving a |
12183 | * timer to a base on another cpu. | |
12184 | * @clockid: clock id for per_cpu support | |
12185 | + * @seq: seqcount around __run_hrtimer | |
12186 | + * @running: pointer to the currently running hrtimer | |
12187 | * @active: red black tree root node for the active timers | |
12188 | * @get_time: function to retrieve the current time of the clock | |
12189 | * @offset: offset of this clock to the monotonic base | |
12190 | */ | |
12191 | struct hrtimer_clock_base { | |
12192 | struct hrtimer_cpu_base *cpu_base; | |
12193 | - int index; | |
12194 | + unsigned int index; | |
12195 | clockid_t clockid; | |
12196 | + seqcount_t seq; | |
12197 | + struct hrtimer *running; | |
12198 | struct timerqueue_head active; | |
12199 | ktime_t (*get_time)(void); | |
12200 | ktime_t offset; | |
12201 | -} __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN))); | |
12202 | +} __hrtimer_clock_base_align; | |
12203 | ||
12204 | enum hrtimer_base_type { | |
12205 | HRTIMER_BASE_MONOTONIC, | |
12206 | HRTIMER_BASE_REALTIME, | |
12207 | HRTIMER_BASE_BOOTTIME, | |
12208 | HRTIMER_BASE_TAI, | |
12209 | + HRTIMER_BASE_MONOTONIC_SOFT, | |
12210 | + HRTIMER_BASE_REALTIME_SOFT, | |
12211 | + HRTIMER_BASE_BOOTTIME_SOFT, | |
12212 | + HRTIMER_BASE_TAI_SOFT, | |
12213 | HRTIMER_MAX_CLOCK_BASES, | |
12214 | }; | |
12215 | ||
12216 | -/* | |
12217 | +/** | |
12218 | * struct hrtimer_cpu_base - the per cpu clock bases | |
12219 | * @lock: lock protecting the base and associated clock bases | |
12220 | * and timers | |
12221 | - * @seq: seqcount around __run_hrtimer | |
12222 | - * @running: pointer to the currently running hrtimer | |
12223 | * @cpu: cpu number | |
12224 | * @active_bases: Bitfield to mark bases with active timers | |
12225 | * @clock_was_set_seq: Sequence counter of clock was set events | |
12226 | - * @migration_enabled: The migration of hrtimers to other cpus is enabled | |
12227 | - * @nohz_active: The nohz functionality is enabled | |
12228 | - * @expires_next: absolute time of the next event which was scheduled | |
12229 | - * via clock_set_next_event() | |
12230 | - * @next_timer: Pointer to the first expiring timer | |
12231 | - * @in_hrtirq: hrtimer_interrupt() is currently executing | |
12232 | * @hres_active: State of high resolution mode | |
12233 | + * @in_hrtirq: hrtimer_interrupt() is currently executing | |
12234 | * @hang_detected: The last hrtimer interrupt detected a hang | |
12235 | + * @softirq_activated: displays, if the softirq is raised - update of softirq | |
12236 | + * related settings is not required then. | |
12237 | * @nr_events: Total number of hrtimer interrupt events | |
12238 | * @nr_retries: Total number of hrtimer interrupt retries | |
12239 | * @nr_hangs: Total number of hrtimer interrupt hangs | |
12240 | * @max_hang_time: Maximum time spent in hrtimer_interrupt | |
12241 | + * @expires_next: absolute time of the next event, is required for remote | |
12242 | + * hrtimer enqueue; it is the total first expiry time (hard | |
12243 | + * and soft hrtimer are taken into account) | |
12244 | + * @next_timer: Pointer to the first expiring timer | |
12245 | + * @softirq_expires_next: Time to check, if soft queues needs also to be expired | |
12246 | + * @softirq_next_timer: Pointer to the first expiring softirq based timer | |
12247 | * @clock_base: array of clock bases for this cpu | |
12248 | * | |
12249 | * Note: next_timer is just an optimization for __remove_hrtimer(). | |
b3bbd485 | 12250 | @@ -173,31 +207,31 @@ enum hrtimer_base_type { |
e4b2b4a8 JK |
12251 | */ |
12252 | struct hrtimer_cpu_base { | |
12253 | raw_spinlock_t lock; | |
12254 | - seqcount_t seq; | |
12255 | - struct hrtimer *running; | |
12256 | unsigned int cpu; | |
12257 | unsigned int active_bases; | |
12258 | unsigned int clock_was_set_seq; | |
12259 | - bool migration_enabled; | |
12260 | - bool nohz_active; | |
12261 | + unsigned int hres_active : 1, | |
12262 | + in_hrtirq : 1, | |
12263 | + hang_detected : 1, | |
12264 | + softirq_activated : 1; | |
12265 | #ifdef CONFIG_HIGH_RES_TIMERS | |
12266 | - unsigned int in_hrtirq : 1, | |
12267 | - hres_active : 1, | |
12268 | - hang_detected : 1; | |
12269 | - ktime_t expires_next; | |
12270 | - struct hrtimer *next_timer; | |
12271 | unsigned int nr_events; | |
12272 | - unsigned int nr_retries; | |
12273 | - unsigned int nr_hangs; | |
12274 | + unsigned short nr_retries; | |
12275 | + unsigned short nr_hangs; | |
12276 | unsigned int max_hang_time; | |
12277 | #endif | |
12278 | + ktime_t expires_next; | |
12279 | + struct hrtimer *next_timer; | |
12280 | + ktime_t softirq_expires_next; | |
12281 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
12282 | + wait_queue_head_t wait; | |
12283 | +#endif | |
12284 | + struct hrtimer *softirq_next_timer; | |
12285 | struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; | |
12286 | } ____cacheline_aligned; | |
12287 | ||
12288 | static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time) | |
12289 | { | |
12290 | - BUILD_BUG_ON(sizeof(struct hrtimer_clock_base) > HRTIMER_CLOCK_BASE_ALIGN); | |
12291 | - | |
12292 | timer->node.expires = time; | |
12293 | timer->_softexpires = time; | |
12294 | } | |
b3bbd485 | 12295 | @@ -266,16 +300,17 @@ static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer) |
e4b2b4a8 JK |
12296 | return timer->base->get_time(); |
12297 | } | |
12298 | ||
12299 | +static inline int hrtimer_is_hres_active(struct hrtimer *timer) | |
12300 | +{ | |
12301 | + return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ? | |
12302 | + timer->base->cpu_base->hres_active : 0; | |
12303 | +} | |
12304 | + | |
12305 | #ifdef CONFIG_HIGH_RES_TIMERS | |
12306 | struct clock_event_device; | |
12307 | ||
12308 | extern void hrtimer_interrupt(struct clock_event_device *dev); | |
12309 | ||
12310 | -static inline int hrtimer_is_hres_active(struct hrtimer *timer) | |
12311 | -{ | |
12312 | - return timer->base->cpu_base->hres_active; | |
12313 | -} | |
12314 | - | |
12315 | /* | |
12316 | * The resolution of the clocks. The resolution value is returned in | |
12317 | * the clock_getres() system call to give application programmers an | |
b3bbd485 | 12318 | @@ -298,11 +333,6 @@ extern unsigned int hrtimer_resolution; |
e4b2b4a8 JK |
12319 | |
12320 | #define hrtimer_resolution (unsigned int)LOW_RES_NSEC | |
12321 | ||
12322 | -static inline int hrtimer_is_hres_active(struct hrtimer *timer) | |
12323 | -{ | |
12324 | - return 0; | |
12325 | -} | |
12326 | - | |
12327 | static inline void clock_was_set_delayed(void) { } | |
12328 | ||
12329 | #endif | |
b3bbd485 | 12330 | @@ -344,10 +374,17 @@ DECLARE_PER_CPU(struct tick_device, tick_cpu_device); |
e4b2b4a8 JK |
12331 | /* Initialize timers: */ |
12332 | extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock, | |
12333 | enum hrtimer_mode mode); | |
12334 | +extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, | |
12335 | + enum hrtimer_mode mode, | |
12336 | + struct task_struct *task); | |
12337 | ||
12338 | #ifdef CONFIG_DEBUG_OBJECTS_TIMERS | |
12339 | extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock, | |
12340 | enum hrtimer_mode mode); | |
12341 | +extern void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, | |
12342 | + clockid_t clock_id, | |
12343 | + enum hrtimer_mode mode, | |
12344 | + struct task_struct *task); | |
12345 | ||
12346 | extern void destroy_hrtimer_on_stack(struct hrtimer *timer); | |
12347 | #else | |
b3bbd485 | 12348 | @@ -357,6 +394,15 @@ static inline void hrtimer_init_on_stack(struct hrtimer *timer, |
e4b2b4a8 JK |
12349 | { |
12350 | hrtimer_init(timer, which_clock, mode); | |
12351 | } | |
12352 | + | |
12353 | +static inline void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, | |
12354 | + clockid_t clock_id, | |
12355 | + enum hrtimer_mode mode, | |
12356 | + struct task_struct *task) | |
12357 | +{ | |
12358 | + hrtimer_init_sleeper(sl, clock_id, mode, task); | |
12359 | +} | |
12360 | + | |
12361 | static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { } | |
12362 | #endif | |
12363 | ||
b3bbd485 | 12364 | @@ -365,11 +411,12 @@ extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, |
e4b2b4a8 JK |
12365 | u64 range_ns, const enum hrtimer_mode mode); |
12366 | ||
12367 | /** | |
12368 | - * hrtimer_start - (re)start an hrtimer on the current CPU | |
12369 | + * hrtimer_start - (re)start an hrtimer | |
12370 | * @timer: the timer to be added | |
12371 | * @tim: expiry time | |
12372 | - * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or | |
12373 | - * relative (HRTIMER_MODE_REL) | |
12374 | + * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or | |
12375 | + * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED); | |
12376 | + * softirq based mode is considered for debug purpose only! | |
12377 | */ | |
12378 | static inline void hrtimer_start(struct hrtimer *timer, ktime_t tim, | |
12379 | const enum hrtimer_mode mode) | |
b3bbd485 | 12380 | @@ -396,6 +443,13 @@ static inline void hrtimer_restart(struct hrtimer *timer) |
e4b2b4a8 JK |
12381 | hrtimer_start_expires(timer, HRTIMER_MODE_ABS); |
12382 | } | |
12383 | ||
12384 | +/* Softirq preemption could deadlock timer removal */ | |
12385 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
12386 | + extern void hrtimer_wait_for_timer(const struct hrtimer *timer); | |
12387 | +#else | |
12388 | +# define hrtimer_wait_for_timer(timer) do { cpu_relax(); } while (0) | |
12389 | +#endif | |
12390 | + | |
12391 | /* Query timers: */ | |
12392 | extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust); | |
12393 | ||
b3bbd485 | 12394 | @@ -420,9 +474,9 @@ static inline int hrtimer_is_queued(struct hrtimer *timer) |
e4b2b4a8 JK |
12395 | * Helper function to check, whether the timer is running the callback |
12396 | * function | |
12397 | */ | |
12398 | -static inline int hrtimer_callback_running(struct hrtimer *timer) | |
12399 | +static inline int hrtimer_callback_running(const struct hrtimer *timer) | |
12400 | { | |
12401 | - return timer->base->cpu_base->running == timer; | |
12402 | + return timer->base->running == timer; | |
12403 | } | |
12404 | ||
12405 | /* Forward a hrtimer so it expires after now: */ | |
b3bbd485 | 12406 | @@ -458,15 +512,12 @@ extern long hrtimer_nanosleep(const struct timespec64 *rqtp, |
e4b2b4a8 JK |
12407 | const enum hrtimer_mode mode, |
12408 | const clockid_t clockid); | |
12409 | ||
12410 | -extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, | |
12411 | - struct task_struct *tsk); | |
12412 | - | |
12413 | extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta, | |
12414 | const enum hrtimer_mode mode); | |
12415 | extern int schedule_hrtimeout_range_clock(ktime_t *expires, | |
12416 | u64 delta, | |
12417 | const enum hrtimer_mode mode, | |
12418 | - int clock); | |
12419 | + clockid_t clock_id); | |
12420 | extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode); | |
12421 | ||
12422 | /* Soft interrupt function to run the hrtimer queues: */ | |
b3bbd485 JK |
12423 | diff --git a/include/linux/idr.h b/include/linux/idr.h |
12424 | index 7c3a365f7e12..a922d984d9b6 100644 | |
12425 | --- a/include/linux/idr.h | |
12426 | +++ b/include/linux/idr.h | |
12427 | @@ -167,10 +167,7 @@ static inline bool idr_is_empty(const struct idr *idr) | |
e4b2b4a8 JK |
12428 | * Each idr_preload() should be matched with an invocation of this |
12429 | * function. See idr_preload() for details. | |
12430 | */ | |
12431 | -static inline void idr_preload_end(void) | |
12432 | -{ | |
12433 | - preempt_enable(); | |
12434 | -} | |
12435 | +void idr_preload_end(void); | |
12436 | ||
12437 | /** | |
12438 | * idr_find - return pointer for given id | |
b3bbd485 JK |
12439 | diff --git a/include/linux/init_task.h b/include/linux/init_task.h |
12440 | index 8062e6cc607c..ee3ff961b84c 100644 | |
12441 | --- a/include/linux/init_task.h | |
12442 | +++ b/include/linux/init_task.h | |
12443 | @@ -163,6 +163,12 @@ extern struct cred init_cred; | |
e4b2b4a8 JK |
12444 | # define INIT_PERF_EVENTS(tsk) |
12445 | #endif | |
12446 | ||
12447 | +#if defined(CONFIG_POSIX_TIMERS) && defined(CONFIG_PREEMPT_RT_BASE) | |
12448 | +# define INIT_TIMER_LIST .posix_timer_list = NULL, | |
12449 | +#else | |
12450 | +# define INIT_TIMER_LIST | |
12451 | +#endif | |
12452 | + | |
12453 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | |
12454 | # define INIT_VTIME(tsk) \ | |
12455 | .vtime.seqcount = SEQCNT_ZERO(tsk.vtime.seqcount), \ | |
b3bbd485 | 12456 | @@ -234,7 +240,8 @@ extern struct cred init_cred; |
e4b2b4a8 JK |
12457 | .static_prio = MAX_PRIO-20, \ |
12458 | .normal_prio = MAX_PRIO-20, \ | |
12459 | .policy = SCHED_NORMAL, \ | |
12460 | - .cpus_allowed = CPU_MASK_ALL, \ | |
12461 | + .cpus_ptr = &tsk.cpus_mask, \ | |
12462 | + .cpus_mask = CPU_MASK_ALL, \ | |
12463 | .nr_cpus_allowed= NR_CPUS, \ | |
12464 | .mm = NULL, \ | |
12465 | .active_mm = &init_mm, \ | |
b3bbd485 | 12466 | @@ -276,6 +283,7 @@ extern struct cred init_cred; |
e4b2b4a8 JK |
12467 | INIT_CPU_TIMERS(tsk) \ |
12468 | .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ | |
12469 | .timer_slack_ns = 50000, /* 50 usec default slack */ \ | |
12470 | + INIT_TIMER_LIST \ | |
12471 | .pids = { \ | |
12472 | [PIDTYPE_PID] = INIT_PID_LINK(PIDTYPE_PID), \ | |
12473 | [PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID), \ | |
b3bbd485 JK |
12474 | diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h |
12475 | index 69c238210325..0f25fa19b2d8 100644 | |
12476 | --- a/include/linux/interrupt.h | |
12477 | +++ b/include/linux/interrupt.h | |
e4b2b4a8 JK |
12478 | @@ -15,6 +15,7 @@ |
12479 | #include <linux/hrtimer.h> | |
12480 | #include <linux/kref.h> | |
12481 | #include <linux/workqueue.h> | |
12482 | +#include <linux/swork.h> | |
12483 | ||
12484 | #include <linux/atomic.h> | |
12485 | #include <asm/ptrace.h> | |
12486 | @@ -63,6 +64,7 @@ | |
12487 | * interrupt handler after suspending interrupts. For system | |
12488 | * wakeup devices users need to implement wakeup detection in | |
12489 | * their interrupt handlers. | |
12490 | + * IRQF_NO_SOFTIRQ_CALL - Do not process softirqs in the irq thread context (RT) | |
12491 | */ | |
12492 | #define IRQF_SHARED 0x00000080 | |
12493 | #define IRQF_PROBE_SHARED 0x00000100 | |
12494 | @@ -76,6 +78,7 @@ | |
12495 | #define IRQF_NO_THREAD 0x00010000 | |
12496 | #define IRQF_EARLY_RESUME 0x00020000 | |
12497 | #define IRQF_COND_SUSPEND 0x00040000 | |
12498 | +#define IRQF_NO_SOFTIRQ_CALL 0x00080000 | |
12499 | ||
12500 | #define IRQF_TIMER (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD) | |
12501 | ||
b3bbd485 | 12502 | @@ -207,7 +210,7 @@ extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id); |
e4b2b4a8 JK |
12503 | #ifdef CONFIG_LOCKDEP |
12504 | # define local_irq_enable_in_hardirq() do { } while (0) | |
12505 | #else | |
12506 | -# define local_irq_enable_in_hardirq() local_irq_enable() | |
12507 | +# define local_irq_enable_in_hardirq() local_irq_enable_nort() | |
12508 | #endif | |
12509 | ||
12510 | extern void disable_irq_nosync(unsigned int irq); | |
b3bbd485 | 12511 | @@ -227,6 +230,7 @@ extern void resume_device_irqs(void); |
e4b2b4a8 JK |
12512 | * struct irq_affinity_notify - context for notification of IRQ affinity changes |
12513 | * @irq: Interrupt to which notification applies | |
12514 | * @kref: Reference count, for internal use | |
12515 | + * @swork: Swork item, for internal use | |
12516 | * @work: Work item, for internal use | |
12517 | * @notify: Function to be called on change. This will be | |
12518 | * called in process context. | |
b3bbd485 | 12519 | @@ -238,7 +242,11 @@ extern void resume_device_irqs(void); |
e4b2b4a8 JK |
12520 | struct irq_affinity_notify { |
12521 | unsigned int irq; | |
12522 | struct kref kref; | |
12523 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
12524 | + struct swork_event swork; | |
12525 | +#else | |
12526 | struct work_struct work; | |
12527 | +#endif | |
12528 | void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask); | |
12529 | void (*release)(struct kref *ref); | |
12530 | }; | |
b3bbd485 | 12531 | @@ -429,9 +437,13 @@ extern int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, |
e4b2b4a8 JK |
12532 | bool state); |
12533 | ||
12534 | #ifdef CONFIG_IRQ_FORCED_THREADING | |
12535 | +# ifndef CONFIG_PREEMPT_RT_BASE | |
12536 | extern bool force_irqthreads; | |
12537 | +# else | |
12538 | +# define force_irqthreads (true) | |
12539 | +# endif | |
12540 | #else | |
12541 | -#define force_irqthreads (0) | |
12542 | +#define force_irqthreads (false) | |
12543 | #endif | |
12544 | ||
12545 | #ifndef __ARCH_SET_SOFTIRQ_PENDING | |
b3bbd485 | 12546 | @@ -488,9 +500,10 @@ struct softirq_action |
e4b2b4a8 JK |
12547 | void (*action)(struct softirq_action *); |
12548 | }; | |
12549 | ||
12550 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
12551 | asmlinkage void do_softirq(void); | |
12552 | asmlinkage void __do_softirq(void); | |
12553 | - | |
12554 | +static inline void thread_do_softirq(void) { do_softirq(); } | |
12555 | #ifdef __ARCH_HAS_DO_SOFTIRQ | |
12556 | void do_softirq_own_stack(void); | |
12557 | #else | |
b3bbd485 | 12558 | @@ -499,13 +512,25 @@ static inline void do_softirq_own_stack(void) |
e4b2b4a8 JK |
12559 | __do_softirq(); |
12560 | } | |
12561 | #endif | |
12562 | +#else | |
12563 | +extern void thread_do_softirq(void); | |
12564 | +#endif | |
12565 | ||
12566 | extern void open_softirq(int nr, void (*action)(struct softirq_action *)); | |
12567 | extern void softirq_init(void); | |
12568 | extern void __raise_softirq_irqoff(unsigned int nr); | |
12569 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
12570 | +extern void __raise_softirq_irqoff_ksoft(unsigned int nr); | |
12571 | +#else | |
12572 | +static inline void __raise_softirq_irqoff_ksoft(unsigned int nr) | |
12573 | +{ | |
12574 | + __raise_softirq_irqoff(nr); | |
12575 | +} | |
12576 | +#endif | |
12577 | ||
12578 | extern void raise_softirq_irqoff(unsigned int nr); | |
12579 | extern void raise_softirq(unsigned int nr); | |
12580 | +extern void softirq_check_pending_idle(void); | |
12581 | ||
12582 | DECLARE_PER_CPU(struct task_struct *, ksoftirqd); | |
12583 | ||
b3bbd485 | 12584 | @@ -527,8 +552,9 @@ static inline struct task_struct *this_cpu_ksoftirqd(void) |
e4b2b4a8 JK |
12585 | to be executed on some cpu at least once after this. |
12586 | * If the tasklet is already scheduled, but its execution is still not | |
12587 | started, it will be executed only once. | |
12588 | - * If this tasklet is already running on another CPU (or schedule is called | |
12589 | - from tasklet itself), it is rescheduled for later. | |
12590 | + * If this tasklet is already running on another CPU, it is rescheduled | |
12591 | + for later. | |
12592 | + * Schedule must not be called from the tasklet itself (a lockup occurs) | |
12593 | * Tasklet is strictly serialized wrt itself, but not | |
12594 | wrt another tasklets. If client needs some intertask synchronization, | |
12595 | he makes it with spinlocks. | |
b3bbd485 | 12596 | @@ -553,27 +579,36 @@ struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(1), func, data } |
e4b2b4a8 JK |
12597 | enum |
12598 | { | |
12599 | TASKLET_STATE_SCHED, /* Tasklet is scheduled for execution */ | |
12600 | - TASKLET_STATE_RUN /* Tasklet is running (SMP only) */ | |
12601 | + TASKLET_STATE_RUN, /* Tasklet is running (SMP only) */ | |
12602 | + TASKLET_STATE_PENDING /* Tasklet is pending */ | |
12603 | }; | |
12604 | ||
12605 | -#ifdef CONFIG_SMP | |
12606 | +#define TASKLET_STATEF_SCHED (1 << TASKLET_STATE_SCHED) | |
12607 | +#define TASKLET_STATEF_RUN (1 << TASKLET_STATE_RUN) | |
12608 | +#define TASKLET_STATEF_PENDING (1 << TASKLET_STATE_PENDING) | |
12609 | + | |
12610 | +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL) | |
12611 | static inline int tasklet_trylock(struct tasklet_struct *t) | |
12612 | { | |
12613 | return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state); | |
12614 | } | |
12615 | ||
12616 | +static inline int tasklet_tryunlock(struct tasklet_struct *t) | |
12617 | +{ | |
12618 | + return cmpxchg(&t->state, TASKLET_STATEF_RUN, 0) == TASKLET_STATEF_RUN; | |
12619 | +} | |
12620 | + | |
12621 | static inline void tasklet_unlock(struct tasklet_struct *t) | |
12622 | { | |
12623 | smp_mb__before_atomic(); | |
12624 | clear_bit(TASKLET_STATE_RUN, &(t)->state); | |
12625 | } | |
12626 | ||
12627 | -static inline void tasklet_unlock_wait(struct tasklet_struct *t) | |
12628 | -{ | |
12629 | - while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); } | |
12630 | -} | |
12631 | +extern void tasklet_unlock_wait(struct tasklet_struct *t); | |
12632 | + | |
12633 | #else | |
12634 | #define tasklet_trylock(t) 1 | |
12635 | +#define tasklet_tryunlock(t) 1 | |
12636 | #define tasklet_unlock_wait(t) do { } while (0) | |
12637 | #define tasklet_unlock(t) do { } while (0) | |
12638 | #endif | |
b3bbd485 | 12639 | @@ -607,41 +642,17 @@ static inline void tasklet_disable(struct tasklet_struct *t) |
e4b2b4a8 JK |
12640 | smp_mb(); |
12641 | } | |
12642 | ||
12643 | -static inline void tasklet_enable(struct tasklet_struct *t) | |
12644 | -{ | |
12645 | - smp_mb__before_atomic(); | |
12646 | - atomic_dec(&t->count); | |
12647 | -} | |
12648 | - | |
12649 | +extern void tasklet_enable(struct tasklet_struct *t); | |
12650 | extern void tasklet_kill(struct tasklet_struct *t); | |
12651 | extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu); | |
12652 | extern void tasklet_init(struct tasklet_struct *t, | |
12653 | void (*func)(unsigned long), unsigned long data); | |
12654 | ||
12655 | -struct tasklet_hrtimer { | |
12656 | - struct hrtimer timer; | |
12657 | - struct tasklet_struct tasklet; | |
12658 | - enum hrtimer_restart (*function)(struct hrtimer *); | |
12659 | -}; | |
12660 | - | |
12661 | -extern void | |
12662 | -tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, | |
12663 | - enum hrtimer_restart (*function)(struct hrtimer *), | |
12664 | - clockid_t which_clock, enum hrtimer_mode mode); | |
12665 | - | |
12666 | -static inline | |
12667 | -void tasklet_hrtimer_start(struct tasklet_hrtimer *ttimer, ktime_t time, | |
12668 | - const enum hrtimer_mode mode) | |
12669 | -{ | |
12670 | - hrtimer_start(&ttimer->timer, time, mode); | |
12671 | -} | |
12672 | - | |
12673 | -static inline | |
12674 | -void tasklet_hrtimer_cancel(struct tasklet_hrtimer *ttimer) | |
12675 | -{ | |
12676 | - hrtimer_cancel(&ttimer->timer); | |
12677 | - tasklet_kill(&ttimer->tasklet); | |
12678 | -} | |
12679 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
12680 | +extern void softirq_early_init(void); | |
12681 | +#else | |
12682 | +static inline void softirq_early_init(void) { } | |
12683 | +#endif | |
12684 | ||
12685 | /* | |
12686 | * Autoprobing for irqs: | |
b3bbd485 JK |
12687 | diff --git a/include/linux/irq.h b/include/linux/irq.h |
12688 | index 0d53626405bf..ddd23c6e2e55 100644 | |
12689 | --- a/include/linux/irq.h | |
12690 | +++ b/include/linux/irq.h | |
12691 | @@ -74,6 +74,7 @@ enum irqchip_irq_state; | |
12692 | * IRQ_IS_POLLED - Always polled by another interrupt. Exclude | |
12693 | * it from the spurious interrupt detection | |
12694 | * mechanism and from core side polling. | |
12695 | + * IRQ_NO_SOFTIRQ_CALL - No softirq processing in the irq thread context (RT) | |
12696 | * IRQ_DISABLE_UNLAZY - Disable lazy irq disable | |
12697 | */ | |
12698 | enum { | |
12699 | @@ -101,13 +102,14 @@ enum { | |
12700 | IRQ_PER_CPU_DEVID = (1 << 17), | |
12701 | IRQ_IS_POLLED = (1 << 18), | |
12702 | IRQ_DISABLE_UNLAZY = (1 << 19), | |
12703 | + IRQ_NO_SOFTIRQ_CALL = (1 << 20), | |
12704 | }; | |
12705 | ||
12706 | #define IRQF_MODIFY_MASK \ | |
12707 | (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \ | |
12708 | IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL | IRQ_NO_BALANCING | \ | |
12709 | IRQ_PER_CPU | IRQ_NESTED_THREAD | IRQ_NOTHREAD | IRQ_PER_CPU_DEVID | \ | |
12710 | - IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY) | |
12711 | + IRQ_IS_POLLED | IRQ_DISABLE_UNLAZY | IRQ_NO_SOFTIRQ_CALL) | |
12712 | ||
12713 | #define IRQ_NO_BALANCING_MASK (IRQ_PER_CPU | IRQ_NO_BALANCING) | |
12714 | ||
12715 | diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h | |
12716 | index 9270d73ea682..1e66fac6f1d2 100644 | |
12717 | --- a/include/linux/irq_work.h | |
12718 | +++ b/include/linux/irq_work.h | |
12719 | @@ -17,6 +17,7 @@ | |
12720 | #define IRQ_WORK_BUSY 2UL | |
12721 | #define IRQ_WORK_FLAGS 3UL | |
12722 | #define IRQ_WORK_LAZY 4UL /* Doesn't want IPI, wait for tick */ | |
12723 | +#define IRQ_WORK_HARD_IRQ 8UL /* Run hard IRQ context, even on RT */ | |
12724 | ||
12725 | struct irq_work { | |
12726 | unsigned long flags; | |
12727 | @@ -52,4 +53,10 @@ static inline bool irq_work_needs_cpu(void) { return false; } | |
12728 | static inline void irq_work_run(void) { } | |
12729 | #endif | |
12730 | ||
12731 | +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL) | |
12732 | +void irq_work_tick_soft(void); | |
12733 | +#else | |
12734 | +static inline void irq_work_tick_soft(void) { } | |
12735 | +#endif | |
12736 | + | |
12737 | #endif /* _LINUX_IRQ_WORK_H */ | |
12738 | diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h | |
12739 | index bacb499c512c..688f2565294c 100644 | |
12740 | --- a/include/linux/irqchip/arm-gic-v3.h | |
12741 | +++ b/include/linux/irqchip/arm-gic-v3.h | |
12742 | @@ -568,6 +568,7 @@ struct rdists { | |
12743 | void __iomem *rd_base; | |
12744 | struct page *pend_page; | |
12745 | phys_addr_t phys_base; | |
12746 | + bool lpi_enabled; | |
12747 | } __percpu *rdist; | |
12748 | struct page *prop_page; | |
12749 | int id_bits; | |
12750 | diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h | |
12751 | index b6084898d330..d334476cdca6 100644 | |
12752 | --- a/include/linux/irqdesc.h | |
12753 | +++ b/include/linux/irqdesc.h | |
12754 | @@ -70,6 +70,7 @@ struct irq_desc { | |
e4b2b4a8 JK |
12755 | unsigned int irqs_unhandled; |
12756 | atomic_t threads_handled; | |
12757 | int threads_handled_last; | |
12758 | + u64 random_ip; | |
12759 | raw_spinlock_t lock; | |
12760 | struct cpumask *percpu_enabled; | |
12761 | const struct cpumask *percpu_affinity; | |
b3bbd485 JK |
12762 | diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h |
12763 | index 46cb57d5eb13..2e023bfe45af 100644 | |
12764 | --- a/include/linux/irqflags.h | |
12765 | +++ b/include/linux/irqflags.h | |
12766 | @@ -34,16 +34,6 @@ do { \ | |
e4b2b4a8 JK |
12767 | current->hardirq_context--; \ |
12768 | crossrelease_hist_end(XHLOCK_HARD); \ | |
12769 | } while (0) | |
12770 | -# define lockdep_softirq_enter() \ | |
12771 | -do { \ | |
12772 | - current->softirq_context++; \ | |
12773 | - crossrelease_hist_start(XHLOCK_SOFT); \ | |
12774 | -} while (0) | |
12775 | -# define lockdep_softirq_exit() \ | |
12776 | -do { \ | |
12777 | - current->softirq_context--; \ | |
12778 | - crossrelease_hist_end(XHLOCK_SOFT); \ | |
12779 | -} while (0) | |
12780 | # define INIT_TRACE_IRQFLAGS .softirqs_enabled = 1, | |
12781 | #else | |
12782 | # define trace_hardirqs_on() do { } while (0) | |
b3bbd485 | 12783 | @@ -56,9 +46,23 @@ do { \ |
e4b2b4a8 JK |
12784 | # define trace_softirqs_enabled(p) 0 |
12785 | # define trace_hardirq_enter() do { } while (0) | |
12786 | # define trace_hardirq_exit() do { } while (0) | |
12787 | +# define INIT_TRACE_IRQFLAGS | |
12788 | +#endif | |
12789 | + | |
12790 | +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT_FULL) | |
12791 | +# define lockdep_softirq_enter() \ | |
12792 | +do { \ | |
12793 | + current->softirq_context++; \ | |
12794 | + crossrelease_hist_start(XHLOCK_SOFT); \ | |
12795 | +} while (0) | |
12796 | +# define lockdep_softirq_exit() \ | |
12797 | +do { \ | |
12798 | + current->softirq_context--; \ | |
12799 | + crossrelease_hist_end(XHLOCK_SOFT); \ | |
12800 | +} while (0) | |
12801 | +#else | |
12802 | # define lockdep_softirq_enter() do { } while (0) | |
12803 | # define lockdep_softirq_exit() do { } while (0) | |
12804 | -# define INIT_TRACE_IRQFLAGS | |
12805 | #endif | |
12806 | ||
12807 | #if defined(CONFIG_IRQSOFF_TRACER) || \ | |
b3bbd485 | 12808 | @@ -165,4 +169,23 @@ do { \ |
e4b2b4a8 JK |
12809 | |
12810 | #define irqs_disabled_flags(flags) raw_irqs_disabled_flags(flags) | |
12811 | ||
12812 | +/* | |
12813 | + * local_irq* variants depending on RT/!RT | |
12814 | + */ | |
12815 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
12816 | +# define local_irq_disable_nort() do { } while (0) | |
12817 | +# define local_irq_enable_nort() do { } while (0) | |
12818 | +# define local_irq_save_nort(flags) local_save_flags(flags) | |
12819 | +# define local_irq_restore_nort(flags) (void)(flags) | |
12820 | +# define local_irq_disable_rt() local_irq_disable() | |
12821 | +# define local_irq_enable_rt() local_irq_enable() | |
12822 | +#else | |
12823 | +# define local_irq_disable_nort() local_irq_disable() | |
12824 | +# define local_irq_enable_nort() local_irq_enable() | |
12825 | +# define local_irq_save_nort(flags) local_irq_save(flags) | |
12826 | +# define local_irq_restore_nort(flags) local_irq_restore(flags) | |
12827 | +# define local_irq_disable_rt() do { } while (0) | |
12828 | +# define local_irq_enable_rt() do { } while (0) | |
12829 | +#endif | |
12830 | + | |
12831 | #endif | |
b3bbd485 JK |
12832 | diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h |
12833 | index 29290bfb94a8..32379bfab9f0 100644 | |
12834 | --- a/include/linux/jbd2.h | |
12835 | +++ b/include/linux/jbd2.h | |
12836 | @@ -347,32 +347,56 @@ static inline struct journal_head *bh2jh(struct buffer_head *bh) | |
e4b2b4a8 JK |
12837 | |
12838 | static inline void jbd_lock_bh_state(struct buffer_head *bh) | |
12839 | { | |
12840 | +#ifndef CONFIG_PREEMPT_RT_BASE | |
12841 | bit_spin_lock(BH_State, &bh->b_state); | |
12842 | +#else | |
12843 | + spin_lock(&bh->b_state_lock); | |
12844 | +#endif | |
12845 | } | |
12846 | ||
12847 | static inline int jbd_trylock_bh_state(struct buffer_head *bh) | |
12848 | { | |
12849 | +#ifndef CONFIG_PREEMPT_RT_BASE | |
12850 | return bit_spin_trylock(BH_State, &bh->b_state); | |
12851 | +#else | |
12852 | + return spin_trylock(&bh->b_state_lock); | |
12853 | +#endif | |
12854 | } | |
12855 | ||
12856 | static inline int jbd_is_locked_bh_state(struct buffer_head *bh) | |
12857 | { | |
12858 | +#ifndef CONFIG_PREEMPT_RT_BASE | |
12859 | return bit_spin_is_locked(BH_State, &bh->b_state); | |
12860 | +#else | |
12861 | + return spin_is_locked(&bh->b_state_lock); | |
12862 | +#endif | |
12863 | } | |
12864 | ||
12865 | static inline void jbd_unlock_bh_state(struct buffer_head *bh) | |
12866 | { | |
12867 | +#ifndef CONFIG_PREEMPT_RT_BASE | |
12868 | bit_spin_unlock(BH_State, &bh->b_state); | |
12869 | +#else | |
12870 | + spin_unlock(&bh->b_state_lock); | |
12871 | +#endif | |
12872 | } | |
12873 | ||
12874 | static inline void jbd_lock_bh_journal_head(struct buffer_head *bh) | |
12875 | { | |
12876 | +#ifndef CONFIG_PREEMPT_RT_BASE | |
12877 | bit_spin_lock(BH_JournalHead, &bh->b_state); | |
12878 | +#else | |
12879 | + spin_lock(&bh->b_journal_head_lock); | |
12880 | +#endif | |
12881 | } | |
12882 | ||
12883 | static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh) | |
12884 | { | |
12885 | +#ifndef CONFIG_PREEMPT_RT_BASE | |
12886 | bit_spin_unlock(BH_JournalHead, &bh->b_state); | |
12887 | +#else | |
12888 | + spin_unlock(&bh->b_journal_head_lock); | |
12889 | +#endif | |
12890 | } | |
12891 | ||
12892 | #define J_ASSERT(assert) BUG_ON(!(assert)) | |
b3bbd485 JK |
12893 | diff --git a/include/linux/kdb.h b/include/linux/kdb.h |
12894 | index 68bd88223417..e033b25b0b72 100644 | |
12895 | --- a/include/linux/kdb.h | |
12896 | +++ b/include/linux/kdb.h | |
12897 | @@ -167,6 +167,7 @@ extern __printf(2, 0) int vkdb_printf(enum kdb_msgsrc src, const char *fmt, | |
e4b2b4a8 JK |
12898 | extern __printf(1, 2) int kdb_printf(const char *, ...); |
12899 | typedef __printf(1, 2) int (*kdb_printf_t)(const char *, ...); | |
12900 | ||
12901 | +#define in_kdb_printk() (kdb_trap_printk) | |
12902 | extern void kdb_init(int level); | |
12903 | ||
12904 | /* Access to kdb specific polling devices */ | |
b3bbd485 | 12905 | @@ -201,6 +202,7 @@ extern int kdb_register_flags(char *, kdb_func_t, char *, char *, |
e4b2b4a8 JK |
12906 | extern int kdb_unregister(char *); |
12907 | #else /* ! CONFIG_KGDB_KDB */ | |
12908 | static inline __printf(1, 2) int kdb_printf(const char *fmt, ...) { return 0; } | |
12909 | +#define in_kdb_printk() (0) | |
12910 | static inline void kdb_init(int level) {} | |
12911 | static inline int kdb_register(char *cmd, kdb_func_t func, char *usage, | |
12912 | char *help, short minlen) { return 0; } | |
b3bbd485 JK |
12913 | diff --git a/include/linux/kernel.h b/include/linux/kernel.h |
12914 | index 4b484ab9e163..74feebf9d82c 100644 | |
12915 | --- a/include/linux/kernel.h | |
12916 | +++ b/include/linux/kernel.h | |
12917 | @@ -225,6 +225,9 @@ extern int _cond_resched(void); | |
e4b2b4a8 JK |
12918 | */ |
12919 | # define might_sleep() \ | |
12920 | do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) | |
12921 | + | |
12922 | +# define might_sleep_no_state_check() \ | |
12923 | + do { ___might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) | |
12924 | # define sched_annotate_sleep() (current->task_state_change = 0) | |
12925 | #else | |
12926 | static inline void ___might_sleep(const char *file, int line, | |
b3bbd485 | 12927 | @@ -232,6 +235,7 @@ extern int _cond_resched(void); |
e4b2b4a8 JK |
12928 | static inline void __might_sleep(const char *file, int line, |
12929 | int preempt_offset) { } | |
12930 | # define might_sleep() do { might_resched(); } while (0) | |
12931 | +# define might_sleep_no_state_check() do { might_resched(); } while (0) | |
12932 | # define sched_annotate_sleep() do { } while (0) | |
12933 | #endif | |
12934 | ||
b3bbd485 | 12935 | @@ -531,6 +535,7 @@ extern enum system_states { |
e4b2b4a8 JK |
12936 | SYSTEM_HALT, |
12937 | SYSTEM_POWER_OFF, | |
12938 | SYSTEM_RESTART, | |
12939 | + SYSTEM_SUSPEND, | |
12940 | } system_state; | |
12941 | ||
12942 | #define TAINT_PROPRIETARY_MODULE 0 | |
b3bbd485 JK |
12943 | diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h |
12944 | index 3fc2cc57ba1b..0b5de7d9ffcf 100644 | |
12945 | --- a/include/linux/list_bl.h | |
12946 | +++ b/include/linux/list_bl.h | |
e4b2b4a8 JK |
12947 | @@ -3,6 +3,7 @@ |
12948 | #define _LINUX_LIST_BL_H | |
12949 | ||
12950 | #include <linux/list.h> | |
12951 | +#include <linux/spinlock.h> | |
12952 | #include <linux/bit_spinlock.h> | |
12953 | ||
12954 | /* | |
12955 | @@ -33,13 +34,24 @@ | |
12956 | ||
12957 | struct hlist_bl_head { | |
12958 | struct hlist_bl_node *first; | |
12959 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
12960 | + raw_spinlock_t lock; | |
12961 | +#endif | |
12962 | }; | |
12963 | ||
12964 | struct hlist_bl_node { | |
12965 | struct hlist_bl_node *next, **pprev; | |
12966 | }; | |
12967 | -#define INIT_HLIST_BL_HEAD(ptr) \ | |
12968 | - ((ptr)->first = NULL) | |
12969 | + | |
12970 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
12971 | +#define INIT_HLIST_BL_HEAD(h) \ | |
12972 | +do { \ | |
12973 | + (h)->first = NULL; \ | |
12974 | + raw_spin_lock_init(&(h)->lock); \ | |
12975 | +} while (0) | |
12976 | +#else | |
12977 | +#define INIT_HLIST_BL_HEAD(h) (h)->first = NULL | |
12978 | +#endif | |
12979 | ||
12980 | static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h) | |
12981 | { | |
b3bbd485 | 12982 | @@ -119,12 +131,26 @@ static inline void hlist_bl_del_init(struct hlist_bl_node *n) |
e4b2b4a8 JK |
12983 | |
12984 | static inline void hlist_bl_lock(struct hlist_bl_head *b) | |
12985 | { | |
12986 | +#ifndef CONFIG_PREEMPT_RT_BASE | |
12987 | bit_spin_lock(0, (unsigned long *)b); | |
12988 | +#else | |
12989 | + raw_spin_lock(&b->lock); | |
12990 | +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) | |
12991 | + __set_bit(0, (unsigned long *)b); | |
12992 | +#endif | |
12993 | +#endif | |
12994 | } | |
12995 | ||
12996 | static inline void hlist_bl_unlock(struct hlist_bl_head *b) | |
12997 | { | |
12998 | +#ifndef CONFIG_PREEMPT_RT_BASE | |
12999 | __bit_spin_unlock(0, (unsigned long *)b); | |
13000 | +#else | |
13001 | +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) | |
13002 | + __clear_bit(0, (unsigned long *)b); | |
13003 | +#endif | |
13004 | + raw_spin_unlock(&b->lock); | |
13005 | +#endif | |
13006 | } | |
13007 | ||
13008 | static inline bool hlist_bl_is_locked(struct hlist_bl_head *b) | |
b3bbd485 JK |
13009 | diff --git a/include/linux/locallock.h b/include/linux/locallock.h |
13010 | new file mode 100644 | |
13011 | index 000000000000..921eab83cd34 | |
13012 | --- /dev/null | |
13013 | +++ b/include/linux/locallock.h | |
13014 | @@ -0,0 +1,281 @@ | |
e4b2b4a8 JK |
13015 | +#ifndef _LINUX_LOCALLOCK_H |
13016 | +#define _LINUX_LOCALLOCK_H | |
13017 | + | |
13018 | +#include <linux/percpu.h> | |
13019 | +#include <linux/spinlock.h> | |
13020 | + | |
13021 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
13022 | + | |
13023 | +#ifdef CONFIG_DEBUG_SPINLOCK | |
13024 | +# define LL_WARN(cond) WARN_ON(cond) | |
13025 | +#else | |
13026 | +# define LL_WARN(cond) do { } while (0) | |
13027 | +#endif | |
13028 | + | |
13029 | +/* | |
13030 | + * per cpu lock based substitute for local_irq_*() | |
13031 | + */ | |
13032 | +struct local_irq_lock { | |
13033 | + spinlock_t lock; | |
13034 | + struct task_struct *owner; | |
13035 | + int nestcnt; | |
13036 | + unsigned long flags; | |
13037 | +}; | |
13038 | + | |
13039 | +#define DEFINE_LOCAL_IRQ_LOCK(lvar) \ | |
13040 | + DEFINE_PER_CPU(struct local_irq_lock, lvar) = { \ | |
13041 | + .lock = __SPIN_LOCK_UNLOCKED((lvar).lock) } | |
13042 | + | |
13043 | +#define DECLARE_LOCAL_IRQ_LOCK(lvar) \ | |
13044 | + DECLARE_PER_CPU(struct local_irq_lock, lvar) | |
13045 | + | |
13046 | +#define local_irq_lock_init(lvar) \ | |
13047 | + do { \ | |
13048 | + int __cpu; \ | |
13049 | + for_each_possible_cpu(__cpu) \ | |
13050 | + spin_lock_init(&per_cpu(lvar, __cpu).lock); \ | |
13051 | + } while (0) | |
13052 | + | |
13053 | +static inline void __local_lock(struct local_irq_lock *lv) | |
13054 | +{ | |
13055 | + if (lv->owner != current) { | |
13056 | + spin_lock(&lv->lock); | |
13057 | + LL_WARN(lv->owner); | |
13058 | + LL_WARN(lv->nestcnt); | |
13059 | + lv->owner = current; | |
13060 | + } | |
13061 | + lv->nestcnt++; | |
13062 | +} | |
13063 | + | |
13064 | +#define local_lock(lvar) \ | |
13065 | + do { __local_lock(&get_local_var(lvar)); } while (0) | |
13066 | + | |
13067 | +#define local_lock_on(lvar, cpu) \ | |
13068 | + do { __local_lock(&per_cpu(lvar, cpu)); } while (0) | |
13069 | + | |
13070 | +static inline int __local_trylock(struct local_irq_lock *lv) | |
13071 | +{ | |
13072 | + if (lv->owner != current && spin_trylock(&lv->lock)) { | |
13073 | + LL_WARN(lv->owner); | |
13074 | + LL_WARN(lv->nestcnt); | |
13075 | + lv->owner = current; | |
13076 | + lv->nestcnt = 1; | |
13077 | + return 1; | |
13078 | + } else if (lv->owner == current) { | |
13079 | + lv->nestcnt++; | |
13080 | + return 1; | |
13081 | + } | |
13082 | + return 0; | |
13083 | +} | |
13084 | + | |
13085 | +#define local_trylock(lvar) \ | |
13086 | + ({ \ | |
13087 | + int __locked; \ | |
13088 | + __locked = __local_trylock(&get_local_var(lvar)); \ | |
13089 | + if (!__locked) \ | |
13090 | + put_local_var(lvar); \ | |
13091 | + __locked; \ | |
13092 | + }) | |
13093 | + | |
13094 | +static inline void __local_unlock(struct local_irq_lock *lv) | |
13095 | +{ | |
13096 | + LL_WARN(lv->nestcnt == 0); | |
13097 | + LL_WARN(lv->owner != current); | |
13098 | + if (--lv->nestcnt) | |
13099 | + return; | |
13100 | + | |
13101 | + lv->owner = NULL; | |
13102 | + spin_unlock(&lv->lock); | |
13103 | +} | |
13104 | + | |
13105 | +#define local_unlock(lvar) \ | |
13106 | + do { \ | |
13107 | + __local_unlock(this_cpu_ptr(&lvar)); \ | |
13108 | + put_local_var(lvar); \ | |
13109 | + } while (0) | |
13110 | + | |
13111 | +#define local_unlock_on(lvar, cpu) \ | |
13112 | + do { __local_unlock(&per_cpu(lvar, cpu)); } while (0) | |
13113 | + | |
13114 | +static inline void __local_lock_irq(struct local_irq_lock *lv) | |
13115 | +{ | |
13116 | + spin_lock_irqsave(&lv->lock, lv->flags); | |
13117 | + LL_WARN(lv->owner); | |
13118 | + LL_WARN(lv->nestcnt); | |
13119 | + lv->owner = current; | |
13120 | + lv->nestcnt = 1; | |
13121 | +} | |
13122 | + | |
13123 | +#define local_lock_irq(lvar) \ | |
13124 | + do { __local_lock_irq(&get_local_var(lvar)); } while (0) | |
13125 | + | |
13126 | +#define local_lock_irq_on(lvar, cpu) \ | |
13127 | + do { __local_lock_irq(&per_cpu(lvar, cpu)); } while (0) | |
13128 | + | |
13129 | +static inline void __local_unlock_irq(struct local_irq_lock *lv) | |
13130 | +{ | |
13131 | + LL_WARN(!lv->nestcnt); | |
13132 | + LL_WARN(lv->owner != current); | |
13133 | + lv->owner = NULL; | |
13134 | + lv->nestcnt = 0; | |
13135 | + spin_unlock_irq(&lv->lock); | |
13136 | +} | |
13137 | + | |
13138 | +#define local_unlock_irq(lvar) \ | |
13139 | + do { \ | |
13140 | + __local_unlock_irq(this_cpu_ptr(&lvar)); \ | |
13141 | + put_local_var(lvar); \ | |
13142 | + } while (0) | |
13143 | + | |
13144 | +#define local_unlock_irq_on(lvar, cpu) \ | |
13145 | + do { \ | |
13146 | + __local_unlock_irq(&per_cpu(lvar, cpu)); \ | |
13147 | + } while (0) | |
13148 | + | |
13149 | +static inline int __local_lock_irqsave(struct local_irq_lock *lv) | |
13150 | +{ | |
13151 | + if (lv->owner != current) { | |
13152 | + __local_lock_irq(lv); | |
13153 | + return 0; | |
13154 | + } else { | |
13155 | + lv->nestcnt++; | |
13156 | + return 1; | |
13157 | + } | |
13158 | +} | |
13159 | + | |
13160 | +#define local_lock_irqsave(lvar, _flags) \ | |
13161 | + do { \ | |
13162 | + if (__local_lock_irqsave(&get_local_var(lvar))) \ | |
13163 | + put_local_var(lvar); \ | |
13164 | + _flags = __this_cpu_read(lvar.flags); \ | |
13165 | + } while (0) | |
13166 | + | |
13167 | +#define local_lock_irqsave_on(lvar, _flags, cpu) \ | |
13168 | + do { \ | |
13169 | + __local_lock_irqsave(&per_cpu(lvar, cpu)); \ | |
13170 | + _flags = per_cpu(lvar, cpu).flags; \ | |
13171 | + } while (0) | |
13172 | + | |
13173 | +static inline int __local_unlock_irqrestore(struct local_irq_lock *lv, | |
13174 | + unsigned long flags) | |
13175 | +{ | |
13176 | + LL_WARN(!lv->nestcnt); | |
13177 | + LL_WARN(lv->owner != current); | |
13178 | + if (--lv->nestcnt) | |
13179 | + return 0; | |
13180 | + | |
13181 | + lv->owner = NULL; | |
13182 | + spin_unlock_irqrestore(&lv->lock, lv->flags); | |
13183 | + return 1; | |
13184 | +} | |
13185 | + | |
13186 | +#define local_unlock_irqrestore(lvar, flags) \ | |
13187 | + do { \ | |
13188 | + if (__local_unlock_irqrestore(this_cpu_ptr(&lvar), flags)) \ | |
13189 | + put_local_var(lvar); \ | |
13190 | + } while (0) | |
13191 | + | |
13192 | +#define local_unlock_irqrestore_on(lvar, flags, cpu) \ | |
13193 | + do { \ | |
13194 | + __local_unlock_irqrestore(&per_cpu(lvar, cpu), flags); \ | |
13195 | + } while (0) | |
13196 | + | |
13197 | +#define local_spin_trylock_irq(lvar, lock) \ | |
13198 | + ({ \ | |
13199 | + int __locked; \ | |
13200 | + local_lock_irq(lvar); \ | |
13201 | + __locked = spin_trylock(lock); \ | |
13202 | + if (!__locked) \ | |
13203 | + local_unlock_irq(lvar); \ | |
13204 | + __locked; \ | |
13205 | + }) | |
13206 | + | |
13207 | +#define local_spin_lock_irq(lvar, lock) \ | |
13208 | + do { \ | |
13209 | + local_lock_irq(lvar); \ | |
13210 | + spin_lock(lock); \ | |
13211 | + } while (0) | |
13212 | + | |
13213 | +#define local_spin_unlock_irq(lvar, lock) \ | |
13214 | + do { \ | |
13215 | + spin_unlock(lock); \ | |
13216 | + local_unlock_irq(lvar); \ | |
13217 | + } while (0) | |
13218 | + | |
13219 | +#define local_spin_lock_irqsave(lvar, lock, flags) \ | |
13220 | + do { \ | |
13221 | + local_lock_irqsave(lvar, flags); \ | |
13222 | + spin_lock(lock); \ | |
13223 | + } while (0) | |
13224 | + | |
13225 | +#define local_spin_unlock_irqrestore(lvar, lock, flags) \ | |
13226 | + do { \ | |
13227 | + spin_unlock(lock); \ | |
13228 | + local_unlock_irqrestore(lvar, flags); \ | |
13229 | + } while (0) | |
13230 | + | |
13231 | +#define get_locked_var(lvar, var) \ | |
13232 | + (*({ \ | |
13233 | + local_lock(lvar); \ | |
13234 | + this_cpu_ptr(&var); \ | |
13235 | + })) | |
13236 | + | |
13237 | +#define put_locked_var(lvar, var) local_unlock(lvar); | |
13238 | + | |
b3bbd485 JK |
13239 | +#define get_locked_ptr(lvar, var) \ |
13240 | + ({ \ | |
13241 | + local_lock(lvar); \ | |
13242 | + this_cpu_ptr(var); \ | |
13243 | + }) | |
13244 | + | |
13245 | +#define put_locked_ptr(lvar, var) local_unlock(lvar); | |
13246 | + | |
e4b2b4a8 JK |
13247 | +#define local_lock_cpu(lvar) \ |
13248 | + ({ \ | |
13249 | + local_lock(lvar); \ | |
13250 | + smp_processor_id(); \ | |
13251 | + }) | |
13252 | + | |
13253 | +#define local_unlock_cpu(lvar) local_unlock(lvar) | |
13254 | + | |
13255 | +#else /* PREEMPT_RT_BASE */ | |
13256 | + | |
13257 | +#define DEFINE_LOCAL_IRQ_LOCK(lvar) __typeof__(const int) lvar | |
13258 | +#define DECLARE_LOCAL_IRQ_LOCK(lvar) extern __typeof__(const int) lvar | |
13259 | + | |
13260 | +static inline void local_irq_lock_init(int lvar) { } | |
13261 | + | |
13262 | +#define local_trylock(lvar) \ | |
13263 | + ({ \ | |
13264 | + preempt_disable(); \ | |
13265 | + 1; \ | |
13266 | + }) | |
13267 | + | |
13268 | +#define local_lock(lvar) preempt_disable() | |
13269 | +#define local_unlock(lvar) preempt_enable() | |
13270 | +#define local_lock_irq(lvar) local_irq_disable() | |
13271 | +#define local_lock_irq_on(lvar, cpu) local_irq_disable() | |
13272 | +#define local_unlock_irq(lvar) local_irq_enable() | |
13273 | +#define local_unlock_irq_on(lvar, cpu) local_irq_enable() | |
13274 | +#define local_lock_irqsave(lvar, flags) local_irq_save(flags) | |
13275 | +#define local_unlock_irqrestore(lvar, flags) local_irq_restore(flags) | |
13276 | + | |
13277 | +#define local_spin_trylock_irq(lvar, lock) spin_trylock_irq(lock) | |
13278 | +#define local_spin_lock_irq(lvar, lock) spin_lock_irq(lock) | |
13279 | +#define local_spin_unlock_irq(lvar, lock) spin_unlock_irq(lock) | |
13280 | +#define local_spin_lock_irqsave(lvar, lock, flags) \ | |
13281 | + spin_lock_irqsave(lock, flags) | |
13282 | +#define local_spin_unlock_irqrestore(lvar, lock, flags) \ | |
13283 | + spin_unlock_irqrestore(lock, flags) | |
13284 | + | |
13285 | +#define get_locked_var(lvar, var) get_cpu_var(var) | |
13286 | +#define put_locked_var(lvar, var) put_cpu_var(var) | |
b3bbd485 JK |
13287 | +#define get_locked_ptr(lvar, var) get_cpu_ptr(var) |
13288 | +#define put_locked_ptr(lvar, var) put_cpu_ptr(var) | |
e4b2b4a8 JK |
13289 | + |
13290 | +#define local_lock_cpu(lvar) get_cpu() | |
13291 | +#define local_unlock_cpu(lvar) put_cpu() | |
13292 | + | |
13293 | +#endif | |
13294 | + | |
13295 | +#endif | |
b3bbd485 JK |
13296 | diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h |
13297 | index e41ef532c4ce..63317710311e 100644 | |
13298 | --- a/include/linux/mm_types.h | |
13299 | +++ b/include/linux/mm_types.h | |
e4b2b4a8 JK |
13300 | @@ -12,6 +12,7 @@ |
13301 | #include <linux/completion.h> | |
13302 | #include <linux/cpumask.h> | |
13303 | #include <linux/uprobes.h> | |
13304 | +#include <linux/rcupdate.h> | |
13305 | #include <linux/page-flags-layout.h> | |
13306 | #include <linux/workqueue.h> | |
13307 | ||
b3bbd485 | 13308 | @@ -496,6 +497,9 @@ struct mm_struct { |
e4b2b4a8 JK |
13309 | bool tlb_flush_batched; |
13310 | #endif | |
13311 | struct uprobes_state uprobes_state; | |
13312 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
13313 | + struct rcu_head delayed_drop; | |
13314 | +#endif | |
13315 | #ifdef CONFIG_HUGETLB_PAGE | |
13316 | atomic_long_t hugetlb_usage; | |
13317 | #endif | |
b3bbd485 JK |
13318 | diff --git a/include/linux/mutex.h b/include/linux/mutex.h |
13319 | index 153274f78402..dbb52857b25b 100644 | |
13320 | --- a/include/linux/mutex.h | |
13321 | +++ b/include/linux/mutex.h | |
e4b2b4a8 | 13322 | @@ -23,6 +23,17 @@ |
1a6e0f06 | 13323 | |
e4b2b4a8 | 13324 | struct ww_acquire_ctx; |
1a6e0f06 | 13325 | |
e4b2b4a8 JK |
13326 | +#ifdef CONFIG_DEBUG_LOCK_ALLOC |
13327 | +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ | |
13328 | + , .dep_map = { .name = #lockname } | |
13329 | +#else | |
13330 | +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) | |
13331 | +#endif | |
13332 | + | |
13333 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
13334 | +# include <linux/mutex_rt.h> | |
13335 | +#else | |
13336 | + | |
13337 | /* | |
13338 | * Simple, straightforward mutexes with strict semantics: | |
13339 | * | |
b3bbd485 | 13340 | @@ -114,13 +125,6 @@ do { \ |
e4b2b4a8 JK |
13341 | __mutex_init((mutex), #mutex, &__key); \ |
13342 | } while (0) | |
1a6e0f06 | 13343 | |
e4b2b4a8 JK |
13344 | -#ifdef CONFIG_DEBUG_LOCK_ALLOC |
13345 | -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ | |
13346 | - , .dep_map = { .name = #lockname } | |
13347 | -#else | |
13348 | -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) | |
13349 | -#endif | |
13350 | - | |
13351 | #define __MUTEX_INITIALIZER(lockname) \ | |
13352 | { .owner = ATOMIC_LONG_INIT(0) \ | |
13353 | , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \ | |
b3bbd485 | 13354 | @@ -228,4 +232,6 @@ mutex_trylock_recursive(struct mutex *lock) |
e4b2b4a8 JK |
13355 | return mutex_trylock(lock); |
13356 | } | |
1a6e0f06 | 13357 | |
e4b2b4a8 JK |
13358 | +#endif /* !PREEMPT_RT_FULL */ |
13359 | + | |
13360 | #endif /* __LINUX_MUTEX_H */ | |
b3bbd485 JK |
13361 | diff --git a/include/linux/mutex_rt.h b/include/linux/mutex_rt.h |
13362 | new file mode 100644 | |
13363 | index 000000000000..3fcb5edb1d2b | |
13364 | --- /dev/null | |
13365 | +++ b/include/linux/mutex_rt.h | |
e4b2b4a8 JK |
13366 | @@ -0,0 +1,130 @@ |
13367 | +#ifndef __LINUX_MUTEX_RT_H | |
13368 | +#define __LINUX_MUTEX_RT_H | |
13369 | + | |
13370 | +#ifndef __LINUX_MUTEX_H | |
13371 | +#error "Please include mutex.h" | |
13372 | +#endif | |
13373 | + | |
13374 | +#include <linux/rtmutex.h> | |
13375 | + | |
13376 | +/* FIXME: Just for __lockfunc */ | |
13377 | +#include <linux/spinlock.h> | |
13378 | + | |
13379 | +struct mutex { | |
13380 | + struct rt_mutex lock; | |
13381 | +#ifdef CONFIG_DEBUG_LOCK_ALLOC | |
13382 | + struct lockdep_map dep_map; | |
13383 | +#endif | |
13384 | +}; | |
13385 | + | |
13386 | +#define __MUTEX_INITIALIZER(mutexname) \ | |
13387 | + { \ | |
13388 | + .lock = __RT_MUTEX_INITIALIZER(mutexname.lock) \ | |
13389 | + __DEP_MAP_MUTEX_INITIALIZER(mutexname) \ | |
13390 | + } | |
13391 | + | |
13392 | +#define DEFINE_MUTEX(mutexname) \ | |
13393 | + struct mutex mutexname = __MUTEX_INITIALIZER(mutexname) | |
13394 | + | |
13395 | +extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key); | |
13396 | +extern void __lockfunc _mutex_lock(struct mutex *lock); | |
13397 | +extern void __lockfunc _mutex_lock_io(struct mutex *lock); | |
13398 | +extern void __lockfunc _mutex_lock_io_nested(struct mutex *lock, int subclass); | |
13399 | +extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock); | |
13400 | +extern int __lockfunc _mutex_lock_killable(struct mutex *lock); | |
13401 | +extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass); | |
13402 | +extern void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock); | |
13403 | +extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass); | |
13404 | +extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass); | |
13405 | +extern int __lockfunc _mutex_trylock(struct mutex *lock); | |
13406 | +extern void __lockfunc _mutex_unlock(struct mutex *lock); | |
13407 | + | |
13408 | +#define mutex_is_locked(l) rt_mutex_is_locked(&(l)->lock) | |
13409 | +#define mutex_lock(l) _mutex_lock(l) | |
13410 | +#define mutex_lock_interruptible(l) _mutex_lock_interruptible(l) | |
13411 | +#define mutex_lock_killable(l) _mutex_lock_killable(l) | |
13412 | +#define mutex_trylock(l) _mutex_trylock(l) | |
13413 | +#define mutex_unlock(l) _mutex_unlock(l) | |
13414 | +#define mutex_lock_io(l) _mutex_lock_io(l); | |
13415 | + | |
13416 | +#define __mutex_owner(l) ((l)->lock.owner) | |
13417 | + | |
13418 | +#ifdef CONFIG_DEBUG_MUTEXES | |
13419 | +#define mutex_destroy(l) rt_mutex_destroy(&(l)->lock) | |
13420 | +#else | |
13421 | +static inline void mutex_destroy(struct mutex *lock) {} | |
13422 | +#endif | |
13423 | + | |
13424 | +#ifdef CONFIG_DEBUG_LOCK_ALLOC | |
13425 | +# define mutex_lock_nested(l, s) _mutex_lock_nested(l, s) | |
13426 | +# define mutex_lock_interruptible_nested(l, s) \ | |
13427 | + _mutex_lock_interruptible_nested(l, s) | |
13428 | +# define mutex_lock_killable_nested(l, s) \ | |
13429 | + _mutex_lock_killable_nested(l, s) | |
13430 | +# define mutex_lock_io_nested(l, s) _mutex_lock_io_nested(l, s) | |
13431 | + | |
13432 | +# define mutex_lock_nest_lock(lock, nest_lock) \ | |
13433 | +do { \ | |
13434 | + typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \ | |
13435 | + _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map); \ | |
13436 | +} while (0) | |
13437 | + | |
13438 | +#else | |
13439 | +# define mutex_lock_nested(l, s) _mutex_lock(l) | |
13440 | +# define mutex_lock_interruptible_nested(l, s) \ | |
13441 | + _mutex_lock_interruptible(l) | |
13442 | +# define mutex_lock_killable_nested(l, s) \ | |
13443 | + _mutex_lock_killable(l) | |
13444 | +# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock) | |
13445 | +# define mutex_lock_io_nested(l, s) _mutex_lock_io(l) | |
13446 | +#endif | |
13447 | + | |
13448 | +# define mutex_init(mutex) \ | |
13449 | +do { \ | |
13450 | + static struct lock_class_key __key; \ | |
13451 | + \ | |
13452 | + rt_mutex_init(&(mutex)->lock); \ | |
13453 | + __mutex_do_init((mutex), #mutex, &__key); \ | |
13454 | +} while (0) | |
13455 | + | |
13456 | +# define __mutex_init(mutex, name, key) \ | |
13457 | +do { \ | |
13458 | + rt_mutex_init(&(mutex)->lock); \ | |
13459 | + __mutex_do_init((mutex), name, key); \ | |
13460 | +} while (0) | |
13461 | + | |
13462 | +/** | |
13463 | + * These values are chosen such that FAIL and SUCCESS match the | |
13464 | + * values of the regular mutex_trylock(). | |
13465 | + */ | |
13466 | +enum mutex_trylock_recursive_enum { | |
13467 | + MUTEX_TRYLOCK_FAILED = 0, | |
13468 | + MUTEX_TRYLOCK_SUCCESS = 1, | |
13469 | + MUTEX_TRYLOCK_RECURSIVE, | |
13470 | +}; | |
13471 | +/** | |
13472 | + * mutex_trylock_recursive - trylock variant that allows recursive locking | |
13473 | + * @lock: mutex to be locked | |
13474 | + * | |
13475 | + * This function should not be used, _ever_. It is purely for hysterical GEM | |
13476 | + * raisins, and once those are gone this will be removed. | |
13477 | + * | |
13478 | + * Returns: | |
13479 | + * MUTEX_TRYLOCK_FAILED - trylock failed, | |
13480 | + * MUTEX_TRYLOCK_SUCCESS - lock acquired, | |
13481 | + * MUTEX_TRYLOCK_RECURSIVE - we already owned the lock. | |
13482 | + */ | |
13483 | +int __rt_mutex_owner_current(struct rt_mutex *lock); | |
13484 | + | |
13485 | +static inline /* __deprecated */ __must_check enum mutex_trylock_recursive_enum | |
13486 | +mutex_trylock_recursive(struct mutex *lock) | |
13487 | +{ | |
13488 | + if (unlikely(__rt_mutex_owner_current(&lock->lock))) | |
13489 | + return MUTEX_TRYLOCK_RECURSIVE; | |
13490 | + | |
13491 | + return mutex_trylock(lock); | |
13492 | +} | |
13493 | + | |
13494 | +extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); | |
13495 | + | |
13496 | +#endif | |
b3bbd485 | 13497 | diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h |
5dd41b01 | 13498 | index a516dbe5869f..3ceccf72757e 100644 |
b3bbd485 JK |
13499 | --- a/include/linux/netdevice.h |
13500 | +++ b/include/linux/netdevice.h | |
13501 | @@ -409,7 +409,19 @@ typedef enum rx_handler_result rx_handler_result_t; | |
e4b2b4a8 | 13502 | typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb); |
1a6e0f06 | 13503 | |
e4b2b4a8 JK |
13504 | void __napi_schedule(struct napi_struct *n); |
13505 | + | |
13506 | +/* | |
13507 | + * When PREEMPT_RT_FULL is defined, all device interrupt handlers | |
13508 | + * run as threads, and they can also be preempted (without PREEMPT_RT | |
13509 | + * interrupt threads can not be preempted). Which means that calling | |
13510 | + * __napi_schedule_irqoff() from an interrupt handler can be preempted | |
13511 | + * and can corrupt the napi->poll_list. | |
13512 | + */ | |
13513 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
13514 | +#define __napi_schedule_irqoff(n) __napi_schedule(n) | |
13515 | +#else | |
13516 | void __napi_schedule_irqoff(struct napi_struct *n); | |
13517 | +#endif | |
1a6e0f06 | 13518 | |
e4b2b4a8 JK |
13519 | static inline bool napi_disable_pending(struct napi_struct *n) |
13520 | { | |
b3bbd485 | 13521 | @@ -571,7 +583,11 @@ struct netdev_queue { |
e4b2b4a8 JK |
13522 | * write-mostly part |
13523 | */ | |
13524 | spinlock_t _xmit_lock ____cacheline_aligned_in_smp; | |
13525 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
13526 | + struct task_struct *xmit_lock_owner; | |
13527 | +#else | |
13528 | int xmit_lock_owner; | |
13529 | +#endif | |
1a6e0f06 | 13530 | /* |
e4b2b4a8 | 13531 | * Time (in jiffies) of last Tx |
1a6e0f06 | 13532 | */ |
5dd41b01 | 13533 | @@ -2440,14 +2456,53 @@ void netdev_freemem(struct net_device *dev); |
e4b2b4a8 JK |
13534 | void synchronize_net(void); |
13535 | int init_dummy_netdev(struct net_device *dev); | |
1a6e0f06 | 13536 | |
e4b2b4a8 JK |
13537 | -DECLARE_PER_CPU(int, xmit_recursion); |
13538 | #define XMIT_RECURSION_LIMIT 10 | |
13539 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
13540 | +static inline int dev_recursion_level(void) | |
13541 | +{ | |
13542 | + return current->xmit_recursion; | |
13543 | +} | |
13544 | + | |
13545 | +static inline int xmit_rec_read(void) | |
13546 | +{ | |
13547 | + return current->xmit_recursion; | |
13548 | +} | |
13549 | + | |
13550 | +static inline void xmit_rec_inc(void) | |
13551 | +{ | |
13552 | + current->xmit_recursion++; | |
13553 | +} | |
13554 | + | |
13555 | +static inline void xmit_rec_dec(void) | |
13556 | +{ | |
13557 | + current->xmit_recursion--; | |
13558 | +} | |
13559 | + | |
13560 | +#else | |
13561 | + | |
13562 | +DECLARE_PER_CPU(int, xmit_recursion); | |
1a6e0f06 | 13563 | |
e4b2b4a8 JK |
13564 | static inline int dev_recursion_level(void) |
13565 | { | |
13566 | return this_cpu_read(xmit_recursion); | |
13567 | } | |
1a6e0f06 | 13568 | |
e4b2b4a8 JK |
13569 | +static inline int xmit_rec_read(void) |
13570 | +{ | |
13571 | + return __this_cpu_read(xmit_recursion); | |
13572 | +} | |
13573 | + | |
13574 | +static inline void xmit_rec_inc(void) | |
13575 | +{ | |
13576 | + __this_cpu_inc(xmit_recursion); | |
13577 | +} | |
13578 | + | |
13579 | +static inline void xmit_rec_dec(void) | |
13580 | +{ | |
13581 | + __this_cpu_dec(xmit_recursion); | |
13582 | +} | |
13583 | +#endif | |
13584 | + | |
13585 | struct net_device *dev_get_by_index(struct net *net, int ifindex); | |
13586 | struct net_device *__dev_get_by_index(struct net *net, int ifindex); | |
13587 | struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); | |
5dd41b01 | 13588 | @@ -2799,6 +2854,7 @@ struct softnet_data { |
e4b2b4a8 JK |
13589 | unsigned int dropped; |
13590 | struct sk_buff_head input_pkt_queue; | |
13591 | struct napi_struct backlog; | |
13592 | + struct sk_buff_head tofree_queue; | |
1a6e0f06 | 13593 | |
e4b2b4a8 | 13594 | }; |
1a6e0f06 | 13595 | |
5dd41b01 | 13596 | @@ -3522,10 +3578,48 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits) |
e4b2b4a8 | 13597 | return (1 << debug_value) - 1; |
1a6e0f06 JK |
13598 | } |
13599 | ||
e4b2b4a8 JK |
13600 | +#ifdef CONFIG_PREEMPT_RT_FULL |
13601 | +static inline void netdev_queue_set_owner(struct netdev_queue *txq, int cpu) | |
13602 | +{ | |
13603 | + txq->xmit_lock_owner = current; | |
13604 | +} | |
13605 | + | |
13606 | +static inline void netdev_queue_clear_owner(struct netdev_queue *txq) | |
13607 | +{ | |
13608 | + txq->xmit_lock_owner = NULL; | |
13609 | +} | |
13610 | + | |
13611 | +static inline bool netdev_queue_has_owner(struct netdev_queue *txq) | |
13612 | +{ | |
13613 | + if (txq->xmit_lock_owner != NULL) | |
13614 | + return true; | |
13615 | + return false; | |
13616 | +} | |
13617 | + | |
13618 | +#else | |
13619 | + | |
13620 | +static inline void netdev_queue_set_owner(struct netdev_queue *txq, int cpu) | |
13621 | +{ | |
13622 | + txq->xmit_lock_owner = cpu; | |
13623 | +} | |
13624 | + | |
13625 | +static inline void netdev_queue_clear_owner(struct netdev_queue *txq) | |
13626 | +{ | |
13627 | + txq->xmit_lock_owner = -1; | |
13628 | +} | |
13629 | + | |
13630 | +static inline bool netdev_queue_has_owner(struct netdev_queue *txq) | |
13631 | +{ | |
13632 | + if (txq->xmit_lock_owner != -1) | |
13633 | + return true; | |
13634 | + return false; | |
13635 | +} | |
13636 | +#endif | |
13637 | + | |
13638 | static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu) | |
13639 | { | |
13640 | spin_lock(&txq->_xmit_lock); | |
13641 | - txq->xmit_lock_owner = cpu; | |
13642 | + netdev_queue_set_owner(txq, cpu); | |
13643 | } | |
1a6e0f06 | 13644 | |
e4b2b4a8 | 13645 | static inline bool __netif_tx_acquire(struct netdev_queue *txq) |
5dd41b01 | 13646 | @@ -3542,32 +3636,32 @@ static inline void __netif_tx_release(struct netdev_queue *txq) |
e4b2b4a8 JK |
13647 | static inline void __netif_tx_lock_bh(struct netdev_queue *txq) |
13648 | { | |
13649 | spin_lock_bh(&txq->_xmit_lock); | |
13650 | - txq->xmit_lock_owner = smp_processor_id(); | |
13651 | + netdev_queue_set_owner(txq, smp_processor_id()); | |
13652 | } | |
1a6e0f06 | 13653 | |
e4b2b4a8 JK |
13654 | static inline bool __netif_tx_trylock(struct netdev_queue *txq) |
13655 | { | |
13656 | bool ok = spin_trylock(&txq->_xmit_lock); | |
13657 | if (likely(ok)) | |
13658 | - txq->xmit_lock_owner = smp_processor_id(); | |
13659 | + netdev_queue_set_owner(txq, smp_processor_id()); | |
13660 | return ok; | |
13661 | } | |
1a6e0f06 | 13662 | |
e4b2b4a8 JK |
13663 | static inline void __netif_tx_unlock(struct netdev_queue *txq) |
13664 | { | |
13665 | - txq->xmit_lock_owner = -1; | |
13666 | + netdev_queue_clear_owner(txq); | |
13667 | spin_unlock(&txq->_xmit_lock); | |
13668 | } | |
1a6e0f06 | 13669 | |
e4b2b4a8 JK |
13670 | static inline void __netif_tx_unlock_bh(struct netdev_queue *txq) |
13671 | { | |
13672 | - txq->xmit_lock_owner = -1; | |
13673 | + netdev_queue_clear_owner(txq); | |
13674 | spin_unlock_bh(&txq->_xmit_lock); | |
13675 | } | |
1a6e0f06 | 13676 | |
e4b2b4a8 JK |
13677 | static inline void txq_trans_update(struct netdev_queue *txq) |
13678 | { | |
13679 | - if (txq->xmit_lock_owner != -1) | |
13680 | + if (netdev_queue_has_owner(txq)) | |
13681 | txq->trans_start = jiffies; | |
13682 | } | |
1a6e0f06 | 13683 | |
b3bbd485 JK |
13684 | diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h |
13685 | index 54f346a45cd0..79723e76af66 100644 | |
13686 | --- a/include/linux/netfilter/x_tables.h | |
13687 | +++ b/include/linux/netfilter/x_tables.h | |
e4b2b4a8 JK |
13688 | @@ -6,6 +6,7 @@ |
13689 | #include <linux/netdevice.h> | |
13690 | #include <linux/static_key.h> | |
13691 | #include <linux/netfilter.h> | |
13692 | +#include <linux/locallock.h> | |
13693 | #include <uapi/linux/netfilter/x_tables.h> | |
1a6e0f06 | 13694 | |
e4b2b4a8 | 13695 | /* Test a struct->invflags and a boolean for inequality */ |
b3bbd485 | 13696 | @@ -341,6 +342,8 @@ void xt_free_table_info(struct xt_table_info *info); |
e4b2b4a8 JK |
13697 | */ |
13698 | DECLARE_PER_CPU(seqcount_t, xt_recseq); | |
1a6e0f06 | 13699 | |
e4b2b4a8 JK |
13700 | +DECLARE_LOCAL_IRQ_LOCK(xt_write_lock); |
13701 | + | |
13702 | /* xt_tee_enabled - true if x_tables needs to handle reentrancy | |
13703 | * | |
13704 | * Enabled if current ip(6)tables ruleset has at least one -j TEE rule. | |
b3bbd485 | 13705 | @@ -361,6 +364,9 @@ static inline unsigned int xt_write_recseq_begin(void) |
e4b2b4a8 JK |
13706 | { |
13707 | unsigned int addend; | |
1a6e0f06 | 13708 | |
e4b2b4a8 JK |
13709 | + /* RT protection */ |
13710 | + local_lock(xt_write_lock); | |
13711 | + | |
13712 | /* | |
13713 | * Low order bit of sequence is set if we already | |
13714 | * called xt_write_recseq_begin(). | |
b3bbd485 | 13715 | @@ -391,6 +397,7 @@ static inline void xt_write_recseq_end(unsigned int addend) |
e4b2b4a8 JK |
13716 | /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */ |
13717 | smp_wmb(); | |
13718 | __this_cpu_add(xt_recseq.sequence, addend); | |
13719 | + local_unlock(xt_write_lock); | |
13720 | } | |
1a6e0f06 | 13721 | |
e4b2b4a8 | 13722 | /* |
b3bbd485 JK |
13723 | diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h |
13724 | index f0015f801a78..c38288622819 100644 | |
13725 | --- a/include/linux/nfs_fs.h | |
13726 | +++ b/include/linux/nfs_fs.h | |
13727 | @@ -162,7 +162,11 @@ struct nfs_inode { | |
1a6e0f06 | 13728 | |
e4b2b4a8 JK |
13729 | /* Readers: in-flight sillydelete RPC calls */ |
13730 | /* Writers: rmdir */ | |
13731 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
13732 | + struct semaphore rmdir_sem; | |
13733 | +#else | |
13734 | struct rw_semaphore rmdir_sem; | |
13735 | +#endif | |
13736 | struct mutex commit_mutex; | |
1a6e0f06 | 13737 | |
e4b2b4a8 | 13738 | #if IS_ENABLED(CONFIG_NFS_V4) |
b3bbd485 JK |
13739 | diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h |
13740 | index 6959968dc36a..802e849b57ac 100644 | |
13741 | --- a/include/linux/nfs_xdr.h | |
13742 | +++ b/include/linux/nfs_xdr.h | |
13743 | @@ -1530,7 +1530,7 @@ struct nfs_unlinkdata { | |
e4b2b4a8 JK |
13744 | struct nfs_removeargs args; |
13745 | struct nfs_removeres res; | |
13746 | struct dentry *dentry; | |
13747 | - wait_queue_head_t wq; | |
13748 | + struct swait_queue_head wq; | |
13749 | struct rpc_cred *cred; | |
13750 | struct nfs_fattr dir_attr; | |
13751 | long timeout; | |
b3bbd485 JK |
13752 | diff --git a/include/linux/notifier.h b/include/linux/notifier.h |
13753 | index 6d731110e0db..e758627da14d 100644 | |
13754 | --- a/include/linux/notifier.h | |
13755 | +++ b/include/linux/notifier.h | |
e4b2b4a8 JK |
13756 | @@ -7,7 +7,7 @@ |
13757 | * | |
13758 | * Alan Cox <Alan.Cox@linux.org> | |
13759 | */ | |
13760 | - | |
13761 | + | |
13762 | #ifndef _LINUX_NOTIFIER_H | |
13763 | #define _LINUX_NOTIFIER_H | |
13764 | #include <linux/errno.h> | |
13765 | @@ -43,9 +43,7 @@ | |
13766 | * in srcu_notifier_call_chain(): no cache bounces and no memory barriers. | |
13767 | * As compensation, srcu_notifier_chain_unregister() is rather expensive. | |
13768 | * SRCU notifier chains should be used when the chain will be called very | |
13769 | - * often but notifier_blocks will seldom be removed. Also, SRCU notifier | |
13770 | - * chains are slightly more difficult to use because they require special | |
13771 | - * runtime initialization. | |
13772 | + * often but notifier_blocks will seldom be removed. | |
13773 | */ | |
1a6e0f06 | 13774 | |
e4b2b4a8 | 13775 | struct notifier_block; |
b3bbd485 | 13776 | @@ -91,7 +89,7 @@ struct srcu_notifier_head { |
e4b2b4a8 JK |
13777 | (name)->head = NULL; \ |
13778 | } while (0) | |
1a6e0f06 | 13779 | |
e4b2b4a8 JK |
13780 | -/* srcu_notifier_heads must be initialized and cleaned up dynamically */ |
13781 | +/* srcu_notifier_heads must be cleaned up dynamically */ | |
13782 | extern void srcu_init_notifier_head(struct srcu_notifier_head *nh); | |
13783 | #define srcu_cleanup_notifier_head(name) \ | |
13784 | cleanup_srcu_struct(&(name)->srcu); | |
b3bbd485 | 13785 | @@ -104,7 +102,13 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh); |
e4b2b4a8 JK |
13786 | .head = NULL } |
13787 | #define RAW_NOTIFIER_INIT(name) { \ | |
13788 | .head = NULL } | |
13789 | -/* srcu_notifier_heads cannot be initialized statically */ | |
13790 | + | |
13791 | +#define SRCU_NOTIFIER_INIT(name, pcpu) \ | |
13792 | + { \ | |
13793 | + .mutex = __MUTEX_INITIALIZER(name.mutex), \ | |
13794 | + .head = NULL, \ | |
13795 | + .srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu), \ | |
13796 | + } | |
1a6e0f06 | 13797 | |
e4b2b4a8 JK |
13798 | #define ATOMIC_NOTIFIER_HEAD(name) \ |
13799 | struct atomic_notifier_head name = \ | |
b3bbd485 | 13800 | @@ -116,6 +120,26 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh); |
e4b2b4a8 JK |
13801 | struct raw_notifier_head name = \ |
13802 | RAW_NOTIFIER_INIT(name) | |
1a6e0f06 | 13803 | |
e4b2b4a8 JK |
13804 | +#ifdef CONFIG_TREE_SRCU |
13805 | +#define _SRCU_NOTIFIER_HEAD(name, mod) \ | |
13806 | + static DEFINE_PER_CPU(struct srcu_data, \ | |
13807 | + name##_head_srcu_data); \ | |
13808 | + mod struct srcu_notifier_head name = \ | |
13809 | + SRCU_NOTIFIER_INIT(name, name##_head_srcu_data) | |
13810 | + | |
13811 | +#else | |
13812 | +#define _SRCU_NOTIFIER_HEAD(name, mod) \ | |
13813 | + mod struct srcu_notifier_head name = \ | |
13814 | + SRCU_NOTIFIER_INIT(name, name) | |
13815 | + | |
13816 | +#endif | |
13817 | + | |
13818 | +#define SRCU_NOTIFIER_HEAD(name) \ | |
13819 | + _SRCU_NOTIFIER_HEAD(name, ) | |
13820 | + | |
13821 | +#define SRCU_NOTIFIER_HEAD_STATIC(name) \ | |
13822 | + _SRCU_NOTIFIER_HEAD(name, static) | |
13823 | + | |
13824 | #ifdef __KERNEL__ | |
1a6e0f06 | 13825 | |
e4b2b4a8 | 13826 | extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh, |
b3bbd485 | 13827 | @@ -185,12 +209,12 @@ static inline int notifier_to_errno(int ret) |
c7c16703 | 13828 | |
e4b2b4a8 JK |
13829 | /* |
13830 | * Declared notifiers so far. I can imagine quite a few more chains | |
13831 | - * over time (eg laptop power reset chains, reboot chain (to clean | |
13832 | + * over time (eg laptop power reset chains, reboot chain (to clean | |
13833 | * device units up), device [un]mount chain, module load/unload chain, | |
13834 | - * low memory chain, screenblank chain (for plug in modular screenblankers) | |
13835 | + * low memory chain, screenblank chain (for plug in modular screenblankers) | |
13836 | * VC switch chains (for loadable kernel svgalib VC switch helpers) etc... | |
13837 | */ | |
13838 | - | |
13839 | + | |
13840 | /* CPU notfiers are defined in include/linux/cpu.h. */ | |
c7c16703 | 13841 | |
e4b2b4a8 | 13842 | /* netdevice notifiers are defined in include/linux/netdevice.h */ |
b3bbd485 JK |
13843 | diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h |
13844 | index 79b99d653e03..fb44e237316d 100644 | |
13845 | --- a/include/linux/percpu-rwsem.h | |
13846 | +++ b/include/linux/percpu-rwsem.h | |
13847 | @@ -29,7 +29,7 @@ static struct percpu_rw_semaphore name = { \ | |
e4b2b4a8 JK |
13848 | extern int __percpu_down_read(struct percpu_rw_semaphore *, int); |
13849 | extern void __percpu_up_read(struct percpu_rw_semaphore *); | |
1a6e0f06 | 13850 | |
e4b2b4a8 JK |
13851 | -static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore *sem) |
13852 | +static inline void percpu_down_read(struct percpu_rw_semaphore *sem) | |
13853 | { | |
13854 | might_sleep(); | |
1a6e0f06 | 13855 | |
b3bbd485 | 13856 | @@ -47,16 +47,10 @@ static inline void percpu_down_read_preempt_disable(struct percpu_rw_semaphore * |
e4b2b4a8 JK |
13857 | __this_cpu_inc(*sem->read_count); |
13858 | if (unlikely(!rcu_sync_is_idle(&sem->rss))) | |
13859 | __percpu_down_read(sem, false); /* Unconditional memory barrier */ | |
13860 | - barrier(); | |
13861 | /* | |
13862 | - * The barrier() prevents the compiler from | |
13863 | + * The preempt_enable() prevents the compiler from | |
13864 | * bleeding the critical section out. | |
13865 | */ | |
13866 | -} | |
1a6e0f06 | 13867 | - |
e4b2b4a8 JK |
13868 | -static inline void percpu_down_read(struct percpu_rw_semaphore *sem) |
13869 | -{ | |
13870 | - percpu_down_read_preempt_disable(sem); | |
13871 | preempt_enable(); | |
13872 | } | |
1a6e0f06 | 13873 | |
b3bbd485 | 13874 | @@ -83,13 +77,9 @@ static inline int percpu_down_read_trylock(struct percpu_rw_semaphore *sem) |
e4b2b4a8 JK |
13875 | return ret; |
13876 | } | |
1a6e0f06 | 13877 | |
e4b2b4a8 JK |
13878 | -static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem) |
13879 | +static inline void percpu_up_read(struct percpu_rw_semaphore *sem) | |
1a6e0f06 | 13880 | { |
e4b2b4a8 JK |
13881 | - /* |
13882 | - * The barrier() prevents the compiler from | |
13883 | - * bleeding the critical section out. | |
13884 | - */ | |
13885 | - barrier(); | |
13886 | + preempt_disable(); | |
13887 | /* | |
13888 | * Same as in percpu_down_read(). | |
13889 | */ | |
b3bbd485 | 13890 | @@ -102,12 +92,6 @@ static inline void percpu_up_read_preempt_enable(struct percpu_rw_semaphore *sem |
e4b2b4a8 | 13891 | rwsem_release(&sem->rw_sem.dep_map, 1, _RET_IP_); |
1a6e0f06 | 13892 | } |
1f39f580 | 13893 | |
e4b2b4a8 JK |
13894 | -static inline void percpu_up_read(struct percpu_rw_semaphore *sem) |
13895 | -{ | |
13896 | - preempt_disable(); | |
13897 | - percpu_up_read_preempt_enable(sem); | |
13898 | -} | |
13899 | - | |
13900 | extern void percpu_down_write(struct percpu_rw_semaphore *); | |
13901 | extern void percpu_up_write(struct percpu_rw_semaphore *); | |
1f39f580 | 13902 | |
b3bbd485 JK |
13903 | diff --git a/include/linux/percpu.h b/include/linux/percpu.h |
13904 | index 296bbe49d5d1..4414796e3941 100644 | |
13905 | --- a/include/linux/percpu.h | |
13906 | +++ b/include/linux/percpu.h | |
13907 | @@ -19,6 +19,35 @@ | |
13908 | #define PERCPU_MODULE_RESERVE 0 | |
13909 | #endif | |
13910 | ||
13911 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
13912 | + | |
13913 | +#define get_local_var(var) (*({ \ | |
13914 | + migrate_disable(); \ | |
13915 | + this_cpu_ptr(&var); })) | |
13916 | + | |
13917 | +#define put_local_var(var) do { \ | |
13918 | + (void)&(var); \ | |
13919 | + migrate_enable(); \ | |
13920 | +} while (0) | |
13921 | + | |
13922 | +# define get_local_ptr(var) ({ \ | |
13923 | + migrate_disable(); \ | |
13924 | + this_cpu_ptr(var); }) | |
13925 | + | |
13926 | +# define put_local_ptr(var) do { \ | |
13927 | + (void)(var); \ | |
13928 | + migrate_enable(); \ | |
13929 | +} while (0) | |
13930 | + | |
13931 | +#else | |
13932 | + | |
13933 | +#define get_local_var(var) get_cpu_var(var) | |
13934 | +#define put_local_var(var) put_cpu_var(var) | |
13935 | +#define get_local_ptr(var) get_cpu_ptr(var) | |
13936 | +#define put_local_ptr(var) put_cpu_ptr(var) | |
13937 | + | |
13938 | +#endif | |
13939 | + | |
13940 | /* minimum unit size, also is the maximum supported allocation size */ | |
13941 | #define PCPU_MIN_UNIT_SIZE PFN_ALIGN(32 << 10) | |
13942 | ||
13943 | diff --git a/include/linux/pid.h b/include/linux/pid.h | |
13944 | index dfd684ce0787..bc954a99aa70 100644 | |
13945 | --- a/include/linux/pid.h | |
13946 | +++ b/include/linux/pid.h | |
e4b2b4a8 JK |
13947 | @@ -3,6 +3,7 @@ |
13948 | #define _LINUX_PID_H | |
1f39f580 | 13949 | |
e4b2b4a8 JK |
13950 | #include <linux/rculist.h> |
13951 | +#include <linux/atomic.h> | |
1f39f580 | 13952 | |
e4b2b4a8 JK |
13953 | enum pid_type |
13954 | { | |
b3bbd485 | 13955 | diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h |
5dd41b01 | 13956 | index 437a539898ae..de5c49b0dccf 100644 |
b3bbd485 JK |
13957 | --- a/include/linux/posix-timers.h |
13958 | +++ b/include/linux/posix-timers.h | |
13959 | @@ -101,8 +101,8 @@ struct k_itimer { | |
e4b2b4a8 JK |
13960 | struct { |
13961 | struct alarm alarmtimer; | |
13962 | } alarm; | |
13963 | - struct rcu_head rcu; | |
13964 | } it; | |
13965 | + struct rcu_head rcu; | |
13966 | }; | |
1f39f580 | 13967 | |
e4b2b4a8 | 13968 | void run_posix_cpu_timers(struct task_struct *task); |
b3bbd485 JK |
13969 | diff --git a/include/linux/preempt.h b/include/linux/preempt.h |
13970 | index 5bd3f151da78..6728662a81e8 100644 | |
13971 | --- a/include/linux/preempt.h | |
13972 | +++ b/include/linux/preempt.h | |
e4b2b4a8 JK |
13973 | @@ -51,7 +51,11 @@ |
13974 | #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) | |
13975 | #define NMI_OFFSET (1UL << NMI_SHIFT) | |
1f39f580 | 13976 | |
e4b2b4a8 JK |
13977 | -#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET) |
13978 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
13979 | +# define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET) | |
13980 | +#else | |
13981 | +# define SOFTIRQ_DISABLE_OFFSET (0) | |
13982 | +#endif | |
1f39f580 | 13983 | |
e4b2b4a8 JK |
13984 | /* We use the MSB mostly because its available */ |
13985 | #define PREEMPT_NEED_RESCHED 0x80000000 | |
13986 | @@ -81,9 +85,15 @@ | |
13987 | #include <asm/preempt.h> | |
1f39f580 | 13988 | |
e4b2b4a8 JK |
13989 | #define hardirq_count() (preempt_count() & HARDIRQ_MASK) |
13990 | -#define softirq_count() (preempt_count() & SOFTIRQ_MASK) | |
13991 | #define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \ | |
13992 | | NMI_MASK)) | |
13993 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
13994 | +# define softirq_count() (preempt_count() & SOFTIRQ_MASK) | |
13995 | +# define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET) | |
13996 | +#else | |
13997 | +# define softirq_count() (0UL) | |
13998 | +extern int in_serving_softirq(void); | |
13999 | +#endif | |
1f39f580 | 14000 | |
e4b2b4a8 JK |
14001 | /* |
14002 | * Are we doing bottom half or hardware interrupt processing? | |
14003 | @@ -101,7 +111,6 @@ | |
14004 | #define in_irq() (hardirq_count()) | |
14005 | #define in_softirq() (softirq_count()) | |
14006 | #define in_interrupt() (irq_count()) | |
14007 | -#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET) | |
14008 | #define in_nmi() (preempt_count() & NMI_MASK) | |
14009 | #define in_task() (!(preempt_count() & \ | |
14010 | (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) | |
14011 | @@ -118,7 +127,11 @@ | |
14012 | /* | |
14013 | * The preempt_count offset after spin_lock() | |
14014 | */ | |
14015 | +#if !defined(CONFIG_PREEMPT_RT_FULL) | |
14016 | #define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET | |
14017 | +#else | |
14018 | +#define PREEMPT_LOCK_OFFSET 0 | |
14019 | +#endif | |
1f39f580 | 14020 | |
e4b2b4a8 JK |
14021 | /* |
14022 | * The preempt_count offset needed for things like: | |
b3bbd485 | 14023 | @@ -167,6 +180,20 @@ extern void preempt_count_sub(int val); |
e4b2b4a8 JK |
14024 | #define preempt_count_inc() preempt_count_add(1) |
14025 | #define preempt_count_dec() preempt_count_sub(1) | |
1f39f580 | 14026 | |
e4b2b4a8 JK |
14027 | +#ifdef CONFIG_PREEMPT_LAZY |
14028 | +#define add_preempt_lazy_count(val) do { preempt_lazy_count() += (val); } while (0) | |
14029 | +#define sub_preempt_lazy_count(val) do { preempt_lazy_count() -= (val); } while (0) | |
14030 | +#define inc_preempt_lazy_count() add_preempt_lazy_count(1) | |
14031 | +#define dec_preempt_lazy_count() sub_preempt_lazy_count(1) | |
14032 | +#define preempt_lazy_count() (current_thread_info()->preempt_lazy_count) | |
14033 | +#else | |
14034 | +#define add_preempt_lazy_count(val) do { } while (0) | |
14035 | +#define sub_preempt_lazy_count(val) do { } while (0) | |
14036 | +#define inc_preempt_lazy_count() do { } while (0) | |
14037 | +#define dec_preempt_lazy_count() do { } while (0) | |
14038 | +#define preempt_lazy_count() (0) | |
14039 | +#endif | |
14040 | + | |
14041 | #ifdef CONFIG_PREEMPT_COUNT | |
1f39f580 | 14042 | |
e4b2b4a8 | 14043 | #define preempt_disable() \ |
b3bbd485 | 14044 | @@ -175,16 +202,53 @@ do { \ |
e4b2b4a8 JK |
14045 | barrier(); \ |
14046 | } while (0) | |
1f39f580 | 14047 | |
e4b2b4a8 JK |
14048 | +#define preempt_lazy_disable() \ |
14049 | +do { \ | |
14050 | + inc_preempt_lazy_count(); \ | |
14051 | + barrier(); \ | |
14052 | +} while (0) | |
14053 | + | |
14054 | #define sched_preempt_enable_no_resched() \ | |
14055 | do { \ | |
14056 | barrier(); \ | |
14057 | preempt_count_dec(); \ | |
14058 | } while (0) | |
1f39f580 | 14059 | |
e4b2b4a8 JK |
14060 | -#define preempt_enable_no_resched() sched_preempt_enable_no_resched() |
14061 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
14062 | +# define preempt_enable_no_resched() sched_preempt_enable_no_resched() | |
14063 | +# define preempt_check_resched_rt() preempt_check_resched() | |
14064 | +#else | |
14065 | +# define preempt_enable_no_resched() preempt_enable() | |
14066 | +# define preempt_check_resched_rt() barrier(); | |
14067 | +#endif | |
1f39f580 | 14068 | |
e4b2b4a8 | 14069 | #define preemptible() (preempt_count() == 0 && !irqs_disabled()) |
1f39f580 | 14070 | |
b3bbd485 | 14071 | +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE) |
e4b2b4a8 JK |
14072 | + |
14073 | +extern void migrate_disable(void); | |
14074 | +extern void migrate_enable(void); | |
14075 | + | |
14076 | +int __migrate_disabled(struct task_struct *p); | |
14077 | + | |
14078 | +#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE) | |
14079 | + | |
14080 | +extern void migrate_disable(void); | |
14081 | +extern void migrate_enable(void); | |
14082 | +static inline int __migrate_disabled(struct task_struct *p) | |
14083 | +{ | |
14084 | + return 0; | |
14085 | +} | |
14086 | + | |
14087 | +#else | |
b3bbd485 JK |
14088 | +#define migrate_disable() preempt_disable() |
14089 | +#define migrate_enable() preempt_enable() | |
e4b2b4a8 JK |
14090 | +static inline int __migrate_disabled(struct task_struct *p) |
14091 | +{ | |
14092 | + return 0; | |
14093 | +} | |
14094 | +#endif | |
14095 | + | |
14096 | #ifdef CONFIG_PREEMPT | |
14097 | #define preempt_enable() \ | |
14098 | do { \ | |
b3bbd485 | 14099 | @@ -206,6 +270,13 @@ do { \ |
e4b2b4a8 JK |
14100 | __preempt_schedule(); \ |
14101 | } while (0) | |
1f39f580 | 14102 | |
e4b2b4a8 JK |
14103 | +#define preempt_lazy_enable() \ |
14104 | +do { \ | |
14105 | + dec_preempt_lazy_count(); \ | |
14106 | + barrier(); \ | |
14107 | + preempt_check_resched(); \ | |
14108 | +} while (0) | |
14109 | + | |
14110 | #else /* !CONFIG_PREEMPT */ | |
14111 | #define preempt_enable() \ | |
14112 | do { \ | |
b3bbd485 | 14113 | @@ -213,6 +284,12 @@ do { \ |
e4b2b4a8 JK |
14114 | preempt_count_dec(); \ |
14115 | } while (0) | |
1f39f580 | 14116 | |
e4b2b4a8 JK |
14117 | +#define preempt_lazy_enable() \ |
14118 | +do { \ | |
14119 | + dec_preempt_lazy_count(); \ | |
14120 | + barrier(); \ | |
14121 | +} while (0) | |
14122 | + | |
14123 | #define preempt_enable_notrace() \ | |
14124 | do { \ | |
14125 | barrier(); \ | |
b3bbd485 | 14126 | @@ -251,8 +328,16 @@ do { \ |
e4b2b4a8 JK |
14127 | #define preempt_disable_notrace() barrier() |
14128 | #define preempt_enable_no_resched_notrace() barrier() | |
14129 | #define preempt_enable_notrace() barrier() | |
14130 | +#define preempt_check_resched_rt() barrier() | |
14131 | #define preemptible() 0 | |
1f39f580 | 14132 | |
e4b2b4a8 JK |
14133 | +#define migrate_disable() barrier() |
14134 | +#define migrate_enable() barrier() | |
14135 | + | |
14136 | +static inline int __migrate_disabled(struct task_struct *p) | |
14137 | +{ | |
14138 | + return 0; | |
14139 | +} | |
14140 | #endif /* CONFIG_PREEMPT_COUNT */ | |
1f39f580 | 14141 | |
e4b2b4a8 | 14142 | #ifdef MODULE |
b3bbd485 | 14143 | @@ -271,10 +356,22 @@ do { \ |
e4b2b4a8 JK |
14144 | } while (0) |
14145 | #define preempt_fold_need_resched() \ | |
14146 | do { \ | |
14147 | - if (tif_need_resched()) \ | |
14148 | + if (tif_need_resched_now()) \ | |
14149 | set_preempt_need_resched(); \ | |
14150 | } while (0) | |
1f39f580 | 14151 | |
e4b2b4a8 JK |
14152 | +#ifdef CONFIG_PREEMPT_RT_FULL |
14153 | +# define preempt_disable_rt() preempt_disable() | |
14154 | +# define preempt_enable_rt() preempt_enable() | |
14155 | +# define preempt_disable_nort() barrier() | |
14156 | +# define preempt_enable_nort() barrier() | |
14157 | +#else | |
14158 | +# define preempt_disable_rt() barrier() | |
14159 | +# define preempt_enable_rt() barrier() | |
14160 | +# define preempt_disable_nort() preempt_disable() | |
14161 | +# define preempt_enable_nort() preempt_enable() | |
14162 | +#endif | |
14163 | + | |
14164 | #ifdef CONFIG_PREEMPT_NOTIFIERS | |
1f39f580 | 14165 | |
e4b2b4a8 | 14166 | struct preempt_notifier; |
b3bbd485 JK |
14167 | diff --git a/include/linux/printk.h b/include/linux/printk.h |
14168 | index 6106befed756..1dba9cb7b91b 100644 | |
14169 | --- a/include/linux/printk.h | |
14170 | +++ b/include/linux/printk.h | |
14171 | @@ -142,9 +142,11 @@ struct va_format { | |
e4b2b4a8 JK |
14172 | #ifdef CONFIG_EARLY_PRINTK |
14173 | extern asmlinkage __printf(1, 2) | |
14174 | void early_printk(const char *fmt, ...); | |
14175 | +extern void printk_kill(void); | |
14176 | #else | |
14177 | static inline __printf(1, 2) __cold | |
14178 | void early_printk(const char *s, ...) { } | |
14179 | +static inline void printk_kill(void) { } | |
14180 | #endif | |
1f39f580 | 14181 | |
e4b2b4a8 | 14182 | #ifdef CONFIG_PRINTK_NMI |
b3bbd485 JK |
14183 | diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h |
14184 | index 567ebb5eaab0..9da7ea957399 100644 | |
14185 | --- a/include/linux/radix-tree.h | |
14186 | +++ b/include/linux/radix-tree.h | |
14187 | @@ -328,6 +328,8 @@ unsigned int radix_tree_gang_lookup_slot(const struct radix_tree_root *, | |
e4b2b4a8 JK |
14188 | int radix_tree_preload(gfp_t gfp_mask); |
14189 | int radix_tree_maybe_preload(gfp_t gfp_mask); | |
14190 | int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order); | |
14191 | +void radix_tree_preload_end(void); | |
14192 | + | |
14193 | void radix_tree_init(void); | |
14194 | void *radix_tree_tag_set(struct radix_tree_root *, | |
14195 | unsigned long index, unsigned int tag); | |
b3bbd485 | 14196 | @@ -347,11 +349,6 @@ unsigned int radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *, |
e4b2b4a8 JK |
14197 | unsigned int max_items, unsigned int tag); |
14198 | int radix_tree_tagged(const struct radix_tree_root *, unsigned int tag); | |
1f39f580 | 14199 | |
e4b2b4a8 JK |
14200 | -static inline void radix_tree_preload_end(void) |
14201 | -{ | |
14202 | - preempt_enable(); | |
14203 | -} | |
14204 | - | |
14205 | int radix_tree_split_preload(unsigned old_order, unsigned new_order, gfp_t); | |
14206 | int radix_tree_split(struct radix_tree_root *, unsigned long index, | |
14207 | unsigned new_order); | |
b3bbd485 JK |
14208 | diff --git a/include/linux/random.h b/include/linux/random.h |
14209 | index 4024f7d9c77d..462d752a739b 100644 | |
14210 | --- a/include/linux/random.h | |
14211 | +++ b/include/linux/random.h | |
14212 | @@ -32,7 +32,7 @@ static inline void add_latent_entropy(void) {} | |
1f39f580 | 14213 | |
e4b2b4a8 JK |
14214 | extern void add_input_randomness(unsigned int type, unsigned int code, |
14215 | unsigned int value) __latent_entropy; | |
14216 | -extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy; | |
14217 | +extern void add_interrupt_randomness(int irq, int irq_flags, __u64 ip) __latent_entropy; | |
1f39f580 | 14218 | |
e4b2b4a8 JK |
14219 | extern void get_random_bytes(void *buf, int nbytes); |
14220 | extern int wait_for_random_bytes(void); | |
b3bbd485 JK |
14221 | diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h |
14222 | index d574361943ea..0a9f442409b9 100644 | |
14223 | --- a/include/linux/rbtree.h | |
14224 | +++ b/include/linux/rbtree.h | |
e4b2b4a8 | 14225 | @@ -31,7 +31,7 @@ |
1f39f580 | 14226 | |
e4b2b4a8 JK |
14227 | #include <linux/kernel.h> |
14228 | #include <linux/stddef.h> | |
14229 | -#include <linux/rcupdate.h> | |
14230 | +#include <linux/rcu_assign_pointer.h> | |
1f39f580 | 14231 | |
e4b2b4a8 JK |
14232 | struct rb_node { |
14233 | unsigned long __rb_parent_color; | |
b3bbd485 JK |
14234 | diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h |
14235 | index 6bfd2b581f75..af8a61be2d8d 100644 | |
14236 | --- a/include/linux/rbtree_augmented.h | |
14237 | +++ b/include/linux/rbtree_augmented.h | |
14238 | @@ -26,6 +26,7 @@ | |
14239 | ||
14240 | #include <linux/compiler.h> | |
14241 | #include <linux/rbtree.h> | |
14242 | +#include <linux/rcupdate.h> | |
14243 | ||
14244 | /* | |
14245 | * Please note - only struct rb_augment_callbacks and the prototypes for | |
14246 | diff --git a/include/linux/rbtree_latch.h b/include/linux/rbtree_latch.h | |
14247 | index ece43e882b56..7d012faa509a 100644 | |
14248 | --- a/include/linux/rbtree_latch.h | |
14249 | +++ b/include/linux/rbtree_latch.h | |
e4b2b4a8 | 14250 | @@ -35,6 +35,7 @@ |
1f39f580 | 14251 | |
e4b2b4a8 JK |
14252 | #include <linux/rbtree.h> |
14253 | #include <linux/seqlock.h> | |
14254 | +#include <linux/rcupdate.h> | |
1f39f580 | 14255 | |
e4b2b4a8 JK |
14256 | struct latch_tree_node { |
14257 | struct rb_node node[2]; | |
b3bbd485 JK |
14258 | diff --git a/include/linux/rcu_assign_pointer.h b/include/linux/rcu_assign_pointer.h |
14259 | new file mode 100644 | |
14260 | index 000000000000..7066962a4379 | |
14261 | --- /dev/null | |
14262 | +++ b/include/linux/rcu_assign_pointer.h | |
e4b2b4a8 JK |
14263 | @@ -0,0 +1,54 @@ |
14264 | +#ifndef __LINUX_RCU_ASSIGN_POINTER_H__ | |
14265 | +#define __LINUX_RCU_ASSIGN_POINTER_H__ | |
14266 | +#include <linux/compiler.h> | |
14267 | +#include <asm/barrier.h> | |
14268 | + | |
14269 | +/** | |
14270 | + * RCU_INITIALIZER() - statically initialize an RCU-protected global variable | |
14271 | + * @v: The value to statically initialize with. | |
14272 | + */ | |
14273 | +#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v) | |
14274 | + | |
14275 | +/** | |
14276 | + * rcu_assign_pointer() - assign to RCU-protected pointer | |
14277 | + * @p: pointer to assign to | |
14278 | + * @v: value to assign (publish) | |
14279 | + * | |
14280 | + * Assigns the specified value to the specified RCU-protected | |
14281 | + * pointer, ensuring that any concurrent RCU readers will see | |
14282 | + * any prior initialization. | |
14283 | + * | |
14284 | + * Inserts memory barriers on architectures that require them | |
14285 | + * (which is most of them), and also prevents the compiler from | |
14286 | + * reordering the code that initializes the structure after the pointer | |
14287 | + * assignment. More importantly, this call documents which pointers | |
14288 | + * will be dereferenced by RCU read-side code. | |
14289 | + * | |
14290 | + * In some special cases, you may use RCU_INIT_POINTER() instead | |
14291 | + * of rcu_assign_pointer(). RCU_INIT_POINTER() is a bit faster due | |
14292 | + * to the fact that it does not constrain either the CPU or the compiler. | |
14293 | + * That said, using RCU_INIT_POINTER() when you should have used | |
14294 | + * rcu_assign_pointer() is a very bad thing that results in | |
14295 | + * impossible-to-diagnose memory corruption. So please be careful. | |
14296 | + * See the RCU_INIT_POINTER() comment header for details. | |
14297 | + * | |
14298 | + * Note that rcu_assign_pointer() evaluates each of its arguments only | |
14299 | + * once, appearances notwithstanding. One of the "extra" evaluations | |
14300 | + * is in typeof() and the other visible only to sparse (__CHECKER__), | |
14301 | + * neither of which actually execute the argument. As with most cpp | |
14302 | + * macros, this execute-arguments-only-once property is important, so | |
14303 | + * please be careful when making changes to rcu_assign_pointer() and the | |
14304 | + * other macros that it invokes. | |
14305 | + */ | |
14306 | +#define rcu_assign_pointer(p, v) \ | |
14307 | +({ \ | |
14308 | + uintptr_t _r_a_p__v = (uintptr_t)(v); \ | |
14309 | + \ | |
14310 | + if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL) \ | |
14311 | + WRITE_ONCE((p), (typeof(p))(_r_a_p__v)); \ | |
14312 | + else \ | |
14313 | + smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \ | |
14314 | + _r_a_p__v; \ | |
14315 | +}) | |
14316 | + | |
14317 | +#endif | |
b3bbd485 JK |
14318 | diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h |
14319 | index a6ddc42f87a5..70996e134818 100644 | |
14320 | --- a/include/linux/rcupdate.h | |
14321 | +++ b/include/linux/rcupdate.h | |
e4b2b4a8 JK |
14322 | @@ -42,6 +42,7 @@ |
14323 | #include <linux/lockdep.h> | |
14324 | #include <asm/processor.h> | |
14325 | #include <linux/cpumask.h> | |
14326 | +#include <linux/rcu_assign_pointer.h> | |
1f39f580 | 14327 | |
e4b2b4a8 JK |
14328 | #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) |
14329 | #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) | |
b3bbd485 | 14330 | @@ -55,7 +56,11 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func); |
e4b2b4a8 JK |
14331 | #define call_rcu call_rcu_sched |
14332 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ | |
1f39f580 | 14333 | |
e4b2b4a8 JK |
14334 | +#ifdef CONFIG_PREEMPT_RT_FULL |
14335 | +#define call_rcu_bh call_rcu | |
14336 | +#else | |
14337 | void call_rcu_bh(struct rcu_head *head, rcu_callback_t func); | |
14338 | +#endif | |
14339 | void call_rcu_sched(struct rcu_head *head, rcu_callback_t func); | |
14340 | void synchronize_sched(void); | |
14341 | void rcu_barrier_tasks(void); | |
b3bbd485 | 14342 | @@ -74,6 +79,11 @@ void synchronize_rcu(void); |
e4b2b4a8 JK |
14343 | * types of kernel builds, the rcu_read_lock() nesting depth is unknowable. |
14344 | */ | |
14345 | #define rcu_preempt_depth() (current->rcu_read_lock_nesting) | |
14346 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
14347 | +#define sched_rcu_preempt_depth() rcu_preempt_depth() | |
14348 | +#else | |
14349 | +static inline int sched_rcu_preempt_depth(void) { return 0; } | |
14350 | +#endif | |
1f39f580 | 14351 | |
e4b2b4a8 | 14352 | #else /* #ifdef CONFIG_PREEMPT_RCU */ |
1f39f580 | 14353 | |
b3bbd485 | 14354 | @@ -99,6 +109,8 @@ static inline int rcu_preempt_depth(void) |
e4b2b4a8 | 14355 | return 0; |
1f39f580 JK |
14356 | } |
14357 | ||
e4b2b4a8 JK |
14358 | +#define sched_rcu_preempt_depth() rcu_preempt_depth() |
14359 | + | |
14360 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ | |
1f39f580 | 14361 | |
e4b2b4a8 | 14362 | /* Internal to kernel */ |
b3bbd485 | 14363 | @@ -255,7 +267,14 @@ extern struct lockdep_map rcu_sched_lock_map; |
e4b2b4a8 JK |
14364 | extern struct lockdep_map rcu_callback_map; |
14365 | int debug_lockdep_rcu_enabled(void); | |
14366 | int rcu_read_lock_held(void); | |
14367 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
14368 | +static inline int rcu_read_lock_bh_held(void) | |
14369 | +{ | |
14370 | + return rcu_read_lock_held(); | |
14371 | +} | |
14372 | +#else | |
14373 | int rcu_read_lock_bh_held(void); | |
14374 | +#endif | |
14375 | int rcu_read_lock_sched_held(void); | |
1f39f580 | 14376 | |
e4b2b4a8 | 14377 | #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
b3bbd485 JK |
14378 | @@ -364,54 +383,6 @@ static inline void rcu_preempt_sleep_check(void) { } |
14379 | ((typeof(*p) __force __kernel *)(________p1)); \ | |
e4b2b4a8 | 14380 | }) |
1f39f580 | 14381 | |
b3bbd485 | 14382 | -/** |
e4b2b4a8 JK |
14383 | - * RCU_INITIALIZER() - statically initialize an RCU-protected global variable |
14384 | - * @v: The value to statically initialize with. | |
14385 | - */ | |
14386 | -#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v) | |
14387 | - | |
14388 | -/** | |
14389 | - * rcu_assign_pointer() - assign to RCU-protected pointer | |
14390 | - * @p: pointer to assign to | |
14391 | - * @v: value to assign (publish) | |
14392 | - * | |
14393 | - * Assigns the specified value to the specified RCU-protected | |
14394 | - * pointer, ensuring that any concurrent RCU readers will see | |
14395 | - * any prior initialization. | |
14396 | - * | |
14397 | - * Inserts memory barriers on architectures that require them | |
14398 | - * (which is most of them), and also prevents the compiler from | |
14399 | - * reordering the code that initializes the structure after the pointer | |
14400 | - * assignment. More importantly, this call documents which pointers | |
14401 | - * will be dereferenced by RCU read-side code. | |
14402 | - * | |
14403 | - * In some special cases, you may use RCU_INIT_POINTER() instead | |
14404 | - * of rcu_assign_pointer(). RCU_INIT_POINTER() is a bit faster due | |
14405 | - * to the fact that it does not constrain either the CPU or the compiler. | |
14406 | - * That said, using RCU_INIT_POINTER() when you should have used | |
14407 | - * rcu_assign_pointer() is a very bad thing that results in | |
14408 | - * impossible-to-diagnose memory corruption. So please be careful. | |
14409 | - * See the RCU_INIT_POINTER() comment header for details. | |
14410 | - * | |
14411 | - * Note that rcu_assign_pointer() evaluates each of its arguments only | |
14412 | - * once, appearances notwithstanding. One of the "extra" evaluations | |
14413 | - * is in typeof() and the other visible only to sparse (__CHECKER__), | |
14414 | - * neither of which actually execute the argument. As with most cpp | |
14415 | - * macros, this execute-arguments-only-once property is important, so | |
14416 | - * please be careful when making changes to rcu_assign_pointer() and the | |
14417 | - * other macros that it invokes. | |
14418 | - */ | |
14419 | -#define rcu_assign_pointer(p, v) \ | |
14420 | -({ \ | |
14421 | - uintptr_t _r_a_p__v = (uintptr_t)(v); \ | |
14422 | - \ | |
14423 | - if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL) \ | |
14424 | - WRITE_ONCE((p), (typeof(p))(_r_a_p__v)); \ | |
14425 | - else \ | |
14426 | - smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \ | |
14427 | - _r_a_p__v; \ | |
14428 | -}) | |
14429 | - | |
b3bbd485 | 14430 | /** |
e4b2b4a8 JK |
14431 | * rcu_swap_protected() - swap an RCU and a regular pointer |
14432 | * @rcu_ptr: RCU pointer | |
b3bbd485 | 14433 | @@ -707,10 +678,14 @@ static inline void rcu_read_unlock(void) |
e4b2b4a8 JK |
14434 | static inline void rcu_read_lock_bh(void) |
14435 | { | |
14436 | local_bh_disable(); | |
14437 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
14438 | + rcu_read_lock(); | |
14439 | +#else | |
14440 | __acquire(RCU_BH); | |
14441 | rcu_lock_acquire(&rcu_bh_lock_map); | |
14442 | RCU_LOCKDEP_WARN(!rcu_is_watching(), | |
14443 | "rcu_read_lock_bh() used illegally while idle"); | |
14444 | +#endif | |
1f39f580 | 14445 | } |
1f39f580 | 14446 | |
e4b2b4a8 | 14447 | /* |
b3bbd485 | 14448 | @@ -720,10 +695,14 @@ static inline void rcu_read_lock_bh(void) |
e4b2b4a8 JK |
14449 | */ |
14450 | static inline void rcu_read_unlock_bh(void) | |
1a6e0f06 | 14451 | { |
e4b2b4a8 JK |
14452 | +#ifdef CONFIG_PREEMPT_RT_FULL |
14453 | + rcu_read_unlock(); | |
14454 | +#else | |
14455 | RCU_LOCKDEP_WARN(!rcu_is_watching(), | |
14456 | "rcu_read_unlock_bh() used illegally while idle"); | |
14457 | rcu_lock_release(&rcu_bh_lock_map); | |
14458 | __release(RCU_BH); | |
14459 | +#endif | |
14460 | local_bh_enable(); | |
1a6e0f06 | 14461 | } |
1a6e0f06 | 14462 | |
b3bbd485 JK |
14463 | diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h |
14464 | index 37d6fd3b7ff8..a082fde7d6bc 100644 | |
14465 | --- a/include/linux/rcutree.h | |
14466 | +++ b/include/linux/rcutree.h | |
14467 | @@ -44,7 +44,11 @@ static inline void rcu_virt_note_context_switch(int cpu) | |
e4b2b4a8 | 14468 | rcu_note_context_switch(false); |
1a6e0f06 JK |
14469 | } |
14470 | ||
e4b2b4a8 JK |
14471 | +#ifdef CONFIG_PREEMPT_RT_FULL |
14472 | +# define synchronize_rcu_bh synchronize_rcu | |
14473 | +#else | |
14474 | void synchronize_rcu_bh(void); | |
14475 | +#endif | |
14476 | void synchronize_sched_expedited(void); | |
14477 | void synchronize_rcu_expedited(void); | |
1a6e0f06 | 14478 | |
b3bbd485 | 14479 | @@ -72,7 +76,11 @@ static inline void synchronize_rcu_bh_expedited(void) |
1a6e0f06 JK |
14480 | } |
14481 | ||
e4b2b4a8 JK |
14482 | void rcu_barrier(void); |
14483 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
14484 | +# define rcu_barrier_bh rcu_barrier | |
14485 | +#else | |
14486 | void rcu_barrier_bh(void); | |
14487 | +#endif | |
14488 | void rcu_barrier_sched(void); | |
14489 | unsigned long get_state_synchronize_rcu(void); | |
14490 | void cond_synchronize_rcu(unsigned long oldstate); | |
b3bbd485 JK |
14491 | diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h |
14492 | index 5caa062a02b2..abce5f5325e1 100644 | |
14493 | --- a/include/linux/ring_buffer.h | |
14494 | +++ b/include/linux/ring_buffer.h | |
14495 | @@ -34,10 +34,12 @@ struct ring_buffer_event { | |
e4b2b4a8 JK |
14496 | * array[0] = time delta (28 .. 59) |
14497 | * size = 8 bytes | |
14498 | * | |
14499 | - * @RINGBUF_TYPE_TIME_STAMP: Sync time stamp with external clock | |
14500 | - * array[0] = tv_nsec | |
14501 | - * array[1..2] = tv_sec | |
14502 | - * size = 16 bytes | |
14503 | + * @RINGBUF_TYPE_TIME_STAMP: Absolute timestamp | |
14504 | + * Same format as TIME_EXTEND except that the | |
14505 | + * value is an absolute timestamp, not a delta | |
14506 | + * event.time_delta contains bottom 27 bits | |
14507 | + * array[0] = top (28 .. 59) bits | |
14508 | + * size = 8 bytes | |
14509 | * | |
14510 | * <= @RINGBUF_TYPE_DATA_TYPE_LEN_MAX: | |
14511 | * Data record | |
b3bbd485 | 14512 | @@ -54,12 +56,12 @@ enum ring_buffer_type { |
e4b2b4a8 JK |
14513 | RINGBUF_TYPE_DATA_TYPE_LEN_MAX = 28, |
14514 | RINGBUF_TYPE_PADDING, | |
14515 | RINGBUF_TYPE_TIME_EXTEND, | |
14516 | - /* FIXME: RINGBUF_TYPE_TIME_STAMP not implemented */ | |
14517 | RINGBUF_TYPE_TIME_STAMP, | |
14518 | }; | |
1a6e0f06 | 14519 | |
e4b2b4a8 JK |
14520 | unsigned ring_buffer_event_length(struct ring_buffer_event *event); |
14521 | void *ring_buffer_event_data(struct ring_buffer_event *event); | |
14522 | +u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event); | |
1a6e0f06 | 14523 | |
e4b2b4a8 JK |
14524 | /* |
14525 | * ring_buffer_discard_commit will remove an event that has not | |
b3bbd485 | 14526 | @@ -115,6 +117,9 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, |
e4b2b4a8 JK |
14527 | int ring_buffer_write(struct ring_buffer *buffer, |
14528 | unsigned long length, void *data); | |
14529 | ||
14530 | +void ring_buffer_nest_start(struct ring_buffer *buffer); | |
14531 | +void ring_buffer_nest_end(struct ring_buffer *buffer); | |
14532 | + | |
14533 | struct ring_buffer_event * | |
14534 | ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts, | |
14535 | unsigned long *lost_events); | |
b3bbd485 | 14536 | @@ -179,6 +184,8 @@ void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, |
e4b2b4a8 JK |
14537 | int cpu, u64 *ts); |
14538 | void ring_buffer_set_clock(struct ring_buffer *buffer, | |
14539 | u64 (*clock)(void)); | |
14540 | +void ring_buffer_set_time_stamp_abs(struct ring_buffer *buffer, bool abs); | |
14541 | +bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer); | |
14542 | ||
14543 | size_t ring_buffer_page_len(void *page); | |
14544 | ||
b3bbd485 JK |
14545 | diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h |
14546 | index 6fd615a0eea9..138bd1e183e0 100644 | |
14547 | --- a/include/linux/rtmutex.h | |
14548 | +++ b/include/linux/rtmutex.h | |
e4b2b4a8 JK |
14549 | @@ -14,11 +14,15 @@ |
14550 | #define __LINUX_RT_MUTEX_H | |
1a6e0f06 | 14551 | |
e4b2b4a8 JK |
14552 | #include <linux/linkage.h> |
14553 | +#include <linux/spinlock_types_raw.h> | |
14554 | #include <linux/rbtree.h> | |
14555 | -#include <linux/spinlock_types.h> | |
1a6e0f06 | 14556 | |
e4b2b4a8 | 14557 | extern int max_lock_depth; /* for sysctl */ |
1a6e0f06 | 14558 | |
e4b2b4a8 JK |
14559 | +#ifdef CONFIG_DEBUG_MUTEXES |
14560 | +#include <linux/debug_locks.h> | |
14561 | +#endif | |
14562 | + | |
14563 | /** | |
14564 | * The rt_mutex structure | |
14565 | * | |
b3bbd485 | 14566 | @@ -31,8 +35,8 @@ struct rt_mutex { |
e4b2b4a8 JK |
14567 | raw_spinlock_t wait_lock; |
14568 | struct rb_root_cached waiters; | |
14569 | struct task_struct *owner; | |
14570 | -#ifdef CONFIG_DEBUG_RT_MUTEXES | |
14571 | int save_state; | |
14572 | +#ifdef CONFIG_DEBUG_RT_MUTEXES | |
14573 | const char *name, *file; | |
14574 | int line; | |
14575 | void *magic; | |
b3bbd485 | 14576 | @@ -82,16 +86,23 @@ do { \ |
e4b2b4a8 JK |
14577 | #define __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname) |
14578 | #endif | |
1a6e0f06 | 14579 | |
e4b2b4a8 JK |
14580 | -#define __RT_MUTEX_INITIALIZER(mutexname) \ |
14581 | - { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \ | |
14582 | +#define __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \ | |
14583 | + .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \ | |
14584 | , .waiters = RB_ROOT_CACHED \ | |
14585 | , .owner = NULL \ | |
14586 | __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \ | |
14587 | - __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)} | |
14588 | + __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname) | |
14589 | + | |
14590 | +#define __RT_MUTEX_INITIALIZER(mutexname) \ | |
14591 | + { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) } | |
1a6e0f06 | 14592 | |
e4b2b4a8 JK |
14593 | #define DEFINE_RT_MUTEX(mutexname) \ |
14594 | struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname) | |
1a6e0f06 | 14595 | |
e4b2b4a8 JK |
14596 | +#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \ |
14597 | + { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \ | |
14598 | + , .save_state = 1 } | |
14599 | + | |
14600 | /** | |
14601 | * rt_mutex_is_locked - is the mutex locked | |
14602 | * @lock: the mutex to be queried | |
b3bbd485 JK |
14603 | @@ -115,6 +126,7 @@ extern void rt_mutex_lock(struct rt_mutex *lock); |
14604 | #endif | |
1a6e0f06 | 14605 | |
e4b2b4a8 JK |
14606 | extern int rt_mutex_lock_interruptible(struct rt_mutex *lock); |
14607 | +extern int rt_mutex_lock_killable(struct rt_mutex *lock); | |
14608 | extern int rt_mutex_timed_lock(struct rt_mutex *lock, | |
14609 | struct hrtimer_sleeper *timeout); | |
1a6e0f06 | 14610 | |
b3bbd485 JK |
14611 | diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h |
14612 | new file mode 100644 | |
14613 | index 000000000000..a9c4c2ac4d1f | |
14614 | --- /dev/null | |
14615 | +++ b/include/linux/rwlock_rt.h | |
e4b2b4a8 JK |
14616 | @@ -0,0 +1,119 @@ |
14617 | +#ifndef __LINUX_RWLOCK_RT_H | |
14618 | +#define __LINUX_RWLOCK_RT_H | |
14619 | + | |
14620 | +#ifndef __LINUX_SPINLOCK_H | |
14621 | +#error Do not include directly. Use spinlock.h | |
1a6e0f06 | 14622 | +#endif |
e4b2b4a8 JK |
14623 | + |
14624 | +extern void __lockfunc rt_write_lock(rwlock_t *rwlock); | |
14625 | +extern void __lockfunc rt_read_lock(rwlock_t *rwlock); | |
14626 | +extern int __lockfunc rt_write_trylock(rwlock_t *rwlock); | |
14627 | +extern int __lockfunc rt_read_trylock(rwlock_t *rwlock); | |
14628 | +extern void __lockfunc rt_write_unlock(rwlock_t *rwlock); | |
14629 | +extern void __lockfunc rt_read_unlock(rwlock_t *rwlock); | |
14630 | +extern int __lockfunc rt_read_can_lock(rwlock_t *rwlock); | |
14631 | +extern int __lockfunc rt_write_can_lock(rwlock_t *rwlock); | |
14632 | +extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key); | |
14633 | + | |
14634 | +#define read_can_lock(rwlock) rt_read_can_lock(rwlock) | |
14635 | +#define write_can_lock(rwlock) rt_write_can_lock(rwlock) | |
14636 | + | |
14637 | +#define read_trylock(lock) __cond_lock(lock, rt_read_trylock(lock)) | |
14638 | +#define write_trylock(lock) __cond_lock(lock, rt_write_trylock(lock)) | |
14639 | + | |
14640 | +static inline int __write_trylock_rt_irqsave(rwlock_t *lock, unsigned long *flags) | |
14641 | +{ | |
14642 | + /* XXX ARCH_IRQ_ENABLED */ | |
14643 | + *flags = 0; | |
14644 | + return rt_write_trylock(lock); | |
1a6e0f06 JK |
14645 | +} |
14646 | + | |
e4b2b4a8 JK |
14647 | +#define write_trylock_irqsave(lock, flags) \ |
14648 | + __cond_lock(lock, __write_trylock_rt_irqsave(lock, &(flags))) | |
14649 | + | |
14650 | +#define read_lock_irqsave(lock, flags) \ | |
14651 | + do { \ | |
14652 | + typecheck(unsigned long, flags); \ | |
14653 | + rt_read_lock(lock); \ | |
14654 | + flags = 0; \ | |
14655 | + } while (0) | |
14656 | + | |
14657 | +#define write_lock_irqsave(lock, flags) \ | |
14658 | + do { \ | |
14659 | + typecheck(unsigned long, flags); \ | |
14660 | + rt_write_lock(lock); \ | |
14661 | + flags = 0; \ | |
14662 | + } while (0) | |
1a6e0f06 | 14663 | + |
e4b2b4a8 | 14664 | +#define read_lock(lock) rt_read_lock(lock) |
1a6e0f06 | 14665 | + |
e4b2b4a8 JK |
14666 | +#define read_lock_bh(lock) \ |
14667 | + do { \ | |
14668 | + local_bh_disable(); \ | |
14669 | + rt_read_lock(lock); \ | |
14670 | + } while (0) | |
1a6e0f06 | 14671 | + |
e4b2b4a8 | 14672 | +#define read_lock_irq(lock) read_lock(lock) |
1a6e0f06 | 14673 | + |
e4b2b4a8 | 14674 | +#define write_lock(lock) rt_write_lock(lock) |
1a6e0f06 | 14675 | + |
e4b2b4a8 JK |
14676 | +#define write_lock_bh(lock) \ |
14677 | + do { \ | |
14678 | + local_bh_disable(); \ | |
14679 | + rt_write_lock(lock); \ | |
14680 | + } while (0) | |
1a6e0f06 | 14681 | + |
e4b2b4a8 | 14682 | +#define write_lock_irq(lock) write_lock(lock) |
1a6e0f06 | 14683 | + |
e4b2b4a8 | 14684 | +#define read_unlock(lock) rt_read_unlock(lock) |
1a6e0f06 | 14685 | + |
e4b2b4a8 JK |
14686 | +#define read_unlock_bh(lock) \ |
14687 | + do { \ | |
14688 | + rt_read_unlock(lock); \ | |
14689 | + local_bh_enable(); \ | |
14690 | + } while (0) | |
1a6e0f06 | 14691 | + |
e4b2b4a8 | 14692 | +#define read_unlock_irq(lock) read_unlock(lock) |
1a6e0f06 | 14693 | + |
e4b2b4a8 JK |
14694 | +#define write_unlock(lock) rt_write_unlock(lock) |
14695 | + | |
14696 | +#define write_unlock_bh(lock) \ | |
14697 | + do { \ | |
14698 | + rt_write_unlock(lock); \ | |
14699 | + local_bh_enable(); \ | |
14700 | + } while (0) | |
14701 | + | |
14702 | +#define write_unlock_irq(lock) write_unlock(lock) | |
14703 | + | |
14704 | +#define read_unlock_irqrestore(lock, flags) \ | |
14705 | + do { \ | |
14706 | + typecheck(unsigned long, flags); \ | |
14707 | + (void) flags; \ | |
14708 | + rt_read_unlock(lock); \ | |
14709 | + } while (0) | |
14710 | + | |
14711 | +#define write_unlock_irqrestore(lock, flags) \ | |
14712 | + do { \ | |
14713 | + typecheck(unsigned long, flags); \ | |
14714 | + (void) flags; \ | |
14715 | + rt_write_unlock(lock); \ | |
14716 | + } while (0) | |
14717 | + | |
14718 | +#define rwlock_init(rwl) \ | |
14719 | +do { \ | |
14720 | + static struct lock_class_key __key; \ | |
14721 | + \ | |
14722 | + __rt_rwlock_init(rwl, #rwl, &__key); \ | |
14723 | +} while (0) | |
1a6e0f06 | 14724 | + |
1a6e0f06 | 14725 | +/* |
e4b2b4a8 | 14726 | + * Internal functions made global for CPU pinning |
1a6e0f06 | 14727 | + */ |
e4b2b4a8 JK |
14728 | +void __read_rt_lock(struct rt_rw_lock *lock); |
14729 | +int __read_rt_trylock(struct rt_rw_lock *lock); | |
14730 | +void __write_rt_lock(struct rt_rw_lock *lock); | |
14731 | +int __write_rt_trylock(struct rt_rw_lock *lock); | |
14732 | +void __read_rt_unlock(struct rt_rw_lock *lock); | |
14733 | +void __write_rt_unlock(struct rt_rw_lock *lock); | |
14734 | + | |
1a6e0f06 | 14735 | +#endif |
b3bbd485 JK |
14736 | diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h |
14737 | index cc0072e93e36..5317cd957292 100644 | |
14738 | --- a/include/linux/rwlock_types.h | |
14739 | +++ b/include/linux/rwlock_types.h | |
e4b2b4a8 JK |
14740 | @@ -1,6 +1,10 @@ |
14741 | #ifndef __LINUX_RWLOCK_TYPES_H | |
14742 | #define __LINUX_RWLOCK_TYPES_H | |
1a6e0f06 | 14743 | |
e4b2b4a8 JK |
14744 | +#if !defined(__LINUX_SPINLOCK_TYPES_H) |
14745 | +# error "Do not include directly, include spinlock_types.h" | |
14746 | +#endif | |
14747 | + | |
1a6e0f06 | 14748 | /* |
e4b2b4a8 JK |
14749 | * include/linux/rwlock_types.h - generic rwlock type definitions |
14750 | * and initializers | |
b3bbd485 JK |
14751 | diff --git a/include/linux/rwlock_types_rt.h b/include/linux/rwlock_types_rt.h |
14752 | new file mode 100644 | |
14753 | index 000000000000..546a1f8f1274 | |
14754 | --- /dev/null | |
14755 | +++ b/include/linux/rwlock_types_rt.h | |
e4b2b4a8 JK |
14756 | @@ -0,0 +1,55 @@ |
14757 | +#ifndef __LINUX_RWLOCK_TYPES_RT_H | |
14758 | +#define __LINUX_RWLOCK_TYPES_RT_H | |
14759 | + | |
14760 | +#ifndef __LINUX_SPINLOCK_TYPES_H | |
14761 | +#error "Do not include directly. Include spinlock_types.h instead" | |
14762 | +#endif | |
14763 | + | |
14764 | +#ifdef CONFIG_DEBUG_LOCK_ALLOC | |
14765 | +# define RW_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname } | |
14766 | +#else | |
14767 | +# define RW_DEP_MAP_INIT(lockname) | |
14768 | +#endif | |
14769 | + | |
14770 | +typedef struct rt_rw_lock rwlock_t; | |
14771 | + | |
14772 | +#define __RW_LOCK_UNLOCKED(name) __RWLOCK_RT_INITIALIZER(name) | |
14773 | + | |
14774 | +#define DEFINE_RWLOCK(name) \ | |
14775 | + rwlock_t name = __RW_LOCK_UNLOCKED(name) | |
14776 | + | |
14777 | +/* | |
14778 | + * A reader biased implementation primarily for CPU pinning. | |
14779 | + * | |
14780 | + * Can be selected as general replacement for the single reader RT rwlock | |
14781 | + * variant | |
14782 | + */ | |
14783 | +struct rt_rw_lock { | |
14784 | + struct rt_mutex rtmutex; | |
14785 | + atomic_t readers; | |
14786 | +#ifdef CONFIG_DEBUG_LOCK_ALLOC | |
14787 | + struct lockdep_map dep_map; | |
14788 | +#endif | |
14789 | +}; | |
14790 | + | |
14791 | +#define READER_BIAS (1U << 31) | |
14792 | +#define WRITER_BIAS (1U << 30) | |
14793 | + | |
14794 | +#define __RWLOCK_RT_INITIALIZER(name) \ | |
14795 | +{ \ | |
14796 | + .readers = ATOMIC_INIT(READER_BIAS), \ | |
14797 | + .rtmutex = __RT_MUTEX_INITIALIZER_SAVE_STATE(name.rtmutex), \ | |
14798 | + RW_DEP_MAP_INIT(name) \ | |
14799 | +} | |
14800 | + | |
14801 | +void __rwlock_biased_rt_init(struct rt_rw_lock *lock, const char *name, | |
14802 | + struct lock_class_key *key); | |
14803 | + | |
14804 | +#define rwlock_biased_rt_init(rwlock) \ | |
14805 | + do { \ | |
14806 | + static struct lock_class_key __key; \ | |
14807 | + \ | |
14808 | + __rwlock_biased_rt_init((rwlock), #rwlock, &__key); \ | |
14809 | + } while (0) | |
14810 | + | |
14811 | +#endif | |
b3bbd485 JK |
14812 | diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h |
14813 | index c427ffaa4904..513df11a364e 100644 | |
14814 | --- a/include/linux/rwsem.h | |
14815 | +++ b/include/linux/rwsem.h | |
e4b2b4a8 JK |
14816 | @@ -20,6 +20,10 @@ |
14817 | #include <linux/osq_lock.h> | |
14818 | #endif | |
1a6e0f06 | 14819 | |
e4b2b4a8 JK |
14820 | +#ifdef CONFIG_PREEMPT_RT_FULL |
14821 | +#include <linux/rwsem_rt.h> | |
14822 | +#else /* PREEMPT_RT_FULL */ | |
14823 | + | |
14824 | struct rw_semaphore; | |
1a6e0f06 | 14825 | |
e4b2b4a8 | 14826 | #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK |
b3bbd485 | 14827 | @@ -114,6 +118,13 @@ static inline int rwsem_is_contended(struct rw_semaphore *sem) |
e4b2b4a8 | 14828 | return !list_empty(&sem->wait_list); |
1a6e0f06 JK |
14829 | } |
14830 | ||
e4b2b4a8 JK |
14831 | +#endif /* !PREEMPT_RT_FULL */ |
14832 | + | |
14833 | +/* | |
14834 | + * The functions below are the same for all rwsem implementations including | |
14835 | + * the RT specific variant. | |
14836 | + */ | |
14837 | + | |
1a6e0f06 | 14838 | /* |
e4b2b4a8 | 14839 | * lock for reading |
1a6e0f06 | 14840 | */ |
b3bbd485 JK |
14841 | diff --git a/include/linux/rwsem_rt.h b/include/linux/rwsem_rt.h |
14842 | new file mode 100644 | |
14843 | index 000000000000..2ffbf093ae92 | |
14844 | --- /dev/null | |
14845 | +++ b/include/linux/rwsem_rt.h | |
e4b2b4a8 JK |
14846 | @@ -0,0 +1,67 @@ |
14847 | +#ifndef _LINUX_RWSEM_RT_H | |
14848 | +#define _LINUX_RWSEM_RT_H | |
14849 | + | |
14850 | +#ifndef _LINUX_RWSEM_H | |
14851 | +#error "Include rwsem.h" | |
14852 | +#endif | |
14853 | + | |
14854 | +#include <linux/rtmutex.h> | |
14855 | +#include <linux/swait.h> | |
14856 | + | |
14857 | +#define READER_BIAS (1U << 31) | |
14858 | +#define WRITER_BIAS (1U << 30) | |
14859 | + | |
14860 | +struct rw_semaphore { | |
14861 | + atomic_t readers; | |
14862 | + struct rt_mutex rtmutex; | |
14863 | +#ifdef CONFIG_DEBUG_LOCK_ALLOC | |
14864 | + struct lockdep_map dep_map; | |
14865 | +#endif | |
14866 | +}; | |
14867 | + | |
14868 | +#define __RWSEM_INITIALIZER(name) \ | |
14869 | +{ \ | |
14870 | + .readers = ATOMIC_INIT(READER_BIAS), \ | |
14871 | + .rtmutex = __RT_MUTEX_INITIALIZER(name.rtmutex), \ | |
14872 | + RW_DEP_MAP_INIT(name) \ | |
14873 | +} | |
14874 | + | |
14875 | +#define DECLARE_RWSEM(lockname) \ | |
14876 | + struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname) | |
14877 | + | |
14878 | +extern void __rwsem_init(struct rw_semaphore *rwsem, const char *name, | |
14879 | + struct lock_class_key *key); | |
14880 | + | |
14881 | +#define __init_rwsem(sem, name, key) \ | |
14882 | +do { \ | |
14883 | + rt_mutex_init(&(sem)->rtmutex); \ | |
14884 | + __rwsem_init((sem), (name), (key)); \ | |
14885 | +} while (0) | |
14886 | + | |
14887 | +#define init_rwsem(sem) \ | |
14888 | +do { \ | |
14889 | + static struct lock_class_key __key; \ | |
14890 | + \ | |
14891 | + __init_rwsem((sem), #sem, &__key); \ | |
14892 | +} while (0) | |
14893 | + | |
14894 | +static inline int rwsem_is_locked(struct rw_semaphore *sem) | |
1a6e0f06 | 14895 | +{ |
e4b2b4a8 JK |
14896 | + return atomic_read(&sem->readers) != READER_BIAS; |
14897 | +} | |
1a6e0f06 | 14898 | + |
e4b2b4a8 JK |
14899 | +static inline int rwsem_is_contended(struct rw_semaphore *sem) |
14900 | +{ | |
14901 | + return atomic_read(&sem->readers) > 0; | |
1a6e0f06 JK |
14902 | +} |
14903 | + | |
e4b2b4a8 JK |
14904 | +extern void __down_read(struct rw_semaphore *sem); |
14905 | +extern int __down_read_trylock(struct rw_semaphore *sem); | |
14906 | +extern void __down_write(struct rw_semaphore *sem); | |
14907 | +extern int __must_check __down_write_killable(struct rw_semaphore *sem); | |
14908 | +extern int __down_write_trylock(struct rw_semaphore *sem); | |
14909 | +extern void __up_read(struct rw_semaphore *sem); | |
14910 | +extern void __up_write(struct rw_semaphore *sem); | |
14911 | +extern void __downgrade_write(struct rw_semaphore *sem); | |
14912 | + | |
14913 | +#endif | |
b3bbd485 JK |
14914 | diff --git a/include/linux/sched.h b/include/linux/sched.h |
14915 | index e04919aa8201..a6ffb552be01 100644 | |
14916 | --- a/include/linux/sched.h | |
14917 | +++ b/include/linux/sched.h | |
e4b2b4a8 JK |
14918 | @@ -27,6 +27,7 @@ |
14919 | #include <linux/signal_types.h> | |
14920 | #include <linux/mm_types_task.h> | |
14921 | #include <linux/task_io_accounting.h> | |
14922 | +#include <asm/kmap_types.h> | |
1a6e0f06 | 14923 | |
e4b2b4a8 JK |
14924 | /* task_struct member predeclarations (sorted alphabetically): */ |
14925 | struct audit_context; | |
b3bbd485 | 14926 | @@ -93,7 +94,6 @@ struct task_group; |
1a6e0f06 | 14927 | |
e4b2b4a8 JK |
14928 | /* Convenience macros for the sake of wake_up(): */ |
14929 | #define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) | |
14930 | -#define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED) | |
1a6e0f06 | 14931 | |
e4b2b4a8 JK |
14932 | /* get_task_state(): */ |
14933 | #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ | |
b3bbd485 | 14934 | @@ -101,12 +101,8 @@ struct task_group; |
e4b2b4a8 JK |
14935 | __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ |
14936 | TASK_PARKED) | |
1a6e0f06 | 14937 | |
e4b2b4a8 JK |
14938 | -#define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) |
14939 | - | |
14940 | #define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0) | |
c7c16703 | 14941 | |
e4b2b4a8 JK |
14942 | -#define task_is_stopped_or_traced(task) ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) |
14943 | - | |
14944 | #define task_contributes_to_load(task) ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \ | |
14945 | (task->flags & PF_FROZEN) == 0 && \ | |
14946 | (task->state & TASK_NOLOAD) == 0) | |
b3bbd485 | 14947 | @@ -134,6 +130,11 @@ struct task_group; |
e4b2b4a8 JK |
14948 | smp_store_mb(current->state, (state_value)); \ |
14949 | } while (0) | |
c7c16703 | 14950 | |
e4b2b4a8 JK |
14951 | +#define __set_current_state_no_track(state_value) \ |
14952 | + current->state = (state_value); | |
14953 | +#define set_current_state_no_track(state_value) \ | |
14954 | + smp_store_mb(current->state, (state_value)); | |
14955 | + | |
14956 | #define set_special_state(state_value) \ | |
14957 | do { \ | |
14958 | unsigned long flags; /* may shadow */ \ | |
b3bbd485 | 14959 | @@ -187,6 +188,9 @@ struct task_group; |
e4b2b4a8 JK |
14960 | #define set_current_state(state_value) \ |
14961 | smp_store_mb(current->state, (state_value)) | |
14962 | ||
14963 | +#define __set_current_state_no_track(state_value) __set_current_state(state_value) | |
14964 | +#define set_current_state_no_track(state_value) set_current_state(state_value) | |
14965 | + | |
14966 | /* | |
14967 | * set_special_state() should be used for those states when the blocking task | |
14968 | * can not use the regular condition based wait-loop. In that case we must | |
b3bbd485 | 14969 | @@ -566,6 +570,8 @@ struct task_struct { |
e4b2b4a8 JK |
14970 | #endif |
14971 | /* -1 unrunnable, 0 runnable, >0 stopped: */ | |
14972 | volatile long state; | |
14973 | + /* saved state for "spinlock sleepers" */ | |
14974 | + volatile long saved_state; | |
c7c16703 | 14975 | |
c7c16703 | 14976 | /* |
e4b2b4a8 | 14977 | * This begins the randomizable portion of task_struct. Only |
b3bbd485 | 14978 | @@ -618,7 +624,25 @@ struct task_struct { |
e4b2b4a8 JK |
14979 | |
14980 | unsigned int policy; | |
14981 | int nr_cpus_allowed; | |
14982 | - cpumask_t cpus_allowed; | |
14983 | + const cpumask_t *cpus_ptr; | |
14984 | + cpumask_t cpus_mask; | |
b3bbd485 | 14985 | +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE) |
e4b2b4a8 JK |
14986 | + int migrate_disable; |
14987 | + int migrate_disable_update; | |
14988 | + int pinned_on_cpu; | |
14989 | +# ifdef CONFIG_SCHED_DEBUG | |
14990 | + int migrate_disable_atomic; | |
14991 | +# endif | |
14992 | + | |
14993 | +#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE) | |
e4b2b4a8 | 14994 | +# ifdef CONFIG_SCHED_DEBUG |
b3bbd485 | 14995 | + int migrate_disable; |
e4b2b4a8 JK |
14996 | + int migrate_disable_atomic; |
14997 | +# endif | |
14998 | +#endif | |
14999 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
15000 | + int sleeping_lock; | |
15001 | +#endif | |
c7c16703 | 15002 | |
e4b2b4a8 JK |
15003 | #ifdef CONFIG_PREEMPT_RCU |
15004 | int rcu_read_lock_nesting; | |
b3bbd485 | 15005 | @@ -777,6 +801,9 @@ struct task_struct { |
e4b2b4a8 JK |
15006 | #ifdef CONFIG_POSIX_TIMERS |
15007 | struct task_cputime cputime_expires; | |
15008 | struct list_head cpu_timers[3]; | |
15009 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
15010 | + struct task_struct *posix_timer_list; | |
15011 | +#endif | |
15012 | #endif | |
c7c16703 | 15013 | |
e4b2b4a8 | 15014 | /* Process credentials: */ |
b3bbd485 | 15015 | @@ -820,11 +847,17 @@ struct task_struct { |
e4b2b4a8 JK |
15016 | /* Signal handlers: */ |
15017 | struct signal_struct *signal; | |
15018 | struct sighand_struct *sighand; | |
15019 | + struct sigqueue *sigqueue_cache; | |
15020 | + | |
15021 | sigset_t blocked; | |
15022 | sigset_t real_blocked; | |
15023 | /* Restored if set_restore_sigmask() was used: */ | |
15024 | sigset_t saved_sigmask; | |
15025 | struct sigpending pending; | |
15026 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
15027 | + /* TODO: move me into ->restart_block ? */ | |
15028 | + struct siginfo forced_info; | |
15029 | +#endif | |
15030 | unsigned long sas_ss_sp; | |
15031 | size_t sas_ss_size; | |
15032 | unsigned int sas_ss_flags; | |
b3bbd485 | 15033 | @@ -849,6 +882,7 @@ struct task_struct { |
e4b2b4a8 JK |
15034 | raw_spinlock_t pi_lock; |
15035 | ||
15036 | struct wake_q_node wake_q; | |
15037 | + struct wake_q_node wake_q_sleeper; | |
15038 | ||
15039 | #ifdef CONFIG_RT_MUTEXES | |
15040 | /* PI waiters blocked on a rt_mutex held by this task: */ | |
b3bbd485 | 15041 | @@ -1116,8 +1150,22 @@ struct task_struct { |
e4b2b4a8 JK |
15042 | unsigned int sequential_io; |
15043 | unsigned int sequential_io_avg; | |
15044 | #endif | |
15045 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
15046 | + struct rcu_head put_rcu; | |
15047 | + int softirq_nestcnt; | |
15048 | + unsigned int softirqs_raised; | |
15049 | +#endif | |
15050 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
15051 | +# if defined CONFIG_HIGHMEM || defined CONFIG_X86_32 | |
15052 | + int kmap_idx; | |
15053 | + pte_t kmap_pte[KM_TYPE_NR]; | |
15054 | +# endif | |
15055 | +#endif | |
15056 | #ifdef CONFIG_DEBUG_ATOMIC_SLEEP | |
15057 | unsigned long task_state_change; | |
b3bbd485 | 15058 | +#endif |
e4b2b4a8 JK |
15059 | +#ifdef CONFIG_PREEMPT_RT_FULL |
15060 | + int xmit_recursion; | |
b3bbd485 | 15061 | #endif |
e4b2b4a8 JK |
15062 | int pagefault_disabled; |
15063 | #ifdef CONFIG_MMU | |
b3bbd485 | 15064 | @@ -1332,6 +1380,7 @@ extern struct pid *cad_pid; |
e4b2b4a8 JK |
15065 | /* |
15066 | * Per process flags | |
15067 | */ | |
15068 | +#define PF_IN_SOFTIRQ 0x00000001 /* Task is serving softirq */ | |
15069 | #define PF_IDLE 0x00000002 /* I am an IDLE thread */ | |
15070 | #define PF_EXITING 0x00000004 /* Getting shut down */ | |
15071 | #define PF_EXITPIDONE 0x00000008 /* PI exit done on shut down */ | |
b3bbd485 | 15072 | @@ -1355,7 +1404,7 @@ extern struct pid *cad_pid; |
e4b2b4a8 JK |
15073 | #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ |
15074 | #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ | |
15075 | #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ | |
15076 | -#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ | |
15077 | +#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ | |
15078 | #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ | |
15079 | #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ | |
15080 | #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ | |
b3bbd485 | 15081 | @@ -1535,6 +1584,7 @@ extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *n |
e4b2b4a8 JK |
15082 | |
15083 | extern int wake_up_state(struct task_struct *tsk, unsigned int state); | |
15084 | extern int wake_up_process(struct task_struct *tsk); | |
15085 | +extern int wake_up_lock_sleeper(struct task_struct *tsk); | |
15086 | extern void wake_up_new_task(struct task_struct *tsk); | |
15087 | ||
15088 | #ifdef CONFIG_SMP | |
b3bbd485 | 15089 | @@ -1611,6 +1661,89 @@ static inline int test_tsk_need_resched(struct task_struct *tsk) |
e4b2b4a8 JK |
15090 | return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); |
15091 | } | |
15092 | ||
15093 | +#ifdef CONFIG_PREEMPT_LAZY | |
15094 | +static inline void set_tsk_need_resched_lazy(struct task_struct *tsk) | |
15095 | +{ | |
15096 | + set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY); | |
15097 | +} | |
15098 | + | |
15099 | +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) | |
15100 | +{ | |
15101 | + clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY); | |
15102 | +} | |
15103 | + | |
15104 | +static inline int test_tsk_need_resched_lazy(struct task_struct *tsk) | |
15105 | +{ | |
15106 | + return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY)); | |
15107 | +} | |
15108 | + | |
15109 | +static inline int need_resched_lazy(void) | |
15110 | +{ | |
15111 | + return test_thread_flag(TIF_NEED_RESCHED_LAZY); | |
15112 | +} | |
15113 | + | |
15114 | +static inline int need_resched_now(void) | |
15115 | +{ | |
15116 | + return test_thread_flag(TIF_NEED_RESCHED); | |
15117 | +} | |
15118 | + | |
15119 | +#else | |
15120 | +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { } | |
15121 | +static inline int need_resched_lazy(void) { return 0; } | |
15122 | + | |
15123 | +static inline int need_resched_now(void) | |
15124 | +{ | |
15125 | + return test_thread_flag(TIF_NEED_RESCHED); | |
15126 | +} | |
15127 | + | |
15128 | +#endif | |
15129 | + | |
15130 | + | |
15131 | +static inline bool __task_is_stopped_or_traced(struct task_struct *task) | |
15132 | +{ | |
15133 | + if (task->state & (__TASK_STOPPED | __TASK_TRACED)) | |
15134 | + return true; | |
15135 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
15136 | + if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED)) | |
15137 | + return true; | |
15138 | +#endif | |
15139 | + return false; | |
15140 | +} | |
15141 | + | |
15142 | +static inline bool task_is_stopped_or_traced(struct task_struct *task) | |
15143 | +{ | |
15144 | + bool traced_stopped; | |
15145 | + | |
15146 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
15147 | + unsigned long flags; | |
15148 | + | |
15149 | + raw_spin_lock_irqsave(&task->pi_lock, flags); | |
15150 | + traced_stopped = __task_is_stopped_or_traced(task); | |
15151 | + raw_spin_unlock_irqrestore(&task->pi_lock, flags); | |
15152 | +#else | |
15153 | + traced_stopped = __task_is_stopped_or_traced(task); | |
15154 | +#endif | |
15155 | + return traced_stopped; | |
15156 | +} | |
15157 | + | |
15158 | +static inline bool task_is_traced(struct task_struct *task) | |
15159 | +{ | |
15160 | + bool traced = false; | |
15161 | + | |
15162 | + if (task->state & __TASK_TRACED) | |
15163 | + return true; | |
15164 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
15165 | + /* in case the task is sleeping on tasklist_lock */ | |
15166 | + raw_spin_lock_irq(&task->pi_lock); | |
15167 | + if (task->state & __TASK_TRACED) | |
15168 | + traced = true; | |
15169 | + else if (task->saved_state & __TASK_TRACED) | |
15170 | + traced = true; | |
15171 | + raw_spin_unlock_irq(&task->pi_lock); | |
15172 | +#endif | |
15173 | + return traced; | |
15174 | +} | |
15175 | + | |
15176 | /* | |
15177 | * cond_resched() and cond_resched_lock(): latency reduction via | |
15178 | * explicit rescheduling in places that are safe. The return | |
b3bbd485 | 15179 | @@ -1636,12 +1769,16 @@ extern int __cond_resched_lock(spinlock_t *lock); |
e4b2b4a8 JK |
15180 | __cond_resched_lock(lock); \ |
15181 | }) | |
15182 | ||
15183 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
15184 | extern int __cond_resched_softirq(void); | |
15185 | ||
15186 | #define cond_resched_softirq() ({ \ | |
15187 | ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \ | |
15188 | __cond_resched_softirq(); \ | |
15189 | }) | |
15190 | +#else | |
15191 | +# define cond_resched_softirq() cond_resched() | |
15192 | +#endif | |
c7c16703 | 15193 | |
e4b2b4a8 JK |
15194 | static inline void cond_resched_rcu(void) |
15195 | { | |
b3bbd485 | 15196 | @@ -1671,6 +1808,23 @@ static __always_inline bool need_resched(void) |
e4b2b4a8 JK |
15197 | return unlikely(tif_need_resched()); |
15198 | } | |
c7c16703 | 15199 | |
e4b2b4a8 JK |
15200 | +#ifdef CONFIG_PREEMPT_RT_FULL |
15201 | +static inline void sleeping_lock_inc(void) | |
15202 | +{ | |
15203 | + current->sleeping_lock++; | |
15204 | +} | |
15205 | + | |
15206 | +static inline void sleeping_lock_dec(void) | |
15207 | +{ | |
15208 | + current->sleeping_lock--; | |
15209 | +} | |
15210 | + | |
15211 | +#else | |
15212 | + | |
15213 | +static inline void sleeping_lock_inc(void) { } | |
15214 | +static inline void sleeping_lock_dec(void) { } | |
15215 | +#endif | |
15216 | + | |
15217 | /* | |
15218 | * Wrappers for p->thread_info->cpu access. No-op on UP. | |
15219 | */ | |
b3bbd485 JK |
15220 | diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h |
15221 | index 3d49b91b674d..d8f2fa8f500c 100644 | |
15222 | --- a/include/linux/sched/mm.h | |
15223 | +++ b/include/linux/sched/mm.h | |
15224 | @@ -43,6 +43,17 @@ static inline void mmdrop(struct mm_struct *mm) | |
15225 | __mmdrop(mm); | |
15226 | } | |
15227 | ||
15228 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
15229 | +extern void __mmdrop_delayed(struct rcu_head *rhp); | |
15230 | +static inline void mmdrop_delayed(struct mm_struct *mm) | |
15231 | +{ | |
15232 | + if (atomic_dec_and_test(&mm->mm_count)) | |
15233 | + call_rcu(&mm->delayed_drop, __mmdrop_delayed); | |
15234 | +} | |
15235 | +#else | |
15236 | +# define mmdrop_delayed(mm) mmdrop(mm) | |
15237 | +#endif | |
15238 | + | |
15239 | static inline void mmdrop_async_fn(struct work_struct *work) | |
15240 | { | |
15241 | struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work); | |
15242 | diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h | |
15243 | index a74ec619ac51..8e7f741370c5 100644 | |
15244 | --- a/include/linux/sched/task.h | |
15245 | +++ b/include/linux/sched/task.h | |
15246 | @@ -88,6 +88,15 @@ extern void sched_exec(void); | |
15247 | ||
15248 | #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) | |
15249 | ||
15250 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
15251 | +extern void __put_task_struct_cb(struct rcu_head *rhp); | |
15252 | + | |
15253 | +static inline void put_task_struct(struct task_struct *t) | |
15254 | +{ | |
15255 | + if (atomic_dec_and_test(&t->usage)) | |
15256 | + call_rcu(&t->put_rcu, __put_task_struct_cb); | |
15257 | +} | |
15258 | +#else | |
15259 | extern void __put_task_struct(struct task_struct *t); | |
15260 | ||
15261 | static inline void put_task_struct(struct task_struct *t) | |
15262 | @@ -95,7 +104,7 @@ static inline void put_task_struct(struct task_struct *t) | |
15263 | if (atomic_dec_and_test(&t->usage)) | |
15264 | __put_task_struct(t); | |
15265 | } | |
15266 | - | |
15267 | +#endif | |
15268 | struct task_struct *task_rcu_dereference(struct task_struct **ptask); | |
15269 | ||
15270 | #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT | |
15271 | diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h | |
15272 | index 10b19a192b2d..ce3ccff3d9d8 100644 | |
15273 | --- a/include/linux/sched/wake_q.h | |
15274 | +++ b/include/linux/sched/wake_q.h | |
15275 | @@ -47,8 +47,29 @@ static inline void wake_q_init(struct wake_q_head *head) | |
15276 | head->lastp = &head->first; | |
15277 | } | |
15278 | ||
15279 | -extern void wake_q_add(struct wake_q_head *head, | |
15280 | - struct task_struct *task); | |
15281 | -extern void wake_up_q(struct wake_q_head *head); | |
15282 | +extern void __wake_q_add(struct wake_q_head *head, | |
15283 | + struct task_struct *task, bool sleeper); | |
15284 | +static inline void wake_q_add(struct wake_q_head *head, | |
15285 | + struct task_struct *task) | |
15286 | +{ | |
15287 | + __wake_q_add(head, task, false); | |
15288 | +} | |
15289 | + | |
15290 | +static inline void wake_q_add_sleeper(struct wake_q_head *head, | |
15291 | + struct task_struct *task) | |
15292 | +{ | |
15293 | + __wake_q_add(head, task, true); | |
15294 | +} | |
15295 | + | |
15296 | +extern void __wake_up_q(struct wake_q_head *head, bool sleeper); | |
15297 | +static inline void wake_up_q(struct wake_q_head *head) | |
15298 | +{ | |
15299 | + __wake_up_q(head, false); | |
15300 | +} | |
15301 | + | |
15302 | +static inline void wake_up_q_sleeper(struct wake_q_head *head) | |
15303 | +{ | |
15304 | + __wake_up_q(head, true); | |
15305 | +} | |
15306 | ||
15307 | #endif /* _LINUX_SCHED_WAKE_Q_H */ | |
15308 | diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h | |
15309 | index f189a8a3bbb8..107079a2d7ed 100644 | |
15310 | --- a/include/linux/seqlock.h | |
15311 | +++ b/include/linux/seqlock.h | |
15312 | @@ -221,20 +221,30 @@ static inline int read_seqcount_retry(const seqcount_t *s, unsigned start) | |
e4b2b4a8 JK |
15313 | return __read_seqcount_retry(s, start); |
15314 | } | |
c7c16703 | 15315 | |
e4b2b4a8 JK |
15316 | - |
15317 | - | |
15318 | -static inline void raw_write_seqcount_begin(seqcount_t *s) | |
15319 | +static inline void __raw_write_seqcount_begin(seqcount_t *s) | |
15320 | { | |
15321 | s->sequence++; | |
15322 | smp_wmb(); | |
15323 | } | |
c7c16703 | 15324 | |
e4b2b4a8 JK |
15325 | -static inline void raw_write_seqcount_end(seqcount_t *s) |
15326 | +static inline void raw_write_seqcount_begin(seqcount_t *s) | |
15327 | +{ | |
15328 | + preempt_disable_rt(); | |
15329 | + __raw_write_seqcount_begin(s); | |
15330 | +} | |
15331 | + | |
15332 | +static inline void __raw_write_seqcount_end(seqcount_t *s) | |
15333 | { | |
15334 | smp_wmb(); | |
15335 | s->sequence++; | |
15336 | } | |
c7c16703 | 15337 | |
e4b2b4a8 JK |
15338 | +static inline void raw_write_seqcount_end(seqcount_t *s) |
15339 | +{ | |
15340 | + __raw_write_seqcount_end(s); | |
15341 | + preempt_enable_rt(); | |
15342 | +} | |
15343 | + | |
15344 | /** | |
15345 | * raw_write_seqcount_barrier - do a seq write barrier | |
15346 | * @s: pointer to seqcount_t | |
b3bbd485 | 15347 | @@ -429,10 +439,33 @@ typedef struct { |
e4b2b4a8 JK |
15348 | /* |
15349 | * Read side functions for starting and finalizing a read side section. | |
15350 | */ | |
15351 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
15352 | static inline unsigned read_seqbegin(const seqlock_t *sl) | |
15353 | { | |
15354 | return read_seqcount_begin(&sl->seqcount); | |
15355 | } | |
15356 | +#else | |
15357 | +/* | |
15358 | + * Starvation safe read side for RT | |
15359 | + */ | |
15360 | +static inline unsigned read_seqbegin(seqlock_t *sl) | |
15361 | +{ | |
15362 | + unsigned ret; | |
15363 | + | |
15364 | +repeat: | |
15365 | + ret = ACCESS_ONCE(sl->seqcount.sequence); | |
15366 | + if (unlikely(ret & 1)) { | |
15367 | + /* | |
15368 | + * Take the lock and let the writer proceed (i.e. evtl | |
15369 | + * boost it), otherwise we could loop here forever. | |
15370 | + */ | |
15371 | + spin_unlock_wait(&sl->lock); | |
15372 | + goto repeat; | |
15373 | + } | |
b3bbd485 | 15374 | + smp_rmb(); |
e4b2b4a8 JK |
15375 | + return ret; |
15376 | +} | |
15377 | +#endif | |
c7c16703 | 15378 | |
e4b2b4a8 JK |
15379 | static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start) |
15380 | { | |
b3bbd485 | 15381 | @@ -447,36 +480,45 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start) |
e4b2b4a8 JK |
15382 | static inline void write_seqlock(seqlock_t *sl) |
15383 | { | |
15384 | spin_lock(&sl->lock); | |
15385 | - write_seqcount_begin(&sl->seqcount); | |
15386 | + __raw_write_seqcount_begin(&sl->seqcount); | |
15387 | +} | |
15388 | + | |
15389 | +static inline int try_write_seqlock(seqlock_t *sl) | |
15390 | +{ | |
15391 | + if (spin_trylock(&sl->lock)) { | |
15392 | + __raw_write_seqcount_begin(&sl->seqcount); | |
15393 | + return 1; | |
15394 | + } | |
15395 | + return 0; | |
c7c16703 | 15396 | } |
c7c16703 | 15397 | |
e4b2b4a8 JK |
15398 | static inline void write_sequnlock(seqlock_t *sl) |
15399 | { | |
15400 | - write_seqcount_end(&sl->seqcount); | |
15401 | + __raw_write_seqcount_end(&sl->seqcount); | |
15402 | spin_unlock(&sl->lock); | |
15403 | } | |
c7c16703 | 15404 | |
e4b2b4a8 JK |
15405 | static inline void write_seqlock_bh(seqlock_t *sl) |
15406 | { | |
15407 | spin_lock_bh(&sl->lock); | |
15408 | - write_seqcount_begin(&sl->seqcount); | |
15409 | + __raw_write_seqcount_begin(&sl->seqcount); | |
c7c16703 | 15410 | } |
e4b2b4a8 JK |
15411 | |
15412 | static inline void write_sequnlock_bh(seqlock_t *sl) | |
1a6e0f06 | 15413 | { |
e4b2b4a8 JK |
15414 | - write_seqcount_end(&sl->seqcount); |
15415 | + __raw_write_seqcount_end(&sl->seqcount); | |
15416 | spin_unlock_bh(&sl->lock); | |
15417 | } | |
1a6e0f06 | 15418 | |
e4b2b4a8 JK |
15419 | static inline void write_seqlock_irq(seqlock_t *sl) |
15420 | { | |
15421 | spin_lock_irq(&sl->lock); | |
15422 | - write_seqcount_begin(&sl->seqcount); | |
15423 | + __raw_write_seqcount_begin(&sl->seqcount); | |
15424 | } | |
1a6e0f06 | 15425 | |
e4b2b4a8 | 15426 | static inline void write_sequnlock_irq(seqlock_t *sl) |
1a6e0f06 | 15427 | { |
e4b2b4a8 JK |
15428 | - write_seqcount_end(&sl->seqcount); |
15429 | + __raw_write_seqcount_end(&sl->seqcount); | |
15430 | spin_unlock_irq(&sl->lock); | |
15431 | } | |
15432 | ||
b3bbd485 | 15433 | @@ -485,7 +527,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl) |
e4b2b4a8 JK |
15434 | unsigned long flags; |
15435 | ||
15436 | spin_lock_irqsave(&sl->lock, flags); | |
15437 | - write_seqcount_begin(&sl->seqcount); | |
15438 | + __raw_write_seqcount_begin(&sl->seqcount); | |
15439 | return flags; | |
15440 | } | |
15441 | ||
b3bbd485 | 15442 | @@ -495,7 +537,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl) |
e4b2b4a8 JK |
15443 | static inline void |
15444 | write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags) | |
1a6e0f06 | 15445 | { |
e4b2b4a8 JK |
15446 | - write_seqcount_end(&sl->seqcount); |
15447 | + __raw_write_seqcount_end(&sl->seqcount); | |
15448 | spin_unlock_irqrestore(&sl->lock, flags); | |
15449 | } | |
1a6e0f06 | 15450 | |
b3bbd485 JK |
15451 | diff --git a/include/linux/signal.h b/include/linux/signal.h |
15452 | index 042968dd98f0..a7d20f85cc0e 100644 | |
15453 | --- a/include/linux/signal.h | |
15454 | +++ b/include/linux/signal.h | |
15455 | @@ -243,6 +243,7 @@ static inline void init_sigpending(struct sigpending *sig) | |
1a6e0f06 JK |
15456 | } |
15457 | ||
e4b2b4a8 JK |
15458 | extern void flush_sigqueue(struct sigpending *queue); |
15459 | +extern void flush_task_sigqueue(struct task_struct *tsk); | |
15460 | ||
15461 | /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */ | |
15462 | static inline int valid_signal(unsigned long sig) | |
b3bbd485 JK |
15463 | diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h |
15464 | index f64e88444082..07576a062ac0 100644 | |
15465 | --- a/include/linux/skbuff.h | |
15466 | +++ b/include/linux/skbuff.h | |
15467 | @@ -287,6 +287,7 @@ struct sk_buff_head { | |
e4b2b4a8 JK |
15468 | |
15469 | __u32 qlen; | |
15470 | spinlock_t lock; | |
15471 | + raw_spinlock_t raw_lock; | |
1a6e0f06 JK |
15472 | }; |
15473 | ||
e4b2b4a8 | 15474 | struct sk_buff; |
b3bbd485 | 15475 | @@ -1672,6 +1673,12 @@ static inline void skb_queue_head_init(struct sk_buff_head *list) |
e4b2b4a8 JK |
15476 | __skb_queue_head_init(list); |
15477 | } | |
1a6e0f06 | 15478 | |
e4b2b4a8 JK |
15479 | +static inline void skb_queue_head_init_raw(struct sk_buff_head *list) |
15480 | +{ | |
15481 | + raw_spin_lock_init(&list->raw_lock); | |
15482 | + __skb_queue_head_init(list); | |
15483 | +} | |
15484 | + | |
15485 | static inline void skb_queue_head_init_class(struct sk_buff_head *list, | |
15486 | struct lock_class_key *class) | |
15487 | { | |
b3bbd485 JK |
15488 | diff --git a/include/linux/smp.h b/include/linux/smp.h |
15489 | index 9fb239e12b82..5801e516ba63 100644 | |
15490 | --- a/include/linux/smp.h | |
15491 | +++ b/include/linux/smp.h | |
15492 | @@ -202,6 +202,9 @@ static inline int get_boot_cpu_id(void) | |
e4b2b4a8 JK |
15493 | #define get_cpu() ({ preempt_disable(); smp_processor_id(); }) |
15494 | #define put_cpu() preempt_enable() | |
1a6e0f06 | 15495 | |
e4b2b4a8 JK |
15496 | +#define get_cpu_light() ({ migrate_disable(); smp_processor_id(); }) |
15497 | +#define put_cpu_light() migrate_enable() | |
15498 | + | |
15499 | /* | |
15500 | * Callback to arch code if there's nosmp or maxcpus=0 on the | |
15501 | * boot command line: | |
b3bbd485 JK |
15502 | diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h |
15503 | index 341e1a12bfc7..7c8f0a985b9e 100644 | |
15504 | --- a/include/linux/spinlock.h | |
15505 | +++ b/include/linux/spinlock.h | |
15506 | @@ -286,7 +286,11 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) | |
e4b2b4a8 JK |
15507 | #define raw_spin_can_lock(lock) (!raw_spin_is_locked(lock)) |
15508 | ||
15509 | /* Include rwlock functions */ | |
15510 | -#include <linux/rwlock.h> | |
1a6e0f06 | 15511 | +#ifdef CONFIG_PREEMPT_RT_FULL |
e4b2b4a8 | 15512 | +# include <linux/rwlock_rt.h> |
1a6e0f06 | 15513 | +#else |
e4b2b4a8 | 15514 | +# include <linux/rwlock.h> |
1a6e0f06 | 15515 | +#endif |
1a6e0f06 | 15516 | |
e4b2b4a8 JK |
15517 | /* |
15518 | * Pull the _spin_*()/_read_*()/_write_*() functions/declarations: | |
b3bbd485 | 15519 | @@ -297,6 +301,10 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) |
e4b2b4a8 JK |
15520 | # include <linux/spinlock_api_up.h> |
15521 | #endif | |
1a6e0f06 | 15522 | |
e4b2b4a8 JK |
15523 | +#ifdef CONFIG_PREEMPT_RT_FULL |
15524 | +# include <linux/spinlock_rt.h> | |
15525 | +#else /* PREEMPT_RT_FULL */ | |
15526 | + | |
15527 | /* | |
15528 | * Map the spin_lock functions to the raw variants for PREEMPT_RT=n | |
15529 | */ | |
b3bbd485 | 15530 | @@ -421,4 +429,6 @@ extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock); |
e4b2b4a8 JK |
15531 | #define atomic_dec_and_lock(atomic, lock) \ |
15532 | __cond_lock(lock, _atomic_dec_and_lock(atomic, lock)) | |
1a6e0f06 | 15533 | |
e4b2b4a8 JK |
15534 | +#endif /* !PREEMPT_RT_FULL */ |
15535 | + | |
15536 | #endif /* __LINUX_SPINLOCK_H */ | |
b3bbd485 JK |
15537 | diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h |
15538 | index 42dfab89e740..29d99ae5a8ab 100644 | |
15539 | --- a/include/linux/spinlock_api_smp.h | |
15540 | +++ b/include/linux/spinlock_api_smp.h | |
15541 | @@ -187,6 +187,8 @@ static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock) | |
15542 | return 0; | |
15543 | } | |
15544 | ||
15545 | -#include <linux/rwlock_api_smp.h> | |
15546 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
15547 | +# include <linux/rwlock_api_smp.h> | |
15548 | +#endif | |
15549 | ||
15550 | #endif /* __LINUX_SPINLOCK_API_SMP_H */ | |
15551 | diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h | |
15552 | new file mode 100644 | |
15553 | index 000000000000..c95e1f5145ac | |
15554 | --- /dev/null | |
15555 | +++ b/include/linux/spinlock_rt.h | |
e4b2b4a8 JK |
15556 | @@ -0,0 +1,159 @@ |
15557 | +#ifndef __LINUX_SPINLOCK_RT_H | |
15558 | +#define __LINUX_SPINLOCK_RT_H | |
15559 | + | |
15560 | +#ifndef __LINUX_SPINLOCK_H | |
15561 | +#error Do not include directly. Use spinlock.h | |
15562 | +#endif | |
15563 | + | |
15564 | +#include <linux/bug.h> | |
15565 | + | |
15566 | +extern void | |
15567 | +__rt_spin_lock_init(spinlock_t *lock, const char *name, struct lock_class_key *key); | |
15568 | + | |
15569 | +#define spin_lock_init(slock) \ | |
15570 | +do { \ | |
15571 | + static struct lock_class_key __key; \ | |
15572 | + \ | |
15573 | + rt_mutex_init(&(slock)->lock); \ | |
15574 | + __rt_spin_lock_init(slock, #slock, &__key); \ | |
15575 | +} while (0) | |
15576 | + | |
15577 | +extern void __lockfunc rt_spin_lock(spinlock_t *lock); | |
15578 | +extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock); | |
15579 | +extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass); | |
15580 | +extern void __lockfunc rt_spin_unlock(spinlock_t *lock); | |
15581 | +extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock); | |
15582 | +extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags); | |
15583 | +extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock); | |
15584 | +extern int __lockfunc rt_spin_trylock(spinlock_t *lock); | |
15585 | +extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock); | |
15586 | + | |
15587 | +/* | |
15588 | + * lockdep-less calls, for derived types like rwlock: | |
15589 | + * (for trylock they can use rt_mutex_trylock() directly. | |
15590 | + * Migrate disable handling must be done at the call site. | |
15591 | + */ | |
15592 | +extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock); | |
15593 | +extern void __lockfunc __rt_spin_trylock(struct rt_mutex *lock); | |
15594 | +extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock); | |
15595 | + | |
15596 | +#define spin_lock(lock) rt_spin_lock(lock) | |
15597 | + | |
15598 | +#define spin_lock_bh(lock) \ | |
15599 | + do { \ | |
15600 | + local_bh_disable(); \ | |
15601 | + rt_spin_lock(lock); \ | |
15602 | + } while (0) | |
15603 | + | |
15604 | +#define spin_lock_irq(lock) spin_lock(lock) | |
15605 | + | |
15606 | +#define spin_do_trylock(lock) __cond_lock(lock, rt_spin_trylock(lock)) | |
15607 | + | |
15608 | +#define spin_trylock(lock) \ | |
15609 | +({ \ | |
15610 | + int __locked; \ | |
15611 | + __locked = spin_do_trylock(lock); \ | |
15612 | + __locked; \ | |
15613 | +}) | |
15614 | + | |
15615 | +#ifdef CONFIG_LOCKDEP | |
15616 | +# define spin_lock_nested(lock, subclass) \ | |
15617 | + do { \ | |
15618 | + rt_spin_lock_nested(lock, subclass); \ | |
15619 | + } while (0) | |
15620 | + | |
15621 | +#define spin_lock_bh_nested(lock, subclass) \ | |
15622 | + do { \ | |
15623 | + local_bh_disable(); \ | |
15624 | + rt_spin_lock_nested(lock, subclass); \ | |
15625 | + } while (0) | |
15626 | + | |
15627 | +# define spin_lock_irqsave_nested(lock, flags, subclass) \ | |
15628 | + do { \ | |
15629 | + typecheck(unsigned long, flags); \ | |
15630 | + flags = 0; \ | |
15631 | + rt_spin_lock_nested(lock, subclass); \ | |
15632 | + } while (0) | |
15633 | +#else | |
15634 | +# define spin_lock_nested(lock, subclass) spin_lock(lock) | |
15635 | +# define spin_lock_bh_nested(lock, subclass) spin_lock_bh(lock) | |
15636 | + | |
15637 | +# define spin_lock_irqsave_nested(lock, flags, subclass) \ | |
15638 | + do { \ | |
15639 | + typecheck(unsigned long, flags); \ | |
15640 | + flags = 0; \ | |
15641 | + spin_lock(lock); \ | |
15642 | + } while (0) | |
15643 | +#endif | |
15644 | + | |
15645 | +#define spin_lock_irqsave(lock, flags) \ | |
15646 | + do { \ | |
15647 | + typecheck(unsigned long, flags); \ | |
15648 | + flags = 0; \ | |
15649 | + spin_lock(lock); \ | |
15650 | + } while (0) | |
15651 | + | |
15652 | +static inline unsigned long spin_lock_trace_flags(spinlock_t *lock) | |
1a6e0f06 | 15653 | +{ |
e4b2b4a8 JK |
15654 | + unsigned long flags = 0; |
15655 | +#ifdef CONFIG_TRACE_IRQFLAGS | |
15656 | + flags = rt_spin_lock_trace_flags(lock); | |
15657 | +#else | |
15658 | + spin_lock(lock); /* lock_local */ | |
15659 | +#endif | |
15660 | + return flags; | |
1a6e0f06 JK |
15661 | +} |
15662 | + | |
e4b2b4a8 JK |
15663 | +/* FIXME: we need rt_spin_lock_nest_lock */ |
15664 | +#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0) | |
15665 | + | |
15666 | +#define spin_unlock(lock) rt_spin_unlock(lock) | |
15667 | + | |
15668 | +#define spin_unlock_bh(lock) \ | |
15669 | + do { \ | |
15670 | + rt_spin_unlock(lock); \ | |
15671 | + local_bh_enable(); \ | |
15672 | + } while (0) | |
15673 | + | |
15674 | +#define spin_unlock_irq(lock) spin_unlock(lock) | |
15675 | + | |
15676 | +#define spin_unlock_irqrestore(lock, flags) \ | |
15677 | + do { \ | |
15678 | + typecheck(unsigned long, flags); \ | |
15679 | + (void) flags; \ | |
15680 | + spin_unlock(lock); \ | |
15681 | + } while (0) | |
15682 | + | |
15683 | +#define spin_trylock_bh(lock) __cond_lock(lock, rt_spin_trylock_bh(lock)) | |
15684 | +#define spin_trylock_irq(lock) spin_trylock(lock) | |
15685 | + | |
15686 | +#define spin_trylock_irqsave(lock, flags) \ | |
15687 | + rt_spin_trylock_irqsave(lock, &(flags)) | |
15688 | + | |
15689 | +#define spin_unlock_wait(lock) rt_spin_unlock_wait(lock) | |
15690 | + | |
15691 | +#ifdef CONFIG_GENERIC_LOCKBREAK | |
15692 | +# define spin_is_contended(lock) ((lock)->break_lock) | |
15693 | +#else | |
15694 | +# define spin_is_contended(lock) (((void)(lock), 0)) | |
15695 | +#endif | |
15696 | + | |
15697 | +static inline int spin_can_lock(spinlock_t *lock) | |
1a6e0f06 | 15698 | +{ |
e4b2b4a8 | 15699 | + return !rt_mutex_is_locked(&lock->lock); |
1a6e0f06 JK |
15700 | +} |
15701 | + | |
e4b2b4a8 | 15702 | +static inline int spin_is_locked(spinlock_t *lock) |
1a6e0f06 | 15703 | +{ |
e4b2b4a8 | 15704 | + return rt_mutex_is_locked(&lock->lock); |
1a6e0f06 JK |
15705 | +} |
15706 | + | |
e4b2b4a8 | 15707 | +static inline void assert_spin_locked(spinlock_t *lock) |
1a6e0f06 | 15708 | +{ |
e4b2b4a8 | 15709 | + BUG_ON(!spin_is_locked(lock)); |
1a6e0f06 | 15710 | +} |
1a6e0f06 | 15711 | + |
e4b2b4a8 JK |
15712 | +#define atomic_dec_and_lock(atomic, lock) \ |
15713 | + atomic_dec_and_spin_lock(atomic, lock) | |
15714 | + | |
15715 | +#endif | |
b3bbd485 JK |
15716 | diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h |
15717 | index 73548eb13a5d..10bac715ea96 100644 | |
15718 | --- a/include/linux/spinlock_types.h | |
15719 | +++ b/include/linux/spinlock_types.h | |
e4b2b4a8 JK |
15720 | @@ -9,80 +9,15 @@ |
15721 | * Released under the General Public License (GPL). | |
15722 | */ | |
1a6e0f06 | 15723 | |
e4b2b4a8 JK |
15724 | -#if defined(CONFIG_SMP) |
15725 | -# include <asm/spinlock_types.h> | |
15726 | -#else | |
15727 | -# include <linux/spinlock_types_up.h> | |
15728 | -#endif | |
15729 | - | |
15730 | -#include <linux/lockdep.h> | |
15731 | - | |
15732 | -typedef struct raw_spinlock { | |
15733 | - arch_spinlock_t raw_lock; | |
15734 | -#ifdef CONFIG_GENERIC_LOCKBREAK | |
15735 | - unsigned int break_lock; | |
15736 | -#endif | |
15737 | -#ifdef CONFIG_DEBUG_SPINLOCK | |
15738 | - unsigned int magic, owner_cpu; | |
15739 | - void *owner; | |
15740 | -#endif | |
15741 | -#ifdef CONFIG_DEBUG_LOCK_ALLOC | |
15742 | - struct lockdep_map dep_map; | |
15743 | -#endif | |
15744 | -} raw_spinlock_t; | |
15745 | - | |
15746 | -#define SPINLOCK_MAGIC 0xdead4ead | |
15747 | - | |
15748 | -#define SPINLOCK_OWNER_INIT ((void *)-1L) | |
15749 | - | |
15750 | -#ifdef CONFIG_DEBUG_LOCK_ALLOC | |
15751 | -# define SPIN_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname } | |
15752 | -#else | |
15753 | -# define SPIN_DEP_MAP_INIT(lockname) | |
15754 | -#endif | |
15755 | +#include <linux/spinlock_types_raw.h> | |
1a6e0f06 | 15756 | |
e4b2b4a8 JK |
15757 | -#ifdef CONFIG_DEBUG_SPINLOCK |
15758 | -# define SPIN_DEBUG_INIT(lockname) \ | |
15759 | - .magic = SPINLOCK_MAGIC, \ | |
15760 | - .owner_cpu = -1, \ | |
15761 | - .owner = SPINLOCK_OWNER_INIT, | |
15762 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
15763 | +# include <linux/spinlock_types_nort.h> | |
15764 | +# include <linux/rwlock_types.h> | |
15765 | #else | |
15766 | -# define SPIN_DEBUG_INIT(lockname) | |
b3bbd485 JK |
15767 | +# include <linux/rtmutex.h> |
15768 | +# include <linux/spinlock_types_rt.h> | |
15769 | +# include <linux/rwlock_types_rt.h> | |
15770 | #endif | |
15771 | ||
e4b2b4a8 JK |
15772 | -#define __RAW_SPIN_LOCK_INITIALIZER(lockname) \ |
15773 | - { \ | |
15774 | - .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \ | |
15775 | - SPIN_DEBUG_INIT(lockname) \ | |
15776 | - SPIN_DEP_MAP_INIT(lockname) } | |
15777 | - | |
15778 | -#define __RAW_SPIN_LOCK_UNLOCKED(lockname) \ | |
15779 | - (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname) | |
15780 | - | |
15781 | -#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x) | |
15782 | - | |
15783 | -typedef struct spinlock { | |
15784 | - union { | |
15785 | - struct raw_spinlock rlock; | |
15786 | - | |
15787 | -#ifdef CONFIG_DEBUG_LOCK_ALLOC | |
15788 | -# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map)) | |
15789 | - struct { | |
15790 | - u8 __padding[LOCK_PADSIZE]; | |
15791 | - struct lockdep_map dep_map; | |
15792 | - }; | |
b3bbd485 | 15793 | -#endif |
e4b2b4a8 JK |
15794 | - }; |
15795 | -} spinlock_t; | |
15796 | - | |
15797 | -#define __SPIN_LOCK_INITIALIZER(lockname) \ | |
15798 | - { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } } | |
15799 | - | |
15800 | -#define __SPIN_LOCK_UNLOCKED(lockname) \ | |
15801 | - (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname) | |
15802 | - | |
15803 | -#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x) | |
15804 | - | |
15805 | -#include <linux/rwlock_types.h> | |
b3bbd485 | 15806 | - |
e4b2b4a8 | 15807 | #endif /* __LINUX_SPINLOCK_TYPES_H */ |
b3bbd485 JK |
15808 | diff --git a/include/linux/spinlock_types_nort.h b/include/linux/spinlock_types_nort.h |
15809 | new file mode 100644 | |
15810 | index 000000000000..f1dac1fb1d6a | |
15811 | --- /dev/null | |
15812 | +++ b/include/linux/spinlock_types_nort.h | |
e4b2b4a8 JK |
15813 | @@ -0,0 +1,33 @@ |
15814 | +#ifndef __LINUX_SPINLOCK_TYPES_NORT_H | |
15815 | +#define __LINUX_SPINLOCK_TYPES_NORT_H | |
1a6e0f06 | 15816 | + |
e4b2b4a8 JK |
15817 | +#ifndef __LINUX_SPINLOCK_TYPES_H |
15818 | +#error "Do not include directly. Include spinlock_types.h instead" | |
15819 | +#endif | |
1a6e0f06 | 15820 | + |
e4b2b4a8 JK |
15821 | +/* |
15822 | + * The non RT version maps spinlocks to raw_spinlocks | |
15823 | + */ | |
15824 | +typedef struct spinlock { | |
15825 | + union { | |
15826 | + struct raw_spinlock rlock; | |
1a6e0f06 | 15827 | + |
e4b2b4a8 JK |
15828 | +#ifdef CONFIG_DEBUG_LOCK_ALLOC |
15829 | +# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map)) | |
15830 | + struct { | |
15831 | + u8 __padding[LOCK_PADSIZE]; | |
15832 | + struct lockdep_map dep_map; | |
15833 | + }; | |
1a6e0f06 | 15834 | +#endif |
e4b2b4a8 JK |
15835 | + }; |
15836 | +} spinlock_t; | |
1a6e0f06 | 15837 | + |
e4b2b4a8 JK |
15838 | +#define __SPIN_LOCK_INITIALIZER(lockname) \ |
15839 | + { { .rlock = __RAW_SPIN_LOCK_INITIALIZER(lockname) } } | |
1a6e0f06 | 15840 | + |
e4b2b4a8 JK |
15841 | +#define __SPIN_LOCK_UNLOCKED(lockname) \ |
15842 | + (spinlock_t ) __SPIN_LOCK_INITIALIZER(lockname) | |
1a6e0f06 | 15843 | + |
e4b2b4a8 | 15844 | +#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x) |
1a6e0f06 | 15845 | + |
e4b2b4a8 | 15846 | +#endif |
b3bbd485 JK |
15847 | diff --git a/include/linux/spinlock_types_raw.h b/include/linux/spinlock_types_raw.h |
15848 | new file mode 100644 | |
15849 | index 000000000000..03235b475b77 | |
15850 | --- /dev/null | |
15851 | +++ b/include/linux/spinlock_types_raw.h | |
e4b2b4a8 JK |
15852 | @@ -0,0 +1,58 @@ |
15853 | +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H | |
15854 | +#define __LINUX_SPINLOCK_TYPES_RAW_H | |
1a6e0f06 | 15855 | + |
e4b2b4a8 | 15856 | +#include <linux/types.h> |
1a6e0f06 | 15857 | + |
e4b2b4a8 JK |
15858 | +#if defined(CONFIG_SMP) |
15859 | +# include <asm/spinlock_types.h> | |
15860 | +#else | |
15861 | +# include <linux/spinlock_types_up.h> | |
15862 | +#endif | |
1a6e0f06 | 15863 | + |
e4b2b4a8 JK |
15864 | +#include <linux/lockdep.h> |
15865 | + | |
15866 | +typedef struct raw_spinlock { | |
15867 | + arch_spinlock_t raw_lock; | |
15868 | +#ifdef CONFIG_GENERIC_LOCKBREAK | |
15869 | + unsigned int break_lock; | |
15870 | +#endif | |
15871 | +#ifdef CONFIG_DEBUG_SPINLOCK | |
15872 | + unsigned int magic, owner_cpu; | |
15873 | + void *owner; | |
15874 | +#endif | |
15875 | +#ifdef CONFIG_DEBUG_LOCK_ALLOC | |
15876 | + struct lockdep_map dep_map; | |
15877 | +#endif | |
15878 | +} raw_spinlock_t; | |
15879 | + | |
15880 | +#define SPINLOCK_MAGIC 0xdead4ead | |
15881 | + | |
15882 | +#define SPINLOCK_OWNER_INIT ((void *)-1L) | |
1a6e0f06 | 15883 | + |
e4b2b4a8 JK |
15884 | +#ifdef CONFIG_DEBUG_LOCK_ALLOC |
15885 | +# define SPIN_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname } | |
1a6e0f06 | 15886 | +#else |
e4b2b4a8 JK |
15887 | +# define SPIN_DEP_MAP_INIT(lockname) |
15888 | +#endif | |
1a6e0f06 | 15889 | + |
e4b2b4a8 JK |
15890 | +#ifdef CONFIG_DEBUG_SPINLOCK |
15891 | +# define SPIN_DEBUG_INIT(lockname) \ | |
15892 | + .magic = SPINLOCK_MAGIC, \ | |
15893 | + .owner_cpu = -1, \ | |
15894 | + .owner = SPINLOCK_OWNER_INIT, | |
15895 | +#else | |
15896 | +# define SPIN_DEBUG_INIT(lockname) | |
1a6e0f06 | 15897 | +#endif |
e4b2b4a8 JK |
15898 | + |
15899 | +#define __RAW_SPIN_LOCK_INITIALIZER(lockname) \ | |
15900 | + { \ | |
15901 | + .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \ | |
15902 | + SPIN_DEBUG_INIT(lockname) \ | |
15903 | + SPIN_DEP_MAP_INIT(lockname) } | |
15904 | + | |
15905 | +#define __RAW_SPIN_LOCK_UNLOCKED(lockname) \ | |
15906 | + (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname) | |
15907 | + | |
15908 | +#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x) | |
15909 | + | |
1a6e0f06 | 15910 | +#endif |
b3bbd485 JK |
15911 | diff --git a/include/linux/spinlock_types_rt.h b/include/linux/spinlock_types_rt.h |
15912 | new file mode 100644 | |
15913 | index 000000000000..3e3d8c5f7a9a | |
15914 | --- /dev/null | |
15915 | +++ b/include/linux/spinlock_types_rt.h | |
e4b2b4a8 JK |
15916 | @@ -0,0 +1,48 @@ |
15917 | +#ifndef __LINUX_SPINLOCK_TYPES_RT_H | |
15918 | +#define __LINUX_SPINLOCK_TYPES_RT_H | |
15919 | + | |
15920 | +#ifndef __LINUX_SPINLOCK_TYPES_H | |
15921 | +#error "Do not include directly. Include spinlock_types.h instead" | |
1a6e0f06 | 15922 | +#endif |
1a6e0f06 | 15923 | + |
e4b2b4a8 JK |
15924 | +#include <linux/cache.h> |
15925 | + | |
15926 | +/* | |
15927 | + * PREEMPT_RT: spinlocks - an RT mutex plus lock-break field: | |
15928 | + */ | |
15929 | +typedef struct spinlock { | |
15930 | + struct rt_mutex lock; | |
15931 | + unsigned int break_lock; | |
15932 | +#ifdef CONFIG_DEBUG_LOCK_ALLOC | |
15933 | + struct lockdep_map dep_map; | |
1a6e0f06 | 15934 | +#endif |
e4b2b4a8 | 15935 | +} spinlock_t; |
1a6e0f06 | 15936 | + |
e4b2b4a8 JK |
15937 | +#ifdef CONFIG_DEBUG_RT_MUTEXES |
15938 | +# define __RT_SPIN_INITIALIZER(name) \ | |
15939 | + { \ | |
15940 | + .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \ | |
15941 | + .save_state = 1, \ | |
15942 | + .file = __FILE__, \ | |
15943 | + .line = __LINE__ , \ | |
15944 | + } | |
1a6e0f06 | 15945 | +#else |
e4b2b4a8 JK |
15946 | +# define __RT_SPIN_INITIALIZER(name) \ |
15947 | + { \ | |
15948 | + .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \ | |
15949 | + .save_state = 1, \ | |
15950 | + } | |
1a6e0f06 | 15951 | +#endif |
1a6e0f06 | 15952 | + |
e4b2b4a8 JK |
15953 | +/* |
15954 | +.wait_list = PLIST_HEAD_INIT_RAW((name).lock.wait_list, (name).lock.wait_lock) | |
15955 | +*/ | |
15956 | + | |
15957 | +#define __SPIN_LOCK_UNLOCKED(name) \ | |
15958 | + { .lock = __RT_SPIN_INITIALIZER(name.lock), \ | |
15959 | + SPIN_DEP_MAP_INIT(name) } | |
15960 | + | |
15961 | +#define DEFINE_SPINLOCK(name) \ | |
15962 | + spinlock_t name = __SPIN_LOCK_UNLOCKED(name) | |
15963 | + | |
1a6e0f06 | 15964 | +#endif |
b3bbd485 JK |
15965 | diff --git a/include/linux/spinlock_types_up.h b/include/linux/spinlock_types_up.h |
15966 | index c09b6407ae1b..b0243ba07fb7 100644 | |
15967 | --- a/include/linux/spinlock_types_up.h | |
15968 | +++ b/include/linux/spinlock_types_up.h | |
e4b2b4a8 JK |
15969 | @@ -1,10 +1,6 @@ |
15970 | #ifndef __LINUX_SPINLOCK_TYPES_UP_H | |
15971 | #define __LINUX_SPINLOCK_TYPES_UP_H | |
1a6e0f06 | 15972 | |
e4b2b4a8 JK |
15973 | -#ifndef __LINUX_SPINLOCK_TYPES_H |
15974 | -# error "please don't include this file directly" | |
15975 | -#endif | |
15976 | - | |
15977 | /* | |
15978 | * include/linux/spinlock_types_up.h - spinlock type definitions for UP | |
15979 | * | |
b3bbd485 JK |
15980 | diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h |
15981 | index 261471f407a5..f41d2fb09f87 100644 | |
15982 | --- a/include/linux/srcutiny.h | |
15983 | +++ b/include/linux/srcutiny.h | |
15984 | @@ -43,7 +43,7 @@ struct srcu_struct { | |
1a6e0f06 | 15985 | |
e4b2b4a8 | 15986 | void srcu_drive_gp(struct work_struct *wp); |
1a6e0f06 | 15987 | |
e4b2b4a8 JK |
15988 | -#define __SRCU_STRUCT_INIT(name) \ |
15989 | +#define __SRCU_STRUCT_INIT(name, __ignored) \ | |
15990 | { \ | |
15991 | .srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq), \ | |
15992 | .srcu_cb_tail = &name.srcu_cb_head, \ | |
b3bbd485 | 15993 | @@ -56,9 +56,9 @@ void srcu_drive_gp(struct work_struct *wp); |
e4b2b4a8 | 15994 | * Tree SRCU, which needs some per-CPU data. |
1a6e0f06 | 15995 | */ |
e4b2b4a8 JK |
15996 | #define DEFINE_SRCU(name) \ |
15997 | - struct srcu_struct name = __SRCU_STRUCT_INIT(name) | |
15998 | + struct srcu_struct name = __SRCU_STRUCT_INIT(name, name) | |
15999 | #define DEFINE_STATIC_SRCU(name) \ | |
16000 | - static struct srcu_struct name = __SRCU_STRUCT_INIT(name) | |
16001 | + static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name) | |
16002 | ||
16003 | void synchronize_srcu(struct srcu_struct *sp); | |
16004 | ||
b3bbd485 JK |
16005 | diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h |
16006 | index a949f4f9e4d7..745d4ca4dd50 100644 | |
16007 | --- a/include/linux/srcutree.h | |
16008 | +++ b/include/linux/srcutree.h | |
16009 | @@ -40,7 +40,7 @@ struct srcu_data { | |
e4b2b4a8 JK |
16010 | unsigned long srcu_unlock_count[2]; /* Unlocks per CPU. */ |
16011 | ||
16012 | /* Update-side state. */ | |
16013 | - raw_spinlock_t __private lock ____cacheline_internodealigned_in_smp; | |
16014 | + spinlock_t __private lock ____cacheline_internodealigned_in_smp; | |
16015 | struct rcu_segcblist srcu_cblist; /* List of callbacks.*/ | |
16016 | unsigned long srcu_gp_seq_needed; /* Furthest future GP needed. */ | |
16017 | unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ | |
b3bbd485 | 16018 | @@ -58,7 +58,7 @@ struct srcu_data { |
e4b2b4a8 JK |
16019 | * Node in SRCU combining tree, similar in function to rcu_data. |
16020 | */ | |
16021 | struct srcu_node { | |
16022 | - raw_spinlock_t __private lock; | |
16023 | + spinlock_t __private lock; | |
16024 | unsigned long srcu_have_cbs[4]; /* GP seq for children */ | |
16025 | /* having CBs, but only */ | |
16026 | /* is > ->srcu_gq_seq. */ | |
b3bbd485 | 16027 | @@ -78,7 +78,7 @@ struct srcu_struct { |
e4b2b4a8 JK |
16028 | struct srcu_node *level[RCU_NUM_LVLS + 1]; |
16029 | /* First node at each level. */ | |
16030 | struct mutex srcu_cb_mutex; /* Serialize CB preparation. */ | |
16031 | - raw_spinlock_t __private lock; /* Protect counters */ | |
16032 | + spinlock_t __private lock; /* Protect counters */ | |
16033 | struct mutex srcu_gp_mutex; /* Serialize GP work. */ | |
16034 | unsigned int srcu_idx; /* Current rdr array element. */ | |
16035 | unsigned long srcu_gp_seq; /* Grace-period seq #. */ | |
b3bbd485 | 16036 | @@ -104,10 +104,10 @@ struct srcu_struct { |
e4b2b4a8 JK |
16037 | #define SRCU_STATE_SCAN1 1 |
16038 | #define SRCU_STATE_SCAN2 2 | |
1a6e0f06 | 16039 | |
e4b2b4a8 JK |
16040 | -#define __SRCU_STRUCT_INIT(name) \ |
16041 | +#define __SRCU_STRUCT_INIT(name, pcpu_name) \ | |
16042 | { \ | |
16043 | - .sda = &name##_srcu_data, \ | |
16044 | - .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ | |
16045 | + .sda = &pcpu_name, \ | |
16046 | + .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ | |
16047 | .srcu_gp_seq_needed = 0 - 1, \ | |
16048 | __SRCU_DEP_MAP_INIT(name) \ | |
16049 | } | |
b3bbd485 | 16050 | @@ -133,7 +133,7 @@ struct srcu_struct { |
1a6e0f06 | 16051 | */ |
e4b2b4a8 JK |
16052 | #define __DEFINE_SRCU(name, is_static) \ |
16053 | static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data);\ | |
16054 | - is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name) | |
16055 | + is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name, name##_srcu_data) | |
16056 | #define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) | |
16057 | #define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) | |
16058 | ||
b3bbd485 JK |
16059 | diff --git a/include/linux/suspend.h b/include/linux/suspend.h |
16060 | index 8544357d92d0..616ea66cd283 100644 | |
16061 | --- a/include/linux/suspend.h | |
16062 | +++ b/include/linux/suspend.h | |
16063 | @@ -196,6 +196,12 @@ struct platform_s2idle_ops { | |
e4b2b4a8 | 16064 | void (*end)(void); |
1a6e0f06 JK |
16065 | }; |
16066 | ||
e4b2b4a8 JK |
16067 | +#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATION) |
16068 | +extern bool pm_in_action; | |
16069 | +#else | |
16070 | +# define pm_in_action false | |
16071 | +#endif | |
16072 | + | |
16073 | #ifdef CONFIG_SUSPEND | |
16074 | extern suspend_state_t mem_sleep_current; | |
16075 | extern suspend_state_t mem_sleep_default; | |
b3bbd485 JK |
16076 | diff --git a/include/linux/swait.h b/include/linux/swait.h |
16077 | index c98aaf677466..853f3e61a9f4 100644 | |
16078 | --- a/include/linux/swait.h | |
16079 | +++ b/include/linux/swait.h | |
e4b2b4a8 JK |
16080 | @@ -5,6 +5,7 @@ |
16081 | #include <linux/list.h> | |
16082 | #include <linux/stddef.h> | |
16083 | #include <linux/spinlock.h> | |
16084 | +#include <linux/wait.h> | |
16085 | #include <asm/current.h> | |
1a6e0f06 | 16086 | |
e4b2b4a8 | 16087 | /* |
b3bbd485 | 16088 | @@ -147,6 +148,7 @@ static inline bool swq_has_sleeper(struct swait_queue_head *wq) |
e4b2b4a8 JK |
16089 | extern void swake_up(struct swait_queue_head *q); |
16090 | extern void swake_up_all(struct swait_queue_head *q); | |
16091 | extern void swake_up_locked(struct swait_queue_head *q); | |
16092 | +extern void swake_up_all_locked(struct swait_queue_head *q); | |
1a6e0f06 | 16093 | |
e4b2b4a8 JK |
16094 | extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); |
16095 | extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state); | |
b3bbd485 JK |
16096 | diff --git a/include/linux/swap.h b/include/linux/swap.h |
16097 | index f02fb5db8914..6c775168df67 100644 | |
16098 | --- a/include/linux/swap.h | |
16099 | +++ b/include/linux/swap.h | |
e4b2b4a8 JK |
16100 | @@ -12,6 +12,7 @@ |
16101 | #include <linux/fs.h> | |
16102 | #include <linux/atomic.h> | |
16103 | #include <linux/page-flags.h> | |
16104 | +#include <linux/locallock.h> | |
16105 | #include <asm/page.h> | |
16106 | ||
16107 | struct notifier_block; | |
b3bbd485 | 16108 | @@ -297,7 +298,8 @@ struct vma_swap_readahead { |
e4b2b4a8 JK |
16109 | void *workingset_eviction(struct address_space *mapping, struct page *page); |
16110 | bool workingset_refault(void *shadow); | |
16111 | void workingset_activation(struct page *page); | |
16112 | -void workingset_update_node(struct radix_tree_node *node, void *private); | |
16113 | +void __workingset_update_node(struct radix_tree_node *node, void *private); | |
16114 | +DECLARE_LOCAL_IRQ_LOCK(shadow_nodes_lock); | |
1a6e0f06 | 16115 | |
e4b2b4a8 JK |
16116 | /* linux/mm/page_alloc.c */ |
16117 | extern unsigned long totalram_pages; | |
b3bbd485 | 16118 | @@ -310,6 +312,7 @@ extern unsigned long nr_free_pagecache_pages(void); |
1a6e0f06 | 16119 | |
1a6e0f06 | 16120 | |
e4b2b4a8 JK |
16121 | /* linux/mm/swap.c */ |
16122 | +DECLARE_LOCAL_IRQ_LOCK(swapvec_lock); | |
16123 | extern void lru_cache_add(struct page *); | |
16124 | extern void lru_cache_add_anon(struct page *page); | |
16125 | extern void lru_cache_add_file(struct page *page); | |
b3bbd485 JK |
16126 | diff --git a/include/linux/swork.h b/include/linux/swork.h |
16127 | new file mode 100644 | |
16128 | index 000000000000..f175fa9a6016 | |
16129 | --- /dev/null | |
16130 | +++ b/include/linux/swork.h | |
e4b2b4a8 JK |
16131 | @@ -0,0 +1,24 @@ |
16132 | +#ifndef _LINUX_SWORK_H | |
16133 | +#define _LINUX_SWORK_H | |
16134 | + | |
16135 | +#include <linux/list.h> | |
16136 | + | |
16137 | +struct swork_event { | |
16138 | + struct list_head item; | |
16139 | + unsigned long flags; | |
16140 | + void (*func)(struct swork_event *); | |
16141 | +}; | |
16142 | + | |
16143 | +static inline void INIT_SWORK(struct swork_event *event, | |
16144 | + void (*func)(struct swork_event *)) | |
16145 | +{ | |
16146 | + event->flags = 0; | |
16147 | + event->func = func; | |
16148 | +} | |
16149 | + | |
16150 | +bool swork_queue(struct swork_event *sev); | |
16151 | + | |
16152 | +int swork_get(void); | |
16153 | +void swork_put(void); | |
16154 | + | |
16155 | +#endif /* _LINUX_SWORK_H */ | |
b3bbd485 JK |
16156 | diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h |
16157 | index cf2862bd134a..fd05d83740df 100644 | |
16158 | --- a/include/linux/thread_info.h | |
16159 | +++ b/include/linux/thread_info.h | |
16160 | @@ -86,7 +86,17 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag) | |
e4b2b4a8 JK |
16161 | #define test_thread_flag(flag) \ |
16162 | test_ti_thread_flag(current_thread_info(), flag) | |
16163 | ||
16164 | -#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) | |
16165 | +#ifdef CONFIG_PREEMPT_LAZY | |
16166 | +#define tif_need_resched() (test_thread_flag(TIF_NEED_RESCHED) || \ | |
16167 | + test_thread_flag(TIF_NEED_RESCHED_LAZY)) | |
16168 | +#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED)) | |
16169 | +#define tif_need_resched_lazy() test_thread_flag(TIF_NEED_RESCHED_LAZY)) | |
16170 | + | |
1a6e0f06 | 16171 | +#else |
e4b2b4a8 JK |
16172 | +#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) |
16173 | +#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED) | |
16174 | +#define tif_need_resched_lazy() 0 | |
1a6e0f06 | 16175 | +#endif |
1a6e0f06 | 16176 | |
e4b2b4a8 JK |
16177 | #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES |
16178 | static inline int arch_within_stack_frames(const void * const stack, | |
b3bbd485 JK |
16179 | diff --git a/include/linux/timer.h b/include/linux/timer.h |
16180 | index e0ea1fe87572..df3085ddf662 100644 | |
16181 | --- a/include/linux/timer.h | |
16182 | +++ b/include/linux/timer.h | |
16183 | @@ -213,7 +213,7 @@ extern void add_timer(struct timer_list *timer); | |
1a6e0f06 | 16184 | |
e4b2b4a8 | 16185 | extern int try_to_del_timer_sync(struct timer_list *timer); |
1a6e0f06 | 16186 | |
e4b2b4a8 JK |
16187 | -#ifdef CONFIG_SMP |
16188 | +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL) | |
16189 | extern int del_timer_sync(struct timer_list *timer); | |
16190 | #else | |
16191 | # define del_timer_sync(t) del_timer(t) | |
b3bbd485 JK |
16192 | diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h |
16193 | index 2bcb4dc6df1a..edd1e42e8a2f 100644 | |
16194 | --- a/include/linux/trace_events.h | |
16195 | +++ b/include/linux/trace_events.h | |
16196 | @@ -62,6 +62,9 @@ struct trace_entry { | |
e4b2b4a8 JK |
16197 | unsigned char flags; |
16198 | unsigned char preempt_count; | |
16199 | int pid; | |
16200 | + unsigned short migrate_disable; | |
16201 | + unsigned short padding; | |
16202 | + unsigned char preempt_lazy_count; | |
16203 | }; | |
1a6e0f06 | 16204 | |
e4b2b4a8 | 16205 | #define TRACE_EVENT_TYPE_MAX \ |
b3bbd485 | 16206 | @@ -402,11 +405,13 @@ enum event_trigger_type { |
e4b2b4a8 JK |
16207 | |
16208 | extern int filter_match_preds(struct event_filter *filter, void *rec); | |
16209 | ||
16210 | -extern enum event_trigger_type event_triggers_call(struct trace_event_file *file, | |
16211 | - void *rec); | |
16212 | -extern void event_triggers_post_call(struct trace_event_file *file, | |
16213 | - enum event_trigger_type tt, | |
16214 | - void *rec); | |
16215 | +extern enum event_trigger_type | |
16216 | +event_triggers_call(struct trace_event_file *file, void *rec, | |
16217 | + struct ring_buffer_event *event); | |
16218 | +extern void | |
16219 | +event_triggers_post_call(struct trace_event_file *file, | |
16220 | + enum event_trigger_type tt, | |
16221 | + void *rec, struct ring_buffer_event *event); | |
1a6e0f06 | 16222 | |
e4b2b4a8 | 16223 | bool trace_event_ignore_this_pid(struct trace_event_file *trace_file); |
1a6e0f06 | 16224 | |
b3bbd485 | 16225 | @@ -426,7 +431,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file) |
1a6e0f06 | 16226 | |
e4b2b4a8 JK |
16227 | if (!(eflags & EVENT_FILE_FL_TRIGGER_COND)) { |
16228 | if (eflags & EVENT_FILE_FL_TRIGGER_MODE) | |
16229 | - event_triggers_call(file, NULL); | |
16230 | + event_triggers_call(file, NULL, NULL); | |
16231 | if (eflags & EVENT_FILE_FL_SOFT_DISABLED) | |
16232 | return true; | |
16233 | if (eflags & EVENT_FILE_FL_PID_FILTER) | |
b3bbd485 JK |
16234 | diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h |
16235 | index 251e655d407f..57e8e32ef2b0 100644 | |
16236 | --- a/include/linux/uaccess.h | |
16237 | +++ b/include/linux/uaccess.h | |
16238 | @@ -185,6 +185,7 @@ static __always_inline void pagefault_disabled_dec(void) | |
e4b2b4a8 JK |
16239 | */ |
16240 | static inline void pagefault_disable(void) | |
1a6e0f06 | 16241 | { |
e4b2b4a8 JK |
16242 | + migrate_disable(); |
16243 | pagefault_disabled_inc(); | |
16244 | /* | |
16245 | * make sure to have issued the store before a pagefault | |
b3bbd485 | 16246 | @@ -201,6 +202,7 @@ static inline void pagefault_enable(void) |
e4b2b4a8 JK |
16247 | */ |
16248 | barrier(); | |
16249 | pagefault_disabled_dec(); | |
16250 | + migrate_enable(); | |
16251 | } | |
1a6e0f06 | 16252 | |
e4b2b4a8 | 16253 | /* |
b3bbd485 JK |
16254 | diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h |
16255 | index 1e0cb72e0598..87ab0996a9b0 100644 | |
16256 | --- a/include/linux/vmstat.h | |
16257 | +++ b/include/linux/vmstat.h | |
16258 | @@ -33,7 +33,9 @@ DECLARE_PER_CPU(struct vm_event_state, vm_event_states); | |
e4b2b4a8 JK |
16259 | */ |
16260 | static inline void __count_vm_event(enum vm_event_item item) | |
16261 | { | |
16262 | + preempt_disable_rt(); | |
16263 | raw_cpu_inc(vm_event_states.event[item]); | |
16264 | + preempt_enable_rt(); | |
1a6e0f06 JK |
16265 | } |
16266 | ||
e4b2b4a8 | 16267 | static inline void count_vm_event(enum vm_event_item item) |
b3bbd485 | 16268 | @@ -43,7 +45,9 @@ static inline void count_vm_event(enum vm_event_item item) |
e4b2b4a8 JK |
16269 | |
16270 | static inline void __count_vm_events(enum vm_event_item item, long delta) | |
1a6e0f06 | 16271 | { |
e4b2b4a8 JK |
16272 | + preempt_disable_rt(); |
16273 | raw_cpu_add(vm_event_states.event[item], delta); | |
16274 | + preempt_enable_rt(); | |
1a6e0f06 JK |
16275 | } |
16276 | ||
e4b2b4a8 | 16277 | static inline void count_vm_events(enum vm_event_item item, long delta) |
b3bbd485 JK |
16278 | diff --git a/include/linux/wait.h b/include/linux/wait.h |
16279 | index 158715445ffb..3451706a3074 100644 | |
16280 | --- a/include/linux/wait.h | |
16281 | +++ b/include/linux/wait.h | |
e4b2b4a8 JK |
16282 | @@ -10,6 +10,7 @@ |
16283 | ||
16284 | #include <asm/current.h> | |
16285 | #include <uapi/linux/wait.h> | |
16286 | +#include <linux/atomic.h> | |
16287 | ||
16288 | typedef struct wait_queue_entry wait_queue_entry_t; | |
16289 | ||
b3bbd485 | 16290 | @@ -486,8 +487,8 @@ do { \ |
e4b2b4a8 JK |
16291 | int __ret = 0; \ |
16292 | struct hrtimer_sleeper __t; \ | |
16293 | \ | |
16294 | - hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); \ | |
16295 | - hrtimer_init_sleeper(&__t, current); \ | |
16296 | + hrtimer_init_sleeper_on_stack(&__t, CLOCK_MONOTONIC, HRTIMER_MODE_REL, \ | |
16297 | + current); \ | |
16298 | if ((timeout) != KTIME_MAX) \ | |
16299 | hrtimer_start_range_ns(&__t.timer, timeout, \ | |
16300 | current->timer_slack_ns, \ | |
b3bbd485 JK |
16301 | diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h |
16302 | index 304f7aa9cc01..00d3813cef26 100644 | |
16303 | --- a/include/net/gen_stats.h | |
16304 | +++ b/include/net/gen_stats.h | |
e4b2b4a8 JK |
16305 | @@ -6,6 +6,7 @@ |
16306 | #include <linux/socket.h> | |
16307 | #include <linux/rtnetlink.h> | |
16308 | #include <linux/pkt_sched.h> | |
16309 | +#include <net/net_seq_lock.h> | |
16310 | ||
16311 | struct gnet_stats_basic_cpu { | |
16312 | struct gnet_stats_basic_packed bstats; | |
b3bbd485 | 16313 | @@ -36,11 +37,11 @@ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type, |
e4b2b4a8 JK |
16314 | spinlock_t *lock, struct gnet_dump *d, |
16315 | int padattr); | |
1a6e0f06 | 16316 | |
e4b2b4a8 JK |
16317 | -int gnet_stats_copy_basic(const seqcount_t *running, |
16318 | +int gnet_stats_copy_basic(net_seqlock_t *running, | |
16319 | struct gnet_dump *d, | |
16320 | struct gnet_stats_basic_cpu __percpu *cpu, | |
16321 | struct gnet_stats_basic_packed *b); | |
16322 | -void __gnet_stats_copy_basic(const seqcount_t *running, | |
16323 | +void __gnet_stats_copy_basic(net_seqlock_t *running, | |
16324 | struct gnet_stats_basic_packed *bstats, | |
16325 | struct gnet_stats_basic_cpu __percpu *cpu, | |
16326 | struct gnet_stats_basic_packed *b); | |
b3bbd485 | 16327 | @@ -57,13 +58,13 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, |
e4b2b4a8 JK |
16328 | struct gnet_stats_basic_cpu __percpu *cpu_bstats, |
16329 | struct net_rate_estimator __rcu **rate_est, | |
16330 | spinlock_t *stats_lock, | |
16331 | - seqcount_t *running, struct nlattr *opt); | |
16332 | + net_seqlock_t *running, struct nlattr *opt); | |
16333 | void gen_kill_estimator(struct net_rate_estimator __rcu **ptr); | |
16334 | int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, | |
16335 | struct gnet_stats_basic_cpu __percpu *cpu_bstats, | |
16336 | struct net_rate_estimator __rcu **ptr, | |
16337 | spinlock_t *stats_lock, | |
16338 | - seqcount_t *running, struct nlattr *opt); | |
16339 | + net_seqlock_t *running, struct nlattr *opt); | |
16340 | bool gen_estimator_active(struct net_rate_estimator __rcu **ptr); | |
16341 | bool gen_estimator_read(struct net_rate_estimator __rcu **ptr, | |
16342 | struct gnet_stats_rate_est64 *sample); | |
b3bbd485 JK |
16343 | diff --git a/include/net/neighbour.h b/include/net/neighbour.h |
16344 | index a964366a7ef5..51c854583987 100644 | |
16345 | --- a/include/net/neighbour.h | |
16346 | +++ b/include/net/neighbour.h | |
16347 | @@ -450,7 +450,7 @@ static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb) | |
1a6e0f06 | 16348 | } |
e4b2b4a8 | 16349 | #endif |
1a6e0f06 | 16350 | |
e4b2b4a8 JK |
16351 | -static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb) |
16352 | +static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb) | |
16353 | { | |
16354 | unsigned int seq; | |
16355 | unsigned int hh_len; | |
b3bbd485 | 16356 | @@ -474,7 +474,7 @@ static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb |
1a6e0f06 | 16357 | |
e4b2b4a8 JK |
16358 | static inline int neigh_output(struct neighbour *n, struct sk_buff *skb) |
16359 | { | |
16360 | - const struct hh_cache *hh = &n->hh; | |
16361 | + struct hh_cache *hh = &n->hh; | |
1a6e0f06 | 16362 | |
e4b2b4a8 JK |
16363 | if ((n->nud_state & NUD_CONNECTED) && hh->hh_len) |
16364 | return neigh_hh_output(hh, skb); | |
b3bbd485 | 16365 | @@ -515,7 +515,7 @@ struct neighbour_cb { |
1a6e0f06 | 16366 | |
e4b2b4a8 | 16367 | #define NEIGH_CB(skb) ((struct neighbour_cb *)(skb)->cb) |
1a6e0f06 | 16368 | |
e4b2b4a8 JK |
16369 | -static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n, |
16370 | +static inline void neigh_ha_snapshot(char *dst, struct neighbour *n, | |
16371 | const struct net_device *dev) | |
16372 | { | |
16373 | unsigned int seq; | |
b3bbd485 JK |
16374 | diff --git a/include/net/net_seq_lock.h b/include/net/net_seq_lock.h |
16375 | new file mode 100644 | |
16376 | index 000000000000..a7034298a82a | |
16377 | --- /dev/null | |
16378 | +++ b/include/net/net_seq_lock.h | |
e4b2b4a8 JK |
16379 | @@ -0,0 +1,15 @@ |
16380 | +#ifndef __NET_NET_SEQ_LOCK_H__ | |
16381 | +#define __NET_NET_SEQ_LOCK_H__ | |
16382 | + | |
1a6e0f06 | 16383 | +#ifdef CONFIG_PREEMPT_RT_BASE |
e4b2b4a8 JK |
16384 | +# define net_seqlock_t seqlock_t |
16385 | +# define net_seq_begin(__r) read_seqbegin(__r) | |
16386 | +# define net_seq_retry(__r, __s) read_seqretry(__r, __s) | |
16387 | + | |
1a6e0f06 | 16388 | +#else |
e4b2b4a8 JK |
16389 | +# define net_seqlock_t seqcount_t |
16390 | +# define net_seq_begin(__r) read_seqcount_begin(__r) | |
16391 | +# define net_seq_retry(__r, __s) read_seqcount_retry(__r, __s) | |
1a6e0f06 JK |
16392 | +#endif |
16393 | + | |
e4b2b4a8 | 16394 | +#endif |
b3bbd485 JK |
16395 | diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h |
16396 | index f59acacaa265..6ac7c3659973 100644 | |
16397 | --- a/include/net/sch_generic.h | |
16398 | +++ b/include/net/sch_generic.h | |
e4b2b4a8 JK |
16399 | @@ -10,6 +10,7 @@ |
16400 | #include <linux/percpu.h> | |
16401 | #include <linux/dynamic_queue_limits.h> | |
16402 | #include <linux/list.h> | |
16403 | +#include <net/net_seq_lock.h> | |
16404 | #include <linux/refcount.h> | |
16405 | #include <linux/workqueue.h> | |
16406 | #include <net/gen_stats.h> | |
b3bbd485 | 16407 | @@ -90,7 +91,7 @@ struct Qdisc { |
e4b2b4a8 JK |
16408 | struct sk_buff *gso_skb ____cacheline_aligned_in_smp; |
16409 | struct qdisc_skb_head q; | |
16410 | struct gnet_stats_basic_packed bstats; | |
16411 | - seqcount_t running; | |
16412 | + net_seqlock_t running; | |
16413 | struct gnet_stats_queue qstats; | |
16414 | unsigned long state; | |
16415 | struct Qdisc *next_sched; | |
b3bbd485 | 16416 | @@ -109,13 +110,22 @@ static inline void qdisc_refcount_inc(struct Qdisc *qdisc) |
e4b2b4a8 JK |
16417 | refcount_inc(&qdisc->refcnt); |
16418 | } | |
1a6e0f06 | 16419 | |
e4b2b4a8 JK |
16420 | -static inline bool qdisc_is_running(const struct Qdisc *qdisc) |
16421 | +static inline bool qdisc_is_running(struct Qdisc *qdisc) | |
1a6e0f06 | 16422 | { |
e4b2b4a8 JK |
16423 | +#ifdef CONFIG_PREEMPT_RT_BASE |
16424 | + return spin_is_locked(&qdisc->running.lock) ? true : false; | |
1a6e0f06 | 16425 | +#else |
e4b2b4a8 | 16426 | return (raw_read_seqcount(&qdisc->running) & 1) ? true : false; |
1a6e0f06 | 16427 | +#endif |
e4b2b4a8 | 16428 | } |
1a6e0f06 | 16429 | |
e4b2b4a8 JK |
16430 | static inline bool qdisc_run_begin(struct Qdisc *qdisc) |
16431 | { | |
1a6e0f06 | 16432 | +#ifdef CONFIG_PREEMPT_RT_BASE |
e4b2b4a8 JK |
16433 | + if (try_write_seqlock(&qdisc->running)) |
16434 | + return true; | |
16435 | + return false; | |
1a6e0f06 | 16436 | +#else |
e4b2b4a8 JK |
16437 | if (qdisc_is_running(qdisc)) |
16438 | return false; | |
16439 | /* Variant of write_seqcount_begin() telling lockdep a trylock | |
b3bbd485 | 16440 | @@ -124,11 +134,16 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) |
e4b2b4a8 JK |
16441 | raw_write_seqcount_begin(&qdisc->running); |
16442 | seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_); | |
16443 | return true; | |
1a6e0f06 | 16444 | +#endif |
e4b2b4a8 | 16445 | } |
1a6e0f06 | 16446 | |
e4b2b4a8 JK |
16447 | static inline void qdisc_run_end(struct Qdisc *qdisc) |
16448 | { | |
1a6e0f06 | 16449 | +#ifdef CONFIG_PREEMPT_RT_BASE |
e4b2b4a8 | 16450 | + write_sequnlock(&qdisc->running); |
1a6e0f06 | 16451 | +#else |
e4b2b4a8 | 16452 | write_seqcount_end(&qdisc->running); |
1a6e0f06 | 16453 | +#endif |
e4b2b4a8 | 16454 | } |
1a6e0f06 | 16455 | |
e4b2b4a8 | 16456 | static inline bool qdisc_may_bulk(const struct Qdisc *qdisc) |
b3bbd485 | 16457 | @@ -337,7 +352,7 @@ static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc) |
e4b2b4a8 JK |
16458 | return qdisc_lock(root); |
16459 | } | |
1a6e0f06 | 16460 | |
e4b2b4a8 JK |
16461 | -static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc) |
16462 | +static inline net_seqlock_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc) | |
16463 | { | |
16464 | struct Qdisc *root = qdisc_root_sleeping(qdisc); | |
1a6e0f06 | 16465 | |
b3bbd485 JK |
16466 | diff --git a/include/net/xfrm.h b/include/net/xfrm.h |
16467 | index db99efb2d1d0..a7b95ffbbf8b 100644 | |
16468 | --- a/include/net/xfrm.h | |
16469 | +++ b/include/net/xfrm.h | |
16470 | @@ -217,7 +217,7 @@ struct xfrm_state { | |
e4b2b4a8 JK |
16471 | struct xfrm_stats stats; |
16472 | ||
16473 | struct xfrm_lifetime_cur curlft; | |
16474 | - struct tasklet_hrtimer mtimer; | |
16475 | + struct hrtimer mtimer; | |
16476 | ||
16477 | struct xfrm_state_offload xso; | |
16478 | ||
b3bbd485 JK |
16479 | diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h |
16480 | index c6f728037c53..a57e4ee989d6 100644 | |
16481 | --- a/include/trace/events/timer.h | |
16482 | +++ b/include/trace/events/timer.h | |
16483 | @@ -148,7 +148,11 @@ DEFINE_EVENT(timer_class, timer_cancel, | |
e4b2b4a8 JK |
16484 | { HRTIMER_MODE_ABS, "ABS" }, \ |
16485 | { HRTIMER_MODE_REL, "REL" }, \ | |
16486 | { HRTIMER_MODE_ABS_PINNED, "ABS|PINNED" }, \ | |
16487 | - { HRTIMER_MODE_REL_PINNED, "REL|PINNED" }) | |
16488 | + { HRTIMER_MODE_REL_PINNED, "REL|PINNED" }, \ | |
16489 | + { HRTIMER_MODE_ABS_SOFT, "ABS|SOFT" }, \ | |
16490 | + { HRTIMER_MODE_REL_SOFT, "REL|SOFT" }, \ | |
16491 | + { HRTIMER_MODE_ABS_PINNED_SOFT, "ABS|PINNED|SOFT" }, \ | |
16492 | + { HRTIMER_MODE_REL_PINNED_SOFT, "REL|PINNED|SOFT" }) | |
1a6e0f06 | 16493 | |
e4b2b4a8 JK |
16494 | /** |
16495 | * hrtimer_init - called when the hrtimer is initialized | |
b3bbd485 | 16496 | @@ -186,15 +190,16 @@ TRACE_EVENT(hrtimer_init, |
e4b2b4a8 JK |
16497 | */ |
16498 | TRACE_EVENT(hrtimer_start, | |
16499 | ||
16500 | - TP_PROTO(struct hrtimer *hrtimer), | |
16501 | + TP_PROTO(struct hrtimer *hrtimer, enum hrtimer_mode mode), | |
16502 | ||
16503 | - TP_ARGS(hrtimer), | |
16504 | + TP_ARGS(hrtimer, mode), | |
16505 | ||
16506 | TP_STRUCT__entry( | |
16507 | __field( void *, hrtimer ) | |
16508 | __field( void *, function ) | |
16509 | __field( s64, expires ) | |
16510 | __field( s64, softexpires ) | |
16511 | + __field( enum hrtimer_mode, mode ) | |
16512 | ), | |
16513 | ||
16514 | TP_fast_assign( | |
b3bbd485 | 16515 | @@ -202,12 +207,14 @@ TRACE_EVENT(hrtimer_start, |
e4b2b4a8 JK |
16516 | __entry->function = hrtimer->function; |
16517 | __entry->expires = hrtimer_get_expires(hrtimer); | |
16518 | __entry->softexpires = hrtimer_get_softexpires(hrtimer); | |
16519 | + __entry->mode = mode; | |
16520 | ), | |
16521 | ||
16522 | - TP_printk("hrtimer=%p function=%pf expires=%llu softexpires=%llu", | |
16523 | - __entry->hrtimer, __entry->function, | |
16524 | + TP_printk("hrtimer=%p function=%pf expires=%llu softexpires=%llu " | |
16525 | + "mode=%s", __entry->hrtimer, __entry->function, | |
16526 | (unsigned long long) __entry->expires, | |
16527 | - (unsigned long long) __entry->softexpires) | |
16528 | + (unsigned long long) __entry->softexpires, | |
16529 | + decode_hrtimer_mode(__entry->mode)) | |
16530 | ); | |
1a6e0f06 | 16531 | |
e4b2b4a8 | 16532 | /** |
b3bbd485 JK |
16533 | diff --git a/init/Kconfig b/init/Kconfig |
16534 | index 46075327c165..a7aff2c1a203 100644 | |
16535 | --- a/init/Kconfig | |
16536 | +++ b/init/Kconfig | |
16537 | @@ -744,6 +744,7 @@ config CFS_BANDWIDTH | |
e4b2b4a8 JK |
16538 | config RT_GROUP_SCHED |
16539 | bool "Group scheduling for SCHED_RR/FIFO" | |
16540 | depends on CGROUP_SCHED | |
16541 | + depends on !PREEMPT_RT_FULL | |
16542 | default n | |
16543 | help | |
16544 | This feature lets you explicitly allocate real CPU bandwidth | |
b3bbd485 | 16545 | @@ -1533,6 +1534,7 @@ choice |
1a6e0f06 | 16546 | |
e4b2b4a8 JK |
16547 | config SLAB |
16548 | bool "SLAB" | |
16549 | + depends on !PREEMPT_RT_FULL | |
16550 | select HAVE_HARDENED_USERCOPY_ALLOCATOR | |
16551 | help | |
16552 | The regular slab allocator that is established and known to work | |
b3bbd485 | 16553 | @@ -1553,6 +1555,7 @@ config SLUB |
e4b2b4a8 JK |
16554 | config SLOB |
16555 | depends on EXPERT | |
16556 | bool "SLOB (Simple Allocator)" | |
16557 | + depends on !PREEMPT_RT_FULL | |
16558 | help | |
16559 | SLOB replaces the stock allocator with a drastically simpler | |
16560 | allocator. SLOB is generally more space efficient but | |
b3bbd485 | 16561 | @@ -1594,7 +1597,7 @@ config SLAB_FREELIST_HARDENED |
1a6e0f06 | 16562 | |
e4b2b4a8 JK |
16563 | config SLUB_CPU_PARTIAL |
16564 | default y | |
16565 | - depends on SLUB && SMP | |
16566 | + depends on SLUB && SMP && !PREEMPT_RT_FULL | |
16567 | bool "SLUB per cpu partial cache" | |
16568 | help | |
16569 | Per cpu partial caches accellerate objects allocation and freeing | |
b3bbd485 JK |
16570 | diff --git a/init/Makefile b/init/Makefile |
16571 | index 1dbb23787290..eabf3f1b14be 100644 | |
16572 | --- a/init/Makefile | |
16573 | +++ b/init/Makefile | |
16574 | @@ -36,4 +36,4 @@ silent_chk_compile.h = : | |
16575 | include/generated/compile.h: FORCE | |
16576 | @$($(quiet)chk_compile.h) | |
16577 | $(Q)$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \ | |
16578 | - "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CC) $(KBUILD_CFLAGS)" | |
16579 | + "$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" "$(CONFIG_PREEMPT_RT_FULL)" "$(CC) $(KBUILD_CFLAGS)" | |
16580 | diff --git a/init/main.c b/init/main.c | |
16581 | index c4a45145e102..c86f3d3b9a72 100644 | |
16582 | --- a/init/main.c | |
16583 | +++ b/init/main.c | |
16584 | @@ -543,6 +543,7 @@ asmlinkage __visible void __init start_kernel(void) | |
e4b2b4a8 JK |
16585 | setup_command_line(command_line); |
16586 | setup_nr_cpu_ids(); | |
16587 | setup_per_cpu_areas(); | |
16588 | + softirq_early_init(); | |
16589 | smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ | |
16590 | boot_cpu_hotplug_init(); | |
1a6e0f06 | 16591 | |
b3bbd485 JK |
16592 | diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks |
16593 | index 84d882f3e299..af27c4000812 100644 | |
16594 | --- a/kernel/Kconfig.locks | |
16595 | +++ b/kernel/Kconfig.locks | |
16596 | @@ -225,11 +225,11 @@ config ARCH_SUPPORTS_ATOMIC_RMW | |
16597 | ||
16598 | config MUTEX_SPIN_ON_OWNER | |
16599 | def_bool y | |
16600 | - depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW | |
16601 | + depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL | |
16602 | ||
16603 | config RWSEM_SPIN_ON_OWNER | |
16604 | def_bool y | |
16605 | - depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW | |
16606 | + depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW && !PREEMPT_RT_FULL | |
16607 | ||
16608 | config LOCK_SPIN_ON_OWNER | |
16609 | def_bool y | |
16610 | diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt | |
16611 | index 3f9c97419f02..11dbe26a8279 100644 | |
16612 | --- a/kernel/Kconfig.preempt | |
16613 | +++ b/kernel/Kconfig.preempt | |
16614 | @@ -1,3 +1,16 @@ | |
16615 | +config PREEMPT | |
16616 | + bool | |
16617 | + select PREEMPT_COUNT | |
16618 | + | |
16619 | +config PREEMPT_RT_BASE | |
16620 | + bool | |
16621 | + select PREEMPT | |
16622 | + | |
16623 | +config HAVE_PREEMPT_LAZY | |
16624 | + bool | |
16625 | + | |
16626 | +config PREEMPT_LAZY | |
16627 | + def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT_FULL | |
16628 | ||
16629 | choice | |
16630 | prompt "Preemption Model" | |
16631 | @@ -33,9 +46,9 @@ config PREEMPT_VOLUNTARY | |
16632 | ||
16633 | Select this if you are building a kernel for a desktop system. | |
16634 | ||
16635 | -config PREEMPT | |
16636 | +config PREEMPT__LL | |
16637 | bool "Preemptible Kernel (Low-Latency Desktop)" | |
16638 | - select PREEMPT_COUNT | |
16639 | + select PREEMPT | |
16640 | select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK | |
16641 | help | |
16642 | This option reduces the latency of the kernel by making | |
16643 | @@ -52,6 +65,22 @@ config PREEMPT | |
16644 | embedded system with latency requirements in the milliseconds | |
16645 | range. | |
16646 | ||
16647 | +config PREEMPT_RTB | |
16648 | + bool "Preemptible Kernel (Basic RT)" | |
16649 | + select PREEMPT_RT_BASE | |
16650 | + help | |
16651 | + This option is basically the same as (Low-Latency Desktop) but | |
16652 | + enables changes which are preliminary for the full preemptible | |
16653 | + RT kernel. | |
16654 | + | |
16655 | +config PREEMPT_RT_FULL | |
16656 | + bool "Fully Preemptible Kernel (RT)" | |
16657 | + depends on IRQ_FORCED_THREADING | |
16658 | + select PREEMPT_RT_BASE | |
16659 | + select PREEMPT_RCU | |
16660 | + help | |
16661 | + All and everything | |
16662 | + | |
16663 | endchoice | |
16664 | ||
16665 | config PREEMPT_COUNT | |
16666 | diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c | |
5dd41b01 | 16667 | index 3fc11b8851ac..a04c3aded76b 100644 |
b3bbd485 JK |
16668 | --- a/kernel/cgroup/cgroup.c |
16669 | +++ b/kernel/cgroup/cgroup.c | |
5dd41b01 | 16670 | @@ -4515,10 +4515,10 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head) |
e4b2b4a8 | 16671 | queue_work(cgroup_destroy_wq, &css->destroy_work); |
1a6e0f06 JK |
16672 | } |
16673 | ||
e4b2b4a8 JK |
16674 | -static void css_release_work_fn(struct work_struct *work) |
16675 | +static void css_release_work_fn(struct swork_event *sev) | |
1a6e0f06 | 16676 | { |
e4b2b4a8 JK |
16677 | struct cgroup_subsys_state *css = |
16678 | - container_of(work, struct cgroup_subsys_state, destroy_work); | |
16679 | + container_of(sev, struct cgroup_subsys_state, destroy_swork); | |
16680 | struct cgroup_subsys *ss = css->ss; | |
16681 | struct cgroup *cgrp = css->cgroup; | |
1a6e0f06 | 16682 | |
5dd41b01 | 16683 | @@ -4569,8 +4569,8 @@ static void css_release(struct percpu_ref *ref) |
e4b2b4a8 JK |
16684 | struct cgroup_subsys_state *css = |
16685 | container_of(ref, struct cgroup_subsys_state, refcnt); | |
1a6e0f06 | 16686 | |
e4b2b4a8 JK |
16687 | - INIT_WORK(&css->destroy_work, css_release_work_fn); |
16688 | - queue_work(cgroup_destroy_wq, &css->destroy_work); | |
16689 | + INIT_SWORK(&css->destroy_swork, css_release_work_fn); | |
16690 | + swork_queue(&css->destroy_swork); | |
1a6e0f06 JK |
16691 | } |
16692 | ||
e4b2b4a8 | 16693 | static void init_and_link_css(struct cgroup_subsys_state *css, |
5dd41b01 | 16694 | @@ -5276,6 +5276,7 @@ static int __init cgroup_wq_init(void) |
e4b2b4a8 JK |
16695 | */ |
16696 | cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); | |
16697 | BUG_ON(!cgroup_destroy_wq); | |
16698 | + BUG_ON(swork_get()); | |
16699 | return 0; | |
16700 | } | |
16701 | core_initcall(cgroup_wq_init); | |
b3bbd485 JK |
16702 | diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c |
16703 | index 4657e2924ecb..bda2af78277a 100644 | |
16704 | --- a/kernel/cgroup/cpuset.c | |
16705 | +++ b/kernel/cgroup/cpuset.c | |
16706 | @@ -288,7 +288,7 @@ static struct cpuset top_cpuset = { | |
1a6e0f06 | 16707 | */ |
1a6e0f06 | 16708 | |
e4b2b4a8 JK |
16709 | static DEFINE_MUTEX(cpuset_mutex); |
16710 | -static DEFINE_SPINLOCK(callback_lock); | |
16711 | +static DEFINE_RAW_SPINLOCK(callback_lock); | |
1a6e0f06 | 16712 | |
e4b2b4a8 | 16713 | static struct workqueue_struct *cpuset_migrate_mm_wq; |
1a6e0f06 | 16714 | |
b3bbd485 | 16715 | @@ -926,9 +926,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) |
e4b2b4a8 JK |
16716 | continue; |
16717 | rcu_read_unlock(); | |
1a6e0f06 | 16718 | |
e4b2b4a8 JK |
16719 | - spin_lock_irq(&callback_lock); |
16720 | + raw_spin_lock_irq(&callback_lock); | |
16721 | cpumask_copy(cp->effective_cpus, new_cpus); | |
16722 | - spin_unlock_irq(&callback_lock); | |
16723 | + raw_spin_unlock_irq(&callback_lock); | |
1a6e0f06 | 16724 | |
e4b2b4a8 JK |
16725 | WARN_ON(!is_in_v2_mode() && |
16726 | !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); | |
b3bbd485 | 16727 | @@ -993,9 +993,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, |
e4b2b4a8 JK |
16728 | if (retval < 0) |
16729 | return retval; | |
1a6e0f06 | 16730 | |
e4b2b4a8 JK |
16731 | - spin_lock_irq(&callback_lock); |
16732 | + raw_spin_lock_irq(&callback_lock); | |
16733 | cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); | |
16734 | - spin_unlock_irq(&callback_lock); | |
16735 | + raw_spin_unlock_irq(&callback_lock); | |
16736 | ||
16737 | /* use trialcs->cpus_allowed as a temp variable */ | |
16738 | update_cpumasks_hier(cs, trialcs->cpus_allowed); | |
b3bbd485 | 16739 | @@ -1179,9 +1179,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) |
e4b2b4a8 JK |
16740 | continue; |
16741 | rcu_read_unlock(); | |
16742 | ||
16743 | - spin_lock_irq(&callback_lock); | |
16744 | + raw_spin_lock_irq(&callback_lock); | |
16745 | cp->effective_mems = *new_mems; | |
16746 | - spin_unlock_irq(&callback_lock); | |
16747 | + raw_spin_unlock_irq(&callback_lock); | |
16748 | ||
16749 | WARN_ON(!is_in_v2_mode() && | |
16750 | !nodes_equal(cp->mems_allowed, cp->effective_mems)); | |
b3bbd485 | 16751 | @@ -1249,9 +1249,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, |
e4b2b4a8 JK |
16752 | if (retval < 0) |
16753 | goto done; | |
16754 | ||
16755 | - spin_lock_irq(&callback_lock); | |
16756 | + raw_spin_lock_irq(&callback_lock); | |
16757 | cs->mems_allowed = trialcs->mems_allowed; | |
16758 | - spin_unlock_irq(&callback_lock); | |
16759 | + raw_spin_unlock_irq(&callback_lock); | |
16760 | ||
16761 | /* use trialcs->mems_allowed as a temp variable */ | |
16762 | update_nodemasks_hier(cs, &trialcs->mems_allowed); | |
b3bbd485 | 16763 | @@ -1342,9 +1342,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, |
e4b2b4a8 JK |
16764 | spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) |
16765 | || (is_spread_page(cs) != is_spread_page(trialcs))); | |
16766 | ||
16767 | - spin_lock_irq(&callback_lock); | |
16768 | + raw_spin_lock_irq(&callback_lock); | |
16769 | cs->flags = trialcs->flags; | |
16770 | - spin_unlock_irq(&callback_lock); | |
16771 | + raw_spin_unlock_irq(&callback_lock); | |
16772 | ||
16773 | if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) | |
16774 | rebuild_sched_domains_locked(); | |
b3bbd485 | 16775 | @@ -1759,7 +1759,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) |
e4b2b4a8 JK |
16776 | cpuset_filetype_t type = seq_cft(sf)->private; |
16777 | int ret = 0; | |
1a6e0f06 | 16778 | |
e4b2b4a8 JK |
16779 | - spin_lock_irq(&callback_lock); |
16780 | + raw_spin_lock_irq(&callback_lock); | |
1a6e0f06 | 16781 | |
e4b2b4a8 JK |
16782 | switch (type) { |
16783 | case FILE_CPULIST: | |
b3bbd485 | 16784 | @@ -1778,7 +1778,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) |
e4b2b4a8 JK |
16785 | ret = -EINVAL; |
16786 | } | |
1a6e0f06 | 16787 | |
e4b2b4a8 JK |
16788 | - spin_unlock_irq(&callback_lock); |
16789 | + raw_spin_unlock_irq(&callback_lock); | |
16790 | return ret; | |
1a6e0f06 JK |
16791 | } |
16792 | ||
b3bbd485 | 16793 | @@ -1993,12 +1993,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) |
1a6e0f06 | 16794 | |
e4b2b4a8 | 16795 | cpuset_inc(); |
1a6e0f06 | 16796 | |
e4b2b4a8 JK |
16797 | - spin_lock_irq(&callback_lock); |
16798 | + raw_spin_lock_irq(&callback_lock); | |
16799 | if (is_in_v2_mode()) { | |
16800 | cpumask_copy(cs->effective_cpus, parent->effective_cpus); | |
16801 | cs->effective_mems = parent->effective_mems; | |
16802 | } | |
16803 | - spin_unlock_irq(&callback_lock); | |
16804 | + raw_spin_unlock_irq(&callback_lock); | |
16805 | ||
16806 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) | |
16807 | goto out_unlock; | |
b3bbd485 | 16808 | @@ -2025,12 +2025,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) |
e4b2b4a8 JK |
16809 | } |
16810 | rcu_read_unlock(); | |
16811 | ||
16812 | - spin_lock_irq(&callback_lock); | |
16813 | + raw_spin_lock_irq(&callback_lock); | |
16814 | cs->mems_allowed = parent->mems_allowed; | |
16815 | cs->effective_mems = parent->mems_allowed; | |
16816 | cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); | |
16817 | cpumask_copy(cs->effective_cpus, parent->cpus_allowed); | |
16818 | - spin_unlock_irq(&callback_lock); | |
16819 | + raw_spin_unlock_irq(&callback_lock); | |
16820 | out_unlock: | |
16821 | mutex_unlock(&cpuset_mutex); | |
16822 | return 0; | |
b3bbd485 | 16823 | @@ -2069,7 +2069,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) |
e4b2b4a8 | 16824 | static void cpuset_bind(struct cgroup_subsys_state *root_css) |
1a6e0f06 | 16825 | { |
e4b2b4a8 JK |
16826 | mutex_lock(&cpuset_mutex); |
16827 | - spin_lock_irq(&callback_lock); | |
16828 | + raw_spin_lock_irq(&callback_lock); | |
16829 | ||
16830 | if (is_in_v2_mode()) { | |
16831 | cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); | |
b3bbd485 | 16832 | @@ -2080,7 +2080,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) |
e4b2b4a8 JK |
16833 | top_cpuset.mems_allowed = top_cpuset.effective_mems; |
16834 | } | |
16835 | ||
16836 | - spin_unlock_irq(&callback_lock); | |
16837 | + raw_spin_unlock_irq(&callback_lock); | |
16838 | mutex_unlock(&cpuset_mutex); | |
1a6e0f06 JK |
16839 | } |
16840 | ||
b3bbd485 | 16841 | @@ -2094,7 +2094,7 @@ static void cpuset_fork(struct task_struct *task) |
e4b2b4a8 JK |
16842 | if (task_css_is_root(task, cpuset_cgrp_id)) |
16843 | return; | |
16844 | ||
16845 | - set_cpus_allowed_ptr(task, ¤t->cpus_allowed); | |
16846 | + set_cpus_allowed_ptr(task, current->cpus_ptr); | |
16847 | task->mems_allowed = current->mems_allowed; | |
1a6e0f06 JK |
16848 | } |
16849 | ||
b3bbd485 | 16850 | @@ -2178,12 +2178,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs, |
1a6e0f06 | 16851 | { |
e4b2b4a8 | 16852 | bool is_empty; |
1a6e0f06 | 16853 | |
e4b2b4a8 JK |
16854 | - spin_lock_irq(&callback_lock); |
16855 | + raw_spin_lock_irq(&callback_lock); | |
16856 | cpumask_copy(cs->cpus_allowed, new_cpus); | |
16857 | cpumask_copy(cs->effective_cpus, new_cpus); | |
16858 | cs->mems_allowed = *new_mems; | |
16859 | cs->effective_mems = *new_mems; | |
16860 | - spin_unlock_irq(&callback_lock); | |
16861 | + raw_spin_unlock_irq(&callback_lock); | |
1a6e0f06 | 16862 | |
e4b2b4a8 JK |
16863 | /* |
16864 | * Don't call update_tasks_cpumask() if the cpuset becomes empty, | |
b3bbd485 | 16865 | @@ -2220,10 +2220,10 @@ hotplug_update_tasks(struct cpuset *cs, |
e4b2b4a8 JK |
16866 | if (nodes_empty(*new_mems)) |
16867 | *new_mems = parent_cs(cs)->effective_mems; | |
1a6e0f06 | 16868 | |
e4b2b4a8 JK |
16869 | - spin_lock_irq(&callback_lock); |
16870 | + raw_spin_lock_irq(&callback_lock); | |
16871 | cpumask_copy(cs->effective_cpus, new_cpus); | |
16872 | cs->effective_mems = *new_mems; | |
16873 | - spin_unlock_irq(&callback_lock); | |
16874 | + raw_spin_unlock_irq(&callback_lock); | |
1a6e0f06 | 16875 | |
e4b2b4a8 JK |
16876 | if (cpus_updated) |
16877 | update_tasks_cpumask(cs); | |
b3bbd485 | 16878 | @@ -2316,21 +2316,21 @@ static void cpuset_hotplug_workfn(struct work_struct *work) |
1a6e0f06 | 16879 | |
e4b2b4a8 JK |
16880 | /* synchronize cpus_allowed to cpu_active_mask */ |
16881 | if (cpus_updated) { | |
16882 | - spin_lock_irq(&callback_lock); | |
16883 | + raw_spin_lock_irq(&callback_lock); | |
16884 | if (!on_dfl) | |
16885 | cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); | |
16886 | cpumask_copy(top_cpuset.effective_cpus, &new_cpus); | |
16887 | - spin_unlock_irq(&callback_lock); | |
16888 | + raw_spin_unlock_irq(&callback_lock); | |
16889 | /* we don't mess with cpumasks of tasks in top_cpuset */ | |
16890 | } | |
1a6e0f06 | 16891 | |
e4b2b4a8 JK |
16892 | /* synchronize mems_allowed to N_MEMORY */ |
16893 | if (mems_updated) { | |
16894 | - spin_lock_irq(&callback_lock); | |
16895 | + raw_spin_lock_irq(&callback_lock); | |
16896 | if (!on_dfl) | |
16897 | top_cpuset.mems_allowed = new_mems; | |
16898 | top_cpuset.effective_mems = new_mems; | |
16899 | - spin_unlock_irq(&callback_lock); | |
16900 | + raw_spin_unlock_irq(&callback_lock); | |
16901 | update_tasks_nodemask(&top_cpuset); | |
16902 | } | |
1a6e0f06 | 16903 | |
b3bbd485 | 16904 | @@ -2429,11 +2429,11 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) |
e4b2b4a8 JK |
16905 | { |
16906 | unsigned long flags; | |
1a6e0f06 | 16907 | |
e4b2b4a8 JK |
16908 | - spin_lock_irqsave(&callback_lock, flags); |
16909 | + raw_spin_lock_irqsave(&callback_lock, flags); | |
16910 | rcu_read_lock(); | |
16911 | guarantee_online_cpus(task_cs(tsk), pmask); | |
16912 | rcu_read_unlock(); | |
16913 | - spin_unlock_irqrestore(&callback_lock, flags); | |
16914 | + raw_spin_unlock_irqrestore(&callback_lock, flags); | |
16915 | } | |
1a6e0f06 | 16916 | |
e4b2b4a8 | 16917 | void cpuset_cpus_allowed_fallback(struct task_struct *tsk) |
b3bbd485 | 16918 | @@ -2481,11 +2481,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) |
e4b2b4a8 JK |
16919 | nodemask_t mask; |
16920 | unsigned long flags; | |
1a6e0f06 | 16921 | |
e4b2b4a8 JK |
16922 | - spin_lock_irqsave(&callback_lock, flags); |
16923 | + raw_spin_lock_irqsave(&callback_lock, flags); | |
16924 | rcu_read_lock(); | |
16925 | guarantee_online_mems(task_cs(tsk), &mask); | |
16926 | rcu_read_unlock(); | |
16927 | - spin_unlock_irqrestore(&callback_lock, flags); | |
16928 | + raw_spin_unlock_irqrestore(&callback_lock, flags); | |
1a6e0f06 | 16929 | |
e4b2b4a8 | 16930 | return mask; |
1a6e0f06 | 16931 | } |
b3bbd485 | 16932 | @@ -2577,14 +2577,14 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask) |
e4b2b4a8 JK |
16933 | return true; |
16934 | ||
16935 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ | |
16936 | - spin_lock_irqsave(&callback_lock, flags); | |
16937 | + raw_spin_lock_irqsave(&callback_lock, flags); | |
16938 | ||
16939 | rcu_read_lock(); | |
16940 | cs = nearest_hardwall_ancestor(task_cs(current)); | |
16941 | allowed = node_isset(node, cs->mems_allowed); | |
16942 | rcu_read_unlock(); | |
1a6e0f06 | 16943 | |
e4b2b4a8 JK |
16944 | - spin_unlock_irqrestore(&callback_lock, flags); |
16945 | + raw_spin_unlock_irqrestore(&callback_lock, flags); | |
16946 | return allowed; | |
1a6e0f06 JK |
16947 | } |
16948 | ||
b3bbd485 JK |
16949 | diff --git a/kernel/cpu.c b/kernel/cpu.c |
16950 | index f3f389e33343..7d777b62e4eb 100644 | |
16951 | --- a/kernel/cpu.c | |
16952 | +++ b/kernel/cpu.c | |
16953 | @@ -74,6 +74,11 @@ static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = { | |
e4b2b4a8 JK |
16954 | .fail = CPUHP_INVALID, |
16955 | }; | |
16956 | ||
16957 | +#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PREEMPT_RT_FULL) | |
16958 | +static DEFINE_PER_CPU(struct rt_rw_lock, cpuhp_pin_lock) = \ | |
16959 | + __RWLOCK_RT_INITIALIZER(cpuhp_pin_lock); | |
1a6e0f06 JK |
16960 | +#endif |
16961 | + | |
e4b2b4a8 JK |
16962 | #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) |
16963 | static struct lockdep_map cpuhp_state_up_map = | |
16964 | STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map); | |
b3bbd485 | 16965 | @@ -287,6 +292,55 @@ static int cpu_hotplug_disabled; |
e4b2b4a8 JK |
16966 | |
16967 | #ifdef CONFIG_HOTPLUG_CPU | |
16968 | ||
16969 | +/** | |
16970 | + * pin_current_cpu - Prevent the current cpu from being unplugged | |
1a6e0f06 | 16971 | + */ |
e4b2b4a8 | 16972 | +void pin_current_cpu(void) |
1a6e0f06 | 16973 | +{ |
e4b2b4a8 JK |
16974 | +#ifdef CONFIG_PREEMPT_RT_FULL |
16975 | + struct rt_rw_lock *cpuhp_pin; | |
16976 | + unsigned int cpu; | |
16977 | + int ret; | |
1a6e0f06 | 16978 | + |
e4b2b4a8 JK |
16979 | +again: |
16980 | + cpuhp_pin = this_cpu_ptr(&cpuhp_pin_lock); | |
16981 | + ret = __read_rt_trylock(cpuhp_pin); | |
16982 | + if (ret) { | |
16983 | + current->pinned_on_cpu = smp_processor_id(); | |
16984 | + return; | |
16985 | + } | |
16986 | + cpu = smp_processor_id(); | |
16987 | + preempt_lazy_enable(); | |
16988 | + preempt_enable(); | |
1a6e0f06 | 16989 | + |
e4b2b4a8 | 16990 | + __read_rt_lock(cpuhp_pin); |
1a6e0f06 | 16991 | + |
e4b2b4a8 JK |
16992 | + preempt_disable(); |
16993 | + preempt_lazy_disable(); | |
16994 | + if (cpu != smp_processor_id()) { | |
16995 | + __read_rt_unlock(cpuhp_pin); | |
16996 | + goto again; | |
16997 | + } | |
16998 | + current->pinned_on_cpu = cpu; | |
16999 | +#endif | |
17000 | +} | |
1a6e0f06 | 17001 | + |
e4b2b4a8 JK |
17002 | +/** |
17003 | + * unpin_current_cpu - Allow unplug of current cpu | |
17004 | + */ | |
17005 | +void unpin_current_cpu(void) | |
17006 | +{ | |
17007 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
17008 | + struct rt_rw_lock *cpuhp_pin = this_cpu_ptr(&cpuhp_pin_lock); | |
1a6e0f06 | 17009 | + |
e4b2b4a8 JK |
17010 | + if (WARN_ON(current->pinned_on_cpu != smp_processor_id())) |
17011 | + cpuhp_pin = per_cpu_ptr(&cpuhp_pin_lock, current->pinned_on_cpu); | |
1a6e0f06 | 17012 | + |
e4b2b4a8 JK |
17013 | + current->pinned_on_cpu = -1; |
17014 | + __read_rt_unlock(cpuhp_pin); | |
17015 | +#endif | |
17016 | +} | |
1a6e0f06 | 17017 | + |
e4b2b4a8 JK |
17018 | DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock); |
17019 | ||
17020 | void cpus_read_lock(void) | |
b3bbd485 | 17021 | @@ -843,6 +897,9 @@ static int take_cpu_down(void *_param) |
e4b2b4a8 JK |
17022 | |
17023 | static int takedown_cpu(unsigned int cpu) | |
17024 | { | |
17025 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
17026 | + struct rt_rw_lock *cpuhp_pin = per_cpu_ptr(&cpuhp_pin_lock, cpu); | |
17027 | +#endif | |
17028 | struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); | |
17029 | int err; | |
17030 | ||
b3bbd485 | 17031 | @@ -855,11 +912,18 @@ static int takedown_cpu(unsigned int cpu) |
e4b2b4a8 JK |
17032 | */ |
17033 | irq_lock_sparse(); | |
17034 | ||
17035 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
17036 | + __write_rt_lock(cpuhp_pin); | |
1a6e0f06 JK |
17037 | +#endif |
17038 | + | |
e4b2b4a8 JK |
17039 | /* |
17040 | * So now all preempt/rcu users must observe !cpu_active(). | |
17041 | */ | |
17042 | err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu)); | |
17043 | if (err) { | |
17044 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
17045 | + __write_rt_unlock(cpuhp_pin); | |
1a6e0f06 | 17046 | +#endif |
e4b2b4a8 JK |
17047 | /* CPU refused to die */ |
17048 | irq_unlock_sparse(); | |
17049 | /* Unpark the hotplug thread so we can rollback there */ | |
b3bbd485 | 17050 | @@ -878,6 +942,9 @@ static int takedown_cpu(unsigned int cpu) |
e4b2b4a8 JK |
17051 | wait_for_ap_thread(st, false); |
17052 | BUG_ON(st->state != CPUHP_AP_IDLE_DEAD); | |
17053 | ||
17054 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
17055 | + __write_rt_unlock(cpuhp_pin); | |
1a6e0f06 | 17056 | +#endif |
e4b2b4a8 JK |
17057 | /* Interrupts are moved away from the dying cpu, reenable alloc/free */ |
17058 | irq_unlock_sparse(); | |
1a6e0f06 | 17059 | |
b3bbd485 JK |
17060 | diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c |
17061 | index ed5d34925ad0..c0d4c24fc241 100644 | |
17062 | --- a/kernel/debug/kdb/kdb_io.c | |
17063 | +++ b/kernel/debug/kdb/kdb_io.c | |
17064 | @@ -854,9 +854,11 @@ int kdb_printf(const char *fmt, ...) | |
e4b2b4a8 JK |
17065 | va_list ap; |
17066 | int r; | |
17067 | ||
17068 | + kdb_trap_printk++; | |
17069 | va_start(ap, fmt); | |
17070 | r = vkdb_printf(KDB_MSGSRC_INTERNAL, fmt, ap); | |
17071 | va_end(ap); | |
17072 | + kdb_trap_printk--; | |
17073 | ||
17074 | return r; | |
17075 | } | |
b3bbd485 | 17076 | diff --git a/kernel/events/core.c b/kernel/events/core.c |
5dd41b01 | 17077 | index 4dbce29a9313..de3d23bae9bf 100644 |
b3bbd485 JK |
17078 | --- a/kernel/events/core.c |
17079 | +++ b/kernel/events/core.c | |
17080 | @@ -1065,7 +1065,7 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) | |
e4b2b4a8 JK |
17081 | cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); |
17082 | ||
17083 | raw_spin_lock_init(&cpuctx->hrtimer_lock); | |
17084 | - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); | |
17085 | + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); | |
17086 | timer->function = perf_mux_hrtimer_handler; | |
17087 | } | |
17088 | ||
5dd41b01 | 17089 | @@ -8760,7 +8760,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event) |
e4b2b4a8 JK |
17090 | if (!is_sampling_event(event)) |
17091 | return; | |
17092 | ||
17093 | - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | |
17094 | + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); | |
17095 | hwc->hrtimer.function = perf_swevent_hrtimer; | |
17096 | ||
17097 | /* | |
b3bbd485 JK |
17098 | diff --git a/kernel/exit.c b/kernel/exit.c |
17099 | index e3a08761eb40..26f3b352b37a 100644 | |
17100 | --- a/kernel/exit.c | |
17101 | +++ b/kernel/exit.c | |
17102 | @@ -159,7 +159,7 @@ static void __exit_signal(struct task_struct *tsk) | |
e4b2b4a8 JK |
17103 | * Do this under ->siglock, we can race with another thread |
17104 | * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. | |
17105 | */ | |
17106 | - flush_sigqueue(&tsk->pending); | |
17107 | + flush_task_sigqueue(tsk); | |
17108 | tsk->sighand = NULL; | |
17109 | spin_unlock(&sighand->siglock); | |
17110 | ||
b3bbd485 JK |
17111 | diff --git a/kernel/fork.c b/kernel/fork.c |
17112 | index 6a219fea4926..bc849ac60aa6 100644 | |
17113 | --- a/kernel/fork.c | |
17114 | +++ b/kernel/fork.c | |
e4b2b4a8 JK |
17115 | @@ -40,6 +40,7 @@ |
17116 | #include <linux/hmm.h> | |
17117 | #include <linux/fs.h> | |
17118 | #include <linux/mm.h> | |
17119 | +#include <linux/kprobes.h> | |
17120 | #include <linux/vmacache.h> | |
17121 | #include <linux/nsproxy.h> | |
17122 | #include <linux/capability.h> | |
b3bbd485 | 17123 | @@ -407,13 +408,24 @@ static inline void put_signal_struct(struct signal_struct *sig) |
e4b2b4a8 JK |
17124 | if (atomic_dec_and_test(&sig->sigcnt)) |
17125 | free_signal_struct(sig); | |
17126 | } | |
17127 | - | |
17128 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
17129 | +static | |
1a6e0f06 | 17130 | +#endif |
e4b2b4a8 JK |
17131 | void __put_task_struct(struct task_struct *tsk) |
17132 | { | |
17133 | WARN_ON(!tsk->exit_state); | |
17134 | WARN_ON(atomic_read(&tsk->usage)); | |
17135 | WARN_ON(tsk == current); | |
17136 | ||
17137 | + /* | |
17138 | + * Remove function-return probe instances associated with this | |
17139 | + * task and put them back on the free list. | |
17140 | + */ | |
17141 | + kprobe_flush_task(tsk); | |
1a6e0f06 | 17142 | + |
e4b2b4a8 JK |
17143 | + /* Task is done with its stack. */ |
17144 | + put_task_stack(tsk); | |
17145 | + | |
17146 | cgroup_free(tsk); | |
17147 | task_numa_free(tsk); | |
17148 | security_task_free(tsk); | |
b3bbd485 | 17149 | @@ -424,7 +436,18 @@ void __put_task_struct(struct task_struct *tsk) |
e4b2b4a8 JK |
17150 | if (!profile_handoff_task(tsk)) |
17151 | free_task(tsk); | |
17152 | } | |
17153 | +#ifndef CONFIG_PREEMPT_RT_BASE | |
17154 | EXPORT_SYMBOL_GPL(__put_task_struct); | |
1a6e0f06 | 17155 | +#else |
e4b2b4a8 JK |
17156 | +void __put_task_struct_cb(struct rcu_head *rhp) |
17157 | +{ | |
17158 | + struct task_struct *tsk = container_of(rhp, struct task_struct, put_rcu); | |
1a6e0f06 | 17159 | + |
e4b2b4a8 JK |
17160 | + __put_task_struct(tsk); |
17161 | + | |
17162 | +} | |
17163 | +EXPORT_SYMBOL_GPL(__put_task_struct_cb); | |
17164 | +#endif | |
17165 | ||
17166 | void __init __weak arch_task_cache_init(void) { } | |
17167 | ||
b3bbd485 | 17168 | @@ -563,7 +586,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) |
e4b2b4a8 JK |
17169 | #ifdef CONFIG_CC_STACKPROTECTOR |
17170 | tsk->stack_canary = get_random_canary(); | |
1a6e0f06 | 17171 | #endif |
e4b2b4a8 JK |
17172 | - |
17173 | + if (orig->cpus_ptr == &orig->cpus_mask) | |
17174 | + tsk->cpus_ptr = &tsk->cpus_mask; | |
17175 | /* | |
17176 | * One for us, one for whoever does the "release_task()" (usually | |
17177 | * parent) | |
b3bbd485 | 17178 | @@ -575,6 +599,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) |
e4b2b4a8 JK |
17179 | tsk->splice_pipe = NULL; |
17180 | tsk->task_frag.page = NULL; | |
17181 | tsk->wake_q.next = NULL; | |
17182 | + tsk->wake_q_sleeper.next = NULL; | |
1a6e0f06 | 17183 | |
e4b2b4a8 JK |
17184 | account_kernel_stack(tsk, 1); |
17185 | ||
b3bbd485 | 17186 | @@ -915,6 +940,19 @@ void __mmdrop(struct mm_struct *mm) |
e4b2b4a8 JK |
17187 | } |
17188 | EXPORT_SYMBOL_GPL(__mmdrop); | |
17189 | ||
17190 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
17191 | +/* | |
17192 | + * RCU callback for delayed mm drop. Not strictly rcu, but we don't | |
17193 | + * want another facility to make this work. | |
17194 | + */ | |
17195 | +void __mmdrop_delayed(struct rcu_head *rhp) | |
17196 | +{ | |
17197 | + struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop); | |
17198 | + | |
17199 | + __mmdrop(mm); | |
17200 | +} | |
17201 | +#endif | |
17202 | + | |
17203 | static inline void __mmput(struct mm_struct *mm) | |
17204 | { | |
17205 | VM_BUG_ON(atomic_read(&mm->mm_users)); | |
b3bbd485 | 17206 | @@ -1496,6 +1534,9 @@ static void rt_mutex_init_task(struct task_struct *p) |
e4b2b4a8 JK |
17207 | */ |
17208 | static void posix_cpu_timers_init(struct task_struct *tsk) | |
17209 | { | |
17210 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
17211 | + tsk->posix_timer_list = NULL; | |
17212 | +#endif | |
17213 | tsk->cputime_expires.prof_exp = 0; | |
17214 | tsk->cputime_expires.virt_exp = 0; | |
17215 | tsk->cputime_expires.sched_exp = 0; | |
b3bbd485 | 17216 | @@ -1648,6 +1689,7 @@ static __latent_entropy struct task_struct *copy_process( |
e4b2b4a8 JK |
17217 | spin_lock_init(&p->alloc_lock); |
17218 | ||
17219 | init_sigpending(&p->pending); | |
17220 | + p->sigqueue_cache = NULL; | |
17221 | ||
17222 | p->utime = p->stime = p->gtime = 0; | |
17223 | #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME | |
b3bbd485 JK |
17224 | diff --git a/kernel/futex.c b/kernel/futex.c |
17225 | index 046cd780d057..2ba7fb04a107 100644 | |
17226 | --- a/kernel/futex.c | |
17227 | +++ b/kernel/futex.c | |
17228 | @@ -936,7 +936,9 @@ void exit_pi_state_list(struct task_struct *curr) | |
e4b2b4a8 JK |
17229 | if (head->next != next) { |
17230 | /* retain curr->pi_lock for the loop invariant */ | |
17231 | raw_spin_unlock(&pi_state->pi_mutex.wait_lock); | |
17232 | + raw_spin_unlock_irq(&curr->pi_lock); | |
17233 | spin_unlock(&hb->lock); | |
17234 | + raw_spin_lock_irq(&curr->pi_lock); | |
17235 | put_pi_state(pi_state); | |
17236 | continue; | |
17237 | } | |
b3bbd485 | 17238 | @@ -1430,6 +1432,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ |
e4b2b4a8 JK |
17239 | struct task_struct *new_owner; |
17240 | bool postunlock = false; | |
17241 | DEFINE_WAKE_Q(wake_q); | |
17242 | + DEFINE_WAKE_Q(wake_sleeper_q); | |
17243 | int ret = 0; | |
17244 | ||
17245 | new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); | |
b3bbd485 | 17246 | @@ -1491,13 +1494,13 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ |
e4b2b4a8 JK |
17247 | pi_state->owner = new_owner; |
17248 | raw_spin_unlock(&new_owner->pi_lock); | |
17249 | ||
17250 | - postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); | |
1a6e0f06 | 17251 | - |
e4b2b4a8 JK |
17252 | + postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q, |
17253 | + &wake_sleeper_q); | |
17254 | out_unlock: | |
17255 | raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); | |
17256 | ||
17257 | if (postunlock) | |
17258 | - rt_mutex_postunlock(&wake_q); | |
17259 | + rt_mutex_postunlock(&wake_q, &wake_sleeper_q); | |
17260 | ||
17261 | return ret; | |
17262 | } | |
b3bbd485 | 17263 | @@ -2104,6 +2107,16 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, |
e4b2b4a8 JK |
17264 | requeue_pi_wake_futex(this, &key2, hb2); |
17265 | drop_count++; | |
17266 | continue; | |
17267 | + } else if (ret == -EAGAIN) { | |
17268 | + /* | |
17269 | + * Waiter was woken by timeout or | |
17270 | + * signal and has set pi_blocked_on to | |
17271 | + * PI_WAKEUP_INPROGRESS before we | |
17272 | + * tried to enqueue it on the rtmutex. | |
17273 | + */ | |
17274 | + this->pi_state = NULL; | |
17275 | + put_pi_state(pi_state); | |
17276 | + continue; | |
17277 | } else if (ret) { | |
17278 | /* | |
17279 | * rt_mutex_start_proxy_lock() detected a | |
b3bbd485 | 17280 | @@ -2642,10 +2655,9 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, |
e4b2b4a8 JK |
17281 | if (abs_time) { |
17282 | to = &timeout; | |
17283 | ||
17284 | - hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? | |
17285 | - CLOCK_REALTIME : CLOCK_MONOTONIC, | |
17286 | - HRTIMER_MODE_ABS); | |
17287 | - hrtimer_init_sleeper(to, current); | |
17288 | + hrtimer_init_sleeper_on_stack(to, (flags & FLAGS_CLOCKRT) ? | |
17289 | + CLOCK_REALTIME : CLOCK_MONOTONIC, | |
17290 | + HRTIMER_MODE_ABS, current); | |
17291 | hrtimer_set_expires_range_ns(&to->timer, *abs_time, | |
17292 | current->timer_slack_ns); | |
17293 | } | |
b3bbd485 | 17294 | @@ -2744,9 +2756,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, |
e4b2b4a8 JK |
17295 | |
17296 | if (time) { | |
17297 | to = &timeout; | |
17298 | - hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME, | |
17299 | - HRTIMER_MODE_ABS); | |
17300 | - hrtimer_init_sleeper(to, current); | |
17301 | + hrtimer_init_sleeper_on_stack(to, CLOCK_REALTIME, | |
17302 | + HRTIMER_MODE_ABS, current); | |
17303 | hrtimer_set_expires(&to->timer, *time); | |
17304 | } | |
17305 | ||
b3bbd485 | 17306 | @@ -2801,7 +2812,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, |
e4b2b4a8 JK |
17307 | goto no_block; |
17308 | } | |
17309 | ||
17310 | - rt_mutex_init_waiter(&rt_waiter); | |
17311 | + rt_mutex_init_waiter(&rt_waiter, false); | |
17312 | ||
17313 | /* | |
17314 | * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not | |
b3bbd485 | 17315 | @@ -2816,9 +2827,18 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, |
e4b2b4a8 JK |
17316 | * lock handoff sequence. |
17317 | */ | |
17318 | raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); | |
17319 | + /* | |
17320 | + * the migrate_disable() here disables migration in the in_atomic() fast | |
17321 | + * path which is enabled again in the following spin_unlock(). We have | |
17322 | + * one migrate_disable() pending in the slow-path which is reversed | |
17323 | + * after the raw_spin_unlock_irq() where we leave the atomic context. | |
17324 | + */ | |
17325 | + migrate_disable(); | |
17326 | + | |
17327 | spin_unlock(q.lock_ptr); | |
17328 | ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); | |
17329 | raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); | |
17330 | + migrate_enable(); | |
17331 | ||
17332 | if (ret) { | |
17333 | if (ret == 1) | |
b3bbd485 | 17334 | @@ -2965,11 +2985,21 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) |
e4b2b4a8 JK |
17335 | * observed. |
17336 | */ | |
17337 | raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); | |
17338 | + /* | |
17339 | + * Magic trickery for now to make the RT migrate disable | |
17340 | + * logic happy. The following spin_unlock() happens with | |
17341 | + * interrupts disabled so the internal migrate_enable() | |
17342 | + * won't undo the migrate_disable() which was issued when | |
17343 | + * locking hb->lock. | |
17344 | + */ | |
17345 | + migrate_disable(); | |
17346 | spin_unlock(&hb->lock); | |
17347 | ||
17348 | /* drops pi_state->pi_mutex.wait_lock */ | |
17349 | ret = wake_futex_pi(uaddr, uval, pi_state); | |
17350 | ||
17351 | + migrate_enable(); | |
17352 | + | |
17353 | put_pi_state(pi_state); | |
17354 | ||
17355 | /* | |
b3bbd485 | 17356 | @@ -3127,7 +3157,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, |
e4b2b4a8 JK |
17357 | struct hrtimer_sleeper timeout, *to = NULL; |
17358 | struct futex_pi_state *pi_state = NULL; | |
17359 | struct rt_mutex_waiter rt_waiter; | |
17360 | - struct futex_hash_bucket *hb; | |
17361 | + struct futex_hash_bucket *hb, *hb2; | |
17362 | union futex_key key2 = FUTEX_KEY_INIT; | |
17363 | struct futex_q q = futex_q_init; | |
17364 | int res, ret; | |
b3bbd485 | 17365 | @@ -3143,10 +3173,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, |
e4b2b4a8 JK |
17366 | |
17367 | if (abs_time) { | |
17368 | to = &timeout; | |
17369 | - hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? | |
17370 | - CLOCK_REALTIME : CLOCK_MONOTONIC, | |
17371 | - HRTIMER_MODE_ABS); | |
17372 | - hrtimer_init_sleeper(to, current); | |
17373 | + hrtimer_init_sleeper_on_stack(to, (flags & FLAGS_CLOCKRT) ? | |
17374 | + CLOCK_REALTIME : CLOCK_MONOTONIC, | |
17375 | + HRTIMER_MODE_ABS, current); | |
17376 | hrtimer_set_expires_range_ns(&to->timer, *abs_time, | |
17377 | current->timer_slack_ns); | |
17378 | } | |
b3bbd485 | 17379 | @@ -3155,7 +3184,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, |
e4b2b4a8 JK |
17380 | * The waiter is allocated on our stack, manipulated by the requeue |
17381 | * code while we sleep on uaddr. | |
17382 | */ | |
17383 | - rt_mutex_init_waiter(&rt_waiter); | |
17384 | + rt_mutex_init_waiter(&rt_waiter, false); | |
1a6e0f06 | 17385 | |
e4b2b4a8 JK |
17386 | ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); |
17387 | if (unlikely(ret != 0)) | |
b3bbd485 | 17388 | @@ -3186,20 +3215,55 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, |
e4b2b4a8 JK |
17389 | /* Queue the futex_q, drop the hb lock, wait for wakeup. */ |
17390 | futex_wait_queue_me(hb, &q, to); | |
1a6e0f06 | 17391 | |
e4b2b4a8 JK |
17392 | - spin_lock(&hb->lock); |
17393 | - ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); | |
17394 | - spin_unlock(&hb->lock); | |
17395 | - if (ret) | |
17396 | - goto out_put_keys; | |
17397 | + /* | |
17398 | + * On RT we must avoid races with requeue and trying to block | |
17399 | + * on two mutexes (hb->lock and uaddr2's rtmutex) by | |
17400 | + * serializing access to pi_blocked_on with pi_lock. | |
17401 | + */ | |
17402 | + raw_spin_lock_irq(¤t->pi_lock); | |
17403 | + if (current->pi_blocked_on) { | |
17404 | + /* | |
17405 | + * We have been requeued or are in the process of | |
17406 | + * being requeued. | |
17407 | + */ | |
17408 | + raw_spin_unlock_irq(¤t->pi_lock); | |
17409 | + } else { | |
17410 | + /* | |
17411 | + * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS | |
17412 | + * prevents a concurrent requeue from moving us to the | |
17413 | + * uaddr2 rtmutex. After that we can safely acquire | |
17414 | + * (and possibly block on) hb->lock. | |
17415 | + */ | |
17416 | + current->pi_blocked_on = PI_WAKEUP_INPROGRESS; | |
17417 | + raw_spin_unlock_irq(¤t->pi_lock); | |
1a6e0f06 | 17418 | + |
e4b2b4a8 | 17419 | + spin_lock(&hb->lock); |
1a6e0f06 | 17420 | + |
e4b2b4a8 JK |
17421 | + /* |
17422 | + * Clean up pi_blocked_on. We might leak it otherwise | |
17423 | + * when we succeeded with the hb->lock in the fast | |
17424 | + * path. | |
17425 | + */ | |
17426 | + raw_spin_lock_irq(¤t->pi_lock); | |
17427 | + current->pi_blocked_on = NULL; | |
17428 | + raw_spin_unlock_irq(¤t->pi_lock); | |
1a6e0f06 | 17429 | + |
e4b2b4a8 JK |
17430 | + ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); |
17431 | + spin_unlock(&hb->lock); | |
17432 | + if (ret) | |
17433 | + goto out_put_keys; | |
1a6e0f06 | 17434 | + } |
c7c16703 | 17435 | |
e4b2b4a8 JK |
17436 | /* |
17437 | - * In order for us to be here, we know our q.key == key2, and since | |
17438 | - * we took the hb->lock above, we also know that futex_requeue() has | |
17439 | - * completed and we no longer have to concern ourselves with a wakeup | |
17440 | - * race with the atomic proxy lock acquisition by the requeue code. The | |
17441 | - * futex_requeue dropped our key1 reference and incremented our key2 | |
17442 | - * reference count. | |
17443 | + * In order to be here, we have either been requeued, are in | |
17444 | + * the process of being requeued, or requeue successfully | |
17445 | + * acquired uaddr2 on our behalf. If pi_blocked_on was | |
17446 | + * non-null above, we may be racing with a requeue. Do not | |
17447 | + * rely on q->lock_ptr to be hb2->lock until after blocking on | |
17448 | + * hb->lock or hb2->lock. The futex_requeue dropped our key1 | |
17449 | + * reference and incremented our key2 reference count. | |
17450 | */ | |
17451 | + hb2 = hash_futex(&key2); | |
17452 | ||
17453 | /* Check if the requeue code acquired the second futex for us. */ | |
17454 | if (!q.rt_waiter) { | |
b3bbd485 | 17455 | @@ -3208,7 +3272,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, |
e4b2b4a8 JK |
17456 | * did a lock-steal - fix up the PI-state in that case. |
17457 | */ | |
17458 | if (q.pi_state && (q.pi_state->owner != current)) { | |
17459 | - spin_lock(q.lock_ptr); | |
17460 | + spin_lock(&hb2->lock); | |
17461 | + BUG_ON(&hb2->lock != q.lock_ptr); | |
17462 | ret = fixup_pi_state_owner(uaddr2, &q, current); | |
17463 | if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { | |
17464 | pi_state = q.pi_state; | |
b3bbd485 | 17465 | @@ -3219,7 +3284,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, |
e4b2b4a8 JK |
17466 | * the requeue_pi() code acquired for us. |
17467 | */ | |
17468 | put_pi_state(q.pi_state); | |
17469 | - spin_unlock(q.lock_ptr); | |
17470 | + spin_unlock(&hb2->lock); | |
17471 | } | |
17472 | } else { | |
17473 | struct rt_mutex *pi_mutex; | |
b3bbd485 | 17474 | @@ -3233,7 +3298,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, |
e4b2b4a8 JK |
17475 | pi_mutex = &q.pi_state->pi_mutex; |
17476 | ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); | |
17477 | ||
17478 | - spin_lock(q.lock_ptr); | |
17479 | + spin_lock(&hb2->lock); | |
17480 | + BUG_ON(&hb2->lock != q.lock_ptr); | |
17481 | if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) | |
17482 | ret = 0; | |
17483 | ||
b3bbd485 JK |
17484 | diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c |
17485 | index 79f987b942b8..d1dbacc29941 100644 | |
17486 | --- a/kernel/irq/handle.c | |
17487 | +++ b/kernel/irq/handle.c | |
17488 | @@ -183,10 +183,16 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) | |
e4b2b4a8 JK |
17489 | { |
17490 | irqreturn_t retval; | |
17491 | unsigned int flags = 0; | |
17492 | + struct pt_regs *regs = get_irq_regs(); | |
17493 | + u64 ip = regs ? instruction_pointer(regs) : 0; | |
17494 | ||
17495 | retval = __handle_irq_event_percpu(desc, &flags); | |
17496 | ||
17497 | - add_interrupt_randomness(desc->irq_data.irq, flags); | |
c7c16703 | 17498 | +#ifdef CONFIG_PREEMPT_RT_FULL |
e4b2b4a8 | 17499 | + desc->random_ip = ip; |
c7c16703 | 17500 | +#else |
e4b2b4a8 | 17501 | + add_interrupt_randomness(desc->irq_data.irq, flags, ip); |
c7c16703 JK |
17502 | +#endif |
17503 | ||
e4b2b4a8 JK |
17504 | if (!noirqdebug) |
17505 | note_interrupt(desc, retval); | |
b3bbd485 JK |
17506 | diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c |
17507 | index 069311541577..f82dcca81712 100644 | |
17508 | --- a/kernel/irq/manage.c | |
17509 | +++ b/kernel/irq/manage.c | |
e4b2b4a8 JK |
17510 | @@ -24,6 +24,7 @@ |
17511 | #include "internals.h" | |
1a6e0f06 | 17512 | |
e4b2b4a8 JK |
17513 | #ifdef CONFIG_IRQ_FORCED_THREADING |
17514 | +# ifndef CONFIG_PREEMPT_RT_BASE | |
17515 | __read_mostly bool force_irqthreads; | |
17516 | ||
17517 | static int __init setup_forced_irqthreads(char *arg) | |
b3bbd485 | 17518 | @@ -32,6 +33,7 @@ static int __init setup_forced_irqthreads(char *arg) |
e4b2b4a8 JK |
17519 | return 0; |
17520 | } | |
17521 | early_param("threadirqs", setup_forced_irqthreads); | |
17522 | +# endif | |
17523 | #endif | |
17524 | ||
17525 | static void __synchronize_hardirq(struct irq_desc *desc) | |
b3bbd485 | 17526 | @@ -224,7 +226,12 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, |
e4b2b4a8 JK |
17527 | |
17528 | if (desc->affinity_notify) { | |
17529 | kref_get(&desc->affinity_notify->kref); | |
1a6e0f06 | 17530 | + |
e4b2b4a8 JK |
17531 | +#ifdef CONFIG_PREEMPT_RT_BASE |
17532 | + swork_queue(&desc->affinity_notify->swork); | |
1a6e0f06 | 17533 | +#else |
e4b2b4a8 JK |
17534 | schedule_work(&desc->affinity_notify->work); |
17535 | +#endif | |
17536 | } | |
17537 | irqd_set(data, IRQD_AFFINITY_SET); | |
1a6e0f06 | 17538 | |
b3bbd485 | 17539 | @@ -262,10 +269,8 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) |
e4b2b4a8 JK |
17540 | } |
17541 | EXPORT_SYMBOL_GPL(irq_set_affinity_hint); | |
17542 | ||
17543 | -static void irq_affinity_notify(struct work_struct *work) | |
17544 | +static void _irq_affinity_notify(struct irq_affinity_notify *notify) | |
1a6e0f06 | 17545 | { |
e4b2b4a8 JK |
17546 | - struct irq_affinity_notify *notify = |
17547 | - container_of(work, struct irq_affinity_notify, work); | |
17548 | struct irq_desc *desc = irq_to_desc(notify->irq); | |
17549 | cpumask_var_t cpumask; | |
17550 | unsigned long flags; | |
b3bbd485 | 17551 | @@ -287,6 +292,35 @@ static void irq_affinity_notify(struct work_struct *work) |
e4b2b4a8 | 17552 | kref_put(¬ify->kref, notify->release); |
1a6e0f06 JK |
17553 | } |
17554 | ||
e4b2b4a8 JK |
17555 | +#ifdef CONFIG_PREEMPT_RT_BASE |
17556 | +static void init_helper_thread(void) | |
1a6e0f06 | 17557 | +{ |
e4b2b4a8 JK |
17558 | + static int init_sworker_once; |
17559 | + | |
17560 | + if (init_sworker_once) | |
17561 | + return; | |
17562 | + if (WARN_ON(swork_get())) | |
17563 | + return; | |
17564 | + init_sworker_once = 1; | |
1a6e0f06 JK |
17565 | +} |
17566 | + | |
e4b2b4a8 | 17567 | +static void irq_affinity_notify(struct swork_event *swork) |
1a6e0f06 | 17568 | +{ |
e4b2b4a8 JK |
17569 | + struct irq_affinity_notify *notify = |
17570 | + container_of(swork, struct irq_affinity_notify, swork); | |
17571 | + _irq_affinity_notify(notify); | |
1a6e0f06 JK |
17572 | +} |
17573 | + | |
e4b2b4a8 JK |
17574 | +#else |
17575 | + | |
17576 | +static void irq_affinity_notify(struct work_struct *work) | |
1a6e0f06 | 17577 | +{ |
e4b2b4a8 JK |
17578 | + struct irq_affinity_notify *notify = |
17579 | + container_of(work, struct irq_affinity_notify, work); | |
17580 | + _irq_affinity_notify(notify); | |
1a6e0f06 JK |
17581 | +} |
17582 | +#endif | |
17583 | + | |
e4b2b4a8 JK |
17584 | /** |
17585 | * irq_set_affinity_notifier - control notification of IRQ affinity changes | |
17586 | * @irq: Interrupt for which to enable/disable notification | |
b3bbd485 | 17587 | @@ -315,7 +349,12 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) |
e4b2b4a8 JK |
17588 | if (notify) { |
17589 | notify->irq = irq; | |
17590 | kref_init(¬ify->kref); | |
17591 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
17592 | + INIT_SWORK(¬ify->swork, irq_affinity_notify); | |
17593 | + init_helper_thread(); | |
17594 | +#else | |
17595 | INIT_WORK(¬ify->work, irq_affinity_notify); | |
17596 | +#endif | |
17597 | } | |
1a6e0f06 | 17598 | |
e4b2b4a8 | 17599 | raw_spin_lock_irqsave(&desc->lock, flags); |
b3bbd485 | 17600 | @@ -883,7 +922,15 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) |
e4b2b4a8 JK |
17601 | local_bh_disable(); |
17602 | ret = action->thread_fn(action->irq, action->dev_id); | |
17603 | irq_finalize_oneshot(desc, action); | |
17604 | - local_bh_enable(); | |
17605 | + /* | |
17606 | + * Interrupts which have real time requirements can be set up | |
17607 | + * to avoid softirq processing in the thread handler. This is | |
17608 | + * safe as these interrupts do not raise soft interrupts. | |
17609 | + */ | |
17610 | + if (irq_settings_no_softirq_call(desc)) | |
17611 | + _local_bh_enable(); | |
17612 | + else | |
17613 | + local_bh_enable(); | |
17614 | return ret; | |
17615 | } | |
1a6e0f06 | 17616 | |
b3bbd485 | 17617 | @@ -980,6 +1027,12 @@ static int irq_thread(void *data) |
e4b2b4a8 JK |
17618 | if (action_ret == IRQ_WAKE_THREAD) |
17619 | irq_wake_secondary(desc, action); | |
1a6e0f06 | 17620 | |
e4b2b4a8 JK |
17621 | +#ifdef CONFIG_PREEMPT_RT_FULL |
17622 | + migrate_disable(); | |
17623 | + add_interrupt_randomness(action->irq, 0, | |
17624 | + desc->random_ip ^ (unsigned long) action); | |
17625 | + migrate_enable(); | |
17626 | +#endif | |
17627 | wake_threads_waitq(desc); | |
17628 | } | |
1a6e0f06 | 17629 | |
b3bbd485 | 17630 | @@ -1378,6 +1431,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) |
e4b2b4a8 JK |
17631 | irqd_set(&desc->irq_data, IRQD_NO_BALANCING); |
17632 | } | |
1a6e0f06 | 17633 | |
e4b2b4a8 JK |
17634 | + if (new->flags & IRQF_NO_SOFTIRQ_CALL) |
17635 | + irq_settings_set_no_softirq_call(desc); | |
1a6e0f06 | 17636 | + |
e4b2b4a8 JK |
17637 | if (irq_settings_can_autoenable(desc)) { |
17638 | irq_startup(desc, IRQ_RESEND, IRQ_START_COND); | |
17639 | } else { | |
b3bbd485 | 17640 | @@ -2159,7 +2215,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state); |
e4b2b4a8 JK |
17641 | * This call sets the internal irqchip state of an interrupt, |
17642 | * depending on the value of @which. | |
1a6e0f06 | 17643 | * |
e4b2b4a8 JK |
17644 | - * This function should be called with preemption disabled if the |
17645 | + * This function should be called with migration disabled if the | |
17646 | * interrupt controller has per-cpu registers. | |
17647 | */ | |
17648 | int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, | |
b3bbd485 JK |
17649 | diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h |
17650 | index e43795cd2ccf..47e2f9e23586 100644 | |
17651 | --- a/kernel/irq/settings.h | |
17652 | +++ b/kernel/irq/settings.h | |
17653 | @@ -17,6 +17,7 @@ enum { | |
e4b2b4a8 JK |
17654 | _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID, |
17655 | _IRQ_IS_POLLED = IRQ_IS_POLLED, | |
17656 | _IRQ_DISABLE_UNLAZY = IRQ_DISABLE_UNLAZY, | |
17657 | + _IRQ_NO_SOFTIRQ_CALL = IRQ_NO_SOFTIRQ_CALL, | |
17658 | _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, | |
17659 | }; | |
1a6e0f06 | 17660 | |
b3bbd485 | 17661 | @@ -31,6 +32,7 @@ enum { |
e4b2b4a8 JK |
17662 | #define IRQ_PER_CPU_DEVID GOT_YOU_MORON |
17663 | #define IRQ_IS_POLLED GOT_YOU_MORON | |
17664 | #define IRQ_DISABLE_UNLAZY GOT_YOU_MORON | |
17665 | +#define IRQ_NO_SOFTIRQ_CALL GOT_YOU_MORON | |
17666 | #undef IRQF_MODIFY_MASK | |
17667 | #define IRQF_MODIFY_MASK GOT_YOU_MORON | |
1a6e0f06 | 17668 | |
b3bbd485 | 17669 | @@ -41,6 +43,16 @@ irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set) |
e4b2b4a8 JK |
17670 | desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK); |
17671 | } | |
1a6e0f06 | 17672 | |
e4b2b4a8 JK |
17673 | +static inline bool irq_settings_no_softirq_call(struct irq_desc *desc) |
17674 | +{ | |
17675 | + return desc->status_use_accessors & _IRQ_NO_SOFTIRQ_CALL; | |
17676 | +} | |
1a6e0f06 | 17677 | + |
e4b2b4a8 JK |
17678 | +static inline void irq_settings_set_no_softirq_call(struct irq_desc *desc) |
17679 | +{ | |
17680 | + desc->status_use_accessors |= _IRQ_NO_SOFTIRQ_CALL; | |
17681 | +} | |
1a6e0f06 | 17682 | + |
e4b2b4a8 JK |
17683 | static inline bool irq_settings_is_per_cpu(struct irq_desc *desc) |
17684 | { | |
17685 | return desc->status_use_accessors & _IRQ_PER_CPU; | |
b3bbd485 JK |
17686 | diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c |
17687 | index 987d7bca4864..75347fb1dfea 100644 | |
17688 | --- a/kernel/irq/spurious.c | |
17689 | +++ b/kernel/irq/spurious.c | |
17690 | @@ -445,6 +445,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true"); | |
1a6e0f06 | 17691 | |
e4b2b4a8 JK |
17692 | static int __init irqfixup_setup(char *str) |
17693 | { | |
17694 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
17695 | + pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n"); | |
17696 | + return 1; | |
17697 | +#endif | |
17698 | irqfixup = 1; | |
17699 | printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); | |
17700 | printk(KERN_WARNING "This may impact system performance.\n"); | |
b3bbd485 | 17701 | @@ -457,6 +461,10 @@ module_param(irqfixup, int, 0644); |
1a6e0f06 | 17702 | |
e4b2b4a8 JK |
17703 | static int __init irqpoll_setup(char *str) |
17704 | { | |
17705 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
17706 | + pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT_BASE\n"); | |
17707 | + return 1; | |
17708 | +#endif | |
17709 | irqfixup = 2; | |
17710 | printk(KERN_WARNING "Misrouted IRQ fixup and polling support " | |
17711 | "enabled\n"); | |
b3bbd485 JK |
17712 | diff --git a/kernel/irq_work.c b/kernel/irq_work.c |
17713 | index bcf107ce0854..2899ba0d23d1 100644 | |
17714 | --- a/kernel/irq_work.c | |
17715 | +++ b/kernel/irq_work.c | |
e4b2b4a8 JK |
17716 | @@ -17,6 +17,7 @@ |
17717 | #include <linux/cpu.h> | |
17718 | #include <linux/notifier.h> | |
17719 | #include <linux/smp.h> | |
17720 | +#include <linux/interrupt.h> | |
17721 | #include <asm/processor.h> | |
1a6e0f06 | 17722 | |
1a6e0f06 | 17723 | |
b3bbd485 | 17724 | @@ -65,6 +66,8 @@ void __weak arch_irq_work_raise(void) |
1a6e0f06 | 17725 | */ |
e4b2b4a8 JK |
17726 | bool irq_work_queue_on(struct irq_work *work, int cpu) |
17727 | { | |
17728 | + struct llist_head *list; | |
1a6e0f06 | 17729 | + |
e4b2b4a8 JK |
17730 | /* All work should have been flushed before going offline */ |
17731 | WARN_ON_ONCE(cpu_is_offline(cpu)); | |
1a6e0f06 | 17732 | |
b3bbd485 | 17733 | @@ -75,7 +78,12 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) |
e4b2b4a8 JK |
17734 | if (!irq_work_claim(work)) |
17735 | return false; | |
17736 | ||
17737 | - if (llist_add(&work->llnode, &per_cpu(raised_list, cpu))) | |
17738 | + if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL) && !(work->flags & IRQ_WORK_HARD_IRQ)) | |
17739 | + list = &per_cpu(lazy_list, cpu); | |
17740 | + else | |
17741 | + list = &per_cpu(raised_list, cpu); | |
17742 | + | |
17743 | + if (llist_add(&work->llnode, list)) | |
17744 | arch_send_call_function_single_ipi(cpu); | |
c7c16703 | 17745 | |
e4b2b4a8 | 17746 | return true; |
b3bbd485 | 17747 | @@ -86,6 +94,9 @@ EXPORT_SYMBOL_GPL(irq_work_queue_on); |
e4b2b4a8 JK |
17748 | /* Enqueue the irq work @work on the current CPU */ |
17749 | bool irq_work_queue(struct irq_work *work) | |
17750 | { | |
17751 | + struct llist_head *list; | |
17752 | + bool lazy_work, realtime = IS_ENABLED(CONFIG_PREEMPT_RT_FULL); | |
17753 | + | |
17754 | /* Only queue if not already pending */ | |
17755 | if (!irq_work_claim(work)) | |
17756 | return false; | |
b3bbd485 | 17757 | @@ -93,13 +104,15 @@ bool irq_work_queue(struct irq_work *work) |
e4b2b4a8 JK |
17758 | /* Queue the entry and raise the IPI if needed. */ |
17759 | preempt_disable(); | |
c7c16703 | 17760 | |
e4b2b4a8 JK |
17761 | - /* If the work is "lazy", handle it from next tick if any */ |
17762 | - if (work->flags & IRQ_WORK_LAZY) { | |
17763 | - if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && | |
17764 | - tick_nohz_tick_stopped()) | |
17765 | - arch_irq_work_raise(); | |
17766 | - } else { | |
17767 | - if (llist_add(&work->llnode, this_cpu_ptr(&raised_list))) | |
17768 | + lazy_work = work->flags & IRQ_WORK_LAZY; | |
17769 | + | |
17770 | + if (lazy_work || (realtime && !(work->flags & IRQ_WORK_HARD_IRQ))) | |
17771 | + list = this_cpu_ptr(&lazy_list); | |
17772 | + else | |
17773 | + list = this_cpu_ptr(&raised_list); | |
17774 | + | |
17775 | + if (llist_add(&work->llnode, list)) { | |
17776 | + if (!lazy_work || tick_nohz_tick_stopped()) | |
17777 | arch_irq_work_raise(); | |
17778 | } | |
c7c16703 | 17779 | |
b3bbd485 | 17780 | @@ -116,9 +129,8 @@ bool irq_work_needs_cpu(void) |
e4b2b4a8 JK |
17781 | raised = this_cpu_ptr(&raised_list); |
17782 | lazy = this_cpu_ptr(&lazy_list); | |
c7c16703 | 17783 | |
e4b2b4a8 JK |
17784 | - if (llist_empty(raised) || arch_irq_work_has_interrupt()) |
17785 | - if (llist_empty(lazy)) | |
17786 | - return false; | |
17787 | + if (llist_empty(raised) && llist_empty(lazy)) | |
17788 | + return false; | |
c7c16703 | 17789 | |
e4b2b4a8 JK |
17790 | /* All work should have been flushed before going offline */ |
17791 | WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); | |
b3bbd485 | 17792 | @@ -132,7 +144,7 @@ static void irq_work_run_list(struct llist_head *list) |
e4b2b4a8 JK |
17793 | struct irq_work *work; |
17794 | struct llist_node *llnode; | |
c7c16703 | 17795 | |
e4b2b4a8 JK |
17796 | - BUG_ON(!irqs_disabled()); |
17797 | + BUG_ON_NONRT(!irqs_disabled()); | |
c7c16703 | 17798 | |
e4b2b4a8 JK |
17799 | if (llist_empty(list)) |
17800 | return; | |
b3bbd485 | 17801 | @@ -169,7 +181,16 @@ static void irq_work_run_list(struct llist_head *list) |
e4b2b4a8 | 17802 | void irq_work_run(void) |
c7c16703 | 17803 | { |
e4b2b4a8 JK |
17804 | irq_work_run_list(this_cpu_ptr(&raised_list)); |
17805 | - irq_work_run_list(this_cpu_ptr(&lazy_list)); | |
17806 | + if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) { | |
17807 | + /* | |
17808 | + * NOTE: we raise softirq via IPI for safety, | |
17809 | + * and execute in irq_work_tick() to move the | |
17810 | + * overhead from hard to soft irq context. | |
17811 | + */ | |
17812 | + if (!llist_empty(this_cpu_ptr(&lazy_list))) | |
17813 | + raise_softirq(TIMER_SOFTIRQ); | |
17814 | + } else | |
17815 | + irq_work_run_list(this_cpu_ptr(&lazy_list)); | |
c7c16703 | 17816 | } |
e4b2b4a8 | 17817 | EXPORT_SYMBOL_GPL(irq_work_run); |
c7c16703 | 17818 | |
b3bbd485 | 17819 | @@ -179,8 +200,17 @@ void irq_work_tick(void) |
1a6e0f06 | 17820 | |
e4b2b4a8 JK |
17821 | if (!llist_empty(raised) && !arch_irq_work_has_interrupt()) |
17822 | irq_work_run_list(raised); | |
1a6e0f06 | 17823 | + |
e4b2b4a8 JK |
17824 | + if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) |
17825 | + irq_work_run_list(this_cpu_ptr(&lazy_list)); | |
17826 | +} | |
1a6e0f06 | 17827 | + |
e4b2b4a8 JK |
17828 | +#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_PREEMPT_RT_FULL) |
17829 | +void irq_work_tick_soft(void) | |
17830 | +{ | |
17831 | irq_work_run_list(this_cpu_ptr(&lazy_list)); | |
17832 | } | |
1a6e0f06 | 17833 | +#endif |
1a6e0f06 | 17834 | |
e4b2b4a8 JK |
17835 | /* |
17836 | * Synchronize against the irq_work @entry, ensures the entry is not | |
b3bbd485 JK |
17837 | diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c |
17838 | index 46ba853656f6..9a23632b6294 100644 | |
17839 | --- a/kernel/ksysfs.c | |
17840 | +++ b/kernel/ksysfs.c | |
17841 | @@ -140,6 +140,15 @@ KERNEL_ATTR_RO(vmcoreinfo); | |
1a6e0f06 | 17842 | |
e4b2b4a8 | 17843 | #endif /* CONFIG_CRASH_CORE */ |
1a6e0f06 | 17844 | |
e4b2b4a8 JK |
17845 | +#if defined(CONFIG_PREEMPT_RT_FULL) |
17846 | +static ssize_t realtime_show(struct kobject *kobj, | |
17847 | + struct kobj_attribute *attr, char *buf) | |
17848 | +{ | |
17849 | + return sprintf(buf, "%d\n", 1); | |
17850 | +} | |
17851 | +KERNEL_ATTR_RO(realtime); | |
17852 | +#endif | |
17853 | + | |
17854 | /* whether file capabilities are enabled */ | |
17855 | static ssize_t fscaps_show(struct kobject *kobj, | |
17856 | struct kobj_attribute *attr, char *buf) | |
b3bbd485 JK |
17857 | @@ -230,6 +239,9 @@ static struct attribute * kernel_attrs[] = { |
17858 | #ifndef CONFIG_TINY_RCU | |
e4b2b4a8 JK |
17859 | &rcu_expedited_attr.attr, |
17860 | &rcu_normal_attr.attr, | |
b3bbd485 | 17861 | +#endif |
e4b2b4a8 JK |
17862 | +#ifdef CONFIG_PREEMPT_RT_FULL |
17863 | + &realtime_attr.attr, | |
b3bbd485 | 17864 | #endif |
e4b2b4a8 JK |
17865 | NULL |
17866 | }; | |
b3bbd485 JK |
17867 | diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile |
17868 | index 392c7f23af76..c0bf04b6b965 100644 | |
17869 | --- a/kernel/locking/Makefile | |
17870 | +++ b/kernel/locking/Makefile | |
e4b2b4a8 JK |
17871 | @@ -3,7 +3,7 @@ |
17872 | # and is generally not a function of system call inputs. | |
17873 | KCOV_INSTRUMENT := n | |
1a6e0f06 | 17874 | |
e4b2b4a8 JK |
17875 | -obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o |
17876 | +obj-y += semaphore.o percpu-rwsem.o | |
1a6e0f06 | 17877 | |
e4b2b4a8 JK |
17878 | ifdef CONFIG_FUNCTION_TRACER |
17879 | CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) | |
b3bbd485 | 17880 | @@ -12,7 +12,11 @@ CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE) |
e4b2b4a8 JK |
17881 | CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE) |
17882 | endif | |
17883 | ||
17884 | +ifneq ($(CONFIG_PREEMPT_RT_FULL),y) | |
17885 | +obj-y += mutex.o | |
17886 | obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o | |
17887 | +endif | |
17888 | +obj-y += rwsem.o | |
17889 | obj-$(CONFIG_LOCKDEP) += lockdep.o | |
17890 | ifeq ($(CONFIG_PROC_FS),y) | |
17891 | obj-$(CONFIG_LOCKDEP) += lockdep_proc.o | |
b3bbd485 | 17892 | @@ -25,8 +29,11 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o |
e4b2b4a8 JK |
17893 | obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o |
17894 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o | |
17895 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o | |
17896 | +ifneq ($(CONFIG_PREEMPT_RT_FULL),y) | |
17897 | obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o | |
17898 | obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o | |
17899 | +endif | |
17900 | +obj-$(CONFIG_PREEMPT_RT_FULL) += mutex-rt.o rwsem-rt.o rwlock-rt.o | |
17901 | obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o | |
17902 | obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o | |
17903 | obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o | |
b3bbd485 JK |
17904 | diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c |
17905 | index d7c155048ea9..def51a27f20f 100644 | |
17906 | --- a/kernel/locking/lockdep.c | |
17907 | +++ b/kernel/locking/lockdep.c | |
17908 | @@ -3914,6 +3914,7 @@ static void check_flags(unsigned long flags) | |
17909 | } | |
17910 | } | |
17911 | ||
17912 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
17913 | /* | |
17914 | * We dont accurately track softirq state in e.g. | |
17915 | * hardirq contexts (such as on 4KSTACKS), so only | |
17916 | @@ -3928,6 +3929,7 @@ static void check_flags(unsigned long flags) | |
17917 | DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); | |
17918 | } | |
17919 | } | |
17920 | +#endif | |
17921 | ||
17922 | if (!debug_locks) | |
17923 | print_irqtrace_events(current); | |
17924 | diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c | |
17925 | index 6dca260eeccf..5d01ac590d4c 100644 | |
17926 | --- a/kernel/locking/locktorture.c | |
17927 | +++ b/kernel/locking/locktorture.c | |
17928 | @@ -26,7 +26,6 @@ | |
17929 | #include <linux/kthread.h> | |
17930 | #include <linux/sched/rt.h> | |
17931 | #include <linux/spinlock.h> | |
17932 | -#include <linux/rwlock.h> | |
17933 | #include <linux/mutex.h> | |
17934 | #include <linux/rwsem.h> | |
17935 | #include <linux/smp.h> | |
17936 | diff --git a/kernel/locking/mutex-rt.c b/kernel/locking/mutex-rt.c | |
17937 | new file mode 100644 | |
17938 | index 000000000000..4f81595c0f52 | |
17939 | --- /dev/null | |
17940 | +++ b/kernel/locking/mutex-rt.c | |
e4b2b4a8 JK |
17941 | @@ -0,0 +1,223 @@ |
17942 | +/* | |
17943 | + * kernel/rt.c | |
17944 | + * | |
17945 | + * Real-Time Preemption Support | |
17946 | + * | |
17947 | + * started by Ingo Molnar: | |
17948 | + * | |
17949 | + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | |
17950 | + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> | |
17951 | + * | |
17952 | + * historic credit for proving that Linux spinlocks can be implemented via | |
17953 | + * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow | |
17954 | + * and others) who prototyped it on 2.4 and did lots of comparative | |
17955 | + * research and analysis; TimeSys, for proving that you can implement a | |
17956 | + * fully preemptible kernel via the use of IRQ threading and mutexes; | |
17957 | + * Bill Huey for persuasively arguing on lkml that the mutex model is the | |
17958 | + * right one; and to MontaVista, who ported pmutexes to 2.6. | |
17959 | + * | |
17960 | + * This code is a from-scratch implementation and is not based on pmutexes, | |
17961 | + * but the idea of converting spinlocks to mutexes is used here too. | |
17962 | + * | |
17963 | + * lock debugging, locking tree, deadlock detection: | |
17964 | + * | |
17965 | + * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey | |
17966 | + * Released under the General Public License (GPL). | |
17967 | + * | |
17968 | + * Includes portions of the generic R/W semaphore implementation from: | |
17969 | + * | |
17970 | + * Copyright (c) 2001 David Howells (dhowells@redhat.com). | |
17971 | + * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de> | |
17972 | + * - Derived also from comments by Linus | |
17973 | + * | |
17974 | + * Pending ownership of locks and ownership stealing: | |
17975 | + * | |
17976 | + * Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt | |
17977 | + * | |
17978 | + * (also by Steven Rostedt) | |
17979 | + * - Converted single pi_lock to individual task locks. | |
17980 | + * | |
17981 | + * By Esben Nielsen: | |
17982 | + * Doing priority inheritance with help of the scheduler. | |
17983 | + * | |
17984 | + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> | |
17985 | + * - major rework based on Esben Nielsens initial patch | |
17986 | + * - replaced thread_info references by task_struct refs | |
17987 | + * - removed task->pending_owner dependency | |
17988 | + * - BKL drop/reacquire for semaphore style locks to avoid deadlocks | |
17989 | + * in the scheduler return path as discussed with Steven Rostedt | |
17990 | + * | |
17991 | + * Copyright (C) 2006, Kihon Technologies Inc. | |
17992 | + * Steven Rostedt <rostedt@goodmis.org> | |
17993 | + * - debugged and patched Thomas Gleixner's rework. | |
17994 | + * - added back the cmpxchg to the rework. | |
17995 | + * - turned atomic require back on for SMP. | |
17996 | + */ | |
17997 | + | |
17998 | +#include <linux/spinlock.h> | |
17999 | +#include <linux/rtmutex.h> | |
18000 | +#include <linux/sched.h> | |
18001 | +#include <linux/delay.h> | |
18002 | +#include <linux/module.h> | |
18003 | +#include <linux/kallsyms.h> | |
18004 | +#include <linux/syscalls.h> | |
18005 | +#include <linux/interrupt.h> | |
18006 | +#include <linux/plist.h> | |
18007 | +#include <linux/fs.h> | |
18008 | +#include <linux/futex.h> | |
18009 | +#include <linux/hrtimer.h> | |
18010 | + | |
18011 | +#include "rtmutex_common.h" | |
18012 | + | |
18013 | +/* | |
18014 | + * struct mutex functions | |
18015 | + */ | |
18016 | +void __mutex_do_init(struct mutex *mutex, const char *name, | |
18017 | + struct lock_class_key *key) | |
18018 | +{ | |
18019 | +#ifdef CONFIG_DEBUG_LOCK_ALLOC | |
18020 | + /* | |
18021 | + * Make sure we are not reinitializing a held lock: | |
18022 | + */ | |
18023 | + debug_check_no_locks_freed((void *)mutex, sizeof(*mutex)); | |
18024 | + lockdep_init_map(&mutex->dep_map, name, key, 0); | |
1a6e0f06 | 18025 | +#endif |
e4b2b4a8 JK |
18026 | + mutex->lock.save_state = 0; |
18027 | +} | |
18028 | +EXPORT_SYMBOL(__mutex_do_init); | |
18029 | + | |
18030 | +void __lockfunc _mutex_lock(struct mutex *lock) | |
18031 | +{ | |
18032 | + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); | |
18033 | + __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE); | |
18034 | +} | |
18035 | +EXPORT_SYMBOL(_mutex_lock); | |
18036 | + | |
18037 | +void __lockfunc _mutex_lock_io(struct mutex *lock) | |
18038 | +{ | |
18039 | + int token; | |
18040 | + | |
18041 | + token = io_schedule_prepare(); | |
18042 | + _mutex_lock(lock); | |
18043 | + io_schedule_finish(token); | |
18044 | +} | |
18045 | +EXPORT_SYMBOL_GPL(_mutex_lock_io); | |
18046 | + | |
18047 | +int __lockfunc _mutex_lock_interruptible(struct mutex *lock) | |
18048 | +{ | |
18049 | + int ret; | |
18050 | + | |
18051 | + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); | |
18052 | + ret = __rt_mutex_lock_state(&lock->lock, TASK_INTERRUPTIBLE); | |
18053 | + if (ret) | |
18054 | + mutex_release(&lock->dep_map, 1, _RET_IP_); | |
18055 | + return ret; | |
18056 | +} | |
18057 | +EXPORT_SYMBOL(_mutex_lock_interruptible); | |
18058 | + | |
18059 | +int __lockfunc _mutex_lock_killable(struct mutex *lock) | |
18060 | +{ | |
18061 | + int ret; | |
18062 | + | |
18063 | + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); | |
18064 | + ret = __rt_mutex_lock_state(&lock->lock, TASK_KILLABLE); | |
18065 | + if (ret) | |
18066 | + mutex_release(&lock->dep_map, 1, _RET_IP_); | |
18067 | + return ret; | |
18068 | +} | |
18069 | +EXPORT_SYMBOL(_mutex_lock_killable); | |
18070 | + | |
18071 | +#ifdef CONFIG_DEBUG_LOCK_ALLOC | |
18072 | +void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass) | |
18073 | +{ | |
18074 | + mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_); | |
18075 | + __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE); | |
18076 | +} | |
18077 | +EXPORT_SYMBOL(_mutex_lock_nested); | |
1a6e0f06 | 18078 | + |
e4b2b4a8 JK |
18079 | +void __lockfunc _mutex_lock_io_nested(struct mutex *lock, int subclass) |
18080 | +{ | |
18081 | + int token; | |
1f39f580 | 18082 | + |
e4b2b4a8 | 18083 | + token = io_schedule_prepare(); |
1a6e0f06 | 18084 | + |
e4b2b4a8 JK |
18085 | + mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_); |
18086 | + __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE); | |
1a6e0f06 | 18087 | + |
e4b2b4a8 JK |
18088 | + io_schedule_finish(token); |
18089 | +} | |
18090 | +EXPORT_SYMBOL_GPL(_mutex_lock_io_nested); | |
1a6e0f06 | 18091 | + |
e4b2b4a8 JK |
18092 | +void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) |
18093 | +{ | |
18094 | + mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_); | |
18095 | + __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE); | |
18096 | +} | |
18097 | +EXPORT_SYMBOL(_mutex_lock_nest_lock); | |
1a6e0f06 | 18098 | + |
e4b2b4a8 | 18099 | +int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass) |
1a6e0f06 | 18100 | +{ |
e4b2b4a8 JK |
18101 | + int ret; |
18102 | + | |
18103 | + mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_); | |
18104 | + ret = __rt_mutex_lock_state(&lock->lock, TASK_INTERRUPTIBLE); | |
18105 | + if (ret) | |
18106 | + mutex_release(&lock->dep_map, 1, _RET_IP_); | |
18107 | + return ret; | |
1a6e0f06 | 18108 | +} |
e4b2b4a8 JK |
18109 | +EXPORT_SYMBOL(_mutex_lock_interruptible_nested); |
18110 | + | |
18111 | +int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass) | |
18112 | +{ | |
18113 | + int ret; | |
18114 | + | |
18115 | + mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); | |
18116 | + ret = __rt_mutex_lock_state(&lock->lock, TASK_KILLABLE); | |
18117 | + if (ret) | |
18118 | + mutex_release(&lock->dep_map, 1, _RET_IP_); | |
18119 | + return ret; | |
18120 | +} | |
18121 | +EXPORT_SYMBOL(_mutex_lock_killable_nested); | |
1a6e0f06 | 18122 | +#endif |
e4b2b4a8 JK |
18123 | + |
18124 | +int __lockfunc _mutex_trylock(struct mutex *lock) | |
18125 | +{ | |
18126 | + int ret = __rt_mutex_trylock(&lock->lock); | |
18127 | + | |
18128 | + if (ret) | |
18129 | + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); | |
18130 | + | |
18131 | + return ret; | |
18132 | +} | |
18133 | +EXPORT_SYMBOL(_mutex_trylock); | |
18134 | + | |
18135 | +void __lockfunc _mutex_unlock(struct mutex *lock) | |
18136 | +{ | |
18137 | + mutex_release(&lock->dep_map, 1, _RET_IP_); | |
18138 | + __rt_mutex_unlock(&lock->lock); | |
18139 | +} | |
18140 | +EXPORT_SYMBOL(_mutex_unlock); | |
18141 | + | |
18142 | +/** | |
18143 | + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 | |
18144 | + * @cnt: the atomic which we are to dec | |
18145 | + * @lock: the mutex to return holding if we dec to 0 | |
18146 | + * | |
18147 | + * return true and hold lock if we dec to 0, return false otherwise | |
18148 | + */ | |
18149 | +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) | |
18150 | +{ | |
18151 | + /* dec if we can't possibly hit 0 */ | |
18152 | + if (atomic_add_unless(cnt, -1, 1)) | |
18153 | + return 0; | |
18154 | + /* we might hit 0, so take the lock */ | |
18155 | + mutex_lock(lock); | |
18156 | + if (!atomic_dec_and_test(cnt)) { | |
18157 | + /* when we actually did the dec, we didn't hit 0 */ | |
18158 | + mutex_unlock(lock); | |
18159 | + return 0; | |
18160 | + } | |
18161 | + /* we hit 0, and we hold the lock */ | |
18162 | + return 1; | |
18163 | +} | |
18164 | +EXPORT_SYMBOL(atomic_dec_and_mutex_lock); | |
b3bbd485 JK |
18165 | diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c |
18166 | index 4ad35718f123..08e233b7dc21 100644 | |
18167 | --- a/kernel/locking/rtmutex.c | |
18168 | +++ b/kernel/locking/rtmutex.c | |
e4b2b4a8 JK |
18169 | @@ -7,6 +7,11 @@ |
18170 | * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> | |
18171 | * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt | |
18172 | * Copyright (C) 2006 Esben Nielsen | |
18173 | + * Adaptive Spinlocks: | |
18174 | + * Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich, | |
18175 | + * and Peter Morreale, | |
18176 | + * Adaptive Spinlocks simplification: | |
18177 | + * Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com> | |
1a6e0f06 | 18178 | * |
e4b2b4a8 JK |
18179 | * See Documentation/locking/rt-mutex-design.txt for details. |
18180 | */ | |
18181 | @@ -18,6 +23,8 @@ | |
18182 | #include <linux/sched/wake_q.h> | |
18183 | #include <linux/sched/debug.h> | |
18184 | #include <linux/timer.h> | |
18185 | +#include <linux/ww_mutex.h> | |
18186 | +#include <linux/blkdev.h> | |
18187 | ||
18188 | #include "rtmutex_common.h" | |
18189 | ||
b3bbd485 | 18190 | @@ -135,6 +142,12 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock) |
e4b2b4a8 | 18191 | WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS); |
1a6e0f06 JK |
18192 | } |
18193 | ||
e4b2b4a8 JK |
18194 | +static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter) |
18195 | +{ | |
18196 | + return waiter && waiter != PI_WAKEUP_INPROGRESS && | |
18197 | + waiter != PI_REQUEUE_INPROGRESS; | |
18198 | +} | |
18199 | + | |
1a6e0f06 | 18200 | /* |
e4b2b4a8 JK |
18201 | * We can speed up the acquire/release, if there's no debugging state to be |
18202 | * set up. | |
b3bbd485 | 18203 | @@ -228,7 +241,7 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, |
e4b2b4a8 | 18204 | * Only use with rt_mutex_waiter_{less,equal}() |
1a6e0f06 | 18205 | */ |
e4b2b4a8 JK |
18206 | #define task_to_waiter(p) \ |
18207 | - &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } | |
18208 | + &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline, .task = (p) } | |
1a6e0f06 | 18209 | |
e4b2b4a8 JK |
18210 | static inline int |
18211 | rt_mutex_waiter_less(struct rt_mutex_waiter *left, | |
b3bbd485 | 18212 | @@ -268,6 +281,27 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left, |
e4b2b4a8 | 18213 | return 1; |
1a6e0f06 JK |
18214 | } |
18215 | ||
e4b2b4a8 JK |
18216 | +#define STEAL_NORMAL 0 |
18217 | +#define STEAL_LATERAL 1 | |
18218 | + | |
18219 | +static inline int | |
18220 | +rt_mutex_steal(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, int mode) | |
18221 | +{ | |
18222 | + struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock); | |
18223 | + | |
18224 | + if (waiter == top_waiter || rt_mutex_waiter_less(waiter, top_waiter)) | |
18225 | + return 1; | |
18226 | + | |
18227 | + /* | |
18228 | + * Note that RT tasks are excluded from lateral-steals | |
18229 | + * to prevent the introduction of an unbounded latency. | |
18230 | + */ | |
18231 | + if (mode == STEAL_NORMAL || rt_task(waiter->task)) | |
18232 | + return 0; | |
18233 | + | |
18234 | + return rt_mutex_waiter_equal(waiter, top_waiter); | |
18235 | +} | |
18236 | + | |
18237 | static void | |
18238 | rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) | |
18239 | { | |
b3bbd485 | 18240 | @@ -372,6 +406,14 @@ static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter, |
e4b2b4a8 | 18241 | return debug_rt_mutex_detect_deadlock(waiter, chwalk); |
1a6e0f06 JK |
18242 | } |
18243 | ||
e4b2b4a8 JK |
18244 | +static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter) |
18245 | +{ | |
18246 | + if (waiter->savestate) | |
18247 | + wake_up_lock_sleeper(waiter->task); | |
18248 | + else | |
18249 | + wake_up_process(waiter->task); | |
18250 | +} | |
18251 | + | |
18252 | /* | |
18253 | * Max number of times we'll walk the boosting chain: | |
18254 | */ | |
b3bbd485 | 18255 | @@ -379,7 +421,8 @@ int max_lock_depth = 1024; |
1a6e0f06 | 18256 | |
e4b2b4a8 JK |
18257 | static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) |
18258 | { | |
18259 | - return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL; | |
18260 | + return rt_mutex_real_waiter(p->pi_blocked_on) ? | |
18261 | + p->pi_blocked_on->lock : NULL; | |
18262 | } | |
1a6e0f06 | 18263 | |
e4b2b4a8 | 18264 | /* |
b3bbd485 | 18265 | @@ -515,7 +558,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, |
e4b2b4a8 JK |
18266 | * reached or the state of the chain has changed while we |
18267 | * dropped the locks. | |
18268 | */ | |
18269 | - if (!waiter) | |
18270 | + if (!rt_mutex_real_waiter(waiter)) | |
18271 | goto out_unlock_pi; | |
1a6e0f06 | 18272 | |
e4b2b4a8 | 18273 | /* |
b3bbd485 | 18274 | @@ -696,13 +739,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, |
e4b2b4a8 JK |
18275 | * follow here. This is the end of the chain we are walking. |
18276 | */ | |
18277 | if (!rt_mutex_owner(lock)) { | |
18278 | + struct rt_mutex_waiter *lock_top_waiter; | |
1a6e0f06 | 18279 | + |
e4b2b4a8 JK |
18280 | /* |
18281 | * If the requeue [7] above changed the top waiter, | |
18282 | * then we need to wake the new top waiter up to try | |
18283 | * to get the lock. | |
18284 | */ | |
18285 | - if (prerequeue_top_waiter != rt_mutex_top_waiter(lock)) | |
18286 | - wake_up_process(rt_mutex_top_waiter(lock)->task); | |
18287 | + lock_top_waiter = rt_mutex_top_waiter(lock); | |
18288 | + if (prerequeue_top_waiter != lock_top_waiter) | |
18289 | + rt_mutex_wake_waiter(lock_top_waiter); | |
18290 | raw_spin_unlock_irq(&lock->wait_lock); | |
18291 | return 0; | |
18292 | } | |
b3bbd485 | 18293 | @@ -804,9 +850,11 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, |
e4b2b4a8 JK |
18294 | * @task: The task which wants to acquire the lock |
18295 | * @waiter: The waiter that is queued to the lock's wait tree if the | |
18296 | * callsite called task_blocked_on_lock(), otherwise NULL | |
18297 | + * @mode: Lock steal mode (STEAL_NORMAL, STEAL_LATERAL) | |
18298 | */ | |
18299 | -static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, | |
18300 | - struct rt_mutex_waiter *waiter) | |
18301 | +static int __try_to_take_rt_mutex(struct rt_mutex *lock, | |
18302 | + struct task_struct *task, | |
18303 | + struct rt_mutex_waiter *waiter, int mode) | |
18304 | { | |
18305 | lockdep_assert_held(&lock->wait_lock); | |
1a6e0f06 | 18306 | |
b3bbd485 | 18307 | @@ -842,12 +890,11 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, |
e4b2b4a8 JK |
18308 | */ |
18309 | if (waiter) { | |
18310 | /* | |
18311 | - * If waiter is not the highest priority waiter of | |
18312 | - * @lock, give up. | |
18313 | + * If waiter is not the highest priority waiter of @lock, | |
18314 | + * or its peer when lateral steal is allowed, give up. | |
18315 | */ | |
18316 | - if (waiter != rt_mutex_top_waiter(lock)) | |
18317 | + if (!rt_mutex_steal(lock, waiter, mode)) | |
18318 | return 0; | |
18319 | - | |
18320 | /* | |
18321 | * We can acquire the lock. Remove the waiter from the | |
18322 | * lock waiters tree. | |
b3bbd485 | 18323 | @@ -865,14 +912,12 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, |
e4b2b4a8 JK |
18324 | */ |
18325 | if (rt_mutex_has_waiters(lock)) { | |
18326 | /* | |
18327 | - * If @task->prio is greater than or equal to | |
18328 | - * the top waiter priority (kernel view), | |
18329 | - * @task lost. | |
18330 | + * If @task->prio is greater than the top waiter | |
18331 | + * priority (kernel view), or equal to it when a | |
18332 | + * lateral steal is forbidden, @task lost. | |
18333 | */ | |
18334 | - if (!rt_mutex_waiter_less(task_to_waiter(task), | |
18335 | - rt_mutex_top_waiter(lock))) | |
18336 | + if (!rt_mutex_steal(lock, task_to_waiter(task), mode)) | |
18337 | return 0; | |
18338 | - | |
18339 | /* | |
18340 | * The current top waiter stays enqueued. We | |
18341 | * don't have to change anything in the lock | |
b3bbd485 | 18342 | @@ -919,6 +964,351 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, |
e4b2b4a8 JK |
18343 | return 1; |
18344 | } | |
1a6e0f06 | 18345 | |
e4b2b4a8 JK |
18346 | +#ifdef CONFIG_PREEMPT_RT_FULL |
18347 | +/* | |
18348 | + * preemptible spin_lock functions: | |
18349 | + */ | |
18350 | +static inline void rt_spin_lock_fastlock(struct rt_mutex *lock, | |
18351 | + void (*slowfn)(struct rt_mutex *lock)) | |
18352 | +{ | |
18353 | + might_sleep_no_state_check(); | |
18354 | + | |
18355 | + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) | |
18356 | + return; | |
18357 | + else | |
18358 | + slowfn(lock); | |
18359 | +} | |
18360 | + | |
18361 | +static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock, | |
18362 | + void (*slowfn)(struct rt_mutex *lock)) | |
18363 | +{ | |
18364 | + if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) | |
18365 | + return; | |
18366 | + else | |
18367 | + slowfn(lock); | |
18368 | +} | |
18369 | +#ifdef CONFIG_SMP | |
18370 | +/* | |
18371 | + * Note that owner is a speculative pointer and dereferencing relies | |
18372 | + * on rcu_read_lock() and the check against the lock owner. | |
18373 | + */ | |
18374 | +static int adaptive_wait(struct rt_mutex *lock, | |
18375 | + struct task_struct *owner) | |
18376 | +{ | |
18377 | + int res = 0; | |
18378 | + | |
18379 | + rcu_read_lock(); | |
18380 | + for (;;) { | |
18381 | + if (owner != rt_mutex_owner(lock)) | |
18382 | + break; | |
18383 | + /* | |
18384 | + * Ensure that owner->on_cpu is dereferenced _after_ | |
18385 | + * checking the above to be valid. | |
18386 | + */ | |
18387 | + barrier(); | |
18388 | + if (!owner->on_cpu) { | |
18389 | + res = 1; | |
18390 | + break; | |
18391 | + } | |
18392 | + cpu_relax(); | |
18393 | + } | |
18394 | + rcu_read_unlock(); | |
18395 | + return res; | |
18396 | +} | |
18397 | +#else | |
18398 | +static int adaptive_wait(struct rt_mutex *lock, | |
18399 | + struct task_struct *orig_owner) | |
18400 | +{ | |
18401 | + return 1; | |
18402 | +} | |
1a6e0f06 JK |
18403 | +#endif |
18404 | + | |
e4b2b4a8 JK |
18405 | +static int task_blocks_on_rt_mutex(struct rt_mutex *lock, |
18406 | + struct rt_mutex_waiter *waiter, | |
18407 | + struct task_struct *task, | |
18408 | + enum rtmutex_chainwalk chwalk); | |
18409 | +/* | |
18410 | + * Slow path lock function spin_lock style: this variant is very | |
18411 | + * careful not to miss any non-lock wakeups. | |
18412 | + * | |
18413 | + * We store the current state under p->pi_lock in p->saved_state and | |
18414 | + * the try_to_wake_up() code handles this accordingly. | |
18415 | + */ | |
18416 | +void __sched rt_spin_lock_slowlock_locked(struct rt_mutex *lock, | |
18417 | + struct rt_mutex_waiter *waiter, | |
18418 | + unsigned long flags) | |
18419 | +{ | |
18420 | + struct task_struct *lock_owner, *self = current; | |
18421 | + struct rt_mutex_waiter *top_waiter; | |
18422 | + int ret; | |
1a6e0f06 | 18423 | + |
e4b2b4a8 JK |
18424 | + if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL)) |
18425 | + return; | |
1a6e0f06 | 18426 | + |
e4b2b4a8 | 18427 | + BUG_ON(rt_mutex_owner(lock) == self); |
1a6e0f06 | 18428 | + |
e4b2b4a8 JK |
18429 | + /* |
18430 | + * We save whatever state the task is in and we'll restore it | |
18431 | + * after acquiring the lock taking real wakeups into account | |
18432 | + * as well. We are serialized via pi_lock against wakeups. See | |
18433 | + * try_to_wake_up(). | |
18434 | + */ | |
18435 | + raw_spin_lock(&self->pi_lock); | |
18436 | + self->saved_state = self->state; | |
18437 | + __set_current_state_no_track(TASK_UNINTERRUPTIBLE); | |
18438 | + raw_spin_unlock(&self->pi_lock); | |
1a6e0f06 | 18439 | + |
e4b2b4a8 JK |
18440 | + ret = task_blocks_on_rt_mutex(lock, waiter, self, RT_MUTEX_MIN_CHAINWALK); |
18441 | + BUG_ON(ret); | |
1a6e0f06 | 18442 | + |
e4b2b4a8 JK |
18443 | + for (;;) { |
18444 | + /* Try to acquire the lock again. */ | |
18445 | + if (__try_to_take_rt_mutex(lock, self, waiter, STEAL_LATERAL)) | |
18446 | + break; | |
1a6e0f06 | 18447 | + |
e4b2b4a8 JK |
18448 | + top_waiter = rt_mutex_top_waiter(lock); |
18449 | + lock_owner = rt_mutex_owner(lock); | |
1a6e0f06 | 18450 | + |
e4b2b4a8 | 18451 | + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); |
1a6e0f06 | 18452 | + |
e4b2b4a8 | 18453 | + debug_rt_mutex_print_deadlock(waiter); |
1a6e0f06 | 18454 | + |
e4b2b4a8 JK |
18455 | + if (top_waiter != waiter || adaptive_wait(lock, lock_owner)) |
18456 | + schedule(); | |
1a6e0f06 | 18457 | + |
e4b2b4a8 | 18458 | + raw_spin_lock_irqsave(&lock->wait_lock, flags); |
1a6e0f06 | 18459 | + |
e4b2b4a8 JK |
18460 | + raw_spin_lock(&self->pi_lock); |
18461 | + __set_current_state_no_track(TASK_UNINTERRUPTIBLE); | |
18462 | + raw_spin_unlock(&self->pi_lock); | |
18463 | + } | |
1a6e0f06 | 18464 | + |
e4b2b4a8 JK |
18465 | + /* |
18466 | + * Restore the task state to current->saved_state. We set it | |
18467 | + * to the original state above and the try_to_wake_up() code | |
18468 | + * has possibly updated it when a real (non-rtmutex) wakeup | |
18469 | + * happened while we were blocked. Clear saved_state so | |
18470 | + * try_to_wakeup() does not get confused. | |
18471 | + */ | |
18472 | + raw_spin_lock(&self->pi_lock); | |
18473 | + __set_current_state_no_track(self->saved_state); | |
18474 | + self->saved_state = TASK_RUNNING; | |
18475 | + raw_spin_unlock(&self->pi_lock); | |
1a6e0f06 | 18476 | + |
e4b2b4a8 JK |
18477 | + /* |
18478 | + * try_to_take_rt_mutex() sets the waiter bit | |
18479 | + * unconditionally. We might have to fix that up: | |
18480 | + */ | |
18481 | + fixup_rt_mutex_waiters(lock); | |
1a6e0f06 | 18482 | + |
e4b2b4a8 JK |
18483 | + BUG_ON(rt_mutex_has_waiters(lock) && waiter == rt_mutex_top_waiter(lock)); |
18484 | + BUG_ON(!RB_EMPTY_NODE(&waiter->tree_entry)); | |
18485 | +} | |
1a6e0f06 | 18486 | + |
e4b2b4a8 JK |
18487 | +static void noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock) |
18488 | +{ | |
18489 | + struct rt_mutex_waiter waiter; | |
18490 | + unsigned long flags; | |
1a6e0f06 | 18491 | + |
e4b2b4a8 | 18492 | + rt_mutex_init_waiter(&waiter, true); |
1a6e0f06 | 18493 | + |
e4b2b4a8 JK |
18494 | + raw_spin_lock_irqsave(&lock->wait_lock, flags); |
18495 | + rt_spin_lock_slowlock_locked(lock, &waiter, flags); | |
18496 | + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); | |
18497 | + debug_rt_mutex_free_waiter(&waiter); | |
18498 | +} | |
1a6e0f06 | 18499 | + |
e4b2b4a8 JK |
18500 | +static bool __sched __rt_mutex_unlock_common(struct rt_mutex *lock, |
18501 | + struct wake_q_head *wake_q, | |
18502 | + struct wake_q_head *wq_sleeper); | |
18503 | +/* | |
18504 | + * Slow path to release a rt_mutex spin_lock style | |
18505 | + */ | |
18506 | +void __sched rt_spin_lock_slowunlock(struct rt_mutex *lock) | |
18507 | +{ | |
18508 | + unsigned long flags; | |
18509 | + DEFINE_WAKE_Q(wake_q); | |
18510 | + DEFINE_WAKE_Q(wake_sleeper_q); | |
18511 | + bool postunlock; | |
1a6e0f06 | 18512 | + |
e4b2b4a8 JK |
18513 | + raw_spin_lock_irqsave(&lock->wait_lock, flags); |
18514 | + postunlock = __rt_mutex_unlock_common(lock, &wake_q, &wake_sleeper_q); | |
18515 | + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); | |
1a6e0f06 | 18516 | + |
e4b2b4a8 JK |
18517 | + if (postunlock) |
18518 | + rt_mutex_postunlock(&wake_q, &wake_sleeper_q); | |
18519 | +} | |
1a6e0f06 | 18520 | + |
e4b2b4a8 JK |
18521 | +void __lockfunc rt_spin_lock(spinlock_t *lock) |
18522 | +{ | |
18523 | + sleeping_lock_inc(); | |
18524 | + migrate_disable(); | |
18525 | + spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); | |
18526 | + rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock); | |
18527 | +} | |
18528 | +EXPORT_SYMBOL(rt_spin_lock); | |
1a6e0f06 | 18529 | + |
e4b2b4a8 JK |
18530 | +void __lockfunc __rt_spin_lock(struct rt_mutex *lock) |
18531 | +{ | |
18532 | + rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock); | |
18533 | +} | |
1a6e0f06 | 18534 | + |
e4b2b4a8 JK |
18535 | +#ifdef CONFIG_DEBUG_LOCK_ALLOC |
18536 | +void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass) | |
18537 | +{ | |
18538 | + sleeping_lock_inc(); | |
18539 | + migrate_disable(); | |
18540 | + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); | |
18541 | + rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock); | |
18542 | +} | |
18543 | +EXPORT_SYMBOL(rt_spin_lock_nested); | |
18544 | +#endif | |
1a6e0f06 | 18545 | + |
e4b2b4a8 JK |
18546 | +void __lockfunc rt_spin_unlock(spinlock_t *lock) |
18547 | +{ | |
18548 | + /* NOTE: we always pass in '1' for nested, for simplicity */ | |
18549 | + spin_release(&lock->dep_map, 1, _RET_IP_); | |
18550 | + rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock); | |
18551 | + migrate_enable(); | |
18552 | + sleeping_lock_dec(); | |
18553 | +} | |
18554 | +EXPORT_SYMBOL(rt_spin_unlock); | |
1a6e0f06 | 18555 | + |
e4b2b4a8 JK |
18556 | +void __lockfunc __rt_spin_unlock(struct rt_mutex *lock) |
18557 | +{ | |
18558 | + rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock); | |
18559 | +} | |
18560 | +EXPORT_SYMBOL(__rt_spin_unlock); | |
18561 | + | |
18562 | +/* | |
18563 | + * Wait for the lock to get unlocked: instead of polling for an unlock | |
18564 | + * (like raw spinlocks do), we lock and unlock, to force the kernel to | |
18565 | + * schedule if there's contention: | |
18566 | + */ | |
18567 | +void __lockfunc rt_spin_unlock_wait(spinlock_t *lock) | |
18568 | +{ | |
18569 | + spin_lock(lock); | |
18570 | + spin_unlock(lock); | |
18571 | +} | |
18572 | +EXPORT_SYMBOL(rt_spin_unlock_wait); | |
18573 | + | |
18574 | +int __lockfunc rt_spin_trylock(spinlock_t *lock) | |
18575 | +{ | |
18576 | + int ret; | |
18577 | + | |
18578 | + sleeping_lock_inc(); | |
18579 | + migrate_disable(); | |
18580 | + ret = __rt_mutex_trylock(&lock->lock); | |
18581 | + if (ret) { | |
18582 | + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); | |
18583 | + } else { | |
18584 | + migrate_enable(); | |
18585 | + sleeping_lock_dec(); | |
18586 | + } | |
18587 | + return ret; | |
18588 | +} | |
18589 | +EXPORT_SYMBOL(rt_spin_trylock); | |
18590 | + | |
18591 | +int __lockfunc rt_spin_trylock_bh(spinlock_t *lock) | |
18592 | +{ | |
18593 | + int ret; | |
18594 | + | |
18595 | + local_bh_disable(); | |
18596 | + ret = __rt_mutex_trylock(&lock->lock); | |
18597 | + if (ret) { | |
18598 | + sleeping_lock_inc(); | |
18599 | + migrate_disable(); | |
18600 | + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); | |
18601 | + } else | |
18602 | + local_bh_enable(); | |
18603 | + return ret; | |
18604 | +} | |
18605 | +EXPORT_SYMBOL(rt_spin_trylock_bh); | |
18606 | + | |
18607 | +int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags) | |
18608 | +{ | |
18609 | + int ret; | |
1a6e0f06 | 18610 | + |
e4b2b4a8 JK |
18611 | + *flags = 0; |
18612 | + ret = __rt_mutex_trylock(&lock->lock); | |
18613 | + if (ret) { | |
18614 | + sleeping_lock_inc(); | |
18615 | + migrate_disable(); | |
18616 | + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); | |
18617 | + } | |
18618 | + return ret; | |
18619 | +} | |
18620 | +EXPORT_SYMBOL(rt_spin_trylock_irqsave); | |
1a6e0f06 | 18621 | + |
e4b2b4a8 JK |
18622 | +int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock) |
18623 | +{ | |
18624 | + /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */ | |
18625 | + if (atomic_add_unless(atomic, -1, 1)) | |
18626 | + return 0; | |
18627 | + rt_spin_lock(lock); | |
18628 | + if (atomic_dec_and_test(atomic)) | |
18629 | + return 1; | |
18630 | + rt_spin_unlock(lock); | |
18631 | + return 0; | |
18632 | +} | |
18633 | +EXPORT_SYMBOL(atomic_dec_and_spin_lock); | |
1a6e0f06 | 18634 | + |
e4b2b4a8 JK |
18635 | +void |
18636 | +__rt_spin_lock_init(spinlock_t *lock, const char *name, struct lock_class_key *key) | |
18637 | +{ | |
1a6e0f06 | 18638 | +#ifdef CONFIG_DEBUG_LOCK_ALLOC |
e4b2b4a8 JK |
18639 | + /* |
18640 | + * Make sure we are not reinitializing a held lock: | |
18641 | + */ | |
18642 | + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); | |
18643 | + lockdep_init_map(&lock->dep_map, name, key, 0); | |
1a6e0f06 | 18644 | +#endif |
e4b2b4a8 JK |
18645 | +} |
18646 | +EXPORT_SYMBOL(__rt_spin_lock_init); | |
1a6e0f06 | 18647 | + |
e4b2b4a8 | 18648 | +#endif /* PREEMPT_RT_FULL */ |
1a6e0f06 | 18649 | + |
e4b2b4a8 JK |
18650 | +#ifdef CONFIG_PREEMPT_RT_FULL |
18651 | + static inline int __sched | |
18652 | +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx) | |
18653 | +{ | |
18654 | + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock); | |
18655 | + struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx); | |
1a6e0f06 | 18656 | + |
e4b2b4a8 JK |
18657 | + if (!hold_ctx) |
18658 | + return 0; | |
1a6e0f06 | 18659 | + |
e4b2b4a8 JK |
18660 | + if (unlikely(ctx == hold_ctx)) |
18661 | + return -EALREADY; | |
1a6e0f06 | 18662 | + |
e4b2b4a8 JK |
18663 | + if (ctx->stamp - hold_ctx->stamp <= LONG_MAX && |
18664 | + (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) { | |
18665 | +#ifdef CONFIG_DEBUG_MUTEXES | |
18666 | + DEBUG_LOCKS_WARN_ON(ctx->contending_lock); | |
18667 | + ctx->contending_lock = ww; | |
18668 | +#endif | |
18669 | + return -EDEADLK; | |
18670 | + } | |
1a6e0f06 | 18671 | + |
e4b2b4a8 JK |
18672 | + return 0; |
18673 | +} | |
18674 | +#else | |
18675 | + static inline int __sched | |
18676 | +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx) | |
18677 | +{ | |
18678 | + BUG(); | |
18679 | + return 0; | |
18680 | +} | |
1a6e0f06 | 18681 | + |
1a6e0f06 JK |
18682 | +#endif |
18683 | + | |
e4b2b4a8 JK |
18684 | +static inline int |
18685 | +try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, | |
18686 | + struct rt_mutex_waiter *waiter) | |
18687 | +{ | |
18688 | + return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL); | |
18689 | +} | |
1a6e0f06 | 18690 | + |
e4b2b4a8 JK |
18691 | /* |
18692 | * Task blocks on lock. | |
18693 | * | |
b3bbd485 | 18694 | @@ -951,6 +1341,22 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, |
e4b2b4a8 JK |
18695 | return -EDEADLK; |
18696 | ||
18697 | raw_spin_lock(&task->pi_lock); | |
18698 | + /* | |
18699 | + * In the case of futex requeue PI, this will be a proxy | |
18700 | + * lock. The task will wake unaware that it is enqueueed on | |
18701 | + * this lock. Avoid blocking on two locks and corrupting | |
18702 | + * pi_blocked_on via the PI_WAKEUP_INPROGRESS | |
18703 | + * flag. futex_wait_requeue_pi() sets this when it wakes up | |
18704 | + * before requeue (due to a signal or timeout). Do not enqueue | |
18705 | + * the task if PI_WAKEUP_INPROGRESS is set. | |
18706 | + */ | |
18707 | + if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) { | |
18708 | + raw_spin_unlock(&task->pi_lock); | |
18709 | + return -EAGAIN; | |
18710 | + } | |
1a6e0f06 | 18711 | + |
e4b2b4a8 | 18712 | + BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on)); |
1a6e0f06 | 18713 | + |
e4b2b4a8 JK |
18714 | waiter->task = task; |
18715 | waiter->lock = lock; | |
18716 | waiter->prio = task->prio; | |
b3bbd485 | 18717 | @@ -974,7 +1380,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, |
e4b2b4a8 JK |
18718 | rt_mutex_enqueue_pi(owner, waiter); |
18719 | ||
18720 | rt_mutex_adjust_prio(owner); | |
18721 | - if (owner->pi_blocked_on) | |
18722 | + if (rt_mutex_real_waiter(owner->pi_blocked_on)) | |
18723 | chain_walk = 1; | |
18724 | } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) { | |
18725 | chain_walk = 1; | |
b3bbd485 | 18726 | @@ -1016,6 +1422,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, |
e4b2b4a8 JK |
18727 | * Called with lock->wait_lock held and interrupts disabled. |
18728 | */ | |
18729 | static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, | |
18730 | + struct wake_q_head *wake_sleeper_q, | |
18731 | struct rt_mutex *lock) | |
18732 | { | |
18733 | struct rt_mutex_waiter *waiter; | |
b3bbd485 | 18734 | @@ -1055,7 +1462,10 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, |
e4b2b4a8 JK |
18735 | * Pairs with preempt_enable() in rt_mutex_postunlock(); |
18736 | */ | |
18737 | preempt_disable(); | |
18738 | - wake_q_add(wake_q, waiter->task); | |
18739 | + if (waiter->savestate) | |
18740 | + wake_q_add_sleeper(wake_sleeper_q, waiter->task); | |
18741 | + else | |
18742 | + wake_q_add(wake_q, waiter->task); | |
18743 | raw_spin_unlock(¤t->pi_lock); | |
18744 | } | |
18745 | ||
b3bbd485 | 18746 | @@ -1070,7 +1480,7 @@ static void remove_waiter(struct rt_mutex *lock, |
e4b2b4a8 JK |
18747 | { |
18748 | bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); | |
18749 | struct task_struct *owner = rt_mutex_owner(lock); | |
18750 | - struct rt_mutex *next_lock; | |
18751 | + struct rt_mutex *next_lock = NULL; | |
18752 | ||
18753 | lockdep_assert_held(&lock->wait_lock); | |
18754 | ||
b3bbd485 | 18755 | @@ -1096,7 +1506,8 @@ static void remove_waiter(struct rt_mutex *lock, |
e4b2b4a8 JK |
18756 | rt_mutex_adjust_prio(owner); |
18757 | ||
18758 | /* Store the lock on which owner is blocked or NULL */ | |
18759 | - next_lock = task_blocked_on_lock(owner); | |
18760 | + if (rt_mutex_real_waiter(owner->pi_blocked_on)) | |
18761 | + next_lock = task_blocked_on_lock(owner); | |
18762 | ||
18763 | raw_spin_unlock(&owner->pi_lock); | |
18764 | ||
b3bbd485 | 18765 | @@ -1132,26 +1543,28 @@ void rt_mutex_adjust_pi(struct task_struct *task) |
e4b2b4a8 JK |
18766 | raw_spin_lock_irqsave(&task->pi_lock, flags); |
18767 | ||
18768 | waiter = task->pi_blocked_on; | |
18769 | - if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { | |
18770 | + if (!rt_mutex_real_waiter(waiter) || | |
18771 | + rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { | |
18772 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | |
18773 | return; | |
18774 | } | |
18775 | next_lock = waiter->lock; | |
18776 | - raw_spin_unlock_irqrestore(&task->pi_lock, flags); | |
18777 | ||
18778 | /* gets dropped in rt_mutex_adjust_prio_chain()! */ | |
18779 | get_task_struct(task); | |
18780 | ||
18781 | + raw_spin_unlock_irqrestore(&task->pi_lock, flags); | |
18782 | rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL, | |
18783 | next_lock, NULL, task); | |
18784 | } | |
18785 | ||
18786 | -void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) | |
18787 | +void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate) | |
18788 | { | |
18789 | debug_rt_mutex_init_waiter(waiter); | |
18790 | RB_CLEAR_NODE(&waiter->pi_tree_entry); | |
18791 | RB_CLEAR_NODE(&waiter->tree_entry); | |
18792 | waiter->task = NULL; | |
18793 | + waiter->savestate = savestate; | |
18794 | } | |
18795 | ||
18796 | /** | |
b3bbd485 | 18797 | @@ -1167,7 +1580,8 @@ void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) |
e4b2b4a8 JK |
18798 | static int __sched |
18799 | __rt_mutex_slowlock(struct rt_mutex *lock, int state, | |
18800 | struct hrtimer_sleeper *timeout, | |
18801 | - struct rt_mutex_waiter *waiter) | |
18802 | + struct rt_mutex_waiter *waiter, | |
18803 | + struct ww_acquire_ctx *ww_ctx) | |
18804 | { | |
18805 | int ret = 0; | |
18806 | ||
b3bbd485 | 18807 | @@ -1176,16 +1590,17 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, |
e4b2b4a8 JK |
18808 | if (try_to_take_rt_mutex(lock, current, waiter)) |
18809 | break; | |
18810 | ||
18811 | - /* | |
18812 | - * TASK_INTERRUPTIBLE checks for signals and | |
18813 | - * timeout. Ignored otherwise. | |
18814 | - */ | |
18815 | - if (likely(state == TASK_INTERRUPTIBLE)) { | |
18816 | - /* Signal pending? */ | |
18817 | - if (signal_pending(current)) | |
18818 | - ret = -EINTR; | |
18819 | - if (timeout && !timeout->task) | |
18820 | - ret = -ETIMEDOUT; | |
18821 | + if (timeout && !timeout->task) { | |
18822 | + ret = -ETIMEDOUT; | |
18823 | + break; | |
18824 | + } | |
18825 | + if (signal_pending_state(state, current)) { | |
18826 | + ret = -EINTR; | |
18827 | + break; | |
18828 | + } | |
1a6e0f06 | 18829 | + |
e4b2b4a8 JK |
18830 | + if (ww_ctx && ww_ctx->acquired > 0) { |
18831 | + ret = __mutex_lock_check_stamp(lock, ww_ctx); | |
18832 | if (ret) | |
18833 | break; | |
18834 | } | |
b3bbd485 | 18835 | @@ -1224,33 +1639,104 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock, |
e4b2b4a8 JK |
18836 | } |
18837 | } | |
18838 | ||
18839 | -/* | |
18840 | - * Slow path lock function: | |
18841 | - */ | |
18842 | -static int __sched | |
18843 | -rt_mutex_slowlock(struct rt_mutex *lock, int state, | |
18844 | - struct hrtimer_sleeper *timeout, | |
18845 | - enum rtmutex_chainwalk chwalk) | |
18846 | +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, | |
18847 | + struct ww_acquire_ctx *ww_ctx) | |
18848 | { | |
18849 | - struct rt_mutex_waiter waiter; | |
18850 | - unsigned long flags; | |
18851 | - int ret = 0; | |
18852 | +#ifdef CONFIG_DEBUG_MUTEXES | |
18853 | + /* | |
18854 | + * If this WARN_ON triggers, you used ww_mutex_lock to acquire, | |
18855 | + * but released with a normal mutex_unlock in this call. | |
18856 | + * | |
18857 | + * This should never happen, always use ww_mutex_unlock. | |
18858 | + */ | |
18859 | + DEBUG_LOCKS_WARN_ON(ww->ctx); | |
18860 | ||
18861 | - rt_mutex_init_waiter(&waiter); | |
18862 | + /* | |
18863 | + * Not quite done after calling ww_acquire_done() ? | |
18864 | + */ | |
18865 | + DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); | |
1a6e0f06 | 18866 | + |
e4b2b4a8 JK |
18867 | + if (ww_ctx->contending_lock) { |
18868 | + /* | |
18869 | + * After -EDEADLK you tried to | |
18870 | + * acquire a different ww_mutex? Bad! | |
18871 | + */ | |
18872 | + DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); | |
1a6e0f06 | 18873 | + |
e4b2b4a8 JK |
18874 | + /* |
18875 | + * You called ww_mutex_lock after receiving -EDEADLK, | |
18876 | + * but 'forgot' to unlock everything else first? | |
18877 | + */ | |
18878 | + DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0); | |
18879 | + ww_ctx->contending_lock = NULL; | |
18880 | + } | |
18881 | ||
18882 | /* | |
18883 | - * Technically we could use raw_spin_[un]lock_irq() here, but this can | |
18884 | - * be called in early boot if the cmpxchg() fast path is disabled | |
18885 | - * (debug, no architecture support). In this case we will acquire the | |
18886 | - * rtmutex with lock->wait_lock held. But we cannot unconditionally | |
18887 | - * enable interrupts in that early boot case. So we need to use the | |
18888 | - * irqsave/restore variants. | |
18889 | + * Naughty, using a different class will lead to undefined behavior! | |
18890 | */ | |
18891 | - raw_spin_lock_irqsave(&lock->wait_lock, flags); | |
18892 | + DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); | |
18893 | +#endif | |
18894 | + ww_ctx->acquired++; | |
18895 | +} | |
1a6e0f06 | 18896 | + |
e4b2b4a8 JK |
18897 | +#ifdef CONFIG_PREEMPT_RT_FULL |
18898 | +static void ww_mutex_account_lock(struct rt_mutex *lock, | |
18899 | + struct ww_acquire_ctx *ww_ctx) | |
18900 | +{ | |
18901 | + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock); | |
18902 | + struct rt_mutex_waiter *waiter, *n; | |
1a6e0f06 | 18903 | + |
e4b2b4a8 JK |
18904 | + /* |
18905 | + * This branch gets optimized out for the common case, | |
18906 | + * and is only important for ww_mutex_lock. | |
18907 | + */ | |
18908 | + ww_mutex_lock_acquired(ww, ww_ctx); | |
18909 | + ww->ctx = ww_ctx; | |
1a6e0f06 | 18910 | + |
e4b2b4a8 JK |
18911 | + /* |
18912 | + * Give any possible sleeping processes the chance to wake up, | |
18913 | + * so they can recheck if they have to back off. | |
18914 | + */ | |
18915 | + rbtree_postorder_for_each_entry_safe(waiter, n, &lock->waiters.rb_root, | |
18916 | + tree_entry) { | |
18917 | + /* XXX debug rt mutex waiter wakeup */ | |
1a6e0f06 | 18918 | + |
e4b2b4a8 JK |
18919 | + BUG_ON(waiter->lock != lock); |
18920 | + rt_mutex_wake_waiter(waiter); | |
18921 | + } | |
1a6e0f06 JK |
18922 | +} |
18923 | + | |
e4b2b4a8 | 18924 | +#else |
1a6e0f06 | 18925 | + |
e4b2b4a8 JK |
18926 | +static void ww_mutex_account_lock(struct rt_mutex *lock, |
18927 | + struct ww_acquire_ctx *ww_ctx) | |
1a6e0f06 | 18928 | +{ |
e4b2b4a8 | 18929 | + BUG(); |
1a6e0f06 | 18930 | +} |
e4b2b4a8 | 18931 | +#endif |
1a6e0f06 | 18932 | + |
e4b2b4a8 JK |
18933 | +int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state, |
18934 | + struct hrtimer_sleeper *timeout, | |
18935 | + enum rtmutex_chainwalk chwalk, | |
18936 | + struct ww_acquire_ctx *ww_ctx, | |
18937 | + struct rt_mutex_waiter *waiter) | |
1a6e0f06 | 18938 | +{ |
e4b2b4a8 | 18939 | + int ret; |
1a6e0f06 | 18940 | + |
e4b2b4a8 JK |
18941 | +#ifdef CONFIG_PREEMPT_RT_FULL |
18942 | + if (ww_ctx) { | |
18943 | + struct ww_mutex *ww; | |
1a6e0f06 | 18944 | + |
e4b2b4a8 JK |
18945 | + ww = container_of(lock, struct ww_mutex, base.lock); |
18946 | + if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) | |
18947 | + return -EALREADY; | |
18948 | + } | |
18949 | +#endif | |
18950 | ||
18951 | /* Try to acquire the lock again: */ | |
18952 | if (try_to_take_rt_mutex(lock, current, NULL)) { | |
18953 | - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); | |
18954 | + if (ww_ctx) | |
18955 | + ww_mutex_account_lock(lock, ww_ctx); | |
18956 | return 0; | |
18957 | } | |
18958 | ||
b3bbd485 | 18959 | @@ -1260,17 +1746,27 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, |
e4b2b4a8 JK |
18960 | if (unlikely(timeout)) |
18961 | hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); | |
18962 | ||
18963 | - ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk); | |
18964 | + ret = task_blocks_on_rt_mutex(lock, waiter, current, chwalk); | |
18965 | ||
18966 | - if (likely(!ret)) | |
18967 | + if (likely(!ret)) { | |
18968 | /* sleep on the mutex */ | |
18969 | - ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); | |
18970 | + ret = __rt_mutex_slowlock(lock, state, timeout, waiter, | |
18971 | + ww_ctx); | |
18972 | + } else if (ww_ctx) { | |
18973 | + /* ww_mutex received EDEADLK, let it become EALREADY */ | |
18974 | + ret = __mutex_lock_check_stamp(lock, ww_ctx); | |
18975 | + BUG_ON(!ret); | |
18976 | + } | |
18977 | ||
18978 | if (unlikely(ret)) { | |
18979 | __set_current_state(TASK_RUNNING); | |
18980 | if (rt_mutex_has_waiters(lock)) | |
18981 | - remove_waiter(lock, &waiter); | |
18982 | - rt_mutex_handle_deadlock(ret, chwalk, &waiter); | |
18983 | + remove_waiter(lock, waiter); | |
18984 | + /* ww_mutex want to report EDEADLK/EALREADY, let them */ | |
18985 | + if (!ww_ctx) | |
18986 | + rt_mutex_handle_deadlock(ret, chwalk, waiter); | |
18987 | + } else if (ww_ctx) { | |
18988 | + ww_mutex_account_lock(lock, ww_ctx); | |
18989 | } | |
18990 | ||
18991 | /* | |
b3bbd485 | 18992 | @@ -1278,6 +1774,36 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, |
e4b2b4a8 JK |
18993 | * unconditionally. We might have to fix that up. |
18994 | */ | |
18995 | fixup_rt_mutex_waiters(lock); | |
18996 | + return ret; | |
1a6e0f06 JK |
18997 | +} |
18998 | + | |
e4b2b4a8 JK |
18999 | +/* |
19000 | + * Slow path lock function: | |
19001 | + */ | |
19002 | +static int __sched | |
19003 | +rt_mutex_slowlock(struct rt_mutex *lock, int state, | |
19004 | + struct hrtimer_sleeper *timeout, | |
19005 | + enum rtmutex_chainwalk chwalk, | |
19006 | + struct ww_acquire_ctx *ww_ctx) | |
1a6e0f06 | 19007 | +{ |
e4b2b4a8 JK |
19008 | + struct rt_mutex_waiter waiter; |
19009 | + unsigned long flags; | |
19010 | + int ret = 0; | |
1a6e0f06 | 19011 | + |
e4b2b4a8 | 19012 | + rt_mutex_init_waiter(&waiter, false); |
1a6e0f06 | 19013 | + |
e4b2b4a8 JK |
19014 | + /* |
19015 | + * Technically we could use raw_spin_[un]lock_irq() here, but this can | |
19016 | + * be called in early boot if the cmpxchg() fast path is disabled | |
19017 | + * (debug, no architecture support). In this case we will acquire the | |
19018 | + * rtmutex with lock->wait_lock held. But we cannot unconditionally | |
19019 | + * enable interrupts in that early boot case. So we need to use the | |
19020 | + * irqsave/restore variants. | |
19021 | + */ | |
19022 | + raw_spin_lock_irqsave(&lock->wait_lock, flags); | |
1a6e0f06 | 19023 | + |
e4b2b4a8 JK |
19024 | + ret = rt_mutex_slowlock_locked(lock, state, timeout, chwalk, ww_ctx, |
19025 | + &waiter); | |
19026 | ||
19027 | raw_spin_unlock_irqrestore(&lock->wait_lock, flags); | |
19028 | ||
b3bbd485 | 19029 | @@ -1338,7 +1864,8 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) |
e4b2b4a8 JK |
19030 | * Return whether the current task needs to call rt_mutex_postunlock(). |
19031 | */ | |
19032 | static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, | |
19033 | - struct wake_q_head *wake_q) | |
19034 | + struct wake_q_head *wake_q, | |
19035 | + struct wake_q_head *wake_sleeper_q) | |
19036 | { | |
19037 | unsigned long flags; | |
19038 | ||
b3bbd485 | 19039 | @@ -1392,7 +1919,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, |
e4b2b4a8 JK |
19040 | * |
19041 | * Queue the next waiter for wakeup once we release the wait_lock. | |
19042 | */ | |
19043 | - mark_wakeup_next_waiter(wake_q, lock); | |
19044 | + mark_wakeup_next_waiter(wake_q, wake_sleeper_q, lock); | |
19045 | raw_spin_unlock_irqrestore(&lock->wait_lock, flags); | |
19046 | ||
19047 | return true; /* call rt_mutex_postunlock() */ | |
b3bbd485 | 19048 | @@ -1406,29 +1933,45 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, |
e4b2b4a8 JK |
19049 | */ |
19050 | static inline int | |
19051 | rt_mutex_fastlock(struct rt_mutex *lock, int state, | |
19052 | + struct ww_acquire_ctx *ww_ctx, | |
19053 | int (*slowfn)(struct rt_mutex *lock, int state, | |
19054 | struct hrtimer_sleeper *timeout, | |
19055 | - enum rtmutex_chainwalk chwalk)) | |
19056 | + enum rtmutex_chainwalk chwalk, | |
19057 | + struct ww_acquire_ctx *ww_ctx)) | |
19058 | { | |
19059 | if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) | |
19060 | return 0; | |
19061 | ||
19062 | - return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); | |
19063 | + /* | |
19064 | + * If rt_mutex blocks, the function sched_submit_work will not call | |
19065 | + * blk_schedule_flush_plug (because tsk_is_pi_blocked would be true). | |
19066 | + * We must call blk_schedule_flush_plug here, if we don't call it, | |
19067 | + * a deadlock in device mapper may happen. | |
19068 | + */ | |
19069 | + if (unlikely(blk_needs_flush_plug(current))) | |
19070 | + blk_schedule_flush_plug(current); | |
1a6e0f06 | 19071 | + |
e4b2b4a8 JK |
19072 | + return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK, ww_ctx); |
19073 | } | |
19074 | ||
19075 | static inline int | |
19076 | rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, | |
19077 | struct hrtimer_sleeper *timeout, | |
19078 | enum rtmutex_chainwalk chwalk, | |
19079 | + struct ww_acquire_ctx *ww_ctx, | |
19080 | int (*slowfn)(struct rt_mutex *lock, int state, | |
19081 | struct hrtimer_sleeper *timeout, | |
19082 | - enum rtmutex_chainwalk chwalk)) | |
19083 | + enum rtmutex_chainwalk chwalk, | |
19084 | + struct ww_acquire_ctx *ww_ctx)) | |
19085 | { | |
19086 | if (chwalk == RT_MUTEX_MIN_CHAINWALK && | |
19087 | likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) | |
19088 | return 0; | |
19089 | ||
19090 | - return slowfn(lock, state, timeout, chwalk); | |
19091 | + if (unlikely(blk_needs_flush_plug(current))) | |
19092 | + blk_schedule_flush_plug(current); | |
1a6e0f06 | 19093 | + |
e4b2b4a8 JK |
19094 | + return slowfn(lock, state, timeout, chwalk, ww_ctx); |
19095 | } | |
19096 | ||
19097 | static inline int | |
b3bbd485 | 19098 | @@ -1444,9 +1987,11 @@ rt_mutex_fasttrylock(struct rt_mutex *lock, |
e4b2b4a8 JK |
19099 | /* |
19100 | * Performs the wakeup of the the top-waiter and re-enables preemption. | |
19101 | */ | |
19102 | -void rt_mutex_postunlock(struct wake_q_head *wake_q) | |
19103 | +void rt_mutex_postunlock(struct wake_q_head *wake_q, | |
19104 | + struct wake_q_head *wake_sleeper_q) | |
19105 | { | |
19106 | wake_up_q(wake_q); | |
19107 | + wake_up_q_sleeper(wake_sleeper_q); | |
19108 | ||
19109 | /* Pairs with preempt_disable() in rt_mutex_slowunlock() */ | |
19110 | preempt_enable(); | |
b3bbd485 | 19111 | @@ -1455,23 +2000,40 @@ void rt_mutex_postunlock(struct wake_q_head *wake_q) |
e4b2b4a8 JK |
19112 | static inline void |
19113 | rt_mutex_fastunlock(struct rt_mutex *lock, | |
19114 | bool (*slowfn)(struct rt_mutex *lock, | |
19115 | - struct wake_q_head *wqh)) | |
19116 | + struct wake_q_head *wqh, | |
19117 | + struct wake_q_head *wq_sleeper)) | |
19118 | { | |
19119 | DEFINE_WAKE_Q(wake_q); | |
19120 | + DEFINE_WAKE_Q(wake_sleeper_q); | |
19121 | ||
19122 | if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) | |
19123 | return; | |
19124 | ||
19125 | - if (slowfn(lock, &wake_q)) | |
19126 | - rt_mutex_postunlock(&wake_q); | |
19127 | + if (slowfn(lock, &wake_q, &wake_sleeper_q)) | |
19128 | + rt_mutex_postunlock(&wake_q, &wake_sleeper_q); | |
b3bbd485 JK |
19129 | } |
19130 | ||
19131 | -static inline void __rt_mutex_lock(struct rt_mutex *lock, unsigned int subclass) | |
e4b2b4a8 | 19132 | +int __sched __rt_mutex_lock_state(struct rt_mutex *lock, int state) |
b3bbd485 JK |
19133 | { |
19134 | might_sleep(); | |
e4b2b4a8 | 19135 | + return rt_mutex_fastlock(lock, state, NULL, rt_mutex_slowlock); |
1a6e0f06 JK |
19136 | +} |
19137 | + | |
e4b2b4a8 JK |
19138 | +/** |
19139 | + * rt_mutex_lock_state - lock a rt_mutex with a given state | |
19140 | + * | |
19141 | + * @lock: The rt_mutex to be locked | |
19142 | + * @state: The state to set when blocking on the rt_mutex | |
19143 | + */ | |
b3bbd485 | 19144 | +static int __sched rt_mutex_lock_state(struct rt_mutex *lock, int state, unsigned int subclass) |
1a6e0f06 | 19145 | +{ |
e4b2b4a8 | 19146 | + int ret; |
b3bbd485 JK |
19147 | |
19148 | mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); | |
19149 | - rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock); | |
e4b2b4a8 JK |
19150 | + ret = __rt_mutex_lock_state(lock, state); |
19151 | + if (ret) | |
19152 | + mutex_release(&lock->dep_map, 1, _RET_IP_); | |
19153 | + return ret; | |
19154 | } | |
19155 | ||
b3bbd485 JK |
19156 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
19157 | @@ -1483,7 +2045,7 @@ static inline void __rt_mutex_lock(struct rt_mutex *lock, unsigned int subclass) | |
19158 | */ | |
19159 | void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass) | |
19160 | { | |
19161 | - __rt_mutex_lock(lock, subclass); | |
19162 | + rt_mutex_lock_state(lock, TASK_UNINTERRUPTIBLE, subclass); | |
19163 | } | |
19164 | EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); | |
19165 | #endif | |
19166 | @@ -1496,7 +2058,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); | |
e4b2b4a8 JK |
19167 | */ |
19168 | void __sched rt_mutex_lock(struct rt_mutex *lock) | |
19169 | { | |
b3bbd485 JK |
19170 | - __rt_mutex_lock(lock, 0); |
19171 | + rt_mutex_lock_state(lock, TASK_UNINTERRUPTIBLE, 0); | |
e4b2b4a8 JK |
19172 | } |
19173 | EXPORT_SYMBOL_GPL(rt_mutex_lock); | |
b3bbd485 JK |
19174 | #endif |
19175 | @@ -1512,16 +2074,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock); | |
e4b2b4a8 JK |
19176 | */ |
19177 | int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) | |
19178 | { | |
19179 | - int ret; | |
19180 | - | |
19181 | - might_sleep(); | |
19182 | - | |
19183 | - mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); | |
19184 | - ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock); | |
19185 | - if (ret) | |
19186 | - mutex_release(&lock->dep_map, 1, _RET_IP_); | |
19187 | - | |
19188 | - return ret; | |
b3bbd485 | 19189 | + return rt_mutex_lock_state(lock, TASK_INTERRUPTIBLE, 0); |
e4b2b4a8 JK |
19190 | } |
19191 | EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); | |
19192 | ||
b3bbd485 JK |
19193 | @@ -1538,6 +2091,22 @@ int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock) |
19194 | return __rt_mutex_slowtrylock(lock); | |
e4b2b4a8 JK |
19195 | } |
19196 | ||
b3bbd485 | 19197 | +/** |
e4b2b4a8 JK |
19198 | + * rt_mutex_lock_killable - lock a rt_mutex killable |
19199 | + * | |
19200 | + * @lock: the rt_mutex to be locked | |
19201 | + * @detect_deadlock: deadlock detection on/off | |
19202 | + * | |
19203 | + * Returns: | |
19204 | + * 0 on success | |
19205 | + * -EINTR when interrupted by a signal | |
19206 | + */ | |
19207 | +int __sched rt_mutex_lock_killable(struct rt_mutex *lock) | |
1a6e0f06 | 19208 | +{ |
b3bbd485 | 19209 | + return rt_mutex_lock_state(lock, TASK_KILLABLE, 0); |
1a6e0f06 | 19210 | +} |
e4b2b4a8 | 19211 | +EXPORT_SYMBOL_GPL(rt_mutex_lock_killable); |
1a6e0f06 | 19212 | + |
b3bbd485 | 19213 | /** |
e4b2b4a8 JK |
19214 | * rt_mutex_timed_lock - lock a rt_mutex interruptible |
19215 | * the timeout structure is provided | |
b3bbd485 | 19216 | @@ -1561,6 +2130,7 @@ rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout) |
e4b2b4a8 JK |
19217 | mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); |
19218 | ret = rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, | |
19219 | RT_MUTEX_MIN_CHAINWALK, | |
19220 | + NULL, | |
19221 | rt_mutex_slowlock); | |
19222 | if (ret) | |
19223 | mutex_release(&lock->dep_map, 1, _RET_IP_); | |
b3bbd485 | 19224 | @@ -1569,6 +2139,18 @@ rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout) |
e4b2b4a8 JK |
19225 | } |
19226 | EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); | |
19227 | ||
19228 | +int __sched __rt_mutex_trylock(struct rt_mutex *lock) | |
1a6e0f06 | 19229 | +{ |
e4b2b4a8 JK |
19230 | +#ifdef CONFIG_PREEMPT_RT_FULL |
19231 | + if (WARN_ON_ONCE(in_irq() || in_nmi())) | |
1a6e0f06 | 19232 | +#else |
e4b2b4a8 JK |
19233 | + if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq())) |
19234 | +#endif | |
19235 | + return 0; | |
1a6e0f06 | 19236 | + |
e4b2b4a8 | 19237 | + return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); |
1a6e0f06 | 19238 | +} |
e4b2b4a8 JK |
19239 | + |
19240 | /** | |
19241 | * rt_mutex_trylock - try to lock a rt_mutex | |
19242 | * | |
b3bbd485 | 19243 | @@ -1584,10 +2166,7 @@ int __sched rt_mutex_trylock(struct rt_mutex *lock) |
e4b2b4a8 JK |
19244 | { |
19245 | int ret; | |
1a6e0f06 | 19246 | |
e4b2b4a8 JK |
19247 | - if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq())) |
19248 | - return 0; | |
19249 | - | |
19250 | - ret = rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); | |
19251 | + ret = __rt_mutex_trylock(lock); | |
19252 | if (ret) | |
19253 | mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); | |
1a6e0f06 | 19254 | |
b3bbd485 | 19255 | @@ -1595,6 +2174,11 @@ int __sched rt_mutex_trylock(struct rt_mutex *lock) |
e4b2b4a8 JK |
19256 | } |
19257 | EXPORT_SYMBOL_GPL(rt_mutex_trylock); | |
1a6e0f06 | 19258 | |
e4b2b4a8 JK |
19259 | +void __sched __rt_mutex_unlock(struct rt_mutex *lock) |
19260 | +{ | |
19261 | + rt_mutex_fastunlock(lock, rt_mutex_slowunlock); | |
19262 | +} | |
1a6e0f06 | 19263 | + |
e4b2b4a8 JK |
19264 | /** |
19265 | * rt_mutex_unlock - unlock a rt_mutex | |
19266 | * | |
b3bbd485 | 19267 | @@ -1603,16 +2187,13 @@ EXPORT_SYMBOL_GPL(rt_mutex_trylock); |
e4b2b4a8 JK |
19268 | void __sched rt_mutex_unlock(struct rt_mutex *lock) |
19269 | { | |
19270 | mutex_release(&lock->dep_map, 1, _RET_IP_); | |
19271 | - rt_mutex_fastunlock(lock, rt_mutex_slowunlock); | |
19272 | + __rt_mutex_unlock(lock); | |
19273 | } | |
19274 | EXPORT_SYMBOL_GPL(rt_mutex_unlock); | |
19275 | ||
19276 | -/** | |
19277 | - * Futex variant, that since futex variants do not use the fast-path, can be | |
19278 | - * simple and will not need to retry. | |
19279 | - */ | |
19280 | -bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, | |
19281 | - struct wake_q_head *wake_q) | |
19282 | +static bool __sched __rt_mutex_unlock_common(struct rt_mutex *lock, | |
19283 | + struct wake_q_head *wake_q, | |
19284 | + struct wake_q_head *wq_sleeper) | |
19285 | { | |
19286 | lockdep_assert_held(&lock->wait_lock); | |
19287 | ||
b3bbd485 | 19288 | @@ -1629,22 +2210,35 @@ bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, |
e4b2b4a8 JK |
19289 | * avoid inversion prior to the wakeup. preempt_disable() |
19290 | * therein pairs with rt_mutex_postunlock(). | |
19291 | */ | |
19292 | - mark_wakeup_next_waiter(wake_q, lock); | |
19293 | + mark_wakeup_next_waiter(wake_q, wq_sleeper, lock); | |
1a6e0f06 | 19294 | |
e4b2b4a8 JK |
19295 | return true; /* call postunlock() */ |
19296 | } | |
1a6e0f06 | 19297 | |
e4b2b4a8 JK |
19298 | +/** |
19299 | + * Futex variant, that since futex variants do not use the fast-path, can be | |
19300 | + * simple and will not need to retry. | |
19301 | + */ | |
19302 | +bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, | |
19303 | + struct wake_q_head *wake_q, | |
19304 | + struct wake_q_head *wq_sleeper) | |
1a6e0f06 | 19305 | +{ |
e4b2b4a8 | 19306 | + return __rt_mutex_unlock_common(lock, wake_q, wq_sleeper); |
1a6e0f06 JK |
19307 | +} |
19308 | + | |
e4b2b4a8 JK |
19309 | void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) |
19310 | { | |
19311 | DEFINE_WAKE_Q(wake_q); | |
19312 | + DEFINE_WAKE_Q(wake_sleeper_q); | |
19313 | + unsigned long flags; | |
19314 | bool postunlock; | |
1a6e0f06 | 19315 | |
e4b2b4a8 JK |
19316 | - raw_spin_lock_irq(&lock->wait_lock); |
19317 | - postunlock = __rt_mutex_futex_unlock(lock, &wake_q); | |
19318 | - raw_spin_unlock_irq(&lock->wait_lock); | |
19319 | + raw_spin_lock_irqsave(&lock->wait_lock, flags); | |
19320 | + postunlock = __rt_mutex_futex_unlock(lock, &wake_q, &wake_sleeper_q); | |
19321 | + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); | |
1a6e0f06 | 19322 | |
e4b2b4a8 JK |
19323 | if (postunlock) |
19324 | - rt_mutex_postunlock(&wake_q); | |
19325 | + rt_mutex_postunlock(&wake_q, &wake_sleeper_q); | |
19326 | } | |
1a6e0f06 | 19327 | |
e4b2b4a8 | 19328 | /** |
b3bbd485 | 19329 | @@ -1683,7 +2277,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name, |
e4b2b4a8 JK |
19330 | if (name && key) |
19331 | debug_rt_mutex_init(lock, name, key); | |
19332 | } | |
19333 | -EXPORT_SYMBOL_GPL(__rt_mutex_init); | |
19334 | +EXPORT_SYMBOL(__rt_mutex_init); | |
1a6e0f06 | 19335 | |
e4b2b4a8 JK |
19336 | /** |
19337 | * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a | |
b3bbd485 | 19338 | @@ -1703,6 +2297,14 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, |
e4b2b4a8 JK |
19339 | struct task_struct *proxy_owner) |
19340 | { | |
19341 | __rt_mutex_init(lock, NULL, NULL); | |
19342 | +#ifdef CONFIG_DEBUG_SPINLOCK | |
19343 | + /* | |
19344 | + * get another key class for the wait_lock. LOCK_PI and UNLOCK_PI is | |
19345 | + * holding the ->wait_lock of the proxy_lock while unlocking a sleeping | |
19346 | + * lock. | |
19347 | + */ | |
19348 | + raw_spin_lock_init(&lock->wait_lock); | |
1a6e0f06 | 19349 | +#endif |
e4b2b4a8 JK |
19350 | debug_rt_mutex_proxy_lock(lock, proxy_owner); |
19351 | rt_mutex_set_owner(lock, proxy_owner); | |
19352 | } | |
b3bbd485 | 19353 | @@ -1735,6 +2337,34 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, |
e4b2b4a8 JK |
19354 | if (try_to_take_rt_mutex(lock, task, NULL)) |
19355 | return 1; | |
1a6e0f06 | 19356 | |
1a6e0f06 | 19357 | +#ifdef CONFIG_PREEMPT_RT_FULL |
e4b2b4a8 JK |
19358 | + /* |
19359 | + * In PREEMPT_RT there's an added race. | |
19360 | + * If the task, that we are about to requeue, times out, | |
19361 | + * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue | |
19362 | + * to skip this task. But right after the task sets | |
19363 | + * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then | |
19364 | + * block on the spin_lock(&hb->lock), which in RT is an rtmutex. | |
19365 | + * This will replace the PI_WAKEUP_INPROGRESS with the actual | |
19366 | + * lock that it blocks on. We *must not* place this task | |
19367 | + * on this proxy lock in that case. | |
19368 | + * | |
19369 | + * To prevent this race, we first take the task's pi_lock | |
19370 | + * and check if it has updated its pi_blocked_on. If it has, | |
19371 | + * we assume that it woke up and we return -EAGAIN. | |
19372 | + * Otherwise, we set the task's pi_blocked_on to | |
19373 | + * PI_REQUEUE_INPROGRESS, so that if the task is waking up | |
19374 | + * it will know that we are in the process of requeuing it. | |
19375 | + */ | |
19376 | + raw_spin_lock(&task->pi_lock); | |
19377 | + if (task->pi_blocked_on) { | |
19378 | + raw_spin_unlock(&task->pi_lock); | |
19379 | + return -EAGAIN; | |
19380 | + } | |
19381 | + task->pi_blocked_on = PI_REQUEUE_INPROGRESS; | |
19382 | + raw_spin_unlock(&task->pi_lock); | |
1a6e0f06 | 19383 | +#endif |
1a6e0f06 | 19384 | + |
e4b2b4a8 JK |
19385 | /* We enforce deadlock detection for futexes */ |
19386 | ret = task_blocks_on_rt_mutex(lock, waiter, task, | |
19387 | RT_MUTEX_FULL_CHAINWALK); | |
b3bbd485 | 19388 | @@ -1749,7 +2379,7 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, |
e4b2b4a8 JK |
19389 | ret = 0; |
19390 | } | |
1a6e0f06 | 19391 | |
e4b2b4a8 JK |
19392 | - if (unlikely(ret)) |
19393 | + if (ret && rt_mutex_has_waiters(lock)) | |
19394 | remove_waiter(lock, waiter); | |
1a6e0f06 | 19395 | |
e4b2b4a8 | 19396 | debug_rt_mutex_print_deadlock(waiter); |
b3bbd485 | 19397 | @@ -1824,17 +2454,36 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, |
e4b2b4a8 JK |
19398 | struct hrtimer_sleeper *to, |
19399 | struct rt_mutex_waiter *waiter) | |
19400 | { | |
19401 | + struct task_struct *tsk = current; | |
19402 | int ret; | |
19403 | ||
19404 | raw_spin_lock_irq(&lock->wait_lock); | |
19405 | /* sleep on the mutex */ | |
19406 | set_current_state(TASK_INTERRUPTIBLE); | |
19407 | - ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); | |
19408 | + ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL); | |
19409 | /* | |
19410 | * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might | |
19411 | * have to fix that up. | |
19412 | */ | |
19413 | fixup_rt_mutex_waiters(lock); | |
19414 | + /* | |
19415 | + * RT has a problem here when the wait got interrupted by a timeout | |
19416 | + * or a signal. task->pi_blocked_on is still set. The task must | |
19417 | + * acquire the hash bucket lock when returning from this function. | |
19418 | + * | |
19419 | + * If the hash bucket lock is contended then the | |
19420 | + * BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on)) in | |
19421 | + * task_blocks_on_rt_mutex() will trigger. This can be avoided by | |
19422 | + * clearing task->pi_blocked_on which removes the task from the | |
19423 | + * boosting chain of the rtmutex. That's correct because the task | |
19424 | + * is not longer blocked on it. | |
19425 | + */ | |
19426 | + if (ret) { | |
19427 | + raw_spin_lock(&tsk->pi_lock); | |
19428 | + tsk->pi_blocked_on = NULL; | |
19429 | + raw_spin_unlock(&tsk->pi_lock); | |
19430 | + } | |
1a6e0f06 | 19431 | + |
e4b2b4a8 | 19432 | raw_spin_unlock_irq(&lock->wait_lock); |
1a6e0f06 | 19433 | |
e4b2b4a8 | 19434 | return ret; |
b3bbd485 | 19435 | @@ -1895,3 +2544,99 @@ bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, |
1a6e0f06 | 19436 | |
e4b2b4a8 | 19437 | return cleanup; |
1a6e0f06 | 19438 | } |
e4b2b4a8 JK |
19439 | + |
19440 | +static inline int | |
19441 | +ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) | |
1a6e0f06 | 19442 | +{ |
e4b2b4a8 JK |
19443 | +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH |
19444 | + unsigned tmp; | |
19445 | + | |
19446 | + if (ctx->deadlock_inject_countdown-- == 0) { | |
19447 | + tmp = ctx->deadlock_inject_interval; | |
19448 | + if (tmp > UINT_MAX/4) | |
19449 | + tmp = UINT_MAX; | |
19450 | + else | |
19451 | + tmp = tmp*2 + tmp + tmp/2; | |
19452 | + | |
19453 | + ctx->deadlock_inject_interval = tmp; | |
19454 | + ctx->deadlock_inject_countdown = tmp; | |
19455 | + ctx->contending_lock = lock; | |
19456 | + | |
19457 | + ww_mutex_unlock(lock); | |
19458 | + | |
19459 | + return -EDEADLK; | |
19460 | + } | |
1a6e0f06 JK |
19461 | +#endif |
19462 | + | |
e4b2b4a8 | 19463 | + return 0; |
1a6e0f06 JK |
19464 | +} |
19465 | + | |
e4b2b4a8 JK |
19466 | +#ifdef CONFIG_PREEMPT_RT_FULL |
19467 | +int __sched | |
19468 | +ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) | |
1a6e0f06 | 19469 | +{ |
e4b2b4a8 | 19470 | + int ret; |
1a6e0f06 | 19471 | + |
e4b2b4a8 | 19472 | + might_sleep(); |
1a6e0f06 | 19473 | + |
e4b2b4a8 JK |
19474 | + mutex_acquire_nest(&lock->base.dep_map, 0, 0, |
19475 | + ctx ? &ctx->dep_map : NULL, _RET_IP_); | |
19476 | + ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0, | |
19477 | + ctx); | |
19478 | + if (ret) | |
19479 | + mutex_release(&lock->base.dep_map, 1, _RET_IP_); | |
19480 | + else if (!ret && ctx && ctx->acquired > 1) | |
19481 | + return ww_mutex_deadlock_injection(lock, ctx); | |
1a6e0f06 | 19482 | + |
e4b2b4a8 | 19483 | + return ret; |
1a6e0f06 | 19484 | +} |
e4b2b4a8 | 19485 | +EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible); |
1a6e0f06 | 19486 | + |
e4b2b4a8 JK |
19487 | +int __sched |
19488 | +ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) | |
1a6e0f06 | 19489 | +{ |
e4b2b4a8 | 19490 | + int ret; |
1a6e0f06 | 19491 | + |
e4b2b4a8 | 19492 | + might_sleep(); |
1a6e0f06 | 19493 | + |
e4b2b4a8 JK |
19494 | + mutex_acquire_nest(&lock->base.dep_map, 0, 0, |
19495 | + ctx ? &ctx->dep_map : NULL, _RET_IP_); | |
19496 | + ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0, | |
19497 | + ctx); | |
19498 | + if (ret) | |
19499 | + mutex_release(&lock->base.dep_map, 1, _RET_IP_); | |
19500 | + else if (!ret && ctx && ctx->acquired > 1) | |
19501 | + return ww_mutex_deadlock_injection(lock, ctx); | |
19502 | + | |
19503 | + return ret; | |
1a6e0f06 | 19504 | +} |
e4b2b4a8 | 19505 | +EXPORT_SYMBOL_GPL(ww_mutex_lock); |
1a6e0f06 | 19506 | + |
e4b2b4a8 | 19507 | +void __sched ww_mutex_unlock(struct ww_mutex *lock) |
1a6e0f06 | 19508 | +{ |
e4b2b4a8 | 19509 | + int nest = !!lock->ctx; |
1a6e0f06 | 19510 | + |
e4b2b4a8 JK |
19511 | + /* |
19512 | + * The unlocking fastpath is the 0->1 transition from 'locked' | |
19513 | + * into 'unlocked' state: | |
19514 | + */ | |
19515 | + if (nest) { | |
19516 | +#ifdef CONFIG_DEBUG_MUTEXES | |
19517 | + DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired); | |
1a6e0f06 | 19518 | +#endif |
e4b2b4a8 JK |
19519 | + if (lock->ctx->acquired > 0) |
19520 | + lock->ctx->acquired--; | |
19521 | + lock->ctx = NULL; | |
19522 | + } | |
19523 | + | |
19524 | + mutex_release(&lock->base.dep_map, nest, _RET_IP_); | |
19525 | + __rt_mutex_unlock(&lock->base.lock); | |
1a6e0f06 | 19526 | +} |
e4b2b4a8 | 19527 | +EXPORT_SYMBOL(ww_mutex_unlock); |
1a6e0f06 | 19528 | + |
e4b2b4a8 | 19529 | +int __rt_mutex_owner_current(struct rt_mutex *lock) |
1a6e0f06 | 19530 | +{ |
e4b2b4a8 | 19531 | + return rt_mutex_owner(lock) == current; |
1a6e0f06 | 19532 | +} |
e4b2b4a8 | 19533 | +EXPORT_SYMBOL(__rt_mutex_owner_current); |
1a6e0f06 | 19534 | +#endif |
b3bbd485 JK |
19535 | diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h |
19536 | index 68686b3ec3c1..2a157c78e18c 100644 | |
19537 | --- a/kernel/locking/rtmutex_common.h | |
19538 | +++ b/kernel/locking/rtmutex_common.h | |
e4b2b4a8 | 19539 | @@ -15,6 +15,7 @@ |
1a6e0f06 | 19540 | |
e4b2b4a8 JK |
19541 | #include <linux/rtmutex.h> |
19542 | #include <linux/sched/wake_q.h> | |
19543 | +#include <linux/sched/debug.h> | |
1a6e0f06 | 19544 | |
e4b2b4a8 JK |
19545 | /* |
19546 | * This is the control structure for tasks blocked on a rt_mutex, | |
b3bbd485 | 19547 | @@ -29,6 +30,7 @@ struct rt_mutex_waiter { |
e4b2b4a8 JK |
19548 | struct rb_node pi_tree_entry; |
19549 | struct task_struct *task; | |
19550 | struct rt_mutex *lock; | |
19551 | + bool savestate; | |
19552 | #ifdef CONFIG_DEBUG_RT_MUTEXES | |
19553 | unsigned long ip; | |
19554 | struct pid *deadlock_task_pid; | |
b3bbd485 | 19555 | @@ -129,12 +131,15 @@ enum rtmutex_chainwalk { |
e4b2b4a8 JK |
19556 | /* |
19557 | * PI-futex support (proxy locking functions, etc.): | |
19558 | */ | |
19559 | +#define PI_WAKEUP_INPROGRESS ((struct rt_mutex_waiter *) 1) | |
19560 | +#define PI_REQUEUE_INPROGRESS ((struct rt_mutex_waiter *) 2) | |
19561 | + | |
19562 | extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); | |
19563 | extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, | |
19564 | struct task_struct *proxy_owner); | |
19565 | extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, | |
19566 | struct task_struct *proxy_owner); | |
19567 | -extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); | |
19568 | +extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savetate); | |
19569 | extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, | |
19570 | struct rt_mutex_waiter *waiter, | |
19571 | struct task_struct *task); | |
b3bbd485 | 19572 | @@ -152,9 +157,27 @@ extern int __rt_mutex_futex_trylock(struct rt_mutex *l); |
e4b2b4a8 JK |
19573 | |
19574 | extern void rt_mutex_futex_unlock(struct rt_mutex *lock); | |
19575 | extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, | |
19576 | - struct wake_q_head *wqh); | |
b3bbd485 JK |
19577 | - |
19578 | -extern void rt_mutex_postunlock(struct wake_q_head *wake_q); | |
e4b2b4a8 JK |
19579 | + struct wake_q_head *wqh, |
19580 | + struct wake_q_head *wq_sleeper); | |
b3bbd485 | 19581 | + |
e4b2b4a8 JK |
19582 | +extern void rt_mutex_postunlock(struct wake_q_head *wake_q, |
19583 | + struct wake_q_head *wake_sleeper_q); | |
19584 | + | |
19585 | +/* RW semaphore special interface */ | |
19586 | +struct ww_acquire_ctx; | |
19587 | + | |
19588 | +extern int __rt_mutex_lock_state(struct rt_mutex *lock, int state); | |
19589 | +extern int __rt_mutex_trylock(struct rt_mutex *lock); | |
19590 | +extern void __rt_mutex_unlock(struct rt_mutex *lock); | |
19591 | +int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state, | |
19592 | + struct hrtimer_sleeper *timeout, | |
19593 | + enum rtmutex_chainwalk chwalk, | |
19594 | + struct ww_acquire_ctx *ww_ctx, | |
19595 | + struct rt_mutex_waiter *waiter); | |
19596 | +void __sched rt_spin_lock_slowlock_locked(struct rt_mutex *lock, | |
19597 | + struct rt_mutex_waiter *waiter, | |
19598 | + unsigned long flags); | |
19599 | +void __sched rt_spin_lock_slowunlock(struct rt_mutex *lock); | |
1a6e0f06 | 19600 | |
e4b2b4a8 JK |
19601 | #ifdef CONFIG_DEBUG_RT_MUTEXES |
19602 | # include "rtmutex-debug.h" | |
b3bbd485 JK |
19603 | diff --git a/kernel/locking/rwlock-rt.c b/kernel/locking/rwlock-rt.c |
19604 | new file mode 100644 | |
19605 | index 000000000000..f2e155b2c4a8 | |
19606 | --- /dev/null | |
19607 | +++ b/kernel/locking/rwlock-rt.c | |
e4b2b4a8 JK |
19608 | @@ -0,0 +1,378 @@ |
19609 | +/* | |
19610 | + */ | |
19611 | +#include <linux/sched/debug.h> | |
19612 | +#include <linux/export.h> | |
19613 | + | |
19614 | +#include "rtmutex_common.h" | |
19615 | +#include <linux/rwlock_types_rt.h> | |
19616 | + | |
19617 | +/* | |
19618 | + * RT-specific reader/writer locks | |
19619 | + * | |
19620 | + * write_lock() | |
19621 | + * 1) Lock lock->rtmutex | |
19622 | + * 2) Remove the reader BIAS to force readers into the slow path | |
19623 | + * 3) Wait until all readers have left the critical region | |
19624 | + * 4) Mark it write locked | |
19625 | + * | |
19626 | + * write_unlock() | |
19627 | + * 1) Remove the write locked marker | |
19628 | + * 2) Set the reader BIAS so readers can use the fast path again | |
19629 | + * 3) Unlock lock->rtmutex to release blocked readers | |
19630 | + * | |
19631 | + * read_lock() | |
19632 | + * 1) Try fast path acquisition (reader BIAS is set) | |
19633 | + * 2) Take lock->rtmutex.wait_lock which protects the writelocked flag | |
19634 | + * 3) If !writelocked, acquire it for read | |
19635 | + * 4) If writelocked, block on lock->rtmutex | |
19636 | + * 5) unlock lock->rtmutex, goto 1) | |
19637 | + * | |
19638 | + * read_unlock() | |
19639 | + * 1) Try fast path release (reader count != 1) | |
19640 | + * 2) Wake the writer waiting in write_lock()#3 | |
19641 | + * | |
19642 | + * read_lock()#3 has the consequence, that rw locks on RT are not writer | |
19643 | + * fair, but writers, which should be avoided in RT tasks (think tasklist | |
19644 | + * lock), are subject to the rtmutex priority/DL inheritance mechanism. | |
19645 | + * | |
19646 | + * It's possible to make the rw locks writer fair by keeping a list of | |
19647 | + * active readers. A blocked writer would force all newly incoming readers | |
19648 | + * to block on the rtmutex, but the rtmutex would have to be proxy locked | |
19649 | + * for one reader after the other. We can't use multi-reader inheritance | |
19650 | + * because there is no way to support that with | |
19651 | + * SCHED_DEADLINE. Implementing the one by one reader boosting/handover | |
19652 | + * mechanism is a major surgery for a very dubious value. | |
19653 | + * | |
19654 | + * The risk of writer starvation is there, but the pathological use cases | |
19655 | + * which trigger it are not necessarily the typical RT workloads. | |
19656 | + */ | |
19657 | + | |
19658 | +void __rwlock_biased_rt_init(struct rt_rw_lock *lock, const char *name, | |
19659 | + struct lock_class_key *key) | |
1a6e0f06 | 19660 | +{ |
e4b2b4a8 JK |
19661 | +#ifdef CONFIG_DEBUG_LOCK_ALLOC |
19662 | + /* | |
19663 | + * Make sure we are not reinitializing a held semaphore: | |
19664 | + */ | |
19665 | + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); | |
19666 | + lockdep_init_map(&lock->dep_map, name, key, 0); | |
1a6e0f06 | 19667 | +#endif |
e4b2b4a8 JK |
19668 | + atomic_set(&lock->readers, READER_BIAS); |
19669 | + rt_mutex_init(&lock->rtmutex); | |
19670 | + lock->rtmutex.save_state = 1; | |
1a6e0f06 JK |
19671 | +} |
19672 | + | |
e4b2b4a8 | 19673 | +int __read_rt_trylock(struct rt_rw_lock *lock) |
1a6e0f06 | 19674 | +{ |
e4b2b4a8 | 19675 | + int r, old; |
1a6e0f06 | 19676 | + |
e4b2b4a8 JK |
19677 | + /* |
19678 | + * Increment reader count, if lock->readers < 0, i.e. READER_BIAS is | |
19679 | + * set. | |
19680 | + */ | |
19681 | + for (r = atomic_read(&lock->readers); r < 0;) { | |
19682 | + old = atomic_cmpxchg(&lock->readers, r, r + 1); | |
19683 | + if (likely(old == r)) | |
19684 | + return 1; | |
19685 | + r = old; | |
19686 | + } | |
19687 | + return 0; | |
1a6e0f06 JK |
19688 | +} |
19689 | + | |
e4b2b4a8 | 19690 | +void __sched __read_rt_lock(struct rt_rw_lock *lock) |
1a6e0f06 | 19691 | +{ |
e4b2b4a8 JK |
19692 | + struct rt_mutex *m = &lock->rtmutex; |
19693 | + struct rt_mutex_waiter waiter; | |
19694 | + unsigned long flags; | |
1a6e0f06 | 19695 | + |
e4b2b4a8 JK |
19696 | + if (__read_rt_trylock(lock)) |
19697 | + return; | |
19698 | + | |
19699 | + raw_spin_lock_irqsave(&m->wait_lock, flags); | |
19700 | + /* | |
19701 | + * Allow readers as long as the writer has not completely | |
19702 | + * acquired the semaphore for write. | |
19703 | + */ | |
19704 | + if (atomic_read(&lock->readers) != WRITER_BIAS) { | |
19705 | + atomic_inc(&lock->readers); | |
19706 | + raw_spin_unlock_irqrestore(&m->wait_lock, flags); | |
19707 | + return; | |
19708 | + } | |
19709 | + | |
19710 | + /* | |
19711 | + * Call into the slow lock path with the rtmutex->wait_lock | |
19712 | + * held, so this can't result in the following race: | |
19713 | + * | |
19714 | + * Reader1 Reader2 Writer | |
19715 | + * read_lock() | |
19716 | + * write_lock() | |
19717 | + * rtmutex_lock(m) | |
19718 | + * swait() | |
19719 | + * read_lock() | |
19720 | + * unlock(m->wait_lock) | |
19721 | + * read_unlock() | |
19722 | + * swake() | |
19723 | + * lock(m->wait_lock) | |
19724 | + * lock->writelocked=true | |
19725 | + * unlock(m->wait_lock) | |
19726 | + * | |
19727 | + * write_unlock() | |
19728 | + * lock->writelocked=false | |
19729 | + * rtmutex_unlock(m) | |
19730 | + * read_lock() | |
19731 | + * write_lock() | |
19732 | + * rtmutex_lock(m) | |
19733 | + * swait() | |
19734 | + * rtmutex_lock(m) | |
19735 | + * | |
19736 | + * That would put Reader1 behind the writer waiting on | |
19737 | + * Reader2 to call read_unlock() which might be unbound. | |
19738 | + */ | |
19739 | + rt_mutex_init_waiter(&waiter, false); | |
19740 | + rt_spin_lock_slowlock_locked(m, &waiter, flags); | |
19741 | + /* | |
19742 | + * The slowlock() above is guaranteed to return with the rtmutex is | |
19743 | + * now held, so there can't be a writer active. Increment the reader | |
19744 | + * count and immediately drop the rtmutex again. | |
19745 | + */ | |
19746 | + atomic_inc(&lock->readers); | |
19747 | + raw_spin_unlock_irqrestore(&m->wait_lock, flags); | |
19748 | + rt_spin_lock_slowunlock(m); | |
19749 | + | |
19750 | + debug_rt_mutex_free_waiter(&waiter); | |
1a6e0f06 JK |
19751 | +} |
19752 | + | |
e4b2b4a8 | 19753 | +void __read_rt_unlock(struct rt_rw_lock *lock) |
1a6e0f06 | 19754 | +{ |
e4b2b4a8 JK |
19755 | + struct rt_mutex *m = &lock->rtmutex; |
19756 | + struct task_struct *tsk; | |
19757 | + | |
19758 | + /* | |
19759 | + * sem->readers can only hit 0 when a writer is waiting for the | |
19760 | + * active readers to leave the critical region. | |
19761 | + */ | |
19762 | + if (!atomic_dec_and_test(&lock->readers)) | |
19763 | + return; | |
19764 | + | |
19765 | + raw_spin_lock_irq(&m->wait_lock); | |
19766 | + /* | |
19767 | + * Wake the writer, i.e. the rtmutex owner. It might release the | |
19768 | + * rtmutex concurrently in the fast path, but to clean up the rw | |
19769 | + * lock it needs to acquire m->wait_lock. The worst case which can | |
19770 | + * happen is a spurious wakeup. | |
19771 | + */ | |
19772 | + tsk = rt_mutex_owner(m); | |
19773 | + if (tsk) | |
19774 | + wake_up_process(tsk); | |
19775 | + | |
19776 | + raw_spin_unlock_irq(&m->wait_lock); | |
1a6e0f06 JK |
19777 | +} |
19778 | + | |
e4b2b4a8 JK |
19779 | +static void __write_unlock_common(struct rt_rw_lock *lock, int bias, |
19780 | + unsigned long flags) | |
1a6e0f06 | 19781 | +{ |
e4b2b4a8 | 19782 | + struct rt_mutex *m = &lock->rtmutex; |
1a6e0f06 | 19783 | + |
e4b2b4a8 JK |
19784 | + atomic_add(READER_BIAS - bias, &lock->readers); |
19785 | + raw_spin_unlock_irqrestore(&m->wait_lock, flags); | |
19786 | + rt_spin_lock_slowunlock(m); | |
1a6e0f06 JK |
19787 | +} |
19788 | + | |
e4b2b4a8 | 19789 | +void __sched __write_rt_lock(struct rt_rw_lock *lock) |
1a6e0f06 | 19790 | +{ |
e4b2b4a8 JK |
19791 | + struct rt_mutex *m = &lock->rtmutex; |
19792 | + struct task_struct *self = current; | |
19793 | + unsigned long flags; | |
19794 | + | |
19795 | + /* Take the rtmutex as a first step */ | |
19796 | + __rt_spin_lock(m); | |
19797 | + | |
19798 | + /* Force readers into slow path */ | |
19799 | + atomic_sub(READER_BIAS, &lock->readers); | |
19800 | + | |
19801 | + raw_spin_lock_irqsave(&m->wait_lock, flags); | |
19802 | + | |
19803 | + raw_spin_lock(&self->pi_lock); | |
19804 | + self->saved_state = self->state; | |
19805 | + __set_current_state_no_track(TASK_UNINTERRUPTIBLE); | |
19806 | + raw_spin_unlock(&self->pi_lock); | |
19807 | + | |
19808 | + for (;;) { | |
19809 | + /* Have all readers left the critical region? */ | |
19810 | + if (!atomic_read(&lock->readers)) { | |
19811 | + atomic_set(&lock->readers, WRITER_BIAS); | |
19812 | + raw_spin_lock(&self->pi_lock); | |
19813 | + __set_current_state_no_track(self->saved_state); | |
19814 | + self->saved_state = TASK_RUNNING; | |
19815 | + raw_spin_unlock(&self->pi_lock); | |
19816 | + raw_spin_unlock_irqrestore(&m->wait_lock, flags); | |
19817 | + return; | |
19818 | + } | |
19819 | + | |
19820 | + raw_spin_unlock_irqrestore(&m->wait_lock, flags); | |
19821 | + | |
19822 | + if (atomic_read(&lock->readers) != 0) | |
19823 | + schedule(); | |
19824 | + | |
19825 | + raw_spin_lock_irqsave(&m->wait_lock, flags); | |
19826 | + | |
19827 | + raw_spin_lock(&self->pi_lock); | |
19828 | + __set_current_state_no_track(TASK_UNINTERRUPTIBLE); | |
19829 | + raw_spin_unlock(&self->pi_lock); | |
19830 | + } | |
19831 | +} | |
19832 | + | |
19833 | +int __write_rt_trylock(struct rt_rw_lock *lock) | |
1a6e0f06 | 19834 | +{ |
e4b2b4a8 JK |
19835 | + struct rt_mutex *m = &lock->rtmutex; |
19836 | + unsigned long flags; | |
19837 | + | |
19838 | + if (!__rt_mutex_trylock(m)) | |
19839 | + return 0; | |
19840 | + | |
19841 | + atomic_sub(READER_BIAS, &lock->readers); | |
19842 | + | |
19843 | + raw_spin_lock_irqsave(&m->wait_lock, flags); | |
19844 | + if (!atomic_read(&lock->readers)) { | |
19845 | + atomic_set(&lock->readers, WRITER_BIAS); | |
19846 | + raw_spin_unlock_irqrestore(&m->wait_lock, flags); | |
19847 | + return 1; | |
19848 | + } | |
19849 | + __write_unlock_common(lock, 0, flags); | |
19850 | + return 0; | |
1a6e0f06 JK |
19851 | +} |
19852 | + | |
e4b2b4a8 JK |
19853 | +void __write_rt_unlock(struct rt_rw_lock *lock) |
19854 | +{ | |
19855 | + struct rt_mutex *m = &lock->rtmutex; | |
19856 | + unsigned long flags; | |
1a6e0f06 | 19857 | + |
e4b2b4a8 JK |
19858 | + raw_spin_lock_irqsave(&m->wait_lock, flags); |
19859 | + __write_unlock_common(lock, WRITER_BIAS, flags); | |
19860 | +} | |
1a6e0f06 | 19861 | + |
e4b2b4a8 JK |
19862 | +/* Map the reader biased implementation */ |
19863 | +static inline int do_read_rt_trylock(rwlock_t *rwlock) | |
1a6e0f06 | 19864 | +{ |
e4b2b4a8 | 19865 | + return __read_rt_trylock(rwlock); |
1a6e0f06 JK |
19866 | +} |
19867 | + | |
e4b2b4a8 JK |
19868 | +static inline int do_write_rt_trylock(rwlock_t *rwlock) |
19869 | +{ | |
19870 | + return __write_rt_trylock(rwlock); | |
19871 | +} | |
1a6e0f06 | 19872 | + |
e4b2b4a8 JK |
19873 | +static inline void do_read_rt_lock(rwlock_t *rwlock) |
19874 | +{ | |
19875 | + __read_rt_lock(rwlock); | |
19876 | +} | |
1a6e0f06 | 19877 | + |
e4b2b4a8 JK |
19878 | +static inline void do_write_rt_lock(rwlock_t *rwlock) |
19879 | +{ | |
19880 | + __write_rt_lock(rwlock); | |
19881 | +} | |
1a6e0f06 | 19882 | + |
e4b2b4a8 JK |
19883 | +static inline void do_read_rt_unlock(rwlock_t *rwlock) |
19884 | +{ | |
19885 | + __read_rt_unlock(rwlock); | |
19886 | +} | |
1a6e0f06 | 19887 | + |
e4b2b4a8 JK |
19888 | +static inline void do_write_rt_unlock(rwlock_t *rwlock) |
19889 | +{ | |
19890 | + __write_rt_unlock(rwlock); | |
19891 | +} | |
1a6e0f06 | 19892 | + |
e4b2b4a8 JK |
19893 | +static inline void do_rwlock_rt_init(rwlock_t *rwlock, const char *name, |
19894 | + struct lock_class_key *key) | |
19895 | +{ | |
19896 | + __rwlock_biased_rt_init(rwlock, name, key); | |
19897 | +} | |
1a6e0f06 | 19898 | + |
e4b2b4a8 JK |
19899 | +int __lockfunc rt_read_can_lock(rwlock_t *rwlock) |
19900 | +{ | |
19901 | + return atomic_read(&rwlock->readers) < 0; | |
19902 | +} | |
1a6e0f06 | 19903 | + |
e4b2b4a8 JK |
19904 | +int __lockfunc rt_write_can_lock(rwlock_t *rwlock) |
19905 | +{ | |
19906 | + return atomic_read(&rwlock->readers) == READER_BIAS; | |
19907 | +} | |
1a6e0f06 JK |
19908 | + |
19909 | +/* | |
e4b2b4a8 | 19910 | + * The common functions which get wrapped into the rwlock API. |
1a6e0f06 | 19911 | + */ |
e4b2b4a8 JK |
19912 | +int __lockfunc rt_read_trylock(rwlock_t *rwlock) |
19913 | +{ | |
19914 | + int ret; | |
1a6e0f06 | 19915 | + |
e4b2b4a8 JK |
19916 | + sleeping_lock_inc(); |
19917 | + migrate_disable(); | |
19918 | + ret = do_read_rt_trylock(rwlock); | |
19919 | + if (ret) { | |
19920 | + rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_); | |
19921 | + } else { | |
19922 | + migrate_enable(); | |
19923 | + sleeping_lock_dec(); | |
19924 | + } | |
19925 | + return ret; | |
19926 | +} | |
19927 | +EXPORT_SYMBOL(rt_read_trylock); | |
1a6e0f06 | 19928 | + |
e4b2b4a8 JK |
19929 | +int __lockfunc rt_write_trylock(rwlock_t *rwlock) |
19930 | +{ | |
19931 | + int ret; | |
1a6e0f06 | 19932 | + |
e4b2b4a8 JK |
19933 | + sleeping_lock_inc(); |
19934 | + migrate_disable(); | |
19935 | + ret = do_write_rt_trylock(rwlock); | |
19936 | + if (ret) { | |
19937 | + rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_); | |
19938 | + } else { | |
19939 | + migrate_enable(); | |
19940 | + sleeping_lock_dec(); | |
19941 | + } | |
19942 | + return ret; | |
19943 | +} | |
19944 | +EXPORT_SYMBOL(rt_write_trylock); | |
1a6e0f06 | 19945 | + |
e4b2b4a8 JK |
19946 | +void __lockfunc rt_read_lock(rwlock_t *rwlock) |
19947 | +{ | |
19948 | + sleeping_lock_inc(); | |
19949 | + migrate_disable(); | |
19950 | + rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_); | |
19951 | + do_read_rt_lock(rwlock); | |
19952 | +} | |
19953 | +EXPORT_SYMBOL(rt_read_lock); | |
1a6e0f06 | 19954 | + |
e4b2b4a8 JK |
19955 | +void __lockfunc rt_write_lock(rwlock_t *rwlock) |
19956 | +{ | |
19957 | + sleeping_lock_inc(); | |
19958 | + migrate_disable(); | |
19959 | + rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_); | |
19960 | + do_write_rt_lock(rwlock); | |
19961 | +} | |
19962 | +EXPORT_SYMBOL(rt_write_lock); | |
1a6e0f06 | 19963 | + |
e4b2b4a8 JK |
19964 | +void __lockfunc rt_read_unlock(rwlock_t *rwlock) |
19965 | +{ | |
19966 | + rwlock_release(&rwlock->dep_map, 1, _RET_IP_); | |
19967 | + do_read_rt_unlock(rwlock); | |
19968 | + migrate_enable(); | |
19969 | + sleeping_lock_dec(); | |
19970 | +} | |
19971 | +EXPORT_SYMBOL(rt_read_unlock); | |
1a6e0f06 | 19972 | + |
e4b2b4a8 JK |
19973 | +void __lockfunc rt_write_unlock(rwlock_t *rwlock) |
19974 | +{ | |
19975 | + rwlock_release(&rwlock->dep_map, 1, _RET_IP_); | |
19976 | + do_write_rt_unlock(rwlock); | |
19977 | + migrate_enable(); | |
19978 | + sleeping_lock_dec(); | |
19979 | +} | |
19980 | +EXPORT_SYMBOL(rt_write_unlock); | |
1a6e0f06 | 19981 | + |
e4b2b4a8 JK |
19982 | +void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key) |
19983 | +{ | |
19984 | + do_rwlock_rt_init(rwlock, name, key); | |
19985 | +} | |
19986 | +EXPORT_SYMBOL(__rt_rwlock_init); | |
b3bbd485 JK |
19987 | diff --git a/kernel/locking/rwsem-rt.c b/kernel/locking/rwsem-rt.c |
19988 | new file mode 100644 | |
19989 | index 000000000000..26991ddb6c5a | |
19990 | --- /dev/null | |
19991 | +++ b/kernel/locking/rwsem-rt.c | |
e4b2b4a8 JK |
19992 | @@ -0,0 +1,269 @@ |
19993 | +/* | |
19994 | + */ | |
19995 | +#include <linux/rwsem.h> | |
19996 | +#include <linux/sched/debug.h> | |
19997 | +#include <linux/sched/signal.h> | |
19998 | +#include <linux/export.h> | |
1a6e0f06 | 19999 | + |
e4b2b4a8 JK |
20000 | +#include "rtmutex_common.h" |
20001 | + | |
20002 | +/* | |
20003 | + * RT-specific reader/writer semaphores | |
20004 | + * | |
20005 | + * down_write() | |
20006 | + * 1) Lock sem->rtmutex | |
20007 | + * 2) Remove the reader BIAS to force readers into the slow path | |
20008 | + * 3) Wait until all readers have left the critical region | |
20009 | + * 4) Mark it write locked | |
20010 | + * | |
20011 | + * up_write() | |
20012 | + * 1) Remove the write locked marker | |
20013 | + * 2) Set the reader BIAS so readers can use the fast path again | |
20014 | + * 3) Unlock sem->rtmutex to release blocked readers | |
20015 | + * | |
20016 | + * down_read() | |
20017 | + * 1) Try fast path acquisition (reader BIAS is set) | |
20018 | + * 2) Take sem->rtmutex.wait_lock which protects the writelocked flag | |
20019 | + * 3) If !writelocked, acquire it for read | |
20020 | + * 4) If writelocked, block on sem->rtmutex | |
20021 | + * 5) unlock sem->rtmutex, goto 1) | |
20022 | + * | |
20023 | + * up_read() | |
20024 | + * 1) Try fast path release (reader count != 1) | |
20025 | + * 2) Wake the writer waiting in down_write()#3 | |
20026 | + * | |
20027 | + * down_read()#3 has the consequence, that rw semaphores on RT are not writer | |
20028 | + * fair, but writers, which should be avoided in RT tasks (think mmap_sem), | |
20029 | + * are subject to the rtmutex priority/DL inheritance mechanism. | |
20030 | + * | |
20031 | + * It's possible to make the rw semaphores writer fair by keeping a list of | |
20032 | + * active readers. A blocked writer would force all newly incoming readers to | |
20033 | + * block on the rtmutex, but the rtmutex would have to be proxy locked for one | |
20034 | + * reader after the other. We can't use multi-reader inheritance because there | |
20035 | + * is no way to support that with SCHED_DEADLINE. Implementing the one by one | |
20036 | + * reader boosting/handover mechanism is a major surgery for a very dubious | |
20037 | + * value. | |
20038 | + * | |
20039 | + * The risk of writer starvation is there, but the pathological use cases | |
20040 | + * which trigger it are not necessarily the typical RT workloads. | |
20041 | + */ | |
20042 | + | |
20043 | +void __rwsem_init(struct rw_semaphore *sem, const char *name, | |
20044 | + struct lock_class_key *key) | |
20045 | +{ | |
20046 | +#ifdef CONFIG_DEBUG_LOCK_ALLOC | |
20047 | + /* | |
20048 | + * Make sure we are not reinitializing a held semaphore: | |
20049 | + */ | |
20050 | + debug_check_no_locks_freed((void *)sem, sizeof(*sem)); | |
20051 | + lockdep_init_map(&sem->dep_map, name, key, 0); | |
1a6e0f06 | 20052 | +#endif |
e4b2b4a8 JK |
20053 | + atomic_set(&sem->readers, READER_BIAS); |
20054 | +} | |
20055 | +EXPORT_SYMBOL(__rwsem_init); | |
20056 | + | |
20057 | +int __down_read_trylock(struct rw_semaphore *sem) | |
20058 | +{ | |
20059 | + int r, old; | |
20060 | + | |
20061 | + /* | |
20062 | + * Increment reader count, if sem->readers < 0, i.e. READER_BIAS is | |
20063 | + * set. | |
20064 | + */ | |
20065 | + for (r = atomic_read(&sem->readers); r < 0;) { | |
20066 | + old = atomic_cmpxchg(&sem->readers, r, r + 1); | |
20067 | + if (likely(old == r)) | |
20068 | + return 1; | |
20069 | + r = old; | |
20070 | + } | |
20071 | + return 0; | |
20072 | +} | |
20073 | + | |
20074 | +void __sched __down_read(struct rw_semaphore *sem) | |
20075 | +{ | |
20076 | + struct rt_mutex *m = &sem->rtmutex; | |
20077 | + struct rt_mutex_waiter waiter; | |
20078 | + | |
20079 | + if (__down_read_trylock(sem)) | |
20080 | + return; | |
20081 | + | |
20082 | + might_sleep(); | |
20083 | + raw_spin_lock_irq(&m->wait_lock); | |
20084 | + /* | |
20085 | + * Allow readers as long as the writer has not completely | |
20086 | + * acquired the semaphore for write. | |
20087 | + */ | |
20088 | + if (atomic_read(&sem->readers) != WRITER_BIAS) { | |
20089 | + atomic_inc(&sem->readers); | |
20090 | + raw_spin_unlock_irq(&m->wait_lock); | |
20091 | + return; | |
20092 | + } | |
1a6e0f06 | 20093 | + |
e4b2b4a8 JK |
20094 | + /* |
20095 | + * Call into the slow lock path with the rtmutex->wait_lock | |
20096 | + * held, so this can't result in the following race: | |
20097 | + * | |
20098 | + * Reader1 Reader2 Writer | |
20099 | + * down_read() | |
20100 | + * down_write() | |
20101 | + * rtmutex_lock(m) | |
20102 | + * swait() | |
20103 | + * down_read() | |
20104 | + * unlock(m->wait_lock) | |
20105 | + * up_read() | |
20106 | + * swake() | |
20107 | + * lock(m->wait_lock) | |
20108 | + * sem->writelocked=true | |
20109 | + * unlock(m->wait_lock) | |
20110 | + * | |
20111 | + * up_write() | |
20112 | + * sem->writelocked=false | |
20113 | + * rtmutex_unlock(m) | |
20114 | + * down_read() | |
20115 | + * down_write() | |
20116 | + * rtmutex_lock(m) | |
20117 | + * swait() | |
20118 | + * rtmutex_lock(m) | |
20119 | + * | |
20120 | + * That would put Reader1 behind the writer waiting on | |
20121 | + * Reader2 to call up_read() which might be unbound. | |
20122 | + */ | |
20123 | + rt_mutex_init_waiter(&waiter, false); | |
20124 | + rt_mutex_slowlock_locked(m, TASK_UNINTERRUPTIBLE, NULL, | |
20125 | + RT_MUTEX_MIN_CHAINWALK, NULL, | |
20126 | + &waiter); | |
20127 | + /* | |
20128 | + * The slowlock() above is guaranteed to return with the rtmutex is | |
20129 | + * now held, so there can't be a writer active. Increment the reader | |
20130 | + * count and immediately drop the rtmutex again. | |
20131 | + */ | |
20132 | + atomic_inc(&sem->readers); | |
20133 | + raw_spin_unlock_irq(&m->wait_lock); | |
20134 | + __rt_mutex_unlock(m); | |
1a6e0f06 | 20135 | + |
e4b2b4a8 JK |
20136 | + debug_rt_mutex_free_waiter(&waiter); |
20137 | +} | |
20138 | + | |
20139 | +void __up_read(struct rw_semaphore *sem) | |
1a6e0f06 | 20140 | +{ |
e4b2b4a8 JK |
20141 | + struct rt_mutex *m = &sem->rtmutex; |
20142 | + struct task_struct *tsk; | |
20143 | + | |
20144 | + /* | |
20145 | + * sem->readers can only hit 0 when a writer is waiting for the | |
20146 | + * active readers to leave the critical region. | |
20147 | + */ | |
20148 | + if (!atomic_dec_and_test(&sem->readers)) | |
20149 | + return; | |
20150 | + | |
20151 | + might_sleep(); | |
20152 | + raw_spin_lock_irq(&m->wait_lock); | |
20153 | + /* | |
20154 | + * Wake the writer, i.e. the rtmutex owner. It might release the | |
20155 | + * rtmutex concurrently in the fast path (due to a signal), but to | |
20156 | + * clean up the rwsem it needs to acquire m->wait_lock. The worst | |
20157 | + * case which can happen is a spurious wakeup. | |
20158 | + */ | |
20159 | + tsk = rt_mutex_owner(m); | |
20160 | + if (tsk) | |
20161 | + wake_up_process(tsk); | |
20162 | + | |
20163 | + raw_spin_unlock_irq(&m->wait_lock); | |
1a6e0f06 JK |
20164 | +} |
20165 | + | |
e4b2b4a8 JK |
20166 | +static void __up_write_unlock(struct rw_semaphore *sem, int bias, |
20167 | + unsigned long flags) | |
20168 | +{ | |
20169 | + struct rt_mutex *m = &sem->rtmutex; | |
1a6e0f06 | 20170 | + |
e4b2b4a8 JK |
20171 | + atomic_add(READER_BIAS - bias, &sem->readers); |
20172 | + raw_spin_unlock_irqrestore(&m->wait_lock, flags); | |
20173 | + __rt_mutex_unlock(m); | |
20174 | +} | |
1a6e0f06 | 20175 | + |
e4b2b4a8 JK |
20176 | +static int __sched __down_write_common(struct rw_semaphore *sem, int state) |
20177 | +{ | |
20178 | + struct rt_mutex *m = &sem->rtmutex; | |
20179 | + unsigned long flags; | |
1a6e0f06 | 20180 | + |
e4b2b4a8 JK |
20181 | + /* Take the rtmutex as a first step */ |
20182 | + if (__rt_mutex_lock_state(m, state)) | |
20183 | + return -EINTR; | |
1a6e0f06 | 20184 | + |
e4b2b4a8 JK |
20185 | + /* Force readers into slow path */ |
20186 | + atomic_sub(READER_BIAS, &sem->readers); | |
20187 | + might_sleep(); | |
1a6e0f06 | 20188 | + |
e4b2b4a8 JK |
20189 | + set_current_state(state); |
20190 | + for (;;) { | |
20191 | + raw_spin_lock_irqsave(&m->wait_lock, flags); | |
20192 | + /* Have all readers left the critical region? */ | |
20193 | + if (!atomic_read(&sem->readers)) { | |
20194 | + atomic_set(&sem->readers, WRITER_BIAS); | |
20195 | + __set_current_state(TASK_RUNNING); | |
20196 | + raw_spin_unlock_irqrestore(&m->wait_lock, flags); | |
20197 | + return 0; | |
20198 | + } | |
1a6e0f06 | 20199 | + |
e4b2b4a8 JK |
20200 | + if (signal_pending_state(state, current)) { |
20201 | + __set_current_state(TASK_RUNNING); | |
20202 | + __up_write_unlock(sem, 0, flags); | |
20203 | + return -EINTR; | |
20204 | + } | |
20205 | + raw_spin_unlock_irqrestore(&m->wait_lock, flags); | |
1a6e0f06 | 20206 | + |
e4b2b4a8 JK |
20207 | + if (atomic_read(&sem->readers) != 0) { |
20208 | + schedule(); | |
20209 | + set_current_state(state); | |
20210 | + } | |
20211 | + } | |
20212 | +} | |
1a6e0f06 | 20213 | + |
e4b2b4a8 JK |
20214 | +void __sched __down_write(struct rw_semaphore *sem) |
20215 | +{ | |
20216 | + __down_write_common(sem, TASK_UNINTERRUPTIBLE); | |
20217 | +} | |
1a6e0f06 | 20218 | + |
e4b2b4a8 | 20219 | +int __sched __down_write_killable(struct rw_semaphore *sem) |
1a6e0f06 | 20220 | +{ |
e4b2b4a8 | 20221 | + return __down_write_common(sem, TASK_KILLABLE); |
1a6e0f06 JK |
20222 | +} |
20223 | + | |
e4b2b4a8 | 20224 | +int __down_write_trylock(struct rw_semaphore *sem) |
1a6e0f06 | 20225 | +{ |
e4b2b4a8 JK |
20226 | + struct rt_mutex *m = &sem->rtmutex; |
20227 | + unsigned long flags; | |
20228 | + | |
20229 | + if (!__rt_mutex_trylock(m)) | |
20230 | + return 0; | |
20231 | + | |
20232 | + atomic_sub(READER_BIAS, &sem->readers); | |
20233 | + | |
20234 | + raw_spin_lock_irqsave(&m->wait_lock, flags); | |
20235 | + if (!atomic_read(&sem->readers)) { | |
20236 | + atomic_set(&sem->readers, WRITER_BIAS); | |
20237 | + raw_spin_unlock_irqrestore(&m->wait_lock, flags); | |
20238 | + return 1; | |
20239 | + } | |
20240 | + __up_write_unlock(sem, 0, flags); | |
20241 | + return 0; | |
1a6e0f06 JK |
20242 | +} |
20243 | + | |
e4b2b4a8 | 20244 | +void __up_write(struct rw_semaphore *sem) |
1a6e0f06 | 20245 | +{ |
e4b2b4a8 JK |
20246 | + struct rt_mutex *m = &sem->rtmutex; |
20247 | + unsigned long flags; | |
20248 | + | |
20249 | + raw_spin_lock_irqsave(&m->wait_lock, flags); | |
20250 | + __up_write_unlock(sem, WRITER_BIAS, flags); | |
1a6e0f06 JK |
20251 | +} |
20252 | + | |
e4b2b4a8 JK |
20253 | +void __downgrade_write(struct rw_semaphore *sem) |
20254 | +{ | |
20255 | + struct rt_mutex *m = &sem->rtmutex; | |
20256 | + unsigned long flags; | |
1a6e0f06 | 20257 | + |
e4b2b4a8 JK |
20258 | + raw_spin_lock_irqsave(&m->wait_lock, flags); |
20259 | + /* Release it and account current as reader */ | |
20260 | + __up_write_unlock(sem, WRITER_BIAS - 1, flags); | |
20261 | +} | |
b3bbd485 JK |
20262 | diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c |
20263 | index 6e40fdfba326..401bda23f786 100644 | |
20264 | --- a/kernel/locking/spinlock.c | |
20265 | +++ b/kernel/locking/spinlock.c | |
20266 | @@ -125,8 +125,11 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \ | |
e4b2b4a8 | 20267 | * __[spin|read|write]_lock_bh() |
1a6e0f06 | 20268 | */ |
e4b2b4a8 | 20269 | BUILD_LOCK_OPS(spin, raw_spinlock); |
1a6e0f06 JK |
20270 | + |
20271 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
e4b2b4a8 JK |
20272 | BUILD_LOCK_OPS(read, rwlock); |
20273 | BUILD_LOCK_OPS(write, rwlock); | |
20274 | +#endif | |
20275 | ||
1a6e0f06 JK |
20276 | #endif |
20277 | ||
b3bbd485 | 20278 | @@ -210,6 +213,8 @@ void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) |
e4b2b4a8 JK |
20279 | EXPORT_SYMBOL(_raw_spin_unlock_bh); |
20280 | #endif | |
20281 | ||
20282 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
1a6e0f06 | 20283 | + |
e4b2b4a8 JK |
20284 | #ifndef CONFIG_INLINE_READ_TRYLOCK |
20285 | int __lockfunc _raw_read_trylock(rwlock_t *lock) | |
20286 | { | |
b3bbd485 | 20287 | @@ -354,6 +359,8 @@ void __lockfunc _raw_write_unlock_bh(rwlock_t *lock) |
e4b2b4a8 JK |
20288 | EXPORT_SYMBOL(_raw_write_unlock_bh); |
20289 | #endif | |
20290 | ||
20291 | +#endif /* !PREEMPT_RT_FULL */ | |
1a6e0f06 | 20292 | + |
e4b2b4a8 JK |
20293 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
20294 | ||
20295 | void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) | |
b3bbd485 JK |
20296 | diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c |
20297 | index 9aa0fccd5d43..76d0b40d9193 100644 | |
20298 | --- a/kernel/locking/spinlock_debug.c | |
20299 | +++ b/kernel/locking/spinlock_debug.c | |
20300 | @@ -31,6 +31,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, | |
e4b2b4a8 JK |
20301 | |
20302 | EXPORT_SYMBOL(__raw_spin_lock_init); | |
20303 | ||
20304 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
20305 | void __rwlock_init(rwlock_t *lock, const char *name, | |
20306 | struct lock_class_key *key) | |
20307 | { | |
b3bbd485 | 20308 | @@ -48,6 +49,7 @@ void __rwlock_init(rwlock_t *lock, const char *name, |
e4b2b4a8 JK |
20309 | } |
20310 | ||
20311 | EXPORT_SYMBOL(__rwlock_init); | |
1a6e0f06 | 20312 | +#endif |
e4b2b4a8 JK |
20313 | |
20314 | static void spin_dump(raw_spinlock_t *lock, const char *msg) | |
20315 | { | |
b3bbd485 | 20316 | @@ -135,6 +137,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock) |
e4b2b4a8 JK |
20317 | arch_spin_unlock(&lock->raw_lock); |
20318 | } | |
20319 | ||
20320 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
20321 | static void rwlock_bug(rwlock_t *lock, const char *msg) | |
20322 | { | |
20323 | if (!debug_locks_off()) | |
b3bbd485 | 20324 | @@ -224,3 +227,5 @@ void do_raw_write_unlock(rwlock_t *lock) |
e4b2b4a8 JK |
20325 | debug_write_unlock(lock); |
20326 | arch_write_unlock(&lock->raw_lock); | |
20327 | } | |
1a6e0f06 JK |
20328 | + |
20329 | +#endif | |
b3bbd485 JK |
20330 | diff --git a/kernel/panic.c b/kernel/panic.c |
20331 | index bdd18afa19a4..5da649633795 100644 | |
20332 | --- a/kernel/panic.c | |
20333 | +++ b/kernel/panic.c | |
20334 | @@ -482,9 +482,11 @@ static u64 oops_id; | |
e4b2b4a8 JK |
20335 | |
20336 | static int init_oops_id(void) | |
20337 | { | |
20338 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
20339 | if (!oops_id) | |
20340 | get_random_bytes(&oops_id, sizeof(oops_id)); | |
20341 | else | |
1a6e0f06 | 20342 | +#endif |
e4b2b4a8 JK |
20343 | oops_id++; |
20344 | ||
20345 | return 0; | |
b3bbd485 JK |
20346 | diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c |
20347 | index a5c36e9c56a6..a4b83cb0c6e5 100644 | |
20348 | --- a/kernel/power/hibernate.c | |
20349 | +++ b/kernel/power/hibernate.c | |
20350 | @@ -287,6 +287,8 @@ static int create_image(int platform_mode) | |
e4b2b4a8 JK |
20351 | |
20352 | local_irq_disable(); | |
20353 | ||
20354 | + system_state = SYSTEM_SUSPEND; | |
1a6e0f06 | 20355 | + |
e4b2b4a8 JK |
20356 | error = syscore_suspend(); |
20357 | if (error) { | |
20358 | pr_err("Some system devices failed to power down, aborting hibernation\n"); | |
b3bbd485 | 20359 | @@ -317,6 +319,7 @@ static int create_image(int platform_mode) |
e4b2b4a8 JK |
20360 | syscore_resume(); |
20361 | ||
20362 | Enable_irqs: | |
20363 | + system_state = SYSTEM_RUNNING; | |
20364 | local_irq_enable(); | |
20365 | ||
20366 | Enable_cpus: | |
b3bbd485 | 20367 | @@ -445,6 +448,7 @@ static int resume_target_kernel(bool platform_mode) |
e4b2b4a8 JK |
20368 | goto Enable_cpus; |
20369 | ||
20370 | local_irq_disable(); | |
20371 | + system_state = SYSTEM_SUSPEND; | |
20372 | ||
20373 | error = syscore_suspend(); | |
20374 | if (error) | |
b3bbd485 | 20375 | @@ -478,6 +482,7 @@ static int resume_target_kernel(bool platform_mode) |
e4b2b4a8 JK |
20376 | syscore_resume(); |
20377 | ||
20378 | Enable_irqs: | |
20379 | + system_state = SYSTEM_RUNNING; | |
20380 | local_irq_enable(); | |
20381 | ||
20382 | Enable_cpus: | |
b3bbd485 | 20383 | @@ -563,6 +568,7 @@ int hibernation_platform_enter(void) |
e4b2b4a8 JK |
20384 | goto Enable_cpus; |
20385 | ||
20386 | local_irq_disable(); | |
20387 | + system_state = SYSTEM_SUSPEND; | |
20388 | syscore_suspend(); | |
20389 | if (pm_wakeup_pending()) { | |
20390 | error = -EAGAIN; | |
b3bbd485 | 20391 | @@ -575,6 +581,7 @@ int hibernation_platform_enter(void) |
e4b2b4a8 JK |
20392 | |
20393 | Power_up: | |
20394 | syscore_resume(); | |
20395 | + system_state = SYSTEM_RUNNING; | |
20396 | local_irq_enable(); | |
20397 | ||
20398 | Enable_cpus: | |
b3bbd485 | 20399 | @@ -672,6 +679,10 @@ static int load_image_and_restore(void) |
e4b2b4a8 JK |
20400 | return error; |
20401 | } | |
20402 | ||
20403 | +#ifndef CONFIG_SUSPEND | |
20404 | +bool pm_in_action; | |
1a6e0f06 | 20405 | +#endif |
1a6e0f06 | 20406 | + |
e4b2b4a8 JK |
20407 | /** |
20408 | * hibernate - Carry out system hibernation, including saving the image. | |
20409 | */ | |
b3bbd485 | 20410 | @@ -685,6 +696,8 @@ int hibernate(void) |
e4b2b4a8 JK |
20411 | return -EPERM; |
20412 | } | |
20413 | ||
20414 | + pm_in_action = true; | |
1a6e0f06 | 20415 | + |
e4b2b4a8 JK |
20416 | lock_system_sleep(); |
20417 | /* The snapshot device should not be opened while we're running */ | |
20418 | if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { | |
b3bbd485 | 20419 | @@ -763,6 +776,7 @@ int hibernate(void) |
e4b2b4a8 JK |
20420 | atomic_inc(&snapshot_device_available); |
20421 | Unlock: | |
20422 | unlock_system_sleep(); | |
20423 | + pm_in_action = false; | |
20424 | pr_info("hibernation exit\n"); | |
20425 | ||
20426 | return error; | |
b3bbd485 JK |
20427 | diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c |
20428 | index c0bc2c89697a..b89605fe0e88 100644 | |
20429 | --- a/kernel/power/suspend.c | |
20430 | +++ b/kernel/power/suspend.c | |
20431 | @@ -27,6 +27,7 @@ | |
20432 | #include <linux/export.h> | |
20433 | #include <linux/suspend.h> | |
20434 | #include <linux/syscore_ops.h> | |
20435 | +#include <linux/swait.h> | |
20436 | #include <linux/ftrace.h> | |
20437 | #include <trace/events/power.h> | |
20438 | #include <linux/compiler.h> | |
20439 | @@ -57,7 +58,7 @@ EXPORT_SYMBOL_GPL(pm_suspend_global_flags); | |
20440 | ||
20441 | static const struct platform_suspend_ops *suspend_ops; | |
20442 | static const struct platform_s2idle_ops *s2idle_ops; | |
20443 | -static DECLARE_WAIT_QUEUE_HEAD(s2idle_wait_head); | |
20444 | +static DECLARE_SWAIT_QUEUE_HEAD(s2idle_wait_head); | |
20445 | ||
20446 | enum s2idle_states __read_mostly s2idle_state; | |
20447 | static DEFINE_RAW_SPINLOCK(s2idle_lock); | |
20448 | @@ -91,8 +92,8 @@ static void s2idle_enter(void) | |
20449 | /* Push all the CPUs into the idle loop. */ | |
20450 | wake_up_all_idle_cpus(); | |
20451 | /* Make the current CPU wait so it can enter the idle loop too. */ | |
20452 | - wait_event(s2idle_wait_head, | |
20453 | - s2idle_state == S2IDLE_STATE_WAKE); | |
20454 | + swait_event(s2idle_wait_head, | |
20455 | + s2idle_state == S2IDLE_STATE_WAKE); | |
20456 | ||
20457 | cpuidle_pause(); | |
20458 | put_online_cpus(); | |
20459 | @@ -159,7 +160,7 @@ void s2idle_wake(void) | |
20460 | raw_spin_lock_irqsave(&s2idle_lock, flags); | |
20461 | if (s2idle_state > S2IDLE_STATE_NONE) { | |
20462 | s2idle_state = S2IDLE_STATE_WAKE; | |
20463 | - wake_up(&s2idle_wait_head); | |
20464 | + swake_up(&s2idle_wait_head); | |
20465 | } | |
20466 | raw_spin_unlock_irqrestore(&s2idle_lock, flags); | |
20467 | } | |
20468 | @@ -428,6 +429,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) | |
e4b2b4a8 JK |
20469 | arch_suspend_disable_irqs(); |
20470 | BUG_ON(!irqs_disabled()); | |
20471 | ||
20472 | + system_state = SYSTEM_SUSPEND; | |
1a6e0f06 | 20473 | + |
e4b2b4a8 JK |
20474 | error = syscore_suspend(); |
20475 | if (!error) { | |
20476 | *wakeup = pm_wakeup_pending(); | |
b3bbd485 | 20477 | @@ -443,6 +446,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) |
e4b2b4a8 JK |
20478 | syscore_resume(); |
20479 | } | |
20480 | ||
20481 | + system_state = SYSTEM_RUNNING; | |
1a6e0f06 | 20482 | + |
e4b2b4a8 JK |
20483 | arch_suspend_enable_irqs(); |
20484 | BUG_ON(irqs_disabled()); | |
20485 | ||
b3bbd485 | 20486 | @@ -589,6 +594,8 @@ static int enter_state(suspend_state_t state) |
e4b2b4a8 JK |
20487 | return error; |
20488 | } | |
20489 | ||
20490 | +bool pm_in_action; | |
1a6e0f06 | 20491 | + |
e4b2b4a8 JK |
20492 | /** |
20493 | * pm_suspend - Externally visible function for suspending the system. | |
20494 | * @state: System sleep state to enter. | |
b3bbd485 | 20495 | @@ -603,6 +610,7 @@ int pm_suspend(suspend_state_t state) |
e4b2b4a8 JK |
20496 | if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX) |
20497 | return -EINVAL; | |
20498 | ||
20499 | + pm_in_action = true; | |
20500 | pr_info("suspend entry (%s)\n", mem_sleep_labels[state]); | |
20501 | error = enter_state(state); | |
20502 | if (error) { | |
b3bbd485 | 20503 | @@ -612,6 +620,7 @@ int pm_suspend(suspend_state_t state) |
e4b2b4a8 JK |
20504 | suspend_stats.success++; |
20505 | } | |
20506 | pr_info("suspend exit\n"); | |
20507 | + pm_in_action = false; | |
20508 | return error; | |
20509 | } | |
20510 | EXPORT_SYMBOL(pm_suspend); | |
b3bbd485 JK |
20511 | diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c |
20512 | index f0223a7d9ed1..13fd0bcf2367 100644 | |
20513 | --- a/kernel/printk/printk.c | |
20514 | +++ b/kernel/printk/printk.c | |
20515 | @@ -1348,6 +1348,8 @@ static int syslog_print_all(char __user *buf, int size, bool clear) | |
e4b2b4a8 JK |
20516 | { |
20517 | char *text; | |
20518 | int len = 0; | |
20519 | + int attempts = 0; | |
20520 | + int num_msg; | |
20521 | ||
20522 | text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); | |
20523 | if (!text) | |
b3bbd485 | 20524 | @@ -1359,6 +1361,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear) |
e4b2b4a8 JK |
20525 | u64 seq; |
20526 | u32 idx; | |
20527 | ||
20528 | +try_again: | |
20529 | + attempts++; | |
20530 | + if (attempts > 10) { | |
20531 | + len = -EBUSY; | |
20532 | + goto out; | |
20533 | + } | |
20534 | + num_msg = 0; | |
20535 | + | |
20536 | /* | |
20537 | * Find first record that fits, including all following records, | |
20538 | * into the user-provided buffer for this dump. | |
b3bbd485 | 20539 | @@ -1371,6 +1381,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear) |
e4b2b4a8 JK |
20540 | len += msg_print_text(msg, true, NULL, 0); |
20541 | idx = log_next(idx); | |
20542 | seq++; | |
20543 | + num_msg++; | |
20544 | + if (num_msg > 5) { | |
20545 | + num_msg = 0; | |
20546 | + logbuf_unlock_irq(); | |
20547 | + logbuf_lock_irq(); | |
20548 | + if (clear_seq < log_first_seq) | |
20549 | + goto try_again; | |
20550 | + } | |
20551 | } | |
20552 | ||
20553 | /* move first record forward until length fits into the buffer */ | |
b3bbd485 | 20554 | @@ -1382,6 +1400,14 @@ static int syslog_print_all(char __user *buf, int size, bool clear) |
e4b2b4a8 JK |
20555 | len -= msg_print_text(msg, true, NULL, 0); |
20556 | idx = log_next(idx); | |
20557 | seq++; | |
20558 | + num_msg++; | |
20559 | + if (num_msg > 5) { | |
20560 | + num_msg = 0; | |
20561 | + logbuf_unlock_irq(); | |
20562 | + logbuf_lock_irq(); | |
20563 | + if (clear_seq < log_first_seq) | |
20564 | + goto try_again; | |
20565 | + } | |
20566 | } | |
20567 | ||
20568 | /* last message fitting into this dump */ | |
b3bbd485 | 20569 | @@ -1420,6 +1446,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) |
e4b2b4a8 JK |
20570 | clear_seq = log_next_seq; |
20571 | clear_idx = log_next_idx; | |
20572 | } | |
20573 | +out: | |
20574 | logbuf_unlock_irq(); | |
20575 | ||
20576 | kfree(text); | |
b3bbd485 | 20577 | @@ -1558,6 +1585,12 @@ static void call_console_drivers(const char *ext_text, size_t ext_len, |
e4b2b4a8 JK |
20578 | if (!console_drivers) |
20579 | return; | |
20580 | ||
20581 | + if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) { | |
20582 | + if (in_irq() || in_nmi()) | |
20583 | + return; | |
1a6e0f06 | 20584 | + } |
1a6e0f06 | 20585 | + |
e4b2b4a8 JK |
20586 | + migrate_disable(); |
20587 | for_each_console(con) { | |
20588 | if (exclusive_console && con != exclusive_console) | |
20589 | continue; | |
b3bbd485 | 20590 | @@ -1573,6 +1606,7 @@ static void call_console_drivers(const char *ext_text, size_t ext_len, |
e4b2b4a8 JK |
20591 | else |
20592 | con->write(con, text, len); | |
20593 | } | |
20594 | + migrate_enable(); | |
20595 | } | |
20596 | ||
20597 | int printk_delay_msec __read_mostly; | |
b3bbd485 | 20598 | @@ -1757,12 +1791,22 @@ asmlinkage int vprintk_emit(int facility, int level, |
e4b2b4a8 JK |
20599 | |
20600 | /* If called from the scheduler, we can not call up(). */ | |
20601 | if (!in_sched) { | |
20602 | + int may_trylock = 1; | |
1a6e0f06 | 20603 | + |
e4b2b4a8 JK |
20604 | +#ifdef CONFIG_PREEMPT_RT_FULL |
20605 | + /* | |
20606 | + * we can't take a sleeping lock with IRQs or preeption disabled | |
20607 | + * so we can't print in these contexts | |
20608 | + */ | |
20609 | + if (!(preempt_count() == 0 && !irqs_disabled())) | |
20610 | + may_trylock = 0; | |
1a6e0f06 | 20611 | +#endif |
e4b2b4a8 JK |
20612 | /* |
20613 | * Try to acquire and then immediately release the console | |
20614 | * semaphore. The release will print out buffers and wake up | |
20615 | * /dev/kmsg and syslog() users. | |
20616 | */ | |
20617 | - if (console_trylock()) | |
20618 | + if (may_trylock && console_trylock()) | |
20619 | console_unlock(); | |
20620 | } | |
1a6e0f06 | 20621 | |
b3bbd485 | 20622 | @@ -1872,26 +1916,6 @@ static bool suppress_message_printing(int level) { return false; } |
1a6e0f06 | 20623 | |
e4b2b4a8 | 20624 | #endif /* CONFIG_PRINTK */ |
1a6e0f06 | 20625 | |
e4b2b4a8 JK |
20626 | -#ifdef CONFIG_EARLY_PRINTK |
20627 | -struct console *early_console; | |
20628 | - | |
20629 | -asmlinkage __visible void early_printk(const char *fmt, ...) | |
20630 | -{ | |
20631 | - va_list ap; | |
20632 | - char buf[512]; | |
20633 | - int n; | |
20634 | - | |
20635 | - if (!early_console) | |
20636 | - return; | |
20637 | - | |
20638 | - va_start(ap, fmt); | |
20639 | - n = vscnprintf(buf, sizeof(buf), fmt, ap); | |
20640 | - va_end(ap); | |
20641 | - | |
20642 | - early_console->write(early_console, buf, n); | |
20643 | -} | |
20644 | -#endif | |
20645 | - | |
20646 | static int __add_preferred_console(char *name, int idx, char *options, | |
20647 | char *brl_options) | |
20648 | { | |
b3bbd485 | 20649 | @@ -2238,10 +2262,15 @@ void console_unlock(void) |
e4b2b4a8 JK |
20650 | console_seq++; |
20651 | raw_spin_unlock(&logbuf_lock); | |
1a6e0f06 | 20652 | |
e4b2b4a8 JK |
20653 | +#ifdef CONFIG_PREEMPT_RT_FULL |
20654 | + printk_safe_exit_irqrestore(flags); | |
20655 | + call_console_drivers(ext_text, ext_len, text, len); | |
1a6e0f06 | 20656 | +#else |
e4b2b4a8 JK |
20657 | stop_critical_timings(); /* don't trace print latency */ |
20658 | call_console_drivers(ext_text, ext_len, text, len); | |
20659 | start_critical_timings(); | |
20660 | printk_safe_exit_irqrestore(flags); | |
1a6e0f06 | 20661 | +#endif |
1a6e0f06 | 20662 | |
e4b2b4a8 JK |
20663 | if (do_cond_resched) |
20664 | cond_resched(); | |
b3bbd485 | 20665 | @@ -2295,6 +2324,11 @@ void console_unblank(void) |
1a6e0f06 | 20666 | { |
e4b2b4a8 | 20667 | struct console *c; |
1a6e0f06 | 20668 | |
e4b2b4a8 JK |
20669 | + if (IS_ENABLED(CONFIG_PREEMPT_RT_BASE)) { |
20670 | + if (in_irq() || in_nmi()) | |
20671 | + return; | |
20672 | + } | |
1a6e0f06 | 20673 | + |
e4b2b4a8 JK |
20674 | /* |
20675 | * console_unblank can no longer be called in interrupt context unless | |
20676 | * oops_in_progress is set to 1.. | |
b3bbd485 JK |
20677 | diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c |
20678 | index 64f8046586b6..a24e16bef51c 100644 | |
20679 | --- a/kernel/printk/printk_safe.c | |
20680 | +++ b/kernel/printk/printk_safe.c | |
20681 | @@ -22,6 +22,7 @@ | |
20682 | #include <linux/cpumask.h> | |
20683 | #include <linux/irq_work.h> | |
20684 | #include <linux/printk.h> | |
20685 | +#include <linux/console.h> | |
20686 | ||
20687 | #include "internal.h" | |
20688 | ||
20689 | @@ -373,8 +374,74 @@ void __printk_safe_exit(void) | |
20690 | this_cpu_dec(printk_context); | |
20691 | } | |
20692 | ||
20693 | +#ifdef CONFIG_EARLY_PRINTK | |
20694 | +struct console *early_console; | |
20695 | + | |
20696 | +static void early_vprintk(const char *fmt, va_list ap) | |
20697 | +{ | |
20698 | + if (early_console) { | |
20699 | + char buf[512]; | |
20700 | + int n = vscnprintf(buf, sizeof(buf), fmt, ap); | |
20701 | + | |
20702 | + early_console->write(early_console, buf, n); | |
20703 | + } | |
20704 | +} | |
20705 | + | |
20706 | +asmlinkage void early_printk(const char *fmt, ...) | |
20707 | +{ | |
20708 | + va_list ap; | |
20709 | + | |
20710 | + va_start(ap, fmt); | |
20711 | + early_vprintk(fmt, ap); | |
20712 | + va_end(ap); | |
20713 | +} | |
20714 | + | |
20715 | +/* | |
20716 | + * This is independent of any log levels - a global | |
20717 | + * kill switch that turns off all of printk. | |
20718 | + * | |
20719 | + * Used by the NMI watchdog if early-printk is enabled. | |
20720 | + */ | |
20721 | +static bool __read_mostly printk_killswitch; | |
20722 | + | |
20723 | +static int __init force_early_printk_setup(char *str) | |
20724 | +{ | |
20725 | + printk_killswitch = true; | |
20726 | + return 0; | |
20727 | +} | |
20728 | +early_param("force_early_printk", force_early_printk_setup); | |
20729 | + | |
20730 | +void printk_kill(void) | |
20731 | +{ | |
20732 | + printk_killswitch = true; | |
20733 | +} | |
20734 | + | |
20735 | +#ifdef CONFIG_PRINTK | |
20736 | +static int forced_early_printk(const char *fmt, va_list ap) | |
20737 | +{ | |
20738 | + if (!printk_killswitch) | |
20739 | + return 0; | |
20740 | + early_vprintk(fmt, ap); | |
20741 | + return 1; | |
20742 | +} | |
20743 | +#endif | |
20744 | + | |
20745 | +#else | |
20746 | +static inline int forced_early_printk(const char *fmt, va_list ap) | |
20747 | +{ | |
20748 | + return 0; | |
20749 | +} | |
20750 | +#endif | |
20751 | + | |
20752 | __printf(1, 0) int vprintk_func(const char *fmt, va_list args) | |
20753 | { | |
20754 | + /* | |
20755 | + * Fall back to early_printk if a debugging subsystem has | |
20756 | + * killed printk output | |
20757 | + */ | |
20758 | + if (unlikely(forced_early_printk(fmt, args))) | |
20759 | + return 1; | |
20760 | + | |
20761 | /* | |
20762 | * Try to use the main logbuf even in NMI. But avoid calling console | |
20763 | * drivers that might have their own locks. | |
20764 | diff --git a/kernel/ptrace.c b/kernel/ptrace.c | |
20765 | index 84b1367935e4..b32a86f63522 100644 | |
20766 | --- a/kernel/ptrace.c | |
20767 | +++ b/kernel/ptrace.c | |
20768 | @@ -175,7 +175,14 @@ static bool ptrace_freeze_traced(struct task_struct *task) | |
e4b2b4a8 JK |
20769 | |
20770 | spin_lock_irq(&task->sighand->siglock); | |
20771 | if (task_is_traced(task) && !__fatal_signal_pending(task)) { | |
20772 | - task->state = __TASK_TRACED; | |
20773 | + unsigned long flags; | |
1a6e0f06 | 20774 | + |
e4b2b4a8 JK |
20775 | + raw_spin_lock_irqsave(&task->pi_lock, flags); |
20776 | + if (task->state & __TASK_TRACED) | |
20777 | + task->state = __TASK_TRACED; | |
20778 | + else | |
20779 | + task->saved_state = __TASK_TRACED; | |
20780 | + raw_spin_unlock_irqrestore(&task->pi_lock, flags); | |
20781 | ret = true; | |
20782 | } | |
20783 | spin_unlock_irq(&task->sighand->siglock); | |
b3bbd485 JK |
20784 | diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig |
20785 | index 9210379c0353..0be2c96fb640 100644 | |
20786 | --- a/kernel/rcu/Kconfig | |
20787 | +++ b/kernel/rcu/Kconfig | |
20788 | @@ -36,7 +36,7 @@ config TINY_RCU | |
1a6e0f06 | 20789 | |
e4b2b4a8 JK |
20790 | config RCU_EXPERT |
20791 | bool "Make expert-level adjustments to RCU configuration" | |
20792 | - default n | |
20793 | + default y if PREEMPT_RT_FULL | |
20794 | help | |
20795 | This option needs to be enabled if you wish to make | |
20796 | expert-level adjustments to RCU configuration. By default, | |
b3bbd485 | 20797 | @@ -172,7 +172,7 @@ config RCU_FANOUT_LEAF |
e4b2b4a8 JK |
20798 | |
20799 | config RCU_FAST_NO_HZ | |
20800 | bool "Accelerate last non-dyntick-idle CPU's grace periods" | |
20801 | - depends on NO_HZ_COMMON && SMP && RCU_EXPERT | |
20802 | + depends on NO_HZ_COMMON && SMP && RCU_EXPERT && !PREEMPT_RT_FULL | |
20803 | default n | |
20804 | help | |
20805 | This option permits CPUs to enter dynticks-idle state even if | |
b3bbd485 | 20806 | @@ -191,7 +191,7 @@ config RCU_FAST_NO_HZ |
e4b2b4a8 JK |
20807 | config RCU_BOOST |
20808 | bool "Enable RCU priority boosting" | |
20809 | depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT | |
20810 | - default n | |
20811 | + default y if PREEMPT_RT_FULL | |
20812 | help | |
20813 | This option boosts the priority of preempted RCU readers that | |
20814 | block the current preemptible RCU grace period for too long. | |
b3bbd485 JK |
20815 | diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h |
20816 | index e4b43fef89f5..0b056c30e9b1 100644 | |
20817 | --- a/kernel/rcu/rcu.h | |
20818 | +++ b/kernel/rcu/rcu.h | |
20819 | @@ -462,18 +462,26 @@ static inline void show_rcu_gp_kthreads(void) { } | |
e4b2b4a8 JK |
20820 | extern unsigned long rcutorture_testseq; |
20821 | extern unsigned long rcutorture_vernum; | |
20822 | unsigned long rcu_batches_started(void); | |
20823 | -unsigned long rcu_batches_started_bh(void); | |
20824 | unsigned long rcu_batches_started_sched(void); | |
20825 | unsigned long rcu_batches_completed(void); | |
20826 | -unsigned long rcu_batches_completed_bh(void); | |
20827 | unsigned long rcu_batches_completed_sched(void); | |
20828 | unsigned long rcu_exp_batches_completed(void); | |
20829 | unsigned long rcu_exp_batches_completed_sched(void); | |
20830 | unsigned long srcu_batches_completed(struct srcu_struct *sp); | |
20831 | void show_rcu_gp_kthreads(void); | |
20832 | void rcu_force_quiescent_state(void); | |
20833 | -void rcu_bh_force_quiescent_state(void); | |
20834 | void rcu_sched_force_quiescent_state(void); | |
1a6e0f06 | 20835 | + |
e4b2b4a8 JK |
20836 | +#ifndef CONFIG_PREEMPT_RT_FULL |
20837 | +void rcu_bh_force_quiescent_state(void); | |
20838 | +unsigned long rcu_batches_started_bh(void); | |
20839 | +unsigned long rcu_batches_completed_bh(void); | |
1a6e0f06 | 20840 | +#else |
e4b2b4a8 JK |
20841 | +# define rcu_bh_force_quiescent_state rcu_force_quiescent_state |
20842 | +# define rcu_batches_completed_bh rcu_batches_completed | |
20843 | +# define rcu_batches_started_bh rcu_batches_completed | |
1a6e0f06 | 20844 | +#endif |
e4b2b4a8 JK |
20845 | + |
20846 | #endif /* #else #ifdef CONFIG_TINY_RCU */ | |
1a6e0f06 | 20847 | |
e4b2b4a8 | 20848 | #ifdef CONFIG_RCU_NOCB_CPU |
b3bbd485 JK |
20849 | diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c |
20850 | index 7649fcd2c4c7..88cba7c2956c 100644 | |
20851 | --- a/kernel/rcu/rcu_segcblist.c | |
20852 | +++ b/kernel/rcu/rcu_segcblist.c | |
e4b2b4a8 JK |
20853 | @@ -23,6 +23,7 @@ |
20854 | #include <linux/types.h> | |
20855 | #include <linux/kernel.h> | |
20856 | #include <linux/interrupt.h> | |
20857 | +#include <linux/rcupdate.h> | |
1a6e0f06 | 20858 | |
e4b2b4a8 | 20859 | #include "rcu_segcblist.h" |
1a6e0f06 | 20860 | |
b3bbd485 JK |
20861 | diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c |
20862 | index 45f2ffbc1e78..2e9dbb734d5a 100644 | |
20863 | --- a/kernel/rcu/rcutorture.c | |
20864 | +++ b/kernel/rcu/rcutorture.c | |
20865 | @@ -417,6 +417,7 @@ static struct rcu_torture_ops rcu_ops = { | |
e4b2b4a8 | 20866 | .name = "rcu" |
1a6e0f06 JK |
20867 | }; |
20868 | ||
e4b2b4a8 JK |
20869 | +#ifndef CONFIG_PREEMPT_RT_FULL |
20870 | /* | |
20871 | * Definitions for rcu_bh torture testing. | |
1a6e0f06 | 20872 | */ |
b3bbd485 | 20873 | @@ -456,6 +457,12 @@ static struct rcu_torture_ops rcu_bh_ops = { |
e4b2b4a8 JK |
20874 | .name = "rcu_bh" |
20875 | }; | |
1a6e0f06 | 20876 | |
e4b2b4a8 JK |
20877 | +#else |
20878 | +static struct rcu_torture_ops rcu_bh_ops = { | |
20879 | + .ttype = INVALID_RCU_FLAVOR, | |
20880 | +}; | |
20881 | +#endif | |
20882 | + | |
1a6e0f06 | 20883 | /* |
e4b2b4a8 JK |
20884 | * Don't even think about trying any of these in real life!!! |
20885 | * The names includes "busted", and they really means it! | |
b3bbd485 JK |
20886 | diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c |
20887 | index 6d5880089ff6..0e3b2bd3f2ac 100644 | |
20888 | --- a/kernel/rcu/srcutree.c | |
20889 | +++ b/kernel/rcu/srcutree.c | |
e4b2b4a8 JK |
20890 | @@ -36,6 +36,8 @@ |
20891 | #include <linux/delay.h> | |
20892 | #include <linux/module.h> | |
20893 | #include <linux/srcu.h> | |
20894 | +#include <linux/cpu.h> | |
20895 | +#include <linux/locallock.h> | |
1a6e0f06 | 20896 | |
e4b2b4a8 JK |
20897 | #include "rcu.h" |
20898 | #include "rcu_segcblist.h" | |
b3bbd485 | 20899 | @@ -53,6 +55,33 @@ static void srcu_invoke_callbacks(struct work_struct *work); |
e4b2b4a8 JK |
20900 | static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay); |
20901 | static void process_srcu(struct work_struct *work); | |
20902 | ||
20903 | +/* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */ | |
20904 | +#define spin_lock_rcu_node(p) \ | |
20905 | +do { \ | |
20906 | + spin_lock(&ACCESS_PRIVATE(p, lock)); \ | |
20907 | + smp_mb__after_unlock_lock(); \ | |
20908 | +} while (0) | |
20909 | + | |
20910 | +#define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock)) | |
20911 | + | |
20912 | +#define spin_lock_irq_rcu_node(p) \ | |
20913 | +do { \ | |
20914 | + spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ | |
20915 | + smp_mb__after_unlock_lock(); \ | |
20916 | +} while (0) | |
20917 | + | |
20918 | +#define spin_unlock_irq_rcu_node(p) \ | |
20919 | + spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) | |
20920 | + | |
20921 | +#define spin_lock_irqsave_rcu_node(p, flags) \ | |
20922 | +do { \ | |
20923 | + spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ | |
20924 | + smp_mb__after_unlock_lock(); \ | |
20925 | +} while (0) | |
20926 | + | |
20927 | +#define spin_unlock_irqrestore_rcu_node(p, flags) \ | |
20928 | + spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ | |
20929 | + | |
20930 | /* | |
20931 | * Initialize SRCU combining tree. Note that statically allocated | |
20932 | * srcu_struct structures might already have srcu_read_lock() and | |
b3bbd485 | 20933 | @@ -77,7 +106,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) |
e4b2b4a8 JK |
20934 | |
20935 | /* Each pass through this loop initializes one srcu_node structure. */ | |
20936 | rcu_for_each_node_breadth_first(sp, snp) { | |
20937 | - raw_spin_lock_init(&ACCESS_PRIVATE(snp, lock)); | |
20938 | + spin_lock_init(&ACCESS_PRIVATE(snp, lock)); | |
20939 | WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != | |
20940 | ARRAY_SIZE(snp->srcu_data_have_cbs)); | |
20941 | for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { | |
b3bbd485 | 20942 | @@ -111,7 +140,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) |
e4b2b4a8 JK |
20943 | snp_first = sp->level[level]; |
20944 | for_each_possible_cpu(cpu) { | |
20945 | sdp = per_cpu_ptr(sp->sda, cpu); | |
20946 | - raw_spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); | |
20947 | + spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); | |
20948 | rcu_segcblist_init(&sdp->srcu_cblist); | |
20949 | sdp->srcu_cblist_invoking = false; | |
20950 | sdp->srcu_gp_seq_needed = sp->srcu_gp_seq; | |
b3bbd485 | 20951 | @@ -170,7 +199,7 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name, |
e4b2b4a8 JK |
20952 | /* Don't re-initialize a lock while it is held. */ |
20953 | debug_check_no_locks_freed((void *)sp, sizeof(*sp)); | |
20954 | lockdep_init_map(&sp->dep_map, name, key, 0); | |
20955 | - raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock)); | |
20956 | + spin_lock_init(&ACCESS_PRIVATE(sp, lock)); | |
20957 | return init_srcu_struct_fields(sp, false); | |
20958 | } | |
20959 | EXPORT_SYMBOL_GPL(__init_srcu_struct); | |
b3bbd485 | 20960 | @@ -187,7 +216,7 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct); |
1a6e0f06 | 20961 | */ |
e4b2b4a8 JK |
20962 | int init_srcu_struct(struct srcu_struct *sp) |
20963 | { | |
20964 | - raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock)); | |
20965 | + spin_lock_init(&ACCESS_PRIVATE(sp, lock)); | |
20966 | return init_srcu_struct_fields(sp, false); | |
20967 | } | |
20968 | EXPORT_SYMBOL_GPL(init_srcu_struct); | |
b3bbd485 | 20969 | @@ -210,13 +239,13 @@ static void check_init_srcu_struct(struct srcu_struct *sp) |
e4b2b4a8 JK |
20970 | /* The smp_load_acquire() pairs with the smp_store_release(). */ |
20971 | if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/ | |
20972 | return; /* Already initialized. */ | |
20973 | - raw_spin_lock_irqsave_rcu_node(sp, flags); | |
20974 | + spin_lock_irqsave_rcu_node(sp, flags); | |
20975 | if (!rcu_seq_state(sp->srcu_gp_seq_needed)) { | |
20976 | - raw_spin_unlock_irqrestore_rcu_node(sp, flags); | |
20977 | + spin_unlock_irqrestore_rcu_node(sp, flags); | |
20978 | return; | |
20979 | } | |
20980 | init_srcu_struct_fields(sp, true); | |
20981 | - raw_spin_unlock_irqrestore_rcu_node(sp, flags); | |
20982 | + spin_unlock_irqrestore_rcu_node(sp, flags); | |
1a6e0f06 JK |
20983 | } |
20984 | ||
e4b2b4a8 | 20985 | /* |
b3bbd485 JK |
20986 | @@ -424,21 +453,6 @@ static void srcu_gp_start(struct srcu_struct *sp) |
20987 | WARN_ON_ONCE(state != SRCU_STATE_SCAN1); | |
e4b2b4a8 | 20988 | } |
1a6e0f06 | 20989 | |
b3bbd485 | 20990 | -/* |
e4b2b4a8 JK |
20991 | - * Track online CPUs to guide callback workqueue placement. |
20992 | - */ | |
20993 | -DEFINE_PER_CPU(bool, srcu_online); | |
20994 | - | |
20995 | -void srcu_online_cpu(unsigned int cpu) | |
20996 | -{ | |
20997 | - WRITE_ONCE(per_cpu(srcu_online, cpu), true); | |
20998 | -} | |
20999 | - | |
21000 | -void srcu_offline_cpu(unsigned int cpu) | |
21001 | -{ | |
21002 | - WRITE_ONCE(per_cpu(srcu_online, cpu), false); | |
21003 | -} | |
21004 | - | |
b3bbd485 | 21005 | /* |
e4b2b4a8 JK |
21006 | * Place the workqueue handler on the specified CPU if online, otherwise |
21007 | * just run it whereever. This is useful for placing workqueue handlers | |
b3bbd485 | 21008 | @@ -450,12 +464,12 @@ static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq, |
1a6e0f06 | 21009 | { |
e4b2b4a8 JK |
21010 | bool ret; |
21011 | ||
21012 | - preempt_disable(); | |
21013 | - if (READ_ONCE(per_cpu(srcu_online, cpu))) | |
21014 | + cpus_read_lock(); | |
21015 | + if (cpu_online(cpu)) | |
21016 | ret = queue_delayed_work_on(cpu, wq, dwork, delay); | |
21017 | else | |
21018 | ret = queue_delayed_work(wq, dwork, delay); | |
21019 | - preempt_enable(); | |
21020 | + cpus_read_unlock(); | |
21021 | return ret; | |
1a6e0f06 JK |
21022 | } |
21023 | ||
b3bbd485 | 21024 | @@ -513,7 +527,7 @@ static void srcu_gp_end(struct srcu_struct *sp) |
e4b2b4a8 JK |
21025 | mutex_lock(&sp->srcu_cb_mutex); |
21026 | ||
21027 | /* End the current grace period. */ | |
21028 | - raw_spin_lock_irq_rcu_node(sp); | |
21029 | + spin_lock_irq_rcu_node(sp); | |
21030 | idx = rcu_seq_state(sp->srcu_gp_seq); | |
21031 | WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); | |
21032 | cbdelay = srcu_get_delay(sp); | |
b3bbd485 | 21033 | @@ -522,7 +536,7 @@ static void srcu_gp_end(struct srcu_struct *sp) |
e4b2b4a8 JK |
21034 | gpseq = rcu_seq_current(&sp->srcu_gp_seq); |
21035 | if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq)) | |
21036 | sp->srcu_gp_seq_needed_exp = gpseq; | |
21037 | - raw_spin_unlock_irq_rcu_node(sp); | |
21038 | + spin_unlock_irq_rcu_node(sp); | |
21039 | mutex_unlock(&sp->srcu_gp_mutex); | |
21040 | /* A new grace period can start at this point. But only one. */ | |
21041 | ||
b3bbd485 | 21042 | @@ -530,7 +544,7 @@ static void srcu_gp_end(struct srcu_struct *sp) |
e4b2b4a8 JK |
21043 | idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); |
21044 | idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs); | |
21045 | rcu_for_each_node_breadth_first(sp, snp) { | |
21046 | - raw_spin_lock_irq_rcu_node(snp); | |
21047 | + spin_lock_irq_rcu_node(snp); | |
21048 | cbs = false; | |
21049 | if (snp >= sp->level[rcu_num_lvls - 1]) | |
21050 | cbs = snp->srcu_have_cbs[idx] == gpseq; | |
b3bbd485 | 21051 | @@ -540,7 +554,7 @@ static void srcu_gp_end(struct srcu_struct *sp) |
e4b2b4a8 JK |
21052 | snp->srcu_gp_seq_needed_exp = gpseq; |
21053 | mask = snp->srcu_data_have_cbs[idx]; | |
21054 | snp->srcu_data_have_cbs[idx] = 0; | |
21055 | - raw_spin_unlock_irq_rcu_node(snp); | |
21056 | + spin_unlock_irq_rcu_node(snp); | |
21057 | if (cbs) | |
21058 | srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); | |
21059 | ||
b3bbd485 | 21060 | @@ -548,11 +562,11 @@ static void srcu_gp_end(struct srcu_struct *sp) |
e4b2b4a8 JK |
21061 | if (!(gpseq & counter_wrap_check)) |
21062 | for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { | |
21063 | sdp = per_cpu_ptr(sp->sda, cpu); | |
21064 | - raw_spin_lock_irqsave_rcu_node(sdp, flags); | |
21065 | + spin_lock_irqsave_rcu_node(sdp, flags); | |
21066 | if (ULONG_CMP_GE(gpseq, | |
21067 | sdp->srcu_gp_seq_needed + 100)) | |
21068 | sdp->srcu_gp_seq_needed = gpseq; | |
21069 | - raw_spin_unlock_irqrestore_rcu_node(sdp, flags); | |
21070 | + spin_unlock_irqrestore_rcu_node(sdp, flags); | |
21071 | } | |
21072 | } | |
1a6e0f06 | 21073 | |
b3bbd485 | 21074 | @@ -560,17 +574,17 @@ static void srcu_gp_end(struct srcu_struct *sp) |
e4b2b4a8 JK |
21075 | mutex_unlock(&sp->srcu_cb_mutex); |
21076 | ||
21077 | /* Start a new grace period if needed. */ | |
21078 | - raw_spin_lock_irq_rcu_node(sp); | |
21079 | + spin_lock_irq_rcu_node(sp); | |
21080 | gpseq = rcu_seq_current(&sp->srcu_gp_seq); | |
21081 | if (!rcu_seq_state(gpseq) && | |
21082 | ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { | |
21083 | srcu_gp_start(sp); | |
21084 | - raw_spin_unlock_irq_rcu_node(sp); | |
21085 | + spin_unlock_irq_rcu_node(sp); | |
21086 | /* Throttle expedited grace periods: Should be rare! */ | |
21087 | srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff | |
21088 | ? 0 : SRCU_INTERVAL); | |
21089 | } else { | |
21090 | - raw_spin_unlock_irq_rcu_node(sp); | |
21091 | + spin_unlock_irq_rcu_node(sp); | |
21092 | } | |
21093 | } | |
1a6e0f06 | 21094 | |
b3bbd485 | 21095 | @@ -590,18 +604,18 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, |
e4b2b4a8 JK |
21096 | if (rcu_seq_done(&sp->srcu_gp_seq, s) || |
21097 | ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s)) | |
21098 | return; | |
21099 | - raw_spin_lock_irqsave_rcu_node(snp, flags); | |
21100 | + spin_lock_irqsave_rcu_node(snp, flags); | |
21101 | if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) { | |
21102 | - raw_spin_unlock_irqrestore_rcu_node(snp, flags); | |
21103 | + spin_unlock_irqrestore_rcu_node(snp, flags); | |
21104 | return; | |
21105 | } | |
21106 | WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); | |
21107 | - raw_spin_unlock_irqrestore_rcu_node(snp, flags); | |
21108 | + spin_unlock_irqrestore_rcu_node(snp, flags); | |
21109 | } | |
21110 | - raw_spin_lock_irqsave_rcu_node(sp, flags); | |
21111 | + spin_lock_irqsave_rcu_node(sp, flags); | |
21112 | if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) | |
21113 | sp->srcu_gp_seq_needed_exp = s; | |
21114 | - raw_spin_unlock_irqrestore_rcu_node(sp, flags); | |
21115 | + spin_unlock_irqrestore_rcu_node(sp, flags); | |
21116 | } | |
1a6e0f06 | 21117 | |
e4b2b4a8 | 21118 | /* |
b3bbd485 | 21119 | @@ -623,12 +637,12 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, |
e4b2b4a8 JK |
21120 | for (; snp != NULL; snp = snp->srcu_parent) { |
21121 | if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode) | |
21122 | return; /* GP already done and CBs recorded. */ | |
21123 | - raw_spin_lock_irqsave_rcu_node(snp, flags); | |
21124 | + spin_lock_irqsave_rcu_node(snp, flags); | |
21125 | if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { | |
21126 | snp_seq = snp->srcu_have_cbs[idx]; | |
21127 | if (snp == sdp->mynode && snp_seq == s) | |
21128 | snp->srcu_data_have_cbs[idx] |= sdp->grpmask; | |
21129 | - raw_spin_unlock_irqrestore_rcu_node(snp, flags); | |
21130 | + spin_unlock_irqrestore_rcu_node(snp, flags); | |
21131 | if (snp == sdp->mynode && snp_seq != s) { | |
21132 | srcu_schedule_cbs_sdp(sdp, do_norm | |
21133 | ? SRCU_INTERVAL | |
b3bbd485 | 21134 | @@ -644,11 +658,11 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, |
e4b2b4a8 JK |
21135 | snp->srcu_data_have_cbs[idx] |= sdp->grpmask; |
21136 | if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) | |
21137 | snp->srcu_gp_seq_needed_exp = s; | |
21138 | - raw_spin_unlock_irqrestore_rcu_node(snp, flags); | |
21139 | + spin_unlock_irqrestore_rcu_node(snp, flags); | |
21140 | } | |
21141 | ||
21142 | /* Top of tree, must ensure the grace period will be started. */ | |
21143 | - raw_spin_lock_irqsave_rcu_node(sp, flags); | |
21144 | + spin_lock_irqsave_rcu_node(sp, flags); | |
21145 | if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) { | |
21146 | /* | |
21147 | * Record need for grace period s. Pair with load | |
b3bbd485 | 21148 | @@ -667,7 +681,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, |
e4b2b4a8 JK |
21149 | queue_delayed_work(system_power_efficient_wq, &sp->work, |
21150 | srcu_get_delay(sp)); | |
21151 | } | |
21152 | - raw_spin_unlock_irqrestore_rcu_node(sp, flags); | |
21153 | + spin_unlock_irqrestore_rcu_node(sp, flags); | |
1a6e0f06 | 21154 | } |
1a6e0f06 | 21155 | |
e4b2b4a8 | 21156 | /* |
b3bbd485 | 21157 | @@ -736,6 +750,8 @@ static void srcu_flip(struct srcu_struct *sp) |
e4b2b4a8 JK |
21158 | * negligible when amoritized over that time period, and the extra latency |
21159 | * of a needlessly non-expedited grace period is similarly negligible. | |
21160 | */ | |
21161 | +static DEFINE_LOCAL_IRQ_LOCK(sp_llock); | |
21162 | + | |
21163 | static bool srcu_might_be_idle(struct srcu_struct *sp) | |
1a6e0f06 | 21164 | { |
e4b2b4a8 | 21165 | unsigned long curseq; |
b3bbd485 | 21166 | @@ -744,13 +760,13 @@ static bool srcu_might_be_idle(struct srcu_struct *sp) |
e4b2b4a8 | 21167 | unsigned long t; |
1a6e0f06 | 21168 | |
e4b2b4a8 JK |
21169 | /* If the local srcu_data structure has callbacks, not idle. */ |
21170 | - local_irq_save(flags); | |
21171 | + local_lock_irqsave(sp_llock, flags); | |
21172 | sdp = this_cpu_ptr(sp->sda); | |
21173 | if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) { | |
21174 | - local_irq_restore(flags); | |
21175 | + local_unlock_irqrestore(sp_llock, flags); | |
21176 | return false; /* Callbacks already present, so not idle. */ | |
21177 | } | |
21178 | - local_irq_restore(flags); | |
21179 | + local_unlock_irqrestore(sp_llock, flags); | |
1a6e0f06 | 21180 | |
e4b2b4a8 JK |
21181 | /* |
21182 | * No local callbacks, so probabalistically probe global state. | |
b3bbd485 | 21183 | @@ -828,9 +844,9 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, |
e4b2b4a8 JK |
21184 | return; |
21185 | } | |
21186 | rhp->func = func; | |
21187 | - local_irq_save(flags); | |
21188 | + local_lock_irqsave(sp_llock, flags); | |
21189 | sdp = this_cpu_ptr(sp->sda); | |
21190 | - raw_spin_lock_rcu_node(sdp); | |
21191 | + spin_lock_rcu_node(sdp); | |
21192 | rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false); | |
21193 | rcu_segcblist_advance(&sdp->srcu_cblist, | |
21194 | rcu_seq_current(&sp->srcu_gp_seq)); | |
b3bbd485 | 21195 | @@ -844,7 +860,8 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, |
e4b2b4a8 JK |
21196 | sdp->srcu_gp_seq_needed_exp = s; |
21197 | needexp = true; | |
21198 | } | |
21199 | - raw_spin_unlock_irqrestore_rcu_node(sdp, flags); | |
21200 | + spin_unlock_rcu_node(sdp); | |
21201 | + local_unlock_irqrestore(sp_llock, flags); | |
21202 | if (needgp) | |
21203 | srcu_funnel_gp_start(sp, sdp, s, do_norm); | |
21204 | else if (needexp) | |
b3bbd485 | 21205 | @@ -900,7 +917,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm) |
e4b2b4a8 JK |
21206 | |
21207 | /* | |
21208 | * Make sure that later code is ordered after the SRCU grace | |
21209 | - * period. This pairs with the raw_spin_lock_irq_rcu_node() | |
21210 | + * period. This pairs with the spin_lock_irq_rcu_node() | |
21211 | * in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed | |
21212 | * because the current CPU might have been totally uninvolved with | |
21213 | * (and thus unordered against) that grace period. | |
b3bbd485 | 21214 | @@ -1024,7 +1041,7 @@ void srcu_barrier(struct srcu_struct *sp) |
e4b2b4a8 JK |
21215 | */ |
21216 | for_each_possible_cpu(cpu) { | |
21217 | sdp = per_cpu_ptr(sp->sda, cpu); | |
21218 | - raw_spin_lock_irq_rcu_node(sdp); | |
21219 | + spin_lock_irq_rcu_node(sdp); | |
21220 | atomic_inc(&sp->srcu_barrier_cpu_cnt); | |
21221 | sdp->srcu_barrier_head.func = srcu_barrier_cb; | |
21222 | debug_rcu_head_queue(&sdp->srcu_barrier_head); | |
b3bbd485 | 21223 | @@ -1033,7 +1050,7 @@ void srcu_barrier(struct srcu_struct *sp) |
e4b2b4a8 JK |
21224 | debug_rcu_head_unqueue(&sdp->srcu_barrier_head); |
21225 | atomic_dec(&sp->srcu_barrier_cpu_cnt); | |
21226 | } | |
21227 | - raw_spin_unlock_irq_rcu_node(sdp); | |
21228 | + spin_unlock_irq_rcu_node(sdp); | |
21229 | } | |
21230 | ||
21231 | /* Remove the initial count, at which point reaching zero can happen. */ | |
b3bbd485 | 21232 | @@ -1082,17 +1099,17 @@ static void srcu_advance_state(struct srcu_struct *sp) |
e4b2b4a8 JK |
21233 | */ |
21234 | idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */ | |
21235 | if (idx == SRCU_STATE_IDLE) { | |
21236 | - raw_spin_lock_irq_rcu_node(sp); | |
21237 | + spin_lock_irq_rcu_node(sp); | |
21238 | if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { | |
21239 | WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq)); | |
21240 | - raw_spin_unlock_irq_rcu_node(sp); | |
21241 | + spin_unlock_irq_rcu_node(sp); | |
21242 | mutex_unlock(&sp->srcu_gp_mutex); | |
21243 | return; | |
21244 | } | |
21245 | idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); | |
21246 | if (idx == SRCU_STATE_IDLE) | |
21247 | srcu_gp_start(sp); | |
21248 | - raw_spin_unlock_irq_rcu_node(sp); | |
21249 | + spin_unlock_irq_rcu_node(sp); | |
21250 | if (idx != SRCU_STATE_IDLE) { | |
21251 | mutex_unlock(&sp->srcu_gp_mutex); | |
21252 | return; /* Someone else started the grace period. */ | |
b3bbd485 | 21253 | @@ -1141,19 +1158,19 @@ static void srcu_invoke_callbacks(struct work_struct *work) |
e4b2b4a8 JK |
21254 | sdp = container_of(work, struct srcu_data, work.work); |
21255 | sp = sdp->sp; | |
21256 | rcu_cblist_init(&ready_cbs); | |
21257 | - raw_spin_lock_irq_rcu_node(sdp); | |
21258 | + spin_lock_irq_rcu_node(sdp); | |
21259 | rcu_segcblist_advance(&sdp->srcu_cblist, | |
21260 | rcu_seq_current(&sp->srcu_gp_seq)); | |
21261 | if (sdp->srcu_cblist_invoking || | |
21262 | !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { | |
21263 | - raw_spin_unlock_irq_rcu_node(sdp); | |
21264 | + spin_unlock_irq_rcu_node(sdp); | |
21265 | return; /* Someone else on the job or nothing to do. */ | |
21266 | } | |
21267 | ||
21268 | /* We are on the job! Extract and invoke ready callbacks. */ | |
21269 | sdp->srcu_cblist_invoking = true; | |
21270 | rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs); | |
21271 | - raw_spin_unlock_irq_rcu_node(sdp); | |
21272 | + spin_unlock_irq_rcu_node(sdp); | |
21273 | rhp = rcu_cblist_dequeue(&ready_cbs); | |
21274 | for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { | |
21275 | debug_rcu_head_unqueue(rhp); | |
b3bbd485 | 21276 | @@ -1166,13 +1183,13 @@ static void srcu_invoke_callbacks(struct work_struct *work) |
e4b2b4a8 JK |
21277 | * Update counts, accelerate new callbacks, and if needed, |
21278 | * schedule another round of callback invocation. | |
21279 | */ | |
21280 | - raw_spin_lock_irq_rcu_node(sdp); | |
21281 | + spin_lock_irq_rcu_node(sdp); | |
21282 | rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs); | |
21283 | (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, | |
21284 | rcu_seq_snap(&sp->srcu_gp_seq)); | |
21285 | sdp->srcu_cblist_invoking = false; | |
21286 | more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); | |
21287 | - raw_spin_unlock_irq_rcu_node(sdp); | |
21288 | + spin_unlock_irq_rcu_node(sdp); | |
21289 | if (more) | |
21290 | srcu_schedule_cbs_sdp(sdp, 0); | |
21291 | } | |
b3bbd485 | 21292 | @@ -1185,7 +1202,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) |
e4b2b4a8 JK |
21293 | { |
21294 | bool pushgp = true; | |
21295 | ||
21296 | - raw_spin_lock_irq_rcu_node(sp); | |
21297 | + spin_lock_irq_rcu_node(sp); | |
21298 | if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { | |
21299 | if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) { | |
21300 | /* All requests fulfilled, time to go idle. */ | |
b3bbd485 | 21301 | @@ -1195,7 +1212,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) |
e4b2b4a8 JK |
21302 | /* Outstanding request and no GP. Start one. */ |
21303 | srcu_gp_start(sp); | |
21304 | } | |
21305 | - raw_spin_unlock_irq_rcu_node(sp); | |
21306 | + spin_unlock_irq_rcu_node(sp); | |
21307 | ||
21308 | if (pushgp) | |
21309 | queue_delayed_work(system_power_efficient_wq, &sp->work, delay); | |
b3bbd485 JK |
21310 | diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c |
21311 | index 3e3650e94ae6..0a722b56d90b 100644 | |
21312 | --- a/kernel/rcu/tree.c | |
21313 | +++ b/kernel/rcu/tree.c | |
e4b2b4a8 JK |
21314 | @@ -58,6 +58,11 @@ |
21315 | #include <linux/trace_events.h> | |
21316 | #include <linux/suspend.h> | |
21317 | #include <linux/ftrace.h> | |
21318 | +#include <linux/delay.h> | |
21319 | +#include <linux/gfp.h> | |
21320 | +#include <linux/oom.h> | |
21321 | +#include <linux/smpboot.h> | |
21322 | +#include "../time/tick-internal.h" | |
21323 | ||
21324 | #include "tree.h" | |
21325 | #include "rcu.h" | |
b3bbd485 | 21326 | @@ -243,6 +248,19 @@ void rcu_sched_qs(void) |
e4b2b4a8 JK |
21327 | this_cpu_ptr(&rcu_sched_data), true); |
21328 | } | |
21329 | ||
21330 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
21331 | +static void rcu_preempt_qs(void); | |
1a6e0f06 | 21332 | + |
e4b2b4a8 JK |
21333 | +void rcu_bh_qs(void) |
21334 | +{ | |
21335 | + unsigned long flags; | |
1a6e0f06 | 21336 | + |
e4b2b4a8 JK |
21337 | + /* Callers to this function, rcu_preempt_qs(), must disable irqs. */ |
21338 | + local_irq_save(flags); | |
21339 | + rcu_preempt_qs(); | |
21340 | + local_irq_restore(flags); | |
21341 | +} | |
1a6e0f06 | 21342 | +#else |
e4b2b4a8 JK |
21343 | void rcu_bh_qs(void) |
21344 | { | |
21345 | RCU_LOCKDEP_WARN(preemptible(), "rcu_bh_qs() invoked with preemption enabled!!!"); | |
b3bbd485 | 21346 | @@ -253,6 +271,7 @@ void rcu_bh_qs(void) |
e4b2b4a8 JK |
21347 | __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false); |
21348 | } | |
21349 | } | |
1a6e0f06 | 21350 | +#endif |
1a6e0f06 | 21351 | |
e4b2b4a8 JK |
21352 | /* |
21353 | * Steal a bit from the bottom of ->dynticks for idle entry/exit | |
b3bbd485 | 21354 | @@ -564,11 +583,13 @@ EXPORT_SYMBOL_GPL(rcu_batches_started_sched); |
e4b2b4a8 JK |
21355 | /* |
21356 | * Return the number of RCU BH batches started thus far for debug & stats. | |
21357 | */ | |
21358 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
21359 | unsigned long rcu_batches_started_bh(void) | |
21360 | { | |
21361 | return rcu_bh_state.gpnum; | |
21362 | } | |
21363 | EXPORT_SYMBOL_GPL(rcu_batches_started_bh); | |
21364 | +#endif | |
1a6e0f06 | 21365 | |
e4b2b4a8 JK |
21366 | /* |
21367 | * Return the number of RCU batches completed thus far for debug & stats. | |
b3bbd485 | 21368 | @@ -588,6 +609,7 @@ unsigned long rcu_batches_completed_sched(void) |
e4b2b4a8 JK |
21369 | } |
21370 | EXPORT_SYMBOL_GPL(rcu_batches_completed_sched); | |
1a6e0f06 | 21371 | |
e4b2b4a8 JK |
21372 | +#ifndef CONFIG_PREEMPT_RT_FULL |
21373 | /* | |
21374 | * Return the number of RCU BH batches completed thus far for debug & stats. | |
21375 | */ | |
b3bbd485 | 21376 | @@ -596,6 +618,7 @@ unsigned long rcu_batches_completed_bh(void) |
e4b2b4a8 JK |
21377 | return rcu_bh_state.completed; |
21378 | } | |
21379 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); | |
1a6e0f06 | 21380 | +#endif |
e4b2b4a8 JK |
21381 | |
21382 | /* | |
21383 | * Return the number of RCU expedited batches completed thus far for | |
b3bbd485 | 21384 | @@ -619,6 +642,7 @@ unsigned long rcu_exp_batches_completed_sched(void) |
1a6e0f06 | 21385 | } |
e4b2b4a8 | 21386 | EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched); |
1a6e0f06 | 21387 | |
e4b2b4a8 JK |
21388 | +#ifndef CONFIG_PREEMPT_RT_FULL |
21389 | /* | |
21390 | * Force a quiescent state. | |
21391 | */ | |
b3bbd485 | 21392 | @@ -637,6 +661,13 @@ void rcu_bh_force_quiescent_state(void) |
1a6e0f06 | 21393 | } |
e4b2b4a8 | 21394 | EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); |
1a6e0f06 | 21395 | |
1a6e0f06 | 21396 | +#else |
e4b2b4a8 JK |
21397 | +void rcu_force_quiescent_state(void) |
21398 | +{ | |
21399 | +} | |
21400 | +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | |
1a6e0f06 | 21401 | +#endif |
e4b2b4a8 JK |
21402 | + |
21403 | /* | |
21404 | * Force a quiescent state for RCU-sched. | |
21405 | */ | |
b3bbd485 | 21406 | @@ -687,9 +718,11 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, |
e4b2b4a8 JK |
21407 | case RCU_FLAVOR: |
21408 | rsp = rcu_state_p; | |
21409 | break; | |
21410 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
21411 | case RCU_BH_FLAVOR: | |
21412 | rsp = &rcu_bh_state; | |
21413 | break; | |
21414 | +#endif | |
21415 | case RCU_SCHED_FLAVOR: | |
21416 | rsp = &rcu_sched_state; | |
21417 | break; | |
b3bbd485 | 21418 | @@ -2918,18 +2951,17 @@ __rcu_process_callbacks(struct rcu_state *rsp) |
e4b2b4a8 JK |
21419 | /* |
21420 | * Do RCU core processing for the current CPU. | |
21421 | */ | |
21422 | -static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) | |
21423 | +static __latent_entropy void rcu_process_callbacks(void) | |
21424 | { | |
21425 | struct rcu_state *rsp; | |
1a6e0f06 | 21426 | |
e4b2b4a8 JK |
21427 | if (cpu_is_offline(smp_processor_id())) |
21428 | return; | |
21429 | - trace_rcu_utilization(TPS("Start RCU core")); | |
21430 | for_each_rcu_flavor(rsp) | |
21431 | __rcu_process_callbacks(rsp); | |
21432 | - trace_rcu_utilization(TPS("End RCU core")); | |
1a6e0f06 JK |
21433 | } |
21434 | ||
e4b2b4a8 JK |
21435 | +static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); |
21436 | /* | |
21437 | * Schedule RCU callback invocation. If the specified type of RCU | |
21438 | * does not support RCU priority boosting, just do a direct call, | |
b3bbd485 | 21439 | @@ -2941,19 +2973,106 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) |
1a6e0f06 | 21440 | { |
e4b2b4a8 JK |
21441 | if (unlikely(!READ_ONCE(rcu_scheduler_fully_active))) |
21442 | return; | |
21443 | - if (likely(!rsp->boost)) { | |
21444 | - rcu_do_batch(rsp, rdp); | |
b3bbd485 JK |
21445 | - return; |
21446 | - } | |
21447 | - invoke_rcu_callbacks_kthread(); | |
e4b2b4a8 | 21448 | + rcu_do_batch(rsp, rdp); |
b3bbd485 JK |
21449 | } |
21450 | ||
e4b2b4a8 JK |
21451 | +static void rcu_wake_cond(struct task_struct *t, int status) |
21452 | +{ | |
21453 | + /* | |
21454 | + * If the thread is yielding, only wake it when this | |
21455 | + * is invoked from idle | |
21456 | + */ | |
21457 | + if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current))) | |
21458 | + wake_up_process(t); | |
21459 | +} | |
1a6e0f06 | 21460 | + |
e4b2b4a8 JK |
21461 | +/* |
21462 | + * Wake up this CPU's rcuc kthread to do RCU core processing. | |
21463 | + */ | |
b3bbd485 JK |
21464 | static void invoke_rcu_core(void) |
21465 | { | |
21466 | - if (cpu_online(smp_processor_id())) | |
21467 | - raise_softirq(RCU_SOFTIRQ); | |
e4b2b4a8 JK |
21468 | + unsigned long flags; |
21469 | + struct task_struct *t; | |
1a6e0f06 | 21470 | + |
e4b2b4a8 | 21471 | + if (!cpu_online(smp_processor_id())) |
b3bbd485 | 21472 | + return; |
e4b2b4a8 JK |
21473 | + local_irq_save(flags); |
21474 | + __this_cpu_write(rcu_cpu_has_work, 1); | |
21475 | + t = __this_cpu_read(rcu_cpu_kthread_task); | |
21476 | + if (t != NULL && current != t) | |
21477 | + rcu_wake_cond(t, __this_cpu_read(rcu_cpu_kthread_status)); | |
21478 | + local_irq_restore(flags); | |
21479 | +} | |
1a6e0f06 | 21480 | + |
e4b2b4a8 JK |
21481 | +static void rcu_cpu_kthread_park(unsigned int cpu) |
21482 | +{ | |
21483 | + per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; | |
21484 | +} | |
1a6e0f06 | 21485 | + |
e4b2b4a8 | 21486 | +static int rcu_cpu_kthread_should_run(unsigned int cpu) |
1a6e0f06 | 21487 | +{ |
e4b2b4a8 | 21488 | + return __this_cpu_read(rcu_cpu_has_work); |
b3bbd485 JK |
21489 | } |
21490 | ||
e4b2b4a8 JK |
21491 | +/* |
21492 | + * Per-CPU kernel thread that invokes RCU callbacks. This replaces the | |
21493 | + * RCU softirq used in flavors and configurations of RCU that do not | |
21494 | + * support RCU priority boosting. | |
21495 | + */ | |
21496 | +static void rcu_cpu_kthread(unsigned int cpu) | |
21497 | +{ | |
21498 | + unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status); | |
21499 | + char work, *workp = this_cpu_ptr(&rcu_cpu_has_work); | |
21500 | + int spincnt; | |
21501 | + | |
21502 | + for (spincnt = 0; spincnt < 10; spincnt++) { | |
21503 | + trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); | |
21504 | + local_bh_disable(); | |
21505 | + *statusp = RCU_KTHREAD_RUNNING; | |
21506 | + this_cpu_inc(rcu_cpu_kthread_loops); | |
21507 | + local_irq_disable(); | |
21508 | + work = *workp; | |
21509 | + *workp = 0; | |
21510 | + local_irq_enable(); | |
21511 | + if (work) | |
21512 | + rcu_process_callbacks(); | |
21513 | + local_bh_enable(); | |
21514 | + if (*workp == 0) { | |
21515 | + trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); | |
21516 | + *statusp = RCU_KTHREAD_WAITING; | |
21517 | + return; | |
21518 | + } | |
b3bbd485 | 21519 | + } |
e4b2b4a8 JK |
21520 | + *statusp = RCU_KTHREAD_YIELDING; |
21521 | + trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); | |
21522 | + schedule_timeout_interruptible(2); | |
21523 | + trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); | |
21524 | + *statusp = RCU_KTHREAD_WAITING; | |
b3bbd485 JK |
21525 | +} |
21526 | + | |
e4b2b4a8 JK |
21527 | +static struct smp_hotplug_thread rcu_cpu_thread_spec = { |
21528 | + .store = &rcu_cpu_kthread_task, | |
21529 | + .thread_should_run = rcu_cpu_kthread_should_run, | |
21530 | + .thread_fn = rcu_cpu_kthread, | |
21531 | + .thread_comm = "rcuc/%u", | |
21532 | + .setup = rcu_cpu_kthread_setup, | |
21533 | + .park = rcu_cpu_kthread_park, | |
21534 | +}; | |
21535 | + | |
21536 | +/* | |
21537 | + * Spawn per-CPU RCU core processing kthreads. | |
21538 | + */ | |
21539 | +static int __init rcu_spawn_core_kthreads(void) | |
b3bbd485 | 21540 | +{ |
e4b2b4a8 JK |
21541 | + int cpu; |
21542 | + | |
21543 | + for_each_possible_cpu(cpu) | |
21544 | + per_cpu(rcu_cpu_has_work, cpu) = 0; | |
21545 | + BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); | |
21546 | + return 0; | |
b3bbd485 | 21547 | +} |
e4b2b4a8 | 21548 | +early_initcall(rcu_spawn_core_kthreads); |
b3bbd485 | 21549 | + |
e4b2b4a8 JK |
21550 | /* |
21551 | * Handle any core-RCU processing required by a call_rcu() invocation. | |
b3bbd485 JK |
21552 | */ |
21553 | @@ -3113,6 +3232,7 @@ void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) | |
1a6e0f06 | 21554 | } |
e4b2b4a8 | 21555 | EXPORT_SYMBOL_GPL(call_rcu_sched); |
1a6e0f06 | 21556 | |
e4b2b4a8 | 21557 | +#ifndef CONFIG_PREEMPT_RT_FULL |
1a6e0f06 | 21558 | /** |
e4b2b4a8 JK |
21559 | * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period. |
21560 | * @head: structure to be used for queueing the RCU updates. | |
b3bbd485 | 21561 | @@ -3140,6 +3260,7 @@ void call_rcu_bh(struct rcu_head *head, rcu_callback_t func) |
e4b2b4a8 | 21562 | __call_rcu(head, func, &rcu_bh_state, -1, 0); |
1a6e0f06 | 21563 | } |
e4b2b4a8 JK |
21564 | EXPORT_SYMBOL_GPL(call_rcu_bh); |
21565 | +#endif | |
1a6e0f06 | 21566 | |
e4b2b4a8 JK |
21567 | /* |
21568 | * Queue an RCU callback for lazy invocation after a grace period. | |
b3bbd485 | 21569 | @@ -3225,6 +3346,7 @@ void synchronize_sched(void) |
e4b2b4a8 JK |
21570 | } |
21571 | EXPORT_SYMBOL_GPL(synchronize_sched); | |
1a6e0f06 | 21572 | |
e4b2b4a8 JK |
21573 | +#ifndef CONFIG_PREEMPT_RT_FULL |
21574 | /** | |
21575 | * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed. | |
21576 | * | |
b3bbd485 | 21577 | @@ -3251,6 +3373,7 @@ void synchronize_rcu_bh(void) |
e4b2b4a8 JK |
21578 | wait_rcu_gp(call_rcu_bh); |
21579 | } | |
21580 | EXPORT_SYMBOL_GPL(synchronize_rcu_bh); | |
21581 | +#endif | |
1a6e0f06 | 21582 | |
e4b2b4a8 JK |
21583 | /** |
21584 | * get_state_synchronize_rcu - Snapshot current RCU state | |
b3bbd485 | 21585 | @@ -3601,6 +3724,7 @@ static void _rcu_barrier(struct rcu_state *rsp) |
e4b2b4a8 JK |
21586 | mutex_unlock(&rsp->barrier_mutex); |
21587 | } | |
1a6e0f06 | 21588 | |
e4b2b4a8 JK |
21589 | +#ifndef CONFIG_PREEMPT_RT_FULL |
21590 | /** | |
21591 | * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete. | |
21592 | */ | |
b3bbd485 | 21593 | @@ -3609,6 +3733,7 @@ void rcu_barrier_bh(void) |
e4b2b4a8 JK |
21594 | _rcu_barrier(&rcu_bh_state); |
21595 | } | |
21596 | EXPORT_SYMBOL_GPL(rcu_barrier_bh); | |
21597 | +#endif | |
1a6e0f06 | 21598 | |
e4b2b4a8 JK |
21599 | /** |
21600 | * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks. | |
b3bbd485 | 21601 | @@ -3741,8 +3866,6 @@ int rcutree_online_cpu(unsigned int cpu) |
e4b2b4a8 JK |
21602 | { |
21603 | sync_sched_exp_online_cleanup(cpu); | |
21604 | rcutree_affinity_setting(cpu, -1); | |
21605 | - if (IS_ENABLED(CONFIG_TREE_SRCU)) | |
21606 | - srcu_online_cpu(cpu); | |
21607 | return 0; | |
21608 | } | |
1a6e0f06 | 21609 | |
b3bbd485 | 21610 | @@ -3753,8 +3876,6 @@ int rcutree_online_cpu(unsigned int cpu) |
e4b2b4a8 JK |
21611 | int rcutree_offline_cpu(unsigned int cpu) |
21612 | { | |
21613 | rcutree_affinity_setting(cpu, cpu); | |
21614 | - if (IS_ENABLED(CONFIG_TREE_SRCU)) | |
21615 | - srcu_offline_cpu(cpu); | |
21616 | return 0; | |
21617 | } | |
1a6e0f06 | 21618 | |
b3bbd485 | 21619 | @@ -4184,12 +4305,13 @@ void __init rcu_init(void) |
1a6e0f06 | 21620 | |
e4b2b4a8 JK |
21621 | rcu_bootup_announce(); |
21622 | rcu_init_geometry(); | |
21623 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
21624 | rcu_init_one(&rcu_bh_state); | |
21625 | +#endif | |
21626 | rcu_init_one(&rcu_sched_state); | |
21627 | if (dump_tree) | |
21628 | rcu_dump_rcu_node_tree(&rcu_sched_state); | |
21629 | __rcu_init_preempt(); | |
21630 | - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | |
1a6e0f06 | 21631 | |
e4b2b4a8 JK |
21632 | /* |
21633 | * We don't need protection against CPU-hotplug here because | |
b3bbd485 | 21634 | @@ -4200,8 +4322,6 @@ void __init rcu_init(void) |
e4b2b4a8 JK |
21635 | for_each_online_cpu(cpu) { |
21636 | rcutree_prepare_cpu(cpu); | |
21637 | rcu_cpu_starting(cpu); | |
21638 | - if (IS_ENABLED(CONFIG_TREE_SRCU)) | |
21639 | - srcu_online_cpu(cpu); | |
21640 | } | |
1a6e0f06 JK |
21641 | } |
21642 | ||
b3bbd485 JK |
21643 | diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h |
21644 | index 8e1f285f0a70..7acc23da94e2 100644 | |
21645 | --- a/kernel/rcu/tree.h | |
21646 | +++ b/kernel/rcu/tree.h | |
21647 | @@ -427,7 +427,9 @@ extern struct list_head rcu_struct_flavors; | |
e4b2b4a8 JK |
21648 | */ |
21649 | extern struct rcu_state rcu_sched_state; | |
1a6e0f06 | 21650 | |
e4b2b4a8 JK |
21651 | +#ifndef CONFIG_PREEMPT_RT_FULL |
21652 | extern struct rcu_state rcu_bh_state; | |
21653 | +#endif | |
1a6e0f06 | 21654 | |
e4b2b4a8 JK |
21655 | #ifdef CONFIG_PREEMPT_RCU |
21656 | extern struct rcu_state rcu_preempt_state; | |
b3bbd485 | 21657 | @@ -436,12 +438,10 @@ extern struct rcu_state rcu_preempt_state; |
e4b2b4a8 JK |
21658 | int rcu_dynticks_snap(struct rcu_dynticks *rdtp); |
21659 | bool rcu_eqs_special_set(int cpu); | |
1a6e0f06 | 21660 | |
e4b2b4a8 JK |
21661 | -#ifdef CONFIG_RCU_BOOST |
21662 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); | |
21663 | DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu); | |
21664 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); | |
21665 | DECLARE_PER_CPU(char, rcu_cpu_has_work); | |
21666 | -#endif /* #ifdef CONFIG_RCU_BOOST */ | |
1a6e0f06 | 21667 | |
e4b2b4a8 JK |
21668 | #ifndef RCU_TREE_NONCORE |
21669 | ||
b3bbd485 | 21670 | @@ -461,10 +461,9 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func); |
e4b2b4a8 JK |
21671 | static void __init __rcu_init_preempt(void); |
21672 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); | |
21673 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); | |
21674 | -static void invoke_rcu_callbacks_kthread(void); | |
21675 | static bool rcu_is_callbacks_kthread(void); | |
21676 | +static void rcu_cpu_kthread_setup(unsigned int cpu); | |
21677 | #ifdef CONFIG_RCU_BOOST | |
21678 | -static void rcu_preempt_do_callbacks(void); | |
21679 | static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | |
21680 | struct rcu_node *rnp); | |
21681 | #endif /* #ifdef CONFIG_RCU_BOOST */ | |
b3bbd485 JK |
21682 | diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h |
21683 | index 8b3102d22823..17ee8d1f38c4 100644 | |
21684 | --- a/kernel/rcu/tree_plugin.h | |
21685 | +++ b/kernel/rcu/tree_plugin.h | |
e4b2b4a8 JK |
21686 | @@ -24,39 +24,16 @@ |
21687 | * Paul E. McKenney <paulmck@linux.vnet.ibm.com> | |
21688 | */ | |
21689 | ||
21690 | -#include <linux/delay.h> | |
21691 | -#include <linux/gfp.h> | |
21692 | -#include <linux/oom.h> | |
21693 | -#include <linux/sched/debug.h> | |
21694 | -#include <linux/smpboot.h> | |
21695 | -#include <uapi/linux/sched/types.h> | |
21696 | -#include "../time/tick-internal.h" | |
21697 | - | |
21698 | -#ifdef CONFIG_RCU_BOOST | |
21699 | - | |
21700 | #include "../locking/rtmutex_common.h" | |
21701 | ||
21702 | /* | |
21703 | * Control variables for per-CPU and per-rcu_node kthreads. These | |
21704 | * handle all flavors of RCU. | |
21705 | */ | |
21706 | -static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); | |
21707 | DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); | |
21708 | DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); | |
21709 | DEFINE_PER_CPU(char, rcu_cpu_has_work); | |
21710 | ||
21711 | -#else /* #ifdef CONFIG_RCU_BOOST */ | |
21712 | - | |
21713 | -/* | |
21714 | - * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST, | |
21715 | - * all uses are in dead code. Provide a definition to keep the compiler | |
21716 | - * happy, but add WARN_ON_ONCE() to complain if used in the wrong place. | |
21717 | - * This probably needs to be excluded from -rt builds. | |
21718 | - */ | |
21719 | -#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; }) | |
21720 | - | |
21721 | -#endif /* #else #ifdef CONFIG_RCU_BOOST */ | |
21722 | - | |
21723 | #ifdef CONFIG_RCU_NOCB_CPU | |
21724 | static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ | |
21725 | static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */ | |
b3bbd485 | 21726 | @@ -324,9 +301,13 @@ static void rcu_preempt_note_context_switch(bool preempt) |
e4b2b4a8 JK |
21727 | struct task_struct *t = current; |
21728 | struct rcu_data *rdp; | |
21729 | struct rcu_node *rnp; | |
21730 | + int sleeping_l = 0; | |
21731 | ||
21732 | RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_preempt_note_context_switch() invoked with interrupts enabled!!!\n"); | |
21733 | - WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0); | |
21734 | +#if defined(CONFIG_PREEMPT_RT_FULL) | |
21735 | + sleeping_l = t->sleeping_lock; | |
21736 | +#endif | |
21737 | + WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0 && !sleeping_l); | |
21738 | if (t->rcu_read_lock_nesting > 0 && | |
21739 | !t->rcu_read_unlock_special.b.blocked) { | |
21740 | ||
b3bbd485 | 21741 | @@ -463,7 +444,7 @@ void rcu_read_unlock_special(struct task_struct *t) |
e4b2b4a8 JK |
21742 | } |
21743 | ||
21744 | /* Hardware IRQ handlers cannot block, complain if they get here. */ | |
21745 | - if (in_irq() || in_serving_softirq()) { | |
21746 | + if (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET)) { | |
21747 | lockdep_rcu_suspicious(__FILE__, __LINE__, | |
21748 | "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n"); | |
21749 | pr_alert("->rcu_read_unlock_special: %#x (b: %d, enq: %d nq: %d)\n", | |
b3bbd485 | 21750 | @@ -530,7 +511,7 @@ void rcu_read_unlock_special(struct task_struct *t) |
e4b2b4a8 JK |
21751 | |
21752 | /* Unboost if we were boosted. */ | |
21753 | if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex) | |
21754 | - rt_mutex_unlock(&rnp->boost_mtx); | |
21755 | + rt_mutex_futex_unlock(&rnp->boost_mtx); | |
21756 | ||
21757 | /* | |
21758 | * If this was the last task on the expedited lists, | |
b3bbd485 | 21759 | @@ -684,15 +665,6 @@ static void rcu_preempt_check_callbacks(void) |
e4b2b4a8 JK |
21760 | t->rcu_read_unlock_special.b.need_qs = true; |
21761 | } | |
21762 | ||
21763 | -#ifdef CONFIG_RCU_BOOST | |
21764 | - | |
21765 | -static void rcu_preempt_do_callbacks(void) | |
21766 | -{ | |
21767 | - rcu_do_batch(rcu_state_p, this_cpu_ptr(rcu_data_p)); | |
21768 | -} | |
21769 | - | |
21770 | -#endif /* #ifdef CONFIG_RCU_BOOST */ | |
21771 | - | |
21772 | /** | |
21773 | * call_rcu() - Queue an RCU callback for invocation after a grace period. | |
21774 | * @head: structure to be used for queueing the RCU updates. | |
b3bbd485 | 21775 | @@ -915,20 +887,23 @@ void exit_rcu(void) |
e4b2b4a8 JK |
21776 | |
21777 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ | |
1a6e0f06 | 21778 | |
1a6e0f06 | 21779 | +/* |
e4b2b4a8 | 21780 | + * If boosting, set rcuc kthreads to realtime priority. |
1a6e0f06 | 21781 | + */ |
e4b2b4a8 | 21782 | +static void rcu_cpu_kthread_setup(unsigned int cpu) |
1a6e0f06 | 21783 | +{ |
b3bbd485 | 21784 | +#ifdef CONFIG_RCU_BOOST |
e4b2b4a8 | 21785 | + struct sched_param sp; |
b3bbd485 JK |
21786 | + |
21787 | + sp.sched_priority = kthread_prio; | |
21788 | + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); | |
21789 | +#endif /* #ifdef CONFIG_RCU_BOOST */ | |
21790 | +} | |
21791 | + | |
21792 | #ifdef CONFIG_RCU_BOOST | |
21793 | ||
21794 | #include "../locking/rtmutex_common.h" | |
e4b2b4a8 | 21795 | |
e4b2b4a8 JK |
21796 | -static void rcu_wake_cond(struct task_struct *t, int status) |
21797 | -{ | |
21798 | - /* | |
21799 | - * If the thread is yielding, only wake it when this | |
21800 | - * is invoked from idle | |
21801 | - */ | |
21802 | - if (status != RCU_KTHREAD_YIELDING || is_idle_task(current)) | |
21803 | - wake_up_process(t); | |
b3bbd485 JK |
21804 | -} |
21805 | - | |
e4b2b4a8 JK |
21806 | /* |
21807 | * Carry out RCU priority boosting on the task indicated by ->exp_tasks | |
21808 | * or ->boost_tasks, advancing the pointer to the next task in the | |
b3bbd485 JK |
21809 | @@ -1070,23 +1045,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) |
21810 | } | |
e4b2b4a8 | 21811 | } |
1a6e0f06 | 21812 | |
b3bbd485 | 21813 | -/* |
e4b2b4a8 JK |
21814 | - * Wake up the per-CPU kthread to invoke RCU callbacks. |
21815 | - */ | |
21816 | -static void invoke_rcu_callbacks_kthread(void) | |
21817 | -{ | |
21818 | - unsigned long flags; | |
21819 | - | |
21820 | - local_irq_save(flags); | |
21821 | - __this_cpu_write(rcu_cpu_has_work, 1); | |
21822 | - if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && | |
21823 | - current != __this_cpu_read(rcu_cpu_kthread_task)) { | |
21824 | - rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task), | |
21825 | - __this_cpu_read(rcu_cpu_kthread_status)); | |
21826 | - } | |
21827 | - local_irq_restore(flags); | |
21828 | -} | |
21829 | - | |
b3bbd485 | 21830 | /* |
e4b2b4a8 JK |
21831 | * Is the current CPU running the RCU-callbacks kthread? |
21832 | * Caller must have preemption disabled. | |
b3bbd485 | 21833 | @@ -1141,67 +1099,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, |
e4b2b4a8 JK |
21834 | return 0; |
21835 | } | |
1a6e0f06 | 21836 | |
e4b2b4a8 JK |
21837 | -static void rcu_kthread_do_work(void) |
21838 | -{ | |
21839 | - rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data)); | |
21840 | - rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data)); | |
21841 | - rcu_preempt_do_callbacks(); | |
21842 | -} | |
21843 | - | |
21844 | -static void rcu_cpu_kthread_setup(unsigned int cpu) | |
21845 | -{ | |
21846 | - struct sched_param sp; | |
21847 | - | |
21848 | - sp.sched_priority = kthread_prio; | |
21849 | - sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); | |
21850 | -} | |
21851 | - | |
21852 | -static void rcu_cpu_kthread_park(unsigned int cpu) | |
21853 | -{ | |
21854 | - per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; | |
21855 | -} | |
21856 | - | |
21857 | -static int rcu_cpu_kthread_should_run(unsigned int cpu) | |
21858 | -{ | |
21859 | - return __this_cpu_read(rcu_cpu_has_work); | |
21860 | -} | |
21861 | - | |
21862 | -/* | |
21863 | - * Per-CPU kernel thread that invokes RCU callbacks. This replaces the | |
21864 | - * RCU softirq used in flavors and configurations of RCU that do not | |
21865 | - * support RCU priority boosting. | |
21866 | - */ | |
21867 | -static void rcu_cpu_kthread(unsigned int cpu) | |
21868 | -{ | |
21869 | - unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status); | |
21870 | - char work, *workp = this_cpu_ptr(&rcu_cpu_has_work); | |
21871 | - int spincnt; | |
21872 | - | |
21873 | - for (spincnt = 0; spincnt < 10; spincnt++) { | |
21874 | - trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); | |
21875 | - local_bh_disable(); | |
21876 | - *statusp = RCU_KTHREAD_RUNNING; | |
21877 | - this_cpu_inc(rcu_cpu_kthread_loops); | |
21878 | - local_irq_disable(); | |
21879 | - work = *workp; | |
21880 | - *workp = 0; | |
21881 | - local_irq_enable(); | |
21882 | - if (work) | |
21883 | - rcu_kthread_do_work(); | |
21884 | - local_bh_enable(); | |
21885 | - if (*workp == 0) { | |
21886 | - trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); | |
21887 | - *statusp = RCU_KTHREAD_WAITING; | |
21888 | - return; | |
21889 | - } | |
21890 | - } | |
21891 | - *statusp = RCU_KTHREAD_YIELDING; | |
21892 | - trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); | |
21893 | - schedule_timeout_interruptible(2); | |
21894 | - trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); | |
21895 | - *statusp = RCU_KTHREAD_WAITING; | |
21896 | -} | |
21897 | - | |
21898 | /* | |
21899 | * Set the per-rcu_node kthread's affinity to cover all CPUs that are | |
21900 | * served by the rcu_node in question. The CPU hotplug lock is still | |
b3bbd485 | 21901 | @@ -1232,26 +1129,12 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) |
e4b2b4a8 JK |
21902 | free_cpumask_var(cm); |
21903 | } | |
1a6e0f06 | 21904 | |
e4b2b4a8 JK |
21905 | -static struct smp_hotplug_thread rcu_cpu_thread_spec = { |
21906 | - .store = &rcu_cpu_kthread_task, | |
21907 | - .thread_should_run = rcu_cpu_kthread_should_run, | |
21908 | - .thread_fn = rcu_cpu_kthread, | |
21909 | - .thread_comm = "rcuc/%u", | |
21910 | - .setup = rcu_cpu_kthread_setup, | |
21911 | - .park = rcu_cpu_kthread_park, | |
21912 | -}; | |
21913 | - | |
21914 | /* | |
21915 | * Spawn boost kthreads -- called as soon as the scheduler is running. | |
21916 | */ | |
21917 | static void __init rcu_spawn_boost_kthreads(void) | |
21918 | { | |
21919 | struct rcu_node *rnp; | |
21920 | - int cpu; | |
21921 | - | |
21922 | - for_each_possible_cpu(cpu) | |
21923 | - per_cpu(rcu_cpu_has_work, cpu) = 0; | |
21924 | - BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); | |
21925 | rcu_for_each_leaf_node(rcu_state_p, rnp) | |
21926 | (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); | |
21927 | } | |
b3bbd485 | 21928 | @@ -1274,11 +1157,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) |
e4b2b4a8 JK |
21929 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
21930 | } | |
1a6e0f06 | 21931 | |
e4b2b4a8 JK |
21932 | -static void invoke_rcu_callbacks_kthread(void) |
21933 | -{ | |
21934 | - WARN_ON_ONCE(1); | |
21935 | -} | |
21936 | - | |
21937 | static bool rcu_is_callbacks_kthread(void) | |
21938 | { | |
21939 | return false; | |
b3bbd485 | 21940 | @@ -1302,7 +1180,7 @@ static void rcu_prepare_kthreads(int cpu) |
1a6e0f06 | 21941 | |
e4b2b4a8 | 21942 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ |
1a6e0f06 | 21943 | |
e4b2b4a8 JK |
21944 | -#if !defined(CONFIG_RCU_FAST_NO_HZ) |
21945 | +#if !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) | |
1a6e0f06 | 21946 | |
e4b2b4a8 JK |
21947 | /* |
21948 | * Check to see if any future RCU-related work will need to be done | |
b3bbd485 | 21949 | @@ -1318,7 +1196,9 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) |
e4b2b4a8 JK |
21950 | *nextevt = KTIME_MAX; |
21951 | return rcu_cpu_has_callbacks(NULL); | |
1a6e0f06 | 21952 | } |
e4b2b4a8 | 21953 | +#endif /* !defined(CONFIG_RCU_FAST_NO_HZ) || defined(CONFIG_PREEMPT_RT_FULL) */ |
1a6e0f06 | 21954 | |
e4b2b4a8 JK |
21955 | +#if !defined(CONFIG_RCU_FAST_NO_HZ) |
21956 | /* | |
21957 | * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up | |
21958 | * after it. | |
b3bbd485 | 21959 | @@ -1414,6 +1294,8 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) |
e4b2b4a8 JK |
21960 | return cbs_ready; |
21961 | } | |
1f39f580 | 21962 | |
e4b2b4a8 JK |
21963 | +#ifndef CONFIG_PREEMPT_RT_FULL |
21964 | + | |
21965 | /* | |
21966 | * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready | |
21967 | * to invoke. If the CPU has callbacks, try to advance them. Tell the | |
b3bbd485 | 21968 | @@ -1456,6 +1338,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) |
e4b2b4a8 JK |
21969 | *nextevt = basemono + dj * TICK_NSEC; |
21970 | return 0; | |
21971 | } | |
21972 | +#endif /* #ifndef CONFIG_PREEMPT_RT_FULL */ | |
1f39f580 | 21973 | |
e4b2b4a8 JK |
21974 | /* |
21975 | * Prepare a CPU for idle from an RCU perspective. The first major task | |
b3bbd485 JK |
21976 | diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c |
21977 | index 7a577bd989a4..2006a09680aa 100644 | |
21978 | --- a/kernel/rcu/update.c | |
21979 | +++ b/kernel/rcu/update.c | |
21980 | @@ -66,7 +66,7 @@ extern int rcu_expedited; /* from sysctl */ | |
e4b2b4a8 JK |
21981 | module_param(rcu_expedited, int, 0); |
21982 | extern int rcu_normal; /* from sysctl */ | |
21983 | module_param(rcu_normal, int, 0); | |
21984 | -static int rcu_normal_after_boot; | |
21985 | +static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT_FULL); | |
21986 | module_param(rcu_normal_after_boot, int, 0); | |
21987 | #endif /* #ifndef CONFIG_TINY_RCU */ | |
1f39f580 | 21988 | |
b3bbd485 | 21989 | @@ -333,6 +333,7 @@ int rcu_read_lock_held(void) |
e4b2b4a8 JK |
21990 | } |
21991 | EXPORT_SYMBOL_GPL(rcu_read_lock_held); | |
1f39f580 | 21992 | |
e4b2b4a8 JK |
21993 | +#ifndef CONFIG_PREEMPT_RT_FULL |
21994 | /** | |
21995 | * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section? | |
21996 | * | |
b3bbd485 | 21997 | @@ -359,6 +360,7 @@ int rcu_read_lock_bh_held(void) |
e4b2b4a8 JK |
21998 | return in_softirq() || irqs_disabled(); |
21999 | } | |
22000 | EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); | |
22001 | +#endif | |
1f39f580 | 22002 | |
e4b2b4a8 | 22003 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
1f39f580 | 22004 | |
b3bbd485 JK |
22005 | diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile |
22006 | index a9ee16bbc693..9943019095e9 100644 | |
22007 | --- a/kernel/sched/Makefile | |
22008 | +++ b/kernel/sched/Makefile | |
22009 | @@ -18,7 +18,7 @@ endif | |
22010 | ||
22011 | obj-y += core.o loadavg.o clock.o cputime.o | |
22012 | obj-y += idle_task.o fair.o rt.o deadline.o | |
22013 | -obj-y += wait.o wait_bit.o swait.o completion.o idle.o | |
22014 | +obj-y += wait.o wait_bit.o swait.o swork.o completion.o idle.o | |
22015 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o | |
22016 | obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o | |
22017 | obj-$(CONFIG_SCHEDSTATS) += stats.o | |
22018 | diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c | |
22019 | index 2ddaec40956f..0fe2982e46a0 100644 | |
22020 | --- a/kernel/sched/completion.c | |
22021 | +++ b/kernel/sched/completion.c | |
22022 | @@ -32,7 +32,7 @@ void complete(struct completion *x) | |
e4b2b4a8 JK |
22023 | { |
22024 | unsigned long flags; | |
1f39f580 | 22025 | |
e4b2b4a8 JK |
22026 | - spin_lock_irqsave(&x->wait.lock, flags); |
22027 | + raw_spin_lock_irqsave(&x->wait.lock, flags); | |
1f39f580 | 22028 | |
e4b2b4a8 JK |
22029 | /* |
22030 | * Perform commit of crossrelease here. | |
b3bbd485 | 22031 | @@ -41,8 +41,8 @@ void complete(struct completion *x) |
1f39f580 | 22032 | |
e4b2b4a8 JK |
22033 | if (x->done != UINT_MAX) |
22034 | x->done++; | |
22035 | - __wake_up_locked(&x->wait, TASK_NORMAL, 1); | |
22036 | - spin_unlock_irqrestore(&x->wait.lock, flags); | |
22037 | + swake_up_locked(&x->wait); | |
22038 | + raw_spin_unlock_irqrestore(&x->wait.lock, flags); | |
22039 | } | |
22040 | EXPORT_SYMBOL(complete); | |
1f39f580 | 22041 | |
b3bbd485 | 22042 | @@ -66,10 +66,10 @@ void complete_all(struct completion *x) |
e4b2b4a8 JK |
22043 | { |
22044 | unsigned long flags; | |
1f39f580 | 22045 | |
e4b2b4a8 JK |
22046 | - spin_lock_irqsave(&x->wait.lock, flags); |
22047 | + raw_spin_lock_irqsave(&x->wait.lock, flags); | |
22048 | x->done = UINT_MAX; | |
22049 | - __wake_up_locked(&x->wait, TASK_NORMAL, 0); | |
22050 | - spin_unlock_irqrestore(&x->wait.lock, flags); | |
22051 | + swake_up_all_locked(&x->wait); | |
22052 | + raw_spin_unlock_irqrestore(&x->wait.lock, flags); | |
22053 | } | |
22054 | EXPORT_SYMBOL(complete_all); | |
1f39f580 | 22055 | |
b3bbd485 | 22056 | @@ -78,20 +78,20 @@ do_wait_for_common(struct completion *x, |
e4b2b4a8 JK |
22057 | long (*action)(long), long timeout, int state) |
22058 | { | |
22059 | if (!x->done) { | |
22060 | - DECLARE_WAITQUEUE(wait, current); | |
22061 | + DECLARE_SWAITQUEUE(wait); | |
1f39f580 | 22062 | |
e4b2b4a8 JK |
22063 | - __add_wait_queue_entry_tail_exclusive(&x->wait, &wait); |
22064 | + __prepare_to_swait(&x->wait, &wait); | |
22065 | do { | |
22066 | if (signal_pending_state(state, current)) { | |
22067 | timeout = -ERESTARTSYS; | |
22068 | break; | |
22069 | } | |
22070 | __set_current_state(state); | |
22071 | - spin_unlock_irq(&x->wait.lock); | |
22072 | + raw_spin_unlock_irq(&x->wait.lock); | |
22073 | timeout = action(timeout); | |
22074 | - spin_lock_irq(&x->wait.lock); | |
22075 | + raw_spin_lock_irq(&x->wait.lock); | |
22076 | } while (!x->done && timeout); | |
22077 | - __remove_wait_queue(&x->wait, &wait); | |
22078 | + __finish_swait(&x->wait, &wait); | |
22079 | if (!x->done) | |
22080 | return timeout; | |
22081 | } | |
b3bbd485 | 22082 | @@ -108,9 +108,9 @@ __wait_for_common(struct completion *x, |
1f39f580 | 22083 | |
e4b2b4a8 | 22084 | complete_acquire(x); |
1f39f580 | 22085 | |
e4b2b4a8 JK |
22086 | - spin_lock_irq(&x->wait.lock); |
22087 | + raw_spin_lock_irq(&x->wait.lock); | |
22088 | timeout = do_wait_for_common(x, action, timeout, state); | |
22089 | - spin_unlock_irq(&x->wait.lock); | |
22090 | + raw_spin_unlock_irq(&x->wait.lock); | |
1f39f580 | 22091 | |
e4b2b4a8 JK |
22092 | complete_release(x); |
22093 | ||
b3bbd485 | 22094 | @@ -299,12 +299,12 @@ bool try_wait_for_completion(struct completion *x) |
e4b2b4a8 JK |
22095 | if (!READ_ONCE(x->done)) |
22096 | return 0; | |
22097 | ||
22098 | - spin_lock_irqsave(&x->wait.lock, flags); | |
22099 | + raw_spin_lock_irqsave(&x->wait.lock, flags); | |
22100 | if (!x->done) | |
22101 | ret = 0; | |
22102 | else if (x->done != UINT_MAX) | |
22103 | x->done--; | |
22104 | - spin_unlock_irqrestore(&x->wait.lock, flags); | |
22105 | + raw_spin_unlock_irqrestore(&x->wait.lock, flags); | |
1f39f580 JK |
22106 | return ret; |
22107 | } | |
e4b2b4a8 | 22108 | EXPORT_SYMBOL(try_wait_for_completion); |
b3bbd485 | 22109 | @@ -330,8 +330,8 @@ bool completion_done(struct completion *x) |
e4b2b4a8 JK |
22110 | * otherwise we can end up freeing the completion before complete() |
22111 | * is done referencing it. | |
22112 | */ | |
22113 | - spin_lock_irqsave(&x->wait.lock, flags); | |
22114 | - spin_unlock_irqrestore(&x->wait.lock, flags); | |
22115 | + raw_spin_lock_irqsave(&x->wait.lock, flags); | |
22116 | + raw_spin_unlock_irqrestore(&x->wait.lock, flags); | |
22117 | return true; | |
22118 | } | |
22119 | EXPORT_SYMBOL(completion_done); | |
b3bbd485 JK |
22120 | diff --git a/kernel/sched/core.c b/kernel/sched/core.c |
22121 | index 4e89ed8a0fb2..6e6bd5262f23 100644 | |
22122 | --- a/kernel/sched/core.c | |
22123 | +++ b/kernel/sched/core.c | |
22124 | @@ -59,7 +59,11 @@ const_debug unsigned int sysctl_sched_features = | |
e4b2b4a8 JK |
22125 | * Number of tasks to iterate in a single balance run. |
22126 | * Limited because this is done with IRQs disabled. | |
22127 | */ | |
22128 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
22129 | const_debug unsigned int sysctl_sched_nr_migrate = 32; | |
22130 | +#else | |
22131 | +const_debug unsigned int sysctl_sched_nr_migrate = 8; | |
22132 | +#endif | |
1f39f580 | 22133 | |
e4b2b4a8 JK |
22134 | /* |
22135 | * period over which we average the RT time consumption, measured | |
b3bbd485 | 22136 | @@ -341,7 +345,7 @@ static void init_rq_hrtick(struct rq *rq) |
e4b2b4a8 JK |
22137 | rq->hrtick_csd.info = rq; |
22138 | #endif | |
1f39f580 | 22139 | |
e4b2b4a8 JK |
22140 | - hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
22141 | + hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); | |
22142 | rq->hrtick_timer.function = hrtick; | |
22143 | } | |
22144 | #else /* CONFIG_SCHED_HRTICK */ | |
b3bbd485 | 22145 | @@ -423,9 +427,15 @@ static bool set_nr_if_polling(struct task_struct *p) |
e4b2b4a8 JK |
22146 | #endif |
22147 | #endif | |
1f39f580 | 22148 | |
e4b2b4a8 JK |
22149 | -void wake_q_add(struct wake_q_head *head, struct task_struct *task) |
22150 | +void __wake_q_add(struct wake_q_head *head, struct task_struct *task, | |
22151 | + bool sleeper) | |
22152 | { | |
22153 | - struct wake_q_node *node = &task->wake_q; | |
22154 | + struct wake_q_node *node; | |
22155 | + | |
22156 | + if (sleeper) | |
22157 | + node = &task->wake_q_sleeper; | |
22158 | + else | |
22159 | + node = &task->wake_q; | |
1f39f580 | 22160 | |
e4b2b4a8 JK |
22161 | /* |
22162 | * Atomically grab the task, if ->wake_q is !nil already it means | |
b3bbd485 | 22163 | @@ -447,24 +457,32 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) |
e4b2b4a8 JK |
22164 | head->lastp = &node->next; |
22165 | } | |
22166 | ||
22167 | -void wake_up_q(struct wake_q_head *head) | |
22168 | +void __wake_up_q(struct wake_q_head *head, bool sleeper) | |
22169 | { | |
22170 | struct wake_q_node *node = head->first; | |
22171 | ||
22172 | while (node != WAKE_Q_TAIL) { | |
22173 | struct task_struct *task; | |
22174 | ||
22175 | - task = container_of(node, struct task_struct, wake_q); | |
22176 | + if (sleeper) | |
22177 | + task = container_of(node, struct task_struct, wake_q_sleeper); | |
22178 | + else | |
22179 | + task = container_of(node, struct task_struct, wake_q); | |
22180 | BUG_ON(!task); | |
22181 | /* Task can safely be re-inserted now: */ | |
22182 | node = node->next; | |
22183 | - task->wake_q.next = NULL; | |
22184 | - | |
22185 | + if (sleeper) | |
22186 | + task->wake_q_sleeper.next = NULL; | |
22187 | + else | |
22188 | + task->wake_q.next = NULL; | |
22189 | /* | |
22190 | * wake_up_process() implies a wmb() to pair with the queueing | |
22191 | * in wake_q_add() so as not to miss wakeups. | |
22192 | */ | |
22193 | - wake_up_process(task); | |
22194 | + if (sleeper) | |
22195 | + wake_up_lock_sleeper(task); | |
22196 | + else | |
22197 | + wake_up_process(task); | |
22198 | put_task_struct(task); | |
1f39f580 | 22199 | } |
e4b2b4a8 | 22200 | } |
b3bbd485 | 22201 | @@ -500,6 +518,48 @@ void resched_curr(struct rq *rq) |
e4b2b4a8 JK |
22202 | trace_sched_wake_idle_without_ipi(cpu); |
22203 | } | |
1f39f580 | 22204 | |
e4b2b4a8 JK |
22205 | +#ifdef CONFIG_PREEMPT_LAZY |
22206 | + | |
22207 | +static int tsk_is_polling(struct task_struct *p) | |
22208 | +{ | |
22209 | +#ifdef TIF_POLLING_NRFLAG | |
22210 | + return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG); | |
22211 | +#else | |
22212 | + return 0; | |
22213 | +#endif | |
22214 | +} | |
22215 | + | |
22216 | +void resched_curr_lazy(struct rq *rq) | |
22217 | +{ | |
22218 | + struct task_struct *curr = rq->curr; | |
22219 | + int cpu; | |
22220 | + | |
22221 | + if (!sched_feat(PREEMPT_LAZY)) { | |
22222 | + resched_curr(rq); | |
22223 | + return; | |
22224 | + } | |
22225 | + | |
22226 | + lockdep_assert_held(&rq->lock); | |
22227 | + | |
22228 | + if (test_tsk_need_resched(curr)) | |
22229 | + return; | |
22230 | + | |
22231 | + if (test_tsk_need_resched_lazy(curr)) | |
22232 | + return; | |
22233 | + | |
22234 | + set_tsk_need_resched_lazy(curr); | |
22235 | + | |
22236 | + cpu = cpu_of(rq); | |
22237 | + if (cpu == smp_processor_id()) | |
22238 | + return; | |
22239 | + | |
22240 | + /* NEED_RESCHED_LAZY must be visible before we test polling */ | |
22241 | + smp_mb(); | |
22242 | + if (!tsk_is_polling(curr)) | |
22243 | + smp_send_reschedule(cpu); | |
22244 | +} | |
22245 | +#endif | |
22246 | + | |
22247 | void resched_cpu(int cpu) | |
1f39f580 | 22248 | { |
e4b2b4a8 | 22249 | struct rq *rq = cpu_rq(cpu); |
b3bbd485 | 22250 | @@ -523,11 +583,14 @@ void resched_cpu(int cpu) |
e4b2b4a8 JK |
22251 | */ |
22252 | int get_nohz_timer_target(void) | |
22253 | { | |
22254 | - int i, cpu = smp_processor_id(); | |
22255 | + int i, cpu; | |
22256 | struct sched_domain *sd; | |
1f39f580 | 22257 | |
e4b2b4a8 JK |
22258 | + preempt_disable_rt(); |
22259 | + cpu = smp_processor_id(); | |
22260 | + | |
22261 | if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu)) | |
22262 | - return cpu; | |
22263 | + goto preempt_en_rt; | |
1f39f580 | 22264 | |
e4b2b4a8 JK |
22265 | rcu_read_lock(); |
22266 | for_each_domain(cpu, sd) { | |
b3bbd485 | 22267 | @@ -546,6 +609,8 @@ int get_nohz_timer_target(void) |
e4b2b4a8 JK |
22268 | cpu = housekeeping_any_cpu(); |
22269 | unlock: | |
22270 | rcu_read_unlock(); | |
22271 | +preempt_en_rt: | |
22272 | + preempt_enable_rt(); | |
22273 | return cpu; | |
1f39f580 JK |
22274 | } |
22275 | ||
b3bbd485 | 22276 | @@ -912,10 +977,10 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) |
e4b2b4a8 JK |
22277 | */ |
22278 | static inline bool is_cpu_allowed(struct task_struct *p, int cpu) | |
1f39f580 | 22279 | { |
e4b2b4a8 JK |
22280 | - if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) |
22281 | + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) | |
22282 | return false; | |
1f39f580 | 22283 | |
b3bbd485 JK |
22284 | - if (is_per_cpu_kthread(p)) |
22285 | + if (is_per_cpu_kthread(p) || __migrate_disabled(p)) | |
22286 | return cpu_online(cpu); | |
22287 | ||
22288 | return cpu_active(cpu); | |
22289 | @@ -1007,7 +1072,7 @@ static int migration_cpu_stop(void *data) | |
e4b2b4a8 | 22290 | local_irq_disable(); |
1f39f580 | 22291 | /* |
e4b2b4a8 JK |
22292 | * We need to explicitly wake pending tasks before running |
22293 | - * __migrate_task() such that we will not miss enforcing cpus_allowed | |
22294 | + * __migrate_task() such that we will not miss enforcing cpus_ptr | |
22295 | * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. | |
22296 | */ | |
22297 | sched_ttwu_pending(); | |
b3bbd485 | 22298 | @@ -1038,11 +1103,19 @@ static int migration_cpu_stop(void *data) |
e4b2b4a8 JK |
22299 | */ |
22300 | void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) | |
1f39f580 | 22301 | { |
e4b2b4a8 JK |
22302 | - cpumask_copy(&p->cpus_allowed, new_mask); |
22303 | + cpumask_copy(&p->cpus_mask, new_mask); | |
22304 | p->nr_cpus_allowed = cpumask_weight(new_mask); | |
22305 | } | |
1f39f580 | 22306 | |
e4b2b4a8 | 22307 | -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) |
b3bbd485 | 22308 | +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE) |
e4b2b4a8 JK |
22309 | +int __migrate_disabled(struct task_struct *p) |
22310 | +{ | |
22311 | + return p->migrate_disable; | |
22312 | +} | |
22313 | +#endif | |
22314 | + | |
22315 | +static void __do_set_cpus_allowed_tail(struct task_struct *p, | |
22316 | + const struct cpumask *new_mask) | |
22317 | { | |
22318 | struct rq *rq = task_rq(p); | |
22319 | bool queued, running; | |
b3bbd485 | 22320 | @@ -1071,6 +1144,20 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) |
e4b2b4a8 | 22321 | set_curr_task(rq, p); |
1f39f580 JK |
22322 | } |
22323 | ||
e4b2b4a8 JK |
22324 | +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) |
22325 | +{ | |
b3bbd485 | 22326 | +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE) |
e4b2b4a8 JK |
22327 | + if (__migrate_disabled(p)) { |
22328 | + lockdep_assert_held(&p->pi_lock); | |
22329 | + | |
22330 | + cpumask_copy(&p->cpus_mask, new_mask); | |
22331 | + p->migrate_disable_update = 1; | |
22332 | + return; | |
22333 | + } | |
22334 | +#endif | |
22335 | + __do_set_cpus_allowed_tail(p, new_mask); | |
22336 | +} | |
22337 | + | |
22338 | /* | |
22339 | * Change a given task's CPU affinity. Migrate the thread to a | |
22340 | * proper CPU and schedule it away if the CPU it's executing on | |
b3bbd485 | 22341 | @@ -1108,7 +1195,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, |
e4b2b4a8 JK |
22342 | goto out; |
22343 | } | |
1f39f580 | 22344 | |
e4b2b4a8 JK |
22345 | - if (cpumask_equal(&p->cpus_allowed, new_mask)) |
22346 | + if (cpumask_equal(p->cpus_ptr, new_mask)) | |
22347 | goto out; | |
1f39f580 | 22348 | |
e4b2b4a8 | 22349 | if (!cpumask_intersects(new_mask, cpu_valid_mask)) { |
b3bbd485 | 22350 | @@ -1129,9 +1216,16 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, |
e4b2b4a8 | 22351 | } |
1f39f580 | 22352 | |
e4b2b4a8 JK |
22353 | /* Can the task run on the task's current CPU? If so, we're done */ |
22354 | - if (cpumask_test_cpu(task_cpu(p), new_mask)) | |
22355 | + if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p)) | |
22356 | goto out; | |
1f39f580 | 22357 | |
b3bbd485 | 22358 | +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE) |
e4b2b4a8 JK |
22359 | + if (__migrate_disabled(p)) { |
22360 | + p->migrate_disable_update = 1; | |
22361 | + goto out; | |
22362 | + } | |
22363 | +#endif | |
22364 | + | |
22365 | dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); | |
22366 | if (task_running(rq, p) || p->state == TASK_WAKING) { | |
22367 | struct migration_arg arg = { p, dest_cpu }; | |
b3bbd485 | 22368 | @@ -1269,10 +1363,10 @@ static int migrate_swap_stop(void *data) |
e4b2b4a8 JK |
22369 | if (task_cpu(arg->src_task) != arg->src_cpu) |
22370 | goto unlock; | |
1f39f580 | 22371 | |
e4b2b4a8 JK |
22372 | - if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed)) |
22373 | + if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr)) | |
22374 | goto unlock; | |
1f39f580 | 22375 | |
e4b2b4a8 JK |
22376 | - if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed)) |
22377 | + if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr)) | |
22378 | goto unlock; | |
1a6e0f06 | 22379 | |
e4b2b4a8 | 22380 | __migrate_swap_task(arg->src_task, arg->dst_cpu); |
b3bbd485 | 22381 | @@ -1313,10 +1407,10 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p) |
e4b2b4a8 JK |
22382 | if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) |
22383 | goto out; | |
1a6e0f06 | 22384 | |
e4b2b4a8 JK |
22385 | - if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed)) |
22386 | + if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr)) | |
22387 | goto out; | |
1a6e0f06 | 22388 | |
e4b2b4a8 JK |
22389 | - if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed)) |
22390 | + if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr)) | |
22391 | goto out; | |
1a6e0f06 | 22392 | |
e4b2b4a8 | 22393 | trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); |
b3bbd485 | 22394 | @@ -1326,6 +1420,18 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p) |
e4b2b4a8 | 22395 | return ret; |
1a6e0f06 JK |
22396 | } |
22397 | ||
e4b2b4a8 JK |
22398 | +static bool check_task_state(struct task_struct *p, long match_state) |
22399 | +{ | |
22400 | + bool match = false; | |
22401 | + | |
22402 | + raw_spin_lock_irq(&p->pi_lock); | |
22403 | + if (p->state == match_state || p->saved_state == match_state) | |
22404 | + match = true; | |
22405 | + raw_spin_unlock_irq(&p->pi_lock); | |
22406 | + | |
22407 | + return match; | |
22408 | +} | |
22409 | + | |
22410 | /* | |
22411 | * wait_task_inactive - wait for a thread to unschedule. | |
22412 | * | |
b3bbd485 | 22413 | @@ -1370,7 +1476,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) |
e4b2b4a8 JK |
22414 | * is actually now running somewhere else! |
22415 | */ | |
22416 | while (task_running(rq, p)) { | |
22417 | - if (match_state && unlikely(p->state != match_state)) | |
22418 | + if (match_state && !check_task_state(p, match_state)) | |
22419 | return 0; | |
22420 | cpu_relax(); | |
22421 | } | |
b3bbd485 | 22422 | @@ -1385,7 +1491,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) |
e4b2b4a8 JK |
22423 | running = task_running(rq, p); |
22424 | queued = task_on_rq_queued(p); | |
22425 | ncsw = 0; | |
22426 | - if (!match_state || p->state == match_state) | |
22427 | + if (!match_state || p->state == match_state || | |
22428 | + p->saved_state == match_state) | |
22429 | ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ | |
22430 | task_rq_unlock(rq, p, &rf); | |
1a6e0f06 | 22431 | |
b3bbd485 | 22432 | @@ -1460,7 +1567,7 @@ void kick_process(struct task_struct *p) |
e4b2b4a8 | 22433 | EXPORT_SYMBOL_GPL(kick_process); |
1a6e0f06 | 22434 | |
e4b2b4a8 JK |
22435 | /* |
22436 | - * ->cpus_allowed is protected by both rq->lock and p->pi_lock | |
22437 | + * ->cpus_ptr is protected by both rq->lock and p->pi_lock | |
22438 | * | |
22439 | * A few notes on cpu_active vs cpu_online: | |
22440 | * | |
b3bbd485 | 22441 | @@ -1500,14 +1607,14 @@ static int select_fallback_rq(int cpu, struct task_struct *p) |
e4b2b4a8 JK |
22442 | for_each_cpu(dest_cpu, nodemask) { |
22443 | if (!cpu_active(dest_cpu)) | |
22444 | continue; | |
22445 | - if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) | |
22446 | + if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) | |
22447 | return dest_cpu; | |
22448 | } | |
22449 | } | |
c7c16703 | 22450 | |
e4b2b4a8 JK |
22451 | for (;;) { |
22452 | /* Any allowed, online CPU? */ | |
22453 | - for_each_cpu(dest_cpu, &p->cpus_allowed) { | |
22454 | + for_each_cpu(dest_cpu, p->cpus_ptr) { | |
22455 | if (!is_cpu_allowed(p, dest_cpu)) | |
22456 | continue; | |
22457 | ||
b3bbd485 | 22458 | @@ -1551,7 +1658,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) |
e4b2b4a8 JK |
22459 | } |
22460 | ||
22461 | /* | |
22462 | - * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. | |
22463 | + * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable. | |
22464 | */ | |
22465 | static inline | |
22466 | int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) | |
b3bbd485 | 22467 | @@ -1561,11 +1668,11 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) |
e4b2b4a8 JK |
22468 | if (p->nr_cpus_allowed > 1) |
22469 | cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); | |
22470 | else | |
22471 | - cpu = cpumask_any(&p->cpus_allowed); | |
22472 | + cpu = cpumask_any(p->cpus_ptr); | |
22473 | ||
22474 | /* | |
22475 | * In order not to call set_task_cpu() on a blocking task we need | |
22476 | - * to rely on ttwu() to place the task on a valid ->cpus_allowed | |
22477 | + * to rely on ttwu() to place the task on a valid ->cpus_ptr | |
22478 | * CPU. | |
22479 | * | |
22480 | * Since this is common to all placement strategies, this lives here. | |
b3bbd485 | 22481 | @@ -1668,10 +1775,6 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl |
1a6e0f06 | 22482 | { |
e4b2b4a8 JK |
22483 | activate_task(rq, p, en_flags); |
22484 | p->on_rq = TASK_ON_RQ_QUEUED; | |
22485 | - | |
22486 | - /* If a worker is waking up, notify the workqueue: */ | |
22487 | - if (p->flags & PF_WQ_WORKER) | |
22488 | - wq_worker_waking_up(p, cpu_of(rq)); | |
22489 | } | |
c7c16703 | 22490 | |
e4b2b4a8 | 22491 | /* |
b3bbd485 | 22492 | @@ -1995,8 +2098,27 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) |
e4b2b4a8 JK |
22493 | */ |
22494 | raw_spin_lock_irqsave(&p->pi_lock, flags); | |
22495 | smp_mb__after_spinlock(); | |
22496 | - if (!(p->state & state)) | |
22497 | + if (!(p->state & state)) { | |
22498 | + /* | |
22499 | + * The task might be running due to a spinlock sleeper | |
22500 | + * wakeup. Check the saved state and set it to running | |
22501 | + * if the wakeup condition is true. | |
22502 | + */ | |
22503 | + if (!(wake_flags & WF_LOCK_SLEEPER)) { | |
22504 | + if (p->saved_state & state) { | |
22505 | + p->saved_state = TASK_RUNNING; | |
22506 | + success = 1; | |
22507 | + } | |
22508 | + } | |
22509 | goto out; | |
22510 | + } | |
22511 | + | |
c7c16703 | 22512 | + /* |
e4b2b4a8 JK |
22513 | + * If this is a regular wakeup, then we can unconditionally |
22514 | + * clear the saved state of a "lock sleeper". | |
c7c16703 | 22515 | + */ |
e4b2b4a8 JK |
22516 | + if (!(wake_flags & WF_LOCK_SLEEPER)) |
22517 | + p->saved_state = TASK_RUNNING; | |
1a6e0f06 | 22518 | |
e4b2b4a8 | 22519 | trace_sched_waking(p); |
1a6e0f06 | 22520 | |
b3bbd485 JK |
22521 | @@ -2092,56 +2214,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) |
22522 | return success; | |
1a6e0f06 | 22523 | } |
1a6e0f06 | 22524 | |
b3bbd485 | 22525 | -/** |
e4b2b4a8 JK |
22526 | - * try_to_wake_up_local - try to wake up a local task with rq lock held |
22527 | - * @p: the thread to be awakened | |
22528 | - * @rf: request-queue flags for pinning | |
22529 | - * | |
22530 | - * Put @p on the run-queue if it's not already there. The caller must | |
22531 | - * ensure that this_rq() is locked, @p is bound to this_rq() and not | |
22532 | - * the current task. | |
22533 | - */ | |
22534 | -static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) | |
22535 | -{ | |
22536 | - struct rq *rq = task_rq(p); | |
22537 | - | |
22538 | - if (WARN_ON_ONCE(rq != this_rq()) || | |
22539 | - WARN_ON_ONCE(p == current)) | |
22540 | - return; | |
22541 | - | |
22542 | - lockdep_assert_held(&rq->lock); | |
22543 | - | |
22544 | - if (!raw_spin_trylock(&p->pi_lock)) { | |
22545 | - /* | |
22546 | - * This is OK, because current is on_cpu, which avoids it being | |
22547 | - * picked for load-balance and preemption/IRQs are still | |
22548 | - * disabled avoiding further scheduler activity on it and we've | |
22549 | - * not yet picked a replacement task. | |
22550 | - */ | |
22551 | - rq_unlock(rq, rf); | |
22552 | - raw_spin_lock(&p->pi_lock); | |
22553 | - rq_relock(rq, rf); | |
22554 | - } | |
22555 | - | |
22556 | - if (!(p->state & TASK_NORMAL)) | |
22557 | - goto out; | |
22558 | - | |
22559 | - trace_sched_waking(p); | |
22560 | - | |
22561 | - if (!task_on_rq_queued(p)) { | |
22562 | - if (p->in_iowait) { | |
22563 | - delayacct_blkio_end(p); | |
22564 | - atomic_dec(&rq->nr_iowait); | |
22565 | - } | |
22566 | - ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK); | |
22567 | - } | |
22568 | - | |
22569 | - ttwu_do_wakeup(rq, p, 0, rf); | |
22570 | - ttwu_stat(p, smp_processor_id(), 0); | |
22571 | -out: | |
22572 | - raw_spin_unlock(&p->pi_lock); | |
22573 | -} | |
22574 | - | |
b3bbd485 | 22575 | /** |
e4b2b4a8 JK |
22576 | * wake_up_process - Wake up a specific process |
22577 | * @p: The process to be woken up. | |
b3bbd485 | 22578 | @@ -2160,6 +2232,18 @@ int wake_up_process(struct task_struct *p) |
e4b2b4a8 JK |
22579 | } |
22580 | EXPORT_SYMBOL(wake_up_process); | |
22581 | ||
22582 | +/** | |
22583 | + * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock" | |
22584 | + * @p: The process to be woken up. | |
22585 | + * | |
22586 | + * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate | |
22587 | + * the nature of the wakeup. | |
1a6e0f06 | 22588 | + */ |
e4b2b4a8 | 22589 | +int wake_up_lock_sleeper(struct task_struct *p) |
1a6e0f06 | 22590 | +{ |
e4b2b4a8 | 22591 | + return try_to_wake_up(p, TASK_UNINTERRUPTIBLE, WF_LOCK_SLEEPER); |
1a6e0f06 | 22592 | +} |
1a6e0f06 | 22593 | + |
e4b2b4a8 | 22594 | int wake_up_state(struct task_struct *p, unsigned int state) |
1a6e0f06 | 22595 | { |
e4b2b4a8 | 22596 | return try_to_wake_up(p, state, 0); |
b3bbd485 | 22597 | @@ -2420,6 +2504,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) |
e4b2b4a8 JK |
22598 | p->on_cpu = 0; |
22599 | #endif | |
22600 | init_task_preempt_count(p); | |
22601 | +#ifdef CONFIG_HAVE_PREEMPT_LAZY | |
22602 | + task_thread_info(p)->preempt_lazy_count = 0; | |
1a6e0f06 | 22603 | +#endif |
e4b2b4a8 JK |
22604 | #ifdef CONFIG_SMP |
22605 | plist_node_init(&p->pushable_tasks, MAX_PRIO); | |
22606 | RB_CLEAR_NODE(&p->pushable_dl_tasks); | |
b3bbd485 | 22607 | @@ -2462,7 +2549,7 @@ void wake_up_new_task(struct task_struct *p) |
e4b2b4a8 JK |
22608 | #ifdef CONFIG_SMP |
22609 | /* | |
22610 | * Fork balancing, do it here and not earlier because: | |
22611 | - * - cpus_allowed can change in the fork path | |
22612 | + * - cpus_ptr can change in the fork path | |
22613 | * - any previously selected CPU might disappear through hotplug | |
22614 | * | |
22615 | * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, | |
b3bbd485 | 22616 | @@ -2675,21 +2762,16 @@ static struct rq *finish_task_switch(struct task_struct *prev) |
e4b2b4a8 | 22617 | finish_arch_post_lock_switch(); |
1a6e0f06 | 22618 | |
e4b2b4a8 JK |
22619 | fire_sched_in_preempt_notifiers(current); |
22620 | + /* | |
22621 | + * We use mmdrop_delayed() here so we don't have to do the | |
22622 | + * full __mmdrop() when we are the last user. | |
22623 | + */ | |
22624 | if (mm) | |
22625 | - mmdrop(mm); | |
22626 | + mmdrop_delayed(mm); | |
22627 | if (unlikely(prev_state == TASK_DEAD)) { | |
22628 | if (prev->sched_class->task_dead) | |
22629 | prev->sched_class->task_dead(prev); | |
1a6e0f06 | 22630 | |
e4b2b4a8 JK |
22631 | - /* |
22632 | - * Remove function-return probe instances associated with this | |
22633 | - * task and put them back on the free list. | |
22634 | - */ | |
22635 | - kprobe_flush_task(prev); | |
22636 | - | |
22637 | - /* Task is done with its stack. */ | |
22638 | - put_task_stack(prev); | |
22639 | - | |
22640 | put_task_struct(prev); | |
22641 | } | |
1a6e0f06 | 22642 | |
b3bbd485 | 22643 | @@ -3336,25 +3418,13 @@ static void __sched notrace __schedule(bool preempt) |
e4b2b4a8 JK |
22644 | atomic_inc(&rq->nr_iowait); |
22645 | delayacct_blkio_start(); | |
22646 | } | |
22647 | - | |
22648 | - /* | |
22649 | - * If a worker went to sleep, notify and ask workqueue | |
22650 | - * whether it wants to wake up a task to maintain | |
22651 | - * concurrency. | |
22652 | - */ | |
22653 | - if (prev->flags & PF_WQ_WORKER) { | |
22654 | - struct task_struct *to_wakeup; | |
22655 | - | |
22656 | - to_wakeup = wq_worker_sleeping(prev); | |
22657 | - if (to_wakeup) | |
22658 | - try_to_wake_up_local(to_wakeup, &rf); | |
22659 | - } | |
22660 | } | |
22661 | switch_count = &prev->nvcsw; | |
22662 | } | |
1a6e0f06 | 22663 | |
e4b2b4a8 JK |
22664 | next = pick_next_task(rq, prev, &rf); |
22665 | clear_tsk_need_resched(prev); | |
22666 | + clear_tsk_need_resched_lazy(prev); | |
22667 | clear_preempt_need_resched(); | |
1a6e0f06 | 22668 | |
e4b2b4a8 | 22669 | if (likely(prev != next)) { |
b3bbd485 | 22670 | @@ -3407,8 +3477,24 @@ void __noreturn do_task_dead(void) |
1a6e0f06 | 22671 | |
e4b2b4a8 JK |
22672 | static inline void sched_submit_work(struct task_struct *tsk) |
22673 | { | |
22674 | - if (!tsk->state || tsk_is_pi_blocked(tsk)) | |
22675 | + if (!tsk->state) | |
b3bbd485 | 22676 | return; |
1a6e0f06 | 22677 | + /* |
e4b2b4a8 JK |
22678 | + * If a worker went to sleep, notify and ask workqueue whether |
22679 | + * it wants to wake up a task to maintain concurrency. | |
b3bbd485 JK |
22680 | + * As this function is called inside the schedule() context, |
22681 | + * we disable preemption to avoid it calling schedule() again | |
22682 | + * in the possible wakeup of a kworker. | |
1a6e0f06 | 22683 | + */ |
b3bbd485 JK |
22684 | + if (tsk->flags & PF_WQ_WORKER) { |
22685 | + preempt_disable(); | |
e4b2b4a8 | 22686 | + wq_worker_sleeping(tsk); |
b3bbd485 JK |
22687 | + preempt_enable_no_resched(); |
22688 | + } | |
1a6e0f06 | 22689 | + |
e4b2b4a8 | 22690 | + if (tsk_is_pi_blocked(tsk)) |
b3bbd485 | 22691 | + return; |
1a6e0f06 | 22692 | + |
1a6e0f06 | 22693 | /* |
e4b2b4a8 JK |
22694 | * If we are going to sleep and we have plugged IO queued, |
22695 | * make sure to submit it to avoid deadlocks. | |
b3bbd485 | 22696 | @@ -3417,6 +3503,12 @@ static inline void sched_submit_work(struct task_struct *tsk) |
e4b2b4a8 JK |
22697 | blk_schedule_flush_plug(tsk); |
22698 | } | |
1a6e0f06 | 22699 | |
e4b2b4a8 JK |
22700 | +static void sched_update_worker(struct task_struct *tsk) |
22701 | +{ | |
22702 | + if (tsk->flags & PF_WQ_WORKER) | |
22703 | + wq_worker_running(tsk); | |
22704 | +} | |
22705 | + | |
22706 | asmlinkage __visible void __sched schedule(void) | |
1a6e0f06 | 22707 | { |
e4b2b4a8 | 22708 | struct task_struct *tsk = current; |
b3bbd485 | 22709 | @@ -3427,6 +3519,7 @@ asmlinkage __visible void __sched schedule(void) |
e4b2b4a8 JK |
22710 | __schedule(false); |
22711 | sched_preempt_enable_no_resched(); | |
22712 | } while (need_resched()); | |
22713 | + sched_update_worker(tsk); | |
22714 | } | |
22715 | EXPORT_SYMBOL(schedule); | |
1a6e0f06 | 22716 | |
b3bbd485 | 22717 | @@ -3515,6 +3608,30 @@ static void __sched notrace preempt_schedule_common(void) |
e4b2b4a8 JK |
22718 | } while (need_resched()); |
22719 | } | |
1a6e0f06 | 22720 | |
e4b2b4a8 JK |
22721 | +#ifdef CONFIG_PREEMPT_LAZY |
22722 | +/* | |
22723 | + * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is | |
22724 | + * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as | |
22725 | + * preempt_lazy_count counter >0. | |
22726 | + */ | |
22727 | +static __always_inline int preemptible_lazy(void) | |
22728 | +{ | |
22729 | + if (test_thread_flag(TIF_NEED_RESCHED)) | |
22730 | + return 1; | |
22731 | + if (current_thread_info()->preempt_lazy_count) | |
22732 | + return 0; | |
22733 | + return 1; | |
22734 | +} | |
22735 | + | |
1a6e0f06 | 22736 | +#else |
e4b2b4a8 JK |
22737 | + |
22738 | +static inline int preemptible_lazy(void) | |
22739 | +{ | |
22740 | + return 1; | |
22741 | +} | |
22742 | + | |
1a6e0f06 | 22743 | +#endif |
e4b2b4a8 JK |
22744 | + |
22745 | #ifdef CONFIG_PREEMPT | |
22746 | /* | |
22747 | * this is the entry point to schedule() from in-kernel preemption | |
b3bbd485 | 22748 | @@ -3529,7 +3646,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) |
e4b2b4a8 JK |
22749 | */ |
22750 | if (likely(!preemptible())) | |
22751 | return; | |
22752 | - | |
22753 | + if (!preemptible_lazy()) | |
22754 | + return; | |
22755 | preempt_schedule_common(); | |
22756 | } | |
22757 | NOKPROBE_SYMBOL(preempt_schedule); | |
b3bbd485 | 22758 | @@ -3556,6 +3674,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) |
e4b2b4a8 JK |
22759 | if (likely(!preemptible())) |
22760 | return; | |
1a6e0f06 | 22761 | |
e4b2b4a8 JK |
22762 | + if (!preemptible_lazy()) |
22763 | + return; | |
22764 | + | |
22765 | do { | |
22766 | /* | |
22767 | * Because the function tracer can trace preempt_count_sub() | |
b3bbd485 | 22768 | @@ -3578,7 +3699,16 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) |
e4b2b4a8 JK |
22769 | * an infinite recursion. |
22770 | */ | |
22771 | prev_ctx = exception_enter(); | |
22772 | + /* | |
22773 | + * The add/subtract must not be traced by the function | |
22774 | + * tracer. But we still want to account for the | |
22775 | + * preempt off latency tracer. Since the _notrace versions | |
22776 | + * of add/subtract skip the accounting for latency tracer | |
22777 | + * we must force it manually. | |
22778 | + */ | |
22779 | + start_critical_timings(); | |
22780 | __schedule(true); | |
22781 | + stop_critical_timings(); | |
22782 | exception_exit(prev_ctx); | |
1a6e0f06 | 22783 | |
e4b2b4a8 | 22784 | preempt_latency_stop(1); |
b3bbd485 | 22785 | @@ -4164,7 +4294,7 @@ static int __sched_setscheduler(struct task_struct *p, |
e4b2b4a8 JK |
22786 | * the entire root_domain to become SCHED_DEADLINE. We |
22787 | * will also fail if there's no bandwidth available. | |
22788 | */ | |
22789 | - if (!cpumask_subset(span, &p->cpus_allowed) || | |
22790 | + if (!cpumask_subset(span, p->cpus_ptr) || | |
22791 | rq->rd->dl_bw.bw == 0) { | |
22792 | task_rq_unlock(rq, p, &rf); | |
22793 | return -EPERM; | |
b3bbd485 | 22794 | @@ -4758,7 +4888,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) |
e4b2b4a8 | 22795 | goto out_unlock; |
1a6e0f06 | 22796 | |
e4b2b4a8 JK |
22797 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
22798 | - cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); | |
22799 | + cpumask_and(mask, &p->cpus_mask, cpu_active_mask); | |
22800 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | |
22801 | ||
22802 | out_unlock: | |
b3bbd485 | 22803 | @@ -4877,6 +5007,7 @@ int __cond_resched_lock(spinlock_t *lock) |
e4b2b4a8 JK |
22804 | } |
22805 | EXPORT_SYMBOL(__cond_resched_lock); | |
22806 | ||
22807 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
22808 | int __sched __cond_resched_softirq(void) | |
22809 | { | |
22810 | BUG_ON(!in_softirq()); | |
b3bbd485 | 22811 | @@ -4890,6 +5021,7 @@ int __sched __cond_resched_softirq(void) |
1a6e0f06 JK |
22812 | return 0; |
22813 | } | |
e4b2b4a8 JK |
22814 | EXPORT_SYMBOL(__cond_resched_softirq); |
22815 | +#endif | |
1a6e0f06 | 22816 | |
e4b2b4a8 JK |
22817 | /** |
22818 | * yield - yield the current processor to other threads. | |
b3bbd485 | 22819 | @@ -5284,7 +5416,9 @@ void init_idle(struct task_struct *idle, int cpu) |
1a6e0f06 | 22820 | |
e4b2b4a8 JK |
22821 | /* Set the preempt count _outside_ the spinlocks! */ |
22822 | init_idle_preempt_count(idle, cpu); | |
22823 | - | |
22824 | +#ifdef CONFIG_HAVE_PREEMPT_LAZY | |
22825 | + task_thread_info(idle)->preempt_lazy_count = 0; | |
1a6e0f06 | 22826 | +#endif |
e4b2b4a8 JK |
22827 | /* |
22828 | * The idle tasks have their own, simple scheduling class: | |
22829 | */ | |
b3bbd485 | 22830 | @@ -5323,7 +5457,7 @@ int task_can_attach(struct task_struct *p, |
e4b2b4a8 JK |
22831 | * allowed nodes is unnecessary. Thus, cpusets are not |
22832 | * applicable for such threads. This prevents checking for | |
22833 | * success of set_cpus_allowed_ptr() on all attached tasks | |
22834 | - * before cpus_allowed may be changed. | |
22835 | + * before cpus_mask may be changed. | |
22836 | */ | |
22837 | if (p->flags & PF_NO_SETAFFINITY) { | |
22838 | ret = -EINVAL; | |
b3bbd485 | 22839 | @@ -5350,7 +5484,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu) |
e4b2b4a8 JK |
22840 | if (curr_cpu == target_cpu) |
22841 | return 0; | |
22842 | ||
22843 | - if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed)) | |
22844 | + if (!cpumask_test_cpu(target_cpu, p->cpus_ptr)) | |
22845 | return -EINVAL; | |
22846 | ||
22847 | /* TODO: This is not properly updating schedstats */ | |
b3bbd485 | 22848 | @@ -5389,6 +5523,8 @@ void sched_setnuma(struct task_struct *p, int nid) |
e4b2b4a8 | 22849 | #endif /* CONFIG_NUMA_BALANCING */ |
1a6e0f06 | 22850 | |
e4b2b4a8 JK |
22851 | #ifdef CONFIG_HOTPLUG_CPU |
22852 | +static DEFINE_PER_CPU(struct mm_struct *, idle_last_mm); | |
22853 | + | |
22854 | /* | |
22855 | * Ensure that the idle task is using init_mm right before its CPU goes | |
22856 | * offline. | |
b3bbd485 | 22857 | @@ -5403,7 +5539,12 @@ void idle_task_exit(void) |
e4b2b4a8 JK |
22858 | switch_mm(mm, &init_mm, current); |
22859 | finish_arch_post_lock_switch(); | |
22860 | } | |
22861 | - mmdrop(mm); | |
22862 | + /* | |
22863 | + * Defer the cleanup to an alive cpu. On RT we can neither | |
22864 | + * call mmdrop() nor mmdrop_delayed() from here. | |
22865 | + */ | |
22866 | + per_cpu(idle_last_mm, smp_processor_id()) = mm; | |
22867 | + | |
1a6e0f06 | 22868 | } |
1a6e0f06 | 22869 | |
e4b2b4a8 | 22870 | /* |
b3bbd485 | 22871 | @@ -5487,7 +5628,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) |
e4b2b4a8 JK |
22872 | put_prev_task(rq, next); |
22873 | ||
22874 | /* | |
22875 | - * Rules for changing task_struct::cpus_allowed are holding | |
22876 | + * Rules for changing task_struct::cpus_mask are holding | |
22877 | * both pi_lock and rq->lock, such that holding either | |
22878 | * stabilizes the mask. | |
22879 | * | |
b3bbd485 | 22880 | @@ -5718,6 +5859,10 @@ int sched_cpu_dying(unsigned int cpu) |
e4b2b4a8 JK |
22881 | update_max_interval(); |
22882 | nohz_balance_exit_idle(cpu); | |
22883 | hrtick_clear(rq); | |
22884 | + if (per_cpu(idle_last_mm, cpu)) { | |
22885 | + mmdrop_delayed(per_cpu(idle_last_mm, cpu)); | |
22886 | + per_cpu(idle_last_mm, cpu) = NULL; | |
22887 | + } | |
22888 | return 0; | |
1a6e0f06 | 22889 | } |
e4b2b4a8 | 22890 | #endif |
b3bbd485 | 22891 | @@ -5964,7 +6109,7 @@ void __init sched_init(void) |
e4b2b4a8 JK |
22892 | #ifdef CONFIG_DEBUG_ATOMIC_SLEEP |
22893 | static inline int preempt_count_equals(int preempt_offset) | |
22894 | { | |
22895 | - int nested = preempt_count() + rcu_preempt_depth(); | |
22896 | + int nested = preempt_count() + sched_rcu_preempt_depth(); | |
1a6e0f06 | 22897 | |
e4b2b4a8 JK |
22898 | return (nested == preempt_offset); |
22899 | } | |
b3bbd485 | 22900 | @@ -6756,3 +6901,196 @@ const u32 sched_prio_to_wmult[40] = { |
e4b2b4a8 JK |
22901 | /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, |
22902 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, | |
22903 | }; | |
1a6e0f06 | 22904 | + |
b3bbd485 | 22905 | +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE) |
1a6e0f06 | 22906 | + |
e4b2b4a8 JK |
22907 | +static inline void |
22908 | +update_nr_migratory(struct task_struct *p, long delta) | |
1a6e0f06 | 22909 | +{ |
e4b2b4a8 JK |
22910 | + if (unlikely((p->sched_class == &rt_sched_class || |
22911 | + p->sched_class == &dl_sched_class) && | |
22912 | + p->nr_cpus_allowed > 1)) { | |
22913 | + if (p->sched_class == &rt_sched_class) | |
22914 | + task_rq(p)->rt.rt_nr_migratory += delta; | |
22915 | + else | |
22916 | + task_rq(p)->dl.dl_nr_migratory += delta; | |
22917 | + } | |
1a6e0f06 JK |
22918 | +} |
22919 | + | |
e4b2b4a8 JK |
22920 | +static inline void |
22921 | +migrate_disable_update_cpus_allowed(struct task_struct *p) | |
22922 | +{ | |
22923 | + struct rq *rq; | |
22924 | + struct rq_flags rf; | |
1a6e0f06 | 22925 | + |
e4b2b4a8 JK |
22926 | + p->cpus_ptr = cpumask_of(smp_processor_id()); |
22927 | + | |
22928 | + rq = task_rq_lock(p, &rf); | |
22929 | + update_nr_migratory(p, -1); | |
22930 | + p->nr_cpus_allowed = 1; | |
22931 | + task_rq_unlock(rq, p, &rf); | |
22932 | +} | |
22933 | + | |
22934 | +static inline void | |
22935 | +migrate_enable_update_cpus_allowed(struct task_struct *p) | |
1a6e0f06 | 22936 | +{ |
e4b2b4a8 JK |
22937 | + struct rq *rq; |
22938 | + struct rq_flags rf; | |
22939 | + | |
22940 | + p->cpus_ptr = &p->cpus_mask; | |
22941 | + | |
22942 | + rq = task_rq_lock(p, &rf); | |
22943 | + p->nr_cpus_allowed = cpumask_weight(&p->cpus_mask); | |
22944 | + update_nr_migratory(p, 1); | |
22945 | + task_rq_unlock(rq, p, &rf); | |
1a6e0f06 | 22946 | +} |
1a6e0f06 | 22947 | + |
e4b2b4a8 JK |
22948 | +void migrate_disable(void) |
22949 | +{ | |
22950 | + struct task_struct *p = current; | |
22951 | + | |
22952 | + if (in_atomic() || irqs_disabled()) { | |
22953 | +#ifdef CONFIG_SCHED_DEBUG | |
22954 | + p->migrate_disable_atomic++; | |
1a6e0f06 | 22955 | +#endif |
e4b2b4a8 JK |
22956 | + return; |
22957 | + } | |
22958 | +#ifdef CONFIG_SCHED_DEBUG | |
22959 | + if (unlikely(p->migrate_disable_atomic)) { | |
22960 | + tracing_off(); | |
22961 | + WARN_ON_ONCE(1); | |
22962 | + } | |
1a6e0f06 | 22963 | +#endif |
1a6e0f06 | 22964 | + |
e4b2b4a8 JK |
22965 | + if (p->migrate_disable) { |
22966 | + p->migrate_disable++; | |
22967 | + return; | |
22968 | + } | |
22969 | + | |
22970 | + preempt_disable(); | |
22971 | + preempt_lazy_disable(); | |
22972 | + pin_current_cpu(); | |
22973 | + | |
22974 | + migrate_disable_update_cpus_allowed(p); | |
22975 | + p->migrate_disable = 1; | |
22976 | + | |
22977 | + preempt_enable(); | |
1a6e0f06 | 22978 | +} |
e4b2b4a8 | 22979 | +EXPORT_SYMBOL(migrate_disable); |
1a6e0f06 | 22980 | + |
e4b2b4a8 | 22981 | +void migrate_enable(void) |
1a6e0f06 | 22982 | +{ |
e4b2b4a8 JK |
22983 | + struct task_struct *p = current; |
22984 | + | |
22985 | + if (in_atomic() || irqs_disabled()) { | |
22986 | +#ifdef CONFIG_SCHED_DEBUG | |
22987 | + p->migrate_disable_atomic--; | |
22988 | +#endif | |
22989 | + return; | |
22990 | + } | |
22991 | + | |
22992 | +#ifdef CONFIG_SCHED_DEBUG | |
22993 | + if (unlikely(p->migrate_disable_atomic)) { | |
22994 | + tracing_off(); | |
22995 | + WARN_ON_ONCE(1); | |
22996 | + } | |
22997 | +#endif | |
22998 | + | |
22999 | + WARN_ON_ONCE(p->migrate_disable <= 0); | |
23000 | + if (p->migrate_disable > 1) { | |
23001 | + p->migrate_disable--; | |
23002 | + return; | |
23003 | + } | |
23004 | + | |
23005 | + preempt_disable(); | |
23006 | + | |
23007 | + p->migrate_disable = 0; | |
23008 | + migrate_enable_update_cpus_allowed(p); | |
23009 | + | |
23010 | + if (p->migrate_disable_update) { | |
23011 | + struct rq *rq; | |
23012 | + struct rq_flags rf; | |
23013 | + | |
23014 | + rq = task_rq_lock(p, &rf); | |
23015 | + update_rq_clock(rq); | |
23016 | + | |
23017 | + __do_set_cpus_allowed_tail(p, &p->cpus_mask); | |
23018 | + task_rq_unlock(rq, p, &rf); | |
23019 | + | |
23020 | + p->migrate_disable_update = 0; | |
23021 | + | |
23022 | + WARN_ON(smp_processor_id() != task_cpu(p)); | |
23023 | + if (!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { | |
23024 | + const struct cpumask *cpu_valid_mask = cpu_active_mask; | |
23025 | + struct migration_arg arg; | |
23026 | + unsigned int dest_cpu; | |
23027 | + | |
23028 | + if (p->flags & PF_KTHREAD) { | |
23029 | + /* | |
23030 | + * Kernel threads are allowed on online && !active CPUs | |
23031 | + */ | |
23032 | + cpu_valid_mask = cpu_online_mask; | |
23033 | + } | |
23034 | + dest_cpu = cpumask_any_and(cpu_valid_mask, &p->cpus_mask); | |
23035 | + arg.task = p; | |
23036 | + arg.dest_cpu = dest_cpu; | |
23037 | + | |
23038 | + unpin_current_cpu(); | |
23039 | + preempt_lazy_enable(); | |
23040 | + preempt_enable(); | |
23041 | + stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); | |
23042 | + tlb_migrate_finish(p->mm); | |
23043 | + | |
23044 | + return; | |
23045 | + } | |
23046 | + } | |
23047 | + unpin_current_cpu(); | |
23048 | + preempt_lazy_enable(); | |
23049 | + preempt_enable(); | |
1a6e0f06 | 23050 | +} |
e4b2b4a8 | 23051 | +EXPORT_SYMBOL(migrate_enable); |
1a6e0f06 | 23052 | + |
e4b2b4a8 JK |
23053 | +#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE) |
23054 | +void migrate_disable(void) | |
23055 | +{ | |
b3bbd485 | 23056 | +#ifdef CONFIG_SCHED_DEBUG |
e4b2b4a8 JK |
23057 | + struct task_struct *p = current; |
23058 | + | |
23059 | + if (in_atomic() || irqs_disabled()) { | |
e4b2b4a8 | 23060 | + p->migrate_disable_atomic++; |
e4b2b4a8 JK |
23061 | + return; |
23062 | + } | |
b3bbd485 | 23063 | + |
e4b2b4a8 JK |
23064 | + if (unlikely(p->migrate_disable_atomic)) { |
23065 | + tracing_off(); | |
23066 | + WARN_ON_ONCE(1); | |
23067 | + } | |
e4b2b4a8 JK |
23068 | + |
23069 | + p->migrate_disable++; | |
b3bbd485 JK |
23070 | +#endif |
23071 | + barrier(); | |
e4b2b4a8 JK |
23072 | +} |
23073 | +EXPORT_SYMBOL(migrate_disable); | |
23074 | + | |
23075 | +void migrate_enable(void) | |
23076 | +{ | |
b3bbd485 | 23077 | +#ifdef CONFIG_SCHED_DEBUG |
e4b2b4a8 JK |
23078 | + struct task_struct *p = current; |
23079 | + | |
23080 | + if (in_atomic() || irqs_disabled()) { | |
e4b2b4a8 | 23081 | + p->migrate_disable_atomic--; |
e4b2b4a8 JK |
23082 | + return; |
23083 | + } | |
23084 | + | |
e4b2b4a8 JK |
23085 | + if (unlikely(p->migrate_disable_atomic)) { |
23086 | + tracing_off(); | |
23087 | + WARN_ON_ONCE(1); | |
23088 | + } | |
1a6e0f06 | 23089 | + |
e4b2b4a8 JK |
23090 | + WARN_ON_ONCE(p->migrate_disable <= 0); |
23091 | + p->migrate_disable--; | |
b3bbd485 JK |
23092 | +#endif |
23093 | + barrier(); | |
e4b2b4a8 JK |
23094 | +} |
23095 | +EXPORT_SYMBOL(migrate_enable); | |
23096 | +#endif | |
b3bbd485 JK |
23097 | diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c |
23098 | index 8d9562d890d3..91a0702fe3df 100644 | |
23099 | --- a/kernel/sched/cpudeadline.c | |
23100 | +++ b/kernel/sched/cpudeadline.c | |
23101 | @@ -127,13 +127,13 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, | |
e4b2b4a8 | 23102 | const struct sched_dl_entity *dl_se = &p->dl; |
1a6e0f06 | 23103 | |
e4b2b4a8 JK |
23104 | if (later_mask && |
23105 | - cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) { | |
23106 | + cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) { | |
23107 | return 1; | |
23108 | } else { | |
23109 | int best_cpu = cpudl_maximum(cp); | |
23110 | WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); | |
23111 | ||
23112 | - if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) && | |
23113 | + if (cpumask_test_cpu(best_cpu, p->cpus_ptr) && | |
23114 | dl_time_before(dl_se->deadline, cp->elements[0].dl)) { | |
23115 | if (later_mask) | |
23116 | cpumask_set_cpu(best_cpu, later_mask); | |
b3bbd485 JK |
23117 | diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c |
23118 | index 2511aba36b89..7b9bc1de0e6c 100644 | |
23119 | --- a/kernel/sched/cpupri.c | |
23120 | +++ b/kernel/sched/cpupri.c | |
23121 | @@ -103,11 +103,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, | |
e4b2b4a8 JK |
23122 | if (skip) |
23123 | continue; | |
1a6e0f06 | 23124 | |
e4b2b4a8 JK |
23125 | - if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) |
23126 | + if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids) | |
23127 | continue; | |
1a6e0f06 | 23128 | |
e4b2b4a8 JK |
23129 | if (lowest_mask) { |
23130 | - cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); | |
23131 | + cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); | |
23132 | ||
23133 | /* | |
23134 | * We have to ensure that we have at least one bit | |
b3bbd485 JK |
23135 | diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c |
23136 | index b2589c7e9439..28a75a9526ac 100644 | |
23137 | --- a/kernel/sched/deadline.c | |
23138 | +++ b/kernel/sched/deadline.c | |
23139 | @@ -504,7 +504,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p | |
e4b2b4a8 JK |
23140 | * If we cannot preempt any rq, fall back to pick any |
23141 | * online cpu. | |
23142 | */ | |
23143 | - cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); | |
23144 | + cpu = cpumask_any_and(cpu_active_mask, p->cpus_ptr); | |
23145 | if (cpu >= nr_cpu_ids) { | |
23146 | /* | |
23147 | * Fail to find any suitable cpu. | |
b3bbd485 | 23148 | @@ -1020,7 +1020,7 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se) |
1a6e0f06 | 23149 | { |
e4b2b4a8 | 23150 | struct hrtimer *timer = &dl_se->dl_timer; |
1a6e0f06 | 23151 | |
e4b2b4a8 JK |
23152 | - hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
23153 | + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); | |
23154 | timer->function = dl_task_timer; | |
23155 | } | |
23156 | ||
b3bbd485 | 23157 | @@ -1753,7 +1753,7 @@ static void set_curr_task_dl(struct rq *rq) |
e4b2b4a8 JK |
23158 | static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) |
23159 | { | |
23160 | if (!task_running(rq, p) && | |
23161 | - cpumask_test_cpu(cpu, &p->cpus_allowed)) | |
23162 | + cpumask_test_cpu(cpu, p->cpus_ptr)) | |
23163 | return 1; | |
23164 | return 0; | |
23165 | } | |
b3bbd485 | 23166 | @@ -1903,7 +1903,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) |
e4b2b4a8 JK |
23167 | /* Retry if something changed. */ |
23168 | if (double_lock_balance(rq, later_rq)) { | |
23169 | if (unlikely(task_rq(task) != rq || | |
23170 | - !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) || | |
23171 | + !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) || | |
23172 | task_running(rq, task) || | |
23173 | !dl_task(task) || | |
23174 | !task_on_rq_queued(task))) { | |
b3bbd485 JK |
23175 | diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c |
23176 | index 2f93e4a2d9f6..b5b43861c2b6 100644 | |
23177 | --- a/kernel/sched/debug.c | |
23178 | +++ b/kernel/sched/debug.c | |
23179 | @@ -1017,6 +1017,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, | |
e4b2b4a8 JK |
23180 | P(dl.runtime); |
23181 | P(dl.deadline); | |
1a6e0f06 | 23182 | } |
b3bbd485 | 23183 | +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE) |
e4b2b4a8 JK |
23184 | + P(migrate_disable); |
23185 | +#endif | |
23186 | + P(nr_cpus_allowed); | |
23187 | #undef PN_SCHEDSTAT | |
23188 | #undef PN | |
23189 | #undef __PN | |
b3bbd485 | 23190 | diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c |
5dd41b01 | 23191 | index b2d699f28304..20e7d867af7a 100644 |
b3bbd485 JK |
23192 | --- a/kernel/sched/fair.c |
23193 | +++ b/kernel/sched/fair.c | |
5dd41b01 | 23194 | @@ -1598,7 +1598,7 @@ static void task_numa_compare(struct task_numa_env *env, |
e4b2b4a8 JK |
23195 | */ |
23196 | if (cur) { | |
23197 | /* Skip this swap candidate if cannot move to the source cpu */ | |
23198 | - if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) | |
23199 | + if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr)) | |
23200 | goto unlock; | |
1a6e0f06 | 23201 | |
e4b2b4a8 | 23202 | /* |
5dd41b01 | 23203 | @@ -1708,7 +1708,7 @@ static void task_numa_find_cpu(struct task_numa_env *env, |
1a6e0f06 | 23204 | |
e4b2b4a8 JK |
23205 | for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { |
23206 | /* Skip this CPU if the source task cannot migrate */ | |
23207 | - if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed)) | |
23208 | + if (!cpumask_test_cpu(cpu, env->p->cpus_ptr)) | |
23209 | continue; | |
1a6e0f06 | 23210 | |
e4b2b4a8 | 23211 | env->dst_cpu = cpu; |
5dd41b01 | 23212 | @@ -3842,7 +3842,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) |
e4b2b4a8 JK |
23213 | ideal_runtime = sched_slice(cfs_rq, curr); |
23214 | delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; | |
23215 | if (delta_exec > ideal_runtime) { | |
23216 | - resched_curr(rq_of(cfs_rq)); | |
23217 | + resched_curr_lazy(rq_of(cfs_rq)); | |
23218 | /* | |
23219 | * The current task ran long enough, ensure it doesn't get | |
23220 | * re-elected due to buddy favours. | |
5dd41b01 | 23221 | @@ -3866,7 +3866,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) |
e4b2b4a8 | 23222 | return; |
1a6e0f06 | 23223 | |
e4b2b4a8 JK |
23224 | if (delta > ideal_runtime) |
23225 | - resched_curr(rq_of(cfs_rq)); | |
23226 | + resched_curr_lazy(rq_of(cfs_rq)); | |
23227 | } | |
1a6e0f06 | 23228 | |
e4b2b4a8 | 23229 | static void |
5dd41b01 | 23230 | @@ -4008,7 +4008,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) |
e4b2b4a8 JK |
23231 | * validating it and just reschedule. |
23232 | */ | |
23233 | if (queued) { | |
23234 | - resched_curr(rq_of(cfs_rq)); | |
23235 | + resched_curr_lazy(rq_of(cfs_rq)); | |
1a6e0f06 | 23236 | return; |
e4b2b4a8 JK |
23237 | } |
23238 | /* | |
5dd41b01 | 23239 | @@ -4190,7 +4190,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) |
e4b2b4a8 JK |
23240 | * hierarchy can be throttled |
23241 | */ | |
23242 | if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) | |
23243 | - resched_curr(rq_of(cfs_rq)); | |
23244 | + resched_curr_lazy(rq_of(cfs_rq)); | |
1a6e0f06 | 23245 | } |
1a6e0f06 | 23246 | |
e4b2b4a8 | 23247 | static __always_inline |
5dd41b01 | 23248 | @@ -4686,9 +4686,9 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) |
b3bbd485 JK |
23249 | cfs_b->period = ns_to_ktime(default_cfs_period()); |
23250 | ||
23251 | INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); | |
23252 | - hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); | |
23253 | + hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD); | |
23254 | cfs_b->period_timer.function = sched_cfs_period_timer; | |
23255 | - hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | |
23256 | + hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); | |
23257 | cfs_b->slack_timer.function = sched_cfs_slack_timer; | |
23258 | } | |
23259 | ||
5dd41b01 | 23260 | @@ -4839,7 +4839,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) |
1a6e0f06 | 23261 | |
e4b2b4a8 JK |
23262 | if (delta < 0) { |
23263 | if (rq->curr == p) | |
23264 | - resched_curr(rq); | |
23265 | + resched_curr_lazy(rq); | |
23266 | return; | |
23267 | } | |
23268 | hrtick_start(rq, delta); | |
5dd41b01 | 23269 | @@ -5477,7 +5477,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, |
1a6e0f06 | 23270 | |
e4b2b4a8 JK |
23271 | /* Skip over this group if it has no CPUs allowed */ |
23272 | if (!cpumask_intersects(sched_group_span(group), | |
23273 | - &p->cpus_allowed)) | |
23274 | + p->cpus_ptr)) | |
23275 | continue; | |
1a6e0f06 | 23276 | |
e4b2b4a8 | 23277 | local_group = cpumask_test_cpu(this_cpu, |
5dd41b01 | 23278 | @@ -5597,7 +5597,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) |
e4b2b4a8 JK |
23279 | return cpumask_first(sched_group_span(group)); |
23280 | ||
23281 | /* Traverse only the allowed CPUs */ | |
23282 | - for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) { | |
23283 | + for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { | |
23284 | if (idle_cpu(i)) { | |
23285 | struct rq *rq = cpu_rq(i); | |
23286 | struct cpuidle_state *idle = idle_get_state(rq); | |
5dd41b01 | 23287 | @@ -5700,7 +5700,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int |
e4b2b4a8 JK |
23288 | if (!test_idle_cores(target, false)) |
23289 | return -1; | |
23290 | ||
23291 | - cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); | |
23292 | + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); | |
23293 | ||
23294 | for_each_cpu_wrap(core, cpus, target) { | |
23295 | bool idle = true; | |
5dd41b01 | 23296 | @@ -5734,7 +5734,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t |
e4b2b4a8 JK |
23297 | return -1; |
23298 | ||
23299 | for_each_cpu(cpu, cpu_smt_mask(target)) { | |
23300 | - if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) | |
23301 | + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) | |
23302 | continue; | |
23303 | if (idle_cpu(cpu)) | |
23304 | return cpu; | |
5dd41b01 | 23305 | @@ -5797,7 +5797,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t |
e4b2b4a8 JK |
23306 | for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { |
23307 | if (!--nr) | |
23308 | return -1; | |
23309 | - if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) | |
23310 | + if (!cpumask_test_cpu(cpu, p->cpus_ptr)) | |
23311 | continue; | |
23312 | if (idle_cpu(cpu)) | |
23313 | break; | |
5dd41b01 | 23314 | @@ -5952,7 +5952,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f |
e4b2b4a8 JK |
23315 | if (sd_flag & SD_BALANCE_WAKE) { |
23316 | record_wakee(p); | |
23317 | want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) | |
23318 | - && cpumask_test_cpu(cpu, &p->cpus_allowed); | |
23319 | + && cpumask_test_cpu(cpu, p->cpus_ptr); | |
23320 | } | |
1a6e0f06 | 23321 | |
e4b2b4a8 | 23322 | rcu_read_lock(); |
5dd41b01 | 23323 | @@ -6233,7 +6233,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ |
e4b2b4a8 | 23324 | return; |
1a6e0f06 | 23325 | |
e4b2b4a8 JK |
23326 | preempt: |
23327 | - resched_curr(rq); | |
23328 | + resched_curr_lazy(rq); | |
23329 | /* | |
23330 | * Only set the backward buddy when the current task is still | |
23331 | * on the rq. This can happen when a wakeup gets interleaved | |
5dd41b01 | 23332 | @@ -6701,14 +6701,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) |
e4b2b4a8 JK |
23333 | /* |
23334 | * We do not migrate tasks that are: | |
23335 | * 1) throttled_lb_pair, or | |
23336 | - * 2) cannot be migrated to this CPU due to cpus_allowed, or | |
23337 | + * 2) cannot be migrated to this CPU due to cpus_ptr, or | |
23338 | * 3) running (obviously), or | |
23339 | * 4) are cache-hot on their current CPU. | |
23340 | */ | |
23341 | if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) | |
23342 | return 0; | |
1a6e0f06 | 23343 | |
e4b2b4a8 JK |
23344 | - if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) { |
23345 | + if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { | |
23346 | int cpu; | |
1a6e0f06 | 23347 | |
e4b2b4a8 | 23348 | schedstat_inc(p->se.statistics.nr_failed_migrations_affine); |
5dd41b01 | 23349 | @@ -6728,7 +6728,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) |
1a6e0f06 | 23350 | |
e4b2b4a8 JK |
23351 | /* Prevent to re-select dst_cpu via env's cpus */ |
23352 | for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { | |
23353 | - if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { | |
23354 | + if (cpumask_test_cpu(cpu, p->cpus_ptr)) { | |
23355 | env->flags |= LBF_DST_PINNED; | |
23356 | env->new_dst_cpu = cpu; | |
23357 | break; | |
5dd41b01 | 23358 | @@ -7297,7 +7297,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) |
1a6e0f06 | 23359 | |
e4b2b4a8 JK |
23360 | /* |
23361 | * Group imbalance indicates (and tries to solve) the problem where balancing | |
23362 | - * groups is inadequate due to ->cpus_allowed constraints. | |
23363 | + * groups is inadequate due to ->cpus_ptr constraints. | |
23364 | * | |
23365 | * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a | |
23366 | * cpumask covering 1 cpu of the first group and 3 cpus of the second group. | |
5dd41b01 | 23367 | @@ -7873,7 +7873,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) |
1a6e0f06 | 23368 | /* |
e4b2b4a8 JK |
23369 | * If the busiest group is imbalanced the below checks don't |
23370 | * work because they assume all things are equal, which typically | |
23371 | - * isn't true due to cpus_allowed constraints and the like. | |
23372 | + * isn't true due to cpus_ptr constraints and the like. | |
23373 | */ | |
23374 | if (busiest->group_type == group_imbalanced) | |
23375 | goto force_balance; | |
5dd41b01 | 23376 | @@ -8265,7 +8265,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, |
e4b2b4a8 JK |
23377 | * if the curr task on busiest cpu can't be |
23378 | * moved to this_cpu | |
23379 | */ | |
23380 | - if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { | |
23381 | + if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) { | |
23382 | raw_spin_unlock_irqrestore(&busiest->lock, | |
23383 | flags); | |
23384 | env.flags |= LBF_ALL_PINNED; | |
5dd41b01 | 23385 | @@ -9087,7 +9087,7 @@ static void task_fork_fair(struct task_struct *p) |
e4b2b4a8 JK |
23386 | * 'current' within the tree based on its new key value. |
23387 | */ | |
23388 | swap(curr->vruntime, se->vruntime); | |
23389 | - resched_curr(rq); | |
23390 | + resched_curr_lazy(rq); | |
1a6e0f06 | 23391 | } |
e4b2b4a8 JK |
23392 | |
23393 | se->vruntime -= cfs_rq->min_vruntime; | |
5dd41b01 | 23394 | @@ -9111,7 +9111,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) |
e4b2b4a8 JK |
23395 | */ |
23396 | if (rq->curr == p) { | |
23397 | if (p->prio > oldprio) | |
23398 | - resched_curr(rq); | |
23399 | + resched_curr_lazy(rq); | |
23400 | } else | |
23401 | check_preempt_curr(rq, p, 0); | |
23402 | } | |
b3bbd485 JK |
23403 | diff --git a/kernel/sched/features.h b/kernel/sched/features.h |
23404 | index 9552fd5854bf..fb069998b518 100644 | |
23405 | --- a/kernel/sched/features.h | |
23406 | +++ b/kernel/sched/features.h | |
23407 | @@ -46,11 +46,19 @@ SCHED_FEAT(LB_BIAS, true) | |
e4b2b4a8 JK |
23408 | */ |
23409 | SCHED_FEAT(NONTASK_CAPACITY, true) | |
23410 | ||
23411 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
23412 | +SCHED_FEAT(TTWU_QUEUE, false) | |
23413 | +# ifdef CONFIG_PREEMPT_LAZY | |
23414 | +SCHED_FEAT(PREEMPT_LAZY, true) | |
23415 | +# endif | |
23416 | +#else | |
23417 | + | |
23418 | /* | |
23419 | * Queue remote wakeups on the target CPU and process them | |
23420 | * using the scheduler IPI. Reduces rq->lock contention/bounces. | |
23421 | */ | |
23422 | SCHED_FEAT(TTWU_QUEUE, true) | |
1a6e0f06 JK |
23423 | +#endif |
23424 | ||
e4b2b4a8 JK |
23425 | /* |
23426 | * When doing wakeups, attempt to limit superfluous scans of the LLC domain. | |
b3bbd485 JK |
23427 | diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c |
23428 | index cb9a5b8532fa..6c72332dab3f 100644 | |
23429 | --- a/kernel/sched/rt.c | |
23430 | +++ b/kernel/sched/rt.c | |
23431 | @@ -47,8 +47,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) | |
c7c16703 | 23432 | |
e4b2b4a8 | 23433 | raw_spin_lock_init(&rt_b->rt_runtime_lock); |
c7c16703 | 23434 | |
e4b2b4a8 JK |
23435 | - hrtimer_init(&rt_b->rt_period_timer, |
23436 | - CLOCK_MONOTONIC, HRTIMER_MODE_REL); | |
23437 | + hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, | |
23438 | + HRTIMER_MODE_REL_HARD); | |
23439 | rt_b->rt_period_timer.function = sched_rt_period_timer; | |
23440 | } | |
c7c16703 | 23441 | |
b3bbd485 | 23442 | @@ -1596,7 +1596,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) |
e4b2b4a8 JK |
23443 | static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) |
23444 | { | |
23445 | if (!task_running(rq, p) && | |
23446 | - cpumask_test_cpu(cpu, &p->cpus_allowed)) | |
23447 | + cpumask_test_cpu(cpu, p->cpus_ptr)) | |
23448 | return 1; | |
23449 | return 0; | |
c7c16703 | 23450 | } |
b3bbd485 | 23451 | @@ -1731,7 +1731,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) |
e4b2b4a8 JK |
23452 | * Also make sure that it wasn't scheduled on its rq. |
23453 | */ | |
23454 | if (unlikely(task_rq(task) != rq || | |
23455 | - !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) || | |
23456 | + !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) || | |
23457 | task_running(rq, task) || | |
23458 | !rt_task(task) || | |
23459 | !task_on_rq_queued(task))) { | |
b3bbd485 JK |
23460 | diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h |
23461 | index b29376169f3f..96481980c8c7 100644 | |
23462 | --- a/kernel/sched/sched.h | |
23463 | +++ b/kernel/sched/sched.h | |
23464 | @@ -1354,6 +1354,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |
e4b2b4a8 JK |
23465 | #define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ |
23466 | #define WF_FORK 0x02 /* child wakeup after fork */ | |
23467 | #define WF_MIGRATED 0x4 /* internal use, task got migrated */ | |
23468 | +#define WF_LOCK_SLEEPER 0x08 /* wakeup spinlock "sleeper" */ | |
c7c16703 | 23469 | |
e4b2b4a8 JK |
23470 | /* |
23471 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | |
b3bbd485 | 23472 | @@ -1545,6 +1546,15 @@ extern void init_sched_fair_class(void); |
e4b2b4a8 JK |
23473 | extern void resched_curr(struct rq *rq); |
23474 | extern void resched_cpu(int cpu); | |
23475 | ||
23476 | +#ifdef CONFIG_PREEMPT_LAZY | |
23477 | +extern void resched_curr_lazy(struct rq *rq); | |
23478 | +#else | |
23479 | +static inline void resched_curr_lazy(struct rq *rq) | |
1a6e0f06 | 23480 | +{ |
e4b2b4a8 | 23481 | + resched_curr(rq); |
1a6e0f06 | 23482 | +} |
1a6e0f06 JK |
23483 | +#endif |
23484 | + | |
e4b2b4a8 JK |
23485 | extern struct rt_bandwidth def_rt_bandwidth; |
23486 | extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); | |
23487 | ||
b3bbd485 JK |
23488 | diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c |
23489 | index 9ff1555341ed..b14638a05ec9 100644 | |
23490 | --- a/kernel/sched/swait.c | |
23491 | +++ b/kernel/sched/swait.c | |
e4b2b4a8 JK |
23492 | @@ -1,6 +1,7 @@ |
23493 | // SPDX-License-Identifier: GPL-2.0 | |
23494 | #include <linux/sched/signal.h> | |
23495 | #include <linux/swait.h> | |
23496 | +#include <linux/suspend.h> | |
23497 | ||
23498 | void __init_swait_queue_head(struct swait_queue_head *q, const char *name, | |
23499 | struct lock_class_key *key) | |
b3bbd485 | 23500 | @@ -30,6 +31,25 @@ void swake_up_locked(struct swait_queue_head *q) |
e4b2b4a8 JK |
23501 | } |
23502 | EXPORT_SYMBOL(swake_up_locked); | |
23503 | ||
23504 | +void swake_up_all_locked(struct swait_queue_head *q) | |
1a6e0f06 | 23505 | +{ |
e4b2b4a8 JK |
23506 | + struct swait_queue *curr; |
23507 | + int wakes = 0; | |
1a6e0f06 | 23508 | + |
e4b2b4a8 | 23509 | + while (!list_empty(&q->task_list)) { |
1a6e0f06 | 23510 | + |
e4b2b4a8 JK |
23511 | + curr = list_first_entry(&q->task_list, typeof(*curr), |
23512 | + task_list); | |
23513 | + wake_up_process(curr->task); | |
23514 | + list_del_init(&curr->task_list); | |
23515 | + wakes++; | |
23516 | + } | |
23517 | + if (pm_in_action) | |
23518 | + return; | |
23519 | + WARN(wakes > 2, "complete_all() with %d waiters\n", wakes); | |
1a6e0f06 | 23520 | +} |
e4b2b4a8 | 23521 | +EXPORT_SYMBOL(swake_up_all_locked); |
1a6e0f06 | 23522 | + |
e4b2b4a8 JK |
23523 | void swake_up(struct swait_queue_head *q) |
23524 | { | |
23525 | unsigned long flags; | |
b3bbd485 | 23526 | @@ -49,6 +69,7 @@ void swake_up_all(struct swait_queue_head *q) |
e4b2b4a8 JK |
23527 | struct swait_queue *curr; |
23528 | LIST_HEAD(tmp); | |
23529 | ||
23530 | + WARN_ON(irqs_disabled()); | |
23531 | raw_spin_lock_irq(&q->lock); | |
23532 | list_splice_init(&q->task_list, &tmp); | |
23533 | while (!list_empty(&tmp)) { | |
b3bbd485 JK |
23534 | diff --git a/kernel/sched/swork.c b/kernel/sched/swork.c |
23535 | new file mode 100644 | |
23536 | index 000000000000..1950f40ca725 | |
23537 | --- /dev/null | |
23538 | +++ b/kernel/sched/swork.c | |
e4b2b4a8 | 23539 | @@ -0,0 +1,173 @@ |
1a6e0f06 | 23540 | +/* |
e4b2b4a8 JK |
23541 | + * Copyright (C) 2014 BMW Car IT GmbH, Daniel Wagner daniel.wagner@bmw-carit.de |
23542 | + * | |
23543 | + * Provides a framework for enqueuing callbacks from irq context | |
23544 | + * PREEMPT_RT_FULL safe. The callbacks are executed in kthread context. | |
1a6e0f06 | 23545 | + */ |
1a6e0f06 | 23546 | + |
e4b2b4a8 JK |
23547 | +#include <linux/swait.h> |
23548 | +#include <linux/swork.h> | |
23549 | +#include <linux/kthread.h> | |
23550 | +#include <linux/slab.h> | |
23551 | +#include <linux/spinlock.h> | |
23552 | +#include <linux/export.h> | |
23553 | + | |
23554 | +#define SWORK_EVENT_PENDING (1 << 0) | |
23555 | + | |
23556 | +static DEFINE_MUTEX(worker_mutex); | |
23557 | +static struct sworker *glob_worker; | |
23558 | + | |
23559 | +struct sworker { | |
23560 | + struct list_head events; | |
23561 | + struct swait_queue_head wq; | |
1a6e0f06 | 23562 | + |
e4b2b4a8 JK |
23563 | + raw_spinlock_t lock; |
23564 | + | |
23565 | + struct task_struct *task; | |
23566 | + int refs; | |
23567 | +}; | |
1a6e0f06 | 23568 | + |
e4b2b4a8 | 23569 | +static bool swork_readable(struct sworker *worker) |
1a6e0f06 | 23570 | +{ |
e4b2b4a8 | 23571 | + bool r; |
1a6e0f06 | 23572 | + |
e4b2b4a8 JK |
23573 | + if (kthread_should_stop()) |
23574 | + return true; | |
23575 | + | |
23576 | + raw_spin_lock_irq(&worker->lock); | |
23577 | + r = !list_empty(&worker->events); | |
23578 | + raw_spin_unlock_irq(&worker->lock); | |
23579 | + | |
23580 | + return r; | |
1a6e0f06 | 23581 | +} |
1a6e0f06 | 23582 | + |
e4b2b4a8 | 23583 | +static int swork_kthread(void *arg) |
1a6e0f06 | 23584 | +{ |
e4b2b4a8 | 23585 | + struct sworker *worker = arg; |
1a6e0f06 | 23586 | + |
e4b2b4a8 JK |
23587 | + for (;;) { |
23588 | + swait_event_interruptible(worker->wq, | |
23589 | + swork_readable(worker)); | |
23590 | + if (kthread_should_stop()) | |
23591 | + break; | |
1a6e0f06 | 23592 | + |
e4b2b4a8 JK |
23593 | + raw_spin_lock_irq(&worker->lock); |
23594 | + while (!list_empty(&worker->events)) { | |
23595 | + struct swork_event *sev; | |
1a6e0f06 | 23596 | + |
e4b2b4a8 JK |
23597 | + sev = list_first_entry(&worker->events, |
23598 | + struct swork_event, item); | |
23599 | + list_del(&sev->item); | |
23600 | + raw_spin_unlock_irq(&worker->lock); | |
1a6e0f06 | 23601 | + |
e4b2b4a8 JK |
23602 | + WARN_ON_ONCE(!test_and_clear_bit(SWORK_EVENT_PENDING, |
23603 | + &sev->flags)); | |
23604 | + sev->func(sev); | |
23605 | + raw_spin_lock_irq(&worker->lock); | |
23606 | + } | |
23607 | + raw_spin_unlock_irq(&worker->lock); | |
23608 | + } | |
23609 | + return 0; | |
1a6e0f06 | 23610 | +} |
1a6e0f06 | 23611 | + |
e4b2b4a8 | 23612 | +static struct sworker *swork_create(void) |
1a6e0f06 | 23613 | +{ |
e4b2b4a8 | 23614 | + struct sworker *worker; |
1a6e0f06 | 23615 | + |
e4b2b4a8 JK |
23616 | + worker = kzalloc(sizeof(*worker), GFP_KERNEL); |
23617 | + if (!worker) | |
23618 | + return ERR_PTR(-ENOMEM); | |
1a6e0f06 | 23619 | + |
e4b2b4a8 JK |
23620 | + INIT_LIST_HEAD(&worker->events); |
23621 | + raw_spin_lock_init(&worker->lock); | |
23622 | + init_swait_queue_head(&worker->wq); | |
1a6e0f06 | 23623 | + |
e4b2b4a8 JK |
23624 | + worker->task = kthread_run(swork_kthread, worker, "kswork"); |
23625 | + if (IS_ERR(worker->task)) { | |
23626 | + kfree(worker); | |
23627 | + return ERR_PTR(-ENOMEM); | |
1a6e0f06 | 23628 | + } |
1a6e0f06 | 23629 | + |
e4b2b4a8 | 23630 | + return worker; |
1a6e0f06 | 23631 | +} |
1a6e0f06 | 23632 | + |
e4b2b4a8 | 23633 | +static void swork_destroy(struct sworker *worker) |
1a6e0f06 | 23634 | +{ |
e4b2b4a8 JK |
23635 | + kthread_stop(worker->task); |
23636 | + | |
23637 | + WARN_ON(!list_empty(&worker->events)); | |
23638 | + kfree(worker); | |
1a6e0f06 | 23639 | +} |
1a6e0f06 | 23640 | + |
e4b2b4a8 JK |
23641 | +/** |
23642 | + * swork_queue - queue swork | |
23643 | + * | |
23644 | + * Returns %false if @work was already on a queue, %true otherwise. | |
23645 | + * | |
23646 | + * The work is queued and processed on a random CPU | |
23647 | + */ | |
23648 | +bool swork_queue(struct swork_event *sev) | |
1a6e0f06 | 23649 | +{ |
e4b2b4a8 | 23650 | + unsigned long flags; |
1a6e0f06 | 23651 | + |
e4b2b4a8 JK |
23652 | + if (test_and_set_bit(SWORK_EVENT_PENDING, &sev->flags)) |
23653 | + return false; | |
1a6e0f06 | 23654 | + |
e4b2b4a8 JK |
23655 | + raw_spin_lock_irqsave(&glob_worker->lock, flags); |
23656 | + list_add_tail(&sev->item, &glob_worker->events); | |
23657 | + raw_spin_unlock_irqrestore(&glob_worker->lock, flags); | |
1a6e0f06 | 23658 | + |
e4b2b4a8 JK |
23659 | + swake_up(&glob_worker->wq); |
23660 | + return true; | |
1a6e0f06 | 23661 | +} |
e4b2b4a8 | 23662 | +EXPORT_SYMBOL_GPL(swork_queue); |
1a6e0f06 | 23663 | + |
e4b2b4a8 JK |
23664 | +/** |
23665 | + * swork_get - get an instance of the sworker | |
23666 | + * | |
23667 | + * Returns an negative error code if the initialization if the worker did not | |
23668 | + * work, %0 otherwise. | |
23669 | + * | |
23670 | + */ | |
23671 | +int swork_get(void) | |
1a6e0f06 | 23672 | +{ |
e4b2b4a8 | 23673 | + struct sworker *worker; |
1a6e0f06 | 23674 | + |
e4b2b4a8 JK |
23675 | + mutex_lock(&worker_mutex); |
23676 | + if (!glob_worker) { | |
23677 | + worker = swork_create(); | |
23678 | + if (IS_ERR(worker)) { | |
23679 | + mutex_unlock(&worker_mutex); | |
23680 | + return -ENOMEM; | |
23681 | + } | |
1a6e0f06 | 23682 | + |
e4b2b4a8 JK |
23683 | + glob_worker = worker; |
23684 | + } | |
1a6e0f06 | 23685 | + |
e4b2b4a8 JK |
23686 | + glob_worker->refs++; |
23687 | + mutex_unlock(&worker_mutex); | |
1a6e0f06 | 23688 | + |
e4b2b4a8 | 23689 | + return 0; |
1a6e0f06 | 23690 | +} |
e4b2b4a8 | 23691 | +EXPORT_SYMBOL_GPL(swork_get); |
1a6e0f06 | 23692 | + |
e4b2b4a8 JK |
23693 | +/** |
23694 | + * swork_put - puts an instance of the sworker | |
23695 | + * | |
23696 | + * Will destroy the sworker thread. This function must not be called until all | |
23697 | + * queued events have been completed. | |
1a6e0f06 | 23698 | + */ |
e4b2b4a8 | 23699 | +void swork_put(void) |
1a6e0f06 | 23700 | +{ |
e4b2b4a8 | 23701 | + mutex_lock(&worker_mutex); |
1a6e0f06 | 23702 | + |
e4b2b4a8 JK |
23703 | + glob_worker->refs--; |
23704 | + if (glob_worker->refs > 0) | |
23705 | + goto out; | |
1a6e0f06 | 23706 | + |
e4b2b4a8 JK |
23707 | + swork_destroy(glob_worker); |
23708 | + glob_worker = NULL; | |
23709 | +out: | |
23710 | + mutex_unlock(&worker_mutex); | |
1a6e0f06 | 23711 | +} |
e4b2b4a8 | 23712 | +EXPORT_SYMBOL_GPL(swork_put); |
b3bbd485 JK |
23713 | diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c |
23714 | index 659e075ef70b..bb22e3620a90 100644 | |
23715 | --- a/kernel/sched/topology.c | |
23716 | +++ b/kernel/sched/topology.c | |
23717 | @@ -286,6 +286,7 @@ static int init_rootdomain(struct root_domain *rd) | |
e4b2b4a8 JK |
23718 | rd->rto_cpu = -1; |
23719 | raw_spin_lock_init(&rd->rto_lock); | |
23720 | init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); | |
23721 | + rd->rto_push_work.flags |= IRQ_WORK_HARD_IRQ; | |
23722 | #endif | |
23723 | ||
23724 | init_dl_bw(&rd->dl_bw); | |
b3bbd485 JK |
23725 | diff --git a/kernel/signal.c b/kernel/signal.c |
23726 | index 4439ba9dc5d9..d8f75a030292 100644 | |
23727 | --- a/kernel/signal.c | |
23728 | +++ b/kernel/signal.c | |
e4b2b4a8 JK |
23729 | @@ -19,6 +19,7 @@ |
23730 | #include <linux/sched/task.h> | |
23731 | #include <linux/sched/task_stack.h> | |
23732 | #include <linux/sched/cputime.h> | |
23733 | +#include <linux/sched/rt.h> | |
23734 | #include <linux/fs.h> | |
23735 | #include <linux/tty.h> | |
23736 | #include <linux/binfmts.h> | |
b3bbd485 | 23737 | @@ -360,13 +361,30 @@ static bool task_participate_group_stop(struct task_struct *task) |
e4b2b4a8 JK |
23738 | return false; |
23739 | } | |
23740 | ||
23741 | +static inline struct sigqueue *get_task_cache(struct task_struct *t) | |
1a6e0f06 | 23742 | +{ |
e4b2b4a8 | 23743 | + struct sigqueue *q = t->sigqueue_cache; |
1a6e0f06 | 23744 | + |
e4b2b4a8 JK |
23745 | + if (cmpxchg(&t->sigqueue_cache, q, NULL) != q) |
23746 | + return NULL; | |
23747 | + return q; | |
1a6e0f06 | 23748 | +} |
1a6e0f06 | 23749 | + |
e4b2b4a8 | 23750 | +static inline int put_task_cache(struct task_struct *t, struct sigqueue *q) |
1a6e0f06 | 23751 | +{ |
e4b2b4a8 JK |
23752 | + if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL) |
23753 | + return 0; | |
23754 | + return 1; | |
1a6e0f06 | 23755 | +} |
1a6e0f06 | 23756 | + |
e4b2b4a8 JK |
23757 | /* |
23758 | * allocate a new signal queue record | |
23759 | * - this may be called without locks if and only if t == current, otherwise an | |
23760 | * appropriate lock must be held to stop the target task from exiting | |
23761 | */ | |
23762 | static struct sigqueue * | |
23763 | -__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit) | |
23764 | +__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags, | |
23765 | + int override_rlimit, int fromslab) | |
23766 | { | |
23767 | struct sigqueue *q = NULL; | |
23768 | struct user_struct *user; | |
b3bbd485 | 23769 | @@ -383,7 +401,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi |
e4b2b4a8 JK |
23770 | if (override_rlimit || |
23771 | atomic_read(&user->sigpending) <= | |
23772 | task_rlimit(t, RLIMIT_SIGPENDING)) { | |
23773 | - q = kmem_cache_alloc(sigqueue_cachep, flags); | |
23774 | + if (!fromslab) | |
23775 | + q = get_task_cache(t); | |
23776 | + if (!q) | |
23777 | + q = kmem_cache_alloc(sigqueue_cachep, flags); | |
23778 | } else { | |
23779 | print_dropped_signal(sig); | |
23780 | } | |
b3bbd485 | 23781 | @@ -400,6 +421,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi |
e4b2b4a8 JK |
23782 | return q; |
23783 | } | |
23784 | ||
23785 | +static struct sigqueue * | |
23786 | +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, | |
23787 | + int override_rlimit) | |
1a6e0f06 | 23788 | +{ |
e4b2b4a8 | 23789 | + return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0); |
1a6e0f06 | 23790 | +} |
1a6e0f06 | 23791 | + |
e4b2b4a8 JK |
23792 | static void __sigqueue_free(struct sigqueue *q) |
23793 | { | |
23794 | if (q->flags & SIGQUEUE_PREALLOC) | |
b3bbd485 | 23795 | @@ -409,6 +437,21 @@ static void __sigqueue_free(struct sigqueue *q) |
e4b2b4a8 JK |
23796 | kmem_cache_free(sigqueue_cachep, q); |
23797 | } | |
23798 | ||
23799 | +static void sigqueue_free_current(struct sigqueue *q) | |
1a6e0f06 | 23800 | +{ |
e4b2b4a8 JK |
23801 | + struct user_struct *up; |
23802 | + | |
23803 | + if (q->flags & SIGQUEUE_PREALLOC) | |
23804 | + return; | |
23805 | + | |
23806 | + up = q->user; | |
23807 | + if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) { | |
23808 | + atomic_dec(&up->sigpending); | |
23809 | + free_uid(up); | |
23810 | + } else | |
23811 | + __sigqueue_free(q); | |
1a6e0f06 | 23812 | +} |
1a6e0f06 | 23813 | + |
e4b2b4a8 JK |
23814 | void flush_sigqueue(struct sigpending *queue) |
23815 | { | |
23816 | struct sigqueue *q; | |
b3bbd485 JK |
23817 | @@ -421,6 +464,21 @@ void flush_sigqueue(struct sigpending *queue) |
23818 | } | |
e4b2b4a8 JK |
23819 | } |
23820 | ||
b3bbd485 | 23821 | +/* |
e4b2b4a8 JK |
23822 | + * Called from __exit_signal. Flush tsk->pending and |
23823 | + * tsk->sigqueue_cache | |
23824 | + */ | |
23825 | +void flush_task_sigqueue(struct task_struct *tsk) | |
1a6e0f06 | 23826 | +{ |
e4b2b4a8 | 23827 | + struct sigqueue *q; |
1a6e0f06 | 23828 | + |
e4b2b4a8 | 23829 | + flush_sigqueue(&tsk->pending); |
1a6e0f06 | 23830 | + |
e4b2b4a8 JK |
23831 | + q = get_task_cache(tsk); |
23832 | + if (q) | |
23833 | + kmem_cache_free(sigqueue_cachep, q); | |
1a6e0f06 JK |
23834 | +} |
23835 | + | |
b3bbd485 | 23836 | /* |
e4b2b4a8 JK |
23837 | * Flush all pending signals for this kthread. |
23838 | */ | |
b3bbd485 | 23839 | @@ -542,7 +600,7 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info, |
e4b2b4a8 JK |
23840 | (info->si_code == SI_TIMER) && |
23841 | (info->si_sys_private); | |
23842 | ||
23843 | - __sigqueue_free(first); | |
23844 | + sigqueue_free_current(first); | |
23845 | } else { | |
23846 | /* | |
23847 | * Ok, it wasn't in the queue. This must be | |
b3bbd485 | 23848 | @@ -578,6 +636,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) |
e4b2b4a8 JK |
23849 | bool resched_timer = false; |
23850 | int signr; | |
23851 | ||
23852 | + WARN_ON_ONCE(tsk != current); | |
23853 | + | |
23854 | /* We only dequeue private signals from ourselves, we don't let | |
23855 | * signalfd steal them | |
23856 | */ | |
b3bbd485 | 23857 | @@ -1177,8 +1237,8 @@ int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, |
e4b2b4a8 JK |
23858 | * We don't want to have recursive SIGSEGV's etc, for example, |
23859 | * that is why we also clear SIGNAL_UNKILLABLE. | |
23860 | */ | |
23861 | -int | |
23862 | -force_sig_info(int sig, struct siginfo *info, struct task_struct *t) | |
23863 | +static int | |
23864 | +do_force_sig_info(int sig, struct siginfo *info, struct task_struct *t) | |
23865 | { | |
23866 | unsigned long int flags; | |
23867 | int ret, blocked, ignored; | |
b3bbd485 | 23868 | @@ -1207,6 +1267,39 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) |
e4b2b4a8 JK |
23869 | return ret; |
23870 | } | |
23871 | ||
23872 | +int force_sig_info(int sig, struct siginfo *info, struct task_struct *t) | |
1a6e0f06 | 23873 | +{ |
e4b2b4a8 JK |
23874 | +/* |
23875 | + * On some archs, PREEMPT_RT has to delay sending a signal from a trap | |
23876 | + * since it can not enable preemption, and the signal code's spin_locks | |
23877 | + * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will | |
23878 | + * send the signal on exit of the trap. | |
23879 | + */ | |
23880 | +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND | |
23881 | + if (in_atomic()) { | |
23882 | + if (WARN_ON_ONCE(t != current)) | |
23883 | + return 0; | |
23884 | + if (WARN_ON_ONCE(t->forced_info.si_signo)) | |
23885 | + return 0; | |
1a6e0f06 | 23886 | + |
e4b2b4a8 JK |
23887 | + if (is_si_special(info)) { |
23888 | + WARN_ON_ONCE(info != SEND_SIG_PRIV); | |
23889 | + t->forced_info.si_signo = sig; | |
23890 | + t->forced_info.si_errno = 0; | |
23891 | + t->forced_info.si_code = SI_KERNEL; | |
23892 | + t->forced_info.si_pid = 0; | |
23893 | + t->forced_info.si_uid = 0; | |
23894 | + } else { | |
23895 | + t->forced_info = *info; | |
23896 | + } | |
1a6e0f06 | 23897 | + |
e4b2b4a8 JK |
23898 | + set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); |
23899 | + return 0; | |
23900 | + } | |
23901 | +#endif | |
23902 | + return do_force_sig_info(sig, info, t); | |
1a6e0f06 | 23903 | +} |
1a6e0f06 | 23904 | + |
e4b2b4a8 JK |
23905 | /* |
23906 | * Nuke all other threads in the group. | |
23907 | */ | |
b3bbd485 | 23908 | @@ -1241,12 +1334,12 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, |
e4b2b4a8 JK |
23909 | * Disable interrupts early to avoid deadlocks. |
23910 | * See rcu_read_unlock() comment header for details. | |
23911 | */ | |
23912 | - local_irq_save(*flags); | |
23913 | + local_irq_save_nort(*flags); | |
23914 | rcu_read_lock(); | |
23915 | sighand = rcu_dereference(tsk->sighand); | |
23916 | if (unlikely(sighand == NULL)) { | |
23917 | rcu_read_unlock(); | |
23918 | - local_irq_restore(*flags); | |
23919 | + local_irq_restore_nort(*flags); | |
23920 | break; | |
23921 | } | |
23922 | /* | |
b3bbd485 | 23923 | @@ -1267,7 +1360,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, |
e4b2b4a8 JK |
23924 | } |
23925 | spin_unlock(&sighand->siglock); | |
23926 | rcu_read_unlock(); | |
23927 | - local_irq_restore(*flags); | |
23928 | + local_irq_restore_nort(*flags); | |
23929 | } | |
23930 | ||
23931 | return sighand; | |
b3bbd485 | 23932 | @@ -1514,7 +1607,8 @@ EXPORT_SYMBOL(kill_pid); |
e4b2b4a8 JK |
23933 | */ |
23934 | struct sigqueue *sigqueue_alloc(void) | |
23935 | { | |
23936 | - struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0); | |
23937 | + /* Preallocated sigqueue objects always from the slabcache ! */ | |
23938 | + struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1); | |
23939 | ||
23940 | if (q) | |
23941 | q->flags |= SIGQUEUE_PREALLOC; | |
b3bbd485 | 23942 | @@ -1888,15 +1982,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) |
e4b2b4a8 JK |
23943 | if (gstop_done && ptrace_reparented(current)) |
23944 | do_notify_parent_cldstop(current, false, why); | |
23945 | ||
23946 | - /* | |
23947 | - * Don't want to allow preemption here, because | |
23948 | - * sys_ptrace() needs this task to be inactive. | |
23949 | - * | |
23950 | - * XXX: implement read_unlock_no_resched(). | |
23951 | - */ | |
23952 | - preempt_disable(); | |
23953 | read_unlock(&tasklist_lock); | |
23954 | - preempt_enable_no_resched(); | |
23955 | freezable_schedule(); | |
23956 | } else { | |
23957 | /* | |
b3bbd485 | 23958 | diff --git a/kernel/softirq.c b/kernel/softirq.c |
5dd41b01 | 23959 | index a4c87cf27f9d..583c9ecf04e3 100644 |
b3bbd485 JK |
23960 | --- a/kernel/softirq.c |
23961 | +++ b/kernel/softirq.c | |
e4b2b4a8 JK |
23962 | @@ -21,11 +21,14 @@ |
23963 | #include <linux/freezer.h> | |
23964 | #include <linux/kthread.h> | |
23965 | #include <linux/rcupdate.h> | |
23966 | +#include <linux/delay.h> | |
23967 | #include <linux/ftrace.h> | |
23968 | #include <linux/smp.h> | |
23969 | #include <linux/smpboot.h> | |
23970 | #include <linux/tick.h> | |
23971 | +#include <linux/locallock.h> | |
23972 | #include <linux/irq.h> | |
23973 | +#include <linux/sched/types.h> | |
23974 | ||
23975 | #define CREATE_TRACE_POINTS | |
23976 | #include <trace/events/irq.h> | |
b3bbd485 | 23977 | @@ -56,12 +59,108 @@ EXPORT_SYMBOL(irq_stat); |
e4b2b4a8 JK |
23978 | static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; |
23979 | ||
23980 | DEFINE_PER_CPU(struct task_struct *, ksoftirqd); | |
23981 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
23982 | +#define TIMER_SOFTIRQS ((1 << TIMER_SOFTIRQ) | (1 << HRTIMER_SOFTIRQ)) | |
23983 | +DEFINE_PER_CPU(struct task_struct *, ktimer_softirqd); | |
23984 | +#endif | |
23985 | ||
23986 | const char * const softirq_to_name[NR_SOFTIRQS] = { | |
23987 | "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL", | |
23988 | "TASKLET", "SCHED", "HRTIMER", "RCU" | |
23989 | }; | |
23990 | ||
23991 | +#ifdef CONFIG_NO_HZ_COMMON | |
23992 | +# ifdef CONFIG_PREEMPT_RT_FULL | |
1a6e0f06 | 23993 | + |
e4b2b4a8 JK |
23994 | +struct softirq_runner { |
23995 | + struct task_struct *runner[NR_SOFTIRQS]; | |
23996 | +}; | |
1a6e0f06 | 23997 | + |
e4b2b4a8 | 23998 | +static DEFINE_PER_CPU(struct softirq_runner, softirq_runners); |
1a6e0f06 | 23999 | + |
e4b2b4a8 | 24000 | +static inline void softirq_set_runner(unsigned int sirq) |
1a6e0f06 | 24001 | +{ |
e4b2b4a8 | 24002 | + struct softirq_runner *sr = this_cpu_ptr(&softirq_runners); |
1a6e0f06 | 24003 | + |
e4b2b4a8 | 24004 | + sr->runner[sirq] = current; |
1a6e0f06 | 24005 | +} |
1a6e0f06 | 24006 | + |
e4b2b4a8 | 24007 | +static inline void softirq_clr_runner(unsigned int sirq) |
1a6e0f06 | 24008 | +{ |
e4b2b4a8 JK |
24009 | + struct softirq_runner *sr = this_cpu_ptr(&softirq_runners); |
24010 | + | |
24011 | + sr->runner[sirq] = NULL; | |
1a6e0f06 | 24012 | +} |
1a6e0f06 | 24013 | + |
e4b2b4a8 JK |
24014 | +/* |
24015 | + * On preempt-rt a softirq running context might be blocked on a | |
24016 | + * lock. There might be no other runnable task on this CPU because the | |
24017 | + * lock owner runs on some other CPU. So we have to go into idle with | |
24018 | + * the pending bit set. Therefor we need to check this otherwise we | |
24019 | + * warn about false positives which confuses users and defeats the | |
24020 | + * whole purpose of this test. | |
1a6e0f06 | 24021 | + * |
e4b2b4a8 | 24022 | + * This code is called with interrupts disabled. |
1a6e0f06 | 24023 | + */ |
e4b2b4a8 | 24024 | +void softirq_check_pending_idle(void) |
1a6e0f06 | 24025 | +{ |
e4b2b4a8 JK |
24026 | + static int rate_limit; |
24027 | + struct softirq_runner *sr = this_cpu_ptr(&softirq_runners); | |
24028 | + u32 warnpending; | |
24029 | + int i; | |
24030 | + | |
24031 | + if (rate_limit >= 10) | |
24032 | + return; | |
24033 | + | |
24034 | + warnpending = local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK; | |
24035 | + for (i = 0; i < NR_SOFTIRQS; i++) { | |
24036 | + struct task_struct *tsk = sr->runner[i]; | |
24037 | + | |
24038 | + /* | |
24039 | + * The wakeup code in rtmutex.c wakes up the task | |
24040 | + * _before_ it sets pi_blocked_on to NULL under | |
24041 | + * tsk->pi_lock. So we need to check for both: state | |
24042 | + * and pi_blocked_on. | |
24043 | + */ | |
24044 | + if (tsk) { | |
24045 | + raw_spin_lock(&tsk->pi_lock); | |
24046 | + if (tsk->pi_blocked_on || tsk->state == TASK_RUNNING) { | |
24047 | + /* Clear all bits pending in that task */ | |
24048 | + warnpending &= ~(tsk->softirqs_raised); | |
24049 | + warnpending &= ~(1 << i); | |
24050 | + } | |
24051 | + raw_spin_unlock(&tsk->pi_lock); | |
24052 | + } | |
1a6e0f06 | 24053 | + } |
e4b2b4a8 JK |
24054 | + |
24055 | + if (warnpending) { | |
24056 | + printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", | |
24057 | + warnpending); | |
24058 | + rate_limit++; | |
24059 | + } | |
24060 | +} | |
24061 | +# else | |
24062 | +/* | |
24063 | + * On !PREEMPT_RT we just printk rate limited: | |
24064 | + */ | |
24065 | +void softirq_check_pending_idle(void) | |
1a6e0f06 | 24066 | +{ |
e4b2b4a8 JK |
24067 | + static int rate_limit; |
24068 | + | |
5dd41b01 | 24069 | + if (rate_limit < 10 && !in_softirq() && |
e4b2b4a8 JK |
24070 | + (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { |
24071 | + printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", | |
24072 | + local_softirq_pending()); | |
24073 | + rate_limit++; | |
24074 | + } | |
1a6e0f06 | 24075 | +} |
e4b2b4a8 JK |
24076 | +# endif |
24077 | + | |
24078 | +#else /* !CONFIG_NO_HZ_COMMON */ | |
24079 | +static inline void softirq_set_runner(unsigned int sirq) { } | |
24080 | +static inline void softirq_clr_runner(unsigned int sirq) { } | |
24081 | +#endif | |
1a6e0f06 JK |
24082 | + |
24083 | /* | |
e4b2b4a8 JK |
24084 | * we cannot loop indefinitely here to avoid userspace starvation, |
24085 | * but we also don't want to introduce a worst case 1/HZ latency | |
b3bbd485 | 24086 | @@ -77,6 +176,38 @@ static void wakeup_softirqd(void) |
e4b2b4a8 | 24087 | wake_up_process(tsk); |
1a6e0f06 JK |
24088 | } |
24089 | ||
e4b2b4a8 JK |
24090 | +#ifdef CONFIG_PREEMPT_RT_FULL |
24091 | +static void wakeup_timer_softirqd(void) | |
1a6e0f06 | 24092 | +{ |
e4b2b4a8 JK |
24093 | + /* Interrupts are disabled: no need to stop preemption */ |
24094 | + struct task_struct *tsk = __this_cpu_read(ktimer_softirqd); | |
24095 | + | |
24096 | + if (tsk && tsk->state != TASK_RUNNING) | |
24097 | + wake_up_process(tsk); | |
1a6e0f06 | 24098 | +} |
e4b2b4a8 | 24099 | +#endif |
1a6e0f06 | 24100 | + |
e4b2b4a8 JK |
24101 | +static void handle_softirq(unsigned int vec_nr) |
24102 | +{ | |
24103 | + struct softirq_action *h = softirq_vec + vec_nr; | |
24104 | + int prev_count; | |
1a6e0f06 | 24105 | + |
e4b2b4a8 | 24106 | + prev_count = preempt_count(); |
1a6e0f06 | 24107 | + |
e4b2b4a8 | 24108 | + kstat_incr_softirqs_this_cpu(vec_nr); |
1a6e0f06 | 24109 | + |
e4b2b4a8 JK |
24110 | + trace_softirq_entry(vec_nr); |
24111 | + h->action(h); | |
24112 | + trace_softirq_exit(vec_nr); | |
24113 | + if (unlikely(prev_count != preempt_count())) { | |
24114 | + pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n", | |
24115 | + vec_nr, softirq_to_name[vec_nr], h->action, | |
24116 | + prev_count, preempt_count()); | |
24117 | + preempt_count_set(prev_count); | |
24118 | + } | |
1a6e0f06 JK |
24119 | +} |
24120 | + | |
e4b2b4a8 | 24121 | +#ifndef CONFIG_PREEMPT_RT_FULL |
1a6e0f06 | 24122 | /* |
e4b2b4a8 JK |
24123 | * If ksoftirqd is scheduled, we do not want to process pending softirqs |
24124 | * right now. Let ksoftirqd handle this at its own rate, to get fairness, | |
b3bbd485 | 24125 | @@ -92,6 +223,47 @@ static bool ksoftirqd_running(unsigned long pending) |
e4b2b4a8 | 24126 | return tsk && (tsk->state == TASK_RUNNING); |
1a6e0f06 JK |
24127 | } |
24128 | ||
e4b2b4a8 | 24129 | +static inline int ksoftirqd_softirq_pending(void) |
1a6e0f06 | 24130 | +{ |
e4b2b4a8 | 24131 | + return local_softirq_pending(); |
1a6e0f06 JK |
24132 | +} |
24133 | + | |
e4b2b4a8 | 24134 | +static void handle_pending_softirqs(u32 pending) |
1a6e0f06 | 24135 | +{ |
e4b2b4a8 JK |
24136 | + struct softirq_action *h = softirq_vec; |
24137 | + int softirq_bit; | |
1a6e0f06 | 24138 | + |
e4b2b4a8 JK |
24139 | + local_irq_enable(); |
24140 | + | |
24141 | + h = softirq_vec; | |
24142 | + | |
24143 | + while ((softirq_bit = ffs(pending))) { | |
24144 | + unsigned int vec_nr; | |
24145 | + | |
24146 | + h += softirq_bit - 1; | |
24147 | + vec_nr = h - softirq_vec; | |
24148 | + handle_softirq(vec_nr); | |
24149 | + | |
24150 | + h++; | |
24151 | + pending >>= softirq_bit; | |
1a6e0f06 | 24152 | + } |
e4b2b4a8 JK |
24153 | + |
24154 | + rcu_bh_qs(); | |
24155 | + local_irq_disable(); | |
1a6e0f06 | 24156 | +} |
e4b2b4a8 JK |
24157 | + |
24158 | +static void run_ksoftirqd(unsigned int cpu) | |
1a6e0f06 | 24159 | +{ |
e4b2b4a8 JK |
24160 | + local_irq_disable(); |
24161 | + if (ksoftirqd_softirq_pending()) { | |
24162 | + __do_softirq(); | |
24163 | + local_irq_enable(); | |
24164 | + cond_resched_rcu_qs(); | |
24165 | + return; | |
24166 | + } | |
24167 | + local_irq_enable(); | |
1a6e0f06 | 24168 | +} |
1a6e0f06 | 24169 | + |
e4b2b4a8 JK |
24170 | /* |
24171 | * preempt_count and SOFTIRQ_OFFSET usage: | |
24172 | * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving | |
b3bbd485 | 24173 | @@ -247,10 +419,8 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) |
e4b2b4a8 JK |
24174 | unsigned long end = jiffies + MAX_SOFTIRQ_TIME; |
24175 | unsigned long old_flags = current->flags; | |
24176 | int max_restart = MAX_SOFTIRQ_RESTART; | |
24177 | - struct softirq_action *h; | |
24178 | bool in_hardirq; | |
24179 | __u32 pending; | |
24180 | - int softirq_bit; | |
24181 | ||
24182 | /* | |
24183 | * Mask out PF_MEMALLOC s current task context is borrowed for the | |
b3bbd485 | 24184 | @@ -269,36 +439,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) |
e4b2b4a8 JK |
24185 | /* Reset the pending bitmask before enabling irqs */ |
24186 | set_softirq_pending(0); | |
24187 | ||
24188 | - local_irq_enable(); | |
24189 | - | |
24190 | - h = softirq_vec; | |
24191 | - | |
24192 | - while ((softirq_bit = ffs(pending))) { | |
24193 | - unsigned int vec_nr; | |
24194 | - int prev_count; | |
24195 | - | |
24196 | - h += softirq_bit - 1; | |
24197 | - | |
24198 | - vec_nr = h - softirq_vec; | |
24199 | - prev_count = preempt_count(); | |
24200 | - | |
24201 | - kstat_incr_softirqs_this_cpu(vec_nr); | |
24202 | - | |
24203 | - trace_softirq_entry(vec_nr); | |
24204 | - h->action(h); | |
24205 | - trace_softirq_exit(vec_nr); | |
24206 | - if (unlikely(prev_count != preempt_count())) { | |
24207 | - pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n", | |
24208 | - vec_nr, softirq_to_name[vec_nr], h->action, | |
24209 | - prev_count, preempt_count()); | |
24210 | - preempt_count_set(prev_count); | |
24211 | - } | |
24212 | - h++; | |
24213 | - pending >>= softirq_bit; | |
24214 | - } | |
24215 | - | |
24216 | - rcu_bh_qs(); | |
24217 | - local_irq_disable(); | |
24218 | + handle_pending_softirqs(pending); | |
24219 | ||
24220 | pending = local_softirq_pending(); | |
24221 | if (pending) { | |
b3bbd485 JK |
24222 | @@ -334,6 +475,309 @@ asmlinkage __visible void do_softirq(void) |
24223 | local_irq_restore(flags); | |
e4b2b4a8 JK |
24224 | } |
24225 | ||
b3bbd485 | 24226 | +/* |
e4b2b4a8 | 24227 | + * This function must run with irqs disabled! |
1a6e0f06 | 24228 | + */ |
e4b2b4a8 | 24229 | +void raise_softirq_irqoff(unsigned int nr) |
1a6e0f06 | 24230 | +{ |
e4b2b4a8 | 24231 | + __raise_softirq_irqoff(nr); |
1a6e0f06 JK |
24232 | + |
24233 | + /* | |
e4b2b4a8 JK |
24234 | + * If we're in an interrupt or softirq, we're done |
24235 | + * (this also catches softirq-disabled code). We will | |
24236 | + * actually run the softirq once we return from | |
24237 | + * the irq or softirq. | |
24238 | + * | |
24239 | + * Otherwise we wake up ksoftirqd to make sure we | |
24240 | + * schedule the softirq soon. | |
1a6e0f06 | 24241 | + */ |
e4b2b4a8 JK |
24242 | + if (!in_interrupt()) |
24243 | + wakeup_softirqd(); | |
24244 | +} | |
1a6e0f06 | 24245 | + |
e4b2b4a8 JK |
24246 | +void __raise_softirq_irqoff(unsigned int nr) |
24247 | +{ | |
24248 | + trace_softirq_raise(nr); | |
24249 | + or_softirq_pending(1UL << nr); | |
24250 | +} | |
1a6e0f06 | 24251 | + |
e4b2b4a8 JK |
24252 | +static inline void local_bh_disable_nort(void) { local_bh_disable(); } |
24253 | +static inline void _local_bh_enable_nort(void) { _local_bh_enable(); } | |
24254 | +static void ksoftirqd_set_sched_params(unsigned int cpu) { } | |
1a6e0f06 | 24255 | + |
e4b2b4a8 | 24256 | +#else /* !PREEMPT_RT_FULL */ |
1a6e0f06 | 24257 | + |
e4b2b4a8 JK |
24258 | +/* |
24259 | + * On RT we serialize softirq execution with a cpu local lock per softirq | |
24260 | + */ | |
24261 | +static DEFINE_PER_CPU(struct local_irq_lock [NR_SOFTIRQS], local_softirq_locks); | |
1a6e0f06 | 24262 | + |
e4b2b4a8 JK |
24263 | +void __init softirq_early_init(void) |
24264 | +{ | |
24265 | + int i; | |
1a6e0f06 | 24266 | + |
e4b2b4a8 JK |
24267 | + for (i = 0; i < NR_SOFTIRQS; i++) |
24268 | + local_irq_lock_init(local_softirq_locks[i]); | |
24269 | +} | |
1a6e0f06 | 24270 | + |
e4b2b4a8 JK |
24271 | +static void lock_softirq(int which) |
24272 | +{ | |
24273 | + local_lock(local_softirq_locks[which]); | |
24274 | +} | |
1a6e0f06 | 24275 | + |
e4b2b4a8 JK |
24276 | +static void unlock_softirq(int which) |
24277 | +{ | |
24278 | + local_unlock(local_softirq_locks[which]); | |
24279 | +} | |
1a6e0f06 | 24280 | + |
e4b2b4a8 JK |
24281 | +static void do_single_softirq(int which) |
24282 | +{ | |
24283 | + unsigned long old_flags = current->flags; | |
1a6e0f06 | 24284 | + |
e4b2b4a8 JK |
24285 | + current->flags &= ~PF_MEMALLOC; |
24286 | + vtime_account_irq_enter(current); | |
24287 | + current->flags |= PF_IN_SOFTIRQ; | |
24288 | + lockdep_softirq_enter(); | |
24289 | + local_irq_enable(); | |
24290 | + handle_softirq(which); | |
24291 | + local_irq_disable(); | |
24292 | + lockdep_softirq_exit(); | |
24293 | + current->flags &= ~PF_IN_SOFTIRQ; | |
24294 | + vtime_account_irq_enter(current); | |
24295 | + current_restore_flags(old_flags, PF_MEMALLOC); | |
1a6e0f06 JK |
24296 | +} |
24297 | + | |
1a6e0f06 | 24298 | +/* |
e4b2b4a8 JK |
24299 | + * Called with interrupts disabled. Process softirqs which were raised |
24300 | + * in current context (or on behalf of ksoftirqd). | |
1a6e0f06 | 24301 | + */ |
e4b2b4a8 | 24302 | +static void do_current_softirqs(void) |
1a6e0f06 | 24303 | +{ |
e4b2b4a8 JK |
24304 | + while (current->softirqs_raised) { |
24305 | + int i = __ffs(current->softirqs_raised); | |
24306 | + unsigned int pending, mask = (1U << i); | |
1a6e0f06 | 24307 | + |
e4b2b4a8 JK |
24308 | + current->softirqs_raised &= ~mask; |
24309 | + local_irq_enable(); | |
1a6e0f06 | 24310 | + |
e4b2b4a8 JK |
24311 | + /* |
24312 | + * If the lock is contended, we boost the owner to | |
24313 | + * process the softirq or leave the critical section | |
24314 | + * now. | |
24315 | + */ | |
24316 | + lock_softirq(i); | |
24317 | + local_irq_disable(); | |
24318 | + softirq_set_runner(i); | |
24319 | + /* | |
24320 | + * Check with the local_softirq_pending() bits, | |
24321 | + * whether we need to process this still or if someone | |
24322 | + * else took care of it. | |
24323 | + */ | |
24324 | + pending = local_softirq_pending(); | |
24325 | + if (pending & mask) { | |
24326 | + set_softirq_pending(pending & ~mask); | |
24327 | + do_single_softirq(i); | |
24328 | + } | |
24329 | + softirq_clr_runner(i); | |
24330 | + WARN_ON(current->softirq_nestcnt != 1); | |
24331 | + local_irq_enable(); | |
24332 | + unlock_softirq(i); | |
24333 | + local_irq_disable(); | |
1a6e0f06 | 24334 | + } |
1a6e0f06 JK |
24335 | +} |
24336 | + | |
e4b2b4a8 | 24337 | +void __local_bh_disable(void) |
1a6e0f06 | 24338 | +{ |
e4b2b4a8 JK |
24339 | + if (++current->softirq_nestcnt == 1) |
24340 | + migrate_disable(); | |
24341 | +} | |
24342 | +EXPORT_SYMBOL(__local_bh_disable); | |
1a6e0f06 | 24343 | + |
e4b2b4a8 JK |
24344 | +void __local_bh_enable(void) |
24345 | +{ | |
24346 | + if (WARN_ON(current->softirq_nestcnt == 0)) | |
24347 | + return; | |
1a6e0f06 | 24348 | + |
e4b2b4a8 JK |
24349 | + local_irq_disable(); |
24350 | + if (current->softirq_nestcnt == 1 && current->softirqs_raised) | |
24351 | + do_current_softirqs(); | |
24352 | + local_irq_enable(); | |
1a6e0f06 | 24353 | + |
e4b2b4a8 JK |
24354 | + if (--current->softirq_nestcnt == 0) |
24355 | + migrate_enable(); | |
1a6e0f06 | 24356 | +} |
e4b2b4a8 | 24357 | +EXPORT_SYMBOL(__local_bh_enable); |
1a6e0f06 | 24358 | + |
e4b2b4a8 | 24359 | +void _local_bh_enable(void) |
1a6e0f06 | 24360 | +{ |
e4b2b4a8 JK |
24361 | + if (WARN_ON(current->softirq_nestcnt == 0)) |
24362 | + return; | |
24363 | + if (--current->softirq_nestcnt == 0) | |
24364 | + migrate_enable(); | |
1a6e0f06 | 24365 | +} |
e4b2b4a8 | 24366 | +EXPORT_SYMBOL(_local_bh_enable); |
1a6e0f06 | 24367 | + |
e4b2b4a8 | 24368 | +int in_serving_softirq(void) |
1a6e0f06 | 24369 | +{ |
e4b2b4a8 | 24370 | + return current->flags & PF_IN_SOFTIRQ; |
1a6e0f06 | 24371 | +} |
e4b2b4a8 | 24372 | +EXPORT_SYMBOL(in_serving_softirq); |
1a6e0f06 | 24373 | + |
e4b2b4a8 JK |
24374 | +/* Called with preemption disabled */ |
24375 | +static void run_ksoftirqd(unsigned int cpu) | |
1a6e0f06 | 24376 | +{ |
e4b2b4a8 JK |
24377 | + local_irq_disable(); |
24378 | + current->softirq_nestcnt++; | |
24379 | + | |
24380 | + do_current_softirqs(); | |
24381 | + current->softirq_nestcnt--; | |
24382 | + local_irq_enable(); | |
24383 | + cond_resched_rcu_qs(); | |
1a6e0f06 | 24384 | +} |
1a6e0f06 | 24385 | + |
e4b2b4a8 JK |
24386 | +/* |
24387 | + * Called from netif_rx_ni(). Preemption enabled, but migration | |
24388 | + * disabled. So the cpu can't go away under us. | |
24389 | + */ | |
24390 | +void thread_do_softirq(void) | |
1a6e0f06 | 24391 | +{ |
e4b2b4a8 JK |
24392 | + if (!in_serving_softirq() && current->softirqs_raised) { |
24393 | + current->softirq_nestcnt++; | |
24394 | + do_current_softirqs(); | |
24395 | + current->softirq_nestcnt--; | |
24396 | + } | |
1a6e0f06 | 24397 | +} |
1a6e0f06 | 24398 | + |
e4b2b4a8 | 24399 | +static void do_raise_softirq_irqoff(unsigned int nr) |
1a6e0f06 | 24400 | +{ |
e4b2b4a8 JK |
24401 | + unsigned int mask; |
24402 | + | |
24403 | + mask = 1UL << nr; | |
24404 | + | |
24405 | + trace_softirq_raise(nr); | |
24406 | + or_softirq_pending(mask); | |
24407 | + | |
24408 | + /* | |
24409 | + * If we are not in a hard interrupt and inside a bh disabled | |
24410 | + * region, we simply raise the flag on current. local_bh_enable() | |
24411 | + * will make sure that the softirq is executed. Otherwise we | |
24412 | + * delegate it to ksoftirqd. | |
24413 | + */ | |
24414 | + if (!in_irq() && current->softirq_nestcnt) | |
24415 | + current->softirqs_raised |= mask; | |
24416 | + else if (!__this_cpu_read(ksoftirqd) || !__this_cpu_read(ktimer_softirqd)) | |
24417 | + return; | |
24418 | + | |
24419 | + if (mask & TIMER_SOFTIRQS) | |
24420 | + __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask; | |
24421 | + else | |
24422 | + __this_cpu_read(ksoftirqd)->softirqs_raised |= mask; | |
1a6e0f06 | 24423 | +} |
1a6e0f06 | 24424 | + |
e4b2b4a8 | 24425 | +static void wakeup_proper_softirq(unsigned int nr) |
1a6e0f06 | 24426 | +{ |
e4b2b4a8 JK |
24427 | + if ((1UL << nr) & TIMER_SOFTIRQS) |
24428 | + wakeup_timer_softirqd(); | |
24429 | + else | |
24430 | + wakeup_softirqd(); | |
1a6e0f06 | 24431 | +} |
1a6e0f06 | 24432 | + |
e4b2b4a8 | 24433 | +void __raise_softirq_irqoff(unsigned int nr) |
1a6e0f06 | 24434 | +{ |
e4b2b4a8 JK |
24435 | + do_raise_softirq_irqoff(nr); |
24436 | + if (!in_irq() && !current->softirq_nestcnt) | |
24437 | + wakeup_proper_softirq(nr); | |
1a6e0f06 | 24438 | +} |
1a6e0f06 | 24439 | + |
e4b2b4a8 JK |
24440 | +/* |
24441 | + * Same as __raise_softirq_irqoff() but will process them in ksoftirqd | |
24442 | + */ | |
24443 | +void __raise_softirq_irqoff_ksoft(unsigned int nr) | |
1a6e0f06 | 24444 | +{ |
e4b2b4a8 | 24445 | + unsigned int mask; |
1a6e0f06 | 24446 | + |
e4b2b4a8 JK |
24447 | + if (WARN_ON_ONCE(!__this_cpu_read(ksoftirqd) || |
24448 | + !__this_cpu_read(ktimer_softirqd))) | |
24449 | + return; | |
24450 | + mask = 1UL << nr; | |
1a6e0f06 | 24451 | + |
e4b2b4a8 JK |
24452 | + trace_softirq_raise(nr); |
24453 | + or_softirq_pending(mask); | |
24454 | + if (mask & TIMER_SOFTIRQS) | |
24455 | + __this_cpu_read(ktimer_softirqd)->softirqs_raised |= mask; | |
24456 | + else | |
24457 | + __this_cpu_read(ksoftirqd)->softirqs_raised |= mask; | |
24458 | + wakeup_proper_softirq(nr); | |
1a6e0f06 | 24459 | +} |
1a6e0f06 JK |
24460 | + |
24461 | +/* | |
e4b2b4a8 | 24462 | + * This function must run with irqs disabled! |
1a6e0f06 | 24463 | + */ |
e4b2b4a8 | 24464 | +void raise_softirq_irqoff(unsigned int nr) |
1a6e0f06 | 24465 | +{ |
e4b2b4a8 JK |
24466 | + do_raise_softirq_irqoff(nr); |
24467 | + | |
24468 | + /* | |
24469 | + * If we're in an hard interrupt we let irq return code deal | |
24470 | + * with the wakeup of ksoftirqd. | |
24471 | + */ | |
24472 | + if (in_irq()) | |
24473 | + return; | |
24474 | + /* | |
24475 | + * If we are in thread context but outside of a bh disabled | |
24476 | + * region, we need to wake ksoftirqd as well. | |
24477 | + * | |
24478 | + * CHECKME: Some of the places which do that could be wrapped | |
24479 | + * into local_bh_disable/enable pairs. Though it's unclear | |
24480 | + * whether this is worth the effort. To find those places just | |
24481 | + * raise a WARN() if the condition is met. | |
24482 | + */ | |
24483 | + if (!current->softirq_nestcnt) | |
24484 | + wakeup_proper_softirq(nr); | |
1a6e0f06 | 24485 | +} |
1a6e0f06 | 24486 | + |
e4b2b4a8 | 24487 | +static inline int ksoftirqd_softirq_pending(void) |
1a6e0f06 | 24488 | +{ |
e4b2b4a8 JK |
24489 | + return current->softirqs_raised; |
24490 | +} | |
1a6e0f06 | 24491 | + |
e4b2b4a8 JK |
24492 | +static inline void local_bh_disable_nort(void) { } |
24493 | +static inline void _local_bh_enable_nort(void) { } | |
24494 | + | |
24495 | +static inline void ksoftirqd_set_sched_params(unsigned int cpu) | |
24496 | +{ | |
24497 | + /* Take over all but timer pending softirqs when starting */ | |
24498 | + local_irq_disable(); | |
24499 | + current->softirqs_raised = local_softirq_pending() & ~TIMER_SOFTIRQS; | |
24500 | + local_irq_enable(); | |
1a6e0f06 | 24501 | +} |
1a6e0f06 | 24502 | + |
e4b2b4a8 | 24503 | +static inline void ktimer_softirqd_set_sched_params(unsigned int cpu) |
1a6e0f06 | 24504 | +{ |
e4b2b4a8 | 24505 | + struct sched_param param = { .sched_priority = 1 }; |
1a6e0f06 | 24506 | + |
e4b2b4a8 JK |
24507 | + sched_setscheduler(current, SCHED_FIFO, ¶m); |
24508 | + | |
24509 | + /* Take over timer pending softirqs when starting */ | |
24510 | + local_irq_disable(); | |
24511 | + current->softirqs_raised = local_softirq_pending() & TIMER_SOFTIRQS; | |
24512 | + local_irq_enable(); | |
1a6e0f06 | 24513 | +} |
1a6e0f06 | 24514 | + |
e4b2b4a8 JK |
24515 | +static inline void ktimer_softirqd_clr_sched_params(unsigned int cpu, |
24516 | + bool online) | |
1a6e0f06 | 24517 | +{ |
e4b2b4a8 | 24518 | + struct sched_param param = { .sched_priority = 0 }; |
1a6e0f06 | 24519 | + |
e4b2b4a8 | 24520 | + sched_setscheduler(current, SCHED_NORMAL, ¶m); |
1a6e0f06 | 24521 | +} |
1a6e0f06 | 24522 | + |
e4b2b4a8 | 24523 | +static int ktimer_softirqd_should_run(unsigned int cpu) |
1a6e0f06 | 24524 | +{ |
e4b2b4a8 | 24525 | + return current->softirqs_raised; |
1a6e0f06 | 24526 | +} |
1a6e0f06 | 24527 | + |
e4b2b4a8 | 24528 | +#endif /* PREEMPT_RT_FULL */ |
b3bbd485 | 24529 | /* |
e4b2b4a8 JK |
24530 | * Enter an interrupt context. |
24531 | */ | |
b3bbd485 | 24532 | @@ -345,9 +789,9 @@ void irq_enter(void) |
e4b2b4a8 JK |
24533 | * Prevent raise_softirq from needlessly waking up ksoftirqd |
24534 | * here, as softirq will be serviced on return from interrupt. | |
24535 | */ | |
24536 | - local_bh_disable(); | |
24537 | + local_bh_disable_nort(); | |
24538 | tick_irq_enter(); | |
24539 | - _local_bh_enable(); | |
24540 | + _local_bh_enable_nort(); | |
24541 | } | |
24542 | ||
24543 | __irq_enter(); | |
b3bbd485 | 24544 | @@ -355,6 +799,7 @@ void irq_enter(void) |
e4b2b4a8 JK |
24545 | |
24546 | static inline void invoke_softirq(void) | |
24547 | { | |
24548 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
24549 | if (ksoftirqd_running(local_softirq_pending())) | |
24550 | return; | |
24551 | ||
b3bbd485 | 24552 | @@ -377,6 +822,18 @@ static inline void invoke_softirq(void) |
e4b2b4a8 JK |
24553 | } else { |
24554 | wakeup_softirqd(); | |
24555 | } | |
24556 | +#else /* PREEMPT_RT_FULL */ | |
24557 | + unsigned long flags; | |
24558 | + | |
24559 | + local_irq_save(flags); | |
24560 | + if (__this_cpu_read(ksoftirqd) && | |
24561 | + __this_cpu_read(ksoftirqd)->softirqs_raised) | |
24562 | + wakeup_softirqd(); | |
24563 | + if (__this_cpu_read(ktimer_softirqd) && | |
24564 | + __this_cpu_read(ktimer_softirqd)->softirqs_raised) | |
24565 | + wakeup_timer_softirqd(); | |
24566 | + local_irq_restore(flags); | |
24567 | +#endif | |
24568 | } | |
24569 | ||
24570 | static inline void tick_irq_exit(void) | |
b3bbd485 | 24571 | @@ -385,7 +842,8 @@ static inline void tick_irq_exit(void) |
e4b2b4a8 JK |
24572 | int cpu = smp_processor_id(); |
24573 | ||
24574 | /* Make sure that timer wheel updates are propagated */ | |
24575 | - if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) { | |
e4b2b4a8 | 24576 | + if ((idle_cpu(cpu) || tick_nohz_full_cpu(cpu)) && |
b3bbd485 | 24577 | + !need_resched() && !local_softirq_pending()) { |
e4b2b4a8 JK |
24578 | if (!in_irq()) |
24579 | tick_nohz_irq_exit(); | |
24580 | } | |
b3bbd485 | 24581 | @@ -413,26 +871,6 @@ void irq_exit(void) |
e4b2b4a8 JK |
24582 | trace_hardirq_exit(); /* must be last! */ |
24583 | } | |
24584 | ||
24585 | -/* | |
24586 | - * This function must run with irqs disabled! | |
24587 | - */ | |
24588 | -inline void raise_softirq_irqoff(unsigned int nr) | |
24589 | -{ | |
24590 | - __raise_softirq_irqoff(nr); | |
24591 | - | |
24592 | - /* | |
24593 | - * If we're in an interrupt or softirq, we're done | |
24594 | - * (this also catches softirq-disabled code). We will | |
24595 | - * actually run the softirq once we return from | |
24596 | - * the irq or softirq. | |
24597 | - * | |
24598 | - * Otherwise we wake up ksoftirqd to make sure we | |
24599 | - * schedule the softirq soon. | |
24600 | - */ | |
24601 | - if (!in_interrupt()) | |
24602 | - wakeup_softirqd(); | |
24603 | -} | |
24604 | - | |
24605 | void raise_softirq(unsigned int nr) | |
24606 | { | |
24607 | unsigned long flags; | |
b3bbd485 | 24608 | @@ -442,12 +880,6 @@ void raise_softirq(unsigned int nr) |
e4b2b4a8 JK |
24609 | local_irq_restore(flags); |
24610 | } | |
24611 | ||
24612 | -void __raise_softirq_irqoff(unsigned int nr) | |
24613 | -{ | |
24614 | - trace_softirq_raise(nr); | |
24615 | - or_softirq_pending(1UL << nr); | |
24616 | -} | |
24617 | - | |
24618 | void open_softirq(int nr, void (*action)(struct softirq_action *)) | |
24619 | { | |
24620 | softirq_vec[nr].action = action; | |
b3bbd485 | 24621 | @@ -464,15 +896,45 @@ struct tasklet_head { |
e4b2b4a8 JK |
24622 | static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec); |
24623 | static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec); | |
24624 | ||
24625 | +static void inline | |
24626 | +__tasklet_common_schedule(struct tasklet_struct *t, struct tasklet_head *head, unsigned int nr) | |
1a6e0f06 | 24627 | +{ |
e4b2b4a8 JK |
24628 | + if (tasklet_trylock(t)) { |
24629 | +again: | |
24630 | + /* We may have been preempted before tasklet_trylock | |
24631 | + * and __tasklet_action may have already run. | |
24632 | + * So double check the sched bit while the takslet | |
24633 | + * is locked before adding it to the list. | |
24634 | + */ | |
24635 | + if (test_bit(TASKLET_STATE_SCHED, &t->state)) { | |
24636 | + t->next = NULL; | |
24637 | + *head->tail = t; | |
24638 | + head->tail = &(t->next); | |
24639 | + raise_softirq_irqoff(nr); | |
24640 | + tasklet_unlock(t); | |
24641 | + } else { | |
24642 | + /* This is subtle. If we hit the corner case above | |
24643 | + * It is possible that we get preempted right here, | |
24644 | + * and another task has successfully called | |
24645 | + * tasklet_schedule(), then this function, and | |
24646 | + * failed on the trylock. Thus we must be sure | |
24647 | + * before releasing the tasklet lock, that the | |
24648 | + * SCHED_BIT is clear. Otherwise the tasklet | |
24649 | + * may get its SCHED_BIT set, but not added to the | |
24650 | + * list | |
24651 | + */ | |
24652 | + if (!tasklet_tryunlock(t)) | |
24653 | + goto again; | |
24654 | + } | |
24655 | + } | |
1a6e0f06 | 24656 | +} |
1a6e0f06 | 24657 | + |
e4b2b4a8 JK |
24658 | void __tasklet_schedule(struct tasklet_struct *t) |
24659 | { | |
24660 | unsigned long flags; | |
24661 | ||
24662 | local_irq_save(flags); | |
24663 | - t->next = NULL; | |
24664 | - *__this_cpu_read(tasklet_vec.tail) = t; | |
24665 | - __this_cpu_write(tasklet_vec.tail, &(t->next)); | |
24666 | - raise_softirq_irqoff(TASKLET_SOFTIRQ); | |
24667 | + __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ); | |
24668 | local_irq_restore(flags); | |
24669 | } | |
24670 | EXPORT_SYMBOL(__tasklet_schedule); | |
b3bbd485 | 24671 | @@ -482,50 +944,108 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) |
e4b2b4a8 JK |
24672 | unsigned long flags; |
24673 | ||
24674 | local_irq_save(flags); | |
24675 | - t->next = NULL; | |
24676 | - *__this_cpu_read(tasklet_hi_vec.tail) = t; | |
24677 | - __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); | |
24678 | - raise_softirq_irqoff(HI_SOFTIRQ); | |
24679 | + __tasklet_common_schedule(t, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); | |
24680 | local_irq_restore(flags); | |
24681 | } | |
24682 | EXPORT_SYMBOL(__tasklet_hi_schedule); | |
24683 | ||
24684 | -static __latent_entropy void tasklet_action(struct softirq_action *a) | |
24685 | +void tasklet_enable(struct tasklet_struct *t) | |
24686 | { | |
24687 | - struct tasklet_struct *list; | |
24688 | + if (!atomic_dec_and_test(&t->count)) | |
24689 | + return; | |
24690 | + if (test_and_clear_bit(TASKLET_STATE_PENDING, &t->state)) | |
24691 | + tasklet_schedule(t); | |
1a6e0f06 | 24692 | +} |
e4b2b4a8 JK |
24693 | +EXPORT_SYMBOL(tasklet_enable); |
24694 | ||
24695 | - local_irq_disable(); | |
24696 | - list = __this_cpu_read(tasklet_vec.head); | |
24697 | - __this_cpu_write(tasklet_vec.head, NULL); | |
24698 | - __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head)); | |
24699 | - local_irq_enable(); | |
24700 | +static void __tasklet_action(struct softirq_action *a, | |
24701 | + struct tasklet_struct *list) | |
24702 | +{ | |
24703 | + int loops = 1000000; | |
24704 | ||
24705 | while (list) { | |
24706 | struct tasklet_struct *t = list; | |
24707 | ||
24708 | list = list->next; | |
24709 | ||
24710 | - if (tasklet_trylock(t)) { | |
24711 | - if (!atomic_read(&t->count)) { | |
24712 | - if (!test_and_clear_bit(TASKLET_STATE_SCHED, | |
24713 | - &t->state)) | |
24714 | - BUG(); | |
24715 | - t->func(t->data); | |
24716 | - tasklet_unlock(t); | |
24717 | - continue; | |
24718 | - } | |
24719 | - tasklet_unlock(t); | |
24720 | + /* | |
24721 | + * Should always succeed - after a tasklist got on the | |
24722 | + * list (after getting the SCHED bit set from 0 to 1), | |
24723 | + * nothing but the tasklet softirq it got queued to can | |
24724 | + * lock it: | |
24725 | + */ | |
24726 | + if (!tasklet_trylock(t)) { | |
24727 | + WARN_ON(1); | |
24728 | + continue; | |
24729 | } | |
24730 | ||
24731 | - local_irq_disable(); | |
24732 | t->next = NULL; | |
24733 | - *__this_cpu_read(tasklet_vec.tail) = t; | |
24734 | - __this_cpu_write(tasklet_vec.tail, &(t->next)); | |
24735 | - __raise_softirq_irqoff(TASKLET_SOFTIRQ); | |
24736 | - local_irq_enable(); | |
1a6e0f06 | 24737 | + |
e4b2b4a8 JK |
24738 | + /* |
24739 | + * If we cannot handle the tasklet because it's disabled, | |
24740 | + * mark it as pending. tasklet_enable() will later | |
24741 | + * re-schedule the tasklet. | |
24742 | + */ | |
24743 | + if (unlikely(atomic_read(&t->count))) { | |
24744 | +out_disabled: | |
24745 | + /* implicit unlock: */ | |
24746 | + wmb(); | |
24747 | + t->state = TASKLET_STATEF_PENDING; | |
24748 | + continue; | |
24749 | + } | |
1a6e0f06 | 24750 | + |
e4b2b4a8 JK |
24751 | + /* |
24752 | + * After this point on the tasklet might be rescheduled | |
24753 | + * on another CPU, but it can only be added to another | |
24754 | + * CPU's tasklet list if we unlock the tasklet (which we | |
24755 | + * dont do yet). | |
24756 | + */ | |
24757 | + if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) | |
24758 | + WARN_ON(1); | |
1a6e0f06 | 24759 | + |
e4b2b4a8 JK |
24760 | +again: |
24761 | + t->func(t->data); | |
1a6e0f06 | 24762 | + |
e4b2b4a8 JK |
24763 | + /* |
24764 | + * Try to unlock the tasklet. We must use cmpxchg, because | |
24765 | + * another CPU might have scheduled or disabled the tasklet. | |
24766 | + * We only allow the STATE_RUN -> 0 transition here. | |
24767 | + */ | |
24768 | + while (!tasklet_tryunlock(t)) { | |
24769 | + /* | |
24770 | + * If it got disabled meanwhile, bail out: | |
24771 | + */ | |
24772 | + if (atomic_read(&t->count)) | |
24773 | + goto out_disabled; | |
24774 | + /* | |
24775 | + * If it got scheduled meanwhile, re-execute | |
24776 | + * the tasklet function: | |
24777 | + */ | |
24778 | + if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) | |
24779 | + goto again; | |
24780 | + if (!--loops) { | |
24781 | + printk("hm, tasklet state: %08lx\n", t->state); | |
24782 | + WARN_ON(1); | |
24783 | + tasklet_unlock(t); | |
24784 | + break; | |
24785 | + } | |
24786 | + } | |
24787 | } | |
24788 | } | |
24789 | ||
24790 | +static __latent_entropy void tasklet_action(struct softirq_action *a) | |
24791 | +{ | |
24792 | + struct tasklet_struct *list; | |
1a6e0f06 | 24793 | + |
e4b2b4a8 JK |
24794 | + local_irq_disable(); |
24795 | + list = __this_cpu_read(tasklet_vec.head); | |
24796 | + __this_cpu_write(tasklet_vec.head, NULL); | |
24797 | + __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head)); | |
24798 | + local_irq_enable(); | |
1a6e0f06 | 24799 | + |
e4b2b4a8 | 24800 | + __tasklet_action(a, list); |
1a6e0f06 | 24801 | +} |
e4b2b4a8 JK |
24802 | + |
24803 | static __latent_entropy void tasklet_hi_action(struct softirq_action *a) | |
24804 | { | |
24805 | struct tasklet_struct *list; | |
b3bbd485 | 24806 | @@ -536,30 +1056,7 @@ static __latent_entropy void tasklet_hi_action(struct softirq_action *a) |
e4b2b4a8 JK |
24807 | __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head)); |
24808 | local_irq_enable(); | |
24809 | ||
24810 | - while (list) { | |
24811 | - struct tasklet_struct *t = list; | |
24812 | - | |
24813 | - list = list->next; | |
24814 | - | |
24815 | - if (tasklet_trylock(t)) { | |
24816 | - if (!atomic_read(&t->count)) { | |
24817 | - if (!test_and_clear_bit(TASKLET_STATE_SCHED, | |
24818 | - &t->state)) | |
24819 | - BUG(); | |
24820 | - t->func(t->data); | |
24821 | - tasklet_unlock(t); | |
24822 | - continue; | |
24823 | - } | |
24824 | - tasklet_unlock(t); | |
24825 | - } | |
24826 | - | |
24827 | - local_irq_disable(); | |
24828 | - t->next = NULL; | |
24829 | - *__this_cpu_read(tasklet_hi_vec.tail) = t; | |
24830 | - __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); | |
24831 | - __raise_softirq_irqoff(HI_SOFTIRQ); | |
24832 | - local_irq_enable(); | |
24833 | - } | |
24834 | + __tasklet_action(a, list); | |
24835 | } | |
24836 | ||
24837 | void tasklet_init(struct tasklet_struct *t, | |
b3bbd485 | 24838 | @@ -580,7 +1077,7 @@ void tasklet_kill(struct tasklet_struct *t) |
e4b2b4a8 JK |
24839 | |
24840 | while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { | |
24841 | do { | |
24842 | - yield(); | |
24843 | + msleep(1); | |
24844 | } while (test_bit(TASKLET_STATE_SCHED, &t->state)); | |
24845 | } | |
24846 | tasklet_unlock_wait(t); | |
b3bbd485 | 24847 | @@ -588,57 +1085,6 @@ void tasklet_kill(struct tasklet_struct *t) |
e4b2b4a8 JK |
24848 | } |
24849 | EXPORT_SYMBOL(tasklet_kill); | |
24850 | ||
24851 | -/* | |
24852 | - * tasklet_hrtimer | |
24853 | - */ | |
24854 | - | |
24855 | -/* | |
24856 | - * The trampoline is called when the hrtimer expires. It schedules a tasklet | |
24857 | - * to run __tasklet_hrtimer_trampoline() which in turn will call the intended | |
24858 | - * hrtimer callback, but from softirq context. | |
24859 | - */ | |
24860 | -static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) | |
24861 | -{ | |
24862 | - struct tasklet_hrtimer *ttimer = | |
24863 | - container_of(timer, struct tasklet_hrtimer, timer); | |
24864 | - | |
24865 | - tasklet_hi_schedule(&ttimer->tasklet); | |
24866 | - return HRTIMER_NORESTART; | |
24867 | -} | |
24868 | - | |
24869 | -/* | |
24870 | - * Helper function which calls the hrtimer callback from | |
24871 | - * tasklet/softirq context | |
24872 | - */ | |
24873 | -static void __tasklet_hrtimer_trampoline(unsigned long data) | |
24874 | -{ | |
24875 | - struct tasklet_hrtimer *ttimer = (void *)data; | |
24876 | - enum hrtimer_restart restart; | |
24877 | - | |
24878 | - restart = ttimer->function(&ttimer->timer); | |
24879 | - if (restart != HRTIMER_NORESTART) | |
24880 | - hrtimer_restart(&ttimer->timer); | |
24881 | -} | |
24882 | - | |
24883 | -/** | |
24884 | - * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks | |
24885 | - * @ttimer: tasklet_hrtimer which is initialized | |
24886 | - * @function: hrtimer callback function which gets called from softirq context | |
24887 | - * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) | |
24888 | - * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) | |
24889 | - */ | |
24890 | -void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, | |
24891 | - enum hrtimer_restart (*function)(struct hrtimer *), | |
24892 | - clockid_t which_clock, enum hrtimer_mode mode) | |
24893 | -{ | |
24894 | - hrtimer_init(&ttimer->timer, which_clock, mode); | |
24895 | - ttimer->timer.function = __hrtimer_tasklet_trampoline; | |
24896 | - tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline, | |
24897 | - (unsigned long)ttimer); | |
24898 | - ttimer->function = function; | |
24899 | -} | |
24900 | -EXPORT_SYMBOL_GPL(tasklet_hrtimer_init); | |
24901 | - | |
24902 | void __init softirq_init(void) | |
24903 | { | |
24904 | int cpu; | |
b3bbd485 | 24905 | @@ -654,25 +1100,26 @@ void __init softirq_init(void) |
e4b2b4a8 JK |
24906 | open_softirq(HI_SOFTIRQ, tasklet_hi_action); |
24907 | } | |
24908 | ||
24909 | -static int ksoftirqd_should_run(unsigned int cpu) | |
24910 | -{ | |
24911 | - return local_softirq_pending(); | |
24912 | -} | |
24913 | - | |
24914 | -static void run_ksoftirqd(unsigned int cpu) | |
24915 | +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL) | |
24916 | +void tasklet_unlock_wait(struct tasklet_struct *t) | |
24917 | { | |
24918 | - local_irq_disable(); | |
24919 | - if (local_softirq_pending()) { | |
24920 | + while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { | |
24921 | /* | |
24922 | - * We can safely run softirq on inline stack, as we are not deep | |
24923 | - * in the task stack here. | |
24924 | + * Hack for now to avoid this busy-loop: | |
24925 | */ | |
24926 | - __do_softirq(); | |
24927 | - local_irq_enable(); | |
24928 | - cond_resched_rcu_qs(); | |
24929 | - return; | |
24930 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
24931 | + msleep(1); | |
1a6e0f06 | 24932 | +#else |
e4b2b4a8 JK |
24933 | + barrier(); |
24934 | +#endif | |
24935 | } | |
24936 | - local_irq_enable(); | |
1a6e0f06 | 24937 | +} |
e4b2b4a8 | 24938 | +EXPORT_SYMBOL(tasklet_unlock_wait); |
1a6e0f06 JK |
24939 | +#endif |
24940 | + | |
e4b2b4a8 | 24941 | +static int ksoftirqd_should_run(unsigned int cpu) |
1a6e0f06 | 24942 | +{ |
e4b2b4a8 JK |
24943 | + return ksoftirqd_softirq_pending(); |
24944 | } | |
1a6e0f06 | 24945 | |
e4b2b4a8 | 24946 | #ifdef CONFIG_HOTPLUG_CPU |
b3bbd485 | 24947 | @@ -739,17 +1186,31 @@ static int takeover_tasklets(unsigned int cpu) |
e4b2b4a8 JK |
24948 | |
24949 | static struct smp_hotplug_thread softirq_threads = { | |
24950 | .store = &ksoftirqd, | |
24951 | + .setup = ksoftirqd_set_sched_params, | |
24952 | .thread_should_run = ksoftirqd_should_run, | |
24953 | .thread_fn = run_ksoftirqd, | |
24954 | .thread_comm = "ksoftirqd/%u", | |
24955 | }; | |
24956 | ||
24957 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
24958 | +static struct smp_hotplug_thread softirq_timer_threads = { | |
24959 | + .store = &ktimer_softirqd, | |
24960 | + .setup = ktimer_softirqd_set_sched_params, | |
24961 | + .cleanup = ktimer_softirqd_clr_sched_params, | |
24962 | + .thread_should_run = ktimer_softirqd_should_run, | |
24963 | + .thread_fn = run_ksoftirqd, | |
24964 | + .thread_comm = "ktimersoftd/%u", | |
24965 | +}; | |
24966 | +#endif | |
1a6e0f06 | 24967 | + |
e4b2b4a8 JK |
24968 | static __init int spawn_ksoftirqd(void) |
24969 | { | |
24970 | cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL, | |
24971 | takeover_tasklets); | |
24972 | BUG_ON(smpboot_register_percpu_thread(&softirq_threads)); | |
24973 | - | |
24974 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
24975 | + BUG_ON(smpboot_register_percpu_thread(&softirq_timer_threads)); | |
24976 | +#endif | |
24977 | return 0; | |
24978 | } | |
24979 | early_initcall(spawn_ksoftirqd); | |
b3bbd485 JK |
24980 | diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c |
24981 | index 067cb83f37ea..56f2f2e01229 100644 | |
24982 | --- a/kernel/stop_machine.c | |
24983 | +++ b/kernel/stop_machine.c | |
24984 | @@ -503,6 +503,8 @@ static void cpu_stopper_thread(unsigned int cpu) | |
e4b2b4a8 JK |
24985 | struct cpu_stop_done *done = work->done; |
24986 | int ret; | |
24987 | ||
24988 | + /* XXX */ | |
1a6e0f06 | 24989 | + |
e4b2b4a8 JK |
24990 | /* cpu stop callbacks must not sleep, make in_atomic() == T */ |
24991 | preempt_count_inc(); | |
24992 | ret = fn(arg); | |
b3bbd485 | 24993 | diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c |
5dd41b01 | 24994 | index fa5de5e8de61..6020ee66e517 100644 |
b3bbd485 JK |
24995 | --- a/kernel/time/alarmtimer.c |
24996 | +++ b/kernel/time/alarmtimer.c | |
24997 | @@ -436,7 +436,7 @@ int alarm_cancel(struct alarm *alarm) | |
e4b2b4a8 JK |
24998 | int ret = alarm_try_to_cancel(alarm); |
24999 | if (ret >= 0) | |
25000 | return ret; | |
25001 | - cpu_relax(); | |
25002 | + hrtimer_wait_for_timer(&alarm->timer); | |
25003 | } | |
25004 | } | |
25005 | EXPORT_SYMBOL_GPL(alarm_cancel); | |
b3bbd485 JK |
25006 | diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c |
25007 | index d00e85ac10d6..b59e009087a9 100644 | |
25008 | --- a/kernel/time/hrtimer.c | |
25009 | +++ b/kernel/time/hrtimer.c | |
25010 | @@ -59,6 +59,15 @@ | |
25011 | ||
e4b2b4a8 | 25012 | #include "tick-internal.h" |
1a6e0f06 | 25013 | |
b3bbd485 | 25014 | +/* |
e4b2b4a8 JK |
25015 | + * Masks for selecting the soft and hard context timers from |
25016 | + * cpu_base->active | |
25017 | + */ | |
25018 | +#define MASK_SHIFT (HRTIMER_BASE_MONOTONIC_SOFT) | |
25019 | +#define HRTIMER_ACTIVE_HARD ((1U << MASK_SHIFT) - 1) | |
25020 | +#define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT) | |
25021 | +#define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) | |
25022 | + | |
b3bbd485 | 25023 | /* |
e4b2b4a8 JK |
25024 | * The timer bases: |
25025 | * | |
e4b2b4a8 JK |
25026 | @@ -70,7 +79,6 @@ |
25027 | DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = | |
1a6e0f06 | 25028 | { |
e4b2b4a8 JK |
25029 | .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), |
25030 | - .seq = SEQCNT_ZERO(hrtimer_bases.seq), | |
25031 | .clock_base = | |
25032 | { | |
25033 | { | |
b3bbd485 | 25034 | @@ -93,6 +101,26 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = |
e4b2b4a8 JK |
25035 | .clockid = CLOCK_TAI, |
25036 | .get_time = &ktime_get_clocktai, | |
25037 | }, | |
25038 | + { | |
25039 | + .index = HRTIMER_BASE_MONOTONIC_SOFT, | |
25040 | + .clockid = CLOCK_MONOTONIC, | |
25041 | + .get_time = &ktime_get, | |
25042 | + }, | |
25043 | + { | |
25044 | + .index = HRTIMER_BASE_REALTIME_SOFT, | |
25045 | + .clockid = CLOCK_REALTIME, | |
25046 | + .get_time = &ktime_get_real, | |
25047 | + }, | |
25048 | + { | |
25049 | + .index = HRTIMER_BASE_BOOTTIME_SOFT, | |
25050 | + .clockid = CLOCK_BOOTTIME, | |
25051 | + .get_time = &ktime_get_boottime, | |
25052 | + }, | |
25053 | + { | |
25054 | + .index = HRTIMER_BASE_TAI_SOFT, | |
25055 | + .clockid = CLOCK_TAI, | |
25056 | + .get_time = &ktime_get_clocktai, | |
25057 | + }, | |
25058 | } | |
25059 | }; | |
1a6e0f06 | 25060 | |
b3bbd485 | 25061 | @@ -118,7 +146,6 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { |
e4b2b4a8 JK |
25062 | * timer->base->cpu_base |
25063 | */ | |
25064 | static struct hrtimer_cpu_base migration_cpu_base = { | |
25065 | - .seq = SEQCNT_ZERO(migration_cpu_base), | |
25066 | .clock_base = { { .cpu_base = &migration_cpu_base, }, }, | |
25067 | }; | |
1a6e0f06 | 25068 | |
b3bbd485 | 25069 | @@ -156,45 +183,33 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, |
1a6e0f06 JK |
25070 | } |
25071 | ||
25072 | /* | |
e4b2b4a8 JK |
25073 | - * With HIGHRES=y we do not migrate the timer when it is expiring |
25074 | - * before the next event on the target cpu because we cannot reprogram | |
25075 | - * the target cpu hardware and we would cause it to fire late. | |
25076 | + * We do not migrate the timer when it is expiring before the next | |
25077 | + * event on the target cpu. When high resolution is enabled, we cannot | |
25078 | + * reprogram the target cpu hardware and we would cause it to fire | |
25079 | + * late. To keep it simple, we handle the high resolution enabled and | |
25080 | + * disabled case similar. | |
25081 | * | |
25082 | * Called with cpu_base->lock of target cpu held. | |
25083 | */ | |
25084 | static int | |
25085 | hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) | |
1a6e0f06 | 25086 | { |
e4b2b4a8 JK |
25087 | -#ifdef CONFIG_HIGH_RES_TIMERS |
25088 | ktime_t expires; | |
1a6e0f06 | 25089 | |
e4b2b4a8 JK |
25090 | - if (!new_base->cpu_base->hres_active) |
25091 | - return 0; | |
25092 | - | |
25093 | expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); | |
25094 | - return expires <= new_base->cpu_base->expires_next; | |
25095 | -#else | |
25096 | - return 0; | |
25097 | -#endif | |
25098 | + return expires < new_base->cpu_base->expires_next; | |
25099 | } | |
1a6e0f06 | 25100 | |
e4b2b4a8 JK |
25101 | -#ifdef CONFIG_NO_HZ_COMMON |
25102 | -static inline | |
25103 | -struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, | |
25104 | - int pinned) | |
25105 | -{ | |
25106 | - if (pinned || !base->migration_enabled) | |
25107 | - return base; | |
25108 | - return &per_cpu(hrtimer_bases, get_nohz_timer_target()); | |
25109 | -} | |
25110 | -#else | |
25111 | static inline | |
25112 | struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, | |
25113 | int pinned) | |
25114 | { | |
25115 | +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) | |
25116 | + if (static_branch_unlikely(&timers_migration_enabled) && !pinned) | |
25117 | + return &per_cpu(hrtimer_bases, get_nohz_timer_target()); | |
25118 | +#endif | |
25119 | return base; | |
25120 | } | |
25121 | -#endif | |
1a6e0f06 | 25122 | |
e4b2b4a8 JK |
25123 | /* |
25124 | * We switch the timer base to a power-optimized selected CPU target, | |
b3bbd485 | 25125 | @@ -396,7 +411,8 @@ static inline void debug_hrtimer_init(struct hrtimer *timer) |
e4b2b4a8 JK |
25126 | debug_object_init(timer, &hrtimer_debug_descr); |
25127 | } | |
1a6e0f06 | 25128 | |
e4b2b4a8 JK |
25129 | -static inline void debug_hrtimer_activate(struct hrtimer *timer) |
25130 | +static inline void debug_hrtimer_activate(struct hrtimer *timer, | |
25131 | + enum hrtimer_mode mode) | |
25132 | { | |
25133 | debug_object_activate(timer, &hrtimer_debug_descr); | |
25134 | } | |
b3bbd485 | 25135 | @@ -429,8 +445,10 @@ void destroy_hrtimer_on_stack(struct hrtimer *timer) |
e4b2b4a8 | 25136 | EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack); |
1a6e0f06 | 25137 | |
e4b2b4a8 JK |
25138 | #else |
25139 | + | |
25140 | static inline void debug_hrtimer_init(struct hrtimer *timer) { } | |
25141 | -static inline void debug_hrtimer_activate(struct hrtimer *timer) { } | |
25142 | +static inline void debug_hrtimer_activate(struct hrtimer *timer, | |
25143 | + enum hrtimer_mode mode) { } | |
25144 | static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } | |
25145 | #endif | |
1a6e0f06 | 25146 | |
b3bbd485 | 25147 | @@ -442,10 +460,11 @@ debug_init(struct hrtimer *timer, clockid_t clockid, |
e4b2b4a8 | 25148 | trace_hrtimer_init(timer, clockid, mode); |
1a6e0f06 | 25149 | } |
1a6e0f06 | 25150 | |
e4b2b4a8 JK |
25151 | -static inline void debug_activate(struct hrtimer *timer) |
25152 | +static inline void debug_activate(struct hrtimer *timer, | |
25153 | + enum hrtimer_mode mode) | |
25154 | { | |
25155 | - debug_hrtimer_activate(timer); | |
25156 | - trace_hrtimer_start(timer); | |
25157 | + debug_hrtimer_activate(timer, mode); | |
25158 | + trace_hrtimer_start(timer, mode); | |
25159 | } | |
1a6e0f06 | 25160 | |
e4b2b4a8 | 25161 | static inline void debug_deactivate(struct hrtimer *timer) |
b3bbd485 | 25162 | @@ -454,35 +473,43 @@ static inline void debug_deactivate(struct hrtimer *timer) |
e4b2b4a8 | 25163 | trace_hrtimer_cancel(timer); |
1a6e0f06 JK |
25164 | } |
25165 | ||
e4b2b4a8 JK |
25166 | -#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) |
25167 | -static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base, | |
25168 | - struct hrtimer *timer) | |
25169 | +static struct hrtimer_clock_base * | |
25170 | +__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active) | |
25171 | { | |
25172 | -#ifdef CONFIG_HIGH_RES_TIMERS | |
25173 | - cpu_base->next_timer = timer; | |
25174 | -#endif | |
25175 | + unsigned int idx; | |
1a6e0f06 | 25176 | + |
e4b2b4a8 JK |
25177 | + if (!*active) |
25178 | + return NULL; | |
1a6e0f06 | 25179 | + |
e4b2b4a8 JK |
25180 | + idx = __ffs(*active); |
25181 | + *active &= ~(1U << idx); | |
1a6e0f06 | 25182 | + |
e4b2b4a8 JK |
25183 | + return &cpu_base->clock_base[idx]; |
25184 | } | |
25185 | ||
25186 | -static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) | |
25187 | +#define for_each_active_base(base, cpu_base, active) \ | |
25188 | + while ((base = __next_base((cpu_base), &(active)))) | |
1a6e0f06 | 25189 | + |
e4b2b4a8 JK |
25190 | +static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, |
25191 | + unsigned int active, | |
25192 | + ktime_t expires_next) | |
25193 | { | |
25194 | - struct hrtimer_clock_base *base = cpu_base->clock_base; | |
25195 | - unsigned int active = cpu_base->active_bases; | |
25196 | - ktime_t expires, expires_next = KTIME_MAX; | |
25197 | + struct hrtimer_clock_base *base; | |
25198 | + ktime_t expires; | |
25199 | ||
25200 | - hrtimer_update_next_timer(cpu_base, NULL); | |
25201 | - for (; active; base++, active >>= 1) { | |
25202 | + for_each_active_base(base, cpu_base, active) { | |
25203 | struct timerqueue_node *next; | |
25204 | struct hrtimer *timer; | |
25205 | ||
25206 | - if (!(active & 0x01)) | |
25207 | - continue; | |
25208 | - | |
25209 | next = timerqueue_getnext(&base->active); | |
25210 | timer = container_of(next, struct hrtimer, node); | |
25211 | expires = ktime_sub(hrtimer_get_expires(timer), base->offset); | |
25212 | if (expires < expires_next) { | |
25213 | expires_next = expires; | |
25214 | - hrtimer_update_next_timer(cpu_base, timer); | |
25215 | + if (timer->is_soft) | |
25216 | + cpu_base->softirq_next_timer = timer; | |
25217 | + else | |
25218 | + cpu_base->next_timer = timer; | |
25219 | } | |
25220 | } | |
25221 | /* | |
b3bbd485 | 25222 | @@ -494,7 +521,47 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) |
e4b2b4a8 JK |
25223 | expires_next = 0; |
25224 | return expires_next; | |
25225 | } | |
25226 | -#endif | |
1a6e0f06 | 25227 | + |
e4b2b4a8 JK |
25228 | +/* |
25229 | + * Recomputes cpu_base::*next_timer and returns the earliest expires_next but | |
25230 | + * does not set cpu_base::*expires_next, that is done by hrtimer_reprogram. | |
25231 | + * | |
25232 | + * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases, | |
25233 | + * those timers will get run whenever the softirq gets handled, at the end of | |
25234 | + * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases. | |
25235 | + * | |
25236 | + * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases. | |
25237 | + * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual | |
25238 | + * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD. | |
25239 | + * | |
25240 | + * @active_mask must be one of: | |
25241 | + * - HRTIMER_ACTIVE_ALL, | |
25242 | + * - HRTIMER_ACTIVE_SOFT, or | |
25243 | + * - HRTIMER_ACTIVE_HARD. | |
25244 | + */ | |
25245 | +static ktime_t | |
25246 | +__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask) | |
1a6e0f06 | 25247 | +{ |
e4b2b4a8 JK |
25248 | + unsigned int active; |
25249 | + struct hrtimer *next_timer = NULL; | |
25250 | + ktime_t expires_next = KTIME_MAX; | |
1a6e0f06 | 25251 | + |
e4b2b4a8 JK |
25252 | + if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) { |
25253 | + active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; | |
25254 | + cpu_base->softirq_next_timer = NULL; | |
25255 | + expires_next = __hrtimer_next_event_base(cpu_base, active, KTIME_MAX); | |
25256 | + | |
25257 | + next_timer = cpu_base->softirq_next_timer; | |
1a6e0f06 | 25258 | + } |
1a6e0f06 | 25259 | + |
e4b2b4a8 JK |
25260 | + if (active_mask & HRTIMER_ACTIVE_HARD) { |
25261 | + active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; | |
25262 | + cpu_base->next_timer = next_timer; | |
25263 | + expires_next = __hrtimer_next_event_base(cpu_base, active, expires_next); | |
25264 | + } | |
1a6e0f06 | 25265 | + |
e4b2b4a8 | 25266 | + return expires_next; |
1a6e0f06 | 25267 | +} |
e4b2b4a8 JK |
25268 | |
25269 | static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) | |
25270 | { | |
b3bbd485 | 25271 | @@ -502,36 +569,14 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) |
e4b2b4a8 JK |
25272 | ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; |
25273 | ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; | |
25274 | ||
25275 | - return ktime_get_update_offsets_now(&base->clock_was_set_seq, | |
25276 | + ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, | |
25277 | offs_real, offs_boot, offs_tai); | |
25278 | -} | |
25279 | - | |
25280 | -/* High resolution timer related functions */ | |
25281 | -#ifdef CONFIG_HIGH_RES_TIMERS | |
25282 | - | |
25283 | -/* | |
25284 | - * High resolution timer enabled ? | |
25285 | - */ | |
25286 | -static bool hrtimer_hres_enabled __read_mostly = true; | |
25287 | -unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; | |
25288 | -EXPORT_SYMBOL_GPL(hrtimer_resolution); | |
25289 | - | |
25290 | -/* | |
25291 | - * Enable / Disable high resolution mode | |
25292 | - */ | |
25293 | -static int __init setup_hrtimer_hres(char *str) | |
25294 | -{ | |
25295 | - return (kstrtobool(str, &hrtimer_hres_enabled) == 0); | |
25296 | -} | |
25297 | ||
25298 | -__setup("highres=", setup_hrtimer_hres); | |
25299 | + base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real; | |
25300 | + base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot; | |
25301 | + base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai; | |
25302 | ||
25303 | -/* | |
25304 | - * hrtimer_high_res_enabled - query, if the highres mode is enabled | |
25305 | - */ | |
25306 | -static inline int hrtimer_is_hres_enabled(void) | |
25307 | -{ | |
25308 | - return hrtimer_hres_enabled; | |
25309 | + return now; | |
25310 | } | |
25311 | ||
1a6e0f06 | 25312 | /* |
b3bbd485 | 25313 | @@ -539,7 +584,8 @@ static inline int hrtimer_is_hres_enabled(void) |
1a6e0f06 | 25314 | */ |
e4b2b4a8 | 25315 | static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) |
1a6e0f06 | 25316 | { |
e4b2b4a8 JK |
25317 | - return cpu_base->hres_active; |
25318 | + return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ? | |
25319 | + cpu_base->hres_active : 0; | |
25320 | } | |
1a6e0f06 | 25321 | |
e4b2b4a8 | 25322 | static inline int hrtimer_hres_active(void) |
b3bbd485 | 25323 | @@ -557,10 +603,23 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) |
e4b2b4a8 JK |
25324 | { |
25325 | ktime_t expires_next; | |
1a6e0f06 | 25326 | |
e4b2b4a8 JK |
25327 | - if (!cpu_base->hres_active) |
25328 | - return; | |
25329 | + /* | |
25330 | + * Find the current next expiration time. | |
25331 | + */ | |
25332 | + expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); | |
1a6e0f06 | 25333 | |
e4b2b4a8 JK |
25334 | - expires_next = __hrtimer_get_next_event(cpu_base); |
25335 | + if (cpu_base->next_timer && cpu_base->next_timer->is_soft) { | |
25336 | + /* | |
25337 | + * When the softirq is activated, hrtimer has to be | |
25338 | + * programmed with the first hard hrtimer because soft | |
25339 | + * timer interrupt could occur too late. | |
25340 | + */ | |
25341 | + if (cpu_base->softirq_activated) | |
25342 | + expires_next = __hrtimer_get_next_event(cpu_base, | |
25343 | + HRTIMER_ACTIVE_HARD); | |
25344 | + else | |
25345 | + cpu_base->softirq_expires_next = expires_next; | |
1a6e0f06 JK |
25346 | + } |
25347 | ||
e4b2b4a8 JK |
25348 | if (skip_equal && expires_next == cpu_base->expires_next) |
25349 | return; | |
b3bbd485 | 25350 | @@ -568,6 +627,9 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) |
e4b2b4a8 | 25351 | cpu_base->expires_next = expires_next; |
1a6e0f06 JK |
25352 | |
25353 | /* | |
e4b2b4a8 JK |
25354 | + * If hres is not active, hardware does not have to be |
25355 | + * reprogrammed yet. | |
25356 | + * | |
25357 | * If a hang was detected in the last timer interrupt then we | |
25358 | * leave the hang delay active in the hardware. We want the | |
25359 | * system to make progress. That also prevents the following | |
b3bbd485 | 25360 | @@ -581,83 +643,38 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) |
e4b2b4a8 JK |
25361 | * set. So we'd effectivly block all timers until the T2 event |
25362 | * fires. | |
1a6e0f06 | 25363 | */ |
e4b2b4a8 JK |
25364 | - if (cpu_base->hang_detected) |
25365 | + if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) | |
25366 | return; | |
1a6e0f06 | 25367 | |
e4b2b4a8 JK |
25368 | tick_program_event(cpu_base->expires_next, 1); |
25369 | } | |
1a6e0f06 | 25370 | |
e4b2b4a8 JK |
25371 | +/* High resolution timer related functions */ |
25372 | +#ifdef CONFIG_HIGH_RES_TIMERS | |
25373 | + | |
25374 | /* | |
25375 | - * When a timer is enqueued and expires earlier than the already enqueued | |
25376 | - * timers, we have to check, whether it expires earlier than the timer for | |
25377 | - * which the clock event device was armed. | |
25378 | - * | |
25379 | - * Called with interrupts disabled and base->cpu_base.lock held | |
25380 | + * High resolution timer enabled ? | |
1a6e0f06 | 25381 | */ |
e4b2b4a8 JK |
25382 | -static void hrtimer_reprogram(struct hrtimer *timer, |
25383 | - struct hrtimer_clock_base *base) | |
25384 | -{ | |
25385 | - struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); | |
25386 | - ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); | |
25387 | - | |
25388 | - WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); | |
25389 | - | |
25390 | - /* | |
25391 | - * If the timer is not on the current cpu, we cannot reprogram | |
25392 | - * the other cpus clock event device. | |
25393 | - */ | |
25394 | - if (base->cpu_base != cpu_base) | |
25395 | - return; | |
25396 | - | |
25397 | - /* | |
25398 | - * If the hrtimer interrupt is running, then it will | |
25399 | - * reevaluate the clock bases and reprogram the clock event | |
25400 | - * device. The callbacks are always executed in hard interrupt | |
25401 | - * context so we don't need an extra check for a running | |
25402 | - * callback. | |
25403 | - */ | |
25404 | - if (cpu_base->in_hrtirq) | |
25405 | - return; | |
25406 | - | |
25407 | - /* | |
25408 | - * CLOCK_REALTIME timer might be requested with an absolute | |
25409 | - * expiry time which is less than base->offset. Set it to 0. | |
25410 | - */ | |
25411 | - if (expires < 0) | |
25412 | - expires = 0; | |
25413 | - | |
25414 | - if (expires >= cpu_base->expires_next) | |
25415 | - return; | |
25416 | - | |
25417 | - /* Update the pointer to the next expiring timer */ | |
25418 | - cpu_base->next_timer = timer; | |
25419 | - | |
25420 | - /* | |
25421 | - * If a hang was detected in the last timer interrupt then we | |
25422 | - * do not schedule a timer which is earlier than the expiry | |
25423 | - * which we enforced in the hang detection. We want the system | |
25424 | - * to make progress. | |
25425 | - */ | |
25426 | - if (cpu_base->hang_detected) | |
25427 | - return; | |
25428 | +static bool hrtimer_hres_enabled __read_mostly = true; | |
25429 | +unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; | |
25430 | +EXPORT_SYMBOL_GPL(hrtimer_resolution); | |
25431 | ||
25432 | - /* | |
25433 | - * Program the timer hardware. We enforce the expiry for | |
25434 | - * events which are already in the past. | |
25435 | - */ | |
25436 | - cpu_base->expires_next = expires; | |
25437 | - tick_program_event(expires, 1); | |
25438 | +/* | |
25439 | + * Enable / Disable high resolution mode | |
25440 | + */ | |
25441 | +static int __init setup_hrtimer_hres(char *str) | |
25442 | +{ | |
25443 | + return (kstrtobool(str, &hrtimer_hres_enabled) == 0); | |
1a6e0f06 JK |
25444 | } |
25445 | ||
e4b2b4a8 JK |
25446 | +__setup("highres=", setup_hrtimer_hres); |
25447 | + | |
25448 | /* | |
25449 | - * Initialize the high resolution related parts of cpu_base | |
25450 | + * hrtimer_high_res_enabled - query, if the highres mode is enabled | |
25451 | */ | |
25452 | -static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) | |
25453 | +static inline int hrtimer_is_hres_enabled(void) | |
1a6e0f06 | 25454 | { |
e4b2b4a8 JK |
25455 | - base->expires_next = KTIME_MAX; |
25456 | - base->hang_detected = 0; | |
25457 | - base->hres_active = 0; | |
25458 | - base->next_timer = NULL; | |
25459 | + return hrtimer_hres_enabled; | |
1a6e0f06 JK |
25460 | } |
25461 | ||
e4b2b4a8 | 25462 | /* |
b3bbd485 | 25463 | @@ -669,7 +686,7 @@ static void retrigger_next_event(void *arg) |
1a6e0f06 | 25464 | { |
e4b2b4a8 | 25465 | struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); |
1a6e0f06 | 25466 | |
e4b2b4a8 JK |
25467 | - if (!base->hres_active) |
25468 | + if (!__hrtimer_hres_active(base)) | |
25469 | return; | |
1a6e0f06 | 25470 | |
e4b2b4a8 | 25471 | raw_spin_lock(&base->lock); |
b3bbd485 | 25472 | @@ -698,6 +715,29 @@ static void hrtimer_switch_to_hres(void) |
e4b2b4a8 JK |
25473 | retrigger_next_event(NULL); |
25474 | } | |
1a6e0f06 | 25475 | |
e4b2b4a8 JK |
25476 | +#ifdef CONFIG_PREEMPT_RT_FULL |
25477 | + | |
25478 | +static struct swork_event clock_set_delay_work; | |
25479 | + | |
25480 | +static void run_clock_set_delay(struct swork_event *event) | |
25481 | +{ | |
25482 | + clock_was_set(); | |
25483 | +} | |
25484 | + | |
25485 | +void clock_was_set_delayed(void) | |
25486 | +{ | |
25487 | + swork_queue(&clock_set_delay_work); | |
25488 | +} | |
25489 | + | |
25490 | +static __init int create_clock_set_delay_thread(void) | |
25491 | +{ | |
25492 | + WARN_ON(swork_get()); | |
25493 | + INIT_SWORK(&clock_set_delay_work, run_clock_set_delay); | |
25494 | + return 0; | |
25495 | +} | |
25496 | +early_initcall(create_clock_set_delay_thread); | |
25497 | +#else /* PREEMPT_RT_FULL */ | |
25498 | + | |
25499 | static void clock_was_set_work(struct work_struct *work) | |
1a6e0f06 | 25500 | { |
e4b2b4a8 | 25501 | clock_was_set(); |
b3bbd485 | 25502 | @@ -713,25 +753,105 @@ void clock_was_set_delayed(void) |
e4b2b4a8 JK |
25503 | { |
25504 | schedule_work(&hrtimer_work); | |
1a6e0f06 | 25505 | } |
e4b2b4a8 | 25506 | +#endif |
1a6e0f06 | 25507 | |
e4b2b4a8 | 25508 | #else |
1a6e0f06 | 25509 | |
e4b2b4a8 JK |
25510 | -static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; } |
25511 | -static inline int hrtimer_hres_active(void) { return 0; } | |
25512 | static inline int hrtimer_is_hres_enabled(void) { return 0; } | |
25513 | static inline void hrtimer_switch_to_hres(void) { } | |
25514 | -static inline void | |
25515 | -hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } | |
25516 | -static inline int hrtimer_reprogram(struct hrtimer *timer, | |
25517 | - struct hrtimer_clock_base *base) | |
25518 | -{ | |
25519 | - return 0; | |
25520 | -} | |
25521 | -static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } | |
25522 | static inline void retrigger_next_event(void *arg) { } | |
1a6e0f06 | 25523 | |
e4b2b4a8 | 25524 | #endif /* CONFIG_HIGH_RES_TIMERS */ |
1a6e0f06 | 25525 | |
b3bbd485 | 25526 | +/* |
e4b2b4a8 JK |
25527 | + * When a timer is enqueued and expires earlier than the already enqueued |
25528 | + * timers, we have to check, whether it expires earlier than the timer for | |
25529 | + * which the clock event device was armed. | |
25530 | + * | |
25531 | + * Called with interrupts disabled and base->cpu_base.lock held | |
25532 | + */ | |
25533 | +static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) | |
25534 | +{ | |
25535 | + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); | |
25536 | + struct hrtimer_clock_base *base = timer->base; | |
25537 | + ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); | |
25538 | + | |
25539 | + WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); | |
25540 | + | |
25541 | + /* | |
25542 | + * CLOCK_REALTIME timer might be requested with an absolute | |
25543 | + * expiry time which is less than base->offset. Set it to 0. | |
25544 | + */ | |
25545 | + if (expires < 0) | |
25546 | + expires = 0; | |
25547 | + | |
25548 | + if (timer->is_soft) { | |
25549 | + /* | |
25550 | + * soft hrtimer could be started on a remote CPU. In this | |
25551 | + * case softirq_expires_next needs to be updated on the | |
25552 | + * remote CPU. The soft hrtimer will not expire before the | |
25553 | + * first hard hrtimer on the remote CPU - | |
25554 | + * hrtimer_check_target() prevents this case. | |
25555 | + */ | |
25556 | + struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base; | |
25557 | + | |
25558 | + if (timer_cpu_base->softirq_activated) | |
25559 | + return; | |
25560 | + | |
25561 | + if (!ktime_before(expires, timer_cpu_base->softirq_expires_next)) | |
25562 | + return; | |
25563 | + | |
25564 | + timer_cpu_base->softirq_next_timer = timer; | |
25565 | + timer_cpu_base->softirq_expires_next = expires; | |
25566 | + | |
25567 | + if (!ktime_before(expires, timer_cpu_base->expires_next) || | |
25568 | + !reprogram) | |
25569 | + return; | |
25570 | + } | |
25571 | + | |
25572 | + /* | |
25573 | + * If the timer is not on the current cpu, we cannot reprogram | |
25574 | + * the other cpus clock event device. | |
25575 | + */ | |
25576 | + if (base->cpu_base != cpu_base) | |
25577 | + return; | |
25578 | + | |
25579 | + /* | |
25580 | + * If the hrtimer interrupt is running, then it will | |
25581 | + * reevaluate the clock bases and reprogram the clock event | |
25582 | + * device. The callbacks are always executed in hard interrupt | |
25583 | + * context so we don't need an extra check for a running | |
25584 | + * callback. | |
25585 | + */ | |
25586 | + if (cpu_base->in_hrtirq) | |
25587 | + return; | |
25588 | + | |
25589 | + if (expires >= cpu_base->expires_next) | |
25590 | + return; | |
25591 | + | |
25592 | + /* Update the pointer to the next expiring timer */ | |
25593 | + cpu_base->next_timer = timer; | |
25594 | + cpu_base->expires_next = expires; | |
25595 | + | |
25596 | + /* | |
25597 | + * If hres is not active, hardware does not have to be | |
25598 | + * programmed yet. | |
25599 | + * | |
25600 | + * If a hang was detected in the last timer interrupt then we | |
25601 | + * do not schedule a timer which is earlier than the expiry | |
25602 | + * which we enforced in the hang detection. We want the system | |
25603 | + * to make progress. | |
25604 | + */ | |
25605 | + if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) | |
25606 | + return; | |
25607 | + | |
25608 | + /* | |
25609 | + * Program the timer hardware. We enforce the expiry for | |
25610 | + * events which are already in the past. | |
25611 | + */ | |
25612 | + tick_program_event(expires, 1); | |
25613 | +} | |
25614 | + | |
b3bbd485 | 25615 | /* |
e4b2b4a8 JK |
25616 | * Clock realtime was set |
25617 | * | |
b3bbd485 | 25618 | @@ -830,6 +950,33 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) |
1a6e0f06 | 25619 | } |
e4b2b4a8 | 25620 | EXPORT_SYMBOL_GPL(hrtimer_forward); |
1a6e0f06 | 25621 | |
e4b2b4a8 JK |
25622 | +#ifdef CONFIG_PREEMPT_RT_BASE |
25623 | +# define wake_up_timer_waiters(b) wake_up(&(b)->wait) | |
25624 | + | |
25625 | +/** | |
25626 | + * hrtimer_wait_for_timer - Wait for a running timer | |
1a6e0f06 | 25627 | + * |
e4b2b4a8 | 25628 | + * @timer: timer to wait for |
1a6e0f06 | 25629 | + * |
e4b2b4a8 JK |
25630 | + * The function waits in case the timers callback function is |
25631 | + * currently executed on the waitqueue of the timer base. The | |
25632 | + * waitqueue is woken up after the timer callback function has | |
25633 | + * finished execution. | |
1a6e0f06 | 25634 | + */ |
e4b2b4a8 | 25635 | +void hrtimer_wait_for_timer(const struct hrtimer *timer) |
1a6e0f06 | 25636 | +{ |
e4b2b4a8 | 25637 | + struct hrtimer_clock_base *base = timer->base; |
1a6e0f06 | 25638 | + |
e4b2b4a8 JK |
25639 | + if (base && base->cpu_base && |
25640 | + base->index >= HRTIMER_BASE_MONOTONIC_SOFT) | |
25641 | + wait_event(base->cpu_base->wait, | |
25642 | + !(hrtimer_callback_running(timer))); | |
1a6e0f06 | 25643 | +} |
1a6e0f06 | 25644 | + |
1a6e0f06 | 25645 | +#else |
e4b2b4a8 | 25646 | +# define wake_up_timer_waiters(b) do { } while (0) |
1a6e0f06 | 25647 | +#endif |
e4b2b4a8 JK |
25648 | + |
25649 | /* | |
25650 | * enqueue_hrtimer - internal function to (re)start a timer | |
25651 | * | |
b3bbd485 | 25652 | @@ -839,9 +986,10 @@ EXPORT_SYMBOL_GPL(hrtimer_forward); |
e4b2b4a8 | 25653 | * Returns 1 when the new timer is the leftmost timer in the tree. |
1a6e0f06 | 25654 | */ |
e4b2b4a8 JK |
25655 | static int enqueue_hrtimer(struct hrtimer *timer, |
25656 | - struct hrtimer_clock_base *base) | |
25657 | + struct hrtimer_clock_base *base, | |
25658 | + enum hrtimer_mode mode) | |
1a6e0f06 | 25659 | { |
e4b2b4a8 JK |
25660 | - debug_activate(timer); |
25661 | + debug_activate(timer, mode); | |
1a6e0f06 | 25662 | |
e4b2b4a8 JK |
25663 | base->cpu_base->active_bases |= 1 << base->index; |
25664 | ||
b3bbd485 | 25665 | @@ -874,7 +1022,6 @@ static void __remove_hrtimer(struct hrtimer *timer, |
e4b2b4a8 JK |
25666 | if (!timerqueue_del(&base->active, &timer->node)) |
25667 | cpu_base->active_bases &= ~(1 << base->index); | |
25668 | ||
25669 | -#ifdef CONFIG_HIGH_RES_TIMERS | |
25670 | /* | |
25671 | * Note: If reprogram is false we do not update | |
25672 | * cpu_base->next_timer. This happens when we remove the first | |
b3bbd485 | 25673 | @@ -885,7 +1032,6 @@ static void __remove_hrtimer(struct hrtimer *timer, |
e4b2b4a8 JK |
25674 | */ |
25675 | if (reprogram && timer == cpu_base->next_timer) | |
25676 | hrtimer_force_reprogram(cpu_base, 1); | |
25677 | -#endif | |
25678 | } | |
1a6e0f06 | 25679 | |
e4b2b4a8 | 25680 | /* |
b3bbd485 | 25681 | @@ -934,22 +1080,36 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, |
e4b2b4a8 | 25682 | return tim; |
1a6e0f06 | 25683 | } |
1a6e0f06 | 25684 | |
e4b2b4a8 JK |
25685 | -/** |
25686 | - * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU | |
25687 | - * @timer: the timer to be added | |
25688 | - * @tim: expiry time | |
25689 | - * @delta_ns: "slack" range for the timer | |
25690 | - * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or | |
25691 | - * relative (HRTIMER_MODE_REL) | |
25692 | - */ | |
25693 | -void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, | |
25694 | - u64 delta_ns, const enum hrtimer_mode mode) | |
25695 | +static void | |
25696 | +hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram) | |
1a6e0f06 | 25697 | { |
e4b2b4a8 JK |
25698 | - struct hrtimer_clock_base *base, *new_base; |
25699 | - unsigned long flags; | |
25700 | - int leftmost; | |
25701 | + ktime_t expires; | |
1a6e0f06 | 25702 | |
e4b2b4a8 | 25703 | - base = lock_hrtimer_base(timer, &flags); |
1a6e0f06 | 25704 | + /* |
e4b2b4a8 | 25705 | + * Find the next SOFT expiration. |
1a6e0f06 | 25706 | + */ |
e4b2b4a8 | 25707 | + expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); |
1a6e0f06 | 25708 | + |
e4b2b4a8 JK |
25709 | + /* |
25710 | + * reprogramming needs to be triggered, even if the next soft | |
25711 | + * hrtimer expires at the same time than the next hard | |
25712 | + * hrtimer. cpu_base->softirq_expires_next needs to be updated! | |
25713 | + */ | |
25714 | + if (expires == KTIME_MAX) | |
25715 | + return; | |
1a6e0f06 | 25716 | + |
e4b2b4a8 JK |
25717 | + /* |
25718 | + * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event() | |
25719 | + * cpu_base->*expires_next is only set by hrtimer_reprogram() | |
25720 | + */ | |
25721 | + hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram); | |
1a6e0f06 | 25722 | +} |
1a6e0f06 | 25723 | + |
e4b2b4a8 JK |
25724 | +static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, |
25725 | + u64 delta_ns, const enum hrtimer_mode mode, | |
25726 | + struct hrtimer_clock_base *base) | |
1a6e0f06 | 25727 | +{ |
e4b2b4a8 JK |
25728 | + struct hrtimer_clock_base *new_base; |
25729 | ||
25730 | /* Remove an active timer from the queue: */ | |
25731 | remove_hrtimer(timer, base, true); | |
b3bbd485 | 25732 | @@ -964,21 +1124,37 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, |
e4b2b4a8 JK |
25733 | /* Switch the timer base, if necessary: */ |
25734 | new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); | |
25735 | ||
25736 | - leftmost = enqueue_hrtimer(timer, new_base); | |
25737 | - if (!leftmost) | |
25738 | - goto unlock; | |
25739 | + return enqueue_hrtimer(timer, new_base, mode); | |
1a6e0f06 | 25740 | +} |
1a6e0f06 | 25741 | + |
e4b2b4a8 JK |
25742 | +/** |
25743 | + * hrtimer_start_range_ns - (re)start an hrtimer | |
25744 | + * @timer: the timer to be added | |
25745 | + * @tim: expiry time | |
25746 | + * @delta_ns: "slack" range for the timer | |
25747 | + * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or | |
25748 | + * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED); | |
25749 | + * softirq based mode is considered for debug purpose only! | |
25750 | + */ | |
25751 | +void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, | |
25752 | + u64 delta_ns, const enum hrtimer_mode mode) | |
1a6e0f06 | 25753 | +{ |
e4b2b4a8 JK |
25754 | + struct hrtimer_clock_base *base; |
25755 | + unsigned long flags; | |
1a6e0f06 JK |
25756 | + |
25757 | + /* | |
e4b2b4a8 JK |
25758 | + * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft |
25759 | + * match. | |
1a6e0f06 | 25760 | + */ |
e4b2b4a8 JK |
25761 | +#ifndef CONFIG_PREEMPT_RT_BASE |
25762 | + WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); | |
1a6e0f06 | 25763 | +#endif |
1a6e0f06 | 25764 | + |
e4b2b4a8 JK |
25765 | + base = lock_hrtimer_base(timer, &flags); |
25766 | + | |
25767 | + if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base)) | |
25768 | + hrtimer_reprogram(timer, true); | |
25769 | ||
25770 | - if (!hrtimer_is_hres_active(timer)) { | |
25771 | - /* | |
25772 | - * Kick to reschedule the next tick to handle the new timer | |
25773 | - * on dynticks target. | |
25774 | - */ | |
25775 | - if (new_base->cpu_base->nohz_active) | |
25776 | - wake_up_nohz_cpu(new_base->cpu_base->cpu); | |
25777 | - } else { | |
25778 | - hrtimer_reprogram(timer, new_base); | |
25779 | - } | |
25780 | -unlock: | |
25781 | unlock_hrtimer_base(timer, &flags); | |
25782 | } | |
25783 | EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); | |
b3bbd485 | 25784 | @@ -1035,7 +1211,7 @@ int hrtimer_cancel(struct hrtimer *timer) |
e4b2b4a8 JK |
25785 | |
25786 | if (ret >= 0) | |
25787 | return ret; | |
25788 | - cpu_relax(); | |
25789 | + hrtimer_wait_for_timer(timer); | |
25790 | } | |
25791 | } | |
25792 | EXPORT_SYMBOL_GPL(hrtimer_cancel); | |
b3bbd485 | 25793 | @@ -1076,7 +1252,7 @@ u64 hrtimer_get_next_event(void) |
e4b2b4a8 JK |
25794 | raw_spin_lock_irqsave(&cpu_base->lock, flags); |
25795 | ||
25796 | if (!__hrtimer_hres_active(cpu_base)) | |
25797 | - expires = __hrtimer_get_next_event(cpu_base); | |
25798 | + expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); | |
25799 | ||
25800 | raw_spin_unlock_irqrestore(&cpu_base->lock, flags); | |
25801 | ||
b3bbd485 | 25802 | @@ -1099,8 +1275,16 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) |
e4b2b4a8 JK |
25803 | static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, |
25804 | enum hrtimer_mode mode) | |
25805 | { | |
25806 | - struct hrtimer_cpu_base *cpu_base; | |
25807 | + bool softtimer; | |
25808 | int base; | |
25809 | + struct hrtimer_cpu_base *cpu_base; | |
25810 | + | |
25811 | + softtimer = !!(mode & HRTIMER_MODE_SOFT); | |
25812 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
25813 | + if (!softtimer && !(mode & HRTIMER_MODE_HARD)) | |
25814 | + softtimer = true; | |
1a6e0f06 | 25815 | +#endif |
e4b2b4a8 JK |
25816 | + base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0; |
25817 | ||
25818 | memset(timer, 0, sizeof(struct hrtimer)); | |
25819 | ||
b3bbd485 | 25820 | @@ -1114,7 +1298,8 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, |
e4b2b4a8 JK |
25821 | if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL) |
25822 | clock_id = CLOCK_MONOTONIC; | |
25823 | ||
25824 | - base = hrtimer_clockid_to_base(clock_id); | |
25825 | + base += hrtimer_clockid_to_base(clock_id); | |
25826 | + timer->is_soft = softtimer; | |
25827 | timer->base = &cpu_base->clock_base[base]; | |
25828 | timerqueue_init(&timer->node); | |
25829 | } | |
b3bbd485 | 25830 | @@ -1123,7 +1308,13 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, |
e4b2b4a8 JK |
25831 | * hrtimer_init - initialize a timer to the given clock |
25832 | * @timer: the timer to be initialized | |
25833 | * @clock_id: the clock to be used | |
25834 | - * @mode: timer mode abs/rel | |
25835 | + * @mode: The modes which are relevant for intitialization: | |
25836 | + * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, | |
25837 | + * HRTIMER_MODE_REL_SOFT | |
25838 | + * | |
25839 | + * The PINNED variants of the above can be handed in, | |
25840 | + * but the PINNED bit is ignored as pinning happens | |
25841 | + * when the hrtimer is started | |
1a6e0f06 | 25842 | */ |
e4b2b4a8 JK |
25843 | void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, |
25844 | enum hrtimer_mode mode) | |
b3bbd485 | 25845 | @@ -1142,19 +1333,19 @@ EXPORT_SYMBOL_GPL(hrtimer_init); |
e4b2b4a8 JK |
25846 | */ |
25847 | bool hrtimer_active(const struct hrtimer *timer) | |
25848 | { | |
25849 | - struct hrtimer_cpu_base *cpu_base; | |
25850 | + struct hrtimer_clock_base *base; | |
25851 | unsigned int seq; | |
25852 | ||
25853 | do { | |
25854 | - cpu_base = READ_ONCE(timer->base->cpu_base); | |
25855 | - seq = raw_read_seqcount_begin(&cpu_base->seq); | |
25856 | + base = READ_ONCE(timer->base); | |
25857 | + seq = raw_read_seqcount_begin(&base->seq); | |
25858 | ||
25859 | if (timer->state != HRTIMER_STATE_INACTIVE || | |
25860 | - cpu_base->running == timer) | |
25861 | + base->running == timer) | |
25862 | return true; | |
25863 | ||
25864 | - } while (read_seqcount_retry(&cpu_base->seq, seq) || | |
25865 | - cpu_base != READ_ONCE(timer->base->cpu_base)); | |
25866 | + } while (read_seqcount_retry(&base->seq, seq) || | |
25867 | + base != READ_ONCE(timer->base)); | |
25868 | ||
25869 | return false; | |
25870 | } | |
b3bbd485 | 25871 | @@ -1180,7 +1371,8 @@ EXPORT_SYMBOL_GPL(hrtimer_active); |
e4b2b4a8 JK |
25872 | |
25873 | static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, | |
25874 | struct hrtimer_clock_base *base, | |
25875 | - struct hrtimer *timer, ktime_t *now) | |
25876 | + struct hrtimer *timer, ktime_t *now, | |
25877 | + unsigned long flags) | |
25878 | { | |
25879 | enum hrtimer_restart (*fn)(struct hrtimer *); | |
25880 | int restart; | |
b3bbd485 | 25881 | @@ -1188,16 +1380,16 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, |
e4b2b4a8 JK |
25882 | lockdep_assert_held(&cpu_base->lock); |
25883 | ||
25884 | debug_deactivate(timer); | |
25885 | - cpu_base->running = timer; | |
25886 | + base->running = timer; | |
25887 | ||
25888 | /* | |
25889 | * Separate the ->running assignment from the ->state assignment. | |
25890 | * | |
25891 | * As with a regular write barrier, this ensures the read side in | |
25892 | - * hrtimer_active() cannot observe cpu_base->running == NULL && | |
25893 | + * hrtimer_active() cannot observe base->running == NULL && | |
25894 | * timer->state == INACTIVE. | |
25895 | */ | |
25896 | - raw_write_seqcount_barrier(&cpu_base->seq); | |
25897 | + raw_write_seqcount_barrier(&base->seq); | |
25898 | ||
25899 | __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); | |
25900 | fn = timer->function; | |
b3bbd485 | 25901 | @@ -1211,15 +1403,15 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, |
e4b2b4a8 JK |
25902 | timer->is_rel = false; |
25903 | ||
25904 | /* | |
25905 | - * Because we run timers from hardirq context, there is no chance | |
25906 | - * they get migrated to another cpu, therefore its safe to unlock | |
25907 | - * the timer base. | |
25908 | + * The timer is marked as running in the cpu base, so it is | |
25909 | + * protected against migration to a different CPU even if the lock | |
25910 | + * is dropped. | |
25911 | */ | |
25912 | - raw_spin_unlock(&cpu_base->lock); | |
25913 | + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); | |
25914 | trace_hrtimer_expire_entry(timer, now); | |
25915 | restart = fn(timer); | |
25916 | trace_hrtimer_expire_exit(timer); | |
25917 | - raw_spin_lock(&cpu_base->lock); | |
25918 | + raw_spin_lock_irq(&cpu_base->lock); | |
25919 | ||
25920 | /* | |
25921 | * Note: We clear the running state after enqueue_hrtimer and | |
b3bbd485 | 25922 | @@ -1232,33 +1424,31 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, |
e4b2b4a8 JK |
25923 | */ |
25924 | if (restart != HRTIMER_NORESTART && | |
25925 | !(timer->state & HRTIMER_STATE_ENQUEUED)) | |
25926 | - enqueue_hrtimer(timer, base); | |
25927 | + enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS); | |
25928 | ||
25929 | /* | |
25930 | * Separate the ->running assignment from the ->state assignment. | |
25931 | * | |
25932 | * As with a regular write barrier, this ensures the read side in | |
25933 | - * hrtimer_active() cannot observe cpu_base->running == NULL && | |
25934 | + * hrtimer_active() cannot observe base->running.timer == NULL && | |
25935 | * timer->state == INACTIVE. | |
25936 | */ | |
25937 | - raw_write_seqcount_barrier(&cpu_base->seq); | |
25938 | + raw_write_seqcount_barrier(&base->seq); | |
1a6e0f06 | 25939 | |
e4b2b4a8 JK |
25940 | - WARN_ON_ONCE(cpu_base->running != timer); |
25941 | - cpu_base->running = NULL; | |
25942 | + WARN_ON_ONCE(base->running != timer); | |
25943 | + base->running = NULL; | |
25944 | } | |
1a6e0f06 | 25945 | |
e4b2b4a8 JK |
25946 | -static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) |
25947 | +static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, | |
25948 | + unsigned long flags, unsigned int active_mask) | |
25949 | { | |
25950 | - struct hrtimer_clock_base *base = cpu_base->clock_base; | |
25951 | - unsigned int active = cpu_base->active_bases; | |
25952 | + struct hrtimer_clock_base *base; | |
25953 | + unsigned int active = cpu_base->active_bases & active_mask; | |
25954 | ||
25955 | - for (; active; base++, active >>= 1) { | |
25956 | + for_each_active_base(base, cpu_base, active) { | |
25957 | struct timerqueue_node *node; | |
25958 | ktime_t basenow; | |
25959 | ||
25960 | - if (!(active & 0x01)) | |
25961 | - continue; | |
25962 | - | |
25963 | basenow = ktime_add(now, base->offset); | |
25964 | ||
25965 | while ((node = timerqueue_getnext(&base->active))) { | |
b3bbd485 | 25966 | @@ -1281,11 +1471,29 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) |
e4b2b4a8 JK |
25967 | if (basenow < hrtimer_get_softexpires_tv64(timer)) |
25968 | break; | |
25969 | ||
25970 | - __run_hrtimer(cpu_base, base, timer, &basenow); | |
25971 | + __run_hrtimer(cpu_base, base, timer, &basenow, flags); | |
25972 | } | |
25973 | } | |
25974 | } | |
25975 | ||
25976 | +static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) | |
1a6e0f06 | 25977 | +{ |
e4b2b4a8 JK |
25978 | + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); |
25979 | + unsigned long flags; | |
25980 | + ktime_t now; | |
1a6e0f06 | 25981 | + |
e4b2b4a8 | 25982 | + raw_spin_lock_irqsave(&cpu_base->lock, flags); |
1a6e0f06 | 25983 | + |
e4b2b4a8 JK |
25984 | + now = hrtimer_update_base(cpu_base); |
25985 | + __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT); | |
25986 | + | |
25987 | + cpu_base->softirq_activated = 0; | |
25988 | + hrtimer_update_softirq_timer(cpu_base, true); | |
25989 | + | |
25990 | + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); | |
25991 | + wake_up_timer_waiters(cpu_base); | |
25992 | +} | |
25993 | + | |
25994 | #ifdef CONFIG_HIGH_RES_TIMERS | |
1a6e0f06 | 25995 | |
e4b2b4a8 | 25996 | /* |
b3bbd485 | 25997 | @@ -1296,13 +1504,14 @@ void hrtimer_interrupt(struct clock_event_device *dev) |
e4b2b4a8 JK |
25998 | { |
25999 | struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); | |
26000 | ktime_t expires_next, now, entry_time, delta; | |
26001 | + unsigned long flags; | |
26002 | int retries = 0; | |
1a6e0f06 | 26003 | |
e4b2b4a8 JK |
26004 | BUG_ON(!cpu_base->hres_active); |
26005 | cpu_base->nr_events++; | |
26006 | dev->next_event = KTIME_MAX; | |
1a6e0f06 | 26007 | |
e4b2b4a8 JK |
26008 | - raw_spin_lock(&cpu_base->lock); |
26009 | + raw_spin_lock_irqsave(&cpu_base->lock, flags); | |
26010 | entry_time = now = hrtimer_update_base(cpu_base); | |
26011 | retry: | |
26012 | cpu_base->in_hrtirq = 1; | |
b3bbd485 | 26013 | @@ -1315,17 +1524,23 @@ void hrtimer_interrupt(struct clock_event_device *dev) |
e4b2b4a8 JK |
26014 | */ |
26015 | cpu_base->expires_next = KTIME_MAX; | |
1a6e0f06 | 26016 | |
e4b2b4a8 JK |
26017 | - __hrtimer_run_queues(cpu_base, now); |
26018 | + if (!ktime_before(now, cpu_base->softirq_expires_next)) { | |
26019 | + cpu_base->softirq_expires_next = KTIME_MAX; | |
26020 | + cpu_base->softirq_activated = 1; | |
26021 | + raise_softirq_irqoff(HRTIMER_SOFTIRQ); | |
26022 | + } | |
1a6e0f06 | 26023 | + |
e4b2b4a8 | 26024 | + __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); |
1a6e0f06 | 26025 | |
e4b2b4a8 JK |
26026 | /* Reevaluate the clock bases for the next expiry */ |
26027 | - expires_next = __hrtimer_get_next_event(cpu_base); | |
26028 | + expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); | |
26029 | /* | |
26030 | * Store the new expiry value so the migration code can verify | |
26031 | * against it. | |
26032 | */ | |
26033 | cpu_base->expires_next = expires_next; | |
26034 | cpu_base->in_hrtirq = 0; | |
26035 | - raw_spin_unlock(&cpu_base->lock); | |
26036 | + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); | |
26037 | ||
26038 | /* Reprogramming necessary ? */ | |
26039 | if (!tick_program_event(expires_next, 0)) { | |
b3bbd485 | 26040 | @@ -1346,7 +1561,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) |
e4b2b4a8 JK |
26041 | * Acquire base lock for updating the offsets and retrieving |
26042 | * the current time. | |
26043 | */ | |
26044 | - raw_spin_lock(&cpu_base->lock); | |
26045 | + raw_spin_lock_irqsave(&cpu_base->lock, flags); | |
26046 | now = hrtimer_update_base(cpu_base); | |
26047 | cpu_base->nr_retries++; | |
26048 | if (++retries < 3) | |
b3bbd485 | 26049 | @@ -1359,7 +1574,8 @@ void hrtimer_interrupt(struct clock_event_device *dev) |
e4b2b4a8 JK |
26050 | */ |
26051 | cpu_base->nr_hangs++; | |
26052 | cpu_base->hang_detected = 1; | |
26053 | - raw_spin_unlock(&cpu_base->lock); | |
26054 | + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); | |
26055 | + | |
26056 | delta = ktime_sub(now, entry_time); | |
26057 | if ((unsigned int)delta > cpu_base->max_hang_time) | |
26058 | cpu_base->max_hang_time = (unsigned int) delta; | |
b3bbd485 | 26059 | @@ -1401,6 +1617,7 @@ static inline void __hrtimer_peek_ahead_timers(void) { } |
e4b2b4a8 JK |
26060 | void hrtimer_run_queues(void) |
26061 | { | |
26062 | struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); | |
26063 | + unsigned long flags; | |
26064 | ktime_t now; | |
1a6e0f06 | 26065 | |
e4b2b4a8 | 26066 | if (__hrtimer_hres_active(cpu_base)) |
b3bbd485 | 26067 | @@ -1418,10 +1635,17 @@ void hrtimer_run_queues(void) |
e4b2b4a8 JK |
26068 | return; |
26069 | } | |
1a6e0f06 | 26070 | |
e4b2b4a8 JK |
26071 | - raw_spin_lock(&cpu_base->lock); |
26072 | + raw_spin_lock_irqsave(&cpu_base->lock, flags); | |
26073 | now = hrtimer_update_base(cpu_base); | |
26074 | - __hrtimer_run_queues(cpu_base, now); | |
26075 | - raw_spin_unlock(&cpu_base->lock); | |
26076 | + | |
26077 | + if (!ktime_before(now, cpu_base->softirq_expires_next)) { | |
26078 | + cpu_base->softirq_expires_next = KTIME_MAX; | |
26079 | + cpu_base->softirq_activated = 1; | |
26080 | + raise_softirq_irqoff(HRTIMER_SOFTIRQ); | |
26081 | + } | |
26082 | + | |
26083 | + __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); | |
26084 | + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); | |
1a6e0f06 JK |
26085 | } |
26086 | ||
e4b2b4a8 | 26087 | /* |
b3bbd485 | 26088 | @@ -1440,13 +1664,65 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) |
e4b2b4a8 | 26089 | return HRTIMER_NORESTART; |
1a6e0f06 JK |
26090 | } |
26091 | ||
e4b2b4a8 JK |
26092 | -void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) |
26093 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
26094 | +static bool task_is_realtime(struct task_struct *tsk) | |
1a6e0f06 | 26095 | { |
e4b2b4a8 | 26096 | + int policy = tsk->policy; |
1a6e0f06 | 26097 | + |
e4b2b4a8 JK |
26098 | + if (policy == SCHED_FIFO || policy == SCHED_RR) |
26099 | + return true; | |
26100 | + if (policy == SCHED_DEADLINE) | |
26101 | + return true; | |
26102 | + return false; | |
26103 | +} | |
1a6e0f06 | 26104 | +#endif |
e4b2b4a8 JK |
26105 | + |
26106 | +static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, | |
26107 | + clockid_t clock_id, | |
26108 | + enum hrtimer_mode mode, | |
26109 | + struct task_struct *task) | |
26110 | +{ | |
26111 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
26112 | + if (!(mode & (HRTIMER_MODE_SOFT | HRTIMER_MODE_HARD))) { | |
26113 | + if (task_is_realtime(current) || system_state != SYSTEM_RUNNING) | |
26114 | + mode |= HRTIMER_MODE_HARD; | |
26115 | + else | |
26116 | + mode |= HRTIMER_MODE_SOFT; | |
26117 | + } | |
1a6e0f06 | 26118 | +#endif |
e4b2b4a8 JK |
26119 | + __hrtimer_init(&sl->timer, clock_id, mode); |
26120 | sl->timer.function = hrtimer_wakeup; | |
26121 | sl->task = task; | |
26122 | } | |
26123 | + | |
26124 | +/** | |
26125 | + * hrtimer_init_sleeper - initialize sleeper to the given clock | |
26126 | + * @sl: sleeper to be initialized | |
26127 | + * @clock_id: the clock to be used | |
26128 | + * @mode: timer mode abs/rel | |
26129 | + * @task: the task to wake up | |
26130 | + */ | |
26131 | +void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, | |
26132 | + enum hrtimer_mode mode, struct task_struct *task) | |
26133 | +{ | |
26134 | + debug_init(&sl->timer, clock_id, mode); | |
26135 | + __hrtimer_init_sleeper(sl, clock_id, mode, task); | |
26136 | + | |
26137 | +} | |
26138 | EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); | |
1a6e0f06 | 26139 | |
e4b2b4a8 JK |
26140 | +#ifdef CONFIG_DEBUG_OBJECTS_TIMERS |
26141 | +void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, | |
26142 | + clockid_t clock_id, | |
26143 | + enum hrtimer_mode mode, | |
26144 | + struct task_struct *task) | |
26145 | +{ | |
26146 | + debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr); | |
26147 | + __hrtimer_init_sleeper(sl, clock_id, mode, task); | |
26148 | +} | |
26149 | +EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack); | |
26150 | +#endif | |
1a6e0f06 | 26151 | + |
e4b2b4a8 JK |
26152 | int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) |
26153 | { | |
26154 | switch(restart->nanosleep.type) { | |
b3bbd485 | 26155 | @@ -1470,8 +1746,6 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod |
e4b2b4a8 JK |
26156 | { |
26157 | struct restart_block *restart; | |
1a6e0f06 | 26158 | |
e4b2b4a8 JK |
26159 | - hrtimer_init_sleeper(t, current); |
26160 | - | |
26161 | do { | |
26162 | set_current_state(TASK_INTERRUPTIBLE); | |
26163 | hrtimer_start_expires(&t->timer, mode); | |
b3bbd485 | 26164 | @@ -1508,10 +1782,9 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) |
e4b2b4a8 JK |
26165 | struct hrtimer_sleeper t; |
26166 | int ret; | |
1a6e0f06 | 26167 | |
e4b2b4a8 JK |
26168 | - hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid, |
26169 | - HRTIMER_MODE_ABS); | |
26170 | + hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid, | |
26171 | + HRTIMER_MODE_ABS, current); | |
26172 | hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); | |
26173 | - | |
26174 | ret = do_nanosleep(&t, HRTIMER_MODE_ABS); | |
26175 | destroy_hrtimer_on_stack(&t.timer); | |
26176 | return ret; | |
b3bbd485 | 26177 | @@ -1529,7 +1802,7 @@ long hrtimer_nanosleep(const struct timespec64 *rqtp, |
e4b2b4a8 JK |
26178 | if (dl_task(current) || rt_task(current)) |
26179 | slack = 0; | |
1a6e0f06 | 26180 | |
e4b2b4a8 JK |
26181 | - hrtimer_init_on_stack(&t.timer, clockid, mode); |
26182 | + hrtimer_init_sleeper_on_stack(&t, clockid, mode, current); | |
26183 | hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack); | |
26184 | ret = do_nanosleep(&t, mode); | |
26185 | if (ret != -ERESTART_RESTARTBLOCK) | |
b3bbd485 | 26186 | @@ -1585,6 +1858,27 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, |
e4b2b4a8 JK |
26187 | } |
26188 | #endif | |
1a6e0f06 | 26189 | |
e4b2b4a8 JK |
26190 | +#ifdef CONFIG_PREEMPT_RT_FULL |
26191 | +/* | |
26192 | + * Sleep for 1 ms in hope whoever holds what we want will let it go. | |
26193 | + */ | |
26194 | +void cpu_chill(void) | |
26195 | +{ | |
26196 | + ktime_t chill_time; | |
26197 | + unsigned int freeze_flag = current->flags & PF_NOFREEZE; | |
26198 | + | |
26199 | + chill_time = ktime_set(0, NSEC_PER_MSEC); | |
26200 | + set_current_state(TASK_UNINTERRUPTIBLE); | |
26201 | + current->flags |= PF_NOFREEZE; | |
26202 | + sleeping_lock_inc(); | |
26203 | + schedule_hrtimeout(&chill_time, HRTIMER_MODE_REL_HARD); | |
26204 | + sleeping_lock_dec(); | |
26205 | + if (!freeze_flag) | |
26206 | + current->flags &= ~PF_NOFREEZE; | |
26207 | +} | |
26208 | +EXPORT_SYMBOL(cpu_chill); | |
26209 | +#endif | |
26210 | + | |
26211 | /* | |
26212 | * Functions related to boot-time initialization: | |
26213 | */ | |
b3bbd485 | 26214 | @@ -1598,9 +1892,17 @@ int hrtimers_prepare_cpu(unsigned int cpu) |
e4b2b4a8 JK |
26215 | timerqueue_init_head(&cpu_base->clock_base[i].active); |
26216 | } | |
1a6e0f06 | 26217 | |
e4b2b4a8 JK |
26218 | - cpu_base->active_bases = 0; |
26219 | cpu_base->cpu = cpu; | |
26220 | - hrtimer_init_hres(cpu_base); | |
26221 | + cpu_base->active_bases = 0; | |
26222 | + cpu_base->hres_active = 0; | |
26223 | + cpu_base->hang_detected = 0; | |
26224 | + cpu_base->next_timer = NULL; | |
26225 | + cpu_base->softirq_next_timer = NULL; | |
26226 | + cpu_base->expires_next = KTIME_MAX; | |
26227 | + cpu_base->softirq_expires_next = KTIME_MAX; | |
26228 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
26229 | + init_waitqueue_head(&cpu_base->wait); | |
26230 | +#endif | |
26231 | return 0; | |
26232 | } | |
1a6e0f06 | 26233 | |
b3bbd485 | 26234 | @@ -1632,7 +1934,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, |
e4b2b4a8 JK |
26235 | * sort out already expired timers and reprogram the |
26236 | * event device. | |
26237 | */ | |
26238 | - enqueue_hrtimer(timer, new_base); | |
26239 | + enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS); | |
26240 | } | |
26241 | } | |
1a6e0f06 | 26242 | |
b3bbd485 | 26243 | @@ -1644,6 +1946,12 @@ int hrtimers_dead_cpu(unsigned int scpu) |
e4b2b4a8 JK |
26244 | BUG_ON(cpu_online(scpu)); |
26245 | tick_cancel_sched_timer(scpu); | |
26246 | ||
26247 | + /* | |
26248 | + * this BH disable ensures that raise_softirq_irqoff() does | |
26249 | + * not wakeup ksoftirqd (and acquire the pi-lock) while | |
26250 | + * holding the cpu_base lock | |
26251 | + */ | |
26252 | + local_bh_disable(); | |
1a6e0f06 | 26253 | local_irq_disable(); |
e4b2b4a8 JK |
26254 | old_base = &per_cpu(hrtimer_bases, scpu); |
26255 | new_base = this_cpu_ptr(&hrtimer_bases); | |
b3bbd485 | 26256 | @@ -1659,12 +1967,19 @@ int hrtimers_dead_cpu(unsigned int scpu) |
e4b2b4a8 JK |
26257 | &new_base->clock_base[i]); |
26258 | } | |
1a6e0f06 | 26259 | |
e4b2b4a8 JK |
26260 | + /* |
26261 | + * The migration might have changed the first expiring softirq | |
26262 | + * timer on this CPU. Update it. | |
26263 | + */ | |
26264 | + hrtimer_update_softirq_timer(new_base, false); | |
26265 | + | |
26266 | raw_spin_unlock(&old_base->lock); | |
26267 | raw_spin_unlock(&new_base->lock); | |
26268 | ||
26269 | /* Check, if we got expired work to do */ | |
26270 | __hrtimer_peek_ahead_timers(); | |
1a6e0f06 | 26271 | local_irq_enable(); |
e4b2b4a8 JK |
26272 | + local_bh_enable(); |
26273 | return 0; | |
26274 | } | |
1a6e0f06 | 26275 | |
b3bbd485 | 26276 | @@ -1673,18 +1988,19 @@ int hrtimers_dead_cpu(unsigned int scpu) |
e4b2b4a8 JK |
26277 | void __init hrtimers_init(void) |
26278 | { | |
26279 | hrtimers_prepare_cpu(smp_processor_id()); | |
26280 | + open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq); | |
1a6e0f06 JK |
26281 | } |
26282 | ||
1a6e0f06 | 26283 | /** |
e4b2b4a8 JK |
26284 | * schedule_hrtimeout_range_clock - sleep until timeout |
26285 | * @expires: timeout value (ktime_t) | |
26286 | * @delta: slack in expires timeout (ktime_t) | |
26287 | - * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL | |
26288 | - * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME | |
26289 | + * @mode: timer mode | |
26290 | + * @clock_id: timer clock to be used | |
1a6e0f06 | 26291 | */ |
e4b2b4a8 JK |
26292 | int __sched |
26293 | schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, | |
26294 | - const enum hrtimer_mode mode, int clock) | |
26295 | + const enum hrtimer_mode mode, clockid_t clock_id) | |
26296 | { | |
26297 | struct hrtimer_sleeper t; | |
26298 | ||
b3bbd485 | 26299 | @@ -1705,11 +2021,9 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, |
e4b2b4a8 | 26300 | return -EINTR; |
1a6e0f06 JK |
26301 | } |
26302 | ||
e4b2b4a8 JK |
26303 | - hrtimer_init_on_stack(&t.timer, clock, mode); |
26304 | + hrtimer_init_sleeper_on_stack(&t, clock_id, mode, current); | |
26305 | hrtimer_set_expires_range_ns(&t.timer, *expires, delta); | |
26306 | ||
26307 | - hrtimer_init_sleeper(&t, current); | |
26308 | - | |
26309 | hrtimer_start_expires(&t.timer, mode); | |
26310 | ||
26311 | if (likely(t.task)) | |
b3bbd485 | 26312 | @@ -1727,7 +2041,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, |
e4b2b4a8 JK |
26313 | * schedule_hrtimeout_range - sleep until timeout |
26314 | * @expires: timeout value (ktime_t) | |
26315 | * @delta: slack in expires timeout (ktime_t) | |
26316 | - * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL | |
26317 | + * @mode: timer mode | |
26318 | * | |
26319 | * Make the current task sleep until the given expiry time has | |
26320 | * elapsed. The routine will return immediately unless | |
b3bbd485 | 26321 | @@ -1766,7 +2080,7 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); |
e4b2b4a8 JK |
26322 | /** |
26323 | * schedule_hrtimeout - sleep until timeout | |
26324 | * @expires: timeout value (ktime_t) | |
26325 | - * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL | |
26326 | + * @mode: timer mode | |
26327 | * | |
26328 | * Make the current task sleep until the given expiry time has | |
26329 | * elapsed. The routine will return immediately unless | |
b3bbd485 JK |
26330 | diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c |
26331 | index f26acef5d7b4..760f38528365 100644 | |
26332 | --- a/kernel/time/itimer.c | |
26333 | +++ b/kernel/time/itimer.c | |
26334 | @@ -214,6 +214,7 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) | |
e4b2b4a8 JK |
26335 | /* We are sharing ->siglock with it_real_fn() */ |
26336 | if (hrtimer_try_to_cancel(timer) < 0) { | |
26337 | spin_unlock_irq(&tsk->sighand->siglock); | |
26338 | + hrtimer_wait_for_timer(&tsk->signal->real_timer); | |
26339 | goto again; | |
26340 | } | |
26341 | expires = timeval_to_ktime(value->it_value); | |
b3bbd485 JK |
26342 | diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c |
26343 | index 497719127bf9..62acb8914c9e 100644 | |
26344 | --- a/kernel/time/jiffies.c | |
26345 | +++ b/kernel/time/jiffies.c | |
26346 | @@ -74,7 +74,8 @@ static struct clocksource clocksource_jiffies = { | |
e4b2b4a8 JK |
26347 | .max_cycles = 10, |
26348 | }; | |
26349 | ||
26350 | -__cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); | |
26351 | +__cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock); | |
26352 | +__cacheline_aligned_in_smp seqcount_t jiffies_seq; | |
26353 | ||
26354 | #if (BITS_PER_LONG < 64) | |
26355 | u64 get_jiffies_64(void) | |
b3bbd485 | 26356 | @@ -83,9 +84,9 @@ u64 get_jiffies_64(void) |
e4b2b4a8 JK |
26357 | u64 ret; |
26358 | ||
26359 | do { | |
26360 | - seq = read_seqbegin(&jiffies_lock); | |
26361 | + seq = read_seqcount_begin(&jiffies_seq); | |
26362 | ret = jiffies_64; | |
26363 | - } while (read_seqretry(&jiffies_lock, seq)); | |
26364 | + } while (read_seqcount_retry(&jiffies_seq, seq)); | |
26365 | return ret; | |
1a6e0f06 | 26366 | } |
e4b2b4a8 | 26367 | EXPORT_SYMBOL(get_jiffies_64); |
b3bbd485 | 26368 | diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c |
5dd41b01 | 26369 | index 2da660d53a4b..c7b7d047d12e 100644 |
b3bbd485 JK |
26370 | --- a/kernel/time/posix-cpu-timers.c |
26371 | +++ b/kernel/time/posix-cpu-timers.c | |
e4b2b4a8 JK |
26372 | @@ -3,8 +3,10 @@ |
26373 | * Implement CPU time clocks for the POSIX clock interface. | |
26374 | */ | |
1a6e0f06 | 26375 | |
e4b2b4a8 JK |
26376 | +#include <uapi/linux/sched/types.h> |
26377 | #include <linux/sched/signal.h> | |
26378 | #include <linux/sched/cputime.h> | |
26379 | +#include <linux/sched/rt.h> | |
26380 | #include <linux/posix-timers.h> | |
26381 | #include <linux/errno.h> | |
26382 | #include <linux/math64.h> | |
26383 | @@ -14,6 +16,7 @@ | |
26384 | #include <linux/tick.h> | |
26385 | #include <linux/workqueue.h> | |
26386 | #include <linux/compat.h> | |
26387 | +#include <linux/smpboot.h> | |
1a6e0f06 | 26388 | |
e4b2b4a8 | 26389 | #include "posix-timers.h" |
1a6e0f06 | 26390 | |
b3bbd485 | 26391 | @@ -603,7 +606,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, |
e4b2b4a8 JK |
26392 | /* |
26393 | * Disarm any old timer after extracting its expiry time. | |
26394 | */ | |
26395 | - WARN_ON_ONCE(!irqs_disabled()); | |
26396 | + WARN_ON_ONCE_NONRT(!irqs_disabled()); | |
1a6e0f06 | 26397 | |
e4b2b4a8 JK |
26398 | ret = 0; |
26399 | old_incr = timer->it.cpu.incr; | |
b3bbd485 | 26400 | @@ -1034,7 +1037,7 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer) |
e4b2b4a8 JK |
26401 | /* |
26402 | * Now re-arm for the new expiry time. | |
26403 | */ | |
26404 | - WARN_ON_ONCE(!irqs_disabled()); | |
26405 | + WARN_ON_ONCE_NONRT(!irqs_disabled()); | |
26406 | arm_timer(timer); | |
26407 | unlock: | |
26408 | unlock_task_sighand(p, &flags); | |
b3bbd485 | 26409 | @@ -1119,13 +1122,13 @@ static inline int fastpath_timer_check(struct task_struct *tsk) |
e4b2b4a8 JK |
26410 | * already updated our counts. We need to check if any timers fire now. |
26411 | * Interrupts are disabled. | |
26412 | */ | |
26413 | -void run_posix_cpu_timers(struct task_struct *tsk) | |
26414 | +static void __run_posix_cpu_timers(struct task_struct *tsk) | |
26415 | { | |
26416 | LIST_HEAD(firing); | |
26417 | struct k_itimer *timer, *next; | |
26418 | unsigned long flags; | |
1a6e0f06 | 26419 | |
e4b2b4a8 JK |
26420 | - WARN_ON_ONCE(!irqs_disabled()); |
26421 | + WARN_ON_ONCE_NONRT(!irqs_disabled()); | |
1a6e0f06 | 26422 | |
e4b2b4a8 JK |
26423 | /* |
26424 | * The fast path checks that there are no expired thread or thread | |
b3bbd485 | 26425 | @@ -1179,6 +1182,152 @@ void run_posix_cpu_timers(struct task_struct *tsk) |
1a6e0f06 | 26426 | } |
1a6e0f06 | 26427 | } |
1a6e0f06 | 26428 | |
e4b2b4a8 JK |
26429 | +#ifdef CONFIG_PREEMPT_RT_BASE |
26430 | +#include <linux/kthread.h> | |
26431 | +#include <linux/cpu.h> | |
26432 | +DEFINE_PER_CPU(struct task_struct *, posix_timer_task); | |
26433 | +DEFINE_PER_CPU(struct task_struct *, posix_timer_tasklist); | |
26434 | +DEFINE_PER_CPU(bool, posix_timer_th_active); | |
1a6e0f06 | 26435 | + |
e4b2b4a8 | 26436 | +static void posix_cpu_kthread_fn(unsigned int cpu) |
1a6e0f06 | 26437 | +{ |
e4b2b4a8 JK |
26438 | + struct task_struct *tsk = NULL; |
26439 | + struct task_struct *next = NULL; | |
1a6e0f06 | 26440 | + |
e4b2b4a8 JK |
26441 | + BUG_ON(per_cpu(posix_timer_task, cpu) != current); |
26442 | + | |
26443 | + /* grab task list */ | |
26444 | + raw_local_irq_disable(); | |
26445 | + tsk = per_cpu(posix_timer_tasklist, cpu); | |
26446 | + per_cpu(posix_timer_tasklist, cpu) = NULL; | |
26447 | + raw_local_irq_enable(); | |
26448 | + | |
26449 | + /* its possible the list is empty, just return */ | |
26450 | + if (!tsk) | |
26451 | + return; | |
26452 | + | |
26453 | + /* Process task list */ | |
26454 | + while (1) { | |
26455 | + /* save next */ | |
26456 | + next = tsk->posix_timer_list; | |
26457 | + | |
26458 | + /* run the task timers, clear its ptr and | |
26459 | + * unreference it | |
26460 | + */ | |
26461 | + __run_posix_cpu_timers(tsk); | |
26462 | + tsk->posix_timer_list = NULL; | |
26463 | + put_task_struct(tsk); | |
26464 | + | |
26465 | + /* check if this is the last on the list */ | |
26466 | + if (next == tsk) | |
26467 | + break; | |
26468 | + tsk = next; | |
1a6e0f06 JK |
26469 | + } |
26470 | +} | |
26471 | + | |
e4b2b4a8 | 26472 | +static inline int __fastpath_timer_check(struct task_struct *tsk) |
1a6e0f06 | 26473 | +{ |
e4b2b4a8 JK |
26474 | + /* tsk == current, ensure it is safe to use ->signal/sighand */ |
26475 | + if (unlikely(tsk->exit_state)) | |
26476 | + return 0; | |
1a6e0f06 | 26477 | + |
e4b2b4a8 JK |
26478 | + if (!task_cputime_zero(&tsk->cputime_expires)) |
26479 | + return 1; | |
26480 | + | |
26481 | + if (!task_cputime_zero(&tsk->signal->cputime_expires)) | |
26482 | + return 1; | |
26483 | + | |
26484 | + return 0; | |
1a6e0f06 JK |
26485 | +} |
26486 | + | |
e4b2b4a8 JK |
26487 | +void run_posix_cpu_timers(struct task_struct *tsk) |
26488 | +{ | |
26489 | + unsigned int cpu = smp_processor_id(); | |
26490 | + struct task_struct *tasklist; | |
1a6e0f06 | 26491 | + |
e4b2b4a8 JK |
26492 | + BUG_ON(!irqs_disabled()); |
26493 | + | |
26494 | + if (per_cpu(posix_timer_th_active, cpu) != true) | |
26495 | + return; | |
26496 | + | |
26497 | + /* get per-cpu references */ | |
26498 | + tasklist = per_cpu(posix_timer_tasklist, cpu); | |
26499 | + | |
26500 | + /* check to see if we're already queued */ | |
26501 | + if (!tsk->posix_timer_list && __fastpath_timer_check(tsk)) { | |
26502 | + get_task_struct(tsk); | |
26503 | + if (tasklist) { | |
26504 | + tsk->posix_timer_list = tasklist; | |
26505 | + } else { | |
26506 | + /* | |
26507 | + * The list is terminated by a self-pointing | |
26508 | + * task_struct | |
26509 | + */ | |
26510 | + tsk->posix_timer_list = tsk; | |
26511 | + } | |
26512 | + per_cpu(posix_timer_tasklist, cpu) = tsk; | |
26513 | + | |
26514 | + wake_up_process(per_cpu(posix_timer_task, cpu)); | |
26515 | + } | |
26516 | +} | |
26517 | + | |
26518 | +static int posix_cpu_kthread_should_run(unsigned int cpu) | |
1a6e0f06 | 26519 | +{ |
e4b2b4a8 | 26520 | + return __this_cpu_read(posix_timer_tasklist) != NULL; |
1a6e0f06 | 26521 | +} |
1a6e0f06 | 26522 | + |
e4b2b4a8 | 26523 | +static void posix_cpu_kthread_park(unsigned int cpu) |
1a6e0f06 | 26524 | +{ |
e4b2b4a8 | 26525 | + this_cpu_write(posix_timer_th_active, false); |
1a6e0f06 JK |
26526 | +} |
26527 | + | |
e4b2b4a8 | 26528 | +static void posix_cpu_kthread_unpark(unsigned int cpu) |
1a6e0f06 | 26529 | +{ |
e4b2b4a8 | 26530 | + this_cpu_write(posix_timer_th_active, true); |
1a6e0f06 | 26531 | +} |
1a6e0f06 | 26532 | + |
e4b2b4a8 JK |
26533 | +static void posix_cpu_kthread_setup(unsigned int cpu) |
26534 | +{ | |
26535 | + struct sched_param sp; | |
26536 | + | |
26537 | + sp.sched_priority = MAX_RT_PRIO - 1; | |
26538 | + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); | |
26539 | + posix_cpu_kthread_unpark(cpu); | |
26540 | +} | |
26541 | + | |
26542 | +static struct smp_hotplug_thread posix_cpu_thread = { | |
26543 | + .store = &posix_timer_task, | |
26544 | + .thread_should_run = posix_cpu_kthread_should_run, | |
26545 | + .thread_fn = posix_cpu_kthread_fn, | |
26546 | + .thread_comm = "posixcputmr/%u", | |
26547 | + .setup = posix_cpu_kthread_setup, | |
26548 | + .park = posix_cpu_kthread_park, | |
26549 | + .unpark = posix_cpu_kthread_unpark, | |
26550 | +}; | |
26551 | + | |
26552 | +static int __init posix_cpu_thread_init(void) | |
1a6e0f06 | 26553 | +{ |
e4b2b4a8 JK |
26554 | + /* Start one for boot CPU. */ |
26555 | + unsigned long cpu; | |
26556 | + int ret; | |
26557 | + | |
26558 | + /* init the per-cpu posix_timer_tasklets */ | |
26559 | + for_each_possible_cpu(cpu) | |
26560 | + per_cpu(posix_timer_tasklist, cpu) = NULL; | |
26561 | + | |
26562 | + ret = smpboot_register_percpu_thread(&posix_cpu_thread); | |
26563 | + WARN_ON(ret); | |
26564 | + | |
1a6e0f06 JK |
26565 | + return 0; |
26566 | +} | |
e4b2b4a8 JK |
26567 | +early_initcall(posix_cpu_thread_init); |
26568 | +#else /* CONFIG_PREEMPT_RT_BASE */ | |
26569 | +void run_posix_cpu_timers(struct task_struct *tsk) | |
26570 | +{ | |
26571 | + __run_posix_cpu_timers(tsk); | |
26572 | +} | |
26573 | +#endif /* CONFIG_PREEMPT_RT_BASE */ | |
26574 | + | |
26575 | /* | |
26576 | * Set one of the process-wide special case CPU timers or RLIMIT_CPU. | |
26577 | * The tsk->sighand->siglock must be held by the caller. | |
b3bbd485 | 26578 | diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c |
5dd41b01 | 26579 | index 55d45fe2cc17..5a59538f3d16 100644 |
b3bbd485 JK |
26580 | --- a/kernel/time/posix-timers.c |
26581 | +++ b/kernel/time/posix-timers.c | |
5dd41b01 | 26582 | @@ -443,6 +443,7 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) |
e4b2b4a8 JK |
26583 | static struct pid *good_sigevent(sigevent_t * event) |
26584 | { | |
26585 | struct task_struct *rtn = current->group_leader; | |
26586 | + int sig = event->sigev_signo; | |
26587 | ||
26588 | switch (event->sigev_notify) { | |
26589 | case SIGEV_SIGNAL | SIGEV_THREAD_ID: | |
5dd41b01 | 26590 | @@ -452,7 +453,8 @@ static struct pid *good_sigevent(sigevent_t * event) |
e4b2b4a8 JK |
26591 | /* FALLTHRU */ |
26592 | case SIGEV_SIGNAL: | |
26593 | case SIGEV_THREAD: | |
26594 | - if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX) | |
26595 | + if (sig <= 0 || sig > SIGRTMAX || | |
26596 | + sig_kernel_only(sig) || sig_kernel_coredump(sig)) | |
26597 | return NULL; | |
26598 | /* FALLTHRU */ | |
26599 | case SIGEV_NONE: | |
5dd41b01 | 26600 | @@ -478,7 +480,7 @@ static struct k_itimer * alloc_posix_timer(void) |
e4b2b4a8 JK |
26601 | |
26602 | static void k_itimer_rcu_free(struct rcu_head *head) | |
26603 | { | |
26604 | - struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu); | |
26605 | + struct k_itimer *tmr = container_of(head, struct k_itimer, rcu); | |
26606 | ||
26607 | kmem_cache_free(posix_timers_cache, tmr); | |
26608 | } | |
5dd41b01 | 26609 | @@ -495,7 +497,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) |
e4b2b4a8 JK |
26610 | } |
26611 | put_pid(tmr->it_pid); | |
26612 | sigqueue_free(tmr->sigq); | |
26613 | - call_rcu(&tmr->it.rcu, k_itimer_rcu_free); | |
26614 | + call_rcu(&tmr->rcu, k_itimer_rcu_free); | |
26615 | } | |
26616 | ||
26617 | static int common_timer_create(struct k_itimer *new_timer) | |
5dd41b01 | 26618 | @@ -834,6 +836,22 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, |
e4b2b4a8 JK |
26619 | hrtimer_start_expires(timer, HRTIMER_MODE_ABS); |
26620 | } | |
26621 | ||
26622 | +/* | |
26623 | + * Protected by RCU! | |
26624 | + */ | |
26625 | +static void timer_wait_for_callback(const struct k_clock *kc, struct k_itimer *timr) | |
26626 | +{ | |
26627 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
26628 | + if (kc->timer_arm == common_hrtimer_arm) | |
26629 | + hrtimer_wait_for_timer(&timr->it.real.timer); | |
26630 | + else if (kc == &alarm_clock) | |
26631 | + hrtimer_wait_for_timer(&timr->it.alarm.alarmtimer.timer); | |
26632 | + else | |
26633 | + /* FIXME: Whacky hack for posix-cpu-timers */ | |
26634 | + schedule_timeout(1); | |
1a6e0f06 | 26635 | +#endif |
e4b2b4a8 | 26636 | +} |
1a6e0f06 | 26637 | + |
e4b2b4a8 JK |
26638 | static int common_hrtimer_try_to_cancel(struct k_itimer *timr) |
26639 | { | |
26640 | return hrtimer_try_to_cancel(&timr->it.real.timer); | |
5dd41b01 | 26641 | @@ -898,6 +916,7 @@ static int do_timer_settime(timer_t timer_id, int flags, |
e4b2b4a8 JK |
26642 | if (!timr) |
26643 | return -EINVAL; | |
26644 | ||
26645 | + rcu_read_lock(); | |
26646 | kc = timr->kclock; | |
26647 | if (WARN_ON_ONCE(!kc || !kc->timer_set)) | |
26648 | error = -EINVAL; | |
5dd41b01 | 26649 | @@ -906,9 +925,12 @@ static int do_timer_settime(timer_t timer_id, int flags, |
e4b2b4a8 JK |
26650 | |
26651 | unlock_timer(timr, flag); | |
26652 | if (error == TIMER_RETRY) { | |
26653 | + timer_wait_for_callback(kc, timr); | |
26654 | old_spec64 = NULL; // We already got the old time... | |
26655 | + rcu_read_unlock(); | |
26656 | goto retry; | |
26657 | } | |
26658 | + rcu_read_unlock(); | |
26659 | ||
26660 | return error; | |
26661 | } | |
5dd41b01 | 26662 | @@ -990,10 +1012,15 @@ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id) |
e4b2b4a8 JK |
26663 | if (!timer) |
26664 | return -EINVAL; | |
26665 | ||
26666 | + rcu_read_lock(); | |
26667 | if (timer_delete_hook(timer) == TIMER_RETRY) { | |
26668 | unlock_timer(timer, flags); | |
26669 | + timer_wait_for_callback(clockid_to_kclock(timer->it_clock), | |
26670 | + timer); | |
26671 | + rcu_read_unlock(); | |
26672 | goto retry_delete; | |
26673 | } | |
26674 | + rcu_read_unlock(); | |
26675 | ||
26676 | spin_lock(¤t->sighand->siglock); | |
26677 | list_del(&timer->list); | |
5dd41b01 | 26678 | @@ -1019,8 +1046,18 @@ static void itimer_delete(struct k_itimer *timer) |
e4b2b4a8 JK |
26679 | retry_delete: |
26680 | spin_lock_irqsave(&timer->it_lock, flags); | |
26681 | ||
26682 | + /* On RT we can race with a deletion */ | |
26683 | + if (!timer->it_signal) { | |
26684 | + unlock_timer(timer, flags); | |
26685 | + return; | |
26686 | + } | |
26687 | + | |
26688 | if (timer_delete_hook(timer) == TIMER_RETRY) { | |
26689 | + rcu_read_lock(); | |
26690 | unlock_timer(timer, flags); | |
26691 | + timer_wait_for_callback(clockid_to_kclock(timer->it_clock), | |
26692 | + timer); | |
26693 | + rcu_read_unlock(); | |
26694 | goto retry_delete; | |
26695 | } | |
26696 | list_del(&timer->list); | |
b3bbd485 JK |
26697 | diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c |
26698 | index 58045eb976c3..f0a34afbc252 100644 | |
26699 | --- a/kernel/time/tick-broadcast-hrtimer.c | |
26700 | +++ b/kernel/time/tick-broadcast-hrtimer.c | |
26701 | @@ -106,7 +106,7 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t) | |
e4b2b4a8 JK |
26702 | |
26703 | void tick_setup_hrtimer_broadcast(void) | |
1a6e0f06 | 26704 | { |
e4b2b4a8 JK |
26705 | - hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); |
26706 | + hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); | |
26707 | bctimer.function = bc_handler; | |
26708 | clockevents_register_device(&ce_broadcast_hrtimer); | |
26709 | } | |
b3bbd485 JK |
26710 | diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c |
26711 | index 49edc1c4f3e6..7a87a4488a5e 100644 | |
26712 | --- a/kernel/time/tick-common.c | |
26713 | +++ b/kernel/time/tick-common.c | |
26714 | @@ -79,13 +79,15 @@ int tick_is_oneshot_available(void) | |
e4b2b4a8 JK |
26715 | static void tick_periodic(int cpu) |
26716 | { | |
26717 | if (tick_do_timer_cpu == cpu) { | |
26718 | - write_seqlock(&jiffies_lock); | |
26719 | + raw_spin_lock(&jiffies_lock); | |
26720 | + write_seqcount_begin(&jiffies_seq); | |
1a6e0f06 | 26721 | |
e4b2b4a8 JK |
26722 | /* Keep track of the next tick event */ |
26723 | tick_next_period = ktime_add(tick_next_period, tick_period); | |
1a6e0f06 | 26724 | |
e4b2b4a8 JK |
26725 | do_timer(1); |
26726 | - write_sequnlock(&jiffies_lock); | |
26727 | + write_seqcount_end(&jiffies_seq); | |
26728 | + raw_spin_unlock(&jiffies_lock); | |
26729 | update_wall_time(); | |
26730 | } | |
1a6e0f06 | 26731 | |
b3bbd485 | 26732 | @@ -157,9 +159,9 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) |
e4b2b4a8 | 26733 | ktime_t next; |
1a6e0f06 | 26734 | |
e4b2b4a8 JK |
26735 | do { |
26736 | - seq = read_seqbegin(&jiffies_lock); | |
26737 | + seq = read_seqcount_begin(&jiffies_seq); | |
26738 | next = tick_next_period; | |
26739 | - } while (read_seqretry(&jiffies_lock, seq)); | |
26740 | + } while (read_seqcount_retry(&jiffies_seq, seq)); | |
1a6e0f06 | 26741 | |
e4b2b4a8 JK |
26742 | clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); |
26743 | ||
b3bbd485 JK |
26744 | @@ -490,6 +492,7 @@ void tick_freeze(void) |
26745 | if (tick_freeze_depth == num_online_cpus()) { | |
26746 | trace_suspend_resume(TPS("timekeeping_freeze"), | |
26747 | smp_processor_id(), true); | |
26748 | + system_state = SYSTEM_SUSPEND; | |
26749 | timekeeping_suspend(); | |
26750 | } else { | |
26751 | tick_suspend_local(); | |
26752 | @@ -513,6 +516,7 @@ void tick_unfreeze(void) | |
26753 | ||
26754 | if (tick_freeze_depth == num_online_cpus()) { | |
26755 | timekeeping_resume(); | |
26756 | + system_state = SYSTEM_RUNNING; | |
26757 | trace_suspend_resume(TPS("timekeeping_freeze"), | |
26758 | smp_processor_id(), false); | |
26759 | } else { | |
26760 | diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h | |
26761 | index f8e1845aa464..e277284c2831 100644 | |
26762 | --- a/kernel/time/tick-internal.h | |
26763 | +++ b/kernel/time/tick-internal.h | |
26764 | @@ -150,16 +150,15 @@ static inline void tick_nohz_init(void) { } | |
e4b2b4a8 JK |
26765 | |
26766 | #ifdef CONFIG_NO_HZ_COMMON | |
26767 | extern unsigned long tick_nohz_active; | |
26768 | -#else | |
26769 | +extern void timers_update_nohz(void); | |
26770 | +# ifdef CONFIG_SMP | |
26771 | +extern struct static_key_false timers_migration_enabled; | |
26772 | +# endif | |
26773 | +#else /* CONFIG_NO_HZ_COMMON */ | |
26774 | +static inline void timers_update_nohz(void) { } | |
26775 | #define tick_nohz_active (0) | |
26776 | #endif | |
26777 | ||
26778 | -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) | |
26779 | -extern void timers_update_migration(bool update_nohz); | |
26780 | -#else | |
26781 | -static inline void timers_update_migration(bool update_nohz) { } | |
26782 | -#endif | |
26783 | - | |
26784 | DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); | |
26785 | ||
26786 | extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); | |
b3bbd485 | 26787 | diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c |
5dd41b01 | 26788 | index a8fa0a896b78..643b36a0b8e1 100644 |
b3bbd485 JK |
26789 | --- a/kernel/time/tick-sched.c |
26790 | +++ b/kernel/time/tick-sched.c | |
26791 | @@ -66,7 +66,8 @@ static void tick_do_update_jiffies64(ktime_t now) | |
1a6e0f06 JK |
26792 | return; |
26793 | ||
e4b2b4a8 JK |
26794 | /* Reevaluate with jiffies_lock held */ |
26795 | - write_seqlock(&jiffies_lock); | |
26796 | + raw_spin_lock(&jiffies_lock); | |
26797 | + write_seqcount_begin(&jiffies_seq); | |
26798 | ||
26799 | delta = ktime_sub(now, last_jiffies_update); | |
26800 | if (delta >= tick_period) { | |
b3bbd485 | 26801 | @@ -89,10 +90,12 @@ static void tick_do_update_jiffies64(ktime_t now) |
e4b2b4a8 JK |
26802 | /* Keep the tick_next_period variable up to date */ |
26803 | tick_next_period = ktime_add(last_jiffies_update, tick_period); | |
26804 | } else { | |
26805 | - write_sequnlock(&jiffies_lock); | |
26806 | + write_seqcount_end(&jiffies_seq); | |
26807 | + raw_spin_unlock(&jiffies_lock); | |
26808 | return; | |
1a6e0f06 | 26809 | } |
e4b2b4a8 JK |
26810 | - write_sequnlock(&jiffies_lock); |
26811 | + write_seqcount_end(&jiffies_seq); | |
26812 | + raw_spin_unlock(&jiffies_lock); | |
26813 | update_wall_time(); | |
26814 | } | |
26815 | ||
b3bbd485 | 26816 | @@ -103,12 +106,14 @@ static ktime_t tick_init_jiffy_update(void) |
e4b2b4a8 JK |
26817 | { |
26818 | ktime_t period; | |
26819 | ||
26820 | - write_seqlock(&jiffies_lock); | |
26821 | + raw_spin_lock(&jiffies_lock); | |
26822 | + write_seqcount_begin(&jiffies_seq); | |
26823 | /* Did we start the jiffies update yet ? */ | |
26824 | if (last_jiffies_update == 0) | |
26825 | last_jiffies_update = tick_next_period; | |
26826 | period = last_jiffies_update; | |
26827 | - write_sequnlock(&jiffies_lock); | |
26828 | + write_seqcount_end(&jiffies_seq); | |
26829 | + raw_spin_unlock(&jiffies_lock); | |
26830 | return period; | |
1a6e0f06 JK |
26831 | } |
26832 | ||
b3bbd485 | 26833 | @@ -225,6 +230,7 @@ static void nohz_full_kick_func(struct irq_work *work) |
e4b2b4a8 JK |
26834 | |
26835 | static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { | |
26836 | .func = nohz_full_kick_func, | |
26837 | + .flags = IRQ_WORK_HARD_IRQ, | |
26838 | }; | |
26839 | ||
1a6e0f06 | 26840 | /* |
b3bbd485 | 26841 | @@ -689,10 +695,10 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, |
1a6e0f06 | 26842 | |
e4b2b4a8 JK |
26843 | /* Read jiffies and the time when jiffies were updated last */ |
26844 | do { | |
26845 | - seq = read_seqbegin(&jiffies_lock); | |
26846 | + seq = read_seqcount_begin(&jiffies_seq); | |
26847 | basemono = last_jiffies_update; | |
26848 | basejiff = jiffies; | |
26849 | - } while (read_seqretry(&jiffies_lock, seq)); | |
26850 | + } while (read_seqcount_retry(&jiffies_seq, seq)); | |
26851 | ts->last_jiffies = basejiff; | |
1a6e0f06 | 26852 | |
e4b2b4a8 | 26853 | /* |
5dd41b01 | 26854 | @@ -906,14 +912,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) |
e4b2b4a8 JK |
26855 | return false; |
26856 | ||
26857 | if (unlikely(local_softirq_pending() && cpu_online(cpu))) { | |
26858 | - static int ratelimit; | |
26859 | - | |
b3a53f05 | 26860 | - if (ratelimit < 10 && !in_softirq() && |
e4b2b4a8 JK |
26861 | - (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { |
26862 | - pr_warn("NOHZ: local_softirq_pending %02x\n", | |
26863 | - (unsigned int) local_softirq_pending()); | |
26864 | - ratelimit++; | |
26865 | - } | |
26866 | + softirq_check_pending_idle(); | |
26867 | return false; | |
1a6e0f06 | 26868 | } |
1a6e0f06 | 26869 | |
b3bbd485 | 26870 | @@ -1132,7 +1131,7 @@ static inline void tick_nohz_activate(struct tick_sched *ts, int mode) |
e4b2b4a8 JK |
26871 | ts->nohz_mode = mode; |
26872 | /* One update is enough */ | |
26873 | if (!test_and_set_bit(0, &tick_nohz_active)) | |
26874 | - timers_update_migration(true); | |
26875 | + timers_update_nohz(); | |
26876 | } | |
1a6e0f06 | 26877 | |
e4b2b4a8 | 26878 | /** |
b3bbd485 | 26879 | @@ -1250,7 +1249,7 @@ void tick_setup_sched_timer(void) |
e4b2b4a8 JK |
26880 | /* |
26881 | * Emulate tick processing via per-CPU hrtimers: | |
26882 | */ | |
26883 | - hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | |
26884 | + hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); | |
26885 | ts->sched_timer.function = tick_sched_timer; | |
26886 | ||
26887 | /* Get the next period (per-CPU) */ | |
b3bbd485 JK |
26888 | diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c |
26889 | index 2cafb49aa65e..2720f2c29a6d 100644 | |
26890 | --- a/kernel/time/timekeeping.c | |
26891 | +++ b/kernel/time/timekeeping.c | |
26892 | @@ -2326,8 +2326,10 @@ EXPORT_SYMBOL(hardpps); | |
e4b2b4a8 JK |
26893 | */ |
26894 | void xtime_update(unsigned long ticks) | |
1a6e0f06 | 26895 | { |
e4b2b4a8 JK |
26896 | - write_seqlock(&jiffies_lock); |
26897 | + raw_spin_lock(&jiffies_lock); | |
26898 | + write_seqcount_begin(&jiffies_seq); | |
26899 | do_timer(ticks); | |
26900 | - write_sequnlock(&jiffies_lock); | |
26901 | + write_seqcount_end(&jiffies_seq); | |
26902 | + raw_spin_unlock(&jiffies_lock); | |
26903 | update_wall_time(); | |
26904 | } | |
b3bbd485 JK |
26905 | diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h |
26906 | index c9f9af339914..0c0f52bf1927 100644 | |
26907 | --- a/kernel/time/timekeeping.h | |
26908 | +++ b/kernel/time/timekeeping.h | |
26909 | @@ -18,7 +18,8 @@ extern void timekeeping_resume(void); | |
e4b2b4a8 JK |
26910 | extern void do_timer(unsigned long ticks); |
26911 | extern void update_wall_time(void); | |
1a6e0f06 | 26912 | |
e4b2b4a8 JK |
26913 | -extern seqlock_t jiffies_lock; |
26914 | +extern raw_spinlock_t jiffies_lock; | |
26915 | +extern seqcount_t jiffies_seq; | |
1a6e0f06 | 26916 | |
e4b2b4a8 JK |
26917 | #define CS_NAME_LEN 32 |
26918 | ||
b3bbd485 JK |
26919 | diff --git a/kernel/time/timer.c b/kernel/time/timer.c |
26920 | index f17c76a1a05f..5fadd754ce20 100644 | |
26921 | --- a/kernel/time/timer.c | |
26922 | +++ b/kernel/time/timer.c | |
e4b2b4a8 JK |
26923 | @@ -44,6 +44,7 @@ |
26924 | #include <linux/sched/debug.h> | |
26925 | #include <linux/slab.h> | |
26926 | #include <linux/compat.h> | |
26927 | +#include <linux/swait.h> | |
26928 | ||
26929 | #include <linux/uaccess.h> | |
26930 | #include <asm/unistd.h> | |
b3bbd485 | 26931 | @@ -197,11 +198,12 @@ EXPORT_SYMBOL(jiffies_64); |
e4b2b4a8 JK |
26932 | struct timer_base { |
26933 | raw_spinlock_t lock; | |
26934 | struct timer_list *running_timer; | |
26935 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
26936 | + struct swait_queue_head wait_for_running_timer; | |
1a6e0f06 | 26937 | +#endif |
e4b2b4a8 JK |
26938 | unsigned long clk; |
26939 | unsigned long next_expiry; | |
26940 | unsigned int cpu; | |
26941 | - bool migration_enabled; | |
26942 | - bool nohz_active; | |
26943 | bool is_idle; | |
26944 | bool must_forward_clk; | |
26945 | DECLARE_BITMAP(pending_map, WHEEL_SIZE); | |
b3bbd485 | 26946 | @@ -210,45 +212,73 @@ struct timer_base { |
1a6e0f06 | 26947 | |
e4b2b4a8 | 26948 | static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]); |
1a6e0f06 | 26949 | |
e4b2b4a8 JK |
26950 | -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) |
26951 | +#ifdef CONFIG_NO_HZ_COMMON | |
1a6e0f06 | 26952 | + |
e4b2b4a8 JK |
26953 | +static DEFINE_STATIC_KEY_FALSE(timers_nohz_active); |
26954 | +static DEFINE_MUTEX(timer_keys_mutex); | |
26955 | + | |
26956 | +static struct swork_event timer_update_swork; | |
26957 | + | |
26958 | +#ifdef CONFIG_SMP | |
26959 | unsigned int sysctl_timer_migration = 1; | |
1a6e0f06 | 26960 | |
e4b2b4a8 JK |
26961 | -void timers_update_migration(bool update_nohz) |
26962 | +DEFINE_STATIC_KEY_FALSE(timers_migration_enabled); | |
1a6e0f06 | 26963 | + |
e4b2b4a8 JK |
26964 | +static void timers_update_migration(void) |
26965 | { | |
26966 | bool on = sysctl_timer_migration && tick_nohz_active; | |
26967 | - unsigned int cpu; | |
1a6e0f06 | 26968 | |
e4b2b4a8 JK |
26969 | - /* Avoid the loop, if nothing to update */ |
26970 | - if (this_cpu_read(timer_bases[BASE_STD].migration_enabled) == on) | |
26971 | - return; | |
26972 | + if (on) | |
26973 | + static_branch_enable(&timers_migration_enabled); | |
26974 | + else | |
26975 | + static_branch_disable(&timers_migration_enabled); | |
26976 | +} | |
26977 | +#else | |
26978 | +static inline void timers_update_migration(void) { } | |
26979 | +#endif /* !CONFIG_SMP */ | |
26980 | ||
26981 | - for_each_possible_cpu(cpu) { | |
26982 | - per_cpu(timer_bases[BASE_STD].migration_enabled, cpu) = on; | |
26983 | - per_cpu(timer_bases[BASE_DEF].migration_enabled, cpu) = on; | |
26984 | - per_cpu(hrtimer_bases.migration_enabled, cpu) = on; | |
26985 | - if (!update_nohz) | |
26986 | - continue; | |
26987 | - per_cpu(timer_bases[BASE_STD].nohz_active, cpu) = true; | |
26988 | - per_cpu(timer_bases[BASE_DEF].nohz_active, cpu) = true; | |
26989 | - per_cpu(hrtimer_bases.nohz_active, cpu) = true; | |
26990 | - } | |
26991 | +static void timer_update_keys(struct swork_event *event) | |
26992 | +{ | |
26993 | + mutex_lock(&timer_keys_mutex); | |
26994 | + timers_update_migration(); | |
26995 | + static_branch_enable(&timers_nohz_active); | |
26996 | + mutex_unlock(&timer_keys_mutex); | |
b3bbd485 JK |
26997 | +} |
26998 | + | |
e4b2b4a8 JK |
26999 | +void timers_update_nohz(void) |
27000 | +{ | |
27001 | + swork_queue(&timer_update_swork); | |
27002 | +} | |
1a6e0f06 | 27003 | + |
e4b2b4a8 | 27004 | +static __init int hrtimer_init_thread(void) |
1a6e0f06 | 27005 | +{ |
e4b2b4a8 JK |
27006 | + WARN_ON(swork_get()); |
27007 | + INIT_SWORK(&timer_update_swork, timer_update_keys); | |
27008 | + return 0; | |
b3bbd485 | 27009 | } |
e4b2b4a8 | 27010 | +early_initcall(hrtimer_init_thread); |
b3bbd485 | 27011 | |
e4b2b4a8 JK |
27012 | int timer_migration_handler(struct ctl_table *table, int write, |
27013 | void __user *buffer, size_t *lenp, | |
27014 | loff_t *ppos) | |
27015 | { | |
27016 | - static DEFINE_MUTEX(mutex); | |
27017 | int ret; | |
27018 | ||
27019 | - mutex_lock(&mutex); | |
27020 | + mutex_lock(&timer_keys_mutex); | |
27021 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | |
27022 | if (!ret && write) | |
27023 | - timers_update_migration(false); | |
27024 | - mutex_unlock(&mutex); | |
27025 | + timers_update_migration(); | |
27026 | + mutex_unlock(&timer_keys_mutex); | |
27027 | return ret; | |
27028 | } | |
27029 | -#endif | |
27030 | + | |
27031 | +static inline bool is_timers_nohz_active(void) | |
27032 | +{ | |
27033 | + return static_branch_unlikely(&timers_nohz_active); | |
1a6e0f06 JK |
27034 | +} |
27035 | +#else | |
e4b2b4a8 JK |
27036 | +static inline bool is_timers_nohz_active(void) { return false; } |
27037 | +#endif /* NO_HZ_COMMON */ | |
27038 | ||
27039 | static unsigned long round_jiffies_common(unsigned long j, int cpu, | |
27040 | bool force_up) | |
b3bbd485 | 27041 | @@ -534,7 +564,7 @@ __internal_add_timer(struct timer_base *base, struct timer_list *timer) |
e4b2b4a8 JK |
27042 | static void |
27043 | trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) | |
1a6e0f06 | 27044 | { |
e4b2b4a8 JK |
27045 | - if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) |
27046 | + if (!is_timers_nohz_active()) | |
27047 | return; | |
1a6e0f06 | 27048 | |
e4b2b4a8 | 27049 | /* |
b3bbd485 | 27050 | @@ -840,21 +870,20 @@ static inline struct timer_base *get_timer_base(u32 tflags) |
e4b2b4a8 JK |
27051 | return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK); |
27052 | } | |
1a6e0f06 | 27053 | |
e4b2b4a8 JK |
27054 | -#ifdef CONFIG_NO_HZ_COMMON |
27055 | static inline struct timer_base * | |
27056 | get_target_base(struct timer_base *base, unsigned tflags) | |
1a6e0f06 | 27057 | { |
e4b2b4a8 JK |
27058 | -#ifdef CONFIG_SMP |
27059 | - if ((tflags & TIMER_PINNED) || !base->migration_enabled) | |
27060 | - return get_timer_this_cpu_base(tflags); | |
27061 | - return get_timer_cpu_base(tflags, get_nohz_timer_target()); | |
27062 | -#else | |
27063 | - return get_timer_this_cpu_base(tflags); | |
27064 | +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) | |
27065 | + if (static_branch_unlikely(&timers_migration_enabled) && | |
27066 | + !(tflags & TIMER_PINNED)) | |
27067 | + return get_timer_cpu_base(tflags, get_nohz_timer_target()); | |
27068 | #endif | |
27069 | + return get_timer_this_cpu_base(tflags); | |
1a6e0f06 | 27070 | } |
1a6e0f06 | 27071 | |
e4b2b4a8 JK |
27072 | static inline void forward_timer_base(struct timer_base *base) |
27073 | { | |
27074 | +#ifdef CONFIG_NO_HZ_COMMON | |
27075 | unsigned long jnow; | |
1a6e0f06 | 27076 | |
e4b2b4a8 | 27077 | /* |
b3bbd485 | 27078 | @@ -878,16 +907,8 @@ static inline void forward_timer_base(struct timer_base *base) |
e4b2b4a8 JK |
27079 | base->clk = jnow; |
27080 | else | |
27081 | base->clk = base->next_expiry; | |
27082 | -} | |
27083 | -#else | |
27084 | -static inline struct timer_base * | |
27085 | -get_target_base(struct timer_base *base, unsigned tflags) | |
27086 | -{ | |
27087 | - return get_timer_this_cpu_base(tflags); | |
27088 | -} | |
27089 | - | |
27090 | -static inline void forward_timer_base(struct timer_base *base) { } | |
27091 | #endif | |
27092 | +} | |
1a6e0f06 | 27093 | |
1a6e0f06 | 27094 | |
1a6e0f06 | 27095 | /* |
b3bbd485 | 27096 | @@ -1130,6 +1151,33 @@ void add_timer_on(struct timer_list *timer, int cpu) |
1a6e0f06 | 27097 | } |
e4b2b4a8 | 27098 | EXPORT_SYMBOL_GPL(add_timer_on); |
1a6e0f06 | 27099 | |
e4b2b4a8 JK |
27100 | +#ifdef CONFIG_PREEMPT_RT_FULL |
27101 | +/* | |
27102 | + * Wait for a running timer | |
27103 | + */ | |
27104 | +static void wait_for_running_timer(struct timer_list *timer) | |
27105 | +{ | |
27106 | + struct timer_base *base; | |
27107 | + u32 tf = timer->flags; | |
27108 | + | |
27109 | + if (tf & TIMER_MIGRATING) | |
27110 | + return; | |
27111 | + | |
27112 | + base = get_timer_base(tf); | |
27113 | + swait_event(base->wait_for_running_timer, | |
27114 | + base->running_timer != timer); | |
27115 | +} | |
27116 | + | |
27117 | +# define wakeup_timer_waiters(b) swake_up_all(&(b)->wait_for_running_timer) | |
1a6e0f06 | 27118 | +#else |
e4b2b4a8 | 27119 | +static inline void wait_for_running_timer(struct timer_list *timer) |
1a6e0f06 | 27120 | +{ |
e4b2b4a8 | 27121 | + cpu_relax(); |
1a6e0f06 | 27122 | +} |
e4b2b4a8 JK |
27123 | + |
27124 | +# define wakeup_timer_waiters(b) do { } while (0) | |
1a6e0f06 JK |
27125 | +#endif |
27126 | + | |
e4b2b4a8 JK |
27127 | /** |
27128 | * del_timer - deactivate a timer. | |
27129 | * @timer: the timer to be deactivated | |
b3bbd485 | 27130 | @@ -1185,7 +1233,7 @@ int try_to_del_timer_sync(struct timer_list *timer) |
e4b2b4a8 JK |
27131 | } |
27132 | EXPORT_SYMBOL(try_to_del_timer_sync); | |
27133 | ||
27134 | -#ifdef CONFIG_SMP | |
27135 | +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT_FULL) | |
27136 | /** | |
27137 | * del_timer_sync - deactivate a timer and wait for the handler to finish. | |
27138 | * @timer: the timer to be deactivated | |
b3bbd485 | 27139 | @@ -1245,7 +1293,7 @@ int del_timer_sync(struct timer_list *timer) |
e4b2b4a8 JK |
27140 | int ret = try_to_del_timer_sync(timer); |
27141 | if (ret >= 0) | |
27142 | return ret; | |
27143 | - cpu_relax(); | |
27144 | + wait_for_running_timer(timer); | |
27145 | } | |
27146 | } | |
27147 | EXPORT_SYMBOL(del_timer_sync); | |
b3bbd485 | 27148 | @@ -1309,13 +1357,16 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head) |
e4b2b4a8 JK |
27149 | fn = timer->function; |
27150 | data = timer->data; | |
27151 | ||
27152 | - if (timer->flags & TIMER_IRQSAFE) { | |
27153 | + if (!IS_ENABLED(CONFIG_PREEMPT_RT_FULL) && | |
27154 | + timer->flags & TIMER_IRQSAFE) { | |
27155 | raw_spin_unlock(&base->lock); | |
27156 | call_timer_fn(timer, fn, data); | |
27157 | + base->running_timer = NULL; | |
27158 | raw_spin_lock(&base->lock); | |
27159 | } else { | |
27160 | raw_spin_unlock_irq(&base->lock); | |
27161 | call_timer_fn(timer, fn, data); | |
27162 | + base->running_timer = NULL; | |
27163 | raw_spin_lock_irq(&base->lock); | |
27164 | } | |
27165 | } | |
b3bbd485 | 27166 | @@ -1586,7 +1637,7 @@ void update_process_times(int user_tick) |
e4b2b4a8 | 27167 | account_process_tick(p, user_tick); |
e4b2b4a8 JK |
27168 | run_local_timers(); |
27169 | rcu_check_callbacks(user_tick); | |
27170 | -#ifdef CONFIG_IRQ_WORK | |
27171 | +#if defined(CONFIG_IRQ_WORK) | |
27172 | if (in_irq()) | |
27173 | irq_work_tick(); | |
27174 | #endif | |
b3bbd485 | 27175 | @@ -1633,8 +1684,8 @@ static inline void __run_timers(struct timer_base *base) |
e4b2b4a8 JK |
27176 | while (levels--) |
27177 | expire_timers(base, heads + levels); | |
27178 | } | |
27179 | - base->running_timer = NULL; | |
27180 | raw_spin_unlock_irq(&base->lock); | |
27181 | + wakeup_timer_waiters(base); | |
27182 | } | |
27183 | ||
1a6e0f06 | 27184 | /* |
b3bbd485 | 27185 | @@ -1644,6 +1695,7 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h) |
e4b2b4a8 JK |
27186 | { |
27187 | struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); | |
27188 | ||
27189 | + irq_work_tick_soft(); | |
b3bbd485 JK |
27190 | __run_timers(base); |
27191 | if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) | |
27192 | __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); | |
27193 | @@ -1867,6 +1919,9 @@ static void __init init_timer_cpu(int cpu) | |
e4b2b4a8 JK |
27194 | base->cpu = cpu; |
27195 | raw_spin_lock_init(&base->lock); | |
27196 | base->clk = jiffies; | |
27197 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
27198 | + init_swait_queue_head(&base->wait_for_running_timer); | |
1a6e0f06 | 27199 | +#endif |
e4b2b4a8 JK |
27200 | } |
27201 | } | |
27202 | ||
b3bbd485 JK |
27203 | diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig |
27204 | index 4ad6f6ca18c1..55d39a3fbdf7 100644 | |
27205 | --- a/kernel/trace/Kconfig | |
27206 | +++ b/kernel/trace/Kconfig | |
27207 | @@ -585,7 +585,10 @@ config HIST_TRIGGERS | |
e4b2b4a8 JK |
27208 | event activity as an initial guide for further investigation |
27209 | using more advanced tools. | |
27210 | ||
27211 | - See Documentation/trace/events.txt. | |
27212 | + Inter-event tracing of quantities such as latencies is also | |
27213 | + supported using hist triggers under this option. | |
27214 | + | |
27215 | + See Documentation/trace/histogram.txt. | |
27216 | If in doubt, say N. | |
27217 | ||
27218 | config MMIOTRACE_TEST | |
b3bbd485 | 27219 | diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c |
5dd41b01 | 27220 | index a1d5e0949dcf..e8ca1e01facd 100644 |
b3bbd485 JK |
27221 | --- a/kernel/trace/ring_buffer.c |
27222 | +++ b/kernel/trace/ring_buffer.c | |
27223 | @@ -41,6 +41,8 @@ int ring_buffer_print_entry_header(struct trace_seq *s) | |
e4b2b4a8 JK |
27224 | RINGBUF_TYPE_PADDING); |
27225 | trace_seq_printf(s, "\ttime_extend : type == %d\n", | |
27226 | RINGBUF_TYPE_TIME_EXTEND); | |
27227 | + trace_seq_printf(s, "\ttime_stamp : type == %d\n", | |
27228 | + RINGBUF_TYPE_TIME_STAMP); | |
27229 | trace_seq_printf(s, "\tdata max type_len == %d\n", | |
27230 | RINGBUF_TYPE_DATA_TYPE_LEN_MAX); | |
27231 | ||
b3bbd485 | 27232 | @@ -140,12 +142,15 @@ int ring_buffer_print_entry_header(struct trace_seq *s) |
e4b2b4a8 JK |
27233 | |
27234 | enum { | |
27235 | RB_LEN_TIME_EXTEND = 8, | |
27236 | - RB_LEN_TIME_STAMP = 16, | |
27237 | + RB_LEN_TIME_STAMP = 8, | |
27238 | }; | |
27239 | ||
27240 | #define skip_time_extend(event) \ | |
27241 | ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND)) | |
27242 | ||
27243 | +#define extended_time(event) \ | |
27244 | + (event->type_len >= RINGBUF_TYPE_TIME_EXTEND) | |
27245 | + | |
27246 | static inline int rb_null_event(struct ring_buffer_event *event) | |
1a6e0f06 | 27247 | { |
e4b2b4a8 | 27248 | return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; |
b3bbd485 | 27249 | @@ -209,7 +214,7 @@ rb_event_ts_length(struct ring_buffer_event *event) |
e4b2b4a8 JK |
27250 | { |
27251 | unsigned len = 0; | |
1a6e0f06 | 27252 | |
e4b2b4a8 JK |
27253 | - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { |
27254 | + if (extended_time(event)) { | |
27255 | /* time extends include the data event after it */ | |
27256 | len = RB_LEN_TIME_EXTEND; | |
27257 | event = skip_time_extend(event); | |
b3bbd485 | 27258 | @@ -231,7 +236,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event) |
e4b2b4a8 JK |
27259 | { |
27260 | unsigned length; | |
1a6e0f06 | 27261 | |
e4b2b4a8 JK |
27262 | - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) |
27263 | + if (extended_time(event)) | |
27264 | event = skip_time_extend(event); | |
27265 | ||
27266 | length = rb_event_length(event); | |
b3bbd485 | 27267 | @@ -248,7 +253,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length); |
e4b2b4a8 JK |
27268 | static __always_inline void * |
27269 | rb_event_data(struct ring_buffer_event *event) | |
1a6e0f06 | 27270 | { |
e4b2b4a8 JK |
27271 | - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) |
27272 | + if (extended_time(event)) | |
27273 | event = skip_time_extend(event); | |
27274 | BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); | |
27275 | /* If length is in len field, then array[0] has the data */ | |
b3bbd485 | 27276 | @@ -275,6 +280,27 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); |
e4b2b4a8 JK |
27277 | #define TS_MASK ((1ULL << TS_SHIFT) - 1) |
27278 | #define TS_DELTA_TEST (~TS_MASK) | |
1a6e0f06 | 27279 | |
e4b2b4a8 JK |
27280 | +/** |
27281 | + * ring_buffer_event_time_stamp - return the event's extended timestamp | |
27282 | + * @event: the event to get the timestamp of | |
27283 | + * | |
27284 | + * Returns the extended timestamp associated with a data event. | |
27285 | + * An extended time_stamp is a 64-bit timestamp represented | |
27286 | + * internally in a special way that makes the best use of space | |
27287 | + * contained within a ring buffer event. This function decodes | |
27288 | + * it and maps it to a straight u64 value. | |
27289 | + */ | |
27290 | +u64 ring_buffer_event_time_stamp(struct ring_buffer_event *event) | |
1a6e0f06 | 27291 | +{ |
e4b2b4a8 | 27292 | + u64 ts; |
1a6e0f06 | 27293 | + |
e4b2b4a8 JK |
27294 | + ts = event->array[0]; |
27295 | + ts <<= TS_SHIFT; | |
27296 | + ts += event->time_delta; | |
1a6e0f06 | 27297 | + |
e4b2b4a8 JK |
27298 | + return ts; |
27299 | +} | |
27300 | + | |
27301 | /* Flag when events were overwritten */ | |
27302 | #define RB_MISSED_EVENTS (1 << 31) | |
27303 | /* Missed count stored at end */ | |
b3bbd485 | 27304 | @@ -451,6 +477,7 @@ struct ring_buffer_per_cpu { |
e4b2b4a8 JK |
27305 | struct buffer_page *reader_page; |
27306 | unsigned long lost_events; | |
27307 | unsigned long last_overrun; | |
27308 | + unsigned long nest; | |
27309 | local_t entries_bytes; | |
27310 | local_t entries; | |
27311 | local_t overrun; | |
b3bbd485 | 27312 | @@ -488,6 +515,7 @@ struct ring_buffer { |
e4b2b4a8 JK |
27313 | u64 (*clock)(void); |
27314 | ||
27315 | struct rb_irq_work irq_work; | |
27316 | + bool time_stamp_abs; | |
27317 | }; | |
27318 | ||
27319 | struct ring_buffer_iter { | |
b3bbd485 | 27320 | @@ -1387,6 +1415,16 @@ void ring_buffer_set_clock(struct ring_buffer *buffer, |
e4b2b4a8 | 27321 | buffer->clock = clock; |
1a6e0f06 JK |
27322 | } |
27323 | ||
e4b2b4a8 | 27324 | +void ring_buffer_set_time_stamp_abs(struct ring_buffer *buffer, bool abs) |
1a6e0f06 | 27325 | +{ |
e4b2b4a8 | 27326 | + buffer->time_stamp_abs = abs; |
1a6e0f06 JK |
27327 | +} |
27328 | + | |
e4b2b4a8 | 27329 | +bool ring_buffer_time_stamp_abs(struct ring_buffer *buffer) |
1a6e0f06 | 27330 | +{ |
e4b2b4a8 | 27331 | + return buffer->time_stamp_abs; |
1a6e0f06 JK |
27332 | +} |
27333 | + | |
e4b2b4a8 JK |
27334 | static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); |
27335 | ||
27336 | static inline unsigned long rb_page_entries(struct buffer_page *bpage) | |
5dd41b01 | 27337 | @@ -2219,12 +2257,15 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, |
e4b2b4a8 JK |
27338 | |
27339 | /* Slow path, do not inline */ | |
27340 | static noinline struct ring_buffer_event * | |
27341 | -rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) | |
27342 | +rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs) | |
27343 | { | |
27344 | - event->type_len = RINGBUF_TYPE_TIME_EXTEND; | |
27345 | + if (abs) | |
27346 | + event->type_len = RINGBUF_TYPE_TIME_STAMP; | |
27347 | + else | |
27348 | + event->type_len = RINGBUF_TYPE_TIME_EXTEND; | |
27349 | ||
27350 | - /* Not the first event on the page? */ | |
27351 | - if (rb_event_index(event)) { | |
27352 | + /* Not the first event on the page, or not delta? */ | |
27353 | + if (abs || rb_event_index(event)) { | |
27354 | event->time_delta = delta & TS_MASK; | |
27355 | event->array[0] = delta >> TS_SHIFT; | |
27356 | } else { | |
5dd41b01 | 27357 | @@ -2267,7 +2308,9 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, |
e4b2b4a8 JK |
27358 | * add it to the start of the resevered space. |
27359 | */ | |
27360 | if (unlikely(info->add_timestamp)) { | |
27361 | - event = rb_add_time_stamp(event, delta); | |
27362 | + bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer); | |
27363 | + | |
27364 | + event = rb_add_time_stamp(event, info->delta, abs); | |
27365 | length -= RB_LEN_TIME_EXTEND; | |
27366 | delta = 0; | |
27367 | } | |
5dd41b01 | 27368 | @@ -2455,7 +2498,7 @@ static __always_inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer |
e4b2b4a8 JK |
27369 | |
27370 | static inline void rb_event_discard(struct ring_buffer_event *event) | |
27371 | { | |
27372 | - if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) | |
27373 | + if (extended_time(event)) | |
27374 | event = skip_time_extend(event); | |
27375 | ||
27376 | /* array[0] holds the actual length for the discarded event */ | |
5dd41b01 | 27377 | @@ -2499,10 +2542,11 @@ rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, |
e4b2b4a8 JK |
27378 | cpu_buffer->write_stamp = |
27379 | cpu_buffer->commit_page->page->time_stamp; | |
27380 | else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { | |
27381 | - delta = event->array[0]; | |
27382 | - delta <<= TS_SHIFT; | |
27383 | - delta += event->time_delta; | |
27384 | + delta = ring_buffer_event_time_stamp(event); | |
27385 | cpu_buffer->write_stamp += delta; | |
27386 | + } else if (event->type_len == RINGBUF_TYPE_TIME_STAMP) { | |
27387 | + delta = ring_buffer_event_time_stamp(event); | |
27388 | + cpu_buffer->write_stamp = delta; | |
27389 | } else | |
27390 | cpu_buffer->write_stamp += event->time_delta; | |
27391 | } | |
5dd41b01 | 27392 | @@ -2585,22 +2629,19 @@ static __always_inline int |
e4b2b4a8 JK |
27393 | trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) |
27394 | { | |
27395 | unsigned int val = cpu_buffer->current_context; | |
27396 | + unsigned long pc = preempt_count(); | |
27397 | int bit; | |
27398 | ||
27399 | - if (in_interrupt()) { | |
27400 | - if (in_nmi()) | |
27401 | - bit = RB_CTX_NMI; | |
27402 | - else if (in_irq()) | |
27403 | - bit = RB_CTX_IRQ; | |
27404 | - else | |
27405 | - bit = RB_CTX_SOFTIRQ; | |
27406 | - } else | |
27407 | + if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) | |
27408 | bit = RB_CTX_NORMAL; | |
27409 | + else | |
27410 | + bit = pc & NMI_MASK ? RB_CTX_NMI : | |
27411 | + pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ; | |
27412 | ||
27413 | - if (unlikely(val & (1 << bit))) | |
27414 | + if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) | |
27415 | return 1; | |
27416 | ||
27417 | - val |= (1 << bit); | |
27418 | + val |= (1 << (bit + cpu_buffer->nest)); | |
27419 | cpu_buffer->current_context = val; | |
27420 | ||
27421 | return 0; | |
5dd41b01 | 27422 | @@ -2609,7 +2650,57 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) |
e4b2b4a8 JK |
27423 | static __always_inline void |
27424 | trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) | |
27425 | { | |
27426 | - cpu_buffer->current_context &= cpu_buffer->current_context - 1; | |
27427 | + cpu_buffer->current_context &= | |
27428 | + cpu_buffer->current_context - (1 << cpu_buffer->nest); | |
27429 | +} | |
27430 | + | |
27431 | +/* The recursive locking above uses 4 bits */ | |
27432 | +#define NESTED_BITS 4 | |
27433 | + | |
27434 | +/** | |
27435 | + * ring_buffer_nest_start - Allow to trace while nested | |
27436 | + * @buffer: The ring buffer to modify | |
27437 | + * | |
27438 | + * The ring buffer has a safty mechanism to prevent recursion. | |
27439 | + * But there may be a case where a trace needs to be done while | |
27440 | + * tracing something else. In this case, calling this function | |
27441 | + * will allow this function to nest within a currently active | |
27442 | + * ring_buffer_lock_reserve(). | |
27443 | + * | |
27444 | + * Call this function before calling another ring_buffer_lock_reserve() and | |
27445 | + * call ring_buffer_nest_end() after the nested ring_buffer_unlock_commit(). | |
1a6e0f06 | 27446 | + */ |
e4b2b4a8 | 27447 | +void ring_buffer_nest_start(struct ring_buffer *buffer) |
1a6e0f06 | 27448 | +{ |
e4b2b4a8 JK |
27449 | + struct ring_buffer_per_cpu *cpu_buffer; |
27450 | + int cpu; | |
1a6e0f06 | 27451 | + |
e4b2b4a8 JK |
27452 | + /* Enabled by ring_buffer_nest_end() */ |
27453 | + preempt_disable_notrace(); | |
27454 | + cpu = raw_smp_processor_id(); | |
27455 | + cpu_buffer = buffer->buffers[cpu]; | |
27456 | + /* This is the shift value for the above recusive locking */ | |
27457 | + cpu_buffer->nest += NESTED_BITS; | |
1a6e0f06 JK |
27458 | +} |
27459 | + | |
e4b2b4a8 JK |
27460 | +/** |
27461 | + * ring_buffer_nest_end - Allow to trace while nested | |
27462 | + * @buffer: The ring buffer to modify | |
27463 | + * | |
27464 | + * Must be called after ring_buffer_nest_start() and after the | |
27465 | + * ring_buffer_unlock_commit(). | |
1a6e0f06 | 27466 | + */ |
e4b2b4a8 | 27467 | +void ring_buffer_nest_end(struct ring_buffer *buffer) |
1a6e0f06 | 27468 | +{ |
e4b2b4a8 | 27469 | + struct ring_buffer_per_cpu *cpu_buffer; |
1a6e0f06 JK |
27470 | + int cpu; |
27471 | + | |
e4b2b4a8 JK |
27472 | + /* disabled by ring_buffer_nest_start() */ |
27473 | + cpu = raw_smp_processor_id(); | |
27474 | + cpu_buffer = buffer->buffers[cpu]; | |
27475 | + /* This is the shift value for the above recusive locking */ | |
27476 | + cpu_buffer->nest -= NESTED_BITS; | |
27477 | + preempt_enable_notrace(); | |
27478 | } | |
27479 | ||
27480 | /** | |
5dd41b01 | 27481 | @@ -2685,7 +2776,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, |
e4b2b4a8 JK |
27482 | * If this is the first commit on the page, then it has the same |
27483 | * timestamp as the page itself. | |
27484 | */ | |
27485 | - if (!tail) | |
27486 | + if (!tail && !ring_buffer_time_stamp_abs(cpu_buffer->buffer)) | |
27487 | info->delta = 0; | |
27488 | ||
27489 | /* See if we shot pass the end of this buffer page */ | |
5dd41b01 | 27490 | @@ -2762,8 +2853,11 @@ rb_reserve_next_event(struct ring_buffer *buffer, |
e4b2b4a8 JK |
27491 | /* make sure this diff is calculated here */ |
27492 | barrier(); | |
27493 | ||
27494 | - /* Did the write stamp get updated already? */ | |
27495 | - if (likely(info.ts >= cpu_buffer->write_stamp)) { | |
27496 | + if (ring_buffer_time_stamp_abs(buffer)) { | |
27497 | + info.delta = info.ts; | |
27498 | + rb_handle_timestamp(cpu_buffer, &info); | |
27499 | + } else /* Did the write stamp get updated already? */ | |
27500 | + if (likely(info.ts >= cpu_buffer->write_stamp)) { | |
27501 | info.delta = diff; | |
27502 | if (unlikely(test_time_stamp(info.delta))) | |
27503 | rb_handle_timestamp(cpu_buffer, &info); | |
5dd41b01 | 27504 | @@ -3461,14 +3555,13 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, |
e4b2b4a8 JK |
27505 | return; |
27506 | ||
27507 | case RINGBUF_TYPE_TIME_EXTEND: | |
27508 | - delta = event->array[0]; | |
27509 | - delta <<= TS_SHIFT; | |
27510 | - delta += event->time_delta; | |
27511 | + delta = ring_buffer_event_time_stamp(event); | |
27512 | cpu_buffer->read_stamp += delta; | |
27513 | return; | |
27514 | ||
27515 | case RINGBUF_TYPE_TIME_STAMP: | |
27516 | - /* FIXME: not implemented */ | |
27517 | + delta = ring_buffer_event_time_stamp(event); | |
27518 | + cpu_buffer->read_stamp = delta; | |
27519 | return; | |
27520 | ||
27521 | case RINGBUF_TYPE_DATA: | |
5dd41b01 | 27522 | @@ -3492,14 +3585,13 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, |
e4b2b4a8 JK |
27523 | return; |
27524 | ||
27525 | case RINGBUF_TYPE_TIME_EXTEND: | |
27526 | - delta = event->array[0]; | |
27527 | - delta <<= TS_SHIFT; | |
27528 | - delta += event->time_delta; | |
27529 | + delta = ring_buffer_event_time_stamp(event); | |
27530 | iter->read_stamp += delta; | |
27531 | return; | |
27532 | ||
27533 | case RINGBUF_TYPE_TIME_STAMP: | |
27534 | - /* FIXME: not implemented */ | |
27535 | + delta = ring_buffer_event_time_stamp(event); | |
27536 | + iter->read_stamp = delta; | |
27537 | return; | |
27538 | ||
27539 | case RINGBUF_TYPE_DATA: | |
5dd41b01 | 27540 | @@ -3723,6 +3815,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, |
e4b2b4a8 JK |
27541 | struct buffer_page *reader; |
27542 | int nr_loops = 0; | |
27543 | ||
27544 | + if (ts) | |
27545 | + *ts = 0; | |
27546 | again: | |
27547 | /* | |
27548 | * We repeat when a time extend is encountered. | |
5dd41b01 | 27549 | @@ -3759,12 +3853,17 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, |
e4b2b4a8 JK |
27550 | goto again; |
27551 | ||
27552 | case RINGBUF_TYPE_TIME_STAMP: | |
27553 | - /* FIXME: not implemented */ | |
27554 | + if (ts) { | |
27555 | + *ts = ring_buffer_event_time_stamp(event); | |
27556 | + ring_buffer_normalize_time_stamp(cpu_buffer->buffer, | |
27557 | + cpu_buffer->cpu, ts); | |
27558 | + } | |
27559 | + /* Internal data, OK to advance */ | |
27560 | rb_advance_reader(cpu_buffer); | |
27561 | goto again; | |
27562 | ||
27563 | case RINGBUF_TYPE_DATA: | |
27564 | - if (ts) { | |
27565 | + if (ts && !(*ts)) { | |
27566 | *ts = cpu_buffer->read_stamp + event->time_delta; | |
27567 | ring_buffer_normalize_time_stamp(cpu_buffer->buffer, | |
27568 | cpu_buffer->cpu, ts); | |
5dd41b01 | 27569 | @@ -3789,6 +3888,9 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) |
e4b2b4a8 JK |
27570 | struct ring_buffer_event *event; |
27571 | int nr_loops = 0; | |
27572 | ||
27573 | + if (ts) | |
27574 | + *ts = 0; | |
27575 | + | |
27576 | cpu_buffer = iter->cpu_buffer; | |
27577 | buffer = cpu_buffer->buffer; | |
27578 | ||
5dd41b01 | 27579 | @@ -3841,12 +3943,17 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) |
e4b2b4a8 JK |
27580 | goto again; |
27581 | ||
27582 | case RINGBUF_TYPE_TIME_STAMP: | |
27583 | - /* FIXME: not implemented */ | |
27584 | + if (ts) { | |
27585 | + *ts = ring_buffer_event_time_stamp(event); | |
27586 | + ring_buffer_normalize_time_stamp(cpu_buffer->buffer, | |
27587 | + cpu_buffer->cpu, ts); | |
27588 | + } | |
27589 | + /* Internal data, OK to advance */ | |
27590 | rb_advance_iter(iter); | |
27591 | goto again; | |
27592 | ||
27593 | case RINGBUF_TYPE_DATA: | |
27594 | - if (ts) { | |
27595 | + if (ts && !(*ts)) { | |
27596 | *ts = iter->read_stamp + event->time_delta; | |
27597 | ring_buffer_normalize_time_stamp(buffer, | |
27598 | cpu_buffer->cpu, ts); | |
b3bbd485 JK |
27599 | diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c |
27600 | index e9cbb96cd99e..4fc60e5ec4b9 100644 | |
27601 | --- a/kernel/trace/trace.c | |
27602 | +++ b/kernel/trace/trace.c | |
27603 | @@ -1170,6 +1170,14 @@ static struct { | |
e4b2b4a8 JK |
27604 | ARCH_TRACE_CLOCKS |
27605 | }; | |
27606 | ||
27607 | +bool trace_clock_in_ns(struct trace_array *tr) | |
27608 | +{ | |
27609 | + if (trace_clocks[tr->clock_id].in_ns) | |
27610 | + return true; | |
27611 | + | |
27612 | + return false; | |
1a6e0f06 | 27613 | +} |
1a6e0f06 JK |
27614 | + |
27615 | /* | |
e4b2b4a8 | 27616 | * trace_parser_get_init - gets the buffer for trace parser |
1a6e0f06 | 27617 | */ |
b3bbd485 | 27618 | @@ -2127,6 +2135,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, |
e4b2b4a8 JK |
27619 | struct task_struct *tsk = current; |
27620 | ||
27621 | entry->preempt_count = pc & 0xff; | |
27622 | + entry->preempt_lazy_count = preempt_lazy_count(); | |
27623 | entry->pid = (tsk) ? tsk->pid : 0; | |
27624 | entry->flags = | |
27625 | #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT | |
b3bbd485 | 27626 | @@ -2137,8 +2146,11 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, |
e4b2b4a8 JK |
27627 | ((pc & NMI_MASK ) ? TRACE_FLAG_NMI : 0) | |
27628 | ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | | |
27629 | ((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) | | |
27630 | - (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) | | |
27631 | + (tif_need_resched_now() ? TRACE_FLAG_NEED_RESCHED : 0) | | |
27632 | + (need_resched_lazy() ? TRACE_FLAG_NEED_RESCHED_LAZY : 0) | | |
27633 | (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0); | |
27634 | + | |
27635 | + entry->migrate_disable = (tsk) ? __migrate_disabled(tsk) & 0xFF : 0; | |
1a6e0f06 | 27636 | } |
e4b2b4a8 | 27637 | EXPORT_SYMBOL_GPL(tracing_generic_entry_update); |
1a6e0f06 | 27638 | |
b3bbd485 | 27639 | @@ -2275,7 +2287,7 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, |
e4b2b4a8 JK |
27640 | |
27641 | *current_rb = trace_file->tr->trace_buffer.buffer; | |
27642 | ||
27643 | - if ((trace_file->flags & | |
27644 | + if (!ring_buffer_time_stamp_abs(*current_rb) && (trace_file->flags & | |
27645 | (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED)) && | |
27646 | (entry = this_cpu_read(trace_buffered_event))) { | |
27647 | /* Try to use the per cpu buffer first */ | |
b3bbd485 | 27648 | @@ -3342,14 +3354,17 @@ get_total_entries(struct trace_buffer *buf, |
e4b2b4a8 JK |
27649 | |
27650 | static void print_lat_help_header(struct seq_file *m) | |
27651 | { | |
27652 | - seq_puts(m, "# _------=> CPU# \n" | |
27653 | - "# / _-----=> irqs-off \n" | |
27654 | - "# | / _----=> need-resched \n" | |
27655 | - "# || / _---=> hardirq/softirq \n" | |
27656 | - "# ||| / _--=> preempt-depth \n" | |
27657 | - "# |||| / delay \n" | |
27658 | - "# cmd pid ||||| time | caller \n" | |
27659 | - "# \\ / ||||| \\ | / \n"); | |
27660 | + seq_puts(m, "# _--------=> CPU# \n" | |
27661 | + "# / _-------=> irqs-off \n" | |
27662 | + "# | / _------=> need-resched \n" | |
27663 | + "# || / _-----=> need-resched_lazy \n" | |
27664 | + "# ||| / _----=> hardirq/softirq \n" | |
27665 | + "# |||| / _---=> preempt-depth \n" | |
27666 | + "# ||||| / _--=> preempt-lazy-depth\n" | |
27667 | + "# |||||| / _-=> migrate-disable \n" | |
27668 | + "# ||||||| / delay \n" | |
27669 | + "# cmd pid |||||||| time | caller \n" | |
27670 | + "# \\ / |||||||| \\ | / \n"); | |
1a6e0f06 | 27671 | } |
1a6e0f06 | 27672 | |
e4b2b4a8 | 27673 | static void print_event_info(struct trace_buffer *buf, struct seq_file *m) |
b3bbd485 | 27674 | @@ -3385,15 +3400,17 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file |
e4b2b4a8 JK |
27675 | tgid ? tgid_space : space); |
27676 | seq_printf(m, "# %s / _----=> need-resched\n", | |
27677 | tgid ? tgid_space : space); | |
27678 | - seq_printf(m, "# %s| / _---=> hardirq/softirq\n", | |
27679 | + seq_printf(m, "# %s| / _----=> need-resched_lazy\n", | |
b3bbd485 JK |
27680 | + tgid ? tgid_space : space); |
27681 | + seq_printf(m, "# %s|| / _---=> hardirq/softirq\n", | |
e4b2b4a8 JK |
27682 | tgid ? tgid_space : space); |
27683 | - seq_printf(m, "# %s|| / _--=> preempt-depth\n", | |
b3bbd485 | 27684 | + seq_printf(m, "# %s||| / _--=> preempt-depth\n", |
e4b2b4a8 JK |
27685 | tgid ? tgid_space : space); |
27686 | - seq_printf(m, "# %s||| / delay\n", | |
b3bbd485 | 27687 | + seq_printf(m, "# %s|||| / delay\n", |
e4b2b4a8 JK |
27688 | tgid ? tgid_space : space); |
27689 | - seq_printf(m, "# TASK-PID %sCPU# |||| TIMESTAMP FUNCTION\n", | |
e4b2b4a8 JK |
27690 | + seq_printf(m, "# TASK-PID %sCPU# ||||| TIMESTAMP FUNCTION\n", |
27691 | tgid ? " TGID " : space); | |
27692 | - seq_printf(m, "# | | %s | |||| | |\n", | |
27693 | + seq_printf(m, "# | | %s | ||||| | |\n", | |
27694 | tgid ? " | " : space); | |
27695 | } | |
27696 | ||
b3bbd485 | 27697 | @@ -4531,6 +4548,9 @@ static const char readme_msg[] = |
e4b2b4a8 JK |
27698 | #ifdef CONFIG_X86_64 |
27699 | " x86-tsc: TSC cycle counter\n" | |
27700 | #endif | |
27701 | + "\n timestamp_mode\t-view the mode used to timestamp events\n" | |
27702 | + " delta: Delta difference against a buffer-wide timestamp\n" | |
27703 | + " absolute: Absolute (standalone) timestamp\n" | |
27704 | "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n" | |
27705 | "\n trace_marker_raw\t\t- Writes into this file writes binary data into the kernel buffer\n" | |
27706 | " tracing_cpumask\t- Limit which CPUs to trace\n" | |
b3bbd485 | 27707 | @@ -4707,8 +4727,9 @@ static const char readme_msg[] = |
e4b2b4a8 JK |
27708 | "\t .sym display an address as a symbol\n" |
27709 | "\t .sym-offset display an address as a symbol and offset\n" | |
27710 | "\t .execname display a common_pid as a program name\n" | |
27711 | - "\t .syscall display a syscall id as a syscall name\n\n" | |
27712 | - "\t .log2 display log2 value rather than raw number\n\n" | |
27713 | + "\t .syscall display a syscall id as a syscall name\n" | |
27714 | + "\t .log2 display log2 value rather than raw number\n" | |
27715 | + "\t .usecs display a common_timestamp in microseconds\n\n" | |
27716 | "\t The 'pause' parameter can be used to pause an existing hist\n" | |
27717 | "\t trigger or to start a hist trigger but not log any events\n" | |
27718 | "\t until told to do so. 'continue' can be used to start or\n" | |
b3bbd485 | 27719 | @@ -6218,7 +6239,7 @@ static int tracing_clock_show(struct seq_file *m, void *v) |
e4b2b4a8 | 27720 | return 0; |
1a6e0f06 | 27721 | } |
1a6e0f06 | 27722 | |
e4b2b4a8 JK |
27723 | -static int tracing_set_clock(struct trace_array *tr, const char *clockstr) |
27724 | +int tracing_set_clock(struct trace_array *tr, const char *clockstr) | |
27725 | { | |
27726 | int i; | |
27727 | ||
b3bbd485 | 27728 | @@ -6298,6 +6319,71 @@ static int tracing_clock_open(struct inode *inode, struct file *file) |
e4b2b4a8 | 27729 | return ret; |
1a6e0f06 | 27730 | } |
e4b2b4a8 JK |
27731 | |
27732 | +static int tracing_time_stamp_mode_show(struct seq_file *m, void *v) | |
27733 | +{ | |
27734 | + struct trace_array *tr = m->private; | |
27735 | + | |
27736 | + mutex_lock(&trace_types_lock); | |
27737 | + | |
27738 | + if (ring_buffer_time_stamp_abs(tr->trace_buffer.buffer)) | |
27739 | + seq_puts(m, "delta [absolute]\n"); | |
27740 | + else | |
27741 | + seq_puts(m, "[delta] absolute\n"); | |
27742 | + | |
27743 | + mutex_unlock(&trace_types_lock); | |
27744 | + | |
27745 | + return 0; | |
27746 | +} | |
27747 | + | |
27748 | +static int tracing_time_stamp_mode_open(struct inode *inode, struct file *file) | |
27749 | +{ | |
27750 | + struct trace_array *tr = inode->i_private; | |
27751 | + int ret; | |
27752 | + | |
27753 | + if (tracing_disabled) | |
27754 | + return -ENODEV; | |
27755 | + | |
27756 | + if (trace_array_get(tr)) | |
27757 | + return -ENODEV; | |
27758 | + | |
27759 | + ret = single_open(file, tracing_time_stamp_mode_show, inode->i_private); | |
27760 | + if (ret < 0) | |
27761 | + trace_array_put(tr); | |
27762 | + | |
27763 | + return ret; | |
27764 | +} | |
27765 | + | |
27766 | +int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs) | |
27767 | +{ | |
27768 | + int ret = 0; | |
27769 | + | |
27770 | + mutex_lock(&trace_types_lock); | |
27771 | + | |
27772 | + if (abs && tr->time_stamp_abs_ref++) | |
27773 | + goto out; | |
27774 | + | |
27775 | + if (!abs) { | |
27776 | + if (WARN_ON_ONCE(!tr->time_stamp_abs_ref)) { | |
27777 | + ret = -EINVAL; | |
27778 | + goto out; | |
27779 | + } | |
27780 | + | |
27781 | + if (--tr->time_stamp_abs_ref) | |
27782 | + goto out; | |
27783 | + } | |
27784 | + | |
27785 | + ring_buffer_set_time_stamp_abs(tr->trace_buffer.buffer, abs); | |
27786 | + | |
27787 | +#ifdef CONFIG_TRACER_MAX_TRACE | |
27788 | + if (tr->max_buffer.buffer) | |
27789 | + ring_buffer_set_time_stamp_abs(tr->max_buffer.buffer, abs); | |
1a6e0f06 | 27790 | +#endif |
e4b2b4a8 JK |
27791 | + out: |
27792 | + mutex_unlock(&trace_types_lock); | |
27793 | + | |
27794 | + return ret; | |
27795 | +} | |
27796 | + | |
27797 | struct ftrace_buffer_info { | |
27798 | struct trace_iterator iter; | |
27799 | void *spare; | |
b3bbd485 | 27800 | @@ -6545,6 +6631,13 @@ static const struct file_operations trace_clock_fops = { |
e4b2b4a8 JK |
27801 | .write = tracing_clock_write, |
27802 | }; | |
1a6e0f06 | 27803 | |
e4b2b4a8 JK |
27804 | +static const struct file_operations trace_time_stamp_mode_fops = { |
27805 | + .open = tracing_time_stamp_mode_open, | |
27806 | + .read = seq_read, | |
27807 | + .llseek = seq_lseek, | |
27808 | + .release = tracing_single_release_tr, | |
27809 | +}; | |
27810 | + | |
27811 | #ifdef CONFIG_TRACER_SNAPSHOT | |
27812 | static const struct file_operations snapshot_fops = { | |
27813 | .open = tracing_snapshot_open, | |
b3bbd485 | 27814 | @@ -7684,6 +7777,7 @@ static int instance_mkdir(const char *name) |
e4b2b4a8 JK |
27815 | struct trace_array *tr; |
27816 | int ret; | |
1a6e0f06 | 27817 | |
e4b2b4a8 JK |
27818 | + mutex_lock(&event_mutex); |
27819 | mutex_lock(&trace_types_lock); | |
1a6e0f06 | 27820 | |
e4b2b4a8 | 27821 | ret = -EEXIST; |
b3bbd485 | 27822 | @@ -7716,6 +7810,7 @@ static int instance_mkdir(const char *name) |
1a6e0f06 | 27823 | |
e4b2b4a8 JK |
27824 | INIT_LIST_HEAD(&tr->systems); |
27825 | INIT_LIST_HEAD(&tr->events); | |
27826 | + INIT_LIST_HEAD(&tr->hist_vars); | |
1a6e0f06 | 27827 | |
e4b2b4a8 JK |
27828 | if (allocate_trace_buffers(tr, trace_buf_size) < 0) |
27829 | goto out_free_tr; | |
b3bbd485 | 27830 | @@ -7739,6 +7834,7 @@ static int instance_mkdir(const char *name) |
e4b2b4a8 | 27831 | list_add(&tr->list, &ftrace_trace_arrays); |
1a6e0f06 | 27832 | |
e4b2b4a8 JK |
27833 | mutex_unlock(&trace_types_lock); |
27834 | + mutex_unlock(&event_mutex); | |
1a6e0f06 | 27835 | |
e4b2b4a8 | 27836 | return 0; |
1a6e0f06 | 27837 | |
b3bbd485 | 27838 | @@ -7750,6 +7846,7 @@ static int instance_mkdir(const char *name) |
1a6e0f06 | 27839 | |
e4b2b4a8 JK |
27840 | out_unlock: |
27841 | mutex_unlock(&trace_types_lock); | |
27842 | + mutex_unlock(&event_mutex); | |
1a6e0f06 | 27843 | |
e4b2b4a8 | 27844 | return ret; |
1a6e0f06 | 27845 | |
b3bbd485 | 27846 | @@ -7762,6 +7859,7 @@ static int instance_rmdir(const char *name) |
e4b2b4a8 JK |
27847 | int ret; |
27848 | int i; | |
1a6e0f06 | 27849 | |
e4b2b4a8 JK |
27850 | + mutex_lock(&event_mutex); |
27851 | mutex_lock(&trace_types_lock); | |
1a6e0f06 | 27852 | |
e4b2b4a8 | 27853 | ret = -ENODEV; |
b3bbd485 | 27854 | @@ -7807,6 +7905,7 @@ static int instance_rmdir(const char *name) |
1a6e0f06 | 27855 | |
e4b2b4a8 JK |
27856 | out_unlock: |
27857 | mutex_unlock(&trace_types_lock); | |
27858 | + mutex_unlock(&event_mutex); | |
1a6e0f06 | 27859 | |
e4b2b4a8 JK |
27860 | return ret; |
27861 | } | |
b3bbd485 | 27862 | @@ -7864,6 +7963,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) |
e4b2b4a8 JK |
27863 | trace_create_file("tracing_on", 0644, d_tracer, |
27864 | tr, &rb_simple_fops); | |
1a6e0f06 | 27865 | |
e4b2b4a8 JK |
27866 | + trace_create_file("timestamp_mode", 0444, d_tracer, tr, |
27867 | + &trace_time_stamp_mode_fops); | |
1a6e0f06 | 27868 | + |
e4b2b4a8 | 27869 | create_trace_options_dir(tr); |
1a6e0f06 | 27870 | |
e4b2b4a8 | 27871 | #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) |
b3bbd485 | 27872 | @@ -8275,6 +8377,92 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) |
1a6e0f06 | 27873 | } |
e4b2b4a8 | 27874 | EXPORT_SYMBOL_GPL(ftrace_dump); |
1a6e0f06 | 27875 | |
e4b2b4a8 | 27876 | +int trace_run_command(const char *buf, int (*createfn)(int, char **)) |
1a6e0f06 | 27877 | +{ |
e4b2b4a8 JK |
27878 | + char **argv; |
27879 | + int argc, ret; | |
1a6e0f06 | 27880 | + |
e4b2b4a8 JK |
27881 | + argc = 0; |
27882 | + ret = 0; | |
27883 | + argv = argv_split(GFP_KERNEL, buf, &argc); | |
27884 | + if (!argv) | |
27885 | + return -ENOMEM; | |
27886 | + | |
27887 | + if (argc) | |
27888 | + ret = createfn(argc, argv); | |
27889 | + | |
27890 | + argv_free(argv); | |
27891 | + | |
27892 | + return ret; | |
1a6e0f06 JK |
27893 | +} |
27894 | + | |
e4b2b4a8 JK |
27895 | +#define WRITE_BUFSIZE 4096 |
27896 | + | |
27897 | +ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, | |
27898 | + size_t count, loff_t *ppos, | |
27899 | + int (*createfn)(int, char **)) | |
27900 | +{ | |
27901 | + char *kbuf, *buf, *tmp; | |
27902 | + int ret = 0; | |
27903 | + size_t done = 0; | |
27904 | + size_t size; | |
27905 | + | |
27906 | + kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); | |
27907 | + if (!kbuf) | |
27908 | + return -ENOMEM; | |
27909 | + | |
27910 | + while (done < count) { | |
27911 | + size = count - done; | |
27912 | + | |
27913 | + if (size >= WRITE_BUFSIZE) | |
27914 | + size = WRITE_BUFSIZE - 1; | |
27915 | + | |
27916 | + if (copy_from_user(kbuf, buffer + done, size)) { | |
27917 | + ret = -EFAULT; | |
27918 | + goto out; | |
27919 | + } | |
27920 | + kbuf[size] = '\0'; | |
27921 | + buf = kbuf; | |
27922 | + do { | |
27923 | + tmp = strchr(buf, '\n'); | |
27924 | + if (tmp) { | |
27925 | + *tmp = '\0'; | |
27926 | + size = tmp - buf + 1; | |
27927 | + } else { | |
27928 | + size = strlen(buf); | |
27929 | + if (done + size < count) { | |
27930 | + if (buf != kbuf) | |
27931 | + break; | |
27932 | + /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */ | |
27933 | + pr_warn("Line length is too long: Should be less than %d\n", | |
27934 | + WRITE_BUFSIZE - 2); | |
27935 | + ret = -EINVAL; | |
27936 | + goto out; | |
27937 | + } | |
27938 | + } | |
27939 | + done += size; | |
27940 | + | |
27941 | + /* Remove comments */ | |
27942 | + tmp = strchr(buf, '#'); | |
27943 | + | |
27944 | + if (tmp) | |
27945 | + *tmp = '\0'; | |
27946 | + | |
27947 | + ret = trace_run_command(buf, createfn); | |
27948 | + if (ret) | |
27949 | + goto out; | |
27950 | + buf += size; | |
27951 | + | |
27952 | + } while (done < count); | |
27953 | + } | |
27954 | + ret = done; | |
27955 | + | |
27956 | +out: | |
27957 | + kfree(kbuf); | |
27958 | + | |
27959 | + return ret; | |
27960 | +} | |
27961 | + | |
27962 | __init static int tracer_alloc_buffers(void) | |
27963 | { | |
27964 | int ring_buf_size; | |
b3bbd485 | 27965 | @@ -8375,6 +8563,7 @@ __init static int tracer_alloc_buffers(void) |
e4b2b4a8 JK |
27966 | |
27967 | INIT_LIST_HEAD(&global_trace.systems); | |
27968 | INIT_LIST_HEAD(&global_trace.events); | |
27969 | + INIT_LIST_HEAD(&global_trace.hist_vars); | |
27970 | list_add(&global_trace.list, &ftrace_trace_arrays); | |
27971 | ||
27972 | apply_trace_boot_options(); | |
b3bbd485 JK |
27973 | diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h |
27974 | index 851cd1605085..18bf383f46e8 100644 | |
27975 | --- a/kernel/trace/trace.h | |
27976 | +++ b/kernel/trace/trace.h | |
27977 | @@ -127,6 +127,7 @@ struct kretprobe_trace_entry_head { | |
27978 | * NEED_RESCHED - reschedule is requested | |
27979 | * HARDIRQ - inside an interrupt handler | |
27980 | * SOFTIRQ - inside a softirq handler | |
27981 | + * NEED_RESCHED_LAZY - lazy reschedule is requested | |
27982 | */ | |
27983 | enum trace_flag_type { | |
27984 | TRACE_FLAG_IRQS_OFF = 0x01, | |
27985 | @@ -136,6 +137,7 @@ enum trace_flag_type { | |
27986 | TRACE_FLAG_SOFTIRQ = 0x10, | |
27987 | TRACE_FLAG_PREEMPT_RESCHED = 0x20, | |
27988 | TRACE_FLAG_NMI = 0x40, | |
27989 | + TRACE_FLAG_NEED_RESCHED_LAZY = 0x80, | |
27990 | }; | |
27991 | ||
27992 | #define TRACE_BUF_SIZE 1024 | |
27993 | @@ -273,6 +275,8 @@ struct trace_array { | |
27994 | /* function tracing enabled */ | |
27995 | int function_enabled; | |
27996 | #endif | |
27997 | + int time_stamp_abs_ref; | |
27998 | + struct list_head hist_vars; | |
27999 | }; | |
28000 | ||
28001 | enum { | |
28002 | @@ -286,6 +290,11 @@ extern struct mutex trace_types_lock; | |
28003 | extern int trace_array_get(struct trace_array *tr); | |
28004 | extern void trace_array_put(struct trace_array *tr); | |
28005 | ||
28006 | +extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs); | |
28007 | +extern int tracing_set_clock(struct trace_array *tr, const char *clockstr); | |
28008 | + | |
28009 | +extern bool trace_clock_in_ns(struct trace_array *tr); | |
28010 | + | |
28011 | /* | |
28012 | * The global tracer (top) should be the first trace array added, | |
28013 | * but we check the flag anyway. | |
28014 | @@ -1293,7 +1302,7 @@ __event_trigger_test_discard(struct trace_event_file *file, | |
28015 | unsigned long eflags = file->flags; | |
28016 | ||
28017 | if (eflags & EVENT_FILE_FL_TRIGGER_COND) | |
28018 | - *tt = event_triggers_call(file, entry); | |
28019 | + *tt = event_triggers_call(file, entry, event); | |
28020 | ||
28021 | if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) || | |
28022 | (unlikely(file->flags & EVENT_FILE_FL_FILTERED) && | |
28023 | @@ -1330,7 +1339,7 @@ event_trigger_unlock_commit(struct trace_event_file *file, | |
28024 | trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc); | |
28025 | ||
28026 | if (tt) | |
28027 | - event_triggers_post_call(file, tt, entry); | |
28028 | + event_triggers_post_call(file, tt, entry, event); | |
28029 | } | |
28030 | ||
28031 | /** | |
28032 | @@ -1363,7 +1372,7 @@ event_trigger_unlock_commit_regs(struct trace_event_file *file, | |
28033 | irq_flags, pc, regs); | |
28034 | ||
28035 | if (tt) | |
28036 | - event_triggers_post_call(file, tt, entry); | |
28037 | + event_triggers_post_call(file, tt, entry, event); | |
28038 | } | |
28039 | ||
28040 | #define FILTER_PRED_INVALID ((unsigned short)-1) | |
28041 | @@ -1545,6 +1554,8 @@ extern void pause_named_trigger(struct event_trigger_data *data); | |
28042 | extern void unpause_named_trigger(struct event_trigger_data *data); | |
28043 | extern void set_named_trigger_data(struct event_trigger_data *data, | |
28044 | struct event_trigger_data *named_data); | |
28045 | +extern struct event_trigger_data * | |
28046 | +get_named_trigger_data(struct event_trigger_data *data); | |
28047 | extern int register_event_command(struct event_command *cmd); | |
28048 | extern int unregister_event_command(struct event_command *cmd); | |
28049 | extern int register_trigger_hist_enable_disable_cmds(void); | |
28050 | @@ -1588,7 +1599,8 @@ extern int register_trigger_hist_enable_disable_cmds(void); | |
28051 | */ | |
28052 | struct event_trigger_ops { | |
28053 | void (*func)(struct event_trigger_data *data, | |
28054 | - void *rec); | |
28055 | + void *rec, | |
28056 | + struct ring_buffer_event *rbe); | |
28057 | int (*init)(struct event_trigger_ops *ops, | |
28058 | struct event_trigger_data *data); | |
28059 | void (*free)(struct event_trigger_ops *ops, | |
28060 | @@ -1755,6 +1767,13 @@ void trace_printk_start_comm(void); | |
28061 | int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); | |
28062 | int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); | |
28063 | ||
28064 | +#define MAX_EVENT_NAME_LEN 64 | |
28065 | + | |
28066 | +extern int trace_run_command(const char *buf, int (*createfn)(int, char**)); | |
28067 | +extern ssize_t trace_parse_run_command(struct file *file, | |
28068 | + const char __user *buffer, size_t count, loff_t *ppos, | |
28069 | + int (*createfn)(int, char**)); | |
28070 | + | |
28071 | /* | |
28072 | * Normal trace_printk() and friends allocates special buffers | |
28073 | * to do the manipulation, as well as saves the print formats | |
28074 | diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c | |
28075 | index d53268a4e167..9ba230a4052f 100644 | |
28076 | --- a/kernel/trace/trace_events.c | |
28077 | +++ b/kernel/trace/trace_events.c | |
28078 | @@ -187,6 +187,8 @@ static int trace_define_common_fields(void) | |
e4b2b4a8 JK |
28079 | __common_field(unsigned char, flags); |
28080 | __common_field(unsigned char, preempt_count); | |
28081 | __common_field(int, pid); | |
28082 | + __common_field(unsigned short, migrate_disable); | |
28083 | + __common_field(unsigned short, padding); | |
28084 | ||
28085 | return ret; | |
1a6e0f06 | 28086 | } |
b3bbd485 | 28087 | @@ -1406,8 +1408,8 @@ static int subsystem_open(struct inode *inode, struct file *filp) |
e4b2b4a8 | 28088 | return -ENODEV; |
1a6e0f06 | 28089 | |
e4b2b4a8 JK |
28090 | /* Make sure the system still exists */ |
28091 | - mutex_lock(&trace_types_lock); | |
28092 | mutex_lock(&event_mutex); | |
28093 | + mutex_lock(&trace_types_lock); | |
28094 | list_for_each_entry(tr, &ftrace_trace_arrays, list) { | |
28095 | list_for_each_entry(dir, &tr->systems, list) { | |
28096 | if (dir == inode->i_private) { | |
b3bbd485 | 28097 | @@ -1421,8 +1423,8 @@ static int subsystem_open(struct inode *inode, struct file *filp) |
e4b2b4a8 JK |
28098 | } |
28099 | } | |
28100 | exit_loop: | |
28101 | - mutex_unlock(&event_mutex); | |
28102 | mutex_unlock(&trace_types_lock); | |
28103 | + mutex_unlock(&event_mutex); | |
28104 | ||
28105 | if (!system) | |
28106 | return -ENODEV; | |
b3bbd485 | 28107 | @@ -2308,15 +2310,15 @@ static void __add_event_to_tracers(struct trace_event_call *call); |
e4b2b4a8 | 28108 | int trace_add_event_call(struct trace_event_call *call) |
1a6e0f06 | 28109 | { |
e4b2b4a8 JK |
28110 | int ret; |
28111 | - mutex_lock(&trace_types_lock); | |
28112 | mutex_lock(&event_mutex); | |
28113 | + mutex_lock(&trace_types_lock); | |
28114 | ||
28115 | ret = __register_event(call, NULL); | |
28116 | if (ret >= 0) | |
28117 | __add_event_to_tracers(call); | |
28118 | ||
28119 | - mutex_unlock(&event_mutex); | |
28120 | mutex_unlock(&trace_types_lock); | |
28121 | + mutex_unlock(&event_mutex); | |
28122 | return ret; | |
1a6e0f06 JK |
28123 | } |
28124 | ||
b3bbd485 | 28125 | @@ -2370,13 +2372,13 @@ int trace_remove_event_call(struct trace_event_call *call) |
1a6e0f06 | 28126 | { |
e4b2b4a8 | 28127 | int ret; |
1a6e0f06 | 28128 | |
e4b2b4a8 JK |
28129 | - mutex_lock(&trace_types_lock); |
28130 | mutex_lock(&event_mutex); | |
28131 | + mutex_lock(&trace_types_lock); | |
28132 | down_write(&trace_event_sem); | |
28133 | ret = probe_remove_event_call(call); | |
28134 | up_write(&trace_event_sem); | |
28135 | - mutex_unlock(&event_mutex); | |
28136 | mutex_unlock(&trace_types_lock); | |
28137 | + mutex_unlock(&event_mutex); | |
1a6e0f06 | 28138 | |
e4b2b4a8 | 28139 | return ret; |
1a6e0f06 | 28140 | } |
b3bbd485 | 28141 | @@ -2438,8 +2440,8 @@ static int trace_module_notify(struct notifier_block *self, |
e4b2b4a8 JK |
28142 | { |
28143 | struct module *mod = data; | |
1a6e0f06 | 28144 | |
e4b2b4a8 JK |
28145 | - mutex_lock(&trace_types_lock); |
28146 | mutex_lock(&event_mutex); | |
28147 | + mutex_lock(&trace_types_lock); | |
28148 | switch (val) { | |
28149 | case MODULE_STATE_COMING: | |
28150 | trace_module_add_events(mod); | |
b3bbd485 | 28151 | @@ -2448,8 +2450,8 @@ static int trace_module_notify(struct notifier_block *self, |
e4b2b4a8 JK |
28152 | trace_module_remove_events(mod); |
28153 | break; | |
28154 | } | |
28155 | - mutex_unlock(&event_mutex); | |
28156 | mutex_unlock(&trace_types_lock); | |
28157 | + mutex_unlock(&event_mutex); | |
1a6e0f06 | 28158 | |
1a6e0f06 JK |
28159 | return 0; |
28160 | } | |
b3bbd485 | 28161 | @@ -2964,24 +2966,24 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) |
e4b2b4a8 JK |
28162 | * creates the event hierachry in the @parent/events directory. |
28163 | * | |
28164 | * Returns 0 on success. | |
28165 | + * | |
28166 | + * Must be called with event_mutex held. | |
28167 | */ | |
28168 | int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr) | |
28169 | { | |
28170 | int ret; | |
1a6e0f06 | 28171 | |
e4b2b4a8 JK |
28172 | - mutex_lock(&event_mutex); |
28173 | + lockdep_assert_held(&event_mutex); | |
c7c16703 | 28174 | |
e4b2b4a8 JK |
28175 | ret = create_event_toplevel_files(parent, tr); |
28176 | if (ret) | |
28177 | - goto out_unlock; | |
28178 | + goto out; | |
c7c16703 | 28179 | |
e4b2b4a8 JK |
28180 | down_write(&trace_event_sem); |
28181 | __trace_add_event_dirs(tr); | |
28182 | up_write(&trace_event_sem); | |
c7c16703 | 28183 | |
e4b2b4a8 JK |
28184 | - out_unlock: |
28185 | - mutex_unlock(&event_mutex); | |
28186 | - | |
28187 | + out: | |
28188 | return ret; | |
1a6e0f06 | 28189 | } |
1a6e0f06 | 28190 | |
b3bbd485 | 28191 | @@ -3010,9 +3012,10 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr) |
e4b2b4a8 | 28192 | return ret; |
1a6e0f06 | 28193 | } |
1a6e0f06 | 28194 | |
e4b2b4a8 JK |
28195 | +/* Must be called with event_mutex held */ |
28196 | int event_trace_del_tracer(struct trace_array *tr) | |
28197 | { | |
28198 | - mutex_lock(&event_mutex); | |
28199 | + lockdep_assert_held(&event_mutex); | |
1a6e0f06 | 28200 | |
e4b2b4a8 JK |
28201 | /* Disable any event triggers and associated soft-disabled events */ |
28202 | clear_event_triggers(tr); | |
b3bbd485 | 28203 | @@ -3033,8 +3036,6 @@ int event_trace_del_tracer(struct trace_array *tr) |
1a6e0f06 | 28204 | |
e4b2b4a8 | 28205 | tr->event_dir = NULL; |
1a6e0f06 | 28206 | |
e4b2b4a8 JK |
28207 | - mutex_unlock(&event_mutex); |
28208 | - | |
28209 | return 0; | |
1a6e0f06 | 28210 | } |
1a6e0f06 | 28211 | |
b3bbd485 JK |
28212 | diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c |
28213 | index 7eb975a2d0e1..24bc0769fdd6 100644 | |
28214 | --- a/kernel/trace/trace_events_hist.c | |
28215 | +++ b/kernel/trace/trace_events_hist.c | |
e4b2b4a8 JK |
28216 | @@ -20,13 +20,39 @@ |
28217 | #include <linux/slab.h> | |
28218 | #include <linux/stacktrace.h> | |
28219 | #include <linux/rculist.h> | |
28220 | +#include <linux/tracefs.h> | |
1a6e0f06 | 28221 | |
e4b2b4a8 JK |
28222 | #include "tracing_map.h" |
28223 | #include "trace.h" | |
1a6e0f06 | 28224 | |
e4b2b4a8 JK |
28225 | +#define SYNTH_SYSTEM "synthetic" |
28226 | +#define SYNTH_FIELDS_MAX 16 | |
28227 | + | |
28228 | +#define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */ | |
28229 | + | |
28230 | struct hist_field; | |
1a6e0f06 | 28231 | |
e4b2b4a8 JK |
28232 | -typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event); |
28233 | +typedef u64 (*hist_field_fn_t) (struct hist_field *field, | |
28234 | + struct tracing_map_elt *elt, | |
28235 | + struct ring_buffer_event *rbe, | |
28236 | + void *event); | |
28237 | + | |
28238 | +#define HIST_FIELD_OPERANDS_MAX 2 | |
28239 | +#define HIST_FIELDS_MAX (TRACING_MAP_FIELDS_MAX + TRACING_MAP_VARS_MAX) | |
28240 | +#define HIST_ACTIONS_MAX 8 | |
28241 | + | |
28242 | +enum field_op_id { | |
28243 | + FIELD_OP_NONE, | |
28244 | + FIELD_OP_PLUS, | |
28245 | + FIELD_OP_MINUS, | |
28246 | + FIELD_OP_UNARY_MINUS, | |
28247 | +}; | |
28248 | + | |
28249 | +struct hist_var { | |
28250 | + char *name; | |
28251 | + struct hist_trigger_data *hist_data; | |
28252 | + unsigned int idx; | |
28253 | +}; | |
28254 | ||
28255 | struct hist_field { | |
28256 | struct ftrace_event_field *field; | |
b3bbd485 | 28257 | @@ -34,26 +60,50 @@ struct hist_field { |
e4b2b4a8 JK |
28258 | hist_field_fn_t fn; |
28259 | unsigned int size; | |
28260 | unsigned int offset; | |
28261 | + unsigned int is_signed; | |
28262 | + const char *type; | |
28263 | + struct hist_field *operands[HIST_FIELD_OPERANDS_MAX]; | |
28264 | + struct hist_trigger_data *hist_data; | |
28265 | + struct hist_var var; | |
28266 | + enum field_op_id operator; | |
28267 | + char *system; | |
28268 | + char *event_name; | |
28269 | + char *name; | |
28270 | + unsigned int var_idx; | |
28271 | + unsigned int var_ref_idx; | |
28272 | + bool read_once; | |
28273 | }; | |
28274 | ||
28275 | -static u64 hist_field_none(struct hist_field *field, void *event) | |
28276 | +static u64 hist_field_none(struct hist_field *field, | |
28277 | + struct tracing_map_elt *elt, | |
28278 | + struct ring_buffer_event *rbe, | |
28279 | + void *event) | |
1a6e0f06 | 28280 | { |
e4b2b4a8 JK |
28281 | return 0; |
28282 | } | |
1a6e0f06 | 28283 | |
e4b2b4a8 JK |
28284 | -static u64 hist_field_counter(struct hist_field *field, void *event) |
28285 | +static u64 hist_field_counter(struct hist_field *field, | |
28286 | + struct tracing_map_elt *elt, | |
28287 | + struct ring_buffer_event *rbe, | |
28288 | + void *event) | |
28289 | { | |
28290 | return 1; | |
1a6e0f06 JK |
28291 | } |
28292 | ||
e4b2b4a8 JK |
28293 | -static u64 hist_field_string(struct hist_field *hist_field, void *event) |
28294 | +static u64 hist_field_string(struct hist_field *hist_field, | |
28295 | + struct tracing_map_elt *elt, | |
28296 | + struct ring_buffer_event *rbe, | |
28297 | + void *event) | |
28298 | { | |
28299 | char *addr = (char *)(event + hist_field->field->offset); | |
1a6e0f06 | 28300 | |
e4b2b4a8 | 28301 | return (u64)(unsigned long)addr; |
1a6e0f06 | 28302 | } |
e4b2b4a8 JK |
28303 | |
28304 | -static u64 hist_field_dynstring(struct hist_field *hist_field, void *event) | |
28305 | +static u64 hist_field_dynstring(struct hist_field *hist_field, | |
28306 | + struct tracing_map_elt *elt, | |
28307 | + struct ring_buffer_event *rbe, | |
28308 | + void *event) | |
28309 | { | |
28310 | u32 str_item = *(u32 *)(event + hist_field->field->offset); | |
28311 | int str_loc = str_item & 0xffff; | |
b3bbd485 | 28312 | @@ -62,22 +112,74 @@ static u64 hist_field_dynstring(struct hist_field *hist_field, void *event) |
e4b2b4a8 | 28313 | return (u64)(unsigned long)addr; |
1a6e0f06 | 28314 | } |
1a6e0f06 | 28315 | |
e4b2b4a8 JK |
28316 | -static u64 hist_field_pstring(struct hist_field *hist_field, void *event) |
28317 | +static u64 hist_field_pstring(struct hist_field *hist_field, | |
28318 | + struct tracing_map_elt *elt, | |
28319 | + struct ring_buffer_event *rbe, | |
28320 | + void *event) | |
28321 | { | |
28322 | char **addr = (char **)(event + hist_field->field->offset); | |
1a6e0f06 | 28323 | |
e4b2b4a8 | 28324 | return (u64)(unsigned long)*addr; |
1a6e0f06 JK |
28325 | } |
28326 | ||
e4b2b4a8 JK |
28327 | -static u64 hist_field_log2(struct hist_field *hist_field, void *event) |
28328 | +static u64 hist_field_log2(struct hist_field *hist_field, | |
28329 | + struct tracing_map_elt *elt, | |
28330 | + struct ring_buffer_event *rbe, | |
28331 | + void *event) | |
1a6e0f06 | 28332 | { |
e4b2b4a8 JK |
28333 | - u64 val = *(u64 *)(event + hist_field->field->offset); |
28334 | + struct hist_field *operand = hist_field->operands[0]; | |
28335 | + | |
28336 | + u64 val = operand->fn(operand, elt, rbe, event); | |
1a6e0f06 | 28337 | |
e4b2b4a8 | 28338 | return (u64) ilog2(roundup_pow_of_two(val)); |
1a6e0f06 JK |
28339 | } |
28340 | ||
e4b2b4a8 JK |
28341 | +static u64 hist_field_plus(struct hist_field *hist_field, |
28342 | + struct tracing_map_elt *elt, | |
28343 | + struct ring_buffer_event *rbe, | |
28344 | + void *event) | |
1a6e0f06 | 28345 | +{ |
e4b2b4a8 JK |
28346 | + struct hist_field *operand1 = hist_field->operands[0]; |
28347 | + struct hist_field *operand2 = hist_field->operands[1]; | |
28348 | + | |
28349 | + u64 val1 = operand1->fn(operand1, elt, rbe, event); | |
28350 | + u64 val2 = operand2->fn(operand2, elt, rbe, event); | |
28351 | + | |
28352 | + return val1 + val2; | |
28353 | +} | |
28354 | + | |
28355 | +static u64 hist_field_minus(struct hist_field *hist_field, | |
28356 | + struct tracing_map_elt *elt, | |
28357 | + struct ring_buffer_event *rbe, | |
28358 | + void *event) | |
28359 | +{ | |
28360 | + struct hist_field *operand1 = hist_field->operands[0]; | |
28361 | + struct hist_field *operand2 = hist_field->operands[1]; | |
28362 | + | |
28363 | + u64 val1 = operand1->fn(operand1, elt, rbe, event); | |
28364 | + u64 val2 = operand2->fn(operand2, elt, rbe, event); | |
28365 | + | |
28366 | + return val1 - val2; | |
28367 | +} | |
28368 | + | |
28369 | +static u64 hist_field_unary_minus(struct hist_field *hist_field, | |
28370 | + struct tracing_map_elt *elt, | |
28371 | + struct ring_buffer_event *rbe, | |
28372 | + void *event) | |
28373 | +{ | |
28374 | + struct hist_field *operand = hist_field->operands[0]; | |
28375 | + | |
28376 | + s64 sval = (s64)operand->fn(operand, elt, rbe, event); | |
28377 | + u64 val = (u64)-sval; | |
28378 | + | |
28379 | + return val; | |
28380 | +} | |
28381 | + | |
28382 | #define DEFINE_HIST_FIELD_FN(type) \ | |
28383 | -static u64 hist_field_##type(struct hist_field *hist_field, void *event)\ | |
28384 | + static u64 hist_field_##type(struct hist_field *hist_field, \ | |
28385 | + struct tracing_map_elt *elt, \ | |
28386 | + struct ring_buffer_event *rbe, \ | |
28387 | + void *event) \ | |
28388 | { \ | |
28389 | type *addr = (type *)(event + hist_field->field->offset); \ | |
28390 | \ | |
b3bbd485 | 28391 | @@ -110,16 +212,29 @@ DEFINE_HIST_FIELD_FN(u8); |
e4b2b4a8 JK |
28392 | #define HIST_KEY_SIZE_MAX (MAX_FILTER_STR_VAL + HIST_STACKTRACE_SIZE) |
28393 | ||
28394 | enum hist_field_flags { | |
28395 | - HIST_FIELD_FL_HITCOUNT = 1, | |
28396 | - HIST_FIELD_FL_KEY = 2, | |
28397 | - HIST_FIELD_FL_STRING = 4, | |
28398 | - HIST_FIELD_FL_HEX = 8, | |
28399 | - HIST_FIELD_FL_SYM = 16, | |
28400 | - HIST_FIELD_FL_SYM_OFFSET = 32, | |
28401 | - HIST_FIELD_FL_EXECNAME = 64, | |
28402 | - HIST_FIELD_FL_SYSCALL = 128, | |
28403 | - HIST_FIELD_FL_STACKTRACE = 256, | |
28404 | - HIST_FIELD_FL_LOG2 = 512, | |
28405 | + HIST_FIELD_FL_HITCOUNT = 1 << 0, | |
28406 | + HIST_FIELD_FL_KEY = 1 << 1, | |
28407 | + HIST_FIELD_FL_STRING = 1 << 2, | |
28408 | + HIST_FIELD_FL_HEX = 1 << 3, | |
28409 | + HIST_FIELD_FL_SYM = 1 << 4, | |
28410 | + HIST_FIELD_FL_SYM_OFFSET = 1 << 5, | |
28411 | + HIST_FIELD_FL_EXECNAME = 1 << 6, | |
28412 | + HIST_FIELD_FL_SYSCALL = 1 << 7, | |
28413 | + HIST_FIELD_FL_STACKTRACE = 1 << 8, | |
28414 | + HIST_FIELD_FL_LOG2 = 1 << 9, | |
28415 | + HIST_FIELD_FL_TIMESTAMP = 1 << 10, | |
28416 | + HIST_FIELD_FL_TIMESTAMP_USECS = 1 << 11, | |
28417 | + HIST_FIELD_FL_VAR = 1 << 12, | |
28418 | + HIST_FIELD_FL_EXPR = 1 << 13, | |
28419 | + HIST_FIELD_FL_VAR_REF = 1 << 14, | |
28420 | + HIST_FIELD_FL_CPU = 1 << 15, | |
28421 | + HIST_FIELD_FL_ALIAS = 1 << 16, | |
28422 | +}; | |
28423 | + | |
28424 | +struct var_defs { | |
28425 | + unsigned int n_vars; | |
28426 | + char *name[TRACING_MAP_VARS_MAX]; | |
28427 | + char *expr[TRACING_MAP_VARS_MAX]; | |
28428 | }; | |
28429 | ||
28430 | struct hist_trigger_attrs { | |
b3bbd485 | 28431 | @@ -127,298 +242,3585 @@ struct hist_trigger_attrs { |
e4b2b4a8 JK |
28432 | char *vals_str; |
28433 | char *sort_key_str; | |
28434 | char *name; | |
28435 | + char *clock; | |
28436 | bool pause; | |
28437 | bool cont; | |
28438 | bool clear; | |
28439 | + bool ts_in_usecs; | |
28440 | unsigned int map_bits; | |
28441 | + | |
28442 | + char *assignment_str[TRACING_MAP_VARS_MAX]; | |
28443 | + unsigned int n_assignments; | |
28444 | + | |
28445 | + char *action_str[HIST_ACTIONS_MAX]; | |
28446 | + unsigned int n_actions; | |
28447 | + | |
28448 | + struct var_defs var_defs; | |
28449 | +}; | |
28450 | + | |
28451 | +struct field_var { | |
28452 | + struct hist_field *var; | |
28453 | + struct hist_field *val; | |
28454 | +}; | |
1a6e0f06 | 28455 | + |
e4b2b4a8 JK |
28456 | +struct field_var_hist { |
28457 | + struct hist_trigger_data *hist_data; | |
28458 | + char *cmd; | |
28459 | }; | |
28460 | ||
28461 | struct hist_trigger_data { | |
28462 | - struct hist_field *fields[TRACING_MAP_FIELDS_MAX]; | |
28463 | + struct hist_field *fields[HIST_FIELDS_MAX]; | |
28464 | unsigned int n_vals; | |
28465 | unsigned int n_keys; | |
28466 | unsigned int n_fields; | |
28467 | + unsigned int n_vars; | |
28468 | unsigned int key_size; | |
28469 | struct tracing_map_sort_key sort_keys[TRACING_MAP_SORT_KEYS_MAX]; | |
28470 | unsigned int n_sort_keys; | |
28471 | struct trace_event_file *event_file; | |
28472 | struct hist_trigger_attrs *attrs; | |
28473 | struct tracing_map *map; | |
28474 | + bool enable_timestamps; | |
28475 | + bool remove; | |
28476 | + struct hist_field *var_refs[TRACING_MAP_VARS_MAX]; | |
28477 | + unsigned int n_var_refs; | |
28478 | + | |
28479 | + struct action_data *actions[HIST_ACTIONS_MAX]; | |
28480 | + unsigned int n_actions; | |
28481 | + | |
28482 | + struct hist_field *synth_var_refs[SYNTH_FIELDS_MAX]; | |
28483 | + unsigned int n_synth_var_refs; | |
28484 | + struct field_var *field_vars[SYNTH_FIELDS_MAX]; | |
28485 | + unsigned int n_field_vars; | |
28486 | + unsigned int n_field_var_str; | |
28487 | + struct field_var_hist *field_var_hists[SYNTH_FIELDS_MAX]; | |
28488 | + unsigned int n_field_var_hists; | |
28489 | + | |
28490 | + struct field_var *max_vars[SYNTH_FIELDS_MAX]; | |
28491 | + unsigned int n_max_vars; | |
28492 | + unsigned int n_max_var_str; | |
b3bbd485 JK |
28493 | }; |
28494 | ||
28495 | -static hist_field_fn_t select_value_fn(int field_size, int field_is_signed) | |
28496 | -{ | |
28497 | - hist_field_fn_t fn = NULL; | |
e4b2b4a8 JK |
28498 | +struct synth_field { |
28499 | + char *type; | |
28500 | + char *name; | |
28501 | + size_t size; | |
28502 | + bool is_signed; | |
28503 | + bool is_string; | |
28504 | +}; | |
b3bbd485 JK |
28505 | |
28506 | - switch (field_size) { | |
28507 | - case 8: | |
28508 | - if (field_is_signed) | |
28509 | - fn = hist_field_s64; | |
28510 | - else | |
28511 | - fn = hist_field_u64; | |
28512 | - break; | |
28513 | - case 4: | |
28514 | - if (field_is_signed) | |
28515 | - fn = hist_field_s32; | |
28516 | - else | |
28517 | - fn = hist_field_u32; | |
28518 | - break; | |
28519 | - case 2: | |
28520 | - if (field_is_signed) | |
28521 | - fn = hist_field_s16; | |
28522 | - else | |
28523 | - fn = hist_field_u16; | |
28524 | - break; | |
28525 | - case 1: | |
28526 | - if (field_is_signed) | |
28527 | - fn = hist_field_s8; | |
28528 | - else | |
28529 | - fn = hist_field_u8; | |
28530 | - break; | |
28531 | - } | |
e4b2b4a8 JK |
28532 | +struct synth_event { |
28533 | + struct list_head list; | |
28534 | + int ref; | |
28535 | + char *name; | |
28536 | + struct synth_field **fields; | |
28537 | + unsigned int n_fields; | |
28538 | + unsigned int n_u64; | |
28539 | + struct trace_event_class class; | |
28540 | + struct trace_event_call call; | |
28541 | + struct tracepoint *tp; | |
28542 | +}; | |
b3bbd485 JK |
28543 | |
28544 | - return fn; | |
e4b2b4a8 | 28545 | +struct action_data; |
1a6e0f06 | 28546 | + |
e4b2b4a8 JK |
28547 | +typedef void (*action_fn_t) (struct hist_trigger_data *hist_data, |
28548 | + struct tracing_map_elt *elt, void *rec, | |
28549 | + struct ring_buffer_event *rbe, | |
28550 | + struct action_data *data, u64 *var_ref_vals); | |
1a6e0f06 | 28551 | + |
e4b2b4a8 JK |
28552 | +struct action_data { |
28553 | + action_fn_t fn; | |
28554 | + unsigned int n_params; | |
28555 | + char *params[SYNTH_FIELDS_MAX]; | |
28556 | + | |
28557 | + union { | |
28558 | + struct { | |
28559 | + unsigned int var_ref_idx; | |
28560 | + char *match_event; | |
28561 | + char *match_event_system; | |
28562 | + char *synth_event_name; | |
28563 | + struct synth_event *synth_event; | |
28564 | + } onmatch; | |
28565 | + | |
28566 | + struct { | |
28567 | + char *var_str; | |
28568 | + char *fn_name; | |
28569 | + unsigned int max_var_ref_idx; | |
28570 | + struct hist_field *max_var; | |
28571 | + struct hist_field *var; | |
28572 | + } onmax; | |
28573 | + }; | |
28574 | +}; | |
28575 | + | |
28576 | + | |
28577 | +static char last_hist_cmd[MAX_FILTER_STR_VAL]; | |
28578 | +static char hist_err_str[MAX_FILTER_STR_VAL]; | |
28579 | + | |
28580 | +static void last_cmd_set(char *str) | |
28581 | +{ | |
28582 | + if (!str) | |
1a6e0f06 JK |
28583 | + return; |
28584 | + | |
e4b2b4a8 | 28585 | + strncpy(last_hist_cmd, str, MAX_FILTER_STR_VAL - 1); |
b3bbd485 JK |
28586 | } |
28587 | ||
28588 | -static int parse_map_size(char *str) | |
e4b2b4a8 | 28589 | +static void hist_err(char *str, char *var) |
b3bbd485 JK |
28590 | { |
28591 | - unsigned long size, map_bits; | |
28592 | - int ret; | |
e4b2b4a8 | 28593 | + int maxlen = MAX_FILTER_STR_VAL - 1; |
b3bbd485 JK |
28594 | |
28595 | - strsep(&str, "="); | |
28596 | - if (!str) { | |
28597 | - ret = -EINVAL; | |
28598 | - goto out; | |
28599 | - } | |
e4b2b4a8 | 28600 | + if (!str) |
1a6e0f06 | 28601 | + return; |
b3bbd485 JK |
28602 | |
28603 | - ret = kstrtoul(str, 0, &size); | |
28604 | - if (ret) | |
28605 | - goto out; | |
e4b2b4a8 JK |
28606 | + if (strlen(hist_err_str)) |
28607 | + return; | |
b3bbd485 JK |
28608 | |
28609 | - map_bits = ilog2(roundup_pow_of_two(size)); | |
28610 | - if (map_bits < TRACING_MAP_BITS_MIN || | |
28611 | - map_bits > TRACING_MAP_BITS_MAX) | |
28612 | - ret = -EINVAL; | |
28613 | - else | |
28614 | - ret = map_bits; | |
28615 | - out: | |
28616 | - return ret; | |
e4b2b4a8 JK |
28617 | + if (!var) |
28618 | + var = ""; | |
28619 | + | |
28620 | + if (strlen(hist_err_str) + strlen(str) + strlen(var) > maxlen) | |
28621 | + return; | |
1a6e0f06 | 28622 | + |
e4b2b4a8 JK |
28623 | + strcat(hist_err_str, str); |
28624 | + strcat(hist_err_str, var); | |
b3bbd485 JK |
28625 | } |
28626 | ||
28627 | -static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs) | |
e4b2b4a8 | 28628 | +static void hist_err_event(char *str, char *system, char *event, char *var) |
b3bbd485 JK |
28629 | { |
28630 | - if (!attrs) | |
28631 | - return; | |
e4b2b4a8 | 28632 | + char err[MAX_FILTER_STR_VAL]; |
b3bbd485 JK |
28633 | |
28634 | - kfree(attrs->name); | |
28635 | - kfree(attrs->sort_key_str); | |
28636 | - kfree(attrs->keys_str); | |
28637 | - kfree(attrs->vals_str); | |
28638 | - kfree(attrs); | |
e4b2b4a8 JK |
28639 | + if (system && var) |
28640 | + snprintf(err, MAX_FILTER_STR_VAL, "%s.%s.%s", system, event, var); | |
28641 | + else if (system) | |
28642 | + snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event); | |
28643 | + else | |
28644 | + strncpy(err, var, MAX_FILTER_STR_VAL); | |
28645 | + | |
28646 | + hist_err(str, err); | |
b3bbd485 JK |
28647 | } |
28648 | ||
28649 | -static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) | |
e4b2b4a8 | 28650 | +static void hist_err_clear(void) |
b3bbd485 JK |
28651 | { |
28652 | - struct hist_trigger_attrs *attrs; | |
28653 | - int ret = 0; | |
e4b2b4a8 | 28654 | + hist_err_str[0] = '\0'; |
1a6e0f06 | 28655 | +} |
b3bbd485 JK |
28656 | |
28657 | - attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); | |
28658 | - if (!attrs) | |
28659 | - return ERR_PTR(-ENOMEM); | |
e4b2b4a8 | 28660 | +static bool have_hist_err(void) |
1a6e0f06 | 28661 | +{ |
e4b2b4a8 JK |
28662 | + if (strlen(hist_err_str)) |
28663 | + return true; | |
b3bbd485 JK |
28664 | |
28665 | - while (trigger_str) { | |
28666 | - char *str = strsep(&trigger_str, ":"); | |
e4b2b4a8 JK |
28667 | + return false; |
28668 | +} | |
b3bbd485 JK |
28669 | |
28670 | - if ((strncmp(str, "key=", strlen("key=")) == 0) || | |
28671 | - (strncmp(str, "keys=", strlen("keys=")) == 0)) | |
28672 | - attrs->keys_str = kstrdup(str, GFP_KERNEL); | |
28673 | - else if ((strncmp(str, "val=", strlen("val=")) == 0) || | |
28674 | - (strncmp(str, "vals=", strlen("vals=")) == 0) || | |
28675 | - (strncmp(str, "values=", strlen("values=")) == 0)) | |
28676 | - attrs->vals_str = kstrdup(str, GFP_KERNEL); | |
28677 | - else if (strncmp(str, "sort=", strlen("sort=")) == 0) | |
28678 | - attrs->sort_key_str = kstrdup(str, GFP_KERNEL); | |
28679 | - else if (strncmp(str, "name=", strlen("name=")) == 0) | |
28680 | - attrs->name = kstrdup(str, GFP_KERNEL); | |
28681 | - else if (strcmp(str, "pause") == 0) | |
28682 | - attrs->pause = true; | |
28683 | - else if ((strcmp(str, "cont") == 0) || | |
28684 | - (strcmp(str, "continue") == 0)) | |
28685 | - attrs->cont = true; | |
28686 | - else if (strcmp(str, "clear") == 0) | |
28687 | - attrs->clear = true; | |
28688 | - else if (strncmp(str, "size=", strlen("size=")) == 0) { | |
28689 | - int map_bits = parse_map_size(str); | |
e4b2b4a8 JK |
28690 | +static LIST_HEAD(synth_event_list); |
28691 | +static DEFINE_MUTEX(synth_event_mutex); | |
b3bbd485 JK |
28692 | |
28693 | - if (map_bits < 0) { | |
28694 | - ret = map_bits; | |
28695 | - goto free; | |
28696 | - } | |
28697 | - attrs->map_bits = map_bits; | |
e4b2b4a8 JK |
28698 | +struct synth_trace_event { |
28699 | + struct trace_entry ent; | |
28700 | + u64 fields[]; | |
28701 | +}; | |
1a6e0f06 | 28702 | + |
e4b2b4a8 JK |
28703 | +static int synth_event_define_fields(struct trace_event_call *call) |
28704 | +{ | |
28705 | + struct synth_trace_event trace; | |
28706 | + int offset = offsetof(typeof(trace), fields); | |
28707 | + struct synth_event *event = call->data; | |
28708 | + unsigned int i, size, n_u64; | |
28709 | + char *name, *type; | |
28710 | + bool is_signed; | |
28711 | + int ret = 0; | |
28712 | + | |
28713 | + for (i = 0, n_u64 = 0; i < event->n_fields; i++) { | |
28714 | + size = event->fields[i]->size; | |
28715 | + is_signed = event->fields[i]->is_signed; | |
28716 | + type = event->fields[i]->type; | |
28717 | + name = event->fields[i]->name; | |
28718 | + ret = trace_define_field(call, type, name, offset, size, | |
28719 | + is_signed, FILTER_OTHER); | |
28720 | + if (ret) | |
28721 | + break; | |
1a6e0f06 | 28722 | + |
e4b2b4a8 JK |
28723 | + if (event->fields[i]->is_string) { |
28724 | + offset += STR_VAR_LEN_MAX; | |
28725 | + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); | |
b3bbd485 JK |
28726 | } else { |
28727 | - ret = -EINVAL; | |
28728 | - goto free; | |
e4b2b4a8 JK |
28729 | + offset += sizeof(u64); |
28730 | + n_u64++; | |
b3bbd485 JK |
28731 | } |
28732 | } | |
28733 | ||
28734 | - if (!attrs->keys_str) { | |
28735 | - ret = -EINVAL; | |
28736 | - goto free; | |
28737 | - } | |
e4b2b4a8 | 28738 | + event->n_u64 = n_u64; |
b3bbd485 JK |
28739 | |
28740 | - return attrs; | |
28741 | - free: | |
28742 | - destroy_hist_trigger_attrs(attrs); | |
e4b2b4a8 JK |
28743 | + return ret; |
28744 | +} | |
b3bbd485 JK |
28745 | |
28746 | - return ERR_PTR(ret); | |
e4b2b4a8 JK |
28747 | +static bool synth_field_signed(char *type) |
28748 | +{ | |
28749 | + if (strncmp(type, "u", 1) == 0) | |
28750 | + return false; | |
1a6e0f06 | 28751 | + |
e4b2b4a8 | 28752 | + return true; |
b3bbd485 JK |
28753 | } |
28754 | ||
28755 | -static inline void save_comm(char *comm, struct task_struct *task) | |
e4b2b4a8 | 28756 | +static int synth_field_is_string(char *type) |
b3bbd485 JK |
28757 | { |
28758 | - if (!task->pid) { | |
28759 | - strcpy(comm, "<idle>"); | |
28760 | - return; | |
28761 | - } | |
e4b2b4a8 JK |
28762 | + if (strstr(type, "char[") != NULL) |
28763 | + return true; | |
b3bbd485 JK |
28764 | |
28765 | - if (WARN_ON_ONCE(task->pid < 0)) { | |
28766 | - strcpy(comm, "<XXX>"); | |
28767 | - return; | |
28768 | - } | |
e4b2b4a8 | 28769 | + return false; |
1a6e0f06 | 28770 | +} |
b3bbd485 JK |
28771 | |
28772 | - memcpy(comm, task->comm, TASK_COMM_LEN); | |
e4b2b4a8 | 28773 | +static int synth_field_string_size(char *type) |
1a6e0f06 | 28774 | +{ |
e4b2b4a8 JK |
28775 | + char buf[4], *end, *start; |
28776 | + unsigned int len; | |
28777 | + int size, err; | |
1a6e0f06 | 28778 | + |
e4b2b4a8 JK |
28779 | + start = strstr(type, "char["); |
28780 | + if (start == NULL) | |
28781 | + return -EINVAL; | |
28782 | + start += strlen("char["); | |
1a6e0f06 | 28783 | + |
e4b2b4a8 JK |
28784 | + end = strchr(type, ']'); |
28785 | + if (!end || end < start) | |
28786 | + return -EINVAL; | |
28787 | + | |
28788 | + len = end - start; | |
28789 | + if (len > 3) | |
28790 | + return -EINVAL; | |
28791 | + | |
28792 | + strncpy(buf, start, len); | |
28793 | + buf[len] = '\0'; | |
28794 | + | |
28795 | + err = kstrtouint(buf, 0, &size); | |
28796 | + if (err) | |
28797 | + return err; | |
28798 | + | |
28799 | + if (size > STR_VAR_LEN_MAX) | |
28800 | + return -EINVAL; | |
28801 | + | |
28802 | + return size; | |
b3bbd485 JK |
28803 | } |
28804 | ||
28805 | -static void hist_trigger_elt_comm_free(struct tracing_map_elt *elt) | |
e4b2b4a8 | 28806 | +static int synth_field_size(char *type) |
b3bbd485 JK |
28807 | { |
28808 | - kfree((char *)elt->private_data); | |
e4b2b4a8 JK |
28809 | + int size = 0; |
28810 | + | |
28811 | + if (strcmp(type, "s64") == 0) | |
28812 | + size = sizeof(s64); | |
28813 | + else if (strcmp(type, "u64") == 0) | |
28814 | + size = sizeof(u64); | |
28815 | + else if (strcmp(type, "s32") == 0) | |
28816 | + size = sizeof(s32); | |
28817 | + else if (strcmp(type, "u32") == 0) | |
28818 | + size = sizeof(u32); | |
28819 | + else if (strcmp(type, "s16") == 0) | |
28820 | + size = sizeof(s16); | |
28821 | + else if (strcmp(type, "u16") == 0) | |
28822 | + size = sizeof(u16); | |
28823 | + else if (strcmp(type, "s8") == 0) | |
28824 | + size = sizeof(s8); | |
28825 | + else if (strcmp(type, "u8") == 0) | |
28826 | + size = sizeof(u8); | |
28827 | + else if (strcmp(type, "char") == 0) | |
28828 | + size = sizeof(char); | |
28829 | + else if (strcmp(type, "unsigned char") == 0) | |
28830 | + size = sizeof(unsigned char); | |
28831 | + else if (strcmp(type, "int") == 0) | |
28832 | + size = sizeof(int); | |
28833 | + else if (strcmp(type, "unsigned int") == 0) | |
28834 | + size = sizeof(unsigned int); | |
28835 | + else if (strcmp(type, "long") == 0) | |
28836 | + size = sizeof(long); | |
28837 | + else if (strcmp(type, "unsigned long") == 0) | |
28838 | + size = sizeof(unsigned long); | |
28839 | + else if (strcmp(type, "pid_t") == 0) | |
28840 | + size = sizeof(pid_t); | |
28841 | + else if (synth_field_is_string(type)) | |
28842 | + size = synth_field_string_size(type); | |
1a6e0f06 | 28843 | + |
e4b2b4a8 | 28844 | + return size; |
b3bbd485 JK |
28845 | } |
28846 | ||
28847 | -static int hist_trigger_elt_comm_alloc(struct tracing_map_elt *elt) | |
e4b2b4a8 | 28848 | +static const char *synth_field_fmt(char *type) |
b3bbd485 JK |
28849 | { |
28850 | - struct hist_trigger_data *hist_data = elt->map->private_data; | |
28851 | - struct hist_field *key_field; | |
28852 | - unsigned int i; | |
e4b2b4a8 JK |
28853 | + const char *fmt = "%llu"; |
28854 | + | |
28855 | + if (strcmp(type, "s64") == 0) | |
28856 | + fmt = "%lld"; | |
28857 | + else if (strcmp(type, "u64") == 0) | |
28858 | + fmt = "%llu"; | |
28859 | + else if (strcmp(type, "s32") == 0) | |
28860 | + fmt = "%d"; | |
28861 | + else if (strcmp(type, "u32") == 0) | |
28862 | + fmt = "%u"; | |
28863 | + else if (strcmp(type, "s16") == 0) | |
28864 | + fmt = "%d"; | |
28865 | + else if (strcmp(type, "u16") == 0) | |
28866 | + fmt = "%u"; | |
28867 | + else if (strcmp(type, "s8") == 0) | |
28868 | + fmt = "%d"; | |
28869 | + else if (strcmp(type, "u8") == 0) | |
28870 | + fmt = "%u"; | |
28871 | + else if (strcmp(type, "char") == 0) | |
28872 | + fmt = "%d"; | |
28873 | + else if (strcmp(type, "unsigned char") == 0) | |
28874 | + fmt = "%u"; | |
28875 | + else if (strcmp(type, "int") == 0) | |
28876 | + fmt = "%d"; | |
28877 | + else if (strcmp(type, "unsigned int") == 0) | |
28878 | + fmt = "%u"; | |
28879 | + else if (strcmp(type, "long") == 0) | |
28880 | + fmt = "%ld"; | |
28881 | + else if (strcmp(type, "unsigned long") == 0) | |
28882 | + fmt = "%lu"; | |
28883 | + else if (strcmp(type, "pid_t") == 0) | |
28884 | + fmt = "%d"; | |
28885 | + else if (synth_field_is_string(type)) | |
28886 | + fmt = "%s"; | |
28887 | + | |
28888 | + return fmt; | |
28889 | +} | |
b3bbd485 JK |
28890 | |
28891 | - for_each_hist_key_field(i, hist_data) { | |
28892 | - key_field = hist_data->fields[i]; | |
e4b2b4a8 JK |
28893 | +static enum print_line_t print_synth_event(struct trace_iterator *iter, |
28894 | + int flags, | |
28895 | + struct trace_event *event) | |
28896 | +{ | |
28897 | + struct trace_array *tr = iter->tr; | |
28898 | + struct trace_seq *s = &iter->seq; | |
28899 | + struct synth_trace_event *entry; | |
28900 | + struct synth_event *se; | |
28901 | + unsigned int i, n_u64; | |
28902 | + char print_fmt[32]; | |
28903 | + const char *fmt; | |
b3bbd485 JK |
28904 | |
28905 | - if (key_field->flags & HIST_FIELD_FL_EXECNAME) { | |
28906 | - unsigned int size = TASK_COMM_LEN + 1; | |
e4b2b4a8 JK |
28907 | + entry = (struct synth_trace_event *)iter->ent; |
28908 | + se = container_of(event, struct synth_event, call.event); | |
b3bbd485 JK |
28909 | |
28910 | - elt->private_data = kzalloc(size, GFP_KERNEL); | |
28911 | - if (!elt->private_data) | |
28912 | - return -ENOMEM; | |
28913 | - break; | |
e4b2b4a8 JK |
28914 | + trace_seq_printf(s, "%s: ", se->name); |
28915 | + | |
28916 | + for (i = 0, n_u64 = 0; i < se->n_fields; i++) { | |
28917 | + if (trace_seq_has_overflowed(s)) | |
28918 | + goto end; | |
28919 | + | |
28920 | + fmt = synth_field_fmt(se->fields[i]->type); | |
28921 | + | |
28922 | + /* parameter types */ | |
28923 | + if (tr->trace_flags & TRACE_ITER_VERBOSE) | |
28924 | + trace_seq_printf(s, "%s ", fmt); | |
28925 | + | |
28926 | + snprintf(print_fmt, sizeof(print_fmt), "%%s=%s%%s", fmt); | |
28927 | + | |
28928 | + /* parameter values */ | |
28929 | + if (se->fields[i]->is_string) { | |
28930 | + trace_seq_printf(s, print_fmt, se->fields[i]->name, | |
28931 | + (char *)&entry->fields[n_u64], | |
28932 | + i == se->n_fields - 1 ? "" : " "); | |
28933 | + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); | |
28934 | + } else { | |
28935 | + trace_seq_printf(s, print_fmt, se->fields[i]->name, | |
28936 | + entry->fields[n_u64], | |
28937 | + i == se->n_fields - 1 ? "" : " "); | |
28938 | + n_u64++; | |
b3bbd485 JK |
28939 | } |
28940 | } | |
e4b2b4a8 JK |
28941 | +end: |
28942 | + trace_seq_putc(s, '\n'); | |
b3bbd485 JK |
28943 | |
28944 | - return 0; | |
e4b2b4a8 | 28945 | + return trace_handle_return(s); |
b3bbd485 JK |
28946 | } |
28947 | ||
28948 | -static void hist_trigger_elt_comm_copy(struct tracing_map_elt *to, | |
28949 | - struct tracing_map_elt *from) | |
e4b2b4a8 JK |
28950 | +static struct trace_event_functions synth_event_funcs = { |
28951 | + .trace = print_synth_event | |
28952 | +}; | |
1a6e0f06 | 28953 | + |
e4b2b4a8 JK |
28954 | +static notrace void trace_event_raw_event_synth(void *__data, |
28955 | + u64 *var_ref_vals, | |
28956 | + unsigned int var_ref_idx) | |
b3bbd485 JK |
28957 | { |
28958 | - char *comm_from = from->private_data; | |
28959 | - char *comm_to = to->private_data; | |
e4b2b4a8 JK |
28960 | + struct trace_event_file *trace_file = __data; |
28961 | + struct synth_trace_event *entry; | |
28962 | + struct trace_event_buffer fbuffer; | |
28963 | + struct ring_buffer *buffer; | |
28964 | + struct synth_event *event; | |
28965 | + unsigned int i, n_u64; | |
28966 | + int fields_size = 0; | |
1a6e0f06 | 28967 | + |
e4b2b4a8 JK |
28968 | + event = trace_file->event_call->data; |
28969 | + | |
28970 | + if (trace_trigger_soft_disabled(trace_file)) | |
1a6e0f06 | 28971 | + return; |
1a6e0f06 | 28972 | + |
e4b2b4a8 | 28973 | + fields_size = event->n_u64 * sizeof(u64); |
1a6e0f06 | 28974 | + |
e4b2b4a8 JK |
28975 | + /* |
28976 | + * Avoid ring buffer recursion detection, as this event | |
28977 | + * is being performed within another event. | |
28978 | + */ | |
28979 | + buffer = trace_file->tr->trace_buffer.buffer; | |
28980 | + ring_buffer_nest_start(buffer); | |
28981 | + | |
28982 | + entry = trace_event_buffer_reserve(&fbuffer, trace_file, | |
28983 | + sizeof(*entry) + fields_size); | |
28984 | + if (!entry) | |
28985 | + goto out; | |
28986 | + | |
28987 | + for (i = 0, n_u64 = 0; i < event->n_fields; i++) { | |
28988 | + if (event->fields[i]->is_string) { | |
28989 | + char *str_val = (char *)(long)var_ref_vals[var_ref_idx + i]; | |
28990 | + char *str_field = (char *)&entry->fields[n_u64]; | |
28991 | + | |
28992 | + strscpy(str_field, str_val, STR_VAR_LEN_MAX); | |
28993 | + n_u64 += STR_VAR_LEN_MAX / sizeof(u64); | |
28994 | + } else { | |
28995 | + entry->fields[n_u64] = var_ref_vals[var_ref_idx + i]; | |
28996 | + n_u64++; | |
28997 | + } | |
1a6e0f06 JK |
28998 | + } |
28999 | + | |
e4b2b4a8 JK |
29000 | + trace_event_buffer_commit(&fbuffer); |
29001 | +out: | |
29002 | + ring_buffer_nest_end(buffer); | |
1a6e0f06 | 29003 | +} |
b3bbd485 JK |
29004 | |
29005 | - if (comm_from) | |
29006 | - memcpy(comm_to, comm_from, TASK_COMM_LEN + 1); | |
e4b2b4a8 | 29007 | +static void free_synth_event_print_fmt(struct trace_event_call *call) |
1a6e0f06 | 29008 | +{ |
e4b2b4a8 JK |
29009 | + if (call) { |
29010 | + kfree(call->print_fmt); | |
29011 | + call->print_fmt = NULL; | |
1a6e0f06 | 29012 | + } |
b3bbd485 JK |
29013 | } |
29014 | ||
29015 | -static void hist_trigger_elt_comm_init(struct tracing_map_elt *elt) | |
e4b2b4a8 JK |
29016 | +static int __set_synth_event_print_fmt(struct synth_event *event, |
29017 | + char *buf, int len) | |
b3bbd485 JK |
29018 | { |
29019 | - char *comm = elt->private_data; | |
e4b2b4a8 JK |
29020 | + const char *fmt; |
29021 | + int pos = 0; | |
29022 | + int i; | |
29023 | + | |
29024 | + /* When len=0, we just calculate the needed length */ | |
29025 | +#define LEN_OR_ZERO (len ? len - pos : 0) | |
29026 | + | |
29027 | + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); | |
29028 | + for (i = 0; i < event->n_fields; i++) { | |
29029 | + fmt = synth_field_fmt(event->fields[i]->type); | |
29030 | + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s%s", | |
29031 | + event->fields[i]->name, fmt, | |
29032 | + i == event->n_fields - 1 ? "" : ", "); | |
1a6e0f06 | 29033 | + } |
e4b2b4a8 | 29034 | + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); |
1a6e0f06 | 29035 | + |
e4b2b4a8 JK |
29036 | + for (i = 0; i < event->n_fields; i++) { |
29037 | + pos += snprintf(buf + pos, LEN_OR_ZERO, | |
29038 | + ", REC->%s", event->fields[i]->name); | |
1a6e0f06 JK |
29039 | + } |
29040 | + | |
e4b2b4a8 | 29041 | +#undef LEN_OR_ZERO |
b3bbd485 JK |
29042 | |
29043 | - if (comm) | |
29044 | - save_comm(comm, current); | |
e4b2b4a8 JK |
29045 | + /* return the length of print_fmt */ |
29046 | + return pos; | |
b3bbd485 JK |
29047 | } |
29048 | ||
29049 | -static const struct tracing_map_ops hist_trigger_elt_comm_ops = { | |
29050 | - .elt_alloc = hist_trigger_elt_comm_alloc, | |
29051 | - .elt_copy = hist_trigger_elt_comm_copy, | |
29052 | - .elt_free = hist_trigger_elt_comm_free, | |
29053 | - .elt_init = hist_trigger_elt_comm_init, | |
29054 | -}; | |
e4b2b4a8 JK |
29055 | +static int set_synth_event_print_fmt(struct trace_event_call *call) |
29056 | +{ | |
29057 | + struct synth_event *event = call->data; | |
29058 | + char *print_fmt; | |
29059 | + int len; | |
1a6e0f06 | 29060 | + |
e4b2b4a8 JK |
29061 | + /* First: called with 0 length to calculate the needed length */ |
29062 | + len = __set_synth_event_print_fmt(event, NULL, 0); | |
1a6e0f06 | 29063 | + |
e4b2b4a8 JK |
29064 | + print_fmt = kmalloc(len + 1, GFP_KERNEL); |
29065 | + if (!print_fmt) | |
29066 | + return -ENOMEM; | |
1a6e0f06 | 29067 | + |
e4b2b4a8 JK |
29068 | + /* Second: actually write the @print_fmt */ |
29069 | + __set_synth_event_print_fmt(event, print_fmt, len + 1); | |
29070 | + call->print_fmt = print_fmt; | |
b3bbd485 JK |
29071 | |
29072 | -static void destroy_hist_field(struct hist_field *hist_field) | |
e4b2b4a8 | 29073 | + return 0; |
1a6e0f06 JK |
29074 | +} |
29075 | + | |
e4b2b4a8 | 29076 | +static void free_synth_field(struct synth_field *field) |
b3bbd485 JK |
29077 | { |
29078 | - kfree(hist_field); | |
e4b2b4a8 JK |
29079 | + kfree(field->type); |
29080 | + kfree(field->name); | |
29081 | + kfree(field); | |
b3bbd485 JK |
29082 | } |
29083 | ||
29084 | -static struct hist_field *create_hist_field(struct ftrace_event_field *field, | |
29085 | - unsigned long flags) | |
e4b2b4a8 JK |
29086 | +static struct synth_field *parse_synth_field(char *field_type, |
29087 | + char *field_name) | |
b3bbd485 JK |
29088 | { |
29089 | - struct hist_field *hist_field; | |
e4b2b4a8 JK |
29090 | + struct synth_field *field; |
29091 | + int len, ret = 0; | |
29092 | + char *array; | |
b3bbd485 JK |
29093 | |
29094 | - if (field && is_function_field(field)) | |
29095 | - return NULL; | |
e4b2b4a8 JK |
29096 | + if (field_type[0] == ';') |
29097 | + field_type++; | |
b3bbd485 JK |
29098 | |
29099 | - hist_field = kzalloc(sizeof(struct hist_field), GFP_KERNEL); | |
29100 | - if (!hist_field) | |
29101 | - return NULL; | |
e4b2b4a8 JK |
29102 | + len = strlen(field_name); |
29103 | + if (field_name[len - 1] == ';') | |
29104 | + field_name[len - 1] = '\0'; | |
b3bbd485 JK |
29105 | |
29106 | - if (flags & HIST_FIELD_FL_HITCOUNT) { | |
29107 | - hist_field->fn = hist_field_counter; | |
29108 | - goto out; | |
e4b2b4a8 JK |
29109 | + field = kzalloc(sizeof(*field), GFP_KERNEL); |
29110 | + if (!field) | |
29111 | + return ERR_PTR(-ENOMEM); | |
1a6e0f06 | 29112 | + |
e4b2b4a8 JK |
29113 | + len = strlen(field_type) + 1; |
29114 | + array = strchr(field_name, '['); | |
29115 | + if (array) | |
29116 | + len += strlen(array); | |
29117 | + field->type = kzalloc(len, GFP_KERNEL); | |
29118 | + if (!field->type) { | |
29119 | + ret = -ENOMEM; | |
29120 | + goto free; | |
29121 | + } | |
29122 | + strcat(field->type, field_type); | |
29123 | + if (array) { | |
29124 | + strcat(field->type, array); | |
29125 | + *array = '\0'; | |
b3bbd485 JK |
29126 | } |
29127 | ||
29128 | - if (flags & HIST_FIELD_FL_STACKTRACE) { | |
29129 | - hist_field->fn = hist_field_none; | |
29130 | - goto out; | |
e4b2b4a8 JK |
29131 | + field->size = synth_field_size(field->type); |
29132 | + if (!field->size) { | |
29133 | + ret = -EINVAL; | |
29134 | + goto free; | |
b3bbd485 JK |
29135 | } |
29136 | ||
29137 | - if (flags & HIST_FIELD_FL_LOG2) { | |
29138 | - hist_field->fn = hist_field_log2; | |
29139 | - goto out; | |
e4b2b4a8 JK |
29140 | + if (synth_field_is_string(field->type)) |
29141 | + field->is_string = true; | |
29142 | + | |
29143 | + field->is_signed = synth_field_signed(field->type); | |
29144 | + | |
29145 | + field->name = kstrdup(field_name, GFP_KERNEL); | |
29146 | + if (!field->name) { | |
29147 | + ret = -ENOMEM; | |
29148 | + goto free; | |
29149 | + } | |
29150 | + out: | |
29151 | + return field; | |
29152 | + free: | |
29153 | + free_synth_field(field); | |
29154 | + field = ERR_PTR(ret); | |
29155 | + goto out; | |
29156 | +} | |
29157 | + | |
29158 | +static void free_synth_tracepoint(struct tracepoint *tp) | |
1a6e0f06 | 29159 | +{ |
e4b2b4a8 JK |
29160 | + if (!tp) |
29161 | + return; | |
29162 | + | |
29163 | + kfree(tp->name); | |
29164 | + kfree(tp); | |
1a6e0f06 | 29165 | +} |
1a6e0f06 | 29166 | + |
e4b2b4a8 | 29167 | +static struct tracepoint *alloc_synth_tracepoint(char *name) |
1a6e0f06 | 29168 | +{ |
e4b2b4a8 | 29169 | + struct tracepoint *tp; |
1a6e0f06 | 29170 | + |
e4b2b4a8 JK |
29171 | + tp = kzalloc(sizeof(*tp), GFP_KERNEL); |
29172 | + if (!tp) | |
29173 | + return ERR_PTR(-ENOMEM); | |
1a6e0f06 | 29174 | + |
e4b2b4a8 JK |
29175 | + tp->name = kstrdup(name, GFP_KERNEL); |
29176 | + if (!tp->name) { | |
29177 | + kfree(tp); | |
29178 | + return ERR_PTR(-ENOMEM); | |
1a6e0f06 | 29179 | + } |
e4b2b4a8 JK |
29180 | + |
29181 | + return tp; | |
1a6e0f06 | 29182 | +} |
1a6e0f06 | 29183 | + |
e4b2b4a8 JK |
29184 | +typedef void (*synth_probe_func_t) (void *__data, u64 *var_ref_vals, |
29185 | + unsigned int var_ref_idx); | |
1a6e0f06 | 29186 | + |
e4b2b4a8 JK |
29187 | +static inline void trace_synth(struct synth_event *event, u64 *var_ref_vals, |
29188 | + unsigned int var_ref_idx) | |
29189 | +{ | |
29190 | + struct tracepoint *tp = event->tp; | |
29191 | + | |
29192 | + if (unlikely(atomic_read(&tp->key.enabled) > 0)) { | |
29193 | + struct tracepoint_func *probe_func_ptr; | |
29194 | + synth_probe_func_t probe_func; | |
29195 | + void *__data; | |
29196 | + | |
29197 | + if (!(cpu_online(raw_smp_processor_id()))) | |
29198 | + return; | |
29199 | + | |
29200 | + probe_func_ptr = rcu_dereference_sched((tp)->funcs); | |
29201 | + if (probe_func_ptr) { | |
29202 | + do { | |
29203 | + probe_func = probe_func_ptr->func; | |
29204 | + __data = probe_func_ptr->data; | |
29205 | + probe_func(__data, var_ref_vals, var_ref_idx); | |
29206 | + } while ((++probe_func_ptr)->func); | |
29207 | + } | |
29208 | + } | |
29209 | +} | |
29210 | + | |
29211 | +static struct synth_event *find_synth_event(const char *name) | |
29212 | +{ | |
29213 | + struct synth_event *event; | |
29214 | + | |
29215 | + list_for_each_entry(event, &synth_event_list, list) { | |
29216 | + if (strcmp(event->name, name) == 0) | |
29217 | + return event; | |
29218 | + } | |
29219 | + | |
29220 | + return NULL; | |
29221 | +} | |
29222 | + | |
29223 | +static int register_synth_event(struct synth_event *event) | |
29224 | +{ | |
29225 | + struct trace_event_call *call = &event->call; | |
29226 | + int ret = 0; | |
29227 | + | |
29228 | + event->call.class = &event->class; | |
29229 | + event->class.system = kstrdup(SYNTH_SYSTEM, GFP_KERNEL); | |
29230 | + if (!event->class.system) { | |
29231 | + ret = -ENOMEM; | |
29232 | + goto out; | |
29233 | + } | |
29234 | + | |
29235 | + event->tp = alloc_synth_tracepoint(event->name); | |
29236 | + if (IS_ERR(event->tp)) { | |
29237 | + ret = PTR_ERR(event->tp); | |
29238 | + event->tp = NULL; | |
29239 | + goto out; | |
29240 | + } | |
29241 | + | |
29242 | + INIT_LIST_HEAD(&call->class->fields); | |
29243 | + call->event.funcs = &synth_event_funcs; | |
29244 | + call->class->define_fields = synth_event_define_fields; | |
29245 | + | |
29246 | + ret = register_trace_event(&call->event); | |
29247 | + if (!ret) { | |
29248 | + ret = -ENODEV; | |
29249 | + goto out; | |
29250 | + } | |
29251 | + call->flags = TRACE_EVENT_FL_TRACEPOINT; | |
29252 | + call->class->reg = trace_event_reg; | |
29253 | + call->class->probe = trace_event_raw_event_synth; | |
29254 | + call->data = event; | |
29255 | + call->tp = event->tp; | |
29256 | + | |
29257 | + ret = trace_add_event_call(call); | |
29258 | + if (ret) { | |
29259 | + pr_warn("Failed to register synthetic event: %s\n", | |
29260 | + trace_event_name(call)); | |
29261 | + goto err; | |
29262 | + } | |
29263 | + | |
29264 | + ret = set_synth_event_print_fmt(call); | |
29265 | + if (ret < 0) { | |
29266 | + trace_remove_event_call(call); | |
29267 | + goto err; | |
29268 | + } | |
29269 | + out: | |
29270 | + return ret; | |
29271 | + err: | |
29272 | + unregister_trace_event(&call->event); | |
29273 | + goto out; | |
29274 | +} | |
29275 | + | |
29276 | +static int unregister_synth_event(struct synth_event *event) | |
29277 | +{ | |
29278 | + struct trace_event_call *call = &event->call; | |
29279 | + int ret; | |
29280 | + | |
29281 | + ret = trace_remove_event_call(call); | |
29282 | + | |
29283 | + return ret; | |
29284 | +} | |
29285 | + | |
29286 | +static void free_synth_event(struct synth_event *event) | |
29287 | +{ | |
29288 | + unsigned int i; | |
29289 | + | |
29290 | + if (!event) | |
29291 | + return; | |
29292 | + | |
29293 | + for (i = 0; i < event->n_fields; i++) | |
29294 | + free_synth_field(event->fields[i]); | |
29295 | + | |
29296 | + kfree(event->fields); | |
29297 | + kfree(event->name); | |
29298 | + kfree(event->class.system); | |
29299 | + free_synth_tracepoint(event->tp); | |
29300 | + free_synth_event_print_fmt(&event->call); | |
29301 | + kfree(event); | |
29302 | +} | |
29303 | + | |
29304 | +static struct synth_event *alloc_synth_event(char *event_name, int n_fields, | |
29305 | + struct synth_field **fields) | |
29306 | +{ | |
29307 | + struct synth_event *event; | |
29308 | + unsigned int i; | |
1a6e0f06 | 29309 | + |
e4b2b4a8 JK |
29310 | + event = kzalloc(sizeof(*event), GFP_KERNEL); |
29311 | + if (!event) { | |
29312 | + event = ERR_PTR(-ENOMEM); | |
29313 | + goto out; | |
29314 | + } | |
1a6e0f06 | 29315 | + |
e4b2b4a8 JK |
29316 | + event->name = kstrdup(event_name, GFP_KERNEL); |
29317 | + if (!event->name) { | |
29318 | + kfree(event); | |
29319 | + event = ERR_PTR(-ENOMEM); | |
29320 | + goto out; | |
29321 | + } | |
1a6e0f06 | 29322 | + |
e4b2b4a8 JK |
29323 | + event->fields = kcalloc(n_fields, sizeof(*event->fields), GFP_KERNEL); |
29324 | + if (!event->fields) { | |
29325 | + free_synth_event(event); | |
29326 | + event = ERR_PTR(-ENOMEM); | |
29327 | + goto out; | |
29328 | + } | |
1a6e0f06 | 29329 | + |
e4b2b4a8 JK |
29330 | + for (i = 0; i < n_fields; i++) |
29331 | + event->fields[i] = fields[i]; | |
1a6e0f06 | 29332 | + |
e4b2b4a8 JK |
29333 | + event->n_fields = n_fields; |
29334 | + out: | |
29335 | + return event; | |
29336 | +} | |
1a6e0f06 | 29337 | + |
e4b2b4a8 JK |
29338 | +static void action_trace(struct hist_trigger_data *hist_data, |
29339 | + struct tracing_map_elt *elt, void *rec, | |
29340 | + struct ring_buffer_event *rbe, | |
29341 | + struct action_data *data, u64 *var_ref_vals) | |
1a6e0f06 | 29342 | +{ |
e4b2b4a8 | 29343 | + struct synth_event *event = data->onmatch.synth_event; |
1a6e0f06 | 29344 | + |
e4b2b4a8 JK |
29345 | + trace_synth(event, var_ref_vals, data->onmatch.var_ref_idx); |
29346 | +} | |
1a6e0f06 | 29347 | + |
e4b2b4a8 JK |
29348 | +struct hist_var_data { |
29349 | + struct list_head list; | |
29350 | + struct hist_trigger_data *hist_data; | |
29351 | +}; | |
1a6e0f06 | 29352 | + |
e4b2b4a8 JK |
29353 | +static void add_or_delete_synth_event(struct synth_event *event, int delete) |
29354 | +{ | |
29355 | + if (delete) | |
29356 | + free_synth_event(event); | |
29357 | + else { | |
29358 | + mutex_lock(&synth_event_mutex); | |
29359 | + if (!find_synth_event(event->name)) | |
29360 | + list_add(&event->list, &synth_event_list); | |
29361 | + else | |
29362 | + free_synth_event(event); | |
29363 | + mutex_unlock(&synth_event_mutex); | |
29364 | + } | |
1a6e0f06 JK |
29365 | +} |
29366 | + | |
e4b2b4a8 | 29367 | +static int create_synth_event(int argc, char **argv) |
1a6e0f06 | 29368 | +{ |
e4b2b4a8 JK |
29369 | + struct synth_field *field, *fields[SYNTH_FIELDS_MAX]; |
29370 | + struct synth_event *event = NULL; | |
29371 | + bool delete_event = false; | |
29372 | + int i, n_fields = 0, ret = 0; | |
29373 | + char *name; | |
1a6e0f06 | 29374 | + |
e4b2b4a8 | 29375 | + mutex_lock(&synth_event_mutex); |
1a6e0f06 | 29376 | + |
e4b2b4a8 JK |
29377 | + /* |
29378 | + * Argument syntax: | |
29379 | + * - Add synthetic event: <event_name> field[;field] ... | |
29380 | + * - Remove synthetic event: !<event_name> field[;field] ... | |
29381 | + * where 'field' = type field_name | |
29382 | + */ | |
29383 | + if (argc < 1) { | |
29384 | + ret = -EINVAL; | |
29385 | + goto out; | |
29386 | + } | |
1a6e0f06 | 29387 | + |
e4b2b4a8 JK |
29388 | + name = argv[0]; |
29389 | + if (name[0] == '!') { | |
29390 | + delete_event = true; | |
29391 | + name++; | |
29392 | + } | |
1a6e0f06 | 29393 | + |
e4b2b4a8 JK |
29394 | + event = find_synth_event(name); |
29395 | + if (event) { | |
29396 | + if (delete_event) { | |
29397 | + if (event->ref) { | |
29398 | + event = NULL; | |
29399 | + ret = -EBUSY; | |
29400 | + goto out; | |
29401 | + } | |
29402 | + list_del(&event->list); | |
29403 | + goto out; | |
1a6e0f06 | 29404 | + } |
e4b2b4a8 JK |
29405 | + event = NULL; |
29406 | + ret = -EEXIST; | |
29407 | + goto out; | |
29408 | + } else if (delete_event) | |
29409 | + goto out; | |
29410 | + | |
29411 | + if (argc < 2) { | |
29412 | + ret = -EINVAL; | |
29413 | + goto out; | |
1a6e0f06 | 29414 | + } |
1a6e0f06 | 29415 | + |
e4b2b4a8 JK |
29416 | + for (i = 1; i < argc - 1; i++) { |
29417 | + if (strcmp(argv[i], ";") == 0) | |
29418 | + continue; | |
29419 | + if (n_fields == SYNTH_FIELDS_MAX) { | |
29420 | + ret = -EINVAL; | |
29421 | + goto err; | |
29422 | + } | |
1a6e0f06 | 29423 | + |
e4b2b4a8 JK |
29424 | + field = parse_synth_field(argv[i], argv[i + 1]); |
29425 | + if (IS_ERR(field)) { | |
29426 | + ret = PTR_ERR(field); | |
29427 | + goto err; | |
29428 | + } | |
29429 | + fields[n_fields] = field; | |
29430 | + i++; n_fields++; | |
29431 | + } | |
1a6e0f06 | 29432 | + |
e4b2b4a8 JK |
29433 | + if (i < argc) { |
29434 | + ret = -EINVAL; | |
29435 | + goto err; | |
29436 | + } | |
1a6e0f06 | 29437 | + |
e4b2b4a8 JK |
29438 | + event = alloc_synth_event(name, n_fields, fields); |
29439 | + if (IS_ERR(event)) { | |
29440 | + ret = PTR_ERR(event); | |
29441 | + event = NULL; | |
29442 | + goto err; | |
1a6e0f06 | 29443 | + } |
e4b2b4a8 JK |
29444 | + out: |
29445 | + mutex_unlock(&synth_event_mutex); | |
1a6e0f06 | 29446 | + |
e4b2b4a8 JK |
29447 | + if (event) { |
29448 | + if (delete_event) { | |
29449 | + ret = unregister_synth_event(event); | |
29450 | + add_or_delete_synth_event(event, !ret); | |
29451 | + } else { | |
29452 | + ret = register_synth_event(event); | |
29453 | + add_or_delete_synth_event(event, ret); | |
29454 | + } | |
29455 | + } | |
29456 | + | |
29457 | + return ret; | |
29458 | + err: | |
29459 | + mutex_unlock(&synth_event_mutex); | |
29460 | + | |
29461 | + for (i = 0; i < n_fields; i++) | |
29462 | + free_synth_field(fields[i]); | |
29463 | + free_synth_event(event); | |
29464 | + | |
29465 | + return ret; | |
1a6e0f06 JK |
29466 | +} |
29467 | + | |
e4b2b4a8 | 29468 | +static int release_all_synth_events(void) |
1a6e0f06 | 29469 | +{ |
e4b2b4a8 JK |
29470 | + struct list_head release_events; |
29471 | + struct synth_event *event, *e; | |
29472 | + int ret = 0; | |
1a6e0f06 | 29473 | + |
e4b2b4a8 JK |
29474 | + INIT_LIST_HEAD(&release_events); |
29475 | + | |
29476 | + mutex_lock(&synth_event_mutex); | |
29477 | + | |
29478 | + list_for_each_entry(event, &synth_event_list, list) { | |
29479 | + if (event->ref) { | |
29480 | + mutex_unlock(&synth_event_mutex); | |
29481 | + return -EBUSY; | |
29482 | + } | |
29483 | + } | |
29484 | + | |
29485 | + list_splice_init(&event->list, &release_events); | |
29486 | + | |
29487 | + mutex_unlock(&synth_event_mutex); | |
29488 | + | |
29489 | + list_for_each_entry_safe(event, e, &release_events, list) { | |
29490 | + list_del(&event->list); | |
29491 | + | |
29492 | + ret = unregister_synth_event(event); | |
29493 | + add_or_delete_synth_event(event, !ret); | |
29494 | + } | |
29495 | + | |
29496 | + return ret; | |
1a6e0f06 JK |
29497 | +} |
29498 | + | |
e4b2b4a8 JK |
29499 | + |
29500 | +static void *synth_events_seq_start(struct seq_file *m, loff_t *pos) | |
1a6e0f06 | 29501 | +{ |
e4b2b4a8 | 29502 | + mutex_lock(&synth_event_mutex); |
1a6e0f06 | 29503 | + |
e4b2b4a8 JK |
29504 | + return seq_list_start(&synth_event_list, *pos); |
29505 | +} | |
1a6e0f06 | 29506 | + |
e4b2b4a8 JK |
29507 | +static void *synth_events_seq_next(struct seq_file *m, void *v, loff_t *pos) |
29508 | +{ | |
29509 | + return seq_list_next(v, &synth_event_list, pos); | |
29510 | +} | |
1a6e0f06 | 29511 | + |
e4b2b4a8 JK |
29512 | +static void synth_events_seq_stop(struct seq_file *m, void *v) |
29513 | +{ | |
29514 | + mutex_unlock(&synth_event_mutex); | |
1a6e0f06 | 29515 | +} |
1a6e0f06 | 29516 | + |
e4b2b4a8 | 29517 | +static int synth_events_seq_show(struct seq_file *m, void *v) |
1a6e0f06 | 29518 | +{ |
e4b2b4a8 JK |
29519 | + struct synth_field *field; |
29520 | + struct synth_event *event = v; | |
29521 | + unsigned int i; | |
1a6e0f06 | 29522 | + |
e4b2b4a8 | 29523 | + seq_printf(m, "%s\t", event->name); |
1a6e0f06 | 29524 | + |
e4b2b4a8 JK |
29525 | + for (i = 0; i < event->n_fields; i++) { |
29526 | + field = event->fields[i]; | |
29527 | + | |
29528 | + /* parameter values */ | |
29529 | + seq_printf(m, "%s %s%s", field->type, field->name, | |
29530 | + i == event->n_fields - 1 ? "" : "; "); | |
1a6e0f06 JK |
29531 | + } |
29532 | + | |
e4b2b4a8 | 29533 | + seq_putc(m, '\n'); |
1a6e0f06 JK |
29534 | + |
29535 | + return 0; | |
29536 | +} | |
1a6e0f06 | 29537 | + |
e4b2b4a8 JK |
29538 | +static const struct seq_operations synth_events_seq_op = { |
29539 | + .start = synth_events_seq_start, | |
29540 | + .next = synth_events_seq_next, | |
29541 | + .stop = synth_events_seq_stop, | |
29542 | + .show = synth_events_seq_show | |
29543 | +}; | |
29544 | + | |
29545 | +static int synth_events_open(struct inode *inode, struct file *file) | |
1a6e0f06 | 29546 | +{ |
e4b2b4a8 | 29547 | + int ret; |
1a6e0f06 | 29548 | + |
e4b2b4a8 JK |
29549 | + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { |
29550 | + ret = release_all_synth_events(); | |
29551 | + if (ret < 0) | |
29552 | + return ret; | |
29553 | + } | |
1a6e0f06 | 29554 | + |
e4b2b4a8 | 29555 | + return seq_open(file, &synth_events_seq_op); |
1a6e0f06 | 29556 | +} |
e4b2b4a8 JK |
29557 | + |
29558 | +static ssize_t synth_events_write(struct file *file, | |
29559 | + const char __user *buffer, | |
29560 | + size_t count, loff_t *ppos) | |
1a6e0f06 | 29561 | +{ |
e4b2b4a8 JK |
29562 | + return trace_parse_run_command(file, buffer, count, ppos, |
29563 | + create_synth_event); | |
29564 | +} | |
1a6e0f06 | 29565 | + |
e4b2b4a8 JK |
29566 | +static const struct file_operations synth_events_fops = { |
29567 | + .open = synth_events_open, | |
29568 | + .write = synth_events_write, | |
29569 | + .read = seq_read, | |
29570 | + .llseek = seq_lseek, | |
29571 | + .release = seq_release, | |
29572 | +}; | |
29573 | + | |
29574 | +static u64 hist_field_timestamp(struct hist_field *hist_field, | |
29575 | + struct tracing_map_elt *elt, | |
29576 | + struct ring_buffer_event *rbe, | |
29577 | + void *event) | |
29578 | +{ | |
29579 | + struct hist_trigger_data *hist_data = hist_field->hist_data; | |
29580 | + struct trace_array *tr = hist_data->event_file->tr; | |
29581 | + | |
29582 | + u64 ts = ring_buffer_event_time_stamp(rbe); | |
29583 | + | |
29584 | + if (hist_data->attrs->ts_in_usecs && trace_clock_in_ns(tr)) | |
29585 | + ts = ns2usecs(ts); | |
29586 | + | |
29587 | + return ts; | |
1a6e0f06 JK |
29588 | +} |
29589 | + | |
e4b2b4a8 JK |
29590 | +static u64 hist_field_cpu(struct hist_field *hist_field, |
29591 | + struct tracing_map_elt *elt, | |
29592 | + struct ring_buffer_event *rbe, | |
29593 | + void *event) | |
1a6e0f06 | 29594 | +{ |
e4b2b4a8 JK |
29595 | + int cpu = smp_processor_id(); |
29596 | + | |
29597 | + return cpu; | |
1a6e0f06 JK |
29598 | +} |
29599 | + | |
e4b2b4a8 JK |
29600 | +static struct hist_field * |
29601 | +check_field_for_var_ref(struct hist_field *hist_field, | |
29602 | + struct hist_trigger_data *var_data, | |
29603 | + unsigned int var_idx) | |
1a6e0f06 | 29604 | +{ |
e4b2b4a8 JK |
29605 | + struct hist_field *found = NULL; |
29606 | + | |
29607 | + if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR_REF) { | |
29608 | + if (hist_field->var.idx == var_idx && | |
29609 | + hist_field->var.hist_data == var_data) { | |
29610 | + found = hist_field; | |
29611 | + } | |
29612 | + } | |
29613 | + | |
29614 | + return found; | |
1a6e0f06 JK |
29615 | +} |
29616 | + | |
e4b2b4a8 JK |
29617 | +static struct hist_field * |
29618 | +check_field_for_var_refs(struct hist_trigger_data *hist_data, | |
29619 | + struct hist_field *hist_field, | |
29620 | + struct hist_trigger_data *var_data, | |
29621 | + unsigned int var_idx, | |
29622 | + unsigned int level) | |
29623 | +{ | |
29624 | + struct hist_field *found = NULL; | |
29625 | + unsigned int i; | |
29626 | + | |
29627 | + if (level > 3) | |
29628 | + return found; | |
29629 | + | |
29630 | + if (!hist_field) | |
29631 | + return found; | |
29632 | + | |
29633 | + found = check_field_for_var_ref(hist_field, var_data, var_idx); | |
29634 | + if (found) | |
29635 | + return found; | |
29636 | + | |
29637 | + for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) { | |
29638 | + struct hist_field *operand; | |
29639 | + | |
29640 | + operand = hist_field->operands[i]; | |
29641 | + found = check_field_for_var_refs(hist_data, operand, var_data, | |
29642 | + var_idx, level + 1); | |
29643 | + if (found) | |
29644 | + return found; | |
29645 | + } | |
29646 | + | |
29647 | + return found; | |
29648 | +} | |
29649 | + | |
29650 | +static struct hist_field *find_var_ref(struct hist_trigger_data *hist_data, | |
29651 | + struct hist_trigger_data *var_data, | |
29652 | + unsigned int var_idx) | |
29653 | +{ | |
29654 | + struct hist_field *hist_field, *found = NULL; | |
29655 | + unsigned int i; | |
29656 | + | |
29657 | + for_each_hist_field(i, hist_data) { | |
29658 | + hist_field = hist_data->fields[i]; | |
29659 | + found = check_field_for_var_refs(hist_data, hist_field, | |
29660 | + var_data, var_idx, 0); | |
29661 | + if (found) | |
29662 | + return found; | |
29663 | + } | |
29664 | + | |
29665 | + for (i = 0; i < hist_data->n_synth_var_refs; i++) { | |
29666 | + hist_field = hist_data->synth_var_refs[i]; | |
29667 | + found = check_field_for_var_refs(hist_data, hist_field, | |
29668 | + var_data, var_idx, 0); | |
29669 | + if (found) | |
29670 | + return found; | |
29671 | + } | |
29672 | + | |
29673 | + return found; | |
29674 | +} | |
29675 | + | |
29676 | +static struct hist_field *find_any_var_ref(struct hist_trigger_data *hist_data, | |
29677 | + unsigned int var_idx) | |
1a6e0f06 | 29678 | +{ |
e4b2b4a8 JK |
29679 | + struct trace_array *tr = hist_data->event_file->tr; |
29680 | + struct hist_field *found = NULL; | |
29681 | + struct hist_var_data *var_data; | |
1a6e0f06 | 29682 | + |
e4b2b4a8 JK |
29683 | + list_for_each_entry(var_data, &tr->hist_vars, list) { |
29684 | + if (var_data->hist_data == hist_data) | |
29685 | + continue; | |
29686 | + found = find_var_ref(var_data->hist_data, hist_data, var_idx); | |
29687 | + if (found) | |
29688 | + break; | |
29689 | + } | |
1a6e0f06 | 29690 | + |
e4b2b4a8 | 29691 | + return found; |
1a6e0f06 JK |
29692 | +} |
29693 | + | |
e4b2b4a8 | 29694 | +static bool check_var_refs(struct hist_trigger_data *hist_data) |
1a6e0f06 | 29695 | +{ |
e4b2b4a8 JK |
29696 | + struct hist_field *field; |
29697 | + bool found = false; | |
29698 | + int i; | |
1a6e0f06 | 29699 | + |
e4b2b4a8 JK |
29700 | + for_each_hist_field(i, hist_data) { |
29701 | + field = hist_data->fields[i]; | |
29702 | + if (field && field->flags & HIST_FIELD_FL_VAR) { | |
29703 | + if (find_any_var_ref(hist_data, field->var.idx)) { | |
29704 | + found = true; | |
29705 | + break; | |
29706 | + } | |
29707 | + } | |
29708 | + } | |
1a6e0f06 | 29709 | + |
e4b2b4a8 | 29710 | + return found; |
1a6e0f06 JK |
29711 | +} |
29712 | + | |
e4b2b4a8 | 29713 | +static struct hist_var_data *find_hist_vars(struct hist_trigger_data *hist_data) |
1a6e0f06 | 29714 | +{ |
e4b2b4a8 JK |
29715 | + struct trace_array *tr = hist_data->event_file->tr; |
29716 | + struct hist_var_data *var_data, *found = NULL; | |
1a6e0f06 | 29717 | + |
e4b2b4a8 JK |
29718 | + list_for_each_entry(var_data, &tr->hist_vars, list) { |
29719 | + if (var_data->hist_data == hist_data) { | |
29720 | + found = var_data; | |
29721 | + break; | |
1a6e0f06 | 29722 | + } |
e4b2b4a8 | 29723 | + } |
1a6e0f06 | 29724 | + |
e4b2b4a8 JK |
29725 | + return found; |
29726 | +} | |
29727 | + | |
29728 | +static bool field_has_hist_vars(struct hist_field *hist_field, | |
29729 | + unsigned int level) | |
29730 | +{ | |
29731 | + int i; | |
29732 | + | |
29733 | + if (level > 3) | |
29734 | + return false; | |
29735 | + | |
29736 | + if (!hist_field) | |
29737 | + return false; | |
29738 | + | |
29739 | + if (hist_field->flags & HIST_FIELD_FL_VAR || | |
29740 | + hist_field->flags & HIST_FIELD_FL_VAR_REF) | |
29741 | + return true; | |
29742 | + | |
29743 | + for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) { | |
29744 | + struct hist_field *operand; | |
29745 | + | |
29746 | + operand = hist_field->operands[i]; | |
29747 | + if (field_has_hist_vars(operand, level + 1)) | |
29748 | + return true; | |
1a6e0f06 | 29749 | + } |
e4b2b4a8 JK |
29750 | + |
29751 | + return false; | |
1a6e0f06 JK |
29752 | +} |
29753 | + | |
e4b2b4a8 JK |
29754 | +static bool has_hist_vars(struct hist_trigger_data *hist_data) |
29755 | +{ | |
29756 | + struct hist_field *hist_field; | |
29757 | + int i; | |
1a6e0f06 | 29758 | + |
e4b2b4a8 JK |
29759 | + for_each_hist_field(i, hist_data) { |
29760 | + hist_field = hist_data->fields[i]; | |
29761 | + if (field_has_hist_vars(hist_field, 0)) | |
29762 | + return true; | |
29763 | + } | |
1a6e0f06 | 29764 | + |
e4b2b4a8 JK |
29765 | + return false; |
29766 | +} | |
1a6e0f06 | 29767 | + |
e4b2b4a8 | 29768 | +static int save_hist_vars(struct hist_trigger_data *hist_data) |
1a6e0f06 | 29769 | +{ |
e4b2b4a8 JK |
29770 | + struct trace_array *tr = hist_data->event_file->tr; |
29771 | + struct hist_var_data *var_data; | |
1a6e0f06 | 29772 | + |
e4b2b4a8 JK |
29773 | + var_data = find_hist_vars(hist_data); |
29774 | + if (var_data) | |
29775 | + return 0; | |
29776 | + | |
29777 | + if (trace_array_get(tr) < 0) | |
29778 | + return -ENODEV; | |
29779 | + | |
29780 | + var_data = kzalloc(sizeof(*var_data), GFP_KERNEL); | |
29781 | + if (!var_data) { | |
29782 | + trace_array_put(tr); | |
29783 | + return -ENOMEM; | |
29784 | + } | |
29785 | + | |
29786 | + var_data->hist_data = hist_data; | |
29787 | + list_add(&var_data->list, &tr->hist_vars); | |
29788 | + | |
29789 | + return 0; | |
1a6e0f06 JK |
29790 | +} |
29791 | + | |
e4b2b4a8 | 29792 | +static void remove_hist_vars(struct hist_trigger_data *hist_data) |
1a6e0f06 | 29793 | +{ |
e4b2b4a8 JK |
29794 | + struct trace_array *tr = hist_data->event_file->tr; |
29795 | + struct hist_var_data *var_data; | |
1a6e0f06 | 29796 | + |
e4b2b4a8 JK |
29797 | + var_data = find_hist_vars(hist_data); |
29798 | + if (!var_data) | |
29799 | + return; | |
29800 | + | |
29801 | + if (WARN_ON(check_var_refs(hist_data))) | |
29802 | + return; | |
29803 | + | |
29804 | + list_del(&var_data->list); | |
29805 | + | |
29806 | + kfree(var_data); | |
29807 | + | |
29808 | + trace_array_put(tr); | |
1a6e0f06 JK |
29809 | +} |
29810 | + | |
e4b2b4a8 JK |
29811 | +static struct hist_field *find_var_field(struct hist_trigger_data *hist_data, |
29812 | + const char *var_name) | |
1a6e0f06 | 29813 | +{ |
e4b2b4a8 | 29814 | + struct hist_field *hist_field, *found = NULL; |
1a6e0f06 JK |
29815 | + int i; |
29816 | + | |
e4b2b4a8 JK |
29817 | + for_each_hist_field(i, hist_data) { |
29818 | + hist_field = hist_data->fields[i]; | |
29819 | + if (hist_field && hist_field->flags & HIST_FIELD_FL_VAR && | |
29820 | + strcmp(hist_field->var.name, var_name) == 0) { | |
29821 | + found = hist_field; | |
29822 | + break; | |
29823 | + } | |
29824 | + } | |
1a6e0f06 | 29825 | + |
e4b2b4a8 JK |
29826 | + return found; |
29827 | +} | |
1a6e0f06 | 29828 | + |
e4b2b4a8 JK |
29829 | +static struct hist_field *find_var(struct hist_trigger_data *hist_data, |
29830 | + struct trace_event_file *file, | |
29831 | + const char *var_name) | |
29832 | +{ | |
29833 | + struct hist_trigger_data *test_data; | |
29834 | + struct event_trigger_data *test; | |
29835 | + struct hist_field *hist_field; | |
29836 | + | |
29837 | + hist_field = find_var_field(hist_data, var_name); | |
29838 | + if (hist_field) | |
29839 | + return hist_field; | |
29840 | + | |
29841 | + list_for_each_entry_rcu(test, &file->triggers, list) { | |
29842 | + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { | |
29843 | + test_data = test->private_data; | |
29844 | + hist_field = find_var_field(test_data, var_name); | |
29845 | + if (hist_field) | |
29846 | + return hist_field; | |
29847 | + } | |
29848 | + } | |
29849 | + | |
29850 | + return NULL; | |
29851 | +} | |
29852 | + | |
29853 | +static struct trace_event_file *find_var_file(struct trace_array *tr, | |
29854 | + char *system, | |
29855 | + char *event_name, | |
29856 | + char *var_name) | |
29857 | +{ | |
29858 | + struct hist_trigger_data *var_hist_data; | |
29859 | + struct hist_var_data *var_data; | |
29860 | + struct trace_event_file *file, *found = NULL; | |
29861 | + | |
29862 | + if (system) | |
29863 | + return find_event_file(tr, system, event_name); | |
29864 | + | |
29865 | + list_for_each_entry(var_data, &tr->hist_vars, list) { | |
29866 | + var_hist_data = var_data->hist_data; | |
29867 | + file = var_hist_data->event_file; | |
29868 | + if (file == found) | |
29869 | + continue; | |
29870 | + | |
29871 | + if (find_var_field(var_hist_data, var_name)) { | |
29872 | + if (found) { | |
29873 | + hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name); | |
29874 | + return NULL; | |
1a6e0f06 | 29875 | + } |
e4b2b4a8 JK |
29876 | + |
29877 | + found = file; | |
1a6e0f06 JK |
29878 | + } |
29879 | + } | |
29880 | + | |
e4b2b4a8 JK |
29881 | + return found; |
29882 | +} | |
29883 | + | |
29884 | +static struct hist_field *find_file_var(struct trace_event_file *file, | |
29885 | + const char *var_name) | |
29886 | +{ | |
29887 | + struct hist_trigger_data *test_data; | |
29888 | + struct event_trigger_data *test; | |
29889 | + struct hist_field *hist_field; | |
29890 | + | |
29891 | + list_for_each_entry_rcu(test, &file->triggers, list) { | |
29892 | + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { | |
29893 | + test_data = test->private_data; | |
29894 | + hist_field = find_var_field(test_data, var_name); | |
29895 | + if (hist_field) | |
29896 | + return hist_field; | |
29897 | + } | |
1a6e0f06 | 29898 | + } |
e4b2b4a8 JK |
29899 | + |
29900 | + return NULL; | |
1a6e0f06 | 29901 | +} |
e4b2b4a8 JK |
29902 | + |
29903 | +static struct hist_field * | |
29904 | +find_match_var(struct hist_trigger_data *hist_data, char *var_name) | |
1a6e0f06 | 29905 | +{ |
e4b2b4a8 JK |
29906 | + struct trace_array *tr = hist_data->event_file->tr; |
29907 | + struct hist_field *hist_field, *found = NULL; | |
29908 | + struct trace_event_file *file; | |
29909 | + unsigned int i; | |
1a6e0f06 | 29910 | + |
e4b2b4a8 JK |
29911 | + for (i = 0; i < hist_data->n_actions; i++) { |
29912 | + struct action_data *data = hist_data->actions[i]; | |
29913 | + | |
29914 | + if (data->fn == action_trace) { | |
29915 | + char *system = data->onmatch.match_event_system; | |
29916 | + char *event_name = data->onmatch.match_event; | |
29917 | + | |
29918 | + file = find_var_file(tr, system, event_name, var_name); | |
29919 | + if (!file) | |
29920 | + continue; | |
29921 | + hist_field = find_file_var(file, var_name); | |
29922 | + if (hist_field) { | |
29923 | + if (found) { | |
29924 | + hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name); | |
29925 | + return ERR_PTR(-EINVAL); | |
29926 | + } | |
29927 | + | |
29928 | + found = hist_field; | |
29929 | + } | |
29930 | + } | |
1a6e0f06 | 29931 | + } |
e4b2b4a8 | 29932 | + return found; |
1a6e0f06 | 29933 | +} |
1a6e0f06 | 29934 | + |
e4b2b4a8 JK |
29935 | +static struct hist_field *find_event_var(struct hist_trigger_data *hist_data, |
29936 | + char *system, | |
29937 | + char *event_name, | |
29938 | + char *var_name) | |
29939 | +{ | |
29940 | + struct trace_array *tr = hist_data->event_file->tr; | |
29941 | + struct hist_field *hist_field = NULL; | |
29942 | + struct trace_event_file *file; | |
1a6e0f06 | 29943 | + |
e4b2b4a8 JK |
29944 | + if (!system || !event_name) { |
29945 | + hist_field = find_match_var(hist_data, var_name); | |
29946 | + if (IS_ERR(hist_field)) | |
29947 | + return NULL; | |
29948 | + if (hist_field) | |
29949 | + return hist_field; | |
29950 | + } | |
29951 | + | |
29952 | + file = find_var_file(tr, system, event_name, var_name); | |
29953 | + if (!file) | |
29954 | + return NULL; | |
29955 | + | |
29956 | + hist_field = find_file_var(file, var_name); | |
29957 | + | |
29958 | + return hist_field; | |
29959 | +} | |
29960 | + | |
29961 | +struct hist_elt_data { | |
29962 | + char *comm; | |
29963 | + u64 *var_ref_vals; | |
29964 | + char *field_var_str[SYNTH_FIELDS_MAX]; | |
b3bbd485 JK |
29965 | +}; |
29966 | + | |
e4b2b4a8 JK |
29967 | +static u64 hist_field_var_ref(struct hist_field *hist_field, |
29968 | + struct tracing_map_elt *elt, | |
29969 | + struct ring_buffer_event *rbe, | |
29970 | + void *event) | |
1a6e0f06 | 29971 | +{ |
e4b2b4a8 JK |
29972 | + struct hist_elt_data *elt_data; |
29973 | + u64 var_val = 0; | |
1a6e0f06 | 29974 | + |
e4b2b4a8 JK |
29975 | + elt_data = elt->private_data; |
29976 | + var_val = elt_data->var_ref_vals[hist_field->var_ref_idx]; | |
29977 | + | |
29978 | + return var_val; | |
1a6e0f06 | 29979 | +} |
1a6e0f06 | 29980 | + |
e4b2b4a8 JK |
29981 | +static bool resolve_var_refs(struct hist_trigger_data *hist_data, void *key, |
29982 | + u64 *var_ref_vals, bool self) | |
1a6e0f06 | 29983 | +{ |
e4b2b4a8 JK |
29984 | + struct hist_trigger_data *var_data; |
29985 | + struct tracing_map_elt *var_elt; | |
29986 | + struct hist_field *hist_field; | |
29987 | + unsigned int i, var_idx; | |
29988 | + bool resolved = true; | |
29989 | + u64 var_val = 0; | |
1a6e0f06 | 29990 | + |
e4b2b4a8 JK |
29991 | + for (i = 0; i < hist_data->n_var_refs; i++) { |
29992 | + hist_field = hist_data->var_refs[i]; | |
29993 | + var_idx = hist_field->var.idx; | |
29994 | + var_data = hist_field->var.hist_data; | |
1a6e0f06 | 29995 | + |
e4b2b4a8 JK |
29996 | + if (var_data == NULL) { |
29997 | + resolved = false; | |
29998 | + break; | |
29999 | + } | |
1a6e0f06 | 30000 | + |
e4b2b4a8 JK |
30001 | + if ((self && var_data != hist_data) || |
30002 | + (!self && var_data == hist_data)) | |
30003 | + continue; | |
30004 | + | |
30005 | + var_elt = tracing_map_lookup(var_data->map, key); | |
30006 | + if (!var_elt) { | |
30007 | + resolved = false; | |
30008 | + break; | |
30009 | + } | |
30010 | + | |
30011 | + if (!tracing_map_var_set(var_elt, var_idx)) { | |
30012 | + resolved = false; | |
30013 | + break; | |
30014 | + } | |
30015 | + | |
30016 | + if (self || !hist_field->read_once) | |
30017 | + var_val = tracing_map_read_var(var_elt, var_idx); | |
30018 | + else | |
30019 | + var_val = tracing_map_read_var_once(var_elt, var_idx); | |
30020 | + | |
30021 | + var_ref_vals[i] = var_val; | |
1a6e0f06 | 30022 | + } |
e4b2b4a8 JK |
30023 | + |
30024 | + return resolved; | |
1a6e0f06 JK |
30025 | +} |
30026 | + | |
e4b2b4a8 JK |
30027 | +static const char *hist_field_name(struct hist_field *field, |
30028 | + unsigned int level) | |
1a6e0f06 | 30029 | +{ |
e4b2b4a8 JK |
30030 | + const char *field_name = ""; |
30031 | + | |
30032 | + if (level > 1) | |
30033 | + return field_name; | |
30034 | + | |
30035 | + if (field->field) | |
30036 | + field_name = field->field->name; | |
30037 | + else if (field->flags & HIST_FIELD_FL_LOG2 || | |
30038 | + field->flags & HIST_FIELD_FL_ALIAS) | |
30039 | + field_name = hist_field_name(field->operands[0], ++level); | |
30040 | + else if (field->flags & HIST_FIELD_FL_CPU) | |
30041 | + field_name = "cpu"; | |
30042 | + else if (field->flags & HIST_FIELD_FL_EXPR || | |
30043 | + field->flags & HIST_FIELD_FL_VAR_REF) { | |
30044 | + if (field->system) { | |
30045 | + static char full_name[MAX_FILTER_STR_VAL]; | |
30046 | + | |
30047 | + strcat(full_name, field->system); | |
30048 | + strcat(full_name, "."); | |
30049 | + strcat(full_name, field->event_name); | |
30050 | + strcat(full_name, "."); | |
30051 | + strcat(full_name, field->name); | |
30052 | + field_name = full_name; | |
30053 | + } else | |
30054 | + field_name = field->name; | |
30055 | + } else if (field->flags & HIST_FIELD_FL_TIMESTAMP) | |
30056 | + field_name = "common_timestamp"; | |
30057 | + | |
30058 | + if (field_name == NULL) | |
30059 | + field_name = ""; | |
30060 | + | |
30061 | + return field_name; | |
1a6e0f06 JK |
30062 | +} |
30063 | + | |
b3bbd485 JK |
30064 | +static hist_field_fn_t select_value_fn(int field_size, int field_is_signed) |
30065 | +{ | |
30066 | + hist_field_fn_t fn = NULL; | |
30067 | + | |
30068 | + switch (field_size) { | |
30069 | + case 8: | |
30070 | + if (field_is_signed) | |
30071 | + fn = hist_field_s64; | |
30072 | + else | |
30073 | + fn = hist_field_u64; | |
30074 | + break; | |
30075 | + case 4: | |
30076 | + if (field_is_signed) | |
30077 | + fn = hist_field_s32; | |
30078 | + else | |
30079 | + fn = hist_field_u32; | |
30080 | + break; | |
30081 | + case 2: | |
30082 | + if (field_is_signed) | |
30083 | + fn = hist_field_s16; | |
30084 | + else | |
30085 | + fn = hist_field_u16; | |
30086 | + break; | |
30087 | + case 1: | |
30088 | + if (field_is_signed) | |
30089 | + fn = hist_field_s8; | |
30090 | + else | |
30091 | + fn = hist_field_u8; | |
30092 | + break; | |
30093 | + } | |
30094 | + | |
30095 | + return fn; | |
30096 | +} | |
30097 | + | |
30098 | +static int parse_map_size(char *str) | |
30099 | +{ | |
30100 | + unsigned long size, map_bits; | |
30101 | + int ret; | |
30102 | + | |
30103 | + strsep(&str, "="); | |
30104 | + if (!str) { | |
30105 | + ret = -EINVAL; | |
30106 | + goto out; | |
30107 | + } | |
30108 | + | |
30109 | + ret = kstrtoul(str, 0, &size); | |
30110 | + if (ret) | |
30111 | + goto out; | |
30112 | + | |
30113 | + map_bits = ilog2(roundup_pow_of_two(size)); | |
30114 | + if (map_bits < TRACING_MAP_BITS_MIN || | |
30115 | + map_bits > TRACING_MAP_BITS_MAX) | |
30116 | + ret = -EINVAL; | |
30117 | + else | |
30118 | + ret = map_bits; | |
30119 | + out: | |
30120 | + return ret; | |
30121 | +} | |
30122 | + | |
30123 | +static void destroy_hist_trigger_attrs(struct hist_trigger_attrs *attrs) | |
30124 | +{ | |
e4b2b4a8 | 30125 | + unsigned int i; |
1a6e0f06 | 30126 | + |
b3bbd485 JK |
30127 | + if (!attrs) |
30128 | + return; | |
30129 | + | |
e4b2b4a8 JK |
30130 | + for (i = 0; i < attrs->n_assignments; i++) |
30131 | + kfree(attrs->assignment_str[i]); | |
1a6e0f06 | 30132 | + |
e4b2b4a8 JK |
30133 | + for (i = 0; i < attrs->n_actions; i++) |
30134 | + kfree(attrs->action_str[i]); | |
1a6e0f06 | 30135 | + |
b3bbd485 JK |
30136 | + kfree(attrs->name); |
30137 | + kfree(attrs->sort_key_str); | |
30138 | + kfree(attrs->keys_str); | |
30139 | + kfree(attrs->vals_str); | |
e4b2b4a8 | 30140 | + kfree(attrs->clock); |
b3bbd485 JK |
30141 | + kfree(attrs); |
30142 | +} | |
30143 | + | |
e4b2b4a8 JK |
30144 | +static int parse_action(char *str, struct hist_trigger_attrs *attrs) |
30145 | +{ | |
30146 | + int ret = -EINVAL; | |
1a6e0f06 | 30147 | + |
e4b2b4a8 JK |
30148 | + if (attrs->n_actions >= HIST_ACTIONS_MAX) |
30149 | + return ret; | |
1a6e0f06 | 30150 | + |
e4b2b4a8 JK |
30151 | + if ((strncmp(str, "onmatch(", strlen("onmatch(")) == 0) || |
30152 | + (strncmp(str, "onmax(", strlen("onmax(")) == 0)) { | |
30153 | + attrs->action_str[attrs->n_actions] = kstrdup(str, GFP_KERNEL); | |
30154 | + if (!attrs->action_str[attrs->n_actions]) { | |
30155 | + ret = -ENOMEM; | |
30156 | + return ret; | |
30157 | + } | |
30158 | + attrs->n_actions++; | |
30159 | + ret = 0; | |
1a6e0f06 JK |
30160 | + } |
30161 | + | |
e4b2b4a8 | 30162 | + return ret; |
1a6e0f06 JK |
30163 | +} |
30164 | + | |
e4b2b4a8 | 30165 | +static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) |
1a6e0f06 | 30166 | +{ |
e4b2b4a8 JK |
30167 | + int ret = 0; |
30168 | + | |
30169 | + if ((strncmp(str, "key=", strlen("key=")) == 0) || | |
30170 | + (strncmp(str, "keys=", strlen("keys=")) == 0)) { | |
30171 | + attrs->keys_str = kstrdup(str, GFP_KERNEL); | |
30172 | + if (!attrs->keys_str) { | |
30173 | + ret = -ENOMEM; | |
30174 | + goto out; | |
30175 | + } | |
30176 | + } else if ((strncmp(str, "val=", strlen("val=")) == 0) || | |
30177 | + (strncmp(str, "vals=", strlen("vals=")) == 0) || | |
30178 | + (strncmp(str, "values=", strlen("values=")) == 0)) { | |
30179 | + attrs->vals_str = kstrdup(str, GFP_KERNEL); | |
30180 | + if (!attrs->vals_str) { | |
30181 | + ret = -ENOMEM; | |
30182 | + goto out; | |
30183 | + } | |
30184 | + } else if (strncmp(str, "sort=", strlen("sort=")) == 0) { | |
30185 | + attrs->sort_key_str = kstrdup(str, GFP_KERNEL); | |
30186 | + if (!attrs->sort_key_str) { | |
30187 | + ret = -ENOMEM; | |
30188 | + goto out; | |
30189 | + } | |
30190 | + } else if (strncmp(str, "name=", strlen("name=")) == 0) { | |
30191 | + attrs->name = kstrdup(str, GFP_KERNEL); | |
30192 | + if (!attrs->name) { | |
30193 | + ret = -ENOMEM; | |
30194 | + goto out; | |
30195 | + } | |
30196 | + } else if (strncmp(str, "clock=", strlen("clock=")) == 0) { | |
30197 | + strsep(&str, "="); | |
30198 | + if (!str) { | |
30199 | + ret = -EINVAL; | |
30200 | + goto out; | |
30201 | + } | |
30202 | + | |
30203 | + str = strstrip(str); | |
30204 | + attrs->clock = kstrdup(str, GFP_KERNEL); | |
30205 | + if (!attrs->clock) { | |
30206 | + ret = -ENOMEM; | |
30207 | + goto out; | |
30208 | + } | |
30209 | + } else if (strncmp(str, "size=", strlen("size=")) == 0) { | |
30210 | + int map_bits = parse_map_size(str); | |
30211 | + | |
30212 | + if (map_bits < 0) { | |
30213 | + ret = map_bits; | |
30214 | + goto out; | |
30215 | + } | |
30216 | + attrs->map_bits = map_bits; | |
30217 | + } else { | |
30218 | + char *assignment; | |
30219 | + | |
30220 | + if (attrs->n_assignments == TRACING_MAP_VARS_MAX) { | |
30221 | + hist_err("Too many variables defined: ", str); | |
30222 | + ret = -EINVAL; | |
30223 | + goto out; | |
30224 | + } | |
30225 | + | |
30226 | + assignment = kstrdup(str, GFP_KERNEL); | |
30227 | + if (!assignment) { | |
30228 | + ret = -ENOMEM; | |
30229 | + goto out; | |
30230 | + } | |
30231 | + | |
30232 | + attrs->assignment_str[attrs->n_assignments++] = assignment; | |
1a6e0f06 | 30233 | + } |
e4b2b4a8 JK |
30234 | + out: |
30235 | + return ret; | |
1a6e0f06 JK |
30236 | +} |
30237 | + | |
b3bbd485 JK |
30238 | +static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) |
30239 | +{ | |
30240 | + struct hist_trigger_attrs *attrs; | |
30241 | + int ret = 0; | |
30242 | + | |
30243 | + attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); | |
30244 | + if (!attrs) | |
30245 | + return ERR_PTR(-ENOMEM); | |
30246 | + | |
30247 | + while (trigger_str) { | |
30248 | + char *str = strsep(&trigger_str, ":"); | |
30249 | + | |
e4b2b4a8 JK |
30250 | + if (strchr(str, '=')) { |
30251 | + ret = parse_assignment(str, attrs); | |
30252 | + if (ret) | |
30253 | + goto free; | |
30254 | + } else if (strcmp(str, "pause") == 0) | |
b3bbd485 JK |
30255 | + attrs->pause = true; |
30256 | + else if ((strcmp(str, "cont") == 0) || | |
30257 | + (strcmp(str, "continue") == 0)) | |
30258 | + attrs->cont = true; | |
30259 | + else if (strcmp(str, "clear") == 0) | |
30260 | + attrs->clear = true; | |
e4b2b4a8 JK |
30261 | + else { |
30262 | + ret = parse_action(str, attrs); | |
30263 | + if (ret) | |
b3bbd485 JK |
30264 | + goto free; |
30265 | + } | |
30266 | + } | |
30267 | + | |
30268 | + if (!attrs->keys_str) { | |
30269 | + ret = -EINVAL; | |
30270 | + goto free; | |
30271 | + } | |
30272 | + | |
e4b2b4a8 JK |
30273 | + if (!attrs->clock) { |
30274 | + attrs->clock = kstrdup("global", GFP_KERNEL); | |
30275 | + if (!attrs->clock) { | |
30276 | + ret = -ENOMEM; | |
30277 | + goto free; | |
30278 | + } | |
30279 | + } | |
30280 | + | |
b3bbd485 JK |
30281 | + return attrs; |
30282 | + free: | |
30283 | + destroy_hist_trigger_attrs(attrs); | |
30284 | + | |
30285 | + return ERR_PTR(ret); | |
30286 | +} | |
30287 | + | |
30288 | +static inline void save_comm(char *comm, struct task_struct *task) | |
30289 | +{ | |
30290 | + if (!task->pid) { | |
30291 | + strcpy(comm, "<idle>"); | |
30292 | + return; | |
30293 | + } | |
30294 | + | |
30295 | + if (WARN_ON_ONCE(task->pid < 0)) { | |
30296 | + strcpy(comm, "<XXX>"); | |
30297 | + return; | |
30298 | + } | |
30299 | + | |
30300 | + memcpy(comm, task->comm, TASK_COMM_LEN); | |
30301 | +} | |
30302 | + | |
e4b2b4a8 | 30303 | +static void hist_elt_data_free(struct hist_elt_data *elt_data) |
b3bbd485 | 30304 | +{ |
e4b2b4a8 JK |
30305 | + unsigned int i; |
30306 | + | |
30307 | + for (i = 0; i < SYNTH_FIELDS_MAX; i++) | |
30308 | + kfree(elt_data->field_var_str[i]); | |
30309 | + | |
30310 | + kfree(elt_data->comm); | |
30311 | + kfree(elt_data); | |
b3bbd485 JK |
30312 | +} |
30313 | + | |
e4b2b4a8 | 30314 | +static void hist_trigger_elt_data_free(struct tracing_map_elt *elt) |
1a6e0f06 | 30315 | +{ |
e4b2b4a8 | 30316 | + struct hist_elt_data *elt_data = elt->private_data; |
1a6e0f06 | 30317 | + |
e4b2b4a8 | 30318 | + hist_elt_data_free(elt_data); |
1a6e0f06 JK |
30319 | +} |
30320 | + | |
e4b2b4a8 | 30321 | +static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt) |
b3bbd485 JK |
30322 | +{ |
30323 | + struct hist_trigger_data *hist_data = elt->map->private_data; | |
e4b2b4a8 JK |
30324 | + unsigned int size = TASK_COMM_LEN; |
30325 | + struct hist_elt_data *elt_data; | |
b3bbd485 | 30326 | + struct hist_field *key_field; |
e4b2b4a8 | 30327 | + unsigned int i, n_str; |
1a6e0f06 | 30328 | + |
e4b2b4a8 JK |
30329 | + elt_data = kzalloc(sizeof(*elt_data), GFP_KERNEL); |
30330 | + if (!elt_data) | |
30331 | + return -ENOMEM; | |
b3bbd485 JK |
30332 | + |
30333 | + for_each_hist_key_field(i, hist_data) { | |
30334 | + key_field = hist_data->fields[i]; | |
30335 | + | |
30336 | + if (key_field->flags & HIST_FIELD_FL_EXECNAME) { | |
e4b2b4a8 JK |
30337 | + elt_data->comm = kzalloc(size, GFP_KERNEL); |
30338 | + if (!elt_data->comm) { | |
30339 | + kfree(elt_data); | |
b3bbd485 | 30340 | + return -ENOMEM; |
e4b2b4a8 | 30341 | + } |
b3bbd485 JK |
30342 | + break; |
30343 | + } | |
30344 | + } | |
30345 | + | |
e4b2b4a8 JK |
30346 | + n_str = hist_data->n_field_var_str + hist_data->n_max_var_str; |
30347 | + | |
30348 | + size = STR_VAR_LEN_MAX; | |
1a6e0f06 | 30349 | + |
e4b2b4a8 JK |
30350 | + for (i = 0; i < n_str; i++) { |
30351 | + elt_data->field_var_str[i] = kzalloc(size, GFP_KERNEL); | |
30352 | + if (!elt_data->field_var_str[i]) { | |
30353 | + hist_elt_data_free(elt_data); | |
30354 | + return -ENOMEM; | |
30355 | + } | |
30356 | + } | |
1a6e0f06 | 30357 | + |
e4b2b4a8 | 30358 | + elt->private_data = elt_data; |
1a6e0f06 | 30359 | + |
b3bbd485 JK |
30360 | + return 0; |
30361 | +} | |
30362 | + | |
e4b2b4a8 | 30363 | +static void hist_trigger_elt_data_init(struct tracing_map_elt *elt) |
b3bbd485 | 30364 | +{ |
e4b2b4a8 | 30365 | + struct hist_elt_data *elt_data = elt->private_data; |
b3bbd485 | 30366 | + |
e4b2b4a8 JK |
30367 | + if (elt_data->comm) |
30368 | + save_comm(elt_data->comm, current); | |
b3bbd485 JK |
30369 | +} |
30370 | + | |
e4b2b4a8 JK |
30371 | +static const struct tracing_map_ops hist_trigger_elt_data_ops = { |
30372 | + .elt_alloc = hist_trigger_elt_data_alloc, | |
30373 | + .elt_free = hist_trigger_elt_data_free, | |
30374 | + .elt_init = hist_trigger_elt_data_init, | |
30375 | +}; | |
30376 | + | |
30377 | +static const char *get_hist_field_flags(struct hist_field *hist_field) | |
b3bbd485 | 30378 | +{ |
e4b2b4a8 | 30379 | + const char *flags_str = NULL; |
b3bbd485 | 30380 | + |
e4b2b4a8 JK |
30381 | + if (hist_field->flags & HIST_FIELD_FL_HEX) |
30382 | + flags_str = "hex"; | |
30383 | + else if (hist_field->flags & HIST_FIELD_FL_SYM) | |
30384 | + flags_str = "sym"; | |
30385 | + else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET) | |
30386 | + flags_str = "sym-offset"; | |
30387 | + else if (hist_field->flags & HIST_FIELD_FL_EXECNAME) | |
30388 | + flags_str = "execname"; | |
30389 | + else if (hist_field->flags & HIST_FIELD_FL_SYSCALL) | |
30390 | + flags_str = "syscall"; | |
30391 | + else if (hist_field->flags & HIST_FIELD_FL_LOG2) | |
30392 | + flags_str = "log2"; | |
30393 | + else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP_USECS) | |
30394 | + flags_str = "usecs"; | |
30395 | + | |
30396 | + return flags_str; | |
b3bbd485 JK |
30397 | +} |
30398 | + | |
e4b2b4a8 | 30399 | +static void expr_field_str(struct hist_field *field, char *expr) |
1a6e0f06 | 30400 | +{ |
e4b2b4a8 JK |
30401 | + if (field->flags & HIST_FIELD_FL_VAR_REF) |
30402 | + strcat(expr, "$"); | |
b3bbd485 | 30403 | + |
e4b2b4a8 | 30404 | + strcat(expr, hist_field_name(field, 0)); |
1a6e0f06 | 30405 | + |
e4b2b4a8 JK |
30406 | + if (field->flags && !(field->flags & HIST_FIELD_FL_VAR_REF)) { |
30407 | + const char *flags_str = get_hist_field_flags(field); | |
1a6e0f06 | 30408 | + |
e4b2b4a8 JK |
30409 | + if (flags_str) { |
30410 | + strcat(expr, "."); | |
30411 | + strcat(expr, flags_str); | |
30412 | + } | |
30413 | + } | |
1a6e0f06 JK |
30414 | +} |
30415 | + | |
e4b2b4a8 | 30416 | +static char *expr_str(struct hist_field *field, unsigned int level) |
1a6e0f06 | 30417 | +{ |
e4b2b4a8 | 30418 | + char *expr; |
1a6e0f06 | 30419 | + |
e4b2b4a8 JK |
30420 | + if (level > 1) |
30421 | + return NULL; | |
1a6e0f06 | 30422 | + |
e4b2b4a8 JK |
30423 | + expr = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL); |
30424 | + if (!expr) | |
30425 | + return NULL; | |
1a6e0f06 | 30426 | + |
e4b2b4a8 JK |
30427 | + if (!field->operands[0]) { |
30428 | + expr_field_str(field, expr); | |
30429 | + return expr; | |
30430 | + } | |
1a6e0f06 | 30431 | + |
e4b2b4a8 JK |
30432 | + if (field->operator == FIELD_OP_UNARY_MINUS) { |
30433 | + char *subexpr; | |
1a6e0f06 | 30434 | + |
e4b2b4a8 JK |
30435 | + strcat(expr, "-("); |
30436 | + subexpr = expr_str(field->operands[0], ++level); | |
30437 | + if (!subexpr) { | |
30438 | + kfree(expr); | |
30439 | + return NULL; | |
1a6e0f06 | 30440 | + } |
e4b2b4a8 JK |
30441 | + strcat(expr, subexpr); |
30442 | + strcat(expr, ")"); | |
1a6e0f06 | 30443 | + |
e4b2b4a8 | 30444 | + kfree(subexpr); |
1a6e0f06 | 30445 | + |
e4b2b4a8 JK |
30446 | + return expr; |
30447 | + } | |
1a6e0f06 | 30448 | + |
e4b2b4a8 | 30449 | + expr_field_str(field->operands[0], expr); |
1a6e0f06 | 30450 | + |
e4b2b4a8 JK |
30451 | + switch (field->operator) { |
30452 | + case FIELD_OP_MINUS: | |
30453 | + strcat(expr, "-"); | |
30454 | + break; | |
30455 | + case FIELD_OP_PLUS: | |
30456 | + strcat(expr, "+"); | |
30457 | + break; | |
30458 | + default: | |
30459 | + kfree(expr); | |
30460 | + return NULL; | |
30461 | + } | |
1a6e0f06 | 30462 | + |
e4b2b4a8 | 30463 | + expr_field_str(field->operands[1], expr); |
1a6e0f06 | 30464 | + |
e4b2b4a8 | 30465 | + return expr; |
1a6e0f06 | 30466 | +} |
1a6e0f06 | 30467 | + |
e4b2b4a8 | 30468 | +static int contains_operator(char *str) |
1a6e0f06 | 30469 | +{ |
e4b2b4a8 JK |
30470 | + enum field_op_id field_op = FIELD_OP_NONE; |
30471 | + char *op; | |
1a6e0f06 | 30472 | + |
e4b2b4a8 JK |
30473 | + op = strpbrk(str, "+-"); |
30474 | + if (!op) | |
30475 | + return FIELD_OP_NONE; | |
1a6e0f06 | 30476 | + |
e4b2b4a8 JK |
30477 | + switch (*op) { |
30478 | + case '-': | |
30479 | + if (*str == '-') | |
30480 | + field_op = FIELD_OP_UNARY_MINUS; | |
30481 | + else | |
30482 | + field_op = FIELD_OP_MINUS; | |
30483 | + break; | |
30484 | + case '+': | |
30485 | + field_op = FIELD_OP_PLUS; | |
30486 | + break; | |
30487 | + default: | |
30488 | + break; | |
1a6e0f06 | 30489 | + } |
1a6e0f06 | 30490 | + |
e4b2b4a8 JK |
30491 | + return field_op; |
30492 | +} | |
1a6e0f06 | 30493 | + |
e4b2b4a8 JK |
30494 | +static void destroy_hist_field(struct hist_field *hist_field, |
30495 | + unsigned int level) | |
b3bbd485 | 30496 | +{ |
e4b2b4a8 | 30497 | + unsigned int i; |
1a6e0f06 | 30498 | + |
e4b2b4a8 JK |
30499 | + if (level > 3) |
30500 | + return; | |
1a6e0f06 | 30501 | + |
e4b2b4a8 | 30502 | + if (!hist_field) |
1a6e0f06 JK |
30503 | + return; |
30504 | + | |
e4b2b4a8 JK |
30505 | + for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++) |
30506 | + destroy_hist_field(hist_field->operands[i], level + 1); | |
1a6e0f06 | 30507 | + |
e4b2b4a8 JK |
30508 | + kfree(hist_field->var.name); |
30509 | + kfree(hist_field->name); | |
30510 | + kfree(hist_field->type); | |
1a6e0f06 | 30511 | + |
b3bbd485 JK |
30512 | + kfree(hist_field); |
30513 | +} | |
30514 | + | |
e4b2b4a8 JK |
30515 | +static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, |
30516 | + struct ftrace_event_field *field, | |
30517 | + unsigned long flags, | |
30518 | + char *var_name) | |
b3bbd485 JK |
30519 | +{ |
30520 | + struct hist_field *hist_field; | |
30521 | + | |
30522 | + if (field && is_function_field(field)) | |
30523 | + return NULL; | |
30524 | + | |
30525 | + hist_field = kzalloc(sizeof(struct hist_field), GFP_KERNEL); | |
30526 | + if (!hist_field) | |
30527 | + return NULL; | |
30528 | + | |
e4b2b4a8 | 30529 | + hist_field->hist_data = hist_data; |
1a6e0f06 | 30530 | + |
e4b2b4a8 JK |
30531 | + if (flags & HIST_FIELD_FL_EXPR || flags & HIST_FIELD_FL_ALIAS) |
30532 | + goto out; /* caller will populate */ | |
1a6e0f06 | 30533 | + |
e4b2b4a8 JK |
30534 | + if (flags & HIST_FIELD_FL_VAR_REF) { |
30535 | + hist_field->fn = hist_field_var_ref; | |
30536 | + goto out; | |
30537 | + } | |
1a6e0f06 | 30538 | + |
b3bbd485 JK |
30539 | + if (flags & HIST_FIELD_FL_HITCOUNT) { |
30540 | + hist_field->fn = hist_field_counter; | |
e4b2b4a8 JK |
30541 | + hist_field->size = sizeof(u64); |
30542 | + hist_field->type = kstrdup("u64", GFP_KERNEL); | |
30543 | + if (!hist_field->type) | |
30544 | + goto free; | |
b3bbd485 JK |
30545 | + goto out; |
30546 | + } | |
30547 | + | |
30548 | + if (flags & HIST_FIELD_FL_STACKTRACE) { | |
30549 | + hist_field->fn = hist_field_none; | |
30550 | + goto out; | |
30551 | + } | |
30552 | + | |
30553 | + if (flags & HIST_FIELD_FL_LOG2) { | |
e4b2b4a8 | 30554 | + unsigned long fl = flags & ~HIST_FIELD_FL_LOG2; |
b3bbd485 | 30555 | + hist_field->fn = hist_field_log2; |
e4b2b4a8 JK |
30556 | + hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL); |
30557 | + hist_field->size = hist_field->operands[0]->size; | |
30558 | + hist_field->type = kstrdup(hist_field->operands[0]->type, GFP_KERNEL); | |
30559 | + if (!hist_field->type) | |
30560 | + goto free; | |
30561 | + goto out; | |
30562 | + } | |
1a6e0f06 | 30563 | + |
e4b2b4a8 JK |
30564 | + if (flags & HIST_FIELD_FL_TIMESTAMP) { |
30565 | + hist_field->fn = hist_field_timestamp; | |
30566 | + hist_field->size = sizeof(u64); | |
30567 | + hist_field->type = kstrdup("u64", GFP_KERNEL); | |
30568 | + if (!hist_field->type) | |
30569 | + goto free; | |
30570 | + goto out; | |
30571 | + } | |
1a6e0f06 | 30572 | + |
e4b2b4a8 JK |
30573 | + if (flags & HIST_FIELD_FL_CPU) { |
30574 | + hist_field->fn = hist_field_cpu; | |
30575 | + hist_field->size = sizeof(int); | |
30576 | + hist_field->type = kstrdup("unsigned int", GFP_KERNEL); | |
30577 | + if (!hist_field->type) | |
30578 | + goto free; | |
b3bbd485 JK |
30579 | + goto out; |
30580 | + } | |
30581 | + | |
30582 | + if (WARN_ON_ONCE(!field)) | |
30583 | + goto out; | |
30584 | + | |
30585 | + if (is_string_field(field)) { | |
30586 | + flags |= HIST_FIELD_FL_STRING; | |
30587 | + | |
e4b2b4a8 JK |
30588 | + hist_field->size = MAX_FILTER_STR_VAL; |
30589 | + hist_field->type = kstrdup(field->type, GFP_KERNEL); | |
30590 | + if (!hist_field->type) | |
30591 | + goto free; | |
30592 | + | |
b3bbd485 JK |
30593 | + if (field->filter_type == FILTER_STATIC_STRING) |
30594 | + hist_field->fn = hist_field_string; | |
30595 | + else if (field->filter_type == FILTER_DYN_STRING) | |
30596 | + hist_field->fn = hist_field_dynstring; | |
30597 | + else | |
30598 | + hist_field->fn = hist_field_pstring; | |
30599 | + } else { | |
e4b2b4a8 JK |
30600 | + hist_field->size = field->size; |
30601 | + hist_field->is_signed = field->is_signed; | |
30602 | + hist_field->type = kstrdup(field->type, GFP_KERNEL); | |
30603 | + if (!hist_field->type) | |
30604 | + goto free; | |
30605 | + | |
b3bbd485 JK |
30606 | + hist_field->fn = select_value_fn(field->size, |
30607 | + field->is_signed); | |
30608 | + if (!hist_field->fn) { | |
e4b2b4a8 | 30609 | + destroy_hist_field(hist_field, 0); |
b3bbd485 JK |
30610 | + return NULL; |
30611 | + } | |
30612 | + } | |
30613 | + out: | |
30614 | + hist_field->field = field; | |
30615 | + hist_field->flags = flags; | |
30616 | + | |
e4b2b4a8 JK |
30617 | + if (var_name) { |
30618 | + hist_field->var.name = kstrdup(var_name, GFP_KERNEL); | |
30619 | + if (!hist_field->var.name) | |
30620 | + goto free; | |
30621 | + } | |
30622 | + | |
b3bbd485 | 30623 | + return hist_field; |
e4b2b4a8 JK |
30624 | + free: |
30625 | + destroy_hist_field(hist_field, 0); | |
30626 | + return NULL; | |
b3bbd485 JK |
30627 | +} |
30628 | + | |
30629 | +static void destroy_hist_fields(struct hist_trigger_data *hist_data) | |
30630 | +{ | |
30631 | + unsigned int i; | |
30632 | + | |
e4b2b4a8 | 30633 | + for (i = 0; i < HIST_FIELDS_MAX; i++) { |
b3bbd485 | 30634 | + if (hist_data->fields[i]) { |
e4b2b4a8 | 30635 | + destroy_hist_field(hist_data->fields[i], 0); |
b3bbd485 JK |
30636 | + hist_data->fields[i] = NULL; |
30637 | + } | |
30638 | + } | |
30639 | +} | |
30640 | + | |
e4b2b4a8 JK |
30641 | +static int init_var_ref(struct hist_field *ref_field, |
30642 | + struct hist_field *var_field, | |
30643 | + char *system, char *event_name) | |
b3bbd485 | 30644 | +{ |
e4b2b4a8 | 30645 | + int err = 0; |
b3bbd485 | 30646 | + |
e4b2b4a8 JK |
30647 | + ref_field->var.idx = var_field->var.idx; |
30648 | + ref_field->var.hist_data = var_field->hist_data; | |
30649 | + ref_field->size = var_field->size; | |
30650 | + ref_field->is_signed = var_field->is_signed; | |
30651 | + ref_field->flags |= var_field->flags & | |
30652 | + (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS); | |
b3bbd485 | 30653 | + |
e4b2b4a8 JK |
30654 | + if (system) { |
30655 | + ref_field->system = kstrdup(system, GFP_KERNEL); | |
30656 | + if (!ref_field->system) | |
30657 | + return -ENOMEM; | |
30658 | + } | |
1a6e0f06 | 30659 | + |
e4b2b4a8 JK |
30660 | + if (event_name) { |
30661 | + ref_field->event_name = kstrdup(event_name, GFP_KERNEL); | |
30662 | + if (!ref_field->event_name) { | |
30663 | + err = -ENOMEM; | |
30664 | + goto free; | |
30665 | + } | |
30666 | + } | |
1a6e0f06 | 30667 | + |
e4b2b4a8 JK |
30668 | + if (var_field->var.name) { |
30669 | + ref_field->name = kstrdup(var_field->var.name, GFP_KERNEL); | |
30670 | + if (!ref_field->name) { | |
30671 | + err = -ENOMEM; | |
30672 | + goto free; | |
30673 | + } | |
30674 | + } else if (var_field->name) { | |
30675 | + ref_field->name = kstrdup(var_field->name, GFP_KERNEL); | |
30676 | + if (!ref_field->name) { | |
30677 | + err = -ENOMEM; | |
30678 | + goto free; | |
30679 | + } | |
30680 | + } | |
1a6e0f06 | 30681 | + |
e4b2b4a8 JK |
30682 | + ref_field->type = kstrdup(var_field->type, GFP_KERNEL); |
30683 | + if (!ref_field->type) { | |
30684 | + err = -ENOMEM; | |
30685 | + goto free; | |
30686 | + } | |
30687 | + out: | |
30688 | + return err; | |
30689 | + free: | |
30690 | + kfree(ref_field->system); | |
30691 | + kfree(ref_field->event_name); | |
30692 | + kfree(ref_field->name); | |
30693 | + | |
30694 | + goto out; | |
1a6e0f06 JK |
30695 | +} |
30696 | + | |
e4b2b4a8 JK |
30697 | +static struct hist_field *create_var_ref(struct hist_field *var_field, |
30698 | + char *system, char *event_name) | |
1a6e0f06 | 30699 | +{ |
e4b2b4a8 JK |
30700 | + unsigned long flags = HIST_FIELD_FL_VAR_REF; |
30701 | + struct hist_field *ref_field; | |
1a6e0f06 | 30702 | + |
e4b2b4a8 JK |
30703 | + ref_field = create_hist_field(var_field->hist_data, NULL, flags, NULL); |
30704 | + if (ref_field) { | |
30705 | + if (init_var_ref(ref_field, var_field, system, event_name)) { | |
30706 | + destroy_hist_field(ref_field, 0); | |
30707 | + return NULL; | |
30708 | + } | |
30709 | + } | |
1a6e0f06 | 30710 | + |
e4b2b4a8 | 30711 | + return ref_field; |
1a6e0f06 JK |
30712 | +} |
30713 | + | |
e4b2b4a8 | 30714 | +static bool is_var_ref(char *var_name) |
1a6e0f06 | 30715 | +{ |
e4b2b4a8 JK |
30716 | + if (!var_name || strlen(var_name) < 2 || var_name[0] != '$') |
30717 | + return false; | |
1a6e0f06 | 30718 | + |
e4b2b4a8 | 30719 | + return true; |
1a6e0f06 JK |
30720 | +} |
30721 | + | |
e4b2b4a8 JK |
30722 | +static char *field_name_from_var(struct hist_trigger_data *hist_data, |
30723 | + char *var_name) | |
1a6e0f06 | 30724 | +{ |
e4b2b4a8 JK |
30725 | + char *name, *field; |
30726 | + unsigned int i; | |
1a6e0f06 | 30727 | + |
e4b2b4a8 JK |
30728 | + for (i = 0; i < hist_data->attrs->var_defs.n_vars; i++) { |
30729 | + name = hist_data->attrs->var_defs.name[i]; | |
1a6e0f06 | 30730 | + |
e4b2b4a8 JK |
30731 | + if (strcmp(var_name, name) == 0) { |
30732 | + field = hist_data->attrs->var_defs.expr[i]; | |
30733 | + if (contains_operator(field) || is_var_ref(field)) | |
30734 | + continue; | |
30735 | + return field; | |
1a6e0f06 JK |
30736 | + } |
30737 | + } | |
e4b2b4a8 JK |
30738 | + |
30739 | + return NULL; | |
1a6e0f06 JK |
30740 | +} |
30741 | + | |
e4b2b4a8 JK |
30742 | +static char *local_field_var_ref(struct hist_trigger_data *hist_data, |
30743 | + char *system, char *event_name, | |
30744 | + char *var_name) | |
30745 | +{ | |
30746 | + struct trace_event_call *call; | |
30747 | + | |
30748 | + if (system && event_name) { | |
30749 | + call = hist_data->event_file->event_call; | |
30750 | + | |
30751 | + if (strcmp(system, call->class->system) != 0) | |
30752 | + return NULL; | |
30753 | + | |
30754 | + if (strcmp(event_name, trace_event_name(call)) != 0) | |
30755 | + return NULL; | |
30756 | + } | |
30757 | + | |
30758 | + if (!!system != !!event_name) | |
30759 | + return NULL; | |
30760 | + | |
30761 | + if (!is_var_ref(var_name)) | |
30762 | + return NULL; | |
30763 | + | |
30764 | + var_name++; | |
30765 | + | |
30766 | + return field_name_from_var(hist_data, var_name); | |
1a6e0f06 | 30767 | +} |
e4b2b4a8 JK |
30768 | + |
30769 | +static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data, | |
30770 | + char *system, char *event_name, | |
30771 | + char *var_name) | |
1a6e0f06 | 30772 | +{ |
e4b2b4a8 JK |
30773 | + struct hist_field *var_field = NULL, *ref_field = NULL; |
30774 | + | |
30775 | + if (!is_var_ref(var_name)) | |
30776 | + return NULL; | |
30777 | + | |
30778 | + var_name++; | |
30779 | + | |
30780 | + var_field = find_event_var(hist_data, system, event_name, var_name); | |
30781 | + if (var_field) | |
30782 | + ref_field = create_var_ref(var_field, system, event_name); | |
30783 | + | |
30784 | + if (!ref_field) | |
30785 | + hist_err_event("Couldn't find variable: $", | |
30786 | + system, event_name, var_name); | |
30787 | + | |
30788 | + return ref_field; | |
30789 | +} | |
30790 | + | |
30791 | +static struct ftrace_event_field * | |
30792 | +parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, | |
30793 | + char *field_str, unsigned long *flags) | |
30794 | +{ | |
30795 | + struct ftrace_event_field *field = NULL; | |
30796 | + char *field_name, *modifier, *str; | |
30797 | + | |
30798 | + modifier = str = kstrdup(field_str, GFP_KERNEL); | |
30799 | + if (!modifier) | |
30800 | + return ERR_PTR(-ENOMEM); | |
1a6e0f06 | 30801 | + |
e4b2b4a8 JK |
30802 | + field_name = strsep(&modifier, "."); |
30803 | + if (modifier) { | |
30804 | + if (strcmp(modifier, "hex") == 0) | |
30805 | + *flags |= HIST_FIELD_FL_HEX; | |
30806 | + else if (strcmp(modifier, "sym") == 0) | |
30807 | + *flags |= HIST_FIELD_FL_SYM; | |
30808 | + else if (strcmp(modifier, "sym-offset") == 0) | |
30809 | + *flags |= HIST_FIELD_FL_SYM_OFFSET; | |
30810 | + else if ((strcmp(modifier, "execname") == 0) && | |
30811 | + (strcmp(field_name, "common_pid") == 0)) | |
30812 | + *flags |= HIST_FIELD_FL_EXECNAME; | |
30813 | + else if (strcmp(modifier, "syscall") == 0) | |
30814 | + *flags |= HIST_FIELD_FL_SYSCALL; | |
30815 | + else if (strcmp(modifier, "log2") == 0) | |
30816 | + *flags |= HIST_FIELD_FL_LOG2; | |
30817 | + else if (strcmp(modifier, "usecs") == 0) | |
30818 | + *flags |= HIST_FIELD_FL_TIMESTAMP_USECS; | |
30819 | + else { | |
30820 | + hist_err("Invalid field modifier: ", modifier); | |
30821 | + field = ERR_PTR(-EINVAL); | |
30822 | + goto out; | |
1a6e0f06 | 30823 | + } |
e4b2b4a8 | 30824 | + } |
1a6e0f06 | 30825 | + |
e4b2b4a8 JK |
30826 | + if (strcmp(field_name, "common_timestamp") == 0) { |
30827 | + *flags |= HIST_FIELD_FL_TIMESTAMP; | |
30828 | + hist_data->enable_timestamps = true; | |
30829 | + if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS) | |
30830 | + hist_data->attrs->ts_in_usecs = true; | |
30831 | + } else if (strcmp(field_name, "cpu") == 0) | |
30832 | + *flags |= HIST_FIELD_FL_CPU; | |
30833 | + else { | |
30834 | + field = trace_find_event_field(file->event_call, field_name); | |
30835 | + if (!field || !field->size) { | |
30836 | + hist_err("Couldn't find field: ", field_name); | |
30837 | + field = ERR_PTR(-EINVAL); | |
30838 | + goto out; | |
30839 | + } | |
30840 | + } | |
30841 | + out: | |
30842 | + kfree(str); | |
1a6e0f06 | 30843 | + |
e4b2b4a8 JK |
30844 | + return field; |
30845 | +} | |
1a6e0f06 | 30846 | + |
e4b2b4a8 JK |
30847 | +static struct hist_field *create_alias(struct hist_trigger_data *hist_data, |
30848 | + struct hist_field *var_ref, | |
30849 | + char *var_name) | |
1a6e0f06 | 30850 | +{ |
e4b2b4a8 JK |
30851 | + struct hist_field *alias = NULL; |
30852 | + unsigned long flags = HIST_FIELD_FL_ALIAS | HIST_FIELD_FL_VAR; | |
1a6e0f06 | 30853 | + |
e4b2b4a8 JK |
30854 | + alias = create_hist_field(hist_data, NULL, flags, var_name); |
30855 | + if (!alias) | |
30856 | + return NULL; | |
1a6e0f06 | 30857 | + |
e4b2b4a8 JK |
30858 | + alias->fn = var_ref->fn; |
30859 | + alias->operands[0] = var_ref; | |
1a6e0f06 | 30860 | + |
e4b2b4a8 JK |
30861 | + if (init_var_ref(alias, var_ref, var_ref->system, var_ref->event_name)) { |
30862 | + destroy_hist_field(alias, 0); | |
30863 | + return NULL; | |
30864 | + } | |
1a6e0f06 | 30865 | + |
e4b2b4a8 | 30866 | + return alias; |
1a6e0f06 JK |
30867 | +} |
30868 | + | |
e4b2b4a8 JK |
30869 | +static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, |
30870 | + struct trace_event_file *file, char *str, | |
30871 | + unsigned long *flags, char *var_name) | |
30872 | +{ | |
30873 | + char *s, *ref_system = NULL, *ref_event = NULL, *ref_var = str; | |
30874 | + struct ftrace_event_field *field = NULL; | |
30875 | + struct hist_field *hist_field = NULL; | |
30876 | + int ret = 0; | |
1a6e0f06 | 30877 | + |
e4b2b4a8 JK |
30878 | + s = strchr(str, '.'); |
30879 | + if (s) { | |
30880 | + s = strchr(++s, '.'); | |
30881 | + if (s) { | |
30882 | + ref_system = strsep(&str, "."); | |
30883 | + if (!str) { | |
30884 | + ret = -EINVAL; | |
30885 | + goto out; | |
30886 | + } | |
30887 | + ref_event = strsep(&str, "."); | |
30888 | + if (!str) { | |
30889 | + ret = -EINVAL; | |
30890 | + goto out; | |
30891 | + } | |
30892 | + ref_var = str; | |
30893 | + } | |
30894 | + } | |
1a6e0f06 | 30895 | + |
e4b2b4a8 JK |
30896 | + s = local_field_var_ref(hist_data, ref_system, ref_event, ref_var); |
30897 | + if (!s) { | |
30898 | + hist_field = parse_var_ref(hist_data, ref_system, ref_event, ref_var); | |
30899 | + if (hist_field) { | |
30900 | + hist_data->var_refs[hist_data->n_var_refs] = hist_field; | |
30901 | + hist_field->var_ref_idx = hist_data->n_var_refs++; | |
30902 | + if (var_name) { | |
30903 | + hist_field = create_alias(hist_data, hist_field, var_name); | |
30904 | + if (!hist_field) { | |
30905 | + ret = -ENOMEM; | |
30906 | + goto out; | |
30907 | + } | |
30908 | + } | |
30909 | + return hist_field; | |
30910 | + } | |
30911 | + } else | |
30912 | + str = s; | |
30913 | + | |
30914 | + field = parse_field(hist_data, file, str, flags); | |
30915 | + if (IS_ERR(field)) { | |
30916 | + ret = PTR_ERR(field); | |
30917 | + goto out; | |
30918 | + } | |
30919 | + | |
30920 | + hist_field = create_hist_field(hist_data, field, *flags, var_name); | |
30921 | + if (!hist_field) { | |
30922 | + ret = -ENOMEM; | |
30923 | + goto out; | |
30924 | + } | |
30925 | + | |
30926 | + return hist_field; | |
30927 | + out: | |
30928 | + return ERR_PTR(ret); | |
30929 | +} | |
30930 | + | |
30931 | +static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, | |
30932 | + struct trace_event_file *file, | |
30933 | + char *str, unsigned long flags, | |
30934 | + char *var_name, unsigned int level); | |
30935 | + | |
30936 | +static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, | |
30937 | + struct trace_event_file *file, | |
30938 | + char *str, unsigned long flags, | |
30939 | + char *var_name, unsigned int level) | |
1a6e0f06 | 30940 | +{ |
e4b2b4a8 JK |
30941 | + struct hist_field *operand1, *expr = NULL; |
30942 | + unsigned long operand_flags; | |
30943 | + int ret = 0; | |
30944 | + char *s; | |
30945 | + | |
30946 | + // we support only -(xxx) i.e. explicit parens required | |
30947 | + | |
30948 | + if (level > 3) { | |
30949 | + hist_err("Too many subexpressions (3 max): ", str); | |
30950 | + ret = -EINVAL; | |
30951 | + goto free; | |
1a6e0f06 | 30952 | + } |
e4b2b4a8 JK |
30953 | + |
30954 | + str++; // skip leading '-' | |
30955 | + | |
30956 | + s = strchr(str, '('); | |
30957 | + if (s) | |
30958 | + str++; | |
30959 | + else { | |
30960 | + ret = -EINVAL; | |
30961 | + goto free; | |
30962 | + } | |
30963 | + | |
30964 | + s = strrchr(str, ')'); | |
30965 | + if (s) | |
30966 | + *s = '\0'; | |
30967 | + else { | |
30968 | + ret = -EINVAL; // no closing ')' | |
30969 | + goto free; | |
30970 | + } | |
30971 | + | |
30972 | + flags |= HIST_FIELD_FL_EXPR; | |
30973 | + expr = create_hist_field(hist_data, NULL, flags, var_name); | |
30974 | + if (!expr) { | |
30975 | + ret = -ENOMEM; | |
30976 | + goto free; | |
30977 | + } | |
30978 | + | |
30979 | + operand_flags = 0; | |
30980 | + operand1 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level); | |
30981 | + if (IS_ERR(operand1)) { | |
30982 | + ret = PTR_ERR(operand1); | |
30983 | + goto free; | |
30984 | + } | |
30985 | + | |
30986 | + expr->flags |= operand1->flags & | |
30987 | + (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS); | |
30988 | + expr->fn = hist_field_unary_minus; | |
30989 | + expr->operands[0] = operand1; | |
30990 | + expr->operator = FIELD_OP_UNARY_MINUS; | |
30991 | + expr->name = expr_str(expr, 0); | |
30992 | + expr->type = kstrdup(operand1->type, GFP_KERNEL); | |
30993 | + if (!expr->type) { | |
30994 | + ret = -ENOMEM; | |
30995 | + goto free; | |
30996 | + } | |
30997 | + | |
30998 | + return expr; | |
30999 | + free: | |
31000 | + destroy_hist_field(expr, 0); | |
31001 | + return ERR_PTR(ret); | |
1a6e0f06 | 31002 | +} |
1a6e0f06 | 31003 | + |
e4b2b4a8 JK |
31004 | +static int check_expr_operands(struct hist_field *operand1, |
31005 | + struct hist_field *operand2) | |
31006 | +{ | |
31007 | + unsigned long operand1_flags = operand1->flags; | |
31008 | + unsigned long operand2_flags = operand2->flags; | |
1a6e0f06 | 31009 | + |
e4b2b4a8 JK |
31010 | + if ((operand1_flags & HIST_FIELD_FL_VAR_REF) || |
31011 | + (operand1_flags & HIST_FIELD_FL_ALIAS)) { | |
31012 | + struct hist_field *var; | |
31013 | + | |
31014 | + var = find_var_field(operand1->var.hist_data, operand1->name); | |
31015 | + if (!var) | |
31016 | + return -EINVAL; | |
31017 | + operand1_flags = var->flags; | |
31018 | + } | |
31019 | + | |
31020 | + if ((operand2_flags & HIST_FIELD_FL_VAR_REF) || | |
31021 | + (operand2_flags & HIST_FIELD_FL_ALIAS)) { | |
31022 | + struct hist_field *var; | |
31023 | + | |
31024 | + var = find_var_field(operand2->var.hist_data, operand2->name); | |
31025 | + if (!var) | |
31026 | + return -EINVAL; | |
31027 | + operand2_flags = var->flags; | |
31028 | + } | |
31029 | + | |
31030 | + if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) != | |
31031 | + (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) { | |
31032 | + hist_err("Timestamp units in expression don't match", NULL); | |
b3bbd485 | 31033 | + return -EINVAL; |
e4b2b4a8 | 31034 | + } |
b3bbd485 JK |
31035 | + |
31036 | + return 0; | |
31037 | +} | |
31038 | + | |
e4b2b4a8 JK |
31039 | +static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, |
31040 | + struct trace_event_file *file, | |
31041 | + char *str, unsigned long flags, | |
31042 | + char *var_name, unsigned int level) | |
b3bbd485 | 31043 | +{ |
e4b2b4a8 JK |
31044 | + struct hist_field *operand1 = NULL, *operand2 = NULL, *expr = NULL; |
31045 | + unsigned long operand_flags; | |
31046 | + int field_op, ret = -EINVAL; | |
31047 | + char *sep, *operand1_str; | |
31048 | + | |
31049 | + if (level > 3) { | |
31050 | + hist_err("Too many subexpressions (3 max): ", str); | |
31051 | + return ERR_PTR(-EINVAL); | |
31052 | + } | |
31053 | + | |
31054 | + field_op = contains_operator(str); | |
31055 | + | |
31056 | + if (field_op == FIELD_OP_NONE) | |
31057 | + return parse_atom(hist_data, file, str, &flags, var_name); | |
31058 | + | |
31059 | + if (field_op == FIELD_OP_UNARY_MINUS) | |
31060 | + return parse_unary(hist_data, file, str, flags, var_name, ++level); | |
31061 | + | |
31062 | + switch (field_op) { | |
31063 | + case FIELD_OP_MINUS: | |
31064 | + sep = "-"; | |
31065 | + break; | |
31066 | + case FIELD_OP_PLUS: | |
31067 | + sep = "+"; | |
31068 | + break; | |
31069 | + default: | |
31070 | + goto free; | |
31071 | + } | |
31072 | + | |
31073 | + operand1_str = strsep(&str, sep); | |
31074 | + if (!operand1_str || !str) | |
31075 | + goto free; | |
31076 | + | |
31077 | + operand_flags = 0; | |
31078 | + operand1 = parse_atom(hist_data, file, operand1_str, | |
31079 | + &operand_flags, NULL); | |
31080 | + if (IS_ERR(operand1)) { | |
31081 | + ret = PTR_ERR(operand1); | |
31082 | + operand1 = NULL; | |
31083 | + goto free; | |
31084 | + } | |
31085 | + | |
31086 | + // rest of string could be another expression e.g. b+c in a+b+c | |
31087 | + operand_flags = 0; | |
31088 | + operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level); | |
31089 | + if (IS_ERR(operand2)) { | |
31090 | + ret = PTR_ERR(operand2); | |
31091 | + operand2 = NULL; | |
31092 | + goto free; | |
31093 | + } | |
31094 | + | |
31095 | + ret = check_expr_operands(operand1, operand2); | |
31096 | + if (ret) | |
31097 | + goto free; | |
31098 | + | |
31099 | + flags |= HIST_FIELD_FL_EXPR; | |
31100 | + | |
31101 | + flags |= operand1->flags & | |
31102 | + (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS); | |
1a6e0f06 | 31103 | + |
e4b2b4a8 JK |
31104 | + expr = create_hist_field(hist_data, NULL, flags, var_name); |
31105 | + if (!expr) { | |
31106 | + ret = -ENOMEM; | |
31107 | + goto free; | |
31108 | + } | |
1a6e0f06 | 31109 | + |
e4b2b4a8 JK |
31110 | + operand1->read_once = true; |
31111 | + operand2->read_once = true; | |
31112 | + | |
31113 | + expr->operands[0] = operand1; | |
31114 | + expr->operands[1] = operand2; | |
31115 | + expr->operator = field_op; | |
31116 | + expr->name = expr_str(expr, 0); | |
31117 | + expr->type = kstrdup(operand1->type, GFP_KERNEL); | |
31118 | + if (!expr->type) { | |
31119 | + ret = -ENOMEM; | |
31120 | + goto free; | |
31121 | + } | |
1a6e0f06 | 31122 | + |
e4b2b4a8 JK |
31123 | + switch (field_op) { |
31124 | + case FIELD_OP_MINUS: | |
31125 | + expr->fn = hist_field_minus; | |
31126 | + break; | |
31127 | + case FIELD_OP_PLUS: | |
31128 | + expr->fn = hist_field_plus; | |
31129 | + break; | |
31130 | + default: | |
31131 | + ret = -EINVAL; | |
31132 | + goto free; | |
31133 | + } | |
31134 | + | |
31135 | + return expr; | |
31136 | + free: | |
31137 | + destroy_hist_field(operand1, 0); | |
31138 | + destroy_hist_field(operand2, 0); | |
31139 | + destroy_hist_field(expr, 0); | |
31140 | + | |
31141 | + return ERR_PTR(ret); | |
31142 | +} | |
31143 | + | |
31144 | +static char *find_trigger_filter(struct hist_trigger_data *hist_data, | |
31145 | + struct trace_event_file *file) | |
1a6e0f06 | 31146 | +{ |
e4b2b4a8 JK |
31147 | + struct event_trigger_data *test; |
31148 | + | |
31149 | + list_for_each_entry_rcu(test, &file->triggers, list) { | |
31150 | + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { | |
31151 | + if (test->private_data == hist_data) | |
31152 | + return test->filter_str; | |
31153 | + } | |
31154 | + } | |
31155 | + | |
31156 | + return NULL; | |
1a6e0f06 JK |
31157 | +} |
31158 | + | |
e4b2b4a8 JK |
31159 | +static struct event_command trigger_hist_cmd; |
31160 | +static int event_hist_trigger_func(struct event_command *cmd_ops, | |
31161 | + struct trace_event_file *file, | |
31162 | + char *glob, char *cmd, char *param); | |
31163 | + | |
31164 | +static bool compatible_keys(struct hist_trigger_data *target_hist_data, | |
31165 | + struct hist_trigger_data *hist_data, | |
31166 | + unsigned int n_keys) | |
1a6e0f06 | 31167 | +{ |
e4b2b4a8 JK |
31168 | + struct hist_field *target_hist_field, *hist_field; |
31169 | + unsigned int n, i, j; | |
31170 | + | |
31171 | + if (hist_data->n_fields - hist_data->n_vals != n_keys) | |
31172 | + return false; | |
31173 | + | |
31174 | + i = hist_data->n_vals; | |
31175 | + j = target_hist_data->n_vals; | |
31176 | + | |
31177 | + for (n = 0; n < n_keys; n++) { | |
31178 | + hist_field = hist_data->fields[i + n]; | |
31179 | + target_hist_field = target_hist_data->fields[j + n]; | |
31180 | + | |
31181 | + if (strcmp(hist_field->type, target_hist_field->type) != 0) | |
31182 | + return false; | |
31183 | + if (hist_field->size != target_hist_field->size) | |
31184 | + return false; | |
31185 | + if (hist_field->is_signed != target_hist_field->is_signed) | |
31186 | + return false; | |
31187 | + } | |
31188 | + | |
31189 | + return true; | |
1a6e0f06 JK |
31190 | +} |
31191 | + | |
e4b2b4a8 JK |
31192 | +static struct hist_trigger_data * |
31193 | +find_compatible_hist(struct hist_trigger_data *target_hist_data, | |
31194 | + struct trace_event_file *file) | |
1a6e0f06 | 31195 | +{ |
e4b2b4a8 JK |
31196 | + struct hist_trigger_data *hist_data; |
31197 | + struct event_trigger_data *test; | |
31198 | + unsigned int n_keys; | |
31199 | + | |
31200 | + n_keys = target_hist_data->n_fields - target_hist_data->n_vals; | |
31201 | + | |
31202 | + list_for_each_entry_rcu(test, &file->triggers, list) { | |
31203 | + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { | |
31204 | + hist_data = test->private_data; | |
31205 | + | |
31206 | + if (compatible_keys(target_hist_data, hist_data, n_keys)) | |
31207 | + return hist_data; | |
31208 | + } | |
31209 | + } | |
31210 | + | |
31211 | + return NULL; | |
1a6e0f06 | 31212 | +} |
1a6e0f06 | 31213 | + |
e4b2b4a8 JK |
31214 | +static struct trace_event_file *event_file(struct trace_array *tr, |
31215 | + char *system, char *event_name) | |
31216 | +{ | |
31217 | + struct trace_event_file *file; | |
31218 | + | |
31219 | + file = find_event_file(tr, system, event_name); | |
31220 | + if (!file) | |
31221 | + return ERR_PTR(-EINVAL); | |
31222 | + | |
31223 | + return file; | |
31224 | +} | |
31225 | + | |
31226 | +static struct hist_field * | |
31227 | +find_synthetic_field_var(struct hist_trigger_data *target_hist_data, | |
31228 | + char *system, char *event_name, char *field_name) | |
31229 | +{ | |
31230 | + struct hist_field *event_var; | |
31231 | + char *synthetic_name; | |
31232 | + | |
31233 | + synthetic_name = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL); | |
31234 | + if (!synthetic_name) | |
31235 | + return ERR_PTR(-ENOMEM); | |
31236 | + | |
31237 | + strcpy(synthetic_name, "synthetic_"); | |
31238 | + strcat(synthetic_name, field_name); | |
31239 | + | |
31240 | + event_var = find_event_var(target_hist_data, system, event_name, synthetic_name); | |
31241 | + | |
31242 | + kfree(synthetic_name); | |
31243 | + | |
31244 | + return event_var; | |
31245 | +} | |
1a6e0f06 JK |
31246 | + |
31247 | +/** | |
e4b2b4a8 JK |
31248 | + * create_field_var_hist - Automatically create a histogram and var for a field |
31249 | + * @target_hist_data: The target hist trigger | |
31250 | + * @subsys_name: Optional subsystem name | |
31251 | + * @event_name: Optional event name | |
31252 | + * @field_name: The name of the field (and the resulting variable) | |
1a6e0f06 | 31253 | + * |
e4b2b4a8 JK |
31254 | + * Hist trigger actions fetch data from variables, not directly from |
31255 | + * events. However, for convenience, users are allowed to directly | |
31256 | + * specify an event field in an action, which will be automatically | |
31257 | + * converted into a variable on their behalf. | |
31258 | + | |
31259 | + * If a user specifies a field on an event that isn't the event the | |
31260 | + * histogram currently being defined (the target event histogram), the | |
31261 | + * only way that can be accomplished is if a new hist trigger is | |
31262 | + * created and the field variable defined on that. | |
1a6e0f06 | 31263 | + * |
e4b2b4a8 JK |
31264 | + * This function creates a new histogram compatible with the target |
31265 | + * event (meaning a histogram with the same key as the target | |
31266 | + * histogram), and creates a variable for the specified field, but | |
31267 | + * with 'synthetic_' prepended to the variable name in order to avoid | |
31268 | + * collision with normal field variables. | |
31269 | + * | |
31270 | + * Return: The variable created for the field. | |
1a6e0f06 | 31271 | + */ |
e4b2b4a8 JK |
31272 | +static struct hist_field * |
31273 | +create_field_var_hist(struct hist_trigger_data *target_hist_data, | |
31274 | + char *subsys_name, char *event_name, char *field_name) | |
31275 | +{ | |
31276 | + struct trace_array *tr = target_hist_data->event_file->tr; | |
31277 | + struct hist_field *event_var = ERR_PTR(-EINVAL); | |
31278 | + struct hist_trigger_data *hist_data; | |
31279 | + unsigned int i, n, first = true; | |
31280 | + struct field_var_hist *var_hist; | |
31281 | + struct trace_event_file *file; | |
31282 | + struct hist_field *key_field; | |
31283 | + char *saved_filter; | |
31284 | + char *cmd; | |
31285 | + int ret; | |
1a6e0f06 | 31286 | + |
e4b2b4a8 JK |
31287 | + if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) { |
31288 | + hist_err_event("onmatch: Too many field variables defined: ", | |
31289 | + subsys_name, event_name, field_name); | |
31290 | + return ERR_PTR(-EINVAL); | |
31291 | + } | |
1a6e0f06 | 31292 | + |
e4b2b4a8 | 31293 | + file = event_file(tr, subsys_name, event_name); |
1a6e0f06 | 31294 | + |
e4b2b4a8 JK |
31295 | + if (IS_ERR(file)) { |
31296 | + hist_err_event("onmatch: Event file not found: ", | |
31297 | + subsys_name, event_name, field_name); | |
31298 | + ret = PTR_ERR(file); | |
31299 | + return ERR_PTR(ret); | |
1a6e0f06 JK |
31300 | + } |
31301 | + | |
e4b2b4a8 JK |
31302 | + /* |
31303 | + * Look for a histogram compatible with target. We'll use the | |
31304 | + * found histogram specification to create a new matching | |
31305 | + * histogram with our variable on it. target_hist_data is not | |
31306 | + * yet a registered histogram so we can't use that. | |
31307 | + */ | |
31308 | + hist_data = find_compatible_hist(target_hist_data, file); | |
31309 | + if (!hist_data) { | |
31310 | + hist_err_event("onmatch: Matching event histogram not found: ", | |
31311 | + subsys_name, event_name, field_name); | |
31312 | + return ERR_PTR(-EINVAL); | |
1a6e0f06 | 31313 | + } |
1a6e0f06 | 31314 | + |
e4b2b4a8 JK |
31315 | + /* See if a synthetic field variable has already been created */ |
31316 | + event_var = find_synthetic_field_var(target_hist_data, subsys_name, | |
31317 | + event_name, field_name); | |
31318 | + if (!IS_ERR_OR_NULL(event_var)) | |
31319 | + return event_var; | |
1a6e0f06 | 31320 | + |
e4b2b4a8 JK |
31321 | + var_hist = kzalloc(sizeof(*var_hist), GFP_KERNEL); |
31322 | + if (!var_hist) | |
31323 | + return ERR_PTR(-ENOMEM); | |
1a6e0f06 | 31324 | + |
e4b2b4a8 JK |
31325 | + cmd = kzalloc(MAX_FILTER_STR_VAL, GFP_KERNEL); |
31326 | + if (!cmd) { | |
31327 | + kfree(var_hist); | |
31328 | + return ERR_PTR(-ENOMEM); | |
31329 | + } | |
31330 | + | |
31331 | + /* Use the same keys as the compatible histogram */ | |
31332 | + strcat(cmd, "keys="); | |
31333 | + | |
31334 | + for_each_hist_key_field(i, hist_data) { | |
31335 | + key_field = hist_data->fields[i]; | |
31336 | + if (!first) | |
31337 | + strcat(cmd, ","); | |
31338 | + strcat(cmd, key_field->field->name); | |
31339 | + first = false; | |
31340 | + } | |
31341 | + | |
31342 | + /* Create the synthetic field variable specification */ | |
31343 | + strcat(cmd, ":synthetic_"); | |
31344 | + strcat(cmd, field_name); | |
31345 | + strcat(cmd, "="); | |
31346 | + strcat(cmd, field_name); | |
31347 | + | |
31348 | + /* Use the same filter as the compatible histogram */ | |
31349 | + saved_filter = find_trigger_filter(hist_data, file); | |
31350 | + if (saved_filter) { | |
31351 | + strcat(cmd, " if "); | |
31352 | + strcat(cmd, saved_filter); | |
31353 | + } | |
31354 | + | |
31355 | + var_hist->cmd = kstrdup(cmd, GFP_KERNEL); | |
31356 | + if (!var_hist->cmd) { | |
31357 | + kfree(cmd); | |
31358 | + kfree(var_hist); | |
31359 | + return ERR_PTR(-ENOMEM); | |
31360 | + } | |
31361 | + | |
31362 | + /* Save the compatible histogram information */ | |
31363 | + var_hist->hist_data = hist_data; | |
31364 | + | |
31365 | + /* Create the new histogram with our variable */ | |
31366 | + ret = event_hist_trigger_func(&trigger_hist_cmd, file, | |
31367 | + "", "hist", cmd); | |
31368 | + if (ret) { | |
31369 | + kfree(cmd); | |
31370 | + kfree(var_hist->cmd); | |
31371 | + kfree(var_hist); | |
31372 | + hist_err_event("onmatch: Couldn't create histogram for field: ", | |
31373 | + subsys_name, event_name, field_name); | |
31374 | + return ERR_PTR(ret); | |
31375 | + } | |
31376 | + | |
31377 | + kfree(cmd); | |
31378 | + | |
31379 | + /* If we can't find the variable, something went wrong */ | |
31380 | + event_var = find_synthetic_field_var(target_hist_data, subsys_name, | |
31381 | + event_name, field_name); | |
31382 | + if (IS_ERR_OR_NULL(event_var)) { | |
31383 | + kfree(var_hist->cmd); | |
31384 | + kfree(var_hist); | |
31385 | + hist_err_event("onmatch: Couldn't find synthetic variable: ", | |
31386 | + subsys_name, event_name, field_name); | |
31387 | + return ERR_PTR(-EINVAL); | |
1a6e0f06 | 31388 | + } |
e4b2b4a8 JK |
31389 | + |
31390 | + n = target_hist_data->n_field_var_hists; | |
31391 | + target_hist_data->field_var_hists[n] = var_hist; | |
31392 | + target_hist_data->n_field_var_hists++; | |
31393 | + | |
31394 | + return event_var; | |
1a6e0f06 JK |
31395 | +} |
31396 | + | |
e4b2b4a8 JK |
31397 | +static struct hist_field * |
31398 | +find_target_event_var(struct hist_trigger_data *hist_data, | |
31399 | + char *subsys_name, char *event_name, char *var_name) | |
1a6e0f06 | 31400 | +{ |
e4b2b4a8 JK |
31401 | + struct trace_event_file *file = hist_data->event_file; |
31402 | + struct hist_field *hist_field = NULL; | |
1a6e0f06 | 31403 | + |
e4b2b4a8 JK |
31404 | + if (subsys_name) { |
31405 | + struct trace_event_call *call; | |
1a6e0f06 | 31406 | + |
e4b2b4a8 JK |
31407 | + if (!event_name) |
31408 | + return NULL; | |
1a6e0f06 | 31409 | + |
e4b2b4a8 | 31410 | + call = file->event_call; |
1a6e0f06 | 31411 | + |
e4b2b4a8 JK |
31412 | + if (strcmp(subsys_name, call->class->system) != 0) |
31413 | + return NULL; | |
1a6e0f06 | 31414 | + |
e4b2b4a8 JK |
31415 | + if (strcmp(event_name, trace_event_name(call)) != 0) |
31416 | + return NULL; | |
31417 | + } | |
31418 | + | |
31419 | + hist_field = find_var_field(hist_data, var_name); | |
31420 | + | |
31421 | + return hist_field; | |
31422 | +} | |
31423 | + | |
31424 | +static inline void __update_field_vars(struct tracing_map_elt *elt, | |
31425 | + struct ring_buffer_event *rbe, | |
31426 | + void *rec, | |
31427 | + struct field_var **field_vars, | |
31428 | + unsigned int n_field_vars, | |
31429 | + unsigned int field_var_str_start) | |
31430 | +{ | |
31431 | + struct hist_elt_data *elt_data = elt->private_data; | |
31432 | + unsigned int i, j, var_idx; | |
31433 | + u64 var_val; | |
1a6e0f06 | 31434 | + |
e4b2b4a8 JK |
31435 | + for (i = 0, j = field_var_str_start; i < n_field_vars; i++) { |
31436 | + struct field_var *field_var = field_vars[i]; | |
31437 | + struct hist_field *var = field_var->var; | |
31438 | + struct hist_field *val = field_var->val; | |
1a6e0f06 | 31439 | + |
e4b2b4a8 JK |
31440 | + var_val = val->fn(val, elt, rbe, rec); |
31441 | + var_idx = var->var.idx; | |
1a6e0f06 | 31442 | + |
e4b2b4a8 JK |
31443 | + if (val->flags & HIST_FIELD_FL_STRING) { |
31444 | + char *str = elt_data->field_var_str[j++]; | |
31445 | + char *val_str = (char *)(uintptr_t)var_val; | |
1a6e0f06 | 31446 | + |
e4b2b4a8 JK |
31447 | + strscpy(str, val_str, STR_VAR_LEN_MAX); |
31448 | + var_val = (u64)(uintptr_t)str; | |
1a6e0f06 | 31449 | + } |
e4b2b4a8 | 31450 | + tracing_map_set_var(elt, var_idx, var_val); |
1a6e0f06 | 31451 | + } |
1a6e0f06 JK |
31452 | +} |
31453 | + | |
e4b2b4a8 JK |
31454 | +static void update_field_vars(struct hist_trigger_data *hist_data, |
31455 | + struct tracing_map_elt *elt, | |
31456 | + struct ring_buffer_event *rbe, | |
31457 | + void *rec) | |
1a6e0f06 | 31458 | +{ |
e4b2b4a8 JK |
31459 | + __update_field_vars(elt, rbe, rec, hist_data->field_vars, |
31460 | + hist_data->n_field_vars, 0); | |
31461 | +} | |
1a6e0f06 | 31462 | + |
e4b2b4a8 JK |
31463 | +static void update_max_vars(struct hist_trigger_data *hist_data, |
31464 | + struct tracing_map_elt *elt, | |
31465 | + struct ring_buffer_event *rbe, | |
31466 | + void *rec) | |
31467 | +{ | |
31468 | + __update_field_vars(elt, rbe, rec, hist_data->max_vars, | |
31469 | + hist_data->n_max_vars, hist_data->n_field_var_str); | |
1a6e0f06 JK |
31470 | +} |
31471 | + | |
e4b2b4a8 JK |
31472 | +static struct hist_field *create_var(struct hist_trigger_data *hist_data, |
31473 | + struct trace_event_file *file, | |
31474 | + char *name, int size, const char *type) | |
31475 | +{ | |
31476 | + struct hist_field *var; | |
31477 | + int idx; | |
1a6e0f06 | 31478 | + |
e4b2b4a8 JK |
31479 | + if (find_var(hist_data, file, name) && !hist_data->remove) { |
31480 | + var = ERR_PTR(-EINVAL); | |
31481 | + goto out; | |
31482 | + } | |
1a6e0f06 | 31483 | + |
e4b2b4a8 JK |
31484 | + var = kzalloc(sizeof(struct hist_field), GFP_KERNEL); |
31485 | + if (!var) { | |
31486 | + var = ERR_PTR(-ENOMEM); | |
31487 | + goto out; | |
31488 | + } | |
1a6e0f06 | 31489 | + |
e4b2b4a8 JK |
31490 | + idx = tracing_map_add_var(hist_data->map); |
31491 | + if (idx < 0) { | |
31492 | + kfree(var); | |
31493 | + var = ERR_PTR(-EINVAL); | |
31494 | + goto out; | |
31495 | + } | |
1a6e0f06 | 31496 | + |
e4b2b4a8 JK |
31497 | + var->flags = HIST_FIELD_FL_VAR; |
31498 | + var->var.idx = idx; | |
31499 | + var->var.hist_data = var->hist_data = hist_data; | |
31500 | + var->size = size; | |
31501 | + var->var.name = kstrdup(name, GFP_KERNEL); | |
31502 | + var->type = kstrdup(type, GFP_KERNEL); | |
31503 | + if (!var->var.name || !var->type) { | |
31504 | + kfree(var->var.name); | |
31505 | + kfree(var->type); | |
31506 | + kfree(var); | |
31507 | + var = ERR_PTR(-ENOMEM); | |
31508 | + } | |
31509 | + out: | |
31510 | + return var; | |
31511 | +} | |
1a6e0f06 | 31512 | + |
e4b2b4a8 JK |
31513 | +static struct field_var *create_field_var(struct hist_trigger_data *hist_data, |
31514 | + struct trace_event_file *file, | |
31515 | + char *field_name) | |
1a6e0f06 | 31516 | +{ |
e4b2b4a8 JK |
31517 | + struct hist_field *val = NULL, *var = NULL; |
31518 | + unsigned long flags = HIST_FIELD_FL_VAR; | |
31519 | + struct field_var *field_var; | |
b3bbd485 JK |
31520 | + int ret = 0; |
31521 | + | |
e4b2b4a8 JK |
31522 | + if (hist_data->n_field_vars >= SYNTH_FIELDS_MAX) { |
31523 | + hist_err("Too many field variables defined: ", field_name); | |
31524 | + ret = -EINVAL; | |
31525 | + goto err; | |
31526 | + } | |
31527 | + | |
31528 | + val = parse_atom(hist_data, file, field_name, &flags, NULL); | |
31529 | + if (IS_ERR(val)) { | |
31530 | + hist_err("Couldn't parse field variable: ", field_name); | |
31531 | + ret = PTR_ERR(val); | |
31532 | + goto err; | |
31533 | + } | |
31534 | + | |
31535 | + var = create_var(hist_data, file, field_name, val->size, val->type); | |
31536 | + if (IS_ERR(var)) { | |
31537 | + hist_err("Couldn't create or find variable: ", field_name); | |
31538 | + kfree(val); | |
31539 | + ret = PTR_ERR(var); | |
31540 | + goto err; | |
31541 | + } | |
31542 | + | |
31543 | + field_var = kzalloc(sizeof(struct field_var), GFP_KERNEL); | |
31544 | + if (!field_var) { | |
31545 | + kfree(val); | |
31546 | + kfree(var); | |
31547 | + ret = -ENOMEM; | |
31548 | + goto err; | |
31549 | + } | |
31550 | + | |
31551 | + field_var->var = var; | |
31552 | + field_var->val = val; | |
31553 | + out: | |
31554 | + return field_var; | |
31555 | + err: | |
31556 | + field_var = ERR_PTR(ret); | |
31557 | + goto out; | |
1a6e0f06 JK |
31558 | +} |
31559 | + | |
e4b2b4a8 JK |
31560 | +/** |
31561 | + * create_target_field_var - Automatically create a variable for a field | |
31562 | + * @target_hist_data: The target hist trigger | |
31563 | + * @subsys_name: Optional subsystem name | |
31564 | + * @event_name: Optional event name | |
31565 | + * @var_name: The name of the field (and the resulting variable) | |
31566 | + * | |
31567 | + * Hist trigger actions fetch data from variables, not directly from | |
31568 | + * events. However, for convenience, users are allowed to directly | |
31569 | + * specify an event field in an action, which will be automatically | |
31570 | + * converted into a variable on their behalf. | |
31571 | + | |
31572 | + * This function creates a field variable with the name var_name on | |
31573 | + * the hist trigger currently being defined on the target event. If | |
31574 | + * subsys_name and event_name are specified, this function simply | |
31575 | + * verifies that they do in fact match the target event subsystem and | |
31576 | + * event name. | |
31577 | + * | |
31578 | + * Return: The variable created for the field. | |
1a6e0f06 | 31579 | + */ |
e4b2b4a8 JK |
31580 | +static struct field_var * |
31581 | +create_target_field_var(struct hist_trigger_data *target_hist_data, | |
31582 | + char *subsys_name, char *event_name, char *var_name) | |
1a6e0f06 | 31583 | +{ |
e4b2b4a8 | 31584 | + struct trace_event_file *file = target_hist_data->event_file; |
1a6e0f06 | 31585 | + |
e4b2b4a8 JK |
31586 | + if (subsys_name) { |
31587 | + struct trace_event_call *call; | |
1a6e0f06 | 31588 | + |
e4b2b4a8 JK |
31589 | + if (!event_name) |
31590 | + return NULL; | |
1a6e0f06 | 31591 | + |
e4b2b4a8 JK |
31592 | + call = file->event_call; |
31593 | + | |
31594 | + if (strcmp(subsys_name, call->class->system) != 0) | |
31595 | + return NULL; | |
31596 | + | |
31597 | + if (strcmp(event_name, trace_event_name(call)) != 0) | |
31598 | + return NULL; | |
31599 | + } | |
31600 | + | |
31601 | + return create_field_var(target_hist_data, file, var_name); | |
1a6e0f06 JK |
31602 | +} |
31603 | + | |
e4b2b4a8 JK |
31604 | +static void onmax_print(struct seq_file *m, |
31605 | + struct hist_trigger_data *hist_data, | |
31606 | + struct tracing_map_elt *elt, | |
31607 | + struct action_data *data) | |
1a6e0f06 | 31608 | +{ |
e4b2b4a8 | 31609 | + unsigned int i, save_var_idx, max_idx = data->onmax.max_var->var.idx; |
1a6e0f06 | 31610 | + |
e4b2b4a8 | 31611 | + seq_printf(m, "\n\tmax: %10llu", tracing_map_read_var(elt, max_idx)); |
1a6e0f06 | 31612 | + |
e4b2b4a8 JK |
31613 | + for (i = 0; i < hist_data->n_max_vars; i++) { |
31614 | + struct hist_field *save_val = hist_data->max_vars[i]->val; | |
31615 | + struct hist_field *save_var = hist_data->max_vars[i]->var; | |
31616 | + u64 val; | |
1a6e0f06 | 31617 | + |
e4b2b4a8 | 31618 | + save_var_idx = save_var->var.idx; |
1a6e0f06 | 31619 | + |
e4b2b4a8 | 31620 | + val = tracing_map_read_var(elt, save_var_idx); |
1a6e0f06 | 31621 | + |
e4b2b4a8 JK |
31622 | + if (save_val->flags & HIST_FIELD_FL_STRING) { |
31623 | + seq_printf(m, " %s: %-32s", save_var->var.name, | |
31624 | + (char *)(uintptr_t)(val)); | |
31625 | + } else | |
31626 | + seq_printf(m, " %s: %10llu", save_var->var.name, val); | |
31627 | + } | |
1a6e0f06 JK |
31628 | +} |
31629 | + | |
e4b2b4a8 JK |
31630 | +static void onmax_save(struct hist_trigger_data *hist_data, |
31631 | + struct tracing_map_elt *elt, void *rec, | |
31632 | + struct ring_buffer_event *rbe, | |
31633 | + struct action_data *data, u64 *var_ref_vals) | |
1a6e0f06 | 31634 | +{ |
e4b2b4a8 JK |
31635 | + unsigned int max_idx = data->onmax.max_var->var.idx; |
31636 | + unsigned int max_var_ref_idx = data->onmax.max_var_ref_idx; | |
1a6e0f06 | 31637 | + |
e4b2b4a8 | 31638 | + u64 var_val, max_val; |
1a6e0f06 | 31639 | + |
e4b2b4a8 JK |
31640 | + var_val = var_ref_vals[max_var_ref_idx]; |
31641 | + max_val = tracing_map_read_var(elt, max_idx); | |
31642 | + | |
31643 | + if (var_val <= max_val) | |
31644 | + return; | |
31645 | + | |
31646 | + tracing_map_set_var(elt, max_idx, var_val); | |
31647 | + | |
31648 | + update_max_vars(hist_data, elt, rbe, rec); | |
31649 | +} | |
1a6e0f06 | 31650 | + |
e4b2b4a8 | 31651 | +static void onmax_destroy(struct action_data *data) |
1a6e0f06 | 31652 | +{ |
e4b2b4a8 | 31653 | + unsigned int i; |
1a6e0f06 | 31654 | + |
e4b2b4a8 JK |
31655 | + destroy_hist_field(data->onmax.max_var, 0); |
31656 | + destroy_hist_field(data->onmax.var, 0); | |
1a6e0f06 | 31657 | + |
e4b2b4a8 JK |
31658 | + kfree(data->onmax.var_str); |
31659 | + kfree(data->onmax.fn_name); | |
1a6e0f06 | 31660 | + |
e4b2b4a8 JK |
31661 | + for (i = 0; i < data->n_params; i++) |
31662 | + kfree(data->params[i]); | |
1a6e0f06 | 31663 | + |
e4b2b4a8 JK |
31664 | + kfree(data); |
31665 | +} | |
1a6e0f06 | 31666 | + |
e4b2b4a8 JK |
31667 | +static int onmax_create(struct hist_trigger_data *hist_data, |
31668 | + struct action_data *data) | |
31669 | +{ | |
31670 | + struct trace_event_file *file = hist_data->event_file; | |
31671 | + struct hist_field *var_field, *ref_field, *max_var; | |
31672 | + unsigned int var_ref_idx = hist_data->n_var_refs; | |
31673 | + struct field_var *field_var; | |
31674 | + char *onmax_var_str, *param; | |
31675 | + unsigned long flags; | |
31676 | + unsigned int i; | |
31677 | + int ret = 0; | |
1a6e0f06 | 31678 | + |
e4b2b4a8 JK |
31679 | + onmax_var_str = data->onmax.var_str; |
31680 | + if (onmax_var_str[0] != '$') { | |
31681 | + hist_err("onmax: For onmax(x), x must be a variable: ", onmax_var_str); | |
b3bbd485 | 31682 | + return -EINVAL; |
e4b2b4a8 JK |
31683 | + } |
31684 | + onmax_var_str++; | |
b3bbd485 | 31685 | + |
e4b2b4a8 JK |
31686 | + var_field = find_target_event_var(hist_data, NULL, NULL, onmax_var_str); |
31687 | + if (!var_field) { | |
31688 | + hist_err("onmax: Couldn't find onmax variable: ", onmax_var_str); | |
31689 | + return -EINVAL; | |
31690 | + } | |
1a6e0f06 | 31691 | + |
e4b2b4a8 JK |
31692 | + flags = HIST_FIELD_FL_VAR_REF; |
31693 | + ref_field = create_hist_field(hist_data, NULL, flags, NULL); | |
31694 | + if (!ref_field) | |
31695 | + return -ENOMEM; | |
1a6e0f06 | 31696 | + |
e4b2b4a8 JK |
31697 | + if (init_var_ref(ref_field, var_field, NULL, NULL)) { |
31698 | + destroy_hist_field(ref_field, 0); | |
31699 | + ret = -ENOMEM; | |
31700 | + goto out; | |
1a6e0f06 | 31701 | + } |
e4b2b4a8 JK |
31702 | + hist_data->var_refs[hist_data->n_var_refs] = ref_field; |
31703 | + ref_field->var_ref_idx = hist_data->n_var_refs++; | |
31704 | + data->onmax.var = ref_field; | |
31705 | + | |
31706 | + data->fn = onmax_save; | |
31707 | + data->onmax.max_var_ref_idx = var_ref_idx; | |
31708 | + max_var = create_var(hist_data, file, "max", sizeof(u64), "u64"); | |
31709 | + if (IS_ERR(max_var)) { | |
31710 | + hist_err("onmax: Couldn't create onmax variable: ", "max"); | |
31711 | + ret = PTR_ERR(max_var); | |
31712 | + goto out; | |
31713 | + } | |
31714 | + data->onmax.max_var = max_var; | |
1a6e0f06 | 31715 | + |
e4b2b4a8 JK |
31716 | + for (i = 0; i < data->n_params; i++) { |
31717 | + param = kstrdup(data->params[i], GFP_KERNEL); | |
31718 | + if (!param) { | |
31719 | + ret = -ENOMEM; | |
31720 | + goto out; | |
31721 | + } | |
31722 | + | |
31723 | + field_var = create_target_field_var(hist_data, NULL, NULL, param); | |
31724 | + if (IS_ERR(field_var)) { | |
31725 | + hist_err("onmax: Couldn't create field variable: ", param); | |
31726 | + ret = PTR_ERR(field_var); | |
31727 | + kfree(param); | |
31728 | + goto out; | |
31729 | + } | |
31730 | + | |
31731 | + hist_data->max_vars[hist_data->n_max_vars++] = field_var; | |
31732 | + if (field_var->val->flags & HIST_FIELD_FL_STRING) | |
31733 | + hist_data->n_max_var_str++; | |
31734 | + | |
31735 | + kfree(param); | |
1a6e0f06 | 31736 | + } |
e4b2b4a8 JK |
31737 | + out: |
31738 | + return ret; | |
1a6e0f06 JK |
31739 | +} |
31740 | + | |
e4b2b4a8 | 31741 | +static int parse_action_params(char *params, struct action_data *data) |
1a6e0f06 | 31742 | +{ |
e4b2b4a8 JK |
31743 | + char *param, *saved_param; |
31744 | + int ret = 0; | |
1a6e0f06 | 31745 | + |
e4b2b4a8 JK |
31746 | + while (params) { |
31747 | + if (data->n_params >= SYNTH_FIELDS_MAX) | |
31748 | + goto out; | |
1a6e0f06 | 31749 | + |
e4b2b4a8 JK |
31750 | + param = strsep(¶ms, ","); |
31751 | + if (!param) { | |
31752 | + ret = -EINVAL; | |
31753 | + goto out; | |
31754 | + } | |
1a6e0f06 | 31755 | + |
e4b2b4a8 JK |
31756 | + param = strstrip(param); |
31757 | + if (strlen(param) < 2) { | |
31758 | + hist_err("Invalid action param: ", param); | |
b3bbd485 JK |
31759 | + ret = -EINVAL; |
31760 | + goto out; | |
31761 | + } | |
1a6e0f06 | 31762 | + |
e4b2b4a8 JK |
31763 | + saved_param = kstrdup(param, GFP_KERNEL); |
31764 | + if (!saved_param) { | |
31765 | + ret = -ENOMEM; | |
31766 | + goto out; | |
31767 | + } | |
31768 | + | |
31769 | + data->params[data->n_params++] = saved_param; | |
b3bbd485 | 31770 | + } |
e4b2b4a8 JK |
31771 | + out: |
31772 | + return ret; | |
31773 | +} | |
b3bbd485 | 31774 | + |
e4b2b4a8 | 31775 | +static struct action_data *onmax_parse(char *str) |
1a6e0f06 | 31776 | +{ |
e4b2b4a8 JK |
31777 | + char *onmax_fn_name, *onmax_var_str; |
31778 | + struct action_data *data; | |
31779 | + int ret = -EINVAL; | |
1a6e0f06 | 31780 | + |
e4b2b4a8 JK |
31781 | + data = kzalloc(sizeof(*data), GFP_KERNEL); |
31782 | + if (!data) | |
31783 | + return ERR_PTR(-ENOMEM); | |
1a6e0f06 | 31784 | + |
e4b2b4a8 JK |
31785 | + onmax_var_str = strsep(&str, ")"); |
31786 | + if (!onmax_var_str || !str) { | |
b3bbd485 | 31787 | + ret = -EINVAL; |
e4b2b4a8 | 31788 | + goto free; |
b3bbd485 JK |
31789 | + } |
31790 | + | |
e4b2b4a8 JK |
31791 | + data->onmax.var_str = kstrdup(onmax_var_str, GFP_KERNEL); |
31792 | + if (!data->onmax.var_str) { | |
31793 | + ret = -ENOMEM; | |
31794 | + goto free; | |
31795 | + } | |
31796 | + | |
31797 | + strsep(&str, "."); | |
31798 | + if (!str) | |
31799 | + goto free; | |
31800 | + | |
31801 | + onmax_fn_name = strsep(&str, "("); | |
31802 | + if (!onmax_fn_name || !str) | |
31803 | + goto free; | |
31804 | + | |
31805 | + if (strncmp(onmax_fn_name, "save", strlen("save")) == 0) { | |
31806 | + char *params = strsep(&str, ")"); | |
31807 | + | |
31808 | + if (!params) { | |
31809 | + ret = -EINVAL; | |
31810 | + goto free; | |
1a6e0f06 | 31811 | + } |
1a6e0f06 | 31812 | + |
e4b2b4a8 JK |
31813 | + ret = parse_action_params(params, data); |
31814 | + if (ret) | |
31815 | + goto free; | |
31816 | + } else | |
31817 | + goto free; | |
31818 | + | |
31819 | + data->onmax.fn_name = kstrdup(onmax_fn_name, GFP_KERNEL); | |
31820 | + if (!data->onmax.fn_name) { | |
31821 | + ret = -ENOMEM; | |
31822 | + goto free; | |
1a6e0f06 | 31823 | + } |
e4b2b4a8 JK |
31824 | + out: |
31825 | + return data; | |
31826 | + free: | |
31827 | + onmax_destroy(data); | |
31828 | + data = ERR_PTR(ret); | |
31829 | + goto out; | |
1a6e0f06 JK |
31830 | +} |
31831 | + | |
e4b2b4a8 JK |
31832 | +static void onmatch_destroy(struct action_data *data) |
31833 | +{ | |
31834 | + unsigned int i; | |
31835 | + | |
31836 | + mutex_lock(&synth_event_mutex); | |
31837 | + | |
31838 | + kfree(data->onmatch.match_event); | |
31839 | + kfree(data->onmatch.match_event_system); | |
31840 | + kfree(data->onmatch.synth_event_name); | |
31841 | + | |
31842 | + for (i = 0; i < data->n_params; i++) | |
31843 | + kfree(data->params[i]); | |
31844 | + | |
31845 | + if (data->onmatch.synth_event) | |
31846 | + data->onmatch.synth_event->ref--; | |
31847 | + | |
31848 | + kfree(data); | |
31849 | + | |
31850 | + mutex_unlock(&synth_event_mutex); | |
31851 | +} | |
31852 | + | |
31853 | +static void destroy_field_var(struct field_var *field_var) | |
31854 | +{ | |
31855 | + if (!field_var) | |
31856 | + return; | |
31857 | + | |
31858 | + destroy_hist_field(field_var->var, 0); | |
31859 | + destroy_hist_field(field_var->val, 0); | |
31860 | + | |
31861 | + kfree(field_var); | |
1a6e0f06 JK |
31862 | +} |
31863 | + | |
e4b2b4a8 JK |
31864 | +static void destroy_field_vars(struct hist_trigger_data *hist_data) |
31865 | +{ | |
31866 | + unsigned int i; | |
1a6e0f06 | 31867 | + |
e4b2b4a8 JK |
31868 | + for (i = 0; i < hist_data->n_field_vars; i++) |
31869 | + destroy_field_var(hist_data->field_vars[i]); | |
31870 | +} | |
31871 | + | |
31872 | +static void save_field_var(struct hist_trigger_data *hist_data, | |
31873 | + struct field_var *field_var) | |
1a6e0f06 | 31874 | +{ |
e4b2b4a8 | 31875 | + hist_data->field_vars[hist_data->n_field_vars++] = field_var; |
1a6e0f06 | 31876 | + |
e4b2b4a8 JK |
31877 | + if (field_var->val->flags & HIST_FIELD_FL_STRING) |
31878 | + hist_data->n_field_var_str++; | |
31879 | +} | |
1a6e0f06 | 31880 | + |
e4b2b4a8 JK |
31881 | + |
31882 | +static void destroy_synth_var_refs(struct hist_trigger_data *hist_data) | |
31883 | +{ | |
31884 | + unsigned int i; | |
31885 | + | |
31886 | + for (i = 0; i < hist_data->n_synth_var_refs; i++) | |
31887 | + destroy_hist_field(hist_data->synth_var_refs[i], 0); | |
1a6e0f06 | 31888 | +} |
e4b2b4a8 JK |
31889 | + |
31890 | +static void save_synth_var_ref(struct hist_trigger_data *hist_data, | |
31891 | + struct hist_field *var_ref) | |
1a6e0f06 | 31892 | +{ |
e4b2b4a8 JK |
31893 | + hist_data->synth_var_refs[hist_data->n_synth_var_refs++] = var_ref; |
31894 | + | |
31895 | + hist_data->var_refs[hist_data->n_var_refs] = var_ref; | |
31896 | + var_ref->var_ref_idx = hist_data->n_var_refs++; | |
1a6e0f06 | 31897 | +} |
1a6e0f06 | 31898 | + |
e4b2b4a8 JK |
31899 | +static int check_synth_field(struct synth_event *event, |
31900 | + struct hist_field *hist_field, | |
31901 | + unsigned int field_pos) | |
1a6e0f06 | 31902 | +{ |
e4b2b4a8 JK |
31903 | + struct synth_field *field; |
31904 | + | |
31905 | + if (field_pos >= event->n_fields) | |
31906 | + return -EINVAL; | |
31907 | + | |
31908 | + field = event->fields[field_pos]; | |
31909 | + | |
31910 | + if (strcmp(field->type, hist_field->type) != 0) | |
31911 | + return -EINVAL; | |
31912 | + | |
31913 | + return 0; | |
1a6e0f06 JK |
31914 | +} |
31915 | + | |
e4b2b4a8 JK |
31916 | +static struct hist_field * |
31917 | +onmatch_find_var(struct hist_trigger_data *hist_data, struct action_data *data, | |
31918 | + char *system, char *event, char *var) | |
31919 | +{ | |
31920 | + struct hist_field *hist_field; | |
31921 | + | |
31922 | + var++; /* skip '$' */ | |
31923 | + | |
31924 | + hist_field = find_target_event_var(hist_data, system, event, var); | |
31925 | + if (!hist_field) { | |
31926 | + if (!system) { | |
31927 | + system = data->onmatch.match_event_system; | |
31928 | + event = data->onmatch.match_event; | |
31929 | + } | |
31930 | + | |
31931 | + hist_field = find_event_var(hist_data, system, event, var); | |
1a6e0f06 JK |
31932 | + } |
31933 | + | |
e4b2b4a8 JK |
31934 | + if (!hist_field) |
31935 | + hist_err_event("onmatch: Couldn't find onmatch param: $", system, event, var); | |
31936 | + | |
31937 | + return hist_field; | |
31938 | +} | |
31939 | + | |
31940 | +static struct hist_field * | |
31941 | +onmatch_create_field_var(struct hist_trigger_data *hist_data, | |
31942 | + struct action_data *data, char *system, | |
31943 | + char *event, char *var) | |
31944 | +{ | |
31945 | + struct hist_field *hist_field = NULL; | |
31946 | + struct field_var *field_var; | |
31947 | + | |
31948 | + /* | |
31949 | + * First try to create a field var on the target event (the | |
31950 | + * currently being defined). This will create a variable for | |
31951 | + * unqualified fields on the target event, or if qualified, | |
31952 | + * target fields that have qualified names matching the target. | |
31953 | + */ | |
31954 | + field_var = create_target_field_var(hist_data, system, event, var); | |
31955 | + | |
31956 | + if (field_var && !IS_ERR(field_var)) { | |
31957 | + save_field_var(hist_data, field_var); | |
31958 | + hist_field = field_var->var; | |
31959 | + } else { | |
31960 | + field_var = NULL; | |
31961 | + /* | |
31962 | + * If no explicit system.event is specfied, default to | |
31963 | + * looking for fields on the onmatch(system.event.xxx) | |
31964 | + * event. | |
31965 | + */ | |
31966 | + if (!system) { | |
31967 | + system = data->onmatch.match_event_system; | |
31968 | + event = data->onmatch.match_event; | |
31969 | + } | |
31970 | + | |
31971 | + /* | |
31972 | + * At this point, we're looking at a field on another | |
31973 | + * event. Because we can't modify a hist trigger on | |
31974 | + * another event to add a variable for a field, we need | |
31975 | + * to create a new trigger on that event and create the | |
31976 | + * variable at the same time. | |
31977 | + */ | |
31978 | + hist_field = create_field_var_hist(hist_data, system, event, var); | |
31979 | + if (IS_ERR(hist_field)) | |
31980 | + goto free; | |
31981 | + } | |
31982 | + out: | |
31983 | + return hist_field; | |
31984 | + free: | |
31985 | + destroy_field_var(field_var); | |
31986 | + hist_field = NULL; | |
31987 | + goto out; | |
31988 | +} | |
31989 | + | |
31990 | +static int onmatch_create(struct hist_trigger_data *hist_data, | |
31991 | + struct trace_event_file *file, | |
31992 | + struct action_data *data) | |
31993 | +{ | |
31994 | + char *event_name, *param, *system = NULL; | |
31995 | + struct hist_field *hist_field, *var_ref; | |
31996 | + unsigned int i, var_ref_idx; | |
31997 | + unsigned int field_pos = 0; | |
31998 | + struct synth_event *event; | |
31999 | + int ret = 0; | |
32000 | + | |
32001 | + mutex_lock(&synth_event_mutex); | |
32002 | + event = find_synth_event(data->onmatch.synth_event_name); | |
32003 | + if (!event) { | |
32004 | + hist_err("onmatch: Couldn't find synthetic event: ", data->onmatch.synth_event_name); | |
32005 | + mutex_unlock(&synth_event_mutex); | |
32006 | + return -EINVAL; | |
32007 | + } | |
32008 | + event->ref++; | |
32009 | + mutex_unlock(&synth_event_mutex); | |
32010 | + | |
32011 | + var_ref_idx = hist_data->n_var_refs; | |
32012 | + | |
32013 | + for (i = 0; i < data->n_params; i++) { | |
32014 | + char *p; | |
32015 | + | |
32016 | + p = param = kstrdup(data->params[i], GFP_KERNEL); | |
32017 | + if (!param) { | |
32018 | + ret = -ENOMEM; | |
32019 | + goto err; | |
32020 | + } | |
32021 | + | |
32022 | + system = strsep(¶m, "."); | |
32023 | + if (!param) { | |
32024 | + param = (char *)system; | |
32025 | + system = event_name = NULL; | |
32026 | + } else { | |
32027 | + event_name = strsep(¶m, "."); | |
32028 | + if (!param) { | |
32029 | + kfree(p); | |
32030 | + ret = -EINVAL; | |
32031 | + goto err; | |
32032 | + } | |
32033 | + } | |
32034 | + | |
32035 | + if (param[0] == '$') | |
32036 | + hist_field = onmatch_find_var(hist_data, data, system, | |
32037 | + event_name, param); | |
32038 | + else | |
32039 | + hist_field = onmatch_create_field_var(hist_data, data, | |
32040 | + system, | |
32041 | + event_name, | |
32042 | + param); | |
32043 | + | |
32044 | + if (!hist_field) { | |
32045 | + kfree(p); | |
32046 | + ret = -EINVAL; | |
32047 | + goto err; | |
32048 | + } | |
32049 | + | |
32050 | + if (check_synth_field(event, hist_field, field_pos) == 0) { | |
32051 | + var_ref = create_var_ref(hist_field, system, event_name); | |
32052 | + if (!var_ref) { | |
32053 | + kfree(p); | |
32054 | + ret = -ENOMEM; | |
32055 | + goto err; | |
32056 | + } | |
32057 | + | |
32058 | + save_synth_var_ref(hist_data, var_ref); | |
32059 | + field_pos++; | |
32060 | + kfree(p); | |
32061 | + continue; | |
32062 | + } | |
32063 | + | |
32064 | + hist_err_event("onmatch: Param type doesn't match synthetic event field type: ", | |
32065 | + system, event_name, param); | |
32066 | + kfree(p); | |
32067 | + ret = -EINVAL; | |
32068 | + goto err; | |
32069 | + } | |
32070 | + | |
32071 | + if (field_pos != event->n_fields) { | |
32072 | + hist_err("onmatch: Param count doesn't match synthetic event field count: ", event->name); | |
32073 | + ret = -EINVAL; | |
32074 | + goto err; | |
32075 | + } | |
32076 | + | |
32077 | + data->fn = action_trace; | |
32078 | + data->onmatch.synth_event = event; | |
32079 | + data->onmatch.var_ref_idx = var_ref_idx; | |
32080 | + out: | |
32081 | + return ret; | |
32082 | + err: | |
32083 | + mutex_lock(&synth_event_mutex); | |
32084 | + event->ref--; | |
32085 | + mutex_unlock(&synth_event_mutex); | |
32086 | + | |
32087 | + goto out; | |
32088 | +} | |
32089 | + | |
32090 | +static struct action_data *onmatch_parse(struct trace_array *tr, char *str) | |
32091 | +{ | |
32092 | + char *match_event, *match_event_system; | |
32093 | + char *synth_event_name, *params; | |
32094 | + struct action_data *data; | |
32095 | + int ret = -EINVAL; | |
32096 | + | |
32097 | + data = kzalloc(sizeof(*data), GFP_KERNEL); | |
32098 | + if (!data) | |
32099 | + return ERR_PTR(-ENOMEM); | |
32100 | + | |
32101 | + match_event = strsep(&str, ")"); | |
32102 | + if (!match_event || !str) { | |
32103 | + hist_err("onmatch: Missing closing paren: ", match_event); | |
32104 | + goto free; | |
32105 | + } | |
32106 | + | |
32107 | + match_event_system = strsep(&match_event, "."); | |
32108 | + if (!match_event) { | |
32109 | + hist_err("onmatch: Missing subsystem for match event: ", match_event_system); | |
32110 | + goto free; | |
32111 | + } | |
32112 | + | |
32113 | + if (IS_ERR(event_file(tr, match_event_system, match_event))) { | |
32114 | + hist_err_event("onmatch: Invalid subsystem or event name: ", | |
32115 | + match_event_system, match_event, NULL); | |
32116 | + goto free; | |
b3bbd485 JK |
32117 | } |
32118 | ||
32119 | - if (WARN_ON_ONCE(!field)) | |
32120 | - goto out; | |
e4b2b4a8 JK |
32121 | + data->onmatch.match_event = kstrdup(match_event, GFP_KERNEL); |
32122 | + if (!data->onmatch.match_event) { | |
32123 | + ret = -ENOMEM; | |
32124 | + goto free; | |
32125 | + } | |
b3bbd485 JK |
32126 | |
32127 | - if (is_string_field(field)) { | |
32128 | - flags |= HIST_FIELD_FL_STRING; | |
e4b2b4a8 JK |
32129 | + data->onmatch.match_event_system = kstrdup(match_event_system, GFP_KERNEL); |
32130 | + if (!data->onmatch.match_event_system) { | |
32131 | + ret = -ENOMEM; | |
32132 | + goto free; | |
32133 | + } | |
b3bbd485 JK |
32134 | |
32135 | - if (field->filter_type == FILTER_STATIC_STRING) | |
32136 | - hist_field->fn = hist_field_string; | |
32137 | - else if (field->filter_type == FILTER_DYN_STRING) | |
32138 | - hist_field->fn = hist_field_dynstring; | |
32139 | - else | |
32140 | - hist_field->fn = hist_field_pstring; | |
32141 | - } else { | |
32142 | - hist_field->fn = select_value_fn(field->size, | |
32143 | - field->is_signed); | |
32144 | - if (!hist_field->fn) { | |
32145 | - destroy_hist_field(hist_field); | |
32146 | - return NULL; | |
32147 | - } | |
e4b2b4a8 JK |
32148 | + strsep(&str, "."); |
32149 | + if (!str) { | |
32150 | + hist_err("onmatch: Missing . after onmatch(): ", str); | |
32151 | + goto free; | |
b3bbd485 JK |
32152 | } |
32153 | - out: | |
32154 | - hist_field->field = field; | |
32155 | - hist_field->flags = flags; | |
32156 | ||
32157 | - return hist_field; | |
32158 | -} | |
e4b2b4a8 JK |
32159 | + synth_event_name = strsep(&str, "("); |
32160 | + if (!synth_event_name || !str) { | |
32161 | + hist_err("onmatch: Missing opening paramlist paren: ", synth_event_name); | |
32162 | + goto free; | |
32163 | + } | |
b3bbd485 JK |
32164 | |
32165 | -static void destroy_hist_fields(struct hist_trigger_data *hist_data) | |
32166 | -{ | |
32167 | - unsigned int i; | |
e4b2b4a8 JK |
32168 | + data->onmatch.synth_event_name = kstrdup(synth_event_name, GFP_KERNEL); |
32169 | + if (!data->onmatch.synth_event_name) { | |
b3bbd485 | 32170 | + ret = -ENOMEM; |
e4b2b4a8 JK |
32171 | + goto free; |
32172 | + } | |
b3bbd485 JK |
32173 | |
32174 | - for (i = 0; i < TRACING_MAP_FIELDS_MAX; i++) { | |
32175 | - if (hist_data->fields[i]) { | |
32176 | - destroy_hist_field(hist_data->fields[i]); | |
32177 | - hist_data->fields[i] = NULL; | |
32178 | - } | |
e4b2b4a8 JK |
32179 | + params = strsep(&str, ")"); |
32180 | + if (!params || !str || (str && strlen(str))) { | |
32181 | + hist_err("onmatch: Missing closing paramlist paren: ", params); | |
32182 | + goto free; | |
b3bbd485 | 32183 | } |
e4b2b4a8 JK |
32184 | + |
32185 | + ret = parse_action_params(params, data); | |
32186 | + if (ret) | |
32187 | + goto free; | |
32188 | + out: | |
32189 | + return data; | |
32190 | + free: | |
32191 | + onmatch_destroy(data); | |
32192 | + data = ERR_PTR(ret); | |
32193 | + goto out; | |
b3bbd485 JK |
32194 | } |
32195 | ||
32196 | static int create_hitcount_val(struct hist_trigger_data *hist_data) | |
32197 | { | |
32198 | hist_data->fields[HITCOUNT_IDX] = | |
32199 | - create_hist_field(NULL, HIST_FIELD_FL_HITCOUNT); | |
e4b2b4a8 | 32200 | + create_hist_field(hist_data, NULL, HIST_FIELD_FL_HITCOUNT, NULL); |
b3bbd485 JK |
32201 | if (!hist_data->fields[HITCOUNT_IDX]) |
32202 | return -ENOMEM; | |
32203 | ||
32204 | hist_data->n_vals++; | |
e4b2b4a8 | 32205 | + hist_data->n_fields++; |
b3bbd485 JK |
32206 | |
32207 | if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX)) | |
32208 | return -EINVAL; | |
32209 | @@ -426,54 +3828,71 @@ static int create_hitcount_val(struct hist_trigger_data *hist_data) | |
32210 | return 0; | |
32211 | } | |
32212 | ||
e4b2b4a8 JK |
32213 | +static int __create_val_field(struct hist_trigger_data *hist_data, |
32214 | + unsigned int val_idx, | |
32215 | + struct trace_event_file *file, | |
32216 | + char *var_name, char *field_str, | |
32217 | + unsigned long flags) | |
32218 | +{ | |
32219 | + struct hist_field *hist_field; | |
32220 | + int ret = 0; | |
32221 | + | |
32222 | + hist_field = parse_expr(hist_data, file, field_str, flags, var_name, 0); | |
32223 | + if (IS_ERR(hist_field)) { | |
32224 | + ret = PTR_ERR(hist_field); | |
b3bbd485 JK |
32225 | + goto out; |
32226 | + } | |
32227 | + | |
e4b2b4a8 JK |
32228 | + hist_data->fields[val_idx] = hist_field; |
32229 | + | |
b3bbd485 | 32230 | + ++hist_data->n_vals; |
e4b2b4a8 | 32231 | + ++hist_data->n_fields; |
b3bbd485 | 32232 | + |
e4b2b4a8 | 32233 | + if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX)) |
b3bbd485 JK |
32234 | + ret = -EINVAL; |
32235 | + out: | |
32236 | + return ret; | |
32237 | +} | |
1a6e0f06 | 32238 | + |
b3bbd485 JK |
32239 | static int create_val_field(struct hist_trigger_data *hist_data, |
32240 | unsigned int val_idx, | |
32241 | struct trace_event_file *file, | |
32242 | char *field_str) | |
32243 | { | |
32244 | - struct ftrace_event_field *field = NULL; | |
32245 | - unsigned long flags = 0; | |
32246 | - char *field_name; | |
32247 | - int ret = 0; | |
32248 | - | |
32249 | if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX)) | |
32250 | return -EINVAL; | |
32251 | ||
32252 | - field_name = strsep(&field_str, "."); | |
32253 | - if (field_str) { | |
32254 | - if (strcmp(field_str, "hex") == 0) | |
32255 | - flags |= HIST_FIELD_FL_HEX; | |
32256 | - else { | |
32257 | - ret = -EINVAL; | |
32258 | - goto out; | |
32259 | - } | |
32260 | - } | |
e4b2b4a8 | 32261 | + return __create_val_field(hist_data, val_idx, file, NULL, field_str, 0); |
1a6e0f06 | 32262 | +} |
b3bbd485 JK |
32263 | |
32264 | - field = trace_find_event_field(file->event_call, field_name); | |
32265 | - if (!field || !field->size) { | |
32266 | - ret = -EINVAL; | |
32267 | - goto out; | |
32268 | - } | |
e4b2b4a8 JK |
32269 | +static int create_var_field(struct hist_trigger_data *hist_data, |
32270 | + unsigned int val_idx, | |
32271 | + struct trace_event_file *file, | |
32272 | + char *var_name, char *expr_str) | |
1a6e0f06 | 32273 | +{ |
e4b2b4a8 | 32274 | + unsigned long flags = 0; |
b3bbd485 JK |
32275 | |
32276 | - hist_data->fields[val_idx] = create_hist_field(field, flags); | |
32277 | - if (!hist_data->fields[val_idx]) { | |
32278 | - ret = -ENOMEM; | |
32279 | - goto out; | |
e4b2b4a8 JK |
32280 | + if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX)) |
32281 | + return -EINVAL; | |
1a6e0f06 | 32282 | + |
e4b2b4a8 JK |
32283 | + if (find_var(hist_data, file, var_name) && !hist_data->remove) { |
32284 | + hist_err("Variable already defined: ", var_name); | |
32285 | + return -EINVAL; | |
b3bbd485 JK |
32286 | } |
32287 | ||
32288 | - ++hist_data->n_vals; | |
e4b2b4a8 JK |
32289 | + flags |= HIST_FIELD_FL_VAR; |
32290 | + hist_data->n_vars++; | |
32291 | + if (WARN_ON(hist_data->n_vars > TRACING_MAP_VARS_MAX)) | |
32292 | + return -EINVAL; | |
b3bbd485 JK |
32293 | |
32294 | - if (WARN_ON(hist_data->n_vals > TRACING_MAP_VALS_MAX)) | |
32295 | - ret = -EINVAL; | |
32296 | - out: | |
32297 | - return ret; | |
e4b2b4a8 | 32298 | + return __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags); |
b3bbd485 JK |
32299 | } |
32300 | ||
e4b2b4a8 JK |
32301 | static int create_val_fields(struct hist_trigger_data *hist_data, |
32302 | struct trace_event_file *file) | |
32303 | { | |
32304 | char *fields_str, *field_str; | |
32305 | - unsigned int i, j; | |
32306 | + unsigned int i, j = 1; | |
32307 | int ret; | |
1a6e0f06 | 32308 | |
e4b2b4a8 | 32309 | ret = create_hitcount_val(hist_data); |
b3bbd485 | 32310 | @@ -493,12 +3912,15 @@ static int create_val_fields(struct hist_trigger_data *hist_data, |
e4b2b4a8 JK |
32311 | field_str = strsep(&fields_str, ","); |
32312 | if (!field_str) | |
32313 | break; | |
32314 | + | |
32315 | if (strcmp(field_str, "hitcount") == 0) | |
32316 | continue; | |
32317 | + | |
32318 | ret = create_val_field(hist_data, j++, file, field_str); | |
32319 | if (ret) | |
32320 | goto out; | |
1a6e0f06 | 32321 | } |
e4b2b4a8 JK |
32322 | + |
32323 | if (fields_str && (strcmp(fields_str, "hitcount") != 0)) | |
32324 | ret = -EINVAL; | |
32325 | out: | |
b3bbd485 | 32326 | @@ -511,12 +3933,13 @@ static int create_key_field(struct hist_trigger_data *hist_data, |
e4b2b4a8 JK |
32327 | struct trace_event_file *file, |
32328 | char *field_str) | |
32329 | { | |
32330 | - struct ftrace_event_field *field = NULL; | |
32331 | + struct hist_field *hist_field = NULL; | |
32332 | + | |
32333 | unsigned long flags = 0; | |
32334 | unsigned int key_size; | |
32335 | int ret = 0; | |
1a6e0f06 | 32336 | |
e4b2b4a8 JK |
32337 | - if (WARN_ON(key_idx >= TRACING_MAP_FIELDS_MAX)) |
32338 | + if (WARN_ON(key_idx >= HIST_FIELDS_MAX)) | |
32339 | return -EINVAL; | |
1a6e0f06 | 32340 | |
e4b2b4a8 | 32341 | flags |= HIST_FIELD_FL_KEY; |
b3bbd485 | 32342 | @@ -524,57 +3947,40 @@ static int create_key_field(struct hist_trigger_data *hist_data, |
e4b2b4a8 JK |
32343 | if (strcmp(field_str, "stacktrace") == 0) { |
32344 | flags |= HIST_FIELD_FL_STACKTRACE; | |
32345 | key_size = sizeof(unsigned long) * HIST_STACKTRACE_DEPTH; | |
32346 | + hist_field = create_hist_field(hist_data, NULL, flags, NULL); | |
32347 | } else { | |
32348 | - char *field_name = strsep(&field_str, "."); | |
32349 | - | |
32350 | - if (field_str) { | |
32351 | - if (strcmp(field_str, "hex") == 0) | |
32352 | - flags |= HIST_FIELD_FL_HEX; | |
32353 | - else if (strcmp(field_str, "sym") == 0) | |
32354 | - flags |= HIST_FIELD_FL_SYM; | |
32355 | - else if (strcmp(field_str, "sym-offset") == 0) | |
32356 | - flags |= HIST_FIELD_FL_SYM_OFFSET; | |
32357 | - else if ((strcmp(field_str, "execname") == 0) && | |
32358 | - (strcmp(field_name, "common_pid") == 0)) | |
32359 | - flags |= HIST_FIELD_FL_EXECNAME; | |
32360 | - else if (strcmp(field_str, "syscall") == 0) | |
32361 | - flags |= HIST_FIELD_FL_SYSCALL; | |
32362 | - else if (strcmp(field_str, "log2") == 0) | |
32363 | - flags |= HIST_FIELD_FL_LOG2; | |
32364 | - else { | |
32365 | - ret = -EINVAL; | |
32366 | - goto out; | |
32367 | - } | |
32368 | + hist_field = parse_expr(hist_data, file, field_str, flags, | |
32369 | + NULL, 0); | |
32370 | + if (IS_ERR(hist_field)) { | |
32371 | + ret = PTR_ERR(hist_field); | |
32372 | + goto out; | |
32373 | } | |
1a6e0f06 | 32374 | |
e4b2b4a8 JK |
32375 | - field = trace_find_event_field(file->event_call, field_name); |
32376 | - if (!field || !field->size) { | |
32377 | + if (hist_field->flags & HIST_FIELD_FL_VAR_REF) { | |
32378 | + hist_err("Using variable references as keys not supported: ", field_str); | |
32379 | + destroy_hist_field(hist_field, 0); | |
32380 | ret = -EINVAL; | |
32381 | goto out; | |
1a6e0f06 | 32382 | } |
e4b2b4a8 JK |
32383 | |
32384 | - if (is_string_field(field)) | |
32385 | - key_size = MAX_FILTER_STR_VAL; | |
32386 | - else | |
32387 | - key_size = field->size; | |
32388 | + key_size = hist_field->size; | |
1a6e0f06 | 32389 | } |
1a6e0f06 | 32390 | |
e4b2b4a8 JK |
32391 | - hist_data->fields[key_idx] = create_hist_field(field, flags); |
32392 | - if (!hist_data->fields[key_idx]) { | |
32393 | - ret = -ENOMEM; | |
32394 | - goto out; | |
32395 | - } | |
32396 | + hist_data->fields[key_idx] = hist_field; | |
32397 | ||
32398 | key_size = ALIGN(key_size, sizeof(u64)); | |
32399 | hist_data->fields[key_idx]->size = key_size; | |
32400 | hist_data->fields[key_idx]->offset = key_offset; | |
32401 | + | |
32402 | hist_data->key_size += key_size; | |
32403 | + | |
32404 | if (hist_data->key_size > HIST_KEY_SIZE_MAX) { | |
32405 | ret = -EINVAL; | |
32406 | goto out; | |
1a6e0f06 | 32407 | } |
1a6e0f06 | 32408 | |
e4b2b4a8 JK |
32409 | hist_data->n_keys++; |
32410 | + hist_data->n_fields++; | |
1a6e0f06 | 32411 | |
e4b2b4a8 JK |
32412 | if (WARN_ON(hist_data->n_keys > TRACING_MAP_KEYS_MAX)) |
32413 | return -EINVAL; | |
b3bbd485 | 32414 | @@ -618,21 +4024,113 @@ static int create_key_fields(struct hist_trigger_data *hist_data, |
e4b2b4a8 | 32415 | return ret; |
1a6e0f06 JK |
32416 | } |
32417 | ||
e4b2b4a8 JK |
32418 | +static int create_var_fields(struct hist_trigger_data *hist_data, |
32419 | + struct trace_event_file *file) | |
32420 | +{ | |
32421 | + unsigned int i, j = hist_data->n_vals; | |
32422 | + int ret = 0; | |
32423 | + | |
32424 | + unsigned int n_vars = hist_data->attrs->var_defs.n_vars; | |
32425 | + | |
32426 | + for (i = 0; i < n_vars; i++) { | |
32427 | + char *var_name = hist_data->attrs->var_defs.name[i]; | |
32428 | + char *expr = hist_data->attrs->var_defs.expr[i]; | |
32429 | + | |
32430 | + ret = create_var_field(hist_data, j++, file, var_name, expr); | |
32431 | + if (ret) | |
32432 | + goto out; | |
32433 | + } | |
32434 | + out: | |
32435 | + return ret; | |
32436 | +} | |
32437 | + | |
32438 | +static void free_var_defs(struct hist_trigger_data *hist_data) | |
32439 | +{ | |
32440 | + unsigned int i; | |
32441 | + | |
32442 | + for (i = 0; i < hist_data->attrs->var_defs.n_vars; i++) { | |
32443 | + kfree(hist_data->attrs->var_defs.name[i]); | |
32444 | + kfree(hist_data->attrs->var_defs.expr[i]); | |
32445 | + } | |
32446 | + | |
32447 | + hist_data->attrs->var_defs.n_vars = 0; | |
32448 | +} | |
32449 | + | |
32450 | +static int parse_var_defs(struct hist_trigger_data *hist_data) | |
32451 | +{ | |
32452 | + char *s, *str, *var_name, *field_str; | |
32453 | + unsigned int i, j, n_vars = 0; | |
32454 | + int ret = 0; | |
32455 | + | |
32456 | + for (i = 0; i < hist_data->attrs->n_assignments; i++) { | |
32457 | + str = hist_data->attrs->assignment_str[i]; | |
32458 | + for (j = 0; j < TRACING_MAP_VARS_MAX; j++) { | |
32459 | + field_str = strsep(&str, ","); | |
32460 | + if (!field_str) | |
32461 | + break; | |
32462 | + | |
32463 | + var_name = strsep(&field_str, "="); | |
32464 | + if (!var_name || !field_str) { | |
32465 | + hist_err("Malformed assignment: ", var_name); | |
32466 | + ret = -EINVAL; | |
32467 | + goto free; | |
32468 | + } | |
32469 | + | |
32470 | + if (n_vars == TRACING_MAP_VARS_MAX) { | |
32471 | + hist_err("Too many variables defined: ", var_name); | |
32472 | + ret = -EINVAL; | |
32473 | + goto free; | |
32474 | + } | |
32475 | + | |
32476 | + s = kstrdup(var_name, GFP_KERNEL); | |
32477 | + if (!s) { | |
32478 | + ret = -ENOMEM; | |
32479 | + goto free; | |
32480 | + } | |
32481 | + hist_data->attrs->var_defs.name[n_vars] = s; | |
32482 | + | |
32483 | + s = kstrdup(field_str, GFP_KERNEL); | |
32484 | + if (!s) { | |
32485 | + kfree(hist_data->attrs->var_defs.name[n_vars]); | |
32486 | + ret = -ENOMEM; | |
32487 | + goto free; | |
32488 | + } | |
32489 | + hist_data->attrs->var_defs.expr[n_vars++] = s; | |
32490 | + | |
32491 | + hist_data->attrs->var_defs.n_vars = n_vars; | |
32492 | + } | |
32493 | + } | |
32494 | + | |
32495 | + return ret; | |
32496 | + free: | |
32497 | + free_var_defs(hist_data); | |
32498 | + | |
32499 | + return ret; | |
32500 | +} | |
32501 | + | |
32502 | static int create_hist_fields(struct hist_trigger_data *hist_data, | |
32503 | struct trace_event_file *file) | |
32504 | { | |
32505 | int ret; | |
1a6e0f06 | 32506 | |
e4b2b4a8 JK |
32507 | + ret = parse_var_defs(hist_data); |
32508 | + if (ret) | |
32509 | + goto out; | |
32510 | + | |
32511 | ret = create_val_fields(hist_data, file); | |
32512 | if (ret) | |
32513 | goto out; | |
1a6e0f06 | 32514 | |
e4b2b4a8 JK |
32515 | - ret = create_key_fields(hist_data, file); |
32516 | + ret = create_var_fields(hist_data, file); | |
32517 | if (ret) | |
32518 | goto out; | |
1a6e0f06 | 32519 | |
e4b2b4a8 JK |
32520 | - hist_data->n_fields = hist_data->n_vals + hist_data->n_keys; |
32521 | + ret = create_key_fields(hist_data, file); | |
32522 | + if (ret) | |
32523 | + goto out; | |
32524 | out: | |
32525 | + free_var_defs(hist_data); | |
32526 | + | |
32527 | return ret; | |
1a6e0f06 JK |
32528 | } |
32529 | ||
b3bbd485 | 32530 | @@ -653,10 +4151,9 @@ static int is_descending(const char *str) |
e4b2b4a8 | 32531 | static int create_sort_keys(struct hist_trigger_data *hist_data) |
1a6e0f06 | 32532 | { |
e4b2b4a8 JK |
32533 | char *fields_str = hist_data->attrs->sort_key_str; |
32534 | - struct ftrace_event_field *field = NULL; | |
32535 | struct tracing_map_sort_key *sort_key; | |
32536 | int descending, ret = 0; | |
32537 | - unsigned int i, j; | |
32538 | + unsigned int i, j, k; | |
1a6e0f06 | 32539 | |
e4b2b4a8 | 32540 | hist_data->n_sort_keys = 1; /* we always have at least one, hitcount */ |
1a6e0f06 | 32541 | |
b3bbd485 | 32542 | @@ -670,7 +4167,9 @@ static int create_sort_keys(struct hist_trigger_data *hist_data) |
e4b2b4a8 | 32543 | } |
1a6e0f06 | 32544 | |
e4b2b4a8 JK |
32545 | for (i = 0; i < TRACING_MAP_SORT_KEYS_MAX; i++) { |
32546 | + struct hist_field *hist_field; | |
32547 | char *field_str, *field_name; | |
32548 | + const char *test_name; | |
1a6e0f06 | 32549 | |
e4b2b4a8 | 32550 | sort_key = &hist_data->sort_keys[i]; |
1a6e0f06 | 32551 | |
b3bbd485 | 32552 | @@ -702,10 +4201,19 @@ static int create_sort_keys(struct hist_trigger_data *hist_data) |
e4b2b4a8 JK |
32553 | continue; |
32554 | } | |
1a6e0f06 | 32555 | |
e4b2b4a8 JK |
32556 | - for (j = 1; j < hist_data->n_fields; j++) { |
32557 | - field = hist_data->fields[j]->field; | |
32558 | - if (field && (strcmp(field_name, field->name) == 0)) { | |
32559 | - sort_key->field_idx = j; | |
32560 | + for (j = 1, k = 1; j < hist_data->n_fields; j++) { | |
32561 | + unsigned int idx; | |
1a6e0f06 | 32562 | + |
e4b2b4a8 JK |
32563 | + hist_field = hist_data->fields[j]; |
32564 | + if (hist_field->flags & HIST_FIELD_FL_VAR) | |
32565 | + continue; | |
1a6e0f06 | 32566 | + |
e4b2b4a8 | 32567 | + idx = k++; |
1a6e0f06 | 32568 | + |
e4b2b4a8 | 32569 | + test_name = hist_field_name(hist_field, 0); |
1a6e0f06 | 32570 | + |
e4b2b4a8 JK |
32571 | + if (strcmp(field_name, test_name) == 0) { |
32572 | + sort_key->field_idx = idx; | |
32573 | descending = is_descending(field_str); | |
32574 | if (descending < 0) { | |
32575 | ret = descending; | |
b3bbd485 | 32576 | @@ -720,16 +4228,230 @@ static int create_sort_keys(struct hist_trigger_data *hist_data) |
e4b2b4a8 JK |
32577 | break; |
32578 | } | |
32579 | } | |
b3bbd485 JK |
32580 | - hist_data->n_sort_keys = i; |
32581 | - out: | |
32582 | - return ret; | |
32583 | + | |
32584 | + hist_data->n_sort_keys = i; | |
32585 | + out: | |
32586 | + return ret; | |
32587 | +} | |
e4b2b4a8 | 32588 | + |
e4b2b4a8 JK |
32589 | +static void destroy_actions(struct hist_trigger_data *hist_data) |
32590 | +{ | |
32591 | + unsigned int i; | |
1a6e0f06 | 32592 | + |
e4b2b4a8 JK |
32593 | + for (i = 0; i < hist_data->n_actions; i++) { |
32594 | + struct action_data *data = hist_data->actions[i]; | |
1a6e0f06 | 32595 | + |
e4b2b4a8 JK |
32596 | + if (data->fn == action_trace) |
32597 | + onmatch_destroy(data); | |
32598 | + else if (data->fn == onmax_save) | |
32599 | + onmax_destroy(data); | |
32600 | + else | |
32601 | + kfree(data); | |
32602 | + } | |
32603 | +} | |
1a6e0f06 | 32604 | + |
e4b2b4a8 JK |
32605 | +static int parse_actions(struct hist_trigger_data *hist_data) |
32606 | +{ | |
32607 | + struct trace_array *tr = hist_data->event_file->tr; | |
32608 | + struct action_data *data; | |
32609 | + unsigned int i; | |
32610 | + int ret = 0; | |
32611 | + char *str; | |
1a6e0f06 | 32612 | + |
e4b2b4a8 JK |
32613 | + for (i = 0; i < hist_data->attrs->n_actions; i++) { |
32614 | + str = hist_data->attrs->action_str[i]; | |
1a6e0f06 | 32615 | + |
e4b2b4a8 JK |
32616 | + if (strncmp(str, "onmatch(", strlen("onmatch(")) == 0) { |
32617 | + char *action_str = str + strlen("onmatch("); | |
1a6e0f06 | 32618 | + |
e4b2b4a8 JK |
32619 | + data = onmatch_parse(tr, action_str); |
32620 | + if (IS_ERR(data)) { | |
32621 | + ret = PTR_ERR(data); | |
32622 | + break; | |
32623 | + } | |
32624 | + data->fn = action_trace; | |
32625 | + } else if (strncmp(str, "onmax(", strlen("onmax(")) == 0) { | |
32626 | + char *action_str = str + strlen("onmax("); | |
1a6e0f06 | 32627 | + |
e4b2b4a8 JK |
32628 | + data = onmax_parse(action_str); |
32629 | + if (IS_ERR(data)) { | |
32630 | + ret = PTR_ERR(data); | |
32631 | + break; | |
32632 | + } | |
32633 | + data->fn = onmax_save; | |
32634 | + } else { | |
32635 | + ret = -EINVAL; | |
32636 | + break; | |
32637 | + } | |
1a6e0f06 | 32638 | + |
e4b2b4a8 JK |
32639 | + hist_data->actions[hist_data->n_actions++] = data; |
32640 | + } | |
1a6e0f06 | 32641 | + |
e4b2b4a8 JK |
32642 | + return ret; |
32643 | +} | |
1a6e0f06 | 32644 | + |
e4b2b4a8 JK |
32645 | +static int create_actions(struct hist_trigger_data *hist_data, |
32646 | + struct trace_event_file *file) | |
32647 | +{ | |
32648 | + struct action_data *data; | |
32649 | + unsigned int i; | |
32650 | + int ret = 0; | |
1a6e0f06 | 32651 | + |
e4b2b4a8 JK |
32652 | + for (i = 0; i < hist_data->attrs->n_actions; i++) { |
32653 | + data = hist_data->actions[i]; | |
1a6e0f06 | 32654 | + |
e4b2b4a8 JK |
32655 | + if (data->fn == action_trace) { |
32656 | + ret = onmatch_create(hist_data, file, data); | |
32657 | + if (ret) | |
32658 | + return ret; | |
32659 | + } else if (data->fn == onmax_save) { | |
32660 | + ret = onmax_create(hist_data, data); | |
32661 | + if (ret) | |
32662 | + return ret; | |
32663 | + } | |
32664 | + } | |
1a6e0f06 | 32665 | + |
e4b2b4a8 JK |
32666 | + return ret; |
32667 | +} | |
1a6e0f06 | 32668 | + |
e4b2b4a8 JK |
32669 | +static void print_actions(struct seq_file *m, |
32670 | + struct hist_trigger_data *hist_data, | |
32671 | + struct tracing_map_elt *elt) | |
32672 | +{ | |
32673 | + unsigned int i; | |
1a6e0f06 | 32674 | + |
e4b2b4a8 JK |
32675 | + for (i = 0; i < hist_data->n_actions; i++) { |
32676 | + struct action_data *data = hist_data->actions[i]; | |
1a6e0f06 | 32677 | + |
e4b2b4a8 JK |
32678 | + if (data->fn == onmax_save) |
32679 | + onmax_print(m, hist_data, elt, data); | |
32680 | + } | |
32681 | +} | |
1a6e0f06 | 32682 | + |
e4b2b4a8 JK |
32683 | +static void print_onmax_spec(struct seq_file *m, |
32684 | + struct hist_trigger_data *hist_data, | |
32685 | + struct action_data *data) | |
32686 | +{ | |
32687 | + unsigned int i; | |
1a6e0f06 | 32688 | + |
e4b2b4a8 JK |
32689 | + seq_puts(m, ":onmax("); |
32690 | + seq_printf(m, "%s", data->onmax.var_str); | |
32691 | + seq_printf(m, ").%s(", data->onmax.fn_name); | |
1a6e0f06 | 32692 | + |
e4b2b4a8 JK |
32693 | + for (i = 0; i < hist_data->n_max_vars; i++) { |
32694 | + seq_printf(m, "%s", hist_data->max_vars[i]->var->var.name); | |
32695 | + if (i < hist_data->n_max_vars - 1) | |
32696 | + seq_puts(m, ","); | |
32697 | + } | |
32698 | + seq_puts(m, ")"); | |
32699 | +} | |
1a6e0f06 | 32700 | + |
e4b2b4a8 JK |
32701 | +static void print_onmatch_spec(struct seq_file *m, |
32702 | + struct hist_trigger_data *hist_data, | |
32703 | + struct action_data *data) | |
32704 | +{ | |
32705 | + unsigned int i; | |
1a6e0f06 | 32706 | + |
e4b2b4a8 JK |
32707 | + seq_printf(m, ":onmatch(%s.%s).", data->onmatch.match_event_system, |
32708 | + data->onmatch.match_event); | |
1a6e0f06 | 32709 | + |
e4b2b4a8 | 32710 | + seq_printf(m, "%s(", data->onmatch.synth_event->name); |
1a6e0f06 | 32711 | + |
e4b2b4a8 JK |
32712 | + for (i = 0; i < data->n_params; i++) { |
32713 | + if (i) | |
32714 | + seq_puts(m, ","); | |
32715 | + seq_printf(m, "%s", data->params[i]); | |
32716 | + } | |
1a6e0f06 | 32717 | + |
e4b2b4a8 JK |
32718 | + seq_puts(m, ")"); |
32719 | +} | |
1a6e0f06 | 32720 | + |
e4b2b4a8 JK |
32721 | +static bool actions_match(struct hist_trigger_data *hist_data, |
32722 | + struct hist_trigger_data *hist_data_test) | |
1a6e0f06 | 32723 | +{ |
e4b2b4a8 | 32724 | + unsigned int i, j; |
1a6e0f06 | 32725 | + |
e4b2b4a8 JK |
32726 | + if (hist_data->n_actions != hist_data_test->n_actions) |
32727 | + return false; | |
1a6e0f06 | 32728 | + |
e4b2b4a8 JK |
32729 | + for (i = 0; i < hist_data->n_actions; i++) { |
32730 | + struct action_data *data = hist_data->actions[i]; | |
32731 | + struct action_data *data_test = hist_data_test->actions[i]; | |
1a6e0f06 | 32732 | + |
e4b2b4a8 JK |
32733 | + if (data->fn != data_test->fn) |
32734 | + return false; | |
1a6e0f06 | 32735 | + |
e4b2b4a8 JK |
32736 | + if (data->n_params != data_test->n_params) |
32737 | + return false; | |
1a6e0f06 | 32738 | + |
e4b2b4a8 JK |
32739 | + for (j = 0; j < data->n_params; j++) { |
32740 | + if (strcmp(data->params[j], data_test->params[j]) != 0) | |
32741 | + return false; | |
32742 | + } | |
1a6e0f06 | 32743 | + |
e4b2b4a8 JK |
32744 | + if (data->fn == action_trace) { |
32745 | + if (strcmp(data->onmatch.synth_event_name, | |
32746 | + data_test->onmatch.synth_event_name) != 0) | |
32747 | + return false; | |
32748 | + if (strcmp(data->onmatch.match_event_system, | |
32749 | + data_test->onmatch.match_event_system) != 0) | |
32750 | + return false; | |
32751 | + if (strcmp(data->onmatch.match_event, | |
32752 | + data_test->onmatch.match_event) != 0) | |
32753 | + return false; | |
32754 | + } else if (data->fn == onmax_save) { | |
32755 | + if (strcmp(data->onmax.var_str, | |
32756 | + data_test->onmax.var_str) != 0) | |
32757 | + return false; | |
32758 | + if (strcmp(data->onmax.fn_name, | |
32759 | + data_test->onmax.fn_name) != 0) | |
32760 | + return false; | |
1a6e0f06 | 32761 | + } |
1a6e0f06 | 32762 | + } |
1a6e0f06 | 32763 | + |
e4b2b4a8 JK |
32764 | + return true; |
32765 | +} | |
1a6e0f06 | 32766 | + |
1a6e0f06 | 32767 | + |
e4b2b4a8 JK |
32768 | +static void print_actions_spec(struct seq_file *m, |
32769 | + struct hist_trigger_data *hist_data) | |
32770 | +{ | |
32771 | + unsigned int i; | |
1a6e0f06 | 32772 | + |
e4b2b4a8 JK |
32773 | + for (i = 0; i < hist_data->n_actions; i++) { |
32774 | + struct action_data *data = hist_data->actions[i]; | |
1a6e0f06 | 32775 | + |
e4b2b4a8 JK |
32776 | + if (data->fn == action_trace) |
32777 | + print_onmatch_spec(m, hist_data, data); | |
32778 | + else if (data->fn == onmax_save) | |
32779 | + print_onmax_spec(m, hist_data, data); | |
1a6e0f06 | 32780 | + } |
1a6e0f06 JK |
32781 | +} |
32782 | + | |
e4b2b4a8 | 32783 | +static void destroy_field_var_hists(struct hist_trigger_data *hist_data) |
1a6e0f06 | 32784 | +{ |
e4b2b4a8 | 32785 | + unsigned int i; |
1a6e0f06 | 32786 | + |
e4b2b4a8 JK |
32787 | + for (i = 0; i < hist_data->n_field_var_hists; i++) { |
32788 | + kfree(hist_data->field_var_hists[i]->cmd); | |
32789 | + kfree(hist_data->field_var_hists[i]); | |
1a6e0f06 | 32790 | + } |
b3bbd485 JK |
32791 | } |
32792 | ||
e4b2b4a8 JK |
32793 | static void destroy_hist_data(struct hist_trigger_data *hist_data) |
32794 | { | |
32795 | + if (!hist_data) | |
32796 | + return; | |
1a6e0f06 | 32797 | + |
e4b2b4a8 JK |
32798 | destroy_hist_trigger_attrs(hist_data->attrs); |
32799 | destroy_hist_fields(hist_data); | |
32800 | tracing_map_destroy(hist_data->map); | |
1a6e0f06 | 32801 | + |
e4b2b4a8 JK |
32802 | + destroy_actions(hist_data); |
32803 | + destroy_field_vars(hist_data); | |
32804 | + destroy_field_var_hists(hist_data); | |
32805 | + destroy_synth_var_refs(hist_data); | |
32806 | + | |
32807 | kfree(hist_data); | |
32808 | } | |
32809 | ||
b3bbd485 | 32810 | @@ -738,7 +4460,7 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data) |
e4b2b4a8 JK |
32811 | struct tracing_map *map = hist_data->map; |
32812 | struct ftrace_event_field *field; | |
32813 | struct hist_field *hist_field; | |
32814 | - int i, idx; | |
32815 | + int i, idx = 0; | |
32816 | ||
32817 | for_each_hist_field(i, hist_data) { | |
32818 | hist_field = hist_data->fields[i]; | |
b3bbd485 | 32819 | @@ -749,6 +4471,9 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data) |
e4b2b4a8 JK |
32820 | |
32821 | if (hist_field->flags & HIST_FIELD_FL_STACKTRACE) | |
32822 | cmp_fn = tracing_map_cmp_none; | |
32823 | + else if (!field) | |
32824 | + cmp_fn = tracing_map_cmp_num(hist_field->size, | |
32825 | + hist_field->is_signed); | |
32826 | else if (is_string_field(field)) | |
32827 | cmp_fn = tracing_map_cmp_string; | |
32828 | else | |
b3bbd485 | 32829 | @@ -757,36 +4482,29 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data) |
e4b2b4a8 JK |
32830 | idx = tracing_map_add_key_field(map, |
32831 | hist_field->offset, | |
32832 | cmp_fn); | |
32833 | - | |
32834 | - } else | |
32835 | + } else if (!(hist_field->flags & HIST_FIELD_FL_VAR)) | |
32836 | idx = tracing_map_add_sum_field(map); | |
32837 | ||
32838 | if (idx < 0) | |
32839 | return idx; | |
32840 | - } | |
32841 | - | |
32842 | - return 0; | |
32843 | -} | |
32844 | - | |
32845 | -static bool need_tracing_map_ops(struct hist_trigger_data *hist_data) | |
32846 | -{ | |
32847 | - struct hist_field *key_field; | |
32848 | - unsigned int i; | |
32849 | - | |
32850 | - for_each_hist_key_field(i, hist_data) { | |
32851 | - key_field = hist_data->fields[i]; | |
32852 | ||
32853 | - if (key_field->flags & HIST_FIELD_FL_EXECNAME) | |
32854 | - return true; | |
32855 | + if (hist_field->flags & HIST_FIELD_FL_VAR) { | |
32856 | + idx = tracing_map_add_var(map); | |
32857 | + if (idx < 0) | |
32858 | + return idx; | |
32859 | + hist_field->var.idx = idx; | |
32860 | + hist_field->var.hist_data = hist_data; | |
32861 | + } | |
32862 | } | |
32863 | ||
32864 | - return false; | |
1a6e0f06 | 32865 | + return 0; |
e4b2b4a8 JK |
32866 | } |
32867 | ||
32868 | static struct hist_trigger_data * | |
32869 | create_hist_data(unsigned int map_bits, | |
32870 | struct hist_trigger_attrs *attrs, | |
32871 | - struct trace_event_file *file) | |
32872 | + struct trace_event_file *file, | |
32873 | + bool remove) | |
32874 | { | |
32875 | const struct tracing_map_ops *map_ops = NULL; | |
32876 | struct hist_trigger_data *hist_data; | |
b3bbd485 | 32877 | @@ -797,6 +4515,12 @@ create_hist_data(unsigned int map_bits, |
e4b2b4a8 JK |
32878 | return ERR_PTR(-ENOMEM); |
32879 | ||
32880 | hist_data->attrs = attrs; | |
32881 | + hist_data->remove = remove; | |
32882 | + hist_data->event_file = file; | |
1a6e0f06 | 32883 | + |
e4b2b4a8 JK |
32884 | + ret = parse_actions(hist_data); |
32885 | + if (ret) | |
32886 | + goto free; | |
32887 | ||
32888 | ret = create_hist_fields(hist_data, file); | |
32889 | if (ret) | |
b3bbd485 | 32890 | @@ -806,8 +4530,7 @@ create_hist_data(unsigned int map_bits, |
e4b2b4a8 JK |
32891 | if (ret) |
32892 | goto free; | |
32893 | ||
32894 | - if (need_tracing_map_ops(hist_data)) | |
32895 | - map_ops = &hist_trigger_elt_comm_ops; | |
32896 | + map_ops = &hist_trigger_elt_data_ops; | |
32897 | ||
32898 | hist_data->map = tracing_map_create(map_bits, hist_data->key_size, | |
32899 | map_ops, hist_data); | |
b3bbd485 | 32900 | @@ -820,12 +4543,6 @@ create_hist_data(unsigned int map_bits, |
e4b2b4a8 JK |
32901 | ret = create_tracing_map_fields(hist_data); |
32902 | if (ret) | |
32903 | goto free; | |
32904 | - | |
32905 | - ret = tracing_map_init(hist_data->map); | |
32906 | - if (ret) | |
32907 | - goto free; | |
32908 | - | |
32909 | - hist_data->event_file = file; | |
32910 | out: | |
32911 | return hist_data; | |
32912 | free: | |
b3bbd485 | 32913 | @@ -839,18 +4556,39 @@ create_hist_data(unsigned int map_bits, |
e4b2b4a8 JK |
32914 | } |
32915 | ||
32916 | static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, | |
32917 | - struct tracing_map_elt *elt, | |
32918 | - void *rec) | |
32919 | + struct tracing_map_elt *elt, void *rec, | |
32920 | + struct ring_buffer_event *rbe, | |
32921 | + u64 *var_ref_vals) | |
32922 | { | |
32923 | + struct hist_elt_data *elt_data; | |
32924 | struct hist_field *hist_field; | |
32925 | - unsigned int i; | |
32926 | + unsigned int i, var_idx; | |
32927 | u64 hist_val; | |
32928 | ||
32929 | + elt_data = elt->private_data; | |
32930 | + elt_data->var_ref_vals = var_ref_vals; | |
32931 | + | |
32932 | for_each_hist_val_field(i, hist_data) { | |
32933 | hist_field = hist_data->fields[i]; | |
32934 | - hist_val = hist_field->fn(hist_field, rec); | |
32935 | + hist_val = hist_field->fn(hist_field, elt, rbe, rec); | |
32936 | + if (hist_field->flags & HIST_FIELD_FL_VAR) { | |
32937 | + var_idx = hist_field->var.idx; | |
32938 | + tracing_map_set_var(elt, var_idx, hist_val); | |
32939 | + continue; | |
32940 | + } | |
32941 | tracing_map_update_sum(elt, i, hist_val); | |
32942 | } | |
32943 | + | |
32944 | + for_each_hist_key_field(i, hist_data) { | |
32945 | + hist_field = hist_data->fields[i]; | |
32946 | + if (hist_field->flags & HIST_FIELD_FL_VAR) { | |
32947 | + hist_val = hist_field->fn(hist_field, elt, rbe, rec); | |
32948 | + var_idx = hist_field->var.idx; | |
32949 | + tracing_map_set_var(elt, var_idx, hist_val); | |
32950 | + } | |
32951 | + } | |
1a6e0f06 | 32952 | + |
e4b2b4a8 JK |
32953 | + update_field_vars(hist_data, elt, rbe, rec); |
32954 | } | |
32955 | ||
32956 | static inline void add_to_key(char *compound_key, void *key, | |
b3bbd485 | 32957 | @@ -877,15 +4615,31 @@ static inline void add_to_key(char *compound_key, void *key, |
e4b2b4a8 JK |
32958 | memcpy(compound_key + key_field->offset, key, size); |
32959 | } | |
32960 | ||
32961 | -static void event_hist_trigger(struct event_trigger_data *data, void *rec) | |
32962 | +static void | |
32963 | +hist_trigger_actions(struct hist_trigger_data *hist_data, | |
32964 | + struct tracing_map_elt *elt, void *rec, | |
32965 | + struct ring_buffer_event *rbe, u64 *var_ref_vals) | |
1a6e0f06 | 32966 | +{ |
e4b2b4a8 JK |
32967 | + struct action_data *data; |
32968 | + unsigned int i; | |
1a6e0f06 | 32969 | + |
e4b2b4a8 JK |
32970 | + for (i = 0; i < hist_data->n_actions; i++) { |
32971 | + data = hist_data->actions[i]; | |
32972 | + data->fn(hist_data, elt, rec, rbe, data, var_ref_vals); | |
1a6e0f06 | 32973 | + } |
1a6e0f06 JK |
32974 | +} |
32975 | + | |
e4b2b4a8 JK |
32976 | +static void event_hist_trigger(struct event_trigger_data *data, void *rec, |
32977 | + struct ring_buffer_event *rbe) | |
32978 | { | |
32979 | struct hist_trigger_data *hist_data = data->private_data; | |
32980 | bool use_compound_key = (hist_data->n_keys > 1); | |
32981 | unsigned long entries[HIST_STACKTRACE_DEPTH]; | |
32982 | + u64 var_ref_vals[TRACING_MAP_VARS_MAX]; | |
32983 | char compound_key[HIST_KEY_SIZE_MAX]; | |
32984 | + struct tracing_map_elt *elt = NULL; | |
32985 | struct stack_trace stacktrace; | |
32986 | struct hist_field *key_field; | |
32987 | - struct tracing_map_elt *elt; | |
32988 | u64 field_contents; | |
32989 | void *key = NULL; | |
32990 | unsigned int i; | |
b3bbd485 | 32991 | @@ -906,7 +4660,7 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec) |
e4b2b4a8 JK |
32992 | |
32993 | key = entries; | |
32994 | } else { | |
32995 | - field_contents = key_field->fn(key_field, rec); | |
32996 | + field_contents = key_field->fn(key_field, elt, rbe, rec); | |
32997 | if (key_field->flags & HIST_FIELD_FL_STRING) { | |
32998 | key = (void *)(unsigned long)field_contents; | |
32999 | use_compound_key = true; | |
b3bbd485 | 33000 | @@ -921,9 +4675,18 @@ static void event_hist_trigger(struct event_trigger_data *data, void *rec) |
e4b2b4a8 JK |
33001 | if (use_compound_key) |
33002 | key = compound_key; | |
33003 | ||
33004 | + if (hist_data->n_var_refs && | |
33005 | + !resolve_var_refs(hist_data, key, var_ref_vals, false)) | |
33006 | + return; | |
1a6e0f06 | 33007 | + |
e4b2b4a8 JK |
33008 | elt = tracing_map_insert(hist_data->map, key); |
33009 | - if (elt) | |
33010 | - hist_trigger_elt_update(hist_data, elt, rec); | |
33011 | + if (!elt) | |
33012 | + return; | |
1a6e0f06 | 33013 | + |
e4b2b4a8 JK |
33014 | + hist_trigger_elt_update(hist_data, elt, rec, rbe, var_ref_vals); |
33015 | + | |
33016 | + if (resolve_var_refs(hist_data, key, var_ref_vals, true)) | |
33017 | + hist_trigger_actions(hist_data, elt, rec, rbe, var_ref_vals); | |
33018 | } | |
33019 | ||
33020 | static void hist_trigger_stacktrace_print(struct seq_file *m, | |
b3bbd485 | 33021 | @@ -952,6 +4715,7 @@ hist_trigger_entry_print(struct seq_file *m, |
e4b2b4a8 JK |
33022 | struct hist_field *key_field; |
33023 | char str[KSYM_SYMBOL_LEN]; | |
33024 | bool multiline = false; | |
33025 | + const char *field_name; | |
33026 | unsigned int i; | |
33027 | u64 uval; | |
33028 | ||
b3bbd485 | 33029 | @@ -963,26 +4727,33 @@ hist_trigger_entry_print(struct seq_file *m, |
e4b2b4a8 JK |
33030 | if (i > hist_data->n_vals) |
33031 | seq_puts(m, ", "); | |
33032 | ||
33033 | + field_name = hist_field_name(key_field, 0); | |
33034 | + | |
33035 | if (key_field->flags & HIST_FIELD_FL_HEX) { | |
33036 | uval = *(u64 *)(key + key_field->offset); | |
33037 | - seq_printf(m, "%s: %llx", | |
33038 | - key_field->field->name, uval); | |
33039 | + seq_printf(m, "%s: %llx", field_name, uval); | |
33040 | } else if (key_field->flags & HIST_FIELD_FL_SYM) { | |
33041 | uval = *(u64 *)(key + key_field->offset); | |
33042 | sprint_symbol_no_offset(str, uval); | |
33043 | - seq_printf(m, "%s: [%llx] %-45s", | |
33044 | - key_field->field->name, uval, str); | |
33045 | + seq_printf(m, "%s: [%llx] %-45s", field_name, | |
33046 | + uval, str); | |
33047 | } else if (key_field->flags & HIST_FIELD_FL_SYM_OFFSET) { | |
33048 | uval = *(u64 *)(key + key_field->offset); | |
33049 | sprint_symbol(str, uval); | |
33050 | - seq_printf(m, "%s: [%llx] %-55s", | |
33051 | - key_field->field->name, uval, str); | |
33052 | + seq_printf(m, "%s: [%llx] %-55s", field_name, | |
33053 | + uval, str); | |
33054 | } else if (key_field->flags & HIST_FIELD_FL_EXECNAME) { | |
33055 | - char *comm = elt->private_data; | |
33056 | + struct hist_elt_data *elt_data = elt->private_data; | |
33057 | + char *comm; | |
33058 | + | |
33059 | + if (WARN_ON_ONCE(!elt_data)) | |
33060 | + return; | |
1a6e0f06 | 33061 | + |
e4b2b4a8 JK |
33062 | + comm = elt_data->comm; |
33063 | ||
33064 | uval = *(u64 *)(key + key_field->offset); | |
33065 | - seq_printf(m, "%s: %-16s[%10llu]", | |
33066 | - key_field->field->name, comm, uval); | |
33067 | + seq_printf(m, "%s: %-16s[%10llu]", field_name, | |
33068 | + comm, uval); | |
33069 | } else if (key_field->flags & HIST_FIELD_FL_SYSCALL) { | |
33070 | const char *syscall_name; | |
33071 | ||
b3bbd485 | 33072 | @@ -991,8 +4762,8 @@ hist_trigger_entry_print(struct seq_file *m, |
e4b2b4a8 JK |
33073 | if (!syscall_name) |
33074 | syscall_name = "unknown_syscall"; | |
33075 | ||
33076 | - seq_printf(m, "%s: %-30s[%3llu]", | |
33077 | - key_field->field->name, syscall_name, uval); | |
33078 | + seq_printf(m, "%s: %-30s[%3llu]", field_name, | |
33079 | + syscall_name, uval); | |
33080 | } else if (key_field->flags & HIST_FIELD_FL_STACKTRACE) { | |
33081 | seq_puts(m, "stacktrace:\n"); | |
33082 | hist_trigger_stacktrace_print(m, | |
b3bbd485 | 33083 | @@ -1000,15 +4771,14 @@ hist_trigger_entry_print(struct seq_file *m, |
e4b2b4a8 JK |
33084 | HIST_STACKTRACE_DEPTH); |
33085 | multiline = true; | |
33086 | } else if (key_field->flags & HIST_FIELD_FL_LOG2) { | |
33087 | - seq_printf(m, "%s: ~ 2^%-2llu", key_field->field->name, | |
33088 | + seq_printf(m, "%s: ~ 2^%-2llu", field_name, | |
33089 | *(u64 *)(key + key_field->offset)); | |
33090 | } else if (key_field->flags & HIST_FIELD_FL_STRING) { | |
33091 | - seq_printf(m, "%s: %-50s", key_field->field->name, | |
33092 | + seq_printf(m, "%s: %-50s", field_name, | |
33093 | (char *)(key + key_field->offset)); | |
33094 | } else { | |
33095 | uval = *(u64 *)(key + key_field->offset); | |
33096 | - seq_printf(m, "%s: %10llu", key_field->field->name, | |
33097 | - uval); | |
33098 | + seq_printf(m, "%s: %10llu", field_name, uval); | |
33099 | } | |
33100 | } | |
33101 | ||
b3bbd485 | 33102 | @@ -1021,17 +4791,23 @@ hist_trigger_entry_print(struct seq_file *m, |
e4b2b4a8 JK |
33103 | tracing_map_read_sum(elt, HITCOUNT_IDX)); |
33104 | ||
33105 | for (i = 1; i < hist_data->n_vals; i++) { | |
33106 | + field_name = hist_field_name(hist_data->fields[i], 0); | |
1a6e0f06 | 33107 | + |
e4b2b4a8 JK |
33108 | + if (hist_data->fields[i]->flags & HIST_FIELD_FL_VAR || |
33109 | + hist_data->fields[i]->flags & HIST_FIELD_FL_EXPR) | |
33110 | + continue; | |
1a6e0f06 | 33111 | + |
e4b2b4a8 JK |
33112 | if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) { |
33113 | - seq_printf(m, " %s: %10llx", | |
33114 | - hist_data->fields[i]->field->name, | |
33115 | + seq_printf(m, " %s: %10llx", field_name, | |
33116 | tracing_map_read_sum(elt, i)); | |
33117 | } else { | |
33118 | - seq_printf(m, " %s: %10llu", | |
33119 | - hist_data->fields[i]->field->name, | |
33120 | + seq_printf(m, " %s: %10llu", field_name, | |
33121 | tracing_map_read_sum(elt, i)); | |
33122 | } | |
33123 | } | |
33124 | ||
33125 | + print_actions(m, hist_data, elt); | |
1a6e0f06 | 33126 | + |
e4b2b4a8 JK |
33127 | seq_puts(m, "\n"); |
33128 | } | |
33129 | ||
b3bbd485 | 33130 | @@ -1102,6 +4878,11 @@ static int hist_show(struct seq_file *m, void *v) |
e4b2b4a8 JK |
33131 | hist_trigger_show(m, data, n++); |
33132 | } | |
33133 | ||
33134 | + if (have_hist_err()) { | |
33135 | + seq_printf(m, "\nERROR: %s\n", hist_err_str); | |
33136 | + seq_printf(m, " Last command: %s\n", last_hist_cmd); | |
33137 | + } | |
1a6e0f06 | 33138 | + |
e4b2b4a8 JK |
33139 | out_unlock: |
33140 | mutex_unlock(&event_mutex); | |
33141 | ||
b3bbd485 | 33142 | @@ -1120,34 +4901,31 @@ const struct file_operations event_hist_fops = { |
e4b2b4a8 JK |
33143 | .release = single_release, |
33144 | }; | |
33145 | ||
33146 | -static const char *get_hist_field_flags(struct hist_field *hist_field) | |
33147 | +static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) | |
33148 | { | |
33149 | - const char *flags_str = NULL; | |
33150 | + const char *field_name = hist_field_name(hist_field, 0); | |
33151 | ||
33152 | - if (hist_field->flags & HIST_FIELD_FL_HEX) | |
33153 | - flags_str = "hex"; | |
33154 | - else if (hist_field->flags & HIST_FIELD_FL_SYM) | |
33155 | - flags_str = "sym"; | |
33156 | - else if (hist_field->flags & HIST_FIELD_FL_SYM_OFFSET) | |
33157 | - flags_str = "sym-offset"; | |
33158 | - else if (hist_field->flags & HIST_FIELD_FL_EXECNAME) | |
33159 | - flags_str = "execname"; | |
33160 | - else if (hist_field->flags & HIST_FIELD_FL_SYSCALL) | |
33161 | - flags_str = "syscall"; | |
33162 | - else if (hist_field->flags & HIST_FIELD_FL_LOG2) | |
33163 | - flags_str = "log2"; | |
33164 | + if (hist_field->var.name) | |
33165 | + seq_printf(m, "%s=", hist_field->var.name); | |
33166 | ||
33167 | - return flags_str; | |
33168 | -} | |
33169 | + if (hist_field->flags & HIST_FIELD_FL_CPU) | |
33170 | + seq_puts(m, "cpu"); | |
33171 | + else if (field_name) { | |
33172 | + if (hist_field->flags & HIST_FIELD_FL_VAR_REF || | |
33173 | + hist_field->flags & HIST_FIELD_FL_ALIAS) | |
33174 | + seq_putc(m, '$'); | |
33175 | + seq_printf(m, "%s", field_name); | |
33176 | + } else if (hist_field->flags & HIST_FIELD_FL_TIMESTAMP) | |
33177 | + seq_puts(m, "common_timestamp"); | |
33178 | ||
33179 | -static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) | |
33180 | -{ | |
33181 | - seq_printf(m, "%s", hist_field->field->name); | |
33182 | if (hist_field->flags) { | |
33183 | - const char *flags_str = get_hist_field_flags(hist_field); | |
33184 | + if (!(hist_field->flags & HIST_FIELD_FL_VAR_REF) && | |
33185 | + !(hist_field->flags & HIST_FIELD_FL_EXPR)) { | |
33186 | + const char *flags = get_hist_field_flags(hist_field); | |
33187 | ||
33188 | - if (flags_str) | |
33189 | - seq_printf(m, ".%s", flags_str); | |
33190 | + if (flags) | |
33191 | + seq_printf(m, ".%s", flags); | |
33192 | + } | |
33193 | } | |
33194 | } | |
33195 | ||
b3bbd485 | 33196 | @@ -1156,7 +4934,8 @@ static int event_hist_trigger_print(struct seq_file *m, |
e4b2b4a8 JK |
33197 | struct event_trigger_data *data) |
33198 | { | |
33199 | struct hist_trigger_data *hist_data = data->private_data; | |
33200 | - struct hist_field *key_field; | |
33201 | + struct hist_field *field; | |
33202 | + bool have_var = false; | |
33203 | unsigned int i; | |
33204 | ||
33205 | seq_puts(m, "hist:"); | |
b3bbd485 | 33206 | @@ -1167,25 +4946,47 @@ static int event_hist_trigger_print(struct seq_file *m, |
e4b2b4a8 JK |
33207 | seq_puts(m, "keys="); |
33208 | ||
33209 | for_each_hist_key_field(i, hist_data) { | |
33210 | - key_field = hist_data->fields[i]; | |
33211 | + field = hist_data->fields[i]; | |
33212 | ||
33213 | if (i > hist_data->n_vals) | |
33214 | seq_puts(m, ","); | |
33215 | ||
33216 | - if (key_field->flags & HIST_FIELD_FL_STACKTRACE) | |
33217 | + if (field->flags & HIST_FIELD_FL_STACKTRACE) | |
33218 | seq_puts(m, "stacktrace"); | |
33219 | else | |
33220 | - hist_field_print(m, key_field); | |
33221 | + hist_field_print(m, field); | |
33222 | } | |
33223 | ||
33224 | seq_puts(m, ":vals="); | |
33225 | ||
33226 | for_each_hist_val_field(i, hist_data) { | |
33227 | + field = hist_data->fields[i]; | |
33228 | + if (field->flags & HIST_FIELD_FL_VAR) { | |
33229 | + have_var = true; | |
33230 | + continue; | |
1a6e0f06 JK |
33231 | + } |
33232 | + | |
e4b2b4a8 JK |
33233 | if (i == HITCOUNT_IDX) |
33234 | seq_puts(m, "hitcount"); | |
33235 | else { | |
33236 | seq_puts(m, ","); | |
33237 | - hist_field_print(m, hist_data->fields[i]); | |
33238 | + hist_field_print(m, field); | |
33239 | + } | |
1a6e0f06 JK |
33240 | + } |
33241 | + | |
e4b2b4a8 JK |
33242 | + if (have_var) { |
33243 | + unsigned int n = 0; | |
33244 | + | |
33245 | + seq_puts(m, ":"); | |
33246 | + | |
33247 | + for_each_hist_val_field(i, hist_data) { | |
33248 | + field = hist_data->fields[i]; | |
33249 | + | |
33250 | + if (field->flags & HIST_FIELD_FL_VAR) { | |
33251 | + if (n++) | |
33252 | + seq_puts(m, ","); | |
33253 | + hist_field_print(m, field); | |
33254 | + } | |
33255 | } | |
33256 | } | |
33257 | ||
b3bbd485 | 33258 | @@ -1193,28 +4994,36 @@ static int event_hist_trigger_print(struct seq_file *m, |
e4b2b4a8 JK |
33259 | |
33260 | for (i = 0; i < hist_data->n_sort_keys; i++) { | |
33261 | struct tracing_map_sort_key *sort_key; | |
33262 | + unsigned int idx, first_key_idx; | |
33263 | + | |
33264 | + /* skip VAR vals */ | |
33265 | + first_key_idx = hist_data->n_vals - hist_data->n_vars; | |
33266 | ||
33267 | sort_key = &hist_data->sort_keys[i]; | |
33268 | + idx = sort_key->field_idx; | |
33269 | + | |
33270 | + if (WARN_ON(idx >= HIST_FIELDS_MAX)) | |
33271 | + return -EINVAL; | |
33272 | ||
33273 | if (i > 0) | |
33274 | seq_puts(m, ","); | |
33275 | ||
33276 | - if (sort_key->field_idx == HITCOUNT_IDX) | |
33277 | + if (idx == HITCOUNT_IDX) | |
33278 | seq_puts(m, "hitcount"); | |
33279 | else { | |
33280 | - unsigned int idx = sort_key->field_idx; | |
33281 | - | |
33282 | - if (WARN_ON(idx >= TRACING_MAP_FIELDS_MAX)) | |
33283 | - return -EINVAL; | |
33284 | - | |
33285 | + if (idx >= first_key_idx) | |
33286 | + idx += hist_data->n_vars; | |
33287 | hist_field_print(m, hist_data->fields[idx]); | |
33288 | } | |
33289 | ||
33290 | if (sort_key->descending) | |
33291 | seq_puts(m, ".descending"); | |
33292 | } | |
33293 | - | |
33294 | seq_printf(m, ":size=%u", (1 << hist_data->map->map_bits)); | |
33295 | + if (hist_data->enable_timestamps) | |
33296 | + seq_printf(m, ":clock=%s", hist_data->attrs->clock); | |
1a6e0f06 | 33297 | + |
e4b2b4a8 JK |
33298 | + print_actions_spec(m, hist_data); |
33299 | ||
33300 | if (data->filter_str) | |
33301 | seq_printf(m, " if %s", data->filter_str); | |
b3bbd485 | 33302 | @@ -1242,6 +5051,21 @@ static int event_hist_trigger_init(struct event_trigger_ops *ops, |
e4b2b4a8 JK |
33303 | return 0; |
33304 | } | |
33305 | ||
33306 | +static void unregister_field_var_hists(struct hist_trigger_data *hist_data) | |
1a6e0f06 | 33307 | +{ |
e4b2b4a8 JK |
33308 | + struct trace_event_file *file; |
33309 | + unsigned int i; | |
33310 | + char *cmd; | |
33311 | + int ret; | |
1a6e0f06 | 33312 | + |
e4b2b4a8 JK |
33313 | + for (i = 0; i < hist_data->n_field_var_hists; i++) { |
33314 | + file = hist_data->field_var_hists[i]->hist_data->event_file; | |
33315 | + cmd = hist_data->field_var_hists[i]->cmd; | |
33316 | + ret = event_hist_trigger_func(&trigger_hist_cmd, file, | |
33317 | + "!hist", "hist", cmd); | |
33318 | + } | |
1a6e0f06 JK |
33319 | +} |
33320 | + | |
e4b2b4a8 JK |
33321 | static void event_hist_trigger_free(struct event_trigger_ops *ops, |
33322 | struct event_trigger_data *data) | |
33323 | { | |
b3bbd485 | 33324 | @@ -1254,7 +5078,13 @@ static void event_hist_trigger_free(struct event_trigger_ops *ops, |
e4b2b4a8 JK |
33325 | if (!data->ref) { |
33326 | if (data->name) | |
33327 | del_named_trigger(data); | |
1a6e0f06 | 33328 | + |
e4b2b4a8 | 33329 | trigger_data_free(data); |
1a6e0f06 | 33330 | + |
e4b2b4a8 | 33331 | + remove_hist_vars(hist_data); |
1a6e0f06 | 33332 | + |
e4b2b4a8 | 33333 | + unregister_field_var_hists(hist_data); |
1a6e0f06 | 33334 | + |
e4b2b4a8 JK |
33335 | destroy_hist_data(hist_data); |
33336 | } | |
33337 | } | |
b3bbd485 | 33338 | @@ -1381,6 +5211,15 @@ static bool hist_trigger_match(struct event_trigger_data *data, |
e4b2b4a8 JK |
33339 | return false; |
33340 | if (key_field->offset != key_field_test->offset) | |
33341 | return false; | |
33342 | + if (key_field->size != key_field_test->size) | |
33343 | + return false; | |
33344 | + if (key_field->is_signed != key_field_test->is_signed) | |
33345 | + return false; | |
33346 | + if (!!key_field->var.name != !!key_field_test->var.name) | |
33347 | + return false; | |
33348 | + if (key_field->var.name && | |
33349 | + strcmp(key_field->var.name, key_field_test->var.name) != 0) | |
33350 | + return false; | |
33351 | } | |
33352 | ||
33353 | for (i = 0; i < hist_data->n_sort_keys; i++) { | |
b3bbd485 | 33354 | @@ -1396,6 +5235,9 @@ static bool hist_trigger_match(struct event_trigger_data *data, |
e4b2b4a8 JK |
33355 | (strcmp(data->filter_str, data_test->filter_str) != 0)) |
33356 | return false; | |
33357 | ||
33358 | + if (!actions_match(hist_data, hist_data_test)) | |
33359 | + return false; | |
1a6e0f06 | 33360 | + |
e4b2b4a8 JK |
33361 | return true; |
33362 | } | |
33363 | ||
b3bbd485 | 33364 | @@ -1412,6 +5254,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, |
e4b2b4a8 JK |
33365 | if (named_data) { |
33366 | if (!hist_trigger_match(data, named_data, named_data, | |
33367 | true)) { | |
33368 | + hist_err("Named hist trigger doesn't match existing named trigger (includes variables): ", hist_data->attrs->name); | |
33369 | ret = -EINVAL; | |
33370 | goto out; | |
33371 | } | |
b3bbd485 | 33372 | @@ -1431,13 +5274,16 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, |
e4b2b4a8 JK |
33373 | test->paused = false; |
33374 | else if (hist_data->attrs->clear) | |
33375 | hist_clear(test); | |
33376 | - else | |
33377 | + else { | |
33378 | + hist_err("Hist trigger already exists", NULL); | |
33379 | ret = -EEXIST; | |
33380 | + } | |
33381 | goto out; | |
33382 | } | |
33383 | } | |
33384 | new: | |
33385 | if (hist_data->attrs->cont || hist_data->attrs->clear) { | |
33386 | + hist_err("Can't clear or continue a nonexistent hist trigger", NULL); | |
33387 | ret = -ENOENT; | |
33388 | goto out; | |
33389 | } | |
b3bbd485 | 33390 | @@ -1446,7 +5292,6 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, |
e4b2b4a8 JK |
33391 | data->paused = true; |
33392 | ||
33393 | if (named_data) { | |
33394 | - destroy_hist_data(data->private_data); | |
33395 | data->private_data = named_data->private_data; | |
33396 | set_named_trigger_data(data, named_data); | |
33397 | data->ops = &event_hist_trigger_named_ops; | |
b3bbd485 | 33398 | @@ -1458,8 +5303,32 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, |
e4b2b4a8 JK |
33399 | goto out; |
33400 | } | |
33401 | ||
33402 | - list_add_rcu(&data->list, &file->triggers); | |
33403 | + if (hist_data->enable_timestamps) { | |
33404 | + char *clock = hist_data->attrs->clock; | |
1a6e0f06 | 33405 | + |
e4b2b4a8 JK |
33406 | + ret = tracing_set_clock(file->tr, hist_data->attrs->clock); |
33407 | + if (ret) { | |
33408 | + hist_err("Couldn't set trace_clock: ", clock); | |
33409 | + goto out; | |
33410 | + } | |
1a6e0f06 | 33411 | + |
e4b2b4a8 | 33412 | + tracing_set_time_stamp_abs(file->tr, true); |
1a6e0f06 JK |
33413 | + } |
33414 | + | |
e4b2b4a8 JK |
33415 | + if (named_data) |
33416 | + destroy_hist_data(hist_data); | |
1a6e0f06 | 33417 | + |
e4b2b4a8 JK |
33418 | ret++; |
33419 | + out: | |
33420 | + return ret; | |
1a6e0f06 | 33421 | +} |
1a6e0f06 | 33422 | + |
e4b2b4a8 JK |
33423 | +static int hist_trigger_enable(struct event_trigger_data *data, |
33424 | + struct trace_event_file *file) | |
1a6e0f06 | 33425 | +{ |
e4b2b4a8 | 33426 | + int ret = 0; |
1a6e0f06 | 33427 | + |
e4b2b4a8 JK |
33428 | + list_add_tail_rcu(&data->list, &file->triggers); |
33429 | ||
33430 | update_cond_flag(file); | |
33431 | ||
b3bbd485 | 33432 | @@ -1468,10 +5337,55 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, |
e4b2b4a8 JK |
33433 | update_cond_flag(file); |
33434 | ret--; | |
33435 | } | |
33436 | - out: | |
1a6e0f06 | 33437 | + |
e4b2b4a8 JK |
33438 | return ret; |
33439 | } | |
33440 | ||
33441 | +static bool have_hist_trigger_match(struct event_trigger_data *data, | |
33442 | + struct trace_event_file *file) | |
1a6e0f06 | 33443 | +{ |
e4b2b4a8 JK |
33444 | + struct hist_trigger_data *hist_data = data->private_data; |
33445 | + struct event_trigger_data *test, *named_data = NULL; | |
33446 | + bool match = false; | |
1a6e0f06 | 33447 | + |
e4b2b4a8 JK |
33448 | + if (hist_data->attrs->name) |
33449 | + named_data = find_named_trigger(hist_data->attrs->name); | |
1a6e0f06 | 33450 | + |
e4b2b4a8 JK |
33451 | + list_for_each_entry_rcu(test, &file->triggers, list) { |
33452 | + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { | |
33453 | + if (hist_trigger_match(data, test, named_data, false)) { | |
33454 | + match = true; | |
33455 | + break; | |
1a6e0f06 | 33456 | + } |
1a6e0f06 JK |
33457 | + } |
33458 | + } | |
1a6e0f06 | 33459 | + |
e4b2b4a8 JK |
33460 | + return match; |
33461 | +} | |
1a6e0f06 | 33462 | + |
e4b2b4a8 JK |
33463 | +static bool hist_trigger_check_refs(struct event_trigger_data *data, |
33464 | + struct trace_event_file *file) | |
1a6e0f06 | 33465 | +{ |
e4b2b4a8 JK |
33466 | + struct hist_trigger_data *hist_data = data->private_data; |
33467 | + struct event_trigger_data *test, *named_data = NULL; | |
1a6e0f06 | 33468 | + |
e4b2b4a8 JK |
33469 | + if (hist_data->attrs->name) |
33470 | + named_data = find_named_trigger(hist_data->attrs->name); | |
1a6e0f06 | 33471 | + |
e4b2b4a8 JK |
33472 | + list_for_each_entry_rcu(test, &file->triggers, list) { |
33473 | + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { | |
33474 | + if (!hist_trigger_match(data, test, named_data, false)) | |
33475 | + continue; | |
33476 | + hist_data = test->private_data; | |
33477 | + if (check_var_refs(hist_data)) | |
33478 | + return true; | |
33479 | + break; | |
1a6e0f06 | 33480 | + } |
e4b2b4a8 | 33481 | + } |
1a6e0f06 | 33482 | + |
e4b2b4a8 JK |
33483 | + return false; |
33484 | +} | |
1a6e0f06 | 33485 | + |
e4b2b4a8 JK |
33486 | static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops, |
33487 | struct event_trigger_data *data, | |
33488 | struct trace_event_file *file) | |
b3bbd485 | 33489 | @@ -1497,17 +5411,55 @@ static void hist_unregister_trigger(char *glob, struct event_trigger_ops *ops, |
e4b2b4a8 JK |
33490 | |
33491 | if (unregistered && test->ops->free) | |
33492 | test->ops->free(test->ops, test); | |
1a6e0f06 | 33493 | + |
e4b2b4a8 JK |
33494 | + if (hist_data->enable_timestamps) { |
33495 | + if (!hist_data->remove || unregistered) | |
33496 | + tracing_set_time_stamp_abs(file->tr, false); | |
33497 | + } | |
33498 | +} | |
1a6e0f06 | 33499 | + |
e4b2b4a8 JK |
33500 | +static bool hist_file_check_refs(struct trace_event_file *file) |
33501 | +{ | |
33502 | + struct hist_trigger_data *hist_data; | |
33503 | + struct event_trigger_data *test; | |
1a6e0f06 | 33504 | + |
e4b2b4a8 JK |
33505 | + list_for_each_entry_rcu(test, &file->triggers, list) { |
33506 | + if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { | |
33507 | + hist_data = test->private_data; | |
33508 | + if (check_var_refs(hist_data)) | |
33509 | + return true; | |
1a6e0f06 | 33510 | + } |
e4b2b4a8 | 33511 | + } |
1a6e0f06 | 33512 | + |
e4b2b4a8 JK |
33513 | + return false; |
33514 | } | |
33515 | ||
33516 | static void hist_unreg_all(struct trace_event_file *file) | |
33517 | { | |
33518 | struct event_trigger_data *test, *n; | |
33519 | + struct hist_trigger_data *hist_data; | |
33520 | + struct synth_event *se; | |
33521 | + const char *se_name; | |
1a6e0f06 | 33522 | + |
e4b2b4a8 JK |
33523 | + if (hist_file_check_refs(file)) |
33524 | + return; | |
33525 | ||
33526 | list_for_each_entry_safe(test, n, &file->triggers, list) { | |
33527 | if (test->cmd_ops->trigger_type == ETT_EVENT_HIST) { | |
33528 | + hist_data = test->private_data; | |
33529 | list_del_rcu(&test->list); | |
33530 | trace_event_trigger_enable_disable(file, 0); | |
33531 | + | |
33532 | + mutex_lock(&synth_event_mutex); | |
33533 | + se_name = trace_event_name(file->event_call); | |
33534 | + se = find_synth_event(se_name); | |
33535 | + if (se) | |
33536 | + se->ref--; | |
33537 | + mutex_unlock(&synth_event_mutex); | |
33538 | + | |
33539 | update_cond_flag(file); | |
33540 | + if (hist_data->enable_timestamps) | |
33541 | + tracing_set_time_stamp_abs(file->tr, false); | |
33542 | if (test->ops->free) | |
33543 | test->ops->free(test->ops, test); | |
33544 | } | |
b3bbd485 | 33545 | @@ -1523,16 +5475,54 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, |
e4b2b4a8 JK |
33546 | struct hist_trigger_attrs *attrs; |
33547 | struct event_trigger_ops *trigger_ops; | |
33548 | struct hist_trigger_data *hist_data; | |
33549 | - char *trigger; | |
33550 | + struct synth_event *se; | |
33551 | + const char *se_name; | |
33552 | + bool remove = false; | |
33553 | + char *trigger, *p; | |
33554 | int ret = 0; | |
33555 | ||
33556 | + if (glob && strlen(glob)) { | |
33557 | + last_cmd_set(param); | |
33558 | + hist_err_clear(); | |
33559 | + } | |
1a6e0f06 | 33560 | + |
e4b2b4a8 JK |
33561 | if (!param) |
33562 | return -EINVAL; | |
33563 | ||
33564 | - /* separate the trigger from the filter (k:v [if filter]) */ | |
33565 | - trigger = strsep(¶m, " \t"); | |
33566 | - if (!trigger) | |
33567 | - return -EINVAL; | |
33568 | + if (glob[0] == '!') | |
33569 | + remove = true; | |
1a6e0f06 | 33570 | + |
e4b2b4a8 JK |
33571 | + /* |
33572 | + * separate the trigger from the filter (k:v [if filter]) | |
33573 | + * allowing for whitespace in the trigger | |
33574 | + */ | |
33575 | + p = trigger = param; | |
33576 | + do { | |
33577 | + p = strstr(p, "if"); | |
33578 | + if (!p) | |
33579 | + break; | |
33580 | + if (p == param) | |
33581 | + return -EINVAL; | |
33582 | + if (*(p - 1) != ' ' && *(p - 1) != '\t') { | |
33583 | + p++; | |
33584 | + continue; | |
1a6e0f06 | 33585 | + } |
e4b2b4a8 JK |
33586 | + if (p >= param + strlen(param) - strlen("if") - 1) |
33587 | + return -EINVAL; | |
33588 | + if (*(p + strlen("if")) != ' ' && *(p + strlen("if")) != '\t') { | |
33589 | + p++; | |
33590 | + continue; | |
33591 | + } | |
33592 | + break; | |
33593 | + } while (p); | |
33594 | + | |
33595 | + if (!p) | |
33596 | + param = NULL; | |
33597 | + else { | |
33598 | + *(p - 1) = '\0'; | |
33599 | + param = strstrip(p); | |
33600 | + trigger = strstrip(trigger); | |
1a6e0f06 | 33601 | + } |
e4b2b4a8 JK |
33602 | |
33603 | attrs = parse_hist_trigger_attrs(trigger); | |
33604 | if (IS_ERR(attrs)) | |
b3bbd485 | 33605 | @@ -1541,7 +5531,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, |
e4b2b4a8 JK |
33606 | if (attrs->map_bits) |
33607 | hist_trigger_bits = attrs->map_bits; | |
33608 | ||
33609 | - hist_data = create_hist_data(hist_trigger_bits, attrs, file); | |
33610 | + hist_data = create_hist_data(hist_trigger_bits, attrs, file, remove); | |
33611 | if (IS_ERR(hist_data)) { | |
33612 | destroy_hist_trigger_attrs(attrs); | |
33613 | return PTR_ERR(hist_data); | |
b3bbd485 | 33614 | @@ -1549,10 +5539,11 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, |
e4b2b4a8 JK |
33615 | |
33616 | trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger); | |
33617 | ||
33618 | - ret = -ENOMEM; | |
33619 | trigger_data = kzalloc(sizeof(*trigger_data), GFP_KERNEL); | |
33620 | - if (!trigger_data) | |
33621 | + if (!trigger_data) { | |
33622 | + ret = -ENOMEM; | |
33623 | goto out_free; | |
33624 | + } | |
33625 | ||
33626 | trigger_data->count = -1; | |
33627 | trigger_data->ops = trigger_ops; | |
b3bbd485 | 33628 | @@ -1570,8 +5561,24 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, |
e4b2b4a8 JK |
33629 | goto out_free; |
33630 | } | |
33631 | ||
33632 | - if (glob[0] == '!') { | |
33633 | + if (remove) { | |
33634 | + if (!have_hist_trigger_match(trigger_data, file)) | |
33635 | + goto out_free; | |
1a6e0f06 | 33636 | + |
e4b2b4a8 JK |
33637 | + if (hist_trigger_check_refs(trigger_data, file)) { |
33638 | + ret = -EBUSY; | |
33639 | + goto out_free; | |
1a6e0f06 JK |
33640 | + } |
33641 | + | |
e4b2b4a8 | 33642 | cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); |
1a6e0f06 | 33643 | + |
e4b2b4a8 JK |
33644 | + mutex_lock(&synth_event_mutex); |
33645 | + se_name = trace_event_name(file->event_call); | |
33646 | + se = find_synth_event(se_name); | |
33647 | + if (se) | |
33648 | + se->ref--; | |
33649 | + mutex_unlock(&synth_event_mutex); | |
1a6e0f06 | 33650 | + |
e4b2b4a8 JK |
33651 | ret = 0; |
33652 | goto out_free; | |
33653 | } | |
b3bbd485 | 33654 | @@ -1588,14 +5595,47 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, |
e4b2b4a8 JK |
33655 | goto out_free; |
33656 | } else if (ret < 0) | |
33657 | goto out_free; | |
1a6e0f06 | 33658 | + |
e4b2b4a8 JK |
33659 | + if (get_named_trigger_data(trigger_data)) |
33660 | + goto enable; | |
1a6e0f06 | 33661 | + |
e4b2b4a8 JK |
33662 | + if (has_hist_vars(hist_data)) |
33663 | + save_hist_vars(hist_data); | |
1a6e0f06 | 33664 | + |
e4b2b4a8 JK |
33665 | + ret = create_actions(hist_data, file); |
33666 | + if (ret) | |
33667 | + goto out_unreg; | |
1a6e0f06 | 33668 | + |
e4b2b4a8 JK |
33669 | + ret = tracing_map_init(hist_data->map); |
33670 | + if (ret) | |
33671 | + goto out_unreg; | |
33672 | +enable: | |
33673 | + ret = hist_trigger_enable(trigger_data, file); | |
33674 | + if (ret) | |
33675 | + goto out_unreg; | |
1a6e0f06 | 33676 | + |
e4b2b4a8 JK |
33677 | + mutex_lock(&synth_event_mutex); |
33678 | + se_name = trace_event_name(file->event_call); | |
33679 | + se = find_synth_event(se_name); | |
33680 | + if (se) | |
33681 | + se->ref++; | |
33682 | + mutex_unlock(&synth_event_mutex); | |
1a6e0f06 | 33683 | + |
e4b2b4a8 JK |
33684 | /* Just return zero, not the number of registered triggers */ |
33685 | ret = 0; | |
33686 | out: | |
33687 | + if (ret == 0) | |
33688 | + hist_err_clear(); | |
1a6e0f06 | 33689 | + |
e4b2b4a8 JK |
33690 | return ret; |
33691 | + out_unreg: | |
33692 | + cmd_ops->unreg(glob+1, trigger_ops, trigger_data, file); | |
33693 | out_free: | |
33694 | if (cmd_ops->set_filter) | |
33695 | cmd_ops->set_filter(NULL, trigger_data, NULL); | |
33696 | ||
33697 | + remove_hist_vars(hist_data); | |
1a6e0f06 | 33698 | + |
e4b2b4a8 JK |
33699 | kfree(trigger_data); |
33700 | ||
33701 | destroy_hist_data(hist_data); | |
b3bbd485 | 33702 | @@ -1625,7 +5665,8 @@ __init int register_trigger_hist_cmd(void) |
e4b2b4a8 JK |
33703 | } |
33704 | ||
33705 | static void | |
33706 | -hist_enable_trigger(struct event_trigger_data *data, void *rec) | |
33707 | +hist_enable_trigger(struct event_trigger_data *data, void *rec, | |
33708 | + struct ring_buffer_event *event) | |
33709 | { | |
33710 | struct enable_trigger_data *enable_data = data->private_data; | |
33711 | struct event_trigger_data *test; | |
b3bbd485 | 33712 | @@ -1641,7 +5682,8 @@ hist_enable_trigger(struct event_trigger_data *data, void *rec) |
e4b2b4a8 JK |
33713 | } |
33714 | ||
33715 | static void | |
33716 | -hist_enable_count_trigger(struct event_trigger_data *data, void *rec) | |
33717 | +hist_enable_count_trigger(struct event_trigger_data *data, void *rec, | |
33718 | + struct ring_buffer_event *event) | |
33719 | { | |
33720 | if (!data->count) | |
33721 | return; | |
b3bbd485 | 33722 | @@ -1649,7 +5691,7 @@ hist_enable_count_trigger(struct event_trigger_data *data, void *rec) |
e4b2b4a8 JK |
33723 | if (data->count != -1) |
33724 | (data->count)--; | |
33725 | ||
33726 | - hist_enable_trigger(data, rec); | |
33727 | + hist_enable_trigger(data, rec, event); | |
33728 | } | |
33729 | ||
33730 | static struct event_trigger_ops hist_enable_trigger_ops = { | |
b3bbd485 | 33731 | @@ -1754,3 +5796,31 @@ __init int register_trigger_hist_enable_disable_cmds(void) |
e4b2b4a8 JK |
33732 | |
33733 | return ret; | |
33734 | } | |
1a6e0f06 | 33735 | + |
e4b2b4a8 JK |
33736 | +static __init int trace_events_hist_init(void) |
33737 | +{ | |
33738 | + struct dentry *entry = NULL; | |
33739 | + struct dentry *d_tracer; | |
33740 | + int err = 0; | |
1a6e0f06 | 33741 | + |
e4b2b4a8 JK |
33742 | + d_tracer = tracing_init_dentry(); |
33743 | + if (IS_ERR(d_tracer)) { | |
33744 | + err = PTR_ERR(d_tracer); | |
33745 | + goto err; | |
1a6e0f06 JK |
33746 | + } |
33747 | + | |
e4b2b4a8 JK |
33748 | + entry = tracefs_create_file("synthetic_events", 0644, d_tracer, |
33749 | + NULL, &synth_events_fops); | |
33750 | + if (!entry) { | |
33751 | + err = -ENODEV; | |
33752 | + goto err; | |
1a6e0f06 JK |
33753 | + } |
33754 | + | |
e4b2b4a8 JK |
33755 | + return err; |
33756 | + err: | |
33757 | + pr_warn("Could not create tracefs 'synthetic_events' entry\n"); | |
33758 | + | |
33759 | + return err; | |
1a6e0f06 | 33760 | +} |
1a6e0f06 | 33761 | + |
e4b2b4a8 | 33762 | +fs_initcall(trace_events_hist_init); |
b3bbd485 JK |
33763 | diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c |
33764 | index 43254c5e7e16..24d42350d738 100644 | |
33765 | --- a/kernel/trace/trace_events_trigger.c | |
33766 | +++ b/kernel/trace/trace_events_trigger.c | |
33767 | @@ -63,7 +63,8 @@ void trigger_data_free(struct event_trigger_data *data) | |
e4b2b4a8 JK |
33768 | * any trigger that should be deferred, ETT_NONE if nothing to defer. |
33769 | */ | |
33770 | enum event_trigger_type | |
33771 | -event_triggers_call(struct trace_event_file *file, void *rec) | |
33772 | +event_triggers_call(struct trace_event_file *file, void *rec, | |
33773 | + struct ring_buffer_event *event) | |
33774 | { | |
33775 | struct event_trigger_data *data; | |
33776 | enum event_trigger_type tt = ETT_NONE; | |
b3bbd485 | 33777 | @@ -76,7 +77,7 @@ event_triggers_call(struct trace_event_file *file, void *rec) |
e4b2b4a8 JK |
33778 | if (data->paused) |
33779 | continue; | |
33780 | if (!rec) { | |
33781 | - data->ops->func(data, rec); | |
33782 | + data->ops->func(data, rec, event); | |
33783 | continue; | |
33784 | } | |
33785 | filter = rcu_dereference_sched(data->filter); | |
b3bbd485 | 33786 | @@ -86,7 +87,7 @@ event_triggers_call(struct trace_event_file *file, void *rec) |
e4b2b4a8 JK |
33787 | tt |= data->cmd_ops->trigger_type; |
33788 | continue; | |
33789 | } | |
33790 | - data->ops->func(data, rec); | |
33791 | + data->ops->func(data, rec, event); | |
33792 | } | |
33793 | return tt; | |
33794 | } | |
b3bbd485 | 33795 | @@ -108,7 +109,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call); |
e4b2b4a8 JK |
33796 | void |
33797 | event_triggers_post_call(struct trace_event_file *file, | |
33798 | enum event_trigger_type tt, | |
33799 | - void *rec) | |
33800 | + void *rec, struct ring_buffer_event *event) | |
33801 | { | |
33802 | struct event_trigger_data *data; | |
33803 | ||
b3bbd485 | 33804 | @@ -116,7 +117,7 @@ event_triggers_post_call(struct trace_event_file *file, |
e4b2b4a8 JK |
33805 | if (data->paused) |
33806 | continue; | |
33807 | if (data->cmd_ops->trigger_type & tt) | |
33808 | - data->ops->func(data, rec); | |
33809 | + data->ops->func(data, rec, event); | |
33810 | } | |
33811 | } | |
33812 | EXPORT_SYMBOL_GPL(event_triggers_post_call); | |
b3bbd485 | 33813 | @@ -914,8 +915,15 @@ void set_named_trigger_data(struct event_trigger_data *data, |
e4b2b4a8 JK |
33814 | data->named_data = named_data; |
33815 | } | |
33816 | ||
33817 | +struct event_trigger_data * | |
33818 | +get_named_trigger_data(struct event_trigger_data *data) | |
1a6e0f06 | 33819 | +{ |
e4b2b4a8 | 33820 | + return data->named_data; |
1a6e0f06 JK |
33821 | +} |
33822 | + | |
e4b2b4a8 JK |
33823 | static void |
33824 | -traceon_trigger(struct event_trigger_data *data, void *rec) | |
33825 | +traceon_trigger(struct event_trigger_data *data, void *rec, | |
33826 | + struct ring_buffer_event *event) | |
33827 | { | |
33828 | if (tracing_is_on()) | |
33829 | return; | |
b3bbd485 | 33830 | @@ -924,7 +932,8 @@ traceon_trigger(struct event_trigger_data *data, void *rec) |
e4b2b4a8 JK |
33831 | } |
33832 | ||
33833 | static void | |
33834 | -traceon_count_trigger(struct event_trigger_data *data, void *rec) | |
33835 | +traceon_count_trigger(struct event_trigger_data *data, void *rec, | |
33836 | + struct ring_buffer_event *event) | |
33837 | { | |
33838 | if (tracing_is_on()) | |
33839 | return; | |
b3bbd485 | 33840 | @@ -939,7 +948,8 @@ traceon_count_trigger(struct event_trigger_data *data, void *rec) |
e4b2b4a8 JK |
33841 | } |
33842 | ||
33843 | static void | |
33844 | -traceoff_trigger(struct event_trigger_data *data, void *rec) | |
33845 | +traceoff_trigger(struct event_trigger_data *data, void *rec, | |
33846 | + struct ring_buffer_event *event) | |
33847 | { | |
33848 | if (!tracing_is_on()) | |
33849 | return; | |
b3bbd485 | 33850 | @@ -948,7 +958,8 @@ traceoff_trigger(struct event_trigger_data *data, void *rec) |
e4b2b4a8 JK |
33851 | } |
33852 | ||
33853 | static void | |
33854 | -traceoff_count_trigger(struct event_trigger_data *data, void *rec) | |
33855 | +traceoff_count_trigger(struct event_trigger_data *data, void *rec, | |
33856 | + struct ring_buffer_event *event) | |
33857 | { | |
33858 | if (!tracing_is_on()) | |
33859 | return; | |
b3bbd485 | 33860 | @@ -1045,7 +1056,8 @@ static struct event_command trigger_traceoff_cmd = { |
1a6e0f06 | 33861 | |
e4b2b4a8 JK |
33862 | #ifdef CONFIG_TRACER_SNAPSHOT |
33863 | static void | |
33864 | -snapshot_trigger(struct event_trigger_data *data, void *rec) | |
33865 | +snapshot_trigger(struct event_trigger_data *data, void *rec, | |
33866 | + struct ring_buffer_event *event) | |
33867 | { | |
33868 | struct trace_event_file *file = data->private_data; | |
33869 | ||
b3bbd485 | 33870 | @@ -1056,7 +1068,8 @@ snapshot_trigger(struct event_trigger_data *data, void *rec) |
1a6e0f06 | 33871 | } |
1a6e0f06 | 33872 | |
e4b2b4a8 JK |
33873 | static void |
33874 | -snapshot_count_trigger(struct event_trigger_data *data, void *rec) | |
33875 | +snapshot_count_trigger(struct event_trigger_data *data, void *rec, | |
33876 | + struct ring_buffer_event *event) | |
33877 | { | |
33878 | if (!data->count) | |
33879 | return; | |
b3bbd485 | 33880 | @@ -1064,7 +1077,7 @@ snapshot_count_trigger(struct event_trigger_data *data, void *rec) |
e4b2b4a8 JK |
33881 | if (data->count != -1) |
33882 | (data->count)--; | |
1a6e0f06 | 33883 | |
e4b2b4a8 JK |
33884 | - snapshot_trigger(data, rec); |
33885 | + snapshot_trigger(data, rec, event); | |
33886 | } | |
33887 | ||
33888 | static int | |
b3bbd485 | 33889 | @@ -1143,13 +1156,15 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; } |
e4b2b4a8 JK |
33890 | #define STACK_SKIP 3 |
33891 | ||
33892 | static void | |
33893 | -stacktrace_trigger(struct event_trigger_data *data, void *rec) | |
33894 | +stacktrace_trigger(struct event_trigger_data *data, void *rec, | |
33895 | + struct ring_buffer_event *event) | |
1a6e0f06 | 33896 | { |
e4b2b4a8 | 33897 | trace_dump_stack(STACK_SKIP); |
1a6e0f06 JK |
33898 | } |
33899 | ||
e4b2b4a8 JK |
33900 | static void |
33901 | -stacktrace_count_trigger(struct event_trigger_data *data, void *rec) | |
33902 | +stacktrace_count_trigger(struct event_trigger_data *data, void *rec, | |
33903 | + struct ring_buffer_event *event) | |
33904 | { | |
33905 | if (!data->count) | |
33906 | return; | |
b3bbd485 | 33907 | @@ -1157,7 +1172,7 @@ stacktrace_count_trigger(struct event_trigger_data *data, void *rec) |
e4b2b4a8 JK |
33908 | if (data->count != -1) |
33909 | (data->count)--; | |
33910 | ||
33911 | - stacktrace_trigger(data, rec); | |
33912 | + stacktrace_trigger(data, rec, event); | |
1a6e0f06 JK |
33913 | } |
33914 | ||
e4b2b4a8 | 33915 | static int |
b3bbd485 | 33916 | @@ -1219,7 +1234,8 @@ static __init void unregister_trigger_traceon_traceoff_cmds(void) |
e4b2b4a8 JK |
33917 | } |
33918 | ||
33919 | static void | |
33920 | -event_enable_trigger(struct event_trigger_data *data, void *rec) | |
33921 | +event_enable_trigger(struct event_trigger_data *data, void *rec, | |
33922 | + struct ring_buffer_event *event) | |
33923 | { | |
33924 | struct enable_trigger_data *enable_data = data->private_data; | |
33925 | ||
b3bbd485 | 33926 | @@ -1230,7 +1246,8 @@ event_enable_trigger(struct event_trigger_data *data, void *rec) |
e4b2b4a8 JK |
33927 | } |
33928 | ||
33929 | static void | |
33930 | -event_enable_count_trigger(struct event_trigger_data *data, void *rec) | |
33931 | +event_enable_count_trigger(struct event_trigger_data *data, void *rec, | |
33932 | + struct ring_buffer_event *event) | |
33933 | { | |
33934 | struct enable_trigger_data *enable_data = data->private_data; | |
33935 | ||
b3bbd485 | 33936 | @@ -1244,7 +1261,7 @@ event_enable_count_trigger(struct event_trigger_data *data, void *rec) |
e4b2b4a8 JK |
33937 | if (data->count != -1) |
33938 | (data->count)--; | |
33939 | ||
33940 | - event_enable_trigger(data, rec); | |
33941 | + event_enable_trigger(data, rec, event); | |
33942 | } | |
33943 | ||
33944 | int event_enable_trigger_print(struct seq_file *m, | |
b3bbd485 JK |
33945 | diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c |
33946 | index d7c8e4ec3d9d..518c61a1bceb 100644 | |
33947 | --- a/kernel/trace/trace_hwlat.c | |
33948 | +++ b/kernel/trace/trace_hwlat.c | |
33949 | @@ -279,7 +279,7 @@ static void move_to_next_cpu(void) | |
e4b2b4a8 JK |
33950 | * of this thread, than stop migrating for the duration |
33951 | * of the current test. | |
33952 | */ | |
33953 | - if (!cpumask_equal(current_mask, ¤t->cpus_allowed)) | |
33954 | + if (!cpumask_equal(current_mask, current->cpus_ptr)) | |
33955 | goto disable; | |
1a6e0f06 | 33956 | |
e4b2b4a8 | 33957 | get_online_cpus(); |
b3bbd485 JK |
33958 | diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c |
33959 | index ea20274a105a..3c40d4174052 100644 | |
33960 | --- a/kernel/trace/trace_kprobe.c | |
33961 | +++ b/kernel/trace/trace_kprobe.c | |
33962 | @@ -918,8 +918,8 @@ static int probes_open(struct inode *inode, struct file *file) | |
e4b2b4a8 JK |
33963 | static ssize_t probes_write(struct file *file, const char __user *buffer, |
33964 | size_t count, loff_t *ppos) | |
33965 | { | |
33966 | - return traceprobe_probes_write(file, buffer, count, ppos, | |
33967 | - create_trace_kprobe); | |
33968 | + return trace_parse_run_command(file, buffer, count, ppos, | |
33969 | + create_trace_kprobe); | |
33970 | } | |
33971 | ||
33972 | static const struct file_operations kprobe_events_ops = { | |
b3bbd485 | 33973 | @@ -1444,9 +1444,9 @@ static __init int kprobe_trace_self_tests_init(void) |
e4b2b4a8 JK |
33974 | |
33975 | pr_info("Testing kprobe tracing: "); | |
33976 | ||
33977 | - ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target " | |
33978 | - "$stack $stack0 +0($stack)", | |
33979 | - create_trace_kprobe); | |
33980 | + ret = trace_run_command("p:testprobe kprobe_trace_selftest_target " | |
33981 | + "$stack $stack0 +0($stack)", | |
33982 | + create_trace_kprobe); | |
33983 | if (WARN_ON_ONCE(ret)) { | |
33984 | pr_warn("error on probing function entry.\n"); | |
33985 | warn++; | |
b3bbd485 | 33986 | @@ -1466,8 +1466,8 @@ static __init int kprobe_trace_self_tests_init(void) |
e4b2b4a8 JK |
33987 | } |
33988 | } | |
1a6e0f06 | 33989 | |
e4b2b4a8 JK |
33990 | - ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target " |
33991 | - "$retval", create_trace_kprobe); | |
33992 | + ret = trace_run_command("r:testprobe2 kprobe_trace_selftest_target " | |
33993 | + "$retval", create_trace_kprobe); | |
33994 | if (WARN_ON_ONCE(ret)) { | |
33995 | pr_warn("error on probing function return.\n"); | |
33996 | warn++; | |
b3bbd485 | 33997 | @@ -1537,13 +1537,13 @@ static __init int kprobe_trace_self_tests_init(void) |
e4b2b4a8 JK |
33998 | disable_trace_kprobe(tk, file); |
33999 | } | |
1a6e0f06 | 34000 | |
e4b2b4a8 JK |
34001 | - ret = traceprobe_command("-:testprobe", create_trace_kprobe); |
34002 | + ret = trace_run_command("-:testprobe", create_trace_kprobe); | |
34003 | if (WARN_ON_ONCE(ret)) { | |
34004 | pr_warn("error on deleting a probe.\n"); | |
34005 | warn++; | |
34006 | } | |
34007 | ||
34008 | - ret = traceprobe_command("-:testprobe2", create_trace_kprobe); | |
34009 | + ret = trace_run_command("-:testprobe2", create_trace_kprobe); | |
34010 | if (WARN_ON_ONCE(ret)) { | |
34011 | pr_warn("error on deleting a probe.\n"); | |
34012 | warn++; | |
b3bbd485 JK |
34013 | diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c |
34014 | index 4500b00e4e36..74a4bfc2c6b7 100644 | |
34015 | --- a/kernel/trace/trace_output.c | |
34016 | +++ b/kernel/trace/trace_output.c | |
34017 | @@ -447,6 +447,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) | |
1a6e0f06 JK |
34018 | { |
34019 | char hardsoft_irq; | |
34020 | char need_resched; | |
34021 | + char need_resched_lazy; | |
34022 | char irqs_off; | |
34023 | int hardirq; | |
34024 | int softirq; | |
b3bbd485 | 34025 | @@ -477,6 +478,9 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) |
1a6e0f06 JK |
34026 | break; |
34027 | } | |
34028 | ||
34029 | + need_resched_lazy = | |
34030 | + (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.'; | |
34031 | + | |
34032 | hardsoft_irq = | |
34033 | (nmi && hardirq) ? 'Z' : | |
34034 | nmi ? 'z' : | |
b3bbd485 | 34035 | @@ -485,14 +489,25 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) |
1a6e0f06 JK |
34036 | softirq ? 's' : |
34037 | '.' ; | |
34038 | ||
e4b2b4a8 JK |
34039 | - trace_seq_printf(s, "%c%c%c", |
34040 | - irqs_off, need_resched, hardsoft_irq); | |
34041 | + trace_seq_printf(s, "%c%c%c%c", | |
34042 | + irqs_off, need_resched, need_resched_lazy, | |
34043 | + hardsoft_irq); | |
34044 | ||
34045 | if (entry->preempt_count) | |
34046 | trace_seq_printf(s, "%x", entry->preempt_count); | |
34047 | else | |
34048 | trace_seq_putc(s, '.'); | |
34049 | ||
34050 | + if (entry->preempt_lazy_count) | |
34051 | + trace_seq_printf(s, "%x", entry->preempt_lazy_count); | |
34052 | + else | |
34053 | + trace_seq_putc(s, '.'); | |
34054 | + | |
34055 | + if (entry->migrate_disable) | |
34056 | + trace_seq_printf(s, "%x", entry->migrate_disable); | |
34057 | + else | |
34058 | + trace_seq_putc(s, '.'); | |
34059 | + | |
34060 | return !trace_seq_has_overflowed(s); | |
34061 | } | |
34062 | ||
b3bbd485 JK |
34063 | diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c |
34064 | index fe4513330412..daf54bda4dc8 100644 | |
34065 | --- a/kernel/trace/trace_probe.c | |
34066 | +++ b/kernel/trace/trace_probe.c | |
34067 | @@ -621,92 +621,6 @@ void traceprobe_free_probe_arg(struct probe_arg *arg) | |
e4b2b4a8 JK |
34068 | kfree(arg->comm); |
34069 | } | |
34070 | ||
34071 | -int traceprobe_command(const char *buf, int (*createfn)(int, char **)) | |
34072 | -{ | |
34073 | - char **argv; | |
34074 | - int argc, ret; | |
34075 | - | |
34076 | - argc = 0; | |
34077 | - ret = 0; | |
34078 | - argv = argv_split(GFP_KERNEL, buf, &argc); | |
34079 | - if (!argv) | |
34080 | - return -ENOMEM; | |
34081 | - | |
34082 | - if (argc) | |
34083 | - ret = createfn(argc, argv); | |
34084 | - | |
34085 | - argv_free(argv); | |
34086 | - | |
34087 | - return ret; | |
34088 | -} | |
34089 | - | |
34090 | -#define WRITE_BUFSIZE 4096 | |
34091 | - | |
34092 | -ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer, | |
34093 | - size_t count, loff_t *ppos, | |
34094 | - int (*createfn)(int, char **)) | |
34095 | -{ | |
34096 | - char *kbuf, *buf, *tmp; | |
34097 | - int ret = 0; | |
34098 | - size_t done = 0; | |
34099 | - size_t size; | |
34100 | - | |
34101 | - kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); | |
34102 | - if (!kbuf) | |
34103 | - return -ENOMEM; | |
34104 | - | |
34105 | - while (done < count) { | |
34106 | - size = count - done; | |
34107 | - | |
34108 | - if (size >= WRITE_BUFSIZE) | |
34109 | - size = WRITE_BUFSIZE - 1; | |
34110 | - | |
34111 | - if (copy_from_user(kbuf, buffer + done, size)) { | |
34112 | - ret = -EFAULT; | |
34113 | - goto out; | |
34114 | - } | |
34115 | - kbuf[size] = '\0'; | |
34116 | - buf = kbuf; | |
34117 | - do { | |
34118 | - tmp = strchr(buf, '\n'); | |
34119 | - if (tmp) { | |
34120 | - *tmp = '\0'; | |
34121 | - size = tmp - buf + 1; | |
34122 | - } else { | |
34123 | - size = strlen(buf); | |
34124 | - if (done + size < count) { | |
34125 | - if (buf != kbuf) | |
34126 | - break; | |
34127 | - /* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */ | |
34128 | - pr_warn("Line length is too long: Should be less than %d\n", | |
34129 | - WRITE_BUFSIZE - 2); | |
34130 | - ret = -EINVAL; | |
34131 | - goto out; | |
34132 | - } | |
34133 | - } | |
34134 | - done += size; | |
34135 | - | |
34136 | - /* Remove comments */ | |
34137 | - tmp = strchr(buf, '#'); | |
34138 | - | |
34139 | - if (tmp) | |
34140 | - *tmp = '\0'; | |
34141 | - | |
34142 | - ret = traceprobe_command(buf, createfn); | |
34143 | - if (ret) | |
34144 | - goto out; | |
34145 | - buf += size; | |
34146 | - | |
34147 | - } while (done < count); | |
34148 | - } | |
34149 | - ret = done; | |
34150 | - | |
34151 | -out: | |
34152 | - kfree(kbuf); | |
34153 | - | |
34154 | - return ret; | |
34155 | -} | |
34156 | - | |
34157 | static int __set_print_fmt(struct trace_probe *tp, char *buf, int len, | |
34158 | bool is_return) | |
34159 | { | |
b3bbd485 JK |
34160 | diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h |
34161 | index dc39472ca9e4..a0d750e3d17c 100644 | |
34162 | --- a/kernel/trace/trace_probe.h | |
34163 | +++ b/kernel/trace/trace_probe.h | |
e4b2b4a8 JK |
34164 | @@ -42,7 +42,6 @@ |
34165 | ||
34166 | #define MAX_TRACE_ARGS 128 | |
34167 | #define MAX_ARGSTR_LEN 63 | |
34168 | -#define MAX_EVENT_NAME_LEN 64 | |
34169 | #define MAX_STRING_SIZE PATH_MAX | |
34170 | ||
34171 | /* Reserved field names */ | |
b3bbd485 | 34172 | @@ -356,12 +355,6 @@ extern void traceprobe_free_probe_arg(struct probe_arg *arg); |
e4b2b4a8 JK |
34173 | |
34174 | extern int traceprobe_split_symbol_offset(char *symbol, long *offset); | |
34175 | ||
34176 | -extern ssize_t traceprobe_probes_write(struct file *file, | |
34177 | - const char __user *buffer, size_t count, loff_t *ppos, | |
34178 | - int (*createfn)(int, char**)); | |
34179 | - | |
34180 | -extern int traceprobe_command(const char *buf, int (*createfn)(int, char**)); | |
34181 | - | |
34182 | /* Sum up total data length for dynamic arraies (strings) */ | |
34183 | static nokprobe_inline int | |
34184 | __get_data_size(struct trace_probe *tp, struct pt_regs *regs) | |
b3bbd485 JK |
34185 | diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c |
34186 | index ea0d90a31fc9..2ccfbb8efeb2 100644 | |
34187 | --- a/kernel/trace/trace_uprobe.c | |
34188 | +++ b/kernel/trace/trace_uprobe.c | |
34189 | @@ -647,7 +647,7 @@ static int probes_open(struct inode *inode, struct file *file) | |
e4b2b4a8 JK |
34190 | static ssize_t probes_write(struct file *file, const char __user *buffer, |
34191 | size_t count, loff_t *ppos) | |
34192 | { | |
34193 | - return traceprobe_probes_write(file, buffer, count, ppos, create_trace_uprobe); | |
34194 | + return trace_parse_run_command(file, buffer, count, ppos, create_trace_uprobe); | |
34195 | } | |
34196 | ||
34197 | static const struct file_operations uprobe_events_ops = { | |
b3bbd485 JK |
34198 | diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c |
34199 | index 305039b122fa..5cadb1b8b5fe 100644 | |
34200 | --- a/kernel/trace/tracing_map.c | |
34201 | +++ b/kernel/trace/tracing_map.c | |
34202 | @@ -66,6 +66,73 @@ u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i) | |
e4b2b4a8 JK |
34203 | return (u64)atomic64_read(&elt->fields[i].sum); |
34204 | } | |
34205 | ||
34206 | +/** | |
34207 | + * tracing_map_set_var - Assign a tracing_map_elt's variable field | |
34208 | + * @elt: The tracing_map_elt | |
34209 | + * @i: The index of the given variable associated with the tracing_map_elt | |
34210 | + * @n: The value to assign | |
34211 | + * | |
34212 | + * Assign n to variable i associated with the specified tracing_map_elt | |
34213 | + * instance. The index i is the index returned by the call to | |
34214 | + * tracing_map_add_var() when the tracing map was set up. | |
34215 | + */ | |
34216 | +void tracing_map_set_var(struct tracing_map_elt *elt, unsigned int i, u64 n) | |
34217 | +{ | |
34218 | + atomic64_set(&elt->vars[i], n); | |
34219 | + elt->var_set[i] = true; | |
34220 | +} | |
34221 | + | |
34222 | +/** | |
34223 | + * tracing_map_var_set - Return whether or not a variable has been set | |
34224 | + * @elt: The tracing_map_elt | |
34225 | + * @i: The index of the given variable associated with the tracing_map_elt | |
34226 | + * | |
34227 | + * Return true if the variable has been set, false otherwise. The | |
34228 | + * index i is the index returned by the call to tracing_map_add_var() | |
34229 | + * when the tracing map was set up. | |
34230 | + */ | |
34231 | +bool tracing_map_var_set(struct tracing_map_elt *elt, unsigned int i) | |
34232 | +{ | |
34233 | + return elt->var_set[i]; | |
34234 | +} | |
34235 | + | |
34236 | +/** | |
34237 | + * tracing_map_read_var - Return the value of a tracing_map_elt's variable field | |
34238 | + * @elt: The tracing_map_elt | |
34239 | + * @i: The index of the given variable associated with the tracing_map_elt | |
34240 | + * | |
34241 | + * Retrieve the value of the variable i associated with the specified | |
34242 | + * tracing_map_elt instance. The index i is the index returned by the | |
34243 | + * call to tracing_map_add_var() when the tracing map was set | |
34244 | + * up. | |
34245 | + * | |
34246 | + * Return: The variable value associated with field i for elt. | |
34247 | + */ | |
34248 | +u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i) | |
34249 | +{ | |
34250 | + return (u64)atomic64_read(&elt->vars[i]); | |
34251 | +} | |
34252 | + | |
34253 | +/** | |
34254 | + * tracing_map_read_var_once - Return and reset a tracing_map_elt's variable field | |
34255 | + * @elt: The tracing_map_elt | |
34256 | + * @i: The index of the given variable associated with the tracing_map_elt | |
34257 | + * | |
34258 | + * Retrieve the value of the variable i associated with the specified | |
34259 | + * tracing_map_elt instance, and reset the variable to the 'not set' | |
34260 | + * state. The index i is the index returned by the call to | |
34261 | + * tracing_map_add_var() when the tracing map was set up. The reset | |
34262 | + * essentially makes the variable a read-once variable if it's only | |
34263 | + * accessed using this function. | |
34264 | + * | |
34265 | + * Return: The variable value associated with field i for elt. | |
34266 | + */ | |
34267 | +u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i) | |
34268 | +{ | |
34269 | + elt->var_set[i] = false; | |
34270 | + return (u64)atomic64_read(&elt->vars[i]); | |
34271 | +} | |
34272 | + | |
34273 | int tracing_map_cmp_string(void *val_a, void *val_b) | |
34274 | { | |
34275 | char *a = val_a; | |
b3bbd485 JK |
34276 | @@ -170,6 +237,28 @@ int tracing_map_add_sum_field(struct tracing_map *map) |
34277 | return tracing_map_add_field(map, tracing_map_cmp_atomic64); | |
e4b2b4a8 JK |
34278 | } |
34279 | ||
b3bbd485 | 34280 | +/** |
e4b2b4a8 JK |
34281 | + * tracing_map_add_var - Add a field describing a tracing_map var |
34282 | + * @map: The tracing_map | |
34283 | + * | |
34284 | + * Add a var to the map and return the index identifying it in the map | |
34285 | + * and associated tracing_map_elts. This is the index used for | |
34286 | + * instance to update a var for a particular tracing_map_elt using | |
34287 | + * tracing_map_update_var() or reading it via tracing_map_read_var(). | |
34288 | + * | |
34289 | + * Return: The index identifying the var in the map and associated | |
34290 | + * tracing_map_elts, or -EINVAL on error. | |
34291 | + */ | |
34292 | +int tracing_map_add_var(struct tracing_map *map) | |
34293 | +{ | |
34294 | + int ret = -EINVAL; | |
34295 | + | |
34296 | + if (map->n_vars < TRACING_MAP_VARS_MAX) | |
34297 | + ret = map->n_vars++; | |
34298 | + | |
34299 | + return ret; | |
34300 | +} | |
34301 | + | |
b3bbd485 | 34302 | /** |
e4b2b4a8 JK |
34303 | * tracing_map_add_key_field - Add a field describing a tracing_map key |
34304 | * @map: The tracing_map | |
b3bbd485 | 34305 | @@ -280,6 +369,11 @@ static void tracing_map_elt_clear(struct tracing_map_elt *elt) |
e4b2b4a8 JK |
34306 | if (elt->fields[i].cmp_fn == tracing_map_cmp_atomic64) |
34307 | atomic64_set(&elt->fields[i].sum, 0); | |
34308 | ||
34309 | + for (i = 0; i < elt->map->n_vars; i++) { | |
34310 | + atomic64_set(&elt->vars[i], 0); | |
34311 | + elt->var_set[i] = false; | |
34312 | + } | |
34313 | + | |
34314 | if (elt->map->ops && elt->map->ops->elt_clear) | |
34315 | elt->map->ops->elt_clear(elt); | |
34316 | } | |
b3bbd485 | 34317 | @@ -306,6 +400,8 @@ static void tracing_map_elt_free(struct tracing_map_elt *elt) |
e4b2b4a8 JK |
34318 | if (elt->map->ops && elt->map->ops->elt_free) |
34319 | elt->map->ops->elt_free(elt); | |
34320 | kfree(elt->fields); | |
34321 | + kfree(elt->vars); | |
34322 | + kfree(elt->var_set); | |
34323 | kfree(elt->key); | |
34324 | kfree(elt); | |
34325 | } | |
b3bbd485 | 34326 | @@ -333,6 +429,18 @@ static struct tracing_map_elt *tracing_map_elt_alloc(struct tracing_map *map) |
e4b2b4a8 JK |
34327 | goto free; |
34328 | } | |
34329 | ||
34330 | + elt->vars = kcalloc(map->n_vars, sizeof(*elt->vars), GFP_KERNEL); | |
34331 | + if (!elt->vars) { | |
34332 | + err = -ENOMEM; | |
34333 | + goto free; | |
34334 | + } | |
34335 | + | |
34336 | + elt->var_set = kcalloc(map->n_vars, sizeof(*elt->var_set), GFP_KERNEL); | |
34337 | + if (!elt->var_set) { | |
34338 | + err = -ENOMEM; | |
34339 | + goto free; | |
34340 | + } | |
34341 | + | |
34342 | tracing_map_elt_init_fields(elt); | |
34343 | ||
34344 | if (map->ops && map->ops->elt_alloc) { | |
b3bbd485 | 34345 | @@ -414,7 +522,9 @@ static inline struct tracing_map_elt * |
e4b2b4a8 JK |
34346 | __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only) |
34347 | { | |
34348 | u32 idx, key_hash, test_key; | |
34349 | + int dup_try = 0; | |
34350 | struct tracing_map_entry *entry; | |
34351 | + struct tracing_map_elt *val; | |
34352 | ||
34353 | key_hash = jhash(key, map->key_size, 0); | |
34354 | if (key_hash == 0) | |
b3bbd485 | 34355 | @@ -426,10 +536,33 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only) |
e4b2b4a8 JK |
34356 | entry = TRACING_MAP_ENTRY(map->map, idx); |
34357 | test_key = entry->key; | |
34358 | ||
34359 | - if (test_key && test_key == key_hash && entry->val && | |
34360 | - keys_match(key, entry->val->key, map->key_size)) { | |
34361 | - atomic64_inc(&map->hits); | |
34362 | - return entry->val; | |
34363 | + if (test_key && test_key == key_hash) { | |
34364 | + val = READ_ONCE(entry->val); | |
34365 | + if (val && | |
34366 | + keys_match(key, val->key, map->key_size)) { | |
34367 | + if (!lookup_only) | |
34368 | + atomic64_inc(&map->hits); | |
34369 | + return val; | |
34370 | + } else if (unlikely(!val)) { | |
34371 | + /* | |
34372 | + * The key is present. But, val (pointer to elt | |
34373 | + * struct) is still NULL. which means some other | |
34374 | + * thread is in the process of inserting an | |
34375 | + * element. | |
34376 | + * | |
34377 | + * On top of that, it's key_hash is same as the | |
34378 | + * one being inserted right now. So, it's | |
34379 | + * possible that the element has the same | |
34380 | + * key as well. | |
34381 | + */ | |
34382 | + | |
34383 | + dup_try++; | |
34384 | + if (dup_try > map->map_size) { | |
34385 | + atomic64_inc(&map->drops); | |
34386 | + break; | |
34387 | + } | |
34388 | + continue; | |
34389 | + } | |
34390 | } | |
34391 | ||
34392 | if (!test_key) { | |
b3bbd485 | 34393 | @@ -451,6 +584,13 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only) |
e4b2b4a8 JK |
34394 | atomic64_inc(&map->hits); |
34395 | ||
34396 | return entry->val; | |
34397 | + } else { | |
34398 | + /* | |
34399 | + * cmpxchg() failed. Loop around once | |
34400 | + * more to check what key was inserted. | |
34401 | + */ | |
34402 | + dup_try++; | |
34403 | + continue; | |
34404 | } | |
34405 | } | |
34406 | ||
b3bbd485 | 34407 | @@ -815,67 +955,15 @@ create_sort_entry(void *key, struct tracing_map_elt *elt) |
e4b2b4a8 JK |
34408 | return sort_entry; |
34409 | } | |
34410 | ||
34411 | -static struct tracing_map_elt *copy_elt(struct tracing_map_elt *elt) | |
34412 | -{ | |
34413 | - struct tracing_map_elt *dup_elt; | |
34414 | - unsigned int i; | |
34415 | - | |
34416 | - dup_elt = tracing_map_elt_alloc(elt->map); | |
34417 | - if (IS_ERR(dup_elt)) | |
34418 | - return NULL; | |
34419 | - | |
34420 | - if (elt->map->ops && elt->map->ops->elt_copy) | |
34421 | - elt->map->ops->elt_copy(dup_elt, elt); | |
34422 | - | |
34423 | - dup_elt->private_data = elt->private_data; | |
34424 | - memcpy(dup_elt->key, elt->key, elt->map->key_size); | |
34425 | - | |
34426 | - for (i = 0; i < elt->map->n_fields; i++) { | |
34427 | - atomic64_set(&dup_elt->fields[i].sum, | |
34428 | - atomic64_read(&elt->fields[i].sum)); | |
34429 | - dup_elt->fields[i].cmp_fn = elt->fields[i].cmp_fn; | |
34430 | - } | |
34431 | - | |
34432 | - return dup_elt; | |
34433 | -} | |
34434 | - | |
34435 | -static int merge_dup(struct tracing_map_sort_entry **sort_entries, | |
34436 | - unsigned int target, unsigned int dup) | |
34437 | -{ | |
34438 | - struct tracing_map_elt *target_elt, *elt; | |
34439 | - bool first_dup = (target - dup) == 1; | |
34440 | - int i; | |
34441 | - | |
34442 | - if (first_dup) { | |
34443 | - elt = sort_entries[target]->elt; | |
34444 | - target_elt = copy_elt(elt); | |
34445 | - if (!target_elt) | |
34446 | - return -ENOMEM; | |
34447 | - sort_entries[target]->elt = target_elt; | |
34448 | - sort_entries[target]->elt_copied = true; | |
34449 | - } else | |
34450 | - target_elt = sort_entries[target]->elt; | |
34451 | - | |
34452 | - elt = sort_entries[dup]->elt; | |
34453 | - | |
34454 | - for (i = 0; i < elt->map->n_fields; i++) | |
34455 | - atomic64_add(atomic64_read(&elt->fields[i].sum), | |
34456 | - &target_elt->fields[i].sum); | |
34457 | - | |
34458 | - sort_entries[dup]->dup = true; | |
34459 | - | |
34460 | - return 0; | |
34461 | -} | |
34462 | - | |
34463 | -static int merge_dups(struct tracing_map_sort_entry **sort_entries, | |
34464 | +static void detect_dups(struct tracing_map_sort_entry **sort_entries, | |
34465 | int n_entries, unsigned int key_size) | |
34466 | { | |
34467 | unsigned int dups = 0, total_dups = 0; | |
34468 | - int err, i, j; | |
34469 | + int i; | |
34470 | void *key; | |
34471 | ||
34472 | if (n_entries < 2) | |
34473 | - return total_dups; | |
34474 | + return; | |
1a6e0f06 | 34475 | |
e4b2b4a8 JK |
34476 | sort(sort_entries, n_entries, sizeof(struct tracing_map_sort_entry *), |
34477 | (int (*)(const void *, const void *))cmp_entries_dup, NULL); | |
b3bbd485 | 34478 | @@ -884,30 +972,14 @@ static int merge_dups(struct tracing_map_sort_entry **sort_entries, |
e4b2b4a8 JK |
34479 | for (i = 1; i < n_entries; i++) { |
34480 | if (!memcmp(sort_entries[i]->key, key, key_size)) { | |
34481 | dups++; total_dups++; | |
34482 | - err = merge_dup(sort_entries, i - dups, i); | |
34483 | - if (err) | |
34484 | - return err; | |
34485 | continue; | |
34486 | } | |
34487 | key = sort_entries[i]->key; | |
34488 | dups = 0; | |
34489 | } | |
1a6e0f06 | 34490 | |
e4b2b4a8 JK |
34491 | - if (!total_dups) |
34492 | - return total_dups; | |
34493 | - | |
34494 | - for (i = 0, j = 0; i < n_entries; i++) { | |
34495 | - if (!sort_entries[i]->dup) { | |
34496 | - sort_entries[j] = sort_entries[i]; | |
34497 | - if (j++ != i) | |
34498 | - sort_entries[i] = NULL; | |
34499 | - } else { | |
34500 | - destroy_sort_entry(sort_entries[i]); | |
34501 | - sort_entries[i] = NULL; | |
34502 | - } | |
34503 | - } | |
34504 | - | |
34505 | - return total_dups; | |
34506 | + WARN_ONCE(total_dups > 0, | |
34507 | + "Duplicates detected: %d\n", total_dups); | |
1a6e0f06 JK |
34508 | } |
34509 | ||
e4b2b4a8 | 34510 | static bool is_key(struct tracing_map *map, unsigned int field_idx) |
b3bbd485 | 34511 | @@ -1033,10 +1105,7 @@ int tracing_map_sort_entries(struct tracing_map *map, |
e4b2b4a8 JK |
34512 | return 1; |
34513 | } | |
34514 | ||
34515 | - ret = merge_dups(entries, n_entries, map->key_size); | |
34516 | - if (ret < 0) | |
34517 | - goto free; | |
34518 | - n_entries -= ret; | |
34519 | + detect_dups(entries, n_entries, map->key_size); | |
34520 | ||
34521 | if (is_key(map, sort_keys[0].field_idx)) | |
34522 | cmp_entries_fn = cmp_entries_key; | |
b3bbd485 JK |
34523 | diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h |
34524 | index ab0ca77331d0..053eb92b2d31 100644 | |
34525 | --- a/kernel/trace/tracing_map.h | |
34526 | +++ b/kernel/trace/tracing_map.h | |
e4b2b4a8 JK |
34527 | @@ -6,10 +6,11 @@ |
34528 | #define TRACING_MAP_BITS_MAX 17 | |
34529 | #define TRACING_MAP_BITS_MIN 7 | |
34530 | ||
34531 | -#define TRACING_MAP_KEYS_MAX 2 | |
34532 | +#define TRACING_MAP_KEYS_MAX 3 | |
34533 | #define TRACING_MAP_VALS_MAX 3 | |
34534 | #define TRACING_MAP_FIELDS_MAX (TRACING_MAP_KEYS_MAX + \ | |
34535 | TRACING_MAP_VALS_MAX) | |
34536 | +#define TRACING_MAP_VARS_MAX 16 | |
34537 | #define TRACING_MAP_SORT_KEYS_MAX 2 | |
34538 | ||
34539 | typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b); | |
b3bbd485 | 34540 | @@ -137,6 +138,8 @@ struct tracing_map_field { |
e4b2b4a8 JK |
34541 | struct tracing_map_elt { |
34542 | struct tracing_map *map; | |
34543 | struct tracing_map_field *fields; | |
34544 | + atomic64_t *vars; | |
34545 | + bool *var_set; | |
34546 | void *key; | |
34547 | void *private_data; | |
34548 | }; | |
b3bbd485 | 34549 | @@ -192,6 +195,7 @@ struct tracing_map { |
e4b2b4a8 JK |
34550 | int key_idx[TRACING_MAP_KEYS_MAX]; |
34551 | unsigned int n_keys; | |
34552 | struct tracing_map_sort_key sort_key; | |
34553 | + unsigned int n_vars; | |
34554 | atomic64_t hits; | |
34555 | atomic64_t drops; | |
34556 | }; | |
b3bbd485 | 34557 | @@ -215,11 +219,6 @@ struct tracing_map { |
e4b2b4a8 JK |
34558 | * Element allocation occurs before tracing begins, when the |
34559 | * tracing_map_init() call is made by client code. | |
34560 | * | |
34561 | - * @elt_copy: At certain points in the lifetime of an element, it may | |
34562 | - * need to be copied. The copy should include a copy of the | |
34563 | - * client-allocated data, which can be copied into the 'to' | |
34564 | - * element from the 'from' element. | |
34565 | - * | |
34566 | * @elt_free: When a tracing_map_elt is freed, this function is called | |
34567 | * and allows client-allocated per-element data to be freed. | |
34568 | * | |
b3bbd485 | 34569 | @@ -233,8 +232,6 @@ struct tracing_map { |
e4b2b4a8 JK |
34570 | */ |
34571 | struct tracing_map_ops { | |
34572 | int (*elt_alloc)(struct tracing_map_elt *elt); | |
34573 | - void (*elt_copy)(struct tracing_map_elt *to, | |
34574 | - struct tracing_map_elt *from); | |
34575 | void (*elt_free)(struct tracing_map_elt *elt); | |
34576 | void (*elt_clear)(struct tracing_map_elt *elt); | |
34577 | void (*elt_init)(struct tracing_map_elt *elt); | |
b3bbd485 | 34578 | @@ -248,6 +245,7 @@ tracing_map_create(unsigned int map_bits, |
e4b2b4a8 JK |
34579 | extern int tracing_map_init(struct tracing_map *map); |
34580 | ||
34581 | extern int tracing_map_add_sum_field(struct tracing_map *map); | |
34582 | +extern int tracing_map_add_var(struct tracing_map *map); | |
34583 | extern int tracing_map_add_key_field(struct tracing_map *map, | |
34584 | unsigned int offset, | |
34585 | tracing_map_cmp_fn_t cmp_fn); | |
b3bbd485 | 34586 | @@ -267,7 +265,13 @@ extern int tracing_map_cmp_none(void *val_a, void *val_b); |
e4b2b4a8 JK |
34587 | |
34588 | extern void tracing_map_update_sum(struct tracing_map_elt *elt, | |
34589 | unsigned int i, u64 n); | |
34590 | +extern void tracing_map_set_var(struct tracing_map_elt *elt, | |
34591 | + unsigned int i, u64 n); | |
34592 | +extern bool tracing_map_var_set(struct tracing_map_elt *elt, unsigned int i); | |
34593 | extern u64 tracing_map_read_sum(struct tracing_map_elt *elt, unsigned int i); | |
34594 | +extern u64 tracing_map_read_var(struct tracing_map_elt *elt, unsigned int i); | |
34595 | +extern u64 tracing_map_read_var_once(struct tracing_map_elt *elt, unsigned int i); | |
34596 | + | |
34597 | extern void tracing_map_set_field_descr(struct tracing_map *map, | |
34598 | unsigned int i, | |
34599 | unsigned int key_offset, | |
b3bbd485 JK |
34600 | diff --git a/kernel/user.c b/kernel/user.c |
34601 | index 00281add65b2..f4cf1841f2fd 100644 | |
34602 | --- a/kernel/user.c | |
34603 | +++ b/kernel/user.c | |
34604 | @@ -162,11 +162,11 @@ void free_uid(struct user_struct *up) | |
1a6e0f06 JK |
34605 | if (!up) |
34606 | return; | |
34607 | ||
34608 | - local_irq_save(flags); | |
34609 | + local_irq_save_nort(flags); | |
34610 | if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) | |
34611 | free_user(up, flags); | |
34612 | else | |
34613 | - local_irq_restore(flags); | |
34614 | + local_irq_restore_nort(flags); | |
34615 | } | |
34616 | ||
34617 | struct user_struct *alloc_uid(kuid_t uid) | |
b3bbd485 JK |
34618 | diff --git a/kernel/watchdog.c b/kernel/watchdog.c |
34619 | index 087994b23f8b..ea4c09109ce4 100644 | |
34620 | --- a/kernel/watchdog.c | |
34621 | +++ b/kernel/watchdog.c | |
34622 | @@ -462,7 +462,7 @@ static void watchdog_enable(unsigned int cpu) | |
e4b2b4a8 JK |
34623 | * Start the timer first to prevent the NMI watchdog triggering |
34624 | * before the timer has a chance to fire. | |
34625 | */ | |
34626 | - hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | |
34627 | + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); | |
34628 | hrtimer->function = watchdog_timer_fn; | |
34629 | hrtimer_start(hrtimer, ns_to_ktime(sample_period), | |
34630 | HRTIMER_MODE_REL_PINNED); | |
b3bbd485 JK |
34631 | diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c |
34632 | index 4ece6028007a..210dccc57c04 100644 | |
34633 | --- a/kernel/watchdog_hld.c | |
34634 | +++ b/kernel/watchdog_hld.c | |
34635 | @@ -24,6 +24,8 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn); | |
e4b2b4a8 JK |
34636 | static DEFINE_PER_CPU(bool, watchdog_nmi_touch); |
34637 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); | |
34638 | static DEFINE_PER_CPU(struct perf_event *, dead_event); | |
1a6e0f06 JK |
34639 | +static DEFINE_RAW_SPINLOCK(watchdog_output_lock); |
34640 | + | |
e4b2b4a8 JK |
34641 | static struct cpumask dead_events_mask; |
34642 | ||
34643 | static unsigned long hardlockup_allcpu_dumped; | |
b3bbd485 | 34644 | @@ -134,6 +136,13 @@ static void watchdog_overflow_callback(struct perf_event *event, |
1a6e0f06 JK |
34645 | /* only print hardlockups once */ |
34646 | if (__this_cpu_read(hard_watchdog_warn) == true) | |
34647 | return; | |
34648 | + /* | |
34649 | + * If early-printk is enabled then make sure we do not | |
34650 | + * lock up in printk() and kill console logging: | |
34651 | + */ | |
34652 | + printk_kill(); | |
34653 | + | |
34654 | + raw_spin_lock(&watchdog_output_lock); | |
34655 | ||
34656 | pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); | |
34657 | print_modules(); | |
b3bbd485 | 34658 | @@ -151,6 +160,7 @@ static void watchdog_overflow_callback(struct perf_event *event, |
1a6e0f06 JK |
34659 | !test_and_set_bit(0, &hardlockup_allcpu_dumped)) |
34660 | trigger_allbutself_cpu_backtrace(); | |
34661 | ||
34662 | + raw_spin_unlock(&watchdog_output_lock); | |
34663 | if (hardlockup_panic) | |
34664 | nmi_panic(regs, "Hard LOCKUP"); | |
34665 | ||
b3bbd485 JK |
34666 | diff --git a/kernel/workqueue.c b/kernel/workqueue.c |
34667 | index 08bc551976b2..76297cce5602 100644 | |
34668 | --- a/kernel/workqueue.c | |
34669 | +++ b/kernel/workqueue.c | |
e4b2b4a8 | 34670 | @@ -49,6 +49,8 @@ |
1a6e0f06 JK |
34671 | #include <linux/moduleparam.h> |
34672 | #include <linux/uaccess.h> | |
e4b2b4a8 | 34673 | #include <linux/nmi.h> |
1a6e0f06 JK |
34674 | +#include <linux/locallock.h> |
34675 | +#include <linux/delay.h> | |
34676 | ||
34677 | #include "workqueue_internal.h" | |
34678 | ||
b3bbd485 | 34679 | @@ -123,11 +125,16 @@ enum { |
1a6e0f06 JK |
34680 | * cpu or grabbing pool->lock is enough for read access. If |
34681 | * POOL_DISASSOCIATED is set, it's identical to L. | |
34682 | * | |
34683 | + * On RT we need the extra protection via rt_lock_idle_list() for | |
34684 | + * the list manipulations against read access from | |
34685 | + * wq_worker_sleeping(). All other places are nicely serialized via | |
34686 | + * pool->lock. | |
34687 | + * | |
34688 | * A: pool->attach_mutex protected. | |
34689 | * | |
34690 | * PL: wq_pool_mutex protected. | |
34691 | * | |
34692 | - * PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads. | |
34693 | + * PR: wq_pool_mutex protected for writes. RCU protected for reads. | |
34694 | * | |
34695 | * PW: wq_pool_mutex and wq->mutex protected for writes. Either for reads. | |
34696 | * | |
b3bbd485 | 34697 | @@ -136,7 +143,7 @@ enum { |
1a6e0f06 JK |
34698 | * |
34699 | * WQ: wq->mutex protected. | |
34700 | * | |
34701 | - * WR: wq->mutex protected for writes. Sched-RCU protected for reads. | |
34702 | + * WR: wq->mutex protected for writes. RCU protected for reads. | |
34703 | * | |
34704 | * MD: wq_mayday_lock protected. | |
34705 | */ | |
b3bbd485 | 34706 | @@ -186,7 +193,7 @@ struct worker_pool { |
1a6e0f06 JK |
34707 | atomic_t nr_running ____cacheline_aligned_in_smp; |
34708 | ||
34709 | /* | |
34710 | - * Destruction of pool is sched-RCU protected to allow dereferences | |
34711 | + * Destruction of pool is RCU protected to allow dereferences | |
34712 | * from get_work_pool(). | |
34713 | */ | |
34714 | struct rcu_head rcu; | |
b3bbd485 | 34715 | @@ -215,7 +222,7 @@ struct pool_workqueue { |
1a6e0f06 JK |
34716 | /* |
34717 | * Release of unbound pwq is punted to system_wq. See put_pwq() | |
34718 | * and pwq_unbound_release_workfn() for details. pool_workqueue | |
34719 | - * itself is also sched-RCU protected so that the first pwq can be | |
34720 | + * itself is also RCU protected so that the first pwq can be | |
34721 | * determined without grabbing wq->mutex. | |
34722 | */ | |
34723 | struct work_struct unbound_release_work; | |
b3bbd485 | 34724 | @@ -352,6 +359,8 @@ EXPORT_SYMBOL_GPL(system_power_efficient_wq); |
1a6e0f06 JK |
34725 | struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly; |
34726 | EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); | |
34727 | ||
34728 | +static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock); | |
34729 | + | |
34730 | static int worker_thread(void *__worker); | |
34731 | static void workqueue_sysfs_unregister(struct workqueue_struct *wq); | |
34732 | ||
b3bbd485 | 34733 | @@ -359,20 +368,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); |
1a6e0f06 JK |
34734 | #include <trace/events/workqueue.h> |
34735 | ||
34736 | #define assert_rcu_or_pool_mutex() \ | |
34737 | - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ | |
34738 | + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ | |
34739 | !lockdep_is_held(&wq_pool_mutex), \ | |
34740 | - "sched RCU or wq_pool_mutex should be held") | |
34741 | + "RCU or wq_pool_mutex should be held") | |
34742 | ||
34743 | #define assert_rcu_or_wq_mutex(wq) \ | |
34744 | - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ | |
34745 | + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ | |
34746 | !lockdep_is_held(&wq->mutex), \ | |
34747 | - "sched RCU or wq->mutex should be held") | |
34748 | + "RCU or wq->mutex should be held") | |
34749 | ||
34750 | #define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \ | |
34751 | - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ | |
34752 | + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ | |
34753 | !lockdep_is_held(&wq->mutex) && \ | |
34754 | !lockdep_is_held(&wq_pool_mutex), \ | |
34755 | - "sched RCU, wq->mutex or wq_pool_mutex should be held") | |
34756 | + "RCU, wq->mutex or wq_pool_mutex should be held") | |
34757 | ||
34758 | #define for_each_cpu_worker_pool(pool, cpu) \ | |
34759 | for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ | |
b3bbd485 | 34760 | @@ -384,7 +393,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); |
1a6e0f06 JK |
34761 | * @pool: iteration cursor |
34762 | * @pi: integer used for iteration | |
34763 | * | |
34764 | - * This must be called either with wq_pool_mutex held or sched RCU read | |
34765 | + * This must be called either with wq_pool_mutex held or RCU read | |
34766 | * locked. If the pool needs to be used beyond the locking in effect, the | |
34767 | * caller is responsible for guaranteeing that the pool stays online. | |
34768 | * | |
b3bbd485 | 34769 | @@ -416,7 +425,7 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); |
1a6e0f06 JK |
34770 | * @pwq: iteration cursor |
34771 | * @wq: the target workqueue | |
34772 | * | |
34773 | - * This must be called either with wq->mutex held or sched RCU read locked. | |
34774 | + * This must be called either with wq->mutex held or RCU read locked. | |
34775 | * If the pwq needs to be used beyond the locking in effect, the caller is | |
34776 | * responsible for guaranteeing that the pwq stays online. | |
34777 | * | |
b3bbd485 | 34778 | @@ -428,6 +437,31 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); |
1a6e0f06 JK |
34779 | if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \ |
34780 | else | |
34781 | ||
34782 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
34783 | +static inline void rt_lock_idle_list(struct worker_pool *pool) | |
34784 | +{ | |
34785 | + preempt_disable(); | |
34786 | +} | |
34787 | +static inline void rt_unlock_idle_list(struct worker_pool *pool) | |
34788 | +{ | |
34789 | + preempt_enable(); | |
34790 | +} | |
34791 | +static inline void sched_lock_idle_list(struct worker_pool *pool) { } | |
34792 | +static inline void sched_unlock_idle_list(struct worker_pool *pool) { } | |
34793 | +#else | |
34794 | +static inline void rt_lock_idle_list(struct worker_pool *pool) { } | |
34795 | +static inline void rt_unlock_idle_list(struct worker_pool *pool) { } | |
34796 | +static inline void sched_lock_idle_list(struct worker_pool *pool) | |
34797 | +{ | |
34798 | + spin_lock_irq(&pool->lock); | |
34799 | +} | |
34800 | +static inline void sched_unlock_idle_list(struct worker_pool *pool) | |
34801 | +{ | |
34802 | + spin_unlock_irq(&pool->lock); | |
34803 | +} | |
34804 | +#endif | |
34805 | + | |
34806 | + | |
34807 | #ifdef CONFIG_DEBUG_OBJECTS_WORK | |
34808 | ||
34809 | static struct debug_obj_descr work_debug_descr; | |
b3bbd485 | 34810 | @@ -552,7 +586,7 @@ static int worker_pool_assign_id(struct worker_pool *pool) |
1a6e0f06 JK |
34811 | * @wq: the target workqueue |
34812 | * @node: the node ID | |
34813 | * | |
34814 | - * This must be called with any of wq_pool_mutex, wq->mutex or sched RCU | |
34815 | + * This must be called with any of wq_pool_mutex, wq->mutex or RCU | |
34816 | * read locked. | |
34817 | * If the pwq needs to be used beyond the locking in effect, the caller is | |
34818 | * responsible for guaranteeing that the pwq stays online. | |
b3bbd485 | 34819 | @@ -696,8 +730,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work) |
1a6e0f06 JK |
34820 | * @work: the work item of interest |
34821 | * | |
34822 | * Pools are created and destroyed under wq_pool_mutex, and allows read | |
34823 | - * access under sched-RCU read lock. As such, this function should be | |
34824 | - * called under wq_pool_mutex or with preemption disabled. | |
34825 | + * access under RCU read lock. As such, this function should be | |
34826 | + * called under wq_pool_mutex or inside of a rcu_read_lock() region. | |
34827 | * | |
34828 | * All fields of the returned pool are accessible as long as the above | |
34829 | * mentioned locking is in effect. If the returned pool needs to be used | |
b3bbd485 | 34830 | @@ -834,50 +868,45 @@ static struct worker *first_idle_worker(struct worker_pool *pool) |
1a6e0f06 JK |
34831 | */ |
34832 | static void wake_up_worker(struct worker_pool *pool) | |
34833 | { | |
34834 | - struct worker *worker = first_idle_worker(pool); | |
34835 | + struct worker *worker; | |
34836 | + | |
34837 | + rt_lock_idle_list(pool); | |
34838 | + | |
34839 | + worker = first_idle_worker(pool); | |
34840 | ||
34841 | if (likely(worker)) | |
34842 | wake_up_process(worker->task); | |
34843 | + | |
34844 | + rt_unlock_idle_list(pool); | |
34845 | } | |
34846 | ||
34847 | /** | |
34848 | - * wq_worker_waking_up - a worker is waking up | |
34849 | + * wq_worker_running - a worker is running again | |
34850 | * @task: task waking up | |
34851 | - * @cpu: CPU @task is waking up to | |
b3bbd485 | 34852 | * |
1a6e0f06 JK |
34853 | - * This function is called during try_to_wake_up() when a worker is |
34854 | - * being awoken. | |
b3bbd485 | 34855 | - * |
1a6e0f06 JK |
34856 | - * CONTEXT: |
34857 | - * spin_lock_irq(rq->lock) | |
34858 | + * This function is called when a worker returns from schedule() | |
34859 | */ | |
34860 | -void wq_worker_waking_up(struct task_struct *task, int cpu) | |
34861 | +void wq_worker_running(struct task_struct *task) | |
34862 | { | |
34863 | struct worker *worker = kthread_data(task); | |
34864 | ||
34865 | - if (!(worker->flags & WORKER_NOT_RUNNING)) { | |
34866 | - WARN_ON_ONCE(worker->pool->cpu != cpu); | |
34867 | + if (!worker->sleeping) | |
34868 | + return; | |
34869 | + if (!(worker->flags & WORKER_NOT_RUNNING)) | |
34870 | atomic_inc(&worker->pool->nr_running); | |
34871 | - } | |
34872 | + worker->sleeping = 0; | |
34873 | } | |
34874 | ||
34875 | /** | |
34876 | * wq_worker_sleeping - a worker is going to sleep | |
34877 | * @task: task going to sleep | |
34878 | * | |
34879 | - * This function is called during schedule() when a busy worker is | |
34880 | - * going to sleep. Worker on the same cpu can be woken up by | |
34881 | - * returning pointer to its task. | |
34882 | - * | |
34883 | - * CONTEXT: | |
34884 | - * spin_lock_irq(rq->lock) | |
34885 | - * | |
34886 | - * Return: | |
34887 | - * Worker task on @cpu to wake up, %NULL if none. | |
34888 | + * This function is called from schedule() when a busy worker is | |
34889 | + * going to sleep. | |
34890 | */ | |
34891 | -struct task_struct *wq_worker_sleeping(struct task_struct *task) | |
34892 | +void wq_worker_sleeping(struct task_struct *task) | |
34893 | { | |
34894 | - struct worker *worker = kthread_data(task), *to_wakeup = NULL; | |
34895 | + struct worker *worker = kthread_data(task); | |
34896 | struct worker_pool *pool; | |
34897 | ||
34898 | /* | |
b3bbd485 | 34899 | @@ -886,29 +915,26 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task) |
1a6e0f06 JK |
34900 | * checking NOT_RUNNING. |
34901 | */ | |
34902 | if (worker->flags & WORKER_NOT_RUNNING) | |
34903 | - return NULL; | |
34904 | + return; | |
34905 | ||
34906 | pool = worker->pool; | |
34907 | ||
34908 | - /* this can only happen on the local cpu */ | |
34909 | - if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id())) | |
34910 | - return NULL; | |
34911 | + if (WARN_ON_ONCE(worker->sleeping)) | |
34912 | + return; | |
34913 | + | |
34914 | + worker->sleeping = 1; | |
34915 | ||
34916 | /* | |
34917 | * The counterpart of the following dec_and_test, implied mb, | |
34918 | * worklist not empty test sequence is in insert_work(). | |
34919 | * Please read comment there. | |
34920 | - * | |
34921 | - * NOT_RUNNING is clear. This means that we're bound to and | |
34922 | - * running on the local cpu w/ rq lock held and preemption | |
34923 | - * disabled, which in turn means that none else could be | |
34924 | - * manipulating idle_list, so dereferencing idle_list without pool | |
34925 | - * lock is safe. | |
34926 | */ | |
34927 | if (atomic_dec_and_test(&pool->nr_running) && | |
34928 | - !list_empty(&pool->worklist)) | |
34929 | - to_wakeup = first_idle_worker(pool); | |
34930 | - return to_wakeup ? to_wakeup->task : NULL; | |
34931 | + !list_empty(&pool->worklist)) { | |
34932 | + sched_lock_idle_list(pool); | |
34933 | + wake_up_worker(pool); | |
34934 | + sched_unlock_idle_list(pool); | |
34935 | + } | |
34936 | } | |
34937 | ||
34938 | /** | |
b3bbd485 | 34939 | @@ -1102,12 +1128,14 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq) |
1a6e0f06 JK |
34940 | { |
34941 | if (pwq) { | |
34942 | /* | |
34943 | - * As both pwqs and pools are sched-RCU protected, the | |
34944 | + * As both pwqs and pools are RCU protected, the | |
34945 | * following lock operations are safe. | |
34946 | */ | |
34947 | - spin_lock_irq(&pwq->pool->lock); | |
c7c16703 | 34948 | + rcu_read_lock(); |
1a6e0f06 JK |
34949 | + local_spin_lock_irq(pendingb_lock, &pwq->pool->lock); |
34950 | put_pwq(pwq); | |
34951 | - spin_unlock_irq(&pwq->pool->lock); | |
34952 | + local_spin_unlock_irq(pendingb_lock, &pwq->pool->lock); | |
c7c16703 | 34953 | + rcu_read_unlock(); |
1a6e0f06 JK |
34954 | } |
34955 | } | |
34956 | ||
b3bbd485 | 34957 | @@ -1211,7 +1239,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, |
1a6e0f06 JK |
34958 | struct worker_pool *pool; |
34959 | struct pool_workqueue *pwq; | |
34960 | ||
34961 | - local_irq_save(*flags); | |
34962 | + local_lock_irqsave(pendingb_lock, *flags); | |
34963 | ||
34964 | /* try to steal the timer if it exists */ | |
34965 | if (is_dwork) { | |
b3bbd485 | 34966 | @@ -1230,6 +1258,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, |
1a6e0f06 JK |
34967 | if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) |
34968 | return 0; | |
34969 | ||
34970 | + rcu_read_lock(); | |
34971 | /* | |
34972 | * The queueing is in progress, or it is already queued. Try to | |
34973 | * steal it from ->worklist without clearing WORK_STRUCT_PENDING. | |
b3bbd485 | 34974 | @@ -1268,14 +1297,16 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, |
1a6e0f06 JK |
34975 | set_work_pool_and_keep_pending(work, pool->id); |
34976 | ||
34977 | spin_unlock(&pool->lock); | |
34978 | + rcu_read_unlock(); | |
34979 | return 1; | |
34980 | } | |
34981 | spin_unlock(&pool->lock); | |
34982 | fail: | |
34983 | - local_irq_restore(*flags); | |
34984 | + rcu_read_unlock(); | |
34985 | + local_unlock_irqrestore(pendingb_lock, *flags); | |
34986 | if (work_is_canceling(work)) | |
34987 | return -ENOENT; | |
34988 | - cpu_relax(); | |
34989 | + cpu_chill(); | |
34990 | return -EAGAIN; | |
34991 | } | |
34992 | ||
b3bbd485 | 34993 | @@ -1377,7 +1408,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, |
1a6e0f06 JK |
34994 | * queued or lose PENDING. Grabbing PENDING and queueing should |
34995 | * happen with IRQ disabled. | |
34996 | */ | |
34997 | - WARN_ON_ONCE(!irqs_disabled()); | |
34998 | + WARN_ON_ONCE_NONRT(!irqs_disabled()); | |
34999 | ||
35000 | debug_work_activate(work); | |
35001 | ||
b3bbd485 | 35002 | @@ -1385,6 +1416,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, |
1a6e0f06 JK |
35003 | if (unlikely(wq->flags & __WQ_DRAINING) && |
35004 | WARN_ON_ONCE(!is_chained_work(wq))) | |
35005 | return; | |
35006 | + rcu_read_lock(); | |
35007 | retry: | |
35008 | if (req_cpu == WORK_CPU_UNBOUND) | |
35009 | cpu = wq_select_unbound_cpu(raw_smp_processor_id()); | |
b3bbd485 | 35010 | @@ -1441,10 +1473,8 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, |
1a6e0f06 JK |
35011 | /* pwq determined, queue */ |
35012 | trace_workqueue_queue_work(req_cpu, pwq, work); | |
35013 | ||
35014 | - if (WARN_ON(!list_empty(&work->entry))) { | |
35015 | - spin_unlock(&pwq->pool->lock); | |
35016 | - return; | |
35017 | - } | |
35018 | + if (WARN_ON(!list_empty(&work->entry))) | |
35019 | + goto out; | |
35020 | ||
35021 | pwq->nr_in_flight[pwq->work_color]++; | |
35022 | work_flags = work_color_to_flags(pwq->work_color); | |
b3bbd485 | 35023 | @@ -1462,7 +1492,9 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, |
1a6e0f06 JK |
35024 | |
35025 | insert_work(pwq, work, worklist, work_flags); | |
35026 | ||
35027 | +out: | |
35028 | spin_unlock(&pwq->pool->lock); | |
35029 | + rcu_read_unlock(); | |
35030 | } | |
35031 | ||
35032 | /** | |
b3bbd485 | 35033 | @@ -1482,14 +1514,14 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq, |
1a6e0f06 JK |
35034 | bool ret = false; |
35035 | unsigned long flags; | |
35036 | ||
35037 | - local_irq_save(flags); | |
35038 | + local_lock_irqsave(pendingb_lock,flags); | |
35039 | ||
35040 | if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { | |
35041 | __queue_work(cpu, wq, work); | |
35042 | ret = true; | |
35043 | } | |
35044 | ||
35045 | - local_irq_restore(flags); | |
35046 | + local_unlock_irqrestore(pendingb_lock, flags); | |
35047 | return ret; | |
35048 | } | |
35049 | EXPORT_SYMBOL(queue_work_on); | |
b3bbd485 | 35050 | @@ -1498,8 +1530,11 @@ void delayed_work_timer_fn(unsigned long __data) |
e4b2b4a8 JK |
35051 | { |
35052 | struct delayed_work *dwork = (struct delayed_work *)__data; | |
35053 | ||
35054 | + /* XXX */ | |
35055 | + /* local_lock(pendingb_lock); */ | |
35056 | /* should have been called from irqsafe timer with irq already off */ | |
35057 | __queue_work(dwork->cpu, dwork->wq, &dwork->work); | |
35058 | + /* local_unlock(pendingb_lock); */ | |
35059 | } | |
35060 | EXPORT_SYMBOL(delayed_work_timer_fn); | |
35061 | ||
b3bbd485 | 35062 | @@ -1555,14 +1590,14 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, |
1a6e0f06 JK |
35063 | unsigned long flags; |
35064 | ||
35065 | /* read the comment in __queue_work() */ | |
35066 | - local_irq_save(flags); | |
35067 | + local_lock_irqsave(pendingb_lock, flags); | |
35068 | ||
35069 | if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { | |
35070 | __queue_delayed_work(cpu, wq, dwork, delay); | |
35071 | ret = true; | |
35072 | } | |
35073 | ||
35074 | - local_irq_restore(flags); | |
35075 | + local_unlock_irqrestore(pendingb_lock, flags); | |
35076 | return ret; | |
35077 | } | |
35078 | EXPORT_SYMBOL(queue_delayed_work_on); | |
b3bbd485 | 35079 | @@ -1597,7 +1632,7 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, |
1a6e0f06 JK |
35080 | |
35081 | if (likely(ret >= 0)) { | |
35082 | __queue_delayed_work(cpu, wq, dwork, delay); | |
35083 | - local_irq_restore(flags); | |
35084 | + local_unlock_irqrestore(pendingb_lock, flags); | |
35085 | } | |
35086 | ||
35087 | /* -ENOENT from try_to_grab_pending() becomes %true */ | |
b3bbd485 | 35088 | @@ -1630,7 +1665,9 @@ static void worker_enter_idle(struct worker *worker) |
1a6e0f06 JK |
35089 | worker->last_active = jiffies; |
35090 | ||
35091 | /* idle_list is LIFO */ | |
35092 | + rt_lock_idle_list(pool); | |
35093 | list_add(&worker->entry, &pool->idle_list); | |
35094 | + rt_unlock_idle_list(pool); | |
35095 | ||
35096 | if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) | |
35097 | mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); | |
b3bbd485 | 35098 | @@ -1663,7 +1700,9 @@ static void worker_leave_idle(struct worker *worker) |
1a6e0f06 JK |
35099 | return; |
35100 | worker_clr_flags(worker, WORKER_IDLE); | |
35101 | pool->nr_idle--; | |
35102 | + rt_lock_idle_list(pool); | |
35103 | list_del_init(&worker->entry); | |
35104 | + rt_unlock_idle_list(pool); | |
35105 | } | |
35106 | ||
35107 | static struct worker *alloc_worker(int node) | |
b3bbd485 | 35108 | @@ -1829,7 +1868,9 @@ static void destroy_worker(struct worker *worker) |
1a6e0f06 JK |
35109 | pool->nr_workers--; |
35110 | pool->nr_idle--; | |
35111 | ||
35112 | + rt_lock_idle_list(pool); | |
35113 | list_del_init(&worker->entry); | |
35114 | + rt_unlock_idle_list(pool); | |
35115 | worker->flags |= WORKER_DIE; | |
35116 | wake_up_process(worker->task); | |
35117 | } | |
b3bbd485 | 35118 | @@ -2815,14 +2856,14 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) |
1a6e0f06 JK |
35119 | |
35120 | might_sleep(); | |
35121 | ||
35122 | - local_irq_disable(); | |
35123 | + rcu_read_lock(); | |
35124 | pool = get_work_pool(work); | |
35125 | if (!pool) { | |
35126 | - local_irq_enable(); | |
35127 | + rcu_read_unlock(); | |
35128 | return false; | |
35129 | } | |
35130 | ||
35131 | - spin_lock(&pool->lock); | |
35132 | + spin_lock_irq(&pool->lock); | |
35133 | /* see the comment in try_to_grab_pending() with the same code */ | |
35134 | pwq = get_work_pwq(work); | |
35135 | if (pwq) { | |
b3bbd485 | 35136 | @@ -2853,10 +2894,11 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) |
e4b2b4a8 JK |
35137 | lock_map_acquire(&pwq->wq->lockdep_map); |
35138 | lock_map_release(&pwq->wq->lockdep_map); | |
35139 | } | |
1a6e0f06 JK |
35140 | - |
35141 | + rcu_read_unlock(); | |
35142 | return true; | |
35143 | already_gone: | |
35144 | spin_unlock_irq(&pool->lock); | |
35145 | + rcu_read_unlock(); | |
35146 | return false; | |
35147 | } | |
35148 | ||
b3bbd485 | 35149 | @@ -2946,7 +2988,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) |
1a6e0f06 JK |
35150 | |
35151 | /* tell other tasks trying to grab @work to back off */ | |
35152 | mark_work_canceling(work); | |
35153 | - local_irq_restore(flags); | |
35154 | + local_unlock_irqrestore(pendingb_lock, flags); | |
35155 | ||
e4b2b4a8 JK |
35156 | /* |
35157 | * This allows canceling during early boot. We know that @work | |
b3bbd485 | 35158 | @@ -3007,10 +3049,10 @@ EXPORT_SYMBOL_GPL(cancel_work_sync); |
1a6e0f06 JK |
35159 | */ |
35160 | bool flush_delayed_work(struct delayed_work *dwork) | |
35161 | { | |
35162 | - local_irq_disable(); | |
35163 | + local_lock_irq(pendingb_lock); | |
35164 | if (del_timer_sync(&dwork->timer)) | |
35165 | __queue_work(dwork->cpu, dwork->wq, &dwork->work); | |
35166 | - local_irq_enable(); | |
35167 | + local_unlock_irq(pendingb_lock); | |
35168 | return flush_work(&dwork->work); | |
35169 | } | |
35170 | EXPORT_SYMBOL(flush_delayed_work); | |
b3bbd485 | 35171 | @@ -3028,7 +3070,7 @@ static bool __cancel_work(struct work_struct *work, bool is_dwork) |
c7c16703 | 35172 | return false; |
1a6e0f06 | 35173 | |
c7c16703 | 35174 | set_work_pool_and_clear_pending(work, get_work_pool_id(work)); |
1a6e0f06 JK |
35175 | - local_irq_restore(flags); |
35176 | + local_unlock_irqrestore(pendingb_lock, flags); | |
35177 | return ret; | |
35178 | } | |
c7c16703 | 35179 | |
b3bbd485 | 35180 | @@ -3284,7 +3326,7 @@ static void rcu_free_pool(struct rcu_head *rcu) |
1a6e0f06 JK |
35181 | * put_unbound_pool - put a worker_pool |
35182 | * @pool: worker_pool to put | |
35183 | * | |
35184 | - * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU | |
35185 | + * Put @pool. If its refcnt reaches zero, it gets destroyed in RCU | |
35186 | * safe manner. get_unbound_pool() calls this function on its failure path | |
35187 | * and this function should be able to release pools which went through, | |
35188 | * successfully or not, init_worker_pool(). | |
b3bbd485 | 35189 | @@ -3338,8 +3380,8 @@ static void put_unbound_pool(struct worker_pool *pool) |
1a6e0f06 JK |
35190 | del_timer_sync(&pool->idle_timer); |
35191 | del_timer_sync(&pool->mayday_timer); | |
35192 | ||
35193 | - /* sched-RCU protected to allow dereferences from get_work_pool() */ | |
35194 | - call_rcu_sched(&pool->rcu, rcu_free_pool); | |
35195 | + /* RCU protected to allow dereferences from get_work_pool() */ | |
35196 | + call_rcu(&pool->rcu, rcu_free_pool); | |
35197 | } | |
35198 | ||
35199 | /** | |
b3bbd485 | 35200 | @@ -3446,14 +3488,14 @@ static void pwq_unbound_release_workfn(struct work_struct *work) |
1a6e0f06 JK |
35201 | put_unbound_pool(pool); |
35202 | mutex_unlock(&wq_pool_mutex); | |
35203 | ||
35204 | - call_rcu_sched(&pwq->rcu, rcu_free_pwq); | |
35205 | + call_rcu(&pwq->rcu, rcu_free_pwq); | |
35206 | ||
35207 | /* | |
35208 | * If we're the last pwq going away, @wq is already dead and no one | |
35209 | * is gonna access it anymore. Schedule RCU free. | |
35210 | */ | |
35211 | if (is_last) | |
35212 | - call_rcu_sched(&wq->rcu, rcu_free_wq); | |
35213 | + call_rcu(&wq->rcu, rcu_free_wq); | |
35214 | } | |
35215 | ||
35216 | /** | |
b3bbd485 | 35217 | @@ -4128,7 +4170,7 @@ void destroy_workqueue(struct workqueue_struct *wq) |
1a6e0f06 JK |
35218 | * The base ref is never dropped on per-cpu pwqs. Directly |
35219 | * schedule RCU free. | |
35220 | */ | |
35221 | - call_rcu_sched(&wq->rcu, rcu_free_wq); | |
35222 | + call_rcu(&wq->rcu, rcu_free_wq); | |
35223 | } else { | |
35224 | /* | |
35225 | * We're the sole accessor of @wq at this point. Directly | |
b3bbd485 | 35226 | @@ -4238,7 +4280,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) |
1a6e0f06 JK |
35227 | struct pool_workqueue *pwq; |
35228 | bool ret; | |
35229 | ||
35230 | - rcu_read_lock_sched(); | |
35231 | + rcu_read_lock(); | |
35232 | + preempt_disable(); | |
35233 | ||
35234 | if (cpu == WORK_CPU_UNBOUND) | |
35235 | cpu = smp_processor_id(); | |
b3bbd485 | 35236 | @@ -4249,7 +4292,8 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) |
1a6e0f06 JK |
35237 | pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); |
35238 | ||
35239 | ret = !list_empty(&pwq->delayed_works); | |
35240 | - rcu_read_unlock_sched(); | |
35241 | + preempt_enable(); | |
35242 | + rcu_read_unlock(); | |
35243 | ||
35244 | return ret; | |
35245 | } | |
b3bbd485 | 35246 | @@ -4275,15 +4319,15 @@ unsigned int work_busy(struct work_struct *work) |
1a6e0f06 JK |
35247 | if (work_pending(work)) |
35248 | ret |= WORK_BUSY_PENDING; | |
35249 | ||
35250 | - local_irq_save(flags); | |
35251 | + rcu_read_lock(); | |
35252 | pool = get_work_pool(work); | |
35253 | if (pool) { | |
35254 | - spin_lock(&pool->lock); | |
35255 | + spin_lock_irqsave(&pool->lock, flags); | |
35256 | if (find_worker_executing_work(pool, work)) | |
35257 | ret |= WORK_BUSY_RUNNING; | |
35258 | - spin_unlock(&pool->lock); | |
35259 | + spin_unlock_irqrestore(&pool->lock, flags); | |
35260 | } | |
35261 | - local_irq_restore(flags); | |
35262 | + rcu_read_unlock(); | |
35263 | ||
35264 | return ret; | |
35265 | } | |
b3bbd485 | 35266 | @@ -4472,7 +4516,7 @@ void show_workqueue_state(void) |
1a6e0f06 JK |
35267 | unsigned long flags; |
35268 | int pi; | |
35269 | ||
35270 | - rcu_read_lock_sched(); | |
35271 | + rcu_read_lock(); | |
35272 | ||
35273 | pr_info("Showing busy workqueues and worker pools:\n"); | |
35274 | ||
b3bbd485 | 35275 | @@ -4537,7 +4581,7 @@ void show_workqueue_state(void) |
e4b2b4a8 | 35276 | touch_nmi_watchdog(); |
1a6e0f06 JK |
35277 | } |
35278 | ||
35279 | - rcu_read_unlock_sched(); | |
35280 | + rcu_read_unlock(); | |
35281 | } | |
35282 | ||
35283 | /* | |
b3bbd485 | 35284 | @@ -4898,16 +4942,16 @@ bool freeze_workqueues_busy(void) |
1a6e0f06 JK |
35285 | * nr_active is monotonically decreasing. It's safe |
35286 | * to peek without lock. | |
35287 | */ | |
35288 | - rcu_read_lock_sched(); | |
35289 | + rcu_read_lock(); | |
35290 | for_each_pwq(pwq, wq) { | |
35291 | WARN_ON_ONCE(pwq->nr_active < 0); | |
35292 | if (pwq->nr_active) { | |
35293 | busy = true; | |
35294 | - rcu_read_unlock_sched(); | |
35295 | + rcu_read_unlock(); | |
35296 | goto out_unlock; | |
35297 | } | |
35298 | } | |
35299 | - rcu_read_unlock_sched(); | |
35300 | + rcu_read_unlock(); | |
35301 | } | |
35302 | out_unlock: | |
35303 | mutex_unlock(&wq_pool_mutex); | |
b3bbd485 | 35304 | @@ -5097,7 +5141,8 @@ static ssize_t wq_pool_ids_show(struct device *dev, |
1a6e0f06 JK |
35305 | const char *delim = ""; |
35306 | int node, written = 0; | |
35307 | ||
35308 | - rcu_read_lock_sched(); | |
35309 | + get_online_cpus(); | |
35310 | + rcu_read_lock(); | |
35311 | for_each_node(node) { | |
35312 | written += scnprintf(buf + written, PAGE_SIZE - written, | |
35313 | "%s%d:%d", delim, node, | |
b3bbd485 | 35314 | @@ -5105,7 +5150,8 @@ static ssize_t wq_pool_ids_show(struct device *dev, |
1a6e0f06 JK |
35315 | delim = " "; |
35316 | } | |
35317 | written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); | |
35318 | - rcu_read_unlock_sched(); | |
35319 | + rcu_read_unlock(); | |
35320 | + put_online_cpus(); | |
35321 | ||
35322 | return written; | |
35323 | } | |
b3bbd485 JK |
35324 | diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h |
35325 | index d390d1be3748..2dbcfe9bc364 100644 | |
35326 | --- a/kernel/workqueue_internal.h | |
35327 | +++ b/kernel/workqueue_internal.h | |
35328 | @@ -45,6 +45,7 @@ struct worker { | |
1a6e0f06 JK |
35329 | unsigned long last_active; /* L: last active timestamp */ |
35330 | unsigned int flags; /* X: flags */ | |
35331 | int id; /* I: worker id */ | |
35332 | + int sleeping; /* None */ | |
35333 | ||
35334 | /* | |
35335 | * Opaque string set with work_set_desc(). Printed out with task | |
b3bbd485 | 35336 | @@ -70,7 +71,7 @@ static inline struct worker *current_wq_worker(void) |
1a6e0f06 JK |
35337 | * Scheduler hooks for concurrency managed workqueue. Only to be used from |
35338 | * sched/core.c and workqueue.c. | |
35339 | */ | |
35340 | -void wq_worker_waking_up(struct task_struct *task, int cpu); | |
35341 | -struct task_struct *wq_worker_sleeping(struct task_struct *task); | |
35342 | +void wq_worker_running(struct task_struct *task); | |
35343 | +void wq_worker_sleeping(struct task_struct *task); | |
35344 | ||
35345 | #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ | |
b3bbd485 JK |
35346 | diff --git a/lib/Kconfig b/lib/Kconfig |
35347 | index b1445b22a6de..9ab51b78991a 100644 | |
35348 | --- a/lib/Kconfig | |
35349 | +++ b/lib/Kconfig | |
35350 | @@ -428,6 +428,7 @@ config CHECK_SIGNATURE | |
35351 | ||
35352 | config CPUMASK_OFFSTACK | |
35353 | bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS | |
35354 | + depends on !PREEMPT_RT_FULL | |
35355 | help | |
35356 | Use dynamic allocation for cpumask_var_t, instead of putting | |
35357 | them on the stack. This is a bit more expensive, but avoids | |
35358 | diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug | |
35359 | index 62d0e25c054c..401b7ed164b5 100644 | |
35360 | --- a/lib/Kconfig.debug | |
35361 | +++ b/lib/Kconfig.debug | |
35362 | @@ -1197,7 +1197,7 @@ config DEBUG_ATOMIC_SLEEP | |
35363 | ||
35364 | config DEBUG_LOCKING_API_SELFTESTS | |
35365 | bool "Locking API boot-time self-tests" | |
35366 | - depends on DEBUG_KERNEL | |
35367 | + depends on DEBUG_KERNEL && !PREEMPT_RT_FULL | |
35368 | help | |
35369 | Say Y here if you want the kernel to run a short self-test during | |
35370 | bootup. The self-test checks whether common types of locking bugs | |
35371 | diff --git a/lib/debugobjects.c b/lib/debugobjects.c | |
35372 | index 99308479b1c8..161da6c6e173 100644 | |
35373 | --- a/lib/debugobjects.c | |
35374 | +++ b/lib/debugobjects.c | |
35375 | @@ -339,7 +339,10 @@ __debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack) | |
1a6e0f06 JK |
35376 | struct debug_obj *obj; |
35377 | unsigned long flags; | |
35378 | ||
35379 | - fill_pool(); | |
35380 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
35381 | + if (preempt_count() == 0 && !irqs_disabled()) | |
35382 | +#endif | |
35383 | + fill_pool(); | |
35384 | ||
35385 | db = get_bucket((unsigned long) addr); | |
35386 | ||
b3bbd485 JK |
35387 | diff --git a/lib/irq_poll.c b/lib/irq_poll.c |
35388 | index 86a709954f5a..9c069ef83d6d 100644 | |
35389 | --- a/lib/irq_poll.c | |
35390 | +++ b/lib/irq_poll.c | |
35391 | @@ -37,6 +37,7 @@ void irq_poll_sched(struct irq_poll *iop) | |
1a6e0f06 JK |
35392 | list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll)); |
35393 | __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ); | |
35394 | local_irq_restore(flags); | |
35395 | + preempt_check_resched_rt(); | |
35396 | } | |
35397 | EXPORT_SYMBOL(irq_poll_sched); | |
35398 | ||
b3bbd485 | 35399 | @@ -72,6 +73,7 @@ void irq_poll_complete(struct irq_poll *iop) |
1a6e0f06 JK |
35400 | local_irq_save(flags); |
35401 | __irq_poll_complete(iop); | |
35402 | local_irq_restore(flags); | |
35403 | + preempt_check_resched_rt(); | |
35404 | } | |
35405 | EXPORT_SYMBOL(irq_poll_complete); | |
35406 | ||
b3bbd485 | 35407 | @@ -96,6 +98,7 @@ static void __latent_entropy irq_poll_softirq(struct softirq_action *h) |
1a6e0f06 JK |
35408 | } |
35409 | ||
35410 | local_irq_enable(); | |
35411 | + preempt_check_resched_rt(); | |
35412 | ||
35413 | /* Even though interrupts have been re-enabled, this | |
35414 | * access is safe because interrupts can only add new | |
b3bbd485 | 35415 | @@ -133,6 +136,7 @@ static void __latent_entropy irq_poll_softirq(struct softirq_action *h) |
1a6e0f06 JK |
35416 | __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ); |
35417 | ||
35418 | local_irq_enable(); | |
35419 | + preempt_check_resched_rt(); | |
35420 | } | |
35421 | ||
35422 | /** | |
b3bbd485 | 35423 | @@ -196,6 +200,7 @@ static int irq_poll_cpu_dead(unsigned int cpu) |
c7c16703 JK |
35424 | this_cpu_ptr(&blk_cpu_iopoll)); |
35425 | __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ); | |
35426 | local_irq_enable(); | |
35427 | + preempt_check_resched_rt(); | |
1a6e0f06 | 35428 | |
c7c16703 JK |
35429 | return 0; |
35430 | } | |
b3bbd485 JK |
35431 | diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c |
35432 | index b5c1293ce147..075e225f4111 100644 | |
35433 | --- a/lib/locking-selftest.c | |
35434 | +++ b/lib/locking-selftest.c | |
35435 | @@ -742,6 +742,8 @@ GENERATE_TESTCASE(init_held_rtmutex); | |
1a6e0f06 JK |
35436 | #include "locking-selftest-spin-hardirq.h" |
35437 | GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_spin) | |
35438 | ||
35439 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
35440 | + | |
35441 | #include "locking-selftest-rlock-hardirq.h" | |
35442 | GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock) | |
35443 | ||
b3bbd485 | 35444 | @@ -757,9 +759,12 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_rlock) |
1a6e0f06 JK |
35445 | #include "locking-selftest-wlock-softirq.h" |
35446 | GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock) | |
35447 | ||
35448 | +#endif | |
35449 | + | |
35450 | #undef E1 | |
35451 | #undef E2 | |
35452 | ||
35453 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
35454 | /* | |
35455 | * Enabling hardirqs with a softirq-safe lock held: | |
35456 | */ | |
b3bbd485 | 35457 | @@ -792,6 +797,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock) |
1a6e0f06 JK |
35458 | #undef E1 |
35459 | #undef E2 | |
35460 | ||
35461 | +#endif | |
35462 | + | |
35463 | /* | |
35464 | * Enabling irqs with an irq-safe lock held: | |
35465 | */ | |
b3bbd485 | 35466 | @@ -815,6 +822,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock) |
1a6e0f06 JK |
35467 | #include "locking-selftest-spin-hardirq.h" |
35468 | GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_spin) | |
35469 | ||
35470 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
35471 | + | |
35472 | #include "locking-selftest-rlock-hardirq.h" | |
35473 | GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock) | |
35474 | ||
b3bbd485 | 35475 | @@ -830,6 +839,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_rlock) |
1a6e0f06 JK |
35476 | #include "locking-selftest-wlock-softirq.h" |
35477 | GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock) | |
35478 | ||
35479 | +#endif | |
35480 | + | |
35481 | #undef E1 | |
35482 | #undef E2 | |
35483 | ||
b3bbd485 | 35484 | @@ -861,6 +872,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock) |
1a6e0f06 JK |
35485 | #include "locking-selftest-spin-hardirq.h" |
35486 | GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_spin) | |
35487 | ||
35488 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
35489 | + | |
35490 | #include "locking-selftest-rlock-hardirq.h" | |
35491 | GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock) | |
35492 | ||
b3bbd485 | 35493 | @@ -876,6 +889,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_rlock) |
1a6e0f06 JK |
35494 | #include "locking-selftest-wlock-softirq.h" |
35495 | GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock) | |
35496 | ||
35497 | +#endif | |
35498 | + | |
35499 | #undef E1 | |
35500 | #undef E2 | |
35501 | #undef E3 | |
b3bbd485 | 35502 | @@ -909,6 +924,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock) |
1a6e0f06 JK |
35503 | #include "locking-selftest-spin-hardirq.h" |
35504 | GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_spin) | |
35505 | ||
35506 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
35507 | + | |
35508 | #include "locking-selftest-rlock-hardirq.h" | |
35509 | GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock) | |
35510 | ||
b3bbd485 | 35511 | @@ -924,10 +941,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_rlock) |
1a6e0f06 JK |
35512 | #include "locking-selftest-wlock-softirq.h" |
35513 | GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock) | |
35514 | ||
35515 | +#endif | |
35516 | + | |
35517 | #undef E1 | |
35518 | #undef E2 | |
35519 | #undef E3 | |
35520 | ||
35521 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
35522 | + | |
35523 | /* | |
35524 | * read-lock / write-lock irq inversion. | |
35525 | * | |
b3bbd485 | 35526 | @@ -990,6 +1011,10 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_wlock) |
1a6e0f06 JK |
35527 | #undef E2 |
35528 | #undef E3 | |
35529 | ||
35530 | +#endif | |
35531 | + | |
35532 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
35533 | + | |
35534 | /* | |
35535 | * read-lock / write-lock recursion that is actually safe. | |
35536 | */ | |
b3bbd485 | 35537 | @@ -1028,6 +1053,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft) |
1a6e0f06 JK |
35538 | #undef E2 |
35539 | #undef E3 | |
35540 | ||
35541 | +#endif | |
35542 | + | |
35543 | /* | |
35544 | * read-lock / write-lock recursion that is unsafe. | |
35545 | */ | |
b3bbd485 | 35546 | @@ -2057,6 +2084,7 @@ void locking_selftest(void) |
1a6e0f06 JK |
35547 | |
35548 | printk(" --------------------------------------------------------------------------\n"); | |
35549 | ||
35550 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
35551 | /* | |
35552 | * irq-context testcases: | |
35553 | */ | |
b3bbd485 | 35554 | @@ -2069,6 +2097,28 @@ void locking_selftest(void) |
1a6e0f06 JK |
35555 | |
35556 | DO_TESTCASE_6x2("irq read-recursion", irq_read_recursion); | |
35557 | // DO_TESTCASE_6x2B("irq read-recursion #2", irq_read_recursion2); | |
35558 | +#else | |
35559 | + /* On -rt, we only do hardirq context test for raw spinlock */ | |
35560 | + DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12); | |
35561 | + DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 21); | |
35562 | + | |
35563 | + DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 12); | |
35564 | + DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 21); | |
35565 | + | |
35566 | + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 123); | |
35567 | + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 132); | |
35568 | + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 213); | |
35569 | + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 231); | |
35570 | + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 312); | |
35571 | + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 321); | |
35572 | + | |
35573 | + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 123); | |
35574 | + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 132); | |
35575 | + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 213); | |
35576 | + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 231); | |
35577 | + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312); | |
35578 | + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321); | |
35579 | +#endif | |
35580 | ||
35581 | ww_tests(); | |
35582 | ||
b3bbd485 JK |
35583 | diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c |
35584 | index 6016f1deb1f5..cdd43086b55b 100644 | |
35585 | --- a/lib/percpu_ida.c | |
35586 | +++ b/lib/percpu_ida.c | |
e4b2b4a8 | 35587 | @@ -27,6 +27,9 @@ |
1a6e0f06 JK |
35588 | #include <linux/string.h> |
35589 | #include <linux/spinlock.h> | |
35590 | #include <linux/percpu_ida.h> | |
35591 | +#include <linux/locallock.h> | |
35592 | + | |
35593 | +static DEFINE_LOCAL_IRQ_LOCK(irq_off_lock); | |
35594 | ||
35595 | struct percpu_ida_cpu { | |
35596 | /* | |
b3bbd485 | 35597 | @@ -149,13 +152,13 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state) |
1a6e0f06 JK |
35598 | unsigned long flags; |
35599 | int tag; | |
35600 | ||
35601 | - local_irq_save(flags); | |
35602 | + local_lock_irqsave(irq_off_lock, flags); | |
35603 | tags = this_cpu_ptr(pool->tag_cpu); | |
35604 | ||
35605 | /* Fastpath */ | |
35606 | tag = alloc_local_tag(tags); | |
35607 | if (likely(tag >= 0)) { | |
35608 | - local_irq_restore(flags); | |
35609 | + local_unlock_irqrestore(irq_off_lock, flags); | |
35610 | return tag; | |
35611 | } | |
35612 | ||
b3bbd485 | 35613 | @@ -174,6 +177,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state) |
1a6e0f06 JK |
35614 | |
35615 | if (!tags->nr_free) | |
35616 | alloc_global_tags(pool, tags); | |
35617 | + | |
35618 | if (!tags->nr_free) | |
35619 | steal_tags(pool, tags); | |
35620 | ||
b3bbd485 | 35621 | @@ -185,7 +189,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state) |
1a6e0f06 JK |
35622 | } |
35623 | ||
35624 | spin_unlock(&pool->lock); | |
35625 | - local_irq_restore(flags); | |
35626 | + local_unlock_irqrestore(irq_off_lock, flags); | |
35627 | ||
35628 | if (tag >= 0 || state == TASK_RUNNING) | |
35629 | break; | |
b3bbd485 | 35630 | @@ -197,7 +201,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state) |
1a6e0f06 JK |
35631 | |
35632 | schedule(); | |
35633 | ||
35634 | - local_irq_save(flags); | |
35635 | + local_lock_irqsave(irq_off_lock, flags); | |
35636 | tags = this_cpu_ptr(pool->tag_cpu); | |
35637 | } | |
35638 | if (state != TASK_RUNNING) | |
b3bbd485 | 35639 | @@ -222,7 +226,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag) |
1a6e0f06 JK |
35640 | |
35641 | BUG_ON(tag >= pool->nr_tags); | |
35642 | ||
35643 | - local_irq_save(flags); | |
35644 | + local_lock_irqsave(irq_off_lock, flags); | |
35645 | tags = this_cpu_ptr(pool->tag_cpu); | |
35646 | ||
35647 | spin_lock(&tags->lock); | |
b3bbd485 | 35648 | @@ -254,7 +258,7 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag) |
1a6e0f06 JK |
35649 | spin_unlock(&pool->lock); |
35650 | } | |
35651 | ||
35652 | - local_irq_restore(flags); | |
35653 | + local_unlock_irqrestore(irq_off_lock, flags); | |
35654 | } | |
35655 | EXPORT_SYMBOL_GPL(percpu_ida_free); | |
35656 | ||
b3bbd485 | 35657 | @@ -346,7 +350,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn, |
1a6e0f06 JK |
35658 | struct percpu_ida_cpu *remote; |
35659 | unsigned cpu, i, err = 0; | |
35660 | ||
35661 | - local_irq_save(flags); | |
35662 | + local_lock_irqsave(irq_off_lock, flags); | |
35663 | for_each_possible_cpu(cpu) { | |
35664 | remote = per_cpu_ptr(pool->tag_cpu, cpu); | |
35665 | spin_lock(&remote->lock); | |
b3bbd485 | 35666 | @@ -368,7 +372,7 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn, |
1a6e0f06 JK |
35667 | } |
35668 | spin_unlock(&pool->lock); | |
35669 | out: | |
35670 | - local_irq_restore(flags); | |
35671 | + local_unlock_irqrestore(irq_off_lock, flags); | |
35672 | return err; | |
35673 | } | |
35674 | EXPORT_SYMBOL_GPL(percpu_ida_for_each_free); | |
b3bbd485 JK |
35675 | diff --git a/lib/radix-tree.c b/lib/radix-tree.c |
35676 | index d172f0341b80..c1da1109a107 100644 | |
35677 | --- a/lib/radix-tree.c | |
35678 | +++ b/lib/radix-tree.c | |
e4b2b4a8 | 35679 | @@ -37,7 +37,7 @@ |
1f39f580 | 35680 | #include <linux/rcupdate.h> |
e4b2b4a8 JK |
35681 | #include <linux/slab.h> |
35682 | #include <linux/string.h> | |
1f39f580 JK |
35683 | - |
35684 | +#include <linux/locallock.h> | |
35685 | ||
35686 | /* Number of nodes in fully populated tree of given height */ | |
35687 | static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly; | |
b3bbd485 | 35688 | @@ -86,6 +86,7 @@ struct radix_tree_preload { |
1f39f580 JK |
35689 | struct radix_tree_node *nodes; |
35690 | }; | |
35691 | static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; | |
35692 | +static DEFINE_LOCAL_IRQ_LOCK(radix_tree_preloads_lock); | |
35693 | ||
e4b2b4a8 | 35694 | static inline struct radix_tree_node *entry_to_node(void *ptr) |
1f39f580 | 35695 | { |
b3bbd485 | 35696 | @@ -404,12 +405,13 @@ radix_tree_node_alloc(gfp_t gfp_mask, struct radix_tree_node *parent, |
1a6e0f06 JK |
35697 | * succeed in getting a node here (and never reach |
35698 | * kmem_cache_alloc) | |
35699 | */ | |
35700 | - rtp = this_cpu_ptr(&radix_tree_preloads); | |
1f39f580 | 35701 | + rtp = &get_locked_var(radix_tree_preloads_lock, radix_tree_preloads); |
1a6e0f06 JK |
35702 | if (rtp->nr) { |
35703 | ret = rtp->nodes; | |
e4b2b4a8 | 35704 | rtp->nodes = ret->parent; |
1a6e0f06 JK |
35705 | rtp->nr--; |
35706 | } | |
1f39f580 | 35707 | + put_locked_var(radix_tree_preloads_lock, radix_tree_preloads); |
1a6e0f06 JK |
35708 | /* |
35709 | * Update the allocation stack trace as this is more useful | |
35710 | * for debugging. | |
b3bbd485 | 35711 | @@ -475,14 +477,14 @@ static __must_check int __radix_tree_preload(gfp_t gfp_mask, unsigned nr) |
1f39f580 JK |
35712 | */ |
35713 | gfp_mask &= ~__GFP_ACCOUNT; | |
35714 | ||
35715 | - preempt_disable(); | |
35716 | + local_lock(radix_tree_preloads_lock); | |
35717 | rtp = this_cpu_ptr(&radix_tree_preloads); | |
35718 | while (rtp->nr < nr) { | |
35719 | - preempt_enable(); | |
35720 | + local_unlock(radix_tree_preloads_lock); | |
35721 | node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); | |
35722 | if (node == NULL) | |
35723 | goto out; | |
35724 | - preempt_disable(); | |
35725 | + local_lock(radix_tree_preloads_lock); | |
35726 | rtp = this_cpu_ptr(&radix_tree_preloads); | |
35727 | if (rtp->nr < nr) { | |
e4b2b4a8 | 35728 | node->parent = rtp->nodes; |
b3bbd485 | 35729 | @@ -524,7 +526,7 @@ int radix_tree_maybe_preload(gfp_t gfp_mask) |
1f39f580 JK |
35730 | if (gfpflags_allow_blocking(gfp_mask)) |
35731 | return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE); | |
35732 | /* Preloading doesn't help anything with this gfp mask, skip it */ | |
35733 | - preempt_disable(); | |
35734 | + local_lock(radix_tree_preloads_lock); | |
35735 | return 0; | |
1a6e0f06 | 35736 | } |
1f39f580 | 35737 | EXPORT_SYMBOL(radix_tree_maybe_preload); |
b3bbd485 | 35738 | @@ -562,7 +564,7 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order) |
1a6e0f06 | 35739 | |
1f39f580 JK |
35740 | /* Preloading doesn't help anything with this gfp mask, skip it */ |
35741 | if (!gfpflags_allow_blocking(gfp_mask)) { | |
35742 | - preempt_disable(); | |
35743 | + local_lock(radix_tree_preloads_lock); | |
35744 | return 0; | |
35745 | } | |
1a6e0f06 | 35746 | |
b3bbd485 | 35747 | @@ -596,6 +598,12 @@ int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order) |
1a6e0f06 JK |
35748 | return __radix_tree_preload(gfp_mask, nr_nodes); |
35749 | } | |
1a6e0f06 | 35750 | |
1f39f580 JK |
35751 | +void radix_tree_preload_end(void) |
35752 | +{ | |
35753 | + local_unlock(radix_tree_preloads_lock); | |
35754 | +} | |
35755 | +EXPORT_SYMBOL(radix_tree_preload_end); | |
35756 | + | |
e4b2b4a8 JK |
35757 | static unsigned radix_tree_load_root(const struct radix_tree_root *root, |
35758 | struct radix_tree_node **nodep, unsigned long *maxindex) | |
35759 | { | |
b3bbd485 | 35760 | @@ -2105,10 +2113,16 @@ EXPORT_SYMBOL(radix_tree_tagged); |
e4b2b4a8 JK |
35761 | void idr_preload(gfp_t gfp_mask) |
35762 | { | |
35763 | if (__radix_tree_preload(gfp_mask, IDR_PRELOAD_SIZE)) | |
35764 | - preempt_disable(); | |
35765 | + local_lock(radix_tree_preloads_lock); | |
35766 | } | |
35767 | EXPORT_SYMBOL(idr_preload); | |
35768 | ||
35769 | +void idr_preload_end(void) | |
35770 | +{ | |
35771 | + local_unlock(radix_tree_preloads_lock); | |
35772 | +} | |
35773 | +EXPORT_SYMBOL(idr_preload_end); | |
35774 | + | |
35775 | /** | |
35776 | * ida_pre_get - reserve resources for ida allocation | |
35777 | * @ida: ida handle | |
b3bbd485 | 35778 | @@ -2125,7 +2139,7 @@ int ida_pre_get(struct ida *ida, gfp_t gfp) |
e4b2b4a8 JK |
35779 | * to return to the ida_pre_get() step. |
35780 | */ | |
35781 | if (!__radix_tree_preload(gfp, IDA_PRELOAD_SIZE)) | |
35782 | - preempt_enable(); | |
35783 | + local_unlock(radix_tree_preloads_lock); | |
35784 | ||
35785 | if (!this_cpu_read(ida_bitmap)) { | |
35786 | struct ida_bitmap *bitmap = kmalloc(sizeof(*bitmap), gfp); | |
b3bbd485 JK |
35787 | diff --git a/lib/scatterlist.c b/lib/scatterlist.c |
35788 | index be7b4dd6b68d..d06c15d3d186 100644 | |
35789 | --- a/lib/scatterlist.c | |
35790 | +++ b/lib/scatterlist.c | |
35791 | @@ -620,7 +620,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter) | |
1a6e0f06 JK |
35792 | flush_kernel_dcache_page(miter->page); |
35793 | ||
35794 | if (miter->__flags & SG_MITER_ATOMIC) { | |
35795 | - WARN_ON_ONCE(preemptible()); | |
35796 | + WARN_ON_ONCE(!pagefault_disabled()); | |
35797 | kunmap_atomic(miter->addr); | |
35798 | } else | |
35799 | kunmap(miter->page); | |
b3bbd485 JK |
35800 | diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c |
35801 | index 835cc6df2776..6f4a4ae881c8 100644 | |
35802 | --- a/lib/smp_processor_id.c | |
35803 | +++ b/lib/smp_processor_id.c | |
35804 | @@ -23,7 +23,7 @@ notrace static unsigned int check_preemption_disabled(const char *what1, | |
e4b2b4a8 JK |
35805 | * Kernel threads bound to a single CPU can safely use |
35806 | * smp_processor_id(): | |
35807 | */ | |
35808 | - if (cpumask_equal(¤t->cpus_allowed, cpumask_of(this_cpu))) | |
35809 | + if (cpumask_equal(current->cpus_ptr, cpumask_of(this_cpu))) | |
35810 | goto out; | |
1a6e0f06 | 35811 | |
e4b2b4a8 | 35812 | /* |
b3bbd485 JK |
35813 | diff --git a/lib/timerqueue.c b/lib/timerqueue.c |
35814 | index 4a720ed4fdaf..0d54bcbc8170 100644 | |
35815 | --- a/lib/timerqueue.c | |
35816 | +++ b/lib/timerqueue.c | |
e4b2b4a8 JK |
35817 | @@ -33,8 +33,9 @@ |
35818 | * @head: head of timerqueue | |
35819 | * @node: timer node to be added | |
35820 | * | |
35821 | - * Adds the timer node to the timerqueue, sorted by the | |
35822 | - * node's expires value. | |
35823 | + * Adds the timer node to the timerqueue, sorted by the node's expires | |
35824 | + * value. Returns true if the newly added timer is the first expiring timer in | |
35825 | + * the queue. | |
35826 | */ | |
35827 | bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node) | |
35828 | { | |
b3bbd485 | 35829 | @@ -70,7 +71,8 @@ EXPORT_SYMBOL_GPL(timerqueue_add); |
e4b2b4a8 JK |
35830 | * @head: head of timerqueue |
35831 | * @node: timer node to be removed | |
35832 | * | |
35833 | - * Removes the timer node from the timerqueue. | |
35834 | + * Removes the timer node from the timerqueue. Returns true if the queue is | |
35835 | + * not empty after the remove. | |
35836 | */ | |
35837 | bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node) | |
35838 | { | |
b3bbd485 JK |
35839 | diff --git a/localversion-rt b/localversion-rt |
35840 | new file mode 100644 | |
5dd41b01 | 35841 | index 000000000000..8a777ac42aab |
b3bbd485 JK |
35842 | --- /dev/null |
35843 | +++ b/localversion-rt | |
1a6e0f06 | 35844 | @@ -0,0 +1 @@ |
5dd41b01 | 35845 | +-rt47 |
b3bbd485 JK |
35846 | diff --git a/mm/Kconfig b/mm/Kconfig |
35847 | index 59efbd3337e0..3df123c0bc3f 100644 | |
35848 | --- a/mm/Kconfig | |
35849 | +++ b/mm/Kconfig | |
35850 | @@ -385,7 +385,7 @@ config NOMMU_INITIAL_TRIM_EXCESS | |
35851 | ||
35852 | config TRANSPARENT_HUGEPAGE | |
35853 | bool "Transparent Hugepage Support" | |
35854 | - depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE | |
35855 | + depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT_FULL | |
35856 | select COMPACTION | |
35857 | select RADIX_TREE_MULTIORDER | |
35858 | help | |
35859 | diff --git a/mm/backing-dev.c b/mm/backing-dev.c | |
35860 | index 9386c98dac12..5e9d804c37cb 100644 | |
35861 | --- a/mm/backing-dev.c | |
35862 | +++ b/mm/backing-dev.c | |
35863 | @@ -470,9 +470,9 @@ void wb_congested_put(struct bdi_writeback_congested *congested) | |
1a6e0f06 JK |
35864 | { |
35865 | unsigned long flags; | |
35866 | ||
35867 | - local_irq_save(flags); | |
35868 | + local_irq_save_nort(flags); | |
35869 | if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) { | |
35870 | - local_irq_restore(flags); | |
35871 | + local_irq_restore_nort(flags); | |
35872 | return; | |
35873 | } | |
35874 | ||
b3bbd485 JK |
35875 | diff --git a/mm/compaction.c b/mm/compaction.c |
35876 | index 85395dc6eb13..d6c8ed009e93 100644 | |
35877 | --- a/mm/compaction.c | |
35878 | +++ b/mm/compaction.c | |
35879 | @@ -1634,10 +1634,12 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro | |
1a6e0f06 JK |
35880 | block_start_pfn(cc->migrate_pfn, cc->order); |
35881 | ||
35882 | if (cc->last_migrated_pfn < current_block_start) { | |
35883 | - cpu = get_cpu(); | |
35884 | + cpu = get_cpu_light(); | |
35885 | + local_lock_irq(swapvec_lock); | |
35886 | lru_add_drain_cpu(cpu); | |
35887 | + local_unlock_irq(swapvec_lock); | |
35888 | drain_local_pages(zone); | |
35889 | - put_cpu(); | |
35890 | + put_cpu_light(); | |
35891 | /* No more flushing until we migrate again */ | |
35892 | cc->last_migrated_pfn = 0; | |
35893 | } | |
b3bbd485 JK |
35894 | diff --git a/mm/filemap.c b/mm/filemap.c |
35895 | index e2e738cc08b1..c47070dae8b9 100644 | |
35896 | --- a/mm/filemap.c | |
35897 | +++ b/mm/filemap.c | |
e4b2b4a8 JK |
35898 | @@ -110,6 +110,7 @@ |
35899 | * ->i_mmap_rwsem | |
35900 | * ->tasklist_lock (memory_failure, collect_procs_ao) | |
35901 | */ | |
35902 | +DECLARE_LOCAL_IRQ_LOCK(shadow_nodes_lock); | |
35903 | ||
35904 | static int page_cache_tree_insert(struct address_space *mapping, | |
35905 | struct page *page, void **shadowp) | |
b3bbd485 | 35906 | @@ -133,8 +134,10 @@ static int page_cache_tree_insert(struct address_space *mapping, |
e4b2b4a8 JK |
35907 | if (shadowp) |
35908 | *shadowp = p; | |
1a6e0f06 | 35909 | } |
e4b2b4a8 JK |
35910 | + local_lock(shadow_nodes_lock); |
35911 | __radix_tree_replace(&mapping->page_tree, node, slot, page, | |
35912 | - workingset_update_node, mapping); | |
35913 | + __workingset_update_node, mapping); | |
35914 | + local_unlock(shadow_nodes_lock); | |
35915 | mapping->nrpages++; | |
1a6e0f06 JK |
35916 | return 0; |
35917 | } | |
b3bbd485 | 35918 | @@ -151,6 +154,7 @@ static void page_cache_tree_delete(struct address_space *mapping, |
e4b2b4a8 JK |
35919 | VM_BUG_ON_PAGE(PageTail(page), page); |
35920 | VM_BUG_ON_PAGE(nr != 1 && shadow, page); | |
35921 | ||
35922 | + local_lock(shadow_nodes_lock); | |
35923 | for (i = 0; i < nr; i++) { | |
35924 | struct radix_tree_node *node; | |
35925 | void **slot; | |
b3bbd485 | 35926 | @@ -162,8 +166,9 @@ static void page_cache_tree_delete(struct address_space *mapping, |
1a6e0f06 | 35927 | |
e4b2b4a8 JK |
35928 | radix_tree_clear_tags(&mapping->page_tree, node, slot); |
35929 | __radix_tree_replace(&mapping->page_tree, node, slot, shadow, | |
35930 | - workingset_update_node, mapping); | |
35931 | + __workingset_update_node, mapping); | |
35932 | } | |
35933 | + local_unlock(shadow_nodes_lock); | |
35934 | ||
35935 | if (shadow) { | |
35936 | mapping->nrexceptional += nr; | |
b3bbd485 JK |
35937 | diff --git a/mm/highmem.c b/mm/highmem.c |
35938 | index 59db3223a5d6..22aa3ddbd87b 100644 | |
35939 | --- a/mm/highmem.c | |
35940 | +++ b/mm/highmem.c | |
e4b2b4a8 | 35941 | @@ -30,10 +30,11 @@ |
1a6e0f06 JK |
35942 | #include <linux/kgdb.h> |
35943 | #include <asm/tlbflush.h> | |
35944 | ||
35945 | - | |
35946 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
35947 | #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32) | |
35948 | DEFINE_PER_CPU(int, __kmap_atomic_idx); | |
35949 | #endif | |
35950 | +#endif | |
35951 | ||
35952 | /* | |
35953 | * Virtual_count is not a pure "count". | |
b3bbd485 | 35954 | @@ -108,8 +109,9 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) |
1a6e0f06 JK |
35955 | unsigned long totalhigh_pages __read_mostly; |
35956 | EXPORT_SYMBOL(totalhigh_pages); | |
35957 | ||
35958 | - | |
35959 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
35960 | EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx); | |
35961 | +#endif | |
35962 | ||
35963 | unsigned int nr_free_highpages (void) | |
35964 | { | |
b3bbd485 JK |
35965 | diff --git a/mm/memcontrol.c b/mm/memcontrol.c |
35966 | index 6a9a7e1066ef..3cc297730103 100644 | |
35967 | --- a/mm/memcontrol.c | |
35968 | +++ b/mm/memcontrol.c | |
e4b2b4a8 | 35969 | @@ -69,6 +69,7 @@ |
1a6e0f06 JK |
35970 | #include <net/sock.h> |
35971 | #include <net/ip.h> | |
35972 | #include "slab.h" | |
35973 | +#include <linux/locallock.h> | |
35974 | ||
e4b2b4a8 | 35975 | #include <linux/uaccess.h> |
1a6e0f06 | 35976 | |
b3bbd485 | 35977 | @@ -94,6 +95,8 @@ int do_swap_account __read_mostly; |
1a6e0f06 JK |
35978 | #define do_swap_account 0 |
35979 | #endif | |
35980 | ||
35981 | +static DEFINE_LOCAL_IRQ_LOCK(event_lock); | |
35982 | + | |
35983 | /* Whether legacy memory+swap accounting is active */ | |
35984 | static bool do_memsw_account(void) | |
35985 | { | |
b3bbd485 | 35986 | @@ -1831,7 +1834,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) |
e4b2b4a8 JK |
35987 | * as well as workers from this path always operate on the local |
35988 | * per-cpu data. CPU up doesn't touch memcg_stock at all. | |
35989 | */ | |
1a6e0f06 JK |
35990 | - curcpu = get_cpu(); |
35991 | + curcpu = get_cpu_light(); | |
35992 | for_each_online_cpu(cpu) { | |
35993 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); | |
35994 | struct mem_cgroup *memcg; | |
b3bbd485 | 35995 | @@ -1851,7 +1854,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) |
1a6e0f06 | 35996 | } |
e4b2b4a8 | 35997 | css_put(&memcg->css); |
1a6e0f06 JK |
35998 | } |
35999 | - put_cpu(); | |
36000 | + put_cpu_light(); | |
1a6e0f06 JK |
36001 | mutex_unlock(&percpu_charge_mutex); |
36002 | } | |
e4b2b4a8 | 36003 | |
b3bbd485 | 36004 | @@ -4631,12 +4634,12 @@ static int mem_cgroup_move_account(struct page *page, |
1a6e0f06 JK |
36005 | |
36006 | ret = 0; | |
36007 | ||
36008 | - local_irq_disable(); | |
36009 | + local_lock_irq(event_lock); | |
36010 | mem_cgroup_charge_statistics(to, page, compound, nr_pages); | |
36011 | memcg_check_events(to, page); | |
36012 | mem_cgroup_charge_statistics(from, page, compound, -nr_pages); | |
36013 | memcg_check_events(from, page); | |
36014 | - local_irq_enable(); | |
36015 | + local_unlock_irq(event_lock); | |
36016 | out_unlock: | |
36017 | unlock_page(page); | |
36018 | out: | |
b3bbd485 | 36019 | @@ -5579,10 +5582,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, |
1a6e0f06 JK |
36020 | |
36021 | commit_charge(page, memcg, lrucare); | |
36022 | ||
36023 | - local_irq_disable(); | |
36024 | + local_lock_irq(event_lock); | |
36025 | mem_cgroup_charge_statistics(memcg, page, compound, nr_pages); | |
36026 | memcg_check_events(memcg, page); | |
36027 | - local_irq_enable(); | |
36028 | + local_unlock_irq(event_lock); | |
36029 | ||
36030 | if (do_memsw_account() && PageSwapCache(page)) { | |
36031 | swp_entry_t entry = { .val = page_private(page) }; | |
b3bbd485 | 36032 | @@ -5651,7 +5654,7 @@ static void uncharge_batch(const struct uncharge_gather *ug) |
e4b2b4a8 | 36033 | memcg_oom_recover(ug->memcg); |
1a6e0f06 JK |
36034 | } |
36035 | ||
36036 | - local_irq_save(flags); | |
36037 | + local_lock_irqsave(event_lock, flags); | |
e4b2b4a8 JK |
36038 | __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon); |
36039 | __this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file); | |
36040 | __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge); | |
b3bbd485 | 36041 | @@ -5659,7 +5662,7 @@ static void uncharge_batch(const struct uncharge_gather *ug) |
e4b2b4a8 JK |
36042 | __this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout); |
36043 | __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages); | |
36044 | memcg_check_events(ug->memcg, ug->dummy_page); | |
1a6e0f06 JK |
36045 | - local_irq_restore(flags); |
36046 | + local_unlock_irqrestore(event_lock, flags); | |
36047 | ||
e4b2b4a8 JK |
36048 | if (!mem_cgroup_is_root(ug->memcg)) |
36049 | css_put_many(&ug->memcg->css, nr_pages); | |
b3bbd485 | 36050 | @@ -5822,10 +5825,10 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) |
1a6e0f06 JK |
36051 | |
36052 | commit_charge(newpage, memcg, false); | |
36053 | ||
36054 | - local_irq_save(flags); | |
36055 | + local_lock_irqsave(event_lock, flags); | |
36056 | mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages); | |
36057 | memcg_check_events(memcg, newpage); | |
36058 | - local_irq_restore(flags); | |
36059 | + local_unlock_irqrestore(event_lock, flags); | |
36060 | } | |
36061 | ||
36062 | DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); | |
b3bbd485 | 36063 | @@ -6017,6 +6020,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) |
1a6e0f06 | 36064 | struct mem_cgroup *memcg, *swap_memcg; |
e4b2b4a8 | 36065 | unsigned int nr_entries; |
1a6e0f06 JK |
36066 | unsigned short oldid; |
36067 | + unsigned long flags; | |
36068 | ||
36069 | VM_BUG_ON_PAGE(PageLRU(page), page); | |
36070 | VM_BUG_ON_PAGE(page_count(page), page); | |
b3bbd485 | 36071 | @@ -6062,13 +6066,17 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) |
1a6e0f06 JK |
36072 | * important here to have the interrupts disabled because it is the |
36073 | * only synchronisation we have for udpating the per-CPU variables. | |
36074 | */ | |
36075 | + local_lock_irqsave(event_lock, flags); | |
36076 | +#ifndef CONFIG_PREEMPT_RT_BASE | |
36077 | VM_BUG_ON(!irqs_disabled()); | |
36078 | +#endif | |
e4b2b4a8 JK |
36079 | mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page), |
36080 | -nr_entries); | |
1a6e0f06 JK |
36081 | memcg_check_events(memcg, page); |
36082 | ||
36083 | if (!mem_cgroup_is_root(memcg)) | |
e4b2b4a8 | 36084 | css_put_many(&memcg->css, nr_entries); |
1a6e0f06 JK |
36085 | + local_unlock_irqrestore(event_lock, flags); |
36086 | } | |
36087 | ||
e4b2b4a8 | 36088 | /** |
b3bbd485 JK |
36089 | diff --git a/mm/mmu_context.c b/mm/mmu_context.c |
36090 | index 3e612ae748e9..d0ccc070979f 100644 | |
36091 | --- a/mm/mmu_context.c | |
36092 | +++ b/mm/mmu_context.c | |
36093 | @@ -25,6 +25,7 @@ void use_mm(struct mm_struct *mm) | |
1a6e0f06 JK |
36094 | struct task_struct *tsk = current; |
36095 | ||
36096 | task_lock(tsk); | |
36097 | + preempt_disable_rt(); | |
36098 | active_mm = tsk->active_mm; | |
36099 | if (active_mm != mm) { | |
e4b2b4a8 | 36100 | mmgrab(mm); |
b3bbd485 | 36101 | @@ -32,6 +33,7 @@ void use_mm(struct mm_struct *mm) |
1a6e0f06 JK |
36102 | } |
36103 | tsk->mm = mm; | |
36104 | switch_mm(active_mm, mm, tsk); | |
36105 | + preempt_enable_rt(); | |
36106 | task_unlock(tsk); | |
36107 | #ifdef finish_arch_post_lock_switch | |
36108 | finish_arch_post_lock_switch(); | |
b3bbd485 | 36109 | diff --git a/mm/page_alloc.c b/mm/page_alloc.c |
5dd41b01 | 36110 | index a604b5da6755..525a6f2d5144 100644 |
b3bbd485 JK |
36111 | --- a/mm/page_alloc.c |
36112 | +++ b/mm/page_alloc.c | |
1a6e0f06 | 36113 | @@ -61,6 +61,7 @@ |
1a6e0f06 JK |
36114 | #include <linux/hugetlb.h> |
36115 | #include <linux/sched/rt.h> | |
e4b2b4a8 | 36116 | #include <linux/sched/mm.h> |
1a6e0f06 JK |
36117 | +#include <linux/locallock.h> |
36118 | #include <linux/page_owner.h> | |
36119 | #include <linux/kthread.h> | |
36120 | #include <linux/memcontrol.h> | |
b3bbd485 | 36121 | @@ -286,6 +287,18 @@ EXPORT_SYMBOL(nr_node_ids); |
1a6e0f06 JK |
36122 | EXPORT_SYMBOL(nr_online_nodes); |
36123 | #endif | |
36124 | ||
36125 | +static DEFINE_LOCAL_IRQ_LOCK(pa_lock); | |
36126 | + | |
36127 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
36128 | +# define cpu_lock_irqsave(cpu, flags) \ | |
36129 | + local_lock_irqsave_on(pa_lock, flags, cpu) | |
36130 | +# define cpu_unlock_irqrestore(cpu, flags) \ | |
36131 | + local_unlock_irqrestore_on(pa_lock, flags, cpu) | |
36132 | +#else | |
36133 | +# define cpu_lock_irqsave(cpu, flags) local_irq_save(flags) | |
36134 | +# define cpu_unlock_irqrestore(cpu, flags) local_irq_restore(flags) | |
36135 | +#endif | |
36136 | + | |
36137 | int page_group_by_mobility_disabled __read_mostly; | |
36138 | ||
36139 | #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT | |
b3bbd485 | 36140 | @@ -1094,7 +1107,7 @@ static bool bulkfree_pcp_prepare(struct page *page) |
1a6e0f06 JK |
36141 | #endif /* CONFIG_DEBUG_VM */ |
36142 | ||
36143 | /* | |
36144 | - * Frees a number of pages from the PCP lists | |
36145 | + * Frees a number of pages which have been collected from the pcp lists. | |
36146 | * Assumes all pages on list are in same zone, and of same order. | |
36147 | * count is the number of pages to free. | |
36148 | * | |
b3bbd485 | 36149 | @@ -1105,15 +1118,53 @@ static bool bulkfree_pcp_prepare(struct page *page) |
1a6e0f06 JK |
36150 | * pinned" detection logic. |
36151 | */ | |
36152 | static void free_pcppages_bulk(struct zone *zone, int count, | |
36153 | - struct per_cpu_pages *pcp) | |
36154 | + struct list_head *list) | |
36155 | { | |
36156 | - int migratetype = 0; | |
36157 | - int batch_free = 0; | |
1a6e0f06 JK |
36158 | bool isolated_pageblocks; |
36159 | + unsigned long flags; | |
1a6e0f06 JK |
36160 | |
36161 | - spin_lock(&zone->lock); | |
e4b2b4a8 | 36162 | + spin_lock_irqsave(&zone->lock, flags); |
1a6e0f06 | 36163 | isolated_pageblocks = has_isolate_pageblock(zone); |
1a6e0f06 JK |
36164 | |
36165 | + while (!list_empty(list)) { | |
36166 | + struct page *page; | |
e4b2b4a8 | 36167 | + int mt; /* migratetype of the to-be-freed page */ |
1a6e0f06 JK |
36168 | + |
36169 | + page = list_first_entry(list, struct page, lru); | |
36170 | + /* must delete as __free_one_page list manipulates */ | |
36171 | + list_del(&page->lru); | |
36172 | + | |
36173 | + mt = get_pcppage_migratetype(page); | |
36174 | + /* MIGRATE_ISOLATE page should not go to pcplists */ | |
36175 | + VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); | |
36176 | + /* Pageblock could have been isolated meanwhile */ | |
36177 | + if (unlikely(isolated_pageblocks)) | |
36178 | + mt = get_pageblock_migratetype(page); | |
36179 | + | |
36180 | + if (bulkfree_pcp_prepare(page)) | |
36181 | + continue; | |
36182 | + | |
36183 | + __free_one_page(page, page_to_pfn(page), zone, 0, mt); | |
36184 | + trace_mm_page_pcpu_drain(page, 0, mt); | |
36185 | + count--; | |
36186 | + } | |
36187 | + WARN_ON(count != 0); | |
36188 | + spin_unlock_irqrestore(&zone->lock, flags); | |
36189 | +} | |
36190 | + | |
36191 | +/* | |
36192 | + * Moves a number of pages from the PCP lists to free list which | |
36193 | + * is freed outside of the locked region. | |
36194 | + * | |
36195 | + * Assumes all pages on list are in same zone, and of same order. | |
36196 | + * count is the number of pages to free. | |
36197 | + */ | |
36198 | +static void isolate_pcp_pages(int count, struct per_cpu_pages *src, | |
36199 | + struct list_head *dst) | |
36200 | +{ | |
36201 | + int migratetype = 0; | |
36202 | + int batch_free = 0; | |
36203 | + | |
36204 | while (count) { | |
36205 | struct page *page; | |
36206 | struct list_head *list; | |
b3bbd485 | 36207 | @@ -1129,7 +1180,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, |
1a6e0f06 JK |
36208 | batch_free++; |
36209 | if (++migratetype == MIGRATE_PCPTYPES) | |
36210 | migratetype = 0; | |
36211 | - list = &pcp->lists[migratetype]; | |
36212 | + list = &src->lists[migratetype]; | |
36213 | } while (list_empty(list)); | |
36214 | ||
36215 | /* This is the only non-empty list. Free them all. */ | |
b3bbd485 | 36216 | @@ -1137,27 +1188,12 @@ static void free_pcppages_bulk(struct zone *zone, int count, |
1a6e0f06 JK |
36217 | batch_free = count; |
36218 | ||
36219 | do { | |
36220 | - int mt; /* migratetype of the to-be-freed page */ | |
36221 | - | |
36222 | page = list_last_entry(list, struct page, lru); | |
36223 | - /* must delete as __free_one_page list manipulates */ | |
36224 | list_del(&page->lru); | |
36225 | ||
36226 | - mt = get_pcppage_migratetype(page); | |
36227 | - /* MIGRATE_ISOLATE page should not go to pcplists */ | |
36228 | - VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); | |
36229 | - /* Pageblock could have been isolated meanwhile */ | |
36230 | - if (unlikely(isolated_pageblocks)) | |
36231 | - mt = get_pageblock_migratetype(page); | |
36232 | - | |
36233 | - if (bulkfree_pcp_prepare(page)) | |
36234 | - continue; | |
36235 | - | |
36236 | - __free_one_page(page, page_to_pfn(page), zone, 0, mt); | |
36237 | - trace_mm_page_pcpu_drain(page, 0, mt); | |
36238 | + list_add(&page->lru, dst); | |
36239 | } while (--count && --batch_free && !list_empty(list)); | |
36240 | } | |
36241 | - spin_unlock(&zone->lock); | |
36242 | } | |
36243 | ||
36244 | static void free_one_page(struct zone *zone, | |
b3bbd485 | 36245 | @@ -1165,13 +1201,15 @@ static void free_one_page(struct zone *zone, |
e4b2b4a8 | 36246 | unsigned int order, |
1a6e0f06 JK |
36247 | int migratetype) |
36248 | { | |
1a6e0f06 JK |
36249 | - spin_lock(&zone->lock); |
36250 | + unsigned long flags; | |
36251 | + | |
36252 | + spin_lock_irqsave(&zone->lock, flags); | |
e4b2b4a8 JK |
36253 | if (unlikely(has_isolate_pageblock(zone) || |
36254 | is_migrate_isolate(migratetype))) { | |
1a6e0f06 JK |
36255 | migratetype = get_pfnblock_migratetype(page, pfn); |
36256 | } | |
36257 | __free_one_page(page, pfn, zone, order, migratetype); | |
36258 | - spin_unlock(&zone->lock); | |
36259 | + spin_unlock_irqrestore(&zone->lock, flags); | |
36260 | } | |
36261 | ||
36262 | static void __meminit __init_single_page(struct page *page, unsigned long pfn, | |
b3bbd485 | 36263 | @@ -1257,10 +1295,10 @@ static void __free_pages_ok(struct page *page, unsigned int order) |
1a6e0f06 JK |
36264 | return; |
36265 | ||
36266 | migratetype = get_pfnblock_migratetype(page, pfn); | |
36267 | - local_irq_save(flags); | |
36268 | + local_lock_irqsave(pa_lock, flags); | |
36269 | __count_vm_events(PGFREE, 1 << order); | |
36270 | free_one_page(page_zone(page), page, pfn, order, migratetype); | |
36271 | - local_irq_restore(flags); | |
36272 | + local_unlock_irqrestore(pa_lock, flags); | |
36273 | } | |
36274 | ||
36275 | static void __init __free_pages_boot_core(struct page *page, unsigned int order) | |
b3bbd485 | 36276 | @@ -2378,16 +2416,18 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, |
1a6e0f06 JK |
36277 | void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) |
36278 | { | |
36279 | unsigned long flags; | |
36280 | + LIST_HEAD(dst); | |
36281 | int to_drain, batch; | |
36282 | ||
36283 | - local_irq_save(flags); | |
36284 | + local_lock_irqsave(pa_lock, flags); | |
36285 | batch = READ_ONCE(pcp->batch); | |
36286 | to_drain = min(pcp->count, batch); | |
36287 | if (to_drain > 0) { | |
36288 | - free_pcppages_bulk(zone, to_drain, pcp); | |
36289 | + isolate_pcp_pages(to_drain, pcp, &dst); | |
36290 | pcp->count -= to_drain; | |
36291 | } | |
36292 | - local_irq_restore(flags); | |
36293 | + local_unlock_irqrestore(pa_lock, flags); | |
36294 | + free_pcppages_bulk(zone, to_drain, &dst); | |
36295 | } | |
36296 | #endif | |
36297 | ||
b3bbd485 | 36298 | @@ -2403,16 +2443,21 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone) |
1a6e0f06 JK |
36299 | unsigned long flags; |
36300 | struct per_cpu_pageset *pset; | |
36301 | struct per_cpu_pages *pcp; | |
36302 | + LIST_HEAD(dst); | |
36303 | + int count; | |
36304 | ||
36305 | - local_irq_save(flags); | |
36306 | + cpu_lock_irqsave(cpu, flags); | |
36307 | pset = per_cpu_ptr(zone->pageset, cpu); | |
36308 | ||
36309 | pcp = &pset->pcp; | |
36310 | - if (pcp->count) { | |
36311 | - free_pcppages_bulk(zone, pcp->count, pcp); | |
36312 | + count = pcp->count; | |
36313 | + if (count) { | |
36314 | + isolate_pcp_pages(count, pcp, &dst); | |
36315 | pcp->count = 0; | |
36316 | } | |
36317 | - local_irq_restore(flags); | |
36318 | + cpu_unlock_irqrestore(cpu, flags); | |
36319 | + if (count) | |
36320 | + free_pcppages_bulk(zone, count, &dst); | |
36321 | } | |
36322 | ||
36323 | /* | |
b3bbd485 | 36324 | @@ -2447,6 +2492,7 @@ void drain_local_pages(struct zone *zone) |
e4b2b4a8 JK |
36325 | drain_pages(cpu); |
36326 | } | |
36327 | ||
36328 | +#ifndef CONFIG_PREEMPT_RT_BASE | |
36329 | static void drain_local_pages_wq(struct work_struct *work) | |
36330 | { | |
36331 | /* | |
b3bbd485 | 36332 | @@ -2460,6 +2506,7 @@ static void drain_local_pages_wq(struct work_struct *work) |
e4b2b4a8 JK |
36333 | drain_local_pages(NULL); |
36334 | preempt_enable(); | |
36335 | } | |
36336 | +#endif | |
36337 | ||
36338 | /* | |
36339 | * Spill all the per-cpu pages from all CPUs back into the buddy allocator. | |
b3bbd485 | 36340 | @@ -2526,7 +2573,14 @@ void drain_all_pages(struct zone *zone) |
1a6e0f06 JK |
36341 | else |
36342 | cpumask_clear_cpu(cpu, &cpus_with_pcps); | |
36343 | } | |
e4b2b4a8 JK |
36344 | - |
36345 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
1a6e0f06 JK |
36346 | + for_each_cpu(cpu, &cpus_with_pcps) { |
36347 | + if (zone) | |
36348 | + drain_pages_zone(cpu, zone); | |
36349 | + else | |
36350 | + drain_pages(cpu); | |
36351 | + } | |
e4b2b4a8 JK |
36352 | +#else |
36353 | for_each_cpu(cpu, &cpus_with_pcps) { | |
36354 | struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu); | |
36355 | INIT_WORK(work, drain_local_pages_wq); | |
b3bbd485 | 36356 | @@ -2534,6 +2588,7 @@ void drain_all_pages(struct zone *zone) |
e4b2b4a8 JK |
36357 | } |
36358 | for_each_cpu(cpu, &cpus_with_pcps) | |
36359 | flush_work(per_cpu_ptr(&pcpu_drain, cpu)); | |
1a6e0f06 | 36360 | +#endif |
1a6e0f06 | 36361 | |
e4b2b4a8 JK |
36362 | mutex_unlock(&pcpu_drain_mutex); |
36363 | } | |
b3bbd485 | 36364 | @@ -2610,7 +2665,7 @@ void free_hot_cold_page(struct page *page, bool cold) |
1a6e0f06 JK |
36365 | |
36366 | migratetype = get_pfnblock_migratetype(page, pfn); | |
36367 | set_pcppage_migratetype(page, migratetype); | |
36368 | - local_irq_save(flags); | |
36369 | + local_lock_irqsave(pa_lock, flags); | |
36370 | __count_vm_event(PGFREE); | |
36371 | ||
36372 | /* | |
b3bbd485 | 36373 | @@ -2636,12 +2691,17 @@ void free_hot_cold_page(struct page *page, bool cold) |
1a6e0f06 JK |
36374 | pcp->count++; |
36375 | if (pcp->count >= pcp->high) { | |
36376 | unsigned long batch = READ_ONCE(pcp->batch); | |
36377 | - free_pcppages_bulk(zone, batch, pcp); | |
36378 | + LIST_HEAD(dst); | |
36379 | + | |
36380 | + isolate_pcp_pages(batch, pcp, &dst); | |
36381 | pcp->count -= batch; | |
36382 | + local_unlock_irqrestore(pa_lock, flags); | |
36383 | + free_pcppages_bulk(zone, batch, &dst); | |
36384 | + return; | |
36385 | } | |
36386 | ||
36387 | out: | |
36388 | - local_irq_restore(flags); | |
36389 | + local_unlock_irqrestore(pa_lock, flags); | |
36390 | } | |
36391 | ||
36392 | /* | |
b3bbd485 | 36393 | @@ -2789,7 +2849,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, |
e4b2b4a8 JK |
36394 | struct page *page; |
36395 | unsigned long flags; | |
1a6e0f06 | 36396 | |
e4b2b4a8 JK |
36397 | - local_irq_save(flags); |
36398 | + local_lock_irqsave(pa_lock, flags); | |
36399 | pcp = &this_cpu_ptr(zone->pageset)->pcp; | |
36400 | list = &pcp->lists[migratetype]; | |
36401 | page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list); | |
b3bbd485 | 36402 | @@ -2797,7 +2857,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, |
e4b2b4a8 JK |
36403 | __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); |
36404 | zone_statistics(preferred_zone, zone); | |
1a6e0f06 | 36405 | } |
e4b2b4a8 JK |
36406 | - local_irq_restore(flags); |
36407 | + local_unlock_irqrestore(pa_lock, flags); | |
36408 | return page; | |
36409 | } | |
36410 | ||
b3bbd485 | 36411 | @@ -2824,7 +2884,7 @@ struct page *rmqueue(struct zone *preferred_zone, |
e4b2b4a8 JK |
36412 | * allocate greater than order-1 page units with __GFP_NOFAIL. |
36413 | */ | |
36414 | WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); | |
36415 | - spin_lock_irqsave(&zone->lock, flags); | |
36416 | + local_spin_lock_irqsave(pa_lock, &zone->lock, flags); | |
36417 | ||
36418 | do { | |
36419 | page = NULL; | |
b3bbd485 | 36420 | @@ -2844,14 +2904,14 @@ struct page *rmqueue(struct zone *preferred_zone, |
1a6e0f06 JK |
36421 | |
36422 | __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); | |
e4b2b4a8 | 36423 | zone_statistics(preferred_zone, zone); |
1a6e0f06 JK |
36424 | - local_irq_restore(flags); |
36425 | + local_unlock_irqrestore(pa_lock, flags); | |
36426 | ||
e4b2b4a8 JK |
36427 | out: |
36428 | VM_BUG_ON_PAGE(page && bad_range(zone, page), page); | |
1a6e0f06 JK |
36429 | return page; |
36430 | ||
36431 | failed: | |
36432 | - local_irq_restore(flags); | |
36433 | + local_unlock_irqrestore(pa_lock, flags); | |
36434 | return NULL; | |
36435 | } | |
36436 | ||
5dd41b01 | 36437 | @@ -6785,8 +6845,9 @@ void __init free_area_init(unsigned long *zones_size) |
1a6e0f06 | 36438 | |
e4b2b4a8 | 36439 | static int page_alloc_cpu_dead(unsigned int cpu) |
1a6e0f06 | 36440 | { |
e4b2b4a8 JK |
36441 | - |
36442 | + local_lock_irq_on(swapvec_lock, cpu); | |
36443 | lru_add_drain_cpu(cpu); | |
36444 | + local_unlock_irq_on(swapvec_lock, cpu); | |
36445 | drain_pages(cpu); | |
1a6e0f06 | 36446 | |
e4b2b4a8 | 36447 | /* |
5dd41b01 | 36448 | @@ -7690,7 +7751,7 @@ void zone_pcp_reset(struct zone *zone) |
1a6e0f06 JK |
36449 | struct per_cpu_pageset *pset; |
36450 | ||
36451 | /* avoid races with drain_pages() */ | |
36452 | - local_irq_save(flags); | |
36453 | + local_lock_irqsave(pa_lock, flags); | |
36454 | if (zone->pageset != &boot_pageset) { | |
36455 | for_each_online_cpu(cpu) { | |
36456 | pset = per_cpu_ptr(zone->pageset, cpu); | |
5dd41b01 | 36457 | @@ -7699,7 +7760,7 @@ void zone_pcp_reset(struct zone *zone) |
1a6e0f06 JK |
36458 | free_percpu(zone->pageset); |
36459 | zone->pageset = &boot_pageset; | |
36460 | } | |
36461 | - local_irq_restore(flags); | |
36462 | + local_unlock_irqrestore(pa_lock, flags); | |
36463 | } | |
36464 | ||
36465 | #ifdef CONFIG_MEMORY_HOTREMOVE | |
b3bbd485 JK |
36466 | diff --git a/mm/slab.h b/mm/slab.h |
36467 | index 485d9fbb8802..f3b06c48bf39 100644 | |
36468 | --- a/mm/slab.h | |
36469 | +++ b/mm/slab.h | |
36470 | @@ -451,7 +451,11 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, | |
1a6e0f06 JK |
36471 | * The slab lists for all objects. |
36472 | */ | |
36473 | struct kmem_cache_node { | |
36474 | +#ifdef CONFIG_SLUB | |
36475 | + raw_spinlock_t list_lock; | |
36476 | +#else | |
36477 | spinlock_t list_lock; | |
36478 | +#endif | |
36479 | ||
36480 | #ifdef CONFIG_SLAB | |
36481 | struct list_head slabs_partial; /* partial list first, better asm code */ | |
b3bbd485 | 36482 | diff --git a/mm/slub.c b/mm/slub.c |
5dd41b01 | 36483 | index 220d42e592ef..9b337c28dd1f 100644 |
b3bbd485 JK |
36484 | --- a/mm/slub.c |
36485 | +++ b/mm/slub.c | |
36486 | @@ -1179,7 +1179,7 @@ static noinline int free_debug_processing( | |
1a6e0f06 JK |
36487 | unsigned long uninitialized_var(flags); |
36488 | int ret = 0; | |
36489 | ||
36490 | - spin_lock_irqsave(&n->list_lock, flags); | |
36491 | + raw_spin_lock_irqsave(&n->list_lock, flags); | |
36492 | slab_lock(page); | |
36493 | ||
36494 | if (s->flags & SLAB_CONSISTENCY_CHECKS) { | |
b3bbd485 | 36495 | @@ -1214,7 +1214,7 @@ static noinline int free_debug_processing( |
1a6e0f06 JK |
36496 | bulk_cnt, cnt); |
36497 | ||
36498 | slab_unlock(page); | |
36499 | - spin_unlock_irqrestore(&n->list_lock, flags); | |
36500 | + raw_spin_unlock_irqrestore(&n->list_lock, flags); | |
36501 | if (!ret) | |
36502 | slab_fix(s, "Object at 0x%p not freed", object); | |
36503 | return ret; | |
b3bbd485 | 36504 | @@ -1342,6 +1342,12 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, |
1a6e0f06 JK |
36505 | |
36506 | #endif /* CONFIG_SLUB_DEBUG */ | |
36507 | ||
36508 | +struct slub_free_list { | |
36509 | + raw_spinlock_t lock; | |
36510 | + struct list_head list; | |
36511 | +}; | |
36512 | +static DEFINE_PER_CPU(struct slub_free_list, slub_free_list); | |
36513 | + | |
36514 | /* | |
36515 | * Hooks for other subsystems that check memory allocations. In a typical | |
36516 | * production configuration these hooks all should produce no code at all. | |
b3bbd485 | 36517 | @@ -1561,10 +1567,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) |
1a6e0f06 JK |
36518 | void *start, *p; |
36519 | int idx, order; | |
36520 | bool shuffle; | |
36521 | + bool enableirqs = false; | |
36522 | ||
36523 | flags &= gfp_allowed_mask; | |
36524 | ||
36525 | if (gfpflags_allow_blocking(flags)) | |
36526 | + enableirqs = true; | |
36527 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
e4b2b4a8 | 36528 | + if (system_state > SYSTEM_BOOTING) |
1a6e0f06 JK |
36529 | + enableirqs = true; |
36530 | +#endif | |
36531 | + if (enableirqs) | |
36532 | local_irq_enable(); | |
36533 | ||
36534 | flags |= s->allocflags; | |
b3bbd485 | 36535 | @@ -1623,7 +1636,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) |
1a6e0f06 JK |
36536 | page->frozen = 1; |
36537 | ||
36538 | out: | |
36539 | - if (gfpflags_allow_blocking(flags)) | |
36540 | + if (enableirqs) | |
36541 | local_irq_disable(); | |
36542 | if (!page) | |
36543 | return NULL; | |
b3bbd485 | 36544 | @@ -1681,6 +1694,16 @@ static void __free_slab(struct kmem_cache *s, struct page *page) |
1a6e0f06 JK |
36545 | __free_pages(page, order); |
36546 | } | |
36547 | ||
36548 | +static void free_delayed(struct list_head *h) | |
36549 | +{ | |
36550 | + while(!list_empty(h)) { | |
36551 | + struct page *page = list_first_entry(h, struct page, lru); | |
36552 | + | |
36553 | + list_del(&page->lru); | |
36554 | + __free_slab(page->slab_cache, page); | |
36555 | + } | |
36556 | +} | |
36557 | + | |
36558 | #define need_reserve_slab_rcu \ | |
36559 | (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head)) | |
36560 | ||
b3bbd485 | 36561 | @@ -1712,6 +1735,12 @@ static void free_slab(struct kmem_cache *s, struct page *page) |
1a6e0f06 JK |
36562 | } |
36563 | ||
36564 | call_rcu(head, rcu_free_slab); | |
36565 | + } else if (irqs_disabled()) { | |
36566 | + struct slub_free_list *f = this_cpu_ptr(&slub_free_list); | |
36567 | + | |
36568 | + raw_spin_lock(&f->lock); | |
36569 | + list_add(&page->lru, &f->list); | |
36570 | + raw_spin_unlock(&f->lock); | |
36571 | } else | |
36572 | __free_slab(s, page); | |
36573 | } | |
b3bbd485 | 36574 | @@ -1819,7 +1848,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, |
1a6e0f06 JK |
36575 | if (!n || !n->nr_partial) |
36576 | return NULL; | |
36577 | ||
36578 | - spin_lock(&n->list_lock); | |
36579 | + raw_spin_lock(&n->list_lock); | |
36580 | list_for_each_entry_safe(page, page2, &n->partial, lru) { | |
36581 | void *t; | |
36582 | ||
b3bbd485 | 36583 | @@ -1844,7 +1873,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, |
1a6e0f06 JK |
36584 | break; |
36585 | ||
36586 | } | |
36587 | - spin_unlock(&n->list_lock); | |
36588 | + raw_spin_unlock(&n->list_lock); | |
36589 | return object; | |
36590 | } | |
36591 | ||
b3bbd485 | 36592 | @@ -2090,7 +2119,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, |
1a6e0f06 JK |
36593 | * that acquire_slab() will see a slab page that |
36594 | * is frozen | |
36595 | */ | |
36596 | - spin_lock(&n->list_lock); | |
36597 | + raw_spin_lock(&n->list_lock); | |
36598 | } | |
36599 | } else { | |
36600 | m = M_FULL; | |
b3bbd485 | 36601 | @@ -2101,7 +2130,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, |
1a6e0f06 JK |
36602 | * slabs from diagnostic functions will not see |
36603 | * any frozen slabs. | |
36604 | */ | |
36605 | - spin_lock(&n->list_lock); | |
36606 | + raw_spin_lock(&n->list_lock); | |
36607 | } | |
36608 | } | |
36609 | ||
b3bbd485 | 36610 | @@ -2136,7 +2165,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, |
1a6e0f06 JK |
36611 | goto redo; |
36612 | ||
36613 | if (lock) | |
36614 | - spin_unlock(&n->list_lock); | |
36615 | + raw_spin_unlock(&n->list_lock); | |
36616 | ||
36617 | if (m == M_FREE) { | |
36618 | stat(s, DEACTIVATE_EMPTY); | |
b3bbd485 | 36619 | @@ -2171,10 +2200,10 @@ static void unfreeze_partials(struct kmem_cache *s, |
1a6e0f06 JK |
36620 | n2 = get_node(s, page_to_nid(page)); |
36621 | if (n != n2) { | |
36622 | if (n) | |
36623 | - spin_unlock(&n->list_lock); | |
36624 | + raw_spin_unlock(&n->list_lock); | |
36625 | ||
36626 | n = n2; | |
36627 | - spin_lock(&n->list_lock); | |
36628 | + raw_spin_lock(&n->list_lock); | |
36629 | } | |
36630 | ||
36631 | do { | |
b3bbd485 | 36632 | @@ -2203,7 +2232,7 @@ static void unfreeze_partials(struct kmem_cache *s, |
1a6e0f06 JK |
36633 | } |
36634 | ||
36635 | if (n) | |
36636 | - spin_unlock(&n->list_lock); | |
36637 | + raw_spin_unlock(&n->list_lock); | |
36638 | ||
36639 | while (discard_page) { | |
36640 | page = discard_page; | |
b3bbd485 | 36641 | @@ -2242,14 +2271,21 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) |
1a6e0f06 JK |
36642 | pobjects = oldpage->pobjects; |
36643 | pages = oldpage->pages; | |
36644 | if (drain && pobjects > s->cpu_partial) { | |
36645 | + struct slub_free_list *f; | |
36646 | unsigned long flags; | |
36647 | + LIST_HEAD(tofree); | |
36648 | /* | |
36649 | * partial array is full. Move the existing | |
36650 | * set to the per node partial list. | |
36651 | */ | |
36652 | local_irq_save(flags); | |
36653 | unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); | |
36654 | + f = this_cpu_ptr(&slub_free_list); | |
36655 | + raw_spin_lock(&f->lock); | |
36656 | + list_splice_init(&f->list, &tofree); | |
36657 | + raw_spin_unlock(&f->lock); | |
36658 | local_irq_restore(flags); | |
36659 | + free_delayed(&tofree); | |
36660 | oldpage = NULL; | |
36661 | pobjects = 0; | |
36662 | pages = 0; | |
b3bbd485 | 36663 | @@ -2319,7 +2355,22 @@ static bool has_cpu_slab(int cpu, void *info) |
1a6e0f06 JK |
36664 | |
36665 | static void flush_all(struct kmem_cache *s) | |
36666 | { | |
36667 | + LIST_HEAD(tofree); | |
36668 | + int cpu; | |
36669 | + | |
36670 | on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC); | |
36671 | + for_each_online_cpu(cpu) { | |
36672 | + struct slub_free_list *f; | |
36673 | + | |
36674 | + if (!has_cpu_slab(cpu, s)) | |
36675 | + continue; | |
36676 | + | |
36677 | + f = &per_cpu(slub_free_list, cpu); | |
36678 | + raw_spin_lock_irq(&f->lock); | |
36679 | + list_splice_init(&f->list, &tofree); | |
36680 | + raw_spin_unlock_irq(&f->lock); | |
36681 | + free_delayed(&tofree); | |
36682 | + } | |
36683 | } | |
36684 | ||
36685 | /* | |
b3bbd485 | 36686 | @@ -2374,10 +2425,10 @@ static unsigned long count_partial(struct kmem_cache_node *n, |
1a6e0f06 JK |
36687 | unsigned long x = 0; |
36688 | struct page *page; | |
36689 | ||
36690 | - spin_lock_irqsave(&n->list_lock, flags); | |
36691 | + raw_spin_lock_irqsave(&n->list_lock, flags); | |
36692 | list_for_each_entry(page, &n->partial, lru) | |
36693 | x += get_count(page); | |
36694 | - spin_unlock_irqrestore(&n->list_lock, flags); | |
36695 | + raw_spin_unlock_irqrestore(&n->list_lock, flags); | |
36696 | return x; | |
36697 | } | |
36698 | #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */ | |
b3bbd485 | 36699 | @@ -2515,8 +2566,10 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page) |
1a6e0f06 JK |
36700 | * already disabled (which is the case for bulk allocation). |
36701 | */ | |
36702 | static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, | |
36703 | - unsigned long addr, struct kmem_cache_cpu *c) | |
36704 | + unsigned long addr, struct kmem_cache_cpu *c, | |
36705 | + struct list_head *to_free) | |
36706 | { | |
36707 | + struct slub_free_list *f; | |
36708 | void *freelist; | |
36709 | struct page *page; | |
36710 | ||
b3bbd485 | 36711 | @@ -2572,6 +2625,13 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, |
1a6e0f06 JK |
36712 | VM_BUG_ON(!c->page->frozen); |
36713 | c->freelist = get_freepointer(s, freelist); | |
36714 | c->tid = next_tid(c->tid); | |
36715 | + | |
36716 | +out: | |
36717 | + f = this_cpu_ptr(&slub_free_list); | |
36718 | + raw_spin_lock(&f->lock); | |
36719 | + list_splice_init(&f->list, to_free); | |
36720 | + raw_spin_unlock(&f->lock); | |
36721 | + | |
36722 | return freelist; | |
36723 | ||
36724 | new_slab: | |
b3bbd485 | 36725 | @@ -2587,7 +2647,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, |
e4b2b4a8 JK |
36726 | |
36727 | if (unlikely(!freelist)) { | |
36728 | slab_out_of_memory(s, gfpflags, node); | |
36729 | - return NULL; | |
36730 | + goto out; | |
36731 | } | |
36732 | ||
36733 | page = c->page; | |
b3bbd485 | 36734 | @@ -2600,7 +2660,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, |
e4b2b4a8 JK |
36735 | goto new_slab; /* Slab failed checks. Next slab needed */ |
36736 | ||
36737 | deactivate_slab(s, page, get_freepointer(s, freelist), c); | |
1a6e0f06 JK |
36738 | - return freelist; |
36739 | + goto out; | |
36740 | } | |
36741 | ||
36742 | /* | |
b3bbd485 | 36743 | @@ -2612,6 +2672,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, |
1a6e0f06 JK |
36744 | { |
36745 | void *p; | |
36746 | unsigned long flags; | |
36747 | + LIST_HEAD(tofree); | |
36748 | ||
36749 | local_irq_save(flags); | |
36750 | #ifdef CONFIG_PREEMPT | |
b3bbd485 | 36751 | @@ -2623,8 +2684,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, |
1a6e0f06 JK |
36752 | c = this_cpu_ptr(s->cpu_slab); |
36753 | #endif | |
36754 | ||
36755 | - p = ___slab_alloc(s, gfpflags, node, addr, c); | |
36756 | + p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree); | |
36757 | local_irq_restore(flags); | |
36758 | + free_delayed(&tofree); | |
36759 | return p; | |
36760 | } | |
36761 | ||
b3bbd485 | 36762 | @@ -2810,7 +2872,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, |
1a6e0f06 JK |
36763 | |
36764 | do { | |
36765 | if (unlikely(n)) { | |
36766 | - spin_unlock_irqrestore(&n->list_lock, flags); | |
36767 | + raw_spin_unlock_irqrestore(&n->list_lock, flags); | |
36768 | n = NULL; | |
36769 | } | |
36770 | prior = page->freelist; | |
b3bbd485 | 36771 | @@ -2842,7 +2904,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, |
1a6e0f06 JK |
36772 | * Otherwise the list_lock will synchronize with |
36773 | * other processors updating the list of slabs. | |
36774 | */ | |
36775 | - spin_lock_irqsave(&n->list_lock, flags); | |
36776 | + raw_spin_lock_irqsave(&n->list_lock, flags); | |
36777 | ||
36778 | } | |
36779 | } | |
b3bbd485 | 36780 | @@ -2884,7 +2946,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, |
1a6e0f06 JK |
36781 | add_partial(n, page, DEACTIVATE_TO_TAIL); |
36782 | stat(s, FREE_ADD_PARTIAL); | |
36783 | } | |
36784 | - spin_unlock_irqrestore(&n->list_lock, flags); | |
36785 | + raw_spin_unlock_irqrestore(&n->list_lock, flags); | |
36786 | return; | |
36787 | ||
36788 | slab_empty: | |
b3bbd485 | 36789 | @@ -2899,7 +2961,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, |
1a6e0f06 JK |
36790 | remove_full(s, n, page); |
36791 | } | |
36792 | ||
36793 | - spin_unlock_irqrestore(&n->list_lock, flags); | |
36794 | + raw_spin_unlock_irqrestore(&n->list_lock, flags); | |
36795 | stat(s, FREE_SLAB); | |
36796 | discard_slab(s, page); | |
36797 | } | |
b3bbd485 | 36798 | @@ -3104,6 +3166,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, |
1a6e0f06 JK |
36799 | void **p) |
36800 | { | |
36801 | struct kmem_cache_cpu *c; | |
36802 | + LIST_HEAD(to_free); | |
36803 | int i; | |
36804 | ||
36805 | /* memcg and kmem_cache debug support */ | |
b3bbd485 | 36806 | @@ -3127,7 +3190,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, |
1a6e0f06 JK |
36807 | * of re-populating per CPU c->freelist |
36808 | */ | |
36809 | p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, | |
36810 | - _RET_IP_, c); | |
36811 | + _RET_IP_, c, &to_free); | |
36812 | if (unlikely(!p[i])) | |
36813 | goto error; | |
36814 | ||
b3bbd485 | 36815 | @@ -3139,6 +3202,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, |
1a6e0f06 JK |
36816 | } |
36817 | c->tid = next_tid(c->tid); | |
36818 | local_irq_enable(); | |
36819 | + free_delayed(&to_free); | |
36820 | ||
36821 | /* Clear memory outside IRQ disabled fastpath loop */ | |
36822 | if (unlikely(flags & __GFP_ZERO)) { | |
b3bbd485 | 36823 | @@ -3153,6 +3217,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, |
e4b2b4a8 JK |
36824 | return i; |
36825 | error: | |
36826 | local_irq_enable(); | |
36827 | + free_delayed(&to_free); | |
36828 | slab_post_alloc_hook(s, flags, i, p); | |
36829 | __kmem_cache_free_bulk(s, i, p); | |
36830 | return 0; | |
b3bbd485 | 36831 | @@ -3286,7 +3351,7 @@ static void |
1a6e0f06 JK |
36832 | init_kmem_cache_node(struct kmem_cache_node *n) |
36833 | { | |
36834 | n->nr_partial = 0; | |
36835 | - spin_lock_init(&n->list_lock); | |
36836 | + raw_spin_lock_init(&n->list_lock); | |
36837 | INIT_LIST_HEAD(&n->partial); | |
36838 | #ifdef CONFIG_SLUB_DEBUG | |
36839 | atomic_long_set(&n->nr_slabs, 0); | |
b3bbd485 | 36840 | @@ -3640,6 +3705,10 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, |
1a6e0f06 JK |
36841 | const char *text) |
36842 | { | |
36843 | #ifdef CONFIG_SLUB_DEBUG | |
36844 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
36845 | + /* XXX move out of irq-off section */ | |
36846 | + slab_err(s, page, text, s->name); | |
36847 | +#else | |
36848 | void *addr = page_address(page); | |
36849 | void *p; | |
36850 | unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) * | |
b3bbd485 | 36851 | @@ -3660,6 +3729,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, |
1a6e0f06 JK |
36852 | slab_unlock(page); |
36853 | kfree(map); | |
36854 | #endif | |
36855 | +#endif | |
36856 | } | |
36857 | ||
36858 | /* | |
b3bbd485 | 36859 | @@ -3673,7 +3743,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) |
1a6e0f06 JK |
36860 | struct page *page, *h; |
36861 | ||
36862 | BUG_ON(irqs_disabled()); | |
36863 | - spin_lock_irq(&n->list_lock); | |
36864 | + raw_spin_lock_irq(&n->list_lock); | |
36865 | list_for_each_entry_safe(page, h, &n->partial, lru) { | |
36866 | if (!page->inuse) { | |
36867 | remove_partial(n, page); | |
b3bbd485 | 36868 | @@ -3683,7 +3753,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) |
1a6e0f06 JK |
36869 | "Objects remaining in %s on __kmem_cache_shutdown()"); |
36870 | } | |
36871 | } | |
36872 | - spin_unlock_irq(&n->list_lock); | |
36873 | + raw_spin_unlock_irq(&n->list_lock); | |
36874 | ||
36875 | list_for_each_entry_safe(page, h, &discard, lru) | |
36876 | discard_slab(s, page); | |
b3bbd485 | 36877 | @@ -3927,7 +3997,7 @@ int __kmem_cache_shrink(struct kmem_cache *s) |
1a6e0f06 JK |
36878 | for (i = 0; i < SHRINK_PROMOTE_MAX; i++) |
36879 | INIT_LIST_HEAD(promote + i); | |
36880 | ||
36881 | - spin_lock_irqsave(&n->list_lock, flags); | |
36882 | + raw_spin_lock_irqsave(&n->list_lock, flags); | |
36883 | ||
36884 | /* | |
36885 | * Build lists of slabs to discard or promote. | |
b3bbd485 | 36886 | @@ -3958,7 +4028,7 @@ int __kmem_cache_shrink(struct kmem_cache *s) |
1a6e0f06 JK |
36887 | for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--) |
36888 | list_splice(promote + i, &n->partial); | |
36889 | ||
36890 | - spin_unlock_irqrestore(&n->list_lock, flags); | |
36891 | + raw_spin_unlock_irqrestore(&n->list_lock, flags); | |
36892 | ||
36893 | /* Release empty slabs */ | |
36894 | list_for_each_entry_safe(page, t, &discard, lru) | |
b3bbd485 | 36895 | @@ -4171,6 +4241,12 @@ void __init kmem_cache_init(void) |
1a6e0f06 JK |
36896 | { |
36897 | static __initdata struct kmem_cache boot_kmem_cache, | |
36898 | boot_kmem_cache_node; | |
36899 | + int cpu; | |
36900 | + | |
36901 | + for_each_possible_cpu(cpu) { | |
36902 | + raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock); | |
36903 | + INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list); | |
36904 | + } | |
36905 | ||
36906 | if (debug_guardpage_minorder()) | |
36907 | slub_max_order = 0; | |
b3bbd485 | 36908 | @@ -4379,7 +4455,7 @@ static int validate_slab_node(struct kmem_cache *s, |
1a6e0f06 JK |
36909 | struct page *page; |
36910 | unsigned long flags; | |
36911 | ||
36912 | - spin_lock_irqsave(&n->list_lock, flags); | |
36913 | + raw_spin_lock_irqsave(&n->list_lock, flags); | |
36914 | ||
36915 | list_for_each_entry(page, &n->partial, lru) { | |
36916 | validate_slab_slab(s, page, map); | |
b3bbd485 | 36917 | @@ -4401,7 +4477,7 @@ static int validate_slab_node(struct kmem_cache *s, |
1a6e0f06 JK |
36918 | s->name, count, atomic_long_read(&n->nr_slabs)); |
36919 | ||
36920 | out: | |
36921 | - spin_unlock_irqrestore(&n->list_lock, flags); | |
36922 | + raw_spin_unlock_irqrestore(&n->list_lock, flags); | |
36923 | return count; | |
36924 | } | |
36925 | ||
b3bbd485 | 36926 | @@ -4589,12 +4665,12 @@ static int list_locations(struct kmem_cache *s, char *buf, |
1a6e0f06 JK |
36927 | if (!atomic_long_read(&n->nr_slabs)) |
36928 | continue; | |
36929 | ||
36930 | - spin_lock_irqsave(&n->list_lock, flags); | |
36931 | + raw_spin_lock_irqsave(&n->list_lock, flags); | |
36932 | list_for_each_entry(page, &n->partial, lru) | |
36933 | process_slab(&t, s, page, alloc, map); | |
36934 | list_for_each_entry(page, &n->full, lru) | |
36935 | process_slab(&t, s, page, alloc, map); | |
36936 | - spin_unlock_irqrestore(&n->list_lock, flags); | |
36937 | + raw_spin_unlock_irqrestore(&n->list_lock, flags); | |
36938 | } | |
36939 | ||
36940 | for (i = 0; i < t.count; i++) { | |
b3bbd485 JK |
36941 | diff --git a/mm/swap.c b/mm/swap.c |
36942 | index a77d68f2c1b6..30d62efe001b 100644 | |
36943 | --- a/mm/swap.c | |
36944 | +++ b/mm/swap.c | |
1a6e0f06 JK |
36945 | @@ -32,6 +32,7 @@ |
36946 | #include <linux/memcontrol.h> | |
36947 | #include <linux/gfp.h> | |
36948 | #include <linux/uio.h> | |
36949 | +#include <linux/locallock.h> | |
36950 | #include <linux/hugetlb.h> | |
36951 | #include <linux/page_idle.h> | |
36952 | ||
b3bbd485 | 36953 | @@ -50,6 +51,8 @@ static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs); |
1a6e0f06 JK |
36954 | #ifdef CONFIG_SMP |
36955 | static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); | |
36956 | #endif | |
36957 | +static DEFINE_LOCAL_IRQ_LOCK(rotate_lock); | |
36958 | +DEFINE_LOCAL_IRQ_LOCK(swapvec_lock); | |
36959 | ||
36960 | /* | |
36961 | * This path almost never happens for VM activity - pages are normally | |
b3bbd485 | 36962 | @@ -252,11 +255,11 @@ void rotate_reclaimable_page(struct page *page) |
1a6e0f06 JK |
36963 | unsigned long flags; |
36964 | ||
36965 | get_page(page); | |
36966 | - local_irq_save(flags); | |
36967 | + local_lock_irqsave(rotate_lock, flags); | |
36968 | pvec = this_cpu_ptr(&lru_rotate_pvecs); | |
36969 | if (!pagevec_add(pvec, page) || PageCompound(page)) | |
36970 | pagevec_move_tail(pvec); | |
36971 | - local_irq_restore(flags); | |
36972 | + local_unlock_irqrestore(rotate_lock, flags); | |
36973 | } | |
36974 | } | |
36975 | ||
b3bbd485 | 36976 | @@ -306,12 +309,13 @@ void activate_page(struct page *page) |
1a6e0f06 JK |
36977 | { |
36978 | page = compound_head(page); | |
36979 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { | |
36980 | - struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); | |
36981 | + struct pagevec *pvec = &get_locked_var(swapvec_lock, | |
36982 | + activate_page_pvecs); | |
36983 | ||
36984 | get_page(page); | |
36985 | if (!pagevec_add(pvec, page) || PageCompound(page)) | |
36986 | pagevec_lru_move_fn(pvec, __activate_page, NULL); | |
36987 | - put_cpu_var(activate_page_pvecs); | |
36988 | + put_locked_var(swapvec_lock, activate_page_pvecs); | |
36989 | } | |
36990 | } | |
36991 | ||
b3bbd485 | 36992 | @@ -338,7 +342,7 @@ void activate_page(struct page *page) |
1a6e0f06 JK |
36993 | |
36994 | static void __lru_cache_activate_page(struct page *page) | |
36995 | { | |
36996 | - struct pagevec *pvec = &get_cpu_var(lru_add_pvec); | |
36997 | + struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec); | |
36998 | int i; | |
36999 | ||
37000 | /* | |
b3bbd485 | 37001 | @@ -360,7 +364,7 @@ static void __lru_cache_activate_page(struct page *page) |
1a6e0f06 JK |
37002 | } |
37003 | } | |
37004 | ||
37005 | - put_cpu_var(lru_add_pvec); | |
37006 | + put_locked_var(swapvec_lock, lru_add_pvec); | |
37007 | } | |
37008 | ||
37009 | /* | |
b3bbd485 | 37010 | @@ -402,12 +406,12 @@ EXPORT_SYMBOL(mark_page_accessed); |
1a6e0f06 JK |
37011 | |
37012 | static void __lru_cache_add(struct page *page) | |
37013 | { | |
37014 | - struct pagevec *pvec = &get_cpu_var(lru_add_pvec); | |
37015 | + struct pagevec *pvec = &get_locked_var(swapvec_lock, lru_add_pvec); | |
37016 | ||
37017 | get_page(page); | |
37018 | if (!pagevec_add(pvec, page) || PageCompound(page)) | |
37019 | __pagevec_lru_add(pvec); | |
37020 | - put_cpu_var(lru_add_pvec); | |
37021 | + put_locked_var(swapvec_lock, lru_add_pvec); | |
37022 | } | |
37023 | ||
37024 | /** | |
b3bbd485 | 37025 | @@ -613,9 +617,15 @@ void lru_add_drain_cpu(int cpu) |
1a6e0f06 JK |
37026 | unsigned long flags; |
37027 | ||
37028 | /* No harm done if a racing interrupt already did this */ | |
37029 | - local_irq_save(flags); | |
37030 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
37031 | + local_lock_irqsave_on(rotate_lock, flags, cpu); | |
37032 | pagevec_move_tail(pvec); | |
37033 | - local_irq_restore(flags); | |
37034 | + local_unlock_irqrestore_on(rotate_lock, flags, cpu); | |
37035 | +#else | |
37036 | + local_lock_irqsave(rotate_lock, flags); | |
37037 | + pagevec_move_tail(pvec); | |
37038 | + local_unlock_irqrestore(rotate_lock, flags); | |
37039 | +#endif | |
37040 | } | |
37041 | ||
37042 | pvec = &per_cpu(lru_deactivate_file_pvecs, cpu); | |
b3bbd485 | 37043 | @@ -647,11 +657,12 @@ void deactivate_file_page(struct page *page) |
1a6e0f06 JK |
37044 | return; |
37045 | ||
37046 | if (likely(get_page_unless_zero(page))) { | |
37047 | - struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs); | |
37048 | + struct pagevec *pvec = &get_locked_var(swapvec_lock, | |
37049 | + lru_deactivate_file_pvecs); | |
37050 | ||
37051 | if (!pagevec_add(pvec, page) || PageCompound(page)) | |
37052 | pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); | |
37053 | - put_cpu_var(lru_deactivate_file_pvecs); | |
37054 | + put_locked_var(swapvec_lock, lru_deactivate_file_pvecs); | |
37055 | } | |
37056 | } | |
37057 | ||
b3bbd485 | 37058 | @@ -666,21 +677,32 @@ void mark_page_lazyfree(struct page *page) |
1a6e0f06 | 37059 | { |
e4b2b4a8 JK |
37060 | if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && |
37061 | !PageSwapCache(page) && !PageUnevictable(page)) { | |
37062 | - struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs); | |
1a6e0f06 | 37063 | + struct pagevec *pvec = &get_locked_var(swapvec_lock, |
e4b2b4a8 | 37064 | + lru_lazyfree_pvecs); |
1a6e0f06 JK |
37065 | |
37066 | get_page(page); | |
37067 | if (!pagevec_add(pvec, page) || PageCompound(page)) | |
e4b2b4a8 JK |
37068 | pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL); |
37069 | - put_cpu_var(lru_lazyfree_pvecs); | |
37070 | + put_locked_var(swapvec_lock, lru_lazyfree_pvecs); | |
1a6e0f06 JK |
37071 | } |
37072 | } | |
37073 | ||
37074 | void lru_add_drain(void) | |
37075 | { | |
37076 | - lru_add_drain_cpu(get_cpu()); | |
37077 | - put_cpu(); | |
37078 | + lru_add_drain_cpu(local_lock_cpu(swapvec_lock)); | |
37079 | + local_unlock_cpu(swapvec_lock); | |
b3bbd485 JK |
37080 | +} |
37081 | + | |
1a6e0f06 JK |
37082 | +#ifdef CONFIG_PREEMPT_RT_BASE |
37083 | +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work) | |
e4b2b4a8 | 37084 | +{ |
1a6e0f06 JK |
37085 | + local_lock_on(swapvec_lock, cpu); |
37086 | + lru_add_drain_cpu(cpu); | |
37087 | + local_unlock_on(swapvec_lock, cpu); | |
b3bbd485 JK |
37088 | } |
37089 | ||
1a6e0f06 | 37090 | +#else |
e4b2b4a8 JK |
37091 | + |
37092 | static void lru_add_drain_per_cpu(struct work_struct *dummy) | |
37093 | { | |
37094 | lru_add_drain(); | |
b3bbd485 | 37095 | @@ -688,6 +710,16 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) |
1a6e0f06 | 37096 | |
e4b2b4a8 | 37097 | static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); |
1a6e0f06 | 37098 | |
1a6e0f06 JK |
37099 | +static inline void remote_lru_add_drain(int cpu, struct cpumask *has_work) |
37100 | +{ | |
37101 | + struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); | |
37102 | + | |
37103 | + INIT_WORK(work, lru_add_drain_per_cpu); | |
e4b2b4a8 | 37104 | + queue_work_on(cpu, mm_percpu_wq, work); |
1a6e0f06 JK |
37105 | + cpumask_set_cpu(cpu, has_work); |
37106 | +} | |
37107 | +#endif | |
37108 | + | |
e4b2b4a8 | 37109 | void lru_add_drain_all_cpuslocked(void) |
1a6e0f06 JK |
37110 | { |
37111 | static DEFINE_MUTEX(lock); | |
b3bbd485 | 37112 | @@ -705,21 +737,19 @@ void lru_add_drain_all_cpuslocked(void) |
1a6e0f06 JK |
37113 | cpumask_clear(&has_work); |
37114 | ||
37115 | for_each_online_cpu(cpu) { | |
37116 | - struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); | |
e4b2b4a8 | 37117 | |
1a6e0f06 JK |
37118 | if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || |
37119 | pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || | |
37120 | pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || | |
e4b2b4a8 | 37121 | pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) || |
1a6e0f06 JK |
37122 | - need_activate_page_drain(cpu)) { |
37123 | - INIT_WORK(work, lru_add_drain_per_cpu); | |
e4b2b4a8 | 37124 | - queue_work_on(cpu, mm_percpu_wq, work); |
1a6e0f06 JK |
37125 | - cpumask_set_cpu(cpu, &has_work); |
37126 | - } | |
37127 | + need_activate_page_drain(cpu)) | |
37128 | + remote_lru_add_drain(cpu, &has_work); | |
37129 | } | |
37130 | ||
37131 | +#ifndef CONFIG_PREEMPT_RT_BASE | |
37132 | for_each_cpu(cpu, &has_work) | |
37133 | flush_work(&per_cpu(lru_add_drain_work, cpu)); | |
37134 | +#endif | |
37135 | ||
1a6e0f06 | 37136 | mutex_unlock(&lock); |
e4b2b4a8 | 37137 | } |
b3bbd485 JK |
37138 | diff --git a/mm/truncate.c b/mm/truncate.c |
37139 | index 2330223841fb..d0c8e6c8fef5 100644 | |
37140 | --- a/mm/truncate.c | |
37141 | +++ b/mm/truncate.c | |
37142 | @@ -41,8 +41,10 @@ static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, | |
e4b2b4a8 JK |
37143 | goto unlock; |
37144 | if (*slot != entry) | |
37145 | goto unlock; | |
37146 | + local_lock(shadow_nodes_lock); | |
37147 | __radix_tree_replace(&mapping->page_tree, node, slot, NULL, | |
37148 | - workingset_update_node, mapping); | |
37149 | + __workingset_update_node, mapping); | |
37150 | + local_unlock(shadow_nodes_lock); | |
37151 | mapping->nrexceptional--; | |
1a6e0f06 JK |
37152 | unlock: |
37153 | spin_unlock_irq(&mapping->tree_lock); | |
b3bbd485 JK |
37154 | diff --git a/mm/vmalloc.c b/mm/vmalloc.c |
37155 | index 9ff21a12ea00..95c83b291548 100644 | |
37156 | --- a/mm/vmalloc.c | |
37157 | +++ b/mm/vmalloc.c | |
37158 | @@ -865,7 +865,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) | |
1a6e0f06 JK |
37159 | struct vmap_block *vb; |
37160 | struct vmap_area *va; | |
37161 | unsigned long vb_idx; | |
37162 | - int node, err; | |
37163 | + int node, err, cpu; | |
37164 | void *vaddr; | |
37165 | ||
37166 | node = numa_node_id(); | |
b3bbd485 | 37167 | @@ -908,11 +908,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) |
1a6e0f06 JK |
37168 | BUG_ON(err); |
37169 | radix_tree_preload_end(); | |
37170 | ||
37171 | - vbq = &get_cpu_var(vmap_block_queue); | |
37172 | + cpu = get_cpu_light(); | |
37173 | + vbq = this_cpu_ptr(&vmap_block_queue); | |
37174 | spin_lock(&vbq->lock); | |
37175 | list_add_tail_rcu(&vb->free_list, &vbq->free); | |
37176 | spin_unlock(&vbq->lock); | |
37177 | - put_cpu_var(vmap_block_queue); | |
37178 | + put_cpu_light(); | |
37179 | ||
37180 | return vaddr; | |
37181 | } | |
b3bbd485 | 37182 | @@ -981,6 +982,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) |
1a6e0f06 JK |
37183 | struct vmap_block *vb; |
37184 | void *vaddr = NULL; | |
37185 | unsigned int order; | |
37186 | + int cpu; | |
37187 | ||
37188 | BUG_ON(offset_in_page(size)); | |
37189 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); | |
b3bbd485 | 37190 | @@ -995,7 +997,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) |
1a6e0f06 JK |
37191 | order = get_order(size); |
37192 | ||
37193 | rcu_read_lock(); | |
37194 | - vbq = &get_cpu_var(vmap_block_queue); | |
37195 | + cpu = get_cpu_light(); | |
37196 | + vbq = this_cpu_ptr(&vmap_block_queue); | |
37197 | list_for_each_entry_rcu(vb, &vbq->free, free_list) { | |
37198 | unsigned long pages_off; | |
37199 | ||
b3bbd485 | 37200 | @@ -1018,7 +1021,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) |
1a6e0f06 JK |
37201 | break; |
37202 | } | |
37203 | ||
37204 | - put_cpu_var(vmap_block_queue); | |
37205 | + put_cpu_light(); | |
37206 | rcu_read_unlock(); | |
37207 | ||
37208 | /* Allocate new block if nothing was found */ | |
b3bbd485 | 37209 | diff --git a/mm/vmstat.c b/mm/vmstat.c |
5dd41b01 | 37210 | index 527ae727d547..ae6446b054d3 100644 |
b3bbd485 JK |
37211 | --- a/mm/vmstat.c |
37212 | +++ b/mm/vmstat.c | |
37213 | @@ -249,6 +249,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, | |
1a6e0f06 JK |
37214 | long x; |
37215 | long t; | |
37216 | ||
37217 | + preempt_disable_rt(); | |
37218 | x = delta + __this_cpu_read(*p); | |
37219 | ||
37220 | t = __this_cpu_read(pcp->stat_threshold); | |
b3bbd485 | 37221 | @@ -258,6 +259,7 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, |
1a6e0f06 JK |
37222 | x = 0; |
37223 | } | |
37224 | __this_cpu_write(*p, x); | |
37225 | + preempt_enable_rt(); | |
37226 | } | |
37227 | EXPORT_SYMBOL(__mod_zone_page_state); | |
37228 | ||
b3bbd485 | 37229 | @@ -269,6 +271,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, |
1a6e0f06 JK |
37230 | long x; |
37231 | long t; | |
37232 | ||
37233 | + preempt_disable_rt(); | |
37234 | x = delta + __this_cpu_read(*p); | |
37235 | ||
37236 | t = __this_cpu_read(pcp->stat_threshold); | |
b3bbd485 | 37237 | @@ -278,6 +281,7 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, |
1a6e0f06 JK |
37238 | x = 0; |
37239 | } | |
37240 | __this_cpu_write(*p, x); | |
37241 | + preempt_enable_rt(); | |
37242 | } | |
37243 | EXPORT_SYMBOL(__mod_node_page_state); | |
37244 | ||
b3bbd485 | 37245 | @@ -310,6 +314,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) |
1a6e0f06 JK |
37246 | s8 __percpu *p = pcp->vm_stat_diff + item; |
37247 | s8 v, t; | |
37248 | ||
37249 | + preempt_disable_rt(); | |
37250 | v = __this_cpu_inc_return(*p); | |
37251 | t = __this_cpu_read(pcp->stat_threshold); | |
37252 | if (unlikely(v > t)) { | |
b3bbd485 | 37253 | @@ -318,6 +323,7 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) |
1a6e0f06 JK |
37254 | zone_page_state_add(v + overstep, zone, item); |
37255 | __this_cpu_write(*p, -overstep); | |
37256 | } | |
37257 | + preempt_enable_rt(); | |
37258 | } | |
37259 | ||
37260 | void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) | |
b3bbd485 | 37261 | @@ -326,6 +332,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) |
1a6e0f06 JK |
37262 | s8 __percpu *p = pcp->vm_node_stat_diff + item; |
37263 | s8 v, t; | |
37264 | ||
37265 | + preempt_disable_rt(); | |
37266 | v = __this_cpu_inc_return(*p); | |
37267 | t = __this_cpu_read(pcp->stat_threshold); | |
37268 | if (unlikely(v > t)) { | |
b3bbd485 | 37269 | @@ -334,6 +341,7 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) |
1a6e0f06 JK |
37270 | node_page_state_add(v + overstep, pgdat, item); |
37271 | __this_cpu_write(*p, -overstep); | |
37272 | } | |
37273 | + preempt_enable_rt(); | |
37274 | } | |
37275 | ||
37276 | void __inc_zone_page_state(struct page *page, enum zone_stat_item item) | |
b3bbd485 | 37277 | @@ -354,6 +362,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) |
1a6e0f06 JK |
37278 | s8 __percpu *p = pcp->vm_stat_diff + item; |
37279 | s8 v, t; | |
37280 | ||
37281 | + preempt_disable_rt(); | |
37282 | v = __this_cpu_dec_return(*p); | |
37283 | t = __this_cpu_read(pcp->stat_threshold); | |
37284 | if (unlikely(v < - t)) { | |
b3bbd485 | 37285 | @@ -362,6 +371,7 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item) |
1a6e0f06 JK |
37286 | zone_page_state_add(v - overstep, zone, item); |
37287 | __this_cpu_write(*p, overstep); | |
37288 | } | |
37289 | + preempt_enable_rt(); | |
37290 | } | |
37291 | ||
37292 | void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) | |
b3bbd485 | 37293 | @@ -370,6 +380,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) |
1a6e0f06 JK |
37294 | s8 __percpu *p = pcp->vm_node_stat_diff + item; |
37295 | s8 v, t; | |
37296 | ||
37297 | + preempt_disable_rt(); | |
37298 | v = __this_cpu_dec_return(*p); | |
37299 | t = __this_cpu_read(pcp->stat_threshold); | |
37300 | if (unlikely(v < - t)) { | |
b3bbd485 | 37301 | @@ -378,6 +389,7 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) |
1a6e0f06 JK |
37302 | node_page_state_add(v - overstep, pgdat, item); |
37303 | __this_cpu_write(*p, overstep); | |
37304 | } | |
37305 | + preempt_enable_rt(); | |
37306 | } | |
37307 | ||
37308 | void __dec_zone_page_state(struct page *page, enum zone_stat_item item) | |
b3bbd485 JK |
37309 | diff --git a/mm/workingset.c b/mm/workingset.c |
37310 | index b997c9de28f6..e252cc69a3d4 100644 | |
37311 | --- a/mm/workingset.c | |
37312 | +++ b/mm/workingset.c | |
37313 | @@ -338,9 +338,10 @@ void workingset_activation(struct page *page) | |
1a6e0f06 JK |
37314 | * point where they would still be useful. |
37315 | */ | |
37316 | ||
e4b2b4a8 JK |
37317 | -static struct list_lru shadow_nodes; |
37318 | +static struct list_lru __shadow_nodes; | |
37319 | +DEFINE_LOCAL_IRQ_LOCK(shadow_nodes_lock); | |
37320 | ||
37321 | -void workingset_update_node(struct radix_tree_node *node, void *private) | |
37322 | +void __workingset_update_node(struct radix_tree_node *node, void *private) | |
37323 | { | |
37324 | struct address_space *mapping = private; | |
37325 | ||
b3bbd485 | 37326 | @@ -358,10 +359,10 @@ void workingset_update_node(struct radix_tree_node *node, void *private) |
e4b2b4a8 JK |
37327 | */ |
37328 | if (node->count && node->count == node->exceptional) { | |
37329 | if (list_empty(&node->private_list)) | |
37330 | - list_lru_add(&shadow_nodes, &node->private_list); | |
37331 | + list_lru_add(&__shadow_nodes, &node->private_list); | |
37332 | } else { | |
37333 | if (!list_empty(&node->private_list)) | |
37334 | - list_lru_del(&shadow_nodes, &node->private_list); | |
37335 | + list_lru_del(&__shadow_nodes, &node->private_list); | |
37336 | } | |
37337 | } | |
1a6e0f06 | 37338 | |
b3bbd485 | 37339 | @@ -373,9 +374,9 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, |
e4b2b4a8 | 37340 | unsigned long cache; |
1a6e0f06 JK |
37341 | |
37342 | /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ | |
37343 | - local_irq_disable(); | |
e4b2b4a8 | 37344 | - nodes = list_lru_shrink_count(&shadow_nodes, sc); |
1a6e0f06 | 37345 | - local_irq_enable(); |
e4b2b4a8 JK |
37346 | + local_lock_irq(shadow_nodes_lock); |
37347 | + nodes = list_lru_shrink_count(&__shadow_nodes, sc); | |
37348 | + local_unlock_irq(shadow_nodes_lock); | |
1a6e0f06 | 37349 | |
e4b2b4a8 JK |
37350 | /* |
37351 | * Approximate a reasonable limit for the radix tree nodes | |
b3bbd485 | 37352 | @@ -475,15 +476,15 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, |
e4b2b4a8 JK |
37353 | goto out_invalid; |
37354 | inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); | |
37355 | __radix_tree_delete_node(&mapping->page_tree, node, | |
37356 | - workingset_update_node, mapping); | |
37357 | + __workingset_update_node, mapping); | |
37358 | ||
37359 | out_invalid: | |
1a6e0f06 JK |
37360 | spin_unlock(&mapping->tree_lock); |
37361 | ret = LRU_REMOVED_RETRY; | |
37362 | out: | |
37363 | - local_irq_enable(); | |
e4b2b4a8 | 37364 | + local_unlock_irq(shadow_nodes_lock); |
1a6e0f06 JK |
37365 | cond_resched(); |
37366 | - local_irq_disable(); | |
e4b2b4a8 | 37367 | + local_lock_irq(shadow_nodes_lock); |
1a6e0f06 JK |
37368 | spin_lock(lru_lock); |
37369 | return ret; | |
37370 | } | |
b3bbd485 | 37371 | @@ -494,9 +495,9 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker, |
1a6e0f06 JK |
37372 | unsigned long ret; |
37373 | ||
37374 | /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ | |
37375 | - local_irq_disable(); | |
e4b2b4a8 | 37376 | - ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL); |
1a6e0f06 | 37377 | - local_irq_enable(); |
e4b2b4a8 JK |
37378 | + local_lock_irq(shadow_nodes_lock); |
37379 | + ret = list_lru_shrink_walk(&__shadow_nodes, sc, shadow_lru_isolate, NULL); | |
37380 | + local_unlock_irq(shadow_nodes_lock); | |
1a6e0f06 JK |
37381 | return ret; |
37382 | } | |
37383 | ||
b3bbd485 | 37384 | @@ -534,7 +535,7 @@ static int __init workingset_init(void) |
1a6e0f06 JK |
37385 | pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", |
37386 | timestamp_bits, max_order, bucket_order); | |
37387 | ||
e4b2b4a8 JK |
37388 | - ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key); |
37389 | + ret = __list_lru_init(&__shadow_nodes, true, &shadow_nodes_key); | |
1a6e0f06 JK |
37390 | if (ret) |
37391 | goto err; | |
37392 | ret = register_shrinker(&workingset_shadow_shrinker); | |
b3bbd485 | 37393 | @@ -542,7 +543,7 @@ static int __init workingset_init(void) |
1a6e0f06 JK |
37394 | goto err_list_lru; |
37395 | return 0; | |
37396 | err_list_lru: | |
e4b2b4a8 JK |
37397 | - list_lru_destroy(&shadow_nodes); |
37398 | + list_lru_destroy(&__shadow_nodes); | |
1a6e0f06 JK |
37399 | err: |
37400 | return ret; | |
37401 | } | |
b3bbd485 JK |
37402 | diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c |
37403 | index 685049a9048d..8d1489fd1dbc 100644 | |
37404 | --- a/mm/zsmalloc.c | |
37405 | +++ b/mm/zsmalloc.c | |
1a6e0f06 JK |
37406 | @@ -53,6 +53,7 @@ |
37407 | #include <linux/mount.h> | |
37408 | #include <linux/migrate.h> | |
37409 | #include <linux/pagemap.h> | |
37410 | +#include <linux/locallock.h> | |
37411 | ||
37412 | #define ZSPAGE_MAGIC 0x58 | |
37413 | ||
37414 | @@ -70,9 +71,22 @@ | |
37415 | */ | |
37416 | #define ZS_MAX_ZSPAGE_ORDER 2 | |
37417 | #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) | |
37418 | - | |
37419 | #define ZS_HANDLE_SIZE (sizeof(unsigned long)) | |
37420 | ||
37421 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
37422 | + | |
37423 | +struct zsmalloc_handle { | |
37424 | + unsigned long addr; | |
37425 | + struct mutex lock; | |
37426 | +}; | |
37427 | + | |
37428 | +#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle)) | |
37429 | + | |
37430 | +#else | |
37431 | + | |
37432 | +#define ZS_HANDLE_ALLOC_SIZE (sizeof(unsigned long)) | |
37433 | +#endif | |
37434 | + | |
37435 | /* | |
37436 | * Object location (<PFN>, <obj_idx>) is encoded as | |
37437 | * as single (unsigned long) handle value. | |
b3bbd485 | 37438 | @@ -320,7 +334,7 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {} |
1a6e0f06 JK |
37439 | |
37440 | static int create_cache(struct zs_pool *pool) | |
37441 | { | |
37442 | - pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, | |
37443 | + pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_ALLOC_SIZE, | |
37444 | 0, 0, NULL); | |
37445 | if (!pool->handle_cachep) | |
37446 | return 1; | |
b3bbd485 | 37447 | @@ -344,10 +358,27 @@ static void destroy_cache(struct zs_pool *pool) |
1a6e0f06 JK |
37448 | |
37449 | static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp) | |
37450 | { | |
37451 | - return (unsigned long)kmem_cache_alloc(pool->handle_cachep, | |
37452 | - gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); | |
37453 | + void *p; | |
37454 | + | |
37455 | + p = kmem_cache_alloc(pool->handle_cachep, | |
37456 | + gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); | |
37457 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
37458 | + if (p) { | |
37459 | + struct zsmalloc_handle *zh = p; | |
37460 | + | |
37461 | + mutex_init(&zh->lock); | |
37462 | + } | |
37463 | +#endif | |
37464 | + return (unsigned long)p; | |
b3bbd485 JK |
37465 | } |
37466 | ||
1a6e0f06 JK |
37467 | +#ifdef CONFIG_PREEMPT_RT_FULL |
37468 | +static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle) | |
37469 | +{ | |
37470 | + return (void *)(handle &~((1 << OBJ_TAG_BITS) - 1)); | |
b3bbd485 | 37471 | +} |
1a6e0f06 | 37472 | +#endif |
b3bbd485 | 37473 | + |
1a6e0f06 JK |
37474 | static void cache_free_handle(struct zs_pool *pool, unsigned long handle) |
37475 | { | |
b3bbd485 JK |
37476 | kmem_cache_free(pool->handle_cachep, (void *)handle); |
37477 | @@ -366,12 +397,18 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) | |
1a6e0f06 JK |
37478 | |
37479 | static void record_obj(unsigned long handle, unsigned long obj) | |
37480 | { | |
37481 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
37482 | + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); | |
37483 | + | |
37484 | + WRITE_ONCE(zh->addr, obj); | |
37485 | +#else | |
37486 | /* | |
37487 | * lsb of @obj represents handle lock while other bits | |
37488 | * represent object value the handle is pointing so | |
37489 | * updating shouldn't do store tearing. | |
37490 | */ | |
37491 | WRITE_ONCE(*(unsigned long *)handle, obj); | |
37492 | +#endif | |
37493 | } | |
37494 | ||
37495 | /* zpool driver */ | |
b3bbd485 | 37496 | @@ -460,6 +497,7 @@ MODULE_ALIAS("zpool-zsmalloc"); |
1a6e0f06 JK |
37497 | |
37498 | /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ | |
37499 | static DEFINE_PER_CPU(struct mapping_area, zs_map_area); | |
37500 | +static DEFINE_LOCAL_IRQ_LOCK(zs_map_area_lock); | |
37501 | ||
37502 | static bool is_zspage_isolated(struct zspage *zspage) | |
37503 | { | |
b3bbd485 | 37504 | @@ -898,7 +936,13 @@ static unsigned long location_to_obj(struct page *page, unsigned int obj_idx) |
1a6e0f06 JK |
37505 | |
37506 | static unsigned long handle_to_obj(unsigned long handle) | |
37507 | { | |
37508 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
37509 | + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); | |
37510 | + | |
37511 | + return zh->addr; | |
37512 | +#else | |
37513 | return *(unsigned long *)handle; | |
37514 | +#endif | |
37515 | } | |
37516 | ||
37517 | static unsigned long obj_to_head(struct page *page, void *obj) | |
b3bbd485 | 37518 | @@ -912,22 +956,46 @@ static unsigned long obj_to_head(struct page *page, void *obj) |
1a6e0f06 JK |
37519 | |
37520 | static inline int testpin_tag(unsigned long handle) | |
37521 | { | |
37522 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
37523 | + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); | |
37524 | + | |
37525 | + return mutex_is_locked(&zh->lock); | |
37526 | +#else | |
37527 | return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle); | |
37528 | +#endif | |
37529 | } | |
37530 | ||
37531 | static inline int trypin_tag(unsigned long handle) | |
37532 | { | |
37533 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
37534 | + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); | |
37535 | + | |
37536 | + return mutex_trylock(&zh->lock); | |
37537 | +#else | |
37538 | return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle); | |
37539 | +#endif | |
37540 | } | |
37541 | ||
37542 | static void pin_tag(unsigned long handle) | |
37543 | { | |
37544 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
37545 | + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); | |
37546 | + | |
37547 | + return mutex_lock(&zh->lock); | |
37548 | +#else | |
37549 | bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle); | |
37550 | +#endif | |
37551 | } | |
37552 | ||
37553 | static void unpin_tag(unsigned long handle) | |
37554 | { | |
37555 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
37556 | + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); | |
37557 | + | |
37558 | + return mutex_unlock(&zh->lock); | |
37559 | +#else | |
37560 | bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle); | |
37561 | +#endif | |
37562 | } | |
37563 | ||
37564 | static void reset_page(struct page *page) | |
b3bbd485 | 37565 | @@ -1365,7 +1433,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, |
1a6e0f06 JK |
37566 | class = pool->size_class[class_idx]; |
37567 | off = (class->size * obj_idx) & ~PAGE_MASK; | |
37568 | ||
37569 | - area = &get_cpu_var(zs_map_area); | |
37570 | + area = &get_locked_var(zs_map_area_lock, zs_map_area); | |
37571 | area->vm_mm = mm; | |
37572 | if (off + class->size <= PAGE_SIZE) { | |
37573 | /* this object is contained entirely within a page */ | |
b3bbd485 | 37574 | @@ -1419,7 +1487,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) |
1a6e0f06 JK |
37575 | |
37576 | __zs_unmap_object(area, pages, off, class->size); | |
37577 | } | |
37578 | - put_cpu_var(zs_map_area); | |
37579 | + put_locked_var(zs_map_area_lock, zs_map_area); | |
37580 | ||
37581 | migrate_read_unlock(zspage); | |
37582 | unpin_tag(handle); | |
b3bbd485 JK |
37583 | diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c |
37584 | index c10bdf63eae7..84a49f2bcfbc 100644 | |
37585 | --- a/net/9p/trans_xen.c | |
37586 | +++ b/net/9p/trans_xen.c | |
e4b2b4a8 JK |
37587 | @@ -38,7 +38,6 @@ |
37588 | ||
37589 | #include <linux/module.h> | |
37590 | #include <linux/spinlock.h> | |
37591 | -#include <linux/rwlock.h> | |
37592 | #include <net/9p/9p.h> | |
37593 | #include <net/9p/client.h> | |
37594 | #include <net/9p/transport.h> | |
b3bbd485 JK |
37595 | diff --git a/net/Kconfig b/net/Kconfig |
37596 | index 9dba2715919d..9c7b38379c09 100644 | |
37597 | --- a/net/Kconfig | |
37598 | +++ b/net/Kconfig | |
37599 | @@ -272,7 +272,7 @@ config CGROUP_NET_CLASSID | |
37600 | ||
37601 | config NET_RX_BUSY_POLL | |
37602 | bool | |
37603 | - default y | |
37604 | + default y if !PREEMPT_RT_FULL | |
37605 | ||
37606 | config BQL | |
37607 | bool | |
37608 | diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c | |
37609 | index 65d734c165bd..923e9a271872 100644 | |
37610 | --- a/net/bluetooth/hci_sock.c | |
37611 | +++ b/net/bluetooth/hci_sock.c | |
37612 | @@ -251,15 +251,13 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb) | |
e4b2b4a8 JK |
37613 | } |
37614 | ||
37615 | /* Send frame to sockets with specific channel */ | |
37616 | -void hci_send_to_channel(unsigned short channel, struct sk_buff *skb, | |
37617 | - int flag, struct sock *skip_sk) | |
37618 | +static void __hci_send_to_channel(unsigned short channel, struct sk_buff *skb, | |
37619 | + int flag, struct sock *skip_sk) | |
37620 | { | |
37621 | struct sock *sk; | |
37622 | ||
37623 | BT_DBG("channel %u len %d", channel, skb->len); | |
37624 | ||
37625 | - read_lock(&hci_sk_list.lock); | |
37626 | - | |
37627 | sk_for_each(sk, &hci_sk_list.head) { | |
37628 | struct sk_buff *nskb; | |
37629 | ||
b3bbd485 | 37630 | @@ -285,6 +283,13 @@ void hci_send_to_channel(unsigned short channel, struct sk_buff *skb, |
e4b2b4a8 JK |
37631 | kfree_skb(nskb); |
37632 | } | |
37633 | ||
37634 | +} | |
37635 | + | |
37636 | +void hci_send_to_channel(unsigned short channel, struct sk_buff *skb, | |
37637 | + int flag, struct sock *skip_sk) | |
37638 | +{ | |
37639 | + read_lock(&hci_sk_list.lock); | |
37640 | + __hci_send_to_channel(channel, skb, flag, skip_sk); | |
37641 | read_unlock(&hci_sk_list.lock); | |
37642 | } | |
37643 | ||
b3bbd485 | 37644 | @@ -388,8 +393,8 @@ void hci_send_monitor_ctrl_event(struct hci_dev *hdev, u16 event, |
e4b2b4a8 JK |
37645 | hdr->index = index; |
37646 | hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); | |
37647 | ||
37648 | - hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, | |
37649 | - HCI_SOCK_TRUSTED, NULL); | |
37650 | + __hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, | |
37651 | + HCI_SOCK_TRUSTED, NULL); | |
37652 | kfree_skb(skb); | |
37653 | } | |
37654 | ||
b3bbd485 JK |
37655 | diff --git a/net/can/bcm.c b/net/can/bcm.c |
37656 | index 13690334efa3..9cc67ac257f1 100644 | |
37657 | --- a/net/can/bcm.c | |
37658 | +++ b/net/can/bcm.c | |
37659 | @@ -102,7 +102,6 @@ struct bcm_op { | |
e4b2b4a8 JK |
37660 | unsigned long frames_abs, frames_filtered; |
37661 | struct bcm_timeval ival1, ival2; | |
37662 | struct hrtimer timer, thrtimer; | |
37663 | - struct tasklet_struct tsklet, thrtsklet; | |
37664 | ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg; | |
37665 | int rx_ifindex; | |
37666 | int cfsiz; | |
b3bbd485 | 37667 | @@ -364,25 +363,34 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head, |
e4b2b4a8 JK |
37668 | } |
37669 | } | |
37670 | ||
37671 | -static void bcm_tx_start_timer(struct bcm_op *op) | |
37672 | +static bool bcm_tx_set_expiry(struct bcm_op *op, struct hrtimer *hrt) | |
37673 | { | |
37674 | + ktime_t ival; | |
37675 | + | |
37676 | if (op->kt_ival1 && op->count) | |
37677 | - hrtimer_start(&op->timer, | |
37678 | - ktime_add(ktime_get(), op->kt_ival1), | |
37679 | - HRTIMER_MODE_ABS); | |
37680 | + ival = op->kt_ival1; | |
37681 | else if (op->kt_ival2) | |
37682 | - hrtimer_start(&op->timer, | |
37683 | - ktime_add(ktime_get(), op->kt_ival2), | |
37684 | - HRTIMER_MODE_ABS); | |
37685 | + ival = op->kt_ival2; | |
37686 | + else | |
37687 | + return false; | |
37688 | + | |
37689 | + hrtimer_set_expires(hrt, ktime_add(ktime_get(), ival)); | |
37690 | + return true; | |
37691 | } | |
37692 | ||
37693 | -static void bcm_tx_timeout_tsklet(unsigned long data) | |
37694 | +static void bcm_tx_start_timer(struct bcm_op *op) | |
37695 | { | |
37696 | - struct bcm_op *op = (struct bcm_op *)data; | |
37697 | + if (bcm_tx_set_expiry(op, &op->timer)) | |
37698 | + hrtimer_start_expires(&op->timer, HRTIMER_MODE_ABS_SOFT); | |
37699 | +} | |
37700 | + | |
37701 | +/* bcm_tx_timeout_handler - performs cyclic CAN frame transmissions */ | |
37702 | +static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer) | |
37703 | +{ | |
37704 | + struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer); | |
37705 | struct bcm_msg_head msg_head; | |
37706 | ||
37707 | if (op->kt_ival1 && (op->count > 0)) { | |
37708 | - | |
37709 | op->count--; | |
37710 | if (!op->count && (op->flags & TX_COUNTEVT)) { | |
37711 | ||
b3bbd485 | 37712 | @@ -399,22 +407,12 @@ static void bcm_tx_timeout_tsklet(unsigned long data) |
e4b2b4a8 JK |
37713 | } |
37714 | bcm_can_tx(op); | |
37715 | ||
37716 | - } else if (op->kt_ival2) | |
37717 | + } else if (op->kt_ival2) { | |
37718 | bcm_can_tx(op); | |
37719 | + } | |
37720 | ||
37721 | - bcm_tx_start_timer(op); | |
37722 | -} | |
37723 | - | |
37724 | -/* | |
37725 | - * bcm_tx_timeout_handler - performs cyclic CAN frame transmissions | |
37726 | - */ | |
37727 | -static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer) | |
37728 | -{ | |
37729 | - struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer); | |
37730 | - | |
37731 | - tasklet_schedule(&op->tsklet); | |
37732 | - | |
37733 | - return HRTIMER_NORESTART; | |
37734 | + return bcm_tx_set_expiry(op, &op->timer) ? | |
37735 | + HRTIMER_RESTART : HRTIMER_NORESTART; | |
37736 | } | |
37737 | ||
37738 | /* | |
b3bbd485 | 37739 | @@ -480,7 +478,7 @@ static void bcm_rx_update_and_send(struct bcm_op *op, |
e4b2b4a8 JK |
37740 | /* do not send the saved data - only start throttle timer */ |
37741 | hrtimer_start(&op->thrtimer, | |
37742 | ktime_add(op->kt_lastmsg, op->kt_ival2), | |
37743 | - HRTIMER_MODE_ABS); | |
37744 | + HRTIMER_MODE_ABS_SOFT); | |
37745 | return; | |
37746 | } | |
37747 | ||
b3bbd485 | 37748 | @@ -539,14 +537,21 @@ static void bcm_rx_starttimer(struct bcm_op *op) |
e4b2b4a8 JK |
37749 | return; |
37750 | ||
37751 | if (op->kt_ival1) | |
37752 | - hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL); | |
37753 | + hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL_SOFT); | |
37754 | } | |
37755 | ||
37756 | -static void bcm_rx_timeout_tsklet(unsigned long data) | |
37757 | +/* bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out */ | |
37758 | +static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer) | |
37759 | { | |
37760 | - struct bcm_op *op = (struct bcm_op *)data; | |
37761 | + struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer); | |
37762 | struct bcm_msg_head msg_head; | |
37763 | ||
37764 | + /* if user wants to be informed, when cyclic CAN-Messages come back */ | |
37765 | + if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) { | |
37766 | + /* clear received CAN frames to indicate 'nothing received' */ | |
37767 | + memset(op->last_frames, 0, op->nframes * op->cfsiz); | |
37768 | + } | |
37769 | + | |
37770 | /* create notification to user */ | |
37771 | msg_head.opcode = RX_TIMEOUT; | |
37772 | msg_head.flags = op->flags; | |
b3bbd485 | 37773 | @@ -557,25 +562,6 @@ static void bcm_rx_timeout_tsklet(unsigned long data) |
e4b2b4a8 JK |
37774 | msg_head.nframes = 0; |
37775 | ||
37776 | bcm_send_to_user(op, &msg_head, NULL, 0); | |
37777 | -} | |
37778 | - | |
37779 | -/* | |
37780 | - * bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out | |
37781 | - */ | |
37782 | -static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer) | |
37783 | -{ | |
37784 | - struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer); | |
37785 | - | |
37786 | - /* schedule before NET_RX_SOFTIRQ */ | |
37787 | - tasklet_hi_schedule(&op->tsklet); | |
37788 | - | |
37789 | - /* no restart of the timer is done here! */ | |
37790 | - | |
37791 | - /* if user wants to be informed, when cyclic CAN-Messages come back */ | |
37792 | - if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) { | |
37793 | - /* clear received CAN frames to indicate 'nothing received' */ | |
37794 | - memset(op->last_frames, 0, op->nframes * op->cfsiz); | |
37795 | - } | |
37796 | ||
37797 | return HRTIMER_NORESTART; | |
37798 | } | |
b3bbd485 | 37799 | @@ -583,14 +569,12 @@ static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer) |
e4b2b4a8 JK |
37800 | /* |
37801 | * bcm_rx_do_flush - helper for bcm_rx_thr_flush | |
37802 | */ | |
37803 | -static inline int bcm_rx_do_flush(struct bcm_op *op, int update, | |
37804 | - unsigned int index) | |
37805 | +static inline int bcm_rx_do_flush(struct bcm_op *op, unsigned int index) | |
37806 | { | |
37807 | struct canfd_frame *lcf = op->last_frames + op->cfsiz * index; | |
37808 | ||
37809 | if ((op->last_frames) && (lcf->flags & RX_THR)) { | |
37810 | - if (update) | |
37811 | - bcm_rx_changed(op, lcf); | |
37812 | + bcm_rx_changed(op, lcf); | |
37813 | return 1; | |
37814 | } | |
37815 | return 0; | |
b3bbd485 | 37816 | @@ -598,11 +582,8 @@ static inline int bcm_rx_do_flush(struct bcm_op *op, int update, |
e4b2b4a8 JK |
37817 | |
37818 | /* | |
37819 | * bcm_rx_thr_flush - Check for throttled data and send it to the userspace | |
37820 | - * | |
37821 | - * update == 0 : just check if throttled data is available (any irq context) | |
37822 | - * update == 1 : check and send throttled data to userspace (soft_irq context) | |
37823 | */ | |
37824 | -static int bcm_rx_thr_flush(struct bcm_op *op, int update) | |
37825 | +static int bcm_rx_thr_flush(struct bcm_op *op) | |
37826 | { | |
37827 | int updated = 0; | |
37828 | ||
b3bbd485 | 37829 | @@ -611,24 +592,16 @@ static int bcm_rx_thr_flush(struct bcm_op *op, int update) |
e4b2b4a8 JK |
37830 | |
37831 | /* for MUX filter we start at index 1 */ | |
37832 | for (i = 1; i < op->nframes; i++) | |
37833 | - updated += bcm_rx_do_flush(op, update, i); | |
37834 | + updated += bcm_rx_do_flush(op, i); | |
37835 | ||
37836 | } else { | |
37837 | /* for RX_FILTER_ID and simple filter */ | |
37838 | - updated += bcm_rx_do_flush(op, update, 0); | |
37839 | + updated += bcm_rx_do_flush(op, 0); | |
37840 | } | |
37841 | ||
37842 | return updated; | |
37843 | } | |
37844 | ||
37845 | -static void bcm_rx_thr_tsklet(unsigned long data) | |
37846 | -{ | |
37847 | - struct bcm_op *op = (struct bcm_op *)data; | |
37848 | - | |
37849 | - /* push the changed data to the userspace */ | |
37850 | - bcm_rx_thr_flush(op, 1); | |
37851 | -} | |
37852 | - | |
37853 | /* | |
37854 | * bcm_rx_thr_handler - the time for blocked content updates is over now: | |
37855 | * Check for throttled data and send it to the userspace | |
b3bbd485 | 37856 | @@ -637,9 +610,7 @@ static enum hrtimer_restart bcm_rx_thr_handler(struct hrtimer *hrtimer) |
e4b2b4a8 JK |
37857 | { |
37858 | struct bcm_op *op = container_of(hrtimer, struct bcm_op, thrtimer); | |
37859 | ||
37860 | - tasklet_schedule(&op->thrtsklet); | |
37861 | - | |
37862 | - if (bcm_rx_thr_flush(op, 0)) { | |
37863 | + if (bcm_rx_thr_flush(op)) { | |
37864 | hrtimer_forward(hrtimer, ktime_get(), op->kt_ival2); | |
37865 | return HRTIMER_RESTART; | |
37866 | } else { | |
b3bbd485 | 37867 | @@ -735,23 +706,8 @@ static struct bcm_op *bcm_find_op(struct list_head *ops, |
e4b2b4a8 JK |
37868 | |
37869 | static void bcm_remove_op(struct bcm_op *op) | |
37870 | { | |
37871 | - if (op->tsklet.func) { | |
37872 | - while (test_bit(TASKLET_STATE_SCHED, &op->tsklet.state) || | |
37873 | - test_bit(TASKLET_STATE_RUN, &op->tsklet.state) || | |
37874 | - hrtimer_active(&op->timer)) { | |
37875 | - hrtimer_cancel(&op->timer); | |
37876 | - tasklet_kill(&op->tsklet); | |
37877 | - } | |
37878 | - } | |
37879 | - | |
37880 | - if (op->thrtsklet.func) { | |
37881 | - while (test_bit(TASKLET_STATE_SCHED, &op->thrtsklet.state) || | |
37882 | - test_bit(TASKLET_STATE_RUN, &op->thrtsklet.state) || | |
37883 | - hrtimer_active(&op->thrtimer)) { | |
37884 | - hrtimer_cancel(&op->thrtimer); | |
37885 | - tasklet_kill(&op->thrtsklet); | |
37886 | - } | |
37887 | - } | |
37888 | + hrtimer_cancel(&op->timer); | |
37889 | + hrtimer_cancel(&op->thrtimer); | |
37890 | ||
37891 | if ((op->frames) && (op->frames != &op->sframe)) | |
37892 | kfree(op->frames); | |
b3bbd485 | 37893 | @@ -979,15 +935,13 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, |
e4b2b4a8 JK |
37894 | op->ifindex = ifindex; |
37895 | ||
37896 | /* initialize uninitialized (kzalloc) structure */ | |
37897 | - hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | |
37898 | + hrtimer_init(&op->timer, CLOCK_MONOTONIC, | |
37899 | + HRTIMER_MODE_REL_SOFT); | |
37900 | op->timer.function = bcm_tx_timeout_handler; | |
37901 | ||
37902 | - /* initialize tasklet for tx countevent notification */ | |
37903 | - tasklet_init(&op->tsklet, bcm_tx_timeout_tsklet, | |
37904 | - (unsigned long) op); | |
37905 | - | |
37906 | /* currently unused in tx_ops */ | |
37907 | - hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | |
37908 | + hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, | |
37909 | + HRTIMER_MODE_REL_SOFT); | |
37910 | ||
37911 | /* add this bcm_op to the list of the tx_ops */ | |
37912 | list_add(&op->list, &bo->tx_ops); | |
b3bbd485 | 37913 | @@ -1150,20 +1104,14 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, |
e4b2b4a8 JK |
37914 | op->rx_ifindex = ifindex; |
37915 | ||
37916 | /* initialize uninitialized (kzalloc) structure */ | |
37917 | - hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | |
37918 | + hrtimer_init(&op->timer, CLOCK_MONOTONIC, | |
37919 | + HRTIMER_MODE_REL_SOFT); | |
37920 | op->timer.function = bcm_rx_timeout_handler; | |
37921 | ||
37922 | - /* initialize tasklet for rx timeout notification */ | |
37923 | - tasklet_init(&op->tsklet, bcm_rx_timeout_tsklet, | |
37924 | - (unsigned long) op); | |
37925 | - | |
37926 | - hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | |
37927 | + hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, | |
37928 | + HRTIMER_MODE_REL_SOFT); | |
37929 | op->thrtimer.function = bcm_rx_thr_handler; | |
37930 | ||
37931 | - /* initialize tasklet for rx throttle handling */ | |
37932 | - tasklet_init(&op->thrtsklet, bcm_rx_thr_tsklet, | |
37933 | - (unsigned long) op); | |
37934 | - | |
37935 | /* add this bcm_op to the list of the rx_ops */ | |
37936 | list_add(&op->list, &bo->rx_ops); | |
37937 | ||
b3bbd485 | 37938 | @@ -1209,12 +1157,12 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, |
e4b2b4a8 JK |
37939 | */ |
37940 | op->kt_lastmsg = 0; | |
37941 | hrtimer_cancel(&op->thrtimer); | |
37942 | - bcm_rx_thr_flush(op, 1); | |
37943 | + bcm_rx_thr_flush(op); | |
37944 | } | |
37945 | ||
37946 | if ((op->flags & STARTTIMER) && op->kt_ival1) | |
37947 | hrtimer_start(&op->timer, op->kt_ival1, | |
37948 | - HRTIMER_MODE_REL); | |
37949 | + HRTIMER_MODE_REL_SOFT); | |
37950 | } | |
37951 | ||
37952 | /* now we can register for can_ids, if we added a new bcm_op */ | |
b3bbd485 | 37953 | diff --git a/net/core/dev.c b/net/core/dev.c |
5dd41b01 | 37954 | index e8a66ad6d07c..fa9642bb0482 100644 |
b3bbd485 JK |
37955 | --- a/net/core/dev.c |
37956 | +++ b/net/core/dev.c | |
37957 | @@ -195,6 +195,7 @@ static unsigned int napi_gen_id = NR_CPUS; | |
1a6e0f06 JK |
37958 | static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); |
37959 | ||
37960 | static seqcount_t devnet_rename_seq; | |
37961 | +static DEFINE_MUTEX(devnet_rename_mutex); | |
37962 | ||
37963 | static inline void dev_base_seq_inc(struct net *net) | |
37964 | { | |
b3bbd485 | 37965 | @@ -217,14 +218,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) |
1a6e0f06 JK |
37966 | static inline void rps_lock(struct softnet_data *sd) |
37967 | { | |
37968 | #ifdef CONFIG_RPS | |
37969 | - spin_lock(&sd->input_pkt_queue.lock); | |
37970 | + raw_spin_lock(&sd->input_pkt_queue.raw_lock); | |
37971 | #endif | |
37972 | } | |
37973 | ||
37974 | static inline void rps_unlock(struct softnet_data *sd) | |
37975 | { | |
37976 | #ifdef CONFIG_RPS | |
37977 | - spin_unlock(&sd->input_pkt_queue.lock); | |
37978 | + raw_spin_unlock(&sd->input_pkt_queue.raw_lock); | |
37979 | #endif | |
37980 | } | |
37981 | ||
b3bbd485 | 37982 | @@ -920,7 +921,8 @@ int netdev_get_name(struct net *net, char *name, int ifindex) |
1a6e0f06 JK |
37983 | strcpy(name, dev->name); |
37984 | rcu_read_unlock(); | |
37985 | if (read_seqcount_retry(&devnet_rename_seq, seq)) { | |
37986 | - cond_resched(); | |
37987 | + mutex_lock(&devnet_rename_mutex); | |
37988 | + mutex_unlock(&devnet_rename_mutex); | |
37989 | goto retry; | |
37990 | } | |
37991 | ||
b3bbd485 | 37992 | @@ -1189,20 +1191,17 @@ int dev_change_name(struct net_device *dev, const char *newname) |
1a6e0f06 JK |
37993 | if (dev->flags & IFF_UP) |
37994 | return -EBUSY; | |
37995 | ||
37996 | - write_seqcount_begin(&devnet_rename_seq); | |
37997 | + mutex_lock(&devnet_rename_mutex); | |
37998 | + __raw_write_seqcount_begin(&devnet_rename_seq); | |
37999 | ||
38000 | - if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { | |
38001 | - write_seqcount_end(&devnet_rename_seq); | |
38002 | - return 0; | |
38003 | - } | |
38004 | + if (strncmp(newname, dev->name, IFNAMSIZ) == 0) | |
38005 | + goto outunlock; | |
38006 | ||
38007 | memcpy(oldname, dev->name, IFNAMSIZ); | |
38008 | ||
38009 | err = dev_get_valid_name(net, dev, newname); | |
38010 | - if (err < 0) { | |
38011 | - write_seqcount_end(&devnet_rename_seq); | |
38012 | - return err; | |
38013 | - } | |
38014 | + if (err < 0) | |
38015 | + goto outunlock; | |
38016 | ||
38017 | if (oldname[0] && !strchr(oldname, '%')) | |
38018 | netdev_info(dev, "renamed from %s\n", oldname); | |
b3bbd485 | 38019 | @@ -1215,11 +1214,12 @@ int dev_change_name(struct net_device *dev, const char *newname) |
1a6e0f06 JK |
38020 | if (ret) { |
38021 | memcpy(dev->name, oldname, IFNAMSIZ); | |
38022 | dev->name_assign_type = old_assign_type; | |
38023 | - write_seqcount_end(&devnet_rename_seq); | |
38024 | - return ret; | |
38025 | + err = ret; | |
38026 | + goto outunlock; | |
38027 | } | |
38028 | ||
38029 | - write_seqcount_end(&devnet_rename_seq); | |
38030 | + __raw_write_seqcount_end(&devnet_rename_seq); | |
38031 | + mutex_unlock(&devnet_rename_mutex); | |
38032 | ||
38033 | netdev_adjacent_rename_links(dev, oldname); | |
38034 | ||
b3bbd485 | 38035 | @@ -1240,7 +1240,8 @@ int dev_change_name(struct net_device *dev, const char *newname) |
1a6e0f06 JK |
38036 | /* err >= 0 after dev_alloc_name() or stores the first errno */ |
38037 | if (err >= 0) { | |
38038 | err = ret; | |
38039 | - write_seqcount_begin(&devnet_rename_seq); | |
38040 | + mutex_lock(&devnet_rename_mutex); | |
38041 | + __raw_write_seqcount_begin(&devnet_rename_seq); | |
38042 | memcpy(dev->name, oldname, IFNAMSIZ); | |
38043 | memcpy(oldname, newname, IFNAMSIZ); | |
38044 | dev->name_assign_type = old_assign_type; | |
b3bbd485 | 38045 | @@ -1253,6 +1254,11 @@ int dev_change_name(struct net_device *dev, const char *newname) |
1a6e0f06 JK |
38046 | } |
38047 | ||
38048 | return err; | |
38049 | + | |
38050 | +outunlock: | |
38051 | + __raw_write_seqcount_end(&devnet_rename_seq); | |
38052 | + mutex_unlock(&devnet_rename_mutex); | |
38053 | + return err; | |
38054 | } | |
38055 | ||
38056 | /** | |
5dd41b01 | 38057 | @@ -2460,6 +2466,7 @@ static void __netif_reschedule(struct Qdisc *q) |
1a6e0f06 JK |
38058 | sd->output_queue_tailp = &q->next_sched; |
38059 | raise_softirq_irqoff(NET_TX_SOFTIRQ); | |
38060 | local_irq_restore(flags); | |
38061 | + preempt_check_resched_rt(); | |
38062 | } | |
38063 | ||
38064 | void __netif_schedule(struct Qdisc *q) | |
5dd41b01 | 38065 | @@ -2522,6 +2529,7 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) |
1a6e0f06 JK |
38066 | __this_cpu_write(softnet_data.completion_queue, skb); |
38067 | raise_softirq_irqoff(NET_TX_SOFTIRQ); | |
38068 | local_irq_restore(flags); | |
38069 | + preempt_check_resched_rt(); | |
38070 | } | |
38071 | EXPORT_SYMBOL(__dev_kfree_skb_irq); | |
38072 | ||
5dd41b01 | 38073 | @@ -3197,7 +3205,11 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, |
1a6e0f06 JK |
38074 | * This permits qdisc->running owner to get the lock more |
38075 | * often and dequeue packets faster. | |
38076 | */ | |
38077 | +#ifdef CONFIG_PREEMPT_RT_FULL | |
38078 | + contended = true; | |
38079 | +#else | |
38080 | contended = qdisc_is_running(q); | |
38081 | +#endif | |
38082 | if (unlikely(contended)) | |
38083 | spin_lock(&q->busylock); | |
38084 | ||
5dd41b01 | 38085 | @@ -3268,8 +3280,10 @@ static void skb_update_prio(struct sk_buff *skb) |
1a6e0f06 JK |
38086 | #define skb_update_prio(skb) |
38087 | #endif | |
38088 | ||
38089 | +#ifndef CONFIG_PREEMPT_RT_FULL | |
38090 | DEFINE_PER_CPU(int, xmit_recursion); | |
38091 | EXPORT_SYMBOL(xmit_recursion); | |
38092 | +#endif | |
38093 | ||
38094 | /** | |
38095 | * dev_loopback_xmit - loop back @skb | |
5dd41b01 | 38096 | @@ -3509,9 +3523,12 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) |
e4b2b4a8 | 38097 | if (dev->flags & IFF_UP) { |
1a6e0f06 JK |
38098 | int cpu = smp_processor_id(); /* ok because BHs are off */ |
38099 | ||
e4b2b4a8 JK |
38100 | +#ifdef CONFIG_PREEMPT_RT_FULL |
38101 | + if (txq->xmit_lock_owner != current) { | |
38102 | +#else | |
1a6e0f06 JK |
38103 | if (txq->xmit_lock_owner != cpu) { |
38104 | - if (unlikely(__this_cpu_read(xmit_recursion) > | |
38105 | - XMIT_RECURSION_LIMIT)) | |
e4b2b4a8 | 38106 | +#endif |
1a6e0f06 JK |
38107 | + if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT)) |
38108 | goto recursion_alert; | |
38109 | ||
38110 | skb = validate_xmit_skb(skb, dev); | |
5dd41b01 | 38111 | @@ -3521,9 +3538,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) |
1a6e0f06 JK |
38112 | HARD_TX_LOCK(dev, txq, cpu); |
38113 | ||
38114 | if (!netif_xmit_stopped(txq)) { | |
38115 | - __this_cpu_inc(xmit_recursion); | |
38116 | + xmit_rec_inc(); | |
38117 | skb = dev_hard_start_xmit(skb, dev, txq, &rc); | |
38118 | - __this_cpu_dec(xmit_recursion); | |
38119 | + xmit_rec_dec(); | |
38120 | if (dev_xmit_complete(rc)) { | |
38121 | HARD_TX_UNLOCK(dev, txq); | |
38122 | goto out; | |
5dd41b01 | 38123 | @@ -3904,6 +3921,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, |
1a6e0f06 JK |
38124 | rps_unlock(sd); |
38125 | ||
38126 | local_irq_restore(flags); | |
38127 | + preempt_check_resched_rt(); | |
38128 | ||
38129 | atomic_long_inc(&skb->dev->rx_dropped); | |
38130 | kfree_skb(skb); | |
5dd41b01 | 38131 | @@ -4056,7 +4074,7 @@ static int netif_rx_internal(struct sk_buff *skb) |
1a6e0f06 JK |
38132 | struct rps_dev_flow voidflow, *rflow = &voidflow; |
38133 | int cpu; | |
38134 | ||
38135 | - preempt_disable(); | |
38136 | + migrate_disable(); | |
38137 | rcu_read_lock(); | |
38138 | ||
38139 | cpu = get_rps_cpu(skb->dev, skb, &rflow); | |
5dd41b01 | 38140 | @@ -4066,14 +4084,14 @@ static int netif_rx_internal(struct sk_buff *skb) |
1a6e0f06 JK |
38141 | ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); |
38142 | ||
38143 | rcu_read_unlock(); | |
38144 | - preempt_enable(); | |
38145 | + migrate_enable(); | |
38146 | } else | |
38147 | #endif | |
38148 | { | |
38149 | unsigned int qtail; | |
e4b2b4a8 | 38150 | |
1a6e0f06 JK |
38151 | - ret = enqueue_to_backlog(skb, get_cpu(), &qtail); |
38152 | - put_cpu(); | |
38153 | + ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail); | |
38154 | + put_cpu_light(); | |
38155 | } | |
38156 | return ret; | |
38157 | } | |
5dd41b01 | 38158 | @@ -4107,11 +4125,9 @@ int netif_rx_ni(struct sk_buff *skb) |
1a6e0f06 JK |
38159 | |
38160 | trace_netif_rx_ni_entry(skb); | |
38161 | ||
38162 | - preempt_disable(); | |
38163 | + local_bh_disable(); | |
38164 | err = netif_rx_internal(skb); | |
38165 | - if (local_softirq_pending()) | |
38166 | - do_softirq(); | |
38167 | - preempt_enable(); | |
38168 | + local_bh_enable(); | |
38169 | ||
38170 | return err; | |
38171 | } | |
5dd41b01 | 38172 | @@ -4629,7 +4645,7 @@ static void flush_backlog(struct work_struct *work) |
1a6e0f06 | 38173 | skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { |
c7c16703 | 38174 | if (skb->dev->reg_state == NETREG_UNREGISTERING) { |
1a6e0f06 JK |
38175 | __skb_unlink(skb, &sd->input_pkt_queue); |
38176 | - kfree_skb(skb); | |
38177 | + __skb_queue_tail(&sd->tofree_queue, skb); | |
38178 | input_queue_head_incr(sd); | |
38179 | } | |
38180 | } | |
5dd41b01 | 38181 | @@ -4639,11 +4655,14 @@ static void flush_backlog(struct work_struct *work) |
1a6e0f06 | 38182 | skb_queue_walk_safe(&sd->process_queue, skb, tmp) { |
c7c16703 | 38183 | if (skb->dev->reg_state == NETREG_UNREGISTERING) { |
1a6e0f06 JK |
38184 | __skb_unlink(skb, &sd->process_queue); |
38185 | - kfree_skb(skb); | |
38186 | + __skb_queue_tail(&sd->tofree_queue, skb); | |
38187 | input_queue_head_incr(sd); | |
38188 | } | |
38189 | } | |
1a6e0f06 JK |
38190 | + if (!skb_queue_empty(&sd->tofree_queue)) |
38191 | + raise_softirq_irqoff(NET_RX_SOFTIRQ); | |
c7c16703 JK |
38192 | local_bh_enable(); |
38193 | + | |
1a6e0f06 JK |
38194 | } |
38195 | ||
c7c16703 | 38196 | static void flush_all_backlogs(void) |
5dd41b01 | 38197 | @@ -5153,12 +5172,14 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) |
1a6e0f06 JK |
38198 | sd->rps_ipi_list = NULL; |
38199 | ||
38200 | local_irq_enable(); | |
38201 | + preempt_check_resched_rt(); | |
38202 | ||
38203 | /* Send pending IPI's to kick RPS processing on remote cpus. */ | |
e4b2b4a8 | 38204 | net_rps_send_ipi(remsd); |
1a6e0f06 JK |
38205 | } else |
38206 | #endif | |
38207 | local_irq_enable(); | |
38208 | + preempt_check_resched_rt(); | |
38209 | } | |
38210 | ||
38211 | static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) | |
5dd41b01 | 38212 | @@ -5188,7 +5209,9 @@ static int process_backlog(struct napi_struct *napi, int quota) |
c7c16703 JK |
38213 | while (again) { |
38214 | struct sk_buff *skb; | |
38215 | ||
38216 | + local_irq_disable(); | |
38217 | while ((skb = __skb_dequeue(&sd->process_queue))) { | |
38218 | + local_irq_enable(); | |
38219 | rcu_read_lock(); | |
38220 | __netif_receive_skb(skb); | |
38221 | rcu_read_unlock(); | |
5dd41b01 | 38222 | @@ -5196,9 +5219,9 @@ static int process_backlog(struct napi_struct *napi, int quota) |
c7c16703 JK |
38223 | if (++work >= quota) |
38224 | return work; | |
38225 | ||
38226 | + local_irq_disable(); | |
38227 | } | |
38228 | ||
38229 | - local_irq_disable(); | |
38230 | rps_lock(sd); | |
38231 | if (skb_queue_empty(&sd->input_pkt_queue)) { | |
38232 | /* | |
5dd41b01 | 38233 | @@ -5236,6 +5259,7 @@ void __napi_schedule(struct napi_struct *n) |
1a6e0f06 JK |
38234 | local_irq_save(flags); |
38235 | ____napi_schedule(this_cpu_ptr(&softnet_data), n); | |
38236 | local_irq_restore(flags); | |
38237 | + preempt_check_resched_rt(); | |
38238 | } | |
38239 | EXPORT_SYMBOL(__napi_schedule); | |
38240 | ||
5dd41b01 | 38241 | @@ -5272,6 +5296,7 @@ bool napi_schedule_prep(struct napi_struct *n) |
e4b2b4a8 JK |
38242 | } |
38243 | EXPORT_SYMBOL(napi_schedule_prep); | |
38244 | ||
c7c16703 JK |
38245 | +#ifndef CONFIG_PREEMPT_RT_FULL |
38246 | /** | |
38247 | * __napi_schedule_irqoff - schedule for receive | |
38248 | * @n: entry to schedule | |
5dd41b01 | 38249 | @@ -5283,6 +5308,7 @@ void __napi_schedule_irqoff(struct napi_struct *n) |
c7c16703 JK |
38250 | ____napi_schedule(this_cpu_ptr(&softnet_data), n); |
38251 | } | |
38252 | EXPORT_SYMBOL(__napi_schedule_irqoff); | |
38253 | +#endif | |
38254 | ||
e4b2b4a8 | 38255 | bool napi_complete_done(struct napi_struct *n, int work_done) |
c7c16703 | 38256 | { |
5dd41b01 | 38257 | @@ -5637,13 +5663,21 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) |
e4b2b4a8 JK |
38258 | unsigned long time_limit = jiffies + |
38259 | usecs_to_jiffies(netdev_budget_usecs); | |
c7c16703 JK |
38260 | int budget = netdev_budget; |
38261 | + struct sk_buff_head tofree_q; | |
38262 | + struct sk_buff *skb; | |
38263 | LIST_HEAD(list); | |
38264 | LIST_HEAD(repoll); | |
38265 | ||
38266 | + __skb_queue_head_init(&tofree_q); | |
38267 | + | |
38268 | local_irq_disable(); | |
38269 | + skb_queue_splice_init(&sd->tofree_queue, &tofree_q); | |
38270 | list_splice_init(&sd->poll_list, &list); | |
38271 | local_irq_enable(); | |
38272 | ||
38273 | + while ((skb = __skb_dequeue(&tofree_q))) | |
38274 | + kfree_skb(skb); | |
38275 | + | |
38276 | for (;;) { | |
38277 | struct napi_struct *n; | |
38278 | ||
5dd41b01 | 38279 | @@ -5673,7 +5707,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) |
1a6e0f06 JK |
38280 | list_splice_tail(&repoll, &list); |
38281 | list_splice(&list, &sd->poll_list); | |
38282 | if (!list_empty(&sd->poll_list)) | |
38283 | - __raise_softirq_irqoff(NET_RX_SOFTIRQ); | |
38284 | + __raise_softirq_irqoff_ksoft(NET_RX_SOFTIRQ); | |
38285 | ||
38286 | net_rps_action_and_irq_enable(sd); | |
e4b2b4a8 | 38287 | out: |
5dd41b01 | 38288 | @@ -7502,7 +7536,7 @@ static void netdev_init_one_queue(struct net_device *dev, |
e4b2b4a8 JK |
38289 | /* Initialize queue lock */ |
38290 | spin_lock_init(&queue->_xmit_lock); | |
38291 | netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); | |
38292 | - queue->xmit_lock_owner = -1; | |
38293 | + netdev_queue_clear_owner(queue); | |
38294 | netdev_queue_numa_node_write(queue, NUMA_NO_NODE); | |
38295 | queue->dev = dev; | |
38296 | #ifdef CONFIG_BQL | |
5dd41b01 | 38297 | @@ -8442,6 +8476,7 @@ static int dev_cpu_dead(unsigned int oldcpu) |
1a6e0f06 JK |
38298 | |
38299 | raise_softirq_irqoff(NET_TX_SOFTIRQ); | |
38300 | local_irq_enable(); | |
38301 | + preempt_check_resched_rt(); | |
38302 | ||
e4b2b4a8 JK |
38303 | #ifdef CONFIG_RPS |
38304 | remsd = oldsd->rps_ipi_list; | |
5dd41b01 | 38305 | @@ -8455,10 +8490,13 @@ static int dev_cpu_dead(unsigned int oldcpu) |
1a6e0f06 JK |
38306 | netif_rx_ni(skb); |
38307 | input_queue_head_incr(oldsd); | |
38308 | } | |
38309 | - while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { | |
38310 | + while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { | |
38311 | netif_rx_ni(skb); | |
38312 | input_queue_head_incr(oldsd); | |
38313 | } | |
38314 | + while ((skb = __skb_dequeue(&oldsd->tofree_queue))) { | |
38315 | + kfree_skb(skb); | |
38316 | + } | |
38317 | ||
e4b2b4a8 | 38318 | return 0; |
1a6e0f06 | 38319 | } |
5dd41b01 | 38320 | @@ -8762,8 +8800,9 @@ static int __init net_dev_init(void) |
c7c16703 JK |
38321 | |
38322 | INIT_WORK(flush, flush_backlog); | |
1a6e0f06 JK |
38323 | |
38324 | - skb_queue_head_init(&sd->input_pkt_queue); | |
38325 | - skb_queue_head_init(&sd->process_queue); | |
38326 | + skb_queue_head_init_raw(&sd->input_pkt_queue); | |
38327 | + skb_queue_head_init_raw(&sd->process_queue); | |
38328 | + skb_queue_head_init_raw(&sd->tofree_queue); | |
38329 | INIT_LIST_HEAD(&sd->poll_list); | |
38330 | sd->output_queue_tailp = &sd->output_queue; | |
38331 | #ifdef CONFIG_RPS | |
b3bbd485 JK |
38332 | diff --git a/net/core/filter.c b/net/core/filter.c |
38333 | index d5158a10ac8f..ad96ec78f7b8 100644 | |
38334 | --- a/net/core/filter.c | |
38335 | +++ b/net/core/filter.c | |
38336 | @@ -1696,7 +1696,7 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) | |
1a6e0f06 JK |
38337 | { |
38338 | int ret; | |
38339 | ||
38340 | - if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) { | |
38341 | + if (unlikely(xmit_rec_read() > XMIT_RECURSION_LIMIT)) { | |
38342 | net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n"); | |
38343 | kfree_skb(skb); | |
38344 | return -ENETDOWN; | |
b3bbd485 | 38345 | @@ -1704,9 +1704,9 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) |
1a6e0f06 JK |
38346 | |
38347 | skb->dev = dev; | |
38348 | ||
38349 | - __this_cpu_inc(xmit_recursion); | |
38350 | + xmit_rec_inc(); | |
38351 | ret = dev_queue_xmit(skb); | |
38352 | - __this_cpu_dec(xmit_recursion); | |
38353 | + xmit_rec_dec(); | |
38354 | ||
38355 | return ret; | |
38356 | } | |
b3bbd485 JK |
38357 | diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c |
38358 | index 7f980bd7426e..7250106015ef 100644 | |
38359 | --- a/net/core/gen_estimator.c | |
38360 | +++ b/net/core/gen_estimator.c | |
e4b2b4a8 JK |
38361 | @@ -46,7 +46,7 @@ |
38362 | struct net_rate_estimator { | |
1a6e0f06 | 38363 | struct gnet_stats_basic_packed *bstats; |
1a6e0f06 JK |
38364 | spinlock_t *stats_lock; |
38365 | - seqcount_t *running; | |
38366 | + net_seqlock_t *running; | |
e4b2b4a8 JK |
38367 | struct gnet_stats_basic_cpu __percpu *cpu_bstats; |
38368 | u8 ewma_log; | |
38369 | u8 intvl_log; /* period : (250ms << intvl_log) */ | |
b3bbd485 | 38370 | @@ -129,7 +129,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, |
1a6e0f06 | 38371 | struct gnet_stats_basic_cpu __percpu *cpu_bstats, |
e4b2b4a8 | 38372 | struct net_rate_estimator __rcu **rate_est, |
1a6e0f06 JK |
38373 | spinlock_t *stats_lock, |
38374 | - seqcount_t *running, | |
38375 | + net_seqlock_t *running, | |
38376 | struct nlattr *opt) | |
38377 | { | |
e4b2b4a8 | 38378 | struct gnet_estimator *parm = nla_data(opt); |
b3bbd485 | 38379 | @@ -222,7 +222,7 @@ int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, |
1a6e0f06 | 38380 | struct gnet_stats_basic_cpu __percpu *cpu_bstats, |
e4b2b4a8 | 38381 | struct net_rate_estimator __rcu **rate_est, |
1a6e0f06 JK |
38382 | spinlock_t *stats_lock, |
38383 | - seqcount_t *running, struct nlattr *opt) | |
38384 | + net_seqlock_t *running, struct nlattr *opt) | |
38385 | { | |
e4b2b4a8 JK |
38386 | return gen_new_estimator(bstats, cpu_bstats, rate_est, |
38387 | stats_lock, running, opt); | |
b3bbd485 JK |
38388 | diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c |
38389 | index 441c04adedba..07f9a6a1f8e4 100644 | |
38390 | --- a/net/core/gen_stats.c | |
38391 | +++ b/net/core/gen_stats.c | |
38392 | @@ -142,7 +142,7 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats, | |
1a6e0f06 JK |
38393 | } |
38394 | ||
38395 | void | |
38396 | -__gnet_stats_copy_basic(const seqcount_t *running, | |
38397 | +__gnet_stats_copy_basic(net_seqlock_t *running, | |
38398 | struct gnet_stats_basic_packed *bstats, | |
38399 | struct gnet_stats_basic_cpu __percpu *cpu, | |
38400 | struct gnet_stats_basic_packed *b) | |
b3bbd485 | 38401 | @@ -155,10 +155,10 @@ __gnet_stats_copy_basic(const seqcount_t *running, |
1a6e0f06 JK |
38402 | } |
38403 | do { | |
38404 | if (running) | |
38405 | - seq = read_seqcount_begin(running); | |
38406 | + seq = net_seq_begin(running); | |
38407 | bstats->bytes = b->bytes; | |
38408 | bstats->packets = b->packets; | |
38409 | - } while (running && read_seqcount_retry(running, seq)); | |
38410 | + } while (running && net_seq_retry(running, seq)); | |
38411 | } | |
38412 | EXPORT_SYMBOL(__gnet_stats_copy_basic); | |
38413 | ||
b3bbd485 | 38414 | @@ -176,7 +176,7 @@ EXPORT_SYMBOL(__gnet_stats_copy_basic); |
1a6e0f06 JK |
38415 | * if the room in the socket buffer was not sufficient. |
38416 | */ | |
38417 | int | |
38418 | -gnet_stats_copy_basic(const seqcount_t *running, | |
38419 | +gnet_stats_copy_basic(net_seqlock_t *running, | |
38420 | struct gnet_dump *d, | |
38421 | struct gnet_stats_basic_cpu __percpu *cpu, | |
38422 | struct gnet_stats_basic_packed *b) | |
b3bbd485 JK |
38423 | diff --git a/net/core/pktgen.c b/net/core/pktgen.c |
38424 | index 6e1e10ff433a..c1ae4075e0ed 100644 | |
38425 | --- a/net/core/pktgen.c | |
38426 | +++ b/net/core/pktgen.c | |
38427 | @@ -2252,7 +2252,8 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) | |
e4b2b4a8 JK |
38428 | s64 remaining; |
38429 | struct hrtimer_sleeper t; | |
38430 | ||
38431 | - hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); | |
38432 | + hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_ABS, | |
38433 | + current); | |
38434 | hrtimer_set_expires(&t.timer, spin_until); | |
38435 | ||
38436 | remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer)); | |
b3bbd485 | 38437 | @@ -2267,7 +2268,6 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) |
e4b2b4a8 JK |
38438 | } while (ktime_compare(end_time, spin_until) < 0); |
38439 | } else { | |
38440 | /* see do_nanosleep */ | |
38441 | - hrtimer_init_sleeper(&t, current); | |
38442 | do { | |
38443 | set_current_state(TASK_INTERRUPTIBLE); | |
38444 | hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); | |
b3bbd485 | 38445 | diff --git a/net/core/skbuff.c b/net/core/skbuff.c |
5dd41b01 | 38446 | index 9f80b947f53b..c0f23b8dcfc6 100644 |
b3bbd485 JK |
38447 | --- a/net/core/skbuff.c |
38448 | +++ b/net/core/skbuff.c | |
e4b2b4a8 | 38449 | @@ -63,6 +63,7 @@ |
1a6e0f06 JK |
38450 | #include <linux/errqueue.h> |
38451 | #include <linux/prefetch.h> | |
38452 | #include <linux/if_vlan.h> | |
38453 | +#include <linux/locallock.h> | |
38454 | ||
38455 | #include <net/protocol.h> | |
38456 | #include <net/dst.h> | |
b3bbd485 | 38457 | @@ -330,6 +331,8 @@ struct napi_alloc_cache { |
1a6e0f06 JK |
38458 | |
38459 | static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); | |
38460 | static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); | |
38461 | +static DEFINE_LOCAL_IRQ_LOCK(netdev_alloc_lock); | |
38462 | +static DEFINE_LOCAL_IRQ_LOCK(napi_alloc_cache_lock); | |
38463 | ||
38464 | static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) | |
38465 | { | |
b3bbd485 | 38466 | @@ -337,10 +340,10 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) |
1a6e0f06 JK |
38467 | unsigned long flags; |
38468 | void *data; | |
38469 | ||
38470 | - local_irq_save(flags); | |
38471 | + local_lock_irqsave(netdev_alloc_lock, flags); | |
38472 | nc = this_cpu_ptr(&netdev_alloc_cache); | |
e4b2b4a8 | 38473 | data = page_frag_alloc(nc, fragsz, gfp_mask); |
1a6e0f06 JK |
38474 | - local_irq_restore(flags); |
38475 | + local_unlock_irqrestore(netdev_alloc_lock, flags); | |
38476 | return data; | |
38477 | } | |
38478 | ||
b3bbd485 | 38479 | @@ -359,9 +362,13 @@ EXPORT_SYMBOL(netdev_alloc_frag); |
1a6e0f06 JK |
38480 | |
38481 | static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) | |
38482 | { | |
38483 | - struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); | |
38484 | + struct napi_alloc_cache *nc; | |
38485 | + void *data; | |
38486 | ||
e4b2b4a8 | 38487 | - return page_frag_alloc(&nc->page, fragsz, gfp_mask); |
1a6e0f06 | 38488 | + nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache); |
e4b2b4a8 | 38489 | + data = page_frag_alloc(&nc->page, fragsz, gfp_mask); |
1a6e0f06 JK |
38490 | + put_locked_var(napi_alloc_cache_lock, napi_alloc_cache); |
38491 | + return data; | |
38492 | } | |
38493 | ||
38494 | void *napi_alloc_frag(unsigned int fragsz) | |
b3bbd485 | 38495 | @@ -408,13 +415,13 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, |
1a6e0f06 JK |
38496 | if (sk_memalloc_socks()) |
38497 | gfp_mask |= __GFP_MEMALLOC; | |
38498 | ||
38499 | - local_irq_save(flags); | |
38500 | + local_lock_irqsave(netdev_alloc_lock, flags); | |
38501 | ||
38502 | nc = this_cpu_ptr(&netdev_alloc_cache); | |
e4b2b4a8 | 38503 | data = page_frag_alloc(nc, len, gfp_mask); |
1a6e0f06 JK |
38504 | pfmemalloc = nc->pfmemalloc; |
38505 | ||
38506 | - local_irq_restore(flags); | |
38507 | + local_unlock_irqrestore(netdev_alloc_lock, flags); | |
38508 | ||
38509 | if (unlikely(!data)) | |
38510 | return NULL; | |
b3bbd485 | 38511 | @@ -455,9 +462,10 @@ EXPORT_SYMBOL(__netdev_alloc_skb); |
1a6e0f06 JK |
38512 | struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, |
38513 | gfp_t gfp_mask) | |
38514 | { | |
38515 | - struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); | |
38516 | + struct napi_alloc_cache *nc; | |
38517 | struct sk_buff *skb; | |
38518 | void *data; | |
38519 | + bool pfmemalloc; | |
38520 | ||
38521 | len += NET_SKB_PAD + NET_IP_ALIGN; | |
38522 | ||
b3bbd485 | 38523 | @@ -475,7 +483,10 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, |
1a6e0f06 JK |
38524 | if (sk_memalloc_socks()) |
38525 | gfp_mask |= __GFP_MEMALLOC; | |
38526 | ||
38527 | + nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache); | |
e4b2b4a8 | 38528 | data = page_frag_alloc(&nc->page, len, gfp_mask); |
1a6e0f06 JK |
38529 | + pfmemalloc = nc->page.pfmemalloc; |
38530 | + put_locked_var(napi_alloc_cache_lock, napi_alloc_cache); | |
38531 | if (unlikely(!data)) | |
38532 | return NULL; | |
38533 | ||
b3bbd485 | 38534 | @@ -486,7 +497,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, |
1a6e0f06 JK |
38535 | } |
38536 | ||
38537 | /* use OR instead of assignment to avoid clearing of bits in mask */ | |
38538 | - if (nc->page.pfmemalloc) | |
38539 | + if (pfmemalloc) | |
38540 | skb->pfmemalloc = 1; | |
38541 | skb->head_frag = 1; | |
38542 | ||
b3bbd485 | 38543 | @@ -718,23 +729,26 @@ void __consume_stateless_skb(struct sk_buff *skb) |
1a6e0f06 JK |
38544 | |
38545 | void __kfree_skb_flush(void) | |
38546 | { | |
38547 | - struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); | |
38548 | + struct napi_alloc_cache *nc; | |
38549 | ||
38550 | + nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache); | |
38551 | /* flush skb_cache if containing objects */ | |
38552 | if (nc->skb_count) { | |
38553 | kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count, | |
38554 | nc->skb_cache); | |
38555 | nc->skb_count = 0; | |
38556 | } | |
38557 | + put_locked_var(napi_alloc_cache_lock, napi_alloc_cache); | |
38558 | } | |
38559 | ||
38560 | static inline void _kfree_skb_defer(struct sk_buff *skb) | |
38561 | { | |
38562 | - struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); | |
38563 | + struct napi_alloc_cache *nc; | |
38564 | ||
38565 | /* drop skb->head and call any destructors for packet */ | |
38566 | skb_release_all(skb); | |
38567 | ||
38568 | + nc = &get_locked_var(napi_alloc_cache_lock, napi_alloc_cache); | |
38569 | /* record skb to CPU local list */ | |
38570 | nc->skb_cache[nc->skb_count++] = skb; | |
38571 | ||
b3bbd485 | 38572 | @@ -749,6 +763,7 @@ static inline void _kfree_skb_defer(struct sk_buff *skb) |
1a6e0f06 JK |
38573 | nc->skb_cache); |
38574 | nc->skb_count = 0; | |
38575 | } | |
38576 | + put_locked_var(napi_alloc_cache_lock, napi_alloc_cache); | |
38577 | } | |
38578 | void __kfree_skb_defer(struct sk_buff *skb) | |
38579 | { | |
b3bbd485 JK |
38580 | diff --git a/net/core/sock.c b/net/core/sock.c |
38581 | index 68d08ed5521e..ee242ff5d4b1 100644 | |
38582 | --- a/net/core/sock.c | |
38583 | +++ b/net/core/sock.c | |
38584 | @@ -2757,12 +2757,11 @@ void lock_sock_nested(struct sock *sk, int subclass) | |
1a6e0f06 JK |
38585 | if (sk->sk_lock.owned) |
38586 | __lock_sock(sk); | |
38587 | sk->sk_lock.owned = 1; | |
38588 | - spin_unlock(&sk->sk_lock.slock); | |
38589 | + spin_unlock_bh(&sk->sk_lock.slock); | |
38590 | /* | |
38591 | * The sk_lock has mutex_lock() semantics here: | |
38592 | */ | |
38593 | mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); | |
38594 | - local_bh_enable(); | |
38595 | } | |
38596 | EXPORT_SYMBOL(lock_sock_nested); | |
38597 | ||
b3bbd485 JK |
38598 | diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c |
38599 | index 3c1570d3e22f..0310ea93f877 100644 | |
38600 | --- a/net/ipv4/icmp.c | |
38601 | +++ b/net/ipv4/icmp.c | |
e4b2b4a8 | 38602 | @@ -77,6 +77,7 @@ |
1a6e0f06 JK |
38603 | #include <linux/string.h> |
38604 | #include <linux/netfilter_ipv4.h> | |
38605 | #include <linux/slab.h> | |
38606 | +#include <linux/locallock.h> | |
38607 | #include <net/snmp.h> | |
38608 | #include <net/ip.h> | |
38609 | #include <net/route.h> | |
b3bbd485 | 38610 | @@ -204,6 +205,8 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; |
1a6e0f06 JK |
38611 | * |
38612 | * On SMP we have one ICMP socket per-cpu. | |
38613 | */ | |
38614 | +static DEFINE_LOCAL_IRQ_LOCK(icmp_sk_lock); | |
38615 | + | |
38616 | static struct sock *icmp_sk(struct net *net) | |
38617 | { | |
38618 | return *this_cpu_ptr(net->ipv4.icmp_sk); | |
b3bbd485 | 38619 | @@ -214,12 +217,16 @@ static inline struct sock *icmp_xmit_lock(struct net *net) |
e4b2b4a8 JK |
38620 | { |
38621 | struct sock *sk; | |
1a6e0f06 | 38622 | |
e4b2b4a8 JK |
38623 | + if (!local_trylock(icmp_sk_lock)) |
38624 | + return NULL; | |
38625 | + | |
1a6e0f06 JK |
38626 | sk = icmp_sk(net); |
38627 | ||
38628 | if (unlikely(!spin_trylock(&sk->sk_lock.slock))) { | |
38629 | /* This can happen if the output path signals a | |
38630 | * dst_link_failure() for an outgoing ICMP packet. | |
38631 | */ | |
38632 | + local_unlock(icmp_sk_lock); | |
1a6e0f06 JK |
38633 | return NULL; |
38634 | } | |
e4b2b4a8 | 38635 | return sk; |
b3bbd485 | 38636 | @@ -228,6 +235,7 @@ static inline struct sock *icmp_xmit_lock(struct net *net) |
1a6e0f06 JK |
38637 | static inline void icmp_xmit_unlock(struct sock *sk) |
38638 | { | |
e4b2b4a8 | 38639 | spin_unlock(&sk->sk_lock.slock); |
1a6e0f06 JK |
38640 | + local_unlock(icmp_sk_lock); |
38641 | } | |
38642 | ||
38643 | int sysctl_icmp_msgs_per_sec __read_mostly = 1000; | |
b3bbd485 | 38644 | diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c |
5dd41b01 | 38645 | index 31b34c0c2d5f..851f241e70b5 100644 |
b3bbd485 JK |
38646 | --- a/net/ipv4/tcp_ipv4.c |
38647 | +++ b/net/ipv4/tcp_ipv4.c | |
1a6e0f06 JK |
38648 | @@ -62,6 +62,7 @@ |
38649 | #include <linux/init.h> | |
38650 | #include <linux/times.h> | |
38651 | #include <linux/slab.h> | |
38652 | +#include <linux/locallock.h> | |
38653 | ||
38654 | #include <net/net_namespace.h> | |
38655 | #include <net/icmp.h> | |
b3bbd485 | 38656 | @@ -580,6 +581,7 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) |
1a6e0f06 JK |
38657 | } |
38658 | EXPORT_SYMBOL(tcp_v4_send_check); | |
38659 | ||
38660 | +static DEFINE_LOCAL_IRQ_LOCK(tcp_sk_lock); | |
38661 | /* | |
38662 | * This routine will send an RST to the other tcp. | |
38663 | * | |
b3bbd485 | 38664 | @@ -710,6 +712,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) |
1a6e0f06 | 38665 | arg.tos = ip_hdr(skb)->tos; |
e4b2b4a8 | 38666 | arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); |
1a6e0f06 | 38667 | local_bh_disable(); |
e4b2b4a8 | 38668 | + local_lock(tcp_sk_lock); |
1a6e0f06 JK |
38669 | ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), |
38670 | skb, &TCP_SKB_CB(skb)->header.h4.opt, | |
e4b2b4a8 | 38671 | ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, |
b3bbd485 | 38672 | @@ -717,6 +720,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) |
e4b2b4a8 | 38673 | |
1a6e0f06 JK |
38674 | __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); |
38675 | __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); | |
1a6e0f06 | 38676 | + local_unlock(tcp_sk_lock); |
e4b2b4a8 | 38677 | local_bh_enable(); |
1a6e0f06 JK |
38678 | |
38679 | #ifdef CONFIG_TCP_MD5SIG | |
b3bbd485 | 38680 | @@ -796,12 +800,14 @@ static void tcp_v4_send_ack(const struct sock *sk, |
1a6e0f06 | 38681 | arg.tos = tos; |
e4b2b4a8 | 38682 | arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); |
1a6e0f06 | 38683 | local_bh_disable(); |
e4b2b4a8 | 38684 | + local_lock(tcp_sk_lock); |
1a6e0f06 JK |
38685 | ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), |
38686 | skb, &TCP_SKB_CB(skb)->header.h4.opt, | |
e4b2b4a8 JK |
38687 | ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, |
38688 | &arg, arg.iov[0].iov_len); | |
1a6e0f06 JK |
38689 | |
38690 | __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); | |
1a6e0f06 | 38691 | + local_unlock(tcp_sk_lock); |
e4b2b4a8 | 38692 | local_bh_enable(); |
1a6e0f06 JK |
38693 | } |
38694 | ||
b3bbd485 JK |
38695 | diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c |
38696 | index dddd498e1338..8f39b8162df8 100644 | |
38697 | --- a/net/mac80211/rx.c | |
38698 | +++ b/net/mac80211/rx.c | |
38699 | @@ -4252,7 +4252,7 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, | |
1a6e0f06 JK |
38700 | struct ieee80211_supported_band *sband; |
38701 | struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); | |
38702 | ||
38703 | - WARN_ON_ONCE(softirq_count() == 0); | |
38704 | + WARN_ON_ONCE_NONRT(softirq_count() == 0); | |
38705 | ||
38706 | if (WARN_ON(status->band >= NUM_NL80211_BANDS)) | |
38707 | goto drop; | |
b3bbd485 JK |
38708 | diff --git a/net/netfilter/core.c b/net/netfilter/core.c |
38709 | index 52cd2901a097..c63e937b6676 100644 | |
38710 | --- a/net/netfilter/core.c | |
38711 | +++ b/net/netfilter/core.c | |
e4b2b4a8 JK |
38712 | @@ -21,6 +21,7 @@ |
38713 | #include <linux/inetdevice.h> | |
1a6e0f06 JK |
38714 | #include <linux/proc_fs.h> |
38715 | #include <linux/mutex.h> | |
1a6e0f06 | 38716 | +#include <linux/locallock.h> |
e4b2b4a8 | 38717 | #include <linux/mm.h> |
c7c16703 | 38718 | #include <linux/rcupdate.h> |
1a6e0f06 | 38719 | #include <net/net_namespace.h> |
e4b2b4a8 | 38720 | @@ -28,6 +29,11 @@ |
1a6e0f06 JK |
38721 | |
38722 | #include "nf_internals.h" | |
38723 | ||
38724 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
38725 | +DEFINE_LOCAL_IRQ_LOCK(xt_write_lock); | |
38726 | +EXPORT_PER_CPU_SYMBOL(xt_write_lock); | |
38727 | +#endif | |
38728 | + | |
38729 | static DEFINE_MUTEX(afinfo_mutex); | |
38730 | ||
38731 | const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly; | |
b3bbd485 | 38732 | diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c |
5dd41b01 | 38733 | index 8d1a7c900393..f1f56be3b061 100644 |
b3bbd485 JK |
38734 | --- a/net/packet/af_packet.c |
38735 | +++ b/net/packet/af_packet.c | |
1a6e0f06 JK |
38736 | @@ -63,6 +63,7 @@ |
38737 | #include <linux/if_packet.h> | |
38738 | #include <linux/wireless.h> | |
38739 | #include <linux/kernel.h> | |
38740 | +#include <linux/delay.h> | |
38741 | #include <linux/kmod.h> | |
38742 | #include <linux/slab.h> | |
38743 | #include <linux/vmalloc.h> | |
b3bbd485 | 38744 | @@ -707,7 +708,7 @@ static void prb_retire_rx_blk_timer_expired(unsigned long data) |
1a6e0f06 JK |
38745 | if (BLOCK_NUM_PKTS(pbd)) { |
38746 | while (atomic_read(&pkc->blk_fill_in_prog)) { | |
38747 | /* Waiting for skb_copy_bits to finish... */ | |
38748 | - cpu_relax(); | |
38749 | + cpu_chill(); | |
38750 | } | |
38751 | } | |
38752 | ||
b3bbd485 | 38753 | @@ -969,7 +970,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *pkc, |
1a6e0f06 JK |
38754 | if (!(status & TP_STATUS_BLK_TMO)) { |
38755 | while (atomic_read(&pkc->blk_fill_in_prog)) { | |
38756 | /* Waiting for skb_copy_bits to finish... */ | |
38757 | - cpu_relax(); | |
38758 | + cpu_chill(); | |
38759 | } | |
38760 | } | |
38761 | prb_close_block(pkc, pbd, po, status); | |
b3bbd485 JK |
38762 | diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c |
38763 | index 9a3c54e659e9..2a95f1d587ac 100644 | |
38764 | --- a/net/rds/ib_rdma.c | |
38765 | +++ b/net/rds/ib_rdma.c | |
1a6e0f06 JK |
38766 | @@ -34,6 +34,7 @@ |
38767 | #include <linux/slab.h> | |
38768 | #include <linux/rculist.h> | |
38769 | #include <linux/llist.h> | |
38770 | +#include <linux/delay.h> | |
38771 | ||
38772 | #include "rds_single_path.h" | |
38773 | #include "ib_mr.h" | |
b3bbd485 | 38774 | @@ -210,7 +211,7 @@ static inline void wait_clean_list_grace(void) |
1a6e0f06 JK |
38775 | for_each_online_cpu(cpu) { |
38776 | flag = &per_cpu(clean_list_grace, cpu); | |
38777 | while (test_bit(CLEAN_LIST_BUSY_BIT, flag)) | |
38778 | - cpu_relax(); | |
38779 | + cpu_chill(); | |
38780 | } | |
38781 | } | |
38782 | ||
b3bbd485 JK |
38783 | diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c |
38784 | index e9f428351293..c4479afe8ae7 100644 | |
38785 | --- a/net/rxrpc/security.c | |
38786 | +++ b/net/rxrpc/security.c | |
1a6e0f06 JK |
38787 | @@ -19,9 +19,6 @@ |
38788 | #include <keys/rxrpc-type.h> | |
38789 | #include "ar-internal.h" | |
38790 | ||
38791 | -static LIST_HEAD(rxrpc_security_methods); | |
38792 | -static DECLARE_RWSEM(rxrpc_security_sem); | |
38793 | - | |
38794 | static const struct rxrpc_security *rxrpc_security_types[] = { | |
38795 | [RXRPC_SECURITY_NONE] = &rxrpc_no_security, | |
38796 | #ifdef CONFIG_RXKAD | |
b3bbd485 | 38797 | diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c |
5dd41b01 | 38798 | index cd69aa067543..73348ac5019f 100644 |
b3bbd485 JK |
38799 | --- a/net/sched/sch_api.c |
38800 | +++ b/net/sched/sch_api.c | |
38801 | @@ -1081,7 +1081,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev, | |
1a6e0f06 JK |
38802 | rcu_assign_pointer(sch->stab, stab); |
38803 | } | |
38804 | if (tca[TCA_RATE]) { | |
38805 | - seqcount_t *running; | |
38806 | + net_seqlock_t *running; | |
38807 | ||
38808 | err = -EOPNOTSUPP; | |
38809 | if (sch->flags & TCQ_F_MQROOT) | |
b3bbd485 JK |
38810 | diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c |
38811 | index 79549baf5804..341f7895659c 100644 | |
38812 | --- a/net/sched/sch_generic.c | |
38813 | +++ b/net/sched/sch_generic.c | |
38814 | @@ -429,7 +429,11 @@ struct Qdisc noop_qdisc = { | |
c7c16703 | 38815 | .ops = &noop_qdisc_ops, |
1a6e0f06 JK |
38816 | .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), |
38817 | .dev_queue = &noop_netdev_queue, | |
38818 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
38819 | + .running = __SEQLOCK_UNLOCKED(noop_qdisc.running), | |
38820 | +#else | |
38821 | .running = SEQCNT_ZERO(noop_qdisc.running), | |
38822 | +#endif | |
38823 | .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock), | |
38824 | }; | |
38825 | EXPORT_SYMBOL(noop_qdisc); | |
b3bbd485 | 38826 | @@ -628,9 +632,17 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, |
1a6e0f06 JK |
38827 | lockdep_set_class(&sch->busylock, |
38828 | dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); | |
38829 | ||
38830 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
38831 | + seqlock_init(&sch->running); | |
38832 | + lockdep_set_class(&sch->running.seqcount, | |
38833 | + dev->qdisc_running_key ?: &qdisc_running_key); | |
38834 | + lockdep_set_class(&sch->running.lock, | |
38835 | + dev->qdisc_running_key ?: &qdisc_running_key); | |
38836 | +#else | |
38837 | seqcount_init(&sch->running); | |
38838 | lockdep_set_class(&sch->running, | |
38839 | dev->qdisc_running_key ?: &qdisc_running_key); | |
38840 | +#endif | |
38841 | ||
38842 | sch->ops = ops; | |
38843 | sch->enqueue = ops->enqueue; | |
b3bbd485 | 38844 | @@ -933,7 +945,7 @@ void dev_deactivate_many(struct list_head *head) |
1a6e0f06 | 38845 | /* Wait for outstanding qdisc_run calls. */ |
e4b2b4a8 | 38846 | list_for_each_entry(dev, head, close_list) { |
1a6e0f06 JK |
38847 | while (some_qdisc_is_busy(dev)) |
38848 | - yield(); | |
38849 | + msleep(1); | |
e4b2b4a8 JK |
38850 | /* The new qdisc is assigned at this point so we can safely |
38851 | * unwind stale skb lists and qdisc statistics | |
38852 | */ | |
b3bbd485 JK |
38853 | diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c |
38854 | index d16a8b423c20..cedaf909eb97 100644 | |
38855 | --- a/net/sunrpc/svc_xprt.c | |
38856 | +++ b/net/sunrpc/svc_xprt.c | |
38857 | @@ -396,7 +396,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt) | |
1a6e0f06 JK |
38858 | goto out; |
38859 | } | |
38860 | ||
38861 | - cpu = get_cpu(); | |
38862 | + cpu = get_cpu_light(); | |
38863 | pool = svc_pool_for_cpu(xprt->xpt_server, cpu); | |
38864 | ||
38865 | atomic_long_inc(&pool->sp_stats.packets); | |
b3bbd485 | 38866 | @@ -432,7 +432,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt) |
1a6e0f06 JK |
38867 | |
38868 | atomic_long_inc(&pool->sp_stats.threads_woken); | |
38869 | wake_up_process(rqstp->rq_task); | |
38870 | - put_cpu(); | |
38871 | + put_cpu_light(); | |
38872 | goto out; | |
38873 | } | |
38874 | rcu_read_unlock(); | |
b3bbd485 | 38875 | @@ -453,7 +453,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt) |
1a6e0f06 JK |
38876 | goto redo_search; |
38877 | } | |
38878 | rqstp = NULL; | |
38879 | - put_cpu(); | |
38880 | + put_cpu_light(); | |
38881 | out: | |
38882 | trace_svc_xprt_do_enqueue(xprt, rqstp); | |
38883 | } | |
b3bbd485 JK |
38884 | diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c |
38885 | index 6c4ec69e11a0..77f52dc790ec 100644 | |
38886 | --- a/net/xfrm/xfrm_state.c | |
38887 | +++ b/net/xfrm/xfrm_state.c | |
38888 | @@ -427,7 +427,7 @@ static void xfrm_put_mode(struct xfrm_mode *mode) | |
e4b2b4a8 JK |
38889 | |
38890 | static void xfrm_state_gc_destroy(struct xfrm_state *x) | |
38891 | { | |
38892 | - tasklet_hrtimer_cancel(&x->mtimer); | |
38893 | + hrtimer_cancel(&x->mtimer); | |
38894 | del_timer_sync(&x->rtimer); | |
38895 | kfree(x->aead); | |
38896 | kfree(x->aalg); | |
b3bbd485 | 38897 | @@ -472,8 +472,8 @@ static void xfrm_state_gc_task(struct work_struct *work) |
e4b2b4a8 JK |
38898 | |
38899 | static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) | |
38900 | { | |
38901 | - struct tasklet_hrtimer *thr = container_of(me, struct tasklet_hrtimer, timer); | |
38902 | - struct xfrm_state *x = container_of(thr, struct xfrm_state, mtimer); | |
38903 | + struct xfrm_state *x = container_of(me, struct xfrm_state, mtimer); | |
38904 | + enum hrtimer_restart ret = HRTIMER_NORESTART; | |
38905 | unsigned long now = get_seconds(); | |
38906 | long next = LONG_MAX; | |
38907 | int warn = 0; | |
b3bbd485 | 38908 | @@ -537,7 +537,8 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) |
e4b2b4a8 JK |
38909 | km_state_expired(x, 0, 0); |
38910 | resched: | |
38911 | if (next != LONG_MAX) { | |
38912 | - tasklet_hrtimer_start(&x->mtimer, ktime_set(next, 0), HRTIMER_MODE_REL); | |
38913 | + hrtimer_forward_now(&x->mtimer, ktime_set(next, 0)); | |
38914 | + ret = HRTIMER_RESTART; | |
38915 | } | |
38916 | ||
38917 | goto out; | |
b3bbd485 | 38918 | @@ -554,7 +555,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) |
e4b2b4a8 JK |
38919 | |
38920 | out: | |
38921 | spin_unlock(&x->lock); | |
38922 | - return HRTIMER_NORESTART; | |
38923 | + return ret; | |
38924 | } | |
38925 | ||
38926 | static void xfrm_replay_timer_handler(unsigned long data); | |
b3bbd485 | 38927 | @@ -573,8 +574,8 @@ struct xfrm_state *xfrm_state_alloc(struct net *net) |
e4b2b4a8 JK |
38928 | INIT_HLIST_NODE(&x->bydst); |
38929 | INIT_HLIST_NODE(&x->bysrc); | |
38930 | INIT_HLIST_NODE(&x->byspi); | |
38931 | - tasklet_hrtimer_init(&x->mtimer, xfrm_timer_handler, | |
38932 | - CLOCK_BOOTTIME, HRTIMER_MODE_ABS); | |
38933 | + hrtimer_init(&x->mtimer, CLOCK_BOOTTIME, HRTIMER_MODE_ABS_SOFT); | |
38934 | + x->mtimer.function = xfrm_timer_handler; | |
38935 | setup_timer(&x->rtimer, xfrm_replay_timer_handler, | |
38936 | (unsigned long)x); | |
38937 | x->curlft.add_time = get_seconds(); | |
b3bbd485 | 38938 | @@ -1031,7 +1032,9 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, |
e4b2b4a8 JK |
38939 | hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h); |
38940 | } | |
38941 | x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires; | |
38942 | - tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL); | |
38943 | + hrtimer_start(&x->mtimer, | |
38944 | + ktime_set(net->xfrm.sysctl_acq_expires, 0), | |
38945 | + HRTIMER_MODE_REL_SOFT); | |
38946 | net->xfrm.state_num++; | |
38947 | xfrm_hash_grow_check(net, x->bydst.next != NULL); | |
38948 | spin_unlock_bh(&net->xfrm.xfrm_state_lock); | |
b3bbd485 | 38949 | @@ -1142,7 +1145,7 @@ static void __xfrm_state_insert(struct xfrm_state *x) |
e4b2b4a8 JK |
38950 | hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h); |
38951 | } | |
38952 | ||
38953 | - tasklet_hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL); | |
38954 | + hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT); | |
38955 | if (x->replay_maxage) | |
38956 | mod_timer(&x->rtimer, jiffies + x->replay_maxage); | |
38957 | ||
b3bbd485 | 38958 | @@ -1246,7 +1249,9 @@ static struct xfrm_state *__find_acq_core(struct net *net, |
e4b2b4a8 JK |
38959 | x->mark.m = m->m; |
38960 | x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires; | |
38961 | xfrm_state_hold(x); | |
38962 | - tasklet_hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL); | |
38963 | + hrtimer_start(&x->mtimer, | |
38964 | + ktime_set(net->xfrm.sysctl_acq_expires, 0), | |
38965 | + HRTIMER_MODE_REL_SOFT); | |
38966 | list_add(&x->km.all, &net->xfrm.state_all); | |
38967 | hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h); | |
38968 | h = xfrm_src_hash(net, daddr, saddr, family); | |
b3bbd485 | 38969 | @@ -1546,7 +1551,8 @@ int xfrm_state_update(struct xfrm_state *x) |
e4b2b4a8 JK |
38970 | memcpy(&x1->lft, &x->lft, sizeof(x1->lft)); |
38971 | x1->km.dying = 0; | |
38972 | ||
38973 | - tasklet_hrtimer_start(&x1->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL); | |
38974 | + hrtimer_start(&x1->mtimer, ktime_set(1, 0), | |
38975 | + HRTIMER_MODE_REL_SOFT); | |
38976 | if (x1->curlft.use_time) | |
38977 | xfrm_state_check_expire(x1); | |
38978 | ||
b3bbd485 | 38979 | @@ -1570,7 +1576,7 @@ int xfrm_state_check_expire(struct xfrm_state *x) |
e4b2b4a8 JK |
38980 | if (x->curlft.bytes >= x->lft.hard_byte_limit || |
38981 | x->curlft.packets >= x->lft.hard_packet_limit) { | |
38982 | x->km.state = XFRM_STATE_EXPIRED; | |
38983 | - tasklet_hrtimer_start(&x->mtimer, 0, HRTIMER_MODE_REL); | |
38984 | + hrtimer_start(&x->mtimer, 0, HRTIMER_MODE_REL_SOFT); | |
38985 | return -EINVAL; | |
38986 | } | |
38987 | ||
b3bbd485 JK |
38988 | diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c |
38989 | index 5522692100ba..8b4be8e1802a 100644 | |
38990 | --- a/samples/trace_events/trace-events-sample.c | |
38991 | +++ b/samples/trace_events/trace-events-sample.c | |
38992 | @@ -33,7 +33,7 @@ static void simple_thread_func(int cnt) | |
e4b2b4a8 JK |
38993 | |
38994 | /* Silly tracepoints */ | |
38995 | trace_foo_bar("hello", cnt, array, random_strings[len], | |
38996 | - ¤t->cpus_allowed); | |
38997 | + current->cpus_ptr); | |
38998 | ||
38999 | trace_foo_with_template_simple("HELLO", cnt); | |
39000 | ||
b3bbd485 JK |
39001 | diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h |
39002 | index 959199c3147e..3e68004ed345 100755 | |
39003 | --- a/scripts/mkcompile_h | |
39004 | +++ b/scripts/mkcompile_h | |
39005 | @@ -5,7 +5,8 @@ TARGET=$1 | |
1a6e0f06 JK |
39006 | ARCH=$2 |
39007 | SMP=$3 | |
39008 | PREEMPT=$4 | |
39009 | -CC=$5 | |
39010 | +RT=$5 | |
39011 | +CC=$6 | |
39012 | ||
39013 | vecho() { [ "${quiet}" = "silent_" ] || echo "$@" ; } | |
39014 | ||
b3bbd485 | 39015 | @@ -58,6 +59,7 @@ UTS_VERSION="#$VERSION" |
1a6e0f06 JK |
39016 | CONFIG_FLAGS="" |
39017 | if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi | |
39018 | if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi | |
39019 | +if [ -n "$RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS RT"; fi | |
39020 | UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP" | |
39021 | ||
39022 | # Truncate to maximum length | |
b3bbd485 JK |
39023 | diff --git a/security/apparmor/include/path.h b/security/apparmor/include/path.h |
39024 | index 05fb3305671e..b26c16b02662 100644 | |
39025 | --- a/security/apparmor/include/path.h | |
39026 | +++ b/security/apparmor/include/path.h | |
39027 | @@ -39,9 +39,10 @@ struct aa_buffers { | |
e4b2b4a8 JK |
39028 | }; |
39029 | ||
39030 | #include <linux/percpu.h> | |
39031 | -#include <linux/preempt.h> | |
39032 | +#include <linux/locallock.h> | |
39033 | ||
39034 | DECLARE_PER_CPU(struct aa_buffers, aa_buffers); | |
39035 | +DECLARE_LOCAL_IRQ_LOCK(aa_buffers_lock); | |
39036 | ||
39037 | #define COUNT_ARGS(X...) COUNT_ARGS_HELPER(, ##X, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) | |
39038 | #define COUNT_ARGS_HELPER(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, n, X...) n | |
b3bbd485 | 39039 | @@ -55,12 +56,24 @@ DECLARE_PER_CPU(struct aa_buffers, aa_buffers); |
e4b2b4a8 JK |
39040 | |
39041 | #define for_each_cpu_buffer(I) for ((I) = 0; (I) < MAX_PATH_BUFFERS; (I)++) | |
39042 | ||
39043 | -#ifdef CONFIG_DEBUG_PREEMPT | |
39044 | +#ifdef CONFIG_PREEMPT_RT_BASE | |
39045 | + | |
39046 | +static inline void AA_BUG_PREEMPT_ENABLED(const char *s) | |
39047 | +{ | |
39048 | + struct local_irq_lock *lv; | |
39049 | + | |
39050 | + lv = this_cpu_ptr(&aa_buffers_lock); | |
39051 | + WARN_ONCE(lv->owner != current, | |
39052 | + "__get_buffer without aa_buffers_lock\n"); | |
39053 | +} | |
39054 | + | |
39055 | +#elif defined(CONFIG_DEBUG_PREEMPT) | |
39056 | #define AA_BUG_PREEMPT_ENABLED(X) AA_BUG(preempt_count() <= 0, X) | |
39057 | #else | |
39058 | #define AA_BUG_PREEMPT_ENABLED(X) /* nop */ | |
39059 | #endif | |
39060 | ||
39061 | + | |
39062 | #define __get_buffer(N) ({ \ | |
39063 | struct aa_buffers *__cpu_var; \ | |
39064 | AA_BUG_PREEMPT_ENABLED("__get_buffer without preempt disabled"); \ | |
b3bbd485 | 39065 | @@ -73,14 +86,14 @@ DECLARE_PER_CPU(struct aa_buffers, aa_buffers); |
e4b2b4a8 JK |
39066 | |
39067 | #define get_buffers(X...) \ | |
39068 | do { \ | |
39069 | - preempt_disable(); \ | |
39070 | + local_lock(aa_buffers_lock); \ | |
39071 | __get_buffers(X); \ | |
39072 | } while (0) | |
39073 | ||
39074 | #define put_buffers(X, Y...) \ | |
39075 | do { \ | |
39076 | __put_buffers(X, Y); \ | |
39077 | - preempt_enable(); \ | |
39078 | + local_unlock(aa_buffers_lock); \ | |
39079 | } while (0) | |
39080 | ||
39081 | #endif /* __AA_PATH_H */ | |
b3bbd485 JK |
39082 | diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c |
39083 | index 1346ee5be04f..aa7e4dee107b 100644 | |
39084 | --- a/security/apparmor/lsm.c | |
39085 | +++ b/security/apparmor/lsm.c | |
e4b2b4a8 JK |
39086 | @@ -44,7 +44,7 @@ |
39087 | int apparmor_initialized; | |
39088 | ||
39089 | DEFINE_PER_CPU(struct aa_buffers, aa_buffers); | |
39090 | - | |
39091 | +DEFINE_LOCAL_IRQ_LOCK(aa_buffers_lock); | |
39092 | ||
39093 | /* | |
39094 | * LSM hook functions | |
b3bbd485 JK |
39095 | diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c |
39096 | index ab3bf36786b6..f0bb7c9aa4be 100644 | |
39097 | --- a/sound/core/pcm_native.c | |
39098 | +++ b/sound/core/pcm_native.c | |
39099 | @@ -148,7 +148,7 @@ EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock); | |
1a6e0f06 JK |
39100 | void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream) |
39101 | { | |
39102 | if (!substream->pcm->nonatomic) | |
39103 | - local_irq_disable(); | |
39104 | + local_irq_disable_nort(); | |
39105 | snd_pcm_stream_lock(substream); | |
39106 | } | |
39107 | EXPORT_SYMBOL_GPL(snd_pcm_stream_lock_irq); | |
b3bbd485 | 39108 | @@ -163,7 +163,7 @@ void snd_pcm_stream_unlock_irq(struct snd_pcm_substream *substream) |
1a6e0f06 JK |
39109 | { |
39110 | snd_pcm_stream_unlock(substream); | |
39111 | if (!substream->pcm->nonatomic) | |
39112 | - local_irq_enable(); | |
39113 | + local_irq_enable_nort(); | |
39114 | } | |
39115 | EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irq); | |
39116 | ||
b3bbd485 | 39117 | @@ -171,7 +171,7 @@ unsigned long _snd_pcm_stream_lock_irqsave(struct snd_pcm_substream *substream) |
1a6e0f06 JK |
39118 | { |
39119 | unsigned long flags = 0; | |
39120 | if (!substream->pcm->nonatomic) | |
39121 | - local_irq_save(flags); | |
39122 | + local_irq_save_nort(flags); | |
39123 | snd_pcm_stream_lock(substream); | |
39124 | return flags; | |
39125 | } | |
b3bbd485 | 39126 | @@ -189,7 +189,7 @@ void snd_pcm_stream_unlock_irqrestore(struct snd_pcm_substream *substream, |
1a6e0f06 JK |
39127 | { |
39128 | snd_pcm_stream_unlock(substream); | |
39129 | if (!substream->pcm->nonatomic) | |
39130 | - local_irq_restore(flags); | |
39131 | + local_irq_restore_nort(flags); | |
39132 | } | |
39133 | EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irqrestore); | |
39134 | ||
b3bbd485 JK |
39135 | diff --git a/sound/drivers/dummy.c b/sound/drivers/dummy.c |
39136 | index c0939a0164a6..549e014ecc0d 100644 | |
39137 | --- a/sound/drivers/dummy.c | |
39138 | +++ b/sound/drivers/dummy.c | |
39139 | @@ -376,17 +376,9 @@ struct dummy_hrtimer_pcm { | |
e4b2b4a8 JK |
39140 | ktime_t period_time; |
39141 | atomic_t running; | |
39142 | struct hrtimer timer; | |
39143 | - struct tasklet_struct tasklet; | |
39144 | struct snd_pcm_substream *substream; | |
39145 | }; | |
39146 | ||
39147 | -static void dummy_hrtimer_pcm_elapsed(unsigned long priv) | |
39148 | -{ | |
39149 | - struct dummy_hrtimer_pcm *dpcm = (struct dummy_hrtimer_pcm *)priv; | |
39150 | - if (atomic_read(&dpcm->running)) | |
39151 | - snd_pcm_period_elapsed(dpcm->substream); | |
39152 | -} | |
39153 | - | |
39154 | static enum hrtimer_restart dummy_hrtimer_callback(struct hrtimer *timer) | |
39155 | { | |
39156 | struct dummy_hrtimer_pcm *dpcm; | |
b3bbd485 | 39157 | @@ -394,7 +386,14 @@ static enum hrtimer_restart dummy_hrtimer_callback(struct hrtimer *timer) |
e4b2b4a8 JK |
39158 | dpcm = container_of(timer, struct dummy_hrtimer_pcm, timer); |
39159 | if (!atomic_read(&dpcm->running)) | |
39160 | return HRTIMER_NORESTART; | |
39161 | - tasklet_schedule(&dpcm->tasklet); | |
39162 | + /* | |
39163 | + * In cases of XRUN and draining, this calls .trigger to stop PCM | |
39164 | + * substream. | |
39165 | + */ | |
39166 | + snd_pcm_period_elapsed(dpcm->substream); | |
39167 | + if (!atomic_read(&dpcm->running)) | |
39168 | + return HRTIMER_NORESTART; | |
39169 | + | |
39170 | hrtimer_forward_now(timer, dpcm->period_time); | |
39171 | return HRTIMER_RESTART; | |
39172 | } | |
b3bbd485 | 39173 | @@ -404,7 +403,7 @@ static int dummy_hrtimer_start(struct snd_pcm_substream *substream) |
e4b2b4a8 JK |
39174 | struct dummy_hrtimer_pcm *dpcm = substream->runtime->private_data; |
39175 | ||
39176 | dpcm->base_time = hrtimer_cb_get_time(&dpcm->timer); | |
39177 | - hrtimer_start(&dpcm->timer, dpcm->period_time, HRTIMER_MODE_REL); | |
39178 | + hrtimer_start(&dpcm->timer, dpcm->period_time, HRTIMER_MODE_REL_SOFT); | |
39179 | atomic_set(&dpcm->running, 1); | |
39180 | return 0; | |
39181 | } | |
b3bbd485 | 39182 | @@ -414,14 +413,14 @@ static int dummy_hrtimer_stop(struct snd_pcm_substream *substream) |
e4b2b4a8 JK |
39183 | struct dummy_hrtimer_pcm *dpcm = substream->runtime->private_data; |
39184 | ||
39185 | atomic_set(&dpcm->running, 0); | |
39186 | - hrtimer_cancel(&dpcm->timer); | |
39187 | + if (!hrtimer_callback_running(&dpcm->timer)) | |
39188 | + hrtimer_cancel(&dpcm->timer); | |
39189 | return 0; | |
39190 | } | |
39191 | ||
39192 | static inline void dummy_hrtimer_sync(struct dummy_hrtimer_pcm *dpcm) | |
39193 | { | |
39194 | hrtimer_cancel(&dpcm->timer); | |
39195 | - tasklet_kill(&dpcm->tasklet); | |
39196 | } | |
39197 | ||
39198 | static snd_pcm_uframes_t | |
b3bbd485 | 39199 | @@ -466,12 +465,10 @@ static int dummy_hrtimer_create(struct snd_pcm_substream *substream) |
e4b2b4a8 JK |
39200 | if (!dpcm) |
39201 | return -ENOMEM; | |
39202 | substream->runtime->private_data = dpcm; | |
39203 | - hrtimer_init(&dpcm->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | |
39204 | + hrtimer_init(&dpcm->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); | |
39205 | dpcm->timer.function = dummy_hrtimer_callback; | |
39206 | dpcm->substream = substream; | |
39207 | atomic_set(&dpcm->running, 0); | |
39208 | - tasklet_init(&dpcm->tasklet, dummy_hrtimer_pcm_elapsed, | |
39209 | - (unsigned long)dpcm); | |
39210 | return 0; | |
39211 | } | |
39212 | ||
b3bbd485 JK |
39213 | diff --git a/tools/testing/selftests/ftrace/test.d/functions b/tools/testing/selftests/ftrace/test.d/functions |
39214 | index 6a4982d029bf..843c2b0d948e 100644 | |
39215 | --- a/tools/testing/selftests/ftrace/test.d/functions | |
39216 | +++ b/tools/testing/selftests/ftrace/test.d/functions | |
39217 | @@ -70,6 +70,13 @@ disable_events() { | |
e4b2b4a8 JK |
39218 | echo 0 > events/enable |
39219 | } | |
39220 | ||
39221 | +clear_synthetic_events() { # reset all current synthetic events | |
39222 | + grep -v ^# synthetic_events | | |
39223 | + while read line; do | |
39224 | + echo "!$line" >> synthetic_events | |
39225 | + done | |
39226 | +} | |
39227 | + | |
39228 | initialize_ftrace() { # Reset ftrace to initial-state | |
39229 | # As the initial state, ftrace will be set to nop tracer, | |
39230 | # no events, no triggers, no filters, no function filters, | |
b3bbd485 JK |
39231 | diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-extended-error-support.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-extended-error-support.tc |
39232 | new file mode 100644 | |
39233 | index 000000000000..786dce7e48be | |
39234 | --- /dev/null | |
39235 | +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-extended-error-support.tc | |
e4b2b4a8 JK |
39236 | @@ -0,0 +1,39 @@ |
39237 | +#!/bin/sh | |
39238 | +# description: event trigger - test extended error support | |
39239 | + | |
39240 | + | |
39241 | +do_reset() { | |
39242 | + reset_trigger | |
39243 | + echo > set_event | |
39244 | + clear_trace | |
39245 | +} | |
39246 | + | |
39247 | +fail() { #msg | |
39248 | + do_reset | |
39249 | + echo $1 | |
39250 | + exit_fail | |
39251 | +} | |
39252 | + | |
39253 | +if [ ! -f set_event ]; then | |
39254 | + echo "event tracing is not supported" | |
39255 | + exit_unsupported | |
39256 | +fi | |
39257 | + | |
39258 | +if [ ! -f synthetic_events ]; then | |
39259 | + echo "synthetic event is not supported" | |
39260 | + exit_unsupported | |
39261 | +fi | |
39262 | + | |
39263 | +reset_tracer | |
39264 | +do_reset | |
39265 | + | |
39266 | +echo "Test extended error support" | |
39267 | +echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_wakeup/trigger | |
39268 | +echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' >> events/sched/sched_wakeup/trigger &>/dev/null | |
39269 | +if ! grep -q "ERROR:" events/sched/sched_wakeup/hist; then | |
39270 | + fail "Failed to generate extended error in histogram" | |
39271 | +fi | |
39272 | + | |
39273 | +do_reset | |
39274 | + | |
39275 | +exit 0 | |
b3bbd485 JK |
39276 | diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc |
39277 | new file mode 100644 | |
39278 | index 000000000000..7fd5b4a8f060 | |
39279 | --- /dev/null | |
39280 | +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc | |
e4b2b4a8 JK |
39281 | @@ -0,0 +1,54 @@ |
39282 | +#!/bin/sh | |
39283 | +# description: event trigger - test field variable support | |
39284 | + | |
39285 | +do_reset() { | |
39286 | + reset_trigger | |
39287 | + echo > set_event | |
39288 | + clear_trace | |
39289 | +} | |
39290 | + | |
39291 | +fail() { #msg | |
39292 | + do_reset | |
39293 | + echo $1 | |
39294 | + exit_fail | |
39295 | +} | |
39296 | + | |
39297 | +if [ ! -f set_event ]; then | |
39298 | + echo "event tracing is not supported" | |
39299 | + exit_unsupported | |
39300 | +fi | |
39301 | + | |
39302 | +if [ ! -f synthetic_events ]; then | |
39303 | + echo "synthetic event is not supported" | |
39304 | + exit_unsupported | |
39305 | +fi | |
39306 | + | |
39307 | +clear_synthetic_events | |
39308 | +reset_tracer | |
39309 | +do_reset | |
39310 | + | |
39311 | +echo "Test field variable support" | |
39312 | + | |
39313 | +echo 'wakeup_latency u64 lat; pid_t pid; int prio; char comm[16]' > synthetic_events | |
39314 | +echo 'hist:keys=comm:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_waking/trigger | |
39315 | +echo 'hist:keys=next_comm:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,sched.sched_waking.prio,next_comm) if next_comm=="ping"' > events/sched/sched_switch/trigger | |
39316 | +echo 'hist:keys=pid,prio,comm:vals=lat:sort=pid,prio' > events/synthetic/wakeup_latency/trigger | |
39317 | + | |
39318 | +ping localhost -c 3 | |
39319 | +if ! grep -q "ping" events/synthetic/wakeup_latency/hist; then | |
39320 | + fail "Failed to create inter-event histogram" | |
39321 | +fi | |
39322 | + | |
39323 | +if ! grep -q "synthetic_prio=prio" events/sched/sched_waking/hist; then | |
39324 | + fail "Failed to create histogram with field variable" | |
39325 | +fi | |
39326 | + | |
39327 | +echo '!hist:keys=next_comm:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,sched.sched_waking.prio,next_comm) if next_comm=="ping"' >> events/sched/sched_switch/trigger | |
39328 | + | |
39329 | +if grep -q "synthetic_prio=prio" events/sched/sched_waking/hist; then | |
39330 | + fail "Failed to remove histogram with field variable" | |
39331 | +fi | |
39332 | + | |
39333 | +do_reset | |
39334 | + | |
39335 | +exit 0 | |
b3bbd485 JK |
39336 | diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc |
39337 | new file mode 100644 | |
39338 | index 000000000000..c93dbe38b5df | |
39339 | --- /dev/null | |
39340 | +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc | |
e4b2b4a8 JK |
39341 | @@ -0,0 +1,58 @@ |
39342 | +#!/bin/sh | |
39343 | +# description: event trigger - test inter-event combined histogram trigger | |
39344 | + | |
39345 | +do_reset() { | |
39346 | + reset_trigger | |
39347 | + echo > set_event | |
39348 | + clear_trace | |
39349 | +} | |
39350 | + | |
39351 | +fail() { #msg | |
39352 | + do_reset | |
39353 | + echo $1 | |
39354 | + exit_fail | |
39355 | +} | |
39356 | + | |
39357 | +if [ ! -f set_event ]; then | |
39358 | + echo "event tracing is not supported" | |
39359 | + exit_unsupported | |
39360 | +fi | |
39361 | + | |
39362 | +if [ ! -f synthetic_events ]; then | |
39363 | + echo "synthetic event is not supported" | |
39364 | + exit_unsupported | |
39365 | +fi | |
39366 | + | |
39367 | +reset_tracer | |
39368 | +do_reset | |
39369 | +clear_synthetic_events | |
39370 | + | |
39371 | +echo "Test create synthetic event" | |
39372 | + | |
39373 | +echo 'waking_latency u64 lat pid_t pid' > synthetic_events | |
39374 | +if [ ! -d events/synthetic/waking_latency ]; then | |
39375 | + fail "Failed to create waking_latency synthetic event" | |
39376 | +fi | |
39377 | + | |
39378 | +echo "Test combined histogram" | |
39379 | + | |
39380 | +echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_waking/trigger | |
39381 | +echo 'hist:keys=pid:waking_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).waking_latency($waking_lat,pid) if comm=="ping"' > events/sched/sched_wakeup/trigger | |
39382 | +echo 'hist:keys=pid,lat:sort=pid,lat' > events/synthetic/waking_latency/trigger | |
39383 | + | |
39384 | +echo 'wakeup_latency u64 lat pid_t pid' >> synthetic_events | |
39385 | +echo 'hist:keys=pid:ts1=common_timestamp.usecs if comm=="ping"' >> events/sched/sched_wakeup/trigger | |
39386 | +echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts1:onmatch(sched.sched_wakeup).wakeup_latency($wakeup_lat,next_pid) if next_comm=="ping"' > events/sched/sched_switch/trigger | |
39387 | + | |
39388 | +echo 'waking+wakeup_latency u64 lat; pid_t pid' >> synthetic_events | |
39389 | +echo 'hist:keys=pid,lat:sort=pid,lat:ww_lat=$waking_lat+$wakeup_lat:onmatch(synthetic.wakeup_latency).waking+wakeup_latency($ww_lat,pid)' >> events/synthetic/wakeup_latency/trigger | |
39390 | +echo 'hist:keys=pid,lat:sort=pid,lat' >> events/synthetic/waking+wakeup_latency/trigger | |
39391 | + | |
39392 | +ping localhost -c 3 | |
39393 | +if ! grep -q "pid:" events/synthetic/waking+wakeup_latency/hist; then | |
39394 | + fail "Failed to create combined histogram" | |
39395 | +fi | |
39396 | + | |
39397 | +do_reset | |
39398 | + | |
39399 | +exit 0 | |
b3bbd485 JK |
39400 | diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc |
39401 | new file mode 100644 | |
39402 | index 000000000000..e84e7d048566 | |
39403 | --- /dev/null | |
39404 | +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc | |
e4b2b4a8 JK |
39405 | @@ -0,0 +1,50 @@ |
39406 | +#!/bin/sh | |
39407 | +# description: event trigger - test inter-event histogram trigger onmatch action | |
39408 | + | |
39409 | +do_reset() { | |
39410 | + reset_trigger | |
39411 | + echo > set_event | |
39412 | + clear_trace | |
39413 | +} | |
39414 | + | |
39415 | +fail() { #msg | |
39416 | + do_reset | |
39417 | + echo $1 | |
39418 | + exit_fail | |
39419 | +} | |
39420 | + | |
39421 | +if [ ! -f set_event ]; then | |
39422 | + echo "event tracing is not supported" | |
39423 | + exit_unsupported | |
39424 | +fi | |
39425 | + | |
39426 | +if [ ! -f synthetic_events ]; then | |
39427 | + echo "synthetic event is not supported" | |
39428 | + exit_unsupported | |
39429 | +fi | |
39430 | + | |
39431 | +clear_synthetic_events | |
39432 | +reset_tracer | |
39433 | +do_reset | |
39434 | + | |
39435 | +echo "Test create synthetic event" | |
39436 | + | |
39437 | +echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events | |
39438 | +if [ ! -d events/synthetic/wakeup_latency ]; then | |
39439 | + fail "Failed to create wakeup_latency synthetic event" | |
39440 | +fi | |
39441 | + | |
39442 | +echo "Test create histogram for synthetic event" | |
39443 | +echo "Test histogram variables,simple expression support and onmatch action" | |
39444 | + | |
39445 | +echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_wakeup/trigger | |
39446 | +echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_wakeup).wakeup_latency($wakeup_lat,next_pid,next_comm) if next_comm=="ping"' > events/sched/sched_switch/trigger | |
39447 | +echo 'hist:keys=comm,pid,lat:wakeup_lat=lat:sort=lat' > events/synthetic/wakeup_latency/trigger | |
39448 | +ping localhost -c 5 | |
39449 | +if ! grep -q "ping" events/synthetic/wakeup_latency/hist; then | |
39450 | + fail "Failed to create onmatch action inter-event histogram" | |
39451 | +fi | |
39452 | + | |
39453 | +do_reset | |
39454 | + | |
39455 | +exit 0 | |
b3bbd485 JK |
39456 | diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc |
39457 | new file mode 100644 | |
39458 | index 000000000000..7907d8aacde3 | |
39459 | --- /dev/null | |
39460 | +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc | |
e4b2b4a8 JK |
39461 | @@ -0,0 +1,50 @@ |
39462 | +#!/bin/sh | |
39463 | +# description: event trigger - test inter-event histogram trigger onmatch-onmax action | |
39464 | + | |
39465 | +do_reset() { | |
39466 | + reset_trigger | |
39467 | + echo > set_event | |
39468 | + clear_trace | |
39469 | +} | |
39470 | + | |
39471 | +fail() { #msg | |
39472 | + do_reset | |
39473 | + echo $1 | |
39474 | + exit_fail | |
39475 | +} | |
39476 | + | |
39477 | +if [ ! -f set_event ]; then | |
39478 | + echo "event tracing is not supported" | |
39479 | + exit_unsupported | |
39480 | +fi | |
39481 | + | |
39482 | +if [ ! -f synthetic_events ]; then | |
39483 | + echo "synthetic event is not supported" | |
39484 | + exit_unsupported | |
39485 | +fi | |
39486 | + | |
39487 | +clear_synthetic_events | |
39488 | +reset_tracer | |
39489 | +do_reset | |
39490 | + | |
39491 | +echo "Test create synthetic event" | |
39492 | + | |
39493 | +echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events | |
39494 | +if [ ! -d events/synthetic/wakeup_latency ]; then | |
39495 | + fail "Failed to create wakeup_latency synthetic event" | |
39496 | +fi | |
39497 | + | |
39498 | +echo "Test create histogram for synthetic event" | |
39499 | +echo "Test histogram variables,simple expression support and onmatch-onmax action" | |
39500 | + | |
39501 | +echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_wakeup/trigger | |
39502 | +echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_wakeup).wakeup_latency($wakeup_lat,next_pid,next_comm):onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) if next_comm=="ping"' >> events/sched/sched_switch/trigger | |
39503 | +echo 'hist:keys=comm,pid,lat:wakeup_lat=lat:sort=lat' > events/synthetic/wakeup_latency/trigger | |
39504 | +ping localhost -c 5 | |
39505 | +if [ ! grep -q "ping" events/synthetic/wakeup_latency/hist -o ! grep -q "max:" events/sched/sched_switch/hist]; then | |
39506 | + fail "Failed to create onmatch-onmax action inter-event histogram" | |
39507 | +fi | |
39508 | + | |
39509 | +do_reset | |
39510 | + | |
39511 | +exit 0 | |
b3bbd485 JK |
39512 | diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc |
39513 | new file mode 100644 | |
39514 | index 000000000000..38b7ed6242b2 | |
39515 | --- /dev/null | |
39516 | +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc | |
e4b2b4a8 JK |
39517 | @@ -0,0 +1,48 @@ |
39518 | +#!/bin/sh | |
39519 | +# description: event trigger - test inter-event histogram trigger onmax action | |
39520 | + | |
39521 | +do_reset() { | |
39522 | + reset_trigger | |
39523 | + echo > set_event | |
39524 | + clear_trace | |
39525 | +} | |
39526 | + | |
39527 | +fail() { #msg | |
39528 | + do_reset | |
39529 | + echo $1 | |
39530 | + exit_fail | |
39531 | +} | |
39532 | + | |
39533 | +if [ ! -f set_event ]; then | |
39534 | + echo "event tracing is not supported" | |
39535 | + exit_unsupported | |
39536 | +fi | |
39537 | + | |
39538 | +if [ ! -f synthetic_events ]; then | |
39539 | + echo "synthetic event is not supported" | |
39540 | + exit_unsupported | |
39541 | +fi | |
39542 | + | |
39543 | +clear_synthetic_events | |
39544 | +reset_tracer | |
39545 | +do_reset | |
39546 | + | |
39547 | +echo "Test create synthetic event" | |
39548 | + | |
39549 | +echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events | |
39550 | +if [ ! -d events/synthetic/wakeup_latency ]; then | |
39551 | + fail "Failed to create wakeup_latency synthetic event" | |
39552 | +fi | |
39553 | + | |
39554 | +echo "Test onmax action" | |
39555 | + | |
39556 | +echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' >> events/sched/sched_waking/trigger | |
39557 | +echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) if next_comm=="ping"' >> events/sched/sched_switch/trigger | |
39558 | +ping localhost -c 3 | |
39559 | +if ! grep -q "max:" events/sched/sched_switch/hist; then | |
39560 | + fail "Failed to create onmax action inter-event histogram" | |
39561 | +fi | |
39562 | + | |
39563 | +do_reset | |
39564 | + | |
39565 | +exit 0 | |
b3bbd485 JK |
39566 | diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc |
39567 | new file mode 100644 | |
39568 | index 000000000000..cef11377dcbd | |
39569 | --- /dev/null | |
39570 | +++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc | |
e4b2b4a8 JK |
39571 | @@ -0,0 +1,54 @@ |
39572 | +#!/bin/sh | |
39573 | +# description: event trigger - test synthetic event create remove | |
39574 | +do_reset() { | |
39575 | + reset_trigger | |
39576 | + echo > set_event | |
39577 | + clear_trace | |
39578 | +} | |
39579 | + | |
39580 | +fail() { #msg | |
39581 | + do_reset | |
39582 | + echo $1 | |
39583 | + exit_fail | |
39584 | +} | |
39585 | + | |
39586 | +if [ ! -f set_event ]; then | |
39587 | + echo "event tracing is not supported" | |
39588 | + exit_unsupported | |
39589 | +fi | |
39590 | + | |
39591 | +if [ ! -f synthetic_events ]; then | |
39592 | + echo "synthetic event is not supported" | |
39593 | + exit_unsupported | |
39594 | +fi | |
39595 | + | |
39596 | +clear_synthetic_events | |
39597 | +reset_tracer | |
39598 | +do_reset | |
39599 | + | |
39600 | +echo "Test create synthetic event" | |
39601 | + | |
39602 | +echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events | |
39603 | +if [ ! -d events/synthetic/wakeup_latency ]; then | |
39604 | + fail "Failed to create wakeup_latency synthetic event" | |
39605 | +fi | |
39606 | + | |
39607 | +reset_trigger | |
39608 | + | |
39609 | +echo "Test create synthetic event with an error" | |
39610 | +echo 'wakeup_latency u64 lat pid_t pid char' > synthetic_events > /dev/null | |
39611 | +if [ -d events/synthetic/wakeup_latency ]; then | |
39612 | + fail "Created wakeup_latency synthetic event with an invalid format" | |
39613 | +fi | |
39614 | + | |
39615 | +reset_trigger | |
39616 | + | |
39617 | +echo "Test remove synthetic event" | |
39618 | +echo '!wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events | |
39619 | +if [ -d events/synthetic/wakeup_latency ]; then | |
39620 | + fail "Failed to delete wakeup_latency synthetic event" | |
39621 | +fi | |
39622 | + | |
39623 | +do_reset | |
39624 | + | |
39625 | +exit 0 | |
b3bbd485 JK |
39626 | diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c |
39627 | index d5f1d8364571..c09e04130bfe 100644 | |
39628 | --- a/virt/kvm/arm/arm.c | |
39629 | +++ b/virt/kvm/arm/arm.c | |
39630 | @@ -69,7 +69,6 @@ static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled); | |
e4b2b4a8 JK |
39631 | |
39632 | static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu) | |
39633 | { | |
39634 | - BUG_ON(preemptible()); | |
39635 | __this_cpu_write(kvm_arm_running_vcpu, vcpu); | |
39636 | } | |
39637 | ||
b3bbd485 | 39638 | @@ -79,7 +78,6 @@ static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu) |
e4b2b4a8 JK |
39639 | */ |
39640 | struct kvm_vcpu *kvm_arm_get_running_vcpu(void) | |
39641 | { | |
39642 | - BUG_ON(preemptible()); | |
39643 | return __this_cpu_read(kvm_arm_running_vcpu); | |
39644 | } | |
39645 | ||
b3bbd485 | 39646 | @@ -653,7 +651,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) |
e4b2b4a8 JK |
39647 | * involves poking the GIC, which must be done in a |
39648 | * non-preemptible context. | |
39649 | */ | |
39650 | - preempt_disable(); | |
39651 | + migrate_disable(); | |
39652 | ||
39653 | kvm_pmu_flush_hwstate(vcpu); | |
39654 | ||
b3bbd485 | 39655 | @@ -690,7 +688,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) |
e4b2b4a8 JK |
39656 | kvm_pmu_sync_hwstate(vcpu); |
39657 | kvm_timer_sync_hwstate(vcpu); | |
39658 | kvm_vgic_sync_hwstate(vcpu); | |
39659 | - preempt_enable(); | |
39660 | + migrate_enable(); | |
39661 | continue; | |
39662 | } | |
39663 | ||
b3bbd485 | 39664 | @@ -745,7 +743,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) |
e4b2b4a8 JK |
39665 | |
39666 | kvm_vgic_sync_hwstate(vcpu); | |
39667 | ||
39668 | - preempt_enable(); | |
39669 | + migrate_enable(); | |
39670 | ||
39671 | ret = handle_exit(vcpu, run, ret); | |
39672 | } |