]>
Commit | Line | Data |
---|---|---|
7d3d7cb5 JB |
1 | Description: Make in-order command queues actually be in-order |
2 | ||
3 | When beignet added out-of-order execution support (7fd45f15), | |
4 | it made *all* queues use it, even ones that are nominally in-order. | |
5 | ||
6 | While using out-of-order queues is probably a good idea when possible | |
7 | (for performance), the OpenCL spec does not allow it to be the default. | |
8 | ||
9 | Author: Rebecca N. Palmer <rebecca_palmer@zoho.com> | |
10 | Forwarded: https://lists.freedesktop.org/archives/beignet/2018-July/009213.html | |
11 | ||
12 | --- a/src/cl_api.c | |
13 | +++ b/src/cl_api.c | |
14 | @@ -276,7 +276,7 @@ clEnqueueSVMFree (cl_command_queue command_queue, | |
15 | data->size = num_svm_pointers; | |
16 | data->ptr = user_data; | |
17 | ||
18 | - if (e_status == CL_COMPLETE) { | |
19 | + if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) { | |
20 | // Sync mode, no need to queue event. | |
21 | err = cl_event_exec(e, CL_COMPLETE, CL_FALSE); | |
22 | if (err != CL_SUCCESS) { | |
23 | @@ -422,7 +422,7 @@ cl_int clEnqueueSVMMemcpy (cl_command_queue command_queue, | |
24 | data->const_ptr = src_ptr; | |
25 | data->size = size; | |
26 | ||
27 | - if (e_status == CL_COMPLETE) { | |
28 | + if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) { | |
29 | // Sync mode, no need to queue event. | |
30 | err = cl_event_exec(e, CL_COMPLETE, CL_FALSE); | |
31 | if (err != CL_SUCCESS) { | |
32 | @@ -434,6 +434,9 @@ cl_int clEnqueueSVMMemcpy (cl_command_queue command_queue, | |
33 | break; | |
34 | } | |
35 | cl_command_queue_enqueue_event(command_queue, e); | |
36 | + if (blocking_copy) { | |
37 | + cl_event_wait_for_events_list(1, &e); | |
38 | + } | |
39 | } | |
40 | } while(0); | |
41 | ||
42 | @@ -511,7 +514,7 @@ cl_int clEnqueueSVMMemFill (cl_command_queue command_queue, | |
43 | data->pattern_size = pattern_size; | |
44 | data->size = size; | |
45 | ||
46 | - if (e_status == CL_COMPLETE) { | |
47 | + if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) { | |
48 | // Sync mode, no need to queue event. | |
49 | err = cl_event_exec(e, CL_COMPLETE, CL_FALSE); | |
50 | if (err != CL_SUCCESS) { | |
51 | --- a/src/cl_api_kernel.c | |
52 | +++ b/src/cl_api_kernel.c | |
53 | @@ -223,6 +223,7 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue, | |
54 | count *= global_wk_sz_rem[2] ? 2 : 1; | |
55 | ||
56 | const size_t *global_wk_all[2] = {global_wk_sz_div, global_wk_sz_rem}; | |
57 | + cl_bool allow_immediate_submit = cl_command_queue_allow_bypass_submit(command_queue); | |
58 | /* Go through the at most 8 cases and euque if there is work items left */ | |
59 | for (i = 0; i < 2; i++) { | |
60 | for (j = 0; j < 2; j++) { | |
61 | @@ -263,7 +264,7 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue, | |
62 | break; | |
63 | } | |
64 | ||
65 | - err = cl_event_exec(e, (event_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED), CL_FALSE); | |
66 | + err = cl_event_exec(e, ((allow_immediate_submit && event_status == CL_COMPLETE) ? CL_SUBMITTED : CL_QUEUED), CL_FALSE); | |
67 | if (err != CL_SUCCESS) { | |
68 | break; | |
69 | } | |
70 | --- a/src/cl_api_mem.c | |
71 | +++ b/src/cl_api_mem.c | |
72 | @@ -308,7 +308,7 @@ clEnqueueMapBuffer(cl_command_queue command_queue, | |
73 | if (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION)) | |
74 | data->write_map = 1; | |
75 | ||
76 | - if (e_status == CL_COMPLETE) { | |
77 | + if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) { | |
78 | // Sync mode, no need to queue event. | |
79 | err = cl_event_exec(e, CL_COMPLETE, CL_FALSE); | |
80 | if (err != CL_SUCCESS) { | |
81 | @@ -321,6 +321,9 @@ clEnqueueMapBuffer(cl_command_queue command_queue, | |
82 | } | |
83 | ||
84 | cl_command_queue_enqueue_event(command_queue, e); | |
85 | + if (blocking_map) { | |
86 | + cl_event_wait_for_events_list(1, &e); | |
87 | + } | |
88 | } | |
89 | ||
90 | ptr = data->ptr; | |
91 | @@ -393,7 +396,7 @@ clEnqueueUnmapMemObject(cl_command_queue command_queue, | |
92 | data->mem_obj = memobj; | |
93 | data->ptr = mapped_ptr; | |
94 | ||
95 | - if (e_status == CL_COMPLETE) { // No need to wait | |
96 | + if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) { // No need to wait | |
97 | err = cl_event_exec(e, CL_COMPLETE, CL_FALSE); | |
98 | if (err != CL_SUCCESS) { | |
99 | break; | |
100 | @@ -495,7 +498,7 @@ clEnqueueReadBuffer(cl_command_queue command_queue, | |
101 | data->offset = offset; | |
102 | data->size = size; | |
103 | ||
104 | - if (e_status == CL_COMPLETE) { | |
105 | + if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) { | |
106 | // Sync mode, no need to queue event. | |
107 | err = cl_event_exec(e, CL_COMPLETE, CL_FALSE); | |
108 | if (err != CL_SUCCESS) { | |
109 | @@ -507,6 +510,9 @@ clEnqueueReadBuffer(cl_command_queue command_queue, | |
110 | break; | |
111 | } | |
112 | cl_command_queue_enqueue_event(command_queue, e); | |
113 | + if (blocking_read) { | |
114 | + cl_event_wait_for_events_list(1, &e); | |
115 | + } | |
116 | } | |
117 | } while (0); | |
118 | ||
119 | @@ -598,7 +604,7 @@ clEnqueueWriteBuffer(cl_command_queue command_queue, | |
120 | data->offset = offset; | |
121 | data->size = size; | |
122 | ||
123 | - if (e_status == CL_COMPLETE) { | |
124 | + if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) { | |
125 | // Sync mode, no need to queue event. | |
126 | err = cl_event_exec(e, CL_COMPLETE, CL_FALSE); | |
127 | if (err != CL_SUCCESS) { | |
128 | @@ -610,6 +616,9 @@ clEnqueueWriteBuffer(cl_command_queue command_queue, | |
129 | break; | |
130 | } | |
131 | cl_command_queue_enqueue_event(command_queue, e); | |
132 | + if (blocking_write) { | |
133 | + cl_event_wait_for_events_list(1, &e); | |
134 | + } | |
135 | } | |
136 | } while (0); | |
137 | ||
138 | @@ -747,7 +756,7 @@ clEnqueueReadBufferRect(cl_command_queue command_queue, | |
139 | data->host_row_pitch = host_row_pitch; | |
140 | data->host_slice_pitch = host_slice_pitch; | |
141 | ||
142 | - if (e_status == CL_COMPLETE) { | |
143 | + if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) { | |
144 | // Sync mode, no need to queue event. | |
145 | err = cl_event_exec(e, CL_COMPLETE, CL_FALSE); | |
146 | if (err != CL_SUCCESS) { | |
147 | @@ -759,6 +768,9 @@ clEnqueueReadBufferRect(cl_command_queue command_queue, | |
148 | break; | |
149 | } | |
150 | cl_command_queue_enqueue_event(command_queue, e); | |
151 | + if (blocking_read) { | |
152 | + cl_event_wait_for_events_list(1, &e); | |
153 | + } | |
154 | } | |
155 | } while (0); | |
156 | ||
157 | @@ -898,7 +910,7 @@ clEnqueueWriteBufferRect(cl_command_queue command_queue, | |
158 | data->host_row_pitch = host_row_pitch; | |
159 | data->host_slice_pitch = host_slice_pitch; | |
160 | ||
161 | - if (e_status == CL_COMPLETE) { | |
162 | + if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) { | |
163 | // Sync mode, no need to queue event. | |
164 | err = cl_event_exec(e, CL_COMPLETE, CL_FALSE); | |
165 | if (err != CL_SUCCESS) { | |
166 | @@ -910,6 +922,9 @@ clEnqueueWriteBufferRect(cl_command_queue command_queue, | |
167 | break; | |
168 | } | |
169 | cl_command_queue_enqueue_event(command_queue, e); | |
170 | + if (blocking_write) { | |
171 | + cl_event_wait_for_events_list(1, &e); | |
172 | + } | |
173 | } | |
174 | } while (0); | |
175 | ||
176 | @@ -1017,7 +1032,7 @@ clEnqueueCopyBuffer(cl_command_queue command_queue, | |
177 | break; | |
178 | } | |
179 | ||
180 | - err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE); | |
181 | + err = cl_event_exec(e, (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) ? CL_SUBMITTED : CL_QUEUED, CL_FALSE); | |
182 | if (err != CL_SUCCESS) { | |
183 | break; | |
184 | } | |
185 | @@ -1207,7 +1222,7 @@ clEnqueueCopyBufferRect(cl_command_queue command_queue, | |
186 | if (e_status < CL_COMPLETE) { // Error happend, cancel. | |
187 | err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST; | |
188 | break; | |
189 | - } else if (e_status == CL_COMPLETE) { | |
190 | + } else if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) { | |
191 | err = cl_event_exec(e, CL_SUBMITTED, CL_FALSE); | |
192 | if (err != CL_SUCCESS) { | |
193 | break; | |
194 | @@ -1308,7 +1323,7 @@ clEnqueueFillBuffer(cl_command_queue command_queue, | |
195 | break; | |
196 | } | |
197 | ||
198 | - err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE); | |
199 | + err = cl_event_exec(e, (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) ? CL_SUBMITTED : CL_QUEUED, CL_FALSE); | |
200 | if (err != CL_SUCCESS) { | |
201 | break; | |
202 | } | |
203 | @@ -1395,7 +1410,7 @@ clEnqueueMigrateMemObjects(cl_command_queue command_queue, | |
204 | break; | |
205 | } | |
206 | ||
207 | - err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE); | |
208 | + err = cl_event_exec(e, (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) ? CL_SUBMITTED : CL_QUEUED, CL_FALSE); | |
209 | if (err != CL_SUCCESS) { | |
210 | break; | |
211 | } | |
212 | @@ -1574,7 +1589,7 @@ clEnqueueMapImage(cl_command_queue command_queue, | |
213 | if (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION)) | |
214 | data->write_map = 1; | |
215 | ||
216 | - if (e_status == CL_COMPLETE) { | |
217 | + if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) { | |
218 | // Sync mode, no need to queue event. | |
219 | err = cl_event_exec(e, CL_COMPLETE, CL_FALSE); | |
220 | if (err != CL_SUCCESS) { | |
221 | @@ -1587,6 +1602,9 @@ clEnqueueMapImage(cl_command_queue command_queue, | |
222 | } | |
223 | ||
224 | cl_command_queue_enqueue_event(command_queue, e); | |
225 | + if (blocking_map) { | |
226 | + cl_event_wait_for_events_list(1, &e); | |
227 | + } | |
228 | } | |
229 | ||
230 | ptr = data->ptr; | |
231 | @@ -1764,7 +1782,7 @@ clEnqueueReadImage(cl_command_queue command_queue, | |
232 | data->row_pitch = row_pitch; | |
233 | data->slice_pitch = slice_pitch; | |
234 | ||
235 | - if (e_status == CL_COMPLETE) { | |
236 | + if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) { | |
237 | // Sync mode, no need to queue event. | |
238 | err = cl_event_exec(e, CL_COMPLETE, CL_FALSE); | |
239 | if (err != CL_SUCCESS) { | |
240 | @@ -1776,6 +1794,9 @@ clEnqueueReadImage(cl_command_queue command_queue, | |
241 | break; | |
242 | } | |
243 | cl_command_queue_enqueue_event(command_queue, e); | |
244 | + if (blocking_read) { | |
245 | + cl_event_wait_for_events_list(1, &e); | |
246 | + } | |
247 | } | |
248 | } while (0); | |
249 | ||
250 | @@ -1916,7 +1937,7 @@ clEnqueueWriteImage(cl_command_queue command_queue, | |
251 | data->row_pitch = row_pitch; | |
252 | data->slice_pitch = slice_pitch; | |
253 | ||
254 | - if (e_status == CL_COMPLETE) { | |
255 | + if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) { | |
256 | // Sync mode, no need to queue event. | |
257 | err = cl_event_exec(e, CL_COMPLETE, CL_FALSE); | |
258 | if (err != CL_SUCCESS) { | |
259 | @@ -1928,6 +1949,9 @@ clEnqueueWriteImage(cl_command_queue command_queue, | |
260 | break; | |
261 | } | |
262 | cl_command_queue_enqueue_event(command_queue, e); | |
263 | + if (blocking_write) { | |
264 | + cl_event_wait_for_events_list(1, &e); | |
265 | + } | |
266 | } | |
267 | } while (0); | |
268 | ||
269 | @@ -2062,7 +2086,7 @@ clEnqueueCopyImage(cl_command_queue command_queue, | |
270 | break; | |
271 | } | |
272 | ||
273 | - err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE); | |
274 | + err = cl_event_exec(e, (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) ? CL_SUBMITTED : CL_QUEUED, CL_FALSE); | |
275 | if (err != CL_SUCCESS) { | |
276 | break; | |
277 | } | |
278 | @@ -2173,7 +2197,7 @@ clEnqueueCopyImageToBuffer(cl_command_queue command_queue, | |
279 | break; | |
280 | } | |
281 | ||
282 | - err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE); | |
283 | + err = cl_event_exec(e, (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) ? CL_SUBMITTED : CL_QUEUED, CL_FALSE); | |
284 | if (err != CL_SUCCESS) { | |
285 | break; | |
286 | } | |
287 | @@ -2285,7 +2309,7 @@ clEnqueueCopyBufferToImage(cl_command_queue command_queue, | |
288 | break; | |
289 | } | |
290 | ||
291 | - err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE); | |
292 | + err = cl_event_exec(e, (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) ? CL_SUBMITTED : CL_QUEUED, CL_FALSE); | |
293 | if (err != CL_SUCCESS) { | |
294 | break; | |
295 | } | |
296 | @@ -2395,7 +2419,7 @@ clEnqueueFillImage(cl_command_queue command_queue, | |
297 | break; | |
298 | } | |
299 | ||
300 | - err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE); | |
301 | + err = cl_event_exec(e, (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) ? CL_SUBMITTED : CL_QUEUED, CL_FALSE); | |
302 | if (err != CL_SUCCESS) { | |
303 | break; | |
304 | } | |
305 | --- a/src/cl_command_queue.h | |
306 | +++ b/src/cl_command_queue.h | |
307 | @@ -103,6 +103,11 @@ extern cl_int cl_command_queue_wait_finish(cl_command_queue queue); | |
308 | extern cl_int cl_command_queue_wait_flush(cl_command_queue queue); | |
309 | /* Note: Must call this function with queue's lock. */ | |
310 | extern cl_event *cl_command_queue_record_in_queue_events(cl_command_queue queue, cl_uint *list_num); | |
311 | +/* Whether it is valid to call cl_event_exec directly, instead of cl_command_queue_enqueue_event */ | |
312 | +static inline cl_bool cl_command_queue_allow_bypass_submit(cl_command_queue queue){ | |
313 | + return (queue->props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)/* if out-of-order, always */ | |
314 | + || list_empty(&queue->worker.enqueued_events);/* if in-order, only if empty */ | |
315 | +} | |
316 | ||
317 | #endif /* __CL_COMMAND_QUEUE_H__ */ | |
318 | ||
319 | --- a/src/cl_command_queue_enqueue.c | |
320 | +++ b/src/cl_command_queue_enqueue.c | |
321 | @@ -65,6 +65,8 @@ worker_thread_function(void *Arg) | |
322 | if (cl_event_is_ready(e) <= CL_COMPLETE) { | |
323 | list_node_del(&e->enqueue_node); | |
324 | list_add_tail(&ready_list, &e->enqueue_node); | |
325 | + } else if(!(queue->props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)){ | |
326 | + break; /* in in-order mode, can't skip over non-ready events */ | |
327 | } | |
328 | } | |
329 | ||
330 | @@ -80,18 +82,20 @@ worker_thread_function(void *Arg) | |
331 | CL_OBJECT_UNLOCK(queue); | |
332 | ||
333 | /* Do the really job without lock.*/ | |
334 | - exec_status = CL_SUBMITTED; | |
335 | - list_for_each_safe(pos, n, &ready_list) | |
336 | - { | |
337 | - e = list_entry(pos, _cl_event, enqueue_node); | |
338 | - cl_event_exec(e, exec_status, CL_FALSE); | |
339 | - } | |
340 | + if (queue->props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) { /* in in-order mode, need to get each all the way to CL_COMPLETE before starting the next one */ | |
341 | + exec_status = CL_SUBMITTED; | |
342 | + list_for_each_safe(pos, n, &ready_list) | |
343 | + { | |
344 | + e = list_entry(pos, _cl_event, enqueue_node); | |
345 | + cl_event_exec(e, exec_status, CL_FALSE); | |
346 | + } | |
347 | ||
348 | - /* Notify all waiting for flush. */ | |
349 | - CL_OBJECT_LOCK(queue); | |
350 | - worker->in_exec_status = CL_SUBMITTED; | |
351 | - CL_OBJECT_NOTIFY_COND(queue); | |
352 | - CL_OBJECT_UNLOCK(queue); | |
353 | + /* Notify all waiting for flush. */ | |
354 | + CL_OBJECT_LOCK(queue); | |
355 | + worker->in_exec_status = CL_SUBMITTED; | |
356 | + CL_OBJECT_NOTIFY_COND(queue); | |
357 | + CL_OBJECT_UNLOCK(queue); | |
358 | + } | |
359 | ||
360 | list_for_each_safe(pos, n, &ready_list) | |
361 | { | |
362 | --- a/src/cl_gl_api.c | |
363 | +++ b/src/cl_gl_api.c | |
364 | @@ -188,7 +188,7 @@ cl_int clEnqueueAcquireGLObjects (cl_command_queue command_queue, | |
365 | data = &e->exec_data; | |
366 | data->type = EnqueueReturnSuccesss; | |
367 | ||
368 | - if (e_status == CL_COMPLETE) { | |
369 | + if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) { | |
370 | // Sync mode, no need to queue event. | |
371 | err = cl_event_exec(e, CL_COMPLETE, CL_FALSE); | |
372 | if (err != CL_SUCCESS) { | |
373 | @@ -274,7 +274,7 @@ cl_int clEnqueueReleaseGLObjects (cl_command_queue command_queue, | |
374 | data = &e->exec_data; | |
375 | data->type = EnqueueReturnSuccesss; | |
376 | ||
377 | - if (e_status == CL_COMPLETE) { | |
378 | + if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) { | |
379 | // Sync mode, no need to queue event. | |
380 | err = cl_event_exec(e, CL_COMPLETE, CL_FALSE); | |
381 | if (err != CL_SUCCESS) { |