]>
Commit | Line | Data |
---|---|---|
9912bce9 JB |
1 | diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/dsputil_altivec.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/dsputil_altivec.c |
2 | --- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/dsputil_altivec.c 1970-01-01 01:00:00.000000000 +0100 | |
3 | +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/dsputil_altivec.c 2003-09-28 17:26:40.000000000 +0200 | |
4 | @@ -0,0 +1,1345 @@ | |
5 | +/* | |
6 | + * Copyright (c) 2002 Brian Foley | |
7 | + * Copyright (c) 2002 Dieter Shirley | |
8 | + * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org> | |
9 | + * | |
10 | + * This library is free software; you can redistribute it and/or | |
11 | + * modify it under the terms of the GNU Lesser General Public | |
12 | + * License as published by the Free Software Foundation; either | |
13 | + * version 2 of the License, or (at your option) any later version. | |
14 | + * | |
15 | + * This library is distributed in the hope that it will be useful, | |
16 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
18 | + * Lesser General Public License for more details. | |
19 | + * | |
20 | + * You should have received a copy of the GNU Lesser General Public | |
21 | + * License along with this library; if not, write to the Free Software | |
22 | + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
23 | + */ | |
24 | + | |
25 | +#include "../dsputil.h" | |
26 | + | |
27 | +#include "gcc_fixes.h" | |
28 | + | |
29 | +#include "dsputil_altivec.h" | |
30 | + | |
31 | +#ifdef CONFIG_DARWIN | |
32 | +#include <sys/sysctl.h> | |
33 | +#else /* CONFIG_DARWIN */ | |
34 | +#include <signal.h> | |
35 | +#include <setjmp.h> | |
36 | + | |
37 | +static sigjmp_buf jmpbuf; | |
38 | +static volatile sig_atomic_t canjump = 0; | |
39 | + | |
40 | +static void sigill_handler (int sig) | |
41 | +{ | |
42 | + if (!canjump) { | |
43 | + signal (sig, SIG_DFL); | |
44 | + raise (sig); | |
45 | + } | |
46 | + | |
47 | + canjump = 0; | |
48 | + siglongjmp (jmpbuf, 1); | |
49 | +} | |
50 | +#endif /* CONFIG_DARWIN */ | |
51 | + | |
52 | +int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) | |
53 | +{ | |
54 | + int i; | |
55 | + int s __attribute__((aligned(16))); | |
56 | + const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); | |
57 | + vector unsigned char *tv; | |
58 | + vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; | |
59 | + vector unsigned int sad; | |
60 | + vector signed int sumdiffs; | |
61 | + | |
62 | + s = 0; | |
63 | + sad = (vector unsigned int)vec_splat_u32(0); | |
64 | + for(i=0;i<16;i++) { | |
65 | + /* | |
66 | + Read unaligned pixels into our vectors. The vectors are as follows: | |
67 | + pix1v: pix1[0]-pix1[15] | |
68 | + pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] | |
69 | + */ | |
70 | + tv = (vector unsigned char *) pix1; | |
71 | + pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); | |
72 | + | |
73 | + tv = (vector unsigned char *) &pix2[0]; | |
74 | + pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); | |
75 | + | |
76 | + tv = (vector unsigned char *) &pix2[1]; | |
77 | + pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); | |
78 | + | |
79 | + /* Calculate the average vector */ | |
80 | + avgv = vec_avg(pix2v, pix2iv); | |
81 | + | |
82 | + /* Calculate a sum of abs differences vector */ | |
83 | + t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); | |
84 | + | |
85 | + /* Add each 4 pixel group together and put 4 results into sad */ | |
86 | + sad = vec_sum4s(t5, sad); | |
87 | + | |
88 | + pix1 += line_size; | |
89 | + pix2 += line_size; | |
90 | + } | |
91 | + /* Sum up the four partial sums, and put the result into s */ | |
92 | + sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); | |
93 | + sumdiffs = vec_splat(sumdiffs, 3); | |
94 | + vec_ste(sumdiffs, 0, &s); | |
95 | + | |
96 | + return s; | |
97 | +} | |
98 | + | |
99 | +int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) | |
100 | +{ | |
101 | + int i; | |
102 | + int s __attribute__((aligned(16))); | |
103 | + const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); | |
104 | + vector unsigned char *tv; | |
105 | + vector unsigned char pix1v, pix2v, pix3v, avgv, t5; | |
106 | + vector unsigned int sad; | |
107 | + vector signed int sumdiffs; | |
108 | + uint8_t *pix3 = pix2 + line_size; | |
109 | + | |
110 | + s = 0; | |
111 | + sad = (vector unsigned int)vec_splat_u32(0); | |
112 | + | |
113 | + /* | |
114 | + Due to the fact that pix3 = pix2 + line_size, the pix3 of one | |
115 | + iteration becomes pix2 in the next iteration. We can use this | |
116 | + fact to avoid a potentially expensive unaligned read, each | |
117 | + time around the loop. | |
118 | + Read unaligned pixels into our vectors. The vectors are as follows: | |
119 | + pix2v: pix2[0]-pix2[15] | |
120 | + Split the pixel vectors into shorts | |
121 | + */ | |
122 | + tv = (vector unsigned char *) &pix2[0]; | |
123 | + pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); | |
124 | + | |
125 | + for(i=0;i<16;i++) { | |
126 | + /* | |
127 | + Read unaligned pixels into our vectors. The vectors are as follows: | |
128 | + pix1v: pix1[0]-pix1[15] | |
129 | + pix3v: pix3[0]-pix3[15] | |
130 | + */ | |
131 | + tv = (vector unsigned char *) pix1; | |
132 | + pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); | |
133 | + | |
134 | + tv = (vector unsigned char *) &pix3[0]; | |
135 | + pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); | |
136 | + | |
137 | + /* Calculate the average vector */ | |
138 | + avgv = vec_avg(pix2v, pix3v); | |
139 | + | |
140 | + /* Calculate a sum of abs differences vector */ | |
141 | + t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); | |
142 | + | |
143 | + /* Add each 4 pixel group together and put 4 results into sad */ | |
144 | + sad = vec_sum4s(t5, sad); | |
145 | + | |
146 | + pix1 += line_size; | |
147 | + pix2v = pix3v; | |
148 | + pix3 += line_size; | |
149 | + | |
150 | + } | |
151 | + | |
152 | + /* Sum up the four partial sums, and put the result into s */ | |
153 | + sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); | |
154 | + sumdiffs = vec_splat(sumdiffs, 3); | |
155 | + vec_ste(sumdiffs, 0, &s); | |
156 | + return s; | |
157 | +} | |
158 | + | |
159 | +int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) | |
160 | +{ | |
161 | + int i; | |
162 | + int s __attribute__((aligned(16))); | |
163 | + uint8_t *pix3 = pix2 + line_size; | |
164 | + const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); | |
165 | + const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2); | |
166 | + vector unsigned char *tv, avgv, t5; | |
167 | + vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv; | |
168 | + vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv; | |
169 | + vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; | |
170 | + vector unsigned short avghv, avglv; | |
171 | + vector unsigned short t1, t2, t3, t4; | |
172 | + vector unsigned int sad; | |
173 | + vector signed int sumdiffs; | |
174 | + | |
175 | + sad = (vector unsigned int)vec_splat_u32(0); | |
176 | + | |
177 | + s = 0; | |
178 | + | |
179 | + /* | |
180 | + Due to the fact that pix3 = pix2 + line_size, the pix3 of one | |
181 | + iteration becomes pix2 in the next iteration. We can use this | |
182 | + fact to avoid a potentially expensive unaligned read, as well | |
183 | + as some splitting, and vector addition each time around the loop. | |
184 | + Read unaligned pixels into our vectors. The vectors are as follows: | |
185 | + pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] | |
186 | + Split the pixel vectors into shorts | |
187 | + */ | |
188 | + tv = (vector unsigned char *) &pix2[0]; | |
189 | + pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); | |
190 | + | |
191 | + tv = (vector unsigned char *) &pix2[1]; | |
192 | + pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); | |
193 | + | |
194 | + pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v); | |
195 | + pix2lv = (vector unsigned short) vec_mergel(zero, pix2v); | |
196 | + pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv); | |
197 | + pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv); | |
198 | + t1 = vec_add(pix2hv, pix2ihv); | |
199 | + t2 = vec_add(pix2lv, pix2ilv); | |
200 | + | |
201 | + for(i=0;i<16;i++) { | |
202 | + /* | |
203 | + Read unaligned pixels into our vectors. The vectors are as follows: | |
204 | + pix1v: pix1[0]-pix1[15] | |
205 | + pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16] | |
206 | + */ | |
207 | + tv = (vector unsigned char *) pix1; | |
208 | + pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); | |
209 | + | |
210 | + tv = (vector unsigned char *) &pix3[0]; | |
211 | + pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); | |
212 | + | |
213 | + tv = (vector unsigned char *) &pix3[1]; | |
214 | + pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1])); | |
215 | + | |
216 | + /* | |
217 | + Note that Altivec does have vec_avg, but this works on vector pairs | |
218 | + and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding | |
219 | + would mean that, for example, avg(3,0,0,1) = 2, when it should be 1. | |
220 | + Instead, we have to split the pixel vectors into vectors of shorts, | |
221 | + and do the averaging by hand. | |
222 | + */ | |
223 | + | |
224 | + /* Split the pixel vectors into shorts */ | |
225 | + pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); | |
226 | + pix3lv = (vector unsigned short) vec_mergel(zero, pix3v); | |
227 | + pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv); | |
228 | + pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv); | |
229 | + | |
230 | + /* Do the averaging on them */ | |
231 | + t3 = vec_add(pix3hv, pix3ihv); | |
232 | + t4 = vec_add(pix3lv, pix3ilv); | |
233 | + | |
234 | + avghv = vec_sr(vec_add(vec_add(t1, t3), two), two); | |
235 | + avglv = vec_sr(vec_add(vec_add(t2, t4), two), two); | |
236 | + | |
237 | + /* Pack the shorts back into a result */ | |
238 | + avgv = vec_pack(avghv, avglv); | |
239 | + | |
240 | + /* Calculate a sum of abs differences vector */ | |
241 | + t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); | |
242 | + | |
243 | + /* Add each 4 pixel group together and put 4 results into sad */ | |
244 | + sad = vec_sum4s(t5, sad); | |
245 | + | |
246 | + pix1 += line_size; | |
247 | + pix3 += line_size; | |
248 | + /* Transfer the calculated values for pix3 into pix2 */ | |
249 | + t1 = t3; | |
250 | + t2 = t4; | |
251 | + } | |
252 | + /* Sum up the four partial sums, and put the result into s */ | |
253 | + sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); | |
254 | + sumdiffs = vec_splat(sumdiffs, 3); | |
255 | + vec_ste(sumdiffs, 0, &s); | |
256 | + | |
257 | + return s; | |
258 | +} | |
259 | + | |
260 | +int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) | |
261 | +{ | |
262 | + int i; | |
263 | + int s __attribute__((aligned(16))); | |
264 | + const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); | |
265 | + vector unsigned char perm1, perm2, *pix1v, *pix2v; | |
266 | + vector unsigned char t1, t2, t3,t4, t5; | |
267 | + vector unsigned int sad; | |
268 | + vector signed int sumdiffs; | |
269 | + | |
270 | + sad = (vector unsigned int)vec_splat_u32(0); | |
271 | + | |
272 | + | |
273 | + for(i=0;i<16;i++) { | |
274 | + /* Read potentially unaligned pixels into t1 and t2 */ | |
275 | + perm1 = vec_lvsl(0, pix1); | |
276 | + pix1v = (vector unsigned char *) pix1; | |
277 | + perm2 = vec_lvsl(0, pix2); | |
278 | + pix2v = (vector unsigned char *) pix2; | |
279 | + t1 = vec_perm(pix1v[0], pix1v[1], perm1); | |
280 | + t2 = vec_perm(pix2v[0], pix2v[1], perm2); | |
281 | + | |
282 | + /* Calculate a sum of abs differences vector */ | |
283 | + t3 = vec_max(t1, t2); | |
284 | + t4 = vec_min(t1, t2); | |
285 | + t5 = vec_sub(t3, t4); | |
286 | + | |
287 | + /* Add each 4 pixel group together and put 4 results into sad */ | |
288 | + sad = vec_sum4s(t5, sad); | |
289 | + | |
290 | + pix1 += line_size; | |
291 | + pix2 += line_size; | |
292 | + } | |
293 | + | |
294 | + /* Sum up the four partial sums, and put the result into s */ | |
295 | + sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); | |
296 | + sumdiffs = vec_splat(sumdiffs, 3); | |
297 | + vec_ste(sumdiffs, 0, &s); | |
298 | + | |
299 | + return s; | |
300 | +} | |
301 | + | |
302 | +int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) | |
303 | +{ | |
304 | + int i; | |
305 | + int s __attribute__((aligned(16))); | |
306 | + const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); | |
307 | + vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; | |
308 | + vector unsigned char t1, t2, t3,t4, t5; | |
309 | + vector unsigned int sad; | |
310 | + vector signed int sumdiffs; | |
311 | + | |
312 | + sad = (vector unsigned int)vec_splat_u32(0); | |
313 | + | |
314 | + permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); | |
315 | + | |
316 | + for(i=0;i<8;i++) { | |
317 | + /* Read potentially unaligned pixels into t1 and t2 | |
318 | + Since we're reading 16 pixels, and actually only want 8, | |
319 | + mask out the last 8 pixels. The 0s don't change the sum. */ | |
320 | + perm1 = vec_lvsl(0, pix1); | |
321 | + pix1v = (vector unsigned char *) pix1; | |
322 | + perm2 = vec_lvsl(0, pix2); | |
323 | + pix2v = (vector unsigned char *) pix2; | |
324 | + t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); | |
325 | + t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); | |
326 | + | |
327 | + /* Calculate a sum of abs differences vector */ | |
328 | + t3 = vec_max(t1, t2); | |
329 | + t4 = vec_min(t1, t2); | |
330 | + t5 = vec_sub(t3, t4); | |
331 | + | |
332 | + /* Add each 4 pixel group together and put 4 results into sad */ | |
333 | + sad = vec_sum4s(t5, sad); | |
334 | + | |
335 | + pix1 += line_size; | |
336 | + pix2 += line_size; | |
337 | + } | |
338 | + | |
339 | + /* Sum up the four partial sums, and put the result into s */ | |
340 | + sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); | |
341 | + sumdiffs = vec_splat(sumdiffs, 3); | |
342 | + vec_ste(sumdiffs, 0, &s); | |
343 | + | |
344 | + return s; | |
345 | +} | |
346 | + | |
347 | +int pix_norm1_altivec(uint8_t *pix, int line_size) | |
348 | +{ | |
349 | + int i; | |
350 | + int s __attribute__((aligned(16))); | |
351 | + const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); | |
352 | + vector unsigned char *tv; | |
353 | + vector unsigned char pixv; | |
354 | + vector unsigned int sv; | |
355 | + vector signed int sum; | |
356 | + | |
357 | + sv = (vector unsigned int)vec_splat_u32(0); | |
358 | + | |
359 | + s = 0; | |
360 | + for (i = 0; i < 16; i++) { | |
361 | + /* Read in the potentially unaligned pixels */ | |
362 | + tv = (vector unsigned char *) pix; | |
363 | + pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix)); | |
364 | + | |
365 | + /* Square the values, and add them to our sum */ | |
366 | + sv = vec_msum(pixv, pixv, sv); | |
367 | + | |
368 | + pix += line_size; | |
369 | + } | |
370 | + /* Sum up the four partial sums, and put the result into s */ | |
371 | + sum = vec_sums((vector signed int) sv, (vector signed int) zero); | |
372 | + sum = vec_splat(sum, 3); | |
373 | + vec_ste(sum, 0, &s); | |
374 | + | |
375 | + return s; | |
376 | +} | |
377 | + | |
378 | +/** | |
379 | + * Sum of Squared Errors for a 8x8 block. | |
380 | + * AltiVec-enhanced. | |
381 | + * It's the pix_abs8x8_altivec code above w/ squaring added. | |
382 | + */ | |
383 | +int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size) | |
384 | +{ | |
385 | + int i; | |
386 | + int s __attribute__((aligned(16))); | |
387 | + const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); | |
388 | + vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; | |
389 | + vector unsigned char t1, t2, t3,t4, t5; | |
390 | + vector unsigned int sum; | |
391 | + vector signed int sumsqr; | |
392 | + | |
393 | + sum = (vector unsigned int)vec_splat_u32(0); | |
394 | + | |
395 | + permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); | |
396 | + | |
397 | + | |
398 | + for(i=0;i<8;i++) { | |
399 | + /* Read potentially unaligned pixels into t1 and t2 | |
400 | + Since we're reading 16 pixels, and actually only want 8, | |
401 | + mask out the last 8 pixels. The 0s don't change the sum. */ | |
402 | + perm1 = vec_lvsl(0, pix1); | |
403 | + pix1v = (vector unsigned char *) pix1; | |
404 | + perm2 = vec_lvsl(0, pix2); | |
405 | + pix2v = (vector unsigned char *) pix2; | |
406 | + t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); | |
407 | + t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); | |
408 | + | |
409 | + /* | |
410 | + Since we want to use unsigned chars, we can take advantage | |
411 | + of the fact that abs(a-b)^2 = (a-b)^2. | |
412 | + */ | |
413 | + | |
414 | + /* Calculate abs differences vector */ | |
415 | + t3 = vec_max(t1, t2); | |
416 | + t4 = vec_min(t1, t2); | |
417 | + t5 = vec_sub(t3, t4); | |
418 | + | |
419 | + /* Square the values and add them to our sum */ | |
420 | + sum = vec_msum(t5, t5, sum); | |
421 | + | |
422 | + pix1 += line_size; | |
423 | + pix2 += line_size; | |
424 | + } | |
425 | + | |
426 | + /* Sum up the four partial sums, and put the result into s */ | |
427 | + sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); | |
428 | + sumsqr = vec_splat(sumsqr, 3); | |
429 | + vec_ste(sumsqr, 0, &s); | |
430 | + | |
431 | + return s; | |
432 | +} | |
433 | + | |
434 | +/** | |
435 | + * Sum of Squared Errors for a 16x16 block. | |
436 | + * AltiVec-enhanced. | |
437 | + * It's the pix_abs16x16_altivec code above w/ squaring added. | |
438 | + */ | |
439 | +int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size) | |
440 | +{ | |
441 | + int i; | |
442 | + int s __attribute__((aligned(16))); | |
443 | + const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); | |
444 | + vector unsigned char perm1, perm2, *pix1v, *pix2v; | |
445 | + vector unsigned char t1, t2, t3,t4, t5; | |
446 | + vector unsigned int sum; | |
447 | + vector signed int sumsqr; | |
448 | + | |
449 | + sum = (vector unsigned int)vec_splat_u32(0); | |
450 | + | |
451 | + for(i=0;i<16;i++) { | |
452 | + /* Read potentially unaligned pixels into t1 and t2 */ | |
453 | + perm1 = vec_lvsl(0, pix1); | |
454 | + pix1v = (vector unsigned char *) pix1; | |
455 | + perm2 = vec_lvsl(0, pix2); | |
456 | + pix2v = (vector unsigned char *) pix2; | |
457 | + t1 = vec_perm(pix1v[0], pix1v[1], perm1); | |
458 | + t2 = vec_perm(pix2v[0], pix2v[1], perm2); | |
459 | + | |
460 | + /* | |
461 | + Since we want to use unsigned chars, we can take advantage | |
462 | + of the fact that abs(a-b)^2 = (a-b)^2. | |
463 | + */ | |
464 | + | |
465 | + /* Calculate abs differences vector */ | |
466 | + t3 = vec_max(t1, t2); | |
467 | + t4 = vec_min(t1, t2); | |
468 | + t5 = vec_sub(t3, t4); | |
469 | + | |
470 | + /* Square the values and add them to our sum */ | |
471 | + sum = vec_msum(t5, t5, sum); | |
472 | + | |
473 | + pix1 += line_size; | |
474 | + pix2 += line_size; | |
475 | + } | |
476 | + | |
477 | + /* Sum up the four partial sums, and put the result into s */ | |
478 | + sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); | |
479 | + sumsqr = vec_splat(sumsqr, 3); | |
480 | + vec_ste(sumsqr, 0, &s); | |
481 | + | |
482 | + return s; | |
483 | +} | |
484 | + | |
485 | +int pix_sum_altivec(uint8_t * pix, int line_size) | |
486 | +{ | |
487 | + const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); | |
488 | + vector unsigned char perm, *pixv; | |
489 | + vector unsigned char t1; | |
490 | + vector unsigned int sad; | |
491 | + vector signed int sumdiffs; | |
492 | + | |
493 | + int i; | |
494 | + int s __attribute__((aligned(16))); | |
495 | + | |
496 | + sad = (vector unsigned int)vec_splat_u32(0); | |
497 | + | |
498 | + for (i = 0; i < 16; i++) { | |
499 | + /* Read the potentially unaligned 16 pixels into t1 */ | |
500 | + perm = vec_lvsl(0, pix); | |
501 | + pixv = (vector unsigned char *) pix; | |
502 | + t1 = vec_perm(pixv[0], pixv[1], perm); | |
503 | + | |
504 | + /* Add each 4 pixel group together and put 4 results into sad */ | |
505 | + sad = vec_sum4s(t1, sad); | |
506 | + | |
507 | + pix += line_size; | |
508 | + } | |
509 | + | |
510 | + /* Sum up the four partial sums, and put the result into s */ | |
511 | + sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); | |
512 | + sumdiffs = vec_splat(sumdiffs, 3); | |
513 | + vec_ste(sumdiffs, 0, &s); | |
514 | + | |
515 | + return s; | |
516 | +} | |
517 | + | |
518 | +void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size) | |
519 | +{ | |
520 | + int i; | |
521 | + vector unsigned char perm, bytes, *pixv; | |
522 | + const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); | |
523 | + vector signed short shorts; | |
524 | + | |
525 | + for(i=0;i<8;i++) | |
526 | + { | |
527 | + // Read potentially unaligned pixels. | |
528 | + // We're reading 16 pixels, and actually only want 8, | |
529 | + // but we simply ignore the extras. | |
530 | + perm = vec_lvsl(0, pixels); | |
531 | + pixv = (vector unsigned char *) pixels; | |
532 | + bytes = vec_perm(pixv[0], pixv[1], perm); | |
533 | + | |
534 | + // convert the bytes into shorts | |
535 | + shorts = (vector signed short)vec_mergeh(zero, bytes); | |
536 | + | |
537 | + // save the data to the block, we assume the block is 16-byte aligned | |
538 | + vec_st(shorts, i*16, (vector signed short*)block); | |
539 | + | |
540 | + pixels += line_size; | |
541 | + } | |
542 | +} | |
543 | + | |
544 | +void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1, | |
545 | + const uint8_t *s2, int stride) | |
546 | +{ | |
547 | + int i; | |
548 | + vector unsigned char perm, bytes, *pixv; | |
549 | + const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); | |
550 | + vector signed short shorts1, shorts2; | |
551 | + | |
552 | + for(i=0;i<4;i++) | |
553 | + { | |
554 | + // Read potentially unaligned pixels | |
555 | + // We're reading 16 pixels, and actually only want 8, | |
556 | + // but we simply ignore the extras. | |
557 | + perm = vec_lvsl(0, s1); | |
558 | + pixv = (vector unsigned char *) s1; | |
559 | + bytes = vec_perm(pixv[0], pixv[1], perm); | |
560 | + | |
561 | + // convert the bytes into shorts | |
562 | + shorts1 = (vector signed short)vec_mergeh(zero, bytes); | |
563 | + | |
564 | + // Do the same for the second block of pixels | |
565 | + perm = vec_lvsl(0, s2); | |
566 | + pixv = (vector unsigned char *) s2; | |
567 | + bytes = vec_perm(pixv[0], pixv[1], perm); | |
568 | + | |
569 | + // convert the bytes into shorts | |
570 | + shorts2 = (vector signed short)vec_mergeh(zero, bytes); | |
571 | + | |
572 | + // Do the subtraction | |
573 | + shorts1 = vec_sub(shorts1, shorts2); | |
574 | + | |
575 | + // save the data to the block, we assume the block is 16-byte aligned | |
576 | + vec_st(shorts1, 0, (vector signed short*)block); | |
577 | + | |
578 | + s1 += stride; | |
579 | + s2 += stride; | |
580 | + block += 8; | |
581 | + | |
582 | + | |
583 | + // The code below is a copy of the code above... This is a manual | |
584 | + // unroll. | |
585 | + | |
586 | + // Read potentially unaligned pixels | |
587 | + // We're reading 16 pixels, and actually only want 8, | |
588 | + // but we simply ignore the extras. | |
589 | + perm = vec_lvsl(0, s1); | |
590 | + pixv = (vector unsigned char *) s1; | |
591 | + bytes = vec_perm(pixv[0], pixv[1], perm); | |
592 | + | |
593 | + // convert the bytes into shorts | |
594 | + shorts1 = (vector signed short)vec_mergeh(zero, bytes); | |
595 | + | |
596 | + // Do the same for the second block of pixels | |
597 | + perm = vec_lvsl(0, s2); | |
598 | + pixv = (vector unsigned char *) s2; | |
599 | + bytes = vec_perm(pixv[0], pixv[1], perm); | |
600 | + | |
601 | + // convert the bytes into shorts | |
602 | + shorts2 = (vector signed short)vec_mergeh(zero, bytes); | |
603 | + | |
604 | + // Do the subtraction | |
605 | + shorts1 = vec_sub(shorts1, shorts2); | |
606 | + | |
607 | + // save the data to the block, we assume the block is 16-byte aligned | |
608 | + vec_st(shorts1, 0, (vector signed short*)block); | |
609 | + | |
610 | + s1 += stride; | |
611 | + s2 += stride; | |
612 | + block += 8; | |
613 | + } | |
614 | +} | |
615 | + | |
616 | +int sad16x16_altivec(void *s, uint8_t *a, uint8_t *b, int stride) { | |
617 | + return pix_abs16x16_altivec(a,b,stride); | |
618 | +} | |
619 | + | |
620 | +int sad8x8_altivec(void *s, uint8_t *a, uint8_t *b, int stride) { | |
621 | + return pix_abs8x8_altivec(a,b,stride); | |
622 | +} | |
623 | + | |
624 | +void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) { | |
625 | +#ifdef ALTIVEC_USE_REFERENCE_C_CODE | |
626 | + int i; | |
627 | + for(i=0; i+7<w; i++){ | |
628 | + dst[i+0] += src[i+0]; | |
629 | + dst[i+1] += src[i+1]; | |
630 | + dst[i+2] += src[i+2]; | |
631 | + dst[i+3] += src[i+3]; | |
632 | + dst[i+4] += src[i+4]; | |
633 | + dst[i+5] += src[i+5]; | |
634 | + dst[i+6] += src[i+6]; | |
635 | + dst[i+7] += src[i+7]; | |
636 | + } | |
637 | + for(; i<w; i++) | |
638 | + dst[i+0] += src[i+0]; | |
639 | +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
640 | + register int i; | |
641 | + register vector unsigned char vdst, vsrc; | |
642 | + | |
643 | + /* dst and src are 16 bytes-aligned (guaranteed) */ | |
644 | + for(i = 0 ; (i + 15) < w ; i++) | |
645 | + { | |
646 | + vdst = vec_ld(i << 4, (unsigned char*)dst); | |
647 | + vsrc = vec_ld(i << 4, (unsigned char*)src); | |
648 | + vdst = vec_add(vsrc, vdst); | |
649 | + vec_st(vdst, i << 4, (unsigned char*)dst); | |
650 | + } | |
651 | + /* if w is not a multiple of 16 */ | |
652 | + for (; (i < w) ; i++) | |
653 | + { | |
654 | + dst[i] = src[i]; | |
655 | + } | |
656 | +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
657 | +} | |
658 | + | |
659 | +/* next one assumes that ((line_size % 16) == 0) */ | |
660 | +void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
661 | +{ | |
662 | +POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1); | |
663 | +#ifdef ALTIVEC_USE_REFERENCE_C_CODE | |
664 | + int i; | |
665 | + | |
666 | +POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1); | |
667 | + | |
668 | + for(i=0; i<h; i++) { | |
669 | + *((uint32_t*)(block )) = (((const struct unaligned_32 *) (pixels))->l); | |
670 | + *((uint32_t*)(block+4)) = (((const struct unaligned_32 *) (pixels+4))->l); | |
671 | + *((uint32_t*)(block+8)) = (((const struct unaligned_32 *) (pixels+8))->l); | |
672 | + *((uint32_t*)(block+12)) = (((const struct unaligned_32 *) (pixels+12))->l); | |
673 | + pixels+=line_size; | |
674 | + block +=line_size; | |
675 | + } | |
676 | + | |
677 | +POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1); | |
678 | + | |
679 | +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
680 | + register vector unsigned char pixelsv1, pixelsv2; | |
681 | + register vector unsigned char pixelsv1B, pixelsv2B; | |
682 | + register vector unsigned char pixelsv1C, pixelsv2C; | |
683 | + register vector unsigned char pixelsv1D, pixelsv2D; | |
684 | + | |
685 | + register vector unsigned char perm = vec_lvsl(0, pixels); | |
686 | + int i; | |
687 | + register int line_size_2 = line_size << 1; | |
688 | + register int line_size_3 = line_size + line_size_2; | |
689 | + register int line_size_4 = line_size << 2; | |
690 | + | |
691 | +POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1); | |
692 | +// hand-unrolling the loop by 4 gains about 15% | |
693 | +// mininum execution time goes from 74 to 60 cycles | |
694 | +// it's faster than -funroll-loops, but using | |
695 | +// -funroll-loops w/ this is bad - 74 cycles again. | |
696 | +// all this is on a 7450, tuning for the 7450 | |
697 | +#if 0 | |
698 | + for(i=0; i<h; i++) { | |
699 | + pixelsv1 = vec_ld(0, (unsigned char*)pixels); | |
700 | + pixelsv2 = vec_ld(16, (unsigned char*)pixels); | |
701 | + vec_st(vec_perm(pixelsv1, pixelsv2, perm), | |
702 | + 0, (unsigned char*)block); | |
703 | + pixels+=line_size; | |
704 | + block +=line_size; | |
705 | + } | |
706 | +#else | |
707 | + for(i=0; i<h; i+=4) { | |
708 | + pixelsv1 = vec_ld(0, (unsigned char*)pixels); | |
709 | + pixelsv2 = vec_ld(16, (unsigned char*)pixels); | |
710 | + pixelsv1B = vec_ld(line_size, (unsigned char*)pixels); | |
711 | + pixelsv2B = vec_ld(16 + line_size, (unsigned char*)pixels); | |
712 | + pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels); | |
713 | + pixelsv2C = vec_ld(16 + line_size_2, (unsigned char*)pixels); | |
714 | + pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels); | |
715 | + pixelsv2D = vec_ld(16 + line_size_3, (unsigned char*)pixels); | |
716 | + vec_st(vec_perm(pixelsv1, pixelsv2, perm), | |
717 | + 0, (unsigned char*)block); | |
718 | + vec_st(vec_perm(pixelsv1B, pixelsv2B, perm), | |
719 | + line_size, (unsigned char*)block); | |
720 | + vec_st(vec_perm(pixelsv1C, pixelsv2C, perm), | |
721 | + line_size_2, (unsigned char*)block); | |
722 | + vec_st(vec_perm(pixelsv1D, pixelsv2D, perm), | |
723 | + line_size_3, (unsigned char*)block); | |
724 | + pixels+=line_size_4; | |
725 | + block +=line_size_4; | |
726 | + } | |
727 | +#endif | |
728 | +POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1); | |
729 | + | |
730 | +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
731 | +} | |
732 | + | |
733 | +/* next one assumes that ((line_size % 16) == 0) */ | |
734 | +#define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) | |
735 | +void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
736 | +{ | |
737 | +POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1); | |
738 | +#ifdef ALTIVEC_USE_REFERENCE_C_CODE | |
739 | + int i; | |
740 | + | |
741 | +POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1); | |
742 | + | |
743 | + for(i=0; i<h; i++) { | |
744 | + op_avg(*((uint32_t*)(block)),(((const struct unaligned_32 *)(pixels))->l)); | |
745 | + op_avg(*((uint32_t*)(block+4)),(((const struct unaligned_32 *)(pixels+4))->l)); | |
746 | + op_avg(*((uint32_t*)(block+8)),(((const struct unaligned_32 *)(pixels+8))->l)); | |
747 | + op_avg(*((uint32_t*)(block+12)),(((const struct unaligned_32 *)(pixels+12))->l)); | |
748 | + pixels+=line_size; | |
749 | + block +=line_size; | |
750 | + } | |
751 | + | |
752 | +POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1); | |
753 | + | |
754 | +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
755 | + register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; | |
756 | + register vector unsigned char perm = vec_lvsl(0, pixels); | |
757 | + int i; | |
758 | + | |
759 | +POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1); | |
760 | + | |
761 | + for(i=0; i<h; i++) { | |
762 | + pixelsv1 = vec_ld(0, (unsigned char*)pixels); | |
763 | + pixelsv2 = vec_ld(16, (unsigned char*)pixels); | |
764 | + blockv = vec_ld(0, block); | |
765 | + pixelsv = vec_perm(pixelsv1, pixelsv2, perm); | |
766 | + blockv = vec_avg(blockv,pixelsv); | |
767 | + vec_st(blockv, 0, (unsigned char*)block); | |
768 | + pixels+=line_size; | |
769 | + block +=line_size; | |
770 | + } | |
771 | + | |
772 | +POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1); | |
773 | + | |
774 | +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
775 | +} | |
776 | + | |
777 | +/* next one assumes that ((line_size % 8) == 0) */ | |
778 | +void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) | |
779 | +{ | |
780 | +POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1); | |
781 | +#ifdef ALTIVEC_USE_REFERENCE_C_CODE | |
782 | + int i; | |
783 | +POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1); | |
784 | + for (i = 0; i < h; i++) { | |
785 | + *((uint32_t *) (block)) = | |
786 | + (((*((uint32_t *) (block))) | | |
787 | + ((((const struct unaligned_32 *) (pixels))->l))) - | |
788 | + ((((*((uint32_t *) (block))) ^ | |
789 | + ((((const struct unaligned_32 *) (pixels))-> | |
790 | + l))) & 0xFEFEFEFEUL) >> 1)); | |
791 | + *((uint32_t *) (block + 4)) = | |
792 | + (((*((uint32_t *) (block + 4))) | | |
793 | + ((((const struct unaligned_32 *) (pixels + 4))->l))) - | |
794 | + ((((*((uint32_t *) (block + 4))) ^ | |
795 | + ((((const struct unaligned_32 *) (pixels + | |
796 | + 4))-> | |
797 | + l))) & 0xFEFEFEFEUL) >> 1)); | |
798 | + pixels += line_size; | |
799 | + block += line_size; | |
800 | + } | |
801 | +POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1); | |
802 | + | |
803 | +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
804 | + register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; | |
805 | + int i; | |
806 | + | |
807 | +POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1); | |
808 | + | |
809 | + for (i = 0; i < h; i++) { | |
810 | + /* | |
811 | + block is 8 bytes-aligned, so we're either in the | |
812 | + left block (16 bytes-aligned) or in the right block (not) | |
813 | + */ | |
814 | + int rightside = ((unsigned long)block & 0x0000000F); | |
815 | + | |
816 | + blockv = vec_ld(0, block); | |
817 | + pixelsv1 = vec_ld(0, (unsigned char*)pixels); | |
818 | + pixelsv2 = vec_ld(16, (unsigned char*)pixels); | |
819 | + pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels)); | |
820 | + | |
821 | + if (rightside) | |
822 | + { | |
823 | + pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); | |
824 | + } | |
825 | + else | |
826 | + { | |
827 | + pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); | |
828 | + } | |
829 | + | |
830 | + blockv = vec_avg(blockv, pixelsv); | |
831 | + | |
832 | + vec_st(blockv, 0, block); | |
833 | + | |
834 | + pixels += line_size; | |
835 | + block += line_size; | |
836 | + } | |
837 | + | |
838 | +POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1); | |
839 | + | |
840 | +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
841 | +} | |
842 | + | |
843 | +/* next one assumes that ((line_size % 8) == 0) */ | |
844 | +void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
845 | +{ | |
846 | +POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1); | |
847 | +#ifdef ALTIVEC_USE_REFERENCE_C_CODE | |
848 | + int j; | |
849 | +POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1); | |
850 | + for (j = 0; j < 2; j++) { | |
851 | + int i; | |
852 | + const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); | |
853 | + const uint32_t b = | |
854 | + (((const struct unaligned_32 *) (pixels + 1))->l); | |
855 | + uint32_t l0 = | |
856 | + (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; | |
857 | + uint32_t h0 = | |
858 | + ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
859 | + uint32_t l1, h1; | |
860 | + pixels += line_size; | |
861 | + for (i = 0; i < h; i += 2) { | |
862 | + uint32_t a = (((const struct unaligned_32 *) (pixels))->l); | |
863 | + uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); | |
864 | + l1 = (a & 0x03030303UL) + (b & 0x03030303UL); | |
865 | + h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
866 | + *((uint32_t *) block) = | |
867 | + h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); | |
868 | + pixels += line_size; | |
869 | + block += line_size; | |
870 | + a = (((const struct unaligned_32 *) (pixels))->l); | |
871 | + b = (((const struct unaligned_32 *) (pixels + 1))->l); | |
872 | + l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; | |
873 | + h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
874 | + *((uint32_t *) block) = | |
875 | + h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); | |
876 | + pixels += line_size; | |
877 | + block += line_size; | |
878 | + } pixels += 4 - line_size * (h + 1); | |
879 | + block += 4 - line_size * h; | |
880 | + } | |
881 | + | |
882 | +POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); | |
883 | + | |
884 | +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
885 | + register int i; | |
886 | + register vector unsigned char | |
887 | + pixelsv1, pixelsv2, | |
888 | + pixelsavg; | |
889 | + register vector unsigned char | |
890 | + blockv, temp1, temp2; | |
891 | + register vector unsigned short | |
892 | + pixelssum1, pixelssum2, temp3; | |
893 | + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); | |
894 | + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); | |
895 | + | |
896 | + temp1 = vec_ld(0, pixels); | |
897 | + temp2 = vec_ld(16, pixels); | |
898 | + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); | |
899 | + if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) | |
900 | + { | |
901 | + pixelsv2 = temp2; | |
902 | + } | |
903 | + else | |
904 | + { | |
905 | + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); | |
906 | + } | |
907 | + pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
908 | + pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
909 | + pixelssum1 = vec_add((vector unsigned short)pixelsv1, | |
910 | + (vector unsigned short)pixelsv2); | |
911 | + pixelssum1 = vec_add(pixelssum1, vctwo); | |
912 | + | |
913 | +POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1); | |
914 | + for (i = 0; i < h ; i++) { | |
915 | + int rightside = ((unsigned long)block & 0x0000000F); | |
916 | + blockv = vec_ld(0, block); | |
917 | + | |
918 | + temp1 = vec_ld(line_size, pixels); | |
919 | + temp2 = vec_ld(line_size + 16, pixels); | |
920 | + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); | |
921 | + if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) | |
922 | + { | |
923 | + pixelsv2 = temp2; | |
924 | + } | |
925 | + else | |
926 | + { | |
927 | + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); | |
928 | + } | |
929 | + | |
930 | + pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
931 | + pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
932 | + pixelssum2 = vec_add((vector unsigned short)pixelsv1, | |
933 | + (vector unsigned short)pixelsv2); | |
934 | + temp3 = vec_add(pixelssum1, pixelssum2); | |
935 | + temp3 = vec_sra(temp3, vctwo); | |
936 | + pixelssum1 = vec_add(pixelssum2, vctwo); | |
937 | + pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); | |
938 | + | |
939 | + if (rightside) | |
940 | + { | |
941 | + blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); | |
942 | + } | |
943 | + else | |
944 | + { | |
945 | + blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); | |
946 | + } | |
947 | + | |
948 | + vec_st(blockv, 0, block); | |
949 | + | |
950 | + block += line_size; | |
951 | + pixels += line_size; | |
952 | + } | |
953 | + | |
954 | +POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); | |
955 | +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
956 | +} | |
957 | + | |
958 | +/* next one assumes that ((line_size % 8) == 0) */ | |
959 | +void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) | |
960 | +{ | |
961 | +POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1); | |
962 | +#ifdef ALTIVEC_USE_REFERENCE_C_CODE | |
963 | + int j; | |
964 | +POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); | |
965 | + for (j = 0; j < 2; j++) { | |
966 | + int i; | |
967 | + const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); | |
968 | + const uint32_t b = | |
969 | + (((const struct unaligned_32 *) (pixels + 1))->l); | |
970 | + uint32_t l0 = | |
971 | + (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; | |
972 | + uint32_t h0 = | |
973 | + ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
974 | + uint32_t l1, h1; | |
975 | + pixels += line_size; | |
976 | + for (i = 0; i < h; i += 2) { | |
977 | + uint32_t a = (((const struct unaligned_32 *) (pixels))->l); | |
978 | + uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); | |
979 | + l1 = (a & 0x03030303UL) + (b & 0x03030303UL); | |
980 | + h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
981 | + *((uint32_t *) block) = | |
982 | + h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); | |
983 | + pixels += line_size; | |
984 | + block += line_size; | |
985 | + a = (((const struct unaligned_32 *) (pixels))->l); | |
986 | + b = (((const struct unaligned_32 *) (pixels + 1))->l); | |
987 | + l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; | |
988 | + h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
989 | + *((uint32_t *) block) = | |
990 | + h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); | |
991 | + pixels += line_size; | |
992 | + block += line_size; | |
993 | + } pixels += 4 - line_size * (h + 1); | |
994 | + block += 4 - line_size * h; | |
995 | + } | |
996 | + | |
997 | +POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); | |
998 | + | |
999 | +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
1000 | + register int i; | |
1001 | + register vector unsigned char | |
1002 | + pixelsv1, pixelsv2, | |
1003 | + pixelsavg; | |
1004 | + register vector unsigned char | |
1005 | + blockv, temp1, temp2; | |
1006 | + register vector unsigned short | |
1007 | + pixelssum1, pixelssum2, temp3; | |
1008 | + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); | |
1009 | + register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); | |
1010 | + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); | |
1011 | + | |
1012 | + temp1 = vec_ld(0, pixels); | |
1013 | + temp2 = vec_ld(16, pixels); | |
1014 | + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); | |
1015 | + if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) | |
1016 | + { | |
1017 | + pixelsv2 = temp2; | |
1018 | + } | |
1019 | + else | |
1020 | + { | |
1021 | + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); | |
1022 | + } | |
1023 | + pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
1024 | + pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
1025 | + pixelssum1 = vec_add((vector unsigned short)pixelsv1, | |
1026 | + (vector unsigned short)pixelsv2); | |
1027 | + pixelssum1 = vec_add(pixelssum1, vcone); | |
1028 | + | |
1029 | +POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); | |
1030 | + for (i = 0; i < h ; i++) { | |
1031 | + int rightside = ((unsigned long)block & 0x0000000F); | |
1032 | + blockv = vec_ld(0, block); | |
1033 | + | |
1034 | + temp1 = vec_ld(line_size, pixels); | |
1035 | + temp2 = vec_ld(line_size + 16, pixels); | |
1036 | + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); | |
1037 | + if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) | |
1038 | + { | |
1039 | + pixelsv2 = temp2; | |
1040 | + } | |
1041 | + else | |
1042 | + { | |
1043 | + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); | |
1044 | + } | |
1045 | + | |
1046 | + pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
1047 | + pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
1048 | + pixelssum2 = vec_add((vector unsigned short)pixelsv1, | |
1049 | + (vector unsigned short)pixelsv2); | |
1050 | + temp3 = vec_add(pixelssum1, pixelssum2); | |
1051 | + temp3 = vec_sra(temp3, vctwo); | |
1052 | + pixelssum1 = vec_add(pixelssum2, vcone); | |
1053 | + pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); | |
1054 | + | |
1055 | + if (rightside) | |
1056 | + { | |
1057 | + blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); | |
1058 | + } | |
1059 | + else | |
1060 | + { | |
1061 | + blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); | |
1062 | + } | |
1063 | + | |
1064 | + vec_st(blockv, 0, block); | |
1065 | + | |
1066 | + block += line_size; | |
1067 | + pixels += line_size; | |
1068 | + } | |
1069 | + | |
1070 | +POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); | |
1071 | +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
1072 | +} | |
1073 | + | |
1074 | +/* next one assumes that ((line_size % 16) == 0) */ | |
1075 | +void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) | |
1076 | +{ | |
1077 | +POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1); | |
1078 | +#ifdef ALTIVEC_USE_REFERENCE_C_CODE | |
1079 | + int j; | |
1080 | +POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1); | |
1081 | + for (j = 0; j < 4; j++) { | |
1082 | + int i; | |
1083 | + const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); | |
1084 | + const uint32_t b = | |
1085 | + (((const struct unaligned_32 *) (pixels + 1))->l); | |
1086 | + uint32_t l0 = | |
1087 | + (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; | |
1088 | + uint32_t h0 = | |
1089 | + ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
1090 | + uint32_t l1, h1; | |
1091 | + pixels += line_size; | |
1092 | + for (i = 0; i < h; i += 2) { | |
1093 | + uint32_t a = (((const struct unaligned_32 *) (pixels))->l); | |
1094 | + uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); | |
1095 | + l1 = (a & 0x03030303UL) + (b & 0x03030303UL); | |
1096 | + h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
1097 | + *((uint32_t *) block) = | |
1098 | + h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); | |
1099 | + pixels += line_size; | |
1100 | + block += line_size; | |
1101 | + a = (((const struct unaligned_32 *) (pixels))->l); | |
1102 | + b = (((const struct unaligned_32 *) (pixels + 1))->l); | |
1103 | + l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; | |
1104 | + h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
1105 | + *((uint32_t *) block) = | |
1106 | + h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); | |
1107 | + pixels += line_size; | |
1108 | + block += line_size; | |
1109 | + } pixels += 4 - line_size * (h + 1); | |
1110 | + block += 4 - line_size * h; | |
1111 | + } | |
1112 | + | |
1113 | +POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); | |
1114 | + | |
1115 | +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
1116 | + register int i; | |
1117 | + register vector unsigned char | |
1118 | + pixelsv1, pixelsv2, pixelsv3, pixelsv4; | |
1119 | + register vector unsigned char | |
1120 | + blockv, temp1, temp2; | |
1121 | + register vector unsigned short | |
1122 | + pixelssum1, pixelssum2, temp3, | |
1123 | + pixelssum3, pixelssum4, temp4; | |
1124 | + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); | |
1125 | + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); | |
1126 | + | |
1127 | +POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1); | |
1128 | + | |
1129 | + temp1 = vec_ld(0, pixels); | |
1130 | + temp2 = vec_ld(16, pixels); | |
1131 | + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); | |
1132 | + if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) | |
1133 | + { | |
1134 | + pixelsv2 = temp2; | |
1135 | + } | |
1136 | + else | |
1137 | + { | |
1138 | + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); | |
1139 | + } | |
1140 | + pixelsv3 = vec_mergel(vczero, pixelsv1); | |
1141 | + pixelsv4 = vec_mergel(vczero, pixelsv2); | |
1142 | + pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
1143 | + pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
1144 | + pixelssum3 = vec_add((vector unsigned short)pixelsv3, | |
1145 | + (vector unsigned short)pixelsv4); | |
1146 | + pixelssum3 = vec_add(pixelssum3, vctwo); | |
1147 | + pixelssum1 = vec_add((vector unsigned short)pixelsv1, | |
1148 | + (vector unsigned short)pixelsv2); | |
1149 | + pixelssum1 = vec_add(pixelssum1, vctwo); | |
1150 | + | |
1151 | + for (i = 0; i < h ; i++) { | |
1152 | + blockv = vec_ld(0, block); | |
1153 | + | |
1154 | + temp1 = vec_ld(line_size, pixels); | |
1155 | + temp2 = vec_ld(line_size + 16, pixels); | |
1156 | + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); | |
1157 | + if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) | |
1158 | + { | |
1159 | + pixelsv2 = temp2; | |
1160 | + } | |
1161 | + else | |
1162 | + { | |
1163 | + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); | |
1164 | + } | |
1165 | + | |
1166 | + pixelsv3 = vec_mergel(vczero, pixelsv1); | |
1167 | + pixelsv4 = vec_mergel(vczero, pixelsv2); | |
1168 | + pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
1169 | + pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
1170 | + | |
1171 | + pixelssum4 = vec_add((vector unsigned short)pixelsv3, | |
1172 | + (vector unsigned short)pixelsv4); | |
1173 | + pixelssum2 = vec_add((vector unsigned short)pixelsv1, | |
1174 | + (vector unsigned short)pixelsv2); | |
1175 | + temp4 = vec_add(pixelssum3, pixelssum4); | |
1176 | + temp4 = vec_sra(temp4, vctwo); | |
1177 | + temp3 = vec_add(pixelssum1, pixelssum2); | |
1178 | + temp3 = vec_sra(temp3, vctwo); | |
1179 | + | |
1180 | + pixelssum3 = vec_add(pixelssum4, vctwo); | |
1181 | + pixelssum1 = vec_add(pixelssum2, vctwo); | |
1182 | + | |
1183 | + blockv = vec_packsu(temp3, temp4); | |
1184 | + | |
1185 | + vec_st(blockv, 0, block); | |
1186 | + | |
1187 | + block += line_size; | |
1188 | + pixels += line_size; | |
1189 | + } | |
1190 | + | |
1191 | +POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); | |
1192 | +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
1193 | +} | |
1194 | + | |
1195 | +/* next one assumes that ((line_size % 16) == 0) */ | |
1196 | +void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) | |
1197 | +{ | |
1198 | +POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1); | |
1199 | +#ifdef ALTIVEC_USE_REFERENCE_C_CODE | |
1200 | + int j; | |
1201 | +POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); | |
1202 | + for (j = 0; j < 4; j++) { | |
1203 | + int i; | |
1204 | + const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); | |
1205 | + const uint32_t b = | |
1206 | + (((const struct unaligned_32 *) (pixels + 1))->l); | |
1207 | + uint32_t l0 = | |
1208 | + (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; | |
1209 | + uint32_t h0 = | |
1210 | + ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
1211 | + uint32_t l1, h1; | |
1212 | + pixels += line_size; | |
1213 | + for (i = 0; i < h; i += 2) { | |
1214 | + uint32_t a = (((const struct unaligned_32 *) (pixels))->l); | |
1215 | + uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); | |
1216 | + l1 = (a & 0x03030303UL) + (b & 0x03030303UL); | |
1217 | + h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
1218 | + *((uint32_t *) block) = | |
1219 | + h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); | |
1220 | + pixels += line_size; | |
1221 | + block += line_size; | |
1222 | + a = (((const struct unaligned_32 *) (pixels))->l); | |
1223 | + b = (((const struct unaligned_32 *) (pixels + 1))->l); | |
1224 | + l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; | |
1225 | + h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); | |
1226 | + *((uint32_t *) block) = | |
1227 | + h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); | |
1228 | + pixels += line_size; | |
1229 | + block += line_size; | |
1230 | + } pixels += 4 - line_size * (h + 1); | |
1231 | + block += 4 - line_size * h; | |
1232 | + } | |
1233 | + | |
1234 | +POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); | |
1235 | + | |
1236 | +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
1237 | + register int i; | |
1238 | + register vector unsigned char | |
1239 | + pixelsv1, pixelsv2, pixelsv3, pixelsv4; | |
1240 | + register vector unsigned char | |
1241 | + blockv, temp1, temp2; | |
1242 | + register vector unsigned short | |
1243 | + pixelssum1, pixelssum2, temp3, | |
1244 | + pixelssum3, pixelssum4, temp4; | |
1245 | + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); | |
1246 | + register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); | |
1247 | + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); | |
1248 | + | |
1249 | +POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); | |
1250 | + | |
1251 | + temp1 = vec_ld(0, pixels); | |
1252 | + temp2 = vec_ld(16, pixels); | |
1253 | + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); | |
1254 | + if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) | |
1255 | + { | |
1256 | + pixelsv2 = temp2; | |
1257 | + } | |
1258 | + else | |
1259 | + { | |
1260 | + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); | |
1261 | + } | |
1262 | + pixelsv3 = vec_mergel(vczero, pixelsv1); | |
1263 | + pixelsv4 = vec_mergel(vczero, pixelsv2); | |
1264 | + pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
1265 | + pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
1266 | + pixelssum3 = vec_add((vector unsigned short)pixelsv3, | |
1267 | + (vector unsigned short)pixelsv4); | |
1268 | + pixelssum3 = vec_add(pixelssum3, vcone); | |
1269 | + pixelssum1 = vec_add((vector unsigned short)pixelsv1, | |
1270 | + (vector unsigned short)pixelsv2); | |
1271 | + pixelssum1 = vec_add(pixelssum1, vcone); | |
1272 | + | |
1273 | + for (i = 0; i < h ; i++) { | |
1274 | + blockv = vec_ld(0, block); | |
1275 | + | |
1276 | + temp1 = vec_ld(line_size, pixels); | |
1277 | + temp2 = vec_ld(line_size + 16, pixels); | |
1278 | + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); | |
1279 | + if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) | |
1280 | + { | |
1281 | + pixelsv2 = temp2; | |
1282 | + } | |
1283 | + else | |
1284 | + { | |
1285 | + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); | |
1286 | + } | |
1287 | + | |
1288 | + pixelsv3 = vec_mergel(vczero, pixelsv1); | |
1289 | + pixelsv4 = vec_mergel(vczero, pixelsv2); | |
1290 | + pixelsv1 = vec_mergeh(vczero, pixelsv1); | |
1291 | + pixelsv2 = vec_mergeh(vczero, pixelsv2); | |
1292 | + | |
1293 | + pixelssum4 = vec_add((vector unsigned short)pixelsv3, | |
1294 | + (vector unsigned short)pixelsv4); | |
1295 | + pixelssum2 = vec_add((vector unsigned short)pixelsv1, | |
1296 | + (vector unsigned short)pixelsv2); | |
1297 | + temp4 = vec_add(pixelssum3, pixelssum4); | |
1298 | + temp4 = vec_sra(temp4, vctwo); | |
1299 | + temp3 = vec_add(pixelssum1, pixelssum2); | |
1300 | + temp3 = vec_sra(temp3, vctwo); | |
1301 | + | |
1302 | + pixelssum3 = vec_add(pixelssum4, vcone); | |
1303 | + pixelssum1 = vec_add(pixelssum2, vcone); | |
1304 | + | |
1305 | + blockv = vec_packsu(temp3, temp4); | |
1306 | + | |
1307 | + vec_st(blockv, 0, block); | |
1308 | + | |
1309 | + block += line_size; | |
1310 | + pixels += line_size; | |
1311 | + } | |
1312 | + | |
1313 | +POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); | |
1314 | +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
1315 | +} | |
1316 | + | |
1317 | +int has_altivec(void) | |
1318 | +{ | |
1319 | +#ifdef CONFIG_DARWIN | |
1320 | + int sels[2] = {CTL_HW, HW_VECTORUNIT}; | |
1321 | + int has_vu = 0; | |
1322 | + size_t len = sizeof(has_vu); | |
1323 | + int err; | |
1324 | + | |
1325 | + err = sysctl(sels, 2, &has_vu, &len, NULL, 0); | |
1326 | + | |
1327 | + if (err == 0) return (has_vu != 0); | |
1328 | +#else /* CONFIG_DARWIN */ | |
1329 | +/* no Darwin, do it the brute-force way */ | |
1330 | +/* this is borrowed from the libmpeg2 library */ | |
1331 | + { | |
1332 | + signal (SIGILL, sigill_handler); | |
1333 | + if (sigsetjmp (jmpbuf, 1)) { | |
1334 | + signal (SIGILL, SIG_DFL); | |
1335 | + } else { | |
1336 | + canjump = 1; | |
1337 | + | |
1338 | + asm volatile ("mtspr 256, %0\n\t" | |
1339 | + "vand %%v0, %%v0, %%v0" | |
1340 | + : | |
1341 | + : "r" (-1)); | |
1342 | + | |
1343 | + signal (SIGILL, SIG_DFL); | |
1344 | + return 1; | |
1345 | + } | |
1346 | + } | |
1347 | +#endif /* CONFIG_DARWIN */ | |
1348 | + return 0; | |
1349 | +} | |
1350 | diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/dsputil_ppc.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/dsputil_ppc.c | |
1351 | --- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/dsputil_ppc.c 1970-01-01 01:00:00.000000000 +0100 | |
1352 | +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/dsputil_ppc.c 2003-09-28 17:26:40.000000000 +0200 | |
1353 | @@ -0,0 +1,307 @@ | |
1354 | +/* | |
1355 | + * Copyright (c) 2002 Brian Foley | |
1356 | + * Copyright (c) 2002 Dieter Shirley | |
1357 | + * | |
1358 | + * This library is free software; you can redistribute it and/or | |
1359 | + * modify it under the terms of the GNU Lesser General Public | |
1360 | + * License as published by the Free Software Foundation; either | |
1361 | + * version 2 of the License, or (at your option) any later version. | |
1362 | + * | |
1363 | + * This library is distributed in the hope that it will be useful, | |
1364 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
1365 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
1366 | + * Lesser General Public License for more details. | |
1367 | + * | |
1368 | + * You should have received a copy of the GNU Lesser General Public | |
1369 | + * License along with this library; if not, write to the Free Software | |
1370 | + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
1371 | + */ | |
1372 | + | |
1373 | +#include "../dsputil.h" | |
1374 | + | |
1375 | +#include "dsputil_ppc.h" | |
1376 | + | |
1377 | +#ifdef HAVE_ALTIVEC | |
1378 | +#include "dsputil_altivec.h" | |
1379 | +#endif | |
1380 | + | |
1381 | +extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block); | |
1382 | +extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block); | |
1383 | + | |
1384 | +int mm_flags = 0; | |
1385 | + | |
1386 | +int mm_support(void) | |
1387 | +{ | |
1388 | + int result = 0; | |
1389 | +#if HAVE_ALTIVEC | |
1390 | + if (has_altivec()) { | |
1391 | + result |= MM_ALTIVEC; | |
1392 | + } | |
1393 | +#endif /* result */ | |
1394 | + return result; | |
1395 | +} | |
1396 | + | |
1397 | +#ifdef POWERPC_PERFORMANCE_REPORT | |
1398 | +unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total]; | |
1399 | +/* list below must match enum in dsputil_ppc.h */ | |
1400 | +static unsigned char* perfname[] = { | |
1401 | + "fft_calc_altivec", | |
1402 | + "gmc1_altivec", | |
1403 | + "dct_unquantize_h263_altivec", | |
1404 | + "idct_add_altivec", | |
1405 | + "idct_put_altivec", | |
1406 | + "put_pixels16_altivec", | |
1407 | + "avg_pixels16_altivec", | |
1408 | + "avg_pixels8_altivec", | |
1409 | + "put_pixels8_xy2_altivec", | |
1410 | + "put_no_rnd_pixels8_xy2_altivec", | |
1411 | + "put_pixels16_xy2_altivec", | |
1412 | + "put_no_rnd_pixels16_xy2_altivec", | |
1413 | + "clear_blocks_dcbz32_ppc", | |
1414 | + "clear_blocks_dcbz128_ppc" | |
1415 | +}; | |
1416 | +#include <stdio.h> | |
1417 | +#endif | |
1418 | + | |
1419 | +#ifdef POWERPC_PERFORMANCE_REPORT | |
1420 | +void powerpc_display_perf_report(void) | |
1421 | +{ | |
1422 | + int i, j; | |
1423 | + fprintf(stderr, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n"); | |
1424 | + for(i = 0 ; i < powerpc_perf_total ; i++) | |
1425 | + { | |
1426 | + for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) | |
1427 | + { | |
1428 | + if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0) | |
1429 | + fprintf(stderr, | |
1430 | + " Function \"%s\" (pmc%d):\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n", | |
1431 | + perfname[i], | |
1432 | + j+1, | |
1433 | + perfdata[j][i][powerpc_data_min], | |
1434 | + perfdata[j][i][powerpc_data_max], | |
1435 | + (double)perfdata[j][i][powerpc_data_sum] / | |
1436 | + (double)perfdata[j][i][powerpc_data_num], | |
1437 | + perfdata[j][i][powerpc_data_num]); | |
1438 | + } | |
1439 | + } | |
1440 | +} | |
1441 | +#endif /* POWERPC_PERFORMANCE_REPORT */ | |
1442 | + | |
1443 | +/* ***** WARNING ***** WARNING ***** WARNING ***** */ | |
1444 | +/* | |
1445 | + clear_blocks_dcbz32_ppc will not work properly | |
1446 | + on PowerPC processors with a cache line size | |
1447 | + not equal to 32 bytes. | |
1448 | + Fortunately all processor used by Apple up to | |
1449 | + at least the 7450 (aka second generation G4) | |
1450 | + use 32 bytes cache line. | |
1451 | + This is due to the use of the 'dcbz' instruction. | |
1452 | + It simply clear to zero a single cache line, | |
1453 | + so you need to know the cache line size to use it ! | |
1454 | + It's absurd, but it's fast... | |
1455 | + | |
1456 | + update 24/06/2003 : Apple released yesterday the G5, | |
1457 | + with a PPC970. cache line size : 128 bytes. Oups. | |
1458 | + The semantic of dcbz was changed, it always clear | |
1459 | + 32 bytes. so the function below will work, but will | |
1460 | + be slow. So I fixed check_dcbz_effect to use dcbzl, | |
1461 | + which is defined to clear a cache line (as dcbz before). | |
1462 | + So we still can distinguish, and use dcbz (32 bytes) | |
1463 | + or dcbzl (one cache line) as required. | |
1464 | + | |
1465 | + see <http://developer.apple.com/technotes/tn/tn2087.html> | |
1466 | + and <http://developer.apple.com/technotes/tn/tn2086.html> | |
1467 | +*/ | |
1468 | +void clear_blocks_dcbz32_ppc(DCTELEM *blocks) | |
1469 | +{ | |
1470 | +POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz32, 1); | |
1471 | + register int misal = ((unsigned long)blocks & 0x00000010); | |
1472 | + register int i = 0; | |
1473 | +POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1); | |
1474 | +#if 1 | |
1475 | + if (misal) { | |
1476 | + ((unsigned long*)blocks)[0] = 0L; | |
1477 | + ((unsigned long*)blocks)[1] = 0L; | |
1478 | + ((unsigned long*)blocks)[2] = 0L; | |
1479 | + ((unsigned long*)blocks)[3] = 0L; | |
1480 | + i += 16; | |
1481 | + } | |
1482 | + for ( ; i < sizeof(DCTELEM)*6*64 ; i += 32) { | |
1483 | + asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory"); | |
1484 | + } | |
1485 | + if (misal) { | |
1486 | + ((unsigned long*)blocks)[188] = 0L; | |
1487 | + ((unsigned long*)blocks)[189] = 0L; | |
1488 | + ((unsigned long*)blocks)[190] = 0L; | |
1489 | + ((unsigned long*)blocks)[191] = 0L; | |
1490 | + i += 16; | |
1491 | + } | |
1492 | +#else | |
1493 | + memset(blocks, 0, sizeof(DCTELEM)*6*64); | |
1494 | +#endif | |
1495 | +POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1); | |
1496 | +} | |
1497 | + | |
1498 | +/* same as above, when dcbzl clear a whole 128B cache line | |
1499 | + i.e. the PPC970 aka G5 */ | |
1500 | +#ifndef NO_DCBZL | |
1501 | +void clear_blocks_dcbz128_ppc(DCTELEM *blocks) | |
1502 | +{ | |
1503 | +POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz128, 1); | |
1504 | + register int misal = ((unsigned long)blocks & 0x0000007f); | |
1505 | + register int i = 0; | |
1506 | +POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1); | |
1507 | +#if 1 | |
1508 | + if (misal) { | |
1509 | + // we could probably also optimize this case, | |
1510 | + // but there's not much point as the machines | |
1511 | + // aren't available yet (2003-06-26) | |
1512 | + memset(blocks, 0, sizeof(DCTELEM)*6*64); | |
1513 | + } | |
1514 | + else | |
1515 | + for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) { | |
1516 | + asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory"); | |
1517 | + } | |
1518 | +#else | |
1519 | + memset(blocks, 0, sizeof(DCTELEM)*6*64); | |
1520 | +#endif | |
1521 | +POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1); | |
1522 | +} | |
1523 | +#else | |
1524 | +void clear_blocks_dcbz128_ppc(DCTELEM *blocks) | |
1525 | +{ | |
1526 | + memset(blocks, 0, sizeof(DCTELEM)*6*64); | |
1527 | +} | |
1528 | +#endif | |
1529 | + | |
1530 | +#ifndef NO_DCBZL | |
1531 | +/* check dcbz report how many bytes are set to 0 by dcbz */ | |
1532 | +/* update 24/06/2003 : replace dcbz by dcbzl to get | |
1533 | + the intended effect (Apple "fixed" dcbz) | |
1534 | + unfortunately this cannot be used unless the assembler | |
1535 | + knows about dcbzl ... */ | |
1536 | +long check_dcbzl_effect(void) | |
1537 | +{ | |
1538 | + register char *fakedata = (char*)av_malloc(1024); | |
1539 | + register char *fakedata_middle; | |
1540 | + register long zero = 0; | |
1541 | + register long i = 0; | |
1542 | + long count = 0; | |
1543 | + | |
1544 | + if (!fakedata) | |
1545 | + { | |
1546 | + return 0L; | |
1547 | + } | |
1548 | + | |
1549 | + fakedata_middle = (fakedata + 512); | |
1550 | + | |
1551 | + memset(fakedata, 0xFF, 1024); | |
1552 | + | |
1553 | + /* below the constraint "b" seems to mean "Address base register" | |
1554 | + in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */ | |
1555 | + asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero)); | |
1556 | + | |
1557 | + for (i = 0; i < 1024 ; i ++) | |
1558 | + { | |
1559 | + if (fakedata[i] == (char)0) | |
1560 | + count++; | |
1561 | + } | |
1562 | + | |
1563 | + av_free(fakedata); | |
1564 | + | |
1565 | + return count; | |
1566 | +} | |
1567 | +#else | |
1568 | +long check_dcbzl_effect(void) | |
1569 | +{ | |
1570 | + return 0; | |
1571 | +} | |
1572 | +#endif | |
1573 | + | |
1574 | +void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx) | |
1575 | +{ | |
1576 | + // Common optimizations whether Altivec is available or not | |
1577 | + | |
1578 | + switch (check_dcbzl_effect()) { | |
1579 | + case 32: | |
1580 | + c->clear_blocks = clear_blocks_dcbz32_ppc; | |
1581 | + break; | |
1582 | + case 128: | |
1583 | + c->clear_blocks = clear_blocks_dcbz128_ppc; | |
1584 | + break; | |
1585 | + default: | |
1586 | + break; | |
1587 | + } | |
1588 | + | |
1589 | +#if HAVE_ALTIVEC | |
1590 | + if (has_altivec()) { | |
1591 | + mm_flags |= MM_ALTIVEC; | |
1592 | + | |
1593 | + // Altivec specific optimisations | |
1594 | + c->pix_abs16x16_x2 = pix_abs16x16_x2_altivec; | |
1595 | + c->pix_abs16x16_y2 = pix_abs16x16_y2_altivec; | |
1596 | + c->pix_abs16x16_xy2 = pix_abs16x16_xy2_altivec; | |
1597 | + c->pix_abs16x16 = pix_abs16x16_altivec; | |
1598 | + c->pix_abs8x8 = pix_abs8x8_altivec; | |
1599 | + c->sad[0]= sad16x16_altivec; | |
1600 | + c->sad[1]= sad8x8_altivec; | |
1601 | + c->pix_norm1 = pix_norm1_altivec; | |
1602 | + c->sse[1]= sse8_altivec; | |
1603 | + c->sse[0]= sse16_altivec; | |
1604 | + c->pix_sum = pix_sum_altivec; | |
1605 | + c->diff_pixels = diff_pixels_altivec; | |
1606 | + c->get_pixels = get_pixels_altivec; | |
1607 | +// next one disabled as it's untested. | |
1608 | +#if 0 | |
1609 | + c->add_bytes= add_bytes_altivec; | |
1610 | +#endif /* 0 */ | |
1611 | + c->put_pixels_tab[0][0] = put_pixels16_altivec; | |
1612 | + /* the tow functions do the same thing, so use the same code */ | |
1613 | + c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec; | |
1614 | + c->avg_pixels_tab[0][0] = avg_pixels16_altivec; | |
1615 | +// next one disabled as it's untested. | |
1616 | +#if 0 | |
1617 | + c->avg_pixels_tab[1][0] = avg_pixels8_altivec; | |
1618 | +#endif /* 0 */ | |
1619 | + c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec; | |
1620 | + c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec; | |
1621 | + c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec; | |
1622 | + c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec; | |
1623 | + | |
1624 | + c->gmc1 = gmc1_altivec; | |
1625 | + | |
1626 | + if ((avctx->idct_algo == FF_IDCT_AUTO) || | |
1627 | + (avctx->idct_algo == FF_IDCT_ALTIVEC)) | |
1628 | + { | |
1629 | + c->idct_put = idct_put_altivec; | |
1630 | + c->idct_add = idct_add_altivec; | |
1631 | +#ifndef ALTIVEC_USE_REFERENCE_C_CODE | |
1632 | + c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; | |
1633 | +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
1634 | + c->idct_permutation_type = FF_NO_IDCT_PERM; | |
1635 | +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
1636 | + } | |
1637 | + | |
1638 | +#ifdef POWERPC_PERFORMANCE_REPORT | |
1639 | + { | |
1640 | + int i, j; | |
1641 | + for (i = 0 ; i < powerpc_perf_total ; i++) | |
1642 | + { | |
1643 | + for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) | |
1644 | + { | |
1645 | + perfdata[j][i][powerpc_data_min] = (unsigned long long)0xFFFFFFFFFFFFFFFF; | |
1646 | + perfdata[j][i][powerpc_data_max] = (unsigned long long)0x0000000000000000; | |
1647 | + perfdata[j][i][powerpc_data_sum] = (unsigned long long)0x0000000000000000; | |
1648 | + perfdata[j][i][powerpc_data_num] = (unsigned long long)0x0000000000000000; | |
1649 | + } | |
1650 | + } | |
1651 | + } | |
1652 | +#endif /* POWERPC_PERFORMANCE_REPORT */ | |
1653 | + } else | |
1654 | +#endif /* HAVE_ALTIVEC */ | |
1655 | + { | |
1656 | + // Non-AltiVec PPC optimisations | |
1657 | + | |
1658 | + // ... pending ... | |
1659 | + } | |
1660 | +} | |
1661 | diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/fft_altivec.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/fft_altivec.c | |
1662 | --- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/fft_altivec.c 1970-01-01 01:00:00.000000000 +0100 | |
1663 | +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/fft_altivec.c 2003-09-28 17:26:40.000000000 +0200 | |
1664 | @@ -0,0 +1,247 @@ | |
1665 | +/* | |
1666 | + * FFT/IFFT transforms | |
1667 | + * AltiVec-enabled | |
1668 | + * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org> | |
1669 | + * Based on code Copyright (c) 2002 Fabrice Bellard. | |
1670 | + * | |
1671 | + * This library is free software; you can redistribute it and/or | |
1672 | + * modify it under the terms of the GNU Lesser General Public | |
1673 | + * License as published by the Free Software Foundation; either | |
1674 | + * version 2 of the License, or (at your option) any later version. | |
1675 | + * | |
1676 | + * This library is distributed in the hope that it will be useful, | |
1677 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
1678 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
1679 | + * Lesser General Public License for more details. | |
1680 | + * | |
1681 | + * You should have received a copy of the GNU Lesser General Public | |
1682 | + * License along with this library; if not, write to the Free Software | |
1683 | + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
1684 | + */ | |
1685 | +#include "../dsputil.h" | |
1686 | + | |
1687 | +#include "gcc_fixes.h" | |
1688 | + | |
1689 | +#include "dsputil_altivec.h" | |
1690 | + | |
1691 | +/* | |
1692 | + those three macros are from libavcodec/fft.c | |
1693 | + and are required for the reference C code | |
1694 | +*/ | |
1695 | +/* butter fly op */ | |
1696 | +#define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \ | |
1697 | +{\ | |
1698 | + FFTSample ax, ay, bx, by;\ | |
1699 | + bx=pre1;\ | |
1700 | + by=pim1;\ | |
1701 | + ax=qre1;\ | |
1702 | + ay=qim1;\ | |
1703 | + pre = (bx + ax);\ | |
1704 | + pim = (by + ay);\ | |
1705 | + qre = (bx - ax);\ | |
1706 | + qim = (by - ay);\ | |
1707 | +} | |
1708 | +#define MUL16(a,b) ((a) * (b)) | |
1709 | +#define CMUL(pre, pim, are, aim, bre, bim) \ | |
1710 | +{\ | |
1711 | + pre = (MUL16(are, bre) - MUL16(aim, bim));\ | |
1712 | + pim = (MUL16(are, bim) + MUL16(bre, aim));\ | |
1713 | +} | |
1714 | + | |
1715 | + | |
1716 | +/** | |
1717 | + * Do a complex FFT with the parameters defined in fft_init(). The | |
1718 | + * input data must be permuted before with s->revtab table. No | |
1719 | + * 1.0/sqrt(n) normalization is done. | |
1720 | + * AltiVec-enabled | |
1721 | + * This code assumes that the 'z' pointer is 16 bytes-aligned | |
1722 | + * It also assumes all FFTComplex are 8 bytes-aligned pair of float | |
1723 | + * The code is exactly the same as the SSE version, except | |
1724 | + * that successive MUL + ADD/SUB have been merged into | |
1725 | + * fused multiply-add ('vec_madd' in altivec) | |
1726 | + */ | |
1727 | +void fft_calc_altivec(FFTContext *s, FFTComplex *z) | |
1728 | +{ | |
1729 | +POWERPC_PERF_DECLARE(altivec_fft_num, s->nbits >= 6); | |
1730 | +#ifdef ALTIVEC_USE_REFERENCE_C_CODE | |
1731 | + int ln = s->nbits; | |
1732 | + int j, np, np2; | |
1733 | + int nblocks, nloops; | |
1734 | + register FFTComplex *p, *q; | |
1735 | + FFTComplex *exptab = s->exptab; | |
1736 | + int l; | |
1737 | + FFTSample tmp_re, tmp_im; | |
1738 | + | |
1739 | +POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6); | |
1740 | + | |
1741 | + np = 1 << ln; | |
1742 | + | |
1743 | + /* pass 0 */ | |
1744 | + | |
1745 | + p=&z[0]; | |
1746 | + j=(np >> 1); | |
1747 | + do { | |
1748 | + BF(p[0].re, p[0].im, p[1].re, p[1].im, | |
1749 | + p[0].re, p[0].im, p[1].re, p[1].im); | |
1750 | + p+=2; | |
1751 | + } while (--j != 0); | |
1752 | + | |
1753 | + /* pass 1 */ | |
1754 | + | |
1755 | + | |
1756 | + p=&z[0]; | |
1757 | + j=np >> 2; | |
1758 | + if (s->inverse) { | |
1759 | + do { | |
1760 | + BF(p[0].re, p[0].im, p[2].re, p[2].im, | |
1761 | + p[0].re, p[0].im, p[2].re, p[2].im); | |
1762 | + BF(p[1].re, p[1].im, p[3].re, p[3].im, | |
1763 | + p[1].re, p[1].im, -p[3].im, p[3].re); | |
1764 | + p+=4; | |
1765 | + } while (--j != 0); | |
1766 | + } else { | |
1767 | + do { | |
1768 | + BF(p[0].re, p[0].im, p[2].re, p[2].im, | |
1769 | + p[0].re, p[0].im, p[2].re, p[2].im); | |
1770 | + BF(p[1].re, p[1].im, p[3].re, p[3].im, | |
1771 | + p[1].re, p[1].im, p[3].im, -p[3].re); | |
1772 | + p+=4; | |
1773 | + } while (--j != 0); | |
1774 | + } | |
1775 | + /* pass 2 .. ln-1 */ | |
1776 | + | |
1777 | + nblocks = np >> 3; | |
1778 | + nloops = 1 << 2; | |
1779 | + np2 = np >> 1; | |
1780 | + do { | |
1781 | + p = z; | |
1782 | + q = z + nloops; | |
1783 | + for (j = 0; j < nblocks; ++j) { | |
1784 | + BF(p->re, p->im, q->re, q->im, | |
1785 | + p->re, p->im, q->re, q->im); | |
1786 | + | |
1787 | + p++; | |
1788 | + q++; | |
1789 | + for(l = nblocks; l < np2; l += nblocks) { | |
1790 | + CMUL(tmp_re, tmp_im, exptab[l].re, exptab[l].im, q->re, q->im); | |
1791 | + BF(p->re, p->im, q->re, q->im, | |
1792 | + p->re, p->im, tmp_re, tmp_im); | |
1793 | + p++; | |
1794 | + q++; | |
1795 | + } | |
1796 | + | |
1797 | + p += nloops; | |
1798 | + q += nloops; | |
1799 | + } | |
1800 | + nblocks = nblocks >> 1; | |
1801 | + nloops = nloops << 1; | |
1802 | + } while (nblocks != 0); | |
1803 | + | |
1804 | +POWERPC_PERF_STOP_COUNT(altivec_fft_num, s->nbits >= 6); | |
1805 | + | |
1806 | +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
1807 | +#ifdef CONFIG_DARWIN | |
1808 | + register const vector float vczero = (const vector float)(0.); | |
1809 | +#else | |
1810 | + register const vector float vczero = (const vector float){0.,0.,0.,0.}; | |
1811 | +#endif | |
1812 | + | |
1813 | + int ln = s->nbits; | |
1814 | + int j, np, np2; | |
1815 | + int nblocks, nloops; | |
1816 | + register FFTComplex *p, *q; | |
1817 | + FFTComplex *cptr, *cptr1; | |
1818 | + int k; | |
1819 | + | |
1820 | +POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6); | |
1821 | + | |
1822 | + np = 1 << ln; | |
1823 | + | |
1824 | + { | |
1825 | + vector float *r, a, b, a1, c1, c2; | |
1826 | + | |
1827 | + r = (vector float *)&z[0]; | |
1828 | + | |
1829 | + c1 = vcii(p,p,n,n); | |
1830 | + | |
1831 | + if (s->inverse) | |
1832 | + { | |
1833 | + c2 = vcii(p,p,n,p); | |
1834 | + } | |
1835 | + else | |
1836 | + { | |
1837 | + c2 = vcii(p,p,p,n); | |
1838 | + } | |
1839 | + | |
1840 | + j = (np >> 2); | |
1841 | + do { | |
1842 | + a = vec_ld(0, r); | |
1843 | + a1 = vec_ld(sizeof(vector float), r); | |
1844 | + | |
1845 | + b = vec_perm(a,a,vcprmle(1,0,3,2)); | |
1846 | + a = vec_madd(a,c1,b); | |
1847 | + /* do the pass 0 butterfly */ | |
1848 | + | |
1849 | + b = vec_perm(a1,a1,vcprmle(1,0,3,2)); | |
1850 | + b = vec_madd(a1,c1,b); | |
1851 | + /* do the pass 0 butterfly */ | |
1852 | + | |
1853 | + /* multiply third by -i */ | |
1854 | + b = vec_perm(b,b,vcprmle(2,3,1,0)); | |
1855 | + | |
1856 | + /* do the pass 1 butterfly */ | |
1857 | + vec_st(vec_madd(b,c2,a), 0, r); | |
1858 | + vec_st(vec_nmsub(b,c2,a), sizeof(vector float), r); | |
1859 | + | |
1860 | + r += 2; | |
1861 | + } while (--j != 0); | |
1862 | + } | |
1863 | + /* pass 2 .. ln-1 */ | |
1864 | + | |
1865 | + nblocks = np >> 3; | |
1866 | + nloops = 1 << 2; | |
1867 | + np2 = np >> 1; | |
1868 | + | |
1869 | + cptr1 = s->exptab1; | |
1870 | + do { | |
1871 | + p = z; | |
1872 | + q = z + nloops; | |
1873 | + j = nblocks; | |
1874 | + do { | |
1875 | + cptr = cptr1; | |
1876 | + k = nloops >> 1; | |
1877 | + do { | |
1878 | + vector float a,b,c,t1; | |
1879 | + | |
1880 | + a = vec_ld(0, (float*)p); | |
1881 | + b = vec_ld(0, (float*)q); | |
1882 | + | |
1883 | + /* complex mul */ | |
1884 | + c = vec_ld(0, (float*)cptr); | |
1885 | + /* cre*re cim*re */ | |
1886 | + t1 = vec_madd(c, vec_perm(b,b,vcprmle(2,2,0,0)),vczero); | |
1887 | + c = vec_ld(sizeof(vector float), (float*)cptr); | |
1888 | + /* -cim*im cre*im */ | |
1889 | + b = vec_madd(c, vec_perm(b,b,vcprmle(3,3,1,1)),t1); | |
1890 | + | |
1891 | + /* butterfly */ | |
1892 | + vec_st(vec_add(a,b), 0, (float*)p); | |
1893 | + vec_st(vec_sub(a,b), 0, (float*)q); | |
1894 | + | |
1895 | + p += 2; | |
1896 | + q += 2; | |
1897 | + cptr += 4; | |
1898 | + } while (--k); | |
1899 | + | |
1900 | + p += nloops; | |
1901 | + q += nloops; | |
1902 | + } while (--j); | |
1903 | + cptr1 += nloops * 2; | |
1904 | + nblocks = nblocks >> 1; | |
1905 | + nloops = nloops << 1; | |
1906 | + } while (nblocks != 0); | |
1907 | + | |
1908 | +POWERPC_PERF_STOP_COUNT(altivec_fft_num, s->nbits >= 6); | |
1909 | + | |
1910 | +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
1911 | +} | |
1912 | diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/gcc_fixes.h avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/gcc_fixes.h | |
1913 | --- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/gcc_fixes.h 2003-07-04 15:40:29.000000000 +0200 | |
1914 | +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/gcc_fixes.h 2003-09-28 17:26:40.000000000 +0200 | |
1915 | @@ -25,7 +25,7 @@ | |
1916 | * http://gcc.gnu.org/ml/gcc/2003-04/msg00967.html | |
1917 | */ | |
1918 | ||
1919 | -static inline vector signed char my_vmrglb (vector signed char const A, | |
1920 | +static inline vector signed char ff_vmrglb (vector signed char const A, | |
1921 | vector signed char const B) | |
1922 | { | |
1923 | static const vector unsigned char lowbyte = { | |
1924 | @@ -35,7 +35,7 @@ | |
1925 | return vec_perm (A, B, lowbyte); | |
1926 | } | |
1927 | ||
1928 | -static inline vector signed short my_vmrglh (vector signed short const A, | |
1929 | +static inline vector signed short ff_vmrglh (vector signed short const A, | |
1930 | vector signed short const B) | |
1931 | { | |
1932 | static const vector unsigned char lowhalf = { | |
1933 | @@ -45,7 +45,7 @@ | |
1934 | return vec_perm (A, B, lowhalf); | |
1935 | } | |
1936 | ||
1937 | -static inline vector signed int my_vmrglw (vector signed int const A, | |
1938 | +static inline vector signed int ff_vmrglw (vector signed int const A, | |
1939 | vector signed int const B) | |
1940 | { | |
1941 | static const vector unsigned char lowword = { | |
1942 | @@ -54,27 +54,27 @@ | |
1943 | }; | |
1944 | return vec_perm (A, B, lowword); | |
1945 | } | |
1946 | -/*#define my_vmrglb my_vmrglb | |
1947 | -#define my_vmrglh my_vmrglh | |
1948 | -#define my_vmrglw my_vmrglw | |
1949 | +/*#define ff_vmrglb ff_vmrglb | |
1950 | +#define ff_vmrglh ff_vmrglh | |
1951 | +#define ff_vmrglw ff_vmrglw | |
1952 | */ | |
1953 | #undef vec_mergel | |
1954 | ||
1955 | #define vec_mergel(a1, a2) \ | |
1956 | __ch (__bin_args_eq (vector signed char, (a1), vector signed char, (a2)), \ | |
1957 | - ((vector signed char) my_vmrglb ((vector signed char) (a1), (vector signed char) (a2))), \ | |
1958 | + ((vector signed char) ff_vmrglb ((vector signed char) (a1), (vector signed char) (a2))), \ | |
1959 | __ch (__bin_args_eq (vector unsigned char, (a1), vector unsigned char, (a2)), \ | |
1960 | - ((vector unsigned char) my_vmrglb ((vector signed char) (a1), (vector signed char) (a2))), \ | |
1961 | + ((vector unsigned char) ff_vmrglb ((vector signed char) (a1), (vector signed char) (a2))), \ | |
1962 | __ch (__bin_args_eq (vector signed short, (a1), vector signed short, (a2)), \ | |
1963 | - ((vector signed short) my_vmrglh ((vector signed short) (a1), (vector signed short) (a2))), \ | |
1964 | + ((vector signed short) ff_vmrglh ((vector signed short) (a1), (vector signed short) (a2))), \ | |
1965 | __ch (__bin_args_eq (vector unsigned short, (a1), vector unsigned short, (a2)), \ | |
1966 | - ((vector unsigned short) my_vmrglh ((vector signed short) (a1), (vector signed short) (a2))), \ | |
1967 | + ((vector unsigned short) ff_vmrglh ((vector signed short) (a1), (vector signed short) (a2))), \ | |
1968 | __ch (__bin_args_eq (vector float, (a1), vector float, (a2)), \ | |
1969 | - ((vector float) my_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \ | |
1970 | + ((vector float) ff_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \ | |
1971 | __ch (__bin_args_eq (vector signed int, (a1), vector signed int, (a2)), \ | |
1972 | - ((vector signed int) my_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \ | |
1973 | + ((vector signed int) ff_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \ | |
1974 | __ch (__bin_args_eq (vector unsigned int, (a1), vector unsigned int, (a2)), \ | |
1975 | - ((vector unsigned int) my_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \ | |
1976 | + ((vector unsigned int) ff_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \ | |
1977 | __altivec_link_error_invalid_argument ()))))))) | |
1978 | ||
1979 | #endif | |
1980 | diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/gmc_altivec.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/gmc_altivec.c | |
1981 | --- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/gmc_altivec.c 1970-01-01 01:00:00.000000000 +0100 | |
1982 | +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/gmc_altivec.c 2003-09-28 17:26:40.000000000 +0200 | |
1983 | @@ -0,0 +1,172 @@ | |
1984 | +/* | |
1985 | + * GMC (Global Motion Compensation) | |
1986 | + * AltiVec-enabled | |
1987 | + * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org> | |
1988 | + * | |
1989 | + * This library is free software; you can redistribute it and/or | |
1990 | + * modify it under the terms of the GNU Lesser General Public | |
1991 | + * License as published by the Free Software Foundation; either | |
1992 | + * version 2 of the License, or (at your option) any later version. | |
1993 | + * | |
1994 | + * This library is distributed in the hope that it will be useful, | |
1995 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
1996 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
1997 | + * Lesser General Public License for more details. | |
1998 | + * | |
1999 | + * You should have received a copy of the GNU Lesser General Public | |
2000 | + * License along with this library; if not, write to the Free Software | |
2001 | + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
2002 | + */ | |
2003 | + | |
2004 | +#include "../dsputil.h" | |
2005 | + | |
2006 | +#include "gcc_fixes.h" | |
2007 | + | |
2008 | +#include "dsputil_altivec.h" | |
2009 | + | |
2010 | +/* | |
2011 | + altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8, | |
2012 | + to preserve proper dst alignement. | |
2013 | +*/ | |
2014 | +#define GMC1_PERF_COND (h==8) | |
2015 | +void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int stride, int h, int x16, int y16, int rounder) | |
2016 | +{ | |
2017 | +POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND); | |
2018 | +#ifdef ALTIVEC_USE_REFERENCE_C_CODE | |
2019 | + const int A=(16-x16)*(16-y16); | |
2020 | + const int B=( x16)*(16-y16); | |
2021 | + const int C=(16-x16)*( y16); | |
2022 | + const int D=( x16)*( y16); | |
2023 | + int i; | |
2024 | + | |
2025 | +POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND); | |
2026 | + | |
2027 | + for(i=0; i<h; i++) | |
2028 | + { | |
2029 | + dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8; | |
2030 | + dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8; | |
2031 | + dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8; | |
2032 | + dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8; | |
2033 | + dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8; | |
2034 | + dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8; | |
2035 | + dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8; | |
2036 | + dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8; | |
2037 | + dst+= stride; | |
2038 | + src+= stride; | |
2039 | + } | |
2040 | + | |
2041 | +POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND); | |
2042 | + | |
2043 | +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
2044 | + const unsigned short __attribute__ ((aligned(16))) rounder_a[8] = | |
2045 | + {rounder, rounder, rounder, rounder, | |
2046 | + rounder, rounder, rounder, rounder}; | |
2047 | + const unsigned short __attribute__ ((aligned(16))) ABCD[8] = | |
2048 | + { | |
2049 | + (16-x16)*(16-y16), /* A */ | |
2050 | + ( x16)*(16-y16), /* B */ | |
2051 | + (16-x16)*( y16), /* C */ | |
2052 | + ( x16)*( y16), /* D */ | |
2053 | + 0, 0, 0, 0 /* padding */ | |
2054 | + }; | |
2055 | + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); | |
2056 | + register const vector unsigned short vcsr8 = (const vector unsigned short)vec_splat_u16(8); | |
2057 | + register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD; | |
2058 | + register vector unsigned short Av, Bv, Cv, Dv, rounderV, tempA, tempB, tempC, tempD; | |
2059 | + int i; | |
2060 | + unsigned long dst_odd = (unsigned long)dst & 0x0000000F; | |
2061 | + unsigned long src_really_odd = (unsigned long)src & 0x0000000F; | |
2062 | + | |
2063 | + | |
2064 | +POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND); | |
2065 | + | |
2066 | + tempA = vec_ld(0, (unsigned short*)ABCD); | |
2067 | + Av = vec_splat(tempA, 0); | |
2068 | + Bv = vec_splat(tempA, 1); | |
2069 | + Cv = vec_splat(tempA, 2); | |
2070 | + Dv = vec_splat(tempA, 3); | |
2071 | + | |
2072 | + rounderV = vec_ld(0, (unsigned short*)rounder_a); | |
2073 | + | |
2074 | + // we'll be able to pick-up our 9 char elements | |
2075 | + // at src from those 32 bytes | |
2076 | + // we load the first batch here, as inside the loop | |
2077 | + // we can re-use 'src+stride' from one iteration | |
2078 | + // as the 'src' of the next. | |
2079 | + src_0 = vec_ld(0, src); | |
2080 | + src_1 = vec_ld(16, src); | |
2081 | + srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src)); | |
2082 | + | |
2083 | + if (src_really_odd != 0x0000000F) | |
2084 | + { // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector. | |
2085 | + srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src)); | |
2086 | + } | |
2087 | + else | |
2088 | + { | |
2089 | + srcvB = src_1; | |
2090 | + } | |
2091 | + srcvA = vec_mergeh(vczero, srcvA); | |
2092 | + srcvB = vec_mergeh(vczero, srcvB); | |
2093 | + | |
2094 | + for(i=0; i<h; i++) | |
2095 | + { | |
2096 | + dst_odd = (unsigned long)dst & 0x0000000F; | |
2097 | + src_really_odd = (((unsigned long)src) + stride) & 0x0000000F; | |
2098 | + | |
2099 | + dstv = vec_ld(0, dst); | |
2100 | + | |
2101 | + // we we'll be able to pick-up our 9 char elements | |
2102 | + // at src + stride from those 32 bytes | |
2103 | + // then reuse the resulting 2 vectors srvcC and srcvD | |
2104 | + // as the next srcvA and srcvB | |
2105 | + src_0 = vec_ld(stride + 0, src); | |
2106 | + src_1 = vec_ld(stride + 16, src); | |
2107 | + srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src)); | |
2108 | + | |
2109 | + if (src_really_odd != 0x0000000F) | |
2110 | + { // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector. | |
2111 | + srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src)); | |
2112 | + } | |
2113 | + else | |
2114 | + { | |
2115 | + srcvD = src_1; | |
2116 | + } | |
2117 | + | |
2118 | + srcvC = vec_mergeh(vczero, srcvC); | |
2119 | + srcvD = vec_mergeh(vczero, srcvD); | |
2120 | + | |
2121 | + | |
2122 | + // OK, now we (finally) do the math :-) | |
2123 | + // those four instructions replaces 32 int muls & 32 int adds. | |
2124 | + // isn't AltiVec nice ? | |
2125 | + tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV); | |
2126 | + tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA); | |
2127 | + tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB); | |
2128 | + tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC); | |
2129 | + | |
2130 | + srcvA = srcvC; | |
2131 | + srcvB = srcvD; | |
2132 | + | |
2133 | + tempD = vec_sr(tempD, vcsr8); | |
2134 | + | |
2135 | + dstv2 = vec_pack(tempD, (vector unsigned short)vczero); | |
2136 | + | |
2137 | + if (dst_odd) | |
2138 | + { | |
2139 | + dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1)); | |
2140 | + } | |
2141 | + else | |
2142 | + { | |
2143 | + dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3)); | |
2144 | + } | |
2145 | + | |
2146 | + vec_st(dstv2, 0, dst); | |
2147 | + | |
2148 | + dst += stride; | |
2149 | + src += stride; | |
2150 | + } | |
2151 | + | |
2152 | +POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND); | |
2153 | + | |
2154 | +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
2155 | +} | |
2156 | diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/idct_altivec.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/idct_altivec.c | |
2157 | --- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/idct_altivec.c 1970-01-01 01:00:00.000000000 +0100 | |
2158 | +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/idct_altivec.c 2003-09-28 17:26:40.000000000 +0200 | |
2159 | @@ -0,0 +1,245 @@ | |
2160 | +/* | |
2161 | + * Copyright (c) 2001 Michel Lespinasse | |
2162 | + * | |
2163 | + * This library is free software; you can redistribute it and/or | |
2164 | + * modify it under the terms of the GNU Lesser General Public | |
2165 | + * License as published by the Free Software Foundation; either | |
2166 | + * version 2 of the License, or (at your option) any later version. | |
2167 | + * | |
2168 | + * This library is distributed in the hope that it will be useful, | |
2169 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
2170 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
2171 | + * Lesser General Public License for more details. | |
2172 | + * | |
2173 | + * You should have received a copy of the GNU Lesser General Public | |
2174 | + * License along with this library; if not, write to the Free Software | |
2175 | + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
2176 | + * | |
2177 | + */ | |
2178 | + | |
2179 | +/* | |
2180 | + * NOTE: This code is based on GPL code from the libmpeg2 project. The | |
2181 | + * author, Michel Lespinasses, has given explicit permission to release | |
2182 | + * under LGPL as part of ffmpeg. | |
2183 | + * | |
2184 | + */ | |
2185 | + | |
2186 | +/* | |
2187 | + * FFMpeg integration by Dieter Shirley | |
2188 | + * | |
2189 | + * This file is a direct copy of the altivec idct module from the libmpeg2 | |
2190 | + * project. I've deleted all of the libmpeg2 specific code, renamed the functions and | |
2191 | + * re-ordered the function parameters. The only change to the IDCT function | |
2192 | + * itself was to factor out the partial transposition, and to perform a full | |
2193 | + * transpose at the end of the function. | |
2194 | + */ | |
2195 | + | |
2196 | + | |
2197 | +#include <stdlib.h> /* malloc(), free() */ | |
2198 | +#include <string.h> | |
2199 | +#include "../dsputil.h" | |
2200 | + | |
2201 | +#include "gcc_fixes.h" | |
2202 | + | |
2203 | +#include "dsputil_altivec.h" | |
2204 | + | |
2205 | +#define vector_s16_t vector signed short | |
2206 | +#define vector_u16_t vector unsigned short | |
2207 | +#define vector_s8_t vector signed char | |
2208 | +#define vector_u8_t vector unsigned char | |
2209 | +#define vector_s32_t vector signed int | |
2210 | +#define vector_u32_t vector unsigned int | |
2211 | + | |
2212 | +#define IDCT_HALF \ | |
2213 | + /* 1st stage */ \ | |
2214 | + t1 = vec_mradds (a1, vx7, vx1 ); \ | |
2215 | + t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7)); \ | |
2216 | + t7 = vec_mradds (a2, vx5, vx3); \ | |
2217 | + t3 = vec_mradds (ma2, vx3, vx5); \ | |
2218 | + \ | |
2219 | + /* 2nd stage */ \ | |
2220 | + t5 = vec_adds (vx0, vx4); \ | |
2221 | + t0 = vec_subs (vx0, vx4); \ | |
2222 | + t2 = vec_mradds (a0, vx6, vx2); \ | |
2223 | + t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6)); \ | |
2224 | + t6 = vec_adds (t8, t3); \ | |
2225 | + t3 = vec_subs (t8, t3); \ | |
2226 | + t8 = vec_subs (t1, t7); \ | |
2227 | + t1 = vec_adds (t1, t7); \ | |
2228 | + \ | |
2229 | + /* 3rd stage */ \ | |
2230 | + t7 = vec_adds (t5, t2); \ | |
2231 | + t2 = vec_subs (t5, t2); \ | |
2232 | + t5 = vec_adds (t0, t4); \ | |
2233 | + t0 = vec_subs (t0, t4); \ | |
2234 | + t4 = vec_subs (t8, t3); \ | |
2235 | + t3 = vec_adds (t8, t3); \ | |
2236 | + \ | |
2237 | + /* 4th stage */ \ | |
2238 | + vy0 = vec_adds (t7, t1); \ | |
2239 | + vy7 = vec_subs (t7, t1); \ | |
2240 | + vy1 = vec_mradds (c4, t3, t5); \ | |
2241 | + vy6 = vec_mradds (mc4, t3, t5); \ | |
2242 | + vy2 = vec_mradds (c4, t4, t0); \ | |
2243 | + vy5 = vec_mradds (mc4, t4, t0); \ | |
2244 | + vy3 = vec_adds (t2, t6); \ | |
2245 | + vy4 = vec_subs (t2, t6); | |
2246 | + | |
2247 | + | |
2248 | +#define IDCT \ | |
2249 | + vector_s16_t vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \ | |
2250 | + vector_s16_t vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \ | |
2251 | + vector_s16_t a0, a1, a2, ma2, c4, mc4, zero, bias; \ | |
2252 | + vector_s16_t t0, t1, t2, t3, t4, t5, t6, t7, t8; \ | |
2253 | + vector_u16_t shift; \ | |
2254 | + \ | |
2255 | + c4 = vec_splat (constants[0], 0); \ | |
2256 | + a0 = vec_splat (constants[0], 1); \ | |
2257 | + a1 = vec_splat (constants[0], 2); \ | |
2258 | + a2 = vec_splat (constants[0], 3); \ | |
2259 | + mc4 = vec_splat (constants[0], 4); \ | |
2260 | + ma2 = vec_splat (constants[0], 5); \ | |
2261 | + bias = (vector_s16_t)vec_splat ((vector_s32_t)constants[0], 3); \ | |
2262 | + \ | |
2263 | + zero = vec_splat_s16 (0); \ | |
2264 | + shift = vec_splat_u16 (4); \ | |
2265 | + \ | |
2266 | + vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero); \ | |
2267 | + vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero); \ | |
2268 | + vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero); \ | |
2269 | + vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero); \ | |
2270 | + vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero); \ | |
2271 | + vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero); \ | |
2272 | + vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero); \ | |
2273 | + vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero); \ | |
2274 | + \ | |
2275 | + IDCT_HALF \ | |
2276 | + \ | |
2277 | + vx0 = vec_mergeh (vy0, vy4); \ | |
2278 | + vx1 = vec_mergel (vy0, vy4); \ | |
2279 | + vx2 = vec_mergeh (vy1, vy5); \ | |
2280 | + vx3 = vec_mergel (vy1, vy5); \ | |
2281 | + vx4 = vec_mergeh (vy2, vy6); \ | |
2282 | + vx5 = vec_mergel (vy2, vy6); \ | |
2283 | + vx6 = vec_mergeh (vy3, vy7); \ | |
2284 | + vx7 = vec_mergel (vy3, vy7); \ | |
2285 | + \ | |
2286 | + vy0 = vec_mergeh (vx0, vx4); \ | |
2287 | + vy1 = vec_mergel (vx0, vx4); \ | |
2288 | + vy2 = vec_mergeh (vx1, vx5); \ | |
2289 | + vy3 = vec_mergel (vx1, vx5); \ | |
2290 | + vy4 = vec_mergeh (vx2, vx6); \ | |
2291 | + vy5 = vec_mergel (vx2, vx6); \ | |
2292 | + vy6 = vec_mergeh (vx3, vx7); \ | |
2293 | + vy7 = vec_mergel (vx3, vx7); \ | |
2294 | + \ | |
2295 | + vx0 = vec_adds (vec_mergeh (vy0, vy4), bias); \ | |
2296 | + vx1 = vec_mergel (vy0, vy4); \ | |
2297 | + vx2 = vec_mergeh (vy1, vy5); \ | |
2298 | + vx3 = vec_mergel (vy1, vy5); \ | |
2299 | + vx4 = vec_mergeh (vy2, vy6); \ | |
2300 | + vx5 = vec_mergel (vy2, vy6); \ | |
2301 | + vx6 = vec_mergeh (vy3, vy7); \ | |
2302 | + vx7 = vec_mergel (vy3, vy7); \ | |
2303 | + \ | |
2304 | + IDCT_HALF \ | |
2305 | + \ | |
2306 | + shift = vec_splat_u16 (6); \ | |
2307 | + vx0 = vec_sra (vy0, shift); \ | |
2308 | + vx1 = vec_sra (vy1, shift); \ | |
2309 | + vx2 = vec_sra (vy2, shift); \ | |
2310 | + vx3 = vec_sra (vy3, shift); \ | |
2311 | + vx4 = vec_sra (vy4, shift); \ | |
2312 | + vx5 = vec_sra (vy5, shift); \ | |
2313 | + vx6 = vec_sra (vy6, shift); \ | |
2314 | + vx7 = vec_sra (vy7, shift); | |
2315 | + | |
2316 | + | |
2317 | +static const vector_s16_t constants[5] = { | |
2318 | + (vector_s16_t) AVV(23170, 13573, 6518, 21895, -23170, -21895, 32, 31), | |
2319 | + (vector_s16_t) AVV(16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725), | |
2320 | + (vector_s16_t) AVV(22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521), | |
2321 | + (vector_s16_t) AVV(21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692), | |
2322 | + (vector_s16_t) AVV(19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722) | |
2323 | +}; | |
2324 | + | |
2325 | +void idct_put_altivec(uint8_t* dest, int stride, vector_s16_t* block) | |
2326 | +{ | |
2327 | +POWERPC_PERF_DECLARE(altivec_idct_put_num, 1); | |
2328 | +#ifdef ALTIVEC_USE_REFERENCE_C_CODE | |
2329 | +POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1); | |
2330 | + void simple_idct_put(uint8_t *dest, int line_size, int16_t *block); | |
2331 | + simple_idct_put(dest, stride, (int16_t*)block); | |
2332 | +POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1); | |
2333 | +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
2334 | + vector_u8_t tmp; | |
2335 | + | |
2336 | +POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1); | |
2337 | + | |
2338 | + IDCT | |
2339 | + | |
2340 | +#define COPY(dest,src) \ | |
2341 | + tmp = vec_packsu (src, src); \ | |
2342 | + vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest); \ | |
2343 | + vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest); | |
2344 | + | |
2345 | + COPY (dest, vx0) dest += stride; | |
2346 | + COPY (dest, vx1) dest += stride; | |
2347 | + COPY (dest, vx2) dest += stride; | |
2348 | + COPY (dest, vx3) dest += stride; | |
2349 | + COPY (dest, vx4) dest += stride; | |
2350 | + COPY (dest, vx5) dest += stride; | |
2351 | + COPY (dest, vx6) dest += stride; | |
2352 | + COPY (dest, vx7) | |
2353 | + | |
2354 | +POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1); | |
2355 | +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
2356 | +} | |
2357 | + | |
2358 | +void idct_add_altivec(uint8_t* dest, int stride, vector_s16_t* block) | |
2359 | +{ | |
2360 | +POWERPC_PERF_DECLARE(altivec_idct_add_num, 1); | |
2361 | +#ifdef ALTIVEC_USE_REFERENCE_C_CODE | |
2362 | +POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1); | |
2363 | + void simple_idct_add(uint8_t *dest, int line_size, int16_t *block); | |
2364 | + simple_idct_add(dest, stride, (int16_t*)block); | |
2365 | +POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1); | |
2366 | +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
2367 | + vector_u8_t tmp; | |
2368 | + vector_s16_t tmp2, tmp3; | |
2369 | + vector_u8_t perm0; | |
2370 | + vector_u8_t perm1; | |
2371 | + vector_u8_t p0, p1, p; | |
2372 | + | |
2373 | +POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1); | |
2374 | + | |
2375 | + IDCT | |
2376 | + | |
2377 | + p0 = vec_lvsl (0, dest); | |
2378 | + p1 = vec_lvsl (stride, dest); | |
2379 | + p = vec_splat_u8 (-1); | |
2380 | + perm0 = vec_mergeh (p, p0); | |
2381 | + perm1 = vec_mergeh (p, p1); | |
2382 | + | |
2383 | +#define ADD(dest,src,perm) \ | |
2384 | + /* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \ | |
2385 | + tmp = vec_ld (0, dest); \ | |
2386 | + tmp2 = (vector_s16_t)vec_perm (tmp, (vector_u8_t)zero, perm); \ | |
2387 | + tmp3 = vec_adds (tmp2, src); \ | |
2388 | + tmp = vec_packsu (tmp3, tmp3); \ | |
2389 | + vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest); \ | |
2390 | + vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest); | |
2391 | + | |
2392 | + ADD (dest, vx0, perm0) dest += stride; | |
2393 | + ADD (dest, vx1, perm1) dest += stride; | |
2394 | + ADD (dest, vx2, perm0) dest += stride; | |
2395 | + ADD (dest, vx3, perm1) dest += stride; | |
2396 | + ADD (dest, vx4, perm0) dest += stride; | |
2397 | + ADD (dest, vx5, perm1) dest += stride; | |
2398 | + ADD (dest, vx6, perm0) dest += stride; | |
2399 | + ADD (dest, vx7, perm1) | |
2400 | + | |
2401 | +POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1); | |
2402 | +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
2403 | +} | |
2404 | + | |
2405 | diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/mpegvideo_altivec.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/mpegvideo_altivec.c | |
2406 | --- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/mpegvideo_altivec.c 1970-01-01 01:00:00.000000000 +0100 | |
2407 | +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/mpegvideo_altivec.c 2003-09-28 17:26:40.000000000 +0200 | |
2408 | @@ -0,0 +1,645 @@ | |
2409 | +/* | |
2410 | + * Copyright (c) 2002 Dieter Shirley | |
2411 | + * | |
2412 | + * This library is free software; you can redistribute it and/or | |
2413 | + * modify it under the terms of the GNU Lesser General Public | |
2414 | + * License as published by the Free Software Foundation; either | |
2415 | + * version 2 of the License, or (at your option) any later version. | |
2416 | + * | |
2417 | + * This library is distributed in the hope that it will be useful, | |
2418 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
2419 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
2420 | + * Lesser General Public License for more details. | |
2421 | + * | |
2422 | + * You should have received a copy of the GNU Lesser General Public | |
2423 | + * License along with this library; if not, write to the Free Software | |
2424 | + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
2425 | + */ | |
2426 | + | |
2427 | +#include <stdlib.h> | |
2428 | +#include <stdio.h> | |
2429 | +#include "../dsputil.h" | |
2430 | +#include "../mpegvideo.h" | |
2431 | + | |
2432 | +#include "gcc_fixes.h" | |
2433 | + | |
2434 | +#include "dsputil_altivec.h" | |
2435 | + | |
2436 | +// Swaps two variables (used for altivec registers) | |
2437 | +#define SWAP(a,b) \ | |
2438 | +do { \ | |
2439 | + __typeof__(a) swap_temp=a; \ | |
2440 | + a=b; \ | |
2441 | + b=swap_temp; \ | |
2442 | +} while (0) | |
2443 | + | |
2444 | +// transposes a matrix consisting of four vectors with four elements each | |
2445 | +#define TRANSPOSE4(a,b,c,d) \ | |
2446 | +do { \ | |
2447 | + __typeof__(a) _trans_ach = vec_mergeh(a, c); \ | |
2448 | + __typeof__(a) _trans_acl = vec_mergel(a, c); \ | |
2449 | + __typeof__(a) _trans_bdh = vec_mergeh(b, d); \ | |
2450 | + __typeof__(a) _trans_bdl = vec_mergel(b, d); \ | |
2451 | + \ | |
2452 | + a = vec_mergeh(_trans_ach, _trans_bdh); \ | |
2453 | + b = vec_mergel(_trans_ach, _trans_bdh); \ | |
2454 | + c = vec_mergeh(_trans_acl, _trans_bdl); \ | |
2455 | + d = vec_mergel(_trans_acl, _trans_bdl); \ | |
2456 | +} while (0) | |
2457 | + | |
2458 | +#define TRANSPOSE8(a,b,c,d,e,f,g,h) \ | |
2459 | +do { \ | |
2460 | + __typeof__(a) _A1, _B1, _C1, _D1, _E1, _F1, _G1, _H1; \ | |
2461 | + __typeof__(a) _A2, _B2, _C2, _D2, _E2, _F2, _G2, _H2; \ | |
2462 | + \ | |
2463 | + _A1 = vec_mergeh (a, e); \ | |
2464 | + _B1 = vec_mergel (a, e); \ | |
2465 | + _C1 = vec_mergeh (b, f); \ | |
2466 | + _D1 = vec_mergel (b, f); \ | |
2467 | + _E1 = vec_mergeh (c, g); \ | |
2468 | + _F1 = vec_mergel (c, g); \ | |
2469 | + _G1 = vec_mergeh (d, h); \ | |
2470 | + _H1 = vec_mergel (d, h); \ | |
2471 | + \ | |
2472 | + _A2 = vec_mergeh (_A1, _E1); \ | |
2473 | + _B2 = vec_mergel (_A1, _E1); \ | |
2474 | + _C2 = vec_mergeh (_B1, _F1); \ | |
2475 | + _D2 = vec_mergel (_B1, _F1); \ | |
2476 | + _E2 = vec_mergeh (_C1, _G1); \ | |
2477 | + _F2 = vec_mergel (_C1, _G1); \ | |
2478 | + _G2 = vec_mergeh (_D1, _H1); \ | |
2479 | + _H2 = vec_mergel (_D1, _H1); \ | |
2480 | + \ | |
2481 | + a = vec_mergeh (_A2, _E2); \ | |
2482 | + b = vec_mergel (_A2, _E2); \ | |
2483 | + c = vec_mergeh (_B2, _F2); \ | |
2484 | + d = vec_mergel (_B2, _F2); \ | |
2485 | + e = vec_mergeh (_C2, _G2); \ | |
2486 | + f = vec_mergel (_C2, _G2); \ | |
2487 | + g = vec_mergeh (_D2, _H2); \ | |
2488 | + h = vec_mergel (_D2, _H2); \ | |
2489 | +} while (0) | |
2490 | + | |
2491 | + | |
2492 | +// Loads a four-byte value (int or float) from the target address | |
2493 | +// into every element in the target vector. Only works if the | |
2494 | +// target address is four-byte aligned (which should be always). | |
2495 | +#define LOAD4(vec, address) \ | |
2496 | +{ \ | |
2497 | + __typeof__(vec)* _load_addr = (__typeof__(vec)*)(address); \ | |
2498 | + vector unsigned char _perm_vec = vec_lvsl(0,(address)); \ | |
2499 | + vec = vec_ld(0, _load_addr); \ | |
2500 | + vec = vec_perm(vec, vec, _perm_vec); \ | |
2501 | + vec = vec_splat(vec, 0); \ | |
2502 | +} | |
2503 | + | |
2504 | + | |
2505 | +#ifdef CONFIG_DARWIN | |
2506 | +#define FOUROF(a) (a) | |
2507 | +#else | |
2508 | +// slower, for dumb non-apple GCC | |
2509 | +#define FOUROF(a) {a,a,a,a} | |
2510 | +#endif | |
2511 | +int dct_quantize_altivec(MpegEncContext* s, | |
2512 | + DCTELEM* data, int n, | |
2513 | + int qscale, int* overflow) | |
2514 | +{ | |
2515 | + int lastNonZero; | |
2516 | + vector float row0, row1, row2, row3, row4, row5, row6, row7; | |
2517 | + vector float alt0, alt1, alt2, alt3, alt4, alt5, alt6, alt7; | |
2518 | + const vector float zero = (const vector float)FOUROF(0.); | |
2519 | + | |
2520 | + // Load the data into the row/alt vectors | |
2521 | + { | |
2522 | + vector signed short data0, data1, data2, data3, data4, data5, data6, data7; | |
2523 | + | |
2524 | + data0 = vec_ld(0, data); | |
2525 | + data1 = vec_ld(16, data); | |
2526 | + data2 = vec_ld(32, data); | |
2527 | + data3 = vec_ld(48, data); | |
2528 | + data4 = vec_ld(64, data); | |
2529 | + data5 = vec_ld(80, data); | |
2530 | + data6 = vec_ld(96, data); | |
2531 | + data7 = vec_ld(112, data); | |
2532 | + | |
2533 | + // Transpose the data before we start | |
2534 | + TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7); | |
2535 | + | |
2536 | + // load the data into floating point vectors. We load | |
2537 | + // the high half of each row into the main row vectors | |
2538 | + // and the low half into the alt vectors. | |
2539 | + row0 = vec_ctf(vec_unpackh(data0), 0); | |
2540 | + alt0 = vec_ctf(vec_unpackl(data0), 0); | |
2541 | + row1 = vec_ctf(vec_unpackh(data1), 0); | |
2542 | + alt1 = vec_ctf(vec_unpackl(data1), 0); | |
2543 | + row2 = vec_ctf(vec_unpackh(data2), 0); | |
2544 | + alt2 = vec_ctf(vec_unpackl(data2), 0); | |
2545 | + row3 = vec_ctf(vec_unpackh(data3), 0); | |
2546 | + alt3 = vec_ctf(vec_unpackl(data3), 0); | |
2547 | + row4 = vec_ctf(vec_unpackh(data4), 0); | |
2548 | + alt4 = vec_ctf(vec_unpackl(data4), 0); | |
2549 | + row5 = vec_ctf(vec_unpackh(data5), 0); | |
2550 | + alt5 = vec_ctf(vec_unpackl(data5), 0); | |
2551 | + row6 = vec_ctf(vec_unpackh(data6), 0); | |
2552 | + alt6 = vec_ctf(vec_unpackl(data6), 0); | |
2553 | + row7 = vec_ctf(vec_unpackh(data7), 0); | |
2554 | + alt7 = vec_ctf(vec_unpackl(data7), 0); | |
2555 | + } | |
2556 | + | |
2557 | + // The following block could exist as a separate an altivec dct | |
2558 | + // function. However, if we put it inline, the DCT data can remain | |
2559 | + // in the vector local variables, as floats, which we'll use during the | |
2560 | + // quantize step... | |
2561 | + { | |
2562 | + const vector float vec_0_298631336 = (vector float)FOUROF(0.298631336f); | |
2563 | + const vector float vec_0_390180644 = (vector float)FOUROF(-0.390180644f); | |
2564 | + const vector float vec_0_541196100 = (vector float)FOUROF(0.541196100f); | |
2565 | + const vector float vec_0_765366865 = (vector float)FOUROF(0.765366865f); | |
2566 | + const vector float vec_0_899976223 = (vector float)FOUROF(-0.899976223f); | |
2567 | + const vector float vec_1_175875602 = (vector float)FOUROF(1.175875602f); | |
2568 | + const vector float vec_1_501321110 = (vector float)FOUROF(1.501321110f); | |
2569 | + const vector float vec_1_847759065 = (vector float)FOUROF(-1.847759065f); | |
2570 | + const vector float vec_1_961570560 = (vector float)FOUROF(-1.961570560f); | |
2571 | + const vector float vec_2_053119869 = (vector float)FOUROF(2.053119869f); | |
2572 | + const vector float vec_2_562915447 = (vector float)FOUROF(-2.562915447f); | |
2573 | + const vector float vec_3_072711026 = (vector float)FOUROF(3.072711026f); | |
2574 | + | |
2575 | + | |
2576 | + int whichPass, whichHalf; | |
2577 | + | |
2578 | + for(whichPass = 1; whichPass<=2; whichPass++) | |
2579 | + { | |
2580 | + for(whichHalf = 1; whichHalf<=2; whichHalf++) | |
2581 | + { | |
2582 | + vector float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; | |
2583 | + vector float tmp10, tmp11, tmp12, tmp13; | |
2584 | + vector float z1, z2, z3, z4, z5; | |
2585 | + | |
2586 | + tmp0 = vec_add(row0, row7); // tmp0 = dataptr[0] + dataptr[7]; | |
2587 | + tmp7 = vec_sub(row0, row7); // tmp7 = dataptr[0] - dataptr[7]; | |
2588 | + tmp3 = vec_add(row3, row4); // tmp3 = dataptr[3] + dataptr[4]; | |
2589 | + tmp4 = vec_sub(row3, row4); // tmp4 = dataptr[3] - dataptr[4]; | |
2590 | + tmp1 = vec_add(row1, row6); // tmp1 = dataptr[1] + dataptr[6]; | |
2591 | + tmp6 = vec_sub(row1, row6); // tmp6 = dataptr[1] - dataptr[6]; | |
2592 | + tmp2 = vec_add(row2, row5); // tmp2 = dataptr[2] + dataptr[5]; | |
2593 | + tmp5 = vec_sub(row2, row5); // tmp5 = dataptr[2] - dataptr[5]; | |
2594 | + | |
2595 | + tmp10 = vec_add(tmp0, tmp3); // tmp10 = tmp0 + tmp3; | |
2596 | + tmp13 = vec_sub(tmp0, tmp3); // tmp13 = tmp0 - tmp3; | |
2597 | + tmp11 = vec_add(tmp1, tmp2); // tmp11 = tmp1 + tmp2; | |
2598 | + tmp12 = vec_sub(tmp1, tmp2); // tmp12 = tmp1 - tmp2; | |
2599 | + | |
2600 | + | |
2601 | + // dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS); | |
2602 | + row0 = vec_add(tmp10, tmp11); | |
2603 | + | |
2604 | + // dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS); | |
2605 | + row4 = vec_sub(tmp10, tmp11); | |
2606 | + | |
2607 | + | |
2608 | + // z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); | |
2609 | + z1 = vec_madd(vec_add(tmp12, tmp13), vec_0_541196100, (vector float)zero); | |
2610 | + | |
2611 | + // dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865), | |
2612 | + // CONST_BITS-PASS1_BITS); | |
2613 | + row2 = vec_madd(tmp13, vec_0_765366865, z1); | |
2614 | + | |
2615 | + // dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065), | |
2616 | + // CONST_BITS-PASS1_BITS); | |
2617 | + row6 = vec_madd(tmp12, vec_1_847759065, z1); | |
2618 | + | |
2619 | + z1 = vec_add(tmp4, tmp7); // z1 = tmp4 + tmp7; | |
2620 | + z2 = vec_add(tmp5, tmp6); // z2 = tmp5 + tmp6; | |
2621 | + z3 = vec_add(tmp4, tmp6); // z3 = tmp4 + tmp6; | |
2622 | + z4 = vec_add(tmp5, tmp7); // z4 = tmp5 + tmp7; | |
2623 | + | |
2624 | + // z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ | |
2625 | + z5 = vec_madd(vec_add(z3, z4), vec_1_175875602, (vector float)zero); | |
2626 | + | |
2627 | + // z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ | |
2628 | + z3 = vec_madd(z3, vec_1_961570560, z5); | |
2629 | + | |
2630 | + // z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ | |
2631 | + z4 = vec_madd(z4, vec_0_390180644, z5); | |
2632 | + | |
2633 | + // The following adds are rolled into the multiplies above | |
2634 | + // z3 = vec_add(z3, z5); // z3 += z5; | |
2635 | + // z4 = vec_add(z4, z5); // z4 += z5; | |
2636 | + | |
2637 | + // z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ | |
2638 | + // Wow! It's actually more effecient to roll this multiply | |
2639 | + // into the adds below, even thought the multiply gets done twice! | |
2640 | + // z2 = vec_madd(z2, vec_2_562915447, (vector float)zero); | |
2641 | + | |
2642 | + // z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ | |
2643 | + // Same with this one... | |
2644 | + // z1 = vec_madd(z1, vec_0_899976223, (vector float)zero); | |
2645 | + | |
2646 | + // tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ | |
2647 | + // dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); | |
2648 | + row7 = vec_madd(tmp4, vec_0_298631336, vec_madd(z1, vec_0_899976223, z3)); | |
2649 | + | |
2650 | + // tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ | |
2651 | + // dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); | |
2652 | + row5 = vec_madd(tmp5, vec_2_053119869, vec_madd(z2, vec_2_562915447, z4)); | |
2653 | + | |
2654 | + // tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ | |
2655 | + // dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); | |
2656 | + row3 = vec_madd(tmp6, vec_3_072711026, vec_madd(z2, vec_2_562915447, z3)); | |
2657 | + | |
2658 | + // tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ | |
2659 | + // dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); | |
2660 | + row1 = vec_madd(z1, vec_0_899976223, vec_madd(tmp7, vec_1_501321110, z4)); | |
2661 | + | |
2662 | + // Swap the row values with the alts. If this is the first half, | |
2663 | + // this sets up the low values to be acted on in the second half. | |
2664 | + // If this is the second half, it puts the high values back in | |
2665 | + // the row values where they are expected to be when we're done. | |
2666 | + SWAP(row0, alt0); | |
2667 | + SWAP(row1, alt1); | |
2668 | + SWAP(row2, alt2); | |
2669 | + SWAP(row3, alt3); | |
2670 | + SWAP(row4, alt4); | |
2671 | + SWAP(row5, alt5); | |
2672 | + SWAP(row6, alt6); | |
2673 | + SWAP(row7, alt7); | |
2674 | + } | |
2675 | + | |
2676 | + if (whichPass == 1) | |
2677 | + { | |
2678 | + // transpose the data for the second pass | |
2679 | + | |
2680 | + // First, block transpose the upper right with lower left. | |
2681 | + SWAP(row4, alt0); | |
2682 | + SWAP(row5, alt1); | |
2683 | + SWAP(row6, alt2); | |
2684 | + SWAP(row7, alt3); | |
2685 | + | |
2686 | + // Now, transpose each block of four | |
2687 | + TRANSPOSE4(row0, row1, row2, row3); | |
2688 | + TRANSPOSE4(row4, row5, row6, row7); | |
2689 | + TRANSPOSE4(alt0, alt1, alt2, alt3); | |
2690 | + TRANSPOSE4(alt4, alt5, alt6, alt7); | |
2691 | + } | |
2692 | + } | |
2693 | + } | |
2694 | + | |
2695 | + // used after quantise step | |
2696 | + int oldBaseValue = 0; | |
2697 | + | |
2698 | + // perform the quantise step, using the floating point data | |
2699 | + // still in the row/alt registers | |
2700 | + { | |
2701 | + const int* biasAddr; | |
2702 | + const vector signed int* qmat; | |
2703 | + vector float bias, negBias; | |
2704 | + | |
2705 | + if (s->mb_intra) | |
2706 | + { | |
2707 | + vector signed int baseVector; | |
2708 | + | |
2709 | + // We must cache element 0 in the intra case | |
2710 | + // (it needs special handling). | |
2711 | + baseVector = vec_cts(vec_splat(row0, 0), 0); | |
2712 | + vec_ste(baseVector, 0, &oldBaseValue); | |
2713 | + | |
2714 | + qmat = (vector signed int*)s->q_intra_matrix[qscale]; | |
2715 | + biasAddr = &(s->intra_quant_bias); | |
2716 | + } | |
2717 | + else | |
2718 | + { | |
2719 | + qmat = (vector signed int*)s->q_inter_matrix[qscale]; | |
2720 | + biasAddr = &(s->inter_quant_bias); | |
2721 | + } | |
2722 | + | |
2723 | + // Load the bias vector (We add 0.5 to the bias so that we're | |
2724 | + // rounding when we convert to int, instead of flooring.) | |
2725 | + { | |
2726 | + vector signed int biasInt; | |
2727 | + const vector float negOneFloat = (vector float)FOUROF(-1.0f); | |
2728 | + LOAD4(biasInt, biasAddr); | |
2729 | + bias = vec_ctf(biasInt, QUANT_BIAS_SHIFT); | |
2730 | + negBias = vec_madd(bias, negOneFloat, zero); | |
2731 | + } | |
2732 | + | |
2733 | + { | |
2734 | + vector float q0, q1, q2, q3, q4, q5, q6, q7; | |
2735 | + | |
2736 | + q0 = vec_ctf(qmat[0], QMAT_SHIFT); | |
2737 | + q1 = vec_ctf(qmat[2], QMAT_SHIFT); | |
2738 | + q2 = vec_ctf(qmat[4], QMAT_SHIFT); | |
2739 | + q3 = vec_ctf(qmat[6], QMAT_SHIFT); | |
2740 | + q4 = vec_ctf(qmat[8], QMAT_SHIFT); | |
2741 | + q5 = vec_ctf(qmat[10], QMAT_SHIFT); | |
2742 | + q6 = vec_ctf(qmat[12], QMAT_SHIFT); | |
2743 | + q7 = vec_ctf(qmat[14], QMAT_SHIFT); | |
2744 | + | |
2745 | + row0 = vec_sel(vec_madd(row0, q0, negBias), vec_madd(row0, q0, bias), | |
2746 | + vec_cmpgt(row0, zero)); | |
2747 | + row1 = vec_sel(vec_madd(row1, q1, negBias), vec_madd(row1, q1, bias), | |
2748 | + vec_cmpgt(row1, zero)); | |
2749 | + row2 = vec_sel(vec_madd(row2, q2, negBias), vec_madd(row2, q2, bias), | |
2750 | + vec_cmpgt(row2, zero)); | |
2751 | + row3 = vec_sel(vec_madd(row3, q3, negBias), vec_madd(row3, q3, bias), | |
2752 | + vec_cmpgt(row3, zero)); | |
2753 | + row4 = vec_sel(vec_madd(row4, q4, negBias), vec_madd(row4, q4, bias), | |
2754 | + vec_cmpgt(row4, zero)); | |
2755 | + row5 = vec_sel(vec_madd(row5, q5, negBias), vec_madd(row5, q5, bias), | |
2756 | + vec_cmpgt(row5, zero)); | |
2757 | + row6 = vec_sel(vec_madd(row6, q6, negBias), vec_madd(row6, q6, bias), | |
2758 | + vec_cmpgt(row6, zero)); | |
2759 | + row7 = vec_sel(vec_madd(row7, q7, negBias), vec_madd(row7, q7, bias), | |
2760 | + vec_cmpgt(row7, zero)); | |
2761 | + | |
2762 | + q0 = vec_ctf(qmat[1], QMAT_SHIFT); | |
2763 | + q1 = vec_ctf(qmat[3], QMAT_SHIFT); | |
2764 | + q2 = vec_ctf(qmat[5], QMAT_SHIFT); | |
2765 | + q3 = vec_ctf(qmat[7], QMAT_SHIFT); | |
2766 | + q4 = vec_ctf(qmat[9], QMAT_SHIFT); | |
2767 | + q5 = vec_ctf(qmat[11], QMAT_SHIFT); | |
2768 | + q6 = vec_ctf(qmat[13], QMAT_SHIFT); | |
2769 | + q7 = vec_ctf(qmat[15], QMAT_SHIFT); | |
2770 | + | |
2771 | + alt0 = vec_sel(vec_madd(alt0, q0, negBias), vec_madd(alt0, q0, bias), | |
2772 | + vec_cmpgt(alt0, zero)); | |
2773 | + alt1 = vec_sel(vec_madd(alt1, q1, negBias), vec_madd(alt1, q1, bias), | |
2774 | + vec_cmpgt(alt1, zero)); | |
2775 | + alt2 = vec_sel(vec_madd(alt2, q2, negBias), vec_madd(alt2, q2, bias), | |
2776 | + vec_cmpgt(alt2, zero)); | |
2777 | + alt3 = vec_sel(vec_madd(alt3, q3, negBias), vec_madd(alt3, q3, bias), | |
2778 | + vec_cmpgt(alt3, zero)); | |
2779 | + alt4 = vec_sel(vec_madd(alt4, q4, negBias), vec_madd(alt4, q4, bias), | |
2780 | + vec_cmpgt(alt4, zero)); | |
2781 | + alt5 = vec_sel(vec_madd(alt5, q5, negBias), vec_madd(alt5, q5, bias), | |
2782 | + vec_cmpgt(alt5, zero)); | |
2783 | + alt6 = vec_sel(vec_madd(alt6, q6, negBias), vec_madd(alt6, q6, bias), | |
2784 | + vec_cmpgt(alt6, zero)); | |
2785 | + alt7 = vec_sel(vec_madd(alt7, q7, negBias), vec_madd(alt7, q7, bias), | |
2786 | + vec_cmpgt(alt7, zero)); | |
2787 | + } | |
2788 | + | |
2789 | + | |
2790 | + } | |
2791 | + | |
2792 | + // Store the data back into the original block | |
2793 | + { | |
2794 | + vector signed short data0, data1, data2, data3, data4, data5, data6, data7; | |
2795 | + | |
2796 | + data0 = vec_pack(vec_cts(row0, 0), vec_cts(alt0, 0)); | |
2797 | + data1 = vec_pack(vec_cts(row1, 0), vec_cts(alt1, 0)); | |
2798 | + data2 = vec_pack(vec_cts(row2, 0), vec_cts(alt2, 0)); | |
2799 | + data3 = vec_pack(vec_cts(row3, 0), vec_cts(alt3, 0)); | |
2800 | + data4 = vec_pack(vec_cts(row4, 0), vec_cts(alt4, 0)); | |
2801 | + data5 = vec_pack(vec_cts(row5, 0), vec_cts(alt5, 0)); | |
2802 | + data6 = vec_pack(vec_cts(row6, 0), vec_cts(alt6, 0)); | |
2803 | + data7 = vec_pack(vec_cts(row7, 0), vec_cts(alt7, 0)); | |
2804 | + | |
2805 | + { | |
2806 | + // Clamp for overflow | |
2807 | + vector signed int max_q_int, min_q_int; | |
2808 | + vector signed short max_q, min_q; | |
2809 | + | |
2810 | + LOAD4(max_q_int, &(s->max_qcoeff)); | |
2811 | + LOAD4(min_q_int, &(s->min_qcoeff)); | |
2812 | + | |
2813 | + max_q = vec_pack(max_q_int, max_q_int); | |
2814 | + min_q = vec_pack(min_q_int, min_q_int); | |
2815 | + | |
2816 | + data0 = vec_max(vec_min(data0, max_q), min_q); | |
2817 | + data1 = vec_max(vec_min(data1, max_q), min_q); | |
2818 | + data2 = vec_max(vec_min(data2, max_q), min_q); | |
2819 | + data4 = vec_max(vec_min(data4, max_q), min_q); | |
2820 | + data5 = vec_max(vec_min(data5, max_q), min_q); | |
2821 | + data6 = vec_max(vec_min(data6, max_q), min_q); | |
2822 | + data7 = vec_max(vec_min(data7, max_q), min_q); | |
2823 | + } | |
2824 | + | |
2825 | + vector bool char zero_01, zero_23, zero_45, zero_67; | |
2826 | + vector signed char scanIndices_01, scanIndices_23, scanIndices_45, scanIndices_67; | |
2827 | + vector signed char negOne = vec_splat_s8(-1); | |
2828 | + vector signed char* scanPtr = | |
2829 | + (vector signed char*)(s->intra_scantable.inverse); | |
2830 | + | |
2831 | + // Determine the largest non-zero index. | |
2832 | + zero_01 = vec_pack(vec_cmpeq(data0, (vector short)zero), | |
2833 | + vec_cmpeq(data1, (vector short)zero)); | |
2834 | + zero_23 = vec_pack(vec_cmpeq(data2, (vector short)zero), | |
2835 | + vec_cmpeq(data3, (vector short)zero)); | |
2836 | + zero_45 = vec_pack(vec_cmpeq(data4, (vector short)zero), | |
2837 | + vec_cmpeq(data5, (vector short)zero)); | |
2838 | + zero_67 = vec_pack(vec_cmpeq(data6, (vector short)zero), | |
2839 | + vec_cmpeq(data7, (vector short)zero)); | |
2840 | + | |
2841 | + // 64 biggest values | |
2842 | + scanIndices_01 = vec_sel(scanPtr[0], negOne, zero_01); | |
2843 | + scanIndices_23 = vec_sel(scanPtr[1], negOne, zero_23); | |
2844 | + scanIndices_45 = vec_sel(scanPtr[2], negOne, zero_45); | |
2845 | + scanIndices_67 = vec_sel(scanPtr[3], negOne, zero_67); | |
2846 | + | |
2847 | + // 32 largest values | |
2848 | + scanIndices_01 = vec_max(scanIndices_01, scanIndices_23); | |
2849 | + scanIndices_45 = vec_max(scanIndices_45, scanIndices_67); | |
2850 | + | |
2851 | + // 16 largest values | |
2852 | + scanIndices_01 = vec_max(scanIndices_01, scanIndices_45); | |
2853 | + | |
2854 | + // 8 largest values | |
2855 | + scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne), | |
2856 | + vec_mergel(scanIndices_01, negOne)); | |
2857 | + | |
2858 | + // 4 largest values | |
2859 | + scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne), | |
2860 | + vec_mergel(scanIndices_01, negOne)); | |
2861 | + | |
2862 | + // 2 largest values | |
2863 | + scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne), | |
2864 | + vec_mergel(scanIndices_01, negOne)); | |
2865 | + | |
2866 | + // largest value | |
2867 | + scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne), | |
2868 | + vec_mergel(scanIndices_01, negOne)); | |
2869 | + | |
2870 | + scanIndices_01 = vec_splat(scanIndices_01, 0); | |
2871 | + | |
2872 | + signed char lastNonZeroChar; | |
2873 | + | |
2874 | + vec_ste(scanIndices_01, 0, &lastNonZeroChar); | |
2875 | + | |
2876 | + lastNonZero = lastNonZeroChar; | |
2877 | + | |
2878 | + // While the data is still in vectors we check for the transpose IDCT permute | |
2879 | + // and handle it using the vector unit if we can. This is the permute used | |
2880 | + // by the altivec idct, so it is common when using the altivec dct. | |
2881 | + | |
2882 | + if ((lastNonZero > 0) && (s->dsp.idct_permutation_type == FF_TRANSPOSE_IDCT_PERM)) | |
2883 | + { | |
2884 | + TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7); | |
2885 | + } | |
2886 | + | |
2887 | + vec_st(data0, 0, data); | |
2888 | + vec_st(data1, 16, data); | |
2889 | + vec_st(data2, 32, data); | |
2890 | + vec_st(data3, 48, data); | |
2891 | + vec_st(data4, 64, data); | |
2892 | + vec_st(data5, 80, data); | |
2893 | + vec_st(data6, 96, data); | |
2894 | + vec_st(data7, 112, data); | |
2895 | + } | |
2896 | + | |
2897 | + // special handling of block[0] | |
2898 | + if (s->mb_intra) | |
2899 | + { | |
2900 | + if (!s->h263_aic) | |
2901 | + { | |
2902 | + if (n < 4) | |
2903 | + oldBaseValue /= s->y_dc_scale; | |
2904 | + else | |
2905 | + oldBaseValue /= s->c_dc_scale; | |
2906 | + } | |
2907 | + | |
2908 | + // Divide by 8, rounding the result | |
2909 | + data[0] = (oldBaseValue + 4) >> 3; | |
2910 | + } | |
2911 | + | |
2912 | + // We handled the tranpose permutation above and we don't | |
2913 | + // need to permute the "no" permutation case. | |
2914 | + if ((lastNonZero > 0) && | |
2915 | + (s->dsp.idct_permutation_type != FF_TRANSPOSE_IDCT_PERM) && | |
2916 | + (s->dsp.idct_permutation_type != FF_NO_IDCT_PERM)) | |
2917 | + { | |
2918 | + ff_block_permute(data, s->dsp.idct_permutation, | |
2919 | + s->intra_scantable.scantable, lastNonZero); | |
2920 | + } | |
2921 | + | |
2922 | + return lastNonZero; | |
2923 | +} | |
2924 | +#undef FOUROF | |
2925 | + | |
2926 | +/* | |
2927 | + AltiVec version of dct_unquantize_h263 | |
2928 | + this code assumes `block' is 16 bytes-aligned | |
2929 | +*/ | |
2930 | +void dct_unquantize_h263_altivec(MpegEncContext *s, | |
2931 | + DCTELEM *block, int n, int qscale) | |
2932 | +{ | |
2933 | +POWERPC_PERF_DECLARE(altivec_dct_unquantize_h263_num, 1); | |
2934 | + int i, level, qmul, qadd; | |
2935 | + int nCoeffs; | |
2936 | + | |
2937 | + assert(s->block_last_index[n]>=0); | |
2938 | + | |
2939 | +POWERPC_PERF_START_COUNT(altivec_dct_unquantize_h263_num, 1); | |
2940 | + | |
2941 | + qadd = (qscale - 1) | 1; | |
2942 | + qmul = qscale << 1; | |
2943 | + | |
2944 | + if (s->mb_intra) { | |
2945 | + if (!s->h263_aic) { | |
2946 | + if (n < 4) | |
2947 | + block[0] = block[0] * s->y_dc_scale; | |
2948 | + else | |
2949 | + block[0] = block[0] * s->c_dc_scale; | |
2950 | + }else | |
2951 | + qadd = 0; | |
2952 | + i = 1; | |
2953 | + nCoeffs= 63; //does not allways use zigzag table | |
2954 | + } else { | |
2955 | + i = 0; | |
2956 | + nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; | |
2957 | + } | |
2958 | + | |
2959 | +#ifdef ALTIVEC_USE_REFERENCE_C_CODE | |
2960 | + for(;i<=nCoeffs;i++) { | |
2961 | + level = block[i]; | |
2962 | + if (level) { | |
2963 | + if (level < 0) { | |
2964 | + level = level * qmul - qadd; | |
2965 | + } else { | |
2966 | + level = level * qmul + qadd; | |
2967 | + } | |
2968 | + block[i] = level; | |
2969 | + } | |
2970 | + } | |
2971 | +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
2972 | + { | |
2973 | + register const vector short vczero = (const vector short)vec_splat_s16(0); | |
2974 | + short __attribute__ ((aligned(16))) qmul8[] = | |
2975 | + { | |
2976 | + qmul, qmul, qmul, qmul, | |
2977 | + qmul, qmul, qmul, qmul | |
2978 | + }; | |
2979 | + short __attribute__ ((aligned(16))) qadd8[] = | |
2980 | + { | |
2981 | + qadd, qadd, qadd, qadd, | |
2982 | + qadd, qadd, qadd, qadd | |
2983 | + }; | |
2984 | + short __attribute__ ((aligned(16))) nqadd8[] = | |
2985 | + { | |
2986 | + -qadd, -qadd, -qadd, -qadd, | |
2987 | + -qadd, -qadd, -qadd, -qadd | |
2988 | + }; | |
2989 | + register vector short blockv, qmulv, qaddv, nqaddv, temp1; | |
2990 | + register vector bool short blockv_null, blockv_neg; | |
2991 | + register short backup_0 = block[0]; | |
2992 | + register int j = 0; | |
2993 | + | |
2994 | + qmulv = vec_ld(0, qmul8); | |
2995 | + qaddv = vec_ld(0, qadd8); | |
2996 | + nqaddv = vec_ld(0, nqadd8); | |
2997 | + | |
2998 | +#if 0 // block *is* 16 bytes-aligned, it seems. | |
2999 | + // first make sure block[j] is 16 bytes-aligned | |
3000 | + for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) { | |
3001 | + level = block[j]; | |
3002 | + if (level) { | |
3003 | + if (level < 0) { | |
3004 | + level = level * qmul - qadd; | |
3005 | + } else { | |
3006 | + level = level * qmul + qadd; | |
3007 | + } | |
3008 | + block[j] = level; | |
3009 | + } | |
3010 | + } | |
3011 | +#endif | |
3012 | + | |
3013 | + // vectorize all the 16 bytes-aligned blocks | |
3014 | + // of 8 elements | |
3015 | + for(; (j + 7) <= nCoeffs ; j+=8) | |
3016 | + { | |
3017 | + blockv = vec_ld(j << 1, block); | |
3018 | + blockv_neg = vec_cmplt(blockv, vczero); | |
3019 | + blockv_null = vec_cmpeq(blockv, vczero); | |
3020 | + // choose between +qadd or -qadd as the third operand | |
3021 | + temp1 = vec_sel(qaddv, nqaddv, blockv_neg); | |
3022 | + // multiply & add (block{i,i+7} * qmul [+-] qadd) | |
3023 | + temp1 = vec_mladd(blockv, qmulv, temp1); | |
3024 | + // put 0 where block[{i,i+7} used to have 0 | |
3025 | + blockv = vec_sel(temp1, blockv, blockv_null); | |
3026 | + vec_st(blockv, j << 1, block); | |
3027 | + } | |
3028 | + | |
3029 | + // if nCoeffs isn't a multiple of 8, finish the job | |
3030 | + // using good old scalar units. | |
3031 | + // (we could do it using a truncated vector, | |
3032 | + // but I'm not sure it's worth the hassle) | |
3033 | + for(; j <= nCoeffs ; j++) { | |
3034 | + level = block[j]; | |
3035 | + if (level) { | |
3036 | + if (level < 0) { | |
3037 | + level = level * qmul - qadd; | |
3038 | + } else { | |
3039 | + level = level * qmul + qadd; | |
3040 | + } | |
3041 | + block[j] = level; | |
3042 | + } | |
3043 | + } | |
3044 | + | |
3045 | + if (i == 1) | |
3046 | + { // cheat. this avoid special-casing the first iteration | |
3047 | + block[0] = backup_0; | |
3048 | + } | |
3049 | + } | |
3050 | +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
3051 | + | |
3052 | +POWERPC_PERF_STOP_COUNT(altivec_dct_unquantize_h263_num, nCoeffs == 63); | |
3053 | +} | |
3054 | diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/mpegvideo_ppc.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/mpegvideo_ppc.c | |
3055 | --- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/mpegvideo_ppc.c 1970-01-01 01:00:00.000000000 +0100 | |
3056 | +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/mpegvideo_ppc.c 2003-09-28 17:26:40.000000000 +0200 | |
3057 | @@ -0,0 +1,83 @@ | |
3058 | +/*\r | |
3059 | + * Copyright (c) 2002 Dieter Shirley\r | |
3060 | + *\r | |
3061 | + * This library is free software; you can redistribute it and/or\r | |
3062 | + * modify it under the terms of the GNU Lesser General Public\r | |
3063 | + * License as published by the Free Software Foundation; either\r | |
3064 | + * version 2 of the License, or (at your option) any later version.\r | |
3065 | + *\r | |
3066 | + * This library is distributed in the hope that it will be useful,\r | |
3067 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of\r | |
3068 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\r | |
3069 | + * Lesser General Public License for more details.\r | |
3070 | + *\r | |
3071 | + * You should have received a copy of the GNU Lesser General Public\r | |
3072 | + * License along with this library; if not, write to the Free Software\r | |
3073 | + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA\r | |
3074 | + */\r | |
3075 | + \r | |
3076 | +#include "../dsputil.h"\r | |
3077 | +#include "../mpegvideo.h"\r | |
3078 | +#include <time.h>\r | |
3079 | +\r | |
3080 | +#ifdef HAVE_ALTIVEC\r | |
3081 | +#include "dsputil_altivec.h"\r | |
3082 | +#endif\r | |
3083 | +\r | |
3084 | +extern int dct_quantize_altivec(MpegEncContext *s, \r | |
3085 | + DCTELEM *block, int n,\r | |
3086 | + int qscale, int *overflow);\r | |
3087 | +extern void dct_unquantize_h263_altivec(MpegEncContext *s, | |
3088 | + DCTELEM *block, int n, int qscale); | |
3089 | +\r | |
3090 | +extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);\r | |
3091 | +extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);\r | |
3092 | +\r | |
3093 | +\r | |
3094 | +void MPV_common_init_ppc(MpegEncContext *s)\r | |
3095 | +{\r | |
3096 | +#if HAVE_ALTIVEC\r | |
3097 | + if (has_altivec())\r | |
3098 | + {\r | |
3099 | + if ((s->avctx->idct_algo == FF_IDCT_AUTO) ||\r | |
3100 | + (s->avctx->idct_algo == FF_IDCT_ALTIVEC))\r | |
3101 | + {\r | |
3102 | + s->dsp.idct_put = idct_put_altivec;\r | |
3103 | + s->dsp.idct_add = idct_add_altivec;\r | |
3104 | +#ifndef ALTIVEC_USE_REFERENCE_C_CODE | |
3105 | + s->dsp.idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;\r | |
3106 | +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
3107 | + s->dsp.idct_permutation_type = FF_NO_IDCT_PERM; | |
3108 | +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
3109 | + }\r | |
3110 | +\r | |
3111 | + // Test to make sure that the dct required alignments are met.\r | |
3112 | + if ((((long)(s->q_intra_matrix) & 0x0f) != 0) ||\r | |
3113 | + (((long)(s->q_inter_matrix) & 0x0f) != 0))\r | |
3114 | + {\r | |
3115 | + fprintf(stderr, "Internal Error: q-matrix blocks must be 16-byte aligned "\r | |
3116 | + "to use Altivec DCT. Reverting to non-altivec version.\n");\r | |
3117 | + return;\r | |
3118 | + }\r | |
3119 | +\r | |
3120 | + if (((long)(s->intra_scantable.inverse) & 0x0f) != 0)\r | |
3121 | + {\r | |
3122 | + fprintf(stderr, "Internal Error: scan table blocks must be 16-byte aligned "\r | |
3123 | + "to use Altivec DCT. Reverting to non-altivec version.\n");\r | |
3124 | + return;\r | |
3125 | + }\r | |
3126 | +\r | |
3127 | +\r | |
3128 | + if ((s->avctx->dct_algo == FF_DCT_AUTO) ||\r | |
3129 | + (s->avctx->dct_algo == FF_DCT_ALTIVEC))\r | |
3130 | + {\r | |
3131 | + s->dct_quantize = dct_quantize_altivec;\r | |
3132 | + s->dct_unquantize_h263 = dct_unquantize_h263_altivec; | |
3133 | + }\r | |
3134 | + } else\r | |
3135 | +#endif\r | |
3136 | + {\r | |
3137 | + /* Non-AltiVec PPC optimisations here */\r | |
3138 | + }\r | |
3139 | +}\r | |
3140 | +\r | |
f497b632 JB |
3141 | --- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/Makefile.am.orig 2003-05-25 23:11:57.000000000 +0200 |
3142 | +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/Makefile.am 2003-11-14 01:06:03.904622008 +0100 | |
3143 | @@ -20,6 +20,6 @@ | |
3144 | ||
3145 | libavcodecppc_la_SOURCES = $(PPC_SRC) | |
3146 | ||
3147 | -AM_CPPFLAGS = $(LTNOPIC) -DHAVE_AV_CONFIG_H -I$(srcdir)/../.. | |
c333e025 | 3148 | +AM_CPPFLAGS = $(LTNOPIC) -DHAVE_AV_CONFIG_H -DHAVE_ALTIVEC_H -DHAVE_ALTIVEC -maltivec -mabi=altivec -I$(srcdir)/../.. |
f497b632 JB |
3149 | |
3150 | MAINTAINERCLEANFILES = Makefile.in |