1 diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/dsputil_altivec.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/dsputil_altivec.c
2 --- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/dsputil_altivec.c 1970-01-01 01:00:00.000000000 +0100
3 +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/dsputil_altivec.c 2003-09-28 17:26:40.000000000 +0200
6 + * Copyright (c) 2002 Brian Foley
7 + * Copyright (c) 2002 Dieter Shirley
8 + * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
10 + * This library is free software; you can redistribute it and/or
11 + * modify it under the terms of the GNU Lesser General Public
12 + * License as published by the Free Software Foundation; either
13 + * version 2 of the License, or (at your option) any later version.
15 + * This library is distributed in the hope that it will be useful,
16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 + * Lesser General Public License for more details.
20 + * You should have received a copy of the GNU Lesser General Public
21 + * License along with this library; if not, write to the Free Software
22 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
25 +#include "../dsputil.h"
27 +#include "gcc_fixes.h"
29 +#include "dsputil_altivec.h"
32 +#include <sys/sysctl.h>
33 +#else /* CONFIG_DARWIN */
37 +static sigjmp_buf jmpbuf;
38 +static volatile sig_atomic_t canjump = 0;
40 +static void sigill_handler (int sig)
43 + signal (sig, SIG_DFL);
48 + siglongjmp (jmpbuf, 1);
50 +#endif /* CONFIG_DARWIN */
52 +int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
55 + int s __attribute__((aligned(16)));
56 + const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
57 + vector unsigned char *tv;
58 + vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
59 + vector unsigned int sad;
60 + vector signed int sumdiffs;
63 + sad = (vector unsigned int)vec_splat_u32(0);
66 + Read unaligned pixels into our vectors. The vectors are as follows:
67 + pix1v: pix1[0]-pix1[15]
68 + pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
70 + tv = (vector unsigned char *) pix1;
71 + pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
73 + tv = (vector unsigned char *) &pix2[0];
74 + pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
76 + tv = (vector unsigned char *) &pix2[1];
77 + pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
79 + /* Calculate the average vector */
80 + avgv = vec_avg(pix2v, pix2iv);
82 + /* Calculate a sum of abs differences vector */
83 + t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
85 + /* Add each 4 pixel group together and put 4 results into sad */
86 + sad = vec_sum4s(t5, sad);
91 + /* Sum up the four partial sums, and put the result into s */
92 + sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
93 + sumdiffs = vec_splat(sumdiffs, 3);
94 + vec_ste(sumdiffs, 0, &s);
99 +int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
102 + int s __attribute__((aligned(16)));
103 + const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
104 + vector unsigned char *tv;
105 + vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
106 + vector unsigned int sad;
107 + vector signed int sumdiffs;
108 + uint8_t *pix3 = pix2 + line_size;
111 + sad = (vector unsigned int)vec_splat_u32(0);
114 + Due to the fact that pix3 = pix2 + line_size, the pix3 of one
115 + iteration becomes pix2 in the next iteration. We can use this
116 + fact to avoid a potentially expensive unaligned read, each
117 + time around the loop.
118 + Read unaligned pixels into our vectors. The vectors are as follows:
119 + pix2v: pix2[0]-pix2[15]
120 + Split the pixel vectors into shorts
122 + tv = (vector unsigned char *) &pix2[0];
123 + pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
125 + for(i=0;i<16;i++) {
127 + Read unaligned pixels into our vectors. The vectors are as follows:
128 + pix1v: pix1[0]-pix1[15]
129 + pix3v: pix3[0]-pix3[15]
131 + tv = (vector unsigned char *) pix1;
132 + pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
134 + tv = (vector unsigned char *) &pix3[0];
135 + pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
137 + /* Calculate the average vector */
138 + avgv = vec_avg(pix2v, pix3v);
140 + /* Calculate a sum of abs differences vector */
141 + t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
143 + /* Add each 4 pixel group together and put 4 results into sad */
144 + sad = vec_sum4s(t5, sad);
152 + /* Sum up the four partial sums, and put the result into s */
153 + sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
154 + sumdiffs = vec_splat(sumdiffs, 3);
155 + vec_ste(sumdiffs, 0, &s);
159 +int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
162 + int s __attribute__((aligned(16)));
163 + uint8_t *pix3 = pix2 + line_size;
164 + const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
165 + const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2);
166 + vector unsigned char *tv, avgv, t5;
167 + vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
168 + vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
169 + vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
170 + vector unsigned short avghv, avglv;
171 + vector unsigned short t1, t2, t3, t4;
172 + vector unsigned int sad;
173 + vector signed int sumdiffs;
175 + sad = (vector unsigned int)vec_splat_u32(0);
180 + Due to the fact that pix3 = pix2 + line_size, the pix3 of one
181 + iteration becomes pix2 in the next iteration. We can use this
182 + fact to avoid a potentially expensive unaligned read, as well
183 + as some splitting, and vector addition each time around the loop.
184 + Read unaligned pixels into our vectors. The vectors are as follows:
185 + pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
186 + Split the pixel vectors into shorts
188 + tv = (vector unsigned char *) &pix2[0];
189 + pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
191 + tv = (vector unsigned char *) &pix2[1];
192 + pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
194 + pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
195 + pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
196 + pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
197 + pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
198 + t1 = vec_add(pix2hv, pix2ihv);
199 + t2 = vec_add(pix2lv, pix2ilv);
201 + for(i=0;i<16;i++) {
203 + Read unaligned pixels into our vectors. The vectors are as follows:
204 + pix1v: pix1[0]-pix1[15]
205 + pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16]
207 + tv = (vector unsigned char *) pix1;
208 + pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
210 + tv = (vector unsigned char *) &pix3[0];
211 + pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
213 + tv = (vector unsigned char *) &pix3[1];
214 + pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
217 + Note that Altivec does have vec_avg, but this works on vector pairs
218 + and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
219 + would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
220 + Instead, we have to split the pixel vectors into vectors of shorts,
221 + and do the averaging by hand.
224 + /* Split the pixel vectors into shorts */
225 + pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
226 + pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
227 + pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
228 + pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
230 + /* Do the averaging on them */
231 + t3 = vec_add(pix3hv, pix3ihv);
232 + t4 = vec_add(pix3lv, pix3ilv);
234 + avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
235 + avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
237 + /* Pack the shorts back into a result */
238 + avgv = vec_pack(avghv, avglv);
240 + /* Calculate a sum of abs differences vector */
241 + t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
243 + /* Add each 4 pixel group together and put 4 results into sad */
244 + sad = vec_sum4s(t5, sad);
248 + /* Transfer the calculated values for pix3 into pix2 */
252 + /* Sum up the four partial sums, and put the result into s */
253 + sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
254 + sumdiffs = vec_splat(sumdiffs, 3);
255 + vec_ste(sumdiffs, 0, &s);
260 +int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
263 + int s __attribute__((aligned(16)));
264 + const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
265 + vector unsigned char perm1, perm2, *pix1v, *pix2v;
266 + vector unsigned char t1, t2, t3,t4, t5;
267 + vector unsigned int sad;
268 + vector signed int sumdiffs;
270 + sad = (vector unsigned int)vec_splat_u32(0);
273 + for(i=0;i<16;i++) {
274 + /* Read potentially unaligned pixels into t1 and t2 */
275 + perm1 = vec_lvsl(0, pix1);
276 + pix1v = (vector unsigned char *) pix1;
277 + perm2 = vec_lvsl(0, pix2);
278 + pix2v = (vector unsigned char *) pix2;
279 + t1 = vec_perm(pix1v[0], pix1v[1], perm1);
280 + t2 = vec_perm(pix2v[0], pix2v[1], perm2);
282 + /* Calculate a sum of abs differences vector */
283 + t3 = vec_max(t1, t2);
284 + t4 = vec_min(t1, t2);
285 + t5 = vec_sub(t3, t4);
287 + /* Add each 4 pixel group together and put 4 results into sad */
288 + sad = vec_sum4s(t5, sad);
294 + /* Sum up the four partial sums, and put the result into s */
295 + sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
296 + sumdiffs = vec_splat(sumdiffs, 3);
297 + vec_ste(sumdiffs, 0, &s);
302 +int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
305 + int s __attribute__((aligned(16)));
306 + const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
307 + vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
308 + vector unsigned char t1, t2, t3,t4, t5;
309 + vector unsigned int sad;
310 + vector signed int sumdiffs;
312 + sad = (vector unsigned int)vec_splat_u32(0);
314 + permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
317 + /* Read potentially unaligned pixels into t1 and t2
318 + Since we're reading 16 pixels, and actually only want 8,
319 + mask out the last 8 pixels. The 0s don't change the sum. */
320 + perm1 = vec_lvsl(0, pix1);
321 + pix1v = (vector unsigned char *) pix1;
322 + perm2 = vec_lvsl(0, pix2);
323 + pix2v = (vector unsigned char *) pix2;
324 + t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
325 + t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
327 + /* Calculate a sum of abs differences vector */
328 + t3 = vec_max(t1, t2);
329 + t4 = vec_min(t1, t2);
330 + t5 = vec_sub(t3, t4);
332 + /* Add each 4 pixel group together and put 4 results into sad */
333 + sad = vec_sum4s(t5, sad);
339 + /* Sum up the four partial sums, and put the result into s */
340 + sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
341 + sumdiffs = vec_splat(sumdiffs, 3);
342 + vec_ste(sumdiffs, 0, &s);
347 +int pix_norm1_altivec(uint8_t *pix, int line_size)
350 + int s __attribute__((aligned(16)));
351 + const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
352 + vector unsigned char *tv;
353 + vector unsigned char pixv;
354 + vector unsigned int sv;
355 + vector signed int sum;
357 + sv = (vector unsigned int)vec_splat_u32(0);
360 + for (i = 0; i < 16; i++) {
361 + /* Read in the potentially unaligned pixels */
362 + tv = (vector unsigned char *) pix;
363 + pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
365 + /* Square the values, and add them to our sum */
366 + sv = vec_msum(pixv, pixv, sv);
370 + /* Sum up the four partial sums, and put the result into s */
371 + sum = vec_sums((vector signed int) sv, (vector signed int) zero);
372 + sum = vec_splat(sum, 3);
373 + vec_ste(sum, 0, &s);
379 + * Sum of Squared Errors for a 8x8 block.
380 + * AltiVec-enhanced.
381 + * It's the pix_abs8x8_altivec code above w/ squaring added.
383 +int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
386 + int s __attribute__((aligned(16)));
387 + const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
388 + vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
389 + vector unsigned char t1, t2, t3,t4, t5;
390 + vector unsigned int sum;
391 + vector signed int sumsqr;
393 + sum = (vector unsigned int)vec_splat_u32(0);
395 + permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
399 + /* Read potentially unaligned pixels into t1 and t2
400 + Since we're reading 16 pixels, and actually only want 8,
401 + mask out the last 8 pixels. The 0s don't change the sum. */
402 + perm1 = vec_lvsl(0, pix1);
403 + pix1v = (vector unsigned char *) pix1;
404 + perm2 = vec_lvsl(0, pix2);
405 + pix2v = (vector unsigned char *) pix2;
406 + t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
407 + t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
410 + Since we want to use unsigned chars, we can take advantage
411 + of the fact that abs(a-b)^2 = (a-b)^2.
414 + /* Calculate abs differences vector */
415 + t3 = vec_max(t1, t2);
416 + t4 = vec_min(t1, t2);
417 + t5 = vec_sub(t3, t4);
419 + /* Square the values and add them to our sum */
420 + sum = vec_msum(t5, t5, sum);
426 + /* Sum up the four partial sums, and put the result into s */
427 + sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
428 + sumsqr = vec_splat(sumsqr, 3);
429 + vec_ste(sumsqr, 0, &s);
435 + * Sum of Squared Errors for a 16x16 block.
436 + * AltiVec-enhanced.
437 + * It's the pix_abs16x16_altivec code above w/ squaring added.
439 +int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
442 + int s __attribute__((aligned(16)));
443 + const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
444 + vector unsigned char perm1, perm2, *pix1v, *pix2v;
445 + vector unsigned char t1, t2, t3,t4, t5;
446 + vector unsigned int sum;
447 + vector signed int sumsqr;
449 + sum = (vector unsigned int)vec_splat_u32(0);
451 + for(i=0;i<16;i++) {
452 + /* Read potentially unaligned pixels into t1 and t2 */
453 + perm1 = vec_lvsl(0, pix1);
454 + pix1v = (vector unsigned char *) pix1;
455 + perm2 = vec_lvsl(0, pix2);
456 + pix2v = (vector unsigned char *) pix2;
457 + t1 = vec_perm(pix1v[0], pix1v[1], perm1);
458 + t2 = vec_perm(pix2v[0], pix2v[1], perm2);
461 + Since we want to use unsigned chars, we can take advantage
462 + of the fact that abs(a-b)^2 = (a-b)^2.
465 + /* Calculate abs differences vector */
466 + t3 = vec_max(t1, t2);
467 + t4 = vec_min(t1, t2);
468 + t5 = vec_sub(t3, t4);
470 + /* Square the values and add them to our sum */
471 + sum = vec_msum(t5, t5, sum);
477 + /* Sum up the four partial sums, and put the result into s */
478 + sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
479 + sumsqr = vec_splat(sumsqr, 3);
480 + vec_ste(sumsqr, 0, &s);
485 +int pix_sum_altivec(uint8_t * pix, int line_size)
487 + const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
488 + vector unsigned char perm, *pixv;
489 + vector unsigned char t1;
490 + vector unsigned int sad;
491 + vector signed int sumdiffs;
494 + int s __attribute__((aligned(16)));
496 + sad = (vector unsigned int)vec_splat_u32(0);
498 + for (i = 0; i < 16; i++) {
499 + /* Read the potentially unaligned 16 pixels into t1 */
500 + perm = vec_lvsl(0, pix);
501 + pixv = (vector unsigned char *) pix;
502 + t1 = vec_perm(pixv[0], pixv[1], perm);
504 + /* Add each 4 pixel group together and put 4 results into sad */
505 + sad = vec_sum4s(t1, sad);
510 + /* Sum up the four partial sums, and put the result into s */
511 + sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
512 + sumdiffs = vec_splat(sumdiffs, 3);
513 + vec_ste(sumdiffs, 0, &s);
518 +void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
521 + vector unsigned char perm, bytes, *pixv;
522 + const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
523 + vector signed short shorts;
527 + // Read potentially unaligned pixels.
528 + // We're reading 16 pixels, and actually only want 8,
529 + // but we simply ignore the extras.
530 + perm = vec_lvsl(0, pixels);
531 + pixv = (vector unsigned char *) pixels;
532 + bytes = vec_perm(pixv[0], pixv[1], perm);
534 + // convert the bytes into shorts
535 + shorts = (vector signed short)vec_mergeh(zero, bytes);
537 + // save the data to the block, we assume the block is 16-byte aligned
538 + vec_st(shorts, i*16, (vector signed short*)block);
540 + pixels += line_size;
544 +void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
545 + const uint8_t *s2, int stride)
548 + vector unsigned char perm, bytes, *pixv;
549 + const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
550 + vector signed short shorts1, shorts2;
554 + // Read potentially unaligned pixels
555 + // We're reading 16 pixels, and actually only want 8,
556 + // but we simply ignore the extras.
557 + perm = vec_lvsl(0, s1);
558 + pixv = (vector unsigned char *) s1;
559 + bytes = vec_perm(pixv[0], pixv[1], perm);
561 + // convert the bytes into shorts
562 + shorts1 = (vector signed short)vec_mergeh(zero, bytes);
564 + // Do the same for the second block of pixels
565 + perm = vec_lvsl(0, s2);
566 + pixv = (vector unsigned char *) s2;
567 + bytes = vec_perm(pixv[0], pixv[1], perm);
569 + // convert the bytes into shorts
570 + shorts2 = (vector signed short)vec_mergeh(zero, bytes);
572 + // Do the subtraction
573 + shorts1 = vec_sub(shorts1, shorts2);
575 + // save the data to the block, we assume the block is 16-byte aligned
576 + vec_st(shorts1, 0, (vector signed short*)block);
583 + // The code below is a copy of the code above... This is a manual
586 + // Read potentially unaligned pixels
587 + // We're reading 16 pixels, and actually only want 8,
588 + // but we simply ignore the extras.
589 + perm = vec_lvsl(0, s1);
590 + pixv = (vector unsigned char *) s1;
591 + bytes = vec_perm(pixv[0], pixv[1], perm);
593 + // convert the bytes into shorts
594 + shorts1 = (vector signed short)vec_mergeh(zero, bytes);
596 + // Do the same for the second block of pixels
597 + perm = vec_lvsl(0, s2);
598 + pixv = (vector unsigned char *) s2;
599 + bytes = vec_perm(pixv[0], pixv[1], perm);
601 + // convert the bytes into shorts
602 + shorts2 = (vector signed short)vec_mergeh(zero, bytes);
604 + // Do the subtraction
605 + shorts1 = vec_sub(shorts1, shorts2);
607 + // save the data to the block, we assume the block is 16-byte aligned
608 + vec_st(shorts1, 0, (vector signed short*)block);
616 +int sad16x16_altivec(void *s, uint8_t *a, uint8_t *b, int stride) {
617 + return pix_abs16x16_altivec(a,b,stride);
620 +int sad8x8_altivec(void *s, uint8_t *a, uint8_t *b, int stride) {
621 + return pix_abs8x8_altivec(a,b,stride);
624 +void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
625 +#ifdef ALTIVEC_USE_REFERENCE_C_CODE
627 + for(i=0; i+7<w; i++){
628 + dst[i+0] += src[i+0];
629 + dst[i+1] += src[i+1];
630 + dst[i+2] += src[i+2];
631 + dst[i+3] += src[i+3];
632 + dst[i+4] += src[i+4];
633 + dst[i+5] += src[i+5];
634 + dst[i+6] += src[i+6];
635 + dst[i+7] += src[i+7];
638 + dst[i+0] += src[i+0];
639 +#else /* ALTIVEC_USE_REFERENCE_C_CODE */
641 + register vector unsigned char vdst, vsrc;
643 + /* dst and src are 16 bytes-aligned (guaranteed) */
644 + for(i = 0 ; (i + 15) < w ; i++)
646 + vdst = vec_ld(i << 4, (unsigned char*)dst);
647 + vsrc = vec_ld(i << 4, (unsigned char*)src);
648 + vdst = vec_add(vsrc, vdst);
649 + vec_st(vdst, i << 4, (unsigned char*)dst);
651 + /* if w is not a multiple of 16 */
652 + for (; (i < w) ; i++)
656 +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
659 +/* next one assumes that ((line_size % 16) == 0) */
660 +void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
662 +POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);
663 +#ifdef ALTIVEC_USE_REFERENCE_C_CODE
666 +POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
668 + for(i=0; i<h; i++) {
669 + *((uint32_t*)(block )) = (((const struct unaligned_32 *) (pixels))->l);
670 + *((uint32_t*)(block+4)) = (((const struct unaligned_32 *) (pixels+4))->l);
671 + *((uint32_t*)(block+8)) = (((const struct unaligned_32 *) (pixels+8))->l);
672 + *((uint32_t*)(block+12)) = (((const struct unaligned_32 *) (pixels+12))->l);
677 +POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
679 +#else /* ALTIVEC_USE_REFERENCE_C_CODE */
680 + register vector unsigned char pixelsv1, pixelsv2;
681 + register vector unsigned char pixelsv1B, pixelsv2B;
682 + register vector unsigned char pixelsv1C, pixelsv2C;
683 + register vector unsigned char pixelsv1D, pixelsv2D;
685 + register vector unsigned char perm = vec_lvsl(0, pixels);
687 + register int line_size_2 = line_size << 1;
688 + register int line_size_3 = line_size + line_size_2;
689 + register int line_size_4 = line_size << 2;
691 +POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
692 +// hand-unrolling the loop by 4 gains about 15%
693 +// mininum execution time goes from 74 to 60 cycles
694 +// it's faster than -funroll-loops, but using
695 +// -funroll-loops w/ this is bad - 74 cycles again.
696 +// all this is on a 7450, tuning for the 7450
698 + for(i=0; i<h; i++) {
699 + pixelsv1 = vec_ld(0, (unsigned char*)pixels);
700 + pixelsv2 = vec_ld(16, (unsigned char*)pixels);
701 + vec_st(vec_perm(pixelsv1, pixelsv2, perm),
702 + 0, (unsigned char*)block);
707 + for(i=0; i<h; i+=4) {
708 + pixelsv1 = vec_ld(0, (unsigned char*)pixels);
709 + pixelsv2 = vec_ld(16, (unsigned char*)pixels);
710 + pixelsv1B = vec_ld(line_size, (unsigned char*)pixels);
711 + pixelsv2B = vec_ld(16 + line_size, (unsigned char*)pixels);
712 + pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels);
713 + pixelsv2C = vec_ld(16 + line_size_2, (unsigned char*)pixels);
714 + pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels);
715 + pixelsv2D = vec_ld(16 + line_size_3, (unsigned char*)pixels);
716 + vec_st(vec_perm(pixelsv1, pixelsv2, perm),
717 + 0, (unsigned char*)block);
718 + vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
719 + line_size, (unsigned char*)block);
720 + vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
721 + line_size_2, (unsigned char*)block);
722 + vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
723 + line_size_3, (unsigned char*)block);
724 + pixels+=line_size_4;
725 + block +=line_size_4;
728 +POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
730 +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
733 +/* next one assumes that ((line_size % 16) == 0) */
734 +#define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
735 +void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
737 +POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
738 +#ifdef ALTIVEC_USE_REFERENCE_C_CODE
741 +POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
743 + for(i=0; i<h; i++) {
744 + op_avg(*((uint32_t*)(block)),(((const struct unaligned_32 *)(pixels))->l));
745 + op_avg(*((uint32_t*)(block+4)),(((const struct unaligned_32 *)(pixels+4))->l));
746 + op_avg(*((uint32_t*)(block+8)),(((const struct unaligned_32 *)(pixels+8))->l));
747 + op_avg(*((uint32_t*)(block+12)),(((const struct unaligned_32 *)(pixels+12))->l));
752 +POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
754 +#else /* ALTIVEC_USE_REFERENCE_C_CODE */
755 + register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
756 + register vector unsigned char perm = vec_lvsl(0, pixels);
759 +POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
761 + for(i=0; i<h; i++) {
762 + pixelsv1 = vec_ld(0, (unsigned char*)pixels);
763 + pixelsv2 = vec_ld(16, (unsigned char*)pixels);
764 + blockv = vec_ld(0, block);
765 + pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
766 + blockv = vec_avg(blockv,pixelsv);
767 + vec_st(blockv, 0, (unsigned char*)block);
772 +POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
774 +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
777 +/* next one assumes that ((line_size % 8) == 0) */
778 +void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
780 +POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
781 +#ifdef ALTIVEC_USE_REFERENCE_C_CODE
783 +POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
784 + for (i = 0; i < h; i++) {
785 + *((uint32_t *) (block)) =
786 + (((*((uint32_t *) (block))) |
787 + ((((const struct unaligned_32 *) (pixels))->l))) -
788 + ((((*((uint32_t *) (block))) ^
789 + ((((const struct unaligned_32 *) (pixels))->
790 + l))) & 0xFEFEFEFEUL) >> 1));
791 + *((uint32_t *) (block + 4)) =
792 + (((*((uint32_t *) (block + 4))) |
793 + ((((const struct unaligned_32 *) (pixels + 4))->l))) -
794 + ((((*((uint32_t *) (block + 4))) ^
795 + ((((const struct unaligned_32 *) (pixels +
797 + l))) & 0xFEFEFEFEUL) >> 1));
798 + pixels += line_size;
799 + block += line_size;
801 +POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
803 +#else /* ALTIVEC_USE_REFERENCE_C_CODE */
804 + register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
807 +POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
809 + for (i = 0; i < h; i++) {
811 + block is 8 bytes-aligned, so we're either in the
812 + left block (16 bytes-aligned) or in the right block (not)
814 + int rightside = ((unsigned long)block & 0x0000000F);
816 + blockv = vec_ld(0, block);
817 + pixelsv1 = vec_ld(0, (unsigned char*)pixels);
818 + pixelsv2 = vec_ld(16, (unsigned char*)pixels);
819 + pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
823 + pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
827 + pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
830 + blockv = vec_avg(blockv, pixelsv);
832 + vec_st(blockv, 0, block);
834 + pixels += line_size;
835 + block += line_size;
838 +POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
840 +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
843 +/* next one assumes that ((line_size % 8) == 0) */
844 +void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
846 +POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
847 +#ifdef ALTIVEC_USE_REFERENCE_C_CODE
849 +POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
850 + for (j = 0; j < 2; j++) {
852 + const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
854 + (((const struct unaligned_32 *) (pixels + 1))->l);
856 + (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
858 + ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
860 + pixels += line_size;
861 + for (i = 0; i < h; i += 2) {
862 + uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
863 + uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
864 + l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
865 + h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
866 + *((uint32_t *) block) =
867 + h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
868 + pixels += line_size;
869 + block += line_size;
870 + a = (((const struct unaligned_32 *) (pixels))->l);
871 + b = (((const struct unaligned_32 *) (pixels + 1))->l);
872 + l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
873 + h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
874 + *((uint32_t *) block) =
875 + h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
876 + pixels += line_size;
877 + block += line_size;
878 + } pixels += 4 - line_size * (h + 1);
879 + block += 4 - line_size * h;
882 +POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
884 +#else /* ALTIVEC_USE_REFERENCE_C_CODE */
886 + register vector unsigned char
887 + pixelsv1, pixelsv2,
889 + register vector unsigned char
890 + blockv, temp1, temp2;
891 + register vector unsigned short
892 + pixelssum1, pixelssum2, temp3;
893 + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
894 + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
896 + temp1 = vec_ld(0, pixels);
897 + temp2 = vec_ld(16, pixels);
898 + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
899 + if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
905 + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
907 + pixelsv1 = vec_mergeh(vczero, pixelsv1);
908 + pixelsv2 = vec_mergeh(vczero, pixelsv2);
909 + pixelssum1 = vec_add((vector unsigned short)pixelsv1,
910 + (vector unsigned short)pixelsv2);
911 + pixelssum1 = vec_add(pixelssum1, vctwo);
913 +POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
914 + for (i = 0; i < h ; i++) {
915 + int rightside = ((unsigned long)block & 0x0000000F);
916 + blockv = vec_ld(0, block);
918 + temp1 = vec_ld(line_size, pixels);
919 + temp2 = vec_ld(line_size + 16, pixels);
920 + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
921 + if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
927 + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
930 + pixelsv1 = vec_mergeh(vczero, pixelsv1);
931 + pixelsv2 = vec_mergeh(vczero, pixelsv2);
932 + pixelssum2 = vec_add((vector unsigned short)pixelsv1,
933 + (vector unsigned short)pixelsv2);
934 + temp3 = vec_add(pixelssum1, pixelssum2);
935 + temp3 = vec_sra(temp3, vctwo);
936 + pixelssum1 = vec_add(pixelssum2, vctwo);
937 + pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
941 + blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
945 + blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
948 + vec_st(blockv, 0, block);
950 + block += line_size;
951 + pixels += line_size;
954 +POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
955 +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
958 +/* next one assumes that ((line_size % 8) == 0) */
959 +void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
961 +POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
962 +#ifdef ALTIVEC_USE_REFERENCE_C_CODE
964 +POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
965 + for (j = 0; j < 2; j++) {
967 + const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
969 + (((const struct unaligned_32 *) (pixels + 1))->l);
971 + (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
973 + ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
975 + pixels += line_size;
976 + for (i = 0; i < h; i += 2) {
977 + uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
978 + uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
979 + l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
980 + h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
981 + *((uint32_t *) block) =
982 + h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
983 + pixels += line_size;
984 + block += line_size;
985 + a = (((const struct unaligned_32 *) (pixels))->l);
986 + b = (((const struct unaligned_32 *) (pixels + 1))->l);
987 + l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
988 + h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
989 + *((uint32_t *) block) =
990 + h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
991 + pixels += line_size;
992 + block += line_size;
993 + } pixels += 4 - line_size * (h + 1);
994 + block += 4 - line_size * h;
997 +POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
999 +#else /* ALTIVEC_USE_REFERENCE_C_CODE */
1001 + register vector unsigned char
1002 + pixelsv1, pixelsv2,
1004 + register vector unsigned char
1005 + blockv, temp1, temp2;
1006 + register vector unsigned short
1007 + pixelssum1, pixelssum2, temp3;
1008 + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
1009 + register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
1010 + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
1012 + temp1 = vec_ld(0, pixels);
1013 + temp2 = vec_ld(16, pixels);
1014 + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1015 + if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
1021 + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1023 + pixelsv1 = vec_mergeh(vczero, pixelsv1);
1024 + pixelsv2 = vec_mergeh(vczero, pixelsv2);
1025 + pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1026 + (vector unsigned short)pixelsv2);
1027 + pixelssum1 = vec_add(pixelssum1, vcone);
1029 +POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
1030 + for (i = 0; i < h ; i++) {
1031 + int rightside = ((unsigned long)block & 0x0000000F);
1032 + blockv = vec_ld(0, block);
1034 + temp1 = vec_ld(line_size, pixels);
1035 + temp2 = vec_ld(line_size + 16, pixels);
1036 + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1037 + if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
1043 + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1046 + pixelsv1 = vec_mergeh(vczero, pixelsv1);
1047 + pixelsv2 = vec_mergeh(vczero, pixelsv2);
1048 + pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1049 + (vector unsigned short)pixelsv2);
1050 + temp3 = vec_add(pixelssum1, pixelssum2);
1051 + temp3 = vec_sra(temp3, vctwo);
1052 + pixelssum1 = vec_add(pixelssum2, vcone);
1053 + pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
1057 + blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
1061 + blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
1064 + vec_st(blockv, 0, block);
1066 + block += line_size;
1067 + pixels += line_size;
1070 +POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
1071 +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1074 +/* next one assumes that ((line_size % 16) == 0) */
1075 +void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
1077 +POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
1078 +#ifdef ALTIVEC_USE_REFERENCE_C_CODE
1080 +POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
1081 + for (j = 0; j < 4; j++) {
1083 + const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1084 + const uint32_t b =
1085 + (((const struct unaligned_32 *) (pixels + 1))->l);
1087 + (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
1089 + ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1091 + pixels += line_size;
1092 + for (i = 0; i < h; i += 2) {
1093 + uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1094 + uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
1095 + l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
1096 + h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1097 + *((uint32_t *) block) =
1098 + h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1099 + pixels += line_size;
1100 + block += line_size;
1101 + a = (((const struct unaligned_32 *) (pixels))->l);
1102 + b = (((const struct unaligned_32 *) (pixels + 1))->l);
1103 + l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
1104 + h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1105 + *((uint32_t *) block) =
1106 + h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1107 + pixels += line_size;
1108 + block += line_size;
1109 + } pixels += 4 - line_size * (h + 1);
1110 + block += 4 - line_size * h;
1113 +POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
1115 +#else /* ALTIVEC_USE_REFERENCE_C_CODE */
1117 + register vector unsigned char
1118 + pixelsv1, pixelsv2, pixelsv3, pixelsv4;
1119 + register vector unsigned char
1120 + blockv, temp1, temp2;
1121 + register vector unsigned short
1122 + pixelssum1, pixelssum2, temp3,
1123 + pixelssum3, pixelssum4, temp4;
1124 + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
1125 + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
1127 +POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
1129 + temp1 = vec_ld(0, pixels);
1130 + temp2 = vec_ld(16, pixels);
1131 + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1132 + if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
1138 + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1140 + pixelsv3 = vec_mergel(vczero, pixelsv1);
1141 + pixelsv4 = vec_mergel(vczero, pixelsv2);
1142 + pixelsv1 = vec_mergeh(vczero, pixelsv1);
1143 + pixelsv2 = vec_mergeh(vczero, pixelsv2);
1144 + pixelssum3 = vec_add((vector unsigned short)pixelsv3,
1145 + (vector unsigned short)pixelsv4);
1146 + pixelssum3 = vec_add(pixelssum3, vctwo);
1147 + pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1148 + (vector unsigned short)pixelsv2);
1149 + pixelssum1 = vec_add(pixelssum1, vctwo);
1151 + for (i = 0; i < h ; i++) {
1152 + blockv = vec_ld(0, block);
1154 + temp1 = vec_ld(line_size, pixels);
1155 + temp2 = vec_ld(line_size + 16, pixels);
1156 + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1157 + if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
1163 + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1166 + pixelsv3 = vec_mergel(vczero, pixelsv1);
1167 + pixelsv4 = vec_mergel(vczero, pixelsv2);
1168 + pixelsv1 = vec_mergeh(vczero, pixelsv1);
1169 + pixelsv2 = vec_mergeh(vczero, pixelsv2);
1171 + pixelssum4 = vec_add((vector unsigned short)pixelsv3,
1172 + (vector unsigned short)pixelsv4);
1173 + pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1174 + (vector unsigned short)pixelsv2);
1175 + temp4 = vec_add(pixelssum3, pixelssum4);
1176 + temp4 = vec_sra(temp4, vctwo);
1177 + temp3 = vec_add(pixelssum1, pixelssum2);
1178 + temp3 = vec_sra(temp3, vctwo);
1180 + pixelssum3 = vec_add(pixelssum4, vctwo);
1181 + pixelssum1 = vec_add(pixelssum2, vctwo);
1183 + blockv = vec_packsu(temp3, temp4);
1185 + vec_st(blockv, 0, block);
1187 + block += line_size;
1188 + pixels += line_size;
1191 +POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
1192 +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1195 +/* next one assumes that ((line_size % 16) == 0) */
1196 +void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
1198 +POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
1199 +#ifdef ALTIVEC_USE_REFERENCE_C_CODE
1201 +POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1202 + for (j = 0; j < 4; j++) {
1204 + const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1205 + const uint32_t b =
1206 + (((const struct unaligned_32 *) (pixels + 1))->l);
1208 + (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
1210 + ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1212 + pixels += line_size;
1213 + for (i = 0; i < h; i += 2) {
1214 + uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1215 + uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
1216 + l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
1217 + h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1218 + *((uint32_t *) block) =
1219 + h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1220 + pixels += line_size;
1221 + block += line_size;
1222 + a = (((const struct unaligned_32 *) (pixels))->l);
1223 + b = (((const struct unaligned_32 *) (pixels + 1))->l);
1224 + l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
1225 + h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1226 + *((uint32_t *) block) =
1227 + h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1228 + pixels += line_size;
1229 + block += line_size;
1230 + } pixels += 4 - line_size * (h + 1);
1231 + block += 4 - line_size * h;
1234 +POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1236 +#else /* ALTIVEC_USE_REFERENCE_C_CODE */
1238 + register vector unsigned char
1239 + pixelsv1, pixelsv2, pixelsv3, pixelsv4;
1240 + register vector unsigned char
1241 + blockv, temp1, temp2;
1242 + register vector unsigned short
1243 + pixelssum1, pixelssum2, temp3,
1244 + pixelssum3, pixelssum4, temp4;
1245 + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
1246 + register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
1247 + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
1249 +POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1251 + temp1 = vec_ld(0, pixels);
1252 + temp2 = vec_ld(16, pixels);
1253 + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1254 + if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
1260 + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1262 + pixelsv3 = vec_mergel(vczero, pixelsv1);
1263 + pixelsv4 = vec_mergel(vczero, pixelsv2);
1264 + pixelsv1 = vec_mergeh(vczero, pixelsv1);
1265 + pixelsv2 = vec_mergeh(vczero, pixelsv2);
1266 + pixelssum3 = vec_add((vector unsigned short)pixelsv3,
1267 + (vector unsigned short)pixelsv4);
1268 + pixelssum3 = vec_add(pixelssum3, vcone);
1269 + pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1270 + (vector unsigned short)pixelsv2);
1271 + pixelssum1 = vec_add(pixelssum1, vcone);
1273 + for (i = 0; i < h ; i++) {
1274 + blockv = vec_ld(0, block);
1276 + temp1 = vec_ld(line_size, pixels);
1277 + temp2 = vec_ld(line_size + 16, pixels);
1278 + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1279 + if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
1285 + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1288 + pixelsv3 = vec_mergel(vczero, pixelsv1);
1289 + pixelsv4 = vec_mergel(vczero, pixelsv2);
1290 + pixelsv1 = vec_mergeh(vczero, pixelsv1);
1291 + pixelsv2 = vec_mergeh(vczero, pixelsv2);
1293 + pixelssum4 = vec_add((vector unsigned short)pixelsv3,
1294 + (vector unsigned short)pixelsv4);
1295 + pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1296 + (vector unsigned short)pixelsv2);
1297 + temp4 = vec_add(pixelssum3, pixelssum4);
1298 + temp4 = vec_sra(temp4, vctwo);
1299 + temp3 = vec_add(pixelssum1, pixelssum2);
1300 + temp3 = vec_sra(temp3, vctwo);
1302 + pixelssum3 = vec_add(pixelssum4, vcone);
1303 + pixelssum1 = vec_add(pixelssum2, vcone);
1305 + blockv = vec_packsu(temp3, temp4);
1307 + vec_st(blockv, 0, block);
1309 + block += line_size;
1310 + pixels += line_size;
1313 +POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1314 +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1317 +int has_altivec(void)
1319 +#ifdef CONFIG_DARWIN
1320 + int sels[2] = {CTL_HW, HW_VECTORUNIT};
1322 + size_t len = sizeof(has_vu);
1325 + err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
1327 + if (err == 0) return (has_vu != 0);
1328 +#else /* CONFIG_DARWIN */
1329 +/* no Darwin, do it the brute-force way */
1330 +/* this is borrowed from the libmpeg2 library */
1332 + signal (SIGILL, sigill_handler);
1333 + if (sigsetjmp (jmpbuf, 1)) {
1334 + signal (SIGILL, SIG_DFL);
1338 + asm volatile ("mtspr 256, %0\n\t"
1339 + "vand %%v0, %%v0, %%v0"
1343 + signal (SIGILL, SIG_DFL);
1347 +#endif /* CONFIG_DARWIN */
1350 diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/dsputil_ppc.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/dsputil_ppc.c
1351 --- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/dsputil_ppc.c 1970-01-01 01:00:00.000000000 +0100
1352 +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/dsputil_ppc.c 2003-09-28 17:26:40.000000000 +0200
1355 + * Copyright (c) 2002 Brian Foley
1356 + * Copyright (c) 2002 Dieter Shirley
1358 + * This library is free software; you can redistribute it and/or
1359 + * modify it under the terms of the GNU Lesser General Public
1360 + * License as published by the Free Software Foundation; either
1361 + * version 2 of the License, or (at your option) any later version.
1363 + * This library is distributed in the hope that it will be useful,
1364 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
1365 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
1366 + * Lesser General Public License for more details.
1368 + * You should have received a copy of the GNU Lesser General Public
1369 + * License along with this library; if not, write to the Free Software
1370 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
1373 +#include "../dsputil.h"
1375 +#include "dsputil_ppc.h"
1377 +#ifdef HAVE_ALTIVEC
1378 +#include "dsputil_altivec.h"
1381 +extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
1382 +extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
1386 +int mm_support(void)
1390 + if (has_altivec()) {
1391 + result |= MM_ALTIVEC;
1393 +#endif /* result */
1397 +#ifdef POWERPC_PERFORMANCE_REPORT
1398 +unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
1399 +/* list below must match enum in dsputil_ppc.h */
1400 +static unsigned char* perfname[] = {
1401 + "fft_calc_altivec",
1403 + "dct_unquantize_h263_altivec",
1404 + "idct_add_altivec",
1405 + "idct_put_altivec",
1406 + "put_pixels16_altivec",
1407 + "avg_pixels16_altivec",
1408 + "avg_pixels8_altivec",
1409 + "put_pixels8_xy2_altivec",
1410 + "put_no_rnd_pixels8_xy2_altivec",
1411 + "put_pixels16_xy2_altivec",
1412 + "put_no_rnd_pixels16_xy2_altivec",
1413 + "clear_blocks_dcbz32_ppc",
1414 + "clear_blocks_dcbz128_ppc"
1419 +#ifdef POWERPC_PERFORMANCE_REPORT
1420 +void powerpc_display_perf_report(void)
1423 + fprintf(stderr, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n");
1424 + for(i = 0 ; i < powerpc_perf_total ; i++)
1426 + for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++)
1428 + if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0)
1430 + " Function \"%s\" (pmc%d):\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n",
1433 + perfdata[j][i][powerpc_data_min],
1434 + perfdata[j][i][powerpc_data_max],
1435 + (double)perfdata[j][i][powerpc_data_sum] /
1436 + (double)perfdata[j][i][powerpc_data_num],
1437 + perfdata[j][i][powerpc_data_num]);
1441 +#endif /* POWERPC_PERFORMANCE_REPORT */
1443 +/* ***** WARNING ***** WARNING ***** WARNING ***** */
1445 + clear_blocks_dcbz32_ppc will not work properly
1446 + on PowerPC processors with a cache line size
1447 + not equal to 32 bytes.
1448 + Fortunately all processor used by Apple up to
1449 + at least the 7450 (aka second generation G4)
1450 + use 32 bytes cache line.
1451 + This is due to the use of the 'dcbz' instruction.
1452 + It simply clear to zero a single cache line,
1453 + so you need to know the cache line size to use it !
1454 + It's absurd, but it's fast...
1456 + update 24/06/2003 : Apple released yesterday the G5,
1457 + with a PPC970. cache line size : 128 bytes. Oups.
1458 + The semantic of dcbz was changed, it always clear
1459 + 32 bytes. so the function below will work, but will
1460 + be slow. So I fixed check_dcbz_effect to use dcbzl,
1461 + which is defined to clear a cache line (as dcbz before).
1462 + So we still can distinguish, and use dcbz (32 bytes)
1463 + or dcbzl (one cache line) as required.
1465 + see <http://developer.apple.com/technotes/tn/tn2087.html>
1466 + and <http://developer.apple.com/technotes/tn/tn2086.html>
1468 +void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
1470 +POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz32, 1);
1471 + register int misal = ((unsigned long)blocks & 0x00000010);
1472 + register int i = 0;
1473 +POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1);
1476 + ((unsigned long*)blocks)[0] = 0L;
1477 + ((unsigned long*)blocks)[1] = 0L;
1478 + ((unsigned long*)blocks)[2] = 0L;
1479 + ((unsigned long*)blocks)[3] = 0L;
1482 + for ( ; i < sizeof(DCTELEM)*6*64 ; i += 32) {
1483 + asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory");
1486 + ((unsigned long*)blocks)[188] = 0L;
1487 + ((unsigned long*)blocks)[189] = 0L;
1488 + ((unsigned long*)blocks)[190] = 0L;
1489 + ((unsigned long*)blocks)[191] = 0L;
1493 + memset(blocks, 0, sizeof(DCTELEM)*6*64);
1495 +POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1);
1498 +/* same as above, when dcbzl clear a whole 128B cache line
1499 + i.e. the PPC970 aka G5 */
1501 +void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
1503 +POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz128, 1);
1504 + register int misal = ((unsigned long)blocks & 0x0000007f);
1505 + register int i = 0;
1506 +POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1);
1509 + // we could probably also optimize this case,
1510 + // but there's not much point as the machines
1511 + // aren't available yet (2003-06-26)
1512 + memset(blocks, 0, sizeof(DCTELEM)*6*64);
1515 + for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
1516 + asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory");
1519 + memset(blocks, 0, sizeof(DCTELEM)*6*64);
1521 +POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1);
1524 +void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
1526 + memset(blocks, 0, sizeof(DCTELEM)*6*64);
1531 +/* check dcbz report how many bytes are set to 0 by dcbz */
1532 +/* update 24/06/2003 : replace dcbz by dcbzl to get
1533 + the intended effect (Apple "fixed" dcbz)
1534 + unfortunately this cannot be used unless the assembler
1535 + knows about dcbzl ... */
1536 +long check_dcbzl_effect(void)
1538 + register char *fakedata = (char*)av_malloc(1024);
1539 + register char *fakedata_middle;
1540 + register long zero = 0;
1541 + register long i = 0;
1549 + fakedata_middle = (fakedata + 512);
1551 + memset(fakedata, 0xFF, 1024);
1553 + /* below the constraint "b" seems to mean "Address base register"
1554 + in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */
1555 + asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero));
1557 + for (i = 0; i < 1024 ; i ++)
1559 + if (fakedata[i] == (char)0)
1563 + av_free(fakedata);
1568 +long check_dcbzl_effect(void)
1574 +void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
1576 + // Common optimizations whether Altivec is available or not
1578 + switch (check_dcbzl_effect()) {
1580 + c->clear_blocks = clear_blocks_dcbz32_ppc;
1583 + c->clear_blocks = clear_blocks_dcbz128_ppc;
1590 + if (has_altivec()) {
1591 + mm_flags |= MM_ALTIVEC;
1593 + // Altivec specific optimisations
1594 + c->pix_abs16x16_x2 = pix_abs16x16_x2_altivec;
1595 + c->pix_abs16x16_y2 = pix_abs16x16_y2_altivec;
1596 + c->pix_abs16x16_xy2 = pix_abs16x16_xy2_altivec;
1597 + c->pix_abs16x16 = pix_abs16x16_altivec;
1598 + c->pix_abs8x8 = pix_abs8x8_altivec;
1599 + c->sad[0]= sad16x16_altivec;
1600 + c->sad[1]= sad8x8_altivec;
1601 + c->pix_norm1 = pix_norm1_altivec;
1602 + c->sse[1]= sse8_altivec;
1603 + c->sse[0]= sse16_altivec;
1604 + c->pix_sum = pix_sum_altivec;
1605 + c->diff_pixels = diff_pixels_altivec;
1606 + c->get_pixels = get_pixels_altivec;
1607 +// next one disabled as it's untested.
1609 + c->add_bytes= add_bytes_altivec;
1611 + c->put_pixels_tab[0][0] = put_pixels16_altivec;
1612 + /* the tow functions do the same thing, so use the same code */
1613 + c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
1614 + c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
1615 +// next one disabled as it's untested.
1617 + c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
1619 + c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
1620 + c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
1621 + c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
1622 + c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
1624 + c->gmc1 = gmc1_altivec;
1626 + if ((avctx->idct_algo == FF_IDCT_AUTO) ||
1627 + (avctx->idct_algo == FF_IDCT_ALTIVEC))
1629 + c->idct_put = idct_put_altivec;
1630 + c->idct_add = idct_add_altivec;
1631 +#ifndef ALTIVEC_USE_REFERENCE_C_CODE
1632 + c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
1633 +#else /* ALTIVEC_USE_REFERENCE_C_CODE */
1634 + c->idct_permutation_type = FF_NO_IDCT_PERM;
1635 +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1638 +#ifdef POWERPC_PERFORMANCE_REPORT
1641 + for (i = 0 ; i < powerpc_perf_total ; i++)
1643 + for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++)
1645 + perfdata[j][i][powerpc_data_min] = (unsigned long long)0xFFFFFFFFFFFFFFFF;
1646 + perfdata[j][i][powerpc_data_max] = (unsigned long long)0x0000000000000000;
1647 + perfdata[j][i][powerpc_data_sum] = (unsigned long long)0x0000000000000000;
1648 + perfdata[j][i][powerpc_data_num] = (unsigned long long)0x0000000000000000;
1652 +#endif /* POWERPC_PERFORMANCE_REPORT */
1654 +#endif /* HAVE_ALTIVEC */
1656 + // Non-AltiVec PPC optimisations
1658 + // ... pending ...
1661 diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/fft_altivec.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/fft_altivec.c
1662 --- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/fft_altivec.c 1970-01-01 01:00:00.000000000 +0100
1663 +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/fft_altivec.c 2003-09-28 17:26:40.000000000 +0200
1666 + * FFT/IFFT transforms
1668 + * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
1669 + * Based on code Copyright (c) 2002 Fabrice Bellard.
1671 + * This library is free software; you can redistribute it and/or
1672 + * modify it under the terms of the GNU Lesser General Public
1673 + * License as published by the Free Software Foundation; either
1674 + * version 2 of the License, or (at your option) any later version.
1676 + * This library is distributed in the hope that it will be useful,
1677 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
1678 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
1679 + * Lesser General Public License for more details.
1681 + * You should have received a copy of the GNU Lesser General Public
1682 + * License along with this library; if not, write to the Free Software
1683 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
1685 +#include "../dsputil.h"
1687 +#include "gcc_fixes.h"
1689 +#include "dsputil_altivec.h"
1692 + those three macros are from libavcodec/fft.c
1693 + and are required for the reference C code
1695 +/* butter fly op */
1696 +#define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \
1698 + FFTSample ax, ay, bx, by;\
1708 +#define MUL16(a,b) ((a) * (b))
1709 +#define CMUL(pre, pim, are, aim, bre, bim) \
1711 + pre = (MUL16(are, bre) - MUL16(aim, bim));\
1712 + pim = (MUL16(are, bim) + MUL16(bre, aim));\
1717 + * Do a complex FFT with the parameters defined in fft_init(). The
1718 + * input data must be permuted before with s->revtab table. No
1719 + * 1.0/sqrt(n) normalization is done.
1721 + * This code assumes that the 'z' pointer is 16 bytes-aligned
1722 + * It also assumes all FFTComplex are 8 bytes-aligned pair of float
1723 + * The code is exactly the same as the SSE version, except
1724 + * that successive MUL + ADD/SUB have been merged into
1725 + * fused multiply-add ('vec_madd' in altivec)
1727 +void fft_calc_altivec(FFTContext *s, FFTComplex *z)
1729 +POWERPC_PERF_DECLARE(altivec_fft_num, s->nbits >= 6);
1730 +#ifdef ALTIVEC_USE_REFERENCE_C_CODE
1731 + int ln = s->nbits;
1733 + int nblocks, nloops;
1734 + register FFTComplex *p, *q;
1735 + FFTComplex *exptab = s->exptab;
1737 + FFTSample tmp_re, tmp_im;
1739 +POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6);
1748 + BF(p[0].re, p[0].im, p[1].re, p[1].im,
1749 + p[0].re, p[0].im, p[1].re, p[1].im);
1751 + } while (--j != 0);
1760 + BF(p[0].re, p[0].im, p[2].re, p[2].im,
1761 + p[0].re, p[0].im, p[2].re, p[2].im);
1762 + BF(p[1].re, p[1].im, p[3].re, p[3].im,
1763 + p[1].re, p[1].im, -p[3].im, p[3].re);
1765 + } while (--j != 0);
1768 + BF(p[0].re, p[0].im, p[2].re, p[2].im,
1769 + p[0].re, p[0].im, p[2].re, p[2].im);
1770 + BF(p[1].re, p[1].im, p[3].re, p[3].im,
1771 + p[1].re, p[1].im, p[3].im, -p[3].re);
1773 + } while (--j != 0);
1775 + /* pass 2 .. ln-1 */
1777 + nblocks = np >> 3;
1783 + for (j = 0; j < nblocks; ++j) {
1784 + BF(p->re, p->im, q->re, q->im,
1785 + p->re, p->im, q->re, q->im);
1789 + for(l = nblocks; l < np2; l += nblocks) {
1790 + CMUL(tmp_re, tmp_im, exptab[l].re, exptab[l].im, q->re, q->im);
1791 + BF(p->re, p->im, q->re, q->im,
1792 + p->re, p->im, tmp_re, tmp_im);
1800 + nblocks = nblocks >> 1;
1801 + nloops = nloops << 1;
1802 + } while (nblocks != 0);
1804 +POWERPC_PERF_STOP_COUNT(altivec_fft_num, s->nbits >= 6);
1806 +#else /* ALTIVEC_USE_REFERENCE_C_CODE */
1807 +#ifdef CONFIG_DARWIN
1808 + register const vector float vczero = (const vector float)(0.);
1810 + register const vector float vczero = (const vector float){0.,0.,0.,0.};
1813 + int ln = s->nbits;
1815 + int nblocks, nloops;
1816 + register FFTComplex *p, *q;
1817 + FFTComplex *cptr, *cptr1;
1820 +POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6);
1825 + vector float *r, a, b, a1, c1, c2;
1827 + r = (vector float *)&z[0];
1829 + c1 = vcii(p,p,n,n);
1833 + c2 = vcii(p,p,n,p);
1837 + c2 = vcii(p,p,p,n);
1843 + a1 = vec_ld(sizeof(vector float), r);
1845 + b = vec_perm(a,a,vcprmle(1,0,3,2));
1846 + a = vec_madd(a,c1,b);
1847 + /* do the pass 0 butterfly */
1849 + b = vec_perm(a1,a1,vcprmle(1,0,3,2));
1850 + b = vec_madd(a1,c1,b);
1851 + /* do the pass 0 butterfly */
1853 + /* multiply third by -i */
1854 + b = vec_perm(b,b,vcprmle(2,3,1,0));
1856 + /* do the pass 1 butterfly */
1857 + vec_st(vec_madd(b,c2,a), 0, r);
1858 + vec_st(vec_nmsub(b,c2,a), sizeof(vector float), r);
1861 + } while (--j != 0);
1863 + /* pass 2 .. ln-1 */
1865 + nblocks = np >> 3;
1869 + cptr1 = s->exptab1;
1878 + vector float a,b,c,t1;
1880 + a = vec_ld(0, (float*)p);
1881 + b = vec_ld(0, (float*)q);
1884 + c = vec_ld(0, (float*)cptr);
1885 + /* cre*re cim*re */
1886 + t1 = vec_madd(c, vec_perm(b,b,vcprmle(2,2,0,0)),vczero);
1887 + c = vec_ld(sizeof(vector float), (float*)cptr);
1888 + /* -cim*im cre*im */
1889 + b = vec_madd(c, vec_perm(b,b,vcprmle(3,3,1,1)),t1);
1892 + vec_st(vec_add(a,b), 0, (float*)p);
1893 + vec_st(vec_sub(a,b), 0, (float*)q);
1903 + cptr1 += nloops * 2;
1904 + nblocks = nblocks >> 1;
1905 + nloops = nloops << 1;
1906 + } while (nblocks != 0);
1908 +POWERPC_PERF_STOP_COUNT(altivec_fft_num, s->nbits >= 6);
1910 +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1912 diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/gcc_fixes.h avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/gcc_fixes.h
1913 --- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/gcc_fixes.h 2003-07-04 15:40:29.000000000 +0200
1914 +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/gcc_fixes.h 2003-09-28 17:26:40.000000000 +0200
1916 * http://gcc.gnu.org/ml/gcc/2003-04/msg00967.html
1919 -static inline vector signed char my_vmrglb (vector signed char const A,
1920 +static inline vector signed char ff_vmrglb (vector signed char const A,
1921 vector signed char const B)
1923 static const vector unsigned char lowbyte = {
1925 return vec_perm (A, B, lowbyte);
1928 -static inline vector signed short my_vmrglh (vector signed short const A,
1929 +static inline vector signed short ff_vmrglh (vector signed short const A,
1930 vector signed short const B)
1932 static const vector unsigned char lowhalf = {
1934 return vec_perm (A, B, lowhalf);
1937 -static inline vector signed int my_vmrglw (vector signed int const A,
1938 +static inline vector signed int ff_vmrglw (vector signed int const A,
1939 vector signed int const B)
1941 static const vector unsigned char lowword = {
1944 return vec_perm (A, B, lowword);
1946 -/*#define my_vmrglb my_vmrglb
1947 -#define my_vmrglh my_vmrglh
1948 -#define my_vmrglw my_vmrglw
1949 +/*#define ff_vmrglb ff_vmrglb
1950 +#define ff_vmrglh ff_vmrglh
1951 +#define ff_vmrglw ff_vmrglw
1955 #define vec_mergel(a1, a2) \
1956 __ch (__bin_args_eq (vector signed char, (a1), vector signed char, (a2)), \
1957 - ((vector signed char) my_vmrglb ((vector signed char) (a1), (vector signed char) (a2))), \
1958 + ((vector signed char) ff_vmrglb ((vector signed char) (a1), (vector signed char) (a2))), \
1959 __ch (__bin_args_eq (vector unsigned char, (a1), vector unsigned char, (a2)), \
1960 - ((vector unsigned char) my_vmrglb ((vector signed char) (a1), (vector signed char) (a2))), \
1961 + ((vector unsigned char) ff_vmrglb ((vector signed char) (a1), (vector signed char) (a2))), \
1962 __ch (__bin_args_eq (vector signed short, (a1), vector signed short, (a2)), \
1963 - ((vector signed short) my_vmrglh ((vector signed short) (a1), (vector signed short) (a2))), \
1964 + ((vector signed short) ff_vmrglh ((vector signed short) (a1), (vector signed short) (a2))), \
1965 __ch (__bin_args_eq (vector unsigned short, (a1), vector unsigned short, (a2)), \
1966 - ((vector unsigned short) my_vmrglh ((vector signed short) (a1), (vector signed short) (a2))), \
1967 + ((vector unsigned short) ff_vmrglh ((vector signed short) (a1), (vector signed short) (a2))), \
1968 __ch (__bin_args_eq (vector float, (a1), vector float, (a2)), \
1969 - ((vector float) my_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \
1970 + ((vector float) ff_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \
1971 __ch (__bin_args_eq (vector signed int, (a1), vector signed int, (a2)), \
1972 - ((vector signed int) my_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \
1973 + ((vector signed int) ff_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \
1974 __ch (__bin_args_eq (vector unsigned int, (a1), vector unsigned int, (a2)), \
1975 - ((vector unsigned int) my_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \
1976 + ((vector unsigned int) ff_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \
1977 __altivec_link_error_invalid_argument ())))))))
1980 diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/gmc_altivec.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/gmc_altivec.c
1981 --- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/gmc_altivec.c 1970-01-01 01:00:00.000000000 +0100
1982 +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/gmc_altivec.c 2003-09-28 17:26:40.000000000 +0200
1985 + * GMC (Global Motion Compensation)
1987 + * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
1989 + * This library is free software; you can redistribute it and/or
1990 + * modify it under the terms of the GNU Lesser General Public
1991 + * License as published by the Free Software Foundation; either
1992 + * version 2 of the License, or (at your option) any later version.
1994 + * This library is distributed in the hope that it will be useful,
1995 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
1996 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
1997 + * Lesser General Public License for more details.
1999 + * You should have received a copy of the GNU Lesser General Public
2000 + * License along with this library; if not, write to the Free Software
2001 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2004 +#include "../dsputil.h"
2006 +#include "gcc_fixes.h"
2008 +#include "dsputil_altivec.h"
2011 + altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8,
2012 + to preserve proper dst alignement.
2014 +#define GMC1_PERF_COND (h==8)
2015 +void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int stride, int h, int x16, int y16, int rounder)
2017 +POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND);
2018 +#ifdef ALTIVEC_USE_REFERENCE_C_CODE
2019 + const int A=(16-x16)*(16-y16);
2020 + const int B=( x16)*(16-y16);
2021 + const int C=(16-x16)*( y16);
2022 + const int D=( x16)*( y16);
2025 +POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
2027 + for(i=0; i<h; i++)
2029 + dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
2030 + dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
2031 + dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
2032 + dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
2033 + dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
2034 + dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
2035 + dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
2036 + dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
2041 +POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
2043 +#else /* ALTIVEC_USE_REFERENCE_C_CODE */
2044 + const unsigned short __attribute__ ((aligned(16))) rounder_a[8] =
2045 + {rounder, rounder, rounder, rounder,
2046 + rounder, rounder, rounder, rounder};
2047 + const unsigned short __attribute__ ((aligned(16))) ABCD[8] =
2049 + (16-x16)*(16-y16), /* A */
2050 + ( x16)*(16-y16), /* B */
2051 + (16-x16)*( y16), /* C */
2052 + ( x16)*( y16), /* D */
2053 + 0, 0, 0, 0 /* padding */
2055 + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
2056 + register const vector unsigned short vcsr8 = (const vector unsigned short)vec_splat_u16(8);
2057 + register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD;
2058 + register vector unsigned short Av, Bv, Cv, Dv, rounderV, tempA, tempB, tempC, tempD;
2060 + unsigned long dst_odd = (unsigned long)dst & 0x0000000F;
2061 + unsigned long src_really_odd = (unsigned long)src & 0x0000000F;
2064 +POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
2066 + tempA = vec_ld(0, (unsigned short*)ABCD);
2067 + Av = vec_splat(tempA, 0);
2068 + Bv = vec_splat(tempA, 1);
2069 + Cv = vec_splat(tempA, 2);
2070 + Dv = vec_splat(tempA, 3);
2072 + rounderV = vec_ld(0, (unsigned short*)rounder_a);
2074 + // we'll be able to pick-up our 9 char elements
2075 + // at src from those 32 bytes
2076 + // we load the first batch here, as inside the loop
2077 + // we can re-use 'src+stride' from one iteration
2078 + // as the 'src' of the next.
2079 + src_0 = vec_ld(0, src);
2080 + src_1 = vec_ld(16, src);
2081 + srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src));
2083 + if (src_really_odd != 0x0000000F)
2084 + { // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector.
2085 + srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src));
2091 + srcvA = vec_mergeh(vczero, srcvA);
2092 + srcvB = vec_mergeh(vczero, srcvB);
2094 + for(i=0; i<h; i++)
2096 + dst_odd = (unsigned long)dst & 0x0000000F;
2097 + src_really_odd = (((unsigned long)src) + stride) & 0x0000000F;
2099 + dstv = vec_ld(0, dst);
2101 + // we we'll be able to pick-up our 9 char elements
2102 + // at src + stride from those 32 bytes
2103 + // then reuse the resulting 2 vectors srvcC and srcvD
2104 + // as the next srcvA and srcvB
2105 + src_0 = vec_ld(stride + 0, src);
2106 + src_1 = vec_ld(stride + 16, src);
2107 + srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src));
2109 + if (src_really_odd != 0x0000000F)
2110 + { // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector.
2111 + srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src));
2118 + srcvC = vec_mergeh(vczero, srcvC);
2119 + srcvD = vec_mergeh(vczero, srcvD);
2122 + // OK, now we (finally) do the math :-)
2123 + // those four instructions replaces 32 int muls & 32 int adds.
2124 + // isn't AltiVec nice ?
2125 + tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV);
2126 + tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA);
2127 + tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB);
2128 + tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC);
2133 + tempD = vec_sr(tempD, vcsr8);
2135 + dstv2 = vec_pack(tempD, (vector unsigned short)vczero);
2139 + dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1));
2143 + dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3));
2146 + vec_st(dstv2, 0, dst);
2152 +POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
2154 +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
2156 diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/idct_altivec.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/idct_altivec.c
2157 --- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/idct_altivec.c 1970-01-01 01:00:00.000000000 +0100
2158 +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/idct_altivec.c 2003-09-28 17:26:40.000000000 +0200
2161 + * Copyright (c) 2001 Michel Lespinasse
2163 + * This library is free software; you can redistribute it and/or
2164 + * modify it under the terms of the GNU Lesser General Public
2165 + * License as published by the Free Software Foundation; either
2166 + * version 2 of the License, or (at your option) any later version.
2168 + * This library is distributed in the hope that it will be useful,
2169 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
2170 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
2171 + * Lesser General Public License for more details.
2173 + * You should have received a copy of the GNU Lesser General Public
2174 + * License along with this library; if not, write to the Free Software
2175 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2180 + * NOTE: This code is based on GPL code from the libmpeg2 project. The
2181 + * author, Michel Lespinasses, has given explicit permission to release
2182 + * under LGPL as part of ffmpeg.
2187 + * FFMpeg integration by Dieter Shirley
2189 + * This file is a direct copy of the altivec idct module from the libmpeg2
2190 + * project. I've deleted all of the libmpeg2 specific code, renamed the functions and
2191 + * re-ordered the function parameters. The only change to the IDCT function
2192 + * itself was to factor out the partial transposition, and to perform a full
2193 + * transpose at the end of the function.
2197 +#include <stdlib.h> /* malloc(), free() */
2198 +#include <string.h>
2199 +#include "../dsputil.h"
2201 +#include "gcc_fixes.h"
2203 +#include "dsputil_altivec.h"
2205 +#define vector_s16_t vector signed short
2206 +#define vector_u16_t vector unsigned short
2207 +#define vector_s8_t vector signed char
2208 +#define vector_u8_t vector unsigned char
2209 +#define vector_s32_t vector signed int
2210 +#define vector_u32_t vector unsigned int
2212 +#define IDCT_HALF \
2214 + t1 = vec_mradds (a1, vx7, vx1 ); \
2215 + t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7)); \
2216 + t7 = vec_mradds (a2, vx5, vx3); \
2217 + t3 = vec_mradds (ma2, vx3, vx5); \
2220 + t5 = vec_adds (vx0, vx4); \
2221 + t0 = vec_subs (vx0, vx4); \
2222 + t2 = vec_mradds (a0, vx6, vx2); \
2223 + t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6)); \
2224 + t6 = vec_adds (t8, t3); \
2225 + t3 = vec_subs (t8, t3); \
2226 + t8 = vec_subs (t1, t7); \
2227 + t1 = vec_adds (t1, t7); \
2230 + t7 = vec_adds (t5, t2); \
2231 + t2 = vec_subs (t5, t2); \
2232 + t5 = vec_adds (t0, t4); \
2233 + t0 = vec_subs (t0, t4); \
2234 + t4 = vec_subs (t8, t3); \
2235 + t3 = vec_adds (t8, t3); \
2238 + vy0 = vec_adds (t7, t1); \
2239 + vy7 = vec_subs (t7, t1); \
2240 + vy1 = vec_mradds (c4, t3, t5); \
2241 + vy6 = vec_mradds (mc4, t3, t5); \
2242 + vy2 = vec_mradds (c4, t4, t0); \
2243 + vy5 = vec_mradds (mc4, t4, t0); \
2244 + vy3 = vec_adds (t2, t6); \
2245 + vy4 = vec_subs (t2, t6);
2249 + vector_s16_t vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \
2250 + vector_s16_t vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \
2251 + vector_s16_t a0, a1, a2, ma2, c4, mc4, zero, bias; \
2252 + vector_s16_t t0, t1, t2, t3, t4, t5, t6, t7, t8; \
2253 + vector_u16_t shift; \
2255 + c4 = vec_splat (constants[0], 0); \
2256 + a0 = vec_splat (constants[0], 1); \
2257 + a1 = vec_splat (constants[0], 2); \
2258 + a2 = vec_splat (constants[0], 3); \
2259 + mc4 = vec_splat (constants[0], 4); \
2260 + ma2 = vec_splat (constants[0], 5); \
2261 + bias = (vector_s16_t)vec_splat ((vector_s32_t)constants[0], 3); \
2263 + zero = vec_splat_s16 (0); \
2264 + shift = vec_splat_u16 (4); \
2266 + vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero); \
2267 + vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero); \
2268 + vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero); \
2269 + vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero); \
2270 + vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero); \
2271 + vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero); \
2272 + vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero); \
2273 + vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero); \
2277 + vx0 = vec_mergeh (vy0, vy4); \
2278 + vx1 = vec_mergel (vy0, vy4); \
2279 + vx2 = vec_mergeh (vy1, vy5); \
2280 + vx3 = vec_mergel (vy1, vy5); \
2281 + vx4 = vec_mergeh (vy2, vy6); \
2282 + vx5 = vec_mergel (vy2, vy6); \
2283 + vx6 = vec_mergeh (vy3, vy7); \
2284 + vx7 = vec_mergel (vy3, vy7); \
2286 + vy0 = vec_mergeh (vx0, vx4); \
2287 + vy1 = vec_mergel (vx0, vx4); \
2288 + vy2 = vec_mergeh (vx1, vx5); \
2289 + vy3 = vec_mergel (vx1, vx5); \
2290 + vy4 = vec_mergeh (vx2, vx6); \
2291 + vy5 = vec_mergel (vx2, vx6); \
2292 + vy6 = vec_mergeh (vx3, vx7); \
2293 + vy7 = vec_mergel (vx3, vx7); \
2295 + vx0 = vec_adds (vec_mergeh (vy0, vy4), bias); \
2296 + vx1 = vec_mergel (vy0, vy4); \
2297 + vx2 = vec_mergeh (vy1, vy5); \
2298 + vx3 = vec_mergel (vy1, vy5); \
2299 + vx4 = vec_mergeh (vy2, vy6); \
2300 + vx5 = vec_mergel (vy2, vy6); \
2301 + vx6 = vec_mergeh (vy3, vy7); \
2302 + vx7 = vec_mergel (vy3, vy7); \
2306 + shift = vec_splat_u16 (6); \
2307 + vx0 = vec_sra (vy0, shift); \
2308 + vx1 = vec_sra (vy1, shift); \
2309 + vx2 = vec_sra (vy2, shift); \
2310 + vx3 = vec_sra (vy3, shift); \
2311 + vx4 = vec_sra (vy4, shift); \
2312 + vx5 = vec_sra (vy5, shift); \
2313 + vx6 = vec_sra (vy6, shift); \
2314 + vx7 = vec_sra (vy7, shift);
2317 +static const vector_s16_t constants[5] = {
2318 + (vector_s16_t) AVV(23170, 13573, 6518, 21895, -23170, -21895, 32, 31),
2319 + (vector_s16_t) AVV(16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725),
2320 + (vector_s16_t) AVV(22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521),
2321 + (vector_s16_t) AVV(21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692),
2322 + (vector_s16_t) AVV(19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722)
2325 +void idct_put_altivec(uint8_t* dest, int stride, vector_s16_t* block)
2327 +POWERPC_PERF_DECLARE(altivec_idct_put_num, 1);
2328 +#ifdef ALTIVEC_USE_REFERENCE_C_CODE
2329 +POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1);
2330 + void simple_idct_put(uint8_t *dest, int line_size, int16_t *block);
2331 + simple_idct_put(dest, stride, (int16_t*)block);
2332 +POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1);
2333 +#else /* ALTIVEC_USE_REFERENCE_C_CODE */
2336 +POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1);
2340 +#define COPY(dest,src) \
2341 + tmp = vec_packsu (src, src); \
2342 + vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest); \
2343 + vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
2345 + COPY (dest, vx0) dest += stride;
2346 + COPY (dest, vx1) dest += stride;
2347 + COPY (dest, vx2) dest += stride;
2348 + COPY (dest, vx3) dest += stride;
2349 + COPY (dest, vx4) dest += stride;
2350 + COPY (dest, vx5) dest += stride;
2351 + COPY (dest, vx6) dest += stride;
2354 +POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1);
2355 +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
2358 +void idct_add_altivec(uint8_t* dest, int stride, vector_s16_t* block)
2360 +POWERPC_PERF_DECLARE(altivec_idct_add_num, 1);
2361 +#ifdef ALTIVEC_USE_REFERENCE_C_CODE
2362 +POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1);
2363 + void simple_idct_add(uint8_t *dest, int line_size, int16_t *block);
2364 + simple_idct_add(dest, stride, (int16_t*)block);
2365 +POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1);
2366 +#else /* ALTIVEC_USE_REFERENCE_C_CODE */
2368 + vector_s16_t tmp2, tmp3;
2369 + vector_u8_t perm0;
2370 + vector_u8_t perm1;
2371 + vector_u8_t p0, p1, p;
2373 +POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1);
2377 + p0 = vec_lvsl (0, dest);
2378 + p1 = vec_lvsl (stride, dest);
2379 + p = vec_splat_u8 (-1);
2380 + perm0 = vec_mergeh (p, p0);
2381 + perm1 = vec_mergeh (p, p1);
2383 +#define ADD(dest,src,perm) \
2384 + /* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \
2385 + tmp = vec_ld (0, dest); \
2386 + tmp2 = (vector_s16_t)vec_perm (tmp, (vector_u8_t)zero, perm); \
2387 + tmp3 = vec_adds (tmp2, src); \
2388 + tmp = vec_packsu (tmp3, tmp3); \
2389 + vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest); \
2390 + vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
2392 + ADD (dest, vx0, perm0) dest += stride;
2393 + ADD (dest, vx1, perm1) dest += stride;
2394 + ADD (dest, vx2, perm0) dest += stride;
2395 + ADD (dest, vx3, perm1) dest += stride;
2396 + ADD (dest, vx4, perm0) dest += stride;
2397 + ADD (dest, vx5, perm1) dest += stride;
2398 + ADD (dest, vx6, perm0) dest += stride;
2399 + ADD (dest, vx7, perm1)
2401 +POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1);
2402 +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
2405 diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/mpegvideo_altivec.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/mpegvideo_altivec.c
2406 --- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/mpegvideo_altivec.c 1970-01-01 01:00:00.000000000 +0100
2407 +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/mpegvideo_altivec.c 2003-09-28 17:26:40.000000000 +0200
2410 + * Copyright (c) 2002 Dieter Shirley
2412 + * This library is free software; you can redistribute it and/or
2413 + * modify it under the terms of the GNU Lesser General Public
2414 + * License as published by the Free Software Foundation; either
2415 + * version 2 of the License, or (at your option) any later version.
2417 + * This library is distributed in the hope that it will be useful,
2418 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
2419 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
2420 + * Lesser General Public License for more details.
2422 + * You should have received a copy of the GNU Lesser General Public
2423 + * License along with this library; if not, write to the Free Software
2424 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2427 +#include <stdlib.h>
2429 +#include "../dsputil.h"
2430 +#include "../mpegvideo.h"
2432 +#include "gcc_fixes.h"
2434 +#include "dsputil_altivec.h"
2436 +// Swaps two variables (used for altivec registers)
2437 +#define SWAP(a,b) \
2439 + __typeof__(a) swap_temp=a; \
2444 +// transposes a matrix consisting of four vectors with four elements each
2445 +#define TRANSPOSE4(a,b,c,d) \
2447 + __typeof__(a) _trans_ach = vec_mergeh(a, c); \
2448 + __typeof__(a) _trans_acl = vec_mergel(a, c); \
2449 + __typeof__(a) _trans_bdh = vec_mergeh(b, d); \
2450 + __typeof__(a) _trans_bdl = vec_mergel(b, d); \
2452 + a = vec_mergeh(_trans_ach, _trans_bdh); \
2453 + b = vec_mergel(_trans_ach, _trans_bdh); \
2454 + c = vec_mergeh(_trans_acl, _trans_bdl); \
2455 + d = vec_mergel(_trans_acl, _trans_bdl); \
2458 +#define TRANSPOSE8(a,b,c,d,e,f,g,h) \
2460 + __typeof__(a) _A1, _B1, _C1, _D1, _E1, _F1, _G1, _H1; \
2461 + __typeof__(a) _A2, _B2, _C2, _D2, _E2, _F2, _G2, _H2; \
2463 + _A1 = vec_mergeh (a, e); \
2464 + _B1 = vec_mergel (a, e); \
2465 + _C1 = vec_mergeh (b, f); \
2466 + _D1 = vec_mergel (b, f); \
2467 + _E1 = vec_mergeh (c, g); \
2468 + _F1 = vec_mergel (c, g); \
2469 + _G1 = vec_mergeh (d, h); \
2470 + _H1 = vec_mergel (d, h); \
2472 + _A2 = vec_mergeh (_A1, _E1); \
2473 + _B2 = vec_mergel (_A1, _E1); \
2474 + _C2 = vec_mergeh (_B1, _F1); \
2475 + _D2 = vec_mergel (_B1, _F1); \
2476 + _E2 = vec_mergeh (_C1, _G1); \
2477 + _F2 = vec_mergel (_C1, _G1); \
2478 + _G2 = vec_mergeh (_D1, _H1); \
2479 + _H2 = vec_mergel (_D1, _H1); \
2481 + a = vec_mergeh (_A2, _E2); \
2482 + b = vec_mergel (_A2, _E2); \
2483 + c = vec_mergeh (_B2, _F2); \
2484 + d = vec_mergel (_B2, _F2); \
2485 + e = vec_mergeh (_C2, _G2); \
2486 + f = vec_mergel (_C2, _G2); \
2487 + g = vec_mergeh (_D2, _H2); \
2488 + h = vec_mergel (_D2, _H2); \
2492 +// Loads a four-byte value (int or float) from the target address
2493 +// into every element in the target vector. Only works if the
2494 +// target address is four-byte aligned (which should be always).
2495 +#define LOAD4(vec, address) \
2497 + __typeof__(vec)* _load_addr = (__typeof__(vec)*)(address); \
2498 + vector unsigned char _perm_vec = vec_lvsl(0,(address)); \
2499 + vec = vec_ld(0, _load_addr); \
2500 + vec = vec_perm(vec, vec, _perm_vec); \
2501 + vec = vec_splat(vec, 0); \
2505 +#ifdef CONFIG_DARWIN
2506 +#define FOUROF(a) (a)
2508 +// slower, for dumb non-apple GCC
2509 +#define FOUROF(a) {a,a,a,a}
2511 +int dct_quantize_altivec(MpegEncContext* s,
2512 + DCTELEM* data, int n,
2513 + int qscale, int* overflow)
2516 + vector float row0, row1, row2, row3, row4, row5, row6, row7;
2517 + vector float alt0, alt1, alt2, alt3, alt4, alt5, alt6, alt7;
2518 + const vector float zero = (const vector float)FOUROF(0.);
2520 + // Load the data into the row/alt vectors
2522 + vector signed short data0, data1, data2, data3, data4, data5, data6, data7;
2524 + data0 = vec_ld(0, data);
2525 + data1 = vec_ld(16, data);
2526 + data2 = vec_ld(32, data);
2527 + data3 = vec_ld(48, data);
2528 + data4 = vec_ld(64, data);
2529 + data5 = vec_ld(80, data);
2530 + data6 = vec_ld(96, data);
2531 + data7 = vec_ld(112, data);
2533 + // Transpose the data before we start
2534 + TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7);
2536 + // load the data into floating point vectors. We load
2537 + // the high half of each row into the main row vectors
2538 + // and the low half into the alt vectors.
2539 + row0 = vec_ctf(vec_unpackh(data0), 0);
2540 + alt0 = vec_ctf(vec_unpackl(data0), 0);
2541 + row1 = vec_ctf(vec_unpackh(data1), 0);
2542 + alt1 = vec_ctf(vec_unpackl(data1), 0);
2543 + row2 = vec_ctf(vec_unpackh(data2), 0);
2544 + alt2 = vec_ctf(vec_unpackl(data2), 0);
2545 + row3 = vec_ctf(vec_unpackh(data3), 0);
2546 + alt3 = vec_ctf(vec_unpackl(data3), 0);
2547 + row4 = vec_ctf(vec_unpackh(data4), 0);
2548 + alt4 = vec_ctf(vec_unpackl(data4), 0);
2549 + row5 = vec_ctf(vec_unpackh(data5), 0);
2550 + alt5 = vec_ctf(vec_unpackl(data5), 0);
2551 + row6 = vec_ctf(vec_unpackh(data6), 0);
2552 + alt6 = vec_ctf(vec_unpackl(data6), 0);
2553 + row7 = vec_ctf(vec_unpackh(data7), 0);
2554 + alt7 = vec_ctf(vec_unpackl(data7), 0);
2557 + // The following block could exist as a separate an altivec dct
2558 + // function. However, if we put it inline, the DCT data can remain
2559 + // in the vector local variables, as floats, which we'll use during the
2560 + // quantize step...
2562 + const vector float vec_0_298631336 = (vector float)FOUROF(0.298631336f);
2563 + const vector float vec_0_390180644 = (vector float)FOUROF(-0.390180644f);
2564 + const vector float vec_0_541196100 = (vector float)FOUROF(0.541196100f);
2565 + const vector float vec_0_765366865 = (vector float)FOUROF(0.765366865f);
2566 + const vector float vec_0_899976223 = (vector float)FOUROF(-0.899976223f);
2567 + const vector float vec_1_175875602 = (vector float)FOUROF(1.175875602f);
2568 + const vector float vec_1_501321110 = (vector float)FOUROF(1.501321110f);
2569 + const vector float vec_1_847759065 = (vector float)FOUROF(-1.847759065f);
2570 + const vector float vec_1_961570560 = (vector float)FOUROF(-1.961570560f);
2571 + const vector float vec_2_053119869 = (vector float)FOUROF(2.053119869f);
2572 + const vector float vec_2_562915447 = (vector float)FOUROF(-2.562915447f);
2573 + const vector float vec_3_072711026 = (vector float)FOUROF(3.072711026f);
2576 + int whichPass, whichHalf;
2578 + for(whichPass = 1; whichPass<=2; whichPass++)
2580 + for(whichHalf = 1; whichHalf<=2; whichHalf++)
2582 + vector float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2583 + vector float tmp10, tmp11, tmp12, tmp13;
2584 + vector float z1, z2, z3, z4, z5;
2586 + tmp0 = vec_add(row0, row7); // tmp0 = dataptr[0] + dataptr[7];
2587 + tmp7 = vec_sub(row0, row7); // tmp7 = dataptr[0] - dataptr[7];
2588 + tmp3 = vec_add(row3, row4); // tmp3 = dataptr[3] + dataptr[4];
2589 + tmp4 = vec_sub(row3, row4); // tmp4 = dataptr[3] - dataptr[4];
2590 + tmp1 = vec_add(row1, row6); // tmp1 = dataptr[1] + dataptr[6];
2591 + tmp6 = vec_sub(row1, row6); // tmp6 = dataptr[1] - dataptr[6];
2592 + tmp2 = vec_add(row2, row5); // tmp2 = dataptr[2] + dataptr[5];
2593 + tmp5 = vec_sub(row2, row5); // tmp5 = dataptr[2] - dataptr[5];
2595 + tmp10 = vec_add(tmp0, tmp3); // tmp10 = tmp0 + tmp3;
2596 + tmp13 = vec_sub(tmp0, tmp3); // tmp13 = tmp0 - tmp3;
2597 + tmp11 = vec_add(tmp1, tmp2); // tmp11 = tmp1 + tmp2;
2598 + tmp12 = vec_sub(tmp1, tmp2); // tmp12 = tmp1 - tmp2;
2601 + // dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS);
2602 + row0 = vec_add(tmp10, tmp11);
2604 + // dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
2605 + row4 = vec_sub(tmp10, tmp11);
2608 + // z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
2609 + z1 = vec_madd(vec_add(tmp12, tmp13), vec_0_541196100, (vector float)zero);
2611 + // dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
2612 + // CONST_BITS-PASS1_BITS);
2613 + row2 = vec_madd(tmp13, vec_0_765366865, z1);
2615 + // dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
2616 + // CONST_BITS-PASS1_BITS);
2617 + row6 = vec_madd(tmp12, vec_1_847759065, z1);
2619 + z1 = vec_add(tmp4, tmp7); // z1 = tmp4 + tmp7;
2620 + z2 = vec_add(tmp5, tmp6); // z2 = tmp5 + tmp6;
2621 + z3 = vec_add(tmp4, tmp6); // z3 = tmp4 + tmp6;
2622 + z4 = vec_add(tmp5, tmp7); // z4 = tmp5 + tmp7;
2624 + // z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
2625 + z5 = vec_madd(vec_add(z3, z4), vec_1_175875602, (vector float)zero);
2627 + // z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
2628 + z3 = vec_madd(z3, vec_1_961570560, z5);
2630 + // z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
2631 + z4 = vec_madd(z4, vec_0_390180644, z5);
2633 + // The following adds are rolled into the multiplies above
2634 + // z3 = vec_add(z3, z5); // z3 += z5;
2635 + // z4 = vec_add(z4, z5); // z4 += z5;
2637 + // z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
2638 + // Wow! It's actually more effecient to roll this multiply
2639 + // into the adds below, even thought the multiply gets done twice!
2640 + // z2 = vec_madd(z2, vec_2_562915447, (vector float)zero);
2642 + // z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
2643 + // Same with this one...
2644 + // z1 = vec_madd(z1, vec_0_899976223, (vector float)zero);
2646 + // tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
2647 + // dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS);
2648 + row7 = vec_madd(tmp4, vec_0_298631336, vec_madd(z1, vec_0_899976223, z3));
2650 + // tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
2651 + // dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS);
2652 + row5 = vec_madd(tmp5, vec_2_053119869, vec_madd(z2, vec_2_562915447, z4));
2654 + // tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
2655 + // dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS);
2656 + row3 = vec_madd(tmp6, vec_3_072711026, vec_madd(z2, vec_2_562915447, z3));
2658 + // tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
2659 + // dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS);
2660 + row1 = vec_madd(z1, vec_0_899976223, vec_madd(tmp7, vec_1_501321110, z4));
2662 + // Swap the row values with the alts. If this is the first half,
2663 + // this sets up the low values to be acted on in the second half.
2664 + // If this is the second half, it puts the high values back in
2665 + // the row values where they are expected to be when we're done.
2676 + if (whichPass == 1)
2678 + // transpose the data for the second pass
2680 + // First, block transpose the upper right with lower left.
2686 + // Now, transpose each block of four
2687 + TRANSPOSE4(row0, row1, row2, row3);
2688 + TRANSPOSE4(row4, row5, row6, row7);
2689 + TRANSPOSE4(alt0, alt1, alt2, alt3);
2690 + TRANSPOSE4(alt4, alt5, alt6, alt7);
2695 + // used after quantise step
2696 + int oldBaseValue = 0;
2698 + // perform the quantise step, using the floating point data
2699 + // still in the row/alt registers
2701 + const int* biasAddr;
2702 + const vector signed int* qmat;
2703 + vector float bias, negBias;
2707 + vector signed int baseVector;
2709 + // We must cache element 0 in the intra case
2710 + // (it needs special handling).
2711 + baseVector = vec_cts(vec_splat(row0, 0), 0);
2712 + vec_ste(baseVector, 0, &oldBaseValue);
2714 + qmat = (vector signed int*)s->q_intra_matrix[qscale];
2715 + biasAddr = &(s->intra_quant_bias);
2719 + qmat = (vector signed int*)s->q_inter_matrix[qscale];
2720 + biasAddr = &(s->inter_quant_bias);
2723 + // Load the bias vector (We add 0.5 to the bias so that we're
2724 + // rounding when we convert to int, instead of flooring.)
2726 + vector signed int biasInt;
2727 + const vector float negOneFloat = (vector float)FOUROF(-1.0f);
2728 + LOAD4(biasInt, biasAddr);
2729 + bias = vec_ctf(biasInt, QUANT_BIAS_SHIFT);
2730 + negBias = vec_madd(bias, negOneFloat, zero);
2734 + vector float q0, q1, q2, q3, q4, q5, q6, q7;
2736 + q0 = vec_ctf(qmat[0], QMAT_SHIFT);
2737 + q1 = vec_ctf(qmat[2], QMAT_SHIFT);
2738 + q2 = vec_ctf(qmat[4], QMAT_SHIFT);
2739 + q3 = vec_ctf(qmat[6], QMAT_SHIFT);
2740 + q4 = vec_ctf(qmat[8], QMAT_SHIFT);
2741 + q5 = vec_ctf(qmat[10], QMAT_SHIFT);
2742 + q6 = vec_ctf(qmat[12], QMAT_SHIFT);
2743 + q7 = vec_ctf(qmat[14], QMAT_SHIFT);
2745 + row0 = vec_sel(vec_madd(row0, q0, negBias), vec_madd(row0, q0, bias),
2746 + vec_cmpgt(row0, zero));
2747 + row1 = vec_sel(vec_madd(row1, q1, negBias), vec_madd(row1, q1, bias),
2748 + vec_cmpgt(row1, zero));
2749 + row2 = vec_sel(vec_madd(row2, q2, negBias), vec_madd(row2, q2, bias),
2750 + vec_cmpgt(row2, zero));
2751 + row3 = vec_sel(vec_madd(row3, q3, negBias), vec_madd(row3, q3, bias),
2752 + vec_cmpgt(row3, zero));
2753 + row4 = vec_sel(vec_madd(row4, q4, negBias), vec_madd(row4, q4, bias),
2754 + vec_cmpgt(row4, zero));
2755 + row5 = vec_sel(vec_madd(row5, q5, negBias), vec_madd(row5, q5, bias),
2756 + vec_cmpgt(row5, zero));
2757 + row6 = vec_sel(vec_madd(row6, q6, negBias), vec_madd(row6, q6, bias),
2758 + vec_cmpgt(row6, zero));
2759 + row7 = vec_sel(vec_madd(row7, q7, negBias), vec_madd(row7, q7, bias),
2760 + vec_cmpgt(row7, zero));
2762 + q0 = vec_ctf(qmat[1], QMAT_SHIFT);
2763 + q1 = vec_ctf(qmat[3], QMAT_SHIFT);
2764 + q2 = vec_ctf(qmat[5], QMAT_SHIFT);
2765 + q3 = vec_ctf(qmat[7], QMAT_SHIFT);
2766 + q4 = vec_ctf(qmat[9], QMAT_SHIFT);
2767 + q5 = vec_ctf(qmat[11], QMAT_SHIFT);
2768 + q6 = vec_ctf(qmat[13], QMAT_SHIFT);
2769 + q7 = vec_ctf(qmat[15], QMAT_SHIFT);
2771 + alt0 = vec_sel(vec_madd(alt0, q0, negBias), vec_madd(alt0, q0, bias),
2772 + vec_cmpgt(alt0, zero));
2773 + alt1 = vec_sel(vec_madd(alt1, q1, negBias), vec_madd(alt1, q1, bias),
2774 + vec_cmpgt(alt1, zero));
2775 + alt2 = vec_sel(vec_madd(alt2, q2, negBias), vec_madd(alt2, q2, bias),
2776 + vec_cmpgt(alt2, zero));
2777 + alt3 = vec_sel(vec_madd(alt3, q3, negBias), vec_madd(alt3, q3, bias),
2778 + vec_cmpgt(alt3, zero));
2779 + alt4 = vec_sel(vec_madd(alt4, q4, negBias), vec_madd(alt4, q4, bias),
2780 + vec_cmpgt(alt4, zero));
2781 + alt5 = vec_sel(vec_madd(alt5, q5, negBias), vec_madd(alt5, q5, bias),
2782 + vec_cmpgt(alt5, zero));
2783 + alt6 = vec_sel(vec_madd(alt6, q6, negBias), vec_madd(alt6, q6, bias),
2784 + vec_cmpgt(alt6, zero));
2785 + alt7 = vec_sel(vec_madd(alt7, q7, negBias), vec_madd(alt7, q7, bias),
2786 + vec_cmpgt(alt7, zero));
2792 + // Store the data back into the original block
2794 + vector signed short data0, data1, data2, data3, data4, data5, data6, data7;
2796 + data0 = vec_pack(vec_cts(row0, 0), vec_cts(alt0, 0));
2797 + data1 = vec_pack(vec_cts(row1, 0), vec_cts(alt1, 0));
2798 + data2 = vec_pack(vec_cts(row2, 0), vec_cts(alt2, 0));
2799 + data3 = vec_pack(vec_cts(row3, 0), vec_cts(alt3, 0));
2800 + data4 = vec_pack(vec_cts(row4, 0), vec_cts(alt4, 0));
2801 + data5 = vec_pack(vec_cts(row5, 0), vec_cts(alt5, 0));
2802 + data6 = vec_pack(vec_cts(row6, 0), vec_cts(alt6, 0));
2803 + data7 = vec_pack(vec_cts(row7, 0), vec_cts(alt7, 0));
2806 + // Clamp for overflow
2807 + vector signed int max_q_int, min_q_int;
2808 + vector signed short max_q, min_q;
2810 + LOAD4(max_q_int, &(s->max_qcoeff));
2811 + LOAD4(min_q_int, &(s->min_qcoeff));
2813 + max_q = vec_pack(max_q_int, max_q_int);
2814 + min_q = vec_pack(min_q_int, min_q_int);
2816 + data0 = vec_max(vec_min(data0, max_q), min_q);
2817 + data1 = vec_max(vec_min(data1, max_q), min_q);
2818 + data2 = vec_max(vec_min(data2, max_q), min_q);
2819 + data4 = vec_max(vec_min(data4, max_q), min_q);
2820 + data5 = vec_max(vec_min(data5, max_q), min_q);
2821 + data6 = vec_max(vec_min(data6, max_q), min_q);
2822 + data7 = vec_max(vec_min(data7, max_q), min_q);
2825 + vector bool char zero_01, zero_23, zero_45, zero_67;
2826 + vector signed char scanIndices_01, scanIndices_23, scanIndices_45, scanIndices_67;
2827 + vector signed char negOne = vec_splat_s8(-1);
2828 + vector signed char* scanPtr =
2829 + (vector signed char*)(s->intra_scantable.inverse);
2831 + // Determine the largest non-zero index.
2832 + zero_01 = vec_pack(vec_cmpeq(data0, (vector short)zero),
2833 + vec_cmpeq(data1, (vector short)zero));
2834 + zero_23 = vec_pack(vec_cmpeq(data2, (vector short)zero),
2835 + vec_cmpeq(data3, (vector short)zero));
2836 + zero_45 = vec_pack(vec_cmpeq(data4, (vector short)zero),
2837 + vec_cmpeq(data5, (vector short)zero));
2838 + zero_67 = vec_pack(vec_cmpeq(data6, (vector short)zero),
2839 + vec_cmpeq(data7, (vector short)zero));
2841 + // 64 biggest values
2842 + scanIndices_01 = vec_sel(scanPtr[0], negOne, zero_01);
2843 + scanIndices_23 = vec_sel(scanPtr[1], negOne, zero_23);
2844 + scanIndices_45 = vec_sel(scanPtr[2], negOne, zero_45);
2845 + scanIndices_67 = vec_sel(scanPtr[3], negOne, zero_67);
2847 + // 32 largest values
2848 + scanIndices_01 = vec_max(scanIndices_01, scanIndices_23);
2849 + scanIndices_45 = vec_max(scanIndices_45, scanIndices_67);
2851 + // 16 largest values
2852 + scanIndices_01 = vec_max(scanIndices_01, scanIndices_45);
2854 + // 8 largest values
2855 + scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne),
2856 + vec_mergel(scanIndices_01, negOne));
2858 + // 4 largest values
2859 + scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne),
2860 + vec_mergel(scanIndices_01, negOne));
2862 + // 2 largest values
2863 + scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne),
2864 + vec_mergel(scanIndices_01, negOne));
2867 + scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne),
2868 + vec_mergel(scanIndices_01, negOne));
2870 + scanIndices_01 = vec_splat(scanIndices_01, 0);
2872 + signed char lastNonZeroChar;
2874 + vec_ste(scanIndices_01, 0, &lastNonZeroChar);
2876 + lastNonZero = lastNonZeroChar;
2878 + // While the data is still in vectors we check for the transpose IDCT permute
2879 + // and handle it using the vector unit if we can. This is the permute used
2880 + // by the altivec idct, so it is common when using the altivec dct.
2882 + if ((lastNonZero > 0) && (s->dsp.idct_permutation_type == FF_TRANSPOSE_IDCT_PERM))
2884 + TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7);
2887 + vec_st(data0, 0, data);
2888 + vec_st(data1, 16, data);
2889 + vec_st(data2, 32, data);
2890 + vec_st(data3, 48, data);
2891 + vec_st(data4, 64, data);
2892 + vec_st(data5, 80, data);
2893 + vec_st(data6, 96, data);
2894 + vec_st(data7, 112, data);
2897 + // special handling of block[0]
2903 + oldBaseValue /= s->y_dc_scale;
2905 + oldBaseValue /= s->c_dc_scale;
2908 + // Divide by 8, rounding the result
2909 + data[0] = (oldBaseValue + 4) >> 3;
2912 + // We handled the tranpose permutation above and we don't
2913 + // need to permute the "no" permutation case.
2914 + if ((lastNonZero > 0) &&
2915 + (s->dsp.idct_permutation_type != FF_TRANSPOSE_IDCT_PERM) &&
2916 + (s->dsp.idct_permutation_type != FF_NO_IDCT_PERM))
2918 + ff_block_permute(data, s->dsp.idct_permutation,
2919 + s->intra_scantable.scantable, lastNonZero);
2922 + return lastNonZero;
2927 + AltiVec version of dct_unquantize_h263
2928 + this code assumes `block' is 16 bytes-aligned
2930 +void dct_unquantize_h263_altivec(MpegEncContext *s,
2931 + DCTELEM *block, int n, int qscale)
2933 +POWERPC_PERF_DECLARE(altivec_dct_unquantize_h263_num, 1);
2934 + int i, level, qmul, qadd;
2937 + assert(s->block_last_index[n]>=0);
2939 +POWERPC_PERF_START_COUNT(altivec_dct_unquantize_h263_num, 1);
2941 + qadd = (qscale - 1) | 1;
2942 + qmul = qscale << 1;
2944 + if (s->mb_intra) {
2945 + if (!s->h263_aic) {
2947 + block[0] = block[0] * s->y_dc_scale;
2949 + block[0] = block[0] * s->c_dc_scale;
2953 + nCoeffs= 63; //does not allways use zigzag table
2956 + nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
2959 +#ifdef ALTIVEC_USE_REFERENCE_C_CODE
2960 + for(;i<=nCoeffs;i++) {
2964 + level = level * qmul - qadd;
2966 + level = level * qmul + qadd;
2971 +#else /* ALTIVEC_USE_REFERENCE_C_CODE */
2973 + register const vector short vczero = (const vector short)vec_splat_s16(0);
2974 + short __attribute__ ((aligned(16))) qmul8[] =
2976 + qmul, qmul, qmul, qmul,
2977 + qmul, qmul, qmul, qmul
2979 + short __attribute__ ((aligned(16))) qadd8[] =
2981 + qadd, qadd, qadd, qadd,
2982 + qadd, qadd, qadd, qadd
2984 + short __attribute__ ((aligned(16))) nqadd8[] =
2986 + -qadd, -qadd, -qadd, -qadd,
2987 + -qadd, -qadd, -qadd, -qadd
2989 + register vector short blockv, qmulv, qaddv, nqaddv, temp1;
2990 + register vector bool short blockv_null, blockv_neg;
2991 + register short backup_0 = block[0];
2992 + register int j = 0;
2994 + qmulv = vec_ld(0, qmul8);
2995 + qaddv = vec_ld(0, qadd8);
2996 + nqaddv = vec_ld(0, nqadd8);
2998 +#if 0 // block *is* 16 bytes-aligned, it seems.
2999 + // first make sure block[j] is 16 bytes-aligned
3000 + for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) {
3004 + level = level * qmul - qadd;
3006 + level = level * qmul + qadd;
3013 + // vectorize all the 16 bytes-aligned blocks
3015 + for(; (j + 7) <= nCoeffs ; j+=8)
3017 + blockv = vec_ld(j << 1, block);
3018 + blockv_neg = vec_cmplt(blockv, vczero);
3019 + blockv_null = vec_cmpeq(blockv, vczero);
3020 + // choose between +qadd or -qadd as the third operand
3021 + temp1 = vec_sel(qaddv, nqaddv, blockv_neg);
3022 + // multiply & add (block{i,i+7} * qmul [+-] qadd)
3023 + temp1 = vec_mladd(blockv, qmulv, temp1);
3024 + // put 0 where block[{i,i+7} used to have 0
3025 + blockv = vec_sel(temp1, blockv, blockv_null);
3026 + vec_st(blockv, j << 1, block);
3029 + // if nCoeffs isn't a multiple of 8, finish the job
3030 + // using good old scalar units.
3031 + // (we could do it using a truncated vector,
3032 + // but I'm not sure it's worth the hassle)
3033 + for(; j <= nCoeffs ; j++) {
3037 + level = level * qmul - qadd;
3039 + level = level * qmul + qadd;
3046 + { // cheat. this avoid special-casing the first iteration
3047 + block[0] = backup_0;
3050 +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
3052 +POWERPC_PERF_STOP_COUNT(altivec_dct_unquantize_h263_num, nCoeffs == 63);
3054 diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/mpegvideo_ppc.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/mpegvideo_ppc.c
3055 --- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/mpegvideo_ppc.c 1970-01-01 01:00:00.000000000 +0100
3056 +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/mpegvideo_ppc.c 2003-09-28 17:26:40.000000000 +0200
3059 + * Copyright (c) 2002 Dieter Shirley
\r
3061 + * This library is free software; you can redistribute it and/or
\r
3062 + * modify it under the terms of the GNU Lesser General Public
\r
3063 + * License as published by the Free Software Foundation; either
\r
3064 + * version 2 of the License, or (at your option) any later version.
\r
3066 + * This library is distributed in the hope that it will be useful,
\r
3067 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
3068 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
\r
3069 + * Lesser General Public License for more details.
\r
3071 + * You should have received a copy of the GNU Lesser General Public
\r
3072 + * License along with this library; if not, write to the Free Software
\r
3073 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
\r
3076 +#include "../dsputil.h"
\r
3077 +#include "../mpegvideo.h"
\r
3078 +#include <time.h>
\r
3080 +#ifdef HAVE_ALTIVEC
\r
3081 +#include "dsputil_altivec.h"
\r
3084 +extern int dct_quantize_altivec(MpegEncContext *s,
\r
3085 + DCTELEM *block, int n,
\r
3086 + int qscale, int *overflow);
\r
3087 +extern void dct_unquantize_h263_altivec(MpegEncContext *s,
3088 + DCTELEM *block, int n, int qscale);
3090 +extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
\r
3091 +extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
\r
3094 +void MPV_common_init_ppc(MpegEncContext *s)
\r
3097 + if (has_altivec())
\r
3099 + if ((s->avctx->idct_algo == FF_IDCT_AUTO) ||
\r
3100 + (s->avctx->idct_algo == FF_IDCT_ALTIVEC))
\r
3102 + s->dsp.idct_put = idct_put_altivec;
\r
3103 + s->dsp.idct_add = idct_add_altivec;
\r
3104 +#ifndef ALTIVEC_USE_REFERENCE_C_CODE
3105 + s->dsp.idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
\r
3106 +#else /* ALTIVEC_USE_REFERENCE_C_CODE */
3107 + s->dsp.idct_permutation_type = FF_NO_IDCT_PERM;
3108 +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
3111 + // Test to make sure that the dct required alignments are met.
\r
3112 + if ((((long)(s->q_intra_matrix) & 0x0f) != 0) ||
\r
3113 + (((long)(s->q_inter_matrix) & 0x0f) != 0))
\r
3115 + fprintf(stderr, "Internal Error: q-matrix blocks must be 16-byte aligned "
\r
3116 + "to use Altivec DCT. Reverting to non-altivec version.\n");
\r
3120 + if (((long)(s->intra_scantable.inverse) & 0x0f) != 0)
\r
3122 + fprintf(stderr, "Internal Error: scan table blocks must be 16-byte aligned "
\r
3123 + "to use Altivec DCT. Reverting to non-altivec version.\n");
\r
3128 + if ((s->avctx->dct_algo == FF_DCT_AUTO) ||
\r
3129 + (s->avctx->dct_algo == FF_DCT_ALTIVEC))
\r
3131 + s->dct_quantize = dct_quantize_altivec;
\r
3132 + s->dct_unquantize_h263 = dct_unquantize_h263_altivec;
3137 + /* Non-AltiVec PPC optimisations here */
\r
3141 --- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/Makefile.am.orig 2003-05-25 23:11:57.000000000 +0200
3142 +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/Makefile.am 2003-11-14 01:06:03.904622008 +0100
3145 libavcodecppc_la_SOURCES = $(PPC_SRC)
3147 -AM_CPPFLAGS = $(LTNOPIC) -DHAVE_AV_CONFIG_H -I$(srcdir)/../..
3148 +AM_CPPFLAGS = $(LTNOPIC) -DHAVE_AV_CONFIG_H -DHAVE_ALTIVEC_H -DHAVE_ALTIVEC -maltivec -mabi=altivec -I$(srcdir)/../..
3150 MAINTAINERCLEANFILES = Makefile.in