]> git.pld-linux.org Git - packages/avifile.git/blame - avifile-ffmpeg-ppc.patch
- workaround for struct name change in Linux 2.6 headers
[packages/avifile.git] / avifile-ffmpeg-ppc.patch
CommitLineData
9912bce9
JB
1diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/dsputil_altivec.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/dsputil_altivec.c
2--- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/dsputil_altivec.c 1970-01-01 01:00:00.000000000 +0100
3+++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/dsputil_altivec.c 2003-09-28 17:26:40.000000000 +0200
4@@ -0,0 +1,1345 @@
5+/*
6+ * Copyright (c) 2002 Brian Foley
7+ * Copyright (c) 2002 Dieter Shirley
8+ * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
9+ *
10+ * This library is free software; you can redistribute it and/or
11+ * modify it under the terms of the GNU Lesser General Public
12+ * License as published by the Free Software Foundation; either
13+ * version 2 of the License, or (at your option) any later version.
14+ *
15+ * This library is distributed in the hope that it will be useful,
16+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
17+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18+ * Lesser General Public License for more details.
19+ *
20+ * You should have received a copy of the GNU Lesser General Public
21+ * License along with this library; if not, write to the Free Software
22+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23+ */
24+
25+#include "../dsputil.h"
26+
27+#include "gcc_fixes.h"
28+
29+#include "dsputil_altivec.h"
30+
31+#ifdef CONFIG_DARWIN
32+#include <sys/sysctl.h>
33+#else /* CONFIG_DARWIN */
34+#include <signal.h>
35+#include <setjmp.h>
36+
37+static sigjmp_buf jmpbuf;
38+static volatile sig_atomic_t canjump = 0;
39+
40+static void sigill_handler (int sig)
41+{
42+ if (!canjump) {
43+ signal (sig, SIG_DFL);
44+ raise (sig);
45+ }
46+
47+ canjump = 0;
48+ siglongjmp (jmpbuf, 1);
49+}
50+#endif /* CONFIG_DARWIN */
51+
52+int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
53+{
54+ int i;
55+ int s __attribute__((aligned(16)));
56+ const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
57+ vector unsigned char *tv;
58+ vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
59+ vector unsigned int sad;
60+ vector signed int sumdiffs;
61+
62+ s = 0;
63+ sad = (vector unsigned int)vec_splat_u32(0);
64+ for(i=0;i<16;i++) {
65+ /*
66+ Read unaligned pixels into our vectors. The vectors are as follows:
67+ pix1v: pix1[0]-pix1[15]
68+ pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
69+ */
70+ tv = (vector unsigned char *) pix1;
71+ pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
72+
73+ tv = (vector unsigned char *) &pix2[0];
74+ pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
75+
76+ tv = (vector unsigned char *) &pix2[1];
77+ pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
78+
79+ /* Calculate the average vector */
80+ avgv = vec_avg(pix2v, pix2iv);
81+
82+ /* Calculate a sum of abs differences vector */
83+ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
84+
85+ /* Add each 4 pixel group together and put 4 results into sad */
86+ sad = vec_sum4s(t5, sad);
87+
88+ pix1 += line_size;
89+ pix2 += line_size;
90+ }
91+ /* Sum up the four partial sums, and put the result into s */
92+ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
93+ sumdiffs = vec_splat(sumdiffs, 3);
94+ vec_ste(sumdiffs, 0, &s);
95+
96+ return s;
97+}
98+
99+int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
100+{
101+ int i;
102+ int s __attribute__((aligned(16)));
103+ const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
104+ vector unsigned char *tv;
105+ vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
106+ vector unsigned int sad;
107+ vector signed int sumdiffs;
108+ uint8_t *pix3 = pix2 + line_size;
109+
110+ s = 0;
111+ sad = (vector unsigned int)vec_splat_u32(0);
112+
113+ /*
114+ Due to the fact that pix3 = pix2 + line_size, the pix3 of one
115+ iteration becomes pix2 in the next iteration. We can use this
116+ fact to avoid a potentially expensive unaligned read, each
117+ time around the loop.
118+ Read unaligned pixels into our vectors. The vectors are as follows:
119+ pix2v: pix2[0]-pix2[15]
120+ Split the pixel vectors into shorts
121+ */
122+ tv = (vector unsigned char *) &pix2[0];
123+ pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
124+
125+ for(i=0;i<16;i++) {
126+ /*
127+ Read unaligned pixels into our vectors. The vectors are as follows:
128+ pix1v: pix1[0]-pix1[15]
129+ pix3v: pix3[0]-pix3[15]
130+ */
131+ tv = (vector unsigned char *) pix1;
132+ pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
133+
134+ tv = (vector unsigned char *) &pix3[0];
135+ pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
136+
137+ /* Calculate the average vector */
138+ avgv = vec_avg(pix2v, pix3v);
139+
140+ /* Calculate a sum of abs differences vector */
141+ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
142+
143+ /* Add each 4 pixel group together and put 4 results into sad */
144+ sad = vec_sum4s(t5, sad);
145+
146+ pix1 += line_size;
147+ pix2v = pix3v;
148+ pix3 += line_size;
149+
150+ }
151+
152+ /* Sum up the four partial sums, and put the result into s */
153+ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
154+ sumdiffs = vec_splat(sumdiffs, 3);
155+ vec_ste(sumdiffs, 0, &s);
156+ return s;
157+}
158+
159+int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
160+{
161+ int i;
162+ int s __attribute__((aligned(16)));
163+ uint8_t *pix3 = pix2 + line_size;
164+ const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
165+ const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2);
166+ vector unsigned char *tv, avgv, t5;
167+ vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
168+ vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
169+ vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
170+ vector unsigned short avghv, avglv;
171+ vector unsigned short t1, t2, t3, t4;
172+ vector unsigned int sad;
173+ vector signed int sumdiffs;
174+
175+ sad = (vector unsigned int)vec_splat_u32(0);
176+
177+ s = 0;
178+
179+ /*
180+ Due to the fact that pix3 = pix2 + line_size, the pix3 of one
181+ iteration becomes pix2 in the next iteration. We can use this
182+ fact to avoid a potentially expensive unaligned read, as well
183+ as some splitting, and vector addition each time around the loop.
184+ Read unaligned pixels into our vectors. The vectors are as follows:
185+ pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
186+ Split the pixel vectors into shorts
187+ */
188+ tv = (vector unsigned char *) &pix2[0];
189+ pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
190+
191+ tv = (vector unsigned char *) &pix2[1];
192+ pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
193+
194+ pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
195+ pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
196+ pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
197+ pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
198+ t1 = vec_add(pix2hv, pix2ihv);
199+ t2 = vec_add(pix2lv, pix2ilv);
200+
201+ for(i=0;i<16;i++) {
202+ /*
203+ Read unaligned pixels into our vectors. The vectors are as follows:
204+ pix1v: pix1[0]-pix1[15]
205+ pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16]
206+ */
207+ tv = (vector unsigned char *) pix1;
208+ pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
209+
210+ tv = (vector unsigned char *) &pix3[0];
211+ pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
212+
213+ tv = (vector unsigned char *) &pix3[1];
214+ pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
215+
216+ /*
217+ Note that Altivec does have vec_avg, but this works on vector pairs
218+ and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
219+ would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
220+ Instead, we have to split the pixel vectors into vectors of shorts,
221+ and do the averaging by hand.
222+ */
223+
224+ /* Split the pixel vectors into shorts */
225+ pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
226+ pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
227+ pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
228+ pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
229+
230+ /* Do the averaging on them */
231+ t3 = vec_add(pix3hv, pix3ihv);
232+ t4 = vec_add(pix3lv, pix3ilv);
233+
234+ avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
235+ avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
236+
237+ /* Pack the shorts back into a result */
238+ avgv = vec_pack(avghv, avglv);
239+
240+ /* Calculate a sum of abs differences vector */
241+ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
242+
243+ /* Add each 4 pixel group together and put 4 results into sad */
244+ sad = vec_sum4s(t5, sad);
245+
246+ pix1 += line_size;
247+ pix3 += line_size;
248+ /* Transfer the calculated values for pix3 into pix2 */
249+ t1 = t3;
250+ t2 = t4;
251+ }
252+ /* Sum up the four partial sums, and put the result into s */
253+ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
254+ sumdiffs = vec_splat(sumdiffs, 3);
255+ vec_ste(sumdiffs, 0, &s);
256+
257+ return s;
258+}
259+
260+int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
261+{
262+ int i;
263+ int s __attribute__((aligned(16)));
264+ const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
265+ vector unsigned char perm1, perm2, *pix1v, *pix2v;
266+ vector unsigned char t1, t2, t3,t4, t5;
267+ vector unsigned int sad;
268+ vector signed int sumdiffs;
269+
270+ sad = (vector unsigned int)vec_splat_u32(0);
271+
272+
273+ for(i=0;i<16;i++) {
274+ /* Read potentially unaligned pixels into t1 and t2 */
275+ perm1 = vec_lvsl(0, pix1);
276+ pix1v = (vector unsigned char *) pix1;
277+ perm2 = vec_lvsl(0, pix2);
278+ pix2v = (vector unsigned char *) pix2;
279+ t1 = vec_perm(pix1v[0], pix1v[1], perm1);
280+ t2 = vec_perm(pix2v[0], pix2v[1], perm2);
281+
282+ /* Calculate a sum of abs differences vector */
283+ t3 = vec_max(t1, t2);
284+ t4 = vec_min(t1, t2);
285+ t5 = vec_sub(t3, t4);
286+
287+ /* Add each 4 pixel group together and put 4 results into sad */
288+ sad = vec_sum4s(t5, sad);
289+
290+ pix1 += line_size;
291+ pix2 += line_size;
292+ }
293+
294+ /* Sum up the four partial sums, and put the result into s */
295+ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
296+ sumdiffs = vec_splat(sumdiffs, 3);
297+ vec_ste(sumdiffs, 0, &s);
298+
299+ return s;
300+}
301+
302+int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
303+{
304+ int i;
305+ int s __attribute__((aligned(16)));
306+ const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
307+ vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
308+ vector unsigned char t1, t2, t3,t4, t5;
309+ vector unsigned int sad;
310+ vector signed int sumdiffs;
311+
312+ sad = (vector unsigned int)vec_splat_u32(0);
313+
314+ permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
315+
316+ for(i=0;i<8;i++) {
317+ /* Read potentially unaligned pixels into t1 and t2
318+ Since we're reading 16 pixels, and actually only want 8,
319+ mask out the last 8 pixels. The 0s don't change the sum. */
320+ perm1 = vec_lvsl(0, pix1);
321+ pix1v = (vector unsigned char *) pix1;
322+ perm2 = vec_lvsl(0, pix2);
323+ pix2v = (vector unsigned char *) pix2;
324+ t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
325+ t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
326+
327+ /* Calculate a sum of abs differences vector */
328+ t3 = vec_max(t1, t2);
329+ t4 = vec_min(t1, t2);
330+ t5 = vec_sub(t3, t4);
331+
332+ /* Add each 4 pixel group together and put 4 results into sad */
333+ sad = vec_sum4s(t5, sad);
334+
335+ pix1 += line_size;
336+ pix2 += line_size;
337+ }
338+
339+ /* Sum up the four partial sums, and put the result into s */
340+ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
341+ sumdiffs = vec_splat(sumdiffs, 3);
342+ vec_ste(sumdiffs, 0, &s);
343+
344+ return s;
345+}
346+
347+int pix_norm1_altivec(uint8_t *pix, int line_size)
348+{
349+ int i;
350+ int s __attribute__((aligned(16)));
351+ const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
352+ vector unsigned char *tv;
353+ vector unsigned char pixv;
354+ vector unsigned int sv;
355+ vector signed int sum;
356+
357+ sv = (vector unsigned int)vec_splat_u32(0);
358+
359+ s = 0;
360+ for (i = 0; i < 16; i++) {
361+ /* Read in the potentially unaligned pixels */
362+ tv = (vector unsigned char *) pix;
363+ pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
364+
365+ /* Square the values, and add them to our sum */
366+ sv = vec_msum(pixv, pixv, sv);
367+
368+ pix += line_size;
369+ }
370+ /* Sum up the four partial sums, and put the result into s */
371+ sum = vec_sums((vector signed int) sv, (vector signed int) zero);
372+ sum = vec_splat(sum, 3);
373+ vec_ste(sum, 0, &s);
374+
375+ return s;
376+}
377+
378+/**
379+ * Sum of Squared Errors for a 8x8 block.
380+ * AltiVec-enhanced.
381+ * It's the pix_abs8x8_altivec code above w/ squaring added.
382+ */
383+int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
384+{
385+ int i;
386+ int s __attribute__((aligned(16)));
387+ const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
388+ vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
389+ vector unsigned char t1, t2, t3,t4, t5;
390+ vector unsigned int sum;
391+ vector signed int sumsqr;
392+
393+ sum = (vector unsigned int)vec_splat_u32(0);
394+
395+ permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
396+
397+
398+ for(i=0;i<8;i++) {
399+ /* Read potentially unaligned pixels into t1 and t2
400+ Since we're reading 16 pixels, and actually only want 8,
401+ mask out the last 8 pixels. The 0s don't change the sum. */
402+ perm1 = vec_lvsl(0, pix1);
403+ pix1v = (vector unsigned char *) pix1;
404+ perm2 = vec_lvsl(0, pix2);
405+ pix2v = (vector unsigned char *) pix2;
406+ t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
407+ t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
408+
409+ /*
410+ Since we want to use unsigned chars, we can take advantage
411+ of the fact that abs(a-b)^2 = (a-b)^2.
412+ */
413+
414+ /* Calculate abs differences vector */
415+ t3 = vec_max(t1, t2);
416+ t4 = vec_min(t1, t2);
417+ t5 = vec_sub(t3, t4);
418+
419+ /* Square the values and add them to our sum */
420+ sum = vec_msum(t5, t5, sum);
421+
422+ pix1 += line_size;
423+ pix2 += line_size;
424+ }
425+
426+ /* Sum up the four partial sums, and put the result into s */
427+ sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
428+ sumsqr = vec_splat(sumsqr, 3);
429+ vec_ste(sumsqr, 0, &s);
430+
431+ return s;
432+}
433+
434+/**
435+ * Sum of Squared Errors for a 16x16 block.
436+ * AltiVec-enhanced.
437+ * It's the pix_abs16x16_altivec code above w/ squaring added.
438+ */
439+int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
440+{
441+ int i;
442+ int s __attribute__((aligned(16)));
443+ const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
444+ vector unsigned char perm1, perm2, *pix1v, *pix2v;
445+ vector unsigned char t1, t2, t3,t4, t5;
446+ vector unsigned int sum;
447+ vector signed int sumsqr;
448+
449+ sum = (vector unsigned int)vec_splat_u32(0);
450+
451+ for(i=0;i<16;i++) {
452+ /* Read potentially unaligned pixels into t1 and t2 */
453+ perm1 = vec_lvsl(0, pix1);
454+ pix1v = (vector unsigned char *) pix1;
455+ perm2 = vec_lvsl(0, pix2);
456+ pix2v = (vector unsigned char *) pix2;
457+ t1 = vec_perm(pix1v[0], pix1v[1], perm1);
458+ t2 = vec_perm(pix2v[0], pix2v[1], perm2);
459+
460+ /*
461+ Since we want to use unsigned chars, we can take advantage
462+ of the fact that abs(a-b)^2 = (a-b)^2.
463+ */
464+
465+ /* Calculate abs differences vector */
466+ t3 = vec_max(t1, t2);
467+ t4 = vec_min(t1, t2);
468+ t5 = vec_sub(t3, t4);
469+
470+ /* Square the values and add them to our sum */
471+ sum = vec_msum(t5, t5, sum);
472+
473+ pix1 += line_size;
474+ pix2 += line_size;
475+ }
476+
477+ /* Sum up the four partial sums, and put the result into s */
478+ sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
479+ sumsqr = vec_splat(sumsqr, 3);
480+ vec_ste(sumsqr, 0, &s);
481+
482+ return s;
483+}
484+
485+int pix_sum_altivec(uint8_t * pix, int line_size)
486+{
487+ const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
488+ vector unsigned char perm, *pixv;
489+ vector unsigned char t1;
490+ vector unsigned int sad;
491+ vector signed int sumdiffs;
492+
493+ int i;
494+ int s __attribute__((aligned(16)));
495+
496+ sad = (vector unsigned int)vec_splat_u32(0);
497+
498+ for (i = 0; i < 16; i++) {
499+ /* Read the potentially unaligned 16 pixels into t1 */
500+ perm = vec_lvsl(0, pix);
501+ pixv = (vector unsigned char *) pix;
502+ t1 = vec_perm(pixv[0], pixv[1], perm);
503+
504+ /* Add each 4 pixel group together and put 4 results into sad */
505+ sad = vec_sum4s(t1, sad);
506+
507+ pix += line_size;
508+ }
509+
510+ /* Sum up the four partial sums, and put the result into s */
511+ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
512+ sumdiffs = vec_splat(sumdiffs, 3);
513+ vec_ste(sumdiffs, 0, &s);
514+
515+ return s;
516+}
517+
518+void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
519+{
520+ int i;
521+ vector unsigned char perm, bytes, *pixv;
522+ const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
523+ vector signed short shorts;
524+
525+ for(i=0;i<8;i++)
526+ {
527+ // Read potentially unaligned pixels.
528+ // We're reading 16 pixels, and actually only want 8,
529+ // but we simply ignore the extras.
530+ perm = vec_lvsl(0, pixels);
531+ pixv = (vector unsigned char *) pixels;
532+ bytes = vec_perm(pixv[0], pixv[1], perm);
533+
534+ // convert the bytes into shorts
535+ shorts = (vector signed short)vec_mergeh(zero, bytes);
536+
537+ // save the data to the block, we assume the block is 16-byte aligned
538+ vec_st(shorts, i*16, (vector signed short*)block);
539+
540+ pixels += line_size;
541+ }
542+}
543+
544+void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
545+ const uint8_t *s2, int stride)
546+{
547+ int i;
548+ vector unsigned char perm, bytes, *pixv;
549+ const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
550+ vector signed short shorts1, shorts2;
551+
552+ for(i=0;i<4;i++)
553+ {
554+ // Read potentially unaligned pixels
555+ // We're reading 16 pixels, and actually only want 8,
556+ // but we simply ignore the extras.
557+ perm = vec_lvsl(0, s1);
558+ pixv = (vector unsigned char *) s1;
559+ bytes = vec_perm(pixv[0], pixv[1], perm);
560+
561+ // convert the bytes into shorts
562+ shorts1 = (vector signed short)vec_mergeh(zero, bytes);
563+
564+ // Do the same for the second block of pixels
565+ perm = vec_lvsl(0, s2);
566+ pixv = (vector unsigned char *) s2;
567+ bytes = vec_perm(pixv[0], pixv[1], perm);
568+
569+ // convert the bytes into shorts
570+ shorts2 = (vector signed short)vec_mergeh(zero, bytes);
571+
572+ // Do the subtraction
573+ shorts1 = vec_sub(shorts1, shorts2);
574+
575+ // save the data to the block, we assume the block is 16-byte aligned
576+ vec_st(shorts1, 0, (vector signed short*)block);
577+
578+ s1 += stride;
579+ s2 += stride;
580+ block += 8;
581+
582+
583+ // The code below is a copy of the code above... This is a manual
584+ // unroll.
585+
586+ // Read potentially unaligned pixels
587+ // We're reading 16 pixels, and actually only want 8,
588+ // but we simply ignore the extras.
589+ perm = vec_lvsl(0, s1);
590+ pixv = (vector unsigned char *) s1;
591+ bytes = vec_perm(pixv[0], pixv[1], perm);
592+
593+ // convert the bytes into shorts
594+ shorts1 = (vector signed short)vec_mergeh(zero, bytes);
595+
596+ // Do the same for the second block of pixels
597+ perm = vec_lvsl(0, s2);
598+ pixv = (vector unsigned char *) s2;
599+ bytes = vec_perm(pixv[0], pixv[1], perm);
600+
601+ // convert the bytes into shorts
602+ shorts2 = (vector signed short)vec_mergeh(zero, bytes);
603+
604+ // Do the subtraction
605+ shorts1 = vec_sub(shorts1, shorts2);
606+
607+ // save the data to the block, we assume the block is 16-byte aligned
608+ vec_st(shorts1, 0, (vector signed short*)block);
609+
610+ s1 += stride;
611+ s2 += stride;
612+ block += 8;
613+ }
614+}
615+
616+int sad16x16_altivec(void *s, uint8_t *a, uint8_t *b, int stride) {
617+ return pix_abs16x16_altivec(a,b,stride);
618+}
619+
620+int sad8x8_altivec(void *s, uint8_t *a, uint8_t *b, int stride) {
621+ return pix_abs8x8_altivec(a,b,stride);
622+}
623+
624+void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
625+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
626+ int i;
627+ for(i=0; i+7<w; i++){
628+ dst[i+0] += src[i+0];
629+ dst[i+1] += src[i+1];
630+ dst[i+2] += src[i+2];
631+ dst[i+3] += src[i+3];
632+ dst[i+4] += src[i+4];
633+ dst[i+5] += src[i+5];
634+ dst[i+6] += src[i+6];
635+ dst[i+7] += src[i+7];
636+ }
637+ for(; i<w; i++)
638+ dst[i+0] += src[i+0];
639+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
640+ register int i;
641+ register vector unsigned char vdst, vsrc;
642+
643+ /* dst and src are 16 bytes-aligned (guaranteed) */
644+ for(i = 0 ; (i + 15) < w ; i++)
645+ {
646+ vdst = vec_ld(i << 4, (unsigned char*)dst);
647+ vsrc = vec_ld(i << 4, (unsigned char*)src);
648+ vdst = vec_add(vsrc, vdst);
649+ vec_st(vdst, i << 4, (unsigned char*)dst);
650+ }
651+ /* if w is not a multiple of 16 */
652+ for (; (i < w) ; i++)
653+ {
654+ dst[i] = src[i];
655+ }
656+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
657+}
658+
659+/* next one assumes that ((line_size % 16) == 0) */
660+void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
661+{
662+POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);
663+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
664+ int i;
665+
666+POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
667+
668+ for(i=0; i<h; i++) {
669+ *((uint32_t*)(block )) = (((const struct unaligned_32 *) (pixels))->l);
670+ *((uint32_t*)(block+4)) = (((const struct unaligned_32 *) (pixels+4))->l);
671+ *((uint32_t*)(block+8)) = (((const struct unaligned_32 *) (pixels+8))->l);
672+ *((uint32_t*)(block+12)) = (((const struct unaligned_32 *) (pixels+12))->l);
673+ pixels+=line_size;
674+ block +=line_size;
675+ }
676+
677+POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
678+
679+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
680+ register vector unsigned char pixelsv1, pixelsv2;
681+ register vector unsigned char pixelsv1B, pixelsv2B;
682+ register vector unsigned char pixelsv1C, pixelsv2C;
683+ register vector unsigned char pixelsv1D, pixelsv2D;
684+
685+ register vector unsigned char perm = vec_lvsl(0, pixels);
686+ int i;
687+ register int line_size_2 = line_size << 1;
688+ register int line_size_3 = line_size + line_size_2;
689+ register int line_size_4 = line_size << 2;
690+
691+POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
692+// hand-unrolling the loop by 4 gains about 15%
693+// mininum execution time goes from 74 to 60 cycles
694+// it's faster than -funroll-loops, but using
695+// -funroll-loops w/ this is bad - 74 cycles again.
696+// all this is on a 7450, tuning for the 7450
697+#if 0
698+ for(i=0; i<h; i++) {
699+ pixelsv1 = vec_ld(0, (unsigned char*)pixels);
700+ pixelsv2 = vec_ld(16, (unsigned char*)pixels);
701+ vec_st(vec_perm(pixelsv1, pixelsv2, perm),
702+ 0, (unsigned char*)block);
703+ pixels+=line_size;
704+ block +=line_size;
705+ }
706+#else
707+ for(i=0; i<h; i+=4) {
708+ pixelsv1 = vec_ld(0, (unsigned char*)pixels);
709+ pixelsv2 = vec_ld(16, (unsigned char*)pixels);
710+ pixelsv1B = vec_ld(line_size, (unsigned char*)pixels);
711+ pixelsv2B = vec_ld(16 + line_size, (unsigned char*)pixels);
712+ pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels);
713+ pixelsv2C = vec_ld(16 + line_size_2, (unsigned char*)pixels);
714+ pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels);
715+ pixelsv2D = vec_ld(16 + line_size_3, (unsigned char*)pixels);
716+ vec_st(vec_perm(pixelsv1, pixelsv2, perm),
717+ 0, (unsigned char*)block);
718+ vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
719+ line_size, (unsigned char*)block);
720+ vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
721+ line_size_2, (unsigned char*)block);
722+ vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
723+ line_size_3, (unsigned char*)block);
724+ pixels+=line_size_4;
725+ block +=line_size_4;
726+ }
727+#endif
728+POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
729+
730+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
731+}
732+
733+/* next one assumes that ((line_size % 16) == 0) */
734+#define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
735+void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
736+{
737+POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
738+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
739+ int i;
740+
741+POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
742+
743+ for(i=0; i<h; i++) {
744+ op_avg(*((uint32_t*)(block)),(((const struct unaligned_32 *)(pixels))->l));
745+ op_avg(*((uint32_t*)(block+4)),(((const struct unaligned_32 *)(pixels+4))->l));
746+ op_avg(*((uint32_t*)(block+8)),(((const struct unaligned_32 *)(pixels+8))->l));
747+ op_avg(*((uint32_t*)(block+12)),(((const struct unaligned_32 *)(pixels+12))->l));
748+ pixels+=line_size;
749+ block +=line_size;
750+ }
751+
752+POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
753+
754+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
755+ register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
756+ register vector unsigned char perm = vec_lvsl(0, pixels);
757+ int i;
758+
759+POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
760+
761+ for(i=0; i<h; i++) {
762+ pixelsv1 = vec_ld(0, (unsigned char*)pixels);
763+ pixelsv2 = vec_ld(16, (unsigned char*)pixels);
764+ blockv = vec_ld(0, block);
765+ pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
766+ blockv = vec_avg(blockv,pixelsv);
767+ vec_st(blockv, 0, (unsigned char*)block);
768+ pixels+=line_size;
769+ block +=line_size;
770+ }
771+
772+POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
773+
774+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
775+}
776+
777+/* next one assumes that ((line_size % 8) == 0) */
778+void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
779+{
780+POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
781+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
782+ int i;
783+POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
784+ for (i = 0; i < h; i++) {
785+ *((uint32_t *) (block)) =
786+ (((*((uint32_t *) (block))) |
787+ ((((const struct unaligned_32 *) (pixels))->l))) -
788+ ((((*((uint32_t *) (block))) ^
789+ ((((const struct unaligned_32 *) (pixels))->
790+ l))) & 0xFEFEFEFEUL) >> 1));
791+ *((uint32_t *) (block + 4)) =
792+ (((*((uint32_t *) (block + 4))) |
793+ ((((const struct unaligned_32 *) (pixels + 4))->l))) -
794+ ((((*((uint32_t *) (block + 4))) ^
795+ ((((const struct unaligned_32 *) (pixels +
796+ 4))->
797+ l))) & 0xFEFEFEFEUL) >> 1));
798+ pixels += line_size;
799+ block += line_size;
800+ }
801+POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
802+
803+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
804+ register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
805+ int i;
806+
807+POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
808+
809+ for (i = 0; i < h; i++) {
810+ /*
811+ block is 8 bytes-aligned, so we're either in the
812+ left block (16 bytes-aligned) or in the right block (not)
813+ */
814+ int rightside = ((unsigned long)block & 0x0000000F);
815+
816+ blockv = vec_ld(0, block);
817+ pixelsv1 = vec_ld(0, (unsigned char*)pixels);
818+ pixelsv2 = vec_ld(16, (unsigned char*)pixels);
819+ pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
820+
821+ if (rightside)
822+ {
823+ pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
824+ }
825+ else
826+ {
827+ pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
828+ }
829+
830+ blockv = vec_avg(blockv, pixelsv);
831+
832+ vec_st(blockv, 0, block);
833+
834+ pixels += line_size;
835+ block += line_size;
836+ }
837+
838+POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
839+
840+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
841+}
842+
843+/* next one assumes that ((line_size % 8) == 0) */
844+void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
845+{
846+POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
847+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
848+ int j;
849+POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
850+ for (j = 0; j < 2; j++) {
851+ int i;
852+ const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
853+ const uint32_t b =
854+ (((const struct unaligned_32 *) (pixels + 1))->l);
855+ uint32_t l0 =
856+ (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
857+ uint32_t h0 =
858+ ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
859+ uint32_t l1, h1;
860+ pixels += line_size;
861+ for (i = 0; i < h; i += 2) {
862+ uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
863+ uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
864+ l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
865+ h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
866+ *((uint32_t *) block) =
867+ h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
868+ pixels += line_size;
869+ block += line_size;
870+ a = (((const struct unaligned_32 *) (pixels))->l);
871+ b = (((const struct unaligned_32 *) (pixels + 1))->l);
872+ l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
873+ h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
874+ *((uint32_t *) block) =
875+ h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
876+ pixels += line_size;
877+ block += line_size;
878+ } pixels += 4 - line_size * (h + 1);
879+ block += 4 - line_size * h;
880+ }
881+
882+POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
883+
884+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
885+ register int i;
886+ register vector unsigned char
887+ pixelsv1, pixelsv2,
888+ pixelsavg;
889+ register vector unsigned char
890+ blockv, temp1, temp2;
891+ register vector unsigned short
892+ pixelssum1, pixelssum2, temp3;
893+ register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
894+ register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
895+
896+ temp1 = vec_ld(0, pixels);
897+ temp2 = vec_ld(16, pixels);
898+ pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
899+ if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
900+ {
901+ pixelsv2 = temp2;
902+ }
903+ else
904+ {
905+ pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
906+ }
907+ pixelsv1 = vec_mergeh(vczero, pixelsv1);
908+ pixelsv2 = vec_mergeh(vczero, pixelsv2);
909+ pixelssum1 = vec_add((vector unsigned short)pixelsv1,
910+ (vector unsigned short)pixelsv2);
911+ pixelssum1 = vec_add(pixelssum1, vctwo);
912+
913+POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
914+ for (i = 0; i < h ; i++) {
915+ int rightside = ((unsigned long)block & 0x0000000F);
916+ blockv = vec_ld(0, block);
917+
918+ temp1 = vec_ld(line_size, pixels);
919+ temp2 = vec_ld(line_size + 16, pixels);
920+ pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
921+ if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
922+ {
923+ pixelsv2 = temp2;
924+ }
925+ else
926+ {
927+ pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
928+ }
929+
930+ pixelsv1 = vec_mergeh(vczero, pixelsv1);
931+ pixelsv2 = vec_mergeh(vczero, pixelsv2);
932+ pixelssum2 = vec_add((vector unsigned short)pixelsv1,
933+ (vector unsigned short)pixelsv2);
934+ temp3 = vec_add(pixelssum1, pixelssum2);
935+ temp3 = vec_sra(temp3, vctwo);
936+ pixelssum1 = vec_add(pixelssum2, vctwo);
937+ pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
938+
939+ if (rightside)
940+ {
941+ blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
942+ }
943+ else
944+ {
945+ blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
946+ }
947+
948+ vec_st(blockv, 0, block);
949+
950+ block += line_size;
951+ pixels += line_size;
952+ }
953+
954+POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
955+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
956+}
957+
958+/* next one assumes that ((line_size % 8) == 0) */
959+void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
960+{
961+POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
962+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
963+ int j;
964+POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
965+ for (j = 0; j < 2; j++) {
966+ int i;
967+ const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
968+ const uint32_t b =
969+ (((const struct unaligned_32 *) (pixels + 1))->l);
970+ uint32_t l0 =
971+ (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
972+ uint32_t h0 =
973+ ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
974+ uint32_t l1, h1;
975+ pixels += line_size;
976+ for (i = 0; i < h; i += 2) {
977+ uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
978+ uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
979+ l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
980+ h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
981+ *((uint32_t *) block) =
982+ h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
983+ pixels += line_size;
984+ block += line_size;
985+ a = (((const struct unaligned_32 *) (pixels))->l);
986+ b = (((const struct unaligned_32 *) (pixels + 1))->l);
987+ l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
988+ h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
989+ *((uint32_t *) block) =
990+ h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
991+ pixels += line_size;
992+ block += line_size;
993+ } pixels += 4 - line_size * (h + 1);
994+ block += 4 - line_size * h;
995+ }
996+
997+POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
998+
999+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
1000+ register int i;
1001+ register vector unsigned char
1002+ pixelsv1, pixelsv2,
1003+ pixelsavg;
1004+ register vector unsigned char
1005+ blockv, temp1, temp2;
1006+ register vector unsigned short
1007+ pixelssum1, pixelssum2, temp3;
1008+ register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
1009+ register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
1010+ register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
1011+
1012+ temp1 = vec_ld(0, pixels);
1013+ temp2 = vec_ld(16, pixels);
1014+ pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1015+ if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
1016+ {
1017+ pixelsv2 = temp2;
1018+ }
1019+ else
1020+ {
1021+ pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1022+ }
1023+ pixelsv1 = vec_mergeh(vczero, pixelsv1);
1024+ pixelsv2 = vec_mergeh(vczero, pixelsv2);
1025+ pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1026+ (vector unsigned short)pixelsv2);
1027+ pixelssum1 = vec_add(pixelssum1, vcone);
1028+
1029+POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
1030+ for (i = 0; i < h ; i++) {
1031+ int rightside = ((unsigned long)block & 0x0000000F);
1032+ blockv = vec_ld(0, block);
1033+
1034+ temp1 = vec_ld(line_size, pixels);
1035+ temp2 = vec_ld(line_size + 16, pixels);
1036+ pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1037+ if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
1038+ {
1039+ pixelsv2 = temp2;
1040+ }
1041+ else
1042+ {
1043+ pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1044+ }
1045+
1046+ pixelsv1 = vec_mergeh(vczero, pixelsv1);
1047+ pixelsv2 = vec_mergeh(vczero, pixelsv2);
1048+ pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1049+ (vector unsigned short)pixelsv2);
1050+ temp3 = vec_add(pixelssum1, pixelssum2);
1051+ temp3 = vec_sra(temp3, vctwo);
1052+ pixelssum1 = vec_add(pixelssum2, vcone);
1053+ pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
1054+
1055+ if (rightside)
1056+ {
1057+ blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
1058+ }
1059+ else
1060+ {
1061+ blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
1062+ }
1063+
1064+ vec_st(blockv, 0, block);
1065+
1066+ block += line_size;
1067+ pixels += line_size;
1068+ }
1069+
1070+POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
1071+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1072+}
1073+
1074+/* next one assumes that ((line_size % 16) == 0) */
1075+void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
1076+{
1077+POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
1078+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
1079+ int j;
1080+POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
1081+ for (j = 0; j < 4; j++) {
1082+ int i;
1083+ const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1084+ const uint32_t b =
1085+ (((const struct unaligned_32 *) (pixels + 1))->l);
1086+ uint32_t l0 =
1087+ (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
1088+ uint32_t h0 =
1089+ ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1090+ uint32_t l1, h1;
1091+ pixels += line_size;
1092+ for (i = 0; i < h; i += 2) {
1093+ uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1094+ uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
1095+ l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
1096+ h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1097+ *((uint32_t *) block) =
1098+ h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1099+ pixels += line_size;
1100+ block += line_size;
1101+ a = (((const struct unaligned_32 *) (pixels))->l);
1102+ b = (((const struct unaligned_32 *) (pixels + 1))->l);
1103+ l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
1104+ h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1105+ *((uint32_t *) block) =
1106+ h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1107+ pixels += line_size;
1108+ block += line_size;
1109+ } pixels += 4 - line_size * (h + 1);
1110+ block += 4 - line_size * h;
1111+ }
1112+
1113+POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
1114+
1115+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
1116+ register int i;
1117+ register vector unsigned char
1118+ pixelsv1, pixelsv2, pixelsv3, pixelsv4;
1119+ register vector unsigned char
1120+ blockv, temp1, temp2;
1121+ register vector unsigned short
1122+ pixelssum1, pixelssum2, temp3,
1123+ pixelssum3, pixelssum4, temp4;
1124+ register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
1125+ register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
1126+
1127+POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
1128+
1129+ temp1 = vec_ld(0, pixels);
1130+ temp2 = vec_ld(16, pixels);
1131+ pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1132+ if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
1133+ {
1134+ pixelsv2 = temp2;
1135+ }
1136+ else
1137+ {
1138+ pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1139+ }
1140+ pixelsv3 = vec_mergel(vczero, pixelsv1);
1141+ pixelsv4 = vec_mergel(vczero, pixelsv2);
1142+ pixelsv1 = vec_mergeh(vczero, pixelsv1);
1143+ pixelsv2 = vec_mergeh(vczero, pixelsv2);
1144+ pixelssum3 = vec_add((vector unsigned short)pixelsv3,
1145+ (vector unsigned short)pixelsv4);
1146+ pixelssum3 = vec_add(pixelssum3, vctwo);
1147+ pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1148+ (vector unsigned short)pixelsv2);
1149+ pixelssum1 = vec_add(pixelssum1, vctwo);
1150+
1151+ for (i = 0; i < h ; i++) {
1152+ blockv = vec_ld(0, block);
1153+
1154+ temp1 = vec_ld(line_size, pixels);
1155+ temp2 = vec_ld(line_size + 16, pixels);
1156+ pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1157+ if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
1158+ {
1159+ pixelsv2 = temp2;
1160+ }
1161+ else
1162+ {
1163+ pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1164+ }
1165+
1166+ pixelsv3 = vec_mergel(vczero, pixelsv1);
1167+ pixelsv4 = vec_mergel(vczero, pixelsv2);
1168+ pixelsv1 = vec_mergeh(vczero, pixelsv1);
1169+ pixelsv2 = vec_mergeh(vczero, pixelsv2);
1170+
1171+ pixelssum4 = vec_add((vector unsigned short)pixelsv3,
1172+ (vector unsigned short)pixelsv4);
1173+ pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1174+ (vector unsigned short)pixelsv2);
1175+ temp4 = vec_add(pixelssum3, pixelssum4);
1176+ temp4 = vec_sra(temp4, vctwo);
1177+ temp3 = vec_add(pixelssum1, pixelssum2);
1178+ temp3 = vec_sra(temp3, vctwo);
1179+
1180+ pixelssum3 = vec_add(pixelssum4, vctwo);
1181+ pixelssum1 = vec_add(pixelssum2, vctwo);
1182+
1183+ blockv = vec_packsu(temp3, temp4);
1184+
1185+ vec_st(blockv, 0, block);
1186+
1187+ block += line_size;
1188+ pixels += line_size;
1189+ }
1190+
1191+POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
1192+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1193+}
1194+
1195+/* next one assumes that ((line_size % 16) == 0) */
1196+void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
1197+{
1198+POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
1199+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
1200+ int j;
1201+POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1202+ for (j = 0; j < 4; j++) {
1203+ int i;
1204+ const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1205+ const uint32_t b =
1206+ (((const struct unaligned_32 *) (pixels + 1))->l);
1207+ uint32_t l0 =
1208+ (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
1209+ uint32_t h0 =
1210+ ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1211+ uint32_t l1, h1;
1212+ pixels += line_size;
1213+ for (i = 0; i < h; i += 2) {
1214+ uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1215+ uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
1216+ l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
1217+ h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1218+ *((uint32_t *) block) =
1219+ h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1220+ pixels += line_size;
1221+ block += line_size;
1222+ a = (((const struct unaligned_32 *) (pixels))->l);
1223+ b = (((const struct unaligned_32 *) (pixels + 1))->l);
1224+ l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
1225+ h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1226+ *((uint32_t *) block) =
1227+ h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1228+ pixels += line_size;
1229+ block += line_size;
1230+ } pixels += 4 - line_size * (h + 1);
1231+ block += 4 - line_size * h;
1232+ }
1233+
1234+POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1235+
1236+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
1237+ register int i;
1238+ register vector unsigned char
1239+ pixelsv1, pixelsv2, pixelsv3, pixelsv4;
1240+ register vector unsigned char
1241+ blockv, temp1, temp2;
1242+ register vector unsigned short
1243+ pixelssum1, pixelssum2, temp3,
1244+ pixelssum3, pixelssum4, temp4;
1245+ register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
1246+ register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
1247+ register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
1248+
1249+POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1250+
1251+ temp1 = vec_ld(0, pixels);
1252+ temp2 = vec_ld(16, pixels);
1253+ pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1254+ if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
1255+ {
1256+ pixelsv2 = temp2;
1257+ }
1258+ else
1259+ {
1260+ pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1261+ }
1262+ pixelsv3 = vec_mergel(vczero, pixelsv1);
1263+ pixelsv4 = vec_mergel(vczero, pixelsv2);
1264+ pixelsv1 = vec_mergeh(vczero, pixelsv1);
1265+ pixelsv2 = vec_mergeh(vczero, pixelsv2);
1266+ pixelssum3 = vec_add((vector unsigned short)pixelsv3,
1267+ (vector unsigned short)pixelsv4);
1268+ pixelssum3 = vec_add(pixelssum3, vcone);
1269+ pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1270+ (vector unsigned short)pixelsv2);
1271+ pixelssum1 = vec_add(pixelssum1, vcone);
1272+
1273+ for (i = 0; i < h ; i++) {
1274+ blockv = vec_ld(0, block);
1275+
1276+ temp1 = vec_ld(line_size, pixels);
1277+ temp2 = vec_ld(line_size + 16, pixels);
1278+ pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1279+ if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
1280+ {
1281+ pixelsv2 = temp2;
1282+ }
1283+ else
1284+ {
1285+ pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1286+ }
1287+
1288+ pixelsv3 = vec_mergel(vczero, pixelsv1);
1289+ pixelsv4 = vec_mergel(vczero, pixelsv2);
1290+ pixelsv1 = vec_mergeh(vczero, pixelsv1);
1291+ pixelsv2 = vec_mergeh(vczero, pixelsv2);
1292+
1293+ pixelssum4 = vec_add((vector unsigned short)pixelsv3,
1294+ (vector unsigned short)pixelsv4);
1295+ pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1296+ (vector unsigned short)pixelsv2);
1297+ temp4 = vec_add(pixelssum3, pixelssum4);
1298+ temp4 = vec_sra(temp4, vctwo);
1299+ temp3 = vec_add(pixelssum1, pixelssum2);
1300+ temp3 = vec_sra(temp3, vctwo);
1301+
1302+ pixelssum3 = vec_add(pixelssum4, vcone);
1303+ pixelssum1 = vec_add(pixelssum2, vcone);
1304+
1305+ blockv = vec_packsu(temp3, temp4);
1306+
1307+ vec_st(blockv, 0, block);
1308+
1309+ block += line_size;
1310+ pixels += line_size;
1311+ }
1312+
1313+POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1314+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1315+}
1316+
1317+int has_altivec(void)
1318+{
1319+#ifdef CONFIG_DARWIN
1320+ int sels[2] = {CTL_HW, HW_VECTORUNIT};
1321+ int has_vu = 0;
1322+ size_t len = sizeof(has_vu);
1323+ int err;
1324+
1325+ err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
1326+
1327+ if (err == 0) return (has_vu != 0);
1328+#else /* CONFIG_DARWIN */
1329+/* no Darwin, do it the brute-force way */
1330+/* this is borrowed from the libmpeg2 library */
1331+ {
1332+ signal (SIGILL, sigill_handler);
1333+ if (sigsetjmp (jmpbuf, 1)) {
1334+ signal (SIGILL, SIG_DFL);
1335+ } else {
1336+ canjump = 1;
1337+
1338+ asm volatile ("mtspr 256, %0\n\t"
1339+ "vand %%v0, %%v0, %%v0"
1340+ :
1341+ : "r" (-1));
1342+
1343+ signal (SIGILL, SIG_DFL);
1344+ return 1;
1345+ }
1346+ }
1347+#endif /* CONFIG_DARWIN */
1348+ return 0;
1349+}
1350diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/dsputil_ppc.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/dsputil_ppc.c
1351--- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/dsputil_ppc.c 1970-01-01 01:00:00.000000000 +0100
1352+++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/dsputil_ppc.c 2003-09-28 17:26:40.000000000 +0200
1353@@ -0,0 +1,307 @@
1354+/*
1355+ * Copyright (c) 2002 Brian Foley
1356+ * Copyright (c) 2002 Dieter Shirley
1357+ *
1358+ * This library is free software; you can redistribute it and/or
1359+ * modify it under the terms of the GNU Lesser General Public
1360+ * License as published by the Free Software Foundation; either
1361+ * version 2 of the License, or (at your option) any later version.
1362+ *
1363+ * This library is distributed in the hope that it will be useful,
1364+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
1365+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
1366+ * Lesser General Public License for more details.
1367+ *
1368+ * You should have received a copy of the GNU Lesser General Public
1369+ * License along with this library; if not, write to the Free Software
1370+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
1371+ */
1372+
1373+#include "../dsputil.h"
1374+
1375+#include "dsputil_ppc.h"
1376+
1377+#ifdef HAVE_ALTIVEC
1378+#include "dsputil_altivec.h"
1379+#endif
1380+
1381+extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
1382+extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
1383+
1384+int mm_flags = 0;
1385+
1386+int mm_support(void)
1387+{
1388+ int result = 0;
1389+#if HAVE_ALTIVEC
1390+ if (has_altivec()) {
1391+ result |= MM_ALTIVEC;
1392+ }
1393+#endif /* result */
1394+ return result;
1395+}
1396+
1397+#ifdef POWERPC_PERFORMANCE_REPORT
1398+unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
1399+/* list below must match enum in dsputil_ppc.h */
1400+static unsigned char* perfname[] = {
1401+ "fft_calc_altivec",
1402+ "gmc1_altivec",
1403+ "dct_unquantize_h263_altivec",
1404+ "idct_add_altivec",
1405+ "idct_put_altivec",
1406+ "put_pixels16_altivec",
1407+ "avg_pixels16_altivec",
1408+ "avg_pixels8_altivec",
1409+ "put_pixels8_xy2_altivec",
1410+ "put_no_rnd_pixels8_xy2_altivec",
1411+ "put_pixels16_xy2_altivec",
1412+ "put_no_rnd_pixels16_xy2_altivec",
1413+ "clear_blocks_dcbz32_ppc",
1414+ "clear_blocks_dcbz128_ppc"
1415+};
1416+#include <stdio.h>
1417+#endif
1418+
1419+#ifdef POWERPC_PERFORMANCE_REPORT
1420+void powerpc_display_perf_report(void)
1421+{
1422+ int i, j;
1423+ fprintf(stderr, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n");
1424+ for(i = 0 ; i < powerpc_perf_total ; i++)
1425+ {
1426+ for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++)
1427+ {
1428+ if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0)
1429+ fprintf(stderr,
1430+ " Function \"%s\" (pmc%d):\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n",
1431+ perfname[i],
1432+ j+1,
1433+ perfdata[j][i][powerpc_data_min],
1434+ perfdata[j][i][powerpc_data_max],
1435+ (double)perfdata[j][i][powerpc_data_sum] /
1436+ (double)perfdata[j][i][powerpc_data_num],
1437+ perfdata[j][i][powerpc_data_num]);
1438+ }
1439+ }
1440+}
1441+#endif /* POWERPC_PERFORMANCE_REPORT */
1442+
1443+/* ***** WARNING ***** WARNING ***** WARNING ***** */
1444+/*
1445+ clear_blocks_dcbz32_ppc will not work properly
1446+ on PowerPC processors with a cache line size
1447+ not equal to 32 bytes.
1448+ Fortunately all processor used by Apple up to
1449+ at least the 7450 (aka second generation G4)
1450+ use 32 bytes cache line.
1451+ This is due to the use of the 'dcbz' instruction.
1452+ It simply clear to zero a single cache line,
1453+ so you need to know the cache line size to use it !
1454+ It's absurd, but it's fast...
1455+
1456+ update 24/06/2003 : Apple released yesterday the G5,
1457+ with a PPC970. cache line size : 128 bytes. Oups.
1458+ The semantic of dcbz was changed, it always clear
1459+ 32 bytes. so the function below will work, but will
1460+ be slow. So I fixed check_dcbz_effect to use dcbzl,
1461+ which is defined to clear a cache line (as dcbz before).
1462+ So we still can distinguish, and use dcbz (32 bytes)
1463+ or dcbzl (one cache line) as required.
1464+
1465+ see <http://developer.apple.com/technotes/tn/tn2087.html>
1466+ and <http://developer.apple.com/technotes/tn/tn2086.html>
1467+*/
1468+void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
1469+{
1470+POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz32, 1);
1471+ register int misal = ((unsigned long)blocks & 0x00000010);
1472+ register int i = 0;
1473+POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1);
1474+#if 1
1475+ if (misal) {
1476+ ((unsigned long*)blocks)[0] = 0L;
1477+ ((unsigned long*)blocks)[1] = 0L;
1478+ ((unsigned long*)blocks)[2] = 0L;
1479+ ((unsigned long*)blocks)[3] = 0L;
1480+ i += 16;
1481+ }
1482+ for ( ; i < sizeof(DCTELEM)*6*64 ; i += 32) {
1483+ asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory");
1484+ }
1485+ if (misal) {
1486+ ((unsigned long*)blocks)[188] = 0L;
1487+ ((unsigned long*)blocks)[189] = 0L;
1488+ ((unsigned long*)blocks)[190] = 0L;
1489+ ((unsigned long*)blocks)[191] = 0L;
1490+ i += 16;
1491+ }
1492+#else
1493+ memset(blocks, 0, sizeof(DCTELEM)*6*64);
1494+#endif
1495+POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1);
1496+}
1497+
1498+/* same as above, when dcbzl clear a whole 128B cache line
1499+ i.e. the PPC970 aka G5 */
1500+#ifndef NO_DCBZL
1501+void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
1502+{
1503+POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz128, 1);
1504+ register int misal = ((unsigned long)blocks & 0x0000007f);
1505+ register int i = 0;
1506+POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1);
1507+#if 1
1508+ if (misal) {
1509+ // we could probably also optimize this case,
1510+ // but there's not much point as the machines
1511+ // aren't available yet (2003-06-26)
1512+ memset(blocks, 0, sizeof(DCTELEM)*6*64);
1513+ }
1514+ else
1515+ for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
1516+ asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory");
1517+ }
1518+#else
1519+ memset(blocks, 0, sizeof(DCTELEM)*6*64);
1520+#endif
1521+POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1);
1522+}
1523+#else
1524+void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
1525+{
1526+ memset(blocks, 0, sizeof(DCTELEM)*6*64);
1527+}
1528+#endif
1529+
1530+#ifndef NO_DCBZL
1531+/* check dcbz report how many bytes are set to 0 by dcbz */
1532+/* update 24/06/2003 : replace dcbz by dcbzl to get
1533+ the intended effect (Apple "fixed" dcbz)
1534+ unfortunately this cannot be used unless the assembler
1535+ knows about dcbzl ... */
1536+long check_dcbzl_effect(void)
1537+{
1538+ register char *fakedata = (char*)av_malloc(1024);
1539+ register char *fakedata_middle;
1540+ register long zero = 0;
1541+ register long i = 0;
1542+ long count = 0;
1543+
1544+ if (!fakedata)
1545+ {
1546+ return 0L;
1547+ }
1548+
1549+ fakedata_middle = (fakedata + 512);
1550+
1551+ memset(fakedata, 0xFF, 1024);
1552+
1553+ /* below the constraint "b" seems to mean "Address base register"
1554+ in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */
1555+ asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero));
1556+
1557+ for (i = 0; i < 1024 ; i ++)
1558+ {
1559+ if (fakedata[i] == (char)0)
1560+ count++;
1561+ }
1562+
1563+ av_free(fakedata);
1564+
1565+ return count;
1566+}
1567+#else
1568+long check_dcbzl_effect(void)
1569+{
1570+ return 0;
1571+}
1572+#endif
1573+
1574+void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
1575+{
1576+ // Common optimizations whether Altivec is available or not
1577+
1578+ switch (check_dcbzl_effect()) {
1579+ case 32:
1580+ c->clear_blocks = clear_blocks_dcbz32_ppc;
1581+ break;
1582+ case 128:
1583+ c->clear_blocks = clear_blocks_dcbz128_ppc;
1584+ break;
1585+ default:
1586+ break;
1587+ }
1588+
1589+#if HAVE_ALTIVEC
1590+ if (has_altivec()) {
1591+ mm_flags |= MM_ALTIVEC;
1592+
1593+ // Altivec specific optimisations
1594+ c->pix_abs16x16_x2 = pix_abs16x16_x2_altivec;
1595+ c->pix_abs16x16_y2 = pix_abs16x16_y2_altivec;
1596+ c->pix_abs16x16_xy2 = pix_abs16x16_xy2_altivec;
1597+ c->pix_abs16x16 = pix_abs16x16_altivec;
1598+ c->pix_abs8x8 = pix_abs8x8_altivec;
1599+ c->sad[0]= sad16x16_altivec;
1600+ c->sad[1]= sad8x8_altivec;
1601+ c->pix_norm1 = pix_norm1_altivec;
1602+ c->sse[1]= sse8_altivec;
1603+ c->sse[0]= sse16_altivec;
1604+ c->pix_sum = pix_sum_altivec;
1605+ c->diff_pixels = diff_pixels_altivec;
1606+ c->get_pixels = get_pixels_altivec;
1607+// next one disabled as it's untested.
1608+#if 0
1609+ c->add_bytes= add_bytes_altivec;
1610+#endif /* 0 */
1611+ c->put_pixels_tab[0][0] = put_pixels16_altivec;
1612+ /* the tow functions do the same thing, so use the same code */
1613+ c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
1614+ c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
1615+// next one disabled as it's untested.
1616+#if 0
1617+ c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
1618+#endif /* 0 */
1619+ c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
1620+ c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
1621+ c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
1622+ c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
1623+
1624+ c->gmc1 = gmc1_altivec;
1625+
1626+ if ((avctx->idct_algo == FF_IDCT_AUTO) ||
1627+ (avctx->idct_algo == FF_IDCT_ALTIVEC))
1628+ {
1629+ c->idct_put = idct_put_altivec;
1630+ c->idct_add = idct_add_altivec;
1631+#ifndef ALTIVEC_USE_REFERENCE_C_CODE
1632+ c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
1633+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
1634+ c->idct_permutation_type = FF_NO_IDCT_PERM;
1635+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1636+ }
1637+
1638+#ifdef POWERPC_PERFORMANCE_REPORT
1639+ {
1640+ int i, j;
1641+ for (i = 0 ; i < powerpc_perf_total ; i++)
1642+ {
1643+ for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++)
1644+ {
1645+ perfdata[j][i][powerpc_data_min] = (unsigned long long)0xFFFFFFFFFFFFFFFF;
1646+ perfdata[j][i][powerpc_data_max] = (unsigned long long)0x0000000000000000;
1647+ perfdata[j][i][powerpc_data_sum] = (unsigned long long)0x0000000000000000;
1648+ perfdata[j][i][powerpc_data_num] = (unsigned long long)0x0000000000000000;
1649+ }
1650+ }
1651+ }
1652+#endif /* POWERPC_PERFORMANCE_REPORT */
1653+ } else
1654+#endif /* HAVE_ALTIVEC */
1655+ {
1656+ // Non-AltiVec PPC optimisations
1657+
1658+ // ... pending ...
1659+ }
1660+}
1661diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/fft_altivec.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/fft_altivec.c
1662--- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/fft_altivec.c 1970-01-01 01:00:00.000000000 +0100
1663+++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/fft_altivec.c 2003-09-28 17:26:40.000000000 +0200
1664@@ -0,0 +1,247 @@
1665+/*
1666+ * FFT/IFFT transforms
1667+ * AltiVec-enabled
1668+ * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
1669+ * Based on code Copyright (c) 2002 Fabrice Bellard.
1670+ *
1671+ * This library is free software; you can redistribute it and/or
1672+ * modify it under the terms of the GNU Lesser General Public
1673+ * License as published by the Free Software Foundation; either
1674+ * version 2 of the License, or (at your option) any later version.
1675+ *
1676+ * This library is distributed in the hope that it will be useful,
1677+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
1678+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
1679+ * Lesser General Public License for more details.
1680+ *
1681+ * You should have received a copy of the GNU Lesser General Public
1682+ * License along with this library; if not, write to the Free Software
1683+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
1684+ */
1685+#include "../dsputil.h"
1686+
1687+#include "gcc_fixes.h"
1688+
1689+#include "dsputil_altivec.h"
1690+
1691+/*
1692+ those three macros are from libavcodec/fft.c
1693+ and are required for the reference C code
1694+*/
1695+/* butter fly op */
1696+#define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \
1697+{\
1698+ FFTSample ax, ay, bx, by;\
1699+ bx=pre1;\
1700+ by=pim1;\
1701+ ax=qre1;\
1702+ ay=qim1;\
1703+ pre = (bx + ax);\
1704+ pim = (by + ay);\
1705+ qre = (bx - ax);\
1706+ qim = (by - ay);\
1707+}
1708+#define MUL16(a,b) ((a) * (b))
1709+#define CMUL(pre, pim, are, aim, bre, bim) \
1710+{\
1711+ pre = (MUL16(are, bre) - MUL16(aim, bim));\
1712+ pim = (MUL16(are, bim) + MUL16(bre, aim));\
1713+}
1714+
1715+
1716+/**
1717+ * Do a complex FFT with the parameters defined in fft_init(). The
1718+ * input data must be permuted before with s->revtab table. No
1719+ * 1.0/sqrt(n) normalization is done.
1720+ * AltiVec-enabled
1721+ * This code assumes that the 'z' pointer is 16 bytes-aligned
1722+ * It also assumes all FFTComplex are 8 bytes-aligned pair of float
1723+ * The code is exactly the same as the SSE version, except
1724+ * that successive MUL + ADD/SUB have been merged into
1725+ * fused multiply-add ('vec_madd' in altivec)
1726+ */
1727+void fft_calc_altivec(FFTContext *s, FFTComplex *z)
1728+{
1729+POWERPC_PERF_DECLARE(altivec_fft_num, s->nbits >= 6);
1730+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
1731+ int ln = s->nbits;
1732+ int j, np, np2;
1733+ int nblocks, nloops;
1734+ register FFTComplex *p, *q;
1735+ FFTComplex *exptab = s->exptab;
1736+ int l;
1737+ FFTSample tmp_re, tmp_im;
1738+
1739+POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6);
1740+
1741+ np = 1 << ln;
1742+
1743+ /* pass 0 */
1744+
1745+ p=&z[0];
1746+ j=(np >> 1);
1747+ do {
1748+ BF(p[0].re, p[0].im, p[1].re, p[1].im,
1749+ p[0].re, p[0].im, p[1].re, p[1].im);
1750+ p+=2;
1751+ } while (--j != 0);
1752+
1753+ /* pass 1 */
1754+
1755+
1756+ p=&z[0];
1757+ j=np >> 2;
1758+ if (s->inverse) {
1759+ do {
1760+ BF(p[0].re, p[0].im, p[2].re, p[2].im,
1761+ p[0].re, p[0].im, p[2].re, p[2].im);
1762+ BF(p[1].re, p[1].im, p[3].re, p[3].im,
1763+ p[1].re, p[1].im, -p[3].im, p[3].re);
1764+ p+=4;
1765+ } while (--j != 0);
1766+ } else {
1767+ do {
1768+ BF(p[0].re, p[0].im, p[2].re, p[2].im,
1769+ p[0].re, p[0].im, p[2].re, p[2].im);
1770+ BF(p[1].re, p[1].im, p[3].re, p[3].im,
1771+ p[1].re, p[1].im, p[3].im, -p[3].re);
1772+ p+=4;
1773+ } while (--j != 0);
1774+ }
1775+ /* pass 2 .. ln-1 */
1776+
1777+ nblocks = np >> 3;
1778+ nloops = 1 << 2;
1779+ np2 = np >> 1;
1780+ do {
1781+ p = z;
1782+ q = z + nloops;
1783+ for (j = 0; j < nblocks; ++j) {
1784+ BF(p->re, p->im, q->re, q->im,
1785+ p->re, p->im, q->re, q->im);
1786+
1787+ p++;
1788+ q++;
1789+ for(l = nblocks; l < np2; l += nblocks) {
1790+ CMUL(tmp_re, tmp_im, exptab[l].re, exptab[l].im, q->re, q->im);
1791+ BF(p->re, p->im, q->re, q->im,
1792+ p->re, p->im, tmp_re, tmp_im);
1793+ p++;
1794+ q++;
1795+ }
1796+
1797+ p += nloops;
1798+ q += nloops;
1799+ }
1800+ nblocks = nblocks >> 1;
1801+ nloops = nloops << 1;
1802+ } while (nblocks != 0);
1803+
1804+POWERPC_PERF_STOP_COUNT(altivec_fft_num, s->nbits >= 6);
1805+
1806+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
1807+#ifdef CONFIG_DARWIN
1808+ register const vector float vczero = (const vector float)(0.);
1809+#else
1810+ register const vector float vczero = (const vector float){0.,0.,0.,0.};
1811+#endif
1812+
1813+ int ln = s->nbits;
1814+ int j, np, np2;
1815+ int nblocks, nloops;
1816+ register FFTComplex *p, *q;
1817+ FFTComplex *cptr, *cptr1;
1818+ int k;
1819+
1820+POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6);
1821+
1822+ np = 1 << ln;
1823+
1824+ {
1825+ vector float *r, a, b, a1, c1, c2;
1826+
1827+ r = (vector float *)&z[0];
1828+
1829+ c1 = vcii(p,p,n,n);
1830+
1831+ if (s->inverse)
1832+ {
1833+ c2 = vcii(p,p,n,p);
1834+ }
1835+ else
1836+ {
1837+ c2 = vcii(p,p,p,n);
1838+ }
1839+
1840+ j = (np >> 2);
1841+ do {
1842+ a = vec_ld(0, r);
1843+ a1 = vec_ld(sizeof(vector float), r);
1844+
1845+ b = vec_perm(a,a,vcprmle(1,0,3,2));
1846+ a = vec_madd(a,c1,b);
1847+ /* do the pass 0 butterfly */
1848+
1849+ b = vec_perm(a1,a1,vcprmle(1,0,3,2));
1850+ b = vec_madd(a1,c1,b);
1851+ /* do the pass 0 butterfly */
1852+
1853+ /* multiply third by -i */
1854+ b = vec_perm(b,b,vcprmle(2,3,1,0));
1855+
1856+ /* do the pass 1 butterfly */
1857+ vec_st(vec_madd(b,c2,a), 0, r);
1858+ vec_st(vec_nmsub(b,c2,a), sizeof(vector float), r);
1859+
1860+ r += 2;
1861+ } while (--j != 0);
1862+ }
1863+ /* pass 2 .. ln-1 */
1864+
1865+ nblocks = np >> 3;
1866+ nloops = 1 << 2;
1867+ np2 = np >> 1;
1868+
1869+ cptr1 = s->exptab1;
1870+ do {
1871+ p = z;
1872+ q = z + nloops;
1873+ j = nblocks;
1874+ do {
1875+ cptr = cptr1;
1876+ k = nloops >> 1;
1877+ do {
1878+ vector float a,b,c,t1;
1879+
1880+ a = vec_ld(0, (float*)p);
1881+ b = vec_ld(0, (float*)q);
1882+
1883+ /* complex mul */
1884+ c = vec_ld(0, (float*)cptr);
1885+ /* cre*re cim*re */
1886+ t1 = vec_madd(c, vec_perm(b,b,vcprmle(2,2,0,0)),vczero);
1887+ c = vec_ld(sizeof(vector float), (float*)cptr);
1888+ /* -cim*im cre*im */
1889+ b = vec_madd(c, vec_perm(b,b,vcprmle(3,3,1,1)),t1);
1890+
1891+ /* butterfly */
1892+ vec_st(vec_add(a,b), 0, (float*)p);
1893+ vec_st(vec_sub(a,b), 0, (float*)q);
1894+
1895+ p += 2;
1896+ q += 2;
1897+ cptr += 4;
1898+ } while (--k);
1899+
1900+ p += nloops;
1901+ q += nloops;
1902+ } while (--j);
1903+ cptr1 += nloops * 2;
1904+ nblocks = nblocks >> 1;
1905+ nloops = nloops << 1;
1906+ } while (nblocks != 0);
1907+
1908+POWERPC_PERF_STOP_COUNT(altivec_fft_num, s->nbits >= 6);
1909+
1910+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1911+}
1912diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/gcc_fixes.h avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/gcc_fixes.h
1913--- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/gcc_fixes.h 2003-07-04 15:40:29.000000000 +0200
1914+++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/gcc_fixes.h 2003-09-28 17:26:40.000000000 +0200
1915@@ -25,7 +25,7 @@
1916 * http://gcc.gnu.org/ml/gcc/2003-04/msg00967.html
1917 */
1918
1919-static inline vector signed char my_vmrglb (vector signed char const A,
1920+static inline vector signed char ff_vmrglb (vector signed char const A,
1921 vector signed char const B)
1922 {
1923 static const vector unsigned char lowbyte = {
1924@@ -35,7 +35,7 @@
1925 return vec_perm (A, B, lowbyte);
1926 }
1927
1928-static inline vector signed short my_vmrglh (vector signed short const A,
1929+static inline vector signed short ff_vmrglh (vector signed short const A,
1930 vector signed short const B)
1931 {
1932 static const vector unsigned char lowhalf = {
1933@@ -45,7 +45,7 @@
1934 return vec_perm (A, B, lowhalf);
1935 }
1936
1937-static inline vector signed int my_vmrglw (vector signed int const A,
1938+static inline vector signed int ff_vmrglw (vector signed int const A,
1939 vector signed int const B)
1940 {
1941 static const vector unsigned char lowword = {
1942@@ -54,27 +54,27 @@
1943 };
1944 return vec_perm (A, B, lowword);
1945 }
1946-/*#define my_vmrglb my_vmrglb
1947-#define my_vmrglh my_vmrglh
1948-#define my_vmrglw my_vmrglw
1949+/*#define ff_vmrglb ff_vmrglb
1950+#define ff_vmrglh ff_vmrglh
1951+#define ff_vmrglw ff_vmrglw
1952 */
1953 #undef vec_mergel
1954
1955 #define vec_mergel(a1, a2) \
1956 __ch (__bin_args_eq (vector signed char, (a1), vector signed char, (a2)), \
1957- ((vector signed char) my_vmrglb ((vector signed char) (a1), (vector signed char) (a2))), \
1958+ ((vector signed char) ff_vmrglb ((vector signed char) (a1), (vector signed char) (a2))), \
1959 __ch (__bin_args_eq (vector unsigned char, (a1), vector unsigned char, (a2)), \
1960- ((vector unsigned char) my_vmrglb ((vector signed char) (a1), (vector signed char) (a2))), \
1961+ ((vector unsigned char) ff_vmrglb ((vector signed char) (a1), (vector signed char) (a2))), \
1962 __ch (__bin_args_eq (vector signed short, (a1), vector signed short, (a2)), \
1963- ((vector signed short) my_vmrglh ((vector signed short) (a1), (vector signed short) (a2))), \
1964+ ((vector signed short) ff_vmrglh ((vector signed short) (a1), (vector signed short) (a2))), \
1965 __ch (__bin_args_eq (vector unsigned short, (a1), vector unsigned short, (a2)), \
1966- ((vector unsigned short) my_vmrglh ((vector signed short) (a1), (vector signed short) (a2))), \
1967+ ((vector unsigned short) ff_vmrglh ((vector signed short) (a1), (vector signed short) (a2))), \
1968 __ch (__bin_args_eq (vector float, (a1), vector float, (a2)), \
1969- ((vector float) my_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \
1970+ ((vector float) ff_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \
1971 __ch (__bin_args_eq (vector signed int, (a1), vector signed int, (a2)), \
1972- ((vector signed int) my_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \
1973+ ((vector signed int) ff_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \
1974 __ch (__bin_args_eq (vector unsigned int, (a1), vector unsigned int, (a2)), \
1975- ((vector unsigned int) my_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \
1976+ ((vector unsigned int) ff_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \
1977 __altivec_link_error_invalid_argument ())))))))
1978
1979 #endif
1980diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/gmc_altivec.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/gmc_altivec.c
1981--- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/gmc_altivec.c 1970-01-01 01:00:00.000000000 +0100
1982+++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/gmc_altivec.c 2003-09-28 17:26:40.000000000 +0200
1983@@ -0,0 +1,172 @@
1984+/*
1985+ * GMC (Global Motion Compensation)
1986+ * AltiVec-enabled
1987+ * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
1988+ *
1989+ * This library is free software; you can redistribute it and/or
1990+ * modify it under the terms of the GNU Lesser General Public
1991+ * License as published by the Free Software Foundation; either
1992+ * version 2 of the License, or (at your option) any later version.
1993+ *
1994+ * This library is distributed in the hope that it will be useful,
1995+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
1996+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
1997+ * Lesser General Public License for more details.
1998+ *
1999+ * You should have received a copy of the GNU Lesser General Public
2000+ * License along with this library; if not, write to the Free Software
2001+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2002+ */
2003+
2004+#include "../dsputil.h"
2005+
2006+#include "gcc_fixes.h"
2007+
2008+#include "dsputil_altivec.h"
2009+
2010+/*
2011+ altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8,
2012+ to preserve proper dst alignement.
2013+*/
2014+#define GMC1_PERF_COND (h==8)
2015+void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int stride, int h, int x16, int y16, int rounder)
2016+{
2017+POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND);
2018+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
2019+ const int A=(16-x16)*(16-y16);
2020+ const int B=( x16)*(16-y16);
2021+ const int C=(16-x16)*( y16);
2022+ const int D=( x16)*( y16);
2023+ int i;
2024+
2025+POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
2026+
2027+ for(i=0; i<h; i++)
2028+ {
2029+ dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
2030+ dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
2031+ dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
2032+ dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
2033+ dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
2034+ dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
2035+ dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
2036+ dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
2037+ dst+= stride;
2038+ src+= stride;
2039+ }
2040+
2041+POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
2042+
2043+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
2044+ const unsigned short __attribute__ ((aligned(16))) rounder_a[8] =
2045+ {rounder, rounder, rounder, rounder,
2046+ rounder, rounder, rounder, rounder};
2047+ const unsigned short __attribute__ ((aligned(16))) ABCD[8] =
2048+ {
2049+ (16-x16)*(16-y16), /* A */
2050+ ( x16)*(16-y16), /* B */
2051+ (16-x16)*( y16), /* C */
2052+ ( x16)*( y16), /* D */
2053+ 0, 0, 0, 0 /* padding */
2054+ };
2055+ register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
2056+ register const vector unsigned short vcsr8 = (const vector unsigned short)vec_splat_u16(8);
2057+ register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD;
2058+ register vector unsigned short Av, Bv, Cv, Dv, rounderV, tempA, tempB, tempC, tempD;
2059+ int i;
2060+ unsigned long dst_odd = (unsigned long)dst & 0x0000000F;
2061+ unsigned long src_really_odd = (unsigned long)src & 0x0000000F;
2062+
2063+
2064+POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
2065+
2066+ tempA = vec_ld(0, (unsigned short*)ABCD);
2067+ Av = vec_splat(tempA, 0);
2068+ Bv = vec_splat(tempA, 1);
2069+ Cv = vec_splat(tempA, 2);
2070+ Dv = vec_splat(tempA, 3);
2071+
2072+ rounderV = vec_ld(0, (unsigned short*)rounder_a);
2073+
2074+ // we'll be able to pick-up our 9 char elements
2075+ // at src from those 32 bytes
2076+ // we load the first batch here, as inside the loop
2077+ // we can re-use 'src+stride' from one iteration
2078+ // as the 'src' of the next.
2079+ src_0 = vec_ld(0, src);
2080+ src_1 = vec_ld(16, src);
2081+ srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src));
2082+
2083+ if (src_really_odd != 0x0000000F)
2084+ { // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector.
2085+ srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src));
2086+ }
2087+ else
2088+ {
2089+ srcvB = src_1;
2090+ }
2091+ srcvA = vec_mergeh(vczero, srcvA);
2092+ srcvB = vec_mergeh(vczero, srcvB);
2093+
2094+ for(i=0; i<h; i++)
2095+ {
2096+ dst_odd = (unsigned long)dst & 0x0000000F;
2097+ src_really_odd = (((unsigned long)src) + stride) & 0x0000000F;
2098+
2099+ dstv = vec_ld(0, dst);
2100+
2101+ // we we'll be able to pick-up our 9 char elements
2102+ // at src + stride from those 32 bytes
2103+ // then reuse the resulting 2 vectors srvcC and srcvD
2104+ // as the next srcvA and srcvB
2105+ src_0 = vec_ld(stride + 0, src);
2106+ src_1 = vec_ld(stride + 16, src);
2107+ srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src));
2108+
2109+ if (src_really_odd != 0x0000000F)
2110+ { // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector.
2111+ srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src));
2112+ }
2113+ else
2114+ {
2115+ srcvD = src_1;
2116+ }
2117+
2118+ srcvC = vec_mergeh(vczero, srcvC);
2119+ srcvD = vec_mergeh(vczero, srcvD);
2120+
2121+
2122+ // OK, now we (finally) do the math :-)
2123+ // those four instructions replaces 32 int muls & 32 int adds.
2124+ // isn't AltiVec nice ?
2125+ tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV);
2126+ tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA);
2127+ tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB);
2128+ tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC);
2129+
2130+ srcvA = srcvC;
2131+ srcvB = srcvD;
2132+
2133+ tempD = vec_sr(tempD, vcsr8);
2134+
2135+ dstv2 = vec_pack(tempD, (vector unsigned short)vczero);
2136+
2137+ if (dst_odd)
2138+ {
2139+ dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1));
2140+ }
2141+ else
2142+ {
2143+ dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3));
2144+ }
2145+
2146+ vec_st(dstv2, 0, dst);
2147+
2148+ dst += stride;
2149+ src += stride;
2150+ }
2151+
2152+POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
2153+
2154+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
2155+}
2156diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/idct_altivec.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/idct_altivec.c
2157--- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/idct_altivec.c 1970-01-01 01:00:00.000000000 +0100
2158+++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/idct_altivec.c 2003-09-28 17:26:40.000000000 +0200
2159@@ -0,0 +1,245 @@
2160+/*
2161+ * Copyright (c) 2001 Michel Lespinasse
2162+ *
2163+ * This library is free software; you can redistribute it and/or
2164+ * modify it under the terms of the GNU Lesser General Public
2165+ * License as published by the Free Software Foundation; either
2166+ * version 2 of the License, or (at your option) any later version.
2167+ *
2168+ * This library is distributed in the hope that it will be useful,
2169+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
2170+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
2171+ * Lesser General Public License for more details.
2172+ *
2173+ * You should have received a copy of the GNU Lesser General Public
2174+ * License along with this library; if not, write to the Free Software
2175+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2176+ *
2177+ */
2178+
2179+/*
2180+ * NOTE: This code is based on GPL code from the libmpeg2 project. The
2181+ * author, Michel Lespinasses, has given explicit permission to release
2182+ * under LGPL as part of ffmpeg.
2183+ *
2184+ */
2185+
2186+/*
2187+ * FFMpeg integration by Dieter Shirley
2188+ *
2189+ * This file is a direct copy of the altivec idct module from the libmpeg2
2190+ * project. I've deleted all of the libmpeg2 specific code, renamed the functions and
2191+ * re-ordered the function parameters. The only change to the IDCT function
2192+ * itself was to factor out the partial transposition, and to perform a full
2193+ * transpose at the end of the function.
2194+ */
2195+
2196+
2197+#include <stdlib.h> /* malloc(), free() */
2198+#include <string.h>
2199+#include "../dsputil.h"
2200+
2201+#include "gcc_fixes.h"
2202+
2203+#include "dsputil_altivec.h"
2204+
2205+#define vector_s16_t vector signed short
2206+#define vector_u16_t vector unsigned short
2207+#define vector_s8_t vector signed char
2208+#define vector_u8_t vector unsigned char
2209+#define vector_s32_t vector signed int
2210+#define vector_u32_t vector unsigned int
2211+
2212+#define IDCT_HALF \
2213+ /* 1st stage */ \
2214+ t1 = vec_mradds (a1, vx7, vx1 ); \
2215+ t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7)); \
2216+ t7 = vec_mradds (a2, vx5, vx3); \
2217+ t3 = vec_mradds (ma2, vx3, vx5); \
2218+ \
2219+ /* 2nd stage */ \
2220+ t5 = vec_adds (vx0, vx4); \
2221+ t0 = vec_subs (vx0, vx4); \
2222+ t2 = vec_mradds (a0, vx6, vx2); \
2223+ t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6)); \
2224+ t6 = vec_adds (t8, t3); \
2225+ t3 = vec_subs (t8, t3); \
2226+ t8 = vec_subs (t1, t7); \
2227+ t1 = vec_adds (t1, t7); \
2228+ \
2229+ /* 3rd stage */ \
2230+ t7 = vec_adds (t5, t2); \
2231+ t2 = vec_subs (t5, t2); \
2232+ t5 = vec_adds (t0, t4); \
2233+ t0 = vec_subs (t0, t4); \
2234+ t4 = vec_subs (t8, t3); \
2235+ t3 = vec_adds (t8, t3); \
2236+ \
2237+ /* 4th stage */ \
2238+ vy0 = vec_adds (t7, t1); \
2239+ vy7 = vec_subs (t7, t1); \
2240+ vy1 = vec_mradds (c4, t3, t5); \
2241+ vy6 = vec_mradds (mc4, t3, t5); \
2242+ vy2 = vec_mradds (c4, t4, t0); \
2243+ vy5 = vec_mradds (mc4, t4, t0); \
2244+ vy3 = vec_adds (t2, t6); \
2245+ vy4 = vec_subs (t2, t6);
2246+
2247+
2248+#define IDCT \
2249+ vector_s16_t vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \
2250+ vector_s16_t vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \
2251+ vector_s16_t a0, a1, a2, ma2, c4, mc4, zero, bias; \
2252+ vector_s16_t t0, t1, t2, t3, t4, t5, t6, t7, t8; \
2253+ vector_u16_t shift; \
2254+ \
2255+ c4 = vec_splat (constants[0], 0); \
2256+ a0 = vec_splat (constants[0], 1); \
2257+ a1 = vec_splat (constants[0], 2); \
2258+ a2 = vec_splat (constants[0], 3); \
2259+ mc4 = vec_splat (constants[0], 4); \
2260+ ma2 = vec_splat (constants[0], 5); \
2261+ bias = (vector_s16_t)vec_splat ((vector_s32_t)constants[0], 3); \
2262+ \
2263+ zero = vec_splat_s16 (0); \
2264+ shift = vec_splat_u16 (4); \
2265+ \
2266+ vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero); \
2267+ vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero); \
2268+ vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero); \
2269+ vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero); \
2270+ vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero); \
2271+ vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero); \
2272+ vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero); \
2273+ vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero); \
2274+ \
2275+ IDCT_HALF \
2276+ \
2277+ vx0 = vec_mergeh (vy0, vy4); \
2278+ vx1 = vec_mergel (vy0, vy4); \
2279+ vx2 = vec_mergeh (vy1, vy5); \
2280+ vx3 = vec_mergel (vy1, vy5); \
2281+ vx4 = vec_mergeh (vy2, vy6); \
2282+ vx5 = vec_mergel (vy2, vy6); \
2283+ vx6 = vec_mergeh (vy3, vy7); \
2284+ vx7 = vec_mergel (vy3, vy7); \
2285+ \
2286+ vy0 = vec_mergeh (vx0, vx4); \
2287+ vy1 = vec_mergel (vx0, vx4); \
2288+ vy2 = vec_mergeh (vx1, vx5); \
2289+ vy3 = vec_mergel (vx1, vx5); \
2290+ vy4 = vec_mergeh (vx2, vx6); \
2291+ vy5 = vec_mergel (vx2, vx6); \
2292+ vy6 = vec_mergeh (vx3, vx7); \
2293+ vy7 = vec_mergel (vx3, vx7); \
2294+ \
2295+ vx0 = vec_adds (vec_mergeh (vy0, vy4), bias); \
2296+ vx1 = vec_mergel (vy0, vy4); \
2297+ vx2 = vec_mergeh (vy1, vy5); \
2298+ vx3 = vec_mergel (vy1, vy5); \
2299+ vx4 = vec_mergeh (vy2, vy6); \
2300+ vx5 = vec_mergel (vy2, vy6); \
2301+ vx6 = vec_mergeh (vy3, vy7); \
2302+ vx7 = vec_mergel (vy3, vy7); \
2303+ \
2304+ IDCT_HALF \
2305+ \
2306+ shift = vec_splat_u16 (6); \
2307+ vx0 = vec_sra (vy0, shift); \
2308+ vx1 = vec_sra (vy1, shift); \
2309+ vx2 = vec_sra (vy2, shift); \
2310+ vx3 = vec_sra (vy3, shift); \
2311+ vx4 = vec_sra (vy4, shift); \
2312+ vx5 = vec_sra (vy5, shift); \
2313+ vx6 = vec_sra (vy6, shift); \
2314+ vx7 = vec_sra (vy7, shift);
2315+
2316+
2317+static const vector_s16_t constants[5] = {
2318+ (vector_s16_t) AVV(23170, 13573, 6518, 21895, -23170, -21895, 32, 31),
2319+ (vector_s16_t) AVV(16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725),
2320+ (vector_s16_t) AVV(22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521),
2321+ (vector_s16_t) AVV(21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692),
2322+ (vector_s16_t) AVV(19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722)
2323+};
2324+
2325+void idct_put_altivec(uint8_t* dest, int stride, vector_s16_t* block)
2326+{
2327+POWERPC_PERF_DECLARE(altivec_idct_put_num, 1);
2328+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
2329+POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1);
2330+ void simple_idct_put(uint8_t *dest, int line_size, int16_t *block);
2331+ simple_idct_put(dest, stride, (int16_t*)block);
2332+POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1);
2333+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
2334+ vector_u8_t tmp;
2335+
2336+POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1);
2337+
2338+ IDCT
2339+
2340+#define COPY(dest,src) \
2341+ tmp = vec_packsu (src, src); \
2342+ vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest); \
2343+ vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
2344+
2345+ COPY (dest, vx0) dest += stride;
2346+ COPY (dest, vx1) dest += stride;
2347+ COPY (dest, vx2) dest += stride;
2348+ COPY (dest, vx3) dest += stride;
2349+ COPY (dest, vx4) dest += stride;
2350+ COPY (dest, vx5) dest += stride;
2351+ COPY (dest, vx6) dest += stride;
2352+ COPY (dest, vx7)
2353+
2354+POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1);
2355+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
2356+}
2357+
2358+void idct_add_altivec(uint8_t* dest, int stride, vector_s16_t* block)
2359+{
2360+POWERPC_PERF_DECLARE(altivec_idct_add_num, 1);
2361+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
2362+POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1);
2363+ void simple_idct_add(uint8_t *dest, int line_size, int16_t *block);
2364+ simple_idct_add(dest, stride, (int16_t*)block);
2365+POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1);
2366+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
2367+ vector_u8_t tmp;
2368+ vector_s16_t tmp2, tmp3;
2369+ vector_u8_t perm0;
2370+ vector_u8_t perm1;
2371+ vector_u8_t p0, p1, p;
2372+
2373+POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1);
2374+
2375+ IDCT
2376+
2377+ p0 = vec_lvsl (0, dest);
2378+ p1 = vec_lvsl (stride, dest);
2379+ p = vec_splat_u8 (-1);
2380+ perm0 = vec_mergeh (p, p0);
2381+ perm1 = vec_mergeh (p, p1);
2382+
2383+#define ADD(dest,src,perm) \
2384+ /* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \
2385+ tmp = vec_ld (0, dest); \
2386+ tmp2 = (vector_s16_t)vec_perm (tmp, (vector_u8_t)zero, perm); \
2387+ tmp3 = vec_adds (tmp2, src); \
2388+ tmp = vec_packsu (tmp3, tmp3); \
2389+ vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest); \
2390+ vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
2391+
2392+ ADD (dest, vx0, perm0) dest += stride;
2393+ ADD (dest, vx1, perm1) dest += stride;
2394+ ADD (dest, vx2, perm0) dest += stride;
2395+ ADD (dest, vx3, perm1) dest += stride;
2396+ ADD (dest, vx4, perm0) dest += stride;
2397+ ADD (dest, vx5, perm1) dest += stride;
2398+ ADD (dest, vx6, perm0) dest += stride;
2399+ ADD (dest, vx7, perm1)
2400+
2401+POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1);
2402+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
2403+}
2404+
2405diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/mpegvideo_altivec.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/mpegvideo_altivec.c
2406--- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/mpegvideo_altivec.c 1970-01-01 01:00:00.000000000 +0100
2407+++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/mpegvideo_altivec.c 2003-09-28 17:26:40.000000000 +0200
2408@@ -0,0 +1,645 @@
2409+/*
2410+ * Copyright (c) 2002 Dieter Shirley
2411+ *
2412+ * This library is free software; you can redistribute it and/or
2413+ * modify it under the terms of the GNU Lesser General Public
2414+ * License as published by the Free Software Foundation; either
2415+ * version 2 of the License, or (at your option) any later version.
2416+ *
2417+ * This library is distributed in the hope that it will be useful,
2418+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
2419+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
2420+ * Lesser General Public License for more details.
2421+ *
2422+ * You should have received a copy of the GNU Lesser General Public
2423+ * License along with this library; if not, write to the Free Software
2424+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2425+ */
2426+
2427+#include <stdlib.h>
2428+#include <stdio.h>
2429+#include "../dsputil.h"
2430+#include "../mpegvideo.h"
2431+
2432+#include "gcc_fixes.h"
2433+
2434+#include "dsputil_altivec.h"
2435+
2436+// Swaps two variables (used for altivec registers)
2437+#define SWAP(a,b) \
2438+do { \
2439+ __typeof__(a) swap_temp=a; \
2440+ a=b; \
2441+ b=swap_temp; \
2442+} while (0)
2443+
2444+// transposes a matrix consisting of four vectors with four elements each
2445+#define TRANSPOSE4(a,b,c,d) \
2446+do { \
2447+ __typeof__(a) _trans_ach = vec_mergeh(a, c); \
2448+ __typeof__(a) _trans_acl = vec_mergel(a, c); \
2449+ __typeof__(a) _trans_bdh = vec_mergeh(b, d); \
2450+ __typeof__(a) _trans_bdl = vec_mergel(b, d); \
2451+ \
2452+ a = vec_mergeh(_trans_ach, _trans_bdh); \
2453+ b = vec_mergel(_trans_ach, _trans_bdh); \
2454+ c = vec_mergeh(_trans_acl, _trans_bdl); \
2455+ d = vec_mergel(_trans_acl, _trans_bdl); \
2456+} while (0)
2457+
2458+#define TRANSPOSE8(a,b,c,d,e,f,g,h) \
2459+do { \
2460+ __typeof__(a) _A1, _B1, _C1, _D1, _E1, _F1, _G1, _H1; \
2461+ __typeof__(a) _A2, _B2, _C2, _D2, _E2, _F2, _G2, _H2; \
2462+ \
2463+ _A1 = vec_mergeh (a, e); \
2464+ _B1 = vec_mergel (a, e); \
2465+ _C1 = vec_mergeh (b, f); \
2466+ _D1 = vec_mergel (b, f); \
2467+ _E1 = vec_mergeh (c, g); \
2468+ _F1 = vec_mergel (c, g); \
2469+ _G1 = vec_mergeh (d, h); \
2470+ _H1 = vec_mergel (d, h); \
2471+ \
2472+ _A2 = vec_mergeh (_A1, _E1); \
2473+ _B2 = vec_mergel (_A1, _E1); \
2474+ _C2 = vec_mergeh (_B1, _F1); \
2475+ _D2 = vec_mergel (_B1, _F1); \
2476+ _E2 = vec_mergeh (_C1, _G1); \
2477+ _F2 = vec_mergel (_C1, _G1); \
2478+ _G2 = vec_mergeh (_D1, _H1); \
2479+ _H2 = vec_mergel (_D1, _H1); \
2480+ \
2481+ a = vec_mergeh (_A2, _E2); \
2482+ b = vec_mergel (_A2, _E2); \
2483+ c = vec_mergeh (_B2, _F2); \
2484+ d = vec_mergel (_B2, _F2); \
2485+ e = vec_mergeh (_C2, _G2); \
2486+ f = vec_mergel (_C2, _G2); \
2487+ g = vec_mergeh (_D2, _H2); \
2488+ h = vec_mergel (_D2, _H2); \
2489+} while (0)
2490+
2491+
2492+// Loads a four-byte value (int or float) from the target address
2493+// into every element in the target vector. Only works if the
2494+// target address is four-byte aligned (which should be always).
2495+#define LOAD4(vec, address) \
2496+{ \
2497+ __typeof__(vec)* _load_addr = (__typeof__(vec)*)(address); \
2498+ vector unsigned char _perm_vec = vec_lvsl(0,(address)); \
2499+ vec = vec_ld(0, _load_addr); \
2500+ vec = vec_perm(vec, vec, _perm_vec); \
2501+ vec = vec_splat(vec, 0); \
2502+}
2503+
2504+
2505+#ifdef CONFIG_DARWIN
2506+#define FOUROF(a) (a)
2507+#else
2508+// slower, for dumb non-apple GCC
2509+#define FOUROF(a) {a,a,a,a}
2510+#endif
2511+int dct_quantize_altivec(MpegEncContext* s,
2512+ DCTELEM* data, int n,
2513+ int qscale, int* overflow)
2514+{
2515+ int lastNonZero;
2516+ vector float row0, row1, row2, row3, row4, row5, row6, row7;
2517+ vector float alt0, alt1, alt2, alt3, alt4, alt5, alt6, alt7;
2518+ const vector float zero = (const vector float)FOUROF(0.);
2519+
2520+ // Load the data into the row/alt vectors
2521+ {
2522+ vector signed short data0, data1, data2, data3, data4, data5, data6, data7;
2523+
2524+ data0 = vec_ld(0, data);
2525+ data1 = vec_ld(16, data);
2526+ data2 = vec_ld(32, data);
2527+ data3 = vec_ld(48, data);
2528+ data4 = vec_ld(64, data);
2529+ data5 = vec_ld(80, data);
2530+ data6 = vec_ld(96, data);
2531+ data7 = vec_ld(112, data);
2532+
2533+ // Transpose the data before we start
2534+ TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7);
2535+
2536+ // load the data into floating point vectors. We load
2537+ // the high half of each row into the main row vectors
2538+ // and the low half into the alt vectors.
2539+ row0 = vec_ctf(vec_unpackh(data0), 0);
2540+ alt0 = vec_ctf(vec_unpackl(data0), 0);
2541+ row1 = vec_ctf(vec_unpackh(data1), 0);
2542+ alt1 = vec_ctf(vec_unpackl(data1), 0);
2543+ row2 = vec_ctf(vec_unpackh(data2), 0);
2544+ alt2 = vec_ctf(vec_unpackl(data2), 0);
2545+ row3 = vec_ctf(vec_unpackh(data3), 0);
2546+ alt3 = vec_ctf(vec_unpackl(data3), 0);
2547+ row4 = vec_ctf(vec_unpackh(data4), 0);
2548+ alt4 = vec_ctf(vec_unpackl(data4), 0);
2549+ row5 = vec_ctf(vec_unpackh(data5), 0);
2550+ alt5 = vec_ctf(vec_unpackl(data5), 0);
2551+ row6 = vec_ctf(vec_unpackh(data6), 0);
2552+ alt6 = vec_ctf(vec_unpackl(data6), 0);
2553+ row7 = vec_ctf(vec_unpackh(data7), 0);
2554+ alt7 = vec_ctf(vec_unpackl(data7), 0);
2555+ }
2556+
2557+ // The following block could exist as a separate an altivec dct
2558+ // function. However, if we put it inline, the DCT data can remain
2559+ // in the vector local variables, as floats, which we'll use during the
2560+ // quantize step...
2561+ {
2562+ const vector float vec_0_298631336 = (vector float)FOUROF(0.298631336f);
2563+ const vector float vec_0_390180644 = (vector float)FOUROF(-0.390180644f);
2564+ const vector float vec_0_541196100 = (vector float)FOUROF(0.541196100f);
2565+ const vector float vec_0_765366865 = (vector float)FOUROF(0.765366865f);
2566+ const vector float vec_0_899976223 = (vector float)FOUROF(-0.899976223f);
2567+ const vector float vec_1_175875602 = (vector float)FOUROF(1.175875602f);
2568+ const vector float vec_1_501321110 = (vector float)FOUROF(1.501321110f);
2569+ const vector float vec_1_847759065 = (vector float)FOUROF(-1.847759065f);
2570+ const vector float vec_1_961570560 = (vector float)FOUROF(-1.961570560f);
2571+ const vector float vec_2_053119869 = (vector float)FOUROF(2.053119869f);
2572+ const vector float vec_2_562915447 = (vector float)FOUROF(-2.562915447f);
2573+ const vector float vec_3_072711026 = (vector float)FOUROF(3.072711026f);
2574+
2575+
2576+ int whichPass, whichHalf;
2577+
2578+ for(whichPass = 1; whichPass<=2; whichPass++)
2579+ {
2580+ for(whichHalf = 1; whichHalf<=2; whichHalf++)
2581+ {
2582+ vector float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2583+ vector float tmp10, tmp11, tmp12, tmp13;
2584+ vector float z1, z2, z3, z4, z5;
2585+
2586+ tmp0 = vec_add(row0, row7); // tmp0 = dataptr[0] + dataptr[7];
2587+ tmp7 = vec_sub(row0, row7); // tmp7 = dataptr[0] - dataptr[7];
2588+ tmp3 = vec_add(row3, row4); // tmp3 = dataptr[3] + dataptr[4];
2589+ tmp4 = vec_sub(row3, row4); // tmp4 = dataptr[3] - dataptr[4];
2590+ tmp1 = vec_add(row1, row6); // tmp1 = dataptr[1] + dataptr[6];
2591+ tmp6 = vec_sub(row1, row6); // tmp6 = dataptr[1] - dataptr[6];
2592+ tmp2 = vec_add(row2, row5); // tmp2 = dataptr[2] + dataptr[5];
2593+ tmp5 = vec_sub(row2, row5); // tmp5 = dataptr[2] - dataptr[5];
2594+
2595+ tmp10 = vec_add(tmp0, tmp3); // tmp10 = tmp0 + tmp3;
2596+ tmp13 = vec_sub(tmp0, tmp3); // tmp13 = tmp0 - tmp3;
2597+ tmp11 = vec_add(tmp1, tmp2); // tmp11 = tmp1 + tmp2;
2598+ tmp12 = vec_sub(tmp1, tmp2); // tmp12 = tmp1 - tmp2;
2599+
2600+
2601+ // dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS);
2602+ row0 = vec_add(tmp10, tmp11);
2603+
2604+ // dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
2605+ row4 = vec_sub(tmp10, tmp11);
2606+
2607+
2608+ // z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
2609+ z1 = vec_madd(vec_add(tmp12, tmp13), vec_0_541196100, (vector float)zero);
2610+
2611+ // dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
2612+ // CONST_BITS-PASS1_BITS);
2613+ row2 = vec_madd(tmp13, vec_0_765366865, z1);
2614+
2615+ // dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
2616+ // CONST_BITS-PASS1_BITS);
2617+ row6 = vec_madd(tmp12, vec_1_847759065, z1);
2618+
2619+ z1 = vec_add(tmp4, tmp7); // z1 = tmp4 + tmp7;
2620+ z2 = vec_add(tmp5, tmp6); // z2 = tmp5 + tmp6;
2621+ z3 = vec_add(tmp4, tmp6); // z3 = tmp4 + tmp6;
2622+ z4 = vec_add(tmp5, tmp7); // z4 = tmp5 + tmp7;
2623+
2624+ // z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
2625+ z5 = vec_madd(vec_add(z3, z4), vec_1_175875602, (vector float)zero);
2626+
2627+ // z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
2628+ z3 = vec_madd(z3, vec_1_961570560, z5);
2629+
2630+ // z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
2631+ z4 = vec_madd(z4, vec_0_390180644, z5);
2632+
2633+ // The following adds are rolled into the multiplies above
2634+ // z3 = vec_add(z3, z5); // z3 += z5;
2635+ // z4 = vec_add(z4, z5); // z4 += z5;
2636+
2637+ // z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
2638+ // Wow! It's actually more effecient to roll this multiply
2639+ // into the adds below, even thought the multiply gets done twice!
2640+ // z2 = vec_madd(z2, vec_2_562915447, (vector float)zero);
2641+
2642+ // z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
2643+ // Same with this one...
2644+ // z1 = vec_madd(z1, vec_0_899976223, (vector float)zero);
2645+
2646+ // tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
2647+ // dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS);
2648+ row7 = vec_madd(tmp4, vec_0_298631336, vec_madd(z1, vec_0_899976223, z3));
2649+
2650+ // tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
2651+ // dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS);
2652+ row5 = vec_madd(tmp5, vec_2_053119869, vec_madd(z2, vec_2_562915447, z4));
2653+
2654+ // tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
2655+ // dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS);
2656+ row3 = vec_madd(tmp6, vec_3_072711026, vec_madd(z2, vec_2_562915447, z3));
2657+
2658+ // tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
2659+ // dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS);
2660+ row1 = vec_madd(z1, vec_0_899976223, vec_madd(tmp7, vec_1_501321110, z4));
2661+
2662+ // Swap the row values with the alts. If this is the first half,
2663+ // this sets up the low values to be acted on in the second half.
2664+ // If this is the second half, it puts the high values back in
2665+ // the row values where they are expected to be when we're done.
2666+ SWAP(row0, alt0);
2667+ SWAP(row1, alt1);
2668+ SWAP(row2, alt2);
2669+ SWAP(row3, alt3);
2670+ SWAP(row4, alt4);
2671+ SWAP(row5, alt5);
2672+ SWAP(row6, alt6);
2673+ SWAP(row7, alt7);
2674+ }
2675+
2676+ if (whichPass == 1)
2677+ {
2678+ // transpose the data for the second pass
2679+
2680+ // First, block transpose the upper right with lower left.
2681+ SWAP(row4, alt0);
2682+ SWAP(row5, alt1);
2683+ SWAP(row6, alt2);
2684+ SWAP(row7, alt3);
2685+
2686+ // Now, transpose each block of four
2687+ TRANSPOSE4(row0, row1, row2, row3);
2688+ TRANSPOSE4(row4, row5, row6, row7);
2689+ TRANSPOSE4(alt0, alt1, alt2, alt3);
2690+ TRANSPOSE4(alt4, alt5, alt6, alt7);
2691+ }
2692+ }
2693+ }
2694+
2695+ // used after quantise step
2696+ int oldBaseValue = 0;
2697+
2698+ // perform the quantise step, using the floating point data
2699+ // still in the row/alt registers
2700+ {
2701+ const int* biasAddr;
2702+ const vector signed int* qmat;
2703+ vector float bias, negBias;
2704+
2705+ if (s->mb_intra)
2706+ {
2707+ vector signed int baseVector;
2708+
2709+ // We must cache element 0 in the intra case
2710+ // (it needs special handling).
2711+ baseVector = vec_cts(vec_splat(row0, 0), 0);
2712+ vec_ste(baseVector, 0, &oldBaseValue);
2713+
2714+ qmat = (vector signed int*)s->q_intra_matrix[qscale];
2715+ biasAddr = &(s->intra_quant_bias);
2716+ }
2717+ else
2718+ {
2719+ qmat = (vector signed int*)s->q_inter_matrix[qscale];
2720+ biasAddr = &(s->inter_quant_bias);
2721+ }
2722+
2723+ // Load the bias vector (We add 0.5 to the bias so that we're
2724+ // rounding when we convert to int, instead of flooring.)
2725+ {
2726+ vector signed int biasInt;
2727+ const vector float negOneFloat = (vector float)FOUROF(-1.0f);
2728+ LOAD4(biasInt, biasAddr);
2729+ bias = vec_ctf(biasInt, QUANT_BIAS_SHIFT);
2730+ negBias = vec_madd(bias, negOneFloat, zero);
2731+ }
2732+
2733+ {
2734+ vector float q0, q1, q2, q3, q4, q5, q6, q7;
2735+
2736+ q0 = vec_ctf(qmat[0], QMAT_SHIFT);
2737+ q1 = vec_ctf(qmat[2], QMAT_SHIFT);
2738+ q2 = vec_ctf(qmat[4], QMAT_SHIFT);
2739+ q3 = vec_ctf(qmat[6], QMAT_SHIFT);
2740+ q4 = vec_ctf(qmat[8], QMAT_SHIFT);
2741+ q5 = vec_ctf(qmat[10], QMAT_SHIFT);
2742+ q6 = vec_ctf(qmat[12], QMAT_SHIFT);
2743+ q7 = vec_ctf(qmat[14], QMAT_SHIFT);
2744+
2745+ row0 = vec_sel(vec_madd(row0, q0, negBias), vec_madd(row0, q0, bias),
2746+ vec_cmpgt(row0, zero));
2747+ row1 = vec_sel(vec_madd(row1, q1, negBias), vec_madd(row1, q1, bias),
2748+ vec_cmpgt(row1, zero));
2749+ row2 = vec_sel(vec_madd(row2, q2, negBias), vec_madd(row2, q2, bias),
2750+ vec_cmpgt(row2, zero));
2751+ row3 = vec_sel(vec_madd(row3, q3, negBias), vec_madd(row3, q3, bias),
2752+ vec_cmpgt(row3, zero));
2753+ row4 = vec_sel(vec_madd(row4, q4, negBias), vec_madd(row4, q4, bias),
2754+ vec_cmpgt(row4, zero));
2755+ row5 = vec_sel(vec_madd(row5, q5, negBias), vec_madd(row5, q5, bias),
2756+ vec_cmpgt(row5, zero));
2757+ row6 = vec_sel(vec_madd(row6, q6, negBias), vec_madd(row6, q6, bias),
2758+ vec_cmpgt(row6, zero));
2759+ row7 = vec_sel(vec_madd(row7, q7, negBias), vec_madd(row7, q7, bias),
2760+ vec_cmpgt(row7, zero));
2761+
2762+ q0 = vec_ctf(qmat[1], QMAT_SHIFT);
2763+ q1 = vec_ctf(qmat[3], QMAT_SHIFT);
2764+ q2 = vec_ctf(qmat[5], QMAT_SHIFT);
2765+ q3 = vec_ctf(qmat[7], QMAT_SHIFT);
2766+ q4 = vec_ctf(qmat[9], QMAT_SHIFT);
2767+ q5 = vec_ctf(qmat[11], QMAT_SHIFT);
2768+ q6 = vec_ctf(qmat[13], QMAT_SHIFT);
2769+ q7 = vec_ctf(qmat[15], QMAT_SHIFT);
2770+
2771+ alt0 = vec_sel(vec_madd(alt0, q0, negBias), vec_madd(alt0, q0, bias),
2772+ vec_cmpgt(alt0, zero));
2773+ alt1 = vec_sel(vec_madd(alt1, q1, negBias), vec_madd(alt1, q1, bias),
2774+ vec_cmpgt(alt1, zero));
2775+ alt2 = vec_sel(vec_madd(alt2, q2, negBias), vec_madd(alt2, q2, bias),
2776+ vec_cmpgt(alt2, zero));
2777+ alt3 = vec_sel(vec_madd(alt3, q3, negBias), vec_madd(alt3, q3, bias),
2778+ vec_cmpgt(alt3, zero));
2779+ alt4 = vec_sel(vec_madd(alt4, q4, negBias), vec_madd(alt4, q4, bias),
2780+ vec_cmpgt(alt4, zero));
2781+ alt5 = vec_sel(vec_madd(alt5, q5, negBias), vec_madd(alt5, q5, bias),
2782+ vec_cmpgt(alt5, zero));
2783+ alt6 = vec_sel(vec_madd(alt6, q6, negBias), vec_madd(alt6, q6, bias),
2784+ vec_cmpgt(alt6, zero));
2785+ alt7 = vec_sel(vec_madd(alt7, q7, negBias), vec_madd(alt7, q7, bias),
2786+ vec_cmpgt(alt7, zero));
2787+ }
2788+
2789+
2790+ }
2791+
2792+ // Store the data back into the original block
2793+ {
2794+ vector signed short data0, data1, data2, data3, data4, data5, data6, data7;
2795+
2796+ data0 = vec_pack(vec_cts(row0, 0), vec_cts(alt0, 0));
2797+ data1 = vec_pack(vec_cts(row1, 0), vec_cts(alt1, 0));
2798+ data2 = vec_pack(vec_cts(row2, 0), vec_cts(alt2, 0));
2799+ data3 = vec_pack(vec_cts(row3, 0), vec_cts(alt3, 0));
2800+ data4 = vec_pack(vec_cts(row4, 0), vec_cts(alt4, 0));
2801+ data5 = vec_pack(vec_cts(row5, 0), vec_cts(alt5, 0));
2802+ data6 = vec_pack(vec_cts(row6, 0), vec_cts(alt6, 0));
2803+ data7 = vec_pack(vec_cts(row7, 0), vec_cts(alt7, 0));
2804+
2805+ {
2806+ // Clamp for overflow
2807+ vector signed int max_q_int, min_q_int;
2808+ vector signed short max_q, min_q;
2809+
2810+ LOAD4(max_q_int, &(s->max_qcoeff));
2811+ LOAD4(min_q_int, &(s->min_qcoeff));
2812+
2813+ max_q = vec_pack(max_q_int, max_q_int);
2814+ min_q = vec_pack(min_q_int, min_q_int);
2815+
2816+ data0 = vec_max(vec_min(data0, max_q), min_q);
2817+ data1 = vec_max(vec_min(data1, max_q), min_q);
2818+ data2 = vec_max(vec_min(data2, max_q), min_q);
2819+ data4 = vec_max(vec_min(data4, max_q), min_q);
2820+ data5 = vec_max(vec_min(data5, max_q), min_q);
2821+ data6 = vec_max(vec_min(data6, max_q), min_q);
2822+ data7 = vec_max(vec_min(data7, max_q), min_q);
2823+ }
2824+
2825+ vector bool char zero_01, zero_23, zero_45, zero_67;
2826+ vector signed char scanIndices_01, scanIndices_23, scanIndices_45, scanIndices_67;
2827+ vector signed char negOne = vec_splat_s8(-1);
2828+ vector signed char* scanPtr =
2829+ (vector signed char*)(s->intra_scantable.inverse);
2830+
2831+ // Determine the largest non-zero index.
2832+ zero_01 = vec_pack(vec_cmpeq(data0, (vector short)zero),
2833+ vec_cmpeq(data1, (vector short)zero));
2834+ zero_23 = vec_pack(vec_cmpeq(data2, (vector short)zero),
2835+ vec_cmpeq(data3, (vector short)zero));
2836+ zero_45 = vec_pack(vec_cmpeq(data4, (vector short)zero),
2837+ vec_cmpeq(data5, (vector short)zero));
2838+ zero_67 = vec_pack(vec_cmpeq(data6, (vector short)zero),
2839+ vec_cmpeq(data7, (vector short)zero));
2840+
2841+ // 64 biggest values
2842+ scanIndices_01 = vec_sel(scanPtr[0], negOne, zero_01);
2843+ scanIndices_23 = vec_sel(scanPtr[1], negOne, zero_23);
2844+ scanIndices_45 = vec_sel(scanPtr[2], negOne, zero_45);
2845+ scanIndices_67 = vec_sel(scanPtr[3], negOne, zero_67);
2846+
2847+ // 32 largest values
2848+ scanIndices_01 = vec_max(scanIndices_01, scanIndices_23);
2849+ scanIndices_45 = vec_max(scanIndices_45, scanIndices_67);
2850+
2851+ // 16 largest values
2852+ scanIndices_01 = vec_max(scanIndices_01, scanIndices_45);
2853+
2854+ // 8 largest values
2855+ scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne),
2856+ vec_mergel(scanIndices_01, negOne));
2857+
2858+ // 4 largest values
2859+ scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne),
2860+ vec_mergel(scanIndices_01, negOne));
2861+
2862+ // 2 largest values
2863+ scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne),
2864+ vec_mergel(scanIndices_01, negOne));
2865+
2866+ // largest value
2867+ scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne),
2868+ vec_mergel(scanIndices_01, negOne));
2869+
2870+ scanIndices_01 = vec_splat(scanIndices_01, 0);
2871+
2872+ signed char lastNonZeroChar;
2873+
2874+ vec_ste(scanIndices_01, 0, &lastNonZeroChar);
2875+
2876+ lastNonZero = lastNonZeroChar;
2877+
2878+ // While the data is still in vectors we check for the transpose IDCT permute
2879+ // and handle it using the vector unit if we can. This is the permute used
2880+ // by the altivec idct, so it is common when using the altivec dct.
2881+
2882+ if ((lastNonZero > 0) && (s->dsp.idct_permutation_type == FF_TRANSPOSE_IDCT_PERM))
2883+ {
2884+ TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7);
2885+ }
2886+
2887+ vec_st(data0, 0, data);
2888+ vec_st(data1, 16, data);
2889+ vec_st(data2, 32, data);
2890+ vec_st(data3, 48, data);
2891+ vec_st(data4, 64, data);
2892+ vec_st(data5, 80, data);
2893+ vec_st(data6, 96, data);
2894+ vec_st(data7, 112, data);
2895+ }
2896+
2897+ // special handling of block[0]
2898+ if (s->mb_intra)
2899+ {
2900+ if (!s->h263_aic)
2901+ {
2902+ if (n < 4)
2903+ oldBaseValue /= s->y_dc_scale;
2904+ else
2905+ oldBaseValue /= s->c_dc_scale;
2906+ }
2907+
2908+ // Divide by 8, rounding the result
2909+ data[0] = (oldBaseValue + 4) >> 3;
2910+ }
2911+
2912+ // We handled the tranpose permutation above and we don't
2913+ // need to permute the "no" permutation case.
2914+ if ((lastNonZero > 0) &&
2915+ (s->dsp.idct_permutation_type != FF_TRANSPOSE_IDCT_PERM) &&
2916+ (s->dsp.idct_permutation_type != FF_NO_IDCT_PERM))
2917+ {
2918+ ff_block_permute(data, s->dsp.idct_permutation,
2919+ s->intra_scantable.scantable, lastNonZero);
2920+ }
2921+
2922+ return lastNonZero;
2923+}
2924+#undef FOUROF
2925+
2926+/*
2927+ AltiVec version of dct_unquantize_h263
2928+ this code assumes `block' is 16 bytes-aligned
2929+*/
2930+void dct_unquantize_h263_altivec(MpegEncContext *s,
2931+ DCTELEM *block, int n, int qscale)
2932+{
2933+POWERPC_PERF_DECLARE(altivec_dct_unquantize_h263_num, 1);
2934+ int i, level, qmul, qadd;
2935+ int nCoeffs;
2936+
2937+ assert(s->block_last_index[n]>=0);
2938+
2939+POWERPC_PERF_START_COUNT(altivec_dct_unquantize_h263_num, 1);
2940+
2941+ qadd = (qscale - 1) | 1;
2942+ qmul = qscale << 1;
2943+
2944+ if (s->mb_intra) {
2945+ if (!s->h263_aic) {
2946+ if (n < 4)
2947+ block[0] = block[0] * s->y_dc_scale;
2948+ else
2949+ block[0] = block[0] * s->c_dc_scale;
2950+ }else
2951+ qadd = 0;
2952+ i = 1;
2953+ nCoeffs= 63; //does not allways use zigzag table
2954+ } else {
2955+ i = 0;
2956+ nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
2957+ }
2958+
2959+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
2960+ for(;i<=nCoeffs;i++) {
2961+ level = block[i];
2962+ if (level) {
2963+ if (level < 0) {
2964+ level = level * qmul - qadd;
2965+ } else {
2966+ level = level * qmul + qadd;
2967+ }
2968+ block[i] = level;
2969+ }
2970+ }
2971+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
2972+ {
2973+ register const vector short vczero = (const vector short)vec_splat_s16(0);
2974+ short __attribute__ ((aligned(16))) qmul8[] =
2975+ {
2976+ qmul, qmul, qmul, qmul,
2977+ qmul, qmul, qmul, qmul
2978+ };
2979+ short __attribute__ ((aligned(16))) qadd8[] =
2980+ {
2981+ qadd, qadd, qadd, qadd,
2982+ qadd, qadd, qadd, qadd
2983+ };
2984+ short __attribute__ ((aligned(16))) nqadd8[] =
2985+ {
2986+ -qadd, -qadd, -qadd, -qadd,
2987+ -qadd, -qadd, -qadd, -qadd
2988+ };
2989+ register vector short blockv, qmulv, qaddv, nqaddv, temp1;
2990+ register vector bool short blockv_null, blockv_neg;
2991+ register short backup_0 = block[0];
2992+ register int j = 0;
2993+
2994+ qmulv = vec_ld(0, qmul8);
2995+ qaddv = vec_ld(0, qadd8);
2996+ nqaddv = vec_ld(0, nqadd8);
2997+
2998+#if 0 // block *is* 16 bytes-aligned, it seems.
2999+ // first make sure block[j] is 16 bytes-aligned
3000+ for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) {
3001+ level = block[j];
3002+ if (level) {
3003+ if (level < 0) {
3004+ level = level * qmul - qadd;
3005+ } else {
3006+ level = level * qmul + qadd;
3007+ }
3008+ block[j] = level;
3009+ }
3010+ }
3011+#endif
3012+
3013+ // vectorize all the 16 bytes-aligned blocks
3014+ // of 8 elements
3015+ for(; (j + 7) <= nCoeffs ; j+=8)
3016+ {
3017+ blockv = vec_ld(j << 1, block);
3018+ blockv_neg = vec_cmplt(blockv, vczero);
3019+ blockv_null = vec_cmpeq(blockv, vczero);
3020+ // choose between +qadd or -qadd as the third operand
3021+ temp1 = vec_sel(qaddv, nqaddv, blockv_neg);
3022+ // multiply & add (block{i,i+7} * qmul [+-] qadd)
3023+ temp1 = vec_mladd(blockv, qmulv, temp1);
3024+ // put 0 where block[{i,i+7} used to have 0
3025+ blockv = vec_sel(temp1, blockv, blockv_null);
3026+ vec_st(blockv, j << 1, block);
3027+ }
3028+
3029+ // if nCoeffs isn't a multiple of 8, finish the job
3030+ // using good old scalar units.
3031+ // (we could do it using a truncated vector,
3032+ // but I'm not sure it's worth the hassle)
3033+ for(; j <= nCoeffs ; j++) {
3034+ level = block[j];
3035+ if (level) {
3036+ if (level < 0) {
3037+ level = level * qmul - qadd;
3038+ } else {
3039+ level = level * qmul + qadd;
3040+ }
3041+ block[j] = level;
3042+ }
3043+ }
3044+
3045+ if (i == 1)
3046+ { // cheat. this avoid special-casing the first iteration
3047+ block[0] = backup_0;
3048+ }
3049+ }
3050+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
3051+
3052+POWERPC_PERF_STOP_COUNT(altivec_dct_unquantize_h263_num, nCoeffs == 63);
3053+}
3054diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/mpegvideo_ppc.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/mpegvideo_ppc.c
3055--- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/mpegvideo_ppc.c 1970-01-01 01:00:00.000000000 +0100
3056+++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/mpegvideo_ppc.c 2003-09-28 17:26:40.000000000 +0200
3057@@ -0,0 +1,83 @@
3058+/*\r
3059+ * Copyright (c) 2002 Dieter Shirley\r
3060+ *\r
3061+ * This library is free software; you can redistribute it and/or\r
3062+ * modify it under the terms of the GNU Lesser General Public\r
3063+ * License as published by the Free Software Foundation; either\r
3064+ * version 2 of the License, or (at your option) any later version.\r
3065+ *\r
3066+ * This library is distributed in the hope that it will be useful,\r
3067+ * but WITHOUT ANY WARRANTY; without even the implied warranty of\r
3068+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\r
3069+ * Lesser General Public License for more details.\r
3070+ *\r
3071+ * You should have received a copy of the GNU Lesser General Public\r
3072+ * License along with this library; if not, write to the Free Software\r
3073+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA\r
3074+ */\r
3075+ \r
3076+#include "../dsputil.h"\r
3077+#include "../mpegvideo.h"\r
3078+#include <time.h>\r
3079+\r
3080+#ifdef HAVE_ALTIVEC\r
3081+#include "dsputil_altivec.h"\r
3082+#endif\r
3083+\r
3084+extern int dct_quantize_altivec(MpegEncContext *s, \r
3085+ DCTELEM *block, int n,\r
3086+ int qscale, int *overflow);\r
3087+extern void dct_unquantize_h263_altivec(MpegEncContext *s,
3088+ DCTELEM *block, int n, int qscale);
3089+\r
3090+extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);\r
3091+extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);\r
3092+\r
3093+\r
3094+void MPV_common_init_ppc(MpegEncContext *s)\r
3095+{\r
3096+#if HAVE_ALTIVEC\r
3097+ if (has_altivec())\r
3098+ {\r
3099+ if ((s->avctx->idct_algo == FF_IDCT_AUTO) ||\r
3100+ (s->avctx->idct_algo == FF_IDCT_ALTIVEC))\r
3101+ {\r
3102+ s->dsp.idct_put = idct_put_altivec;\r
3103+ s->dsp.idct_add = idct_add_altivec;\r
3104+#ifndef ALTIVEC_USE_REFERENCE_C_CODE
3105+ s->dsp.idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;\r
3106+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
3107+ s->dsp.idct_permutation_type = FF_NO_IDCT_PERM;
3108+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
3109+ }\r
3110+\r
3111+ // Test to make sure that the dct required alignments are met.\r
3112+ if ((((long)(s->q_intra_matrix) & 0x0f) != 0) ||\r
3113+ (((long)(s->q_inter_matrix) & 0x0f) != 0))\r
3114+ {\r
3115+ fprintf(stderr, "Internal Error: q-matrix blocks must be 16-byte aligned "\r
3116+ "to use Altivec DCT. Reverting to non-altivec version.\n");\r
3117+ return;\r
3118+ }\r
3119+\r
3120+ if (((long)(s->intra_scantable.inverse) & 0x0f) != 0)\r
3121+ {\r
3122+ fprintf(stderr, "Internal Error: scan table blocks must be 16-byte aligned "\r
3123+ "to use Altivec DCT. Reverting to non-altivec version.\n");\r
3124+ return;\r
3125+ }\r
3126+\r
3127+\r
3128+ if ((s->avctx->dct_algo == FF_DCT_AUTO) ||\r
3129+ (s->avctx->dct_algo == FF_DCT_ALTIVEC))\r
3130+ {\r
3131+ s->dct_quantize = dct_quantize_altivec;\r
3132+ s->dct_unquantize_h263 = dct_unquantize_h263_altivec;
3133+ }\r
3134+ } else\r
3135+#endif\r
3136+ {\r
3137+ /* Non-AltiVec PPC optimisations here */\r
3138+ }\r
3139+}\r
3140+\r
f497b632
JB
3141--- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/Makefile.am.orig 2003-05-25 23:11:57.000000000 +0200
3142+++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/Makefile.am 2003-11-14 01:06:03.904622008 +0100
3143@@ -20,6 +20,6 @@
3144
3145 libavcodecppc_la_SOURCES = $(PPC_SRC)
3146
3147-AM_CPPFLAGS = $(LTNOPIC) -DHAVE_AV_CONFIG_H -I$(srcdir)/../..
c333e025 3148+AM_CPPFLAGS = $(LTNOPIC) -DHAVE_AV_CONFIG_H -DHAVE_ALTIVEC_H -DHAVE_ALTIVEC -maltivec -mabi=altivec -I$(srcdir)/../..
f497b632
JB
3149
3150 MAINTAINERCLEANFILES = Makefile.in
This page took 0.536643 seconds and 4 git commands to generate.