From 9912bce9035fa14f6face36d14de0a48baac7463 Mon Sep 17 00:00:00 2001 From: Jakub Bogusz Date: Thu, 13 Nov 2003 23:44:44 +0000 Subject: [PATCH 1/1] - missing files+update from ffmpeg 0.4.8 Changed files: avifile-ffmpeg-ppc.patch -> 1.1 --- avifile-ffmpeg-ppc.patch | 3140 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 3140 insertions(+) create mode 100644 avifile-ffmpeg-ppc.patch diff --git a/avifile-ffmpeg-ppc.patch b/avifile-ffmpeg-ppc.patch new file mode 100644 index 0000000..356b511 --- /dev/null +++ b/avifile-ffmpeg-ppc.patch @@ -0,0 +1,3140 @@ +diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/dsputil_altivec.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/dsputil_altivec.c +--- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/dsputil_altivec.c 1970-01-01 01:00:00.000000000 +0100 ++++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/dsputil_altivec.c 2003-09-28 17:26:40.000000000 +0200 +@@ -0,0 +1,1345 @@ ++/* ++ * Copyright (c) 2002 Brian Foley ++ * Copyright (c) 2002 Dieter Shirley ++ * Copyright (c) 2003 Romain Dolbeau ++ * ++ * This library is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2 of the License, or (at your option) any later version. ++ * ++ * This library is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with this library; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ */ ++ ++#include "../dsputil.h" ++ ++#include "gcc_fixes.h" ++ ++#include "dsputil_altivec.h" ++ ++#ifdef CONFIG_DARWIN ++#include ++#else /* CONFIG_DARWIN */ ++#include ++#include ++ ++static sigjmp_buf jmpbuf; ++static volatile sig_atomic_t canjump = 0; ++ ++static void sigill_handler (int sig) ++{ ++ if (!canjump) { ++ signal (sig, SIG_DFL); ++ raise (sig); ++ } ++ ++ canjump = 0; ++ siglongjmp (jmpbuf, 1); ++} ++#endif /* CONFIG_DARWIN */ ++ ++int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) ++{ ++ int i; ++ int s __attribute__((aligned(16))); ++ const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); ++ vector unsigned char *tv; ++ vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; ++ vector unsigned int sad; ++ vector signed int sumdiffs; ++ ++ s = 0; ++ sad = (vector unsigned int)vec_splat_u32(0); ++ for(i=0;i<16;i++) { ++ /* ++ Read unaligned pixels into our vectors. The vectors are as follows: ++ pix1v: pix1[0]-pix1[15] ++ pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] ++ */ ++ tv = (vector unsigned char *) pix1; ++ pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); ++ ++ tv = (vector unsigned char *) &pix2[0]; ++ pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); ++ ++ tv = (vector unsigned char *) &pix2[1]; ++ pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); ++ ++ /* Calculate the average vector */ ++ avgv = vec_avg(pix2v, pix2iv); ++ ++ /* Calculate a sum of abs differences vector */ ++ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); ++ ++ /* Add each 4 pixel group together and put 4 results into sad */ ++ sad = vec_sum4s(t5, sad); ++ ++ pix1 += line_size; ++ pix2 += line_size; ++ } ++ /* Sum up the four partial sums, and put the result into s */ ++ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); ++ sumdiffs = vec_splat(sumdiffs, 3); ++ vec_ste(sumdiffs, 0, &s); ++ ++ return s; ++} ++ ++int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) ++{ ++ int i; ++ int s __attribute__((aligned(16))); ++ const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); ++ vector unsigned char *tv; ++ vector unsigned char pix1v, pix2v, pix3v, avgv, t5; ++ vector unsigned int sad; ++ vector signed int sumdiffs; ++ uint8_t *pix3 = pix2 + line_size; ++ ++ s = 0; ++ sad = (vector unsigned int)vec_splat_u32(0); ++ ++ /* ++ Due to the fact that pix3 = pix2 + line_size, the pix3 of one ++ iteration becomes pix2 in the next iteration. We can use this ++ fact to avoid a potentially expensive unaligned read, each ++ time around the loop. ++ Read unaligned pixels into our vectors. The vectors are as follows: ++ pix2v: pix2[0]-pix2[15] ++ Split the pixel vectors into shorts ++ */ ++ tv = (vector unsigned char *) &pix2[0]; ++ pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); ++ ++ for(i=0;i<16;i++) { ++ /* ++ Read unaligned pixels into our vectors. The vectors are as follows: ++ pix1v: pix1[0]-pix1[15] ++ pix3v: pix3[0]-pix3[15] ++ */ ++ tv = (vector unsigned char *) pix1; ++ pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); ++ ++ tv = (vector unsigned char *) &pix3[0]; ++ pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); ++ ++ /* Calculate the average vector */ ++ avgv = vec_avg(pix2v, pix3v); ++ ++ /* Calculate a sum of abs differences vector */ ++ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); ++ ++ /* Add each 4 pixel group together and put 4 results into sad */ ++ sad = vec_sum4s(t5, sad); ++ ++ pix1 += line_size; ++ pix2v = pix3v; ++ pix3 += line_size; ++ ++ } ++ ++ /* Sum up the four partial sums, and put the result into s */ ++ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); ++ sumdiffs = vec_splat(sumdiffs, 3); ++ vec_ste(sumdiffs, 0, &s); ++ return s; ++} ++ ++int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) ++{ ++ int i; ++ int s __attribute__((aligned(16))); ++ uint8_t *pix3 = pix2 + line_size; ++ const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); ++ const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2); ++ vector unsigned char *tv, avgv, t5; ++ vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv; ++ vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv; ++ vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; ++ vector unsigned short avghv, avglv; ++ vector unsigned short t1, t2, t3, t4; ++ vector unsigned int sad; ++ vector signed int sumdiffs; ++ ++ sad = (vector unsigned int)vec_splat_u32(0); ++ ++ s = 0; ++ ++ /* ++ Due to the fact that pix3 = pix2 + line_size, the pix3 of one ++ iteration becomes pix2 in the next iteration. We can use this ++ fact to avoid a potentially expensive unaligned read, as well ++ as some splitting, and vector addition each time around the loop. ++ Read unaligned pixels into our vectors. The vectors are as follows: ++ pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] ++ Split the pixel vectors into shorts ++ */ ++ tv = (vector unsigned char *) &pix2[0]; ++ pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); ++ ++ tv = (vector unsigned char *) &pix2[1]; ++ pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); ++ ++ pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v); ++ pix2lv = (vector unsigned short) vec_mergel(zero, pix2v); ++ pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv); ++ pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv); ++ t1 = vec_add(pix2hv, pix2ihv); ++ t2 = vec_add(pix2lv, pix2ilv); ++ ++ for(i=0;i<16;i++) { ++ /* ++ Read unaligned pixels into our vectors. The vectors are as follows: ++ pix1v: pix1[0]-pix1[15] ++ pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16] ++ */ ++ tv = (vector unsigned char *) pix1; ++ pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); ++ ++ tv = (vector unsigned char *) &pix3[0]; ++ pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); ++ ++ tv = (vector unsigned char *) &pix3[1]; ++ pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1])); ++ ++ /* ++ Note that Altivec does have vec_avg, but this works on vector pairs ++ and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding ++ would mean that, for example, avg(3,0,0,1) = 2, when it should be 1. ++ Instead, we have to split the pixel vectors into vectors of shorts, ++ and do the averaging by hand. ++ */ ++ ++ /* Split the pixel vectors into shorts */ ++ pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); ++ pix3lv = (vector unsigned short) vec_mergel(zero, pix3v); ++ pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv); ++ pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv); ++ ++ /* Do the averaging on them */ ++ t3 = vec_add(pix3hv, pix3ihv); ++ t4 = vec_add(pix3lv, pix3ilv); ++ ++ avghv = vec_sr(vec_add(vec_add(t1, t3), two), two); ++ avglv = vec_sr(vec_add(vec_add(t2, t4), two), two); ++ ++ /* Pack the shorts back into a result */ ++ avgv = vec_pack(avghv, avglv); ++ ++ /* Calculate a sum of abs differences vector */ ++ t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); ++ ++ /* Add each 4 pixel group together and put 4 results into sad */ ++ sad = vec_sum4s(t5, sad); ++ ++ pix1 += line_size; ++ pix3 += line_size; ++ /* Transfer the calculated values for pix3 into pix2 */ ++ t1 = t3; ++ t2 = t4; ++ } ++ /* Sum up the four partial sums, and put the result into s */ ++ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); ++ sumdiffs = vec_splat(sumdiffs, 3); ++ vec_ste(sumdiffs, 0, &s); ++ ++ return s; ++} ++ ++int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) ++{ ++ int i; ++ int s __attribute__((aligned(16))); ++ const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); ++ vector unsigned char perm1, perm2, *pix1v, *pix2v; ++ vector unsigned char t1, t2, t3,t4, t5; ++ vector unsigned int sad; ++ vector signed int sumdiffs; ++ ++ sad = (vector unsigned int)vec_splat_u32(0); ++ ++ ++ for(i=0;i<16;i++) { ++ /* Read potentially unaligned pixels into t1 and t2 */ ++ perm1 = vec_lvsl(0, pix1); ++ pix1v = (vector unsigned char *) pix1; ++ perm2 = vec_lvsl(0, pix2); ++ pix2v = (vector unsigned char *) pix2; ++ t1 = vec_perm(pix1v[0], pix1v[1], perm1); ++ t2 = vec_perm(pix2v[0], pix2v[1], perm2); ++ ++ /* Calculate a sum of abs differences vector */ ++ t3 = vec_max(t1, t2); ++ t4 = vec_min(t1, t2); ++ t5 = vec_sub(t3, t4); ++ ++ /* Add each 4 pixel group together and put 4 results into sad */ ++ sad = vec_sum4s(t5, sad); ++ ++ pix1 += line_size; ++ pix2 += line_size; ++ } ++ ++ /* Sum up the four partial sums, and put the result into s */ ++ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); ++ sumdiffs = vec_splat(sumdiffs, 3); ++ vec_ste(sumdiffs, 0, &s); ++ ++ return s; ++} ++ ++int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) ++{ ++ int i; ++ int s __attribute__((aligned(16))); ++ const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); ++ vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; ++ vector unsigned char t1, t2, t3,t4, t5; ++ vector unsigned int sad; ++ vector signed int sumdiffs; ++ ++ sad = (vector unsigned int)vec_splat_u32(0); ++ ++ permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); ++ ++ for(i=0;i<8;i++) { ++ /* Read potentially unaligned pixels into t1 and t2 ++ Since we're reading 16 pixels, and actually only want 8, ++ mask out the last 8 pixels. The 0s don't change the sum. */ ++ perm1 = vec_lvsl(0, pix1); ++ pix1v = (vector unsigned char *) pix1; ++ perm2 = vec_lvsl(0, pix2); ++ pix2v = (vector unsigned char *) pix2; ++ t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); ++ t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); ++ ++ /* Calculate a sum of abs differences vector */ ++ t3 = vec_max(t1, t2); ++ t4 = vec_min(t1, t2); ++ t5 = vec_sub(t3, t4); ++ ++ /* Add each 4 pixel group together and put 4 results into sad */ ++ sad = vec_sum4s(t5, sad); ++ ++ pix1 += line_size; ++ pix2 += line_size; ++ } ++ ++ /* Sum up the four partial sums, and put the result into s */ ++ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); ++ sumdiffs = vec_splat(sumdiffs, 3); ++ vec_ste(sumdiffs, 0, &s); ++ ++ return s; ++} ++ ++int pix_norm1_altivec(uint8_t *pix, int line_size) ++{ ++ int i; ++ int s __attribute__((aligned(16))); ++ const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); ++ vector unsigned char *tv; ++ vector unsigned char pixv; ++ vector unsigned int sv; ++ vector signed int sum; ++ ++ sv = (vector unsigned int)vec_splat_u32(0); ++ ++ s = 0; ++ for (i = 0; i < 16; i++) { ++ /* Read in the potentially unaligned pixels */ ++ tv = (vector unsigned char *) pix; ++ pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix)); ++ ++ /* Square the values, and add them to our sum */ ++ sv = vec_msum(pixv, pixv, sv); ++ ++ pix += line_size; ++ } ++ /* Sum up the four partial sums, and put the result into s */ ++ sum = vec_sums((vector signed int) sv, (vector signed int) zero); ++ sum = vec_splat(sum, 3); ++ vec_ste(sum, 0, &s); ++ ++ return s; ++} ++ ++/** ++ * Sum of Squared Errors for a 8x8 block. ++ * AltiVec-enhanced. ++ * It's the pix_abs8x8_altivec code above w/ squaring added. ++ */ ++int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size) ++{ ++ int i; ++ int s __attribute__((aligned(16))); ++ const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); ++ vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; ++ vector unsigned char t1, t2, t3,t4, t5; ++ vector unsigned int sum; ++ vector signed int sumsqr; ++ ++ sum = (vector unsigned int)vec_splat_u32(0); ++ ++ permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); ++ ++ ++ for(i=0;i<8;i++) { ++ /* Read potentially unaligned pixels into t1 and t2 ++ Since we're reading 16 pixels, and actually only want 8, ++ mask out the last 8 pixels. The 0s don't change the sum. */ ++ perm1 = vec_lvsl(0, pix1); ++ pix1v = (vector unsigned char *) pix1; ++ perm2 = vec_lvsl(0, pix2); ++ pix2v = (vector unsigned char *) pix2; ++ t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); ++ t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); ++ ++ /* ++ Since we want to use unsigned chars, we can take advantage ++ of the fact that abs(a-b)^2 = (a-b)^2. ++ */ ++ ++ /* Calculate abs differences vector */ ++ t3 = vec_max(t1, t2); ++ t4 = vec_min(t1, t2); ++ t5 = vec_sub(t3, t4); ++ ++ /* Square the values and add them to our sum */ ++ sum = vec_msum(t5, t5, sum); ++ ++ pix1 += line_size; ++ pix2 += line_size; ++ } ++ ++ /* Sum up the four partial sums, and put the result into s */ ++ sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); ++ sumsqr = vec_splat(sumsqr, 3); ++ vec_ste(sumsqr, 0, &s); ++ ++ return s; ++} ++ ++/** ++ * Sum of Squared Errors for a 16x16 block. ++ * AltiVec-enhanced. ++ * It's the pix_abs16x16_altivec code above w/ squaring added. ++ */ ++int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size) ++{ ++ int i; ++ int s __attribute__((aligned(16))); ++ const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); ++ vector unsigned char perm1, perm2, *pix1v, *pix2v; ++ vector unsigned char t1, t2, t3,t4, t5; ++ vector unsigned int sum; ++ vector signed int sumsqr; ++ ++ sum = (vector unsigned int)vec_splat_u32(0); ++ ++ for(i=0;i<16;i++) { ++ /* Read potentially unaligned pixels into t1 and t2 */ ++ perm1 = vec_lvsl(0, pix1); ++ pix1v = (vector unsigned char *) pix1; ++ perm2 = vec_lvsl(0, pix2); ++ pix2v = (vector unsigned char *) pix2; ++ t1 = vec_perm(pix1v[0], pix1v[1], perm1); ++ t2 = vec_perm(pix2v[0], pix2v[1], perm2); ++ ++ /* ++ Since we want to use unsigned chars, we can take advantage ++ of the fact that abs(a-b)^2 = (a-b)^2. ++ */ ++ ++ /* Calculate abs differences vector */ ++ t3 = vec_max(t1, t2); ++ t4 = vec_min(t1, t2); ++ t5 = vec_sub(t3, t4); ++ ++ /* Square the values and add them to our sum */ ++ sum = vec_msum(t5, t5, sum); ++ ++ pix1 += line_size; ++ pix2 += line_size; ++ } ++ ++ /* Sum up the four partial sums, and put the result into s */ ++ sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); ++ sumsqr = vec_splat(sumsqr, 3); ++ vec_ste(sumsqr, 0, &s); ++ ++ return s; ++} ++ ++int pix_sum_altivec(uint8_t * pix, int line_size) ++{ ++ const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); ++ vector unsigned char perm, *pixv; ++ vector unsigned char t1; ++ vector unsigned int sad; ++ vector signed int sumdiffs; ++ ++ int i; ++ int s __attribute__((aligned(16))); ++ ++ sad = (vector unsigned int)vec_splat_u32(0); ++ ++ for (i = 0; i < 16; i++) { ++ /* Read the potentially unaligned 16 pixels into t1 */ ++ perm = vec_lvsl(0, pix); ++ pixv = (vector unsigned char *) pix; ++ t1 = vec_perm(pixv[0], pixv[1], perm); ++ ++ /* Add each 4 pixel group together and put 4 results into sad */ ++ sad = vec_sum4s(t1, sad); ++ ++ pix += line_size; ++ } ++ ++ /* Sum up the four partial sums, and put the result into s */ ++ sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); ++ sumdiffs = vec_splat(sumdiffs, 3); ++ vec_ste(sumdiffs, 0, &s); ++ ++ return s; ++} ++ ++void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size) ++{ ++ int i; ++ vector unsigned char perm, bytes, *pixv; ++ const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); ++ vector signed short shorts; ++ ++ for(i=0;i<8;i++) ++ { ++ // Read potentially unaligned pixels. ++ // We're reading 16 pixels, and actually only want 8, ++ // but we simply ignore the extras. ++ perm = vec_lvsl(0, pixels); ++ pixv = (vector unsigned char *) pixels; ++ bytes = vec_perm(pixv[0], pixv[1], perm); ++ ++ // convert the bytes into shorts ++ shorts = (vector signed short)vec_mergeh(zero, bytes); ++ ++ // save the data to the block, we assume the block is 16-byte aligned ++ vec_st(shorts, i*16, (vector signed short*)block); ++ ++ pixels += line_size; ++ } ++} ++ ++void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1, ++ const uint8_t *s2, int stride) ++{ ++ int i; ++ vector unsigned char perm, bytes, *pixv; ++ const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); ++ vector signed short shorts1, shorts2; ++ ++ for(i=0;i<4;i++) ++ { ++ // Read potentially unaligned pixels ++ // We're reading 16 pixels, and actually only want 8, ++ // but we simply ignore the extras. ++ perm = vec_lvsl(0, s1); ++ pixv = (vector unsigned char *) s1; ++ bytes = vec_perm(pixv[0], pixv[1], perm); ++ ++ // convert the bytes into shorts ++ shorts1 = (vector signed short)vec_mergeh(zero, bytes); ++ ++ // Do the same for the second block of pixels ++ perm = vec_lvsl(0, s2); ++ pixv = (vector unsigned char *) s2; ++ bytes = vec_perm(pixv[0], pixv[1], perm); ++ ++ // convert the bytes into shorts ++ shorts2 = (vector signed short)vec_mergeh(zero, bytes); ++ ++ // Do the subtraction ++ shorts1 = vec_sub(shorts1, shorts2); ++ ++ // save the data to the block, we assume the block is 16-byte aligned ++ vec_st(shorts1, 0, (vector signed short*)block); ++ ++ s1 += stride; ++ s2 += stride; ++ block += 8; ++ ++ ++ // The code below is a copy of the code above... This is a manual ++ // unroll. ++ ++ // Read potentially unaligned pixels ++ // We're reading 16 pixels, and actually only want 8, ++ // but we simply ignore the extras. ++ perm = vec_lvsl(0, s1); ++ pixv = (vector unsigned char *) s1; ++ bytes = vec_perm(pixv[0], pixv[1], perm); ++ ++ // convert the bytes into shorts ++ shorts1 = (vector signed short)vec_mergeh(zero, bytes); ++ ++ // Do the same for the second block of pixels ++ perm = vec_lvsl(0, s2); ++ pixv = (vector unsigned char *) s2; ++ bytes = vec_perm(pixv[0], pixv[1], perm); ++ ++ // convert the bytes into shorts ++ shorts2 = (vector signed short)vec_mergeh(zero, bytes); ++ ++ // Do the subtraction ++ shorts1 = vec_sub(shorts1, shorts2); ++ ++ // save the data to the block, we assume the block is 16-byte aligned ++ vec_st(shorts1, 0, (vector signed short*)block); ++ ++ s1 += stride; ++ s2 += stride; ++ block += 8; ++ } ++} ++ ++int sad16x16_altivec(void *s, uint8_t *a, uint8_t *b, int stride) { ++ return pix_abs16x16_altivec(a,b,stride); ++} ++ ++int sad8x8_altivec(void *s, uint8_t *a, uint8_t *b, int stride) { ++ return pix_abs8x8_altivec(a,b,stride); ++} ++ ++void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) { ++#ifdef ALTIVEC_USE_REFERENCE_C_CODE ++ int i; ++ for(i=0; i+7l); ++ *((uint32_t*)(block+4)) = (((const struct unaligned_32 *) (pixels+4))->l); ++ *((uint32_t*)(block+8)) = (((const struct unaligned_32 *) (pixels+8))->l); ++ *((uint32_t*)(block+12)) = (((const struct unaligned_32 *) (pixels+12))->l); ++ pixels+=line_size; ++ block +=line_size; ++ } ++ ++POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1); ++ ++#else /* ALTIVEC_USE_REFERENCE_C_CODE */ ++ register vector unsigned char pixelsv1, pixelsv2; ++ register vector unsigned char pixelsv1B, pixelsv2B; ++ register vector unsigned char pixelsv1C, pixelsv2C; ++ register vector unsigned char pixelsv1D, pixelsv2D; ++ ++ register vector unsigned char perm = vec_lvsl(0, pixels); ++ int i; ++ register int line_size_2 = line_size << 1; ++ register int line_size_3 = line_size + line_size_2; ++ register int line_size_4 = line_size << 2; ++ ++POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1); ++// hand-unrolling the loop by 4 gains about 15% ++// mininum execution time goes from 74 to 60 cycles ++// it's faster than -funroll-loops, but using ++// -funroll-loops w/ this is bad - 74 cycles again. ++// all this is on a 7450, tuning for the 7450 ++#if 0 ++ for(i=0; i>1) ) ++void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) ++{ ++POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1); ++#ifdef ALTIVEC_USE_REFERENCE_C_CODE ++ int i; ++ ++POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1); ++ ++ for(i=0; il)); ++ op_avg(*((uint32_t*)(block+4)),(((const struct unaligned_32 *)(pixels+4))->l)); ++ op_avg(*((uint32_t*)(block+8)),(((const struct unaligned_32 *)(pixels+8))->l)); ++ op_avg(*((uint32_t*)(block+12)),(((const struct unaligned_32 *)(pixels+12))->l)); ++ pixels+=line_size; ++ block +=line_size; ++ } ++ ++POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1); ++ ++#else /* ALTIVEC_USE_REFERENCE_C_CODE */ ++ register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; ++ register vector unsigned char perm = vec_lvsl(0, pixels); ++ int i; ++ ++POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1); ++ ++ for(i=0; il))) - ++ ((((*((uint32_t *) (block))) ^ ++ ((((const struct unaligned_32 *) (pixels))-> ++ l))) & 0xFEFEFEFEUL) >> 1)); ++ *((uint32_t *) (block + 4)) = ++ (((*((uint32_t *) (block + 4))) | ++ ((((const struct unaligned_32 *) (pixels + 4))->l))) - ++ ((((*((uint32_t *) (block + 4))) ^ ++ ((((const struct unaligned_32 *) (pixels + ++ 4))-> ++ l))) & 0xFEFEFEFEUL) >> 1)); ++ pixels += line_size; ++ block += line_size; ++ } ++POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1); ++ ++#else /* ALTIVEC_USE_REFERENCE_C_CODE */ ++ register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; ++ int i; ++ ++POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1); ++ ++ for (i = 0; i < h; i++) { ++ /* ++ block is 8 bytes-aligned, so we're either in the ++ left block (16 bytes-aligned) or in the right block (not) ++ */ ++ int rightside = ((unsigned long)block & 0x0000000F); ++ ++ blockv = vec_ld(0, block); ++ pixelsv1 = vec_ld(0, (unsigned char*)pixels); ++ pixelsv2 = vec_ld(16, (unsigned char*)pixels); ++ pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels)); ++ ++ if (rightside) ++ { ++ pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); ++ } ++ else ++ { ++ pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); ++ } ++ ++ blockv = vec_avg(blockv, pixelsv); ++ ++ vec_st(blockv, 0, block); ++ ++ pixels += line_size; ++ block += line_size; ++ } ++ ++POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1); ++ ++#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ ++} ++ ++/* next one assumes that ((line_size % 8) == 0) */ ++void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) ++{ ++POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1); ++#ifdef ALTIVEC_USE_REFERENCE_C_CODE ++ int j; ++POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1); ++ for (j = 0; j < 2; j++) { ++ int i; ++ const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); ++ const uint32_t b = ++ (((const struct unaligned_32 *) (pixels + 1))->l); ++ uint32_t l0 = ++ (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; ++ uint32_t h0 = ++ ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); ++ uint32_t l1, h1; ++ pixels += line_size; ++ for (i = 0; i < h; i += 2) { ++ uint32_t a = (((const struct unaligned_32 *) (pixels))->l); ++ uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); ++ l1 = (a & 0x03030303UL) + (b & 0x03030303UL); ++ h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); ++ *((uint32_t *) block) = ++ h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); ++ pixels += line_size; ++ block += line_size; ++ a = (((const struct unaligned_32 *) (pixels))->l); ++ b = (((const struct unaligned_32 *) (pixels + 1))->l); ++ l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; ++ h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); ++ *((uint32_t *) block) = ++ h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); ++ pixels += line_size; ++ block += line_size; ++ } pixels += 4 - line_size * (h + 1); ++ block += 4 - line_size * h; ++ } ++ ++POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); ++ ++#else /* ALTIVEC_USE_REFERENCE_C_CODE */ ++ register int i; ++ register vector unsigned char ++ pixelsv1, pixelsv2, ++ pixelsavg; ++ register vector unsigned char ++ blockv, temp1, temp2; ++ register vector unsigned short ++ pixelssum1, pixelssum2, temp3; ++ register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); ++ register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); ++ ++ temp1 = vec_ld(0, pixels); ++ temp2 = vec_ld(16, pixels); ++ pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); ++ if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) ++ { ++ pixelsv2 = temp2; ++ } ++ else ++ { ++ pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); ++ } ++ pixelsv1 = vec_mergeh(vczero, pixelsv1); ++ pixelsv2 = vec_mergeh(vczero, pixelsv2); ++ pixelssum1 = vec_add((vector unsigned short)pixelsv1, ++ (vector unsigned short)pixelsv2); ++ pixelssum1 = vec_add(pixelssum1, vctwo); ++ ++POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1); ++ for (i = 0; i < h ; i++) { ++ int rightside = ((unsigned long)block & 0x0000000F); ++ blockv = vec_ld(0, block); ++ ++ temp1 = vec_ld(line_size, pixels); ++ temp2 = vec_ld(line_size + 16, pixels); ++ pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); ++ if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) ++ { ++ pixelsv2 = temp2; ++ } ++ else ++ { ++ pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); ++ } ++ ++ pixelsv1 = vec_mergeh(vczero, pixelsv1); ++ pixelsv2 = vec_mergeh(vczero, pixelsv2); ++ pixelssum2 = vec_add((vector unsigned short)pixelsv1, ++ (vector unsigned short)pixelsv2); ++ temp3 = vec_add(pixelssum1, pixelssum2); ++ temp3 = vec_sra(temp3, vctwo); ++ pixelssum1 = vec_add(pixelssum2, vctwo); ++ pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); ++ ++ if (rightside) ++ { ++ blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); ++ } ++ else ++ { ++ blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); ++ } ++ ++ vec_st(blockv, 0, block); ++ ++ block += line_size; ++ pixels += line_size; ++ } ++ ++POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); ++#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ ++} ++ ++/* next one assumes that ((line_size % 8) == 0) */ ++void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) ++{ ++POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1); ++#ifdef ALTIVEC_USE_REFERENCE_C_CODE ++ int j; ++POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); ++ for (j = 0; j < 2; j++) { ++ int i; ++ const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); ++ const uint32_t b = ++ (((const struct unaligned_32 *) (pixels + 1))->l); ++ uint32_t l0 = ++ (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; ++ uint32_t h0 = ++ ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); ++ uint32_t l1, h1; ++ pixels += line_size; ++ for (i = 0; i < h; i += 2) { ++ uint32_t a = (((const struct unaligned_32 *) (pixels))->l); ++ uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); ++ l1 = (a & 0x03030303UL) + (b & 0x03030303UL); ++ h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); ++ *((uint32_t *) block) = ++ h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); ++ pixels += line_size; ++ block += line_size; ++ a = (((const struct unaligned_32 *) (pixels))->l); ++ b = (((const struct unaligned_32 *) (pixels + 1))->l); ++ l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; ++ h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); ++ *((uint32_t *) block) = ++ h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); ++ pixels += line_size; ++ block += line_size; ++ } pixels += 4 - line_size * (h + 1); ++ block += 4 - line_size * h; ++ } ++ ++POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); ++ ++#else /* ALTIVEC_USE_REFERENCE_C_CODE */ ++ register int i; ++ register vector unsigned char ++ pixelsv1, pixelsv2, ++ pixelsavg; ++ register vector unsigned char ++ blockv, temp1, temp2; ++ register vector unsigned short ++ pixelssum1, pixelssum2, temp3; ++ register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); ++ register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); ++ register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); ++ ++ temp1 = vec_ld(0, pixels); ++ temp2 = vec_ld(16, pixels); ++ pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); ++ if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) ++ { ++ pixelsv2 = temp2; ++ } ++ else ++ { ++ pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); ++ } ++ pixelsv1 = vec_mergeh(vczero, pixelsv1); ++ pixelsv2 = vec_mergeh(vczero, pixelsv2); ++ pixelssum1 = vec_add((vector unsigned short)pixelsv1, ++ (vector unsigned short)pixelsv2); ++ pixelssum1 = vec_add(pixelssum1, vcone); ++ ++POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); ++ for (i = 0; i < h ; i++) { ++ int rightside = ((unsigned long)block & 0x0000000F); ++ blockv = vec_ld(0, block); ++ ++ temp1 = vec_ld(line_size, pixels); ++ temp2 = vec_ld(line_size + 16, pixels); ++ pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); ++ if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) ++ { ++ pixelsv2 = temp2; ++ } ++ else ++ { ++ pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); ++ } ++ ++ pixelsv1 = vec_mergeh(vczero, pixelsv1); ++ pixelsv2 = vec_mergeh(vczero, pixelsv2); ++ pixelssum2 = vec_add((vector unsigned short)pixelsv1, ++ (vector unsigned short)pixelsv2); ++ temp3 = vec_add(pixelssum1, pixelssum2); ++ temp3 = vec_sra(temp3, vctwo); ++ pixelssum1 = vec_add(pixelssum2, vcone); ++ pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); ++ ++ if (rightside) ++ { ++ blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); ++ } ++ else ++ { ++ blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); ++ } ++ ++ vec_st(blockv, 0, block); ++ ++ block += line_size; ++ pixels += line_size; ++ } ++ ++POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); ++#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ ++} ++ ++/* next one assumes that ((line_size % 16) == 0) */ ++void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) ++{ ++POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1); ++#ifdef ALTIVEC_USE_REFERENCE_C_CODE ++ int j; ++POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1); ++ for (j = 0; j < 4; j++) { ++ int i; ++ const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); ++ const uint32_t b = ++ (((const struct unaligned_32 *) (pixels + 1))->l); ++ uint32_t l0 = ++ (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; ++ uint32_t h0 = ++ ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); ++ uint32_t l1, h1; ++ pixels += line_size; ++ for (i = 0; i < h; i += 2) { ++ uint32_t a = (((const struct unaligned_32 *) (pixels))->l); ++ uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); ++ l1 = (a & 0x03030303UL) + (b & 0x03030303UL); ++ h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); ++ *((uint32_t *) block) = ++ h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); ++ pixels += line_size; ++ block += line_size; ++ a = (((const struct unaligned_32 *) (pixels))->l); ++ b = (((const struct unaligned_32 *) (pixels + 1))->l); ++ l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; ++ h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); ++ *((uint32_t *) block) = ++ h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); ++ pixels += line_size; ++ block += line_size; ++ } pixels += 4 - line_size * (h + 1); ++ block += 4 - line_size * h; ++ } ++ ++POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); ++ ++#else /* ALTIVEC_USE_REFERENCE_C_CODE */ ++ register int i; ++ register vector unsigned char ++ pixelsv1, pixelsv2, pixelsv3, pixelsv4; ++ register vector unsigned char ++ blockv, temp1, temp2; ++ register vector unsigned short ++ pixelssum1, pixelssum2, temp3, ++ pixelssum3, pixelssum4, temp4; ++ register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); ++ register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); ++ ++POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1); ++ ++ temp1 = vec_ld(0, pixels); ++ temp2 = vec_ld(16, pixels); ++ pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); ++ if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) ++ { ++ pixelsv2 = temp2; ++ } ++ else ++ { ++ pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); ++ } ++ pixelsv3 = vec_mergel(vczero, pixelsv1); ++ pixelsv4 = vec_mergel(vczero, pixelsv2); ++ pixelsv1 = vec_mergeh(vczero, pixelsv1); ++ pixelsv2 = vec_mergeh(vczero, pixelsv2); ++ pixelssum3 = vec_add((vector unsigned short)pixelsv3, ++ (vector unsigned short)pixelsv4); ++ pixelssum3 = vec_add(pixelssum3, vctwo); ++ pixelssum1 = vec_add((vector unsigned short)pixelsv1, ++ (vector unsigned short)pixelsv2); ++ pixelssum1 = vec_add(pixelssum1, vctwo); ++ ++ for (i = 0; i < h ; i++) { ++ blockv = vec_ld(0, block); ++ ++ temp1 = vec_ld(line_size, pixels); ++ temp2 = vec_ld(line_size + 16, pixels); ++ pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); ++ if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) ++ { ++ pixelsv2 = temp2; ++ } ++ else ++ { ++ pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); ++ } ++ ++ pixelsv3 = vec_mergel(vczero, pixelsv1); ++ pixelsv4 = vec_mergel(vczero, pixelsv2); ++ pixelsv1 = vec_mergeh(vczero, pixelsv1); ++ pixelsv2 = vec_mergeh(vczero, pixelsv2); ++ ++ pixelssum4 = vec_add((vector unsigned short)pixelsv3, ++ (vector unsigned short)pixelsv4); ++ pixelssum2 = vec_add((vector unsigned short)pixelsv1, ++ (vector unsigned short)pixelsv2); ++ temp4 = vec_add(pixelssum3, pixelssum4); ++ temp4 = vec_sra(temp4, vctwo); ++ temp3 = vec_add(pixelssum1, pixelssum2); ++ temp3 = vec_sra(temp3, vctwo); ++ ++ pixelssum3 = vec_add(pixelssum4, vctwo); ++ pixelssum1 = vec_add(pixelssum2, vctwo); ++ ++ blockv = vec_packsu(temp3, temp4); ++ ++ vec_st(blockv, 0, block); ++ ++ block += line_size; ++ pixels += line_size; ++ } ++ ++POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); ++#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ ++} ++ ++/* next one assumes that ((line_size % 16) == 0) */ ++void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) ++{ ++POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1); ++#ifdef ALTIVEC_USE_REFERENCE_C_CODE ++ int j; ++POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); ++ for (j = 0; j < 4; j++) { ++ int i; ++ const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); ++ const uint32_t b = ++ (((const struct unaligned_32 *) (pixels + 1))->l); ++ uint32_t l0 = ++ (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; ++ uint32_t h0 = ++ ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); ++ uint32_t l1, h1; ++ pixels += line_size; ++ for (i = 0; i < h; i += 2) { ++ uint32_t a = (((const struct unaligned_32 *) (pixels))->l); ++ uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); ++ l1 = (a & 0x03030303UL) + (b & 0x03030303UL); ++ h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); ++ *((uint32_t *) block) = ++ h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); ++ pixels += line_size; ++ block += line_size; ++ a = (((const struct unaligned_32 *) (pixels))->l); ++ b = (((const struct unaligned_32 *) (pixels + 1))->l); ++ l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; ++ h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); ++ *((uint32_t *) block) = ++ h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); ++ pixels += line_size; ++ block += line_size; ++ } pixels += 4 - line_size * (h + 1); ++ block += 4 - line_size * h; ++ } ++ ++POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); ++ ++#else /* ALTIVEC_USE_REFERENCE_C_CODE */ ++ register int i; ++ register vector unsigned char ++ pixelsv1, pixelsv2, pixelsv3, pixelsv4; ++ register vector unsigned char ++ blockv, temp1, temp2; ++ register vector unsigned short ++ pixelssum1, pixelssum2, temp3, ++ pixelssum3, pixelssum4, temp4; ++ register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); ++ register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); ++ register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); ++ ++POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); ++ ++ temp1 = vec_ld(0, pixels); ++ temp2 = vec_ld(16, pixels); ++ pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); ++ if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) ++ { ++ pixelsv2 = temp2; ++ } ++ else ++ { ++ pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); ++ } ++ pixelsv3 = vec_mergel(vczero, pixelsv1); ++ pixelsv4 = vec_mergel(vczero, pixelsv2); ++ pixelsv1 = vec_mergeh(vczero, pixelsv1); ++ pixelsv2 = vec_mergeh(vczero, pixelsv2); ++ pixelssum3 = vec_add((vector unsigned short)pixelsv3, ++ (vector unsigned short)pixelsv4); ++ pixelssum3 = vec_add(pixelssum3, vcone); ++ pixelssum1 = vec_add((vector unsigned short)pixelsv1, ++ (vector unsigned short)pixelsv2); ++ pixelssum1 = vec_add(pixelssum1, vcone); ++ ++ for (i = 0; i < h ; i++) { ++ blockv = vec_ld(0, block); ++ ++ temp1 = vec_ld(line_size, pixels); ++ temp2 = vec_ld(line_size + 16, pixels); ++ pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); ++ if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) ++ { ++ pixelsv2 = temp2; ++ } ++ else ++ { ++ pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); ++ } ++ ++ pixelsv3 = vec_mergel(vczero, pixelsv1); ++ pixelsv4 = vec_mergel(vczero, pixelsv2); ++ pixelsv1 = vec_mergeh(vczero, pixelsv1); ++ pixelsv2 = vec_mergeh(vczero, pixelsv2); ++ ++ pixelssum4 = vec_add((vector unsigned short)pixelsv3, ++ (vector unsigned short)pixelsv4); ++ pixelssum2 = vec_add((vector unsigned short)pixelsv1, ++ (vector unsigned short)pixelsv2); ++ temp4 = vec_add(pixelssum3, pixelssum4); ++ temp4 = vec_sra(temp4, vctwo); ++ temp3 = vec_add(pixelssum1, pixelssum2); ++ temp3 = vec_sra(temp3, vctwo); ++ ++ pixelssum3 = vec_add(pixelssum4, vcone); ++ pixelssum1 = vec_add(pixelssum2, vcone); ++ ++ blockv = vec_packsu(temp3, temp4); ++ ++ vec_st(blockv, 0, block); ++ ++ block += line_size; ++ pixels += line_size; ++ } ++ ++POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); ++#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ ++} ++ ++int has_altivec(void) ++{ ++#ifdef CONFIG_DARWIN ++ int sels[2] = {CTL_HW, HW_VECTORUNIT}; ++ int has_vu = 0; ++ size_t len = sizeof(has_vu); ++ int err; ++ ++ err = sysctl(sels, 2, &has_vu, &len, NULL, 0); ++ ++ if (err == 0) return (has_vu != 0); ++#else /* CONFIG_DARWIN */ ++/* no Darwin, do it the brute-force way */ ++/* this is borrowed from the libmpeg2 library */ ++ { ++ signal (SIGILL, sigill_handler); ++ if (sigsetjmp (jmpbuf, 1)) { ++ signal (SIGILL, SIG_DFL); ++ } else { ++ canjump = 1; ++ ++ asm volatile ("mtspr 256, %0\n\t" ++ "vand %%v0, %%v0, %%v0" ++ : ++ : "r" (-1)); ++ ++ signal (SIGILL, SIG_DFL); ++ return 1; ++ } ++ } ++#endif /* CONFIG_DARWIN */ ++ return 0; ++} +diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/dsputil_ppc.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/dsputil_ppc.c +--- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/dsputil_ppc.c 1970-01-01 01:00:00.000000000 +0100 ++++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/dsputil_ppc.c 2003-09-28 17:26:40.000000000 +0200 +@@ -0,0 +1,307 @@ ++/* ++ * Copyright (c) 2002 Brian Foley ++ * Copyright (c) 2002 Dieter Shirley ++ * ++ * This library is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2 of the License, or (at your option) any later version. ++ * ++ * This library is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with this library; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ */ ++ ++#include "../dsputil.h" ++ ++#include "dsputil_ppc.h" ++ ++#ifdef HAVE_ALTIVEC ++#include "dsputil_altivec.h" ++#endif ++ ++extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block); ++extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block); ++ ++int mm_flags = 0; ++ ++int mm_support(void) ++{ ++ int result = 0; ++#if HAVE_ALTIVEC ++ if (has_altivec()) { ++ result |= MM_ALTIVEC; ++ } ++#endif /* result */ ++ return result; ++} ++ ++#ifdef POWERPC_PERFORMANCE_REPORT ++unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total]; ++/* list below must match enum in dsputil_ppc.h */ ++static unsigned char* perfname[] = { ++ "fft_calc_altivec", ++ "gmc1_altivec", ++ "dct_unquantize_h263_altivec", ++ "idct_add_altivec", ++ "idct_put_altivec", ++ "put_pixels16_altivec", ++ "avg_pixels16_altivec", ++ "avg_pixels8_altivec", ++ "put_pixels8_xy2_altivec", ++ "put_no_rnd_pixels8_xy2_altivec", ++ "put_pixels16_xy2_altivec", ++ "put_no_rnd_pixels16_xy2_altivec", ++ "clear_blocks_dcbz32_ppc", ++ "clear_blocks_dcbz128_ppc" ++}; ++#include ++#endif ++ ++#ifdef POWERPC_PERFORMANCE_REPORT ++void powerpc_display_perf_report(void) ++{ ++ int i, j; ++ fprintf(stderr, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n"); ++ for(i = 0 ; i < powerpc_perf_total ; i++) ++ { ++ for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) ++ { ++ if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0) ++ fprintf(stderr, ++ " Function \"%s\" (pmc%d):\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n", ++ perfname[i], ++ j+1, ++ perfdata[j][i][powerpc_data_min], ++ perfdata[j][i][powerpc_data_max], ++ (double)perfdata[j][i][powerpc_data_sum] / ++ (double)perfdata[j][i][powerpc_data_num], ++ perfdata[j][i][powerpc_data_num]); ++ } ++ } ++} ++#endif /* POWERPC_PERFORMANCE_REPORT */ ++ ++/* ***** WARNING ***** WARNING ***** WARNING ***** */ ++/* ++ clear_blocks_dcbz32_ppc will not work properly ++ on PowerPC processors with a cache line size ++ not equal to 32 bytes. ++ Fortunately all processor used by Apple up to ++ at least the 7450 (aka second generation G4) ++ use 32 bytes cache line. ++ This is due to the use of the 'dcbz' instruction. ++ It simply clear to zero a single cache line, ++ so you need to know the cache line size to use it ! ++ It's absurd, but it's fast... ++ ++ update 24/06/2003 : Apple released yesterday the G5, ++ with a PPC970. cache line size : 128 bytes. Oups. ++ The semantic of dcbz was changed, it always clear ++ 32 bytes. so the function below will work, but will ++ be slow. So I fixed check_dcbz_effect to use dcbzl, ++ which is defined to clear a cache line (as dcbz before). ++ So we still can distinguish, and use dcbz (32 bytes) ++ or dcbzl (one cache line) as required. ++ ++ see ++ and ++*/ ++void clear_blocks_dcbz32_ppc(DCTELEM *blocks) ++{ ++POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz32, 1); ++ register int misal = ((unsigned long)blocks & 0x00000010); ++ register int i = 0; ++POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1); ++#if 1 ++ if (misal) { ++ ((unsigned long*)blocks)[0] = 0L; ++ ((unsigned long*)blocks)[1] = 0L; ++ ((unsigned long*)blocks)[2] = 0L; ++ ((unsigned long*)blocks)[3] = 0L; ++ i += 16; ++ } ++ for ( ; i < sizeof(DCTELEM)*6*64 ; i += 32) { ++ asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory"); ++ } ++ if (misal) { ++ ((unsigned long*)blocks)[188] = 0L; ++ ((unsigned long*)blocks)[189] = 0L; ++ ((unsigned long*)blocks)[190] = 0L; ++ ((unsigned long*)blocks)[191] = 0L; ++ i += 16; ++ } ++#else ++ memset(blocks, 0, sizeof(DCTELEM)*6*64); ++#endif ++POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1); ++} ++ ++/* same as above, when dcbzl clear a whole 128B cache line ++ i.e. the PPC970 aka G5 */ ++#ifndef NO_DCBZL ++void clear_blocks_dcbz128_ppc(DCTELEM *blocks) ++{ ++POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz128, 1); ++ register int misal = ((unsigned long)blocks & 0x0000007f); ++ register int i = 0; ++POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1); ++#if 1 ++ if (misal) { ++ // we could probably also optimize this case, ++ // but there's not much point as the machines ++ // aren't available yet (2003-06-26) ++ memset(blocks, 0, sizeof(DCTELEM)*6*64); ++ } ++ else ++ for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) { ++ asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory"); ++ } ++#else ++ memset(blocks, 0, sizeof(DCTELEM)*6*64); ++#endif ++POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1); ++} ++#else ++void clear_blocks_dcbz128_ppc(DCTELEM *blocks) ++{ ++ memset(blocks, 0, sizeof(DCTELEM)*6*64); ++} ++#endif ++ ++#ifndef NO_DCBZL ++/* check dcbz report how many bytes are set to 0 by dcbz */ ++/* update 24/06/2003 : replace dcbz by dcbzl to get ++ the intended effect (Apple "fixed" dcbz) ++ unfortunately this cannot be used unless the assembler ++ knows about dcbzl ... */ ++long check_dcbzl_effect(void) ++{ ++ register char *fakedata = (char*)av_malloc(1024); ++ register char *fakedata_middle; ++ register long zero = 0; ++ register long i = 0; ++ long count = 0; ++ ++ if (!fakedata) ++ { ++ return 0L; ++ } ++ ++ fakedata_middle = (fakedata + 512); ++ ++ memset(fakedata, 0xFF, 1024); ++ ++ /* below the constraint "b" seems to mean "Address base register" ++ in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */ ++ asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero)); ++ ++ for (i = 0; i < 1024 ; i ++) ++ { ++ if (fakedata[i] == (char)0) ++ count++; ++ } ++ ++ av_free(fakedata); ++ ++ return count; ++} ++#else ++long check_dcbzl_effect(void) ++{ ++ return 0; ++} ++#endif ++ ++void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx) ++{ ++ // Common optimizations whether Altivec is available or not ++ ++ switch (check_dcbzl_effect()) { ++ case 32: ++ c->clear_blocks = clear_blocks_dcbz32_ppc; ++ break; ++ case 128: ++ c->clear_blocks = clear_blocks_dcbz128_ppc; ++ break; ++ default: ++ break; ++ } ++ ++#if HAVE_ALTIVEC ++ if (has_altivec()) { ++ mm_flags |= MM_ALTIVEC; ++ ++ // Altivec specific optimisations ++ c->pix_abs16x16_x2 = pix_abs16x16_x2_altivec; ++ c->pix_abs16x16_y2 = pix_abs16x16_y2_altivec; ++ c->pix_abs16x16_xy2 = pix_abs16x16_xy2_altivec; ++ c->pix_abs16x16 = pix_abs16x16_altivec; ++ c->pix_abs8x8 = pix_abs8x8_altivec; ++ c->sad[0]= sad16x16_altivec; ++ c->sad[1]= sad8x8_altivec; ++ c->pix_norm1 = pix_norm1_altivec; ++ c->sse[1]= sse8_altivec; ++ c->sse[0]= sse16_altivec; ++ c->pix_sum = pix_sum_altivec; ++ c->diff_pixels = diff_pixels_altivec; ++ c->get_pixels = get_pixels_altivec; ++// next one disabled as it's untested. ++#if 0 ++ c->add_bytes= add_bytes_altivec; ++#endif /* 0 */ ++ c->put_pixels_tab[0][0] = put_pixels16_altivec; ++ /* the tow functions do the same thing, so use the same code */ ++ c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec; ++ c->avg_pixels_tab[0][0] = avg_pixels16_altivec; ++// next one disabled as it's untested. ++#if 0 ++ c->avg_pixels_tab[1][0] = avg_pixels8_altivec; ++#endif /* 0 */ ++ c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec; ++ c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec; ++ c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec; ++ c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec; ++ ++ c->gmc1 = gmc1_altivec; ++ ++ if ((avctx->idct_algo == FF_IDCT_AUTO) || ++ (avctx->idct_algo == FF_IDCT_ALTIVEC)) ++ { ++ c->idct_put = idct_put_altivec; ++ c->idct_add = idct_add_altivec; ++#ifndef ALTIVEC_USE_REFERENCE_C_CODE ++ c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; ++#else /* ALTIVEC_USE_REFERENCE_C_CODE */ ++ c->idct_permutation_type = FF_NO_IDCT_PERM; ++#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ ++ } ++ ++#ifdef POWERPC_PERFORMANCE_REPORT ++ { ++ int i, j; ++ for (i = 0 ; i < powerpc_perf_total ; i++) ++ { ++ for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) ++ { ++ perfdata[j][i][powerpc_data_min] = (unsigned long long)0xFFFFFFFFFFFFFFFF; ++ perfdata[j][i][powerpc_data_max] = (unsigned long long)0x0000000000000000; ++ perfdata[j][i][powerpc_data_sum] = (unsigned long long)0x0000000000000000; ++ perfdata[j][i][powerpc_data_num] = (unsigned long long)0x0000000000000000; ++ } ++ } ++ } ++#endif /* POWERPC_PERFORMANCE_REPORT */ ++ } else ++#endif /* HAVE_ALTIVEC */ ++ { ++ // Non-AltiVec PPC optimisations ++ ++ // ... pending ... ++ } ++} +diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/fft_altivec.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/fft_altivec.c +--- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/fft_altivec.c 1970-01-01 01:00:00.000000000 +0100 ++++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/fft_altivec.c 2003-09-28 17:26:40.000000000 +0200 +@@ -0,0 +1,247 @@ ++/* ++ * FFT/IFFT transforms ++ * AltiVec-enabled ++ * Copyright (c) 2003 Romain Dolbeau ++ * Based on code Copyright (c) 2002 Fabrice Bellard. ++ * ++ * This library is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2 of the License, or (at your option) any later version. ++ * ++ * This library is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with this library; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ */ ++#include "../dsputil.h" ++ ++#include "gcc_fixes.h" ++ ++#include "dsputil_altivec.h" ++ ++/* ++ those three macros are from libavcodec/fft.c ++ and are required for the reference C code ++*/ ++/* butter fly op */ ++#define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \ ++{\ ++ FFTSample ax, ay, bx, by;\ ++ bx=pre1;\ ++ by=pim1;\ ++ ax=qre1;\ ++ ay=qim1;\ ++ pre = (bx + ax);\ ++ pim = (by + ay);\ ++ qre = (bx - ax);\ ++ qim = (by - ay);\ ++} ++#define MUL16(a,b) ((a) * (b)) ++#define CMUL(pre, pim, are, aim, bre, bim) \ ++{\ ++ pre = (MUL16(are, bre) - MUL16(aim, bim));\ ++ pim = (MUL16(are, bim) + MUL16(bre, aim));\ ++} ++ ++ ++/** ++ * Do a complex FFT with the parameters defined in fft_init(). The ++ * input data must be permuted before with s->revtab table. No ++ * 1.0/sqrt(n) normalization is done. ++ * AltiVec-enabled ++ * This code assumes that the 'z' pointer is 16 bytes-aligned ++ * It also assumes all FFTComplex are 8 bytes-aligned pair of float ++ * The code is exactly the same as the SSE version, except ++ * that successive MUL + ADD/SUB have been merged into ++ * fused multiply-add ('vec_madd' in altivec) ++ */ ++void fft_calc_altivec(FFTContext *s, FFTComplex *z) ++{ ++POWERPC_PERF_DECLARE(altivec_fft_num, s->nbits >= 6); ++#ifdef ALTIVEC_USE_REFERENCE_C_CODE ++ int ln = s->nbits; ++ int j, np, np2; ++ int nblocks, nloops; ++ register FFTComplex *p, *q; ++ FFTComplex *exptab = s->exptab; ++ int l; ++ FFTSample tmp_re, tmp_im; ++ ++POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6); ++ ++ np = 1 << ln; ++ ++ /* pass 0 */ ++ ++ p=&z[0]; ++ j=(np >> 1); ++ do { ++ BF(p[0].re, p[0].im, p[1].re, p[1].im, ++ p[0].re, p[0].im, p[1].re, p[1].im); ++ p+=2; ++ } while (--j != 0); ++ ++ /* pass 1 */ ++ ++ ++ p=&z[0]; ++ j=np >> 2; ++ if (s->inverse) { ++ do { ++ BF(p[0].re, p[0].im, p[2].re, p[2].im, ++ p[0].re, p[0].im, p[2].re, p[2].im); ++ BF(p[1].re, p[1].im, p[3].re, p[3].im, ++ p[1].re, p[1].im, -p[3].im, p[3].re); ++ p+=4; ++ } while (--j != 0); ++ } else { ++ do { ++ BF(p[0].re, p[0].im, p[2].re, p[2].im, ++ p[0].re, p[0].im, p[2].re, p[2].im); ++ BF(p[1].re, p[1].im, p[3].re, p[3].im, ++ p[1].re, p[1].im, p[3].im, -p[3].re); ++ p+=4; ++ } while (--j != 0); ++ } ++ /* pass 2 .. ln-1 */ ++ ++ nblocks = np >> 3; ++ nloops = 1 << 2; ++ np2 = np >> 1; ++ do { ++ p = z; ++ q = z + nloops; ++ for (j = 0; j < nblocks; ++j) { ++ BF(p->re, p->im, q->re, q->im, ++ p->re, p->im, q->re, q->im); ++ ++ p++; ++ q++; ++ for(l = nblocks; l < np2; l += nblocks) { ++ CMUL(tmp_re, tmp_im, exptab[l].re, exptab[l].im, q->re, q->im); ++ BF(p->re, p->im, q->re, q->im, ++ p->re, p->im, tmp_re, tmp_im); ++ p++; ++ q++; ++ } ++ ++ p += nloops; ++ q += nloops; ++ } ++ nblocks = nblocks >> 1; ++ nloops = nloops << 1; ++ } while (nblocks != 0); ++ ++POWERPC_PERF_STOP_COUNT(altivec_fft_num, s->nbits >= 6); ++ ++#else /* ALTIVEC_USE_REFERENCE_C_CODE */ ++#ifdef CONFIG_DARWIN ++ register const vector float vczero = (const vector float)(0.); ++#else ++ register const vector float vczero = (const vector float){0.,0.,0.,0.}; ++#endif ++ ++ int ln = s->nbits; ++ int j, np, np2; ++ int nblocks, nloops; ++ register FFTComplex *p, *q; ++ FFTComplex *cptr, *cptr1; ++ int k; ++ ++POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6); ++ ++ np = 1 << ln; ++ ++ { ++ vector float *r, a, b, a1, c1, c2; ++ ++ r = (vector float *)&z[0]; ++ ++ c1 = vcii(p,p,n,n); ++ ++ if (s->inverse) ++ { ++ c2 = vcii(p,p,n,p); ++ } ++ else ++ { ++ c2 = vcii(p,p,p,n); ++ } ++ ++ j = (np >> 2); ++ do { ++ a = vec_ld(0, r); ++ a1 = vec_ld(sizeof(vector float), r); ++ ++ b = vec_perm(a,a,vcprmle(1,0,3,2)); ++ a = vec_madd(a,c1,b); ++ /* do the pass 0 butterfly */ ++ ++ b = vec_perm(a1,a1,vcprmle(1,0,3,2)); ++ b = vec_madd(a1,c1,b); ++ /* do the pass 0 butterfly */ ++ ++ /* multiply third by -i */ ++ b = vec_perm(b,b,vcprmle(2,3,1,0)); ++ ++ /* do the pass 1 butterfly */ ++ vec_st(vec_madd(b,c2,a), 0, r); ++ vec_st(vec_nmsub(b,c2,a), sizeof(vector float), r); ++ ++ r += 2; ++ } while (--j != 0); ++ } ++ /* pass 2 .. ln-1 */ ++ ++ nblocks = np >> 3; ++ nloops = 1 << 2; ++ np2 = np >> 1; ++ ++ cptr1 = s->exptab1; ++ do { ++ p = z; ++ q = z + nloops; ++ j = nblocks; ++ do { ++ cptr = cptr1; ++ k = nloops >> 1; ++ do { ++ vector float a,b,c,t1; ++ ++ a = vec_ld(0, (float*)p); ++ b = vec_ld(0, (float*)q); ++ ++ /* complex mul */ ++ c = vec_ld(0, (float*)cptr); ++ /* cre*re cim*re */ ++ t1 = vec_madd(c, vec_perm(b,b,vcprmle(2,2,0,0)),vczero); ++ c = vec_ld(sizeof(vector float), (float*)cptr); ++ /* -cim*im cre*im */ ++ b = vec_madd(c, vec_perm(b,b,vcprmle(3,3,1,1)),t1); ++ ++ /* butterfly */ ++ vec_st(vec_add(a,b), 0, (float*)p); ++ vec_st(vec_sub(a,b), 0, (float*)q); ++ ++ p += 2; ++ q += 2; ++ cptr += 4; ++ } while (--k); ++ ++ p += nloops; ++ q += nloops; ++ } while (--j); ++ cptr1 += nloops * 2; ++ nblocks = nblocks >> 1; ++ nloops = nloops << 1; ++ } while (nblocks != 0); ++ ++POWERPC_PERF_STOP_COUNT(altivec_fft_num, s->nbits >= 6); ++ ++#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ ++} +diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/gcc_fixes.h avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/gcc_fixes.h +--- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/gcc_fixes.h 2003-07-04 15:40:29.000000000 +0200 ++++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/gcc_fixes.h 2003-09-28 17:26:40.000000000 +0200 +@@ -25,7 +25,7 @@ + * http://gcc.gnu.org/ml/gcc/2003-04/msg00967.html + */ + +-static inline vector signed char my_vmrglb (vector signed char const A, ++static inline vector signed char ff_vmrglb (vector signed char const A, + vector signed char const B) + { + static const vector unsigned char lowbyte = { +@@ -35,7 +35,7 @@ + return vec_perm (A, B, lowbyte); + } + +-static inline vector signed short my_vmrglh (vector signed short const A, ++static inline vector signed short ff_vmrglh (vector signed short const A, + vector signed short const B) + { + static const vector unsigned char lowhalf = { +@@ -45,7 +45,7 @@ + return vec_perm (A, B, lowhalf); + } + +-static inline vector signed int my_vmrglw (vector signed int const A, ++static inline vector signed int ff_vmrglw (vector signed int const A, + vector signed int const B) + { + static const vector unsigned char lowword = { +@@ -54,27 +54,27 @@ + }; + return vec_perm (A, B, lowword); + } +-/*#define my_vmrglb my_vmrglb +-#define my_vmrglh my_vmrglh +-#define my_vmrglw my_vmrglw ++/*#define ff_vmrglb ff_vmrglb ++#define ff_vmrglh ff_vmrglh ++#define ff_vmrglw ff_vmrglw + */ + #undef vec_mergel + + #define vec_mergel(a1, a2) \ + __ch (__bin_args_eq (vector signed char, (a1), vector signed char, (a2)), \ +- ((vector signed char) my_vmrglb ((vector signed char) (a1), (vector signed char) (a2))), \ ++ ((vector signed char) ff_vmrglb ((vector signed char) (a1), (vector signed char) (a2))), \ + __ch (__bin_args_eq (vector unsigned char, (a1), vector unsigned char, (a2)), \ +- ((vector unsigned char) my_vmrglb ((vector signed char) (a1), (vector signed char) (a2))), \ ++ ((vector unsigned char) ff_vmrglb ((vector signed char) (a1), (vector signed char) (a2))), \ + __ch (__bin_args_eq (vector signed short, (a1), vector signed short, (a2)), \ +- ((vector signed short) my_vmrglh ((vector signed short) (a1), (vector signed short) (a2))), \ ++ ((vector signed short) ff_vmrglh ((vector signed short) (a1), (vector signed short) (a2))), \ + __ch (__bin_args_eq (vector unsigned short, (a1), vector unsigned short, (a2)), \ +- ((vector unsigned short) my_vmrglh ((vector signed short) (a1), (vector signed short) (a2))), \ ++ ((vector unsigned short) ff_vmrglh ((vector signed short) (a1), (vector signed short) (a2))), \ + __ch (__bin_args_eq (vector float, (a1), vector float, (a2)), \ +- ((vector float) my_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \ ++ ((vector float) ff_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \ + __ch (__bin_args_eq (vector signed int, (a1), vector signed int, (a2)), \ +- ((vector signed int) my_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \ ++ ((vector signed int) ff_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \ + __ch (__bin_args_eq (vector unsigned int, (a1), vector unsigned int, (a2)), \ +- ((vector unsigned int) my_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \ ++ ((vector unsigned int) ff_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \ + __altivec_link_error_invalid_argument ()))))))) + + #endif +diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/gmc_altivec.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/gmc_altivec.c +--- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/gmc_altivec.c 1970-01-01 01:00:00.000000000 +0100 ++++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/gmc_altivec.c 2003-09-28 17:26:40.000000000 +0200 +@@ -0,0 +1,172 @@ ++/* ++ * GMC (Global Motion Compensation) ++ * AltiVec-enabled ++ * Copyright (c) 2003 Romain Dolbeau ++ * ++ * This library is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2 of the License, or (at your option) any later version. ++ * ++ * This library is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with this library; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ */ ++ ++#include "../dsputil.h" ++ ++#include "gcc_fixes.h" ++ ++#include "dsputil_altivec.h" ++ ++/* ++ altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8, ++ to preserve proper dst alignement. ++*/ ++#define GMC1_PERF_COND (h==8) ++void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int stride, int h, int x16, int y16, int rounder) ++{ ++POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND); ++#ifdef ALTIVEC_USE_REFERENCE_C_CODE ++ const int A=(16-x16)*(16-y16); ++ const int B=( x16)*(16-y16); ++ const int C=(16-x16)*( y16); ++ const int D=( x16)*( y16); ++ int i; ++ ++POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND); ++ ++ for(i=0; i>8; ++ dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8; ++ dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8; ++ dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8; ++ dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8; ++ dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8; ++ dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8; ++ dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8; ++ dst+= stride; ++ src+= stride; ++ } ++ ++POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND); ++ ++#else /* ALTIVEC_USE_REFERENCE_C_CODE */ ++ const unsigned short __attribute__ ((aligned(16))) rounder_a[8] = ++ {rounder, rounder, rounder, rounder, ++ rounder, rounder, rounder, rounder}; ++ const unsigned short __attribute__ ((aligned(16))) ABCD[8] = ++ { ++ (16-x16)*(16-y16), /* A */ ++ ( x16)*(16-y16), /* B */ ++ (16-x16)*( y16), /* C */ ++ ( x16)*( y16), /* D */ ++ 0, 0, 0, 0 /* padding */ ++ }; ++ register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); ++ register const vector unsigned short vcsr8 = (const vector unsigned short)vec_splat_u16(8); ++ register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD; ++ register vector unsigned short Av, Bv, Cv, Dv, rounderV, tempA, tempB, tempC, tempD; ++ int i; ++ unsigned long dst_odd = (unsigned long)dst & 0x0000000F; ++ unsigned long src_really_odd = (unsigned long)src & 0x0000000F; ++ ++ ++POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND); ++ ++ tempA = vec_ld(0, (unsigned short*)ABCD); ++ Av = vec_splat(tempA, 0); ++ Bv = vec_splat(tempA, 1); ++ Cv = vec_splat(tempA, 2); ++ Dv = vec_splat(tempA, 3); ++ ++ rounderV = vec_ld(0, (unsigned short*)rounder_a); ++ ++ // we'll be able to pick-up our 9 char elements ++ // at src from those 32 bytes ++ // we load the first batch here, as inside the loop ++ // we can re-use 'src+stride' from one iteration ++ // as the 'src' of the next. ++ src_0 = vec_ld(0, src); ++ src_1 = vec_ld(16, src); ++ srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src)); ++ ++ if (src_really_odd != 0x0000000F) ++ { // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector. ++ srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src)); ++ } ++ else ++ { ++ srcvB = src_1; ++ } ++ srcvA = vec_mergeh(vczero, srcvA); ++ srcvB = vec_mergeh(vczero, srcvB); ++ ++ for(i=0; i /* malloc(), free() */ ++#include ++#include "../dsputil.h" ++ ++#include "gcc_fixes.h" ++ ++#include "dsputil_altivec.h" ++ ++#define vector_s16_t vector signed short ++#define vector_u16_t vector unsigned short ++#define vector_s8_t vector signed char ++#define vector_u8_t vector unsigned char ++#define vector_s32_t vector signed int ++#define vector_u32_t vector unsigned int ++ ++#define IDCT_HALF \ ++ /* 1st stage */ \ ++ t1 = vec_mradds (a1, vx7, vx1 ); \ ++ t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7)); \ ++ t7 = vec_mradds (a2, vx5, vx3); \ ++ t3 = vec_mradds (ma2, vx3, vx5); \ ++ \ ++ /* 2nd stage */ \ ++ t5 = vec_adds (vx0, vx4); \ ++ t0 = vec_subs (vx0, vx4); \ ++ t2 = vec_mradds (a0, vx6, vx2); \ ++ t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6)); \ ++ t6 = vec_adds (t8, t3); \ ++ t3 = vec_subs (t8, t3); \ ++ t8 = vec_subs (t1, t7); \ ++ t1 = vec_adds (t1, t7); \ ++ \ ++ /* 3rd stage */ \ ++ t7 = vec_adds (t5, t2); \ ++ t2 = vec_subs (t5, t2); \ ++ t5 = vec_adds (t0, t4); \ ++ t0 = vec_subs (t0, t4); \ ++ t4 = vec_subs (t8, t3); \ ++ t3 = vec_adds (t8, t3); \ ++ \ ++ /* 4th stage */ \ ++ vy0 = vec_adds (t7, t1); \ ++ vy7 = vec_subs (t7, t1); \ ++ vy1 = vec_mradds (c4, t3, t5); \ ++ vy6 = vec_mradds (mc4, t3, t5); \ ++ vy2 = vec_mradds (c4, t4, t0); \ ++ vy5 = vec_mradds (mc4, t4, t0); \ ++ vy3 = vec_adds (t2, t6); \ ++ vy4 = vec_subs (t2, t6); ++ ++ ++#define IDCT \ ++ vector_s16_t vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \ ++ vector_s16_t vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \ ++ vector_s16_t a0, a1, a2, ma2, c4, mc4, zero, bias; \ ++ vector_s16_t t0, t1, t2, t3, t4, t5, t6, t7, t8; \ ++ vector_u16_t shift; \ ++ \ ++ c4 = vec_splat (constants[0], 0); \ ++ a0 = vec_splat (constants[0], 1); \ ++ a1 = vec_splat (constants[0], 2); \ ++ a2 = vec_splat (constants[0], 3); \ ++ mc4 = vec_splat (constants[0], 4); \ ++ ma2 = vec_splat (constants[0], 5); \ ++ bias = (vector_s16_t)vec_splat ((vector_s32_t)constants[0], 3); \ ++ \ ++ zero = vec_splat_s16 (0); \ ++ shift = vec_splat_u16 (4); \ ++ \ ++ vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero); \ ++ vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero); \ ++ vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero); \ ++ vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero); \ ++ vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero); \ ++ vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero); \ ++ vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero); \ ++ vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero); \ ++ \ ++ IDCT_HALF \ ++ \ ++ vx0 = vec_mergeh (vy0, vy4); \ ++ vx1 = vec_mergel (vy0, vy4); \ ++ vx2 = vec_mergeh (vy1, vy5); \ ++ vx3 = vec_mergel (vy1, vy5); \ ++ vx4 = vec_mergeh (vy2, vy6); \ ++ vx5 = vec_mergel (vy2, vy6); \ ++ vx6 = vec_mergeh (vy3, vy7); \ ++ vx7 = vec_mergel (vy3, vy7); \ ++ \ ++ vy0 = vec_mergeh (vx0, vx4); \ ++ vy1 = vec_mergel (vx0, vx4); \ ++ vy2 = vec_mergeh (vx1, vx5); \ ++ vy3 = vec_mergel (vx1, vx5); \ ++ vy4 = vec_mergeh (vx2, vx6); \ ++ vy5 = vec_mergel (vx2, vx6); \ ++ vy6 = vec_mergeh (vx3, vx7); \ ++ vy7 = vec_mergel (vx3, vx7); \ ++ \ ++ vx0 = vec_adds (vec_mergeh (vy0, vy4), bias); \ ++ vx1 = vec_mergel (vy0, vy4); \ ++ vx2 = vec_mergeh (vy1, vy5); \ ++ vx3 = vec_mergel (vy1, vy5); \ ++ vx4 = vec_mergeh (vy2, vy6); \ ++ vx5 = vec_mergel (vy2, vy6); \ ++ vx6 = vec_mergeh (vy3, vy7); \ ++ vx7 = vec_mergel (vy3, vy7); \ ++ \ ++ IDCT_HALF \ ++ \ ++ shift = vec_splat_u16 (6); \ ++ vx0 = vec_sra (vy0, shift); \ ++ vx1 = vec_sra (vy1, shift); \ ++ vx2 = vec_sra (vy2, shift); \ ++ vx3 = vec_sra (vy3, shift); \ ++ vx4 = vec_sra (vy4, shift); \ ++ vx5 = vec_sra (vy5, shift); \ ++ vx6 = vec_sra (vy6, shift); \ ++ vx7 = vec_sra (vy7, shift); ++ ++ ++static const vector_s16_t constants[5] = { ++ (vector_s16_t) AVV(23170, 13573, 6518, 21895, -23170, -21895, 32, 31), ++ (vector_s16_t) AVV(16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725), ++ (vector_s16_t) AVV(22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521), ++ (vector_s16_t) AVV(21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692), ++ (vector_s16_t) AVV(19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722) ++}; ++ ++void idct_put_altivec(uint8_t* dest, int stride, vector_s16_t* block) ++{ ++POWERPC_PERF_DECLARE(altivec_idct_put_num, 1); ++#ifdef ALTIVEC_USE_REFERENCE_C_CODE ++POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1); ++ void simple_idct_put(uint8_t *dest, int line_size, int16_t *block); ++ simple_idct_put(dest, stride, (int16_t*)block); ++POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1); ++#else /* ALTIVEC_USE_REFERENCE_C_CODE */ ++ vector_u8_t tmp; ++ ++POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1); ++ ++ IDCT ++ ++#define COPY(dest,src) \ ++ tmp = vec_packsu (src, src); \ ++ vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest); \ ++ vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest); ++ ++ COPY (dest, vx0) dest += stride; ++ COPY (dest, vx1) dest += stride; ++ COPY (dest, vx2) dest += stride; ++ COPY (dest, vx3) dest += stride; ++ COPY (dest, vx4) dest += stride; ++ COPY (dest, vx5) dest += stride; ++ COPY (dest, vx6) dest += stride; ++ COPY (dest, vx7) ++ ++POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1); ++#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ ++} ++ ++void idct_add_altivec(uint8_t* dest, int stride, vector_s16_t* block) ++{ ++POWERPC_PERF_DECLARE(altivec_idct_add_num, 1); ++#ifdef ALTIVEC_USE_REFERENCE_C_CODE ++POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1); ++ void simple_idct_add(uint8_t *dest, int line_size, int16_t *block); ++ simple_idct_add(dest, stride, (int16_t*)block); ++POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1); ++#else /* ALTIVEC_USE_REFERENCE_C_CODE */ ++ vector_u8_t tmp; ++ vector_s16_t tmp2, tmp3; ++ vector_u8_t perm0; ++ vector_u8_t perm1; ++ vector_u8_t p0, p1, p; ++ ++POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1); ++ ++ IDCT ++ ++ p0 = vec_lvsl (0, dest); ++ p1 = vec_lvsl (stride, dest); ++ p = vec_splat_u8 (-1); ++ perm0 = vec_mergeh (p, p0); ++ perm1 = vec_mergeh (p, p1); ++ ++#define ADD(dest,src,perm) \ ++ /* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \ ++ tmp = vec_ld (0, dest); \ ++ tmp2 = (vector_s16_t)vec_perm (tmp, (vector_u8_t)zero, perm); \ ++ tmp3 = vec_adds (tmp2, src); \ ++ tmp = vec_packsu (tmp3, tmp3); \ ++ vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest); \ ++ vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest); ++ ++ ADD (dest, vx0, perm0) dest += stride; ++ ADD (dest, vx1, perm1) dest += stride; ++ ADD (dest, vx2, perm0) dest += stride; ++ ADD (dest, vx3, perm1) dest += stride; ++ ADD (dest, vx4, perm0) dest += stride; ++ ADD (dest, vx5, perm1) dest += stride; ++ ADD (dest, vx6, perm0) dest += stride; ++ ADD (dest, vx7, perm1) ++ ++POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1); ++#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ ++} ++ +diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/mpegvideo_altivec.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/mpegvideo_altivec.c +--- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/mpegvideo_altivec.c 1970-01-01 01:00:00.000000000 +0100 ++++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/mpegvideo_altivec.c 2003-09-28 17:26:40.000000000 +0200 +@@ -0,0 +1,645 @@ ++/* ++ * Copyright (c) 2002 Dieter Shirley ++ * ++ * This library is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2 of the License, or (at your option) any later version. ++ * ++ * This library is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with this library; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ */ ++ ++#include ++#include ++#include "../dsputil.h" ++#include "../mpegvideo.h" ++ ++#include "gcc_fixes.h" ++ ++#include "dsputil_altivec.h" ++ ++// Swaps two variables (used for altivec registers) ++#define SWAP(a,b) \ ++do { \ ++ __typeof__(a) swap_temp=a; \ ++ a=b; \ ++ b=swap_temp; \ ++} while (0) ++ ++// transposes a matrix consisting of four vectors with four elements each ++#define TRANSPOSE4(a,b,c,d) \ ++do { \ ++ __typeof__(a) _trans_ach = vec_mergeh(a, c); \ ++ __typeof__(a) _trans_acl = vec_mergel(a, c); \ ++ __typeof__(a) _trans_bdh = vec_mergeh(b, d); \ ++ __typeof__(a) _trans_bdl = vec_mergel(b, d); \ ++ \ ++ a = vec_mergeh(_trans_ach, _trans_bdh); \ ++ b = vec_mergel(_trans_ach, _trans_bdh); \ ++ c = vec_mergeh(_trans_acl, _trans_bdl); \ ++ d = vec_mergel(_trans_acl, _trans_bdl); \ ++} while (0) ++ ++#define TRANSPOSE8(a,b,c,d,e,f,g,h) \ ++do { \ ++ __typeof__(a) _A1, _B1, _C1, _D1, _E1, _F1, _G1, _H1; \ ++ __typeof__(a) _A2, _B2, _C2, _D2, _E2, _F2, _G2, _H2; \ ++ \ ++ _A1 = vec_mergeh (a, e); \ ++ _B1 = vec_mergel (a, e); \ ++ _C1 = vec_mergeh (b, f); \ ++ _D1 = vec_mergel (b, f); \ ++ _E1 = vec_mergeh (c, g); \ ++ _F1 = vec_mergel (c, g); \ ++ _G1 = vec_mergeh (d, h); \ ++ _H1 = vec_mergel (d, h); \ ++ \ ++ _A2 = vec_mergeh (_A1, _E1); \ ++ _B2 = vec_mergel (_A1, _E1); \ ++ _C2 = vec_mergeh (_B1, _F1); \ ++ _D2 = vec_mergel (_B1, _F1); \ ++ _E2 = vec_mergeh (_C1, _G1); \ ++ _F2 = vec_mergel (_C1, _G1); \ ++ _G2 = vec_mergeh (_D1, _H1); \ ++ _H2 = vec_mergel (_D1, _H1); \ ++ \ ++ a = vec_mergeh (_A2, _E2); \ ++ b = vec_mergel (_A2, _E2); \ ++ c = vec_mergeh (_B2, _F2); \ ++ d = vec_mergel (_B2, _F2); \ ++ e = vec_mergeh (_C2, _G2); \ ++ f = vec_mergel (_C2, _G2); \ ++ g = vec_mergeh (_D2, _H2); \ ++ h = vec_mergel (_D2, _H2); \ ++} while (0) ++ ++ ++// Loads a four-byte value (int or float) from the target address ++// into every element in the target vector. Only works if the ++// target address is four-byte aligned (which should be always). ++#define LOAD4(vec, address) \ ++{ \ ++ __typeof__(vec)* _load_addr = (__typeof__(vec)*)(address); \ ++ vector unsigned char _perm_vec = vec_lvsl(0,(address)); \ ++ vec = vec_ld(0, _load_addr); \ ++ vec = vec_perm(vec, vec, _perm_vec); \ ++ vec = vec_splat(vec, 0); \ ++} ++ ++ ++#ifdef CONFIG_DARWIN ++#define FOUROF(a) (a) ++#else ++// slower, for dumb non-apple GCC ++#define FOUROF(a) {a,a,a,a} ++#endif ++int dct_quantize_altivec(MpegEncContext* s, ++ DCTELEM* data, int n, ++ int qscale, int* overflow) ++{ ++ int lastNonZero; ++ vector float row0, row1, row2, row3, row4, row5, row6, row7; ++ vector float alt0, alt1, alt2, alt3, alt4, alt5, alt6, alt7; ++ const vector float zero = (const vector float)FOUROF(0.); ++ ++ // Load the data into the row/alt vectors ++ { ++ vector signed short data0, data1, data2, data3, data4, data5, data6, data7; ++ ++ data0 = vec_ld(0, data); ++ data1 = vec_ld(16, data); ++ data2 = vec_ld(32, data); ++ data3 = vec_ld(48, data); ++ data4 = vec_ld(64, data); ++ data5 = vec_ld(80, data); ++ data6 = vec_ld(96, data); ++ data7 = vec_ld(112, data); ++ ++ // Transpose the data before we start ++ TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7); ++ ++ // load the data into floating point vectors. We load ++ // the high half of each row into the main row vectors ++ // and the low half into the alt vectors. ++ row0 = vec_ctf(vec_unpackh(data0), 0); ++ alt0 = vec_ctf(vec_unpackl(data0), 0); ++ row1 = vec_ctf(vec_unpackh(data1), 0); ++ alt1 = vec_ctf(vec_unpackl(data1), 0); ++ row2 = vec_ctf(vec_unpackh(data2), 0); ++ alt2 = vec_ctf(vec_unpackl(data2), 0); ++ row3 = vec_ctf(vec_unpackh(data3), 0); ++ alt3 = vec_ctf(vec_unpackl(data3), 0); ++ row4 = vec_ctf(vec_unpackh(data4), 0); ++ alt4 = vec_ctf(vec_unpackl(data4), 0); ++ row5 = vec_ctf(vec_unpackh(data5), 0); ++ alt5 = vec_ctf(vec_unpackl(data5), 0); ++ row6 = vec_ctf(vec_unpackh(data6), 0); ++ alt6 = vec_ctf(vec_unpackl(data6), 0); ++ row7 = vec_ctf(vec_unpackh(data7), 0); ++ alt7 = vec_ctf(vec_unpackl(data7), 0); ++ } ++ ++ // The following block could exist as a separate an altivec dct ++ // function. However, if we put it inline, the DCT data can remain ++ // in the vector local variables, as floats, which we'll use during the ++ // quantize step... ++ { ++ const vector float vec_0_298631336 = (vector float)FOUROF(0.298631336f); ++ const vector float vec_0_390180644 = (vector float)FOUROF(-0.390180644f); ++ const vector float vec_0_541196100 = (vector float)FOUROF(0.541196100f); ++ const vector float vec_0_765366865 = (vector float)FOUROF(0.765366865f); ++ const vector float vec_0_899976223 = (vector float)FOUROF(-0.899976223f); ++ const vector float vec_1_175875602 = (vector float)FOUROF(1.175875602f); ++ const vector float vec_1_501321110 = (vector float)FOUROF(1.501321110f); ++ const vector float vec_1_847759065 = (vector float)FOUROF(-1.847759065f); ++ const vector float vec_1_961570560 = (vector float)FOUROF(-1.961570560f); ++ const vector float vec_2_053119869 = (vector float)FOUROF(2.053119869f); ++ const vector float vec_2_562915447 = (vector float)FOUROF(-2.562915447f); ++ const vector float vec_3_072711026 = (vector float)FOUROF(3.072711026f); ++ ++ ++ int whichPass, whichHalf; ++ ++ for(whichPass = 1; whichPass<=2; whichPass++) ++ { ++ for(whichHalf = 1; whichHalf<=2; whichHalf++) ++ { ++ vector float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; ++ vector float tmp10, tmp11, tmp12, tmp13; ++ vector float z1, z2, z3, z4, z5; ++ ++ tmp0 = vec_add(row0, row7); // tmp0 = dataptr[0] + dataptr[7]; ++ tmp7 = vec_sub(row0, row7); // tmp7 = dataptr[0] - dataptr[7]; ++ tmp3 = vec_add(row3, row4); // tmp3 = dataptr[3] + dataptr[4]; ++ tmp4 = vec_sub(row3, row4); // tmp4 = dataptr[3] - dataptr[4]; ++ tmp1 = vec_add(row1, row6); // tmp1 = dataptr[1] + dataptr[6]; ++ tmp6 = vec_sub(row1, row6); // tmp6 = dataptr[1] - dataptr[6]; ++ tmp2 = vec_add(row2, row5); // tmp2 = dataptr[2] + dataptr[5]; ++ tmp5 = vec_sub(row2, row5); // tmp5 = dataptr[2] - dataptr[5]; ++ ++ tmp10 = vec_add(tmp0, tmp3); // tmp10 = tmp0 + tmp3; ++ tmp13 = vec_sub(tmp0, tmp3); // tmp13 = tmp0 - tmp3; ++ tmp11 = vec_add(tmp1, tmp2); // tmp11 = tmp1 + tmp2; ++ tmp12 = vec_sub(tmp1, tmp2); // tmp12 = tmp1 - tmp2; ++ ++ ++ // dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS); ++ row0 = vec_add(tmp10, tmp11); ++ ++ // dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS); ++ row4 = vec_sub(tmp10, tmp11); ++ ++ ++ // z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); ++ z1 = vec_madd(vec_add(tmp12, tmp13), vec_0_541196100, (vector float)zero); ++ ++ // dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865), ++ // CONST_BITS-PASS1_BITS); ++ row2 = vec_madd(tmp13, vec_0_765366865, z1); ++ ++ // dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065), ++ // CONST_BITS-PASS1_BITS); ++ row6 = vec_madd(tmp12, vec_1_847759065, z1); ++ ++ z1 = vec_add(tmp4, tmp7); // z1 = tmp4 + tmp7; ++ z2 = vec_add(tmp5, tmp6); // z2 = tmp5 + tmp6; ++ z3 = vec_add(tmp4, tmp6); // z3 = tmp4 + tmp6; ++ z4 = vec_add(tmp5, tmp7); // z4 = tmp5 + tmp7; ++ ++ // z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ ++ z5 = vec_madd(vec_add(z3, z4), vec_1_175875602, (vector float)zero); ++ ++ // z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ ++ z3 = vec_madd(z3, vec_1_961570560, z5); ++ ++ // z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ ++ z4 = vec_madd(z4, vec_0_390180644, z5); ++ ++ // The following adds are rolled into the multiplies above ++ // z3 = vec_add(z3, z5); // z3 += z5; ++ // z4 = vec_add(z4, z5); // z4 += z5; ++ ++ // z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ ++ // Wow! It's actually more effecient to roll this multiply ++ // into the adds below, even thought the multiply gets done twice! ++ // z2 = vec_madd(z2, vec_2_562915447, (vector float)zero); ++ ++ // z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ ++ // Same with this one... ++ // z1 = vec_madd(z1, vec_0_899976223, (vector float)zero); ++ ++ // tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ ++ // dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); ++ row7 = vec_madd(tmp4, vec_0_298631336, vec_madd(z1, vec_0_899976223, z3)); ++ ++ // tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ ++ // dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); ++ row5 = vec_madd(tmp5, vec_2_053119869, vec_madd(z2, vec_2_562915447, z4)); ++ ++ // tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ ++ // dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); ++ row3 = vec_madd(tmp6, vec_3_072711026, vec_madd(z2, vec_2_562915447, z3)); ++ ++ // tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ ++ // dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); ++ row1 = vec_madd(z1, vec_0_899976223, vec_madd(tmp7, vec_1_501321110, z4)); ++ ++ // Swap the row values with the alts. If this is the first half, ++ // this sets up the low values to be acted on in the second half. ++ // If this is the second half, it puts the high values back in ++ // the row values where they are expected to be when we're done. ++ SWAP(row0, alt0); ++ SWAP(row1, alt1); ++ SWAP(row2, alt2); ++ SWAP(row3, alt3); ++ SWAP(row4, alt4); ++ SWAP(row5, alt5); ++ SWAP(row6, alt6); ++ SWAP(row7, alt7); ++ } ++ ++ if (whichPass == 1) ++ { ++ // transpose the data for the second pass ++ ++ // First, block transpose the upper right with lower left. ++ SWAP(row4, alt0); ++ SWAP(row5, alt1); ++ SWAP(row6, alt2); ++ SWAP(row7, alt3); ++ ++ // Now, transpose each block of four ++ TRANSPOSE4(row0, row1, row2, row3); ++ TRANSPOSE4(row4, row5, row6, row7); ++ TRANSPOSE4(alt0, alt1, alt2, alt3); ++ TRANSPOSE4(alt4, alt5, alt6, alt7); ++ } ++ } ++ } ++ ++ // used after quantise step ++ int oldBaseValue = 0; ++ ++ // perform the quantise step, using the floating point data ++ // still in the row/alt registers ++ { ++ const int* biasAddr; ++ const vector signed int* qmat; ++ vector float bias, negBias; ++ ++ if (s->mb_intra) ++ { ++ vector signed int baseVector; ++ ++ // We must cache element 0 in the intra case ++ // (it needs special handling). ++ baseVector = vec_cts(vec_splat(row0, 0), 0); ++ vec_ste(baseVector, 0, &oldBaseValue); ++ ++ qmat = (vector signed int*)s->q_intra_matrix[qscale]; ++ biasAddr = &(s->intra_quant_bias); ++ } ++ else ++ { ++ qmat = (vector signed int*)s->q_inter_matrix[qscale]; ++ biasAddr = &(s->inter_quant_bias); ++ } ++ ++ // Load the bias vector (We add 0.5 to the bias so that we're ++ // rounding when we convert to int, instead of flooring.) ++ { ++ vector signed int biasInt; ++ const vector float negOneFloat = (vector float)FOUROF(-1.0f); ++ LOAD4(biasInt, biasAddr); ++ bias = vec_ctf(biasInt, QUANT_BIAS_SHIFT); ++ negBias = vec_madd(bias, negOneFloat, zero); ++ } ++ ++ { ++ vector float q0, q1, q2, q3, q4, q5, q6, q7; ++ ++ q0 = vec_ctf(qmat[0], QMAT_SHIFT); ++ q1 = vec_ctf(qmat[2], QMAT_SHIFT); ++ q2 = vec_ctf(qmat[4], QMAT_SHIFT); ++ q3 = vec_ctf(qmat[6], QMAT_SHIFT); ++ q4 = vec_ctf(qmat[8], QMAT_SHIFT); ++ q5 = vec_ctf(qmat[10], QMAT_SHIFT); ++ q6 = vec_ctf(qmat[12], QMAT_SHIFT); ++ q7 = vec_ctf(qmat[14], QMAT_SHIFT); ++ ++ row0 = vec_sel(vec_madd(row0, q0, negBias), vec_madd(row0, q0, bias), ++ vec_cmpgt(row0, zero)); ++ row1 = vec_sel(vec_madd(row1, q1, negBias), vec_madd(row1, q1, bias), ++ vec_cmpgt(row1, zero)); ++ row2 = vec_sel(vec_madd(row2, q2, negBias), vec_madd(row2, q2, bias), ++ vec_cmpgt(row2, zero)); ++ row3 = vec_sel(vec_madd(row3, q3, negBias), vec_madd(row3, q3, bias), ++ vec_cmpgt(row3, zero)); ++ row4 = vec_sel(vec_madd(row4, q4, negBias), vec_madd(row4, q4, bias), ++ vec_cmpgt(row4, zero)); ++ row5 = vec_sel(vec_madd(row5, q5, negBias), vec_madd(row5, q5, bias), ++ vec_cmpgt(row5, zero)); ++ row6 = vec_sel(vec_madd(row6, q6, negBias), vec_madd(row6, q6, bias), ++ vec_cmpgt(row6, zero)); ++ row7 = vec_sel(vec_madd(row7, q7, negBias), vec_madd(row7, q7, bias), ++ vec_cmpgt(row7, zero)); ++ ++ q0 = vec_ctf(qmat[1], QMAT_SHIFT); ++ q1 = vec_ctf(qmat[3], QMAT_SHIFT); ++ q2 = vec_ctf(qmat[5], QMAT_SHIFT); ++ q3 = vec_ctf(qmat[7], QMAT_SHIFT); ++ q4 = vec_ctf(qmat[9], QMAT_SHIFT); ++ q5 = vec_ctf(qmat[11], QMAT_SHIFT); ++ q6 = vec_ctf(qmat[13], QMAT_SHIFT); ++ q7 = vec_ctf(qmat[15], QMAT_SHIFT); ++ ++ alt0 = vec_sel(vec_madd(alt0, q0, negBias), vec_madd(alt0, q0, bias), ++ vec_cmpgt(alt0, zero)); ++ alt1 = vec_sel(vec_madd(alt1, q1, negBias), vec_madd(alt1, q1, bias), ++ vec_cmpgt(alt1, zero)); ++ alt2 = vec_sel(vec_madd(alt2, q2, negBias), vec_madd(alt2, q2, bias), ++ vec_cmpgt(alt2, zero)); ++ alt3 = vec_sel(vec_madd(alt3, q3, negBias), vec_madd(alt3, q3, bias), ++ vec_cmpgt(alt3, zero)); ++ alt4 = vec_sel(vec_madd(alt4, q4, negBias), vec_madd(alt4, q4, bias), ++ vec_cmpgt(alt4, zero)); ++ alt5 = vec_sel(vec_madd(alt5, q5, negBias), vec_madd(alt5, q5, bias), ++ vec_cmpgt(alt5, zero)); ++ alt6 = vec_sel(vec_madd(alt6, q6, negBias), vec_madd(alt6, q6, bias), ++ vec_cmpgt(alt6, zero)); ++ alt7 = vec_sel(vec_madd(alt7, q7, negBias), vec_madd(alt7, q7, bias), ++ vec_cmpgt(alt7, zero)); ++ } ++ ++ ++ } ++ ++ // Store the data back into the original block ++ { ++ vector signed short data0, data1, data2, data3, data4, data5, data6, data7; ++ ++ data0 = vec_pack(vec_cts(row0, 0), vec_cts(alt0, 0)); ++ data1 = vec_pack(vec_cts(row1, 0), vec_cts(alt1, 0)); ++ data2 = vec_pack(vec_cts(row2, 0), vec_cts(alt2, 0)); ++ data3 = vec_pack(vec_cts(row3, 0), vec_cts(alt3, 0)); ++ data4 = vec_pack(vec_cts(row4, 0), vec_cts(alt4, 0)); ++ data5 = vec_pack(vec_cts(row5, 0), vec_cts(alt5, 0)); ++ data6 = vec_pack(vec_cts(row6, 0), vec_cts(alt6, 0)); ++ data7 = vec_pack(vec_cts(row7, 0), vec_cts(alt7, 0)); ++ ++ { ++ // Clamp for overflow ++ vector signed int max_q_int, min_q_int; ++ vector signed short max_q, min_q; ++ ++ LOAD4(max_q_int, &(s->max_qcoeff)); ++ LOAD4(min_q_int, &(s->min_qcoeff)); ++ ++ max_q = vec_pack(max_q_int, max_q_int); ++ min_q = vec_pack(min_q_int, min_q_int); ++ ++ data0 = vec_max(vec_min(data0, max_q), min_q); ++ data1 = vec_max(vec_min(data1, max_q), min_q); ++ data2 = vec_max(vec_min(data2, max_q), min_q); ++ data4 = vec_max(vec_min(data4, max_q), min_q); ++ data5 = vec_max(vec_min(data5, max_q), min_q); ++ data6 = vec_max(vec_min(data6, max_q), min_q); ++ data7 = vec_max(vec_min(data7, max_q), min_q); ++ } ++ ++ vector bool char zero_01, zero_23, zero_45, zero_67; ++ vector signed char scanIndices_01, scanIndices_23, scanIndices_45, scanIndices_67; ++ vector signed char negOne = vec_splat_s8(-1); ++ vector signed char* scanPtr = ++ (vector signed char*)(s->intra_scantable.inverse); ++ ++ // Determine the largest non-zero index. ++ zero_01 = vec_pack(vec_cmpeq(data0, (vector short)zero), ++ vec_cmpeq(data1, (vector short)zero)); ++ zero_23 = vec_pack(vec_cmpeq(data2, (vector short)zero), ++ vec_cmpeq(data3, (vector short)zero)); ++ zero_45 = vec_pack(vec_cmpeq(data4, (vector short)zero), ++ vec_cmpeq(data5, (vector short)zero)); ++ zero_67 = vec_pack(vec_cmpeq(data6, (vector short)zero), ++ vec_cmpeq(data7, (vector short)zero)); ++ ++ // 64 biggest values ++ scanIndices_01 = vec_sel(scanPtr[0], negOne, zero_01); ++ scanIndices_23 = vec_sel(scanPtr[1], negOne, zero_23); ++ scanIndices_45 = vec_sel(scanPtr[2], negOne, zero_45); ++ scanIndices_67 = vec_sel(scanPtr[3], negOne, zero_67); ++ ++ // 32 largest values ++ scanIndices_01 = vec_max(scanIndices_01, scanIndices_23); ++ scanIndices_45 = vec_max(scanIndices_45, scanIndices_67); ++ ++ // 16 largest values ++ scanIndices_01 = vec_max(scanIndices_01, scanIndices_45); ++ ++ // 8 largest values ++ scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne), ++ vec_mergel(scanIndices_01, negOne)); ++ ++ // 4 largest values ++ scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne), ++ vec_mergel(scanIndices_01, negOne)); ++ ++ // 2 largest values ++ scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne), ++ vec_mergel(scanIndices_01, negOne)); ++ ++ // largest value ++ scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne), ++ vec_mergel(scanIndices_01, negOne)); ++ ++ scanIndices_01 = vec_splat(scanIndices_01, 0); ++ ++ signed char lastNonZeroChar; ++ ++ vec_ste(scanIndices_01, 0, &lastNonZeroChar); ++ ++ lastNonZero = lastNonZeroChar; ++ ++ // While the data is still in vectors we check for the transpose IDCT permute ++ // and handle it using the vector unit if we can. This is the permute used ++ // by the altivec idct, so it is common when using the altivec dct. ++ ++ if ((lastNonZero > 0) && (s->dsp.idct_permutation_type == FF_TRANSPOSE_IDCT_PERM)) ++ { ++ TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7); ++ } ++ ++ vec_st(data0, 0, data); ++ vec_st(data1, 16, data); ++ vec_st(data2, 32, data); ++ vec_st(data3, 48, data); ++ vec_st(data4, 64, data); ++ vec_st(data5, 80, data); ++ vec_st(data6, 96, data); ++ vec_st(data7, 112, data); ++ } ++ ++ // special handling of block[0] ++ if (s->mb_intra) ++ { ++ if (!s->h263_aic) ++ { ++ if (n < 4) ++ oldBaseValue /= s->y_dc_scale; ++ else ++ oldBaseValue /= s->c_dc_scale; ++ } ++ ++ // Divide by 8, rounding the result ++ data[0] = (oldBaseValue + 4) >> 3; ++ } ++ ++ // We handled the tranpose permutation above and we don't ++ // need to permute the "no" permutation case. ++ if ((lastNonZero > 0) && ++ (s->dsp.idct_permutation_type != FF_TRANSPOSE_IDCT_PERM) && ++ (s->dsp.idct_permutation_type != FF_NO_IDCT_PERM)) ++ { ++ ff_block_permute(data, s->dsp.idct_permutation, ++ s->intra_scantable.scantable, lastNonZero); ++ } ++ ++ return lastNonZero; ++} ++#undef FOUROF ++ ++/* ++ AltiVec version of dct_unquantize_h263 ++ this code assumes `block' is 16 bytes-aligned ++*/ ++void dct_unquantize_h263_altivec(MpegEncContext *s, ++ DCTELEM *block, int n, int qscale) ++{ ++POWERPC_PERF_DECLARE(altivec_dct_unquantize_h263_num, 1); ++ int i, level, qmul, qadd; ++ int nCoeffs; ++ ++ assert(s->block_last_index[n]>=0); ++ ++POWERPC_PERF_START_COUNT(altivec_dct_unquantize_h263_num, 1); ++ ++ qadd = (qscale - 1) | 1; ++ qmul = qscale << 1; ++ ++ if (s->mb_intra) { ++ if (!s->h263_aic) { ++ if (n < 4) ++ block[0] = block[0] * s->y_dc_scale; ++ else ++ block[0] = block[0] * s->c_dc_scale; ++ }else ++ qadd = 0; ++ i = 1; ++ nCoeffs= 63; //does not allways use zigzag table ++ } else { ++ i = 0; ++ nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; ++ } ++ ++#ifdef ALTIVEC_USE_REFERENCE_C_CODE ++ for(;i<=nCoeffs;i++) { ++ level = block[i]; ++ if (level) { ++ if (level < 0) { ++ level = level * qmul - qadd; ++ } else { ++ level = level * qmul + qadd; ++ } ++ block[i] = level; ++ } ++ } ++#else /* ALTIVEC_USE_REFERENCE_C_CODE */ ++ { ++ register const vector short vczero = (const vector short)vec_splat_s16(0); ++ short __attribute__ ((aligned(16))) qmul8[] = ++ { ++ qmul, qmul, qmul, qmul, ++ qmul, qmul, qmul, qmul ++ }; ++ short __attribute__ ((aligned(16))) qadd8[] = ++ { ++ qadd, qadd, qadd, qadd, ++ qadd, qadd, qadd, qadd ++ }; ++ short __attribute__ ((aligned(16))) nqadd8[] = ++ { ++ -qadd, -qadd, -qadd, -qadd, ++ -qadd, -qadd, -qadd, -qadd ++ }; ++ register vector short blockv, qmulv, qaddv, nqaddv, temp1; ++ register vector bool short blockv_null, blockv_neg; ++ register short backup_0 = block[0]; ++ register int j = 0; ++ ++ qmulv = vec_ld(0, qmul8); ++ qaddv = vec_ld(0, qadd8); ++ nqaddv = vec_ld(0, nqadd8); ++ ++#if 0 // block *is* 16 bytes-aligned, it seems. ++ // first make sure block[j] is 16 bytes-aligned ++ for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) { ++ level = block[j]; ++ if (level) { ++ if (level < 0) { ++ level = level * qmul - qadd; ++ } else { ++ level = level * qmul + qadd; ++ } ++ block[j] = level; ++ } ++ } ++#endif ++ ++ // vectorize all the 16 bytes-aligned blocks ++ // of 8 elements ++ for(; (j + 7) <= nCoeffs ; j+=8) ++ { ++ blockv = vec_ld(j << 1, block); ++ blockv_neg = vec_cmplt(blockv, vczero); ++ blockv_null = vec_cmpeq(blockv, vczero); ++ // choose between +qadd or -qadd as the third operand ++ temp1 = vec_sel(qaddv, nqaddv, blockv_neg); ++ // multiply & add (block{i,i+7} * qmul [+-] qadd) ++ temp1 = vec_mladd(blockv, qmulv, temp1); ++ // put 0 where block[{i,i+7} used to have 0 ++ blockv = vec_sel(temp1, blockv, blockv_null); ++ vec_st(blockv, j << 1, block); ++ } ++ ++ // if nCoeffs isn't a multiple of 8, finish the job ++ // using good old scalar units. ++ // (we could do it using a truncated vector, ++ // but I'm not sure it's worth the hassle) ++ for(; j <= nCoeffs ; j++) { ++ level = block[j]; ++ if (level) { ++ if (level < 0) { ++ level = level * qmul - qadd; ++ } else { ++ level = level * qmul + qadd; ++ } ++ block[j] = level; ++ } ++ } ++ ++ if (i == 1) ++ { // cheat. this avoid special-casing the first iteration ++ block[0] = backup_0; ++ } ++ } ++#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ ++ ++POWERPC_PERF_STOP_COUNT(altivec_dct_unquantize_h263_num, nCoeffs == 63); ++} +diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/mpegvideo_ppc.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/mpegvideo_ppc.c +--- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/mpegvideo_ppc.c 1970-01-01 01:00:00.000000000 +0100 ++++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/mpegvideo_ppc.c 2003-09-28 17:26:40.000000000 +0200 +@@ -0,0 +1,83 @@ ++/* ++ * Copyright (c) 2002 Dieter Shirley ++ * ++ * This library is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2 of the License, or (at your option) any later version. ++ * ++ * This library is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with this library; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ */ ++ ++#include "../dsputil.h" ++#include "../mpegvideo.h" ++#include ++ ++#ifdef HAVE_ALTIVEC ++#include "dsputil_altivec.h" ++#endif ++ ++extern int dct_quantize_altivec(MpegEncContext *s, ++ DCTELEM *block, int n, ++ int qscale, int *overflow); ++extern void dct_unquantize_h263_altivec(MpegEncContext *s, ++ DCTELEM *block, int n, int qscale); ++ ++extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block); ++extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block); ++ ++ ++void MPV_common_init_ppc(MpegEncContext *s) ++{ ++#if HAVE_ALTIVEC ++ if (has_altivec()) ++ { ++ if ((s->avctx->idct_algo == FF_IDCT_AUTO) || ++ (s->avctx->idct_algo == FF_IDCT_ALTIVEC)) ++ { ++ s->dsp.idct_put = idct_put_altivec; ++ s->dsp.idct_add = idct_add_altivec; ++#ifndef ALTIVEC_USE_REFERENCE_C_CODE ++ s->dsp.idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; ++#else /* ALTIVEC_USE_REFERENCE_C_CODE */ ++ s->dsp.idct_permutation_type = FF_NO_IDCT_PERM; ++#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ ++ } ++ ++ // Test to make sure that the dct required alignments are met. ++ if ((((long)(s->q_intra_matrix) & 0x0f) != 0) || ++ (((long)(s->q_inter_matrix) & 0x0f) != 0)) ++ { ++ fprintf(stderr, "Internal Error: q-matrix blocks must be 16-byte aligned " ++ "to use Altivec DCT. Reverting to non-altivec version.\n"); ++ return; ++ } ++ ++ if (((long)(s->intra_scantable.inverse) & 0x0f) != 0) ++ { ++ fprintf(stderr, "Internal Error: scan table blocks must be 16-byte aligned " ++ "to use Altivec DCT. Reverting to non-altivec version.\n"); ++ return; ++ } ++ ++ ++ if ((s->avctx->dct_algo == FF_DCT_AUTO) || ++ (s->avctx->dct_algo == FF_DCT_ALTIVEC)) ++ { ++ s->dct_quantize = dct_quantize_altivec; ++ s->dct_unquantize_h263 = dct_unquantize_h263_altivec; ++ } ++ } else ++#endif ++ { ++ /* Non-AltiVec PPC optimisations here */ ++ } ++} ++ -- 2.44.0