avifile-ffmpeg-ppc.patch

   1 diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/dsputil_altivec.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/dsputil_altivec.c
   2 --- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/dsputil_altivec.c     1970-01-01 01:00:00.000000000 +0100
   3 +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/dsputil_altivec.c  2003-09-28 17:26:40.000000000 +0200
   4 @@ -0,0 +1,1345 @@
   5 +/*
   6 + * Copyright (c) 2002 Brian Foley
   7 + * Copyright (c) 2002 Dieter Shirley
   8 + * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
   9 + *
  10 + * This library is free software; you can redistribute it and/or
  11 + * modify it under the terms of the GNU Lesser General Public
  12 + * License as published by the Free Software Foundation; either
  13 + * version 2 of the License, or (at your option) any later version.
  14 + *
  15 + * This library is distributed in the hope that it will be useful,
  16 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18 + * Lesser General Public License for more details.
  19 + *
  20 + * You should have received a copy of the GNU Lesser General Public
  21 + * License along with this library; if not, write to the Free Software
  22 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  23 + */
  24 +
  25 +#include "../dsputil.h"
  26 +
  27 +#include "gcc_fixes.h"
  28 +
  29 +#include "dsputil_altivec.h"
  30 +
  31 +#ifdef CONFIG_DARWIN
  32 +#include <sys/sysctl.h>
  33 +#else /* CONFIG_DARWIN */
  34 +#include <signal.h>
  35 +#include <setjmp.h>
  36 +
  37 +static sigjmp_buf jmpbuf;
  38 +static volatile sig_atomic_t canjump = 0;
  39 +
  40 +static void sigill_handler (int sig)
  41 +{
  42 +    if (!canjump) {
  43 +        signal (sig, SIG_DFL);
  44 +        raise (sig);
  45 +    }
  46 +
  47 +    canjump = 0;
  48 +    siglongjmp (jmpbuf, 1);
  49 +}
  50 +#endif /* CONFIG_DARWIN */
  51 +
  52 +int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
  53 +{
  54 +    int i;
  55 +    int s __attribute__((aligned(16)));
  56 +    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
  57 +    vector unsigned char *tv;
  58 +    vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
  59 +    vector unsigned int sad;
  60 +    vector signed int sumdiffs;
  61 +
  62 +    s = 0;
  63 +    sad = (vector unsigned int)vec_splat_u32(0);
  64 +    for(i=0;i<16;i++) {
  65 +        /*
  66 +           Read unaligned pixels into our vectors. The vectors are as follows:
  67 +           pix1v: pix1[0]-pix1[15]
  68 +           pix2v: pix2[0]-pix2[15]     pix2iv: pix2[1]-pix2[16]
  69 +        */
  70 +        tv = (vector unsigned char *) pix1;
  71 +        pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  72 +
  73 +        tv = (vector unsigned char *) &pix2[0];
  74 +        pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  75 +
  76 +        tv = (vector unsigned char *) &pix2[1];
  77 +        pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
  78 +
  79 +        /* Calculate the average vector */
  80 +        avgv = vec_avg(pix2v, pix2iv);
  81 +
  82 +        /* Calculate a sum of abs differences vector */
  83 +        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  84 +
  85 +        /* Add each 4 pixel group together and put 4 results into sad */
  86 +        sad = vec_sum4s(t5, sad);
  87 +
  88 +        pix1 += line_size;
  89 +        pix2 += line_size;
  90 +    }
  91 +    /* Sum up the four partial sums, and put the result into s */
  92 +    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  93 +    sumdiffs = vec_splat(sumdiffs, 3);
  94 +    vec_ste(sumdiffs, 0, &s);
  95 +
  96 +    return s;
  97 +}
  98 +
  99 +int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 100 +{
 101 +    int i;
 102 +    int s __attribute__((aligned(16)));
 103 +    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
 104 +    vector unsigned char *tv;
 105 +    vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
 106 +    vector unsigned int sad;
 107 +    vector signed int sumdiffs;
 108 +    uint8_t *pix3 = pix2 + line_size;
 109 +
 110 +    s = 0;
 111 +    sad = (vector unsigned int)vec_splat_u32(0);
 112 +
 113 +    /*
 114 +       Due to the fact that pix3 = pix2 + line_size, the pix3 of one
 115 +       iteration becomes pix2 in the next iteration. We can use this
 116 +       fact to avoid a potentially expensive unaligned read, each
 117 +       time around the loop.
 118 +       Read unaligned pixels into our vectors. The vectors are as follows:
 119 +       pix2v: pix2[0]-pix2[15]
 120 +       Split the pixel vectors into shorts
 121 +    */
 122 +    tv = (vector unsigned char *) &pix2[0];
 123 +    pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
 124 +
 125 +    for(i=0;i<16;i++) {
 126 +        /*
 127 +           Read unaligned pixels into our vectors. The vectors are as follows:
 128 +           pix1v: pix1[0]-pix1[15]
 129 +           pix3v: pix3[0]-pix3[15]
 130 +        */
 131 +        tv = (vector unsigned char *) pix1;
 132 +        pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
 133 +
 134 +        tv = (vector unsigned char *) &pix3[0];
 135 +        pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
 136 +
 137 +        /* Calculate the average vector */
 138 +        avgv = vec_avg(pix2v, pix3v);
 139 +
 140 +        /* Calculate a sum of abs differences vector */
 141 +        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
 142 +
 143 +        /* Add each 4 pixel group together and put 4 results into sad */
 144 +        sad = vec_sum4s(t5, sad);
 145 +
 146 +        pix1 += line_size;
 147 +        pix2v = pix3v;
 148 +        pix3 += line_size;
 149 +
 150 +    }
 151 +
 152 +    /* Sum up the four partial sums, and put the result into s */
 153 +    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
 154 +    sumdiffs = vec_splat(sumdiffs, 3);
 155 +    vec_ste(sumdiffs, 0, &s);
 156 +    return s;
 157 +}
 158 +
 159 +int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 160 +{
 161 +    int i;
 162 +    int s __attribute__((aligned(16)));
 163 +    uint8_t *pix3 = pix2 + line_size;
 164 +    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
 165 +    const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2);
 166 +    vector unsigned char *tv, avgv, t5;
 167 +    vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
 168 +    vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
 169 +    vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
 170 +    vector unsigned short avghv, avglv;
 171 +    vector unsigned short t1, t2, t3, t4;
 172 +    vector unsigned int sad;
 173 +    vector signed int sumdiffs;
 174 +
 175 +    sad = (vector unsigned int)vec_splat_u32(0);
 176 +
 177 +    s = 0;
 178 +
 179 +    /*
 180 +       Due to the fact that pix3 = pix2 + line_size, the pix3 of one
 181 +       iteration becomes pix2 in the next iteration. We can use this
 182 +       fact to avoid a potentially expensive unaligned read, as well
 183 +       as some splitting, and vector addition each time around the loop.
 184 +       Read unaligned pixels into our vectors. The vectors are as follows:
 185 +       pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
 186 +       Split the pixel vectors into shorts
 187 +    */
 188 +    tv = (vector unsigned char *) &pix2[0];
 189 +    pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
 190 +
 191 +    tv = (vector unsigned char *) &pix2[1];
 192 +    pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
 193 +
 194 +    pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
 195 +    pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
 196 +    pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
 197 +    pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
 198 +    t1 = vec_add(pix2hv, pix2ihv);
 199 +    t2 = vec_add(pix2lv, pix2ilv);
 200 +
 201 +    for(i=0;i<16;i++) {
 202 +        /*
 203 +           Read unaligned pixels into our vectors. The vectors are as follows:
 204 +           pix1v: pix1[0]-pix1[15]
 205 +           pix3v: pix3[0]-pix3[15]     pix3iv: pix3[1]-pix3[16]
 206 +        */
 207 +        tv = (vector unsigned char *) pix1;
 208 +        pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
 209 +
 210 +        tv = (vector unsigned char *) &pix3[0];
 211 +        pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
 212 +
 213 +        tv = (vector unsigned char *) &pix3[1];
 214 +        pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
 215 +
 216 +        /*
 217 +          Note that Altivec does have vec_avg, but this works on vector pairs
 218 +          and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
 219 +          would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
 220 +          Instead, we have to split the pixel vectors into vectors of shorts,
 221 +          and do the averaging by hand.
 222 +        */
 223 +
 224 +        /* Split the pixel vectors into shorts */
 225 +        pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
 226 +        pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
 227 +        pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
 228 +        pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
 229 +
 230 +        /* Do the averaging on them */
 231 +        t3 = vec_add(pix3hv, pix3ihv);
 232 +        t4 = vec_add(pix3lv, pix3ilv);
 233 +
 234 +        avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
 235 +        avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
 236 +
 237 +        /* Pack the shorts back into a result */
 238 +        avgv = vec_pack(avghv, avglv);
 239 +
 240 +        /* Calculate a sum of abs differences vector */
 241 +        t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
 242 +
 243 +        /* Add each 4 pixel group together and put 4 results into sad */
 244 +        sad = vec_sum4s(t5, sad);
 245 +
 246 +        pix1 += line_size;
 247 +        pix3 += line_size;
 248 +        /* Transfer the calculated values for pix3 into pix2 */
 249 +        t1 = t3;
 250 +        t2 = t4;
 251 +    }
 252 +    /* Sum up the four partial sums, and put the result into s */
 253 +    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
 254 +    sumdiffs = vec_splat(sumdiffs, 3);
 255 +    vec_ste(sumdiffs, 0, &s);
 256 +
 257 +    return s;
 258 +}
 259 +
 260 +int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 261 +{
 262 +    int i;
 263 +    int s __attribute__((aligned(16)));
 264 +    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
 265 +    vector unsigned char perm1, perm2, *pix1v, *pix2v;
 266 +    vector unsigned char t1, t2, t3,t4, t5;
 267 +    vector unsigned int sad;
 268 +    vector signed int sumdiffs;
 269 +
 270 +    sad = (vector unsigned int)vec_splat_u32(0);
 271 +
 272 +
 273 +    for(i=0;i<16;i++) {
 274 +       /* Read potentially unaligned pixels into t1 and t2 */
 275 +        perm1 = vec_lvsl(0, pix1);
 276 +        pix1v = (vector unsigned char *) pix1;
 277 +        perm2 = vec_lvsl(0, pix2);
 278 +        pix2v = (vector unsigned char *) pix2;
 279 +        t1 = vec_perm(pix1v[0], pix1v[1], perm1);
 280 +        t2 = vec_perm(pix2v[0], pix2v[1], perm2);
 281 +
 282 +       /* Calculate a sum of abs differences vector */
 283 +        t3 = vec_max(t1, t2);
 284 +        t4 = vec_min(t1, t2);
 285 +        t5 = vec_sub(t3, t4);
 286 +
 287 +       /* Add each 4 pixel group together and put 4 results into sad */
 288 +        sad = vec_sum4s(t5, sad);
 289 +
 290 +        pix1 += line_size;
 291 +        pix2 += line_size;
 292 +    }
 293 +
 294 +    /* Sum up the four partial sums, and put the result into s */
 295 +    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
 296 +    sumdiffs = vec_splat(sumdiffs, 3);
 297 +    vec_ste(sumdiffs, 0, &s);
 298 +
 299 +    return s;
 300 +}
 301 +
 302 +int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 303 +{
 304 +    int i;
 305 +    int s __attribute__((aligned(16)));
 306 +    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
 307 +    vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
 308 +    vector unsigned char t1, t2, t3,t4, t5;
 309 +    vector unsigned int sad;
 310 +    vector signed int sumdiffs;
 311 +
 312 +    sad = (vector unsigned int)vec_splat_u32(0);
 313 +
 314 +    permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
 315 +
 316 +    for(i=0;i<8;i++) {
 317 +       /* Read potentially unaligned pixels into t1 and t2
 318 +          Since we're reading 16 pixels, and actually only want 8,
 319 +          mask out the last 8 pixels. The 0s don't change the sum. */
 320 +        perm1 = vec_lvsl(0, pix1);
 321 +        pix1v = (vector unsigned char *) pix1;
 322 +        perm2 = vec_lvsl(0, pix2);
 323 +        pix2v = (vector unsigned char *) pix2;
 324 +        t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
 325 +        t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
 326 +
 327 +       /* Calculate a sum of abs differences vector */
 328 +        t3 = vec_max(t1, t2);
 329 +        t4 = vec_min(t1, t2);
 330 +        t5 = vec_sub(t3, t4);
 331 +
 332 +       /* Add each 4 pixel group together and put 4 results into sad */
 333 +        sad = vec_sum4s(t5, sad);
 334 +
 335 +        pix1 += line_size;
 336 +        pix2 += line_size;
 337 +    }
 338 +
 339 +    /* Sum up the four partial sums, and put the result into s */
 340 +    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
 341 +    sumdiffs = vec_splat(sumdiffs, 3);
 342 +    vec_ste(sumdiffs, 0, &s);
 343 +
 344 +    return s;
 345 +}
 346 +
 347 +int pix_norm1_altivec(uint8_t *pix, int line_size)
 348 +{
 349 +    int i;
 350 +    int s __attribute__((aligned(16)));
 351 +    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
 352 +    vector unsigned char *tv;
 353 +    vector unsigned char pixv;
 354 +    vector unsigned int sv;
 355 +    vector signed int sum;
 356 +
 357 +    sv = (vector unsigned int)vec_splat_u32(0);
 358 +
 359 +    s = 0;
 360 +    for (i = 0; i < 16; i++) {
 361 +        /* Read in the potentially unaligned pixels */
 362 +        tv = (vector unsigned char *) pix;
 363 +        pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
 364 +
 365 +        /* Square the values, and add them to our sum */
 366 +        sv = vec_msum(pixv, pixv, sv);
 367 +
 368 +        pix += line_size;
 369 +    }
 370 +    /* Sum up the four partial sums, and put the result into s */
 371 +    sum = vec_sums((vector signed int) sv, (vector signed int) zero);
 372 +    sum = vec_splat(sum, 3);
 373 +    vec_ste(sum, 0, &s);
 374 +
 375 +    return s;
 376 +}
 377 +
 378 +/**
 379 + * Sum of Squared Errors for a 8x8 block.
 380 + * AltiVec-enhanced.
 381 + * It's the pix_abs8x8_altivec code above w/ squaring added.
 382 + */
 383 +int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
 384 +{
 385 +    int i;
 386 +    int s __attribute__((aligned(16)));
 387 +    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
 388 +    vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
 389 +    vector unsigned char t1, t2, t3,t4, t5;
 390 +    vector unsigned int sum;
 391 +    vector signed int sumsqr;
 392 +
 393 +    sum = (vector unsigned int)vec_splat_u32(0);
 394 +
 395 +    permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
 396 +
 397 +
 398 +    for(i=0;i<8;i++) {
 399 +       /* Read potentially unaligned pixels into t1 and t2
 400 +          Since we're reading 16 pixels, and actually only want 8,
 401 +          mask out the last 8 pixels. The 0s don't change the sum. */
 402 +        perm1 = vec_lvsl(0, pix1);
 403 +        pix1v = (vector unsigned char *) pix1;
 404 +        perm2 = vec_lvsl(0, pix2);
 405 +        pix2v = (vector unsigned char *) pix2;
 406 +        t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
 407 +        t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
 408 +
 409 +        /*
 410 +          Since we want to use unsigned chars, we can take advantage
 411 +          of the fact that abs(a-b)^2 = (a-b)^2.
 412 +        */
 413 +
 414 +       /* Calculate abs differences vector */
 415 +        t3 = vec_max(t1, t2);
 416 +        t4 = vec_min(t1, t2);
 417 +        t5 = vec_sub(t3, t4);
 418 +
 419 +        /* Square the values and add them to our sum */
 420 +        sum = vec_msum(t5, t5, sum);
 421 +
 422 +        pix1 += line_size;
 423 +        pix2 += line_size;
 424 +    }
 425 +
 426 +    /* Sum up the four partial sums, and put the result into s */
 427 +    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
 428 +    sumsqr = vec_splat(sumsqr, 3);
 429 +    vec_ste(sumsqr, 0, &s);
 430 +
 431 +    return s;
 432 +}
 433 +
 434 +/**
 435 + * Sum of Squared Errors for a 16x16 block.
 436 + * AltiVec-enhanced.
 437 + * It's the pix_abs16x16_altivec code above w/ squaring added.
 438 + */
 439 +int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
 440 +{
 441 +    int i;
 442 +    int s __attribute__((aligned(16)));
 443 +    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
 444 +    vector unsigned char perm1, perm2, *pix1v, *pix2v;
 445 +    vector unsigned char t1, t2, t3,t4, t5;
 446 +    vector unsigned int sum;
 447 +    vector signed int sumsqr;
 448 +
 449 +    sum = (vector unsigned int)vec_splat_u32(0);
 450 +
 451 +    for(i=0;i<16;i++) {
 452 +       /* Read potentially unaligned pixels into t1 and t2 */
 453 +        perm1 = vec_lvsl(0, pix1);
 454 +        pix1v = (vector unsigned char *) pix1;
 455 +        perm2 = vec_lvsl(0, pix2);
 456 +        pix2v = (vector unsigned char *) pix2;
 457 +        t1 = vec_perm(pix1v[0], pix1v[1], perm1);
 458 +        t2 = vec_perm(pix2v[0], pix2v[1], perm2);
 459 +
 460 +        /*
 461 +          Since we want to use unsigned chars, we can take advantage
 462 +          of the fact that abs(a-b)^2 = (a-b)^2.
 463 +        */
 464 +
 465 +       /* Calculate abs differences vector */
 466 +        t3 = vec_max(t1, t2);
 467 +        t4 = vec_min(t1, t2);
 468 +        t5 = vec_sub(t3, t4);
 469 +
 470 +        /* Square the values and add them to our sum */
 471 +        sum = vec_msum(t5, t5, sum);
 472 +
 473 +        pix1 += line_size;
 474 +        pix2 += line_size;
 475 +    }
 476 +
 477 +    /* Sum up the four partial sums, and put the result into s */
 478 +    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
 479 +    sumsqr = vec_splat(sumsqr, 3);
 480 +    vec_ste(sumsqr, 0, &s);
 481 +
 482 +    return s;
 483 +}
 484 +
 485 +int pix_sum_altivec(uint8_t * pix, int line_size)
 486 +{
 487 +    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
 488 +    vector unsigned char perm, *pixv;
 489 +    vector unsigned char t1;
 490 +    vector unsigned int sad;
 491 +    vector signed int sumdiffs;
 492 +
 493 +    int i;
 494 +    int s __attribute__((aligned(16)));
 495 +
 496 +    sad = (vector unsigned int)vec_splat_u32(0);
 497 +
 498 +    for (i = 0; i < 16; i++) {
 499 +       /* Read the potentially unaligned 16 pixels into t1 */
 500 +        perm = vec_lvsl(0, pix);
 501 +        pixv = (vector unsigned char *) pix;
 502 +        t1 = vec_perm(pixv[0], pixv[1], perm);
 503 +
 504 +       /* Add each 4 pixel group together and put 4 results into sad */
 505 +        sad = vec_sum4s(t1, sad);
 506 +
 507 +        pix += line_size;
 508 +    }
 509 +
 510 +    /* Sum up the four partial sums, and put the result into s */
 511 +    sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
 512 +    sumdiffs = vec_splat(sumdiffs, 3);
 513 +    vec_ste(sumdiffs, 0, &s);
 514 +
 515 +    return s;
 516 +}
 517 +
 518 +void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 519 +{
 520 +    int i;
 521 +    vector unsigned char perm, bytes, *pixv;
 522 +    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
 523 +    vector signed short shorts;
 524 +
 525 +    for(i=0;i<8;i++)
 526 +    {
 527 +        // Read potentially unaligned pixels.
 528 +        // We're reading 16 pixels, and actually only want 8,
 529 +        // but we simply ignore the extras.
 530 +        perm = vec_lvsl(0, pixels);
 531 +        pixv = (vector unsigned char *) pixels;
 532 +        bytes = vec_perm(pixv[0], pixv[1], perm);
 533 +
 534 +        // convert the bytes into shorts
 535 +        shorts = (vector signed short)vec_mergeh(zero, bytes);
 536 +
 537 +        // save the data to the block, we assume the block is 16-byte aligned
 538 +        vec_st(shorts, i*16, (vector signed short*)block);
 539 +
 540 +        pixels += line_size;
 541 +    }
 542 +}
 543 +
 544 +void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
 545 +        const uint8_t *s2, int stride)
 546 +{
 547 +    int i;
 548 +    vector unsigned char perm, bytes, *pixv;
 549 +    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
 550 +    vector signed short shorts1, shorts2;
 551 +
 552 +    for(i=0;i<4;i++)
 553 +    {
 554 +        // Read potentially unaligned pixels
 555 +        // We're reading 16 pixels, and actually only want 8,
 556 +        // but we simply ignore the extras.
 557 +        perm = vec_lvsl(0, s1);
 558 +        pixv = (vector unsigned char *) s1;
 559 +        bytes = vec_perm(pixv[0], pixv[1], perm);
 560 +
 561 +        // convert the bytes into shorts
 562 +        shorts1 = (vector signed short)vec_mergeh(zero, bytes);
 563 +
 564 +        // Do the same for the second block of pixels
 565 +        perm = vec_lvsl(0, s2);
 566 +        pixv = (vector unsigned char *) s2;
 567 +        bytes = vec_perm(pixv[0], pixv[1], perm);
 568 +
 569 +        // convert the bytes into shorts
 570 +        shorts2 = (vector signed short)vec_mergeh(zero, bytes);
 571 +
 572 +        // Do the subtraction
 573 +        shorts1 = vec_sub(shorts1, shorts2);
 574 +
 575 +        // save the data to the block, we assume the block is 16-byte aligned
 576 +        vec_st(shorts1, 0, (vector signed short*)block);
 577 +
 578 +        s1 += stride;
 579 +        s2 += stride;
 580 +        block += 8;
 581 +
 582 +
 583 +        // The code below is a copy of the code above... This is a manual
 584 +        // unroll.
 585 +
 586 +        // Read potentially unaligned pixels
 587 +        // We're reading 16 pixels, and actually only want 8,
 588 +        // but we simply ignore the extras.
 589 +        perm = vec_lvsl(0, s1);
 590 +        pixv = (vector unsigned char *) s1;
 591 +        bytes = vec_perm(pixv[0], pixv[1], perm);
 592 +
 593 +        // convert the bytes into shorts
 594 +        shorts1 = (vector signed short)vec_mergeh(zero, bytes);
 595 +
 596 +        // Do the same for the second block of pixels
 597 +        perm = vec_lvsl(0, s2);
 598 +        pixv = (vector unsigned char *) s2;
 599 +        bytes = vec_perm(pixv[0], pixv[1], perm);
 600 +
 601 +        // convert the bytes into shorts
 602 +        shorts2 = (vector signed short)vec_mergeh(zero, bytes);
 603 +
 604 +        // Do the subtraction
 605 +        shorts1 = vec_sub(shorts1, shorts2);
 606 +
 607 +        // save the data to the block, we assume the block is 16-byte aligned
 608 +        vec_st(shorts1, 0, (vector signed short*)block);
 609 +
 610 +        s1 += stride;
 611 +        s2 += stride;
 612 +        block += 8;
 613 +    }
 614 +}
 615 +
 616 +int sad16x16_altivec(void *s, uint8_t *a, uint8_t *b, int stride) {
 617 +  return pix_abs16x16_altivec(a,b,stride);
 618 +}
 619 +
 620 +int sad8x8_altivec(void *s, uint8_t *a, uint8_t *b, int stride) {
 621 +  return pix_abs8x8_altivec(a,b,stride);
 622 +}
 623 +
 624 +void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
 625 +#ifdef ALTIVEC_USE_REFERENCE_C_CODE
 626 +    int i;
 627 +    for(i=0; i+7<w; i++){
 628 +        dst[i+0] += src[i+0];
 629 +        dst[i+1] += src[i+1];
 630 +        dst[i+2] += src[i+2];
 631 +        dst[i+3] += src[i+3];
 632 +        dst[i+4] += src[i+4];
 633 +        dst[i+5] += src[i+5];
 634 +        dst[i+6] += src[i+6];
 635 +        dst[i+7] += src[i+7];
 636 +    }
 637 +    for(; i<w; i++)
 638 +        dst[i+0] += src[i+0];
 639 +#else /* ALTIVEC_USE_REFERENCE_C_CODE */
 640 +    register int i;
 641 +    register vector unsigned char vdst, vsrc;
 642 +
 643 +    /* dst and src are 16 bytes-aligned (guaranteed) */
 644 +    for(i = 0 ; (i + 15) < w ; i++)
 645 +    {
 646 +      vdst = vec_ld(i << 4, (unsigned char*)dst);
 647 +      vsrc = vec_ld(i << 4, (unsigned char*)src);
 648 +      vdst = vec_add(vsrc, vdst);
 649 +      vec_st(vdst, i << 4, (unsigned char*)dst);
 650 +    }
 651 +    /* if w is not a multiple of 16 */
 652 +    for (; (i < w) ; i++)
 653 +    {
 654 +      dst[i] = src[i];
 655 +    }
 656 +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
 657 +}
 658 +
 659 +/* next one assumes that ((line_size % 16) == 0) */
 660 +void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 661 +{
 662 +POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);
 663 +#ifdef ALTIVEC_USE_REFERENCE_C_CODE
 664 +    int i;
 665 +
 666 +POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
 667 +
 668 +    for(i=0; i<h; i++) {
 669 +      *((uint32_t*)(block )) = (((const struct unaligned_32 *) (pixels))->l);
 670 +      *((uint32_t*)(block+4)) = (((const struct unaligned_32 *) (pixels+4))->l);
 671 +      *((uint32_t*)(block+8)) = (((const struct unaligned_32 *) (pixels+8))->l);
 672 +      *((uint32_t*)(block+12)) = (((const struct unaligned_32 *) (pixels+12))->l);
 673 +      pixels+=line_size;
 674 +      block +=line_size;
 675 +    }
 676 +
 677 +POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
 678 +
 679 +#else /* ALTIVEC_USE_REFERENCE_C_CODE */
 680 +    register vector unsigned char pixelsv1, pixelsv2;
 681 +    register vector unsigned char pixelsv1B, pixelsv2B;
 682 +    register vector unsigned char pixelsv1C, pixelsv2C;
 683 +    register vector unsigned char pixelsv1D, pixelsv2D;
 684 +
 685 +    register vector unsigned char perm = vec_lvsl(0, pixels);
 686 +    int i;
 687 +    register int line_size_2 = line_size << 1;
 688 +    register int line_size_3 = line_size + line_size_2;
 689 +    register int line_size_4 = line_size << 2;
 690 +
 691 +POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
 692 +// hand-unrolling the loop by 4 gains about 15%
 693 +// mininum execution time goes from 74 to 60 cycles
 694 +// it's faster than -funroll-loops, but using
 695 +// -funroll-loops w/ this is bad - 74 cycles again.
 696 +// all this is on a 7450, tuning for the 7450
 697 +#if 0
 698 +    for(i=0; i<h; i++) {
 699 +      pixelsv1 = vec_ld(0, (unsigned char*)pixels);
 700 +      pixelsv2 = vec_ld(16, (unsigned char*)pixels);
 701 +      vec_st(vec_perm(pixelsv1, pixelsv2, perm),
 702 +             0, (unsigned char*)block);
 703 +      pixels+=line_size;
 704 +      block +=line_size;
 705 +    }
 706 +#else
 707 +    for(i=0; i<h; i+=4) {
 708 +      pixelsv1 = vec_ld(0, (unsigned char*)pixels);
 709 +      pixelsv2 = vec_ld(16, (unsigned char*)pixels);
 710 +      pixelsv1B = vec_ld(line_size, (unsigned char*)pixels);
 711 +      pixelsv2B = vec_ld(16 + line_size, (unsigned char*)pixels);
 712 +      pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels);
 713 +      pixelsv2C = vec_ld(16 + line_size_2, (unsigned char*)pixels);
 714 +      pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels);
 715 +      pixelsv2D = vec_ld(16 + line_size_3, (unsigned char*)pixels);
 716 +      vec_st(vec_perm(pixelsv1, pixelsv2, perm),
 717 +             0, (unsigned char*)block);
 718 +      vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
 719 +             line_size, (unsigned char*)block);
 720 +      vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
 721 +             line_size_2, (unsigned char*)block);
 722 +      vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
 723 +             line_size_3, (unsigned char*)block);
 724 +      pixels+=line_size_4;
 725 +      block +=line_size_4;
 726 +    }
 727 +#endif
 728 +POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
 729 +
 730 +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
 731 +}
 732 +
 733 +/* next one assumes that ((line_size % 16) == 0) */
 734 +#define op_avg(a,b)  a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
 735 +void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 736 +{
 737 +POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
 738 +#ifdef ALTIVEC_USE_REFERENCE_C_CODE
 739 +    int i;
 740 +
 741 +POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
 742 +
 743 +    for(i=0; i<h; i++) {
 744 +      op_avg(*((uint32_t*)(block)),(((const struct unaligned_32 *)(pixels))->l));
 745 +      op_avg(*((uint32_t*)(block+4)),(((const struct unaligned_32 *)(pixels+4))->l));
 746 +      op_avg(*((uint32_t*)(block+8)),(((const struct unaligned_32 *)(pixels+8))->l));
 747 +      op_avg(*((uint32_t*)(block+12)),(((const struct unaligned_32 *)(pixels+12))->l));
 748 +      pixels+=line_size;
 749 +      block +=line_size;
 750 +    }
 751 +
 752 +POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
 753 +
 754 +#else /* ALTIVEC_USE_REFERENCE_C_CODE */
 755 +    register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
 756 +    register vector unsigned char perm = vec_lvsl(0, pixels);
 757 +    int i;
 758 +
 759 +POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
 760 +
 761 +    for(i=0; i<h; i++) {
 762 +      pixelsv1 = vec_ld(0, (unsigned char*)pixels);
 763 +      pixelsv2 = vec_ld(16, (unsigned char*)pixels);
 764 +      blockv = vec_ld(0, block);
 765 +      pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
 766 +      blockv = vec_avg(blockv,pixelsv);
 767 +      vec_st(blockv, 0, (unsigned char*)block);
 768 +      pixels+=line_size;
 769 +      block +=line_size;
 770 +    }
 771 +
 772 +POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
 773 +
 774 +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
 775 +}
 776 +
 777 +/* next one assumes that ((line_size % 8) == 0) */
 778 +void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
 779 +{
 780 +POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
 781 +#ifdef ALTIVEC_USE_REFERENCE_C_CODE
 782 +    int i;
 783 +POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
 784 +    for (i = 0; i < h; i++) {
 785 +        *((uint32_t *) (block)) =
 786 +            (((*((uint32_t *) (block))) |
 787 +              ((((const struct unaligned_32 *) (pixels))->l))) -
 788 +             ((((*((uint32_t *) (block))) ^
 789 +                ((((const struct unaligned_32 *) (pixels))->
 790 +                  l))) & 0xFEFEFEFEUL) >> 1));
 791 +        *((uint32_t *) (block + 4)) =
 792 +            (((*((uint32_t *) (block + 4))) |
 793 +              ((((const struct unaligned_32 *) (pixels + 4))->l))) -
 794 +             ((((*((uint32_t *) (block + 4))) ^
 795 +                ((((const struct unaligned_32 *) (pixels +
 796 +                                                  4))->
 797 +                  l))) & 0xFEFEFEFEUL) >> 1));
 798 +        pixels += line_size;
 799 +        block += line_size;
 800 +    }
 801 +POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
 802 +
 803 +#else /* ALTIVEC_USE_REFERENCE_C_CODE */
 804 +    register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
 805 +    int i;
 806 +
 807 +POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
 808 +
 809 +   for (i = 0; i < h; i++) {
 810 +     /*
 811 +       block is 8 bytes-aligned, so we're either in the
 812 +       left block (16 bytes-aligned) or in the right block (not)
 813 +     */
 814 +     int rightside = ((unsigned long)block & 0x0000000F);
 815 +
 816 +     blockv = vec_ld(0, block);
 817 +     pixelsv1 = vec_ld(0, (unsigned char*)pixels);
 818 +     pixelsv2 = vec_ld(16, (unsigned char*)pixels);
 819 +     pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
 820 +
 821 +     if (rightside)
 822 +     {
 823 +       pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
 824 +     }
 825 +     else
 826 +     {
 827 +       pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
 828 +     }
 829 +
 830 +     blockv = vec_avg(blockv, pixelsv);
 831 +
 832 +     vec_st(blockv, 0, block);
 833 +
 834 +     pixels += line_size;
 835 +     block += line_size;
 836 +   }
 837 +
 838 +POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
 839 +
 840 +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
 841 +}
 842 +
 843 +/* next one assumes that ((line_size % 8) == 0) */
 844 +void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 845 +{
 846 +POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
 847 +#ifdef ALTIVEC_USE_REFERENCE_C_CODE
 848 +    int j;
 849 +POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
 850 +    for (j = 0; j < 2; j++) {
 851 +      int i;
 852 +      const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
 853 +      const uint32_t b =
 854 +        (((const struct unaligned_32 *) (pixels + 1))->l);
 855 +      uint32_t l0 =
 856 +        (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
 857 +      uint32_t h0 =
 858 +        ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
 859 +      uint32_t l1, h1;
 860 +      pixels += line_size;
 861 +      for (i = 0; i < h; i += 2) {
 862 +        uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
 863 +        uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
 864 +        l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
 865 +        h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
 866 +        *((uint32_t *) block) =
 867 +          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
 868 +        pixels += line_size;
 869 +        block += line_size;
 870 +        a = (((const struct unaligned_32 *) (pixels))->l);
 871 +        b = (((const struct unaligned_32 *) (pixels + 1))->l);
 872 +        l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
 873 +        h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
 874 +        *((uint32_t *) block) =
 875 +          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
 876 +        pixels += line_size;
 877 +        block += line_size;
 878 +      } pixels += 4 - line_size * (h + 1);
 879 +      block += 4 - line_size * h;
 880 +    }
 881 +
 882 +POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
 883 +
 884 +#else /* ALTIVEC_USE_REFERENCE_C_CODE */
 885 +   register int i;
 886 +   register vector unsigned char
 887 +     pixelsv1, pixelsv2,
 888 +     pixelsavg;
 889 +   register vector unsigned char
 890 +     blockv, temp1, temp2;
 891 +   register vector unsigned short
 892 +     pixelssum1, pixelssum2, temp3;
 893 +   register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
 894 +   register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
 895 +
 896 +   temp1 = vec_ld(0, pixels);
 897 +   temp2 = vec_ld(16, pixels);
 898 +   pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
 899 +   if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
 900 +   {
 901 +     pixelsv2 = temp2;
 902 +   }
 903 +   else
 904 +   {
 905 +     pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
 906 +   }
 907 +   pixelsv1 = vec_mergeh(vczero, pixelsv1);
 908 +   pixelsv2 = vec_mergeh(vczero, pixelsv2);
 909 +   pixelssum1 = vec_add((vector unsigned short)pixelsv1,
 910 +                        (vector unsigned short)pixelsv2);
 911 +   pixelssum1 = vec_add(pixelssum1, vctwo);
 912 +
 913 +POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
 914 +   for (i = 0; i < h ; i++) {
 915 +     int rightside = ((unsigned long)block & 0x0000000F);
 916 +     blockv = vec_ld(0, block);
 917 +
 918 +     temp1 = vec_ld(line_size, pixels);
 919 +     temp2 = vec_ld(line_size + 16, pixels);
 920 +     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
 921 +     if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
 922 +     {
 923 +       pixelsv2 = temp2;
 924 +     }
 925 +     else
 926 +     {
 927 +       pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
 928 +     }
 929 +
 930 +     pixelsv1 = vec_mergeh(vczero, pixelsv1);
 931 +     pixelsv2 = vec_mergeh(vczero, pixelsv2);
 932 +     pixelssum2 = vec_add((vector unsigned short)pixelsv1,
 933 +                          (vector unsigned short)pixelsv2);
 934 +     temp3 = vec_add(pixelssum1, pixelssum2);
 935 +     temp3 = vec_sra(temp3, vctwo);
 936 +     pixelssum1 = vec_add(pixelssum2, vctwo);
 937 +     pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
 938 +
 939 +     if (rightside)
 940 +     {
 941 +       blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
 942 +     }
 943 +     else
 944 +     {
 945 +       blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
 946 +     }
 947 +
 948 +     vec_st(blockv, 0, block);
 949 +
 950 +     block += line_size;
 951 +     pixels += line_size;
 952 +   }
 953 +
 954 +POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
 955 +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
 956 +}
 957 +
 958 +/* next one assumes that ((line_size % 8) == 0) */
 959 +void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 960 +{
 961 +POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
 962 +#ifdef ALTIVEC_USE_REFERENCE_C_CODE
 963 +    int j;
 964 +POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
 965 +    for (j = 0; j < 2; j++) {
 966 +      int i;
 967 +      const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
 968 +      const uint32_t b =
 969 +        (((const struct unaligned_32 *) (pixels + 1))->l);
 970 +      uint32_t l0 =
 971 +        (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
 972 +      uint32_t h0 =
 973 +        ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
 974 +      uint32_t l1, h1;
 975 +      pixels += line_size;
 976 +      for (i = 0; i < h; i += 2) {
 977 +        uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
 978 +        uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
 979 +        l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
 980 +        h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
 981 +        *((uint32_t *) block) =
 982 +          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
 983 +        pixels += line_size;
 984 +        block += line_size;
 985 +        a = (((const struct unaligned_32 *) (pixels))->l);
 986 +        b = (((const struct unaligned_32 *) (pixels + 1))->l);
 987 +        l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
 988 +        h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
 989 +        *((uint32_t *) block) =
 990 +          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
 991 +        pixels += line_size;
 992 +        block += line_size;
 993 +      } pixels += 4 - line_size * (h + 1);
 994 +      block += 4 - line_size * h;
 995 +    }
 996 +
 997 +POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
 998 +
 999 +#else /* ALTIVEC_USE_REFERENCE_C_CODE */
1000 +   register int i;
1001 +   register vector unsigned char
1002 +     pixelsv1, pixelsv2,
1003 +     pixelsavg;
1004 +   register vector unsigned char
1005 +     blockv, temp1, temp2;
1006 +   register vector unsigned short
1007 +     pixelssum1, pixelssum2, temp3;
1008 +   register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
1009 +   register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
1010 +   register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
1011 +
1012 +   temp1 = vec_ld(0, pixels);
1013 +   temp2 = vec_ld(16, pixels);
1014 +   pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1015 +   if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
1016 +   {
1017 +     pixelsv2 = temp2;
1018 +   }
1019 +   else
1020 +   {
1021 +     pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1022 +   }
1023 +   pixelsv1 = vec_mergeh(vczero, pixelsv1);
1024 +   pixelsv2 = vec_mergeh(vczero, pixelsv2);
1025 +   pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1026 +                        (vector unsigned short)pixelsv2);
1027 +   pixelssum1 = vec_add(pixelssum1, vcone);
1028 +
1029 +POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
1030 +   for (i = 0; i < h ; i++) {
1031 +     int rightside = ((unsigned long)block & 0x0000000F);
1032 +     blockv = vec_ld(0, block);
1033 +
1034 +     temp1 = vec_ld(line_size, pixels);
1035 +     temp2 = vec_ld(line_size + 16, pixels);
1036 +     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1037 +     if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
1038 +     {
1039 +       pixelsv2 = temp2;
1040 +     }
1041 +     else
1042 +     {
1043 +       pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1044 +     }
1045 +
1046 +     pixelsv1 = vec_mergeh(vczero, pixelsv1);
1047 +     pixelsv2 = vec_mergeh(vczero, pixelsv2);
1048 +     pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1049 +                          (vector unsigned short)pixelsv2);
1050 +     temp3 = vec_add(pixelssum1, pixelssum2);
1051 +     temp3 = vec_sra(temp3, vctwo);
1052 +     pixelssum1 = vec_add(pixelssum2, vcone);
1053 +     pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
1054 +
1055 +     if (rightside)
1056 +     {
1057 +       blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
1058 +     }
1059 +     else
1060 +     {
1061 +       blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
1062 +     }
1063 +
1064 +     vec_st(blockv, 0, block);
1065 +
1066 +     block += line_size;
1067 +     pixels += line_size;
1068 +   }
1069 +
1070 +POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
1071 +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1072 +}
1073 +
1074 +/* next one assumes that ((line_size % 16) == 0) */
1075 +void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
1076 +{
1077 +POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
1078 +#ifdef ALTIVEC_USE_REFERENCE_C_CODE
1079 +    int j;
1080 +POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
1081 +      for (j = 0; j < 4; j++) {
1082 +      int i;
1083 +      const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1084 +      const uint32_t b =
1085 +        (((const struct unaligned_32 *) (pixels + 1))->l);
1086 +      uint32_t l0 =
1087 +        (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
1088 +      uint32_t h0 =
1089 +        ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1090 +      uint32_t l1, h1;
1091 +      pixels += line_size;
1092 +      for (i = 0; i < h; i += 2) {
1093 +        uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1094 +        uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
1095 +        l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
1096 +        h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1097 +        *((uint32_t *) block) =
1098 +          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1099 +        pixels += line_size;
1100 +        block += line_size;
1101 +        a = (((const struct unaligned_32 *) (pixels))->l);
1102 +        b = (((const struct unaligned_32 *) (pixels + 1))->l);
1103 +        l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
1104 +        h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1105 +        *((uint32_t *) block) =
1106 +          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1107 +        pixels += line_size;
1108 +        block += line_size;
1109 +      } pixels += 4 - line_size * (h + 1);
1110 +      block += 4 - line_size * h;
1111 +    }
1112 +
1113 +POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
1114 +
1115 +#else /* ALTIVEC_USE_REFERENCE_C_CODE */
1116 +   register int i;
1117 +   register vector unsigned char
1118 +     pixelsv1, pixelsv2, pixelsv3, pixelsv4;
1119 +   register vector unsigned char
1120 +     blockv, temp1, temp2;
1121 +   register vector unsigned short
1122 +     pixelssum1, pixelssum2, temp3,
1123 +     pixelssum3, pixelssum4, temp4;
1124 +   register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
1125 +   register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
1126 +
1127 +POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
1128 +
1129 +   temp1 = vec_ld(0, pixels);
1130 +   temp2 = vec_ld(16, pixels);
1131 +   pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1132 +   if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
1133 +   {
1134 +     pixelsv2 = temp2;
1135 +   }
1136 +   else
1137 +   {
1138 +     pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1139 +   }
1140 +   pixelsv3 = vec_mergel(vczero, pixelsv1);
1141 +   pixelsv4 = vec_mergel(vczero, pixelsv2);
1142 +   pixelsv1 = vec_mergeh(vczero, pixelsv1);
1143 +   pixelsv2 = vec_mergeh(vczero, pixelsv2);
1144 +   pixelssum3 = vec_add((vector unsigned short)pixelsv3,
1145 +                        (vector unsigned short)pixelsv4);
1146 +   pixelssum3 = vec_add(pixelssum3, vctwo);
1147 +   pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1148 +                        (vector unsigned short)pixelsv2);
1149 +   pixelssum1 = vec_add(pixelssum1, vctwo);
1150 +
1151 +   for (i = 0; i < h ; i++) {
1152 +     blockv = vec_ld(0, block);
1153 +
1154 +     temp1 = vec_ld(line_size, pixels);
1155 +     temp2 = vec_ld(line_size + 16, pixels);
1156 +     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1157 +     if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
1158 +     {
1159 +       pixelsv2 = temp2;
1160 +     }
1161 +     else
1162 +     {
1163 +       pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1164 +     }
1165 +
1166 +     pixelsv3 = vec_mergel(vczero, pixelsv1);
1167 +     pixelsv4 = vec_mergel(vczero, pixelsv2);
1168 +     pixelsv1 = vec_mergeh(vczero, pixelsv1);
1169 +     pixelsv2 = vec_mergeh(vczero, pixelsv2);
1170 +
1171 +     pixelssum4 = vec_add((vector unsigned short)pixelsv3,
1172 +                          (vector unsigned short)pixelsv4);
1173 +     pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1174 +                          (vector unsigned short)pixelsv2);
1175 +     temp4 = vec_add(pixelssum3, pixelssum4);
1176 +     temp4 = vec_sra(temp4, vctwo);
1177 +     temp3 = vec_add(pixelssum1, pixelssum2);
1178 +     temp3 = vec_sra(temp3, vctwo);
1179 +
1180 +     pixelssum3 = vec_add(pixelssum4, vctwo);
1181 +     pixelssum1 = vec_add(pixelssum2, vctwo);
1182 +
1183 +     blockv = vec_packsu(temp3, temp4);
1184 +
1185 +     vec_st(blockv, 0, block);
1186 +
1187 +     block += line_size;
1188 +     pixels += line_size;
1189 +   }
1190 +
1191 +POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
1192 +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1193 +}
1194 +
1195 +/* next one assumes that ((line_size % 16) == 0) */
1196 +void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
1197 +{
1198 +POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
1199 +#ifdef ALTIVEC_USE_REFERENCE_C_CODE
1200 +    int j;
1201 +POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1202 +      for (j = 0; j < 4; j++) {
1203 +      int i;
1204 +      const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1205 +      const uint32_t b =
1206 +        (((const struct unaligned_32 *) (pixels + 1))->l);
1207 +      uint32_t l0 =
1208 +        (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
1209 +      uint32_t h0 =
1210 +        ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1211 +      uint32_t l1, h1;
1212 +      pixels += line_size;
1213 +      for (i = 0; i < h; i += 2) {
1214 +        uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
1215 +        uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
1216 +        l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
1217 +        h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1218 +        *((uint32_t *) block) =
1219 +          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1220 +        pixels += line_size;
1221 +        block += line_size;
1222 +        a = (((const struct unaligned_32 *) (pixels))->l);
1223 +        b = (((const struct unaligned_32 *) (pixels + 1))->l);
1224 +        l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
1225 +        h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
1226 +        *((uint32_t *) block) =
1227 +          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
1228 +        pixels += line_size;
1229 +        block += line_size;
1230 +      } pixels += 4 - line_size * (h + 1);
1231 +      block += 4 - line_size * h;
1232 +    }
1233 +
1234 +POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1235 +
1236 +#else /* ALTIVEC_USE_REFERENCE_C_CODE */
1237 +   register int i;
1238 +   register vector unsigned char
1239 +     pixelsv1, pixelsv2, pixelsv3, pixelsv4;
1240 +   register vector unsigned char
1241 +     blockv, temp1, temp2;
1242 +   register vector unsigned short
1243 +     pixelssum1, pixelssum2, temp3,
1244 +     pixelssum3, pixelssum4, temp4;
1245 +   register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
1246 +   register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
1247 +   register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
1248 +
1249 +POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1250 +
1251 +   temp1 = vec_ld(0, pixels);
1252 +   temp2 = vec_ld(16, pixels);
1253 +   pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1254 +   if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
1255 +   {
1256 +     pixelsv2 = temp2;
1257 +   }
1258 +   else
1259 +   {
1260 +     pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1261 +   }
1262 +   pixelsv3 = vec_mergel(vczero, pixelsv1);
1263 +   pixelsv4 = vec_mergel(vczero, pixelsv2);
1264 +   pixelsv1 = vec_mergeh(vczero, pixelsv1);
1265 +   pixelsv2 = vec_mergeh(vczero, pixelsv2);
1266 +   pixelssum3 = vec_add((vector unsigned short)pixelsv3,
1267 +                        (vector unsigned short)pixelsv4);
1268 +   pixelssum3 = vec_add(pixelssum3, vcone);
1269 +   pixelssum1 = vec_add((vector unsigned short)pixelsv1,
1270 +                        (vector unsigned short)pixelsv2);
1271 +   pixelssum1 = vec_add(pixelssum1, vcone);
1272 +
1273 +   for (i = 0; i < h ; i++) {
1274 +     blockv = vec_ld(0, block);
1275 +
1276 +     temp1 = vec_ld(line_size, pixels);
1277 +     temp2 = vec_ld(line_size + 16, pixels);
1278 +     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1279 +     if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
1280 +     {
1281 +       pixelsv2 = temp2;
1282 +     }
1283 +     else
1284 +     {
1285 +       pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1286 +     }
1287 +
1288 +     pixelsv3 = vec_mergel(vczero, pixelsv1);
1289 +     pixelsv4 = vec_mergel(vczero, pixelsv2);
1290 +     pixelsv1 = vec_mergeh(vczero, pixelsv1);
1291 +     pixelsv2 = vec_mergeh(vczero, pixelsv2);
1292 +
1293 +     pixelssum4 = vec_add((vector unsigned short)pixelsv3,
1294 +                          (vector unsigned short)pixelsv4);
1295 +     pixelssum2 = vec_add((vector unsigned short)pixelsv1,
1296 +                          (vector unsigned short)pixelsv2);
1297 +     temp4 = vec_add(pixelssum3, pixelssum4);
1298 +     temp4 = vec_sra(temp4, vctwo);
1299 +     temp3 = vec_add(pixelssum1, pixelssum2);
1300 +     temp3 = vec_sra(temp3, vctwo);
1301 +
1302 +     pixelssum3 = vec_add(pixelssum4, vcone);
1303 +     pixelssum1 = vec_add(pixelssum2, vcone);
1304 +
1305 +     blockv = vec_packsu(temp3, temp4);
1306 +
1307 +     vec_st(blockv, 0, block);
1308 +
1309 +     block += line_size;
1310 +     pixels += line_size;
1311 +   }
1312 +
1313 +POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
1314 +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1315 +}
1316 +
1317 +int has_altivec(void)
1318 +{
1319 +#ifdef CONFIG_DARWIN
1320 +    int sels[2] = {CTL_HW, HW_VECTORUNIT};
1321 +    int has_vu = 0;
1322 +    size_t len = sizeof(has_vu);
1323 +    int err;
1324 +
1325 +    err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
1326 +
1327 +    if (err == 0) return (has_vu != 0);
1328 +#else /* CONFIG_DARWIN */
1329 +/* no Darwin, do it the brute-force way */
1330 +/* this is borrowed from the libmpeg2 library */
1331 +    {
1332 +      signal (SIGILL, sigill_handler);
1333 +      if (sigsetjmp (jmpbuf, 1)) {
1334 +        signal (SIGILL, SIG_DFL);
1335 +      } else {
1336 +        canjump = 1;
1337 +
1338 +        asm volatile ("mtspr 256, %0\n\t"
1339 +                      "vand %%v0, %%v0, %%v0"
1340 +                      :
1341 +                      : "r" (-1));
1342 +
1343 +        signal (SIGILL, SIG_DFL);
1344 +        return 1;
1345 +      }
1346 +    }
1347 +#endif /* CONFIG_DARWIN */
1348 +    return 0;
1349 +}
1350 diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/dsputil_ppc.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/dsputil_ppc.c
1351 --- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/dsputil_ppc.c 1970-01-01 01:00:00.000000000 +0100
1352 +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/dsputil_ppc.c      2003-09-28 17:26:40.000000000 +0200
1353 @@ -0,0 +1,307 @@
1354 +/*
1355 + * Copyright (c) 2002 Brian Foley
1356 + * Copyright (c) 2002 Dieter Shirley
1357 + *
1358 + * This library is free software; you can redistribute it and/or
1359 + * modify it under the terms of the GNU Lesser General Public
1360 + * License as published by the Free Software Foundation; either
1361 + * version 2 of the License, or (at your option) any later version.
1362 + *
1363 + * This library is distributed in the hope that it will be useful,
1364 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
1365 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
1366 + * Lesser General Public License for more details.
1367 + *
1368 + * You should have received a copy of the GNU Lesser General Public
1369 + * License along with this library; if not, write to the Free Software
1370 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
1371 + */
1372 +
1373 +#include "../dsputil.h"
1374 +
1375 +#include "dsputil_ppc.h"
1376 +
1377 +#ifdef HAVE_ALTIVEC
1378 +#include "dsputil_altivec.h"
1379 +#endif
1380 +
1381 +extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);
1382 +extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);
1383 +
1384 +int mm_flags = 0;
1385 +
1386 +int mm_support(void)
1387 +{
1388 +    int result = 0;
1389 +#if HAVE_ALTIVEC
1390 +    if (has_altivec()) {
1391 +        result |= MM_ALTIVEC;
1392 +    }
1393 +#endif /* result */
1394 +    return result;
1395 +}
1396 +
1397 +#ifdef POWERPC_PERFORMANCE_REPORT
1398 +unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total];
1399 +/* list below must match enum in dsputil_ppc.h */
1400 +static unsigned char* perfname[] = {
1401 +  "fft_calc_altivec",
1402 +  "gmc1_altivec",
1403 +  "dct_unquantize_h263_altivec",
1404 +  "idct_add_altivec",
1405 +  "idct_put_altivec",
1406 +  "put_pixels16_altivec",
1407 +  "avg_pixels16_altivec",
1408 +  "avg_pixels8_altivec",
1409 +  "put_pixels8_xy2_altivec",
1410 +  "put_no_rnd_pixels8_xy2_altivec",
1411 +  "put_pixels16_xy2_altivec",
1412 +  "put_no_rnd_pixels16_xy2_altivec",
1413 +  "clear_blocks_dcbz32_ppc",
1414 +  "clear_blocks_dcbz128_ppc"
1415 +};
1416 +#include <stdio.h>
1417 +#endif
1418 +
1419 +#ifdef POWERPC_PERFORMANCE_REPORT
1420 +void powerpc_display_perf_report(void)
1421 +{
1422 +  int i, j;
1423 +  fprintf(stderr, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n");
1424 +  for(i = 0 ; i < powerpc_perf_total ; i++)
1425 +  {
1426 +    for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++)
1427 +      {
1428 +       if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0)
1429 +         fprintf(stderr,
1430 +                 " Function \"%s\" (pmc%d):\n\tmin: %llu\n\tmax: %llu\n\tavg: %1.2lf (%llu)\n",
1431 +                 perfname[i],
1432 +                 j+1,
1433 +                 perfdata[j][i][powerpc_data_min],
1434 +                 perfdata[j][i][powerpc_data_max],
1435 +                 (double)perfdata[j][i][powerpc_data_sum] /
1436 +                 (double)perfdata[j][i][powerpc_data_num],
1437 +                 perfdata[j][i][powerpc_data_num]);
1438 +      }
1439 +  }
1440 +}
1441 +#endif /* POWERPC_PERFORMANCE_REPORT */
1442 +
1443 +/* ***** WARNING ***** WARNING ***** WARNING ***** */
1444 +/*
1445 +  clear_blocks_dcbz32_ppc will not work properly
1446 +  on PowerPC processors with a cache line size
1447 +  not equal to 32 bytes.
1448 +  Fortunately all processor used by Apple up to
1449 +  at least the 7450 (aka second generation G4)
1450 +  use 32 bytes cache line.
1451 +  This is due to the use of the 'dcbz' instruction.
1452 +  It simply clear to zero a single cache line,
1453 +  so you need to know the cache line size to use it !
1454 +  It's absurd, but it's fast...
1455 +
1456 +  update 24/06/2003 : Apple released yesterday the G5,
1457 +  with a PPC970. cache line size : 128 bytes. Oups.
1458 +  The semantic of dcbz was changed, it always clear
1459 +  32 bytes. so the function below will work, but will
1460 +  be slow. So I fixed check_dcbz_effect to use dcbzl,
1461 +  which is defined to clear a cache line (as dcbz before).
1462 +  So we still can distinguish, and use dcbz (32 bytes)
1463 +  or dcbzl (one cache line) as required.
1464 +
1465 +  see <http://developer.apple.com/technotes/tn/tn2087.html>
1466 +  and <http://developer.apple.com/technotes/tn/tn2086.html>
1467 +*/
1468 +void clear_blocks_dcbz32_ppc(DCTELEM *blocks)
1469 +{
1470 +POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz32, 1);
1471 +    register int misal = ((unsigned long)blocks & 0x00000010);
1472 +    register int i = 0;
1473 +POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1);
1474 +#if 1
1475 +    if (misal) {
1476 +      ((unsigned long*)blocks)[0] = 0L;
1477 +      ((unsigned long*)blocks)[1] = 0L;
1478 +      ((unsigned long*)blocks)[2] = 0L;
1479 +      ((unsigned long*)blocks)[3] = 0L;
1480 +      i += 16;
1481 +    }
1482 +    for ( ; i < sizeof(DCTELEM)*6*64 ; i += 32) {
1483 +      asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory");
1484 +    }
1485 +    if (misal) {
1486 +      ((unsigned long*)blocks)[188] = 0L;
1487 +      ((unsigned long*)blocks)[189] = 0L;
1488 +      ((unsigned long*)blocks)[190] = 0L;
1489 +      ((unsigned long*)blocks)[191] = 0L;
1490 +      i += 16;
1491 +    }
1492 +#else
1493 +    memset(blocks, 0, sizeof(DCTELEM)*6*64);
1494 +#endif
1495 +POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1);
1496 +}
1497 +
1498 +/* same as above, when dcbzl clear a whole 128B cache line
1499 +   i.e. the PPC970 aka G5 */
1500 +#ifndef NO_DCBZL
1501 +void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
1502 +{
1503 +POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz128, 1);
1504 +    register int misal = ((unsigned long)blocks & 0x0000007f);
1505 +    register int i = 0;
1506 +POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1);
1507 +#if 1
1508 + if (misal) {
1509 +   // we could probably also optimize this case,
1510 +   // but there's not much point as the machines
1511 +   // aren't available yet (2003-06-26)
1512 +      memset(blocks, 0, sizeof(DCTELEM)*6*64);
1513 +    }
1514 +    else
1515 +      for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) {
1516 +       asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory");
1517 +      }
1518 +#else
1519 +    memset(blocks, 0, sizeof(DCTELEM)*6*64);
1520 +#endif
1521 +POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1);
1522 +}
1523 +#else
1524 +void clear_blocks_dcbz128_ppc(DCTELEM *blocks)
1525 +{
1526 +  memset(blocks, 0, sizeof(DCTELEM)*6*64);
1527 +}
1528 +#endif
1529 +
1530 +#ifndef NO_DCBZL
1531 +/* check dcbz report how many bytes are set to 0 by dcbz */
1532 +/* update 24/06/2003 : replace dcbz by dcbzl to get
1533 +   the intended effect (Apple "fixed" dcbz)
1534 +   unfortunately this cannot be used unless the assembler
1535 +   knows about dcbzl ... */
1536 +long check_dcbzl_effect(void)
1537 +{
1538 +  register char *fakedata = (char*)av_malloc(1024);
1539 +  register char *fakedata_middle;
1540 +  register long zero = 0;
1541 +  register long i = 0;
1542 +  long count = 0;
1543 +
1544 +  if (!fakedata)
1545 +  {
1546 +    return 0L;
1547 +  }
1548 +
1549 +  fakedata_middle = (fakedata + 512);
1550 +
1551 +  memset(fakedata, 0xFF, 1024);
1552 +
1553 +  /* below the constraint "b" seems to mean "Address base register"
1554 +     in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */
1555 +  asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero));
1556 +
1557 +  for (i = 0; i < 1024 ; i ++)
1558 +  {
1559 +    if (fakedata[i] == (char)0)
1560 +      count++;
1561 +  }
1562 +
1563 +  av_free(fakedata);
1564 +
1565 +  return count;
1566 +}
1567 +#else
1568 +long check_dcbzl_effect(void)
1569 +{
1570 +  return 0;
1571 +}
1572 +#endif
1573 +
1574 +void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx)
1575 +{
1576 +    // Common optimizations whether Altivec is available or not
1577 +
1578 +  switch (check_dcbzl_effect()) {
1579 +  case 32:
1580 +    c->clear_blocks = clear_blocks_dcbz32_ppc;
1581 +    break;
1582 +  case 128:
1583 +    c->clear_blocks = clear_blocks_dcbz128_ppc;
1584 +    break;
1585 +  default:
1586 +    break;
1587 +  }
1588 +
1589 +#if HAVE_ALTIVEC
1590 +    if (has_altivec()) {
1591 +        mm_flags |= MM_ALTIVEC;
1592 +
1593 +        // Altivec specific optimisations
1594 +        c->pix_abs16x16_x2 = pix_abs16x16_x2_altivec;
1595 +        c->pix_abs16x16_y2 = pix_abs16x16_y2_altivec;
1596 +        c->pix_abs16x16_xy2 = pix_abs16x16_xy2_altivec;
1597 +        c->pix_abs16x16 = pix_abs16x16_altivec;
1598 +        c->pix_abs8x8 = pix_abs8x8_altivec;
1599 +        c->sad[0]= sad16x16_altivec;
1600 +        c->sad[1]= sad8x8_altivec;
1601 +        c->pix_norm1 = pix_norm1_altivec;
1602 +        c->sse[1]= sse8_altivec;
1603 +        c->sse[0]= sse16_altivec;
1604 +        c->pix_sum = pix_sum_altivec;
1605 +        c->diff_pixels = diff_pixels_altivec;
1606 +        c->get_pixels = get_pixels_altivec;
1607 +// next one disabled as it's untested.
1608 +#if 0
1609 +        c->add_bytes= add_bytes_altivec;
1610 +#endif /* 0 */
1611 +        c->put_pixels_tab[0][0] = put_pixels16_altivec;
1612 +        /* the tow functions do the same thing, so use the same code */
1613 +        c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
1614 +        c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
1615 +// next one disabled as it's untested.
1616 +#if 0
1617 +        c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
1618 +#endif /* 0 */
1619 +        c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
1620 +        c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
1621 +        c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
1622 +        c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
1623 +
1624 +       c->gmc1 = gmc1_altivec;
1625 +
1626 +        if ((avctx->idct_algo == FF_IDCT_AUTO) ||
1627 +                (avctx->idct_algo == FF_IDCT_ALTIVEC))
1628 +        {
1629 +            c->idct_put = idct_put_altivec;
1630 +            c->idct_add = idct_add_altivec;
1631 +#ifndef ALTIVEC_USE_REFERENCE_C_CODE
1632 +            c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
1633 +#else /* ALTIVEC_USE_REFERENCE_C_CODE */
1634 +            c->idct_permutation_type = FF_NO_IDCT_PERM;
1635 +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1636 +        }
1637 +
1638 +#ifdef POWERPC_PERFORMANCE_REPORT
1639 +        {
1640 +          int i, j;
1641 +          for (i = 0 ; i < powerpc_perf_total ; i++)
1642 +          {
1643 +           for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++)
1644 +             {
1645 +               perfdata[j][i][powerpc_data_min] = (unsigned long long)0xFFFFFFFFFFFFFFFF;
1646 +               perfdata[j][i][powerpc_data_max] = (unsigned long long)0x0000000000000000;
1647 +               perfdata[j][i][powerpc_data_sum] = (unsigned long long)0x0000000000000000;
1648 +               perfdata[j][i][powerpc_data_num] = (unsigned long long)0x0000000000000000;
1649 +             }
1650 +         }
1651 +        }
1652 +#endif /* POWERPC_PERFORMANCE_REPORT */
1653 +    } else
1654 +#endif /* HAVE_ALTIVEC */
1655 +    {
1656 +        // Non-AltiVec PPC optimisations
1657 +
1658 +        // ... pending ...
1659 +    }
1660 +}
1661 diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/fft_altivec.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/fft_altivec.c
1662 --- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/fft_altivec.c 1970-01-01 01:00:00.000000000 +0100
1663 +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/fft_altivec.c      2003-09-28 17:26:40.000000000 +0200
1664 @@ -0,0 +1,247 @@
1665 +/*
1666 + * FFT/IFFT transforms
1667 + * AltiVec-enabled
1668 + * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
1669 + * Based on code Copyright (c) 2002 Fabrice Bellard.
1670 + *
1671 + * This library is free software; you can redistribute it and/or
1672 + * modify it under the terms of the GNU Lesser General Public
1673 + * License as published by the Free Software Foundation; either
1674 + * version 2 of the License, or (at your option) any later version.
1675 + *
1676 + * This library is distributed in the hope that it will be useful,
1677 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
1678 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
1679 + * Lesser General Public License for more details.
1680 + *
1681 + * You should have received a copy of the GNU Lesser General Public
1682 + * License along with this library; if not, write to the Free Software
1683 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
1684 + */
1685 +#include "../dsputil.h"
1686 +
1687 +#include "gcc_fixes.h"
1688 +
1689 +#include "dsputil_altivec.h"
1690 +
1691 +/*
1692 +  those three macros are from libavcodec/fft.c
1693 +  and are required for the reference C code
1694 +*/
1695 +/* butter fly op */
1696 +#define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \
1697 +{\
1698 +  FFTSample ax, ay, bx, by;\
1699 +  bx=pre1;\
1700 +  by=pim1;\
1701 +  ax=qre1;\
1702 +  ay=qim1;\
1703 +  pre = (bx + ax);\
1704 +  pim = (by + ay);\
1705 +  qre = (bx - ax);\
1706 +  qim = (by - ay);\
1707 +}
1708 +#define MUL16(a,b) ((a) * (b))
1709 +#define CMUL(pre, pim, are, aim, bre, bim) \
1710 +{\
1711 +   pre = (MUL16(are, bre) - MUL16(aim, bim));\
1712 +   pim = (MUL16(are, bim) + MUL16(bre, aim));\
1713 +}
1714 +
1715 +
1716 +/**
1717 + * Do a complex FFT with the parameters defined in fft_init(). The
1718 + * input data must be permuted before with s->revtab table. No
1719 + * 1.0/sqrt(n) normalization is done.
1720 + * AltiVec-enabled
1721 + * This code assumes that the 'z' pointer is 16 bytes-aligned
1722 + * It also assumes all FFTComplex are 8 bytes-aligned pair of float
1723 + * The code is exactly the same as the SSE version, except
1724 + * that successive MUL + ADD/SUB have been merged into
1725 + * fused multiply-add ('vec_madd' in altivec)
1726 + */
1727 +void fft_calc_altivec(FFTContext *s, FFTComplex *z)
1728 +{
1729 +POWERPC_PERF_DECLARE(altivec_fft_num, s->nbits >= 6);
1730 +#ifdef ALTIVEC_USE_REFERENCE_C_CODE
1731 +    int ln = s->nbits;
1732 +    int        j, np, np2;
1733 +    int        nblocks, nloops;
1734 +    register FFTComplex *p, *q;
1735 +    FFTComplex *exptab = s->exptab;
1736 +    int l;
1737 +    FFTSample tmp_re, tmp_im;
1738 +
1739 +POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6);
1740 +
1741 +    np = 1 << ln;
1742 +
1743 +    /* pass 0 */
1744 +
1745 +    p=&z[0];
1746 +    j=(np >> 1);
1747 +    do {
1748 +        BF(p[0].re, p[0].im, p[1].re, p[1].im,
1749 +           p[0].re, p[0].im, p[1].re, p[1].im);
1750 +        p+=2;
1751 +    } while (--j != 0);
1752 +
1753 +    /* pass 1 */
1754 +
1755 +
1756 +    p=&z[0];
1757 +    j=np >> 2;
1758 +    if (s->inverse) {
1759 +        do {
1760 +            BF(p[0].re, p[0].im, p[2].re, p[2].im,
1761 +               p[0].re, p[0].im, p[2].re, p[2].im);
1762 +            BF(p[1].re, p[1].im, p[3].re, p[3].im,
1763 +               p[1].re, p[1].im, -p[3].im, p[3].re);
1764 +            p+=4;
1765 +        } while (--j != 0);
1766 +    } else {
1767 +        do {
1768 +            BF(p[0].re, p[0].im, p[2].re, p[2].im,
1769 +               p[0].re, p[0].im, p[2].re, p[2].im);
1770 +            BF(p[1].re, p[1].im, p[3].re, p[3].im,
1771 +               p[1].re, p[1].im, p[3].im, -p[3].re);
1772 +            p+=4;
1773 +        } while (--j != 0);
1774 +    }
1775 +    /* pass 2 .. ln-1 */
1776 +
1777 +    nblocks = np >> 3;
1778 +    nloops = 1 << 2;
1779 +    np2 = np >> 1;
1780 +    do {
1781 +        p = z;
1782 +        q = z + nloops;
1783 +        for (j = 0; j < nblocks; ++j) {
1784 +            BF(p->re, p->im, q->re, q->im,
1785 +               p->re, p->im, q->re, q->im);
1786 +
1787 +            p++;
1788 +            q++;
1789 +            for(l = nblocks; l < np2; l += nblocks) {
1790 +                CMUL(tmp_re, tmp_im, exptab[l].re, exptab[l].im, q->re, q->im);
1791 +                BF(p->re, p->im, q->re, q->im,
1792 +                   p->re, p->im, tmp_re, tmp_im);
1793 +                p++;
1794 +                q++;
1795 +            }
1796 +
1797 +            p += nloops;
1798 +            q += nloops;
1799 +        }
1800 +        nblocks = nblocks >> 1;
1801 +        nloops = nloops << 1;
1802 +    } while (nblocks != 0);
1803 +
1804 +POWERPC_PERF_STOP_COUNT(altivec_fft_num, s->nbits >= 6);
1805 +
1806 +#else /* ALTIVEC_USE_REFERENCE_C_CODE */
1807 +#ifdef CONFIG_DARWIN
1808 +    register const vector float vczero = (const vector float)(0.);
1809 +#else
1810 +    register const vector float vczero = (const vector float){0.,0.,0.,0.};
1811 +#endif
1812 +
1813 +    int ln = s->nbits;
1814 +    int        j, np, np2;
1815 +    int        nblocks, nloops;
1816 +    register FFTComplex *p, *q;
1817 +    FFTComplex *cptr, *cptr1;
1818 +    int k;
1819 +
1820 +POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6);
1821 +
1822 +    np = 1 << ln;
1823 +
1824 +    {
1825 +        vector float *r, a, b, a1, c1, c2;
1826 +
1827 +        r = (vector float *)&z[0];
1828 +
1829 +        c1 = vcii(p,p,n,n);
1830 +
1831 +        if (s->inverse)
1832 +            {
1833 +                c2 = vcii(p,p,n,p);
1834 +            }
1835 +        else
1836 +            {
1837 +                c2 = vcii(p,p,p,n);
1838 +            }
1839 +
1840 +        j = (np >> 2);
1841 +        do {
1842 +            a = vec_ld(0, r);
1843 +            a1 = vec_ld(sizeof(vector float), r);
1844 +
1845 +            b = vec_perm(a,a,vcprmle(1,0,3,2));
1846 +            a = vec_madd(a,c1,b);
1847 +            /* do the pass 0 butterfly */
1848 +
1849 +            b = vec_perm(a1,a1,vcprmle(1,0,3,2));
1850 +            b = vec_madd(a1,c1,b);
1851 +            /* do the pass 0 butterfly */
1852 +
1853 +            /* multiply third by -i */
1854 +            b = vec_perm(b,b,vcprmle(2,3,1,0));
1855 +
1856 +            /* do the pass 1 butterfly */
1857 +            vec_st(vec_madd(b,c2,a), 0, r);
1858 +            vec_st(vec_nmsub(b,c2,a), sizeof(vector float), r);
1859 +
1860 +            r += 2;
1861 +        } while (--j != 0);
1862 +    }
1863 +    /* pass 2 .. ln-1 */
1864 +
1865 +    nblocks = np >> 3;
1866 +    nloops = 1 << 2;
1867 +    np2 = np >> 1;
1868 +
1869 +    cptr1 = s->exptab1;
1870 +    do {
1871 +        p = z;
1872 +        q = z + nloops;
1873 +        j = nblocks;
1874 +        do {
1875 +            cptr = cptr1;
1876 +            k = nloops >> 1;
1877 +            do {
1878 +                vector float a,b,c,t1;
1879 +
1880 +                a = vec_ld(0, (float*)p);
1881 +                b = vec_ld(0, (float*)q);
1882 +
1883 +                /* complex mul */
1884 +                c = vec_ld(0, (float*)cptr);
1885 +                /*  cre*re cim*re */
1886 +                t1 = vec_madd(c, vec_perm(b,b,vcprmle(2,2,0,0)),vczero);
1887 +                c = vec_ld(sizeof(vector float), (float*)cptr);
1888 +                /*  -cim*im cre*im */
1889 +                b = vec_madd(c, vec_perm(b,b,vcprmle(3,3,1,1)),t1);
1890 +
1891 +                /* butterfly */
1892 +                vec_st(vec_add(a,b), 0, (float*)p);
1893 +                vec_st(vec_sub(a,b), 0, (float*)q);
1894 +
1895 +                p += 2;
1896 +                q += 2;
1897 +                cptr += 4;
1898 +            } while (--k);
1899 +
1900 +            p += nloops;
1901 +            q += nloops;
1902 +        } while (--j);
1903 +        cptr1 += nloops * 2;
1904 +        nblocks = nblocks >> 1;
1905 +        nloops = nloops << 1;
1906 +    } while (nblocks != 0);
1907 +
1908 +POWERPC_PERF_STOP_COUNT(altivec_fft_num, s->nbits >= 6);
1909 +
1910 +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
1911 +}
1912 diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/gcc_fixes.h avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/gcc_fixes.h
1913 --- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/gcc_fixes.h   2003-07-04 15:40:29.000000000 +0200
1914 +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/gcc_fixes.h        2003-09-28 17:26:40.000000000 +0200
1915 @@ -25,7 +25,7 @@
1916   * http://gcc.gnu.org/ml/gcc/2003-04/msg00967.html
1917   */
1918
1919 -static inline vector signed char my_vmrglb (vector signed char const A,
1920 +static inline vector signed char ff_vmrglb (vector signed char const A,
1921                                           vector signed char const B)
1922  {
1923      static const vector unsigned char lowbyte = {
1924 @@ -35,7 +35,7 @@
1925      return vec_perm (A, B, lowbyte);
1926  }
1927
1928 -static inline vector signed short my_vmrglh (vector signed short const A,
1929 +static inline vector signed short ff_vmrglh (vector signed short const A,
1930                                           vector signed short const B)
1931  {
1932      static const vector unsigned char lowhalf = {
1933 @@ -45,7 +45,7 @@
1934      return vec_perm (A, B, lowhalf);
1935  }
1936
1937 -static inline vector signed int my_vmrglw (vector signed int const A,
1938 +static inline vector signed int ff_vmrglw (vector signed int const A,
1939                                           vector signed int const B)
1940  {
1941      static const vector unsigned char lowword = {
1942 @@ -54,27 +54,27 @@
1943      };
1944      return vec_perm (A, B, lowword);
1945  }
1946 -/*#define my_vmrglb my_vmrglb
1947 -#define my_vmrglh my_vmrglh
1948 -#define my_vmrglw my_vmrglw
1949 +/*#define ff_vmrglb ff_vmrglb
1950 +#define ff_vmrglh ff_vmrglh
1951 +#define ff_vmrglw ff_vmrglw
1952  */
1953  #undef vec_mergel
1954
1955  #define vec_mergel(a1, a2) \
1956  __ch (__bin_args_eq (vector signed char, (a1), vector signed char, (a2)), \
1957 -      ((vector signed char) my_vmrglb ((vector signed char) (a1), (vector signed char) (a2))), \
1958 +      ((vector signed char) ff_vmrglb ((vector signed char) (a1), (vector signed char) (a2))), \
1959  __ch (__bin_args_eq (vector unsigned char, (a1), vector unsigned char, (a2)), \
1960 -      ((vector unsigned char) my_vmrglb ((vector signed char) (a1), (vector signed char) (a2))), \
1961 +      ((vector unsigned char) ff_vmrglb ((vector signed char) (a1), (vector signed char) (a2))), \
1962  __ch (__bin_args_eq (vector signed short, (a1), vector signed short, (a2)), \
1963 -      ((vector signed short) my_vmrglh ((vector signed short) (a1), (vector signed short) (a2))), \
1964 +      ((vector signed short) ff_vmrglh ((vector signed short) (a1), (vector signed short) (a2))), \
1965  __ch (__bin_args_eq (vector unsigned short, (a1), vector unsigned short, (a2)), \
1966 -      ((vector unsigned short) my_vmrglh ((vector signed short) (a1), (vector signed short) (a2))), \
1967 +      ((vector unsigned short) ff_vmrglh ((vector signed short) (a1), (vector signed short) (a2))), \
1968  __ch (__bin_args_eq (vector float, (a1), vector float, (a2)), \
1969 -      ((vector float) my_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \
1970 +      ((vector float) ff_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \
1971  __ch (__bin_args_eq (vector signed int, (a1), vector signed int, (a2)), \
1972 -      ((vector signed int) my_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \
1973 +      ((vector signed int) ff_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \
1974  __ch (__bin_args_eq (vector unsigned int, (a1), vector unsigned int, (a2)), \
1975 -      ((vector unsigned int) my_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \
1976 +      ((vector unsigned int) ff_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \
1977      __altivec_link_error_invalid_argument ())))))))
1978
1979  #endif
1980 diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/gmc_altivec.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/gmc_altivec.c
1981 --- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/gmc_altivec.c 1970-01-01 01:00:00.000000000 +0100
1982 +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/gmc_altivec.c      2003-09-28 17:26:40.000000000 +0200
1983 @@ -0,0 +1,172 @@
1984 +/*
1985 + * GMC (Global Motion Compensation)
1986 + * AltiVec-enabled
1987 + * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
1988 + *
1989 + * This library is free software; you can redistribute it and/or
1990 + * modify it under the terms of the GNU Lesser General Public
1991 + * License as published by the Free Software Foundation; either
1992 + * version 2 of the License, or (at your option) any later version.
1993 + *
1994 + * This library is distributed in the hope that it will be useful,
1995 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
1996 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
1997 + * Lesser General Public License for more details.
1998 + *
1999 + * You should have received a copy of the GNU Lesser General Public
2000 + * License along with this library; if not, write to the Free Software
2001 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
2002 + */
2003 +
2004 +#include "../dsputil.h"
2005 +
2006 +#include "gcc_fixes.h"
2007 +
2008 +#include "dsputil_altivec.h"
2009 +
2010 +/*
2011 +  altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8,
2012 +  to preserve proper dst alignement.
2013 +*/
2014 +#define GMC1_PERF_COND (h==8)
2015 +void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int stride, int h, int x16, int y16, int rounder)
2016 +{
2017 +POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND);
2018 +#ifdef ALTIVEC_USE_REFERENCE_C_CODE
2019 +    const int A=(16-x16)*(16-y16);
2020 +    const int B=(   x16)*(16-y16);
2021 +    const int C=(16-x16)*(   y16);
2022 +    const int D=(   x16)*(   y16);
2023 +    int i;
2024 +
2025 +POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
2026 +
2027 +    for(i=0; i<h; i++)
2028 +    {
2029 +        dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
2030 +        dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
2031 +        dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
2032 +        dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
2033 +        dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
2034 +        dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
2035 +        dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
2036 +        dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
2037 +        dst+= stride;
2038 +        src+= stride;
2039 +    }
2040 +
2041 +POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
2042 +
2043 +#else /* ALTIVEC_USE_REFERENCE_C_CODE */
2044 +    const unsigned short __attribute__ ((aligned(16))) rounder_a[8] =
2045 +      {rounder, rounder, rounder, rounder,
2046 +       rounder, rounder, rounder, rounder};
2047 +    const unsigned short __attribute__ ((aligned(16))) ABCD[8] =
2048 +      {
2049 +        (16-x16)*(16-y16), /* A */
2050 +        (   x16)*(16-y16), /* B */
2051 +        (16-x16)*(   y16), /* C */
2052 +        (   x16)*(   y16), /* D */
2053 +        0, 0, 0, 0         /* padding */
2054 +      };
2055 +    register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
2056 +    register const vector unsigned short vcsr8 = (const vector unsigned short)vec_splat_u16(8);
2057 +    register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD;
2058 +    register vector unsigned short Av, Bv, Cv, Dv, rounderV, tempA, tempB, tempC, tempD;
2059 +    int i;
2060 +    unsigned long dst_odd = (unsigned long)dst & 0x0000000F;
2061 +    unsigned long src_really_odd = (unsigned long)src & 0x0000000F;
2062 +
2063 +
2064 +POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
2065 +
2066 +    tempA = vec_ld(0, (unsigned short*)ABCD);
2067 +    Av = vec_splat(tempA, 0);
2068 +    Bv = vec_splat(tempA, 1);
2069 +    Cv = vec_splat(tempA, 2);
2070 +    Dv = vec_splat(tempA, 3);
2071 +
2072 +    rounderV = vec_ld(0, (unsigned short*)rounder_a);
2073 +
2074 +    // we'll be able to pick-up our 9 char elements
2075 +    // at src from those 32 bytes
2076 +    // we load the first batch here, as inside the loop
2077 +    // we can re-use 'src+stride' from one iteration
2078 +    // as the 'src' of the next.
2079 +    src_0 = vec_ld(0, src);
2080 +    src_1 = vec_ld(16, src);
2081 +    srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src));
2082 +
2083 +    if (src_really_odd != 0x0000000F)
2084 +    { // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector.
2085 +      srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src));
2086 +    }
2087 +    else
2088 +    {
2089 +      srcvB = src_1;
2090 +    }
2091 +    srcvA = vec_mergeh(vczero, srcvA);
2092 +    srcvB = vec_mergeh(vczero, srcvB);
2093 +
2094 +    for(i=0; i<h; i++)
2095 +    {
2096 +      dst_odd = (unsigned long)dst & 0x0000000F;
2097 +      src_really_odd = (((unsigned long)src) + stride) & 0x0000000F;
2098 +
2099 +      dstv = vec_ld(0, dst);
2100 +
2101 +      // we we'll be able to pick-up our 9 char elements
2102 +      // at src + stride from those 32 bytes
2103 +      // then reuse the resulting 2 vectors srvcC and srcvD
2104 +      // as the next srcvA and srcvB
2105 +      src_0 = vec_ld(stride + 0, src);
2106 +      src_1 = vec_ld(stride + 16, src);
2107 +      srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src));
2108 +
2109 +      if (src_really_odd != 0x0000000F)
2110 +      { // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector.
2111 +        srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src));
2112 +      }
2113 +      else
2114 +      {
2115 +        srcvD = src_1;
2116 +      }
2117 +
2118 +      srcvC = vec_mergeh(vczero, srcvC);
2119 +      srcvD = vec_mergeh(vczero, srcvD);
2120 +
2121 +
2122 +      // OK, now we (finally) do the math :-)
2123 +      // those four instructions replaces 32 int muls & 32 int adds.
2124 +      // isn't AltiVec nice ?
2125 +      tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV);
2126 +      tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA);
2127 +      tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB);
2128 +      tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC);
2129 +
2130 +      srcvA = srcvC;
2131 +      srcvB = srcvD;
2132 +
2133 +      tempD = vec_sr(tempD, vcsr8);
2134 +
2135 +      dstv2 = vec_pack(tempD, (vector unsigned short)vczero);
2136 +
2137 +      if (dst_odd)
2138 +      {
2139 +        dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1));
2140 +      }
2141 +      else
2142 +      {
2143 +        dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3));
2144 +      }
2145 +
2146 +      vec_st(dstv2, 0, dst);
2147 +
2148 +      dst += stride;
2149 +      src += stride;
2150 +    }
2151 +
2152 +POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND);
2153 +
2154 +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
2155 +}
2156 diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/idct_altivec.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/idct_altivec.c
2157 --- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/idct_altivec.c        1970-01-01 01:00:00.000000000 +0100
2158 +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/idct_altivec.c     2003-09-28 17:26:40.000000000 +0200
2159 @@ -0,0 +1,245 @@
2160 +/*
2161 + * Copyright (c) 2001 Michel Lespinasse
2162 + *
2163 + * This library is free software; you can redistribute it and/or
2164 + * modify it under the terms of the GNU Lesser General Public
2165 + * License as published by the Free Software Foundation; either
2166 + * version 2 of the License, or (at your option) any later version.
2167 + *
2168 + * This library is distributed in the hope that it will be useful,
2169 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
2170 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
2171 + * Lesser General Public License for more details.
2172 + *
2173 + * You should have received a copy of the GNU Lesser General Public
2174 + * License along with this library; if not, write to the Free Software
2175 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
2176 + *
2177 + */
2178 +
2179 +/*
2180 + * NOTE: This code is based on GPL code from the libmpeg2 project.  The
2181 + * author, Michel Lespinasses, has given explicit permission to release
2182 + * under LGPL as part of ffmpeg.
2183 + *
2184 + */
2185 +
2186 +/*
2187 + * FFMpeg integration by Dieter Shirley
2188 + *
2189 + * This file is a direct copy of the altivec idct module from the libmpeg2
2190 + * project.  I've deleted all of the libmpeg2 specific code, renamed the functions and
2191 + * re-ordered the function parameters.  The only change to the IDCT function
2192 + * itself was to factor out the partial transposition, and to perform a full
2193 + * transpose at the end of the function.
2194 + */
2195 +
2196 +
2197 +#include <stdlib.h>                                      /* malloc(), free() */
2198 +#include <string.h>
2199 +#include "../dsputil.h"
2200 +
2201 +#include "gcc_fixes.h"
2202 +
2203 +#include "dsputil_altivec.h"
2204 +
2205 +#define vector_s16_t vector signed short
2206 +#define vector_u16_t vector unsigned short
2207 +#define vector_s8_t vector signed char
2208 +#define vector_u8_t vector unsigned char
2209 +#define vector_s32_t vector signed int
2210 +#define vector_u32_t vector unsigned int
2211 +
2212 +#define IDCT_HALF                                      \
2213 +    /* 1st stage */                                    \
2214 +    t1 = vec_mradds (a1, vx7, vx1 );                   \
2215 +    t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7));   \
2216 +    t7 = vec_mradds (a2, vx5, vx3);                    \
2217 +    t3 = vec_mradds (ma2, vx3, vx5);                   \
2218 +                                                       \
2219 +    /* 2nd stage */                                    \
2220 +    t5 = vec_adds (vx0, vx4);                          \
2221 +    t0 = vec_subs (vx0, vx4);                          \
2222 +    t2 = vec_mradds (a0, vx6, vx2);                    \
2223 +    t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6));   \
2224 +    t6 = vec_adds (t8, t3);                            \
2225 +    t3 = vec_subs (t8, t3);                            \
2226 +    t8 = vec_subs (t1, t7);                            \
2227 +    t1 = vec_adds (t1, t7);                            \
2228 +                                                       \
2229 +    /* 3rd stage */                                    \
2230 +    t7 = vec_adds (t5, t2);                            \
2231 +    t2 = vec_subs (t5, t2);                            \
2232 +    t5 = vec_adds (t0, t4);                            \
2233 +    t0 = vec_subs (t0, t4);                            \
2234 +    t4 = vec_subs (t8, t3);                            \
2235 +    t3 = vec_adds (t8, t3);                            \
2236 +                                                       \
2237 +    /* 4th stage */                                    \
2238 +    vy0 = vec_adds (t7, t1);                           \
2239 +    vy7 = vec_subs (t7, t1);                           \
2240 +    vy1 = vec_mradds (c4, t3, t5);                     \
2241 +    vy6 = vec_mradds (mc4, t3, t5);                    \
2242 +    vy2 = vec_mradds (c4, t4, t0);                     \
2243 +    vy5 = vec_mradds (mc4, t4, t0);                    \
2244 +    vy3 = vec_adds (t2, t6);                           \
2245 +    vy4 = vec_subs (t2, t6);
2246 +
2247 +
2248 +#define IDCT                                                           \
2249 +    vector_s16_t vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;               \
2250 +    vector_s16_t vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;               \
2251 +    vector_s16_t a0, a1, a2, ma2, c4, mc4, zero, bias;                 \
2252 +    vector_s16_t t0, t1, t2, t3, t4, t5, t6, t7, t8;                   \
2253 +    vector_u16_t shift;                                                        \
2254 +                                                                       \
2255 +    c4 = vec_splat (constants[0], 0);                                  \
2256 +    a0 = vec_splat (constants[0], 1);                                  \
2257 +    a1 = vec_splat (constants[0], 2);                                  \
2258 +    a2 = vec_splat (constants[0], 3);                                  \
2259 +    mc4 = vec_splat (constants[0], 4);                                 \
2260 +    ma2 = vec_splat (constants[0], 5);                                 \
2261 +    bias = (vector_s16_t)vec_splat ((vector_s32_t)constants[0], 3);    \
2262 +                                                                       \
2263 +    zero = vec_splat_s16 (0);                                          \
2264 +    shift = vec_splat_u16 (4);                                         \
2265 +                                                                       \
2266 +    vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero);   \
2267 +    vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero);   \
2268 +    vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero);   \
2269 +    vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero);   \
2270 +    vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero);   \
2271 +    vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero);   \
2272 +    vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero);   \
2273 +    vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero);   \
2274 +                                                                       \
2275 +    IDCT_HALF                                                          \
2276 +                                                                       \
2277 +    vx0 = vec_mergeh (vy0, vy4);                                       \
2278 +    vx1 = vec_mergel (vy0, vy4);                                       \
2279 +    vx2 = vec_mergeh (vy1, vy5);                                       \
2280 +    vx3 = vec_mergel (vy1, vy5);                                       \
2281 +    vx4 = vec_mergeh (vy2, vy6);                                       \
2282 +    vx5 = vec_mergel (vy2, vy6);                                       \
2283 +    vx6 = vec_mergeh (vy3, vy7);                                       \
2284 +    vx7 = vec_mergel (vy3, vy7);                                       \
2285 +                                                                       \
2286 +    vy0 = vec_mergeh (vx0, vx4);                                       \
2287 +    vy1 = vec_mergel (vx0, vx4);                                       \
2288 +    vy2 = vec_mergeh (vx1, vx5);                                       \
2289 +    vy3 = vec_mergel (vx1, vx5);                                       \
2290 +    vy4 = vec_mergeh (vx2, vx6);                                       \
2291 +    vy5 = vec_mergel (vx2, vx6);                                       \
2292 +    vy6 = vec_mergeh (vx3, vx7);                                       \
2293 +    vy7 = vec_mergel (vx3, vx7);                                       \
2294 +                                                                       \
2295 +    vx0 = vec_adds (vec_mergeh (vy0, vy4), bias);                      \
2296 +    vx1 = vec_mergel (vy0, vy4);                                       \
2297 +    vx2 = vec_mergeh (vy1, vy5);                                       \
2298 +    vx3 = vec_mergel (vy1, vy5);                                       \
2299 +    vx4 = vec_mergeh (vy2, vy6);                                       \
2300 +    vx5 = vec_mergel (vy2, vy6);                                       \
2301 +    vx6 = vec_mergeh (vy3, vy7);                                       \
2302 +    vx7 = vec_mergel (vy3, vy7);                                       \
2303 +                                                                       \
2304 +    IDCT_HALF                                                          \
2305 +                                                                       \
2306 +    shift = vec_splat_u16 (6);                                         \
2307 +    vx0 = vec_sra (vy0, shift);                                                \
2308 +    vx1 = vec_sra (vy1, shift);                                                \
2309 +    vx2 = vec_sra (vy2, shift);                                                \
2310 +    vx3 = vec_sra (vy3, shift);                                                \
2311 +    vx4 = vec_sra (vy4, shift);                                                \
2312 +    vx5 = vec_sra (vy5, shift);                                                \
2313 +    vx6 = vec_sra (vy6, shift);                                                \
2314 +    vx7 = vec_sra (vy7, shift);
2315 +
2316 +
2317 +static const vector_s16_t constants[5] = {
2318 +    (vector_s16_t) AVV(23170, 13573, 6518, 21895, -23170, -21895, 32, 31),
2319 +    (vector_s16_t) AVV(16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725),
2320 +    (vector_s16_t) AVV(22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521),
2321 +    (vector_s16_t) AVV(21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692),
2322 +    (vector_s16_t) AVV(19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722)
2323 +};
2324 +
2325 +void idct_put_altivec(uint8_t* dest, int stride, vector_s16_t* block)
2326 +{
2327 +POWERPC_PERF_DECLARE(altivec_idct_put_num, 1);
2328 +#ifdef ALTIVEC_USE_REFERENCE_C_CODE
2329 +POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1);
2330 +    void simple_idct_put(uint8_t *dest, int line_size, int16_t *block);
2331 +    simple_idct_put(dest, stride, (int16_t*)block);
2332 +POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1);
2333 +#else /* ALTIVEC_USE_REFERENCE_C_CODE */
2334 +    vector_u8_t tmp;
2335 +
2336 +POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1);
2337 +
2338 +    IDCT
2339 +
2340 +#define COPY(dest,src)                                         \
2341 +    tmp = vec_packsu (src, src);                               \
2342 +    vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);      \
2343 +    vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
2344 +
2345 +    COPY (dest, vx0)   dest += stride;
2346 +    COPY (dest, vx1)   dest += stride;
2347 +    COPY (dest, vx2)   dest += stride;
2348 +    COPY (dest, vx3)   dest += stride;
2349 +    COPY (dest, vx4)   dest += stride;
2350 +    COPY (dest, vx5)   dest += stride;
2351 +    COPY (dest, vx6)   dest += stride;
2352 +    COPY (dest, vx7)
2353 +
2354 +POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1);
2355 +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
2356 +}
2357 +
2358 +void idct_add_altivec(uint8_t* dest, int stride, vector_s16_t* block)
2359 +{
2360 +POWERPC_PERF_DECLARE(altivec_idct_add_num, 1);
2361 +#ifdef ALTIVEC_USE_REFERENCE_C_CODE
2362 +POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1);
2363 +    void simple_idct_add(uint8_t *dest, int line_size, int16_t *block);
2364 +    simple_idct_add(dest, stride, (int16_t*)block);
2365 +POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1);
2366 +#else /* ALTIVEC_USE_REFERENCE_C_CODE */
2367 +    vector_u8_t tmp;
2368 +    vector_s16_t tmp2, tmp3;
2369 +    vector_u8_t perm0;
2370 +    vector_u8_t perm1;
2371 +    vector_u8_t p0, p1, p;
2372 +
2373 +POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1);
2374 +
2375 +    IDCT
2376 +
2377 +    p0 = vec_lvsl (0, dest);
2378 +    p1 = vec_lvsl (stride, dest);
2379 +    p = vec_splat_u8 (-1);
2380 +    perm0 = vec_mergeh (p, p0);
2381 +    perm1 = vec_mergeh (p, p1);
2382 +
2383 +#define ADD(dest,src,perm)                                             \
2384 +    /* *(uint64_t *)&tmp = *(uint64_t *)dest; */                       \
2385 +    tmp = vec_ld (0, dest);                                            \
2386 +    tmp2 = (vector_s16_t)vec_perm (tmp, (vector_u8_t)zero, perm);      \
2387 +    tmp3 = vec_adds (tmp2, src);                                       \
2388 +    tmp = vec_packsu (tmp3, tmp3);                                     \
2389 +    vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest);              \
2390 +    vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest);
2391 +
2392 +    ADD (dest, vx0, perm0)     dest += stride;
2393 +    ADD (dest, vx1, perm1)     dest += stride;
2394 +    ADD (dest, vx2, perm0)     dest += stride;
2395 +    ADD (dest, vx3, perm1)     dest += stride;
2396 +    ADD (dest, vx4, perm0)     dest += stride;
2397 +    ADD (dest, vx5, perm1)     dest += stride;
2398 +    ADD (dest, vx6, perm0)     dest += stride;
2399 +    ADD (dest, vx7, perm1)
2400 +
2401 +POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1);
2402 +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
2403 +}
2404 +
2405 diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/mpegvideo_altivec.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/mpegvideo_altivec.c
2406 --- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/mpegvideo_altivec.c   1970-01-01 01:00:00.000000000 +0100
2407 +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/mpegvideo_altivec.c        2003-09-28 17:26:40.000000000 +0200
2408 @@ -0,0 +1,645 @@
2409 +/*
2410 + * Copyright (c) 2002 Dieter Shirley
2411 + *
2412 + * This library is free software; you can redistribute it and/or
2413 + * modify it under the terms of the GNU Lesser General Public
2414 + * License as published by the Free Software Foundation; either
2415 + * version 2 of the License, or (at your option) any later version.
2416 + *
2417 + * This library is distributed in the hope that it will be useful,
2418 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
2419 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
2420 + * Lesser General Public License for more details.
2421 + *
2422 + * You should have received a copy of the GNU Lesser General Public
2423 + * License along with this library; if not, write to the Free Software
2424 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
2425 + */
2426 +
2427 +#include <stdlib.h>
2428 +#include <stdio.h>
2429 +#include "../dsputil.h"
2430 +#include "../mpegvideo.h"
2431 +
2432 +#include "gcc_fixes.h"
2433 +
2434 +#include "dsputil_altivec.h"
2435 +
2436 +// Swaps two variables (used for altivec registers)
2437 +#define SWAP(a,b) \
2438 +do { \
2439 +    __typeof__(a) swap_temp=a; \
2440 +    a=b; \
2441 +    b=swap_temp; \
2442 +} while (0)
2443 +
2444 +// transposes a matrix consisting of four vectors with four elements each
2445 +#define TRANSPOSE4(a,b,c,d) \
2446 +do { \
2447 +  __typeof__(a) _trans_ach = vec_mergeh(a, c); \
2448 +  __typeof__(a) _trans_acl = vec_mergel(a, c); \
2449 +  __typeof__(a) _trans_bdh = vec_mergeh(b, d); \
2450 +  __typeof__(a) _trans_bdl = vec_mergel(b, d); \
2451 + \
2452 +  a = vec_mergeh(_trans_ach, _trans_bdh); \
2453 +  b = vec_mergel(_trans_ach, _trans_bdh); \
2454 +  c = vec_mergeh(_trans_acl, _trans_bdl); \
2455 +  d = vec_mergel(_trans_acl, _trans_bdl); \
2456 +} while (0)
2457 +
2458 +#define TRANSPOSE8(a,b,c,d,e,f,g,h) \
2459 +do { \
2460 +    __typeof__(a)  _A1, _B1, _C1, _D1, _E1, _F1, _G1, _H1; \
2461 +    __typeof__(a)  _A2, _B2, _C2, _D2, _E2, _F2, _G2, _H2; \
2462 + \
2463 +    _A1 = vec_mergeh (a, e); \
2464 +    _B1 = vec_mergel (a, e); \
2465 +    _C1 = vec_mergeh (b, f); \
2466 +    _D1 = vec_mergel (b, f); \
2467 +    _E1 = vec_mergeh (c, g); \
2468 +    _F1 = vec_mergel (c, g); \
2469 +    _G1 = vec_mergeh (d, h); \
2470 +    _H1 = vec_mergel (d, h); \
2471 + \
2472 +    _A2 = vec_mergeh (_A1, _E1); \
2473 +    _B2 = vec_mergel (_A1, _E1); \
2474 +    _C2 = vec_mergeh (_B1, _F1); \
2475 +    _D2 = vec_mergel (_B1, _F1); \
2476 +    _E2 = vec_mergeh (_C1, _G1); \
2477 +    _F2 = vec_mergel (_C1, _G1); \
2478 +    _G2 = vec_mergeh (_D1, _H1); \
2479 +    _H2 = vec_mergel (_D1, _H1); \
2480 + \
2481 +    a = vec_mergeh (_A2, _E2); \
2482 +    b = vec_mergel (_A2, _E2); \
2483 +    c = vec_mergeh (_B2, _F2); \
2484 +    d = vec_mergel (_B2, _F2); \
2485 +    e = vec_mergeh (_C2, _G2); \
2486 +    f = vec_mergel (_C2, _G2); \
2487 +    g = vec_mergeh (_D2, _H2); \
2488 +    h = vec_mergel (_D2, _H2); \
2489 +} while (0)
2490 +
2491 +
2492 +// Loads a four-byte value (int or float) from the target address
2493 +// into every element in the target vector.  Only works if the
2494 +// target address is four-byte aligned (which should be always).
2495 +#define LOAD4(vec, address) \
2496 +{ \
2497 +    __typeof__(vec)* _load_addr = (__typeof__(vec)*)(address); \
2498 +    vector unsigned char _perm_vec = vec_lvsl(0,(address)); \
2499 +    vec = vec_ld(0, _load_addr); \
2500 +    vec = vec_perm(vec, vec, _perm_vec); \
2501 +    vec = vec_splat(vec, 0); \
2502 +}
2503 +
2504 +
2505 +#ifdef CONFIG_DARWIN
2506 +#define FOUROF(a) (a)
2507 +#else
2508 +// slower, for dumb non-apple GCC
2509 +#define FOUROF(a) {a,a,a,a}
2510 +#endif
2511 +int dct_quantize_altivec(MpegEncContext* s,
2512 +                        DCTELEM* data, int n,
2513 +                        int qscale, int* overflow)
2514 +{
2515 +    int lastNonZero;
2516 +    vector float row0, row1, row2, row3, row4, row5, row6, row7;
2517 +    vector float alt0, alt1, alt2, alt3, alt4, alt5, alt6, alt7;
2518 +    const vector float zero = (const vector float)FOUROF(0.);
2519 +
2520 +    // Load the data into the row/alt vectors
2521 +    {
2522 +        vector signed short data0, data1, data2, data3, data4, data5, data6, data7;
2523 +
2524 +        data0 = vec_ld(0, data);
2525 +        data1 = vec_ld(16, data);
2526 +        data2 = vec_ld(32, data);
2527 +        data3 = vec_ld(48, data);
2528 +        data4 = vec_ld(64, data);
2529 +        data5 = vec_ld(80, data);
2530 +        data6 = vec_ld(96, data);
2531 +        data7 = vec_ld(112, data);
2532 +
2533 +        // Transpose the data before we start
2534 +        TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7);
2535 +
2536 +        // load the data into floating point vectors.  We load
2537 +        // the high half of each row into the main row vectors
2538 +        // and the low half into the alt vectors.
2539 +        row0 = vec_ctf(vec_unpackh(data0), 0);
2540 +        alt0 = vec_ctf(vec_unpackl(data0), 0);
2541 +        row1 = vec_ctf(vec_unpackh(data1), 0);
2542 +        alt1 = vec_ctf(vec_unpackl(data1), 0);
2543 +        row2 = vec_ctf(vec_unpackh(data2), 0);
2544 +        alt2 = vec_ctf(vec_unpackl(data2), 0);
2545 +        row3 = vec_ctf(vec_unpackh(data3), 0);
2546 +        alt3 = vec_ctf(vec_unpackl(data3), 0);
2547 +        row4 = vec_ctf(vec_unpackh(data4), 0);
2548 +        alt4 = vec_ctf(vec_unpackl(data4), 0);
2549 +        row5 = vec_ctf(vec_unpackh(data5), 0);
2550 +        alt5 = vec_ctf(vec_unpackl(data5), 0);
2551 +        row6 = vec_ctf(vec_unpackh(data6), 0);
2552 +        alt6 = vec_ctf(vec_unpackl(data6), 0);
2553 +        row7 = vec_ctf(vec_unpackh(data7), 0);
2554 +        alt7 = vec_ctf(vec_unpackl(data7), 0);
2555 +    }
2556 +
2557 +    // The following block could exist as a separate an altivec dct
2558 +               // function.  However, if we put it inline, the DCT data can remain
2559 +               // in the vector local variables, as floats, which we'll use during the
2560 +               // quantize step...
2561 +    {
2562 +        const vector float vec_0_298631336 = (vector float)FOUROF(0.298631336f);
2563 +        const vector float vec_0_390180644 = (vector float)FOUROF(-0.390180644f);
2564 +        const vector float vec_0_541196100 = (vector float)FOUROF(0.541196100f);
2565 +        const vector float vec_0_765366865 = (vector float)FOUROF(0.765366865f);
2566 +        const vector float vec_0_899976223 = (vector float)FOUROF(-0.899976223f);
2567 +        const vector float vec_1_175875602 = (vector float)FOUROF(1.175875602f);
2568 +        const vector float vec_1_501321110 = (vector float)FOUROF(1.501321110f);
2569 +        const vector float vec_1_847759065 = (vector float)FOUROF(-1.847759065f);
2570 +        const vector float vec_1_961570560 = (vector float)FOUROF(-1.961570560f);
2571 +        const vector float vec_2_053119869 = (vector float)FOUROF(2.053119869f);
2572 +        const vector float vec_2_562915447 = (vector float)FOUROF(-2.562915447f);
2573 +        const vector float vec_3_072711026 = (vector float)FOUROF(3.072711026f);
2574 +
2575 +
2576 +        int whichPass, whichHalf;
2577 +
2578 +        for(whichPass = 1; whichPass<=2; whichPass++)
2579 +        {
2580 +            for(whichHalf = 1; whichHalf<=2; whichHalf++)
2581 +            {
2582 +                vector float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2583 +                vector float tmp10, tmp11, tmp12, tmp13;
2584 +                vector float z1, z2, z3, z4, z5;
2585 +
2586 +                tmp0 = vec_add(row0, row7); // tmp0 = dataptr[0] + dataptr[7];
2587 +                tmp7 = vec_sub(row0, row7); // tmp7 = dataptr[0] - dataptr[7];
2588 +                tmp3 = vec_add(row3, row4); // tmp3 = dataptr[3] + dataptr[4];
2589 +                tmp4 = vec_sub(row3, row4); // tmp4 = dataptr[3] - dataptr[4];
2590 +                tmp1 = vec_add(row1, row6); // tmp1 = dataptr[1] + dataptr[6];
2591 +                tmp6 = vec_sub(row1, row6); // tmp6 = dataptr[1] - dataptr[6];
2592 +                tmp2 = vec_add(row2, row5); // tmp2 = dataptr[2] + dataptr[5];
2593 +                tmp5 = vec_sub(row2, row5); // tmp5 = dataptr[2] - dataptr[5];
2594 +
2595 +                tmp10 = vec_add(tmp0, tmp3); // tmp10 = tmp0 + tmp3;
2596 +                tmp13 = vec_sub(tmp0, tmp3); // tmp13 = tmp0 - tmp3;
2597 +                tmp11 = vec_add(tmp1, tmp2); // tmp11 = tmp1 + tmp2;
2598 +                tmp12 = vec_sub(tmp1, tmp2); // tmp12 = tmp1 - tmp2;
2599 +
2600 +
2601 +                // dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS);
2602 +                row0 = vec_add(tmp10, tmp11);
2603 +
2604 +                // dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
2605 +                row4 = vec_sub(tmp10, tmp11);
2606 +
2607 +
2608 +                // z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
2609 +                z1 = vec_madd(vec_add(tmp12, tmp13), vec_0_541196100, (vector float)zero);
2610 +
2611 +                // dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
2612 +                //                CONST_BITS-PASS1_BITS);
2613 +                row2 = vec_madd(tmp13, vec_0_765366865, z1);
2614 +
2615 +                // dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
2616 +                //                CONST_BITS-PASS1_BITS);
2617 +                row6 = vec_madd(tmp12, vec_1_847759065, z1);
2618 +
2619 +                z1 = vec_add(tmp4, tmp7); // z1 = tmp4 + tmp7;
2620 +                z2 = vec_add(tmp5, tmp6); // z2 = tmp5 + tmp6;
2621 +                z3 = vec_add(tmp4, tmp6); // z3 = tmp4 + tmp6;
2622 +                z4 = vec_add(tmp5, tmp7); // z4 = tmp5 + tmp7;
2623 +
2624 +                // z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
2625 +                z5 = vec_madd(vec_add(z3, z4), vec_1_175875602, (vector float)zero);
2626 +
2627 +                // z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
2628 +                z3 = vec_madd(z3, vec_1_961570560, z5);
2629 +
2630 +                // z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
2631 +                z4 = vec_madd(z4, vec_0_390180644, z5);
2632 +
2633 +                // The following adds are rolled into the multiplies above
2634 +                // z3 = vec_add(z3, z5);  // z3 += z5;
2635 +                // z4 = vec_add(z4, z5);  // z4 += z5;
2636 +
2637 +                // z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
2638 +                // Wow!  It's actually more effecient to roll this multiply
2639 +                // into the adds below, even thought the multiply gets done twice!
2640 +                // z2 = vec_madd(z2, vec_2_562915447, (vector float)zero);
2641 +
2642 +                // z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
2643 +                // Same with this one...
2644 +                // z1 = vec_madd(z1, vec_0_899976223, (vector float)zero);
2645 +
2646 +                // tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
2647 +                // dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS);
2648 +                row7 = vec_madd(tmp4, vec_0_298631336, vec_madd(z1, vec_0_899976223, z3));
2649 +
2650 +                // tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
2651 +                // dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS);
2652 +                row5 = vec_madd(tmp5, vec_2_053119869, vec_madd(z2, vec_2_562915447, z4));
2653 +
2654 +                // tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
2655 +                // dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS);
2656 +                row3 = vec_madd(tmp6, vec_3_072711026, vec_madd(z2, vec_2_562915447, z3));
2657 +
2658 +                // tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
2659 +                // dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS);
2660 +                row1 = vec_madd(z1, vec_0_899976223, vec_madd(tmp7, vec_1_501321110, z4));
2661 +
2662 +                // Swap the row values with the alts.  If this is the first half,
2663 +                // this sets up the low values to be acted on in the second half.
2664 +                // If this is the second half, it puts the high values back in
2665 +                // the row values where they are expected to be when we're done.
2666 +                SWAP(row0, alt0);
2667 +                SWAP(row1, alt1);
2668 +                SWAP(row2, alt2);
2669 +                SWAP(row3, alt3);
2670 +                SWAP(row4, alt4);
2671 +                SWAP(row5, alt5);
2672 +                SWAP(row6, alt6);
2673 +                SWAP(row7, alt7);
2674 +            }
2675 +
2676 +            if (whichPass == 1)
2677 +            {
2678 +                // transpose the data for the second pass
2679 +
2680 +                // First, block transpose the upper right with lower left.
2681 +                SWAP(row4, alt0);
2682 +                SWAP(row5, alt1);
2683 +                SWAP(row6, alt2);
2684 +                SWAP(row7, alt3);
2685 +
2686 +                // Now, transpose each block of four
2687 +                TRANSPOSE4(row0, row1, row2, row3);
2688 +                TRANSPOSE4(row4, row5, row6, row7);
2689 +                TRANSPOSE4(alt0, alt1, alt2, alt3);
2690 +                TRANSPOSE4(alt4, alt5, alt6, alt7);
2691 +            }
2692 +        }
2693 +    }
2694 +
2695 +    // used after quantise step
2696 +    int oldBaseValue = 0;
2697 +
2698 +    // perform the quantise step, using the floating point data
2699 +    // still in the row/alt registers
2700 +    {
2701 +        const int* biasAddr;
2702 +        const vector signed int* qmat;
2703 +        vector float bias, negBias;
2704 +
2705 +        if (s->mb_intra)
2706 +        {
2707 +            vector signed int baseVector;
2708 +
2709 +            // We must cache element 0 in the intra case
2710 +            // (it needs special handling).
2711 +            baseVector = vec_cts(vec_splat(row0, 0), 0);
2712 +            vec_ste(baseVector, 0, &oldBaseValue);
2713 +
2714 +            qmat = (vector signed int*)s->q_intra_matrix[qscale];
2715 +            biasAddr = &(s->intra_quant_bias);
2716 +        }
2717 +        else
2718 +        {
2719 +            qmat = (vector signed int*)s->q_inter_matrix[qscale];
2720 +            biasAddr = &(s->inter_quant_bias);
2721 +        }
2722 +
2723 +        // Load the bias vector (We add 0.5 to the bias so that we're
2724 +                               // rounding when we convert to int, instead of flooring.)
2725 +        {
2726 +            vector signed int biasInt;
2727 +            const vector float negOneFloat = (vector float)FOUROF(-1.0f);
2728 +            LOAD4(biasInt, biasAddr);
2729 +            bias = vec_ctf(biasInt, QUANT_BIAS_SHIFT);
2730 +            negBias = vec_madd(bias, negOneFloat, zero);
2731 +        }
2732 +
2733 +        {
2734 +            vector float q0, q1, q2, q3, q4, q5, q6, q7;
2735 +
2736 +            q0 = vec_ctf(qmat[0], QMAT_SHIFT);
2737 +            q1 = vec_ctf(qmat[2], QMAT_SHIFT);
2738 +            q2 = vec_ctf(qmat[4], QMAT_SHIFT);
2739 +            q3 = vec_ctf(qmat[6], QMAT_SHIFT);
2740 +            q4 = vec_ctf(qmat[8], QMAT_SHIFT);
2741 +            q5 = vec_ctf(qmat[10], QMAT_SHIFT);
2742 +            q6 = vec_ctf(qmat[12], QMAT_SHIFT);
2743 +            q7 = vec_ctf(qmat[14], QMAT_SHIFT);
2744 +
2745 +            row0 = vec_sel(vec_madd(row0, q0, negBias), vec_madd(row0, q0, bias),
2746 +                    vec_cmpgt(row0, zero));
2747 +            row1 = vec_sel(vec_madd(row1, q1, negBias), vec_madd(row1, q1, bias),
2748 +                    vec_cmpgt(row1, zero));
2749 +            row2 = vec_sel(vec_madd(row2, q2, negBias), vec_madd(row2, q2, bias),
2750 +                    vec_cmpgt(row2, zero));
2751 +            row3 = vec_sel(vec_madd(row3, q3, negBias), vec_madd(row3, q3, bias),
2752 +                    vec_cmpgt(row3, zero));
2753 +            row4 = vec_sel(vec_madd(row4, q4, negBias), vec_madd(row4, q4, bias),
2754 +                    vec_cmpgt(row4, zero));
2755 +            row5 = vec_sel(vec_madd(row5, q5, negBias), vec_madd(row5, q5, bias),
2756 +                    vec_cmpgt(row5, zero));
2757 +            row6 = vec_sel(vec_madd(row6, q6, negBias), vec_madd(row6, q6, bias),
2758 +                    vec_cmpgt(row6, zero));
2759 +            row7 = vec_sel(vec_madd(row7, q7, negBias), vec_madd(row7, q7, bias),
2760 +                    vec_cmpgt(row7, zero));
2761 +
2762 +            q0 = vec_ctf(qmat[1], QMAT_SHIFT);
2763 +            q1 = vec_ctf(qmat[3], QMAT_SHIFT);
2764 +            q2 = vec_ctf(qmat[5], QMAT_SHIFT);
2765 +            q3 = vec_ctf(qmat[7], QMAT_SHIFT);
2766 +            q4 = vec_ctf(qmat[9], QMAT_SHIFT);
2767 +            q5 = vec_ctf(qmat[11], QMAT_SHIFT);
2768 +            q6 = vec_ctf(qmat[13], QMAT_SHIFT);
2769 +            q7 = vec_ctf(qmat[15], QMAT_SHIFT);
2770 +
2771 +            alt0 = vec_sel(vec_madd(alt0, q0, negBias), vec_madd(alt0, q0, bias),
2772 +                    vec_cmpgt(alt0, zero));
2773 +            alt1 = vec_sel(vec_madd(alt1, q1, negBias), vec_madd(alt1, q1, bias),
2774 +                    vec_cmpgt(alt1, zero));
2775 +            alt2 = vec_sel(vec_madd(alt2, q2, negBias), vec_madd(alt2, q2, bias),
2776 +                    vec_cmpgt(alt2, zero));
2777 +            alt3 = vec_sel(vec_madd(alt3, q3, negBias), vec_madd(alt3, q3, bias),
2778 +                    vec_cmpgt(alt3, zero));
2779 +            alt4 = vec_sel(vec_madd(alt4, q4, negBias), vec_madd(alt4, q4, bias),
2780 +                    vec_cmpgt(alt4, zero));
2781 +            alt5 = vec_sel(vec_madd(alt5, q5, negBias), vec_madd(alt5, q5, bias),
2782 +                    vec_cmpgt(alt5, zero));
2783 +            alt6 = vec_sel(vec_madd(alt6, q6, negBias), vec_madd(alt6, q6, bias),
2784 +                    vec_cmpgt(alt6, zero));
2785 +            alt7 = vec_sel(vec_madd(alt7, q7, negBias), vec_madd(alt7, q7, bias),
2786 +                    vec_cmpgt(alt7, zero));
2787 +        }
2788 +
2789 +
2790 +    }
2791 +
2792 +    // Store the data back into the original block
2793 +    {
2794 +        vector signed short data0, data1, data2, data3, data4, data5, data6, data7;
2795 +
2796 +        data0 = vec_pack(vec_cts(row0, 0), vec_cts(alt0, 0));
2797 +        data1 = vec_pack(vec_cts(row1, 0), vec_cts(alt1, 0));
2798 +        data2 = vec_pack(vec_cts(row2, 0), vec_cts(alt2, 0));
2799 +        data3 = vec_pack(vec_cts(row3, 0), vec_cts(alt3, 0));
2800 +        data4 = vec_pack(vec_cts(row4, 0), vec_cts(alt4, 0));
2801 +        data5 = vec_pack(vec_cts(row5, 0), vec_cts(alt5, 0));
2802 +        data6 = vec_pack(vec_cts(row6, 0), vec_cts(alt6, 0));
2803 +        data7 = vec_pack(vec_cts(row7, 0), vec_cts(alt7, 0));
2804 +
2805 +        {
2806 +            // Clamp for overflow
2807 +            vector signed int max_q_int, min_q_int;
2808 +            vector signed short max_q, min_q;
2809 +
2810 +            LOAD4(max_q_int, &(s->max_qcoeff));
2811 +            LOAD4(min_q_int, &(s->min_qcoeff));
2812 +
2813 +            max_q = vec_pack(max_q_int, max_q_int);
2814 +            min_q = vec_pack(min_q_int, min_q_int);
2815 +
2816 +            data0 = vec_max(vec_min(data0, max_q), min_q);
2817 +            data1 = vec_max(vec_min(data1, max_q), min_q);
2818 +            data2 = vec_max(vec_min(data2, max_q), min_q);
2819 +            data4 = vec_max(vec_min(data4, max_q), min_q);
2820 +            data5 = vec_max(vec_min(data5, max_q), min_q);
2821 +            data6 = vec_max(vec_min(data6, max_q), min_q);
2822 +            data7 = vec_max(vec_min(data7, max_q), min_q);
2823 +        }
2824 +
2825 +        vector bool char zero_01, zero_23, zero_45, zero_67;
2826 +        vector signed char scanIndices_01, scanIndices_23, scanIndices_45, scanIndices_67;
2827 +        vector signed char negOne = vec_splat_s8(-1);
2828 +        vector signed char* scanPtr =
2829 +                (vector signed char*)(s->intra_scantable.inverse);
2830 +
2831 +        // Determine the largest non-zero index.
2832 +        zero_01 = vec_pack(vec_cmpeq(data0, (vector short)zero),
2833 +                vec_cmpeq(data1, (vector short)zero));
2834 +        zero_23 = vec_pack(vec_cmpeq(data2, (vector short)zero),
2835 +                vec_cmpeq(data3, (vector short)zero));
2836 +        zero_45 = vec_pack(vec_cmpeq(data4, (vector short)zero),
2837 +                vec_cmpeq(data5, (vector short)zero));
2838 +        zero_67 = vec_pack(vec_cmpeq(data6, (vector short)zero),
2839 +                vec_cmpeq(data7, (vector short)zero));
2840 +
2841 +        // 64 biggest values
2842 +        scanIndices_01 = vec_sel(scanPtr[0], negOne, zero_01);
2843 +        scanIndices_23 = vec_sel(scanPtr[1], negOne, zero_23);
2844 +        scanIndices_45 = vec_sel(scanPtr[2], negOne, zero_45);
2845 +        scanIndices_67 = vec_sel(scanPtr[3], negOne, zero_67);
2846 +
2847 +        // 32 largest values
2848 +        scanIndices_01 = vec_max(scanIndices_01, scanIndices_23);
2849 +        scanIndices_45 = vec_max(scanIndices_45, scanIndices_67);
2850 +
2851 +        // 16 largest values
2852 +        scanIndices_01 = vec_max(scanIndices_01, scanIndices_45);
2853 +
2854 +        // 8 largest values
2855 +        scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne),
2856 +                vec_mergel(scanIndices_01, negOne));
2857 +
2858 +        // 4 largest values
2859 +        scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne),
2860 +                vec_mergel(scanIndices_01, negOne));
2861 +
2862 +        // 2 largest values
2863 +        scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne),
2864 +                vec_mergel(scanIndices_01, negOne));
2865 +
2866 +        // largest value
2867 +        scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne),
2868 +                vec_mergel(scanIndices_01, negOne));
2869 +
2870 +        scanIndices_01 = vec_splat(scanIndices_01, 0);
2871 +
2872 +        signed char lastNonZeroChar;
2873 +
2874 +        vec_ste(scanIndices_01, 0, &lastNonZeroChar);
2875 +
2876 +        lastNonZero = lastNonZeroChar;
2877 +
2878 +        // While the data is still in vectors we check for the transpose IDCT permute
2879 +        // and handle it using the vector unit if we can.  This is the permute used
2880 +        // by the altivec idct, so it is common when using the altivec dct.
2881 +
2882 +        if ((lastNonZero > 0) && (s->dsp.idct_permutation_type == FF_TRANSPOSE_IDCT_PERM))
2883 +        {
2884 +            TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7);
2885 +        }
2886 +
2887 +        vec_st(data0, 0, data);
2888 +        vec_st(data1, 16, data);
2889 +        vec_st(data2, 32, data);
2890 +        vec_st(data3, 48, data);
2891 +        vec_st(data4, 64, data);
2892 +        vec_st(data5, 80, data);
2893 +        vec_st(data6, 96, data);
2894 +        vec_st(data7, 112, data);
2895 +    }
2896 +
2897 +    // special handling of block[0]
2898 +    if (s->mb_intra)
2899 +    {
2900 +        if (!s->h263_aic)
2901 +        {
2902 +            if (n < 4)
2903 +                oldBaseValue /= s->y_dc_scale;
2904 +            else
2905 +                oldBaseValue /= s->c_dc_scale;
2906 +        }
2907 +
2908 +        // Divide by 8, rounding the result
2909 +        data[0] = (oldBaseValue + 4) >> 3;
2910 +    }
2911 +
2912 +    // We handled the tranpose permutation above and we don't
2913 +    // need to permute the "no" permutation case.
2914 +    if ((lastNonZero > 0) &&
2915 +        (s->dsp.idct_permutation_type != FF_TRANSPOSE_IDCT_PERM) &&
2916 +        (s->dsp.idct_permutation_type != FF_NO_IDCT_PERM))
2917 +    {
2918 +        ff_block_permute(data, s->dsp.idct_permutation,
2919 +                s->intra_scantable.scantable, lastNonZero);
2920 +    }
2921 +
2922 +    return lastNonZero;
2923 +}
2924 +#undef FOUROF
2925 +
2926 +/*
2927 +  AltiVec version of dct_unquantize_h263
2928 +  this code assumes `block' is 16 bytes-aligned
2929 +*/
2930 +void dct_unquantize_h263_altivec(MpegEncContext *s,
2931 +                                 DCTELEM *block, int n, int qscale)
2932 +{
2933 +POWERPC_PERF_DECLARE(altivec_dct_unquantize_h263_num, 1);
2934 +    int i, level, qmul, qadd;
2935 +    int nCoeffs;
2936 +
2937 +    assert(s->block_last_index[n]>=0);
2938 +
2939 +POWERPC_PERF_START_COUNT(altivec_dct_unquantize_h263_num, 1);
2940 +
2941 +    qadd = (qscale - 1) | 1;
2942 +    qmul = qscale << 1;
2943 +
2944 +    if (s->mb_intra) {
2945 +        if (!s->h263_aic) {
2946 +            if (n < 4)
2947 +                block[0] = block[0] * s->y_dc_scale;
2948 +            else
2949 +                block[0] = block[0] * s->c_dc_scale;
2950 +        }else
2951 +            qadd = 0;
2952 +        i = 1;
2953 +        nCoeffs= 63; //does not allways use zigzag table
2954 +    } else {
2955 +        i = 0;
2956 +        nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
2957 +    }
2958 +
2959 +#ifdef ALTIVEC_USE_REFERENCE_C_CODE
2960 +    for(;i<=nCoeffs;i++) {
2961 +        level = block[i];
2962 +        if (level) {
2963 +            if (level < 0) {
2964 +                level = level * qmul - qadd;
2965 +            } else {
2966 +                level = level * qmul + qadd;
2967 +            }
2968 +            block[i] = level;
2969 +        }
2970 +    }
2971 +#else /* ALTIVEC_USE_REFERENCE_C_CODE */
2972 +    {
2973 +      register const vector short vczero = (const vector short)vec_splat_s16(0);
2974 +      short __attribute__ ((aligned(16))) qmul8[] =
2975 +          {
2976 +            qmul, qmul, qmul, qmul,
2977 +            qmul, qmul, qmul, qmul
2978 +          };
2979 +      short __attribute__ ((aligned(16))) qadd8[] =
2980 +          {
2981 +            qadd, qadd, qadd, qadd,
2982 +            qadd, qadd, qadd, qadd
2983 +          };
2984 +      short __attribute__ ((aligned(16))) nqadd8[] =
2985 +          {
2986 +            -qadd, -qadd, -qadd, -qadd,
2987 +            -qadd, -qadd, -qadd, -qadd
2988 +          };
2989 +      register vector short blockv, qmulv, qaddv, nqaddv, temp1;
2990 +      register vector bool short blockv_null, blockv_neg;
2991 +      register short backup_0 = block[0];
2992 +      register int j = 0;
2993 +
2994 +      qmulv = vec_ld(0, qmul8);
2995 +      qaddv = vec_ld(0, qadd8);
2996 +      nqaddv = vec_ld(0, nqadd8);
2997 +
2998 +#if 0 // block *is* 16 bytes-aligned, it seems.
2999 +      // first make sure block[j] is 16 bytes-aligned
3000 +      for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) {
3001 +        level = block[j];
3002 +        if (level) {
3003 +          if (level < 0) {
3004 +                level = level * qmul - qadd;
3005 +            } else {
3006 +                level = level * qmul + qadd;
3007 +            }
3008 +            block[j] = level;
3009 +        }
3010 +      }
3011 +#endif
3012 +
3013 +      // vectorize all the 16 bytes-aligned blocks
3014 +      // of 8 elements
3015 +      for(; (j + 7) <= nCoeffs ; j+=8)
3016 +      {
3017 +        blockv = vec_ld(j << 1, block);
3018 +        blockv_neg = vec_cmplt(blockv, vczero);
3019 +        blockv_null = vec_cmpeq(blockv, vczero);
3020 +        // choose between +qadd or -qadd as the third operand
3021 +        temp1 = vec_sel(qaddv, nqaddv, blockv_neg);
3022 +        // multiply & add (block{i,i+7} * qmul [+-] qadd)
3023 +        temp1 = vec_mladd(blockv, qmulv, temp1);
3024 +        // put 0 where block[{i,i+7} used to have 0
3025 +        blockv = vec_sel(temp1, blockv, blockv_null);
3026 +        vec_st(blockv, j << 1, block);
3027 +      }
3028 +
3029 +      // if nCoeffs isn't a multiple of 8, finish the job
3030 +      // using good old scalar units.
3031 +      // (we could do it using a truncated vector,
3032 +      // but I'm not sure it's worth the hassle)
3033 +      for(; j <= nCoeffs ; j++) {
3034 +        level = block[j];
3035 +        if (level) {
3036 +          if (level < 0) {
3037 +                level = level * qmul - qadd;
3038 +            } else {
3039 +                level = level * qmul + qadd;
3040 +            }
3041 +            block[j] = level;
3042 +        }
3043 +      }
3044 +
3045 +      if (i == 1)
3046 +      { // cheat. this avoid special-casing the first iteration
3047 +        block[0] = backup_0;
3048 +      }
3049 +    }
3050 +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
3051 +
3052 +POWERPC_PERF_STOP_COUNT(altivec_dct_unquantize_h263_num, nCoeffs == 63);
3053 +}
3054 diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/mpegvideo_ppc.c avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/mpegvideo_ppc.c
3055 --- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc.orig/mpegvideo_ppc.c       1970-01-01 01:00:00.000000000 +0100
3056 +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/mpegvideo_ppc.c    2003-09-28 17:26:40.000000000 +0200
3057 @@ -0,0 +1,83 @@
3058 +/*\r
3059 + * Copyright (c) 2002 Dieter Shirley\r
3060 + *\r
3061 + * This library is free software; you can redistribute it and/or\r
3062 + * modify it under the terms of the GNU Lesser General Public\r
3063 + * License as published by the Free Software Foundation; either\r
3064 + * version 2 of the License, or (at your option) any later version.\r
3065 + *\r
3066 + * This library is distributed in the hope that it will be useful,\r
3067 + * but WITHOUT ANY WARRANTY; without even the implied warranty of\r
3068 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\r
3069 + * Lesser General Public License for more details.\r
3070 + *\r
3071 + * You should have received a copy of the GNU Lesser General Public\r
3072 + * License along with this library; if not, write to the Free Software\r
3073 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA\r
3074 + */\r
3075 + \r
3076 +#include "../dsputil.h"\r
3077 +#include "../mpegvideo.h"\r
3078 +#include <time.h>\r
3079 +\r
3080 +#ifdef HAVE_ALTIVEC\r
3081 +#include "dsputil_altivec.h"\r
3082 +#endif\r
3083 +\r
3084 +extern int dct_quantize_altivec(MpegEncContext *s,  \r
3085 +        DCTELEM *block, int n,\r
3086 +        int qscale, int *overflow);\r
3087 +extern void dct_unquantize_h263_altivec(MpegEncContext *s,
3088 +                                        DCTELEM *block, int n, int qscale);
3089 +\r
3090 +extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block);\r
3091 +extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block);\r
3092 +\r
3093 +\r
3094 +void MPV_common_init_ppc(MpegEncContext *s)\r
3095 +{\r
3096 +#if HAVE_ALTIVEC\r
3097 +    if (has_altivec())\r
3098 +    {\r
3099 +        if ((s->avctx->idct_algo == FF_IDCT_AUTO) ||\r
3100 +                (s->avctx->idct_algo == FF_IDCT_ALTIVEC))\r
3101 +        {\r
3102 +            s->dsp.idct_put = idct_put_altivec;\r
3103 +            s->dsp.idct_add = idct_add_altivec;\r
3104 +#ifndef ALTIVEC_USE_REFERENCE_C_CODE
3105 +            s->dsp.idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;\r
3106 +#else /* ALTIVEC_USE_REFERENCE_C_CODE */
3107 +            s->dsp.idct_permutation_type = FF_NO_IDCT_PERM;
3108 +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
3109 +        }\r
3110 +\r
3111 +        // Test to make sure that the dct required alignments are met.\r
3112 +        if ((((long)(s->q_intra_matrix) & 0x0f) != 0) ||\r
3113 +                (((long)(s->q_inter_matrix) & 0x0f) != 0))\r
3114 +        {\r
3115 +            fprintf(stderr, "Internal Error: q-matrix blocks must be 16-byte aligned "\r
3116 +                    "to use Altivec DCT. Reverting to non-altivec version.\n");\r
3117 +            return;\r
3118 +        }\r
3119 +\r
3120 +        if (((long)(s->intra_scantable.inverse) & 0x0f) != 0)\r
3121 +        {\r
3122 +            fprintf(stderr, "Internal Error: scan table blocks must be 16-byte aligned "\r
3123 +                    "to use Altivec DCT. Reverting to non-altivec version.\n");\r
3124 +            return;\r
3125 +        }\r
3126 +\r
3127 +\r
3128 +        if ((s->avctx->dct_algo == FF_DCT_AUTO) ||\r
3129 +                (s->avctx->dct_algo == FF_DCT_ALTIVEC))\r
3130 +        {\r
3131 +            s->dct_quantize = dct_quantize_altivec;\r
3132 +            s->dct_unquantize_h263 = dct_unquantize_h263_altivec;
3133 +        }\r
3134 +    } else\r
3135 +#endif\r
3136 +    {\r
3137 +        /* Non-AltiVec PPC optimisations here */\r
3138 +    }\r
3139 +}\r
3140 +\r
3141 --- avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/Makefile.am.orig   2003-05-25 23:11:57.000000000 +0200
3142 +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/ppc/Makefile.am        2003-11-14 01:06:03.904622008 +0100
3143 @@ -20,6 +20,6 @@
3144
3145  libavcodecppc_la_SOURCES = $(PPC_SRC)
3146
3147 -AM_CPPFLAGS = $(LTNOPIC) -DHAVE_AV_CONFIG_H -I$(srcdir)/../..
3148 +AM_CPPFLAGS = $(LTNOPIC) -DHAVE_AV_CONFIG_H -DHAVE_ALTIVEC_H -DHAVE_ALTIVEC -maltivec -mabi=altivec -I$(srcdir)/../..
3149
3150  MAINTAINERCLEANFILES = Makefile.in