avifile-ffmpeg-alpha.patch

   1 diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha.orig/asm.h avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha/asm.h
   2 --- avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha.orig/asm.h       2002-10-16 09:26:12.000000000 +0200
   3 +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha/asm.h    2003-09-28 17:26:39.000000000 +0200
   4 @@ -42,14 +42,14 @@
   5  #define AMASK_CIX (1 << 2)
   6  #define AMASK_MVI (1 << 8)
   7
   8 -inline static uint64_t BYTE_VEC(uint64_t x)
   9 +static inline uint64_t BYTE_VEC(uint64_t x)
  10  {
  11      x |= x <<  8;
  12      x |= x << 16;
  13      x |= x << 32;
  14      return x;
  15  }
  16 -inline static uint64_t WORD_VEC(uint64_t x)
  17 +static inline uint64_t WORD_VEC(uint64_t x)
  18  {
  19      x |= x << 16;
  20      x |= x << 32;
  21 @@ -63,27 +63,15 @@
  22  #define sextw(x) ((int16_t) (x))
  23
  24  #ifdef __GNUC__
  25 -#define ASM_ACCEPT_MVI asm (".arch pca56")
  26  struct unaligned_long { uint64_t l; } __attribute__((packed));
  27  #define ldq_u(p)     (*(const uint64_t *) (((uint64_t) (p)) & ~7ul))
  28  #define uldq(a)             (((const struct unaligned_long *) (a))->l)
  29
  30 -#if GNUC_PREREQ(3,0)
  31 -/* Unfortunately, __builtin_prefetch is slightly buggy on Alpha. The
  32 -   defines here are kludged so we still get the right
  33 -   instruction. This needs to be adapted as soon as gcc is fixed.  */
  34 -# define prefetch(p)     __builtin_prefetch((p), 0, 1)
  35 -# define prefetch_en(p)  __builtin_prefetch((p), 1, 1)
  36 -# define prefetch_m(p)   __builtin_prefetch((p), 0, 0)
  37 -# define prefetch_men(p) __builtin_prefetch((p), 1, 0)
  38 -#else
  39 -# define prefetch(p)     asm volatile("ldl $31,%0"  : : "m"(*(const char *) (p)) : "memory")
  40 -# define prefetch_en(p)  asm volatile("ldq $31,%0"  : : "m"(*(const char *) (p)) : "memory")
  41 -# define prefetch_m(p)   asm volatile("lds $f31,%0" : : "m"(*(const char *) (p)) : "memory")
  42 -# define prefetch_men(p) asm volatile("ldt $f31,%0" : : "m"(*(const char *) (p)) : "memory")
  43 -#endif
  44 -
  45  #if GNUC_PREREQ(3,3)
  46 +#define prefetch(p)     __builtin_prefetch((p), 0, 1)
  47 +#define prefetch_en(p)  __builtin_prefetch((p), 0, 0)
  48 +#define prefetch_m(p)   __builtin_prefetch((p), 1, 1)
  49 +#define prefetch_men(p) __builtin_prefetch((p), 1, 0)
  50  #define cmpbge __builtin_alpha_cmpbge
  51  /* Avoid warnings.  */
  52  #define extql(a, b)    __builtin_alpha_extql(a, (uint64_t) (b))
  53 @@ -94,6 +82,24 @@
  54  #define amask  __builtin_alpha_amask
  55  #define implver        __builtin_alpha_implver
  56  #define rpcc   __builtin_alpha_rpcc
  57 +#else
  58 +#define prefetch(p)     asm volatile("ldl $31,%0"  : : "m"(*(const char *) (p)) : "memory")
  59 +#define prefetch_en(p)  asm volatile("ldq $31,%0"  : : "m"(*(const char *) (p)) : "memory")
  60 +#define prefetch_m(p)   asm volatile("lds $f31,%0" : : "m"(*(const char *) (p)) : "memory")
  61 +#define prefetch_men(p) asm volatile("ldt $f31,%0" : : "m"(*(const char *) (p)) : "memory")
  62 +#define cmpbge(a, b) ({ uint64_t __r; asm ("cmpbge  %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
  63 +#define extql(a, b)  ({ uint64_t __r; asm ("extql   %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
  64 +#define extwl(a, b)  ({ uint64_t __r; asm ("extwl   %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
  65 +#define extqh(a, b)  ({ uint64_t __r; asm ("extqh   %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
  66 +#define zap(a, b)    ({ uint64_t __r; asm ("zap     %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
  67 +#define zapnot(a, b) ({ uint64_t __r; asm ("zapnot  %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
  68 +#define amask(a)     ({ uint64_t __r; asm ("amask   %1,%0"      : "=r" (__r) : "rI"  (a));          __r; })
  69 +#define implver()    ({ uint64_t __r; asm ("implver %0"         : "=r" (__r));                      __r; })
  70 +#define rpcc()      ({ uint64_t __r; asm volatile ("rpcc %0"   : "=r" (__r));                       __r; })
  71 +#endif
  72 +#define wh64(p) asm volatile("wh64 (%0)" : : "r"(p) : "memory")
  73 +
  74 +#if GNUC_PREREQ(3,3) && defined(__alpha_max__)
  75  #define minub8 __builtin_alpha_minub8
  76  #define minsb8 __builtin_alpha_minsb8
  77  #define minuw4 __builtin_alpha_minuw4
  78 @@ -108,34 +114,24 @@
  79  #define unpkbl __builtin_alpha_unpkbl
  80  #define unpkbw __builtin_alpha_unpkbw
  81  #else
  82 -#define cmpbge(a, b) ({ uint64_t __r; asm ("cmpbge  %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
  83 -#define extql(a, b)  ({ uint64_t __r; asm ("extql   %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
  84 -#define extwl(a, b)  ({ uint64_t __r; asm ("extwl   %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
  85 -#define extqh(a, b)  ({ uint64_t __r; asm ("extqh   %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
  86 -#define zap(a, b)    ({ uint64_t __r; asm ("zap     %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
  87 -#define zapnot(a, b) ({ uint64_t __r; asm ("zapnot  %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
  88 -#define amask(a)     ({ uint64_t __r; asm ("amask   %1,%0"      : "=r" (__r) : "rI"  (a));          __r; })
  89 -#define implver()    ({ uint64_t __r; asm ("implver %0"         : "=r" (__r));                      __r; })
  90 -#define rpcc()      ({ uint64_t __r; asm volatile ("rpcc %0"   : "=r" (__r));                       __r; })
  91 -#define minub8(a, b) ({ uint64_t __r; asm ("minub8  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
  92 -#define minsb8(a, b) ({ uint64_t __r; asm ("minsb8  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
  93 -#define minuw4(a, b) ({ uint64_t __r; asm ("minuw4  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
  94 -#define minsw4(a, b) ({ uint64_t __r; asm ("minsw4  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
  95 -#define maxub8(a, b) ({ uint64_t __r; asm ("maxub8  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
  96 -#define maxsb8(a, b) ({ uint64_t __r; asm ("maxsb8  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
  97 -#define maxuw4(a, b) ({ uint64_t __r; asm ("maxuw4  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
  98 -#define maxsw4(a, b) ({ uint64_t __r; asm ("maxsw4  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
  99 -#define perr(a, b)   ({ uint64_t __r; asm ("perr    %r1,%r2,%0" : "=r" (__r) : "%rJ" (a), "rJ" (b)); __r; })
 100 -#define pklb(a)      ({ uint64_t __r; asm ("pklb    %r1,%0"     : "=r" (__r) : "rJ"  (a));          __r; })
 101 -#define pkwb(a)      ({ uint64_t __r; asm ("pkwb    %r1,%0"     : "=r" (__r) : "rJ"  (a));          __r; })
 102 -#define unpkbl(a)    ({ uint64_t __r; asm ("unpkbl  %r1,%0"     : "=r" (__r) : "rJ"  (a));          __r; })
 103 -#define unpkbw(a)    ({ uint64_t __r; asm ("unpkbw  %r1,%0"     : "=r" (__r) : "rJ"  (a));          __r; })
 104 +#define minub8(a, b) ({ uint64_t __r; asm (".arch ev6; minub8  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
 105 +#define minsb8(a, b) ({ uint64_t __r; asm (".arch ev6; minsb8  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
 106 +#define minuw4(a, b) ({ uint64_t __r; asm (".arch ev6; minuw4  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
 107 +#define minsw4(a, b) ({ uint64_t __r; asm (".arch ev6; minsw4  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
 108 +#define maxub8(a, b) ({ uint64_t __r; asm (".arch ev6; maxub8  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
 109 +#define maxsb8(a, b) ({ uint64_t __r; asm (".arch ev6; maxsb8  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
 110 +#define maxuw4(a, b) ({ uint64_t __r; asm (".arch ev6; maxuw4  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
 111 +#define maxsw4(a, b) ({ uint64_t __r; asm (".arch ev6; maxsw4  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
 112 +#define perr(a, b)   ({ uint64_t __r; asm (".arch ev6; perr    %r1,%r2,%0" : "=r" (__r) : "%rJ" (a), "rJ" (b)); __r; })
 113 +#define pklb(a)      ({ uint64_t __r; asm (".arch ev6; pklb    %r1,%0"     : "=r" (__r) : "rJ"  (a));       __r; })
 114 +#define pkwb(a)      ({ uint64_t __r; asm (".arch ev6; pkwb    %r1,%0"     : "=r" (__r) : "rJ"  (a));       __r; })
 115 +#define unpkbl(a)    ({ uint64_t __r; asm (".arch ev6; unpkbl  %r1,%0"     : "=r" (__r) : "rJ"  (a));       __r; })
 116 +#define unpkbw(a)    ({ uint64_t __r; asm (".arch ev6; unpkbw  %r1,%0"     : "=r" (__r) : "rJ"  (a));       __r; })
 117  #endif
 118
 119  #elif defined(__DECC)          /* Digital/Compaq/hp "ccc" compiler */
 120
 121  #include <c_asm.h>
 122 -#define ASM_ACCEPT_MVI
 123  #define ldq_u(a)     asm ("ldq_u   %v0,0(%a0)", a)
 124  #define uldq(a)             (*(const __unaligned uint64_t *) (a))
 125  #define cmpbge(a, b) asm ("cmpbge  %a0,%a1,%v0", a, b)
 126 @@ -160,6 +156,7 @@
 127  #define pkwb(a)      asm ("pkwb    %a0,%v0", a)
 128  #define unpkbl(a)    asm ("unpkbl  %a0,%v0", a)
 129  #define unpkbw(a)    asm ("unpkbw  %a0,%v0", a)
 130 +#define wh64(a)      asm ("wh64    %a0", a)
 131
 132  #else
 133  #error "Unknown compiler!"
 134 diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha.orig/dsputil_alpha.c avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha/dsputil_alpha.c
 135 --- avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha.orig/dsputil_alpha.c     1970-01-01 01:00:00.000000000 +0100
 136 +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha/dsputil_alpha.c  2003-09-28 17:26:39.000000000 +0200
 137 @@ -0,0 +1,364 @@
 138 +/*
 139 + * Alpha optimized DSP utils
 140 + * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
 141 + *
 142 + * This library is free software; you can redistribute it and/or
 143 + * modify it under the terms of the GNU Lesser General Public
 144 + * License as published by the Free Software Foundation; either
 145 + * version 2 of the License, or (at your option) any later version.
 146 + *
 147 + * This library is distributed in the hope that it will be useful,
 148 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 149 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 150 + * Lesser General Public License for more details.
 151 + *
 152 + * You should have received a copy of the GNU Lesser General Public
 153 + * License along with this library; if not, write to the Free Software
 154 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 155 + */
 156 +
 157 +#include "asm.h"
 158 +#include "../dsputil.h"
 159 +
 160 +extern void simple_idct_axp(DCTELEM *block);
 161 +extern void simple_idct_put_axp(uint8_t *dest, int line_size, DCTELEM *block);
 162 +extern void simple_idct_add_axp(uint8_t *dest, int line_size, DCTELEM *block);
 163 +
 164 +void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
 165 +                        int line_size, int h);
 166 +void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
 167 +                                int line_size);
 168 +void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
 169 +                                int line_size);
 170 +void (*put_pixels_clamped_axp_p)(const DCTELEM *block, uint8_t *pixels,
 171 +                                 int line_size);
 172 +void (*add_pixels_clamped_axp_p)(const DCTELEM *block, uint8_t *pixels,
 173 +                                 int line_size);
 174 +
 175 +void get_pixels_mvi(DCTELEM *restrict block,
 176 +                    const uint8_t *restrict pixels, int line_size);
 177 +void diff_pixels_mvi(DCTELEM *block, const uint8_t *s1, const uint8_t *s2,
 178 +                     int stride);
 179 +int pix_abs8x8_mvi(uint8_t *pix1, uint8_t *pix2, int line_size);
 180 +int pix_abs16x16_mvi_asm(uint8_t *pix1, uint8_t *pix2, int line_size);
 181 +int pix_abs16x16_x2_mvi(uint8_t *pix1, uint8_t *pix2, int line_size);
 182 +int pix_abs16x16_y2_mvi(uint8_t *pix1, uint8_t *pix2, int line_size);
 183 +int pix_abs16x16_xy2_mvi(uint8_t *pix1, uint8_t *pix2, int line_size);
 184 +
 185 +#if 0
 186 +/* These functions were the base for the optimized assembler routines,
 187 +   and remain here for documentation purposes.  */
 188 +static void put_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels,
 189 +                                   int line_size)
 190 +{
 191 +    int i = 8;
 192 +    uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
 193 +
 194 +    do {
 195 +        uint64_t shorts0, shorts1;
 196 +
 197 +        shorts0 = ldq(block);
 198 +        shorts0 = maxsw4(shorts0, 0);
 199 +        shorts0 = minsw4(shorts0, clampmask);
 200 +        stl(pkwb(shorts0), pixels);
 201 +
 202 +        shorts1 = ldq(block + 4);
 203 +        shorts1 = maxsw4(shorts1, 0);
 204 +        shorts1 = minsw4(shorts1, clampmask);
 205 +        stl(pkwb(shorts1), pixels + 4);
 206 +
 207 +        pixels += line_size;
 208 +        block += 8;
 209 +    } while (--i);
 210 +}
 211 +
 212 +void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels,
 213 +                            int line_size)
 214 +{
 215 +    int h = 8;
 216 +    /* Keep this function a leaf function by generating the constants
 217 +       manually (mainly for the hack value ;-).  */
 218 +    uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
 219 +    uint64_t signmask  = zap(-1, 0x33);
 220 +    signmask ^= signmask >> 1;  /* 0x8000800080008000 */
 221 +
 222 +    do {
 223 +        uint64_t shorts0, pix0, signs0;
 224 +        uint64_t shorts1, pix1, signs1;
 225 +
 226 +        shorts0 = ldq(block);
 227 +        shorts1 = ldq(block + 4);
 228 +
 229 +        pix0    = unpkbw(ldl(pixels));
 230 +        /* Signed subword add (MMX paddw).  */
 231 +        signs0  = shorts0 & signmask;
 232 +        shorts0 &= ~signmask;
 233 +        shorts0 += pix0;
 234 +        shorts0 ^= signs0;
 235 +        /* Clamp. */
 236 +        shorts0 = maxsw4(shorts0, 0);
 237 +        shorts0 = minsw4(shorts0, clampmask);
 238 +
 239 +        /* Next 4.  */
 240 +        pix1    = unpkbw(ldl(pixels + 4));
 241 +        signs1  = shorts1 & signmask;
 242 +        shorts1 &= ~signmask;
 243 +        shorts1 += pix1;
 244 +        shorts1 ^= signs1;
 245 +        shorts1 = maxsw4(shorts1, 0);
 246 +        shorts1 = minsw4(shorts1, clampmask);
 247 +
 248 +        stl(pkwb(shorts0), pixels);
 249 +        stl(pkwb(shorts1), pixels + 4);
 250 +
 251 +        pixels += line_size;
 252 +        block += 8;
 253 +    } while (--h);
 254 +}
 255 +#endif
 256 +
 257 +static void clear_blocks_axp(DCTELEM *blocks) {
 258 +    uint64_t *p = (uint64_t *) blocks;
 259 +    int n = sizeof(DCTELEM) * 6 * 64;
 260 +
 261 +    do {
 262 +        p[0] = 0;
 263 +        p[1] = 0;
 264 +        p[2] = 0;
 265 +        p[3] = 0;
 266 +        p[4] = 0;
 267 +        p[5] = 0;
 268 +        p[6] = 0;
 269 +        p[7] = 0;
 270 +        p += 8;
 271 +        n -= 8 * 8;
 272 +    } while (n);
 273 +}
 274 +
 275 +static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b)
 276 +{
 277 +    return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
 278 +}
 279 +
 280 +static inline uint64_t avg2(uint64_t a, uint64_t b)
 281 +{
 282 +    return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
 283 +}
 284 +
 285 +#if 0
 286 +/* The XY2 routines basically utilize this scheme, but reuse parts in
 287 +   each iteration.  */
 288 +static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
 289 +{
 290 +    uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
 291 +                + ((l2 & ~BYTE_VEC(0x03)) >> 2)
 292 +                + ((l3 & ~BYTE_VEC(0x03)) >> 2)
 293 +                + ((l4 & ~BYTE_VEC(0x03)) >> 2);
 294 +    uint64_t r2 = ((  (l1 & BYTE_VEC(0x03))
 295 +                    + (l2 & BYTE_VEC(0x03))
 296 +                    + (l3 & BYTE_VEC(0x03))
 297 +                    + (l4 & BYTE_VEC(0x03))
 298 +                    + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
 299 +    return r1 + r2;
 300 +}
 301 +#endif
 302 +
 303 +#define OP(LOAD, STORE)                         \
 304 +    do {                                        \
 305 +        STORE(LOAD(pixels), block);             \
 306 +        pixels += line_size;                    \
 307 +        block += line_size;                     \
 308 +    } while (--h)
 309 +
 310 +#define OP_X2(LOAD, STORE)                                      \
 311 +    do {                                                        \
 312 +        uint64_t pix1, pix2;                                    \
 313 +                                                                \
 314 +        pix1 = LOAD(pixels);                                    \
 315 +        pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);        \
 316 +        STORE(AVG2(pix1, pix2), block);                         \
 317 +        pixels += line_size;                                    \
 318 +        block += line_size;                                     \
 319 +    } while (--h)
 320 +
 321 +#define OP_Y2(LOAD, STORE)                      \
 322 +    do {                                        \
 323 +        uint64_t pix = LOAD(pixels);            \
 324 +        do {                                    \
 325 +            uint64_t next_pix;                  \
 326 +                                                \
 327 +            pixels += line_size;                \
 328 +            next_pix = LOAD(pixels);            \
 329 +            STORE(AVG2(pix, next_pix), block);  \
 330 +            block += line_size;                 \
 331 +            pix = next_pix;                     \
 332 +        } while (--h);                          \
 333 +    } while (0)
 334 +
 335 +#define OP_XY2(LOAD, STORE)                                                 \
 336 +    do {                                                                    \
 337 +        uint64_t pix1 = LOAD(pixels);                                       \
 338 +        uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);           \
 339 +        uint64_t pix_l = (pix1 & BYTE_VEC(0x03))                            \
 340 +                       + (pix2 & BYTE_VEC(0x03));                           \
 341 +        uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2)                    \
 342 +                       + ((pix2 & ~BYTE_VEC(0x03)) >> 2);                   \
 343 +                                                                            \
 344 +        do {                                                                \
 345 +            uint64_t npix1, npix2;                                          \
 346 +            uint64_t npix_l, npix_h;                                        \
 347 +            uint64_t avg;                                                   \
 348 +                                                                            \
 349 +            pixels += line_size;                                            \
 350 +            npix1 = LOAD(pixels);                                           \
 351 +            npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56);              \
 352 +            npix_l = (npix1 & BYTE_VEC(0x03))                               \
 353 +                   + (npix2 & BYTE_VEC(0x03));                              \
 354 +            npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2)                       \
 355 +                   + ((npix2 & ~BYTE_VEC(0x03)) >> 2);                      \
 356 +            avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \
 357 +                + pix_h + npix_h;                                           \
 358 +            STORE(avg, block);                                              \
 359 +                                                                            \
 360 +            block += line_size;                                             \
 361 +            pix_l = npix_l;                                                 \
 362 +            pix_h = npix_h;                                                 \
 363 +        } while (--h);                                                      \
 364 +    } while (0)
 365 +
 366 +#define MAKE_OP(OPNAME, SUFF, OPKIND, STORE)                                \
 367 +static void OPNAME ## _pixels ## SUFF ## _axp                               \
 368 +        (uint8_t *restrict block, const uint8_t *restrict pixels,           \
 369 +         int line_size, int h)                                              \
 370 +{                                                                           \
 371 +    if ((size_t) pixels & 0x7) {                                            \
 372 +        OPKIND(uldq, STORE);                                                \
 373 +    } else {                                                                \
 374 +        OPKIND(ldq, STORE);                                                 \
 375 +    }                                                                       \
 376 +}                                                                           \
 377 +                                                                            \
 378 +static void OPNAME ## _pixels16 ## SUFF ## _axp                             \
 379 +        (uint8_t *restrict block, const uint8_t *restrict pixels,           \
 380 +         int line_size, int h)                                              \
 381 +{                                                                           \
 382 +    OPNAME ## _pixels ## SUFF ## _axp(block,     pixels,     line_size, h); \
 383 +    OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \
 384 +}
 385 +
 386 +#define PIXOP(OPNAME, STORE)                    \
 387 +    MAKE_OP(OPNAME, ,     OP,     STORE)        \
 388 +    MAKE_OP(OPNAME, _x2,  OP_X2,  STORE)        \
 389 +    MAKE_OP(OPNAME, _y2,  OP_Y2,  STORE)        \
 390 +    MAKE_OP(OPNAME, _xy2, OP_XY2, STORE)
 391 +
 392 +/* Rounding primitives.  */
 393 +#define AVG2 avg2
 394 +#define AVG4 avg4
 395 +#define AVG4_ROUNDER BYTE_VEC(0x02)
 396 +#define STORE(l, b) stq(l, b)
 397 +PIXOP(put, STORE);
 398 +
 399 +#undef STORE
 400 +#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
 401 +PIXOP(avg, STORE);
 402 +
 403 +/* Not rounding primitives.  */
 404 +#undef AVG2
 405 +#undef AVG4
 406 +#undef AVG4_ROUNDER
 407 +#undef STORE
 408 +#define AVG2 avg2_no_rnd
 409 +#define AVG4 avg4_no_rnd
 410 +#define AVG4_ROUNDER BYTE_VEC(0x01)
 411 +#define STORE(l, b) stq(l, b)
 412 +PIXOP(put_no_rnd, STORE);
 413 +
 414 +#undef STORE
 415 +#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
 416 +PIXOP(avg_no_rnd, STORE);
 417 +
 418 +void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels,
 419 +                          int line_size, int h)
 420 +{
 421 +    put_pixels_axp_asm(block,     pixels,     line_size, h);
 422 +    put_pixels_axp_asm(block + 8, pixels + 8, line_size, h);
 423 +}
 424 +
 425 +static int sad16x16_mvi(void *s, uint8_t *a, uint8_t *b, int stride)
 426 +{
 427 +    return pix_abs16x16_mvi_asm(a, b, stride);
 428 +}
 429 +
 430 +static int sad8x8_mvi(void *s, uint8_t *a, uint8_t *b, int stride)
 431 +{
 432 +    return pix_abs8x8_mvi(a, b, stride);
 433 +}
 434 +
 435 +void dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx)
 436 +{
 437 +    c->put_pixels_tab[0][0] = put_pixels16_axp_asm;
 438 +    c->put_pixels_tab[0][1] = put_pixels16_x2_axp;
 439 +    c->put_pixels_tab[0][2] = put_pixels16_y2_axp;
 440 +    c->put_pixels_tab[0][3] = put_pixels16_xy2_axp;
 441 +
 442 +    c->put_no_rnd_pixels_tab[0][0] = put_pixels16_axp_asm;
 443 +    c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp;
 444 +    c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp;
 445 +    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp;
 446 +
 447 +    c->avg_pixels_tab[0][0] = avg_pixels16_axp;
 448 +    c->avg_pixels_tab[0][1] = avg_pixels16_x2_axp;
 449 +    c->avg_pixels_tab[0][2] = avg_pixels16_y2_axp;
 450 +    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_axp;
 451 +
 452 +    c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_axp;
 453 +    c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_axp;
 454 +    c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_axp;
 455 +    c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_axp;
 456 +
 457 +    c->put_pixels_tab[1][0] = put_pixels_axp_asm;
 458 +    c->put_pixels_tab[1][1] = put_pixels_x2_axp;
 459 +    c->put_pixels_tab[1][2] = put_pixels_y2_axp;
 460 +    c->put_pixels_tab[1][3] = put_pixels_xy2_axp;
 461 +
 462 +    c->put_no_rnd_pixels_tab[1][0] = put_pixels_axp_asm;
 463 +    c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp;
 464 +    c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp;
 465 +    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp;
 466 +
 467 +    c->avg_pixels_tab[1][0] = avg_pixels_axp;
 468 +    c->avg_pixels_tab[1][1] = avg_pixels_x2_axp;
 469 +    c->avg_pixels_tab[1][2] = avg_pixels_y2_axp;
 470 +    c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp;
 471 +
 472 +    c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels_axp;
 473 +    c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels_x2_axp;
 474 +    c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels_y2_axp;
 475 +    c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels_xy2_axp;
 476 +
 477 +    c->clear_blocks = clear_blocks_axp;
 478 +
 479 +    /* amask clears all bits that correspond to present features.  */
 480 +    if (amask(AMASK_MVI) == 0) {
 481 +        c->put_pixels_clamped = put_pixels_clamped_mvi_asm;
 482 +        c->add_pixels_clamped = add_pixels_clamped_mvi_asm;
 483 +
 484 +        c->get_pixels       = get_pixels_mvi;
 485 +        c->diff_pixels      = diff_pixels_mvi;
 486 +        c->sad[0]           = sad16x16_mvi;
 487 +        c->sad[1]           = sad8x8_mvi;
 488 +        c->pix_abs8x8       = pix_abs8x8_mvi;
 489 +        c->pix_abs16x16     = pix_abs16x16_mvi_asm;
 490 +        c->pix_abs16x16_x2  = pix_abs16x16_x2_mvi;
 491 +        c->pix_abs16x16_y2  = pix_abs16x16_y2_mvi;
 492 +        c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mvi;
 493 +    }
 494 +
 495 +    put_pixels_clamped_axp_p = c->put_pixels_clamped;
 496 +    add_pixels_clamped_axp_p = c->add_pixels_clamped;
 497 +
 498 +    c->idct_put = simple_idct_put_axp;
 499 +    c->idct_add = simple_idct_add_axp;
 500 +    c->idct = simple_idct_axp;
 501 +}
 502 diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha.orig/Makefile.am avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha/Makefile.am
 503 --- avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha.orig/Makefile.am 2003-05-25 23:07:42.000000000 +0200
 504 +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha/Makefile.am      2003-11-13 23:51:25.426454176 +0100
 505 @@ -7,10 +7,12 @@
 506   dsputil_alpha.c \
 507   motion_est_alpha.c \
 508   mpegvideo_alpha.c \
 509 - simple_idct_alpha.c
 510 + simple_idct_alpha.c \
 511 + dsputil_alpha_asm.S \
 512 + motion_est_mvi_asm.S
 513  endif
 514
 515 -noinst_HEADERS = asm.h dsputil_alpha_asm.S regdef.h motion_est_mvi_asm.S
 516 +noinst_HEADERS = asm.h regdef.h
 517
 518  libavcodecalpha_la_SOURCES = $(ALPHA_SRC)
 519
 520 diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha.orig/motion_est_alpha.c avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha/motion_est_alpha.c
 521 --- avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha.orig/motion_est_alpha.c  1970-01-01 01:00:00.000000000 +0100
 522 +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha/motion_est_alpha.c       2003-09-28 17:26:39.000000000 +0200
 523 @@ -0,0 +1,347 @@
 524 +/*
 525 + * Alpha optimized DSP utils
 526 + * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
 527 + *
 528 + * This library is free software; you can redistribute it and/or
 529 + * modify it under the terms of the GNU Lesser General Public
 530 + * License as published by the Free Software Foundation; either
 531 + * version 2 of the License, or (at your option) any later version.
 532 + *
 533 + * This library is distributed in the hope that it will be useful,
 534 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 535 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 536 + * Lesser General Public License for more details.
 537 + *
 538 + * You should have received a copy of the GNU Lesser General Public
 539 + * License along with this library; if not, write to the Free Software
 540 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 541 + */
 542 +
 543 +#include "asm.h"
 544 +#include "../dsputil.h"
 545 +
 546 +void get_pixels_mvi(DCTELEM *restrict block,
 547 +                    const uint8_t *restrict pixels, int line_size)
 548 +{
 549 +    int h = 8;
 550 +
 551 +    do {
 552 +        uint64_t p;
 553 +
 554 +        p = ldq(pixels);
 555 +        stq(unpkbw(p),       block);
 556 +        stq(unpkbw(p >> 32), block + 4);
 557 +
 558 +        pixels += line_size;
 559 +        block += 8;
 560 +    } while (--h);
 561 +}
 562 +
 563 +void diff_pixels_mvi(DCTELEM *block, const uint8_t *s1, const uint8_t *s2,
 564 +                     int stride) {
 565 +    int h = 8;
 566 +    uint64_t mask = 0x4040;
 567 +
 568 +    mask |= mask << 16;
 569 +    mask |= mask << 32;
 570 +    do {
 571 +        uint64_t x, y, c, d, a;
 572 +        uint64_t signs;
 573 +
 574 +        x = ldq(s1);
 575 +        y = ldq(s2);
 576 +        c = cmpbge(x, y);
 577 +        d = x - y;
 578 +        a = zap(mask, c);       /* We use 0x4040404040404040 here...  */
 579 +        d += 4 * a;             /* ...so we can use s4addq here.      */
 580 +        signs = zap(-1, c);
 581 +
 582 +        stq(unpkbw(d)       | (unpkbw(signs)       << 8), block);
 583 +        stq(unpkbw(d >> 32) | (unpkbw(signs >> 32) << 8), block + 4);
 584 +
 585 +        s1 += stride;
 586 +        s2 += stride;
 587 +        block += 8;
 588 +    } while (--h);
 589 +}
 590 +
 591 +static inline uint64_t avg2(uint64_t a, uint64_t b)
 592 +{
 593 +    return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
 594 +}
 595 +
 596 +static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
 597 +{
 598 +    uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
 599 +                + ((l2 & ~BYTE_VEC(0x03)) >> 2)
 600 +                + ((l3 & ~BYTE_VEC(0x03)) >> 2)
 601 +                + ((l4 & ~BYTE_VEC(0x03)) >> 2);
 602 +    uint64_t r2 = ((  (l1 & BYTE_VEC(0x03))
 603 +                    + (l2 & BYTE_VEC(0x03))
 604 +                    + (l3 & BYTE_VEC(0x03))
 605 +                    + (l4 & BYTE_VEC(0x03))
 606 +                    + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
 607 +    return r1 + r2;
 608 +}
 609 +
 610 +int pix_abs8x8_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
 611 +{
 612 +    int result = 0;
 613 +    int h = 8;
 614 +
 615 +    if ((size_t) pix2 & 0x7) {
 616 +        /* works only when pix2 is actually unaligned */
 617 +        do {                    /* do 8 pixel a time */
 618 +            uint64_t p1, p2;
 619 +
 620 +            p1  = ldq(pix1);
 621 +            p2  = uldq(pix2);
 622 +            result += perr(p1, p2);
 623 +
 624 +            pix1 += line_size;
 625 +            pix2 += line_size;
 626 +        } while (--h);
 627 +    } else {
 628 +        do {
 629 +            uint64_t p1, p2;
 630 +
 631 +            p1 = ldq(pix1);
 632 +            p2 = ldq(pix2);
 633 +            result += perr(p1, p2);
 634 +
 635 +            pix1 += line_size;
 636 +            pix2 += line_size;
 637 +        } while (--h);
 638 +    }
 639 +
 640 +    return result;
 641 +}
 642 +
 643 +#if 0                          /* now done in assembly */
 644 +int pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
 645 +{
 646 +    int result = 0;
 647 +    int h = 16;
 648 +
 649 +    if ((size_t) pix2 & 0x7) {
 650 +        /* works only when pix2 is actually unaligned */
 651 +        do {                    /* do 16 pixel a time */
 652 +            uint64_t p1_l, p1_r, p2_l, p2_r;
 653 +            uint64_t t;
 654 +
 655 +            p1_l  = ldq(pix1);
 656 +            p1_r  = ldq(pix1 + 8);
 657 +            t     = ldq_u(pix2 + 8);
 658 +            p2_l  = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
 659 +            p2_r  = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
 660 +            pix1 += line_size;
 661 +            pix2 += line_size;
 662 +
 663 +            result += perr(p1_l, p2_l)
 664 +                    + perr(p1_r, p2_r);
 665 +        } while (--h);
 666 +    } else {
 667 +        do {
 668 +            uint64_t p1_l, p1_r, p2_l, p2_r;
 669 +
 670 +            p1_l = ldq(pix1);
 671 +            p1_r = ldq(pix1 + 8);
 672 +            p2_l = ldq(pix2);
 673 +            p2_r = ldq(pix2 + 8);
 674 +            pix1 += line_size;
 675 +            pix2 += line_size;
 676 +
 677 +            result += perr(p1_l, p2_l)
 678 +                    + perr(p1_r, p2_r);
 679 +        } while (--h);
 680 +    }
 681 +
 682 +    return result;
 683 +}
 684 +#endif
 685 +
 686 +int pix_abs16x16_x2_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
 687 +{
 688 +    int result = 0;
 689 +    int h = 16;
 690 +    uint64_t disalign = (size_t) pix2 & 0x7;
 691 +
 692 +    switch (disalign) {
 693 +    case 0:
 694 +        do {
 695 +            uint64_t p1_l, p1_r, p2_l, p2_r;
 696 +            uint64_t l, r;
 697 +
 698 +            p1_l = ldq(pix1);
 699 +            p1_r = ldq(pix1 + 8);
 700 +            l    = ldq(pix2);
 701 +            r    = ldq(pix2 + 8);
 702 +            p2_l = avg2(l, (l >> 8) | ((uint64_t) r << 56));
 703 +            p2_r = avg2(r, (r >> 8) | ((uint64_t) pix2[16] << 56));
 704 +            pix1 += line_size;
 705 +            pix2 += line_size;
 706 +
 707 +            result += perr(p1_l, p2_l)
 708 +                    + perr(p1_r, p2_r);
 709 +        } while (--h);
 710 +        break;
 711 +    case 7:
 712 +        /* |.......l|lllllllr|rrrrrrr*|
 713 +           This case is special because disalign1 would be 8, which
 714 +           gets treated as 0 by extqh.  At least it is a bit faster
 715 +           that way :)  */
 716 +        do {
 717 +            uint64_t p1_l, p1_r, p2_l, p2_r;
 718 +            uint64_t l, m, r;
 719 +
 720 +            p1_l = ldq(pix1);
 721 +            p1_r = ldq(pix1 + 8);
 722 +            l     = ldq_u(pix2);
 723 +            m     = ldq_u(pix2 + 8);
 724 +            r     = ldq_u(pix2 + 16);
 725 +            p2_l  = avg2(extql(l, disalign) | extqh(m, disalign), m);
 726 +            p2_r  = avg2(extql(m, disalign) | extqh(r, disalign), r);
 727 +            pix1 += line_size;
 728 +            pix2 += line_size;
 729 +
 730 +            result += perr(p1_l, p2_l)
 731 +                    + perr(p1_r, p2_r);
 732 +        } while (--h);
 733 +        break;
 734 +    default:
 735 +        do {
 736 +            uint64_t disalign1 = disalign + 1;
 737 +            uint64_t p1_l, p1_r, p2_l, p2_r;
 738 +            uint64_t l, m, r;
 739 +
 740 +            p1_l  = ldq(pix1);
 741 +            p1_r  = ldq(pix1 + 8);
 742 +            l     = ldq_u(pix2);
 743 +            m     = ldq_u(pix2 + 8);
 744 +            r     = ldq_u(pix2 + 16);
 745 +            p2_l  = avg2(extql(l, disalign) | extqh(m, disalign),
 746 +                         extql(l, disalign1) | extqh(m, disalign1));
 747 +            p2_r  = avg2(extql(m, disalign) | extqh(r, disalign),
 748 +                         extql(m, disalign1) | extqh(r, disalign1));
 749 +            pix1 += line_size;
 750 +            pix2 += line_size;
 751 +
 752 +            result += perr(p1_l, p2_l)
 753 +                    + perr(p1_r, p2_r);
 754 +        } while (--h);
 755 +        break;
 756 +    }
 757 +    return result;
 758 +}
 759 +
 760 +int pix_abs16x16_y2_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
 761 +{
 762 +    int result = 0;
 763 +    int h = 16;
 764 +
 765 +    if ((size_t) pix2 & 0x7) {
 766 +        uint64_t t, p2_l, p2_r;
 767 +        t     = ldq_u(pix2 + 8);
 768 +        p2_l  = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
 769 +        p2_r  = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
 770 +
 771 +        do {
 772 +            uint64_t p1_l, p1_r, np2_l, np2_r;
 773 +            uint64_t t;
 774 +
 775 +            p1_l  = ldq(pix1);
 776 +            p1_r  = ldq(pix1 + 8);
 777 +            pix2 += line_size;
 778 +            t     = ldq_u(pix2 + 8);
 779 +            np2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2);
 780 +            np2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2);
 781 +
 782 +            result += perr(p1_l, avg2(p2_l, np2_l))
 783 +                    + perr(p1_r, avg2(p2_r, np2_r));
 784 +
 785 +            pix1 += line_size;
 786 +            p2_l  = np2_l;
 787 +            p2_r  = np2_r;
 788 +
 789 +        } while (--h);
 790 +    } else {
 791 +        uint64_t p2_l, p2_r;
 792 +        p2_l = ldq(pix2);
 793 +        p2_r = ldq(pix2 + 8);
 794 +        do {
 795 +            uint64_t p1_l, p1_r, np2_l, np2_r;
 796 +
 797 +            p1_l = ldq(pix1);
 798 +            p1_r = ldq(pix1 + 8);
 799 +            pix2 += line_size;
 800 +            np2_l = ldq(pix2);
 801 +            np2_r = ldq(pix2 + 8);
 802 +
 803 +            result += perr(p1_l, avg2(p2_l, np2_l))
 804 +                    + perr(p1_r, avg2(p2_r, np2_r));
 805 +
 806 +            pix1 += line_size;
 807 +            p2_l  = np2_l;
 808 +            p2_r  = np2_r;
 809 +        } while (--h);
 810 +    }
 811 +    return result;
 812 +}
 813 +
 814 +int pix_abs16x16_xy2_mvi(uint8_t *pix1, uint8_t *pix2, int line_size)
 815 +{
 816 +    int result = 0;
 817 +    int h = 16;
 818 +
 819 +    uint64_t p1_l, p1_r;
 820 +    uint64_t p2_l, p2_r, p2_x;
 821 +
 822 +    p1_l = ldq(pix1);
 823 +    p1_r = ldq(pix1 + 8);
 824 +
 825 +    if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
 826 +        p2_l = uldq(pix2);
 827 +        p2_r = uldq(pix2 + 8);
 828 +        p2_x = (uint64_t) pix2[16] << 56;
 829 +    } else {
 830 +        p2_l = ldq(pix2);
 831 +        p2_r = ldq(pix2 + 8);
 832 +        p2_x = ldq(pix2 + 16) << 56;
 833 +    }
 834 +
 835 +    do {
 836 +        uint64_t np1_l, np1_r;
 837 +        uint64_t np2_l, np2_r, np2_x;
 838 +
 839 +        pix1 += line_size;
 840 +        pix2 += line_size;
 841 +
 842 +        np1_l = ldq(pix1);
 843 +        np1_r = ldq(pix1 + 8);
 844 +
 845 +        if ((size_t) pix2 & 0x7) { /* could be optimized a lot */
 846 +            np2_l = uldq(pix2);
 847 +            np2_r = uldq(pix2 + 8);
 848 +            np2_x = (uint64_t) pix2[16] << 56;
 849 +        } else {
 850 +            np2_l = ldq(pix2);
 851 +            np2_r = ldq(pix2 + 8);
 852 +            np2_x = ldq(pix2 + 16) << 56;
 853 +        }
 854 +
 855 +        result += perr(p1_l,
 856 +                       avg4( p2_l, ( p2_l >> 8) | ((uint64_t)  p2_r << 56),
 857 +                            np2_l, (np2_l >> 8) | ((uint64_t) np2_r << 56)))
 858 +                + perr(p1_r,
 859 +                       avg4( p2_r, ( p2_r >> 8) | ((uint64_t)  p2_x),
 860 +                            np2_r, (np2_r >> 8) | ((uint64_t) np2_x)));
 861 +
 862 +        p1_l = np1_l;
 863 +        p1_r = np1_r;
 864 +        p2_l = np2_l;
 865 +        p2_r = np2_r;
 866 +        p2_x = np2_x;
 867 +    } while (--h);
 868 +
 869 +    return result;
 870 +}
 871 diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha.orig/mpegvideo_alpha.c avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha/mpegvideo_alpha.c
 872 --- avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha.orig/mpegvideo_alpha.c   1970-01-01 01:00:00.000000000 +0100
 873 +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha/mpegvideo_alpha.c        2003-09-28 17:26:39.000000000 +0200
 874 @@ -0,0 +1,96 @@
 875 +/*
 876 + * Alpha optimized DSP utils
 877 + * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
 878 + *
 879 + * This library is free software; you can redistribute it and/or
 880 + * modify it under the terms of the GNU Lesser General Public
 881 + * License as published by the Free Software Foundation; either
 882 + * version 2 of the License, or (at your option) any later version.
 883 + *
 884 + * This library is distributed in the hope that it will be useful,
 885 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 886 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 887 + * Lesser General Public License for more details.
 888 + *
 889 + * You should have received a copy of the GNU Lesser General Public
 890 + * License along with this library; if not, write to the Free Software
 891 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 892 + */
 893 +
 894 +#include "asm.h"
 895 +#include "../dsputil.h"
 896 +#include "../mpegvideo.h"
 897 +
 898 +static void dct_unquantize_h263_axp(MpegEncContext *s, DCTELEM *block,
 899 +                                    int n, int qscale)
 900 +{
 901 +    int i, n_coeffs;
 902 +    uint64_t qmul, qadd;
 903 +    uint64_t correction;
 904 +    DCTELEM *orig_block = block;
 905 +    DCTELEM block0;
 906 +
 907 +    qadd = WORD_VEC((qscale - 1) | 1);
 908 +    qmul = qscale << 1;
 909 +    /* This mask kills spill from negative subwords to the next subword.  */
 910 +    correction = WORD_VEC((qmul - 1) + 1); /* multiplication / addition */
 911 +
 912 +    if (s->mb_intra) {
 913 +        if (!s->h263_aic) {
 914 +            if (n < 4)
 915 +                block0 = block[0] * s->y_dc_scale;
 916 +            else
 917 +                block0 = block[0] * s->c_dc_scale;
 918 +        } else {
 919 +           qadd = 0;
 920 +       }
 921 +        n_coeffs = 63; // does not always use zigzag table
 922 +    } else {
 923 +        n_coeffs = s->intra_scantable.raster_end[s->block_last_index[n]];
 924 +    }
 925 +
 926 +    for(i = 0; i <= n_coeffs; block += 4, i += 4) {
 927 +        uint64_t levels, negmask, zeros, add;
 928 +
 929 +        levels = ldq(block);
 930 +        if (levels == 0)
 931 +            continue;
 932 +
 933 +#ifdef __alpha_max__
 934 +        /* I don't think the speed difference justifies runtime
 935 +           detection.  */
 936 +        negmask = maxsw4(levels, -1); /* negative -> ffff (-1) */
 937 +        negmask = minsw4(negmask, 0); /* positive -> 0000 (0) */
 938 +#else
 939 +        negmask = cmpbge(WORD_VEC(0x7fff), levels);
 940 +        negmask &= (negmask >> 1) | (1 << 7);
 941 +        negmask = zap(-1, negmask);
 942 +#endif
 943 +
 944 +        zeros = cmpbge(0, levels);
 945 +        zeros &= zeros >> 1;
 946 +        /* zeros |= zeros << 1 is not needed since qadd <= 255, so
 947 +           zapping the lower byte suffices.  */
 948 +
 949 +        levels *= qmul;
 950 +        levels -= correction & (negmask << 16);
 951 +
 952 +        /* Negate qadd for negative levels.  */
 953 +        add = qadd ^ negmask;
 954 +        add += WORD_VEC(0x0001) & negmask;
 955 +        /* Set qadd to 0 for levels == 0.  */
 956 +        add = zap(add, zeros);
 957 +
 958 +        levels += add;
 959 +
 960 +        stq(levels, block);
 961 +    }
 962 +
 963 +    if (s->mb_intra && !s->h263_aic)
 964 +        orig_block[0] = block0;
 965 +}
 966 +
 967 +void MPV_common_init_axp(MpegEncContext *s)
 968 +{
 969 +    s->dct_unquantize_h263 = dct_unquantize_h263_axp;
 970 +}
 971 diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha.orig/simple_idct_alpha.c avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha/simple_idct_alpha.c
 972 --- avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha.orig/simple_idct_alpha.c 1970-01-01 01:00:00.000000000 +0100
 973 +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha/simple_idct_alpha.c      2003-09-28 17:26:39.000000000 +0200
 974 @@ -0,0 +1,311 @@
 975 +/*
 976 + * Simple IDCT (Alpha optimized)
 977 + *
 978 + * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
 979 + *
 980 + * This library is free software; you can redistribute it and/or
 981 + * modify it under the terms of the GNU Lesser General Public
 982 + * License as published by the Free Software Foundation; either
 983 + * version 2 of the License, or (at your option) any later version.
 984 + *
 985 + * This library is distributed in the hope that it will be useful,
 986 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 987 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 988 + * Lesser General Public License for more details.
 989 + *
 990 + * You should have received a copy of the GNU Lesser General Public
 991 + * License along with this library; if not, write to the Free Software
 992 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 993 + *
 994 + * based upon some outcommented c code from mpeg2dec (idct_mmx.c
 995 + * written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>)
 996 + *
 997 + * Alpha optimiziations by Måns Rullgård <mru@users.sourceforge.net>
 998 + *                     and Falk Hueffner <falk@debian.org>
 999 + */
1000 +
1001 +#include "asm.h"
1002 +#include "../dsputil.h"
1003 +
1004 +extern void (*put_pixels_clamped_axp_p)(const DCTELEM *block, uint8_t *pixels,
1005 +                                        int line_size);
1006 +extern void (*add_pixels_clamped_axp_p)(const DCTELEM *block, uint8_t *pixels,
1007 +                                        int line_size);
1008 +
1009 +// cos(i * M_PI / 16) * sqrt(2) * (1 << 14)
1010 +// W4 is actually exactly 16384, but using 16383 works around
1011 +// accumulating rounding errors for some encoders
1012 +#define W1 ((int_fast32_t) 22725)
1013 +#define W2 ((int_fast32_t) 21407)
1014 +#define W3 ((int_fast32_t) 19266)
1015 +#define W4 ((int_fast32_t) 16383)
1016 +#define W5 ((int_fast32_t) 12873)
1017 +#define W6 ((int_fast32_t)  8867)
1018 +#define W7 ((int_fast32_t)  4520)
1019 +#define ROW_SHIFT 11
1020 +#define COL_SHIFT 20
1021 +
1022 +/* 0: all entries 0, 1: only first entry nonzero, 2: otherwise  */
1023 +static inline int idct_row(DCTELEM *row)
1024 +{
1025 +    int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3, t;
1026 +    uint64_t l, r, t2;
1027 +    l = ldq(row);
1028 +    r = ldq(row + 4);
1029 +
1030 +    if (l == 0 && r == 0)
1031 +        return 0;
1032 +
1033 +    a0 = W4 * sextw(l) + (1 << (ROW_SHIFT - 1));
1034 +
1035 +    if (((l & ~0xffffUL) | r) == 0) {
1036 +        a0 >>= ROW_SHIFT;
1037 +        t2 = (uint16_t) a0;
1038 +        t2 |= t2 << 16;
1039 +        t2 |= t2 << 32;
1040 +
1041 +        stq(t2, row);
1042 +        stq(t2, row + 4);
1043 +        return 1;
1044 +    }
1045 +
1046 +    a1 = a0;
1047 +    a2 = a0;
1048 +    a3 = a0;
1049 +
1050 +    t = extwl(l, 4);            /* row[2] */
1051 +    if (t != 0) {
1052 +        t = sextw(t);
1053 +        a0 += W2 * t;
1054 +        a1 += W6 * t;
1055 +        a2 -= W6 * t;
1056 +        a3 -= W2 * t;
1057 +    }
1058 +
1059 +    t = extwl(r, 0);            /* row[4] */
1060 +    if (t != 0) {
1061 +        t = sextw(t);
1062 +        a0 += W4 * t;
1063 +        a1 -= W4 * t;
1064 +        a2 -= W4 * t;
1065 +        a3 += W4 * t;
1066 +    }
1067 +
1068 +    t = extwl(r, 4);            /* row[6] */
1069 +    if (t != 0) {
1070 +        t = sextw(t);
1071 +        a0 += W6 * t;
1072 +        a1 -= W2 * t;
1073 +        a2 += W2 * t;
1074 +        a3 -= W6 * t;
1075 +    }
1076 +
1077 +    t = extwl(l, 2);            /* row[1] */
1078 +    if (t != 0) {
1079 +        t = sextw(t);
1080 +        b0 = W1 * t;
1081 +        b1 = W3 * t;
1082 +        b2 = W5 * t;
1083 +        b3 = W7 * t;
1084 +    } else {
1085 +        b0 = 0;
1086 +        b1 = 0;
1087 +        b2 = 0;
1088 +        b3 = 0;
1089 +    }
1090 +
1091 +    t = extwl(l, 6);            /* row[3] */
1092 +    if (t) {
1093 +        t = sextw(t);
1094 +        b0 += W3 * t;
1095 +        b1 -= W7 * t;
1096 +        b2 -= W1 * t;
1097 +        b3 -= W5 * t;
1098 +    }
1099 +
1100 +
1101 +    t = extwl(r, 2);            /* row[5] */
1102 +    if (t) {
1103 +        t = sextw(t);
1104 +        b0 += W5 * t;
1105 +        b1 -= W1 * t;
1106 +        b2 += W7 * t;
1107 +        b3 += W3 * t;
1108 +    }
1109 +
1110 +    t = extwl(r, 6);            /* row[7] */
1111 +    if (t) {
1112 +        t = sextw(t);
1113 +        b0 += W7 * t;
1114 +        b1 -= W5 * t;
1115 +        b2 += W3 * t;
1116 +        b3 -= W1 * t;
1117 +    }
1118 +
1119 +    row[0] = (a0 + b0) >> ROW_SHIFT;
1120 +    row[1] = (a1 + b1) >> ROW_SHIFT;
1121 +    row[2] = (a2 + b2) >> ROW_SHIFT;
1122 +    row[3] = (a3 + b3) >> ROW_SHIFT;
1123 +    row[4] = (a3 - b3) >> ROW_SHIFT;
1124 +    row[5] = (a2 - b2) >> ROW_SHIFT;
1125 +    row[6] = (a1 - b1) >> ROW_SHIFT;
1126 +    row[7] = (a0 - b0) >> ROW_SHIFT;
1127 +
1128 +    return 2;
1129 +}
1130 +
1131 +static inline void idct_col(DCTELEM *col)
1132 +{
1133 +    int_fast32_t a0, a1, a2, a3, b0, b1, b2, b3;
1134 +
1135 +    col[0] += (1 << (COL_SHIFT - 1)) / W4;
1136 +
1137 +    a0 = W4 * col[8 * 0];
1138 +    a1 = W4 * col[8 * 0];
1139 +    a2 = W4 * col[8 * 0];
1140 +    a3 = W4 * col[8 * 0];
1141 +
1142 +    if (col[8 * 2]) {
1143 +        a0 += W2 * col[8 * 2];
1144 +        a1 += W6 * col[8 * 2];
1145 +        a2 -= W6 * col[8 * 2];
1146 +        a3 -= W2 * col[8 * 2];
1147 +    }
1148 +
1149 +    if (col[8 * 4]) {
1150 +        a0 += W4 * col[8 * 4];
1151 +        a1 -= W4 * col[8 * 4];
1152 +        a2 -= W4 * col[8 * 4];
1153 +        a3 += W4 * col[8 * 4];
1154 +    }
1155 +
1156 +    if (col[8 * 6]) {
1157 +        a0 += W6 * col[8 * 6];
1158 +        a1 -= W2 * col[8 * 6];
1159 +        a2 += W2 * col[8 * 6];
1160 +        a3 -= W6 * col[8 * 6];
1161 +    }
1162 +
1163 +    if (col[8 * 1]) {
1164 +        b0 = W1 * col[8 * 1];
1165 +        b1 = W3 * col[8 * 1];
1166 +        b2 = W5 * col[8 * 1];
1167 +        b3 = W7 * col[8 * 1];
1168 +    } else {
1169 +        b0 = 0;
1170 +        b1 = 0;
1171 +        b2 = 0;
1172 +        b3 = 0;
1173 +    }
1174 +
1175 +    if (col[8 * 3]) {
1176 +        b0 += W3 * col[8 * 3];
1177 +        b1 -= W7 * col[8 * 3];
1178 +        b2 -= W1 * col[8 * 3];
1179 +        b3 -= W5 * col[8 * 3];
1180 +    }
1181 +
1182 +    if (col[8 * 5]) {
1183 +        b0 += W5 * col[8 * 5];
1184 +        b1 -= W1 * col[8 * 5];
1185 +        b2 += W7 * col[8 * 5];
1186 +        b3 += W3 * col[8 * 5];
1187 +    }
1188 +
1189 +    if (col[8 * 7]) {
1190 +        b0 += W7 * col[8 * 7];
1191 +        b1 -= W5 * col[8 * 7];
1192 +        b2 += W3 * col[8 * 7];
1193 +        b3 -= W1 * col[8 * 7];
1194 +    }
1195 +
1196 +    col[8 * 0] = (a0 + b0) >> COL_SHIFT;
1197 +    col[8 * 7] = (a0 - b0) >> COL_SHIFT;
1198 +    col[8 * 1] = (a1 + b1) >> COL_SHIFT;
1199 +    col[8 * 6] = (a1 - b1) >> COL_SHIFT;
1200 +    col[8 * 2] = (a2 + b2) >> COL_SHIFT;
1201 +    col[8 * 5] = (a2 - b2) >> COL_SHIFT;
1202 +    col[8 * 3] = (a3 + b3) >> COL_SHIFT;
1203 +    col[8 * 4] = (a3 - b3) >> COL_SHIFT;
1204 +}
1205 +
1206 +/* If all rows but the first one are zero after row transformation,
1207 +   all rows will be identical after column transformation.  */
1208 +static inline void idct_col2(DCTELEM *col)
1209 +{
1210 +    int i;
1211 +    uint64_t l, r;
1212 +    uint64_t *lcol = (uint64_t *) col;
1213 +
1214 +    for (i = 0; i < 8; ++i) {
1215 +        int_fast32_t a0 = col[0] + (1 << (COL_SHIFT - 1)) / W4;
1216 +
1217 +        a0 *= W4;
1218 +        col[0] = a0 >> COL_SHIFT;
1219 +        ++col;
1220 +    }
1221 +
1222 +    l = lcol[0];
1223 +    r = lcol[1];
1224 +    lcol[ 2] = l; lcol[ 3] = r;
1225 +    lcol[ 4] = l; lcol[ 5] = r;
1226 +    lcol[ 6] = l; lcol[ 7] = r;
1227 +    lcol[ 8] = l; lcol[ 9] = r;
1228 +    lcol[10] = l; lcol[11] = r;
1229 +    lcol[12] = l; lcol[13] = r;
1230 +    lcol[14] = l; lcol[15] = r;
1231 +}
1232 +
1233 +void simple_idct_axp(DCTELEM *block)
1234 +{
1235 +
1236 +    int i;
1237 +    int rowsZero = 1;           /* all rows except row 0 zero */
1238 +    int rowsConstant = 1;       /* all rows consist of a constant value */
1239 +
1240 +    for (i = 0; i < 8; i++) {
1241 +        int sparseness = idct_row(block + 8 * i);
1242 +
1243 +        if (i > 0 && sparseness > 0)
1244 +            rowsZero = 0;
1245 +        if (sparseness == 2)
1246 +            rowsConstant = 0;
1247 +    }
1248 +
1249 +    if (rowsZero) {
1250 +        idct_col2(block);
1251 +    } else if (rowsConstant) {
1252 +        uint64_t *lblock = (uint64_t *) block;
1253 +
1254 +        idct_col(block);
1255 +        for (i = 0; i < 8; i += 2) {
1256 +            uint64_t v = (uint16_t) block[i * 8];
1257 +            uint64_t w = (uint16_t) block[i * 8 + 8];
1258 +
1259 +            v |= v << 16;
1260 +            w |= w << 16;
1261 +            v |= v << 32;
1262 +            w |= w << 32;
1263 +            lblock[0] = v;
1264 +            lblock[1] = v;
1265 +            lblock[2] = w;
1266 +            lblock[3] = w;
1267 +            lblock += 4;
1268 +        }
1269 +    } else {
1270 +        for (i = 0; i < 8; i++)
1271 +            idct_col(block + i);
1272 +    }
1273 +}
1274 +
1275 +void simple_idct_put_axp(uint8_t *dest, int line_size, DCTELEM *block)
1276 +{
1277 +    simple_idct_axp(block);
1278 +    put_pixels_clamped_axp_p(block, dest, line_size);
1279 +}
1280 +
1281 +void simple_idct_add_axp(uint8_t *dest, int line_size, DCTELEM *block)
1282 +{
1283 +    simple_idct_axp(block);
1284 +    add_pixels_clamped_axp_p(block, dest, line_size);
1285 +}
1286 --- avifile-0.7-0.7.38/configure.in.orig        2003-07-10 13:15:54.000000000 +0200
1287 +++ avifile-0.7-0.7.38/configure.in     2003-11-14 00:09:16.019699264 +0100
1288 @@ -57,6 +57,7 @@
1289  AC_PROG_MAKE_SET
1290  AC_PROG_RANLIB
1291  AC_PROG_AWK
1292 +AM_PROG_AS
1293
1294  AC_CC_VERSION
1295
1296 --- avifile-0.7-0.7.38/acinclude.m4.orig        2003-07-10 15:40:57.000000000 +0200
1297 +++ avifile-0.7-0.7.38/acinclude.m4     2003-11-14 00:17:33.678043696 +0100
1298 @@ -1,17 +1,3 @@
1299 -# as.m4
1300 -# Figure out how to run the assembler.
1301 -
1302 -# AM_PROG_AS
1303 -AC_DEFUN([AM_PROG_AS],
1304 -[# By default we simply use the C compiler to build assembly code.
1305 -AC_REQUIRE([AC_PROG_CC])
1306 -: ${AS='$(CC)'}
1307 -# Set ASFLAGS if not already set.
1308 -: ${ASFLAGS='$(CFLAGS)'}
1309 -AC_SUBST(AS)
1310 -AC_SUBST(ASFLAGS)
1311 -])
1312 -
1313  dnl AC_GCC_VERSION
1314  dnl check for compiler version
1315  dnl sets COMPILER_VERSION and GCC_VERSION