]>
Commit | Line | Data |
---|---|---|
095b5ee2 JB |
1 | diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha.orig/asm.h avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha/asm.h |
2 | --- avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha.orig/asm.h 2002-10-16 09:26:12.000000000 +0200 | |
3 | +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha/asm.h 2003-09-28 17:26:39.000000000 +0200 | |
4 | @@ -42,14 +42,14 @@ | |
5 | #define AMASK_CIX (1 << 2) | |
6 | #define AMASK_MVI (1 << 8) | |
7 | ||
8 | -inline static uint64_t BYTE_VEC(uint64_t x) | |
9 | +static inline uint64_t BYTE_VEC(uint64_t x) | |
10 | { | |
11 | x |= x << 8; | |
12 | x |= x << 16; | |
13 | x |= x << 32; | |
14 | return x; | |
15 | } | |
16 | -inline static uint64_t WORD_VEC(uint64_t x) | |
17 | +static inline uint64_t WORD_VEC(uint64_t x) | |
18 | { | |
19 | x |= x << 16; | |
20 | x |= x << 32; | |
21 | @@ -63,27 +63,15 @@ | |
22 | #define sextw(x) ((int16_t) (x)) | |
23 | ||
24 | #ifdef __GNUC__ | |
25 | -#define ASM_ACCEPT_MVI asm (".arch pca56") | |
26 | struct unaligned_long { uint64_t l; } __attribute__((packed)); | |
27 | #define ldq_u(p) (*(const uint64_t *) (((uint64_t) (p)) & ~7ul)) | |
28 | #define uldq(a) (((const struct unaligned_long *) (a))->l) | |
29 | ||
30 | -#if GNUC_PREREQ(3,0) | |
31 | -/* Unfortunately, __builtin_prefetch is slightly buggy on Alpha. The | |
32 | - defines here are kludged so we still get the right | |
33 | - instruction. This needs to be adapted as soon as gcc is fixed. */ | |
34 | -# define prefetch(p) __builtin_prefetch((p), 0, 1) | |
35 | -# define prefetch_en(p) __builtin_prefetch((p), 1, 1) | |
36 | -# define prefetch_m(p) __builtin_prefetch((p), 0, 0) | |
37 | -# define prefetch_men(p) __builtin_prefetch((p), 1, 0) | |
38 | -#else | |
39 | -# define prefetch(p) asm volatile("ldl $31,%0" : : "m"(*(const char *) (p)) : "memory") | |
40 | -# define prefetch_en(p) asm volatile("ldq $31,%0" : : "m"(*(const char *) (p)) : "memory") | |
41 | -# define prefetch_m(p) asm volatile("lds $f31,%0" : : "m"(*(const char *) (p)) : "memory") | |
42 | -# define prefetch_men(p) asm volatile("ldt $f31,%0" : : "m"(*(const char *) (p)) : "memory") | |
43 | -#endif | |
44 | - | |
45 | #if GNUC_PREREQ(3,3) | |
46 | +#define prefetch(p) __builtin_prefetch((p), 0, 1) | |
47 | +#define prefetch_en(p) __builtin_prefetch((p), 0, 0) | |
48 | +#define prefetch_m(p) __builtin_prefetch((p), 1, 1) | |
49 | +#define prefetch_men(p) __builtin_prefetch((p), 1, 0) | |
50 | #define cmpbge __builtin_alpha_cmpbge | |
51 | /* Avoid warnings. */ | |
52 | #define extql(a, b) __builtin_alpha_extql(a, (uint64_t) (b)) | |
53 | @@ -94,6 +82,24 @@ | |
54 | #define amask __builtin_alpha_amask | |
55 | #define implver __builtin_alpha_implver | |
56 | #define rpcc __builtin_alpha_rpcc | |
57 | +#else | |
58 | +#define prefetch(p) asm volatile("ldl $31,%0" : : "m"(*(const char *) (p)) : "memory") | |
59 | +#define prefetch_en(p) asm volatile("ldq $31,%0" : : "m"(*(const char *) (p)) : "memory") | |
60 | +#define prefetch_m(p) asm volatile("lds $f31,%0" : : "m"(*(const char *) (p)) : "memory") | |
61 | +#define prefetch_men(p) asm volatile("ldt $f31,%0" : : "m"(*(const char *) (p)) : "memory") | |
62 | +#define cmpbge(a, b) ({ uint64_t __r; asm ("cmpbge %r1,%2,%0" : "=r" (__r) : "rJ" (a), "rI" (b)); __r; }) | |
63 | +#define extql(a, b) ({ uint64_t __r; asm ("extql %r1,%2,%0" : "=r" (__r) : "rJ" (a), "rI" (b)); __r; }) | |
64 | +#define extwl(a, b) ({ uint64_t __r; asm ("extwl %r1,%2,%0" : "=r" (__r) : "rJ" (a), "rI" (b)); __r; }) | |
65 | +#define extqh(a, b) ({ uint64_t __r; asm ("extqh %r1,%2,%0" : "=r" (__r) : "rJ" (a), "rI" (b)); __r; }) | |
66 | +#define zap(a, b) ({ uint64_t __r; asm ("zap %r1,%2,%0" : "=r" (__r) : "rJ" (a), "rI" (b)); __r; }) | |
67 | +#define zapnot(a, b) ({ uint64_t __r; asm ("zapnot %r1,%2,%0" : "=r" (__r) : "rJ" (a), "rI" (b)); __r; }) | |
68 | +#define amask(a) ({ uint64_t __r; asm ("amask %1,%0" : "=r" (__r) : "rI" (a)); __r; }) | |
69 | +#define implver() ({ uint64_t __r; asm ("implver %0" : "=r" (__r)); __r; }) | |
70 | +#define rpcc() ({ uint64_t __r; asm volatile ("rpcc %0" : "=r" (__r)); __r; }) | |
71 | +#endif | |
72 | +#define wh64(p) asm volatile("wh64 (%0)" : : "r"(p) : "memory") | |
73 | + | |
74 | +#if GNUC_PREREQ(3,3) && defined(__alpha_max__) | |
75 | #define minub8 __builtin_alpha_minub8 | |
76 | #define minsb8 __builtin_alpha_minsb8 | |
77 | #define minuw4 __builtin_alpha_minuw4 | |
78 | @@ -108,34 +114,24 @@ | |
79 | #define unpkbl __builtin_alpha_unpkbl | |
80 | #define unpkbw __builtin_alpha_unpkbw | |
81 | #else | |
82 | -#define cmpbge(a, b) ({ uint64_t __r; asm ("cmpbge %r1,%2,%0" : "=r" (__r) : "rJ" (a), "rI" (b)); __r; }) | |
83 | -#define extql(a, b) ({ uint64_t __r; asm ("extql %r1,%2,%0" : "=r" (__r) : "rJ" (a), "rI" (b)); __r; }) | |
84 | -#define extwl(a, b) ({ uint64_t __r; asm ("extwl %r1,%2,%0" : "=r" (__r) : "rJ" (a), "rI" (b)); __r; }) | |
85 | -#define extqh(a, b) ({ uint64_t __r; asm ("extqh %r1,%2,%0" : "=r" (__r) : "rJ" (a), "rI" (b)); __r; }) | |
86 | -#define zap(a, b) ({ uint64_t __r; asm ("zap %r1,%2,%0" : "=r" (__r) : "rJ" (a), "rI" (b)); __r; }) | |
87 | -#define zapnot(a, b) ({ uint64_t __r; asm ("zapnot %r1,%2,%0" : "=r" (__r) : "rJ" (a), "rI" (b)); __r; }) | |
88 | -#define amask(a) ({ uint64_t __r; asm ("amask %1,%0" : "=r" (__r) : "rI" (a)); __r; }) | |
89 | -#define implver() ({ uint64_t __r; asm ("implver %0" : "=r" (__r)); __r; }) | |
90 | -#define rpcc() ({ uint64_t __r; asm volatile ("rpcc %0" : "=r" (__r)); __r; }) | |
91 | -#define minub8(a, b) ({ uint64_t __r; asm ("minub8 %r1,%2,%0" : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; }) | |
92 | -#define minsb8(a, b) ({ uint64_t __r; asm ("minsb8 %r1,%2,%0" : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; }) | |
93 | -#define minuw4(a, b) ({ uint64_t __r; asm ("minuw4 %r1,%2,%0" : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; }) | |
94 | -#define minsw4(a, b) ({ uint64_t __r; asm ("minsw4 %r1,%2,%0" : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; }) | |
95 | -#define maxub8(a, b) ({ uint64_t __r; asm ("maxub8 %r1,%2,%0" : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; }) | |
96 | -#define maxsb8(a, b) ({ uint64_t __r; asm ("maxsb8 %r1,%2,%0" : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; }) | |
97 | -#define maxuw4(a, b) ({ uint64_t __r; asm ("maxuw4 %r1,%2,%0" : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; }) | |
98 | -#define maxsw4(a, b) ({ uint64_t __r; asm ("maxsw4 %r1,%2,%0" : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; }) | |
99 | -#define perr(a, b) ({ uint64_t __r; asm ("perr %r1,%r2,%0" : "=r" (__r) : "%rJ" (a), "rJ" (b)); __r; }) | |
100 | -#define pklb(a) ({ uint64_t __r; asm ("pklb %r1,%0" : "=r" (__r) : "rJ" (a)); __r; }) | |
101 | -#define pkwb(a) ({ uint64_t __r; asm ("pkwb %r1,%0" : "=r" (__r) : "rJ" (a)); __r; }) | |
102 | -#define unpkbl(a) ({ uint64_t __r; asm ("unpkbl %r1,%0" : "=r" (__r) : "rJ" (a)); __r; }) | |
103 | -#define unpkbw(a) ({ uint64_t __r; asm ("unpkbw %r1,%0" : "=r" (__r) : "rJ" (a)); __r; }) | |
104 | +#define minub8(a, b) ({ uint64_t __r; asm (".arch ev6; minub8 %r1,%2,%0" : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; }) | |
105 | +#define minsb8(a, b) ({ uint64_t __r; asm (".arch ev6; minsb8 %r1,%2,%0" : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; }) | |
106 | +#define minuw4(a, b) ({ uint64_t __r; asm (".arch ev6; minuw4 %r1,%2,%0" : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; }) | |
107 | +#define minsw4(a, b) ({ uint64_t __r; asm (".arch ev6; minsw4 %r1,%2,%0" : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; }) | |
108 | +#define maxub8(a, b) ({ uint64_t __r; asm (".arch ev6; maxub8 %r1,%2,%0" : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; }) | |
109 | +#define maxsb8(a, b) ({ uint64_t __r; asm (".arch ev6; maxsb8 %r1,%2,%0" : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; }) | |
110 | +#define maxuw4(a, b) ({ uint64_t __r; asm (".arch ev6; maxuw4 %r1,%2,%0" : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; }) | |
111 | +#define maxsw4(a, b) ({ uint64_t __r; asm (".arch ev6; maxsw4 %r1,%2,%0" : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; }) | |
112 | +#define perr(a, b) ({ uint64_t __r; asm (".arch ev6; perr %r1,%r2,%0" : "=r" (__r) : "%rJ" (a), "rJ" (b)); __r; }) | |
113 | +#define pklb(a) ({ uint64_t __r; asm (".arch ev6; pklb %r1,%0" : "=r" (__r) : "rJ" (a)); __r; }) | |
114 | +#define pkwb(a) ({ uint64_t __r; asm (".arch ev6; pkwb %r1,%0" : "=r" (__r) : "rJ" (a)); __r; }) | |
115 | +#define unpkbl(a) ({ uint64_t __r; asm (".arch ev6; unpkbl %r1,%0" : "=r" (__r) : "rJ" (a)); __r; }) | |
116 | +#define unpkbw(a) ({ uint64_t __r; asm (".arch ev6; unpkbw %r1,%0" : "=r" (__r) : "rJ" (a)); __r; }) | |
117 | #endif | |
118 | ||
119 | #elif defined(__DECC) /* Digital/Compaq/hp "ccc" compiler */ | |
120 | ||
121 | #include <c_asm.h> | |
122 | -#define ASM_ACCEPT_MVI | |
123 | #define ldq_u(a) asm ("ldq_u %v0,0(%a0)", a) | |
124 | #define uldq(a) (*(const __unaligned uint64_t *) (a)) | |
125 | #define cmpbge(a, b) asm ("cmpbge %a0,%a1,%v0", a, b) | |
126 | @@ -160,6 +156,7 @@ | |
127 | #define pkwb(a) asm ("pkwb %a0,%v0", a) | |
128 | #define unpkbl(a) asm ("unpkbl %a0,%v0", a) | |
129 | #define unpkbw(a) asm ("unpkbw %a0,%v0", a) | |
130 | +#define wh64(a) asm ("wh64 %a0", a) | |
131 | ||
132 | #else | |
133 | #error "Unknown compiler!" | |
134 | diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha.orig/dsputil_alpha.c avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha/dsputil_alpha.c | |
135 | --- avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha.orig/dsputil_alpha.c 1970-01-01 01:00:00.000000000 +0100 | |
136 | +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha/dsputil_alpha.c 2003-09-28 17:26:39.000000000 +0200 | |
137 | @@ -0,0 +1,364 @@ | |
138 | +/* | |
139 | + * Alpha optimized DSP utils | |
140 | + * Copyright (c) 2002 Falk Hueffner <falk@debian.org> | |
141 | + * | |
142 | + * This library is free software; you can redistribute it and/or | |
143 | + * modify it under the terms of the GNU Lesser General Public | |
144 | + * License as published by the Free Software Foundation; either | |
145 | + * version 2 of the License, or (at your option) any later version. | |
146 | + * | |
147 | + * This library is distributed in the hope that it will be useful, | |
148 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
149 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
150 | + * Lesser General Public License for more details. | |
151 | + * | |
152 | + * You should have received a copy of the GNU Lesser General Public | |
153 | + * License along with this library; if not, write to the Free Software | |
154 | + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
155 | + */ | |
156 | + | |
157 | +#include "asm.h" | |
158 | +#include "../dsputil.h" | |
159 | + | |
160 | +extern void simple_idct_axp(DCTELEM *block); | |
161 | +extern void simple_idct_put_axp(uint8_t *dest, int line_size, DCTELEM *block); | |
162 | +extern void simple_idct_add_axp(uint8_t *dest, int line_size, DCTELEM *block); | |
163 | + | |
164 | +void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels, | |
165 | + int line_size, int h); | |
166 | +void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, | |
167 | + int line_size); | |
168 | +void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, | |
169 | + int line_size); | |
170 | +void (*put_pixels_clamped_axp_p)(const DCTELEM *block, uint8_t *pixels, | |
171 | + int line_size); | |
172 | +void (*add_pixels_clamped_axp_p)(const DCTELEM *block, uint8_t *pixels, | |
173 | + int line_size); | |
174 | + | |
175 | +void get_pixels_mvi(DCTELEM *restrict block, | |
176 | + const uint8_t *restrict pixels, int line_size); | |
177 | +void diff_pixels_mvi(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, | |
178 | + int stride); | |
179 | +int pix_abs8x8_mvi(uint8_t *pix1, uint8_t *pix2, int line_size); | |
180 | +int pix_abs16x16_mvi_asm(uint8_t *pix1, uint8_t *pix2, int line_size); | |
181 | +int pix_abs16x16_x2_mvi(uint8_t *pix1, uint8_t *pix2, int line_size); | |
182 | +int pix_abs16x16_y2_mvi(uint8_t *pix1, uint8_t *pix2, int line_size); | |
183 | +int pix_abs16x16_xy2_mvi(uint8_t *pix1, uint8_t *pix2, int line_size); | |
184 | + | |
185 | +#if 0 | |
186 | +/* These functions were the base for the optimized assembler routines, | |
187 | + and remain here for documentation purposes. */ | |
188 | +static void put_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, | |
189 | + int line_size) | |
190 | +{ | |
191 | + int i = 8; | |
192 | + uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */ | |
193 | + | |
194 | + do { | |
195 | + uint64_t shorts0, shorts1; | |
196 | + | |
197 | + shorts0 = ldq(block); | |
198 | + shorts0 = maxsw4(shorts0, 0); | |
199 | + shorts0 = minsw4(shorts0, clampmask); | |
200 | + stl(pkwb(shorts0), pixels); | |
201 | + | |
202 | + shorts1 = ldq(block + 4); | |
203 | + shorts1 = maxsw4(shorts1, 0); | |
204 | + shorts1 = minsw4(shorts1, clampmask); | |
205 | + stl(pkwb(shorts1), pixels + 4); | |
206 | + | |
207 | + pixels += line_size; | |
208 | + block += 8; | |
209 | + } while (--i); | |
210 | +} | |
211 | + | |
212 | +void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, | |
213 | + int line_size) | |
214 | +{ | |
215 | + int h = 8; | |
216 | + /* Keep this function a leaf function by generating the constants | |
217 | + manually (mainly for the hack value ;-). */ | |
218 | + uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */ | |
219 | + uint64_t signmask = zap(-1, 0x33); | |
220 | + signmask ^= signmask >> 1; /* 0x8000800080008000 */ | |
221 | + | |
222 | + do { | |
223 | + uint64_t shorts0, pix0, signs0; | |
224 | + uint64_t shorts1, pix1, signs1; | |
225 | + | |
226 | + shorts0 = ldq(block); | |
227 | + shorts1 = ldq(block + 4); | |
228 | + | |
229 | + pix0 = unpkbw(ldl(pixels)); | |
230 | + /* Signed subword add (MMX paddw). */ | |
231 | + signs0 = shorts0 & signmask; | |
232 | + shorts0 &= ~signmask; | |
233 | + shorts0 += pix0; | |
234 | + shorts0 ^= signs0; | |
235 | + /* Clamp. */ | |
236 | + shorts0 = maxsw4(shorts0, 0); | |
237 | + shorts0 = minsw4(shorts0, clampmask); | |
238 | + | |
239 | + /* Next 4. */ | |
240 | + pix1 = unpkbw(ldl(pixels + 4)); | |
241 | + signs1 = shorts1 & signmask; | |
242 | + shorts1 &= ~signmask; | |
243 | + shorts1 += pix1; | |
244 | + shorts1 ^= signs1; | |
245 | + shorts1 = maxsw4(shorts1, 0); | |
246 | + shorts1 = minsw4(shorts1, clampmask); | |
247 | + | |
248 | + stl(pkwb(shorts0), pixels); | |
249 | + stl(pkwb(shorts1), pixels + 4); | |
250 | + | |
251 | + pixels += line_size; | |
252 | + block += 8; | |
253 | + } while (--h); | |
254 | +} | |
255 | +#endif | |
256 | + | |
257 | +static void clear_blocks_axp(DCTELEM *blocks) { | |
258 | + uint64_t *p = (uint64_t *) blocks; | |
259 | + int n = sizeof(DCTELEM) * 6 * 64; | |
260 | + | |
261 | + do { | |
262 | + p[0] = 0; | |
263 | + p[1] = 0; | |
264 | + p[2] = 0; | |
265 | + p[3] = 0; | |
266 | + p[4] = 0; | |
267 | + p[5] = 0; | |
268 | + p[6] = 0; | |
269 | + p[7] = 0; | |
270 | + p += 8; | |
271 | + n -= 8 * 8; | |
272 | + } while (n); | |
273 | +} | |
274 | + | |
275 | +static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b) | |
276 | +{ | |
277 | + return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1); | |
278 | +} | |
279 | + | |
280 | +static inline uint64_t avg2(uint64_t a, uint64_t b) | |
281 | +{ | |
282 | + return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1); | |
283 | +} | |
284 | + | |
285 | +#if 0 | |
286 | +/* The XY2 routines basically utilize this scheme, but reuse parts in | |
287 | + each iteration. */ | |
288 | +static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4) | |
289 | +{ | |
290 | + uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) | |
291 | + + ((l2 & ~BYTE_VEC(0x03)) >> 2) | |
292 | + + ((l3 & ~BYTE_VEC(0x03)) >> 2) | |
293 | + + ((l4 & ~BYTE_VEC(0x03)) >> 2); | |
294 | + uint64_t r2 = (( (l1 & BYTE_VEC(0x03)) | |
295 | + + (l2 & BYTE_VEC(0x03)) | |
296 | + + (l3 & BYTE_VEC(0x03)) | |
297 | + + (l4 & BYTE_VEC(0x03)) | |
298 | + + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03); | |
299 | + return r1 + r2; | |
300 | +} | |
301 | +#endif | |
302 | + | |
303 | +#define OP(LOAD, STORE) \ | |
304 | + do { \ | |
305 | + STORE(LOAD(pixels), block); \ | |
306 | + pixels += line_size; \ | |
307 | + block += line_size; \ | |
308 | + } while (--h) | |
309 | + | |
310 | +#define OP_X2(LOAD, STORE) \ | |
311 | + do { \ | |
312 | + uint64_t pix1, pix2; \ | |
313 | + \ | |
314 | + pix1 = LOAD(pixels); \ | |
315 | + pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ | |
316 | + STORE(AVG2(pix1, pix2), block); \ | |
317 | + pixels += line_size; \ | |
318 | + block += line_size; \ | |
319 | + } while (--h) | |
320 | + | |
321 | +#define OP_Y2(LOAD, STORE) \ | |
322 | + do { \ | |
323 | + uint64_t pix = LOAD(pixels); \ | |
324 | + do { \ | |
325 | + uint64_t next_pix; \ | |
326 | + \ | |
327 | + pixels += line_size; \ | |
328 | + next_pix = LOAD(pixels); \ | |
329 | + STORE(AVG2(pix, next_pix), block); \ | |
330 | + block += line_size; \ | |
331 | + pix = next_pix; \ | |
332 | + } while (--h); \ | |
333 | + } while (0) | |
334 | + | |
335 | +#define OP_XY2(LOAD, STORE) \ | |
336 | + do { \ | |
337 | + uint64_t pix1 = LOAD(pixels); \ | |
338 | + uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ | |
339 | + uint64_t pix_l = (pix1 & BYTE_VEC(0x03)) \ | |
340 | + + (pix2 & BYTE_VEC(0x03)); \ | |
341 | + uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2) \ | |
342 | + + ((pix2 & ~BYTE_VEC(0x03)) >> 2); \ | |
343 | + \ | |
344 | + do { \ | |
345 | + uint64_t npix1, npix2; \ | |
346 | + uint64_t npix_l, npix_h; \ | |
347 | + uint64_t avg; \ | |
348 | + \ | |
349 | + pixels += line_size; \ | |
350 | + npix1 = LOAD(pixels); \ | |
351 | + npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56); \ | |
352 | + npix_l = (npix1 & BYTE_VEC(0x03)) \ | |
353 | + + (npix2 & BYTE_VEC(0x03)); \ | |
354 | + npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2) \ | |
355 | + + ((npix2 & ~BYTE_VEC(0x03)) >> 2); \ | |
356 | + avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \ | |
357 | + + pix_h + npix_h; \ | |
358 | + STORE(avg, block); \ | |
359 | + \ | |
360 | + block += line_size; \ | |
361 | + pix_l = npix_l; \ | |
362 | + pix_h = npix_h; \ | |
363 | + } while (--h); \ | |
364 | + } while (0) | |
365 | + | |
366 | +#define MAKE_OP(OPNAME, SUFF, OPKIND, STORE) \ | |
367 | +static void OPNAME ## _pixels ## SUFF ## _axp \ | |
368 | + (uint8_t *restrict block, const uint8_t *restrict pixels, \ | |
369 | + int line_size, int h) \ | |
370 | +{ \ | |
371 | + if ((size_t) pixels & 0x7) { \ | |
372 | + OPKIND(uldq, STORE); \ | |
373 | + } else { \ | |
374 | + OPKIND(ldq, STORE); \ | |
375 | + } \ | |
376 | +} \ | |
377 | + \ | |
378 | +static void OPNAME ## _pixels16 ## SUFF ## _axp \ | |
379 | + (uint8_t *restrict block, const uint8_t *restrict pixels, \ | |
380 | + int line_size, int h) \ | |
381 | +{ \ | |
382 | + OPNAME ## _pixels ## SUFF ## _axp(block, pixels, line_size, h); \ | |
383 | + OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \ | |
384 | +} | |
385 | + | |
386 | +#define PIXOP(OPNAME, STORE) \ | |
387 | + MAKE_OP(OPNAME, , OP, STORE) \ | |
388 | + MAKE_OP(OPNAME, _x2, OP_X2, STORE) \ | |
389 | + MAKE_OP(OPNAME, _y2, OP_Y2, STORE) \ | |
390 | + MAKE_OP(OPNAME, _xy2, OP_XY2, STORE) | |
391 | + | |
392 | +/* Rounding primitives. */ | |
393 | +#define AVG2 avg2 | |
394 | +#define AVG4 avg4 | |
395 | +#define AVG4_ROUNDER BYTE_VEC(0x02) | |
396 | +#define STORE(l, b) stq(l, b) | |
397 | +PIXOP(put, STORE); | |
398 | + | |
399 | +#undef STORE | |
400 | +#define STORE(l, b) stq(AVG2(l, ldq(b)), b); | |
401 | +PIXOP(avg, STORE); | |
402 | + | |
403 | +/* Not rounding primitives. */ | |
404 | +#undef AVG2 | |
405 | +#undef AVG4 | |
406 | +#undef AVG4_ROUNDER | |
407 | +#undef STORE | |
408 | +#define AVG2 avg2_no_rnd | |
409 | +#define AVG4 avg4_no_rnd | |
410 | +#define AVG4_ROUNDER BYTE_VEC(0x01) | |
411 | +#define STORE(l, b) stq(l, b) | |
412 | +PIXOP(put_no_rnd, STORE); | |
413 | + | |
414 | +#undef STORE | |
415 | +#define STORE(l, b) stq(AVG2(l, ldq(b)), b); | |
416 | +PIXOP(avg_no_rnd, STORE); | |
417 | + | |
418 | +void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels, | |
419 | + int line_size, int h) | |
420 | +{ | |
421 | + put_pixels_axp_asm(block, pixels, line_size, h); | |
422 | + put_pixels_axp_asm(block + 8, pixels + 8, line_size, h); | |
423 | +} | |
424 | + | |
425 | +static int sad16x16_mvi(void *s, uint8_t *a, uint8_t *b, int stride) | |
426 | +{ | |
427 | + return pix_abs16x16_mvi_asm(a, b, stride); | |
428 | +} | |
429 | + | |
430 | +static int sad8x8_mvi(void *s, uint8_t *a, uint8_t *b, int stride) | |
431 | +{ | |
432 | + return pix_abs8x8_mvi(a, b, stride); | |
433 | +} | |
434 | + | |
435 | +void dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx) | |
436 | +{ | |
437 | + c->put_pixels_tab[0][0] = put_pixels16_axp_asm; | |
438 | + c->put_pixels_tab[0][1] = put_pixels16_x2_axp; | |
439 | + c->put_pixels_tab[0][2] = put_pixels16_y2_axp; | |
440 | + c->put_pixels_tab[0][3] = put_pixels16_xy2_axp; | |
441 | + | |
442 | + c->put_no_rnd_pixels_tab[0][0] = put_pixels16_axp_asm; | |
443 | + c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp; | |
444 | + c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp; | |
445 | + c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp; | |
446 | + | |
447 | + c->avg_pixels_tab[0][0] = avg_pixels16_axp; | |
448 | + c->avg_pixels_tab[0][1] = avg_pixels16_x2_axp; | |
449 | + c->avg_pixels_tab[0][2] = avg_pixels16_y2_axp; | |
450 | + c->avg_pixels_tab[0][3] = avg_pixels16_xy2_axp; | |
451 | + | |
452 | + c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_axp; | |
453 | + c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_axp; | |
454 | + c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_axp; | |
455 | + c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_axp; | |
456 | + | |
457 | + c->put_pixels_tab[1][0] = put_pixels_axp_asm; | |
458 | + c->put_pixels_tab[1][1] = put_pixels_x2_axp; | |
459 | + c->put_pixels_tab[1][2] = put_pixels_y2_axp; | |
460 | + c->put_pixels_tab[1][3] = put_pixels_xy2_axp; | |
461 | + | |
462 | + c->put_no_rnd_pixels_tab[1][0] = put_pixels_axp_asm; | |
463 | + c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp; | |
464 | + c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp; | |
465 | + c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp; | |
466 | + | |
467 | + c->avg_pixels_tab[1][0] = avg_pixels_axp; | |
468 | + c->avg_pixels_tab[1][1] = avg_pixels_x2_axp; | |
469 | + c->avg_pixels_tab[1][2] = avg_pixels_y2_axp; | |
470 | + c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp; | |
471 | + | |
472 | + c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels_axp; | |
473 | + c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels_x2_axp; | |
474 | + c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels_y2_axp; | |
475 | + c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels_xy2_axp; | |
476 | + | |
477 | + c->clear_blocks = clear_blocks_axp; | |
478 | + | |
479 | + /* amask clears all bits that correspond to present features. */ | |
480 | + if (amask(AMASK_MVI) == 0) { | |
481 | + c->put_pixels_clamped = put_pixels_clamped_mvi_asm; | |
482 | + c->add_pixels_clamped = add_pixels_clamped_mvi_asm; | |
483 | + | |
484 | + c->get_pixels = get_pixels_mvi; | |
485 | + c->diff_pixels = diff_pixels_mvi; | |
486 | + c->sad[0] = sad16x16_mvi; | |
487 | + c->sad[1] = sad8x8_mvi; | |
488 | + c->pix_abs8x8 = pix_abs8x8_mvi; | |
489 | + c->pix_abs16x16 = pix_abs16x16_mvi_asm; | |
490 | + c->pix_abs16x16_x2 = pix_abs16x16_x2_mvi; | |
491 | + c->pix_abs16x16_y2 = pix_abs16x16_y2_mvi; | |
492 | + c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mvi; | |
493 | + } | |
494 | + | |
495 | + put_pixels_clamped_axp_p = c->put_pixels_clamped; | |
496 | + add_pixels_clamped_axp_p = c->add_pixels_clamped; | |
497 | + | |
498 | + c->idct_put = simple_idct_put_axp; | |
499 | + c->idct_add = simple_idct_add_axp; | |
500 | + c->idct = simple_idct_axp; | |
501 | +} | |
502 | diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha.orig/Makefile.am avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha/Makefile.am | |
503 | --- avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha.orig/Makefile.am 2003-05-25 23:07:42.000000000 +0200 | |
504 | +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha/Makefile.am 2003-11-13 23:51:25.426454176 +0100 | |
505 | @@ -7,10 +7,12 @@ | |
506 | dsputil_alpha.c \ | |
507 | motion_est_alpha.c \ | |
508 | mpegvideo_alpha.c \ | |
509 | - simple_idct_alpha.c | |
510 | + simple_idct_alpha.c \ | |
511 | + dsputil_alpha_asm.S \ | |
512 | + motion_est_mvi_asm.S | |
513 | endif | |
514 | ||
515 | -noinst_HEADERS = asm.h dsputil_alpha_asm.S regdef.h motion_est_mvi_asm.S | |
516 | +noinst_HEADERS = asm.h regdef.h | |
517 | ||
518 | libavcodecalpha_la_SOURCES = $(ALPHA_SRC) | |
519 | ||
520 | diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha.orig/motion_est_alpha.c avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha/motion_est_alpha.c | |
521 | --- avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha.orig/motion_est_alpha.c 1970-01-01 01:00:00.000000000 +0100 | |
522 | +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha/motion_est_alpha.c 2003-09-28 17:26:39.000000000 +0200 | |
523 | @@ -0,0 +1,347 @@ | |
524 | +/* | |
525 | + * Alpha optimized DSP utils | |
526 | + * Copyright (c) 2002 Falk Hueffner <falk@debian.org> | |
527 | + * | |
528 | + * This library is free software; you can redistribute it and/or | |
529 | + * modify it under the terms of the GNU Lesser General Public | |
530 | + * License as published by the Free Software Foundation; either | |
531 | + * version 2 of the License, or (at your option) any later version. | |
532 | + * | |
533 | + * This library is distributed in the hope that it will be useful, | |
534 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
535 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
536 | + * Lesser General Public License for more details. | |
537 | + * | |
538 | + * You should have received a copy of the GNU Lesser General Public | |
539 | + * License along with this library; if not, write to the Free Software | |
540 | + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
541 | + */ | |
542 | + | |
543 | +#include "asm.h" | |
544 | +#include "../dsputil.h" | |
545 | + | |
546 | +void get_pixels_mvi(DCTELEM *restrict block, | |
547 | + const uint8_t *restrict pixels, int line_size) | |
548 | +{ | |
549 | + int h = 8; | |
550 | + | |
551 | + do { | |
552 | + uint64_t p; | |
553 | + | |
554 | + p = ldq(pixels); | |
555 | + stq(unpkbw(p), block); | |
556 | + stq(unpkbw(p >> 32), block + 4); | |
557 | + | |
558 | + pixels += line_size; | |
559 | + block += 8; | |
560 | + } while (--h); | |
561 | +} | |
562 | + | |
563 | +void diff_pixels_mvi(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, | |
564 | + int stride) { | |
565 | + int h = 8; | |
566 | + uint64_t mask = 0x4040; | |
567 | + | |
568 | + mask |= mask << 16; | |
569 | + mask |= mask << 32; | |
570 | + do { | |
571 | + uint64_t x, y, c, d, a; | |
572 | + uint64_t signs; | |
573 | + | |
574 | + x = ldq(s1); | |
575 | + y = ldq(s2); | |
576 | + c = cmpbge(x, y); | |
577 | + d = x - y; | |
578 | + a = zap(mask, c); /* We use 0x4040404040404040 here... */ | |
579 | + d += 4 * a; /* ...so we can use s4addq here. */ | |
580 | + signs = zap(-1, c); | |
581 | + | |
582 | + stq(unpkbw(d) | (unpkbw(signs) << 8), block); | |
583 | + stq(unpkbw(d >> 32) | (unpkbw(signs >> 32) << 8), block + 4); | |
584 | + | |
585 | + s1 += stride; | |
586 | + s2 += stride; | |
587 | + block += 8; | |
588 | + } while (--h); | |
589 | +} | |
590 | + | |
591 | +static inline uint64_t avg2(uint64_t a, uint64_t b) | |
592 | +{ | |
593 | + return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1); | |
594 | +} | |
595 | + | |
596 | +static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4) | |
597 | +{ | |
598 | + uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) | |
599 | + + ((l2 & ~BYTE_VEC(0x03)) >> 2) | |
600 | + + ((l3 & ~BYTE_VEC(0x03)) >> 2) | |
601 | + + ((l4 & ~BYTE_VEC(0x03)) >> 2); | |
602 | + uint64_t r2 = (( (l1 & BYTE_VEC(0x03)) | |
603 | + + (l2 & BYTE_VEC(0x03)) | |
604 | + + (l3 & BYTE_VEC(0x03)) | |
605 | + + (l4 & BYTE_VEC(0x03)) | |
606 | + + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03); | |
607 | + return r1 + r2; | |
608 | +} | |
609 | + | |
610 | +int pix_abs8x8_mvi(uint8_t *pix1, uint8_t *pix2, int line_size) | |
611 | +{ | |
612 | + int result = 0; | |
613 | + int h = 8; | |
614 | + | |
615 | + if ((size_t) pix2 & 0x7) { | |
616 | + /* works only when pix2 is actually unaligned */ | |
617 | + do { /* do 8 pixel a time */ | |
618 | + uint64_t p1, p2; | |
619 | + | |
620 | + p1 = ldq(pix1); | |
621 | + p2 = uldq(pix2); | |
622 | + result += perr(p1, p2); | |
623 | + | |
624 | + pix1 += line_size; | |
625 | + pix2 += line_size; | |
626 | + } while (--h); | |
627 | + } else { | |
628 | + do { | |
629 | + uint64_t p1, p2; | |
630 | + | |
631 | + p1 = ldq(pix1); | |
632 | + p2 = ldq(pix2); | |
633 | + result += perr(p1, p2); | |
634 | + | |
635 | + pix1 += line_size; | |
636 | + pix2 += line_size; | |
637 | + } while (--h); | |
638 | + } | |
639 | + | |
640 | + return result; | |
641 | +} | |
642 | + | |
643 | +#if 0 /* now done in assembly */ | |
644 | +int pix_abs16x16_mvi(uint8_t *pix1, uint8_t *pix2, int line_size) | |
645 | +{ | |
646 | + int result = 0; | |
647 | + int h = 16; | |
648 | + | |
649 | + if ((size_t) pix2 & 0x7) { | |
650 | + /* works only when pix2 is actually unaligned */ | |
651 | + do { /* do 16 pixel a time */ | |
652 | + uint64_t p1_l, p1_r, p2_l, p2_r; | |
653 | + uint64_t t; | |
654 | + | |
655 | + p1_l = ldq(pix1); | |
656 | + p1_r = ldq(pix1 + 8); | |
657 | + t = ldq_u(pix2 + 8); | |
658 | + p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2); | |
659 | + p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2); | |
660 | + pix1 += line_size; | |
661 | + pix2 += line_size; | |
662 | + | |
663 | + result += perr(p1_l, p2_l) | |
664 | + + perr(p1_r, p2_r); | |
665 | + } while (--h); | |
666 | + } else { | |
667 | + do { | |
668 | + uint64_t p1_l, p1_r, p2_l, p2_r; | |
669 | + | |
670 | + p1_l = ldq(pix1); | |
671 | + p1_r = ldq(pix1 + 8); | |
672 | + p2_l = ldq(pix2); | |
673 | + p2_r = ldq(pix2 + 8); | |
674 | + pix1 += line_size; | |
675 | + pix2 += line_size; | |
676 | + | |
677 | + result += perr(p1_l, p2_l) | |
678 | + + perr(p1_r, p2_r); | |
679 | + } while (--h); | |
680 | + } | |
681 | + | |
682 | + return result; | |
683 | +} | |
684 | +#endif | |
685 | + | |
686 | +int pix_abs16x16_x2_mvi(uint8_t *pix1, uint8_t *pix2, int line_size) | |
687 | +{ | |
688 | + int result = 0; | |
689 | + int h = 16; | |
690 | + uint64_t disalign = (size_t) pix2 & 0x7; | |
691 | + | |
692 | + switch (disalign) { | |
693 | + case 0: | |
694 | + do { | |
695 | + uint64_t p1_l, p1_r, p2_l, p2_r; | |
696 | + uint64_t l, r; | |
697 | + | |
698 | + p1_l = ldq(pix1); | |
699 | + p1_r = ldq(pix1 + 8); | |
700 | + l = ldq(pix2); | |
701 | + r = ldq(pix2 + 8); | |
702 | + p2_l = avg2(l, (l >> 8) | ((uint64_t) r << 56)); | |
703 | + p2_r = avg2(r, (r >> 8) | ((uint64_t) pix2[16] << 56)); | |
704 | + pix1 += line_size; | |
705 | + pix2 += line_size; | |
706 | + | |
707 | + result += perr(p1_l, p2_l) | |
708 | + + perr(p1_r, p2_r); | |
709 | + } while (--h); | |
710 | + break; | |
711 | + case 7: | |
712 | + /* |.......l|lllllllr|rrrrrrr*| | |
713 | + This case is special because disalign1 would be 8, which | |
714 | + gets treated as 0 by extqh. At least it is a bit faster | |
715 | + that way :) */ | |
716 | + do { | |
717 | + uint64_t p1_l, p1_r, p2_l, p2_r; | |
718 | + uint64_t l, m, r; | |
719 | + | |
720 | + p1_l = ldq(pix1); | |
721 | + p1_r = ldq(pix1 + 8); | |
722 | + l = ldq_u(pix2); | |
723 | + m = ldq_u(pix2 + 8); | |
724 | + r = ldq_u(pix2 + 16); | |
725 | + p2_l = avg2(extql(l, disalign) | extqh(m, disalign), m); | |
726 | + p2_r = avg2(extql(m, disalign) | extqh(r, disalign), r); | |
727 | + pix1 += line_size; | |
728 | + pix2 += line_size; | |
729 | + | |
730 | + result += perr(p1_l, p2_l) | |
731 | + + perr(p1_r, p2_r); | |
732 | + } while (--h); | |
733 | + break; | |
734 | + default: | |
735 | + do { | |
736 | + uint64_t disalign1 = disalign + 1; | |
737 | + uint64_t p1_l, p1_r, p2_l, p2_r; | |
738 | + uint64_t l, m, r; | |
739 | + | |
740 | + p1_l = ldq(pix1); | |
741 | + p1_r = ldq(pix1 + 8); | |
742 | + l = ldq_u(pix2); | |
743 | + m = ldq_u(pix2 + 8); | |
744 | + r = ldq_u(pix2 + 16); | |
745 | + p2_l = avg2(extql(l, disalign) | extqh(m, disalign), | |
746 | + extql(l, disalign1) | extqh(m, disalign1)); | |
747 | + p2_r = avg2(extql(m, disalign) | extqh(r, disalign), | |
748 | + extql(m, disalign1) | extqh(r, disalign1)); | |
749 | + pix1 += line_size; | |
750 | + pix2 += line_size; | |
751 | + | |
752 | + result += perr(p1_l, p2_l) | |
753 | + + perr(p1_r, p2_r); | |
754 | + } while (--h); | |
755 | + break; | |
756 | + } | |
757 | + return result; | |
758 | +} | |
759 | + | |
760 | +int pix_abs16x16_y2_mvi(uint8_t *pix1, uint8_t *pix2, int line_size) | |
761 | +{ | |
762 | + int result = 0; | |
763 | + int h = 16; | |
764 | + | |
765 | + if ((size_t) pix2 & 0x7) { | |
766 | + uint64_t t, p2_l, p2_r; | |
767 | + t = ldq_u(pix2 + 8); | |
768 | + p2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2); | |
769 | + p2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2); | |
770 | + | |
771 | + do { | |
772 | + uint64_t p1_l, p1_r, np2_l, np2_r; | |
773 | + uint64_t t; | |
774 | + | |
775 | + p1_l = ldq(pix1); | |
776 | + p1_r = ldq(pix1 + 8); | |
777 | + pix2 += line_size; | |
778 | + t = ldq_u(pix2 + 8); | |
779 | + np2_l = extql(ldq_u(pix2), pix2) | extqh(t, pix2); | |
780 | + np2_r = extql(t, pix2) | extqh(ldq_u(pix2 + 16), pix2); | |
781 | + | |
782 | + result += perr(p1_l, avg2(p2_l, np2_l)) | |
783 | + + perr(p1_r, avg2(p2_r, np2_r)); | |
784 | + | |
785 | + pix1 += line_size; | |
786 | + p2_l = np2_l; | |
787 | + p2_r = np2_r; | |
788 | + | |
789 | + } while (--h); | |
790 | + } else { | |
791 | + uint64_t p2_l, p2_r; | |
792 | + p2_l = ldq(pix2); | |
793 | + p2_r = ldq(pix2 + 8); | |
794 | + do { | |
795 | + uint64_t p1_l, p1_r, np2_l, np2_r; | |
796 | + | |
797 | + p1_l = ldq(pix1); | |
798 | + p1_r = ldq(pix1 + 8); | |
799 | + pix2 += line_size; | |
800 | + np2_l = ldq(pix2); | |
801 | + np2_r = ldq(pix2 + 8); | |
802 | + | |
803 | + result += perr(p1_l, avg2(p2_l, np2_l)) | |
804 | + + perr(p1_r, avg2(p2_r, np2_r)); | |
805 | + | |
806 | + pix1 += line_size; | |
807 | + p2_l = np2_l; | |
808 | + p2_r = np2_r; | |
809 | + } while (--h); | |
810 | + } | |
811 | + return result; | |
812 | +} | |
813 | + | |
814 | +int pix_abs16x16_xy2_mvi(uint8_t *pix1, uint8_t *pix2, int line_size) | |
815 | +{ | |
816 | + int result = 0; | |
817 | + int h = 16; | |
818 | + | |
819 | + uint64_t p1_l, p1_r; | |
820 | + uint64_t p2_l, p2_r, p2_x; | |
821 | + | |
822 | + p1_l = ldq(pix1); | |
823 | + p1_r = ldq(pix1 + 8); | |
824 | + | |
825 | + if ((size_t) pix2 & 0x7) { /* could be optimized a lot */ | |
826 | + p2_l = uldq(pix2); | |
827 | + p2_r = uldq(pix2 + 8); | |
828 | + p2_x = (uint64_t) pix2[16] << 56; | |
829 | + } else { | |
830 | + p2_l = ldq(pix2); | |
831 | + p2_r = ldq(pix2 + 8); | |
832 | + p2_x = ldq(pix2 + 16) << 56; | |
833 | + } | |
834 | + | |
835 | + do { | |
836 | + uint64_t np1_l, np1_r; | |
837 | + uint64_t np2_l, np2_r, np2_x; | |
838 | + | |
839 | + pix1 += line_size; | |
840 | + pix2 += line_size; | |
841 | + | |
842 | + np1_l = ldq(pix1); | |
843 | + np1_r = ldq(pix1 + 8); | |
844 | + | |
845 | + if ((size_t) pix2 & 0x7) { /* could be optimized a lot */ | |
846 | + np2_l = uldq(pix2); | |
847 | + np2_r = uldq(pix2 + 8); | |
848 | + np2_x = (uint64_t) pix2[16] << 56; | |
849 | + } else { | |
850 | + np2_l = ldq(pix2); | |
851 | + np2_r = ldq(pix2 + 8); | |
852 | + np2_x = ldq(pix2 + 16) << 56; | |
853 | + } | |
854 | + | |
855 | + result += perr(p1_l, | |
856 | + avg4( p2_l, ( p2_l >> 8) | ((uint64_t) p2_r << 56), | |
857 | + np2_l, (np2_l >> 8) | ((uint64_t) np2_r << 56))) | |
858 | + + perr(p1_r, | |
859 | + avg4( p2_r, ( p2_r >> 8) | ((uint64_t) p2_x), | |
860 | + np2_r, (np2_r >> 8) | ((uint64_t) np2_x))); | |
861 | + | |
862 | + p1_l = np1_l; | |
863 | + p1_r = np1_r; | |
864 | + p2_l = np2_l; | |
865 | + p2_r = np2_r; | |
866 | + p2_x = np2_x; | |
867 | + } while (--h); | |
868 | + | |
869 | + return result; | |
870 | +} | |
871 | diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha.orig/mpegvideo_alpha.c avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha/mpegvideo_alpha.c | |
872 | --- avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha.orig/mpegvideo_alpha.c 1970-01-01 01:00:00.000000000 +0100 | |
873 | +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha/mpegvideo_alpha.c 2003-09-28 17:26:39.000000000 +0200 | |
874 | @@ -0,0 +1,96 @@ | |
875 | +/* | |
876 | + * Alpha optimized DSP utils | |
877 | + * Copyright (c) 2002 Falk Hueffner <falk@debian.org> | |
878 | + * | |
879 | + * This library is free software; you can redistribute it and/or | |
880 | + * modify it under the terms of the GNU Lesser General Public | |
881 | + * License as published by the Free Software Foundation; either | |
882 | + * version 2 of the License, or (at your option) any later version. | |
883 | + * | |
884 | + * This library is distributed in the hope that it will be useful, | |
885 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
886 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
887 | + * Lesser General Public License for more details. | |
888 | + * | |
889 | + * You should have received a copy of the GNU Lesser General Public | |
890 | + * License along with this library; if not, write to the Free Software | |
891 | + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
892 | + */ | |
893 | + | |
894 | +#include "asm.h" | |
895 | +#include "../dsputil.h" | |
896 | +#include "../mpegvideo.h" | |
897 | + | |
898 | +static void dct_unquantize_h263_axp(MpegEncContext *s, DCTELEM *block, | |
899 | + int n, int qscale) | |
900 | +{ | |
901 | + int i, n_coeffs; | |
902 | + uint64_t qmul, qadd; | |
903 | + uint64_t correction; | |
904 | + DCTELEM *orig_block = block; | |
905 | + DCTELEM block0; | |
906 | + | |
907 | + qadd = WORD_VEC((qscale - 1) | 1); | |
908 | + qmul = qscale << 1; | |
909 | + /* This mask kills spill from negative subwords to the next subword. */ | |
910 | + correction = WORD_VEC((qmul - 1) + 1); /* multiplication / addition */ | |
911 | + | |
912 | + if (s->mb_intra) { | |
913 | + if (!s->h263_aic) { | |
914 | + if (n < 4) | |
915 | + block0 = block[0] * s->y_dc_scale; | |
916 | + else | |
917 | + block0 = block[0] * s->c_dc_scale; | |
918 | + } else { | |
919 | + qadd = 0; | |
920 | + } | |
921 | + n_coeffs = 63; // does not always use zigzag table | |
922 | + } else { | |
923 | + n_coeffs = s->intra_scantable.raster_end[s->block_last_index[n]]; | |
924 | + } | |
925 | + | |
926 | + for(i = 0; i <= n_coeffs; block += 4, i += 4) { | |
927 | + uint64_t levels, negmask, zeros, add; | |
928 | + | |
929 | + levels = ldq(block); | |
930 | + if (levels == 0) | |
931 | + continue; | |
932 | + | |
933 | +#ifdef __alpha_max__ | |
934 | + /* I don't think the speed difference justifies runtime | |
935 | + detection. */ | |
936 | + negmask = maxsw4(levels, -1); /* negative -> ffff (-1) */ | |
937 | + negmask = minsw4(negmask, 0); /* positive -> 0000 (0) */ | |
938 | +#else | |
939 | + negmask = cmpbge(WORD_VEC(0x7fff), levels); | |
940 | + negmask &= (negmask >> 1) | (1 << 7); | |
941 | + negmask = zap(-1, negmask); | |
942 | +#endif | |
943 | + | |
944 | + zeros = cmpbge(0, levels); | |
945 | + zeros &= zeros >> 1; | |
946 | + /* zeros |= zeros << 1 is not needed since qadd <= 255, so | |
947 | + zapping the lower byte suffices. */ | |
948 | + | |
949 | + levels *= qmul; | |
950 | + levels -= correction & (negmask << 16); | |
951 | + | |
952 | + /* Negate qadd for negative levels. */ | |
953 | + add = qadd ^ negmask; | |
954 | + add += WORD_VEC(0x0001) & negmask; | |
955 | + /* Set qadd to 0 for levels == 0. */ | |
956 | + add = zap(add, zeros); | |
957 | + | |
958 | + levels += add; | |
959 | + | |
960 | + stq(levels, block); | |
961 | + } | |
962 | + | |
963 | + if (s->mb_intra && !s->h263_aic) | |
964 | + orig_block[0] = block0; | |
965 | +} | |
966 | + | |
967 | +void MPV_common_init_axp(MpegEncContext *s) | |
968 | +{ | |
969 | + s->dct_unquantize_h263 = dct_unquantize_h263_axp; | |
970 | +} | |
971 | diff -Nur avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha.orig/simple_idct_alpha.c avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha/simple_idct_alpha.c | |
972 | --- avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha.orig/simple_idct_alpha.c 1970-01-01 01:00:00.000000000 +0100 | |
973 | +++ avifile-0.7-0.7.38/ffmpeg/libavcodec/alpha/simple_idct_alpha.c 2003-09-28 17:26:39.000000000 +0200 | |
974 | @@ -0,0 +1,311 @@ | |
975 | +/* | |
976 | + * Simple IDCT (Alpha optimized) | |
977 | + * | |
978 | + * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> | |
979 | + * | |
980 | + * This library is free software; you can redistribute it and/or | |
981 | + * modify it under the terms of the GNU Lesser General Public | |
982 | + * License as published by the Free Software Foundation; either | |
983 | + * version 2 of the License, or (at your option) any later version. | |
984 | + * | |
985 | + * This library is distributed in the hope that it will be useful, | |
986 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
987 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
988 | + * Lesser General Public License for more details. | |
989 | + * | |
990 | + * You should have received a copy of the GNU Lesser General Public | |
991 | + * License along with this library; if not, write to the Free Software | |
992 | + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
993 | + * | |
994 | + * based upon some outcommented c code from mpeg2dec (idct_mmx.c | |
995 | + * written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>) | |
996 | + * | |
997 |