--- ffmpeg/libavcodec/i386/dsputil_mmx.c.orig 2006-01-12 22:43:17.000000000 +0000 +++ ffmpeg/libavcodec/i386/dsputil_mmx.c 2006-01-30 01:42:21.087254880 +0000 @@ -20,6 +20,7 @@ * MMX optimization by Nick Kurshev */ +#include #include "../dsputil.h" #include "../simple_idct.h" #include "../mpegvideo.h" @@ -617,6 +618,22 @@ ); } +#if (__GNUC__ >= 4) +static inline void transpose4x4(uint8_t *dst, uint8_t *src, ptrdiff_t dst_stride, ptrdiff_t src_stride) { + __m64 row0 = _mm_cvtsi32_si64(*(unsigned*)(src + (0 * src_stride))); + __m64 row1 = _mm_cvtsi32_si64(*(unsigned*)(src + (1 * src_stride))); + __m64 row2 = _mm_cvtsi32_si64(*(unsigned*)(src + (2 * src_stride))); + __m64 row3 = _mm_cvtsi32_si64(*(unsigned*)(src + (3 * src_stride))); + __m64 tmp0 = _mm_unpacklo_pi8(row0, row1); + __m64 tmp1 = _mm_unpacklo_pi8(row2, row3); + __m64 row01 = _mm_unpacklo_pi16(tmp0, tmp1); + __m64 row23 = _mm_unpackhi_pi16(tmp0, tmp1); + *((unsigned*)(dst + (0 * dst_stride))) = _mm_cvtsi64_si32(row01); + *((unsigned*)(dst + (1 * dst_stride))) = _mm_cvtsi64_si32(_mm_unpackhi_pi32(row01, row01)); + *((unsigned*)(dst + (2 * dst_stride))) = _mm_cvtsi64_si32(row23); + *((unsigned*)(dst + (3 * dst_stride))) = _mm_cvtsi64_si32(_mm_unpackhi_pi32(row23, row23)); +} +#else static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ asm volatile( //FIXME could save 1 instruction if done as 8x4 ... "movd %4, %%mm0 \n\t" @@ -645,6 +662,7 @@ "m" (*(uint32_t*)(src + 3*src_stride)) ); } +#endif static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){ const int strength= ff_h263_loop_filter_strength[qscale];