---- SDL_gfx-2.0.8/SDL_imageFilter.c.orig 2002-02-10 03:51:21.000000000 +0100
-+++ SDL_gfx-2.0.8/SDL_imageFilter.c 2003-08-05 21:56:40.000000000 +0200
-@@ -79,13 +79,13 @@
+--- SDL_gfx-2.0.13/SDL_imageFilter.c.orig 2004-11-29 20:53:35.000000000 +0100
++++ SDL_gfx-2.0.13/SDL_imageFilter.c 2005-01-16 00:19:22.272596920 +0100
+@@ -81,13 +81,13 @@
"mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
".align 16 \n\t" // 16 byte allignment of the loop entry
"emms \n\t" // exit MMX state
"popa \n\t":"=m" (Dest) // %0
:"m"(Src2), // %1
-@@ -156,7 +156,7 @@
+@@ -158,7 +158,7 @@
"mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L21011: \n\t" "movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1
-+ "1: \n\t" "movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1
+- ".L21011: \n\t"
++ "1: \n\t"
+ "movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1
"movq (%%ebx), %%mm2 \n\t" // load 8 bytes from Src2 into mm2
// --- Byte shift via Word shift ---
- "psrlw $1, %%mm1 \n\t" // shift 4 WORDS of mm1 1 bit to the right
-@@ -169,7 +169,7 @@
- "add $8, %%eax \n\t" // increase Src1, Src2 and Dest
+@@ -174,7 +174,7 @@
"add $8, %%ebx \n\t" // register pointers by 8
- "add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter
+ "add $8, %%edi \n\t"
+ "dec %%ecx \n\t" // decrease loop counter
- "jnz .L21011 \n\t" // check loop termination, proceed if required
+ "jnz 1b \n\t" // check loop termination, proceed if required
"emms \n\t" // exit MMX state
"popa \n\t":"=m" (Dest) // %0
:"m"(Src2), // %1
-@@ -236,13 +236,13 @@
+@@ -241,13 +241,13 @@
"mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
".align 16 \n\t" // 16 byte allignment of the loop entry
"emms \n\t" // exit MMX state
"popa \n\t":"=m" (Dest) // %0
:"m"(Src2), // %1
-@@ -308,7 +308,7 @@
+@@ -313,7 +313,7 @@
"mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
".align 16 \n\t" // 16 byte allignment of the loop entry
"movq (%%ebx), %%mm2 \n\t" // load 8 bytes from Src2 into mm2
"psubusb (%%ebx), %%mm1 \n\t" // mm1=Src1-Src2 (sub 8 bytes with saturation)
"psubusb (%%eax), %%mm2 \n\t" // mm2=Src2-Src1 (sub 8 bytes with saturation)
-@@ -317,7 +317,7 @@
+@@ -322,7 +322,7 @@
"add $8, %%eax \n\t" // increase Src1, Src2 and Dest
"add $8, %%ebx \n\t" // register pointers by 8
"add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter
"emms \n\t" // exit MMX state
"popa \n\t":"=m" (Dest) // %0
:"m"(Src2), // %1
-@@ -383,7 +383,7 @@
+@@ -388,7 +388,7 @@
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
"pxor %%mm0, %%mm0 \n\t" // zero mm0 register
".align 16 \n\t" // 16 byte allignment of the loop entry
"movq (%%ebx), %%mm3 \n\t" // load 8 bytes from Src2 into mm3
"movq %%mm1, %%mm2 \n\t" // copy mm1 into mm2
"movq %%mm3, %%mm4 \n\t" // copy mm3 into mm4
-@@ -407,7 +407,7 @@
+@@ -412,7 +412,7 @@
"add $8, %%eax \n\t" // increase Src1, Src2 and Dest
"add $8, %%ebx \n\t" // register pointers by 8
"add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter
"emms \n\t" // exit MMX state
"popa \n\t":"=m" (Dest) // %0
:"m"(Src2), // %1
-@@ -476,13 +476,13 @@
+@@ -481,13 +481,13 @@
"mov %0, %%edi \n\t" // load Dest address into edi
"mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx
".align 16 \n\t" // 16 byte allignment of the loop entry
"popa \n\t":"=m" (Dest) // %0
:"m"(Src2), // %1
"m"(Src1), // %2
-@@ -544,7 +544,7 @@
+@@ -549,7 +549,7 @@
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
"pxor %%mm0, %%mm0 \n\t" // zero mm0 register
".align 16 \n\t" // 16 byte allignment of the loop entry
"movq (%%ebx), %%mm3 \n\t" // load 8 bytes from Src2 into mm3
"movq %%mm1, %%mm2 \n\t" // copy mm1 into mm2
"movq %%mm3, %%mm4 \n\t" // copy mm3 into mm4
-@@ -561,7 +561,7 @@
+@@ -566,7 +566,7 @@
"add $8, %%eax \n\t" // increase Src1, Src2 and Dest
"add $8, %%ebx \n\t" // register pointers by 8
"add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter
"emms \n\t" // exit MMX state
"popa \n\t":"=m" (Dest) // %0
:"m"(Src2), // %1
-@@ -629,7 +629,7 @@
+@@ -634,7 +634,7 @@
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
"pxor %%mm0, %%mm0 \n\t" // zero mm0 register
".align 16 \n\t" // 16 byte allignment of the loop entry
"movq (%%ebx), %%mm3 \n\t" // load 8 bytes from Src2 into mm3
"movq %%mm1, %%mm2 \n\t" // copy mm1 into mm2
"movq %%mm3, %%mm4 \n\t" // copy mm3 into mm4
-@@ -648,7 +648,7 @@
+@@ -653,7 +653,7 @@
"add $8, %%eax \n\t" // increase Src1, Src2 and Dest
"add $8, %%ebx \n\t" // register pointers by 8
"add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter
"emms \n\t" // exit MMX state
"popa \n\t":"=m" (Dest) // %0
:"m"(Src2), // %1
-@@ -715,13 +715,13 @@
+@@ -720,13 +720,13 @@
"mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
".align 16 \n\t" // 16 byte allignment of the loop entry
"emms \n\t" // exit MMX state
"popa \n\t":"=m" (Dest) // %0
:"m"(Src2), // %1
-@@ -787,13 +787,13 @@
+@@ -792,13 +792,13 @@
"mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
".align 16 \n\t" // 16 byte allignment of the loop entry
"emms \n\t" // exit MMX state
"popa \n\t":"=m" (Dest) // %0
:"m"(Src2), // %1
-@@ -855,17 +855,17 @@
+@@ -860,17 +860,17 @@
"mov %0, %%edi \n\t" // load Dest address into edi
"mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx
".align 16 \n\t" // 16 byte allignment of the loop entry
"popa \n\t":"=m" (Dest) // %0
:"m"(Src2), // %1
"m"(Src1), // %2
-@@ -902,12 +902,12 @@
+@@ -907,12 +907,12 @@
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
".align 16 \n\t" // 16 byte allignment of the loop entry
"emms \n\t" // exit MMX state
"popa \n\t":"=m" (Dest) // %0
:"m"(Src1), // %1
-@@ -975,13 +975,13 @@
+@@ -980,14 +980,14 @@
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L1021: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from Src1 into MM0
-+ "1: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from Src1 into MM0
+- ".L1021: \n\t"
++ "1: \n\t"
+ "movq (%%eax), %%mm0 \n\t" // load 8 bytes from Src1 into MM0
"paddusb %%mm1, %%mm0 \n\t" // MM0=SrcDest+C (add 8 bytes with saturation)
"movq %%mm0, (%%edi) \n\t" // store result in Dest
"add $8, %%eax \n\t" // increase Dest register pointer by 8
"emms \n\t" // exit MMX state
"popa \n\t":"=m" (Dest) // %0
:"m"(Src1), // %1
-@@ -1059,7 +1059,7 @@
+@@ -1059,14 +1059,14 @@
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L1022: \n\t" "movq (%%eax), %%mm2 \n\t" // load 8 bytes from Src1 into MM2
-+ "1: \n\t" "movq (%%eax), %%mm2 \n\t" // load 8 bytes from Src1 into MM2
+- ".L11023: \n\t"
++ "1: \n\t"
+ "movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0
+ "paddusb %%mm1, %%mm0 \n\t" // MM0=SrcDest+C (add 8 bytes with saturation)
+ "movq %%mm0, (%%edi) \n\t" // store result in SrcDest
+ "add $8, %%eax \n\t" // increase Src1 register pointer by 8
+ "add $8, %%edi \n\t" // increase Dest register pointer by 8
+ "dec %%ecx \n\t" // decrease loop counter
+- "jnz .L11023 \n\t" // check loop termination, proceed if required
++ "jnz 1b \n\t" // check loop termination, proceed if required
+ "emms \n\t" // exit MMX state
+ "popa \n\t":"=m" (Dest) // %0
+ :"m"(Src1), // %1
+@@ -1154,7 +1154,7 @@
+ "mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx
+ "shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
+ ".align 16 \n\t" // 16 byte allignment of the loop entry
+- ".L1022: \n\t"
++ "1: \n\t"
+ "movq (%%eax), %%mm2 \n\t" // load 8 bytes from Src1 into MM2
"psrlw $1, %%mm2 \n\t" // shift 4 WORDS of MM2 1 bit to the right
// "pand %%mm0, %%mm2 \n\t" // apply Mask to 8 BYTES of MM2
- ".byte 0x0f, 0xdb, 0xd0 \n\t" "paddusb %%mm1, %%mm2 \n\t" // MM2=SrcDest+C (add 8 bytes with saturation)
-@@ -1067,7 +1067,7 @@
+@@ -1164,7 +1164,7 @@
"add $8, %%eax \n\t" // increase Src1 register pointer by 8
"add $8, %%edi \n\t" // increase Dest register pointer by 8
"dec %%ecx \n\t" // decrease loop counter
"emms \n\t" // exit MMX state
"popa \n\t":"=m" (Dest) // %0
:"m"(Src1), // %1
-@@ -1146,13 +1146,13 @@
+@@ -1243,13 +1243,13 @@
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
".align 16 \n\t" // 16 byte allignment of the loop entry
- ".L1023: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0
+ "1: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0
- "psubusb %%mm1, %%mm0 \n\t" // MM0=SrcDest+C (add 8 bytes with saturation)
+ "psubusb %%mm1, %%mm0 \n\t" // MM0=SrcDest-C (sub 8 bytes with saturation)
"movq %%mm0, (%%edi) \n\t" // store result in SrcDest
"add $8, %%eax \n\t" // increase Src1 register pointer by 8
"add $8, %%edi \n\t" // increase Dest register pointer by 8
"emms \n\t" // exit MMX state
"popa \n\t":"=m" (Dest) // %0
:"m"(Src1), // %1
-@@ -1221,25 +1221,25 @@
+@@ -1322,13 +1322,13 @@
+ "mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx
+ "shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
+ ".align 16 \n\t" // 16 byte allignment of the loop entry
+- ".L11024: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0
++ "1: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0
+ "psubusb %%mm1, %%mm0 \n\t" // MM0=SrcDest-C (sub 8 bytes with saturation)
+ "movq %%mm0, (%%edi) \n\t" // store result in SrcDest
+ "add $8, %%eax \n\t" // increase Src1 register pointer by 8
+ "add $8, %%edi \n\t" // increase Dest register pointer by 8
+ "dec %%ecx \n\t" // decrease loop counter
+- "jnz .L11024 \n\t" // check loop termination, proceed if required
++ "jnz 1b \n\t" // check loop termination, proceed if required
+ "emms \n\t" // exit MMX state
+ "popa \n\t":"=m" (Dest) // %0
+ :"m"(Src1), // %1
+@@ -1405,19 +1405,19 @@
"mov %3, %%cl \n\t" // load loop counter (N) into CL
"movd %%ecx, %%mm3 \n\t" // copy (N) into MM3
"pcmpeqb %%mm1, %%mm1 \n\t" // generate all 1's in mm1
+ "1: \n\t" // ** Prepare proper bit-Mask in MM1 **
"psrlw $1, %%mm1 \n\t" // shift 4 WORDS of MM1 1 bit to the right
// "pand %%mm0, %%mm1 \n\t" // apply Mask to 8 BYTES of MM1
- ".byte 0x0f, 0xdb, 0xc8 \n\t" "dec %%cl \n\t" // decrease loop counter
+ ".byte 0x0f, 0xdb, 0xc8 \n\t"
+ "dec %%cl \n\t" // decrease loop counter
- "jnz .L10240 \n\t" // check loop termination, proceed if required
+ "jnz 1b \n\t" // check loop termination, proceed if required
// ** Shift all bytes of the image **
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L10241: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0
-+ "2: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0
+- ".L10241: \n\t"
++ "2: \n\t"
+ "movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0
"psrlw %%mm3, %%mm0 \n\t" // shift 4 WORDS of MM0 (N) bits to the right
// "pand %%mm1, %%mm0 \n\t" // apply proper bit-Mask to 8 BYTES of MM0
- ".byte 0x0f, 0xdb, 0xc1 \n\t" "movq %%mm0, (%%edi) \n\t" // store result in SrcDest
- "add $8, %%eax \n\t" // increase Src1 register pointer by 8
+@@ -1426,7 +1426,7 @@
+ "add $8, %%eax \n\t" // increase Src1 register pointer by 8
"add $8, %%edi \n\t" // increase Dest register pointer by 8
"dec %%ecx \n\t" // decrease loop counter
- "jnz .L10241 \n\t" // check loop termination, proceed if required
"emms \n\t" // exit MMX state
"popa \n\t":"=m" (Dest) // %0
:"m"(Src1), // %1
-@@ -1318,8 +1318,8 @@
+@@ -1495,13 +1495,13 @@
+ "mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx
+ "shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
+ ".align 16 \n\t" // 16 byte allignment of the loop entry
+- ".L13023: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0
++ "1: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0
+ "psrld %3, %%mm0 \n\t" // MM0=SrcDest+C (add 8 bytes with saturation)
+ "movq %%mm0, (%%edi) \n\t" // store result in SrcDest
+ "add $8, %%eax \n\t" // increase Src1 register pointer by 8
+ "add $8, %%edi \n\t" // increase Dest register pointer by 8
+ "dec %%ecx \n\t" // decrease loop counter
+- "jnz .L13023 \n\t" // check loop termination, proceed if required
++ "jnz 1b \n\t" // check loop termination, proceed if required
+ "emms \n\t" // exit MMX state
+ "popa \n\t":"=m" (Dest) // %0
+ :"m"(Src1), // %1
+@@ -1581,8 +1581,8 @@
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
"cmp $128, %%al \n\t" // if (C <= 128) execute more efficient code
"movq %%mm3, %%mm4 \n\t" // copy MM3 into MM4
"punpcklbw %%mm0, %%mm3 \n\t" // unpack low bytes of SrcDest into words
"punpckhbw %%mm0, %%mm4 \n\t" // unpack high bytes of SrcDest into words
-@@ -1330,9 +1330,9 @@
+@@ -1593,9 +1593,9 @@
"add $8, %%eax \n\t" // increase Src1 register pointer by 8
"add $8, %%edi \n\t" // increase Dest register pointer by 8
"dec %%ecx \n\t" // decrease loop counter
"movq %%mm3, %%mm4 \n\t" // copy MM3 into MM4
"punpcklbw %%mm0, %%mm3 \n\t" // unpack low bytes of SrcDest into words
"punpckhbw %%mm0, %%mm4 \n\t" // unpack high bytes of SrcDest into words
-@@ -1352,8 +1352,8 @@
+@@ -1615,8 +1615,8 @@
"add $8, %%eax \n\t" // increase Src1 register pointer by 8
"add $8, %%edi \n\t" // increase Dest register pointer by 8
"dec %%ecx \n\t" // decrease loop counter
"popa \n\t":"=m" (Dest) // %0
:"m"(Src1), // %1
"m"(length), // %2
-@@ -1433,7 +1433,7 @@
+@@ -1696,7 +1696,7 @@
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
".align 16 \n\t" // 16 byte allignment of the loop entry
"movq %%mm3, %%mm4 \n\t" // copy MM3 into MM4
"punpcklbw %%mm0, %%mm3 \n\t" // unpack low bytes of SrcDest into words
"punpckhbw %%mm0, %%mm4 \n\t" // unpack high bytes of SrcDest into words
-@@ -1446,7 +1446,7 @@
+@@ -1709,7 +1709,7 @@
"add $8, %%eax \n\t" // increase Src1 register pointer by 8
"add $8, %%edi \n\t" // increase Dest register pointer by 8
"dec %%ecx \n\t" // decrease loop counter
"emms \n\t" // exit MMX state
"popa \n\t":"=m" (Dest) // %0
:"m"(Src1), // %1
-@@ -1521,25 +1521,25 @@
+@@ -1784,25 +1784,25 @@
"mov %3, %%cl \n\t" // load loop counter (N) into CL
"movd %%ecx, %%mm3 \n\t" // copy (N) into MM3
"pcmpeqb %%mm1, %%mm1 \n\t" // generate all 1's in mm1
"emms \n\t" // exit MMX state
"popa \n\t":"=m" (Dest) // %0
:"m"(Src1), // %1
-@@ -1610,8 +1610,8 @@
+@@ -1870,13 +1870,13 @@
+ "mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx
+ "shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
+ ".align 16 \n\t" // 16 byte allignment of the loop entry
+- ".L12023: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0
++ "1: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0
+ "pslld %3, %%mm0 \n\t" // MM0=SrcDest+C (add 8 bytes with saturation)
+ "movq %%mm0, (%%edi) \n\t" // store result in SrcDest
+ "add $8, %%eax \n\t" // increase Src1 register pointer by 8
+ "add $8, %%edi \n\t" // increase Dest register pointer by 8
+ "dec %%ecx \n\t" // decrease loop counter
+- "jnz .L12023 \n\t" // check loop termination, proceed if required
++ "jnz 1b \n\t" // check loop termination, proceed if required
+ "emms \n\t" // exit MMX state
+ "popa \n\t":"=m" (Dest) // %0
+ :"m"(Src1), // %1
+@@ -1949,8 +1949,8 @@
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
"cmp $7, %%al \n\t" // if (N <= 7) execute more efficient code
"movq %%mm3, %%mm4 \n\t" // copy MM3 into MM4
"punpcklbw %%mm0, %%mm3 \n\t" // unpack low bytes of SrcDest into words
"punpckhbw %%mm0, %%mm4 \n\t" // unpack high bytes of SrcDest into words
-@@ -1622,9 +1622,9 @@
+@@ -1961,9 +1961,9 @@
"add $8, %%eax \n\t" // increase Src1 register pointer by 8
"add $8, %%edi \n\t" // increase Dest register pointer by 8
"dec %%ecx \n\t" // decrease loop counter
"movq %%mm3, %%mm4 \n\t" // copy MM3 into MM4
"punpcklbw %%mm0, %%mm3 \n\t" // unpack low bytes of SrcDest into words
"punpckhbw %%mm0, %%mm4 \n\t" // unpack high bytes of SrcDest into words
-@@ -1644,8 +1644,8 @@
+@@ -1983,8 +1983,8 @@
"add $8, %%eax \n\t" // increase Src1 register pointer by 8
"add $8, %%edi \n\t" // increase Dest register pointer by 8
"dec %%ecx \n\t" // decrease loop counter
"popa \n\t":"=m" (Dest) // %0
:"m"(Src1), // %1
"m"(length), // %2
-@@ -1724,14 +1724,14 @@
+@@ -2063,7 +2063,7 @@
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
".align 16 \n\t" // 16 byte alignment of the loop entry
-- ".L1029: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0
-+ "1: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0
+- ".L1029: \n\t"
++ "1: \n\t"
+ "movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0
"paddusb %%mm2, %%mm0 \n\t" // MM0=SrcDest+(0xFF-T) (add 8 bytes with saturation)
"pcmpeqb %%mm1, %%mm0 \n\t" // binarize 255:0, comparing to 255
- "movq %%mm0, (%%edi) \n\t" // store result in SrcDest
+@@ -2071,7 +2071,7 @@
"add $8, %%eax \n\t" // increase Src1 register pointer by 8
"add $8, %%edi \n\t" // increase Dest register pointer by 8
"dec %%ecx \n\t" // decrease loop counter
"emms \n\t" // exit MMX state
"popa \n\t":"=m" (Dest) // %0
:"m"(Src1), // %1
-@@ -1814,7 +1814,7 @@
+@@ -2154,7 +2154,7 @@
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L1030: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from Src1 into MM0
-+ "1: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from Src1 into MM0
+- ".L1030: \n\t"
++ "1: \n\t"
+ "movq (%%eax), %%mm0 \n\t" // load 8 bytes from Src1 into MM0
"paddusb %%mm1, %%mm0 \n\t" // MM0=SrcDest+(0xFF-Tmax)
"psubusb %%mm7, %%mm0 \n\t" // MM0=MM0-(0xFF-Tmax+Tmin)
- "paddusb %%mm5, %%mm0 \n\t" // MM0=MM0+Tmin
-@@ -1822,7 +1822,7 @@
+@@ -2163,7 +2163,7 @@
"add $8, %%eax \n\t" // increase Src1 register pointer by 8
"add $8, %%edi \n\t" // increase Dest register pointer by 8
"dec %%ecx \n\t" // decrease loop counter
"emms \n\t" // exit MMX state
"popa \n\t":"=m" (Dest) // %0
:"m"(Src1), // %1
-@@ -1890,11 +1890,11 @@
+@@ -2231,11 +2231,11 @@
"mov %4, %%bx \n\t" // load Cmax in BX
"sub %5, %%ax \n\t" // AX = Nmax - Nmin
"sub %3, %%bx \n\t" // BX = Cmax - Cmin
"mov %%ax, %%bx \n\t" // copy AX into BX
"shl $16, %%eax \n\t" // shift 2 bytes of EAX left
"mov %%bx, %%ax \n\t" // copy BX into AX
-@@ -1923,7 +1923,7 @@
+@@ -2264,7 +2264,7 @@
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L1031: \n\t" "movq (%%eax), %%mm3 \n\t" // load 8 bytes from Src1 into MM3
-+ "3: \n\t" "movq (%%eax), %%mm3 \n\t" // load 8 bytes from Src1 into MM3
+- ".L1031: \n\t"
++ "3: \n\t"
+ "movq (%%eax), %%mm3 \n\t" // load 8 bytes from Src1 into MM3
"movq %%mm3, %%mm4 \n\t" // copy MM3 into MM4
"punpcklbw %%mm7, %%mm3 \n\t" // unpack low bytes of SrcDest into words
- "punpckhbw %%mm7, %%mm4 \n\t" // unpack high bytes of SrcDest into words
-@@ -1947,7 +1947,7 @@
+@@ -2289,7 +2289,7 @@
"add $8, %%eax \n\t" // increase Src1 register pointer by 8
"add $8, %%edi \n\t" // increase Dest register pointer by 8
"dec %%ecx \n\t" // decrease loop counter
"emms \n\t" // exit MMX state
"popa \n\t":"=m" (Dest) // %0
:"m"(Src1), // %1
-@@ -2041,10 +2041,10 @@
+@@ -2383,10 +2383,10 @@
"mov %2, %%edx \n\t" // initialize ROWS counter
"sub $2, %%edx \n\t" // do not use first and last row
// ---
// ---
"movq (%%esi), %%mm1 \n\t" // load 8 bytes of the image first row
"add %%eax, %%esi \n\t" // move one row below
-@@ -2085,11 +2085,11 @@
+@@ -2427,11 +2427,11 @@
"inc %%edi \n\t" // move Dest pointer to the next pixel
// ---
"dec %%ecx \n\t" // decrease loop counter COLUMNS
// ---
"emms \n\t" // exit MMX state
"popa \n\t":"=m" (Dest) // %0
-@@ -2132,10 +2132,10 @@
+@@ -2474,10 +2474,10 @@
"mov %2, %%ebx \n\t" // initialize ROWS counter
"sub $4, %%ebx \n\t" // do not use first 2 and last 2 rows
// ---
"movd %%esi, %%mm6 \n\t" // save ESI in MM6
// --- 1
"movq (%%esi), %%mm1 \n\t" // load 8 bytes of the Src
-@@ -2235,11 +2235,11 @@
+@@ -2577,11 +2577,11 @@
"inc %%edi \n\t" // move Dest pointer to the next pixel
// ---
"dec %%ecx \n\t" // decrease loop counter COLUMNS
// ---
"emms \n\t" // exit MMX state
"popa \n\t":"=m" (Dest) // %0
-@@ -2280,10 +2280,10 @@
+@@ -2622,10 +2622,10 @@
"add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" // initialize ROWS counter
"sub $6, %%ebx \n\t" // do not use first 3 and last 3 rows
// ---
"movd %%esi, %%mm6 \n\t" // save ESI in MM6
// --- 1
"movq (%%esi), %%mm1 \n\t" // load 8 bytes of the Src
-@@ -2411,11 +2411,11 @@
+@@ -2753,11 +2753,11 @@
"inc %%edi \n\t" // move Dest pointer to the next pixel
// ---
"dec %%ecx \n\t" // decrease loop counter COLUMNS
// ---
"emms \n\t" // exit MMX state
"popa \n\t":"=m" (Dest) // %0
-@@ -2456,10 +2456,10 @@
+@@ -2798,10 +2798,10 @@
"add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" // initialize ROWS counter
"sub $8, %%ebx \n\t" // do not use first 4 and last 4 rows
// ---
"movd %%esi, %%mm6 \n\t" // save ESI in MM6
// --- 1
"movq (%%esi), %%mm1 \n\t" // load 8 bytes of the Src
-@@ -2678,11 +2678,11 @@
+@@ -3020,11 +3020,11 @@
"inc %%edi \n\t" // move Dest pointer to the next pixel
// ---
"dec %%ecx \n\t" // decrease loop counter COLUMNS
// ---
"emms \n\t" // exit MMX state
"popa \n\t":"=m" (Dest) // %0
-@@ -2729,10 +2729,10 @@
+@@ -3071,10 +3071,10 @@
"mov %2, %%edx \n\t" // initialize ROWS counter
"sub $2, %%edx \n\t" // do not use first and last row
// ---
// ---
"movq (%%esi), %%mm1 \n\t" // load 8 bytes of the image first row
"add %%eax, %%esi \n\t" // move one row below
-@@ -2765,11 +2765,11 @@
+@@ -3107,11 +3107,11 @@
"inc %%edi \n\t" // move Dest pointer to the next pixel
// ---
"dec %%ecx \n\t" // decrease loop counter COLUMNS
// ---
"emms \n\t" // exit MMX state
"popa \n\t":"=m" (Dest) // %0
-@@ -2812,10 +2812,10 @@
+@@ -3154,10 +3154,10 @@
"mov %2, %%ebx \n\t" // initialize ROWS counter
"sub $4, %%ebx \n\t" // do not use first 2 and last 2 rows
// ---
"movd %%esi, %%mm6 \n\t" // save ESI in MM6
// --- 1
"movq (%%esi), %%mm1 \n\t" // load 8 bytes of the Src
-@@ -2914,11 +2914,11 @@
+@@ -3256,11 +3256,11 @@
"inc %%edi \n\t" // move Dest pointer to the next pixel
// ---
"dec %%ecx \n\t" // decrease loop counter COLUMNS
// ---
"emms \n\t" // exit MMX state
"popa \n\t":"=m" (Dest) // %0
-@@ -2959,10 +2959,10 @@
+@@ -3301,10 +3301,10 @@
"add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" // initialize ROWS counter
"sub $6, %%ebx \n\t" // do not use first 3 and last 3 rows
// ---
"movd %%esi, %%mm6 \n\t" // save ESI in MM6
// --- 1
"movq (%%esi), %%mm1 \n\t" // load 8 bytes of the Src
-@@ -3093,11 +3093,11 @@
+@@ -3435,11 +3435,11 @@
"inc %%edi \n\t" // move Dest pointer to the next pixel
// ---
"dec %%ecx \n\t" // decrease loop counter COLUMNS
// ---
"emms \n\t" // exit MMX state
"popa \n\t":"=m" (Dest) // %0
-@@ -3138,10 +3138,10 @@
+@@ -3480,10 +3480,10 @@
"add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" // initialize ROWS counter
"sub $8, %%ebx \n\t" // do not use first 4 and last 4 rows
// ---
"movd %%esi, %%mm6 \n\t" // save ESI in MM6
// --- 1
"movq (%%esi), %%mm1 \n\t" // load 8 bytes of the Src
-@@ -3376,11 +3376,11 @@
+@@ -3718,11 +3718,11 @@
"inc %%edi \n\t" // move Dest pointer to the next pixel
// ---
"dec %%ecx \n\t" // decrease loop counter COLUMNS
// ---
"emms \n\t" // exit MMX state
"popa \n\t":"=m" (Dest) // %0
-@@ -3419,12 +3419,12 @@
+@@ -3761,12 +3761,12 @@
"mov %2, %%edx \n\t" // initialize ROWS counter
"sub $2, %%edx \n\t" // do not use first and last rows
// ---
// ---
"movq (%%esi), %%mm4 \n\t" // load 8 bytes from Src
"movq %%mm4, %%mm5 \n\t" // save MM4 in MM5
-@@ -3502,13 +3502,13 @@
+@@ -3844,13 +3844,13 @@
"add $8, %%edi \n\t" // move Dest pointer to the next 8 pixels
// ---
"dec %%ecx \n\t" // decrease loop counter COLUMNS
// ---
"emms \n\t" // exit MMX state
"popa \n\t":"=m" (Dest) // %0
-@@ -3547,12 +3547,12 @@
+@@ -3889,12 +3889,12 @@
// initialize ROWS counter
"subl $2, %2 \n\t" // do not use first and last rows
// ---
// ---
"movq (%%esi), %%mm4 \n\t" // load 8 bytes from Src
"movq %%mm4, %%mm5 \n\t" // save MM4 in MM5
-@@ -3642,13 +3642,13 @@
+@@ -3984,13 +3984,13 @@
"add $8, %%edi \n\t" // move Dest pointer to the next 8 pixels
// ---
"dec %%ecx \n\t" // decrease loop counter COLUMNS