---- SDL_gfx-2.0.13/SDL_imageFilter.c.orig 2004-11-29 20:53:35.000000000 +0100
-+++ SDL_gfx-2.0.13/SDL_imageFilter.c 2005-01-16 00:19:22.272596920 +0100
+--- SDL_gfx-2.0.18/SDL_imageFilter.c~ 2008-12-22 09:18:04.000000000 +0100
++++ SDL_gfx-2.0.18/SDL_imageFilter.c 2008-12-22 09:55:22.000000000 +0100
@@ -81,13 +81,13 @@
- "mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx
- "shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
- ".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L1010: \n\t" "movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1
-+ "1: \n\t" "movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1
- "paddusb (%%ebx), %%mm1 \n\t" // mm1=Src1+Src2 (add 8 bytes with saturation)
- "movq %%mm1, (%%edi) \n\t" // store result in Dest
- "add $8, %%eax \n\t" // increase Src1, Src2 and Dest
- "add $8, %%ebx \n\t" // register pointers by 8
- "add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter
-- "jnz .L1010 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
- :"m"(Src2), // %1
+ "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
+ "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
+- ".L1010: \n\t" "movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
++ "1: \n\t" "movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
+ "paddusb (%%ebx), %%mm1 \n\t" /* mm1=Src1+Src2 (add 8 bytes with saturation) */
+ "movq %%mm1, (%%edi) \n\t" /* store result in Dest */
+ "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */
+ "add $8, %%ebx \n\t" /* register pointers by 8 */
+ "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */
+- "jnz .L1010 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+ :"m"(Src2), /* %1 */
@@ -158,7 +158,7 @@
- "mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx
- "shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
- ".align 16 \n\t" // 16 byte allignment of the loop entry
+ "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
+ "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
- ".L21011: \n\t"
+ "1: \n\t"
- "movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1
- "movq (%%ebx), %%mm2 \n\t" // load 8 bytes from Src2 into mm2
- // --- Byte shift via Word shift ---
+ "movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
+ "movq (%%ebx), %%mm2 \n\t" /* load 8 bytes from Src2 into mm2 */
+ /* --- Byte shift via Word shift --- */
@@ -174,7 +174,7 @@
- "add $8, %%ebx \n\t" // register pointers by 8
+ "add $8, %%ebx \n\t" /* register pointers by 8 */
"add $8, %%edi \n\t"
- "dec %%ecx \n\t" // decrease loop counter
-- "jnz .L21011 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
- :"m"(Src2), // %1
+ "dec %%ecx \n\t" /* decrease loop counter */
+- "jnz .L21011 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+ :"m"(Src2), /* %1 */
@@ -241,13 +241,13 @@
- "mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx
- "shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
- ".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L1012: \n\t" "movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1
-+ "1: \n\t" "movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1
- "psubusb (%%ebx), %%mm1 \n\t" // mm1=Src1-Src2 (sub 8 bytes with saturation)
- "movq %%mm1, (%%edi) \n\t" // store result in Dest
- "add $8, %%eax \n\t" // increase Src1, Src2 and Dest
- "add $8, %%ebx \n\t" // register pointers by 8
- "add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter
-- "jnz .L1012 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
- :"m"(Src2), // %1
+ "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
+ "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
+- ".L1012: \n\t" "movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
++ "1: \n\t" "movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
+ "psubusb (%%ebx), %%mm1 \n\t" /* mm1=Src1-Src2 (sub 8 bytes with saturation) */
+ "movq %%mm1, (%%edi) \n\t" /* store result in Dest */
+ "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */
+ "add $8, %%ebx \n\t" /* register pointers by 8 */
+ "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */
+- "jnz .L1012 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+ :"m"(Src2), /* %1 */
@@ -313,7 +313,7 @@
- "mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx
- "shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
- ".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L1013: \n\t" "movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1
-+ "1: \n\t" "movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1
- "movq (%%ebx), %%mm2 \n\t" // load 8 bytes from Src2 into mm2
- "psubusb (%%ebx), %%mm1 \n\t" // mm1=Src1-Src2 (sub 8 bytes with saturation)
- "psubusb (%%eax), %%mm2 \n\t" // mm2=Src2-Src1 (sub 8 bytes with saturation)
+ "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
+ "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
+- ".L1013: \n\t" "movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
++ "1: \n\t" "movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
+ "movq (%%ebx), %%mm2 \n\t" /* load 8 bytes from Src2 into mm2 */
+ "psubusb (%%ebx), %%mm1 \n\t" /* mm1=Src1-Src2 (sub 8 bytes with saturation) */
+ "psubusb (%%eax), %%mm2 \n\t" /* mm2=Src2-Src1 (sub 8 bytes with saturation) */
@@ -322,7 +322,7 @@
- "add $8, %%eax \n\t" // increase Src1, Src2 and Dest
- "add $8, %%ebx \n\t" // register pointers by 8
- "add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter
-- "jnz .L1013 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
- :"m"(Src2), // %1
+ "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */
+ "add $8, %%ebx \n\t" /* register pointers by 8 */
+ "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */
+- "jnz .L1013 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+ :"m"(Src2), /* %1 */
@@ -388,7 +388,7 @@
- "shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
- "pxor %%mm0, %%mm0 \n\t" // zero mm0 register
- ".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L1014: \n\t" "movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1
-+ "1: \n\t" "movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1
- "movq (%%ebx), %%mm3 \n\t" // load 8 bytes from Src2 into mm3
- "movq %%mm1, %%mm2 \n\t" // copy mm1 into mm2
- "movq %%mm3, %%mm4 \n\t" // copy mm3 into mm4
+ "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
+ "pxor %%mm0, %%mm0 \n\t" /* zero mm0 register */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
+- ".L1014: \n\t" "movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
++ "1: \n\t" "movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
+ "movq (%%ebx), %%mm3 \n\t" /* load 8 bytes from Src2 into mm3 */
+ "movq %%mm1, %%mm2 \n\t" /* copy mm1 into mm2 */
+ "movq %%mm3, %%mm4 \n\t" /* copy mm3 into mm4 */
@@ -412,7 +412,7 @@
- "add $8, %%eax \n\t" // increase Src1, Src2 and Dest
- "add $8, %%ebx \n\t" // register pointers by 8
- "add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter
-- "jnz .L1014 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
- :"m"(Src2), // %1
+ "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */
+ "add $8, %%ebx \n\t" /* register pointers by 8 */
+ "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */
+- "jnz .L1014 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+ :"m"(Src2), /* %1 */
@@ -481,13 +481,13 @@
- "mov %0, %%edi \n\t" // load Dest address into edi
- "mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx
- ".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L10141: \n\t" "mov (%%edx), %%al \n\t" // load a byte from Src1
-+ "1: \n\t" "mov (%%edx), %%al \n\t" // load a byte from Src1
- "mulb (%%esi) \n\t" // mul with a byte from Src2
-- ".L10142: \n\t" "mov %%al, (%%edi) \n\t" // move a byte result to Dest
-+ "mov %%al, (%%edi) \n\t" // move a byte result to Dest
- "inc %%edx \n\t" // increment Src1, Src2, Dest
- "inc %%esi \n\t" // pointer registers by one
- "inc %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter
-- "jnz .L10141 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- "popa \n\t":"=m" (Dest) // %0
- :"m"(Src2), // %1
- "m"(Src1), // %2
-@@ -549,7 +549,7 @@
- "shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
- "pxor %%mm0, %%mm0 \n\t" // zero mm0 register
- ".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L1015: \n\t" "movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1
-+ "1: \n\t" "movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1
- "movq (%%ebx), %%mm3 \n\t" // load 8 bytes from Src2 into mm3
- "movq %%mm1, %%mm2 \n\t" // copy mm1 into mm2
- "movq %%mm3, %%mm4 \n\t" // copy mm3 into mm4
-@@ -566,7 +566,7 @@
- "add $8, %%eax \n\t" // increase Src1, Src2 and Dest
- "add $8, %%ebx \n\t" // register pointers by 8
- "add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter
-- "jnz .L1015 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
- :"m"(Src2), // %1
-@@ -634,7 +634,7 @@
- "shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
- "pxor %%mm0, %%mm0 \n\t" // zero mm0 register
- ".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L1016: \n\t" "movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1
-+ "1: \n\t" "movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1
- "movq (%%ebx), %%mm3 \n\t" // load 8 bytes from Src2 into mm3
- "movq %%mm1, %%mm2 \n\t" // copy mm1 into mm2
- "movq %%mm3, %%mm4 \n\t" // copy mm3 into mm4
-@@ -653,7 +653,7 @@
- "add $8, %%eax \n\t" // increase Src1, Src2 and Dest
- "add $8, %%ebx \n\t" // register pointers by 8
- "add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter
-- "jnz .L1016 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
- :"m"(Src2), // %1
-@@ -720,13 +720,13 @@
- "mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx
- "shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
- ".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L1017: \n\t" "movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1
-+ "1: \n\t" "movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1
- "pand (%%ebx), %%mm1 \n\t" // mm1=Src1&Src2
- "movq %%mm1, (%%edi) \n\t" // store result in Dest
- "add $8, %%eax \n\t" // increase Src1, Src2 and Dest
- "add $8, %%ebx \n\t" // register pointers by 8
- "add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter
-- "jnz .L1017 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
- :"m"(Src2), // %1
-@@ -792,13 +792,13 @@
- "mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx
- "shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
- ".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L91017: \n\t" "movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1
-+ "1: \n\t" "movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1
- "por (%%ebx), %%mm1 \n\t" // mm1=Src1|Src2
- "movq %%mm1, (%%edi) \n\t" // store result in Dest
- "add $8, %%eax \n\t" // increase Src1, Src2 and Dest
- "add $8, %%ebx \n\t" // register pointers by 8
- "add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter
-- "jnz .L91017 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
- :"m"(Src2), // %1
-@@ -860,17 +860,17 @@
- "mov %0, %%edi \n\t" // load Dest address into edi
- "mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx
- ".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L10191: \n\t" "mov (%%esi), %%bl \n\t" // load a byte from Src2
-+ "1: \n\t" "mov (%%esi), %%bl \n\t" // load a byte from Src2
- "cmp $0, %%bl \n\t" // check if it zero
-- "jnz .L10192 \n\t" "movb $255, (%%edi) \n\t" // division by zero = 255 !!!
-- "jmp .L10193 \n\t" ".L10192: \n\t" "xor %%ah, %%ah \n\t" // prepare AX, zero AH register
-+ "jnz 2f \n\t" "movb $255, (%%edi) \n\t" // division by zero = 255 !!!
-+ "jmp 3f \n\t" "2: \n\t" "xor %%ah, %%ah \n\t" // prepare AX, zero AH register
- "mov (%%edx), %%al \n\t" // load a byte from Src1 into AL
- "div %%bl \n\t" // divide AL by BL
- "mov %%al, (%%edi) \n\t" // move a byte result to Dest
-- ".L10193: \n\t" "inc %%edx \n\t" // increment Src1, Src2, Dest
-+ "3: \n\t" "inc %%edx \n\t" // increment Src1, Src2, Dest
- "inc %%esi \n\t" // pointer registers by one
- "inc %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter
-- "jnz .L10191 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- "popa \n\t":"=m" (Dest) // %0
- :"m"(Src2), // %1
- "m"(Src1), // %2
-@@ -907,12 +907,12 @@
- "mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx
- "shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
- ".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L91117: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from Src1 into mm1
-+ "1: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from Src1 into mm1
- "pxor %%mm1, %%mm0 \n\t" // negate mm0 by xoring with mm1
- "movq %%mm0, (%%edi) \n\t" // store result in Dest
- "add $8, %%eax \n\t" // increase Src1, Src2 and Dest
- "add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter
-- "jnz .L91117 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
- :"m"(Src1), // %1
-@@ -980,14 +980,14 @@
- "mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx
- "shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
- ".align 16 \n\t" // 16 byte allignment of the loop entry
+ "mov %0, %%edi \n\t" /* load Dest address into edi */
+ "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
+- ".L10141: \n\t" "mov (%%edx), %%al \n\t" /* load a byte from Src1 */
++ "1: \n\t" "mov (%%edx), %%al \n\t" /* load a byte from Src1 */
+ "mulb (%%esi) \n\t" /* mul with a byte from Src2 */
+- ".L10142: \n\t" "mov %%al, (%%edi) \n\t" /* move a byte result to Dest */
++ "mov %%al, (%%edi) \n\t \n\t" "mov %%al, (%%edi) \n\t" /* move a byte result to Dest */
+ "inc %%edx \n\t" /* increment Src1, Src2, Dest */
+ "inc %%esi \n\t" /* pointer registers by one */
+ "inc %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */
+- "jnz .L10141 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ "popa \n\t":"=m" (Dest) /* %0 */
+ :"m"(Src2), /* %1 */
+ "m"(Src1), /* %2 */
+@@ -557,7 +557,7 @@
+ "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
+ "pxor %%mm0, %%mm0 \n\t" /* zero mm0 register */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
+- ".L1015: \n\t" "movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
++ "1: \n\t" "movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
+ "movq (%%ebx), %%mm3 \n\t" /* load 8 bytes from Src2 into mm3 */
+ "movq %%mm1, %%mm2 \n\t" /* copy mm1 into mm2 */
+ "movq %%mm3, %%mm4 \n\t" /* copy mm3 into mm4 */
+@@ -574,7 +574,7 @@
+ "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */
+ "add $8, %%ebx \n\t" /* register pointers by 8 */
+ "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */
+- "jnz .L1015 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+ :"m"(Src2), /* %1 */
+@@ -642,7 +642,7 @@
+ "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
+ "pxor %%mm0, %%mm0 \n\t" /* zero mm0 register */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
+- ".L1016: \n\t" "movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
++ "1: \n\t" "movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
+ "movq (%%ebx), %%mm3 \n\t" /* load 8 bytes from Src2 into mm3 */
+ "movq %%mm1, %%mm2 \n\t" /* copy mm1 into mm2 */
+ "movq %%mm3, %%mm4 \n\t" /* copy mm3 into mm4 */
+@@ -661,7 +661,7 @@
+ "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */
+ "add $8, %%ebx \n\t" /* register pointers by 8 */
+ "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */
+- "jnz .L1016 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+ :"m"(Src2), /* %1 */
+@@ -728,13 +728,13 @@
+ "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
+ "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
+- ".L1017: \n\t" "movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
++ "1: \n\t" "movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
+ "pand (%%ebx), %%mm1 \n\t" /* mm1=Src1&Src2 */
+ "movq %%mm1, (%%edi) \n\t" /* store result in Dest */
+ "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */
+ "add $8, %%ebx \n\t" /* register pointers by 8 */
+ "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */
+- "jnz .L1017 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+ :"m"(Src2), /* %1 */
+@@ -800,13 +800,13 @@
+ "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
+ "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
+- ".L91017: \n\t" "movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
++ "1: \n\t" "movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */
+ "por (%%ebx), %%mm1 \n\t" /* mm1=Src1|Src2 */
+ "movq %%mm1, (%%edi) \n\t" /* store result in Dest */
+ "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */
+ "add $8, %%ebx \n\t" /* register pointers by 8 */
+ "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */
+- "jnz .L91017 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+ :"m"(Src2), /* %1 */
+@@ -868,17 +868,17 @@
+ "mov %0, %%edi \n\t" /* load Dest address into edi */
+ "mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
+- ".L10191: \n\t" "mov (%%esi), %%bl \n\t" /* load a byte from Src2 */
++ "1: \n\t" "mov (%%esi), %%bl \n\t" /* load a byte from Src2 */
+ "cmp $0, %%bl \n\t" /* check if it zero */
+- "jnz .L10192 \n\t" "movb $255, (%%edi) \n\t" /* division by zero = 255 !!! */
+- "jmp .L10193 \n\t" ".L10192: \n\t" "xor %%ah, %%ah \n\t" /* prepare AX, zero AH register */
++ "jnz 2f \n\t" "movb $255, (%%edi) \n\t" /* division by zero = 255 !!! */
++ "jmp 3f \n\t" ".L10192: \n\t" "xor %%ah, %%ah \n\t" /* prepare AX, zero AH register */
+ "mov (%%edx), %%al \n\t" /* load a byte from Src1 into AL */
+ "div %%bl \n\t" /* divide AL by BL */
+ "mov %%al, (%%edi) \n\t" /* move a byte result to Dest */
+- ".L10193: \n\t" "inc %%edx \n\t" /* increment Src1, Src2, Dest */
++ "3: \n\t" "inc %%edx \n\t" /* increment Src1, Src2, Dest */
+ "inc %%esi \n\t" /* pointer registers by one */
+ "inc %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */
+- "jnz .L10191 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ "popa \n\t":"=m" (Dest) /* %0 */
+ :"m"(Src2), /* %1 */
+ "m"(Src1), /* %2 */
+@@ -939,12 +939,12 @@
+ "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
+ "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
+- ".L91117: \n\t" "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from Src1 into mm1 */
++ "1: \n\t" "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from Src1 into mm1 */
+ "pxor %%mm1, %%mm0 \n\t" /* negate mm0 by xoring with mm1 */
+ "movq %%mm0, (%%edi) \n\t" /* store result in Dest */
+ "add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */
+ "add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */
+- "jnz .L91117 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+ :"m"(Src1), /* %1 */
+@@ -1012,14 +1012,14 @@
+ "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
+ "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
- ".L1021: \n\t"
+ "1: \n\t"
- "movq (%%eax), %%mm0 \n\t" // load 8 bytes from Src1 into MM0
- "paddusb %%mm1, %%mm0 \n\t" // MM0=SrcDest+C (add 8 bytes with saturation)
- "movq %%mm0, (%%edi) \n\t" // store result in Dest
- "add $8, %%eax \n\t" // increase Dest register pointer by 8
- "add $8, %%edi \n\t" // increase Dest register pointer by 8
- "dec %%ecx \n\t" // decrease loop counter
-- "jnz .L1021 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
- :"m"(Src1), // %1
-@@ -1059,14 +1059,14 @@
- "mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx
- "shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
- ".align 16 \n\t" // 16 byte allignment of the loop entry
+ "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from Src1 into MM0 */
+ "paddusb %%mm1, %%mm0 \n\t" /* MM0=SrcDest+C (add 8 bytes with saturation) */
+ "movq %%mm0, (%%edi) \n\t" /* store result in Dest */
+ "add $8, %%eax \n\t" /* increase Dest register pointer by 8 */
+ "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
+ "dec %%ecx \n\t" /* decrease loop counter */
+- "jnz .L1021 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+ :"m"(Src1), /* %1 */
+@@ -1091,14 +1091,14 @@
+ "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
+ "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
- ".L11023: \n\t"
+ "1: \n\t"
- "movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0
- "paddusb %%mm1, %%mm0 \n\t" // MM0=SrcDest+C (add 8 bytes with saturation)
- "movq %%mm0, (%%edi) \n\t" // store result in SrcDest
- "add $8, %%eax \n\t" // increase Src1 register pointer by 8
- "add $8, %%edi \n\t" // increase Dest register pointer by 8
- "dec %%ecx \n\t" // decrease loop counter
-- "jnz .L11023 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
- :"m"(Src1), // %1
-@@ -1154,7 +1154,7 @@
- "mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx
- "shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
- ".align 16 \n\t" // 16 byte allignment of the loop entry
+ "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */
+ "paddusb %%mm1, %%mm0 \n\t" /* MM0=SrcDest+C (add 8 bytes with saturation) */
+ "movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */
+ "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
+ "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
+ "dec %%ecx \n\t" /* decrease loop counter */
+- "jnz .L11023 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+ :"m"(Src1), /* %1 */
+@@ -1186,7 +1186,7 @@
+ "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
+ "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
- ".L1022: \n\t"
+ "1: \n\t"
- "movq (%%eax), %%mm2 \n\t" // load 8 bytes from Src1 into MM2
- "psrlw $1, %%mm2 \n\t" // shift 4 WORDS of MM2 1 bit to the right
- // "pand %%mm0, %%mm2 \n\t" // apply Mask to 8 BYTES of MM2
-@@ -1164,7 +1164,7 @@
- "add $8, %%eax \n\t" // increase Src1 register pointer by 8
- "add $8, %%edi \n\t" // increase Dest register pointer by 8
- "dec %%ecx \n\t" // decrease loop counter
-- "jnz .L1022 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
- :"m"(Src1), // %1
-@@ -1243,13 +1243,13 @@
- "mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx
- "shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
- ".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L1023: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0
-+ "1: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0
- "psubusb %%mm1, %%mm0 \n\t" // MM0=SrcDest-C (sub 8 bytes with saturation)
- "movq %%mm0, (%%edi) \n\t" // store result in SrcDest
- "add $8, %%eax \n\t" // increase Src1 register pointer by 8
- "add $8, %%edi \n\t" // increase Dest register pointer by 8
- "dec %%ecx \n\t" // decrease loop counter
-- "jnz .L1023 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
- :"m"(Src1), // %1
-@@ -1322,13 +1322,13 @@
- "mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx
- "shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
- ".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L11024: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0
-+ "1: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0
- "psubusb %%mm1, %%mm0 \n\t" // MM0=SrcDest-C (sub 8 bytes with saturation)
- "movq %%mm0, (%%edi) \n\t" // store result in SrcDest
- "add $8, %%eax \n\t" // increase Src1 register pointer by 8
- "add $8, %%edi \n\t" // increase Dest register pointer by 8
- "dec %%ecx \n\t" // decrease loop counter
-- "jnz .L11024 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
- :"m"(Src1), // %1
-@@ -1405,19 +1405,19 @@
- "mov %3, %%cl \n\t" // load loop counter (N) into CL
- "movd %%ecx, %%mm3 \n\t" // copy (N) into MM3
- "pcmpeqb %%mm1, %%mm1 \n\t" // generate all 1's in mm1
-- ".L10240: \n\t" // ** Prepare proper bit-Mask in MM1 **
-+ "1: \n\t" // ** Prepare proper bit-Mask in MM1 **
- "psrlw $1, %%mm1 \n\t" // shift 4 WORDS of MM1 1 bit to the right
- // "pand %%mm0, %%mm1 \n\t" // apply Mask to 8 BYTES of MM1
+ "movq (%%eax), %%mm2 \n\t" /* load 8 bytes from Src1 into MM2 */
+ "psrlw $1, %%mm2 \n\t" /* shift 4 WORDS of MM2 1 bit to the right */
+ /* "pand %%mm0, %%mm2 \n\t" // apply Mask to 8 BYTES of MM2 */
+@@ -1196,7 +1196,7 @@
+ "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
+ "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
+ "dec %%ecx \n\t" /* decrease loop counter */
+- "jnz .L1022 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+ :"m"(Src1), /* %1 */
+@@ -1275,13 +1275,13 @@
+ "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
+ "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
+- ".L1023: \n\t" "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */
++ "1: \n\t" "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */
+ "psubusb %%mm1, %%mm0 \n\t" /* MM0=SrcDest-C (sub 8 bytes with saturation) */
+ "movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */
+ "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
+ "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
+ "dec %%ecx \n\t" /* decrease loop counter */
+- "jnz .L1023 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+ :"m"(Src1), /* %1 */
+@@ -1354,13 +1354,13 @@
+ "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
+ "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
+- ".L11024: \n\t" "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */
++ "1: \n\t" "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */
+ "psubusb %%mm1, %%mm0 \n\t" /* MM0=SrcDest-C (sub 8 bytes with saturation) */
+ "movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */
+ "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
+ "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
+ "dec %%ecx \n\t" /* decrease loop counter */
+- "jnz .L11024 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+ :"m"(Src1), /* %1 */
+@@ -1437,19 +1437,19 @@
+ "mov %3, %%cl \n\t" /* load loop counter (N) into CL */
+ "movd %%ecx, %%mm3 \n\t" /* copy (N) into MM3 */
+ "pcmpeqb %%mm1, %%mm1 \n\t" /* generate all 1's in mm1 */
+- ".L10240: \n\t" /* ** Prepare proper bit-Mask in MM1 ** */
++ "1: \n\t" /* ** Prepare proper bit-Mask in MM1 ** */
+ "psrlw $1, %%mm1 \n\t" /* shift 4 WORDS of MM1 1 bit to the right */
+ /* "pand %%mm0, %%mm1 \n\t" // apply Mask to 8 BYTES of MM1 */
".byte 0x0f, 0xdb, 0xc8 \n\t"
- "dec %%cl \n\t" // decrease loop counter
-- "jnz .L10240 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- // ** Shift all bytes of the image **
- "mov %1, %%eax \n\t" // load Src1 address into eax
- "mov %0, %%edi \n\t" // load Dest address into edi
- "mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx
- "shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
- ".align 16 \n\t" // 16 byte allignment of the loop entry
+ "dec %%cl \n\t" /* decrease loop counter */
+- "jnz .L10240 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ /* ** Shift all bytes of the image ** */
+ "mov %1, %%eax \n\t" /* load Src1 address into eax */
+ "mov %0, %%edi \n\t" /* load Dest address into edi */
+ "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
+ "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
- ".L10241: \n\t"
+ "2: \n\t"
- "movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0
- "psrlw %%mm3, %%mm0 \n\t" // shift 4 WORDS of MM0 (N) bits to the right
- // "pand %%mm1, %%mm0 \n\t" // apply proper bit-Mask to 8 BYTES of MM0
-@@ -1426,7 +1426,7 @@
- "add $8, %%eax \n\t" // increase Src1 register pointer by 8
- "add $8, %%edi \n\t" // increase Dest register pointer by 8
- "dec %%ecx \n\t" // decrease loop counter
-- "jnz .L10241 \n\t" // check loop termination, proceed if required
-+ "jnz 2b \n\t" // check loop termination, proceed if required
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
- :"m"(Src1), // %1
-@@ -1495,13 +1495,13 @@
- "mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx
- "shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
- ".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L13023: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0
-+ "1: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0
- "psrld %3, %%mm0 \n\t" // MM0=SrcDest+C (add 8 bytes with saturation)
- "movq %%mm0, (%%edi) \n\t" // store result in SrcDest
- "add $8, %%eax \n\t" // increase Src1 register pointer by 8
- "add $8, %%edi \n\t" // increase Dest register pointer by 8
- "dec %%ecx \n\t" // decrease loop counter
-- "jnz .L13023 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
- :"m"(Src1), // %1
-@@ -1581,8 +1581,8 @@
- "mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx
- "shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
- "cmp $128, %%al \n\t" // if (C <= 128) execute more efficient code
-- "jg .L10251 \n\t" ".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L10250: \n\t" "movq (%%eax), %%mm3 \n\t" // load 8 bytes from Src1 into MM3
-+ "jg 1f \n\t" ".align 16 \n\t" // 16 byte allignment of the loop entry
-+ "2: \n\t" "movq (%%eax), %%mm3 \n\t" // load 8 bytes from Src1 into MM3
- "movq %%mm3, %%mm4 \n\t" // copy MM3 into MM4
- "punpcklbw %%mm0, %%mm3 \n\t" // unpack low bytes of SrcDest into words
- "punpckhbw %%mm0, %%mm4 \n\t" // unpack high bytes of SrcDest into words
-@@ -1593,9 +1593,9 @@
- "add $8, %%eax \n\t" // increase Src1 register pointer by 8
- "add $8, %%edi \n\t" // increase Dest register pointer by 8
- "dec %%ecx \n\t" // decrease loop counter
-- "jnz .L10250 \n\t" // check loop termination, proceed if required
-- "jmp .L10252 \n\t" ".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L10251: \n\t" "movq (%%eax), %%mm3 \n\t" // load 8 bytes from Src1 into MM3
-+ "jnz 2b \n\t" // check loop termination, proceed if required
-+ "jmp 3f \n\t" ".align 16 \n\t" // 16 byte allignment of the loop entry
-+ "1: \n\t" "movq (%%eax), %%mm3 \n\t" // load 8 bytes from Src1 into MM3
- "movq %%mm3, %%mm4 \n\t" // copy MM3 into MM4
- "punpcklbw %%mm0, %%mm3 \n\t" // unpack low bytes of SrcDest into words
- "punpckhbw %%mm0, %%mm4 \n\t" // unpack high bytes of SrcDest into words
-@@ -1615,8 +1615,8 @@
- "add $8, %%eax \n\t" // increase Src1 register pointer by 8
- "add $8, %%edi \n\t" // increase Dest register pointer by 8
- "dec %%ecx \n\t" // decrease loop counter
-- "jnz .L10251 \n\t" // check loop termination, proceed if required
-- ".L10252: \n\t" "emms \n\t" // exit MMX state
-+ "jnz 1b \n\t" // check loop termination, proceed if required
-+ "3: \n\t" "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
- :"m"(Src1), // %1
- "m"(length), // %2
-@@ -1696,7 +1696,7 @@
- "mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx
- "shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
- ".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L1026: \n\t" "movq (%%eax), %%mm3 \n\t" // load 8 bytes from Src1 into MM3
-+ "1: \n\t" "movq (%%eax), %%mm3 \n\t" // load 8 bytes from Src1 into MM3
- "movq %%mm3, %%mm4 \n\t" // copy MM3 into MM4
- "punpcklbw %%mm0, %%mm3 \n\t" // unpack low bytes of SrcDest into words
- "punpckhbw %%mm0, %%mm4 \n\t" // unpack high bytes of SrcDest into words
-@@ -1709,7 +1709,7 @@
- "add $8, %%eax \n\t" // increase Src1 register pointer by 8
- "add $8, %%edi \n\t" // increase Dest register pointer by 8
- "dec %%ecx \n\t" // decrease loop counter
-- "jnz .L1026 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
- :"m"(Src1), // %1
-@@ -1784,25 +1784,25 @@
- "mov %3, %%cl \n\t" // load loop counter (N) into CL
- "movd %%ecx, %%mm3 \n\t" // copy (N) into MM3
- "pcmpeqb %%mm1, %%mm1 \n\t" // generate all 1's in mm1
-- ".L10270: \n\t" // ** Prepare proper bit-Mask in MM1 **
-+ "1: \n\t" // ** Prepare proper bit-Mask in MM1 **
- "psllw $1, %%mm1 \n\t" // shift 4 WORDS of MM1 1 bit to the left
- // "pand %%mm0, %%mm1 \n\t" // apply Mask to 8 BYTES of MM1
- ".byte 0x0f, 0xdb, 0xc8 \n\t" "dec %%cl \n\t" // decrease loop counter
-- "jnz .L10270 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- // ** Shift all bytes of the image **
- "mov %1, %%eax \n\t" // load Src1 address into eax
- "mov %0, %%edi \n\t" // load SrcDest address into edi
- "mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx
- "shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
- ".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L10271: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from Src1 into MM0
-+ "2: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from Src1 into MM0
- "psllw %%mm3, %%mm0 \n\t" // shift 4 WORDS of MM0 (N) bits to the left
- // "pand %%mm1, %%mm0 \n\t" // apply proper bit-Mask to 8 BYTES of MM0
- ".byte 0x0f, 0xdb, 0xc1 \n\t" "movq %%mm0, (%%edi) \n\t" // store result in Dest
- "add $8, %%eax \n\t" // increase Src1 register pointer by 8
- "add $8, %%edi \n\t" // increase Dest register pointer by 8
- "dec %%ecx \n\t" // decrease loop counter
-- "jnz .L10271 \n\t" // check loop termination, proceed if required
-+ "jnz 2b \n\t" // check loop termination, proceed if required
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
- :"m"(Src1), // %1
-@@ -1870,13 +1870,13 @@
- "mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx
- "shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
- ".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L12023: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0
-+ "1: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0
- "pslld %3, %%mm0 \n\t" // MM0=SrcDest+C (add 8 bytes with saturation)
- "movq %%mm0, (%%edi) \n\t" // store result in SrcDest
- "add $8, %%eax \n\t" // increase Src1 register pointer by 8
- "add $8, %%edi \n\t" // increase Dest register pointer by 8
- "dec %%ecx \n\t" // decrease loop counter
-- "jnz .L12023 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
- :"m"(Src1), // %1
-@@ -1949,8 +1949,8 @@
- "mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx
- "shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
- "cmp $7, %%al \n\t" // if (N <= 7) execute more efficient code
-- "jg .L10281 \n\t" ".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L10280: \n\t" "movq (%%eax), %%mm3 \n\t" // load 8 bytes from Src1 into MM3
-+ "jg 1f \n\t" ".align 16 \n\t" // 16 byte allignment of the loop entry
-+ "2: \n\t" "movq (%%eax), %%mm3 \n\t" // load 8 bytes from Src1 into MM3
- "movq %%mm3, %%mm4 \n\t" // copy MM3 into MM4
- "punpcklbw %%mm0, %%mm3 \n\t" // unpack low bytes of SrcDest into words
- "punpckhbw %%mm0, %%mm4 \n\t" // unpack high bytes of SrcDest into words
-@@ -1961,9 +1961,9 @@
- "add $8, %%eax \n\t" // increase Src1 register pointer by 8
- "add $8, %%edi \n\t" // increase Dest register pointer by 8
- "dec %%ecx \n\t" // decrease loop counter
-- "jnz .L10280 \n\t" // check loop termination, proceed if required
-- "jmp .L10282 \n\t" ".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L10281: \n\t" "movq (%%eax), %%mm3 \n\t" // load 8 bytes from Src1 into MM3
-+ "jnz 2b \n\t" // check loop termination, proceed if required
-+ "jmp 3f \n\t" ".align 16 \n\t" // 16 byte allignment of the loop entry
-+ "1: \n\t" "movq (%%eax), %%mm3 \n\t" // load 8 bytes from Src1 into MM3
- "movq %%mm3, %%mm4 \n\t" // copy MM3 into MM4
- "punpcklbw %%mm0, %%mm3 \n\t" // unpack low bytes of SrcDest into words
- "punpckhbw %%mm0, %%mm4 \n\t" // unpack high bytes of SrcDest into words
-@@ -1983,8 +1983,8 @@
- "add $8, %%eax \n\t" // increase Src1 register pointer by 8
- "add $8, %%edi \n\t" // increase Dest register pointer by 8
- "dec %%ecx \n\t" // decrease loop counter
-- "jnz .L10281 \n\t" // check loop termination, proceed if required
-- ".L10282: \n\t" "emms \n\t" // exit MMX state
-+ "jnz 1b \n\t" // check loop termination, proceed if required
-+ "3: \n\t" "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
- :"m"(Src1), // %1
- "m"(length), // %2
-@@ -2063,7 +2063,7 @@
- "mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx
- "shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
- ".align 16 \n\t" // 16 byte alignment of the loop entry
+ "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */
+ "psrlw %%mm3, %%mm0 \n\t" /* shift 4 WORDS of MM0 (N) bits to the right */
+ /* "pand %%mm1, %%mm0 \n\t" // apply proper bit-Mask to 8 BYTES of MM0 */
+@@ -1458,7 +1458,7 @@
+ "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
+ "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
+ "dec %%ecx \n\t" /* decrease loop counter */
+- "jnz .L10241 \n\t" /* check loop termination, proceed if required */
++ "jnz 2b \n\t" /* check loop termination, proceed if required */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+ :"m"(Src1), /* %1 */
+@@ -1527,13 +1527,13 @@
+ "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
+ "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
+- ".L13023: \n\t" "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */
++ "1: \n\t" "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */
+ "psrld %3, %%mm0 \n\t"
+ "movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */
+ "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
+ "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
+ "dec %%ecx \n\t" /* decrease loop counter */
+- "jnz .L13023 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+ :"m"(Src1), /* %1 */
+@@ -1613,8 +1613,8 @@
+ "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
+ "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
+ "cmp $128, %%al \n\t" /* if (C <= 128) execute more efficient code */
+- "jg .L10251 \n\t" ".align 16 \n\t" /* 16 byte allignment of the loop entry */
+- ".L10250: \n\t" "movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */
++ "jg 1f \n\t" ".align 16 \n\t" /* 16 byte allignment of the loop entry */
++ "2: \n\t" "movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */
+ "movq %%mm3, %%mm4 \n\t" /* copy MM3 into MM4 */
+ "punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of SrcDest into words */
+ "punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of SrcDest into words */
+@@ -1625,9 +1625,9 @@
+ "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
+ "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
+ "dec %%ecx \n\t" /* decrease loop counter */
+- "jnz .L10250 \n\t" /* check loop termination, proceed if required */
+- "jmp .L10252 \n\t" ".align 16 \n\t" /* 16 byte allignment of the loop entry */
+- ".L10251: \n\t" "movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */
++ "jnz 2b \n\t" /* check loop termination, proceed if required */
++ "jmp 3f \n\t" ".align 16 \n\t" /* 16 byte allignment of the loop entry */
++ "1: \n\t" "movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */
+ "movq %%mm3, %%mm4 \n\t" /* copy MM3 into MM4 */
+ "punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of SrcDest into words */
+ "punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of SrcDest into words */
+@@ -1647,8 +1647,8 @@
+ "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
+ "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
+ "dec %%ecx \n\t" /* decrease loop counter */
+- "jnz .L10251 \n\t" /* check loop termination, proceed if required */
+- ".L10252: \n\t" "emms \n\t" /* exit MMX state */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
++ "3: \n\t" "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+ :"m"(Src1), /* %1 */
+ "m"(length), /* %2 */
+@@ -1728,7 +1728,7 @@
+ "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
+ "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
+- ".L1026: \n\t" "movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */
++ "1: \n\t" "movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */
+ "movq %%mm3, %%mm4 \n\t" /* copy MM3 into MM4 */
+ "punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of SrcDest into words */
+ "punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of SrcDest into words */
+@@ -1741,7 +1741,7 @@
+ "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
+ "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
+ "dec %%ecx \n\t" /* decrease loop counter */
+- "jnz .L1026 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+ :"m"(Src1), /* %1 */
+@@ -1816,25 +1816,25 @@
+ "mov %3, %%cl \n\t" /* load loop counter (N) into CL */
+ "movd %%ecx, %%mm3 \n\t" /* copy (N) into MM3 */
+ "pcmpeqb %%mm1, %%mm1 \n\t" /* generate all 1's in mm1 */
+- ".L10270: \n\t" /* ** Prepare proper bit-Mask in MM1 ** */
++ "1: \n\t" /* ** Prepare proper bit-Mask in MM1 ** */
+ "psllw $1, %%mm1 \n\t" /* shift 4 WORDS of MM1 1 bit to the left */
+ /* "pand %%mm0, %%mm1 \n\t" // apply Mask to 8 BYTES of MM1 */
+ ".byte 0x0f, 0xdb, 0xc8 \n\t" "dec %%cl \n\t" /* decrease loop counter */
+- "jnz .L10270 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ /* ** Shift all bytes of the image ** */
+ "mov %1, %%eax \n\t" /* load Src1 address into eax */
+ "mov %0, %%edi \n\t" /* load SrcDest address into edi */
+ "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
+ "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
+- ".L10271: \n\t" "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from Src1 into MM0 */
++ "2: \n\t" "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from Src1 into MM0 */
+ "psllw %%mm3, %%mm0 \n\t" /* shift 4 WORDS of MM0 (N) bits to the left */
+ /* "pand %%mm1, %%mm0 \n\t" // apply proper bit-Mask to 8 BYTES of MM0 */
+ ".byte 0x0f, 0xdb, 0xc1 \n\t" "movq %%mm0, (%%edi) \n\t" /* store result in Dest */
+ "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
+ "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
+ "dec %%ecx \n\t" /* decrease loop counter */
+- "jnz .L10271 \n\t" /* check loop termination, proceed if required */
++ "jnz 2b \n\t" /* check loop termination, proceed if required */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+ :"m"(Src1), /* %1 */
+@@ -1902,13 +1902,13 @@
+ "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
+ "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
+- ".L12023: \n\t" "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */
++ "1: \n\t" "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */
+ "pslld %3, %%mm0 \n\t" /* MM0=SrcDest+C (add 8 bytes with saturation) */
+ "movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */
+ "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
+ "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
+ "dec %%ecx \n\t" /* decrease loop counter */
+- "jnz .L12023 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+ :"m"(Src1), /* %1 */
+@@ -1981,8 +1981,8 @@
+ "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
+ "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
+ "cmp $7, %%al \n\t" /* if (N <= 7) execute more efficient code */
+- "jg .L10281 \n\t" ".align 16 \n\t" /* 16 byte allignment of the loop entry */
+- ".L10280: \n\t" "movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */
++ "jg 1f \n\t" ".align 16 \n\t" /* 16 byte allignment of the loop entry */
++ "2: \n\t" "movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */
+ "movq %%mm3, %%mm4 \n\t" /* copy MM3 into MM4 */
+ "punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of SrcDest into words */
+ "punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of SrcDest into words */
+@@ -1993,9 +1993,9 @@
+ "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
+ "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
+ "dec %%ecx \n\t" /* decrease loop counter */
+- "jnz .L10280 \n\t" /* check loop termination, proceed if required */
+- "jmp .L10282 \n\t" ".align 16 \n\t" /* 16 byte allignment of the loop entry */
+- ".L10281: \n\t" "movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */
++ "jnz 2b \n\t" /* check loop termination, proceed if required */
++ "jmp 3f \n\t" ".align 16 \n\t" /* 16 byte allignment of the loop entry */
++ "1: \n\t" "movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */
+ "movq %%mm3, %%mm4 \n\t" /* copy MM3 into MM4 */
+ "punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of SrcDest into words */
+ "punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of SrcDest into words */
+@@ -2015,8 +2015,8 @@
+ "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
+ "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
+ "dec %%ecx \n\t" /* decrease loop counter */
+- "jnz .L10281 \n\t" /* check loop termination, proceed if required */
+- ".L10282: \n\t" "emms \n\t" /* exit MMX state */
++ "jnz 1b \n\t" /* check loop termination, proceed if required *
++ "3: \n\t" "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+ :"m"(Src1), /* %1 */
+ "m"(length), /* %2 */
+@@ -2095,7 +2095,7 @@
+ "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
+ "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
+ ".align 16 \n\t" /* 16 byte alignment of the loop entry */
- ".L1029: \n\t"
+ "1: \n\t"
- "movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0
- "paddusb %%mm2, %%mm0 \n\t" // MM0=SrcDest+(0xFF-T) (add 8 bytes with saturation)
- "pcmpeqb %%mm1, %%mm0 \n\t" // binarize 255:0, comparing to 255
-@@ -2071,7 +2071,7 @@
- "add $8, %%eax \n\t" // increase Src1 register pointer by 8
- "add $8, %%edi \n\t" // increase Dest register pointer by 8
- "dec %%ecx \n\t" // decrease loop counter
-- "jnz .L1029 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
- :"m"(Src1), // %1
-@@ -2154,7 +2154,7 @@
- "mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx
- "shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
- ".align 16 \n\t" // 16 byte allignment of the loop entry
+ "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */
+ "paddusb %%mm2, %%mm0 \n\t" /* MM0=SrcDest+(0xFF-T) (add 8 bytes with saturation) */
+ "pcmpeqb %%mm1, %%mm0 \n\t" /* binarize 255:0, comparing to 255 */
+@@ -2103,7 +2103,7 @@
+ "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
+ "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
+ "dec %%ecx \n\t" /* decrease loop counter */
+- "jnz .L1029 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+ :"m"(Src1), /* %1 */
+@@ -2186,7 +2186,7 @@
+ "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
+ "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
- ".L1030: \n\t"
+ "1: \n\t"
- "movq (%%eax), %%mm0 \n\t" // load 8 bytes from Src1 into MM0
- "paddusb %%mm1, %%mm0 \n\t" // MM0=SrcDest+(0xFF-Tmax)
- "psubusb %%mm7, %%mm0 \n\t" // MM0=MM0-(0xFF-Tmax+Tmin)
-@@ -2163,7 +2163,7 @@
- "add $8, %%eax \n\t" // increase Src1 register pointer by 8
- "add $8, %%edi \n\t" // increase Dest register pointer by 8
- "dec %%ecx \n\t" // decrease loop counter
-- "jnz .L1030 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
- :"m"(Src1), // %1
-@@ -2231,11 +2231,11 @@
- "mov %4, %%bx \n\t" // load Cmax in BX
- "sub %5, %%ax \n\t" // AX = Nmax - Nmin
- "sub %3, %%bx \n\t" // BX = Cmax - Cmin
-- "jz .L10311 \n\t" // check division by zero
-+ "jz 1f \n\t" // check division by zero
- "xor %%dx, %%dx \n\t" // prepare for division, zero DX
- "div %%bx \n\t" // AX = AX/BX
-- "jmp .L10312 \n\t" ".L10311: \n\t" "mov $255, %%ax \n\t" // if div by zero, assume result max. byte value
-- ".L10312: \n\t" // ** Duplicate AX in 4 words of MM0 **
-+ "jmp 2f \n\t" "1: \n\t" "mov $255, %%ax \n\t" // if div by zero, assume result max. byte value
-+ "2: \n\t" // ** Duplicate AX in 4 words of MM0 **
- "mov %%ax, %%bx \n\t" // copy AX into BX
- "shl $16, %%eax \n\t" // shift 2 bytes of EAX left
- "mov %%bx, %%ax \n\t" // copy BX into AX
-@@ -2264,7 +2264,7 @@
- "mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx
- "shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time)
- ".align 16 \n\t" // 16 byte allignment of the loop entry
+ "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from Src1 into MM0 */
+ "paddusb %%mm1, %%mm0 \n\t" /* MM0=SrcDest+(0xFF-Tmax) */
+ "psubusb %%mm7, %%mm0 \n\t" /* MM0=MM0-(0xFF-Tmax+Tmin) */
+@@ -2195,7 +2195,7 @@
+ "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
+ "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
+ "dec %%ecx \n\t" /* decrease loop counter */
+- "jnz .L1030 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+ :"m"(Src1), /* %1 */
+@@ -2263,11 +2263,11 @@
+ "mov %4, %%bx \n\t" /* load Cmax in BX */
+ "sub %5, %%ax \n\t" /* AX = Nmax - Nmin */
+ "sub %3, %%bx \n\t" /* BX = Cmax - Cmin */
+- "jz .L10311 \n\t" /* check division by zero */
++ "jz 1f \n\t" /* check division by zero */
+ "xor %%dx, %%dx \n\t" /* prepare for division, zero DX */
+ "div %%bx \n\t" /* AX = AX/BX */
+- "jmp .L10312 \n\t" ".L10311: \n\t" "mov $255, %%ax \n\t" /* if div by zero, assume result max. byte value */
+- ".L10312: \n\t" /* ** Duplicate AX in 4 words of MM0 ** */
++ "jmp 2f \n\t" "1: \n\t" "mov $255, %%ax \n\t" /* if div by zero, assume result max. byte value */
++ "2: \n\t" /* ** Duplicate AX in 4 words of MM0 ** */
+ "mov %%ax, %%bx \n\t" /* copy AX into BX */
+ "shl $16, %%eax \n\t" /* shift 2 bytes of EAX left */
+ "mov %%bx, %%ax \n\t" /* copy BX into AX */
+@@ -2296,7 +2296,7 @@
+ "mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */
+ "shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
- ".L1031: \n\t"
+ "3: \n\t"
- "movq (%%eax), %%mm3 \n\t" // load 8 bytes from Src1 into MM3
- "movq %%mm3, %%mm4 \n\t" // copy MM3 into MM4
- "punpcklbw %%mm7, %%mm3 \n\t" // unpack low bytes of SrcDest into words
-@@ -2289,7 +2289,7 @@
- "add $8, %%eax \n\t" // increase Src1 register pointer by 8
- "add $8, %%edi \n\t" // increase Dest register pointer by 8
- "dec %%ecx \n\t" // decrease loop counter
-- "jnz .L1031 \n\t" // check loop termination, proceed if required
-+ "jnz 3b \n\t" // check loop termination, proceed if required
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
- :"m"(Src1), // %1
-@@ -2383,10 +2383,10 @@
- "mov %2, %%edx \n\t" // initialize ROWS counter
- "sub $2, %%edx \n\t" // do not use first and last row
- // ---
-- ".L10320: \n\t" "mov %%eax, %%ecx \n\t" // initialize COLUMS counter
-+ "1: \n\t" "mov %%eax, %%ecx \n\t" // initialize COLUMS counter
- "sub $2, %%ecx \n\t" // do not use first and last column
- ".align 16 \n\t" // 16 byte allignment of the loop entry
+ "movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */
+ "movq %%mm3, %%mm4 \n\t" /* copy MM3 into MM4 */
+ "punpcklbw %%mm7, %%mm3 \n\t" /* unpack low bytes of SrcDest into words */
+@@ -2321,7 +2321,7 @@
+ "add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */
+ "add $8, %%edi \n\t" /* increase Dest register pointer by 8 */
+ "dec %%ecx \n\t" /* decrease loop counter */
+- "jnz .L1031 \n\t" /* check loop termination, proceed if required */
++ "jnz 3b \n\t" /* check loop termination, proceed if required */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+ :"m"(Src1), /* %1 */
+@@ -2415,10 +2415,10 @@
+ "mov %2, %%edx \n\t" /* initialize ROWS counter */
+ "sub $2, %%edx \n\t" /* do not use first and last row */
+ /* --- */
+- ".L10320: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */
++ "1: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */
+ "sub $2, %%ecx \n\t" /* do not use first and last column */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
- ".L10322: \n\t"
+ "2: \n\t"
- // ---
- "movq (%%esi), %%mm1 \n\t" // load 8 bytes of the image first row
- "add %%eax, %%esi \n\t" // move one row below
-@@ -2427,11 +2427,11 @@
- "inc %%edi \n\t" // move Dest pointer to the next pixel
- // ---
- "dec %%ecx \n\t" // decrease loop counter COLUMNS
-- "jnz .L10322 \n\t" // check loop termination, proceed if required
-+ "jnz 2b \n\t" // check loop termination, proceed if required
- "add $2, %%esi \n\t" // move to the next row in Src
- "add $2, %%edi \n\t" // move to the next row in Dest
- "dec %%edx \n\t" // decrease loop counter ROWS
-- "jnz .L10320 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- // ---
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
-@@ -2474,10 +2474,10 @@
- "mov %2, %%ebx \n\t" // initialize ROWS counter
- "sub $4, %%ebx \n\t" // do not use first 2 and last 2 rows
- // ---
-- ".L10330: \n\t" "mov %%eax, %%ecx \n\t" // initialize COLUMNS counter
-+ "1: \n\t" "mov %%eax, %%ecx \n\t" // initialize COLUMNS counter
- "sub $4, %%ecx \n\t" // do not use first 2 and last 2 columns
- ".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L10332: \n\t" "pxor %%mm7, %%mm7 \n\t" // zero MM7 (accumulator)
-+ "2: \n\t" "pxor %%mm7, %%mm7 \n\t" // zero MM7 (accumulator)
- "movd %%esi, %%mm6 \n\t" // save ESI in MM6
- // --- 1
- "movq (%%esi), %%mm1 \n\t" // load 8 bytes of the Src
-@@ -2577,11 +2577,11 @@
- "inc %%edi \n\t" // move Dest pointer to the next pixel
- // ---
- "dec %%ecx \n\t" // decrease loop counter COLUMNS
-- "jnz .L10332 \n\t" // check loop termination, proceed if required
-+ "jnz 2b \n\t" // check loop termination, proceed if required
- "add $4, %%esi \n\t" // move to the next row in Src
- "add $4, %%edi \n\t" // move to the next row in Dest
- "dec %%ebx \n\t" // decrease loop counter ROWS
-- "jnz .L10330 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- // ---
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
-@@ -2622,10 +2622,10 @@
- "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" // initialize ROWS counter
- "sub $6, %%ebx \n\t" // do not use first 3 and last 3 rows
- // ---
-- ".L10340: \n\t" "mov %%eax, %%ecx \n\t" // initialize COLUMNS counter
-+ "1: \n\t" "mov %%eax, %%ecx \n\t" // initialize COLUMNS counter
- "sub $6, %%ecx \n\t" // do not use first 3 and last 3 columns
- ".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L10342: \n\t" "pxor %%mm7, %%mm7 \n\t" // zero MM7 (accumulator)
-+ "2: \n\t" "pxor %%mm7, %%mm7 \n\t" // zero MM7 (accumulator)
- "movd %%esi, %%mm6 \n\t" // save ESI in MM6
- // --- 1
- "movq (%%esi), %%mm1 \n\t" // load 8 bytes of the Src
-@@ -2753,11 +2753,11 @@
- "inc %%edi \n\t" // move Dest pointer to the next pixel
- // ---
- "dec %%ecx \n\t" // decrease loop counter COLUMNS
-- "jnz .L10342 \n\t" // check loop termination, proceed if required
-+ "jnz 2b \n\t" // check loop termination, proceed if required
- "add $6, %%esi \n\t" // move to the next row in Src
- "add $6, %%edi \n\t" // move to the next row in Dest
- "dec %%ebx \n\t" // decrease loop counter ROWS
-- "jnz .L10340 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- // ---
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
-@@ -2798,10 +2798,10 @@
- "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" // initialize ROWS counter
- "sub $8, %%ebx \n\t" // do not use first 4 and last 4 rows
- // ---
-- ".L10350: \n\t" "mov %%eax, %%ecx \n\t" // initialize COLUMNS counter
-+ "1: \n\t" "mov %%eax, %%ecx \n\t" // initialize COLUMNS counter
- "sub $8, %%ecx \n\t" // do not use first 4 and last 4 columns
- ".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L10352: \n\t" "pxor %%mm7, %%mm7 \n\t" // zero MM7 (accumulator)
-+ "2: \n\t" "pxor %%mm7, %%mm7 \n\t" // zero MM7 (accumulator)
- "movd %%esi, %%mm6 \n\t" // save ESI in MM6
- // --- 1
- "movq (%%esi), %%mm1 \n\t" // load 8 bytes of the Src
-@@ -3020,11 +3020,11 @@
- "inc %%edi \n\t" // move Dest pointer to the next pixel
- // ---
- "dec %%ecx \n\t" // decrease loop counter COLUMNS
-- "jnz .L10352 \n\t" // check loop termination, proceed if required
-+ "jnz 2b \n\t" // check loop termination, proceed if required
- "add $8, %%esi \n\t" // move to the next row in Src
- "add $8, %%edi \n\t" // move to the next row in Dest
- "dec %%ebx \n\t" // decrease loop counter ROWS
-- "jnz .L10350 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- // ---
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
-@@ -3071,10 +3071,10 @@
- "mov %2, %%edx \n\t" // initialize ROWS counter
- "sub $2, %%edx \n\t" // do not use first and last row
- // ---
-- ".L10360: \n\t" "mov %%eax, %%ecx \n\t" // initialize COLUMS counter
-+ "1: \n\t" "mov %%eax, %%ecx \n\t" // initialize COLUMS counter
- "sub $2, %%ecx \n\t" // do not use first and last column
- ".align 16 \n\t" // 16 byte allignment of the loop entry
+ /* --- */
+ "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the image first row */
+ "add %%eax, %%esi \n\t" /* move one row below */
+@@ -2459,11 +2459,11 @@
+ "inc %%edi \n\t" /* move Dest pointer to the next pixel */
+ /* --- */
+ "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
+- "jnz .L10322 \n\t" /* check loop termination, proceed if required */
++ "jnz 2b \n\t" /* check loop termination, proceed if required */
+ "add $2, %%esi \n\t" /* move to the next row in Src */
+ "add $2, %%edi \n\t" /* move to the next row in Dest */
+ "dec %%edx \n\t" /* decrease loop counter ROWS */
+- "jnz .L10320 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ /* --- */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+@@ -2506,10 +2506,10 @@
+ "mov %2, %%ebx \n\t" /* initialize ROWS counter */
+ "sub $4, %%ebx \n\t" /* do not use first 2 and last 2 rows */
+ /* --- */
+- ".L10330: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */
++ "1: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */
+ "sub $4, %%ecx \n\t" /* do not use first 2 and last 2 columns */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
+- ".L10332: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */
++ "2: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */
+ "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */
+ /* --- 1 */
+ "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
+@@ -2609,11 +2609,11 @@
+ "inc %%edi \n\t" /* move Dest pointer to the next pixel */
+ /* --- */
+ "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
+- "jnz .L10332 \n\t" /* check loop termination, proceed if required */
++ "jnz 2b \n\t" /* check loop termination, proceed if required */
+ "add $4, %%esi \n\t" /* move to the next row in Src */
+ "add $4, %%edi \n\t" /* move to the next row in Dest */
+ "dec %%ebx \n\t" /* decrease loop counter ROWS */
+- "jnz .L10330 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ /* --- */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+@@ -2654,10 +2654,10 @@
+ "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" /* initialize ROWS counter */
+ "sub $6, %%ebx \n\t" /* do not use first 3 and last 3 rows */
+ /* --- */
+- ".L10340: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */
++ "1: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */
+ "sub $6, %%ecx \n\t" /* do not use first 3 and last 3 columns */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
+- ".L10342: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */
++ "2: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */
+ "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */
+ /* --- 1 */
+ "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
+@@ -2785,11 +2785,11 @@
+ "inc %%edi \n\t" /* move Dest pointer to the next pixel */
+ /* --- */
+ "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
+- "jnz .L10342 \n\t" /* check loop termination, proceed if required */
++ "jnz 2b \n\t" /* check loop termination, proceed if required */
+ "add $6, %%esi \n\t" /* move to the next row in Src */
+ "add $6, %%edi \n\t" /* move to the next row in Dest */
+ "dec %%ebx \n\t" /* decrease loop counter ROWS */
+- "jnz .L10340 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ /* --- */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+@@ -2830,10 +2830,10 @@
+ "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" /* initialize ROWS counter */
+ "sub $8, %%ebx \n\t" /* do not use first 4 and last 4 rows */
+ /* --- */
+- ".L10350: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */
++ "1: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */
+ "sub $8, %%ecx \n\t" /* do not use first 4 and last 4 columns */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
+- ".L10352: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */
++ "2: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */
+ "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */
+ /* --- 1 */
+ "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
+@@ -3052,11 +3052,11 @@
+ "inc %%edi \n\t" /* move Dest pointer to the next pixel */
+ /* --- */
+ "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
+- "jnz .L10352 \n\t" /* check loop termination, proceed if required */
++ "jnz 2b \n\t" /* check loop termination, proceed if required */
+ "add $8, %%esi \n\t" /* move to the next row in Src */
+ "add $8, %%edi \n\t" /* move to the next row in Dest */
+ "dec %%ebx \n\t" /* decrease loop counter ROWS */
+- "jnz .L10350 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ /* --- */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+@@ -3103,10 +3103,10 @@
+ "mov %2, %%edx \n\t" /* initialize ROWS counter */
+ "sub $2, %%edx \n\t" /* do not use first and last row */
+ /* --- */
+- ".L10360: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */
++ "1: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */
+ "sub $2, %%ecx \n\t" /* do not use first and last column */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
- ".L10362: \n\t"
+ "2: \n\t"
- // ---
- "movq (%%esi), %%mm1 \n\t" // load 8 bytes of the image first row
- "add %%eax, %%esi \n\t" // move one row below
-@@ -3107,11 +3107,11 @@
- "inc %%edi \n\t" // move Dest pointer to the next pixel
- // ---
- "dec %%ecx \n\t" // decrease loop counter COLUMNS
-- "jnz .L10362 \n\t" // check loop termination, proceed if required
-+ "jnz 2b \n\t" // check loop termination, proceed if required
- "add $2, %%esi \n\t" // move to the next row in Src
- "add $2, %%edi \n\t" // move to the next row in Dest
- "dec %%edx \n\t" // decrease loop counter ROWS
-- "jnz .L10360 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- // ---
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
-@@ -3154,10 +3154,10 @@
- "mov %2, %%ebx \n\t" // initialize ROWS counter
- "sub $4, %%ebx \n\t" // do not use first 2 and last 2 rows
- // ---
-- ".L10370: \n\t" "mov %%eax, %%ecx \n\t" // initialize COLUMNS counter
-+ "1: \n\t" "mov %%eax, %%ecx \n\t" // initialize COLUMNS counter
- "sub $4, %%ecx \n\t" // do not use first 2 and last 2 columns
- ".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L10372: \n\t" "pxor %%mm7, %%mm7 \n\t" // zero MM7 (accumulator)
-+ "2: \n\t" "pxor %%mm7, %%mm7 \n\t" // zero MM7 (accumulator)
- "movd %%esi, %%mm6 \n\t" // save ESI in MM6
- // --- 1
- "movq (%%esi), %%mm1 \n\t" // load 8 bytes of the Src
-@@ -3256,11 +3256,11 @@
- "inc %%edi \n\t" // move Dest pointer to the next pixel
- // ---
- "dec %%ecx \n\t" // decrease loop counter COLUMNS
-- "jnz .L10372 \n\t" // check loop termination, proceed if required
-+ "jnz 2b \n\t" // check loop termination, proceed if required
- "add $4, %%esi \n\t" // move to the next row in Src
- "add $4, %%edi \n\t" // move to the next row in Dest
- "dec %%ebx \n\t" // decrease loop counter ROWS
-- "jnz .L10370 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- // ---
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
-@@ -3301,10 +3301,10 @@
- "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" // initialize ROWS counter
- "sub $6, %%ebx \n\t" // do not use first 3 and last 3 rows
- // ---
-- ".L10380: \n\t" "mov %%eax, %%ecx \n\t" // initialize COLUMNS counter
-+ "1: \n\t" "mov %%eax, %%ecx \n\t" // initialize COLUMNS counter
- "sub $6, %%ecx \n\t" // do not use first 3 and last 3 columns
- ".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L10382: \n\t" "pxor %%mm7, %%mm7 \n\t" // zero MM7 (accumulator)
-+ "2: \n\t" "pxor %%mm7, %%mm7 \n\t" // zero MM7 (accumulator)
- "movd %%esi, %%mm6 \n\t" // save ESI in MM6
- // --- 1
- "movq (%%esi), %%mm1 \n\t" // load 8 bytes of the Src
-@@ -3435,11 +3435,11 @@
- "inc %%edi \n\t" // move Dest pointer to the next pixel
- // ---
- "dec %%ecx \n\t" // decrease loop counter COLUMNS
-- "jnz .L10382 \n\t" // check loop termination, proceed if required
-+ "jnz 2b \n\t" // check loop termination, proceed if required
- "add $6, %%esi \n\t" // move to the next row in Src
- "add $6, %%edi \n\t" // move to the next row in Dest
- "dec %%ebx \n\t" // decrease loop counter ROWS
-- "jnz .L10380 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- // ---
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
-@@ -3480,10 +3480,10 @@
- "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" // initialize ROWS counter
- "sub $8, %%ebx \n\t" // do not use first 4 and last 4 rows
- // ---
-- ".L10390: \n\t" "mov %%eax, %%ecx \n\t" // initialize COLUMNS counter
-+ "1: \n\t" "mov %%eax, %%ecx \n\t" // initialize COLUMNS counter
- "sub $8, %%ecx \n\t" // do not use first 4 and last 4 columns
- ".align 16 \n\t" // 16 byte allignment of the loop entry
-- ".L10392: \n\t" "pxor %%mm7, %%mm7 \n\t" // zero MM7 (accumulator)
-+ "2: \n\t" "pxor %%mm7, %%mm7 \n\t" // zero MM7 (accumulator)
- "movd %%esi, %%mm6 \n\t" // save ESI in MM6
- // --- 1
- "movq (%%esi), %%mm1 \n\t" // load 8 bytes of the Src
-@@ -3718,11 +3718,11 @@
- "inc %%edi \n\t" // move Dest pointer to the next pixel
- // ---
- "dec %%ecx \n\t" // decrease loop counter COLUMNS
-- "jnz .L10392 \n\t" // check loop termination, proceed if required
-+ "jnz 2b \n\t" // check loop termination, proceed if required
- "add $8, %%esi \n\t" // move to the next row in Src
- "add $8, %%edi \n\t" // move to the next row in Dest
- "dec %%ebx \n\t" // decrease loop counter ROWS
-- "jnz .L10390 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- // ---
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
-@@ -3761,12 +3761,12 @@
- "mov %2, %%edx \n\t" // initialize ROWS counter
- "sub $2, %%edx \n\t" // do not use first and last rows
- // ---
-- ".L10400: \n\t" "mov %%eax, %%ecx \n\t" // initialize COLUMS counter
-+ "1: \n\t" "mov %%eax, %%ecx \n\t" // initialize COLUMS counter
- "shr $3, %%ecx \n\t" // EBX/8 (MMX loads 8 bytes at a time)
- "mov %%esi, %%ebx \n\t" // save ESI in EBX
- "movd %%edi, %%mm1 \n\t" // save EDI in MM1
- ".align 16 \n\t" // 16 byte allignment of the loop entry
+ /* --- */
+ "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the image first row */
+ "add %%eax, %%esi \n\t" /* move one row below */
+@@ -3139,11 +3139,11 @@
+ "inc %%edi \n\t" /* move Dest pointer to the next pixel */
+ /* --- */
+ "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
+- "jnz .L10362 \n\t" /* check loop termination, proceed if required */
++ "jnz 2b \n\t" /* check loop termination, proceed if required */
+ "add $2, %%esi \n\t" /* move to the next row in Src */
+ "add $2, %%edi \n\t" /* move to the next row in Dest */
+ "dec %%edx \n\t" /* decrease loop counter ROWS */
+- "jnz .L10360 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ /* --- */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+@@ -3186,10 +3186,10 @@
+ "mov %2, %%ebx \n\t" /* initialize ROWS counter */
+ "sub $4, %%ebx \n\t" /* do not use first 2 and last 2 rows */
+ /* --- */
+- ".L10370: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */
++ "1: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */
+ "sub $4, %%ecx \n\t" /* do not use first 2 and last 2 columns */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
+- ".L10372: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */
++ "2: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */
+ "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */
+ /* --- 1 */
+ "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
+@@ -3288,11 +3288,11 @@
+ "inc %%edi \n\t" /* move Dest pointer to the next pixel */
+ /* --- */
+ "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
+- "jnz .L10372 \n\t" /* check loop termination, proceed if required */
++ "jnz 2b \n\t" /* check loop termination, proceed if required */
+ "add $4, %%esi \n\t" /* move to the next row in Src */
+ "add $4, %%edi \n\t" /* move to the next row in Dest */
+ "dec %%ebx \n\t" /* decrease loop counter ROWS */
+- "jnz .L10370 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ /* --- */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+@@ -3333,10 +3333,10 @@
+ "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" /* initialize ROWS counter */
+ "sub $6, %%ebx \n\t" /* do not use first 3 and last 3 rows */
+ /* --- */
+- ".L10380: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */
++ "1: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */
+ "sub $6, %%ecx \n\t" /* do not use first 3 and last 3 columns */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
+- ".L10382: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */
++ "2: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */
+ "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */
+ /* --- 1 */
+ "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
+@@ -3467,11 +3467,11 @@
+ "inc %%edi \n\t" /* move Dest pointer to the next pixel */
+ /* --- */
+ "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
+- "jnz .L10382 \n\t" /* check loop termination, proceed if required */
++ "jnz 2b \n\t" /* check loop termination, proceed if required */
+ "add $6, %%esi \n\t" /* move to the next row in Src */
+ "add $6, %%edi \n\t" /* move to the next row in Dest */
+ "dec %%ebx \n\t" /* decrease loop counter ROWS */
+- "jnz .L10380 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ /* --- */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+@@ -3512,10 +3512,10 @@
+ "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "add %%eax, %%edi \n\t" "mov %2, %%ebx \n\t" /* initialize ROWS counter */
+ "sub $8, %%ebx \n\t" /* do not use first 4 and last 4 rows */
+ /* --- */
+- ".L10390: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */
++ "1: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMNS counter */
+ "sub $8, %%ecx \n\t" /* do not use first 4 and last 4 columns */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
+- ".L10392: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */
++ "2: \n\t" "pxor %%mm7, %%mm7 \n\t" /* zero MM7 (accumulator) */
+ "movd %%esi, %%mm6 \n\t" /* save ESI in MM6 */
+ /* --- 1 */
+ "movq (%%esi), %%mm1 \n\t" /* load 8 bytes of the Src */
+@@ -3750,11 +3750,11 @@
+ "inc %%edi \n\t" /* move Dest pointer to the next pixel */
+ /* --- */
+ "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
+- "jnz .L10392 \n\t" /* check loop termination, proceed if required */
++ "jnz 2b \n\t" /* check loop termination, proceed if required */
+ "add $8, %%esi \n\t" /* move to the next row in Src */
+ "add $8, %%edi \n\t" /* move to the next row in Dest */
+ "dec %%ebx \n\t" /* decrease loop counter ROWS */
+- "jnz .L10390 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ /* --- */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+@@ -3793,12 +3793,12 @@
+ "mov %2, %%edx \n\t" /* initialize ROWS counter */
+ "sub $2, %%edx \n\t" /* do not use first and last rows */
+ /* --- */
+- ".L10400: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */
++ "1: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */
+ "shr $3, %%ecx \n\t" /* EBX/8 (MMX loads 8 bytes at a time) */
+ "mov %%esi, %%ebx \n\t" /* save ESI in EBX */
+ "movd %%edi, %%mm1 \n\t" /* save EDI in MM1 */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
- ".L10402: \n\t"
+ "2: \n\t"
- // ---
- "movq (%%esi), %%mm4 \n\t" // load 8 bytes from Src
- "movq %%mm4, %%mm5 \n\t" // save MM4 in MM5
-@@ -3844,13 +3844,13 @@
- "add $8, %%edi \n\t" // move Dest pointer to the next 8 pixels
- // ---
- "dec %%ecx \n\t" // decrease loop counter COLUMNS
-- "jnz .L10402 \n\t" // check loop termination, proceed if required
-+ "jnz 2b \n\t" // check loop termination, proceed if required
- "mov %%ebx, %%esi \n\t" // restore most left current row Src address
- "movd %%mm1, %%edi \n\t" // restore most left current row Dest address
- "add %%eax, %%esi \n\t" // move to the next row in Src
- "add %%eax, %%edi \n\t" // move to the next row in Dest
- "dec %%edx \n\t" // decrease loop counter ROWS
-- "jnz .L10400 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- // ---
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
-@@ -3889,12 +3889,12 @@
- // initialize ROWS counter
- "subl $2, %2 \n\t" // do not use first and last rows
- // ---
-- ".L10410: \n\t" "mov %%eax, %%ecx \n\t" // initialize COLUMS counter
-+ "1: \n\t" "mov %%eax, %%ecx \n\t" // initialize COLUMS counter
- "shr $3, %%ecx \n\t" // EBX/8 (MMX loads 8 bytes at a time)
- "mov %%esi, %%ebx \n\t" // save ESI in EBX
- "mov %%edi, %%edx \n\t" // save EDI in EDX
- ".align 16 \n\t" // 16 byte allignment of the loop entry
+ /* --- */
+ "movq (%%esi), %%mm4 \n\t" /* load 8 bytes from Src */
+ "movq %%mm4, %%mm5 \n\t" /* save MM4 in MM5 */
+@@ -3876,13 +3876,13 @@
+ "add $8, %%edi \n\t" /* move Dest pointer to the next 8 pixels */
+ /* --- */
+ "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
+- "jnz .L10402 \n\t" /* check loop termination, proceed if required */
++ "jnz 2b \n\t" /* check loop termination, proceed if required */
+ "mov %%ebx, %%esi \n\t" /* restore most left current row Src address */
+ "movd %%mm1, %%edi \n\t" /* restore most left current row Dest address */
+ "add %%eax, %%esi \n\t" /* move to the next row in Src */
+ "add %%eax, %%edi \n\t" /* move to the next row in Dest */
+ "dec %%edx \n\t" /* decrease loop counter ROWS */
+- "jnz .L10400 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ /* --- */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */
+@@ -3921,12 +3921,12 @@
+ /* initialize ROWS counter */
+ "subl $2, %2 \n\t" /* do not use first and last rows */
+ /* --- */
+- ".L10410: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */
++ "1: \n\t" "mov %%eax, %%ecx \n\t" /* initialize COLUMS counter */
+ "shr $3, %%ecx \n\t" /* EBX/8 (MMX loads 8 bytes at a time) */
+ "mov %%esi, %%ebx \n\t" /* save ESI in EBX */
+ "mov %%edi, %%edx \n\t" /* save EDI in EDX */
+ ".align 16 \n\t" /* 16 byte allignment of the loop entry */
- ".L10412: \n\t"
+ "2: \n\t"
- // ---
- "movq (%%esi), %%mm4 \n\t" // load 8 bytes from Src
- "movq %%mm4, %%mm5 \n\t" // save MM4 in MM5
-@@ -3984,13 +3984,13 @@
- "add $8, %%edi \n\t" // move Dest pointer to the next 8 pixels
- // ---
- "dec %%ecx \n\t" // decrease loop counter COLUMNS
-- "jnz .L10412 \n\t" // check loop termination, proceed if required
-+ "jnz 2b \n\t" // check loop termination, proceed if required
- "mov %%ebx, %%esi \n\t" // restore most left current row Src address
- "mov %%edx, %%edi \n\t" // restore most left current row Dest address
- "add %%eax, %%esi \n\t" // move to the next row in Src
- "add %%eax, %%edi \n\t" // move to the next row in Dest
- "decl %2 \n\t" // decrease loop counter ROWS
-- "jnz .L10410 \n\t" // check loop termination, proceed if required
-+ "jnz 1b \n\t" // check loop termination, proceed if required
- // ---
- "emms \n\t" // exit MMX state
- "popa \n\t":"=m" (Dest) // %0
+ /* --- */
+ "movq (%%esi), %%mm4 \n\t" /* load 8 bytes from Src */
+ "movq %%mm4, %%mm5 \n\t" /* save MM4 in MM5 */
+@@ -4016,13 +4016,13 @@
+ "add $8, %%edi \n\t" /* move Dest pointer to the next 8 pixels */
+ /* --- */
+ "dec %%ecx \n\t" /* decrease loop counter COLUMNS */
+- "jnz .L10412 \n\t" /* check loop termination, proceed if required */
++ "jnz 2b \n\t" /* check loop termination, proceed if required */
+ "mov %%ebx, %%esi \n\t" /* restore most left current row Src address */
+ "mov %%edx, %%edi \n\t" /* restore most left current row Dest address */
+ "add %%eax, %%esi \n\t" /* move to the next row in Src */
+ "add %%eax, %%edi \n\t" /* move to the next row in Dest */
+ "decl %2 \n\t" /* decrease loop counter ROWS */
+- "jnz .L10410 \n\t" /* check loop termination, proceed if required */
++ "jnz 1b \n\t" /* check loop termination, proceed if required */
+ /* --- */
+ "emms \n\t" /* exit MMX state */
+ "popa \n\t":"=m" (Dest) /* %0 */