New csum functions optimized for different processors.
	Author: Denis Vlasenko <vda@port.imtp.ilyichevsk.odessa.ua>

diff -urN linux-2.4.20-pre11/arch/i386/lib/Makefile linux-2.4.20-pre11csum/arch/i386/lib/Makefile
--- linux-2.4.20-pre11/arch/i386/lib/Makefile	Mon Sep 10 12:31:30 2001
+++ linux-2.4.20-pre11csum/arch/i386/lib/Makefile	Fri Nov  1 23:55:58 2002
@@ -7,9 +7,17 @@
 
 L_TARGET = lib.a
 
-obj-y = checksum.o old-checksum.o delay.o \
+obj-y = old-checksum.o delay.o \
 	usercopy.o getuser.o \
-	memcpy.o strstr.o
+	memcpy.o strstr.o \
+	bench_csum.o \
+	bench_func.o \
+	csum.o \
+	csum_basic.o \
+	csum_naive.o \
+	csum_3dnow.o \
+	csum_ssemmxplus.o \
+	csumcpy.o
 
 obj-$(CONFIG_X86_USE_3DNOW) += mmx.o
 obj-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
diff -urN linux-2.4.20-pre11/arch/i386/lib/bench_csum.c linux-2.4.20-pre11csum/arch/i386/lib/bench_csum.c
--- linux-2.4.20-pre11/arch/i386/lib/bench_csum.c	Wed Dec 31 22:00:00 1969
+++ linux-2.4.20-pre11csum/arch/i386/lib/bench_csum.c	Sat Nov  2 11:51:40 2002
@@ -0,0 +1,216 @@
+#include <linux/mm.h>		// for get_pages
+#include <asm/uaccess.h>	// for access_ok in asm/checksum.h
+#include <linux/in6.h>		// for in6_addr in asm/checksum.h
+#include <asm/byteorder.h>	// for ntoh in asm/checksum.h
+#include <asm/cpufeature.h>	// for X86_FEATURE_xx
+#include <linux/byteorder/generic.h>	// for ntohX in asm/checksum.h
+#include <linux/stddef.h>	// for NULL in asm/checksum.h
+#include <linux/linkage.h>	// for asmlinkage in asm/checksum.h
+#include <linux/module.h>
+
+#include <asm/checksum.h>
+#include "bench_func.h"
+
+//#define dprintk(a...)	printk(a)
+#define dprintk(a...) ((void)0)
+
+/* Features usable for mem optimization:
+	Intel
+X86_FEATURE_FPU		Onboard FPU
+X86_FEATURE_MMX		Multimedia Extensions
+X86_FEATURE_XMM		Streaming SIMD Extensions
+X86_FEATURE_XMM2	Streaming SIMD Extensions-2
+	AMD
+X86_FEATURE_3DNOW	3DNow!
+X86_FEATURE_MMXEXT	AMD MMX extensions
+X86_FEATURE_3DNOWEXT	AMD 3DNow! extensions
+	Cyrix
+X86_FEATURE_CXMMX	Cyrix MMX extensions
+*/
+
+typedef typeof(jiffies) jiffies_t;
+
+typedef void asm_helper(void);
+
+extern asm_helper csum_basic;
+extern asm_helper csum_naive;
+extern asm_helper csum_3dnow;
+extern asm_helper csum_ssemmxplus;
+
+static struct candidate csum_runner[] = {
+    { "basic"	, csum_basic     , 1, { -1 } },
+    { "simple"	, csum_naive     , 1, { -1 } },
+    { "3Dnow!"	, csum_3dnow     , 1, { X86_FEATURE_3DNOW, -1 } },
+    { "AMD MMX", csum_ssemmxplus, 1, { X86_FEATURE_MMXEXT, -1 } },
+    { "SSE1+", csum_ssemmxplus, 1, { X86_FEATURE_XMM, -1 } },
+};
+
+extern asm_helper csumcpy_basic;
+extern asm_helper csumcpy_naive;
+extern asm_helper csumcpy_ssemmxplus;
+extern asm_helper csumcpy_sse;
+
+static struct candidate csumcpy_runner[] = {
+    { "basic"	, csumcpy_basic     , 2, { -1 } },
+    { "simple"	, csumcpy_naive     , 2, { -1 } },
+    /* higher weight: we prefer these for less cache pollution: */
+    { "AND MMX", csumcpy_ssemmxplus, 3, { X86_FEATURE_MMXEXT, -1 } },
+    { "SSE1+", csumcpy_ssemmxplus, 3, { X86_FEATURE_XMM, -1 } },
+    { "SSE1"	, csumcpy_sse       , 3, { X86_FEATURE_XMM, -1 } },
+};
+
+//====== TODO: split here: above: arch, below:generic
+
+/* set this to value bigger than cache(s) */
+/* TODO: heuristic for buffer size */
+#define bufshift	20		/* 10=1kb, 20=1MB etc */
+/* typical size of a packet */
+#define chunksz		(4*1024)
+
+#define bufsz		(1<<bufshift)
+#define chunkcnt	(bufsz/chunksz)
+
+#define VECTOR_SZ(a)	(sizeof(a)/sizeof((a)[0]))
+
+asm_helper *best_csum = csum_basic;
+asm_helper *best_csumcpy = csumcpy_basic;
+
+/*
+** Count the number of iterations done during a fixed period,
+** and use this to calculate throughput.
+*/
+
+static int duration = 1;	// jiffies for each run
+static int report;
+
+static inline void
+wait_for_jiffy(void) {
+	jiffies_t now = jiffies;
+	while(now == jiffies) cpu_relax();
+}
+
+static int
+bench_csum(struct candidate *cand, char *buf)
+{
+	int i, max;
+	best_csum = (asm_helper*)(cand->f);
+	
+	max = 0;
+	// In practice these are pretty repeatable
+	// so 3 runs is an overkill
+	for(i=0; i<3; i++) {
+		int count = 0;	
+		jiffies_t limit;
+		wait_for_jiffy();
+		limit = jiffies+duration;
+		while(time_before(jiffies, limit)) {
+			int i;
+			mb();
+			// interleaved to avoid bias due to prefetch
+			for(i=0; i<chunkcnt; i+=2)
+				csum_partial(buf+i*chunksz, chunksz, 0);
+			for(i=1; i<chunkcnt; i+=2)
+				csum_partial(buf+i*chunksz, chunksz, 0);
+			mb();
+			count++;
+			mb();
+		}
+		dprintk("   count =%6i\n",count);
+		if(count>max)
+			max = count;
+	}
+
+	if(report) {
+		int kb_sec = max * (((chunksz*chunkcnt)/1024) * HZ) / duration;
+		printk("   %-10s:%6d.%03d MB/sec\n", cand->name,
+			kb_sec / 1000, kb_sec % 1000);
+	}
+	       
+	return max;
+}
+
+static int
+bench_csumcpy(struct candidate *cand, char *buf)
+{
+	int err;
+	int i, max;
+	best_csumcpy = (asm_helper*)(cand->f);
+
+	max = 0;
+	for(i=0; i<3; i++) {
+		int count = 0;
+		jiffies_t limit;
+		wait_for_jiffy();
+		limit = jiffies+duration;
+		while(time_before(jiffies, limit)) {
+			int i;
+			mb();
+			// interleaved to avoid bias due to prefetch
+			for(i=0; i<chunkcnt; i+=2)
+				csum_partial_copy_generic(buf+i*chunksz,
+					buf+(chunkcnt-1-i)*chunksz,
+					chunksz, 0, &err, &err);
+			for(i=1; i<chunkcnt; i+=2)
+				csum_partial_copy_generic(buf+i*chunksz,
+					buf+(chunkcnt-1-i)*chunksz,
+					chunksz, 0, &err, &err);
+			mb();
+			count++;
+			mb();
+		}
+		dprintk("   count =%6i\n",count);
+		if(count>max)
+			max = count;
+	}
+
+	if(report) {
+		int kb_sec = max * (((chunksz*chunkcnt)/1024) * HZ) / duration;
+		printk("   %-10s:%6d.%03d MB/sec\n", cand->name,
+			kb_sec / 1000, kb_sec % 1000);
+	}
+	       
+	return max;
+}
+
+static int
+find_best_csum(void)
+{
+	struct candidate *best;
+	char *buffer = (char *) __get_free_pages(GFP_KERNEL,
+					(bufshift-PAGE_SHIFT));
+					
+	printk(KERN_INFO "Measuring network checksumming speed\n");
+	if(!buffer) {
+		printk("csum: cannot allocate %i pages\n",
+			1<<(bufshift-PAGE_SHIFT)
+		);
+		return -ENOMEM;
+	}
+	dprintk("allocated %i pages\n",1<<(bufshift-PAGE_SHIFT));
+
+	// find # of jiffies suitable for reliable results
+	// (at least %5 accuracy)
+	while(bench_csumcpy(&csumcpy_runner[0], buffer)<20) {
+		duration<<=1;
+	}
+	dprintk("test run will last %i ticks\n", duration);
+	report = 1;
+
+	best = find_best(bench_csum, buffer, csum_runner,
+			VECTOR_SZ(csum_runner));
+	printk("csum: using csum function: %s\n", best->name);
+	best_csum = (asm_helper*)(best->f);
+
+	best = find_best(bench_csumcpy, buffer, csumcpy_runner,
+			VECTOR_SZ(csumcpy_runner));
+	printk("csum: using csum_copy function: %s\n", best->name);
+	best_csumcpy = (asm_helper*)(best->f);
+	
+	free_pages((unsigned long)buffer, (bufshift-PAGE_SHIFT));
+	dprintk("freed %i pages\n",1<<(bufshift-PAGE_SHIFT));
+	return 0;
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(find_best_csum);
diff -urN linux-2.4.20-pre11/arch/i386/lib/bench_func.c linux-2.4.20-pre11csum/arch/i386/lib/bench_func.c
--- linux-2.4.20-pre11/arch/i386/lib/bench_func.c	Wed Dec 31 22:00:00 1969
+++ linux-2.4.20-pre11csum/arch/i386/lib/bench_func.c	Fri Nov  1 18:08:37 2002
@@ -0,0 +1,53 @@
+#include <linux/kernel.h>	// for KERN_DEBUG
+
+#include <asm/bitops.h>		// for test_bit
+#include <asm/processor.h>	// cpu caps
+#include <asm/cpufeature.h>	// cpu features constants
+#include "bench_func.h"
+
+//#define dprintk(a...)	printk(a)
+#define dprintk(a...) ((void)0)
+
+// 2.4 only, already in 2.5
+extern inline int
+boot_cpu_has(int cap)
+{
+	return test_bit(cap,  boot_cpu_data.x86_capability);
+}
+
+extern inline int
+cpu_supports(int *cap)
+{
+	while(*cap != -1) {
+		if(!boot_cpu_has(*cap)) {
+			dprintk("unsupported caps: %i\n", *cap);
+			return 0;
+		}
+		cap++;
+	}
+	return 1;
+}
+
+/*
+** Call all the candidates which can be run on this CPU,
+** find the best
+*/
+struct candidate*
+find_best(bench_func *bench, char *opaque, struct candidate runner[], int count)
+{
+	int score, max = 0;
+	struct candidate *best = 0;
+	while(count--) {
+		if(!cpu_supports(runner->cpu_caps_needed)) {
+			printk("func %s skipped: not supported by CPU\n", runner->name);
+		} else {
+			score = bench(runner,opaque) * runner->weight;
+			if(max < score) {
+				max = score;
+				best = runner;
+			}
+		}
+		runner++;
+	}
+	return best;
+}
diff -urN linux-2.4.20-pre11/arch/i386/lib/bench_func.h linux-2.4.20-pre11csum/arch/i386/lib/bench_func.h
--- linux-2.4.20-pre11/arch/i386/lib/bench_func.h	Wed Dec 31 22:00:00 1969
+++ linux-2.4.20-pre11csum/arch/i386/lib/bench_func.h	Fri Nov  1 18:08:37 2002
@@ -0,0 +1,16 @@
+#ifndef _BENCH_FUNC_H
+#define _BENCH_FUNC_H
+ 
+struct candidate {
+	const char	*name;
+	void 		*f;	// pointer to func
+	int		weight;
+	int		cpu_caps_needed[4];
+};
+
+typedef int bench_func(struct candidate *cand, char *opaque);
+
+struct candidate* find_best(bench_func *bench, char *opaque,
+		struct candidate runner[], int count);
+
+#endif
diff -urN linux-2.4.20-pre11/arch/i386/lib/checksum.S linux-2.4.20-pre11csum/arch/i386/lib/checksum.S
--- linux-2.4.20-pre11/arch/i386/lib/checksum.S	Fri Nov  1 18:06:59 2002
+++ linux-2.4.20-pre11csum/arch/i386/lib/checksum.S	Wed Dec 31 22:00:00 1969
@@ -1,496 +0,0 @@
-/*
- * INET		An implementation of the TCP/IP protocol suite for the LINUX
- *		operating system.  INET is implemented using the  BSD Socket
- *		interface as the means of communication with the user level.
- *
- *		IP/TCP/UDP checksumming routines
- *
- * Authors:	Jorge Cwik, <jorge@laser.satlink.net>
- *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
- *		Tom May, <ftom@netcom.com>
- *              Pentium Pro/II routines:
- *              Alexander Kjeldaas <astor@guardian.no>
- *              Finn Arne Gangstad <finnag@guardian.no>
- *		Lots of code moved from tcp.c and ip.c; see those files
- *		for more names.
- *
- * Changes:     Ingo Molnar, converted csum_partial_copy() to 2.1 exception
- *			     handling.
- *		Andi Kleen,  add zeroing on error
- *                   converted to pure assembler
- *
- *		This program is free software; you can redistribute it and/or
- *		modify it under the terms of the GNU General Public License
- *		as published by the Free Software Foundation; either version
- *		2 of the License, or (at your option) any later version.
- */
-
-#include <linux/config.h>
-#include <asm/errno.h>
-				
-/*
- * computes a partial checksum, e.g. for TCP/UDP fragments
- */
-
-/*	
-unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
- */
-		
-.text
-.align 4
-.globl csum_partial								
-		
-#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
-
-	  /*		
-	   * Experiments with Ethernet and SLIP connections show that buff
-	   * is aligned on either a 2-byte or 4-byte boundary.  We get at
-	   * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
-	   * Fortunately, it is easy to convert 2-byte alignment to 4-byte
-	   * alignment for the unrolled loop.
-	   */		
-csum_partial:	
-	pushl %esi
-	pushl %ebx
-	movl 20(%esp),%eax	# Function arg: unsigned int sum
-	movl 16(%esp),%ecx	# Function arg: int len
-	movl 12(%esp),%esi	# Function arg: unsigned char *buff
-	testl $3, %esi		# Check alignment.
-	jz 2f			# Jump if alignment is ok.
-	testl $1, %esi		# Check alignment.
-	jz 10f			# Jump if alignment is boundary of 2bytes.
-
-	# buf is odd
-	dec %ecx
-	jl 8f
-	movzbl (%esi), %ebx
-	adcl %ebx, %eax
-	roll $8, %eax
-	inc %esi
-	testl $2, %esi
-	jz 2f
-10:
-	subl $2, %ecx		# Alignment uses up two bytes.
-	jae 1f			# Jump if we had at least two bytes.
-	addl $2, %ecx		# ecx was < 2.  Deal with it.
-	jmp 4f
-1:	movw (%esi), %bx
-	addl $2, %esi
-	addw %bx, %ax
-	adcl $0, %eax
-2:
-	movl %ecx, %edx
-	shrl $5, %ecx
-	jz 2f
-	testl %esi, %esi
-1:	movl (%esi), %ebx
-	adcl %ebx, %eax
-	movl 4(%esi), %ebx
-	adcl %ebx, %eax
-	movl 8(%esi), %ebx
-	adcl %ebx, %eax
-	movl 12(%esi), %ebx
-	adcl %ebx, %eax
-	movl 16(%esi), %ebx
-	adcl %ebx, %eax
-	movl 20(%esi), %ebx
-	adcl %ebx, %eax
-	movl 24(%esi), %ebx
-	adcl %ebx, %eax
-	movl 28(%esi), %ebx
-	adcl %ebx, %eax
-	lea 32(%esi), %esi
-	dec %ecx
-	jne 1b
-	adcl $0, %eax
-2:	movl %edx, %ecx
-	andl $0x1c, %edx
-	je 4f
-	shrl $2, %edx		# This clears CF
-3:	adcl (%esi), %eax
-	lea 4(%esi), %esi
-	dec %edx
-	jne 3b
-	adcl $0, %eax
-4:	andl $3, %ecx
-	jz 7f
-	cmpl $2, %ecx
-	jb 5f
-	movw (%esi),%cx
-	leal 2(%esi),%esi
-	je 6f
-	shll $16,%ecx
-5:	movb (%esi),%cl
-6:	addl %ecx,%eax
-	adcl $0, %eax 
-7:	
-	testl $1, 12(%esp)
-	jz 8f
-	roll $8, %eax
-8:
-	popl %ebx
-	popl %esi
-	ret
-
-#else
-
-/* Version for PentiumII/PPro */
-
-csum_partial:
-	pushl %esi
-	pushl %ebx
-	movl 20(%esp),%eax	# Function arg: unsigned int sum
-	movl 16(%esp),%ecx	# Function arg: int len
-	movl 12(%esp),%esi	# Function arg:	const unsigned char *buf
-
-	testl $3, %esi         
-	jnz 25f                 
-10:
-	movl %ecx, %edx
-	movl %ecx, %ebx
-	andl $0x7c, %ebx
-	shrl $7, %ecx
-	addl %ebx,%esi
-	shrl $2, %ebx  
-	negl %ebx
-	lea 45f(%ebx,%ebx,2), %ebx
-	testl %esi, %esi
-	jmp *%ebx
-
-	# Handle 2-byte-aligned regions
-20:	addw (%esi), %ax
-	lea 2(%esi), %esi
-	adcl $0, %eax
-	jmp 10b
-25:
-	testl $1, %esi         
-	jz 30f                 
-	# buf is odd
-	dec %ecx
-	jl 90f
-	movzbl (%esi), %ebx
-	addl %ebx, %eax
-	adcl $0, %eax
-	roll $8, %eax
-	inc %esi
-	testl $2, %esi
-	jz 10b
-
-30:	subl $2, %ecx          
-	ja 20b                 
-	je 32f
-	addl $2, %ecx
-	jz 80f
-	movzbl (%esi),%ebx	# csumming 1 byte, 2-aligned
-	addl %ebx, %eax
-	adcl $0, %eax
-	jmp 80f
-32:
-	addw (%esi), %ax	# csumming 2 bytes, 2-aligned
-	adcl $0, %eax
-	jmp 80f
-
-40: 
-	addl -128(%esi), %eax
-	adcl -124(%esi), %eax
-	adcl -120(%esi), %eax
-	adcl -116(%esi), %eax   
-	adcl -112(%esi), %eax   
-	adcl -108(%esi), %eax
-	adcl -104(%esi), %eax
-	adcl -100(%esi), %eax
-	adcl -96(%esi), %eax
-	adcl -92(%esi), %eax
-	adcl -88(%esi), %eax
-	adcl -84(%esi), %eax
-	adcl -80(%esi), %eax
-	adcl -76(%esi), %eax
-	adcl -72(%esi), %eax
-	adcl -68(%esi), %eax
-	adcl -64(%esi), %eax     
-	adcl -60(%esi), %eax     
-	adcl -56(%esi), %eax     
-	adcl -52(%esi), %eax   
-	adcl -48(%esi), %eax   
-	adcl -44(%esi), %eax
-	adcl -40(%esi), %eax
-	adcl -36(%esi), %eax
-	adcl -32(%esi), %eax
-	adcl -28(%esi), %eax
-	adcl -24(%esi), %eax
-	adcl -20(%esi), %eax
-	adcl -16(%esi), %eax
-	adcl -12(%esi), %eax
-	adcl -8(%esi), %eax
-	adcl -4(%esi), %eax
-45:
-	lea 128(%esi), %esi
-	adcl $0, %eax
-	dec %ecx
-	jge 40b
-	movl %edx, %ecx
-50:	andl $3, %ecx
-	jz 80f
-
-	# Handle the last 1-3 bytes without jumping
-	notl %ecx		# 1->2, 2->1, 3->0, higher bits are masked
-	movl $0xffffff,%ebx	# by the shll and shrl instructions
-	shll $3,%ecx
-	shrl %cl,%ebx
-	andl -128(%esi),%ebx	# esi is 4-aligned so should be ok
-	addl %ebx,%eax
-	adcl $0,%eax
-80: 
-	testl $1, 12(%esp)
-	jz 90f
-	roll $8, %eax
-90: 
-	popl %ebx
-	popl %esi
-	ret
-				
-#endif
-
-/*
-unsigned int csum_partial_copy_generic (const char *src, char *dst,
-				  int len, int sum, int *src_err_ptr, int *dst_err_ptr)
- */ 
-
-/*
- * Copy from ds while checksumming, otherwise like csum_partial
- *
- * The macros SRC and DST specify the type of access for the instruction.
- * thus we can call a custom exception handler for all access types.
- *
- * FIXME: could someone double-check whether I haven't mixed up some SRC and
- *	  DST definitions? It's damn hard to trigger all cases.  I hope I got
- *	  them all but there's no guarantee.
- */
-
-#define SRC(y...)			\
-	9999: y;			\
-	.section __ex_table, "a";	\
-	.long 9999b, 6001f	;	\
-	.previous
-
-#define DST(y...)			\
-	9999: y;			\
-	.section __ex_table, "a";	\
-	.long 9999b, 6002f	;	\
-	.previous
-
-.align 4
-.globl csum_partial_copy_generic
-				
-#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
-
-#define ARGBASE 16		
-#define FP		12
-		
-csum_partial_copy_generic:
-	subl  $4,%esp	
-	pushl %edi
-	pushl %esi
-	pushl %ebx
-	movl ARGBASE+16(%esp),%eax	# sum
-	movl ARGBASE+12(%esp),%ecx	# len
-	movl ARGBASE+4(%esp),%esi	# src
-	movl ARGBASE+8(%esp),%edi	# dst
-
-	testl $2, %edi			# Check alignment. 
-	jz 2f				# Jump if alignment is ok.
-	subl $2, %ecx			# Alignment uses up two bytes.
-	jae 1f				# Jump if we had at least two bytes.
-	addl $2, %ecx			# ecx was < 2.  Deal with it.
-	jmp 4f
-SRC(1:	movw (%esi), %bx	)
-	addl $2, %esi
-DST(	movw %bx, (%edi)	)
-	addl $2, %edi
-	addw %bx, %ax	
-	adcl $0, %eax
-2:
-	movl %ecx, FP(%esp)
-	shrl $5, %ecx
-	jz 2f
-	testl %esi, %esi
-SRC(1:	movl (%esi), %ebx	)
-SRC(	movl 4(%esi), %edx	)
-	adcl %ebx, %eax
-DST(	movl %ebx, (%edi)	)
-	adcl %edx, %eax
-DST(	movl %edx, 4(%edi)	)
-
-SRC(	movl 8(%esi), %ebx	)
-SRC(	movl 12(%esi), %edx	)
-	adcl %ebx, %eax
-DST(	movl %ebx, 8(%edi)	)
-	adcl %edx, %eax
-DST(	movl %edx, 12(%edi)	)
-
-SRC(	movl 16(%esi), %ebx 	)
-SRC(	movl 20(%esi), %edx	)
-	adcl %ebx, %eax
-DST(	movl %ebx, 16(%edi)	)
-	adcl %edx, %eax
-DST(	movl %edx, 20(%edi)	)
-
-SRC(	movl 24(%esi), %ebx	)
-SRC(	movl 28(%esi), %edx	)
-	adcl %ebx, %eax
-DST(	movl %ebx, 24(%edi)	)
-	adcl %edx, %eax
-DST(	movl %edx, 28(%edi)	)
-
-	lea 32(%esi), %esi
-	lea 32(%edi), %edi
-	dec %ecx
-	jne 1b
-	adcl $0, %eax
-2:	movl FP(%esp), %edx
-	movl %edx, %ecx
-	andl $0x1c, %edx
-	je 4f
-	shrl $2, %edx			# This clears CF
-SRC(3:	movl (%esi), %ebx	)
-	adcl %ebx, %eax
-DST(	movl %ebx, (%edi)	)
-	lea 4(%esi), %esi
-	lea 4(%edi), %edi
-	dec %edx
-	jne 3b
-	adcl $0, %eax
-4:	andl $3, %ecx
-	jz 7f
-	cmpl $2, %ecx
-	jb 5f
-SRC(	movw (%esi), %cx	)
-	leal 2(%esi), %esi
-DST(	movw %cx, (%edi)	)
-	leal 2(%edi), %edi
-	je 6f
-	shll $16,%ecx
-SRC(5:	movb (%esi), %cl	)
-DST(	movb %cl, (%edi)	)
-6:	addl %ecx, %eax
-	adcl $0, %eax
-7:
-5000:
-
-# Exception handler:
-.section .fixup, "ax"							
-
-6001:
-	movl ARGBASE+20(%esp), %ebx	# src_err_ptr
-	movl $-EFAULT, (%ebx)
-
-	# zero the complete destination - computing the rest
-	# is too much work 
-	movl ARGBASE+8(%esp), %edi	# dst
-	movl ARGBASE+12(%esp), %ecx	# len
-	xorl %eax,%eax
-	rep ; stosb
-
-	jmp 5000b
-
-6002:
-	movl ARGBASE+24(%esp), %ebx	# dst_err_ptr
-	movl $-EFAULT,(%ebx)
-	jmp 5000b
-
-.previous
-
-	popl %ebx
-	popl %esi
-	popl %edi
-	popl %ecx			# equivalent to addl $4,%esp
-	ret	
-
-#else
-
-/* Version for PentiumII/PPro */
-
-#define ROUND1(x) \
-	SRC(movl x(%esi), %ebx	)	;	\
-	addl %ebx, %eax			;	\
-	DST(movl %ebx, x(%edi)	)	; 
-
-#define ROUND(x) \
-	SRC(movl x(%esi), %ebx	)	;	\
-	adcl %ebx, %eax			;	\
-	DST(movl %ebx, x(%edi)	)	;
-
-#define ARGBASE 12
-		
-csum_partial_copy_generic:
-	pushl %ebx
-	pushl %edi
-	pushl %esi
-	movl ARGBASE+4(%esp),%esi	#src
-	movl ARGBASE+8(%esp),%edi	#dst	
-	movl ARGBASE+12(%esp),%ecx	#len
-	movl ARGBASE+16(%esp),%eax	#sum
-#	movl %ecx, %edx  
-	movl %ecx, %ebx  
-	movl %esi, %edx
-	shrl $6, %ecx     
-	andl $0x3c, %ebx  
-	negl %ebx
-	subl %ebx, %esi  
-	subl %ebx, %edi  
-	lea  -1(%esi),%edx
-	andl $-32,%edx
-	lea 3f(%ebx,%ebx), %ebx
-	testl %esi, %esi 
-	jmp *%ebx
-1:	addl $64,%esi
-	addl $64,%edi 
-	SRC(movb -32(%edx),%bl)	; SRC(movb (%edx),%bl)
-	ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)	
-	ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)	
-	ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)	
-	ROUND (-16) ROUND(-12) ROUND(-8)  ROUND(-4)	
-3:	adcl $0,%eax
-	addl $64, %edx
-	dec %ecx
-	jge 1b
-4:	movl ARGBASE+12(%esp),%edx	#len
-	andl $3, %edx
-	jz 7f
-	cmpl $2, %edx
-	jb 5f
-SRC(	movw (%esi), %dx         )
-	leal 2(%esi), %esi
-DST(	movw %dx, (%edi)         )
-	leal 2(%edi), %edi
-	je 6f
-	shll $16,%edx
-5:
-SRC(	movb (%esi), %dl         )
-DST(	movb %dl, (%edi)         )
-6:	addl %edx, %eax
-	adcl $0, %eax
-7:
-.section .fixup, "ax"
-6001:	movl	ARGBASE+20(%esp), %ebx	# src_err_ptr	
-	movl $-EFAULT, (%ebx)
-	# zero the complete destination (computing the rest is too much work)
-	movl ARGBASE+8(%esp),%edi	# dst
-	movl ARGBASE+12(%esp),%ecx	# len
-	xorl %eax,%eax
-	rep; stosb
-	jmp 7b
-6002:	movl ARGBASE+24(%esp), %ebx	# dst_err_ptr
-	movl $-EFAULT, (%ebx)
-	jmp  7b			
-.previous				
-
-	popl %esi
-	popl %edi
-	popl %ebx
-	ret
-				
-#undef ROUND
-#undef ROUND1		
-		
-#endif
diff -urN linux-2.4.20-pre11/arch/i386/lib/csum.S linux-2.4.20-pre11csum/arch/i386/lib/csum.S
--- linux-2.4.20-pre11/arch/i386/lib/csum.S	Wed Dec 31 22:00:00 1969
+++ linux-2.4.20-pre11csum/arch/i386/lib/csum.S	Fri Nov  1 22:45:31 2002
@@ -0,0 +1,97 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		IP/TCP/UDP checksumming routines
+ *
+ * Authors:	Jorge Cwik, <jorge@laser.satlink.net>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Tom May, <ftom@netcom.com>
+ *              Pentium Pro/II routines:
+ *              Alexander Kjeldaas <astor@guardian.no>
+ *              Finn Arne Gangstad <finnag@guardian.no>
+ *		Lots of code moved from tcp.c and ip.c; see those files
+ *		for more names.
+ *
+ * Changes:     Ingo Molnar, converted csum_partial_copy() to 2.1 exception
+ *			     handling.
+ *		Andi Kleen,  add zeroing on error converted to pure assembler
+ *		2002-10-30 Denis Vlasenko
+ *			boot-time benchmarking, 3Dnow/MMX+/SSE versions
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+/*
+** computes a partial checksum, e.g. for TCP/UDP fragments
+**
+** unsigned int csum_partial(const unsigned char * buff,
+**		int len, unsigned int sum)
+*/
+	   
+.text
+.align 4
+.globl csum_partial
+
+csum_partial:
+	pushl	%esi
+	pushl	%ebx
+	movl	20(%esp), %eax	# arg: sum
+	movl	16(%esp), %ecx	# arg: len
+	movl	12(%esp), %esi	# arg:	buf
+
+	testl	$3, %esi
+	jz	40f
+20:
+    # not 4-aligned: analyze and align...
+	testl	$1, %esi
+	jz	30f
+	
+    # unaligned start addr
+	decl	%ecx
+	js	90f 		# sz==0, exit
+	movzbl	(%esi), %ebx	# eat one byte...
+	addl	%ebx, %eax
+	adcl	$0, %eax
+	roll	$8, %eax	# NB: need to be undone at exit!
+	incl	%esi
+	testl	$2, %esi
+	jz	40f
+30:
+    # Note: 2-aligned, but not 4-aligned
+	cmpl	$3, %ecx
+	jbe	60f
+	addw	(%esi), %ax	# eat 2 bytes
+	leal	2(%esi), %esi
+	adcl	$0, %eax
+	subl	$2, %ecx
+40:
+    # esi is 4-aligned here, call block routine
+	movl	$csum_basic, %ebx	# known ok even for ecx==0 etc
+	cmpl	$128, %ecx		# use optimized routine
+	jb	50f			#   only for large blocks
+	movl	best_csum, %ebx
+50:	call	*%ebx
+60:
+    # handle the last 0-3 bytes without much jumping
+	jecxz	80f
+	notl	%ecx		# 0->3, 1->2, 2->1, 3->0, higher bits are masked
+	movl	$0xffffff, %ebx	#   by the shll and shrl instructions
+	shll	$3, %ecx
+	shrl	%cl, %ebx
+	andl	(%esi), %ebx	# esi is 4-aligned so should be ok
+	addl	%ebx, %eax
+	adcl	$0, %eax
+80:
+    # undo csum rotation if start addr was odd
+	testl	$1, 12(%esp)
+	jz	90f
+	roll	$8, %eax
+90: 
+	popl	%ebx
+	popl	%esi
+	ret
diff -urN linux-2.4.20-pre11/arch/i386/lib/csum_3dnow.S linux-2.4.20-pre11csum/arch/i386/lib/csum_3dnow.S
--- linux-2.4.20-pre11/arch/i386/lib/csum_3dnow.S	Wed Dec 31 22:00:00 1969
+++ linux-2.4.20-pre11csum/arch/i386/lib/csum_3dnow.S	Fri Nov  1 22:48:32 2002
@@ -0,0 +1,4 @@
+#define PREFETCH(a) prefetch a
+#define NAME csum_3dnow
+
+#include "csum_pf.inc"
diff -urN linux-2.4.20-pre11/arch/i386/lib/csum_basic.S linux-2.4.20-pre11csum/arch/i386/lib/csum_basic.S
--- linux-2.4.20-pre11/arch/i386/lib/csum_basic.S	Wed Dec 31 22:00:00 1969
+++ linux-2.4.20-pre11csum/arch/i386/lib/csum_basic.S	Fri Nov  1 22:56:19 2002
@@ -0,0 +1,63 @@
+.text
+.align 4
+.globl csum_basic
+
+/* Experiments with Ethernet and SLIP connections show that buff
+** is aligned on either a 2-byte or 4-byte boundary.  We get at
+** least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
+** Fortunately, it is easy to convert 2-byte alignment to 4-byte
+** alignment for the unrolled loop.
+*/
+csum_basic:
+	movl	%ecx, %ebx
+	movl	%ecx, %edx
+	shrl	$7, %ecx
+	andl	$0x7c, %ebx
+	addl	%ebx, %esi
+	shrl	$2, %ebx
+	negl	%ebx
+	leal	50f(%ebx,%ebx,2), %ebx
+	clc
+	jmp	*%ebx
+40:
+	leal	128(%esi), %esi
+	adcl	-128(%esi), %eax
+	adcl	-124(%esi), %eax
+	adcl	-120(%esi), %eax
+	adcl	-116(%esi), %eax
+	adcl	-112(%esi), %eax
+	adcl	-108(%esi), %eax
+	adcl	-104(%esi), %eax
+	adcl	-100(%esi), %eax
+	adcl	-96(%esi), %eax
+	adcl	-92(%esi), %eax
+	adcl	-88(%esi), %eax
+	adcl	-84(%esi), %eax
+	adcl	-80(%esi), %eax
+	adcl	-76(%esi), %eax
+	adcl	-72(%esi), %eax
+	adcl	-68(%esi), %eax
+	adcl	-64(%esi), %eax
+	adcl	-60(%esi), %eax
+	adcl	-56(%esi), %eax
+	adcl	-52(%esi), %eax
+	adcl	-48(%esi), %eax
+	adcl	-44(%esi), %eax
+	adcl	-40(%esi), %eax
+	adcl	-36(%esi), %eax
+	adcl	-32(%esi), %eax
+	adcl	-28(%esi), %eax
+	adcl	-24(%esi), %eax
+	adcl	-20(%esi), %eax
+	adcl	-16(%esi), %eax
+	adcl	-12(%esi), %eax
+	adcl	-8(%esi), %eax
+	adcl	-4(%esi), %eax
+50:
+	decl	%ecx
+	jge	40b
+	
+	adcl	$0, %eax
+	movl	%edx, %ecx
+	andl	$3, %ecx
+	ret
diff -urN linux-2.4.20-pre11/arch/i386/lib/csum_naive.S linux-2.4.20-pre11csum/arch/i386/lib/csum_naive.S
--- linux-2.4.20-pre11/arch/i386/lib/csum_naive.S	Wed Dec 31 22:00:00 1969
+++ linux-2.4.20-pre11csum/arch/i386/lib/csum_naive.S	Fri Nov  1 22:36:20 2002
@@ -0,0 +1,17 @@
+.text
+.align 4
+.globl csum_naive
+
+csum_naive:
+	mov	%ecx, %edx
+	shrl	$2, %ecx
+	clc
+1:
+	adcl	(%esi), %eax
+	leal	4(%esi), %esi
+	loop	1b
+
+	adcl	$0, %eax
+	mov	%edx, %ecx
+	andl	$3, %ecx
+	ret
diff -urN linux-2.4.20-pre11/arch/i386/lib/csum_pf.inc linux-2.4.20-pre11csum/arch/i386/lib/csum_pf.inc
--- linux-2.4.20-pre11/arch/i386/lib/csum_pf.inc	Wed Dec 31 22:00:00 1969
+++ linux-2.4.20-pre11csum/arch/i386/lib/csum_pf.inc	Fri Nov  1 22:57:20 2002
@@ -0,0 +1,95 @@
+//#define PREFETCH(a) prefetchnta a
+//#define PREFETCH(a) prefetch a
+//#define PREFETCH(a)
+
+// How much unrolling do you want?
+//vda: 5 is best on Duron 650
+#define ITER_BITS 5		// ...5,6,7 - ...32,64,128 bytes
+				// NB: tweak unrolled loop too...
+/*
+** computes a partial checksum, e.g. for TCP/UDP fragments
+** int csum_partial(const char *buff, int len, int sum)
+*/
+
+#define ITER_SZ		(1<<ITER_BITS)
+#define ITER_MSK	((1<<ITER_BITS)-4)
+   
+.text
+.align 4
+.globl NAME
+
+NAME:
+
+# Guaranteed by caller: esi is 4-aligned, ecx>=16
+10:
+	PREFETCH((%esi))	# Prefetch _each_ cacheline
+	PREFETCH(32(%esi))	# Note! Athlons have 64 bytes long ones, but
+	PREFETCH(64(%esi))	# PIIIs only 32! This gives ~20% speedup
+	PREFETCH(64+32(%esi))	# for PIII
+	PREFETCH(128(%esi))
+	PREFETCH(128+32(%esi))
+	PREFETCH(192(%esi))
+	PREFETCH(192+32(%esi))
+	movl	%ecx, %ebx
+	movl	%ecx, %edx
+	andl	$ITER_MSK, %ebx	# = bytes to handle in first (partial) iteration
+	shrl	$ITER_BITS, %ecx # = iterations to make 
+	addl	%ebx, %esi	# => 1st byte to handle in 2nd complete iteration
+	shrl	$2, %ebx	# = dwords to handle
+	negl	%ebx
+	lea	50f(%ebx,%ebx,2), %ebx # = 45f - 3*dwords_to_handle
+	clc
+	jmp	*%ebx		# here we go!
+
+40:
+	PREFETCH(256(%esi))
+41:
+	lea	ITER_SZ(%esi), %esi	# does NOT change CF!
+/*
+	addl	-128(%esi), %eax
+	adcl	-124(%esi), %eax
+	adcl	-120(%esi), %eax
+	adcl	-116(%esi), %eax
+	adcl	-112(%esi), %eax
+	adcl	-108(%esi), %eax
+	adcl	-104(%esi), %eax
+	adcl	-100(%esi), %eax
+	adcl	-96(%esi), %eax
+	adcl	-92(%esi), %eax
+	adcl	-88(%esi), %eax
+	adcl	-84(%esi), %eax
+	adcl	-80(%esi), %eax
+	adcl	-76(%esi), %eax
+	adcl	-72(%esi), %eax
+	adcl	-68(%esi), %eax
+	adcl	-64(%esi), %eax
+	adcl	-60(%esi), %eax
+	adcl	-56(%esi), %eax
+	adcl	-52(%esi), %eax
+	adcl	-48(%esi), %eax
+	adcl	-44(%esi), %eax
+	adcl	-40(%esi), %eax
+	adcl	-36(%esi), %eax
+*/
+	addl	-32(%esi), %eax
+	adcl	-28(%esi), %eax
+	adcl	-24(%esi), %eax
+	adcl	-20(%esi), %eax
+	adcl	-16(%esi), %eax
+	adcl	-12(%esi), %eax
+	adcl	-8(%esi), %eax
+	adcl	-4(%esi), %eax
+50:
+	adcl	$0, %eax
+	dec	%ecx			# does NOT change CF!
+    # We can do just "jge 40b" here, but we can be a bit clever...
+    # This little twist gives surprisingly noticeable benefits!
+    # Seen 11% increase on random 1K blocks on Duron 650
+	js	60f
+	cmp	$256/ITER_SZ, %ecx
+	jae	40b	# need prefetch
+	jmp	41b	# do not need it
+60:
+	movl	%edx, %ecx
+	andl	$3, %ecx
+	ret
diff -urN linux-2.4.20-pre11/arch/i386/lib/csum_ssemmxplus.S linux-2.4.20-pre11csum/arch/i386/lib/csum_ssemmxplus.S
--- linux-2.4.20-pre11/arch/i386/lib/csum_ssemmxplus.S	Wed Dec 31 22:00:00 1969
+++ linux-2.4.20-pre11csum/arch/i386/lib/csum_ssemmxplus.S	Fri Nov  1 22:48:39 2002
@@ -0,0 +1,4 @@
+#define PREFETCH(a) prefetchnta a
+#define NAME csum_ssemmxplus
+
+#include "csum_pf.inc"
diff -urN linux-2.4.20-pre11/arch/i386/lib/csumcpy.S linux-2.4.20-pre11csum/arch/i386/lib/csumcpy.S
--- linux-2.4.20-pre11/arch/i386/lib/csumcpy.S	Wed Dec 31 22:00:00 1969
+++ linux-2.4.20-pre11csum/arch/i386/lib/csumcpy.S	Fri Nov  1 22:49:44 2002
@@ -0,0 +1,178 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		IP/TCP/UDP checksumming routines
+ *
+ * Authors:	Jorge Cwik, <jorge@laser.satlink.net>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Tom May, <ftom@netcom.com>
+ *              Pentium Pro/II routines:
+ *              Alexander Kjeldaas <astor@guardian.no>
+ *              Finn Arne Gangstad <finnag@guardian.no>
+ *		Lots of code moved from tcp.c and ip.c; see those files
+ *		for more names.
+ *
+ * Changes:     Ingo Molnar, converted csum_partial_copy() to 2.1 exception
+ *			     handling.
+ *		Andi Kleen,  add zeroing on error converted to pure assembler
+ *		2002-10-30 Denis Vlasenko
+ *			boot-time benchmarking, 3Dnow/MMX+/SSE versions
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/errno.h>
+				
+/*
+** computes a partial checksum, e.g. for TCP/UDP fragments
+**
+** unsigned int csum_partial(const unsigned char * buff,
+**		int len, unsigned int sum)
+*/
+
+#ifdef __KERNEL__
+#define K(a...) a
+#else
+#define K(a...)
+#endif
+
+#define SRC(y...) \
+9999:	y			;\
+	.section __ex_table, "a";\
+	.long	9999b, 6001f	;\
+	.previous
+
+#define DST(y...) \
+9999:	y			;\
+	.section __ex_table, "a";\
+	.long	9999b, 6002f	;\
+	.previous
+
+#define	KERNEL_FPU_BEGIN \
+	call	kernel_fpu_begin
+
+#define	KERNEL_FPU_END(r) \
+K(	movl	%cr0, r		;)\
+K(	orl	$8, r		;)\
+K(	movl	r, %cr0		;)
+
+.text
+
+#include "csumcpy_naive.inc"
+#include "csumcpy_basic.inc"
+#include "csumcpy_ssemmxplus.inc"
+#include "csumcpy_sse.inc"
+
+.align 4
+.globl csum_partial_copy_generic
+
+csum_partial_copy_generic:
+	pushl	%ebx
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebp
+	movl	%esp, %ebp
+
+#define STK_DERR 40(%ebp)
+#define STK_SERR 36(%ebp)
+#define STK_SUM 32(%ebp)
+#define STK_LEN 28(%ebp)
+#define STK_DST 24(%ebp)
+#define STK_SRC 20(%ebp)
+#define STK_EIP 16(%ebp)
+#define STK_EBX 12(%ebp)
+#define STK_EDI 8(%ebp)
+#define STK_ESI 4(%ebp)
+#define STK_EBP (%ebp)
+	
+	movl	STK_SRC, %esi	#src
+	movl	STK_DST, %edi	#dst	
+	movl	STK_LEN, %ecx	#len
+	movl	STK_SUM, %eax	#sum
+
+        testl	$3, %edi	# Check dst alignment
+        jz	40f
+
+    # not 4-aligned: analyze and align...
+	testl	$1, %edi
+	jz	30f
+
+    # unaligned start addr
+	decl	%ecx
+	js	90f 		# sz==0, exit
+	movzbl	(%esi), %ebx	# eat one byte...
+	movb	%bl, (%edi)
+	addl	%ebx, %eax
+	adcl	$0, %eax
+	roll	$8, %eax	# NB: need to be undone at exit!
+	incl	%esi
+	incl	%edi
+	testl	$2, %edi
+	jz	40f
+30:
+    # xxx 2-aligned, but not 4-aligned
+	cmpl	$3, %ecx
+	jbe	60f
+	movw	(%esi), %bx	# eat 2 bytes
+	addw	%bx, %ax
+	movw	%bx, (%edi)
+	adcl	$0, %eax
+	leal	2(%esi), %esi
+	leal	2(%edi), %edi
+	subl	$2, %ecx
+40:
+    # edi is 4-aligned now: call block routine
+	movl	$csumcpy_basic, %ebx	# 'default', known good for ecx==0 etc
+	cmpl	$128, %ecx		# use optimized routine
+	jb	50f			#   only for large blocks
+	movl	best_csumcpy, %ebx
+50:	call	*%ebx
+60:
+    # handle last 0-3 bytes
+	jecxz	80f
+	cmpl	$2, %ecx
+	jb	70f
+SRC(	movw	(%esi), %cx         )
+	leal	2(%esi), %esi
+DST(	movw	%cx, (%edi)         )
+	leal	2(%edi), %edi
+	je	75f
+	shll	$16, %ecx
+70:
+SRC(	movb	(%esi), %cl         )
+DST(	movb	%cl, (%edi)         )
+75:	addl	%ecx, %eax
+	adcl	$0, %eax
+80:
+    # undo csum rotation if dst was unaligned
+	testl	$1, STK_DST
+	jz	90f
+	roll	$8, %eax
+90:
+	movl	%esp, %ebp
+	popl	%ebp
+	popl	%esi
+	popl	%edi
+	popl	%ebx
+	ret
+
+
+.section .fixup, "ax"
+6001:	movl	STK_SERR, %ebx		# src_err_ptr	
+	movl	$-EFAULT, (%ebx)
+	# zero the complete destination (computing the rest is too much work)
+	movl	STK_DST, %edi		# dst
+	movl	STK_LEN, %ecx		# len
+	xorl	%eax, %eax
+	cld
+	rep; stosb
+	jmp	90b
+6002:	movl	STK_DERR, %ebx		# dst_err_ptr
+	movl	$-EFAULT, (%ebx)
+	jmp	90b
+.previous
diff -urN linux-2.4.20-pre11/arch/i386/lib/csumcpy_basic.inc linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_basic.inc
--- linux-2.4.20-pre11/arch/i386/lib/csumcpy_basic.inc	Wed Dec 31 22:00:00 1969
+++ linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_basic.inc	Fri Nov  1 23:27:28 2002
@@ -0,0 +1,40 @@
+// Please somebody experiment with unroll length
+// on a PII. Do _not_ optimize for PIII/Athlons/etc,
+// they won't typically use this...
+
+.align 4
+.globl csumcpy_basic
+
+csumcpy_basic:
+	movl	%ecx, %ebx
+	movl	%ecx, %edx
+	shrl	$6, %ecx
+	andl	$0x3c, %ebx
+	negl	%ebx
+	subl	%ebx, %esi
+	subl	%ebx, %edi
+	leal	50f(%ebx,%ebx), %ebx
+	clc
+	jmp	*%ebx
+40:
+	leal	64(%esi), %esi
+	leal	64(%edi), %edi 
+
+#undef ROUND
+#define ROUND(x) \
+SRC(	movl	x(%esi), %ebx	);	\
+	adcl	%ebx, %eax	;	\
+DST(	movl	%ebx, x(%edi)	);
+
+	ROUND(-64) ROUND(-60) ROUND(-56) ROUND(-52)
+	ROUND(-48) ROUND(-44) ROUND(-40) ROUND(-36)
+	ROUND(-32) ROUND(-28) ROUND(-24) ROUND(-20)
+	ROUND(-16) ROUND(-12) ROUND(-8)  ROUND(-4)
+50:
+	decl	%ecx
+	jge	40b
+
+	adcl	$0, %eax
+	movl	%edx, %ecx
+	andl	$3, %ecx
+	ret
diff -urN linux-2.4.20-pre11/arch/i386/lib/csumcpy_naive.inc linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_naive.inc
--- linux-2.4.20-pre11/arch/i386/lib/csumcpy_naive.inc	Wed Dec 31 22:00:00 1969
+++ linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_naive.inc	Fri Nov  1 23:27:51 2002
@@ -0,0 +1,21 @@
+// Heh... at least it's small ;)
+
+.align 4
+.globl csumcpy_naive
+
+csumcpy_naive:
+	mov	%ecx, %edx
+	shrl	$2, %ecx
+	clc
+1:
+SRC(	movl	(%esi), %ebx	)
+DST(	movl	%ebx, (%edi)	)
+	adcl	%ebx, %eax
+	leal	4(%esi), %esi
+	leal	4(%edi), %edi
+	loop	1b
+
+	adcl	$0, %eax
+	mov	%edx, %ecx
+	and	$3, %ecx
+	ret
diff -urN linux-2.4.20-pre11/arch/i386/lib/csumcpy_sse.inc linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_sse.inc
--- linux-2.4.20-pre11/arch/i386/lib/csumcpy_sse.inc	Wed Dec 31 22:00:00 1969
+++ linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_sse.inc	Fri Nov  1 23:38:32 2002
@@ -0,0 +1,147 @@
+// Huge routine, I don't like it's size and number
+// of fixups... think of that when you want
+// to unroll loop more
+// TODO: benchmark and reduce size
+// I won't stand 1K behemot just for 5% speedup
+
+#undef PREFETCH
+#define PREFETCH(a) prefetchnta a
+
+// How much unrolling do you want?
+// vda: celeron 1200: 5 with movaps, 4 with movups
+#undef ITER_BITS
+#define ITER_BITS 6	// ...4,5,6,7 - ...16,32,64,128 bytes
+                        // NB: tweak unrolled loop too...
+
+#undef ITER_SZ
+#undef ITER_MSK
+#define ITER_SZ		(1<<ITER_BITS)
+#define ITER_MSK	((1<<ITER_BITS)-4)
+
+.align 4
+.globl csumcpy_sse
+
+csumcpy_sse:
+	testl	$0xe, %edi	# Check alignment
+	jnz	5500f		# align to 16 bytes
+1:
+	movl	%ecx, %edx
+	shrl	$ITER_BITS, %ecx
+	jz	20f
+
+# "big chunks" loop 
+	PREFETCH((%esi))	# Prefetch a couple of cachelines
+	PREFETCH(32(%esi))	// Note: Athlons have 64 bytes long ones, but
+	PREFETCH(64(%esi))	// PIIIs only 32! This gives ~20% speedup
+	PREFETCH(64+32(%esi))	// for PIII
+	PREFETCH(128(%esi))	// Note2: 128 pf depth is slower for Athlons
+	PREFETCH(128+32(%esi))	// let them enjoy 256
+	PREFETCH(192(%esi))
+	PREFETCH(192+32(%esi))
+
+	//KERNEL_FPU_BEGIN	// We can't use lazy save - can be in irq :(
+	subl	$32, %esp	// hopefully this is not too slow...
+K(	movl	%cr0, %ebx	)
+K(	clts			)
+	movups	%xmm0, (%esp)
+	movups	%xmm1, 16(%esp)
+	
+
+#undef ROUND0
+#undef ROUND
+#define ROUND0(au,r) \
+SRC(	mov##au##ps (%esi), r	;)	\
+	adcl	(%esi), %eax	;	\
+	adcl	4(%esi), %eax	;	\
+	adcl	8(%esi), %eax	;	\
+	adcl	12(%esi), %eax	;	\
+DST(	movntps	r, (%edi)	;)	\
+
+#define ROUND(au,x,r) \
+SRC(	mov##au##ps x(%esi), r	;)	\
+	adcl	x(%esi), %eax	;	\
+	adcl	x+4(%esi), %eax	;	\
+	adcl	x+8(%esi), %eax	;	\
+	adcl	x+12(%esi), %eax;	\
+DST(	movntps	r, x(%edi)	;)	\
+
+// ROUND[0]: edi must be 16-aligned!
+// if esi is not aligned, movaps wouldn't work,
+// not caught by testsuite. TODO.
+// We don't need SRC() around adcl's
+// (exception, if any, would be caught by 1st one)
+// (FIXME: can races against interrupts bite us?)
+
+	testl	$0xf, %esi	# Check esi alignment + clear CF
+	jz	15f
+10:				# esi is NOT 16-aligned
+	PREFETCH(256(%esi))
+	ROUND0(u,%xmm0)
+	ROUND(u,16,%xmm1)
+	PREFETCH(256+32(%esi))
+	ROUND(u,32,%xmm0)
+	ROUND(u,48,%xmm1)
+	lea	ITER_SZ(%esi), %esi
+	lea	ITER_SZ(%edi), %edi
+	//dec	%ecx
+	//jnz	10b
+	loop	10b	// Beware: loop and ITER_BITS>6 don't mix
+	adcl	$0, %eax
+	jmp	19f
+15:				# esi is 16-aligned
+	PREFETCH(256(%esi))
+	ROUND0(a,%xmm0)
+	ROUND(a,16,%xmm1)
+	PREFETCH(256+32(%esi))
+	ROUND(a,32,%xmm0)
+	ROUND(a,48,%xmm1)
+	lea	ITER_SZ(%esi), %esi
+	lea	ITER_SZ(%edi), %edi
+	//dec	%ecx
+	//jnz	15b
+	loop	15b	// Beware: loop and ITER_BITS>6 don't mix
+	adcl	$0, %eax
+19:
+	sfence			# clean up XMM
+	//KERNEL_FPU_END(%ebx)
+	movups	(%esp), %xmm0
+	movups	16(%esp), %xmm1
+	addl	$32, %esp
+K(	movl	%ebx, %cr0	)
+	
+20:
+    # loop for dwords
+	movl	%edx, %ecx
+	andl	$ITER_MSK, %edx
+	jz	40f
+	shrl	$2, %edx	# this also clears CF
+30:
+SRC(	movl	(%esi), %ebx	)
+	adcl	%ebx, %eax
+DST(	movl	%ebx, (%edi)	)
+	lea	4(%esi), %esi
+	lea	4(%edi), %edi
+	dec	%edx
+	jnz	30b
+	adcl	$0, %eax
+40:	
+    # last 1, 2 or 3 bytes: handled by caller
+	andl	$3, %ecx
+	ret
+
+
+# xxx 16-align edi and get back
+5500:	cmp	$ITER_SZ, %ecx	# edi is 4-aligned here
+	mov	%ecx, %edx      # edx needed at 20:
+	jb	20b		# not worthy: too short
+	
+5520:	test	$0xe, %edi	# loop until we are 16-aligned
+	jz	1b
+SRC(	movl	(%esi), %ebx	)
+	addl	$4, %esi
+DST(	movl	%ebx, (%edi)	)
+	addl	$4, %edi
+	addl	%ebx, %eax
+	adcl	$0, %eax
+	subl	$4, %ecx
+	jmp	5520b
diff -urN linux-2.4.20-pre11/arch/i386/lib/csumcpy_ssemmxplus.inc linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_ssemmxplus.inc
--- linux-2.4.20-pre11/arch/i386/lib/csumcpy_ssemmxplus.inc	Wed Dec 31 22:00:00 1969
+++ linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_ssemmxplus.inc	Fri Nov  1 23:22:58 2002
@@ -0,0 +1,103 @@
+#undef PREFETCH
+#define PREFETCH(a) prefetchnta a
+
+// How much unrolling do you want?
+#undef ITER_BITS
+#define ITER_BITS 5             // ...5,6,7 - ...32,64,128 bytes
+                                // NB: tweak unrolled loop too...
+
+#undef ITER_SZ
+#undef ITER_MSK
+#define ITER_SZ		(1<<ITER_BITS)
+#define ITER_MSK	((1<<ITER_BITS)-4)
+
+.align 4
+.globl csumcpy_ssemmxplus
+
+csumcpy_ssemmxplus:
+	movl	%ecx, %edx
+	shrl	$ITER_BITS, %ecx
+	jz	20f
+
+# "big chunks" loop 
+	PREFETCH((%esi))	# Prefetch a couple of cachelines
+	PREFETCH(32(%esi))	// Note: Athlons have 64 bytes long ones, but
+	PREFETCH(64(%esi))	// PIIIs only 32! This gives ~20% speedup
+	PREFETCH(64+32(%esi))	// for PIII
+	PREFETCH(128(%esi))	// Note2: 128 pf depth is slower for Athlons
+	PREFETCH(128+32(%esi))	// let them enjoy 256
+	PREFETCH(192(%esi))
+	PREFETCH(192+32(%esi))
+	
+	//KERNEL_FPU_BEGIN	// We can't use lazy save - can be in irq :(
+K(	movl	%cr0, %ebx	)
+K(	clts			)
+	subl	$108, %esp
+	fnsave	(%esp)
+	fwait
+	
+	clc
+
+#undef ROUND0
+#undef ROUND
+#define ROUND0(r) \
+SRC(	movq	(%esi), r	;)	\
+	adcl	(%esi), %eax	;	\
+	adcl	4(%esi), %eax	;	\
+DST(	movntq	r, (%edi)	;)	\
+
+#define ROUND(x,r) \
+SRC(	movq	x(%esi), r	;)	\
+	adcl	x(%esi), %eax	;	\
+	adcl	x+4(%esi), %eax	;	\
+DST(	movntq	r, x(%edi)	;)	\
+
+// moving store to the end of a ROUND makes it faster
+// don't ask me why
+// we don't need SRC() around adcl's
+// (exception, if any, would be caught by 1st one)
+// (FIXME: can races against interrupts bite us?)
+
+10:
+	PREFETCH(256(%esi))
+	ROUND0(%mm0)		// using mm1,2,3 does not speed up things
+	ROUND(8,%mm0)
+	ROUND(16,%mm0)
+	ROUND(24,%mm0)
+/*	PREFETCH(256+32(%esi))
+	ROUND(32,%mm0)
+	ROUND(40,%mm0)
+	ROUND(48,%mm0)
+	ROUND(56,%mm0)*/
+
+	lea	ITER_SZ(%esi), %esi
+	lea	ITER_SZ(%edi), %edi
+	//dec	%ecx
+	//jnz	10b
+	loop	10b	// Beware: loop and ITER_BITS>5 don't mix
+	adcl	$0, %eax
+
+	sfence			
+	//KERNEL_FPU_END(%ebx)
+	frstor	(%esp)
+	addl	$108, %esp
+K(	movl	%ebx, %cr0	)
+
+20:	
+    # loop for dwords
+	movl	%edx, %ecx
+	andl	$ITER_MSK, %edx
+	jz	40f
+	shrl	$2, %edx	# this also clears CF
+30:
+SRC(	movl	(%esi), %ebx	)
+	adcl	%ebx, %eax
+DST(	movl	%ebx, (%edi)	)
+	lea	4(%esi), %esi
+	lea	4(%edi), %edi
+	dec	%edx
+	jnz	30b
+	adcl	$0, %eax
+	
+40:	andl	$3, %ecx
+	ret