New csum functions optimized for different processors. Author: Denis Vlasenko diff -urN linux-2.4.20-pre11/arch/i386/lib/Makefile linux-2.4.20-pre11csum/arch/i386/lib/Makefile --- linux-2.4.20-pre11/arch/i386/lib/Makefile Mon Sep 10 12:31:30 2001 +++ linux-2.4.20-pre11csum/arch/i386/lib/Makefile Fri Nov 1 23:55:58 2002 @@ -7,9 +7,17 @@ L_TARGET = lib.a -obj-y = checksum.o old-checksum.o delay.o \ +obj-y = old-checksum.o delay.o \ usercopy.o getuser.o \ - memcpy.o strstr.o + memcpy.o strstr.o \ + bench_csum.o \ + bench_func.o \ + csum.o \ + csum_basic.o \ + csum_naive.o \ + csum_3dnow.o \ + csum_ssemmxplus.o \ + csumcpy.o obj-$(CONFIG_X86_USE_3DNOW) += mmx.o obj-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o diff -urN linux-2.4.20-pre11/arch/i386/lib/bench_csum.c linux-2.4.20-pre11csum/arch/i386/lib/bench_csum.c --- linux-2.4.20-pre11/arch/i386/lib/bench_csum.c Wed Dec 31 22:00:00 1969 +++ linux-2.4.20-pre11csum/arch/i386/lib/bench_csum.c Sat Nov 2 11:51:40 2002 @@ -0,0 +1,216 @@ +#include // for get_pages +#include // for access_ok in asm/checksum.h +#include // for in6_addr in asm/checksum.h +#include // for ntoh in asm/checksum.h +#include // for X86_FEATURE_xx +#include // for ntohX in asm/checksum.h +#include // for NULL in asm/checksum.h +#include // for asmlinkage in asm/checksum.h +#include + +#include +#include "bench_func.h" + +//#define dprintk(a...) printk(a) +#define dprintk(a...) ((void)0) + +/* Features usable for mem optimization: + Intel +X86_FEATURE_FPU Onboard FPU +X86_FEATURE_MMX Multimedia Extensions +X86_FEATURE_XMM Streaming SIMD Extensions +X86_FEATURE_XMM2 Streaming SIMD Extensions-2 + AMD +X86_FEATURE_3DNOW 3DNow! +X86_FEATURE_MMXEXT AMD MMX extensions +X86_FEATURE_3DNOWEXT AMD 3DNow! extensions + Cyrix +X86_FEATURE_CXMMX Cyrix MMX extensions +*/ + +typedef typeof(jiffies) jiffies_t; + +typedef void asm_helper(void); + +extern asm_helper csum_basic; +extern asm_helper csum_naive; +extern asm_helper csum_3dnow; +extern asm_helper csum_ssemmxplus; + +static struct candidate csum_runner[] = { + { "basic" , csum_basic , 1, { -1 } }, + { "simple" , csum_naive , 1, { -1 } }, + { "3Dnow!" , csum_3dnow , 1, { X86_FEATURE_3DNOW, -1 } }, + { "AMD MMX", csum_ssemmxplus, 1, { X86_FEATURE_MMXEXT, -1 } }, + { "SSE1+", csum_ssemmxplus, 1, { X86_FEATURE_XMM, -1 } }, +}; + +extern asm_helper csumcpy_basic; +extern asm_helper csumcpy_naive; +extern asm_helper csumcpy_ssemmxplus; +extern asm_helper csumcpy_sse; + +static struct candidate csumcpy_runner[] = { + { "basic" , csumcpy_basic , 2, { -1 } }, + { "simple" , csumcpy_naive , 2, { -1 } }, + /* higher weight: we prefer these for less cache pollution: */ + { "AND MMX", csumcpy_ssemmxplus, 3, { X86_FEATURE_MMXEXT, -1 } }, + { "SSE1+", csumcpy_ssemmxplus, 3, { X86_FEATURE_XMM, -1 } }, + { "SSE1" , csumcpy_sse , 3, { X86_FEATURE_XMM, -1 } }, +}; + +//====== TODO: split here: above: arch, below:generic + +/* set this to value bigger than cache(s) */ +/* TODO: heuristic for buffer size */ +#define bufshift 20 /* 10=1kb, 20=1MB etc */ +/* typical size of a packet */ +#define chunksz (4*1024) + +#define bufsz (1<f); + + max = 0; + // In practice these are pretty repeatable + // so 3 runs is an overkill + for(i=0; i<3; i++) { + int count = 0; + jiffies_t limit; + wait_for_jiffy(); + limit = jiffies+duration; + while(time_before(jiffies, limit)) { + int i; + mb(); + // interleaved to avoid bias due to prefetch + for(i=0; imax) + max = count; + } + + if(report) { + int kb_sec = max * (((chunksz*chunkcnt)/1024) * HZ) / duration; + printk(" %-10s:%6d.%03d MB/sec\n", cand->name, + kb_sec / 1000, kb_sec % 1000); + } + + return max; +} + +static int +bench_csumcpy(struct candidate *cand, char *buf) +{ + int err; + int i, max; + best_csumcpy = (asm_helper*)(cand->f); + + max = 0; + for(i=0; i<3; i++) { + int count = 0; + jiffies_t limit; + wait_for_jiffy(); + limit = jiffies+duration; + while(time_before(jiffies, limit)) { + int i; + mb(); + // interleaved to avoid bias due to prefetch + for(i=0; imax) + max = count; + } + + if(report) { + int kb_sec = max * (((chunksz*chunkcnt)/1024) * HZ) / duration; + printk(" %-10s:%6d.%03d MB/sec\n", cand->name, + kb_sec / 1000, kb_sec % 1000); + } + + return max; +} + +static int +find_best_csum(void) +{ + struct candidate *best; + char *buffer = (char *) __get_free_pages(GFP_KERNEL, + (bufshift-PAGE_SHIFT)); + + printk(KERN_INFO "Measuring network checksumming speed\n"); + if(!buffer) { + printk("csum: cannot allocate %i pages\n", + 1<<(bufshift-PAGE_SHIFT) + ); + return -ENOMEM; + } + dprintk("allocated %i pages\n",1<<(bufshift-PAGE_SHIFT)); + + // find # of jiffies suitable for reliable results + // (at least %5 accuracy) + while(bench_csumcpy(&csumcpy_runner[0], buffer)<20) { + duration<<=1; + } + dprintk("test run will last %i ticks\n", duration); + report = 1; + + best = find_best(bench_csum, buffer, csum_runner, + VECTOR_SZ(csum_runner)); + printk("csum: using csum function: %s\n", best->name); + best_csum = (asm_helper*)(best->f); + + best = find_best(bench_csumcpy, buffer, csumcpy_runner, + VECTOR_SZ(csumcpy_runner)); + printk("csum: using csum_copy function: %s\n", best->name); + best_csumcpy = (asm_helper*)(best->f); + + free_pages((unsigned long)buffer, (bufshift-PAGE_SHIFT)); + dprintk("freed %i pages\n",1<<(bufshift-PAGE_SHIFT)); + return 0; +} + +MODULE_LICENSE("GPL"); + +module_init(find_best_csum); diff -urN linux-2.4.20-pre11/arch/i386/lib/bench_func.c linux-2.4.20-pre11csum/arch/i386/lib/bench_func.c --- linux-2.4.20-pre11/arch/i386/lib/bench_func.c Wed Dec 31 22:00:00 1969 +++ linux-2.4.20-pre11csum/arch/i386/lib/bench_func.c Fri Nov 1 18:08:37 2002 @@ -0,0 +1,53 @@ +#include // for KERN_DEBUG + +#include // for test_bit +#include // cpu caps +#include // cpu features constants +#include "bench_func.h" + +//#define dprintk(a...) printk(a) +#define dprintk(a...) ((void)0) + +// 2.4 only, already in 2.5 +extern inline int +boot_cpu_has(int cap) +{ + return test_bit(cap, boot_cpu_data.x86_capability); +} + +extern inline int +cpu_supports(int *cap) +{ + while(*cap != -1) { + if(!boot_cpu_has(*cap)) { + dprintk("unsupported caps: %i\n", *cap); + return 0; + } + cap++; + } + return 1; +} + +/* +** Call all the candidates which can be run on this CPU, +** find the best +*/ +struct candidate* +find_best(bench_func *bench, char *opaque, struct candidate runner[], int count) +{ + int score, max = 0; + struct candidate *best = 0; + while(count--) { + if(!cpu_supports(runner->cpu_caps_needed)) { + printk("func %s skipped: not supported by CPU\n", runner->name); + } else { + score = bench(runner,opaque) * runner->weight; + if(max < score) { + max = score; + best = runner; + } + } + runner++; + } + return best; +} diff -urN linux-2.4.20-pre11/arch/i386/lib/bench_func.h linux-2.4.20-pre11csum/arch/i386/lib/bench_func.h --- linux-2.4.20-pre11/arch/i386/lib/bench_func.h Wed Dec 31 22:00:00 1969 +++ linux-2.4.20-pre11csum/arch/i386/lib/bench_func.h Fri Nov 1 18:08:37 2002 @@ -0,0 +1,16 @@ +#ifndef _BENCH_FUNC_H +#define _BENCH_FUNC_H + +struct candidate { + const char *name; + void *f; // pointer to func + int weight; + int cpu_caps_needed[4]; +}; + +typedef int bench_func(struct candidate *cand, char *opaque); + +struct candidate* find_best(bench_func *bench, char *opaque, + struct candidate runner[], int count); + +#endif diff -urN linux-2.4.20-pre11/arch/i386/lib/checksum.S linux-2.4.20-pre11csum/arch/i386/lib/checksum.S --- linux-2.4.20-pre11/arch/i386/lib/checksum.S Fri Nov 1 18:06:59 2002 +++ linux-2.4.20-pre11csum/arch/i386/lib/checksum.S Wed Dec 31 22:00:00 1969 @@ -1,496 +0,0 @@ -/* - * INET An implementation of the TCP/IP protocol suite for the LINUX - * operating system. INET is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * IP/TCP/UDP checksumming routines - * - * Authors: Jorge Cwik, - * Arnt Gulbrandsen, - * Tom May, - * Pentium Pro/II routines: - * Alexander Kjeldaas - * Finn Arne Gangstad - * Lots of code moved from tcp.c and ip.c; see those files - * for more names. - * - * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception - * handling. - * Andi Kleen, add zeroing on error - * converted to pure assembler - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include - -/* - * computes a partial checksum, e.g. for TCP/UDP fragments - */ - -/* -unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) - */ - -.text -.align 4 -.globl csum_partial - -#ifndef CONFIG_X86_USE_PPRO_CHECKSUM - - /* - * Experiments with Ethernet and SLIP connections show that buff - * is aligned on either a 2-byte or 4-byte boundary. We get at - * least a twofold speedup on 486 and Pentium if it is 4-byte aligned. - * Fortunately, it is easy to convert 2-byte alignment to 4-byte - * alignment for the unrolled loop. - */ -csum_partial: - pushl %esi - pushl %ebx - movl 20(%esp),%eax # Function arg: unsigned int sum - movl 16(%esp),%ecx # Function arg: int len - movl 12(%esp),%esi # Function arg: unsigned char *buff - testl $3, %esi # Check alignment. - jz 2f # Jump if alignment is ok. - testl $1, %esi # Check alignment. - jz 10f # Jump if alignment is boundary of 2bytes. - - # buf is odd - dec %ecx - jl 8f - movzbl (%esi), %ebx - adcl %ebx, %eax - roll $8, %eax - inc %esi - testl $2, %esi - jz 2f -10: - subl $2, %ecx # Alignment uses up two bytes. - jae 1f # Jump if we had at least two bytes. - addl $2, %ecx # ecx was < 2. Deal with it. - jmp 4f -1: movw (%esi), %bx - addl $2, %esi - addw %bx, %ax - adcl $0, %eax -2: - movl %ecx, %edx - shrl $5, %ecx - jz 2f - testl %esi, %esi -1: movl (%esi), %ebx - adcl %ebx, %eax - movl 4(%esi), %ebx - adcl %ebx, %eax - movl 8(%esi), %ebx - adcl %ebx, %eax - movl 12(%esi), %ebx - adcl %ebx, %eax - movl 16(%esi), %ebx - adcl %ebx, %eax - movl 20(%esi), %ebx - adcl %ebx, %eax - movl 24(%esi), %ebx - adcl %ebx, %eax - movl 28(%esi), %ebx - adcl %ebx, %eax - lea 32(%esi), %esi - dec %ecx - jne 1b - adcl $0, %eax -2: movl %edx, %ecx - andl $0x1c, %edx - je 4f - shrl $2, %edx # This clears CF -3: adcl (%esi), %eax - lea 4(%esi), %esi - dec %edx - jne 3b - adcl $0, %eax -4: andl $3, %ecx - jz 7f - cmpl $2, %ecx - jb 5f - movw (%esi),%cx - leal 2(%esi),%esi - je 6f - shll $16,%ecx -5: movb (%esi),%cl -6: addl %ecx,%eax - adcl $0, %eax -7: - testl $1, 12(%esp) - jz 8f - roll $8, %eax -8: - popl %ebx - popl %esi - ret - -#else - -/* Version for PentiumII/PPro */ - -csum_partial: - pushl %esi - pushl %ebx - movl 20(%esp),%eax # Function arg: unsigned int sum - movl 16(%esp),%ecx # Function arg: int len - movl 12(%esp),%esi # Function arg: const unsigned char *buf - - testl $3, %esi - jnz 25f -10: - movl %ecx, %edx - movl %ecx, %ebx - andl $0x7c, %ebx - shrl $7, %ecx - addl %ebx,%esi - shrl $2, %ebx - negl %ebx - lea 45f(%ebx,%ebx,2), %ebx - testl %esi, %esi - jmp *%ebx - - # Handle 2-byte-aligned regions -20: addw (%esi), %ax - lea 2(%esi), %esi - adcl $0, %eax - jmp 10b -25: - testl $1, %esi - jz 30f - # buf is odd - dec %ecx - jl 90f - movzbl (%esi), %ebx - addl %ebx, %eax - adcl $0, %eax - roll $8, %eax - inc %esi - testl $2, %esi - jz 10b - -30: subl $2, %ecx - ja 20b - je 32f - addl $2, %ecx - jz 80f - movzbl (%esi),%ebx # csumming 1 byte, 2-aligned - addl %ebx, %eax - adcl $0, %eax - jmp 80f -32: - addw (%esi), %ax # csumming 2 bytes, 2-aligned - adcl $0, %eax - jmp 80f - -40: - addl -128(%esi), %eax - adcl -124(%esi), %eax - adcl -120(%esi), %eax - adcl -116(%esi), %eax - adcl -112(%esi), %eax - adcl -108(%esi), %eax - adcl -104(%esi), %eax - adcl -100(%esi), %eax - adcl -96(%esi), %eax - adcl -92(%esi), %eax - adcl -88(%esi), %eax - adcl -84(%esi), %eax - adcl -80(%esi), %eax - adcl -76(%esi), %eax - adcl -72(%esi), %eax - adcl -68(%esi), %eax - adcl -64(%esi), %eax - adcl -60(%esi), %eax - adcl -56(%esi), %eax - adcl -52(%esi), %eax - adcl -48(%esi), %eax - adcl -44(%esi), %eax - adcl -40(%esi), %eax - adcl -36(%esi), %eax - adcl -32(%esi), %eax - adcl -28(%esi), %eax - adcl -24(%esi), %eax - adcl -20(%esi), %eax - adcl -16(%esi), %eax - adcl -12(%esi), %eax - adcl -8(%esi), %eax - adcl -4(%esi), %eax -45: - lea 128(%esi), %esi - adcl $0, %eax - dec %ecx - jge 40b - movl %edx, %ecx -50: andl $3, %ecx - jz 80f - - # Handle the last 1-3 bytes without jumping - notl %ecx # 1->2, 2->1, 3->0, higher bits are masked - movl $0xffffff,%ebx # by the shll and shrl instructions - shll $3,%ecx - shrl %cl,%ebx - andl -128(%esi),%ebx # esi is 4-aligned so should be ok - addl %ebx,%eax - adcl $0,%eax -80: - testl $1, 12(%esp) - jz 90f - roll $8, %eax -90: - popl %ebx - popl %esi - ret - -#endif - -/* -unsigned int csum_partial_copy_generic (const char *src, char *dst, - int len, int sum, int *src_err_ptr, int *dst_err_ptr) - */ - -/* - * Copy from ds while checksumming, otherwise like csum_partial - * - * The macros SRC and DST specify the type of access for the instruction. - * thus we can call a custom exception handler for all access types. - * - * FIXME: could someone double-check whether I haven't mixed up some SRC and - * DST definitions? It's damn hard to trigger all cases. I hope I got - * them all but there's no guarantee. - */ - -#define SRC(y...) \ - 9999: y; \ - .section __ex_table, "a"; \ - .long 9999b, 6001f ; \ - .previous - -#define DST(y...) \ - 9999: y; \ - .section __ex_table, "a"; \ - .long 9999b, 6002f ; \ - .previous - -.align 4 -.globl csum_partial_copy_generic - -#ifndef CONFIG_X86_USE_PPRO_CHECKSUM - -#define ARGBASE 16 -#define FP 12 - -csum_partial_copy_generic: - subl $4,%esp - pushl %edi - pushl %esi - pushl %ebx - movl ARGBASE+16(%esp),%eax # sum - movl ARGBASE+12(%esp),%ecx # len - movl ARGBASE+4(%esp),%esi # src - movl ARGBASE+8(%esp),%edi # dst - - testl $2, %edi # Check alignment. - jz 2f # Jump if alignment is ok. - subl $2, %ecx # Alignment uses up two bytes. - jae 1f # Jump if we had at least two bytes. - addl $2, %ecx # ecx was < 2. Deal with it. - jmp 4f -SRC(1: movw (%esi), %bx ) - addl $2, %esi -DST( movw %bx, (%edi) ) - addl $2, %edi - addw %bx, %ax - adcl $0, %eax -2: - movl %ecx, FP(%esp) - shrl $5, %ecx - jz 2f - testl %esi, %esi -SRC(1: movl (%esi), %ebx ) -SRC( movl 4(%esi), %edx ) - adcl %ebx, %eax -DST( movl %ebx, (%edi) ) - adcl %edx, %eax -DST( movl %edx, 4(%edi) ) - -SRC( movl 8(%esi), %ebx ) -SRC( movl 12(%esi), %edx ) - adcl %ebx, %eax -DST( movl %ebx, 8(%edi) ) - adcl %edx, %eax -DST( movl %edx, 12(%edi) ) - -SRC( movl 16(%esi), %ebx ) -SRC( movl 20(%esi), %edx ) - adcl %ebx, %eax -DST( movl %ebx, 16(%edi) ) - adcl %edx, %eax -DST( movl %edx, 20(%edi) ) - -SRC( movl 24(%esi), %ebx ) -SRC( movl 28(%esi), %edx ) - adcl %ebx, %eax -DST( movl %ebx, 24(%edi) ) - adcl %edx, %eax -DST( movl %edx, 28(%edi) ) - - lea 32(%esi), %esi - lea 32(%edi), %edi - dec %ecx - jne 1b - adcl $0, %eax -2: movl FP(%esp), %edx - movl %edx, %ecx - andl $0x1c, %edx - je 4f - shrl $2, %edx # This clears CF -SRC(3: movl (%esi), %ebx ) - adcl %ebx, %eax -DST( movl %ebx, (%edi) ) - lea 4(%esi), %esi - lea 4(%edi), %edi - dec %edx - jne 3b - adcl $0, %eax -4: andl $3, %ecx - jz 7f - cmpl $2, %ecx - jb 5f -SRC( movw (%esi), %cx ) - leal 2(%esi), %esi -DST( movw %cx, (%edi) ) - leal 2(%edi), %edi - je 6f - shll $16,%ecx -SRC(5: movb (%esi), %cl ) -DST( movb %cl, (%edi) ) -6: addl %ecx, %eax - adcl $0, %eax -7: -5000: - -# Exception handler: -.section .fixup, "ax" - -6001: - movl ARGBASE+20(%esp), %ebx # src_err_ptr - movl $-EFAULT, (%ebx) - - # zero the complete destination - computing the rest - # is too much work - movl ARGBASE+8(%esp), %edi # dst - movl ARGBASE+12(%esp), %ecx # len - xorl %eax,%eax - rep ; stosb - - jmp 5000b - -6002: - movl ARGBASE+24(%esp), %ebx # dst_err_ptr - movl $-EFAULT,(%ebx) - jmp 5000b - -.previous - - popl %ebx - popl %esi - popl %edi - popl %ecx # equivalent to addl $4,%esp - ret - -#else - -/* Version for PentiumII/PPro */ - -#define ROUND1(x) \ - SRC(movl x(%esi), %ebx ) ; \ - addl %ebx, %eax ; \ - DST(movl %ebx, x(%edi) ) ; - -#define ROUND(x) \ - SRC(movl x(%esi), %ebx ) ; \ - adcl %ebx, %eax ; \ - DST(movl %ebx, x(%edi) ) ; - -#define ARGBASE 12 - -csum_partial_copy_generic: - pushl %ebx - pushl %edi - pushl %esi - movl ARGBASE+4(%esp),%esi #src - movl ARGBASE+8(%esp),%edi #dst - movl ARGBASE+12(%esp),%ecx #len - movl ARGBASE+16(%esp),%eax #sum -# movl %ecx, %edx - movl %ecx, %ebx - movl %esi, %edx - shrl $6, %ecx - andl $0x3c, %ebx - negl %ebx - subl %ebx, %esi - subl %ebx, %edi - lea -1(%esi),%edx - andl $-32,%edx - lea 3f(%ebx,%ebx), %ebx - testl %esi, %esi - jmp *%ebx -1: addl $64,%esi - addl $64,%edi - SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl) - ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52) - ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36) - ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20) - ROUND (-16) ROUND(-12) ROUND(-8) ROUND(-4) -3: adcl $0,%eax - addl $64, %edx - dec %ecx - jge 1b -4: movl ARGBASE+12(%esp),%edx #len - andl $3, %edx - jz 7f - cmpl $2, %edx - jb 5f -SRC( movw (%esi), %dx ) - leal 2(%esi), %esi -DST( movw %dx, (%edi) ) - leal 2(%edi), %edi - je 6f - shll $16,%edx -5: -SRC( movb (%esi), %dl ) -DST( movb %dl, (%edi) ) -6: addl %edx, %eax - adcl $0, %eax -7: -.section .fixup, "ax" -6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr - movl $-EFAULT, (%ebx) - # zero the complete destination (computing the rest is too much work) - movl ARGBASE+8(%esp),%edi # dst - movl ARGBASE+12(%esp),%ecx # len - xorl %eax,%eax - rep; stosb - jmp 7b -6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr - movl $-EFAULT, (%ebx) - jmp 7b -.previous - - popl %esi - popl %edi - popl %ebx - ret - -#undef ROUND -#undef ROUND1 - -#endif diff -urN linux-2.4.20-pre11/arch/i386/lib/csum.S linux-2.4.20-pre11csum/arch/i386/lib/csum.S --- linux-2.4.20-pre11/arch/i386/lib/csum.S Wed Dec 31 22:00:00 1969 +++ linux-2.4.20-pre11csum/arch/i386/lib/csum.S Fri Nov 1 22:45:31 2002 @@ -0,0 +1,97 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IP/TCP/UDP checksumming routines + * + * Authors: Jorge Cwik, + * Arnt Gulbrandsen, + * Tom May, + * Pentium Pro/II routines: + * Alexander Kjeldaas + * Finn Arne Gangstad + * Lots of code moved from tcp.c and ip.c; see those files + * for more names. + * + * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception + * handling. + * Andi Kleen, add zeroing on error converted to pure assembler + * 2002-10-30 Denis Vlasenko + * boot-time benchmarking, 3Dnow/MMX+/SSE versions + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* +** computes a partial checksum, e.g. for TCP/UDP fragments +** +** unsigned int csum_partial(const unsigned char * buff, +** int len, unsigned int sum) +*/ + +.text +.align 4 +.globl csum_partial + +csum_partial: + pushl %esi + pushl %ebx + movl 20(%esp), %eax # arg: sum + movl 16(%esp), %ecx # arg: len + movl 12(%esp), %esi # arg: buf + + testl $3, %esi + jz 40f +20: + # not 4-aligned: analyze and align... + testl $1, %esi + jz 30f + + # unaligned start addr + decl %ecx + js 90f # sz==0, exit + movzbl (%esi), %ebx # eat one byte... + addl %ebx, %eax + adcl $0, %eax + roll $8, %eax # NB: need to be undone at exit! + incl %esi + testl $2, %esi + jz 40f +30: + # Note: 2-aligned, but not 4-aligned + cmpl $3, %ecx + jbe 60f + addw (%esi), %ax # eat 2 bytes + leal 2(%esi), %esi + adcl $0, %eax + subl $2, %ecx +40: + # esi is 4-aligned here, call block routine + movl $csum_basic, %ebx # known ok even for ecx==0 etc + cmpl $128, %ecx # use optimized routine + jb 50f # only for large blocks + movl best_csum, %ebx +50: call *%ebx +60: + # handle the last 0-3 bytes without much jumping + jecxz 80f + notl %ecx # 0->3, 1->2, 2->1, 3->0, higher bits are masked + movl $0xffffff, %ebx # by the shll and shrl instructions + shll $3, %ecx + shrl %cl, %ebx + andl (%esi), %ebx # esi is 4-aligned so should be ok + addl %ebx, %eax + adcl $0, %eax +80: + # undo csum rotation if start addr was odd + testl $1, 12(%esp) + jz 90f + roll $8, %eax +90: + popl %ebx + popl %esi + ret diff -urN linux-2.4.20-pre11/arch/i386/lib/csum_3dnow.S linux-2.4.20-pre11csum/arch/i386/lib/csum_3dnow.S --- linux-2.4.20-pre11/arch/i386/lib/csum_3dnow.S Wed Dec 31 22:00:00 1969 +++ linux-2.4.20-pre11csum/arch/i386/lib/csum_3dnow.S Fri Nov 1 22:48:32 2002 @@ -0,0 +1,4 @@ +#define PREFETCH(a) prefetch a +#define NAME csum_3dnow + +#include "csum_pf.inc" diff -urN linux-2.4.20-pre11/arch/i386/lib/csum_basic.S linux-2.4.20-pre11csum/arch/i386/lib/csum_basic.S --- linux-2.4.20-pre11/arch/i386/lib/csum_basic.S Wed Dec 31 22:00:00 1969 +++ linux-2.4.20-pre11csum/arch/i386/lib/csum_basic.S Fri Nov 1 22:56:19 2002 @@ -0,0 +1,63 @@ +.text +.align 4 +.globl csum_basic + +/* Experiments with Ethernet and SLIP connections show that buff +** is aligned on either a 2-byte or 4-byte boundary. We get at +** least a twofold speedup on 486 and Pentium if it is 4-byte aligned. +** Fortunately, it is easy to convert 2-byte alignment to 4-byte +** alignment for the unrolled loop. +*/ +csum_basic: + movl %ecx, %ebx + movl %ecx, %edx + shrl $7, %ecx + andl $0x7c, %ebx + addl %ebx, %esi + shrl $2, %ebx + negl %ebx + leal 50f(%ebx,%ebx,2), %ebx + clc + jmp *%ebx +40: + leal 128(%esi), %esi + adcl -128(%esi), %eax + adcl -124(%esi), %eax + adcl -120(%esi), %eax + adcl -116(%esi), %eax + adcl -112(%esi), %eax + adcl -108(%esi), %eax + adcl -104(%esi), %eax + adcl -100(%esi), %eax + adcl -96(%esi), %eax + adcl -92(%esi), %eax + adcl -88(%esi), %eax + adcl -84(%esi), %eax + adcl -80(%esi), %eax + adcl -76(%esi), %eax + adcl -72(%esi), %eax + adcl -68(%esi), %eax + adcl -64(%esi), %eax + adcl -60(%esi), %eax + adcl -56(%esi), %eax + adcl -52(%esi), %eax + adcl -48(%esi), %eax + adcl -44(%esi), %eax + adcl -40(%esi), %eax + adcl -36(%esi), %eax + adcl -32(%esi), %eax + adcl -28(%esi), %eax + adcl -24(%esi), %eax + adcl -20(%esi), %eax + adcl -16(%esi), %eax + adcl -12(%esi), %eax + adcl -8(%esi), %eax + adcl -4(%esi), %eax +50: + decl %ecx + jge 40b + + adcl $0, %eax + movl %edx, %ecx + andl $3, %ecx + ret diff -urN linux-2.4.20-pre11/arch/i386/lib/csum_naive.S linux-2.4.20-pre11csum/arch/i386/lib/csum_naive.S --- linux-2.4.20-pre11/arch/i386/lib/csum_naive.S Wed Dec 31 22:00:00 1969 +++ linux-2.4.20-pre11csum/arch/i386/lib/csum_naive.S Fri Nov 1 22:36:20 2002 @@ -0,0 +1,17 @@ +.text +.align 4 +.globl csum_naive + +csum_naive: + mov %ecx, %edx + shrl $2, %ecx + clc +1: + adcl (%esi), %eax + leal 4(%esi), %esi + loop 1b + + adcl $0, %eax + mov %edx, %ecx + andl $3, %ecx + ret diff -urN linux-2.4.20-pre11/arch/i386/lib/csum_pf.inc linux-2.4.20-pre11csum/arch/i386/lib/csum_pf.inc --- linux-2.4.20-pre11/arch/i386/lib/csum_pf.inc Wed Dec 31 22:00:00 1969 +++ linux-2.4.20-pre11csum/arch/i386/lib/csum_pf.inc Fri Nov 1 22:57:20 2002 @@ -0,0 +1,95 @@ +//#define PREFETCH(a) prefetchnta a +//#define PREFETCH(a) prefetch a +//#define PREFETCH(a) + +// How much unrolling do you want? +//vda: 5 is best on Duron 650 +#define ITER_BITS 5 // ...5,6,7 - ...32,64,128 bytes + // NB: tweak unrolled loop too... +/* +** computes a partial checksum, e.g. for TCP/UDP fragments +** int csum_partial(const char *buff, int len, int sum) +*/ + +#define ITER_SZ (1<=16 +10: + PREFETCH((%esi)) # Prefetch _each_ cacheline + PREFETCH(32(%esi)) # Note! Athlons have 64 bytes long ones, but + PREFETCH(64(%esi)) # PIIIs only 32! This gives ~20% speedup + PREFETCH(64+32(%esi)) # for PIII + PREFETCH(128(%esi)) + PREFETCH(128+32(%esi)) + PREFETCH(192(%esi)) + PREFETCH(192+32(%esi)) + movl %ecx, %ebx + movl %ecx, %edx + andl $ITER_MSK, %ebx # = bytes to handle in first (partial) iteration + shrl $ITER_BITS, %ecx # = iterations to make + addl %ebx, %esi # => 1st byte to handle in 2nd complete iteration + shrl $2, %ebx # = dwords to handle + negl %ebx + lea 50f(%ebx,%ebx,2), %ebx # = 45f - 3*dwords_to_handle + clc + jmp *%ebx # here we go! + +40: + PREFETCH(256(%esi)) +41: + lea ITER_SZ(%esi), %esi # does NOT change CF! +/* + addl -128(%esi), %eax + adcl -124(%esi), %eax + adcl -120(%esi), %eax + adcl -116(%esi), %eax + adcl -112(%esi), %eax + adcl -108(%esi), %eax + adcl -104(%esi), %eax + adcl -100(%esi), %eax + adcl -96(%esi), %eax + adcl -92(%esi), %eax + adcl -88(%esi), %eax + adcl -84(%esi), %eax + adcl -80(%esi), %eax + adcl -76(%esi), %eax + adcl -72(%esi), %eax + adcl -68(%esi), %eax + adcl -64(%esi), %eax + adcl -60(%esi), %eax + adcl -56(%esi), %eax + adcl -52(%esi), %eax + adcl -48(%esi), %eax + adcl -44(%esi), %eax + adcl -40(%esi), %eax + adcl -36(%esi), %eax +*/ + addl -32(%esi), %eax + adcl -28(%esi), %eax + adcl -24(%esi), %eax + adcl -20(%esi), %eax + adcl -16(%esi), %eax + adcl -12(%esi), %eax + adcl -8(%esi), %eax + adcl -4(%esi), %eax +50: + adcl $0, %eax + dec %ecx # does NOT change CF! + # We can do just "jge 40b" here, but we can be a bit clever... + # This little twist gives surprisingly noticeable benefits! + # Seen 11% increase on random 1K blocks on Duron 650 + js 60f + cmp $256/ITER_SZ, %ecx + jae 40b # need prefetch + jmp 41b # do not need it +60: + movl %edx, %ecx + andl $3, %ecx + ret diff -urN linux-2.4.20-pre11/arch/i386/lib/csum_ssemmxplus.S linux-2.4.20-pre11csum/arch/i386/lib/csum_ssemmxplus.S --- linux-2.4.20-pre11/arch/i386/lib/csum_ssemmxplus.S Wed Dec 31 22:00:00 1969 +++ linux-2.4.20-pre11csum/arch/i386/lib/csum_ssemmxplus.S Fri Nov 1 22:48:39 2002 @@ -0,0 +1,4 @@ +#define PREFETCH(a) prefetchnta a +#define NAME csum_ssemmxplus + +#include "csum_pf.inc" diff -urN linux-2.4.20-pre11/arch/i386/lib/csumcpy.S linux-2.4.20-pre11csum/arch/i386/lib/csumcpy.S --- linux-2.4.20-pre11/arch/i386/lib/csumcpy.S Wed Dec 31 22:00:00 1969 +++ linux-2.4.20-pre11csum/arch/i386/lib/csumcpy.S Fri Nov 1 22:49:44 2002 @@ -0,0 +1,178 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IP/TCP/UDP checksumming routines + * + * Authors: Jorge Cwik, + * Arnt Gulbrandsen, + * Tom May, + * Pentium Pro/II routines: + * Alexander Kjeldaas + * Finn Arne Gangstad + * Lots of code moved from tcp.c and ip.c; see those files + * for more names. + * + * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception + * handling. + * Andi Kleen, add zeroing on error converted to pure assembler + * 2002-10-30 Denis Vlasenko + * boot-time benchmarking, 3Dnow/MMX+/SSE versions + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include + +/* +** computes a partial checksum, e.g. for TCP/UDP fragments +** +** unsigned int csum_partial(const unsigned char * buff, +** int len, unsigned int sum) +*/ + +#ifdef __KERNEL__ +#define K(a...) a +#else +#define K(a...) +#endif + +#define SRC(y...) \ +9999: y ;\ + .section __ex_table, "a";\ + .long 9999b, 6001f ;\ + .previous + +#define DST(y...) \ +9999: y ;\ + .section __ex_table, "a";\ + .long 9999b, 6002f ;\ + .previous + +#define KERNEL_FPU_BEGIN \ + call kernel_fpu_begin + +#define KERNEL_FPU_END(r) \ +K( movl %cr0, r ;)\ +K( orl $8, r ;)\ +K( movl r, %cr0 ;) + +.text + +#include "csumcpy_naive.inc" +#include "csumcpy_basic.inc" +#include "csumcpy_ssemmxplus.inc" +#include "csumcpy_sse.inc" + +.align 4 +.globl csum_partial_copy_generic + +csum_partial_copy_generic: + pushl %ebx + pushl %edi + pushl %esi + pushl %ebp + movl %esp, %ebp + +#define STK_DERR 40(%ebp) +#define STK_SERR 36(%ebp) +#define STK_SUM 32(%ebp) +#define STK_LEN 28(%ebp) +#define STK_DST 24(%ebp) +#define STK_SRC 20(%ebp) +#define STK_EIP 16(%ebp) +#define STK_EBX 12(%ebp) +#define STK_EDI 8(%ebp) +#define STK_ESI 4(%ebp) +#define STK_EBP (%ebp) + + movl STK_SRC, %esi #src + movl STK_DST, %edi #dst + movl STK_LEN, %ecx #len + movl STK_SUM, %eax #sum + + testl $3, %edi # Check dst alignment + jz 40f + + # not 4-aligned: analyze and align... + testl $1, %edi + jz 30f + + # unaligned start addr + decl %ecx + js 90f # sz==0, exit + movzbl (%esi), %ebx # eat one byte... + movb %bl, (%edi) + addl %ebx, %eax + adcl $0, %eax + roll $8, %eax # NB: need to be undone at exit! + incl %esi + incl %edi + testl $2, %edi + jz 40f +30: + # xxx 2-aligned, but not 4-aligned + cmpl $3, %ecx + jbe 60f + movw (%esi), %bx # eat 2 bytes + addw %bx, %ax + movw %bx, (%edi) + adcl $0, %eax + leal 2(%esi), %esi + leal 2(%edi), %edi + subl $2, %ecx +40: + # edi is 4-aligned now: call block routine + movl $csumcpy_basic, %ebx # 'default', known good for ecx==0 etc + cmpl $128, %ecx # use optimized routine + jb 50f # only for large blocks + movl best_csumcpy, %ebx +50: call *%ebx +60: + # handle last 0-3 bytes + jecxz 80f + cmpl $2, %ecx + jb 70f +SRC( movw (%esi), %cx ) + leal 2(%esi), %esi +DST( movw %cx, (%edi) ) + leal 2(%edi), %edi + je 75f + shll $16, %ecx +70: +SRC( movb (%esi), %cl ) +DST( movb %cl, (%edi) ) +75: addl %ecx, %eax + adcl $0, %eax +80: + # undo csum rotation if dst was unaligned + testl $1, STK_DST + jz 90f + roll $8, %eax +90: + movl %esp, %ebp + popl %ebp + popl %esi + popl %edi + popl %ebx + ret + + +.section .fixup, "ax" +6001: movl STK_SERR, %ebx # src_err_ptr + movl $-EFAULT, (%ebx) + # zero the complete destination (computing the rest is too much work) + movl STK_DST, %edi # dst + movl STK_LEN, %ecx # len + xorl %eax, %eax + cld + rep; stosb + jmp 90b +6002: movl STK_DERR, %ebx # dst_err_ptr + movl $-EFAULT, (%ebx) + jmp 90b +.previous diff -urN linux-2.4.20-pre11/arch/i386/lib/csumcpy_basic.inc linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_basic.inc --- linux-2.4.20-pre11/arch/i386/lib/csumcpy_basic.inc Wed Dec 31 22:00:00 1969 +++ linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_basic.inc Fri Nov 1 23:27:28 2002 @@ -0,0 +1,40 @@ +// Please somebody experiment with unroll length +// on a PII. Do _not_ optimize for PIII/Athlons/etc, +// they won't typically use this... + +.align 4 +.globl csumcpy_basic + +csumcpy_basic: + movl %ecx, %ebx + movl %ecx, %edx + shrl $6, %ecx + andl $0x3c, %ebx + negl %ebx + subl %ebx, %esi + subl %ebx, %edi + leal 50f(%ebx,%ebx), %ebx + clc + jmp *%ebx +40: + leal 64(%esi), %esi + leal 64(%edi), %edi + +#undef ROUND +#define ROUND(x) \ +SRC( movl x(%esi), %ebx ); \ + adcl %ebx, %eax ; \ +DST( movl %ebx, x(%edi) ); + + ROUND(-64) ROUND(-60) ROUND(-56) ROUND(-52) + ROUND(-48) ROUND(-44) ROUND(-40) ROUND(-36) + ROUND(-32) ROUND(-28) ROUND(-24) ROUND(-20) + ROUND(-16) ROUND(-12) ROUND(-8) ROUND(-4) +50: + decl %ecx + jge 40b + + adcl $0, %eax + movl %edx, %ecx + andl $3, %ecx + ret diff -urN linux-2.4.20-pre11/arch/i386/lib/csumcpy_naive.inc linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_naive.inc --- linux-2.4.20-pre11/arch/i386/lib/csumcpy_naive.inc Wed Dec 31 22:00:00 1969 +++ linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_naive.inc Fri Nov 1 23:27:51 2002 @@ -0,0 +1,21 @@ +// Heh... at least it's small ;) + +.align 4 +.globl csumcpy_naive + +csumcpy_naive: + mov %ecx, %edx + shrl $2, %ecx + clc +1: +SRC( movl (%esi), %ebx ) +DST( movl %ebx, (%edi) ) + adcl %ebx, %eax + leal 4(%esi), %esi + leal 4(%edi), %edi + loop 1b + + adcl $0, %eax + mov %edx, %ecx + and $3, %ecx + ret diff -urN linux-2.4.20-pre11/arch/i386/lib/csumcpy_sse.inc linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_sse.inc --- linux-2.4.20-pre11/arch/i386/lib/csumcpy_sse.inc Wed Dec 31 22:00:00 1969 +++ linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_sse.inc Fri Nov 1 23:38:32 2002 @@ -0,0 +1,147 @@ +// Huge routine, I don't like it's size and number +// of fixups... think of that when you want +// to unroll loop more +// TODO: benchmark and reduce size +// I won't stand 1K behemot just for 5% speedup + +#undef PREFETCH +#define PREFETCH(a) prefetchnta a + +// How much unrolling do you want? +// vda: celeron 1200: 5 with movaps, 4 with movups +#undef ITER_BITS +#define ITER_BITS 6 // ...4,5,6,7 - ...16,32,64,128 bytes + // NB: tweak unrolled loop too... + +#undef ITER_SZ +#undef ITER_MSK +#define ITER_SZ (1<6 don't mix + adcl $0, %eax + jmp 19f +15: # esi is 16-aligned + PREFETCH(256(%esi)) + ROUND0(a,%xmm0) + ROUND(a,16,%xmm1) + PREFETCH(256+32(%esi)) + ROUND(a,32,%xmm0) + ROUND(a,48,%xmm1) + lea ITER_SZ(%esi), %esi + lea ITER_SZ(%edi), %edi + //dec %ecx + //jnz 15b + loop 15b // Beware: loop and ITER_BITS>6 don't mix + adcl $0, %eax +19: + sfence # clean up XMM + //KERNEL_FPU_END(%ebx) + movups (%esp), %xmm0 + movups 16(%esp), %xmm1 + addl $32, %esp +K( movl %ebx, %cr0 ) + +20: + # loop for dwords + movl %edx, %ecx + andl $ITER_MSK, %edx + jz 40f + shrl $2, %edx # this also clears CF +30: +SRC( movl (%esi), %ebx ) + adcl %ebx, %eax +DST( movl %ebx, (%edi) ) + lea 4(%esi), %esi + lea 4(%edi), %edi + dec %edx + jnz 30b + adcl $0, %eax +40: + # last 1, 2 or 3 bytes: handled by caller + andl $3, %ecx + ret + + +# xxx 16-align edi and get back +5500: cmp $ITER_SZ, %ecx # edi is 4-aligned here + mov %ecx, %edx # edx needed at 20: + jb 20b # not worthy: too short + +5520: test $0xe, %edi # loop until we are 16-aligned + jz 1b +SRC( movl (%esi), %ebx ) + addl $4, %esi +DST( movl %ebx, (%edi) ) + addl $4, %edi + addl %ebx, %eax + adcl $0, %eax + subl $4, %ecx + jmp 5520b diff -urN linux-2.4.20-pre11/arch/i386/lib/csumcpy_ssemmxplus.inc linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_ssemmxplus.inc --- linux-2.4.20-pre11/arch/i386/lib/csumcpy_ssemmxplus.inc Wed Dec 31 22:00:00 1969 +++ linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_ssemmxplus.inc Fri Nov 1 23:22:58 2002 @@ -0,0 +1,103 @@ +#undef PREFETCH +#define PREFETCH(a) prefetchnta a + +// How much unrolling do you want? +#undef ITER_BITS +#define ITER_BITS 5 // ...5,6,7 - ...32,64,128 bytes + // NB: tweak unrolled loop too... + +#undef ITER_SZ +#undef ITER_MSK +#define ITER_SZ (1<5 don't mix + adcl $0, %eax + + sfence + //KERNEL_FPU_END(%ebx) + frstor (%esp) + addl $108, %esp +K( movl %ebx, %cr0 ) + +20: + # loop for dwords + movl %edx, %ecx + andl $ITER_MSK, %edx + jz 40f + shrl $2, %edx # this also clears CF +30: +SRC( movl (%esi), %ebx ) + adcl %ebx, %eax +DST( movl %ebx, (%edi) ) + lea 4(%esi), %esi + lea 4(%edi), %edi + dec %edx + jnz 30b + adcl $0, %eax + +40: andl $3, %ecx + ret