1 New csum functions optimized for different processors.
2 Author: Denis Vlasenko <vda@port.imtp.ilyichevsk.odessa.ua>
4 diff -urN linux-2.4.20-pre11/arch/i386/lib/Makefile linux-2.4.20-pre11csum/arch/i386/lib/Makefile
5 --- linux-2.4.20-pre11/arch/i386/lib/Makefile Mon Sep 10 12:31:30 2001
6 +++ linux-2.4.20-pre11csum/arch/i386/lib/Makefile Fri Nov 1 23:55:58 2002
11 -obj-y = checksum.o old-checksum.o delay.o \
12 +obj-y = old-checksum.o delay.o \
13 usercopy.o getuser.o \
25 obj-$(CONFIG_X86_USE_3DNOW) += mmx.o
26 obj-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
27 diff -urN linux-2.4.20-pre11/arch/i386/lib/bench_csum.c linux-2.4.20-pre11csum/arch/i386/lib/bench_csum.c
28 --- linux-2.4.20-pre11/arch/i386/lib/bench_csum.c Wed Dec 31 22:00:00 1969
29 +++ linux-2.4.20-pre11csum/arch/i386/lib/bench_csum.c Sat Nov 2 11:51:40 2002
31 +#include <linux/mm.h> // for get_pages
32 +#include <asm/uaccess.h> // for access_ok in asm/checksum.h
33 +#include <linux/in6.h> // for in6_addr in asm/checksum.h
34 +#include <asm/byteorder.h> // for ntoh in asm/checksum.h
35 +#include <asm/cpufeature.h> // for X86_FEATURE_xx
36 +#include <linux/byteorder/generic.h> // for ntohX in asm/checksum.h
37 +#include <linux/stddef.h> // for NULL in asm/checksum.h
38 +#include <linux/linkage.h> // for asmlinkage in asm/checksum.h
39 +#include <linux/module.h>
41 +#include <asm/checksum.h>
42 +#include "bench_func.h"
44 +//#define dprintk(a...) printk(a)
45 +#define dprintk(a...) ((void)0)
47 +/* Features usable for mem optimization:
49 +X86_FEATURE_FPU Onboard FPU
50 +X86_FEATURE_MMX Multimedia Extensions
51 +X86_FEATURE_XMM Streaming SIMD Extensions
52 +X86_FEATURE_XMM2 Streaming SIMD Extensions-2
54 +X86_FEATURE_3DNOW 3DNow!
55 +X86_FEATURE_MMXEXT AMD MMX extensions
56 +X86_FEATURE_3DNOWEXT AMD 3DNow! extensions
58 +X86_FEATURE_CXMMX Cyrix MMX extensions
61 +typedef typeof(jiffies) jiffies_t;
63 +typedef void asm_helper(void);
65 +extern asm_helper csum_basic;
66 +extern asm_helper csum_naive;
67 +extern asm_helper csum_3dnow;
68 +extern asm_helper csum_ssemmxplus;
70 +static struct candidate csum_runner[] = {
71 + { "basic" , csum_basic , 1, { -1 } },
72 + { "simple" , csum_naive , 1, { -1 } },
73 + { "3Dnow!" , csum_3dnow , 1, { X86_FEATURE_3DNOW, -1 } },
74 + { "AMD MMX", csum_ssemmxplus, 1, { X86_FEATURE_MMXEXT, -1 } },
75 + { "SSE1+", csum_ssemmxplus, 1, { X86_FEATURE_XMM, -1 } },
78 +extern asm_helper csumcpy_basic;
79 +extern asm_helper csumcpy_naive;
80 +extern asm_helper csumcpy_ssemmxplus;
81 +extern asm_helper csumcpy_sse;
83 +static struct candidate csumcpy_runner[] = {
84 + { "basic" , csumcpy_basic , 2, { -1 } },
85 + { "simple" , csumcpy_naive , 2, { -1 } },
86 + /* higher weight: we prefer these for less cache pollution: */
87 + { "AND MMX", csumcpy_ssemmxplus, 3, { X86_FEATURE_MMXEXT, -1 } },
88 + { "SSE1+", csumcpy_ssemmxplus, 3, { X86_FEATURE_XMM, -1 } },
89 + { "SSE1" , csumcpy_sse , 3, { X86_FEATURE_XMM, -1 } },
92 +//====== TODO: split here: above: arch, below:generic
94 +/* set this to value bigger than cache(s) */
95 +/* TODO: heuristic for buffer size */
96 +#define bufshift 20 /* 10=1kb, 20=1MB etc */
97 +/* typical size of a packet */
98 +#define chunksz (4*1024)
100 +#define bufsz (1<<bufshift)
101 +#define chunkcnt (bufsz/chunksz)
103 +#define VECTOR_SZ(a) (sizeof(a)/sizeof((a)[0]))
105 +asm_helper *best_csum = csum_basic;
106 +asm_helper *best_csumcpy = csumcpy_basic;
109 +** Count the number of iterations done during a fixed period,
110 +** and use this to calculate throughput.
113 +static int duration = 1; // jiffies for each run
117 +wait_for_jiffy(void) {
118 + jiffies_t now = jiffies;
119 + while(now == jiffies) cpu_relax();
123 +bench_csum(struct candidate *cand, char *buf)
126 + best_csum = (asm_helper*)(cand->f);
129 + // In practice these are pretty repeatable
130 + // so 3 runs is an overkill
131 + for(i=0; i<3; i++) {
135 + limit = jiffies+duration;
136 + while(time_before(jiffies, limit)) {
139 + // interleaved to avoid bias due to prefetch
140 + for(i=0; i<chunkcnt; i+=2)
141 + csum_partial(buf+i*chunksz, chunksz, 0);
142 + for(i=1; i<chunkcnt; i+=2)
143 + csum_partial(buf+i*chunksz, chunksz, 0);
148 + dprintk(" count =%6i\n",count);
154 + int kb_sec = max * (((chunksz*chunkcnt)/1024) * HZ) / duration;
155 + printk(" %-10s:%6d.%03d MB/sec\n", cand->name,
156 + kb_sec / 1000, kb_sec % 1000);
163 +bench_csumcpy(struct candidate *cand, char *buf)
167 + best_csumcpy = (asm_helper*)(cand->f);
170 + for(i=0; i<3; i++) {
174 + limit = jiffies+duration;
175 + while(time_before(jiffies, limit)) {
178 + // interleaved to avoid bias due to prefetch
179 + for(i=0; i<chunkcnt; i+=2)
180 + csum_partial_copy_generic(buf+i*chunksz,
181 + buf+(chunkcnt-1-i)*chunksz,
182 + chunksz, 0, &err, &err);
183 + for(i=1; i<chunkcnt; i+=2)
184 + csum_partial_copy_generic(buf+i*chunksz,
185 + buf+(chunkcnt-1-i)*chunksz,
186 + chunksz, 0, &err, &err);
191 + dprintk(" count =%6i\n",count);
197 + int kb_sec = max * (((chunksz*chunkcnt)/1024) * HZ) / duration;
198 + printk(" %-10s:%6d.%03d MB/sec\n", cand->name,
199 + kb_sec / 1000, kb_sec % 1000);
206 +find_best_csum(void)
208 + struct candidate *best;
209 + char *buffer = (char *) __get_free_pages(GFP_KERNEL,
210 + (bufshift-PAGE_SHIFT));
212 + printk(KERN_INFO "Measuring network checksumming speed\n");
214 + printk("csum: cannot allocate %i pages\n",
215 + 1<<(bufshift-PAGE_SHIFT)
219 + dprintk("allocated %i pages\n",1<<(bufshift-PAGE_SHIFT));
221 + // find # of jiffies suitable for reliable results
222 + // (at least %5 accuracy)
223 + while(bench_csumcpy(&csumcpy_runner[0], buffer)<20) {
226 + dprintk("test run will last %i ticks\n", duration);
229 + best = find_best(bench_csum, buffer, csum_runner,
230 + VECTOR_SZ(csum_runner));
231 + printk("csum: using csum function: %s\n", best->name);
232 + best_csum = (asm_helper*)(best->f);
234 + best = find_best(bench_csumcpy, buffer, csumcpy_runner,
235 + VECTOR_SZ(csumcpy_runner));
236 + printk("csum: using csum_copy function: %s\n", best->name);
237 + best_csumcpy = (asm_helper*)(best->f);
239 + free_pages((unsigned long)buffer, (bufshift-PAGE_SHIFT));
240 + dprintk("freed %i pages\n",1<<(bufshift-PAGE_SHIFT));
244 +MODULE_LICENSE("GPL");
246 +module_init(find_best_csum);
247 diff -urN linux-2.4.20-pre11/arch/i386/lib/bench_func.c linux-2.4.20-pre11csum/arch/i386/lib/bench_func.c
248 --- linux-2.4.20-pre11/arch/i386/lib/bench_func.c Wed Dec 31 22:00:00 1969
249 +++ linux-2.4.20-pre11csum/arch/i386/lib/bench_func.c Fri Nov 1 18:08:37 2002
251 +#include <linux/kernel.h> // for KERN_DEBUG
253 +#include <asm/bitops.h> // for test_bit
254 +#include <asm/processor.h> // cpu caps
255 +#include <asm/cpufeature.h> // cpu features constants
256 +#include "bench_func.h"
258 +//#define dprintk(a...) printk(a)
259 +#define dprintk(a...) ((void)0)
261 +// 2.4 only, already in 2.5
263 +boot_cpu_has(int cap)
265 + return test_bit(cap, boot_cpu_data.x86_capability);
269 +cpu_supports(int *cap)
271 + while(*cap != -1) {
272 + if(!boot_cpu_has(*cap)) {
273 + dprintk("unsupported caps: %i\n", *cap);
282 +** Call all the candidates which can be run on this CPU,
286 +find_best(bench_func *bench, char *opaque, struct candidate runner[], int count)
288 + int score, max = 0;
289 + struct candidate *best = 0;
291 + if(!cpu_supports(runner->cpu_caps_needed)) {
292 + printk("func %s skipped: not supported by CPU\n", runner->name);
294 + score = bench(runner,opaque) * runner->weight;
304 diff -urN linux-2.4.20-pre11/arch/i386/lib/bench_func.h linux-2.4.20-pre11csum/arch/i386/lib/bench_func.h
305 --- linux-2.4.20-pre11/arch/i386/lib/bench_func.h Wed Dec 31 22:00:00 1969
306 +++ linux-2.4.20-pre11csum/arch/i386/lib/bench_func.h Fri Nov 1 18:08:37 2002
308 +#ifndef _BENCH_FUNC_H
309 +#define _BENCH_FUNC_H
313 + void *f; // pointer to func
315 + int cpu_caps_needed[4];
318 +typedef int bench_func(struct candidate *cand, char *opaque);
320 +struct candidate* find_best(bench_func *bench, char *opaque,
321 + struct candidate runner[], int count);
324 diff -urN linux-2.4.20-pre11/arch/i386/lib/checksum.S linux-2.4.20-pre11csum/arch/i386/lib/checksum.S
325 --- linux-2.4.20-pre11/arch/i386/lib/checksum.S Fri Nov 1 18:06:59 2002
326 +++ linux-2.4.20-pre11csum/arch/i386/lib/checksum.S Wed Dec 31 22:00:00 1969
329 - * INET An implementation of the TCP/IP protocol suite for the LINUX
330 - * operating system. INET is implemented using the BSD Socket
331 - * interface as the means of communication with the user level.
333 - * IP/TCP/UDP checksumming routines
335 - * Authors: Jorge Cwik, <jorge@laser.satlink.net>
336 - * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
337 - * Tom May, <ftom@netcom.com>
338 - * Pentium Pro/II routines:
339 - * Alexander Kjeldaas <astor@guardian.no>
340 - * Finn Arne Gangstad <finnag@guardian.no>
341 - * Lots of code moved from tcp.c and ip.c; see those files
344 - * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception
346 - * Andi Kleen, add zeroing on error
347 - * converted to pure assembler
349 - * This program is free software; you can redistribute it and/or
350 - * modify it under the terms of the GNU General Public License
351 - * as published by the Free Software Foundation; either version
352 - * 2 of the License, or (at your option) any later version.
355 -#include <linux/config.h>
356 -#include <asm/errno.h>
359 - * computes a partial checksum, e.g. for TCP/UDP fragments
363 -unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
370 -#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
373 - * Experiments with Ethernet and SLIP connections show that buff
374 - * is aligned on either a 2-byte or 4-byte boundary. We get at
375 - * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
376 - * Fortunately, it is easy to convert 2-byte alignment to 4-byte
377 - * alignment for the unrolled loop.
382 - movl 20(%esp),%eax # Function arg: unsigned int sum
383 - movl 16(%esp),%ecx # Function arg: int len
384 - movl 12(%esp),%esi # Function arg: unsigned char *buff
385 - testl $3, %esi # Check alignment.
386 - jz 2f # Jump if alignment is ok.
387 - testl $1, %esi # Check alignment.
388 - jz 10f # Jump if alignment is boundary of 2bytes.
393 - movzbl (%esi), %ebx
400 - subl $2, %ecx # Alignment uses up two bytes.
401 - jae 1f # Jump if we had at least two bytes.
402 - addl $2, %ecx # ecx was < 2. Deal with it.
413 -1: movl (%esi), %ebx
419 - movl 12(%esi), %ebx
421 - movl 16(%esi), %ebx
423 - movl 20(%esi), %ebx
425 - movl 24(%esi), %ebx
427 - movl 28(%esi), %ebx
436 - shrl $2, %edx # This clears CF
437 -3: adcl (%esi), %eax
464 -/* Version for PentiumII/PPro */
469 - movl 20(%esp),%eax # Function arg: unsigned int sum
470 - movl 16(%esp),%ecx # Function arg: int len
471 - movl 12(%esp),%esi # Function arg: const unsigned char *buf
483 - lea 45f(%ebx,%ebx,2), %ebx
487 - # Handle 2-byte-aligned regions
488 -20: addw (%esi), %ax
498 - movzbl (%esi), %ebx
511 - movzbl (%esi),%ebx # csumming 1 byte, 2-aligned
516 - addw (%esi), %ax # csumming 2 bytes, 2-aligned
521 - addl -128(%esi), %eax
522 - adcl -124(%esi), %eax
523 - adcl -120(%esi), %eax
524 - adcl -116(%esi), %eax
525 - adcl -112(%esi), %eax
526 - adcl -108(%esi), %eax
527 - adcl -104(%esi), %eax
528 - adcl -100(%esi), %eax
529 - adcl -96(%esi), %eax
530 - adcl -92(%esi), %eax
531 - adcl -88(%esi), %eax
532 - adcl -84(%esi), %eax
533 - adcl -80(%esi), %eax
534 - adcl -76(%esi), %eax
535 - adcl -72(%esi), %eax
536 - adcl -68(%esi), %eax
537 - adcl -64(%esi), %eax
538 - adcl -60(%esi), %eax
539 - adcl -56(%esi), %eax
540 - adcl -52(%esi), %eax
541 - adcl -48(%esi), %eax
542 - adcl -44(%esi), %eax
543 - adcl -40(%esi), %eax
544 - adcl -36(%esi), %eax
545 - adcl -32(%esi), %eax
546 - adcl -28(%esi), %eax
547 - adcl -24(%esi), %eax
548 - adcl -20(%esi), %eax
549 - adcl -16(%esi), %eax
550 - adcl -12(%esi), %eax
551 - adcl -8(%esi), %eax
552 - adcl -4(%esi), %eax
554 - lea 128(%esi), %esi
562 - # Handle the last 1-3 bytes without jumping
563 - notl %ecx # 1->2, 2->1, 3->0, higher bits are masked
564 - movl $0xffffff,%ebx # by the shll and shrl instructions
567 - andl -128(%esi),%ebx # esi is 4-aligned so should be ok
582 -unsigned int csum_partial_copy_generic (const char *src, char *dst,
583 - int len, int sum, int *src_err_ptr, int *dst_err_ptr)
587 - * Copy from ds while checksumming, otherwise like csum_partial
589 - * The macros SRC and DST specify the type of access for the instruction.
590 - * thus we can call a custom exception handler for all access types.
592 - * FIXME: could someone double-check whether I haven't mixed up some SRC and
593 - * DST definitions? It's damn hard to trigger all cases. I hope I got
594 - * them all but there's no guarantee.
599 - .section __ex_table, "a"; \
600 - .long 9999b, 6001f ; \
605 - .section __ex_table, "a"; \
606 - .long 9999b, 6002f ; \
610 -.globl csum_partial_copy_generic
612 -#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
617 -csum_partial_copy_generic:
622 - movl ARGBASE+16(%esp),%eax # sum
623 - movl ARGBASE+12(%esp),%ecx # len
624 - movl ARGBASE+4(%esp),%esi # src
625 - movl ARGBASE+8(%esp),%edi # dst
627 - testl $2, %edi # Check alignment.
628 - jz 2f # Jump if alignment is ok.
629 - subl $2, %ecx # Alignment uses up two bytes.
630 - jae 1f # Jump if we had at least two bytes.
631 - addl $2, %ecx # ecx was < 2. Deal with it.
633 -SRC(1: movw (%esi), %bx )
635 -DST( movw %bx, (%edi) )
640 - movl %ecx, FP(%esp)
644 -SRC(1: movl (%esi), %ebx )
645 -SRC( movl 4(%esi), %edx )
647 -DST( movl %ebx, (%edi) )
649 -DST( movl %edx, 4(%edi) )
651 -SRC( movl 8(%esi), %ebx )
652 -SRC( movl 12(%esi), %edx )
654 -DST( movl %ebx, 8(%edi) )
656 -DST( movl %edx, 12(%edi) )
658 -SRC( movl 16(%esi), %ebx )
659 -SRC( movl 20(%esi), %edx )
661 -DST( movl %ebx, 16(%edi) )
663 -DST( movl %edx, 20(%edi) )
665 -SRC( movl 24(%esi), %ebx )
666 -SRC( movl 28(%esi), %edx )
668 -DST( movl %ebx, 24(%edi) )
670 -DST( movl %edx, 28(%edi) )
677 -2: movl FP(%esp), %edx
681 - shrl $2, %edx # This clears CF
682 -SRC(3: movl (%esi), %ebx )
684 -DST( movl %ebx, (%edi) )
694 -SRC( movw (%esi), %cx )
696 -DST( movw %cx, (%edi) )
700 -SRC(5: movb (%esi), %cl )
701 -DST( movb %cl, (%edi) )
707 -# Exception handler:
708 -.section .fixup, "ax"
711 - movl ARGBASE+20(%esp), %ebx # src_err_ptr
712 - movl $-EFAULT, (%ebx)
714 - # zero the complete destination - computing the rest
716 - movl ARGBASE+8(%esp), %edi # dst
717 - movl ARGBASE+12(%esp), %ecx # len
724 - movl ARGBASE+24(%esp), %ebx # dst_err_ptr
725 - movl $-EFAULT,(%ebx)
733 - popl %ecx # equivalent to addl $4,%esp
738 -/* Version for PentiumII/PPro */
741 - SRC(movl x(%esi), %ebx ) ; \
742 - addl %ebx, %eax ; \
743 - DST(movl %ebx, x(%edi) ) ;
746 - SRC(movl x(%esi), %ebx ) ; \
747 - adcl %ebx, %eax ; \
748 - DST(movl %ebx, x(%edi) ) ;
752 -csum_partial_copy_generic:
756 - movl ARGBASE+4(%esp),%esi #src
757 - movl ARGBASE+8(%esp),%edi #dst
758 - movl ARGBASE+12(%esp),%ecx #len
759 - movl ARGBASE+16(%esp),%eax #sum
770 - lea 3f(%ebx,%ebx), %ebx
775 - SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl)
776 - ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)
777 - ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)
778 - ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)
779 - ROUND (-16) ROUND(-12) ROUND(-8) ROUND(-4)
784 -4: movl ARGBASE+12(%esp),%edx #len
789 -SRC( movw (%esi), %dx )
791 -DST( movw %dx, (%edi) )
796 -SRC( movb (%esi), %dl )
797 -DST( movb %dl, (%edi) )
801 -.section .fixup, "ax"
802 -6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr
803 - movl $-EFAULT, (%ebx)
804 - # zero the complete destination (computing the rest is too much work)
805 - movl ARGBASE+8(%esp),%edi # dst
806 - movl ARGBASE+12(%esp),%ecx # len
810 -6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr
811 - movl $-EFAULT, (%ebx)
824 diff -urN linux-2.4.20-pre11/arch/i386/lib/csum.S linux-2.4.20-pre11csum/arch/i386/lib/csum.S
825 --- linux-2.4.20-pre11/arch/i386/lib/csum.S Wed Dec 31 22:00:00 1969
826 +++ linux-2.4.20-pre11csum/arch/i386/lib/csum.S Fri Nov 1 22:45:31 2002
829 + * INET An implementation of the TCP/IP protocol suite for the LINUX
830 + * operating system. INET is implemented using the BSD Socket
831 + * interface as the means of communication with the user level.
833 + * IP/TCP/UDP checksumming routines
835 + * Authors: Jorge Cwik, <jorge@laser.satlink.net>
836 + * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
837 + * Tom May, <ftom@netcom.com>
838 + * Pentium Pro/II routines:
839 + * Alexander Kjeldaas <astor@guardian.no>
840 + * Finn Arne Gangstad <finnag@guardian.no>
841 + * Lots of code moved from tcp.c and ip.c; see those files
844 + * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception
846 + * Andi Kleen, add zeroing on error converted to pure assembler
847 + * 2002-10-30 Denis Vlasenko
848 + * boot-time benchmarking, 3Dnow/MMX+/SSE versions
850 + * This program is free software; you can redistribute it and/or
851 + * modify it under the terms of the GNU General Public License
852 + * as published by the Free Software Foundation; either version
853 + * 2 of the License, or (at your option) any later version.
857 +** computes a partial checksum, e.g. for TCP/UDP fragments
859 +** unsigned int csum_partial(const unsigned char * buff,
860 +** int len, unsigned int sum)
870 + movl 20(%esp), %eax # arg: sum
871 + movl 16(%esp), %ecx # arg: len
872 + movl 12(%esp), %esi # arg: buf
877 + # not 4-aligned: analyze and align...
881 + # unaligned start addr
883 + js 90f # sz==0, exit
884 + movzbl (%esi), %ebx # eat one byte...
887 + roll $8, %eax # NB: need to be undone at exit!
892 + # Note: 2-aligned, but not 4-aligned
895 + addw (%esi), %ax # eat 2 bytes
900 + # esi is 4-aligned here, call block routine
901 + movl $csum_basic, %ebx # known ok even for ecx==0 etc
902 + cmpl $128, %ecx # use optimized routine
903 + jb 50f # only for large blocks
904 + movl best_csum, %ebx
907 + # handle the last 0-3 bytes without much jumping
909 + notl %ecx # 0->3, 1->2, 2->1, 3->0, higher bits are masked
910 + movl $0xffffff, %ebx # by the shll and shrl instructions
913 + andl (%esi), %ebx # esi is 4-aligned so should be ok
917 + # undo csum rotation if start addr was odd
925 diff -urN linux-2.4.20-pre11/arch/i386/lib/csum_3dnow.S linux-2.4.20-pre11csum/arch/i386/lib/csum_3dnow.S
926 --- linux-2.4.20-pre11/arch/i386/lib/csum_3dnow.S Wed Dec 31 22:00:00 1969
927 +++ linux-2.4.20-pre11csum/arch/i386/lib/csum_3dnow.S Fri Nov 1 22:48:32 2002
929 +#define PREFETCH(a) prefetch a
930 +#define NAME csum_3dnow
932 +#include "csum_pf.inc"
933 diff -urN linux-2.4.20-pre11/arch/i386/lib/csum_basic.S linux-2.4.20-pre11csum/arch/i386/lib/csum_basic.S
934 --- linux-2.4.20-pre11/arch/i386/lib/csum_basic.S Wed Dec 31 22:00:00 1969
935 +++ linux-2.4.20-pre11csum/arch/i386/lib/csum_basic.S Fri Nov 1 22:56:19 2002
941 +/* Experiments with Ethernet and SLIP connections show that buff
942 +** is aligned on either a 2-byte or 4-byte boundary. We get at
943 +** least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
944 +** Fortunately, it is easy to convert 2-byte alignment to 4-byte
945 +** alignment for the unrolled loop.
955 + leal 50f(%ebx,%ebx,2), %ebx
959 + leal 128(%esi), %esi
960 + adcl -128(%esi), %eax
961 + adcl -124(%esi), %eax
962 + adcl -120(%esi), %eax
963 + adcl -116(%esi), %eax
964 + adcl -112(%esi), %eax
965 + adcl -108(%esi), %eax
966 + adcl -104(%esi), %eax
967 + adcl -100(%esi), %eax
968 + adcl -96(%esi), %eax
969 + adcl -92(%esi), %eax
970 + adcl -88(%esi), %eax
971 + adcl -84(%esi), %eax
972 + adcl -80(%esi), %eax
973 + adcl -76(%esi), %eax
974 + adcl -72(%esi), %eax
975 + adcl -68(%esi), %eax
976 + adcl -64(%esi), %eax
977 + adcl -60(%esi), %eax
978 + adcl -56(%esi), %eax
979 + adcl -52(%esi), %eax
980 + adcl -48(%esi), %eax
981 + adcl -44(%esi), %eax
982 + adcl -40(%esi), %eax
983 + adcl -36(%esi), %eax
984 + adcl -32(%esi), %eax
985 + adcl -28(%esi), %eax
986 + adcl -24(%esi), %eax
987 + adcl -20(%esi), %eax
988 + adcl -16(%esi), %eax
989 + adcl -12(%esi), %eax
990 + adcl -8(%esi), %eax
991 + adcl -4(%esi), %eax
1000 diff -urN linux-2.4.20-pre11/arch/i386/lib/csum_naive.S linux-2.4.20-pre11csum/arch/i386/lib/csum_naive.S
1001 --- linux-2.4.20-pre11/arch/i386/lib/csum_naive.S Wed Dec 31 22:00:00 1969
1002 +++ linux-2.4.20-pre11csum/arch/i386/lib/csum_naive.S Fri Nov 1 22:36:20 2002
1014 + leal 4(%esi), %esi
1021 diff -urN linux-2.4.20-pre11/arch/i386/lib/csum_pf.inc linux-2.4.20-pre11csum/arch/i386/lib/csum_pf.inc
1022 --- linux-2.4.20-pre11/arch/i386/lib/csum_pf.inc Wed Dec 31 22:00:00 1969
1023 +++ linux-2.4.20-pre11csum/arch/i386/lib/csum_pf.inc Fri Nov 1 22:57:20 2002
1025 +//#define PREFETCH(a) prefetchnta a
1026 +//#define PREFETCH(a) prefetch a
1027 +//#define PREFETCH(a)
1029 +// How much unrolling do you want?
1030 +//vda: 5 is best on Duron 650
1031 +#define ITER_BITS 5 // ...5,6,7 - ...32,64,128 bytes
1032 + // NB: tweak unrolled loop too...
1034 +** computes a partial checksum, e.g. for TCP/UDP fragments
1035 +** int csum_partial(const char *buff, int len, int sum)
1038 +#define ITER_SZ (1<<ITER_BITS)
1039 +#define ITER_MSK ((1<<ITER_BITS)-4)
1047 +# Guaranteed by caller: esi is 4-aligned, ecx>=16
1049 + PREFETCH((%esi)) # Prefetch _each_ cacheline
1050 + PREFETCH(32(%esi)) # Note! Athlons have 64 bytes long ones, but
1051 + PREFETCH(64(%esi)) # PIIIs only 32! This gives ~20% speedup
1052 + PREFETCH(64+32(%esi)) # for PIII
1053 + PREFETCH(128(%esi))
1054 + PREFETCH(128+32(%esi))
1055 + PREFETCH(192(%esi))
1056 + PREFETCH(192+32(%esi))
1059 + andl $ITER_MSK, %ebx # = bytes to handle in first (partial) iteration
1060 + shrl $ITER_BITS, %ecx # = iterations to make
1061 + addl %ebx, %esi # => 1st byte to handle in 2nd complete iteration
1062 + shrl $2, %ebx # = dwords to handle
1064 + lea 50f(%ebx,%ebx,2), %ebx # = 45f - 3*dwords_to_handle
1066 + jmp *%ebx # here we go!
1069 + PREFETCH(256(%esi))
1071 + lea ITER_SZ(%esi), %esi # does NOT change CF!
1073 + addl -128(%esi), %eax
1074 + adcl -124(%esi), %eax
1075 + adcl -120(%esi), %eax
1076 + adcl -116(%esi), %eax
1077 + adcl -112(%esi), %eax
1078 + adcl -108(%esi), %eax
1079 + adcl -104(%esi), %eax
1080 + adcl -100(%esi), %eax
1081 + adcl -96(%esi), %eax
1082 + adcl -92(%esi), %eax
1083 + adcl -88(%esi), %eax
1084 + adcl -84(%esi), %eax
1085 + adcl -80(%esi), %eax
1086 + adcl -76(%esi), %eax
1087 + adcl -72(%esi), %eax
1088 + adcl -68(%esi), %eax
1089 + adcl -64(%esi), %eax
1090 + adcl -60(%esi), %eax
1091 + adcl -56(%esi), %eax
1092 + adcl -52(%esi), %eax
1093 + adcl -48(%esi), %eax
1094 + adcl -44(%esi), %eax
1095 + adcl -40(%esi), %eax
1096 + adcl -36(%esi), %eax
1098 + addl -32(%esi), %eax
1099 + adcl -28(%esi), %eax
1100 + adcl -24(%esi), %eax
1101 + adcl -20(%esi), %eax
1102 + adcl -16(%esi), %eax
1103 + adcl -12(%esi), %eax
1104 + adcl -8(%esi), %eax
1105 + adcl -4(%esi), %eax
1108 + dec %ecx # does NOT change CF!
1109 + # We can do just "jge 40b" here, but we can be a bit clever...
1110 + # This little twist gives surprisingly noticeable benefits!
1111 + # Seen 11% increase on random 1K blocks on Duron 650
1113 + cmp $256/ITER_SZ, %ecx
1114 + jae 40b # need prefetch
1115 + jmp 41b # do not need it
1120 diff -urN linux-2.4.20-pre11/arch/i386/lib/csum_ssemmxplus.S linux-2.4.20-pre11csum/arch/i386/lib/csum_ssemmxplus.S
1121 --- linux-2.4.20-pre11/arch/i386/lib/csum_ssemmxplus.S Wed Dec 31 22:00:00 1969
1122 +++ linux-2.4.20-pre11csum/arch/i386/lib/csum_ssemmxplus.S Fri Nov 1 22:48:39 2002
1124 +#define PREFETCH(a) prefetchnta a
1125 +#define NAME csum_ssemmxplus
1127 +#include "csum_pf.inc"
1128 diff -urN linux-2.4.20-pre11/arch/i386/lib/csumcpy.S linux-2.4.20-pre11csum/arch/i386/lib/csumcpy.S
1129 --- linux-2.4.20-pre11/arch/i386/lib/csumcpy.S Wed Dec 31 22:00:00 1969
1130 +++ linux-2.4.20-pre11csum/arch/i386/lib/csumcpy.S Fri Nov 1 22:49:44 2002
1133 + * INET An implementation of the TCP/IP protocol suite for the LINUX
1134 + * operating system. INET is implemented using the BSD Socket
1135 + * interface as the means of communication with the user level.
1137 + * IP/TCP/UDP checksumming routines
1139 + * Authors: Jorge Cwik, <jorge@laser.satlink.net>
1140 + * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
1141 + * Tom May, <ftom@netcom.com>
1142 + * Pentium Pro/II routines:
1143 + * Alexander Kjeldaas <astor@guardian.no>
1144 + * Finn Arne Gangstad <finnag@guardian.no>
1145 + * Lots of code moved from tcp.c and ip.c; see those files
1148 + * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception
1150 + * Andi Kleen, add zeroing on error converted to pure assembler
1151 + * 2002-10-30 Denis Vlasenko
1152 + * boot-time benchmarking, 3Dnow/MMX+/SSE versions
1154 + * This program is free software; you can redistribute it and/or
1155 + * modify it under the terms of the GNU General Public License
1156 + * as published by the Free Software Foundation; either version
1157 + * 2 of the License, or (at your option) any later version.
1160 +#include <asm/errno.h>
1163 +** computes a partial checksum, e.g. for TCP/UDP fragments
1165 +** unsigned int csum_partial(const unsigned char * buff,
1166 +** int len, unsigned int sum)
1175 +#define SRC(y...) \
1177 + .section __ex_table, "a";\
1178 + .long 9999b, 6001f ;\
1181 +#define DST(y...) \
1183 + .section __ex_table, "a";\
1184 + .long 9999b, 6002f ;\
1187 +#define KERNEL_FPU_BEGIN \
1188 + call kernel_fpu_begin
1190 +#define KERNEL_FPU_END(r) \
1191 +K( movl %cr0, r ;)\
1197 +#include "csumcpy_naive.inc"
1198 +#include "csumcpy_basic.inc"
1199 +#include "csumcpy_ssemmxplus.inc"
1200 +#include "csumcpy_sse.inc"
1203 +.globl csum_partial_copy_generic
1205 +csum_partial_copy_generic:
1212 +#define STK_DERR 40(%ebp)
1213 +#define STK_SERR 36(%ebp)
1214 +#define STK_SUM 32(%ebp)
1215 +#define STK_LEN 28(%ebp)
1216 +#define STK_DST 24(%ebp)
1217 +#define STK_SRC 20(%ebp)
1218 +#define STK_EIP 16(%ebp)
1219 +#define STK_EBX 12(%ebp)
1220 +#define STK_EDI 8(%ebp)
1221 +#define STK_ESI 4(%ebp)
1222 +#define STK_EBP (%ebp)
1224 + movl STK_SRC, %esi #src
1225 + movl STK_DST, %edi #dst
1226 + movl STK_LEN, %ecx #len
1227 + movl STK_SUM, %eax #sum
1229 + testl $3, %edi # Check dst alignment
1232 + # not 4-aligned: analyze and align...
1236 + # unaligned start addr
1238 + js 90f # sz==0, exit
1239 + movzbl (%esi), %ebx # eat one byte...
1243 + roll $8, %eax # NB: need to be undone at exit!
1249 + # xxx 2-aligned, but not 4-aligned
1252 + movw (%esi), %bx # eat 2 bytes
1256 + leal 2(%esi), %esi
1257 + leal 2(%edi), %edi
1260 + # edi is 4-aligned now: call block routine
1261 + movl $csumcpy_basic, %ebx # 'default', known good for ecx==0 etc
1262 + cmpl $128, %ecx # use optimized routine
1263 + jb 50f # only for large blocks
1264 + movl best_csumcpy, %ebx
1267 + # handle last 0-3 bytes
1271 +SRC( movw (%esi), %cx )
1272 + leal 2(%esi), %esi
1273 +DST( movw %cx, (%edi) )
1274 + leal 2(%edi), %edi
1278 +SRC( movb (%esi), %cl )
1279 +DST( movb %cl, (%edi) )
1280 +75: addl %ecx, %eax
1283 + # undo csum rotation if dst was unaligned
1296 +.section .fixup, "ax"
1297 +6001: movl STK_SERR, %ebx # src_err_ptr
1298 + movl $-EFAULT, (%ebx)
1299 + # zero the complete destination (computing the rest is too much work)
1300 + movl STK_DST, %edi # dst
1301 + movl STK_LEN, %ecx # len
1306 +6002: movl STK_DERR, %ebx # dst_err_ptr
1307 + movl $-EFAULT, (%ebx)
1310 diff -urN linux-2.4.20-pre11/arch/i386/lib/csumcpy_basic.inc linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_basic.inc
1311 --- linux-2.4.20-pre11/arch/i386/lib/csumcpy_basic.inc Wed Dec 31 22:00:00 1969
1312 +++ linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_basic.inc Fri Nov 1 23:27:28 2002
1314 +// Please somebody experiment with unroll length
1315 +// on a PII. Do _not_ optimize for PIII/Athlons/etc,
1316 +// they won't typically use this...
1319 +.globl csumcpy_basic
1329 + leal 50f(%ebx,%ebx), %ebx
1333 + leal 64(%esi), %esi
1334 + leal 64(%edi), %edi
1338 +SRC( movl x(%esi), %ebx ); \
1339 + adcl %ebx, %eax ; \
1340 +DST( movl %ebx, x(%edi) );
1342 + ROUND(-64) ROUND(-60) ROUND(-56) ROUND(-52)
1343 + ROUND(-48) ROUND(-44) ROUND(-40) ROUND(-36)
1344 + ROUND(-32) ROUND(-28) ROUND(-24) ROUND(-20)
1345 + ROUND(-16) ROUND(-12) ROUND(-8) ROUND(-4)
1354 diff -urN linux-2.4.20-pre11/arch/i386/lib/csumcpy_naive.inc linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_naive.inc
1355 --- linux-2.4.20-pre11/arch/i386/lib/csumcpy_naive.inc Wed Dec 31 22:00:00 1969
1356 +++ linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_naive.inc Fri Nov 1 23:27:51 2002
1358 +// Heh... at least it's small ;)
1361 +.globl csumcpy_naive
1368 +SRC( movl (%esi), %ebx )
1369 +DST( movl %ebx, (%edi) )
1371 + leal 4(%esi), %esi
1372 + leal 4(%edi), %edi
1379 diff -urN linux-2.4.20-pre11/arch/i386/lib/csumcpy_sse.inc linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_sse.inc
1380 --- linux-2.4.20-pre11/arch/i386/lib/csumcpy_sse.inc Wed Dec 31 22:00:00 1969
1381 +++ linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_sse.inc Fri Nov 1 23:38:32 2002
1383 +// Huge routine, I don't like it's size and number
1384 +// of fixups... think of that when you want
1385 +// to unroll loop more
1386 +// TODO: benchmark and reduce size
1387 +// I won't stand 1K behemot just for 5% speedup
1390 +#define PREFETCH(a) prefetchnta a
1392 +// How much unrolling do you want?
1393 +// vda: celeron 1200: 5 with movaps, 4 with movups
1395 +#define ITER_BITS 6 // ...4,5,6,7 - ...16,32,64,128 bytes
1396 + // NB: tweak unrolled loop too...
1400 +#define ITER_SZ (1<<ITER_BITS)
1401 +#define ITER_MSK ((1<<ITER_BITS)-4)
1407 + testl $0xe, %edi # Check alignment
1408 + jnz 5500f # align to 16 bytes
1411 + shrl $ITER_BITS, %ecx
1414 +# "big chunks" loop
1415 + PREFETCH((%esi)) # Prefetch a couple of cachelines
1416 + PREFETCH(32(%esi)) // Note: Athlons have 64 bytes long ones, but
1417 + PREFETCH(64(%esi)) // PIIIs only 32! This gives ~20% speedup
1418 + PREFETCH(64+32(%esi)) // for PIII
1419 + PREFETCH(128(%esi)) // Note2: 128 pf depth is slower for Athlons
1420 + PREFETCH(128+32(%esi)) // let them enjoy 256
1421 + PREFETCH(192(%esi))
1422 + PREFETCH(192+32(%esi))
1424 + //KERNEL_FPU_BEGIN // We can't use lazy save - can be in irq :(
1425 + subl $32, %esp // hopefully this is not too slow...
1426 +K( movl %cr0, %ebx )
1428 + movups %xmm0, (%esp)
1429 + movups %xmm1, 16(%esp)
1434 +#define ROUND0(au,r) \
1435 +SRC( mov##au##ps (%esi), r ;) \
1436 + adcl (%esi), %eax ; \
1437 + adcl 4(%esi), %eax ; \
1438 + adcl 8(%esi), %eax ; \
1439 + adcl 12(%esi), %eax ; \
1440 +DST( movntps r, (%edi) ;) \
1442 +#define ROUND(au,x,r) \
1443 +SRC( mov##au##ps x(%esi), r ;) \
1444 + adcl x(%esi), %eax ; \
1445 + adcl x+4(%esi), %eax ; \
1446 + adcl x+8(%esi), %eax ; \
1447 + adcl x+12(%esi), %eax; \
1448 +DST( movntps r, x(%edi) ;) \
1450 +// ROUND[0]: edi must be 16-aligned!
1451 +// if esi is not aligned, movaps wouldn't work,
1452 +// not caught by testsuite. TODO.
1453 +// We don't need SRC() around adcl's
1454 +// (exception, if any, would be caught by 1st one)
1455 +// (FIXME: can races against interrupts bite us?)
1457 + testl $0xf, %esi # Check esi alignment + clear CF
1459 +10: # esi is NOT 16-aligned
1460 + PREFETCH(256(%esi))
1463 + PREFETCH(256+32(%esi))
1466 + lea ITER_SZ(%esi), %esi
1467 + lea ITER_SZ(%edi), %edi
1470 + loop 10b // Beware: loop and ITER_BITS>6 don't mix
1473 +15: # esi is 16-aligned
1474 + PREFETCH(256(%esi))
1477 + PREFETCH(256+32(%esi))
1480 + lea ITER_SZ(%esi), %esi
1481 + lea ITER_SZ(%edi), %edi
1484 + loop 15b // Beware: loop and ITER_BITS>6 don't mix
1487 + sfence # clean up XMM
1488 + //KERNEL_FPU_END(%ebx)
1489 + movups (%esp), %xmm0
1490 + movups 16(%esp), %xmm1
1492 +K( movl %ebx, %cr0 )
1497 + andl $ITER_MSK, %edx
1499 + shrl $2, %edx # this also clears CF
1501 +SRC( movl (%esi), %ebx )
1503 +DST( movl %ebx, (%edi) )
1510 + # last 1, 2 or 3 bytes: handled by caller
1515 +# xxx 16-align edi and get back
1516 +5500: cmp $ITER_SZ, %ecx # edi is 4-aligned here
1517 + mov %ecx, %edx # edx needed at 20:
1518 + jb 20b # not worthy: too short
1520 +5520: test $0xe, %edi # loop until we are 16-aligned
1522 +SRC( movl (%esi), %ebx )
1524 +DST( movl %ebx, (%edi) )
1530 diff -urN linux-2.4.20-pre11/arch/i386/lib/csumcpy_ssemmxplus.inc linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_ssemmxplus.inc
1531 --- linux-2.4.20-pre11/arch/i386/lib/csumcpy_ssemmxplus.inc Wed Dec 31 22:00:00 1969
1532 +++ linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_ssemmxplus.inc Fri Nov 1 23:22:58 2002
1535 +#define PREFETCH(a) prefetchnta a
1537 +// How much unrolling do you want?
1539 +#define ITER_BITS 5 // ...5,6,7 - ...32,64,128 bytes
1540 + // NB: tweak unrolled loop too...
1544 +#define ITER_SZ (1<<ITER_BITS)
1545 +#define ITER_MSK ((1<<ITER_BITS)-4)
1548 +.globl csumcpy_ssemmxplus
1550 +csumcpy_ssemmxplus:
1552 + shrl $ITER_BITS, %ecx
1555 +# "big chunks" loop
1556 + PREFETCH((%esi)) # Prefetch a couple of cachelines
1557 + PREFETCH(32(%esi)) // Note: Athlons have 64 bytes long ones, but
1558 + PREFETCH(64(%esi)) // PIIIs only 32! This gives ~20% speedup
1559 + PREFETCH(64+32(%esi)) // for PIII
1560 + PREFETCH(128(%esi)) // Note2: 128 pf depth is slower for Athlons
1561 + PREFETCH(128+32(%esi)) // let them enjoy 256
1562 + PREFETCH(192(%esi))
1563 + PREFETCH(192+32(%esi))
1565 + //KERNEL_FPU_BEGIN // We can't use lazy save - can be in irq :(
1566 +K( movl %cr0, %ebx )
1576 +#define ROUND0(r) \
1577 +SRC( movq (%esi), r ;) \
1578 + adcl (%esi), %eax ; \
1579 + adcl 4(%esi), %eax ; \
1580 +DST( movntq r, (%edi) ;) \
1582 +#define ROUND(x,r) \
1583 +SRC( movq x(%esi), r ;) \
1584 + adcl x(%esi), %eax ; \
1585 + adcl x+4(%esi), %eax ; \
1586 +DST( movntq r, x(%edi) ;) \
1588 +// moving store to the end of a ROUND makes it faster
1589 +// don't ask me why
1590 +// we don't need SRC() around adcl's
1591 +// (exception, if any, would be caught by 1st one)
1592 +// (FIXME: can races against interrupts bite us?)
1595 + PREFETCH(256(%esi))
1596 + ROUND0(%mm0) // using mm1,2,3 does not speed up things
1600 +/* PREFETCH(256+32(%esi))
1606 + lea ITER_SZ(%esi), %esi
1607 + lea ITER_SZ(%edi), %edi
1610 + loop 10b // Beware: loop and ITER_BITS>5 don't mix
1614 + //KERNEL_FPU_END(%ebx)
1617 +K( movl %ebx, %cr0 )
1622 + andl $ITER_MSK, %edx
1624 + shrl $2, %edx # this also clears CF
1626 +SRC( movl (%esi), %ebx )
1628 +DST( movl %ebx, (%edi) )