]> git.pld-linux.org Git - packages/kernel.git/blame - jam-15-fast-csum-D.patch
- CSZ scheduler removed from kernel tree.
[packages/kernel.git] / jam-15-fast-csum-D.patch
CommitLineData
e6d11017
JR
1 New csum functions optimized for different processors.
2 Author: Denis Vlasenko <vda@port.imtp.ilyichevsk.odessa.ua>
3
4diff -urN linux-2.4.20-pre11/arch/i386/lib/Makefile linux-2.4.20-pre11csum/arch/i386/lib/Makefile
5--- linux-2.4.20-pre11/arch/i386/lib/Makefile Mon Sep 10 12:31:30 2001
6+++ linux-2.4.20-pre11csum/arch/i386/lib/Makefile Fri Nov 1 23:55:58 2002
4bf063fb 7@@ -7,9 +7,17 @@
e6d11017
JR
8
9 L_TARGET = lib.a
10
11-obj-y = checksum.o old-checksum.o delay.o \
12+obj-y = old-checksum.o delay.o \
13 usercopy.o getuser.o \
14- memcpy.o strstr.o
15+ memcpy.o strstr.o \
16+ bench_csum.o \
4bf063fb 17+ bench_func.o \
e6d11017
JR
18+ csum.o \
19+ csum_basic.o \
20+ csum_naive.o \
21+ csum_3dnow.o \
22+ csum_ssemmxplus.o \
23+ csumcpy.o
24
25 obj-$(CONFIG_X86_USE_3DNOW) += mmx.o
26 obj-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
27diff -urN linux-2.4.20-pre11/arch/i386/lib/bench_csum.c linux-2.4.20-pre11csum/arch/i386/lib/bench_csum.c
28--- linux-2.4.20-pre11/arch/i386/lib/bench_csum.c Wed Dec 31 22:00:00 1969
29+++ linux-2.4.20-pre11csum/arch/i386/lib/bench_csum.c Sat Nov 2 11:51:40 2002
4bf063fb 30@@ -0,0 +1,216 @@
e6d11017
JR
31+#include <linux/mm.h> // for get_pages
32+#include <asm/uaccess.h> // for access_ok in asm/checksum.h
33+#include <linux/in6.h> // for in6_addr in asm/checksum.h
34+#include <asm/byteorder.h> // for ntoh in asm/checksum.h
35+#include <asm/cpufeature.h> // for X86_FEATURE_xx
36+#include <linux/byteorder/generic.h> // for ntohX in asm/checksum.h
37+#include <linux/stddef.h> // for NULL in asm/checksum.h
38+#include <linux/linkage.h> // for asmlinkage in asm/checksum.h
39+#include <linux/module.h>
40+
e6d11017 41+#include <asm/checksum.h>
4bf063fb 42+#include "bench_func.h"
e6d11017
JR
43+
44+//#define dprintk(a...) printk(a)
45+#define dprintk(a...) ((void)0)
46+
47+/* Features usable for mem optimization:
48+ Intel
49+X86_FEATURE_FPU Onboard FPU
50+X86_FEATURE_MMX Multimedia Extensions
51+X86_FEATURE_XMM Streaming SIMD Extensions
52+X86_FEATURE_XMM2 Streaming SIMD Extensions-2
53+ AMD
54+X86_FEATURE_3DNOW 3DNow!
55+X86_FEATURE_MMXEXT AMD MMX extensions
56+X86_FEATURE_3DNOWEXT AMD 3DNow! extensions
57+ Cyrix
58+X86_FEATURE_CXMMX Cyrix MMX extensions
59+*/
60+
61+typedef typeof(jiffies) jiffies_t;
62+
63+typedef void asm_helper(void);
64+
65+extern asm_helper csum_basic;
66+extern asm_helper csum_naive;
67+extern asm_helper csum_3dnow;
68+extern asm_helper csum_ssemmxplus;
69+
70+static struct candidate csum_runner[] = {
71+ { "basic" , csum_basic , 1, { -1 } },
72+ { "simple" , csum_naive , 1, { -1 } },
73+ { "3Dnow!" , csum_3dnow , 1, { X86_FEATURE_3DNOW, -1 } },
74+ { "AMD MMX", csum_ssemmxplus, 1, { X86_FEATURE_MMXEXT, -1 } },
75+ { "SSE1+", csum_ssemmxplus, 1, { X86_FEATURE_XMM, -1 } },
76+};
77+
78+extern asm_helper csumcpy_basic;
79+extern asm_helper csumcpy_naive;
80+extern asm_helper csumcpy_ssemmxplus;
81+extern asm_helper csumcpy_sse;
82+
83+static struct candidate csumcpy_runner[] = {
84+ { "basic" , csumcpy_basic , 2, { -1 } },
85+ { "simple" , csumcpy_naive , 2, { -1 } },
86+ /* higher weight: we prefer these for less cache pollution: */
87+ { "AND MMX", csumcpy_ssemmxplus, 3, { X86_FEATURE_MMXEXT, -1 } },
88+ { "SSE1+", csumcpy_ssemmxplus, 3, { X86_FEATURE_XMM, -1 } },
89+ { "SSE1" , csumcpy_sse , 3, { X86_FEATURE_XMM, -1 } },
90+};
91+
92+//====== TODO: split here: above: arch, below:generic
93+
94+/* set this to value bigger than cache(s) */
95+/* TODO: heuristic for buffer size */
96+#define bufshift 20 /* 10=1kb, 20=1MB etc */
97+/* typical size of a packet */
98+#define chunksz (4*1024)
99+
100+#define bufsz (1<<bufshift)
101+#define chunkcnt (bufsz/chunksz)
102+
103+#define VECTOR_SZ(a) (sizeof(a)/sizeof((a)[0]))
104+
105+asm_helper *best_csum = csum_basic;
106+asm_helper *best_csumcpy = csumcpy_basic;
107+
108+/*
109+** Count the number of iterations done during a fixed period,
110+** and use this to calculate throughput.
111+*/
112+
113+static int duration = 1; // jiffies for each run
114+static int report;
115+
116+static inline void
117+wait_for_jiffy(void) {
118+ jiffies_t now = jiffies;
119+ while(now == jiffies) cpu_relax();
120+}
121+
122+static int
123+bench_csum(struct candidate *cand, char *buf)
124+{
125+ int i, max;
126+ best_csum = (asm_helper*)(cand->f);
127+
128+ max = 0;
129+ // In practice these are pretty repeatable
130+ // so 3 runs is an overkill
131+ for(i=0; i<3; i++) {
132+ int count = 0;
133+ jiffies_t limit;
134+ wait_for_jiffy();
135+ limit = jiffies+duration;
136+ while(time_before(jiffies, limit)) {
137+ int i;
138+ mb();
139+ // interleaved to avoid bias due to prefetch
140+ for(i=0; i<chunkcnt; i+=2)
141+ csum_partial(buf+i*chunksz, chunksz, 0);
142+ for(i=1; i<chunkcnt; i+=2)
143+ csum_partial(buf+i*chunksz, chunksz, 0);
144+ mb();
145+ count++;
146+ mb();
147+ }
148+ dprintk(" count =%6i\n",count);
149+ if(count>max)
150+ max = count;
151+ }
152+
153+ if(report) {
154+ int kb_sec = max * (((chunksz*chunkcnt)/1024) * HZ) / duration;
155+ printk(" %-10s:%6d.%03d MB/sec\n", cand->name,
156+ kb_sec / 1000, kb_sec % 1000);
157+ }
158+
159+ return max;
160+}
161+
162+static int
163+bench_csumcpy(struct candidate *cand, char *buf)
164+{
165+ int err;
166+ int i, max;
167+ best_csumcpy = (asm_helper*)(cand->f);
168+
169+ max = 0;
170+ for(i=0; i<3; i++) {
171+ int count = 0;
172+ jiffies_t limit;
173+ wait_for_jiffy();
174+ limit = jiffies+duration;
175+ while(time_before(jiffies, limit)) {
176+ int i;
177+ mb();
178+ // interleaved to avoid bias due to prefetch
179+ for(i=0; i<chunkcnt; i+=2)
180+ csum_partial_copy_generic(buf+i*chunksz,
181+ buf+(chunkcnt-1-i)*chunksz,
182+ chunksz, 0, &err, &err);
183+ for(i=1; i<chunkcnt; i+=2)
184+ csum_partial_copy_generic(buf+i*chunksz,
185+ buf+(chunkcnt-1-i)*chunksz,
186+ chunksz, 0, &err, &err);
187+ mb();
188+ count++;
189+ mb();
190+ }
191+ dprintk(" count =%6i\n",count);
192+ if(count>max)
193+ max = count;
194+ }
195+
196+ if(report) {
197+ int kb_sec = max * (((chunksz*chunkcnt)/1024) * HZ) / duration;
198+ printk(" %-10s:%6d.%03d MB/sec\n", cand->name,
199+ kb_sec / 1000, kb_sec % 1000);
200+ }
201+
202+ return max;
203+}
204+
205+static int
206+find_best_csum(void)
207+{
208+ struct candidate *best;
209+ char *buffer = (char *) __get_free_pages(GFP_KERNEL,
210+ (bufshift-PAGE_SHIFT));
211+
212+ printk(KERN_INFO "Measuring network checksumming speed\n");
213+ if(!buffer) {
214+ printk("csum: cannot allocate %i pages\n",
215+ 1<<(bufshift-PAGE_SHIFT)
216+ );
217+ return -ENOMEM;
218+ }
219+ dprintk("allocated %i pages\n",1<<(bufshift-PAGE_SHIFT));
220+
221+ // find # of jiffies suitable for reliable results
222+ // (at least %5 accuracy)
223+ while(bench_csumcpy(&csumcpy_runner[0], buffer)<20) {
224+ duration<<=1;
225+ }
226+ dprintk("test run will last %i ticks\n", duration);
227+ report = 1;
228+
229+ best = find_best(bench_csum, buffer, csum_runner,
230+ VECTOR_SZ(csum_runner));
231+ printk("csum: using csum function: %s\n", best->name);
232+ best_csum = (asm_helper*)(best->f);
233+
234+ best = find_best(bench_csumcpy, buffer, csumcpy_runner,
235+ VECTOR_SZ(csumcpy_runner));
236+ printk("csum: using csum_copy function: %s\n", best->name);
237+ best_csumcpy = (asm_helper*)(best->f);
238+
239+ free_pages((unsigned long)buffer, (bufshift-PAGE_SHIFT));
240+ dprintk("freed %i pages\n",1<<(bufshift-PAGE_SHIFT));
241+ return 0;
242+}
243+
244+MODULE_LICENSE("GPL");
245+
246+module_init(find_best_csum);
4bf063fb
JR
247diff -urN linux-2.4.20-pre11/arch/i386/lib/bench_func.c linux-2.4.20-pre11csum/arch/i386/lib/bench_func.c
248--- linux-2.4.20-pre11/arch/i386/lib/bench_func.c Wed Dec 31 22:00:00 1969
249+++ linux-2.4.20-pre11csum/arch/i386/lib/bench_func.c Fri Nov 1 18:08:37 2002
250@@ -0,0 +1,53 @@
251+#include <linux/kernel.h> // for KERN_DEBUG
252+
253+#include <asm/bitops.h> // for test_bit
254+#include <asm/processor.h> // cpu caps
255+#include <asm/cpufeature.h> // cpu features constants
256+#include "bench_func.h"
257+
258+//#define dprintk(a...) printk(a)
259+#define dprintk(a...) ((void)0)
260+
261+// 2.4 only, already in 2.5
262+extern inline int
263+boot_cpu_has(int cap)
264+{
265+ return test_bit(cap, boot_cpu_data.x86_capability);
266+}
267+
268+extern inline int
269+cpu_supports(int *cap)
270+{
271+ while(*cap != -1) {
272+ if(!boot_cpu_has(*cap)) {
273+ dprintk("unsupported caps: %i\n", *cap);
274+ return 0;
275+ }
276+ cap++;
277+ }
278+ return 1;
279+}
280+
281+/*
282+** Call all the candidates which can be run on this CPU,
283+** find the best
284+*/
285+struct candidate*
286+find_best(bench_func *bench, char *opaque, struct candidate runner[], int count)
287+{
288+ int score, max = 0;
289+ struct candidate *best = 0;
290+ while(count--) {
291+ if(!cpu_supports(runner->cpu_caps_needed)) {
292+ printk("func %s skipped: not supported by CPU\n", runner->name);
293+ } else {
294+ score = bench(runner,opaque) * runner->weight;
295+ if(max < score) {
296+ max = score;
297+ best = runner;
298+ }
299+ }
300+ runner++;
301+ }
302+ return best;
303+}
304diff -urN linux-2.4.20-pre11/arch/i386/lib/bench_func.h linux-2.4.20-pre11csum/arch/i386/lib/bench_func.h
305--- linux-2.4.20-pre11/arch/i386/lib/bench_func.h Wed Dec 31 22:00:00 1969
306+++ linux-2.4.20-pre11csum/arch/i386/lib/bench_func.h Fri Nov 1 18:08:37 2002
307@@ -0,0 +1,16 @@
308+#ifndef _BENCH_FUNC_H
309+#define _BENCH_FUNC_H
310+
311+struct candidate {
312+ const char *name;
313+ void *f; // pointer to func
314+ int weight;
315+ int cpu_caps_needed[4];
316+};
317+
318+typedef int bench_func(struct candidate *cand, char *opaque);
319+
320+struct candidate* find_best(bench_func *bench, char *opaque,
321+ struct candidate runner[], int count);
322+
323+#endif
e6d11017
JR
324diff -urN linux-2.4.20-pre11/arch/i386/lib/checksum.S linux-2.4.20-pre11csum/arch/i386/lib/checksum.S
325--- linux-2.4.20-pre11/arch/i386/lib/checksum.S Fri Nov 1 18:06:59 2002
326+++ linux-2.4.20-pre11csum/arch/i386/lib/checksum.S Wed Dec 31 22:00:00 1969
327@@ -1,496 +0,0 @@
328-/*
329- * INET An implementation of the TCP/IP protocol suite for the LINUX
330- * operating system. INET is implemented using the BSD Socket
331- * interface as the means of communication with the user level.
332- *
333- * IP/TCP/UDP checksumming routines
334- *
335- * Authors: Jorge Cwik, <jorge@laser.satlink.net>
336- * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
337- * Tom May, <ftom@netcom.com>
338- * Pentium Pro/II routines:
339- * Alexander Kjeldaas <astor@guardian.no>
340- * Finn Arne Gangstad <finnag@guardian.no>
341- * Lots of code moved from tcp.c and ip.c; see those files
342- * for more names.
343- *
344- * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception
345- * handling.
346- * Andi Kleen, add zeroing on error
347- * converted to pure assembler
348- *
349- * This program is free software; you can redistribute it and/or
350- * modify it under the terms of the GNU General Public License
351- * as published by the Free Software Foundation; either version
352- * 2 of the License, or (at your option) any later version.
353- */
354-
355-#include <linux/config.h>
356-#include <asm/errno.h>
357-
358-/*
359- * computes a partial checksum, e.g. for TCP/UDP fragments
360- */
361-
362-/*
363-unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
364- */
365-
366-.text
367-.align 4
368-.globl csum_partial
369-
370-#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
371-
372- /*
373- * Experiments with Ethernet and SLIP connections show that buff
374- * is aligned on either a 2-byte or 4-byte boundary. We get at
375- * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
376- * Fortunately, it is easy to convert 2-byte alignment to 4-byte
377- * alignment for the unrolled loop.
378- */
379-csum_partial:
380- pushl %esi
381- pushl %ebx
382- movl 20(%esp),%eax # Function arg: unsigned int sum
383- movl 16(%esp),%ecx # Function arg: int len
384- movl 12(%esp),%esi # Function arg: unsigned char *buff
385- testl $3, %esi # Check alignment.
386- jz 2f # Jump if alignment is ok.
387- testl $1, %esi # Check alignment.
388- jz 10f # Jump if alignment is boundary of 2bytes.
389-
390- # buf is odd
391- dec %ecx
392- jl 8f
393- movzbl (%esi), %ebx
394- adcl %ebx, %eax
395- roll $8, %eax
396- inc %esi
397- testl $2, %esi
398- jz 2f
399-10:
400- subl $2, %ecx # Alignment uses up two bytes.
401- jae 1f # Jump if we had at least two bytes.
402- addl $2, %ecx # ecx was < 2. Deal with it.
403- jmp 4f
404-1: movw (%esi), %bx
405- addl $2, %esi
406- addw %bx, %ax
407- adcl $0, %eax
408-2:
409- movl %ecx, %edx
410- shrl $5, %ecx
411- jz 2f
412- testl %esi, %esi
413-1: movl (%esi), %ebx
414- adcl %ebx, %eax
415- movl 4(%esi), %ebx
416- adcl %ebx, %eax
417- movl 8(%esi), %ebx
418- adcl %ebx, %eax
419- movl 12(%esi), %ebx
420- adcl %ebx, %eax
421- movl 16(%esi), %ebx
422- adcl %ebx, %eax
423- movl 20(%esi), %ebx
424- adcl %ebx, %eax
425- movl 24(%esi), %ebx
426- adcl %ebx, %eax
427- movl 28(%esi), %ebx
428- adcl %ebx, %eax
429- lea 32(%esi), %esi
430- dec %ecx
431- jne 1b
432- adcl $0, %eax
433-2: movl %edx, %ecx
434- andl $0x1c, %edx
435- je 4f
436- shrl $2, %edx # This clears CF
437-3: adcl (%esi), %eax
438- lea 4(%esi), %esi
439- dec %edx
440- jne 3b
441- adcl $0, %eax
442-4: andl $3, %ecx
443- jz 7f
444- cmpl $2, %ecx
445- jb 5f
446- movw (%esi),%cx
447- leal 2(%esi),%esi
448- je 6f
449- shll $16,%ecx
450-5: movb (%esi),%cl
451-6: addl %ecx,%eax
452- adcl $0, %eax
453-7:
454- testl $1, 12(%esp)
455- jz 8f
456- roll $8, %eax
457-8:
458- popl %ebx
459- popl %esi
460- ret
461-
462-#else
463-
464-/* Version for PentiumII/PPro */
465-
466-csum_partial:
467- pushl %esi
468- pushl %ebx
469- movl 20(%esp),%eax # Function arg: unsigned int sum
470- movl 16(%esp),%ecx # Function arg: int len
471- movl 12(%esp),%esi # Function arg: const unsigned char *buf
472-
473- testl $3, %esi
474- jnz 25f
475-10:
476- movl %ecx, %edx
477- movl %ecx, %ebx
478- andl $0x7c, %ebx
479- shrl $7, %ecx
480- addl %ebx,%esi
481- shrl $2, %ebx
482- negl %ebx
483- lea 45f(%ebx,%ebx,2), %ebx
484- testl %esi, %esi
485- jmp *%ebx
486-
487- # Handle 2-byte-aligned regions
488-20: addw (%esi), %ax
489- lea 2(%esi), %esi
490- adcl $0, %eax
491- jmp 10b
492-25:
493- testl $1, %esi
494- jz 30f
495- # buf is odd
496- dec %ecx
497- jl 90f
498- movzbl (%esi), %ebx
499- addl %ebx, %eax
500- adcl $0, %eax
501- roll $8, %eax
502- inc %esi
503- testl $2, %esi
504- jz 10b
505-
506-30: subl $2, %ecx
507- ja 20b
508- je 32f
509- addl $2, %ecx
510- jz 80f
511- movzbl (%esi),%ebx # csumming 1 byte, 2-aligned
512- addl %ebx, %eax
513- adcl $0, %eax
514- jmp 80f
515-32:
516- addw (%esi), %ax # csumming 2 bytes, 2-aligned
517- adcl $0, %eax
518- jmp 80f
519-
520-40:
521- addl -128(%esi), %eax
522- adcl -124(%esi), %eax
523- adcl -120(%esi), %eax
524- adcl -116(%esi), %eax
525- adcl -112(%esi), %eax
526- adcl -108(%esi), %eax
527- adcl -104(%esi), %eax
528- adcl -100(%esi), %eax
529- adcl -96(%esi), %eax
530- adcl -92(%esi), %eax
531- adcl -88(%esi), %eax
532- adcl -84(%esi), %eax
533- adcl -80(%esi), %eax
534- adcl -76(%esi), %eax
535- adcl -72(%esi), %eax
536- adcl -68(%esi), %eax
537- adcl -64(%esi), %eax
538- adcl -60(%esi), %eax
539- adcl -56(%esi), %eax
540- adcl -52(%esi), %eax
541- adcl -48(%esi), %eax
542- adcl -44(%esi), %eax
543- adcl -40(%esi), %eax
544- adcl -36(%esi), %eax
545- adcl -32(%esi), %eax
546- adcl -28(%esi), %eax
547- adcl -24(%esi), %eax
548- adcl -20(%esi), %eax
549- adcl -16(%esi), %eax
550- adcl -12(%esi), %eax
551- adcl -8(%esi), %eax
552- adcl -4(%esi), %eax
553-45:
554- lea 128(%esi), %esi
555- adcl $0, %eax
556- dec %ecx
557- jge 40b
558- movl %edx, %ecx
559-50: andl $3, %ecx
560- jz 80f
561-
562- # Handle the last 1-3 bytes without jumping
563- notl %ecx # 1->2, 2->1, 3->0, higher bits are masked
564- movl $0xffffff,%ebx # by the shll and shrl instructions
565- shll $3,%ecx
566- shrl %cl,%ebx
567- andl -128(%esi),%ebx # esi is 4-aligned so should be ok
568- addl %ebx,%eax
569- adcl $0,%eax
570-80:
571- testl $1, 12(%esp)
572- jz 90f
573- roll $8, %eax
574-90:
575- popl %ebx
576- popl %esi
577- ret
578-
579-#endif
580-
581-/*
582-unsigned int csum_partial_copy_generic (const char *src, char *dst,
583- int len, int sum, int *src_err_ptr, int *dst_err_ptr)
584- */
585-
586-/*
587- * Copy from ds while checksumming, otherwise like csum_partial
588- *
589- * The macros SRC and DST specify the type of access for the instruction.
590- * thus we can call a custom exception handler for all access types.
591- *
592- * FIXME: could someone double-check whether I haven't mixed up some SRC and
593- * DST definitions? It's damn hard to trigger all cases. I hope I got
594- * them all but there's no guarantee.
595- */
596-
597-#define SRC(y...) \
598- 9999: y; \
599- .section __ex_table, "a"; \
600- .long 9999b, 6001f ; \
601- .previous
602-
603-#define DST(y...) \
604- 9999: y; \
605- .section __ex_table, "a"; \
606- .long 9999b, 6002f ; \
607- .previous
608-
609-.align 4
610-.globl csum_partial_copy_generic
611-
612-#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
613-
614-#define ARGBASE 16
615-#define FP 12
616-
617-csum_partial_copy_generic:
618- subl $4,%esp
619- pushl %edi
620- pushl %esi
621- pushl %ebx
622- movl ARGBASE+16(%esp),%eax # sum
623- movl ARGBASE+12(%esp),%ecx # len
624- movl ARGBASE+4(%esp),%esi # src
625- movl ARGBASE+8(%esp),%edi # dst
626-
627- testl $2, %edi # Check alignment.
628- jz 2f # Jump if alignment is ok.
629- subl $2, %ecx # Alignment uses up two bytes.
630- jae 1f # Jump if we had at least two bytes.
631- addl $2, %ecx # ecx was < 2. Deal with it.
632- jmp 4f
633-SRC(1: movw (%esi), %bx )
634- addl $2, %esi
635-DST( movw %bx, (%edi) )
636- addl $2, %edi
637- addw %bx, %ax
638- adcl $0, %eax
639-2:
640- movl %ecx, FP(%esp)
641- shrl $5, %ecx
642- jz 2f
643- testl %esi, %esi
644-SRC(1: movl (%esi), %ebx )
645-SRC( movl 4(%esi), %edx )
646- adcl %ebx, %eax
647-DST( movl %ebx, (%edi) )
648- adcl %edx, %eax
649-DST( movl %edx, 4(%edi) )
650-
651-SRC( movl 8(%esi), %ebx )
652-SRC( movl 12(%esi), %edx )
653- adcl %ebx, %eax
654-DST( movl %ebx, 8(%edi) )
655- adcl %edx, %eax
656-DST( movl %edx, 12(%edi) )
657-
658-SRC( movl 16(%esi), %ebx )
659-SRC( movl 20(%esi), %edx )
660- adcl %ebx, %eax
661-DST( movl %ebx, 16(%edi) )
662- adcl %edx, %eax
663-DST( movl %edx, 20(%edi) )
664-
665-SRC( movl 24(%esi), %ebx )
666-SRC( movl 28(%esi), %edx )
667- adcl %ebx, %eax
668-DST( movl %ebx, 24(%edi) )
669- adcl %edx, %eax
670-DST( movl %edx, 28(%edi) )
671-
672- lea 32(%esi), %esi
673- lea 32(%edi), %edi
674- dec %ecx
675- jne 1b
676- adcl $0, %eax
677-2: movl FP(%esp), %edx
678- movl %edx, %ecx
679- andl $0x1c, %edx
680- je 4f
681- shrl $2, %edx # This clears CF
682-SRC(3: movl (%esi), %ebx )
683- adcl %ebx, %eax
684-DST( movl %ebx, (%edi) )
685- lea 4(%esi), %esi
686- lea 4(%edi), %edi
687- dec %edx
688- jne 3b
689- adcl $0, %eax
690-4: andl $3, %ecx
691- jz 7f
692- cmpl $2, %ecx
693- jb 5f
694-SRC( movw (%esi), %cx )
695- leal 2(%esi), %esi
696-DST( movw %cx, (%edi) )
697- leal 2(%edi), %edi
698- je 6f
699- shll $16,%ecx
700-SRC(5: movb (%esi), %cl )
701-DST( movb %cl, (%edi) )
702-6: addl %ecx, %eax
703- adcl $0, %eax
704-7:
705-5000:
706-
707-# Exception handler:
708-.section .fixup, "ax"
709-
710-6001:
711- movl ARGBASE+20(%esp), %ebx # src_err_ptr
712- movl $-EFAULT, (%ebx)
713-
714- # zero the complete destination - computing the rest
715- # is too much work
716- movl ARGBASE+8(%esp), %edi # dst
717- movl ARGBASE+12(%esp), %ecx # len
718- xorl %eax,%eax
719- rep ; stosb
720-
721- jmp 5000b
722-
723-6002:
724- movl ARGBASE+24(%esp), %ebx # dst_err_ptr
725- movl $-EFAULT,(%ebx)
726- jmp 5000b
727-
728-.previous
729-
730- popl %ebx
731- popl %esi
732- popl %edi
733- popl %ecx # equivalent to addl $4,%esp
734- ret
735-
736-#else
737-
738-/* Version for PentiumII/PPro */
739-
740-#define ROUND1(x) \
741- SRC(movl x(%esi), %ebx ) ; \
742- addl %ebx, %eax ; \
743- DST(movl %ebx, x(%edi) ) ;
744-
745-#define ROUND(x) \
746- SRC(movl x(%esi), %ebx ) ; \
747- adcl %ebx, %eax ; \
748- DST(movl %ebx, x(%edi) ) ;
749-
750-#define ARGBASE 12
751-
752-csum_partial_copy_generic:
753- pushl %ebx
754- pushl %edi
755- pushl %esi
756- movl ARGBASE+4(%esp),%esi #src
757- movl ARGBASE+8(%esp),%edi #dst
758- movl ARGBASE+12(%esp),%ecx #len
759- movl ARGBASE+16(%esp),%eax #sum
760-# movl %ecx, %edx
761- movl %ecx, %ebx
762- movl %esi, %edx
763- shrl $6, %ecx
764- andl $0x3c, %ebx
765- negl %ebx
766- subl %ebx, %esi
767- subl %ebx, %edi
768- lea -1(%esi),%edx
769- andl $-32,%edx
770- lea 3f(%ebx,%ebx), %ebx
771- testl %esi, %esi
772- jmp *%ebx
773-1: addl $64,%esi
774- addl $64,%edi
775- SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl)
776- ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)
777- ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)
778- ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)
779- ROUND (-16) ROUND(-12) ROUND(-8) ROUND(-4)
780-3: adcl $0,%eax
781- addl $64, %edx
782- dec %ecx
783- jge 1b
784-4: movl ARGBASE+12(%esp),%edx #len
785- andl $3, %edx
786- jz 7f
787- cmpl $2, %edx
788- jb 5f
789-SRC( movw (%esi), %dx )
790- leal 2(%esi), %esi
791-DST( movw %dx, (%edi) )
792- leal 2(%edi), %edi
793- je 6f
794- shll $16,%edx
795-5:
796-SRC( movb (%esi), %dl )
797-DST( movb %dl, (%edi) )
798-6: addl %edx, %eax
799- adcl $0, %eax
800-7:
801-.section .fixup, "ax"
802-6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr
803- movl $-EFAULT, (%ebx)
804- # zero the complete destination (computing the rest is too much work)
805- movl ARGBASE+8(%esp),%edi # dst
806- movl ARGBASE+12(%esp),%ecx # len
807- xorl %eax,%eax
808- rep; stosb
809- jmp 7b
810-6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr
811- movl $-EFAULT, (%ebx)
812- jmp 7b
813-.previous
814-
815- popl %esi
816- popl %edi
817- popl %ebx
818- ret
819-
820-#undef ROUND
821-#undef ROUND1
822-
823-#endif
824diff -urN linux-2.4.20-pre11/arch/i386/lib/csum.S linux-2.4.20-pre11csum/arch/i386/lib/csum.S
825--- linux-2.4.20-pre11/arch/i386/lib/csum.S Wed Dec 31 22:00:00 1969
826+++ linux-2.4.20-pre11csum/arch/i386/lib/csum.S Fri Nov 1 22:45:31 2002
827@@ -0,0 +1,97 @@
828+/*
829+ * INET An implementation of the TCP/IP protocol suite for the LINUX
830+ * operating system. INET is implemented using the BSD Socket
831+ * interface as the means of communication with the user level.
832+ *
833+ * IP/TCP/UDP checksumming routines
834+ *
835+ * Authors: Jorge Cwik, <jorge@laser.satlink.net>
836+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
837+ * Tom May, <ftom@netcom.com>
838+ * Pentium Pro/II routines:
839+ * Alexander Kjeldaas <astor@guardian.no>
840+ * Finn Arne Gangstad <finnag@guardian.no>
841+ * Lots of code moved from tcp.c and ip.c; see those files
842+ * for more names.
843+ *
844+ * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception
845+ * handling.
846+ * Andi Kleen, add zeroing on error converted to pure assembler
847+ * 2002-10-30 Denis Vlasenko
848+ * boot-time benchmarking, 3Dnow/MMX+/SSE versions
849+ *
850+ * This program is free software; you can redistribute it and/or
851+ * modify it under the terms of the GNU General Public License
852+ * as published by the Free Software Foundation; either version
853+ * 2 of the License, or (at your option) any later version.
854+ */
855+
856+/*
857+** computes a partial checksum, e.g. for TCP/UDP fragments
858+**
859+** unsigned int csum_partial(const unsigned char * buff,
860+** int len, unsigned int sum)
861+*/
862+
863+.text
864+.align 4
865+.globl csum_partial
866+
867+csum_partial:
868+ pushl %esi
869+ pushl %ebx
870+ movl 20(%esp), %eax # arg: sum
871+ movl 16(%esp), %ecx # arg: len
872+ movl 12(%esp), %esi # arg: buf
873+
874+ testl $3, %esi
875+ jz 40f
876+20:
877+ # not 4-aligned: analyze and align...
878+ testl $1, %esi
879+ jz 30f
880+
881+ # unaligned start addr
882+ decl %ecx
883+ js 90f # sz==0, exit
884+ movzbl (%esi), %ebx # eat one byte...
885+ addl %ebx, %eax
886+ adcl $0, %eax
887+ roll $8, %eax # NB: need to be undone at exit!
888+ incl %esi
889+ testl $2, %esi
890+ jz 40f
891+30:
892+ # Note: 2-aligned, but not 4-aligned
893+ cmpl $3, %ecx
894+ jbe 60f
895+ addw (%esi), %ax # eat 2 bytes
896+ leal 2(%esi), %esi
897+ adcl $0, %eax
898+ subl $2, %ecx
899+40:
900+ # esi is 4-aligned here, call block routine
901+ movl $csum_basic, %ebx # known ok even for ecx==0 etc
902+ cmpl $128, %ecx # use optimized routine
903+ jb 50f # only for large blocks
904+ movl best_csum, %ebx
905+50: call *%ebx
906+60:
907+ # handle the last 0-3 bytes without much jumping
908+ jecxz 80f
909+ notl %ecx # 0->3, 1->2, 2->1, 3->0, higher bits are masked
910+ movl $0xffffff, %ebx # by the shll and shrl instructions
911+ shll $3, %ecx
912+ shrl %cl, %ebx
913+ andl (%esi), %ebx # esi is 4-aligned so should be ok
914+ addl %ebx, %eax
915+ adcl $0, %eax
916+80:
917+ # undo csum rotation if start addr was odd
918+ testl $1, 12(%esp)
919+ jz 90f
920+ roll $8, %eax
921+90:
922+ popl %ebx
923+ popl %esi
924+ ret
925diff -urN linux-2.4.20-pre11/arch/i386/lib/csum_3dnow.S linux-2.4.20-pre11csum/arch/i386/lib/csum_3dnow.S
926--- linux-2.4.20-pre11/arch/i386/lib/csum_3dnow.S Wed Dec 31 22:00:00 1969
927+++ linux-2.4.20-pre11csum/arch/i386/lib/csum_3dnow.S Fri Nov 1 22:48:32 2002
928@@ -0,0 +1,4 @@
929+#define PREFETCH(a) prefetch a
930+#define NAME csum_3dnow
931+
932+#include "csum_pf.inc"
933diff -urN linux-2.4.20-pre11/arch/i386/lib/csum_basic.S linux-2.4.20-pre11csum/arch/i386/lib/csum_basic.S
934--- linux-2.4.20-pre11/arch/i386/lib/csum_basic.S Wed Dec 31 22:00:00 1969
935+++ linux-2.4.20-pre11csum/arch/i386/lib/csum_basic.S Fri Nov 1 22:56:19 2002
936@@ -0,0 +1,63 @@
937+.text
938+.align 4
939+.globl csum_basic
940+
941+/* Experiments with Ethernet and SLIP connections show that buff
942+** is aligned on either a 2-byte or 4-byte boundary. We get at
943+** least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
944+** Fortunately, it is easy to convert 2-byte alignment to 4-byte
945+** alignment for the unrolled loop.
946+*/
947+csum_basic:
948+ movl %ecx, %ebx
949+ movl %ecx, %edx
950+ shrl $7, %ecx
951+ andl $0x7c, %ebx
952+ addl %ebx, %esi
953+ shrl $2, %ebx
954+ negl %ebx
955+ leal 50f(%ebx,%ebx,2), %ebx
956+ clc
957+ jmp *%ebx
958+40:
959+ leal 128(%esi), %esi
960+ adcl -128(%esi), %eax
961+ adcl -124(%esi), %eax
962+ adcl -120(%esi), %eax
963+ adcl -116(%esi), %eax
964+ adcl -112(%esi), %eax
965+ adcl -108(%esi), %eax
966+ adcl -104(%esi), %eax
967+ adcl -100(%esi), %eax
968+ adcl -96(%esi), %eax
969+ adcl -92(%esi), %eax
970+ adcl -88(%esi), %eax
971+ adcl -84(%esi), %eax
972+ adcl -80(%esi), %eax
973+ adcl -76(%esi), %eax
974+ adcl -72(%esi), %eax
975+ adcl -68(%esi), %eax
976+ adcl -64(%esi), %eax
977+ adcl -60(%esi), %eax
978+ adcl -56(%esi), %eax
979+ adcl -52(%esi), %eax
980+ adcl -48(%esi), %eax
981+ adcl -44(%esi), %eax
982+ adcl -40(%esi), %eax
983+ adcl -36(%esi), %eax
984+ adcl -32(%esi), %eax
985+ adcl -28(%esi), %eax
986+ adcl -24(%esi), %eax
987+ adcl -20(%esi), %eax
988+ adcl -16(%esi), %eax
989+ adcl -12(%esi), %eax
990+ adcl -8(%esi), %eax
991+ adcl -4(%esi), %eax
992+50:
993+ decl %ecx
994+ jge 40b
995+
996+ adcl $0, %eax
997+ movl %edx, %ecx
998+ andl $3, %ecx
999+ ret
1000diff -urN linux-2.4.20-pre11/arch/i386/lib/csum_naive.S linux-2.4.20-pre11csum/arch/i386/lib/csum_naive.S
1001--- linux-2.4.20-pre11/arch/i386/lib/csum_naive.S Wed Dec 31 22:00:00 1969
1002+++ linux-2.4.20-pre11csum/arch/i386/lib/csum_naive.S Fri Nov 1 22:36:20 2002
1003@@ -0,0 +1,17 @@
1004+.text
1005+.align 4
1006+.globl csum_naive
1007+
1008+csum_naive:
1009+ mov %ecx, %edx
1010+ shrl $2, %ecx
1011+ clc
1012+1:
1013+ adcl (%esi), %eax
1014+ leal 4(%esi), %esi
1015+ loop 1b
1016+
1017+ adcl $0, %eax
1018+ mov %edx, %ecx
1019+ andl $3, %ecx
1020+ ret
1021diff -urN linux-2.4.20-pre11/arch/i386/lib/csum_pf.inc linux-2.4.20-pre11csum/arch/i386/lib/csum_pf.inc
1022--- linux-2.4.20-pre11/arch/i386/lib/csum_pf.inc Wed Dec 31 22:00:00 1969
1023+++ linux-2.4.20-pre11csum/arch/i386/lib/csum_pf.inc Fri Nov 1 22:57:20 2002
1024@@ -0,0 +1,95 @@
1025+//#define PREFETCH(a) prefetchnta a
1026+//#define PREFETCH(a) prefetch a
1027+//#define PREFETCH(a)
1028+
1029+// How much unrolling do you want?
1030+//vda: 5 is best on Duron 650
1031+#define ITER_BITS 5 // ...5,6,7 - ...32,64,128 bytes
1032+ // NB: tweak unrolled loop too...
1033+/*
1034+** computes a partial checksum, e.g. for TCP/UDP fragments
1035+** int csum_partial(const char *buff, int len, int sum)
1036+*/
1037+
1038+#define ITER_SZ (1<<ITER_BITS)
1039+#define ITER_MSK ((1<<ITER_BITS)-4)
1040+
1041+.text
1042+.align 4
1043+.globl NAME
1044+
1045+NAME:
1046+
1047+# Guaranteed by caller: esi is 4-aligned, ecx>=16
1048+10:
1049+ PREFETCH((%esi)) # Prefetch _each_ cacheline
1050+ PREFETCH(32(%esi)) # Note! Athlons have 64 bytes long ones, but
1051+ PREFETCH(64(%esi)) # PIIIs only 32! This gives ~20% speedup
1052+ PREFETCH(64+32(%esi)) # for PIII
1053+ PREFETCH(128(%esi))
1054+ PREFETCH(128+32(%esi))
1055+ PREFETCH(192(%esi))
1056+ PREFETCH(192+32(%esi))
1057+ movl %ecx, %ebx
1058+ movl %ecx, %edx
1059+ andl $ITER_MSK, %ebx # = bytes to handle in first (partial) iteration
1060+ shrl $ITER_BITS, %ecx # = iterations to make
1061+ addl %ebx, %esi # => 1st byte to handle in 2nd complete iteration
1062+ shrl $2, %ebx # = dwords to handle
1063+ negl %ebx
1064+ lea 50f(%ebx,%ebx,2), %ebx # = 45f - 3*dwords_to_handle
1065+ clc
1066+ jmp *%ebx # here we go!
1067+
1068+40:
1069+ PREFETCH(256(%esi))
1070+41:
1071+ lea ITER_SZ(%esi), %esi # does NOT change CF!
1072+/*
1073+ addl -128(%esi), %eax
1074+ adcl -124(%esi), %eax
1075+ adcl -120(%esi), %eax
1076+ adcl -116(%esi), %eax
1077+ adcl -112(%esi), %eax
1078+ adcl -108(%esi), %eax
1079+ adcl -104(%esi), %eax
1080+ adcl -100(%esi), %eax
1081+ adcl -96(%esi), %eax
1082+ adcl -92(%esi), %eax
1083+ adcl -88(%esi), %eax
1084+ adcl -84(%esi), %eax
1085+ adcl -80(%esi), %eax
1086+ adcl -76(%esi), %eax
1087+ adcl -72(%esi), %eax
1088+ adcl -68(%esi), %eax
1089+ adcl -64(%esi), %eax
1090+ adcl -60(%esi), %eax
1091+ adcl -56(%esi), %eax
1092+ adcl -52(%esi), %eax
1093+ adcl -48(%esi), %eax
1094+ adcl -44(%esi), %eax
1095+ adcl -40(%esi), %eax
1096+ adcl -36(%esi), %eax
1097+*/
1098+ addl -32(%esi), %eax
1099+ adcl -28(%esi), %eax
1100+ adcl -24(%esi), %eax
1101+ adcl -20(%esi), %eax
1102+ adcl -16(%esi), %eax
1103+ adcl -12(%esi), %eax
1104+ adcl -8(%esi), %eax
1105+ adcl -4(%esi), %eax
1106+50:
1107+ adcl $0, %eax
1108+ dec %ecx # does NOT change CF!
1109+ # We can do just "jge 40b" here, but we can be a bit clever...
1110+ # This little twist gives surprisingly noticeable benefits!
1111+ # Seen 11% increase on random 1K blocks on Duron 650
1112+ js 60f
1113+ cmp $256/ITER_SZ, %ecx
1114+ jae 40b # need prefetch
1115+ jmp 41b # do not need it
1116+60:
1117+ movl %edx, %ecx
1118+ andl $3, %ecx
1119+ ret
1120diff -urN linux-2.4.20-pre11/arch/i386/lib/csum_ssemmxplus.S linux-2.4.20-pre11csum/arch/i386/lib/csum_ssemmxplus.S
1121--- linux-2.4.20-pre11/arch/i386/lib/csum_ssemmxplus.S Wed Dec 31 22:00:00 1969
1122+++ linux-2.4.20-pre11csum/arch/i386/lib/csum_ssemmxplus.S Fri Nov 1 22:48:39 2002
1123@@ -0,0 +1,4 @@
1124+#define PREFETCH(a) prefetchnta a
1125+#define NAME csum_ssemmxplus
1126+
1127+#include "csum_pf.inc"
1128diff -urN linux-2.4.20-pre11/arch/i386/lib/csumcpy.S linux-2.4.20-pre11csum/arch/i386/lib/csumcpy.S
1129--- linux-2.4.20-pre11/arch/i386/lib/csumcpy.S Wed Dec 31 22:00:00 1969
1130+++ linux-2.4.20-pre11csum/arch/i386/lib/csumcpy.S Fri Nov 1 22:49:44 2002
1131@@ -0,0 +1,178 @@
1132+/*
1133+ * INET An implementation of the TCP/IP protocol suite for the LINUX
1134+ * operating system. INET is implemented using the BSD Socket
1135+ * interface as the means of communication with the user level.
1136+ *
1137+ * IP/TCP/UDP checksumming routines
1138+ *
1139+ * Authors: Jorge Cwik, <jorge@laser.satlink.net>
1140+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
1141+ * Tom May, <ftom@netcom.com>
1142+ * Pentium Pro/II routines:
1143+ * Alexander Kjeldaas <astor@guardian.no>
1144+ * Finn Arne Gangstad <finnag@guardian.no>
1145+ * Lots of code moved from tcp.c and ip.c; see those files
1146+ * for more names.
1147+ *
1148+ * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception
1149+ * handling.
1150+ * Andi Kleen, add zeroing on error converted to pure assembler
1151+ * 2002-10-30 Denis Vlasenko
1152+ * boot-time benchmarking, 3Dnow/MMX+/SSE versions
1153+ *
1154+ * This program is free software; you can redistribute it and/or
1155+ * modify it under the terms of the GNU General Public License
1156+ * as published by the Free Software Foundation; either version
1157+ * 2 of the License, or (at your option) any later version.
1158+ */
1159+
1160+#include <asm/errno.h>
1161+
1162+/*
1163+** computes a partial checksum, e.g. for TCP/UDP fragments
1164+**
1165+** unsigned int csum_partial(const unsigned char * buff,
1166+** int len, unsigned int sum)
1167+*/
1168+
1169+#ifdef __KERNEL__
1170+#define K(a...) a
1171+#else
1172+#define K(a...)
1173+#endif
1174+
1175+#define SRC(y...) \
1176+9999: y ;\
1177+ .section __ex_table, "a";\
1178+ .long 9999b, 6001f ;\
1179+ .previous
1180+
1181+#define DST(y...) \
1182+9999: y ;\
1183+ .section __ex_table, "a";\
1184+ .long 9999b, 6002f ;\
1185+ .previous
1186+
1187+#define KERNEL_FPU_BEGIN \
1188+ call kernel_fpu_begin
1189+
1190+#define KERNEL_FPU_END(r) \
1191+K( movl %cr0, r ;)\
1192+K( orl $8, r ;)\
1193+K( movl r, %cr0 ;)
1194+
1195+.text
1196+
1197+#include "csumcpy_naive.inc"
1198+#include "csumcpy_basic.inc"
1199+#include "csumcpy_ssemmxplus.inc"
1200+#include "csumcpy_sse.inc"
1201+
1202+.align 4
1203+.globl csum_partial_copy_generic
1204+
1205+csum_partial_copy_generic:
1206+ pushl %ebx
1207+ pushl %edi
1208+ pushl %esi
1209+ pushl %ebp
1210+ movl %esp, %ebp
1211+
1212+#define STK_DERR 40(%ebp)
1213+#define STK_SERR 36(%ebp)
1214+#define STK_SUM 32(%ebp)
1215+#define STK_LEN 28(%ebp)
1216+#define STK_DST 24(%ebp)
1217+#define STK_SRC 20(%ebp)
1218+#define STK_EIP 16(%ebp)
1219+#define STK_EBX 12(%ebp)
1220+#define STK_EDI 8(%ebp)
1221+#define STK_ESI 4(%ebp)
1222+#define STK_EBP (%ebp)
1223+
1224+ movl STK_SRC, %esi #src
1225+ movl STK_DST, %edi #dst
1226+ movl STK_LEN, %ecx #len
1227+ movl STK_SUM, %eax #sum
1228+
1229+ testl $3, %edi # Check dst alignment
1230+ jz 40f
1231+
1232+ # not 4-aligned: analyze and align...
1233+ testl $1, %edi
1234+ jz 30f
1235+
1236+ # unaligned start addr
1237+ decl %ecx
1238+ js 90f # sz==0, exit
1239+ movzbl (%esi), %ebx # eat one byte...
1240+ movb %bl, (%edi)
1241+ addl %ebx, %eax
1242+ adcl $0, %eax
1243+ roll $8, %eax # NB: need to be undone at exit!
1244+ incl %esi
1245+ incl %edi
1246+ testl $2, %edi
1247+ jz 40f
1248+30:
4bf063fb 1249+ # xxx 2-aligned, but not 4-aligned
e6d11017
JR
1250+ cmpl $3, %ecx
1251+ jbe 60f
1252+ movw (%esi), %bx # eat 2 bytes
1253+ addw %bx, %ax
1254+ movw %bx, (%edi)
1255+ adcl $0, %eax
1256+ leal 2(%esi), %esi
1257+ leal 2(%edi), %edi
1258+ subl $2, %ecx
1259+40:
1260+ # edi is 4-aligned now: call block routine
1261+ movl $csumcpy_basic, %ebx # 'default', known good for ecx==0 etc
1262+ cmpl $128, %ecx # use optimized routine
1263+ jb 50f # only for large blocks
1264+ movl best_csumcpy, %ebx
1265+50: call *%ebx
1266+60:
1267+ # handle last 0-3 bytes
1268+ jecxz 80f
1269+ cmpl $2, %ecx
1270+ jb 70f
1271+SRC( movw (%esi), %cx )
1272+ leal 2(%esi), %esi
1273+DST( movw %cx, (%edi) )
1274+ leal 2(%edi), %edi
1275+ je 75f
1276+ shll $16, %ecx
1277+70:
1278+SRC( movb (%esi), %cl )
1279+DST( movb %cl, (%edi) )
1280+75: addl %ecx, %eax
1281+ adcl $0, %eax
1282+80:
1283+ # undo csum rotation if dst was unaligned
1284+ testl $1, STK_DST
1285+ jz 90f
1286+ roll $8, %eax
1287+90:
1288+ movl %esp, %ebp
1289+ popl %ebp
1290+ popl %esi
1291+ popl %edi
1292+ popl %ebx
1293+ ret
1294+
1295+
1296+.section .fixup, "ax"
1297+6001: movl STK_SERR, %ebx # src_err_ptr
1298+ movl $-EFAULT, (%ebx)
1299+ # zero the complete destination (computing the rest is too much work)
1300+ movl STK_DST, %edi # dst
1301+ movl STK_LEN, %ecx # len
1302+ xorl %eax, %eax
1303+ cld
1304+ rep; stosb
1305+ jmp 90b
1306+6002: movl STK_DERR, %ebx # dst_err_ptr
1307+ movl $-EFAULT, (%ebx)
1308+ jmp 90b
1309+.previous
1310diff -urN linux-2.4.20-pre11/arch/i386/lib/csumcpy_basic.inc linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_basic.inc
1311--- linux-2.4.20-pre11/arch/i386/lib/csumcpy_basic.inc Wed Dec 31 22:00:00 1969
1312+++ linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_basic.inc Fri Nov 1 23:27:28 2002
1313@@ -0,0 +1,40 @@
1314+// Please somebody experiment with unroll length
1315+// on a PII. Do _not_ optimize for PIII/Athlons/etc,
1316+// they won't typically use this...
1317+
1318+.align 4
1319+.globl csumcpy_basic
1320+
1321+csumcpy_basic:
1322+ movl %ecx, %ebx
1323+ movl %ecx, %edx
1324+ shrl $6, %ecx
1325+ andl $0x3c, %ebx
1326+ negl %ebx
1327+ subl %ebx, %esi
1328+ subl %ebx, %edi
1329+ leal 50f(%ebx,%ebx), %ebx
1330+ clc
1331+ jmp *%ebx
1332+40:
1333+ leal 64(%esi), %esi
1334+ leal 64(%edi), %edi
1335+
1336+#undef ROUND
1337+#define ROUND(x) \
1338+SRC( movl x(%esi), %ebx ); \
1339+ adcl %ebx, %eax ; \
1340+DST( movl %ebx, x(%edi) );
1341+
1342+ ROUND(-64) ROUND(-60) ROUND(-56) ROUND(-52)
1343+ ROUND(-48) ROUND(-44) ROUND(-40) ROUND(-36)
1344+ ROUND(-32) ROUND(-28) ROUND(-24) ROUND(-20)
1345+ ROUND(-16) ROUND(-12) ROUND(-8) ROUND(-4)
1346+50:
1347+ decl %ecx
1348+ jge 40b
1349+
1350+ adcl $0, %eax
1351+ movl %edx, %ecx
1352+ andl $3, %ecx
1353+ ret
1354diff -urN linux-2.4.20-pre11/arch/i386/lib/csumcpy_naive.inc linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_naive.inc
1355--- linux-2.4.20-pre11/arch/i386/lib/csumcpy_naive.inc Wed Dec 31 22:00:00 1969
1356+++ linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_naive.inc Fri Nov 1 23:27:51 2002
1357@@ -0,0 +1,21 @@
1358+// Heh... at least it's small ;)
1359+
1360+.align 4
1361+.globl csumcpy_naive
1362+
1363+csumcpy_naive:
1364+ mov %ecx, %edx
1365+ shrl $2, %ecx
1366+ clc
1367+1:
1368+SRC( movl (%esi), %ebx )
1369+DST( movl %ebx, (%edi) )
1370+ adcl %ebx, %eax
1371+ leal 4(%esi), %esi
1372+ leal 4(%edi), %edi
1373+ loop 1b
1374+
1375+ adcl $0, %eax
1376+ mov %edx, %ecx
1377+ and $3, %ecx
1378+ ret
1379diff -urN linux-2.4.20-pre11/arch/i386/lib/csumcpy_sse.inc linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_sse.inc
1380--- linux-2.4.20-pre11/arch/i386/lib/csumcpy_sse.inc Wed Dec 31 22:00:00 1969
1381+++ linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_sse.inc Fri Nov 1 23:38:32 2002
1382@@ -0,0 +1,147 @@
1383+// Huge routine, I don't like it's size and number
1384+// of fixups... think of that when you want
1385+// to unroll loop more
1386+// TODO: benchmark and reduce size
1387+// I won't stand 1K behemot just for 5% speedup
1388+
1389+#undef PREFETCH
1390+#define PREFETCH(a) prefetchnta a
1391+
1392+// How much unrolling do you want?
1393+// vda: celeron 1200: 5 with movaps, 4 with movups
1394+#undef ITER_BITS
1395+#define ITER_BITS 6 // ...4,5,6,7 - ...16,32,64,128 bytes
1396+ // NB: tweak unrolled loop too...
1397+
1398+#undef ITER_SZ
1399+#undef ITER_MSK
1400+#define ITER_SZ (1<<ITER_BITS)
1401+#define ITER_MSK ((1<<ITER_BITS)-4)
1402+
1403+.align 4
1404+.globl csumcpy_sse
1405+
1406+csumcpy_sse:
1407+ testl $0xe, %edi # Check alignment
1408+ jnz 5500f # align to 16 bytes
1409+1:
1410+ movl %ecx, %edx
1411+ shrl $ITER_BITS, %ecx
1412+ jz 20f
1413+
1414+# "big chunks" loop
1415+ PREFETCH((%esi)) # Prefetch a couple of cachelines
1416+ PREFETCH(32(%esi)) // Note: Athlons have 64 bytes long ones, but
1417+ PREFETCH(64(%esi)) // PIIIs only 32! This gives ~20% speedup
1418+ PREFETCH(64+32(%esi)) // for PIII
1419+ PREFETCH(128(%esi)) // Note2: 128 pf depth is slower for Athlons
1420+ PREFETCH(128+32(%esi)) // let them enjoy 256
1421+ PREFETCH(192(%esi))
1422+ PREFETCH(192+32(%esi))
1423+
1424+ //KERNEL_FPU_BEGIN // We can't use lazy save - can be in irq :(
1425+ subl $32, %esp // hopefully this is not too slow...
1426+K( movl %cr0, %ebx )
1427+K( clts )
1428+ movups %xmm0, (%esp)
1429+ movups %xmm1, 16(%esp)
1430+
1431+
1432+#undef ROUND0
1433+#undef ROUND
1434+#define ROUND0(au,r) \
1435+SRC( mov##au##ps (%esi), r ;) \
1436+ adcl (%esi), %eax ; \
1437+ adcl 4(%esi), %eax ; \
1438+ adcl 8(%esi), %eax ; \
1439+ adcl 12(%esi), %eax ; \
1440+DST( movntps r, (%edi) ;) \
1441+
1442+#define ROUND(au,x,r) \
1443+SRC( mov##au##ps x(%esi), r ;) \
1444+ adcl x(%esi), %eax ; \
1445+ adcl x+4(%esi), %eax ; \
1446+ adcl x+8(%esi), %eax ; \
1447+ adcl x+12(%esi), %eax; \
1448+DST( movntps r, x(%edi) ;) \
1449+
1450+// ROUND[0]: edi must be 16-aligned!
1451+// if esi is not aligned, movaps wouldn't work,
1452+// not caught by testsuite. TODO.
1453+// We don't need SRC() around adcl's
1454+// (exception, if any, would be caught by 1st one)
1455+// (FIXME: can races against interrupts bite us?)
1456+
1457+ testl $0xf, %esi # Check esi alignment + clear CF
1458+ jz 15f
1459+10: # esi is NOT 16-aligned
1460+ PREFETCH(256(%esi))
1461+ ROUND0(u,%xmm0)
1462+ ROUND(u,16,%xmm1)
1463+ PREFETCH(256+32(%esi))
1464+ ROUND(u,32,%xmm0)
1465+ ROUND(u,48,%xmm1)
1466+ lea ITER_SZ(%esi), %esi
1467+ lea ITER_SZ(%edi), %edi
1468+ //dec %ecx
1469+ //jnz 10b
1470+ loop 10b // Beware: loop and ITER_BITS>6 don't mix
1471+ adcl $0, %eax
1472+ jmp 19f
1473+15: # esi is 16-aligned
1474+ PREFETCH(256(%esi))
1475+ ROUND0(a,%xmm0)
1476+ ROUND(a,16,%xmm1)
1477+ PREFETCH(256+32(%esi))
1478+ ROUND(a,32,%xmm0)
1479+ ROUND(a,48,%xmm1)
1480+ lea ITER_SZ(%esi), %esi
1481+ lea ITER_SZ(%edi), %edi
1482+ //dec %ecx
1483+ //jnz 15b
1484+ loop 15b // Beware: loop and ITER_BITS>6 don't mix
1485+ adcl $0, %eax
1486+19:
1487+ sfence # clean up XMM
1488+ //KERNEL_FPU_END(%ebx)
1489+ movups (%esp), %xmm0
1490+ movups 16(%esp), %xmm1
1491+ addl $32, %esp
1492+K( movl %ebx, %cr0 )
1493+
1494+20:
1495+ # loop for dwords
1496+ movl %edx, %ecx
1497+ andl $ITER_MSK, %edx
1498+ jz 40f
1499+ shrl $2, %edx # this also clears CF
1500+30:
1501+SRC( movl (%esi), %ebx )
1502+ adcl %ebx, %eax
1503+DST( movl %ebx, (%edi) )
1504+ lea 4(%esi), %esi
1505+ lea 4(%edi), %edi
1506+ dec %edx
1507+ jnz 30b
1508+ adcl $0, %eax
1509+40:
1510+ # last 1, 2 or 3 bytes: handled by caller
1511+ andl $3, %ecx
1512+ ret
1513+
1514+
4bf063fb 1515+# xxx 16-align edi and get back
e6d11017
JR
1516+5500: cmp $ITER_SZ, %ecx # edi is 4-aligned here
1517+ mov %ecx, %edx # edx needed at 20:
1518+ jb 20b # not worthy: too short
1519+
1520+5520: test $0xe, %edi # loop until we are 16-aligned
1521+ jz 1b
1522+SRC( movl (%esi), %ebx )
1523+ addl $4, %esi
1524+DST( movl %ebx, (%edi) )
1525+ addl $4, %edi
1526+ addl %ebx, %eax
1527+ adcl $0, %eax
1528+ subl $4, %ecx
1529+ jmp 5520b
1530diff -urN linux-2.4.20-pre11/arch/i386/lib/csumcpy_ssemmxplus.inc linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_ssemmxplus.inc
1531--- linux-2.4.20-pre11/arch/i386/lib/csumcpy_ssemmxplus.inc Wed Dec 31 22:00:00 1969
1532+++ linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_ssemmxplus.inc Fri Nov 1 23:22:58 2002
1533@@ -0,0 +1,103 @@
1534+#undef PREFETCH
1535+#define PREFETCH(a) prefetchnta a
1536+
1537+// How much unrolling do you want?
1538+#undef ITER_BITS
1539+#define ITER_BITS 5 // ...5,6,7 - ...32,64,128 bytes
1540+ // NB: tweak unrolled loop too...
1541+
1542+#undef ITER_SZ
1543+#undef ITER_MSK
1544+#define ITER_SZ (1<<ITER_BITS)
1545+#define ITER_MSK ((1<<ITER_BITS)-4)
1546+
1547+.align 4
1548+.globl csumcpy_ssemmxplus
1549+
1550+csumcpy_ssemmxplus:
1551+ movl %ecx, %edx
1552+ shrl $ITER_BITS, %ecx
1553+ jz 20f
1554+
1555+# "big chunks" loop
1556+ PREFETCH((%esi)) # Prefetch a couple of cachelines
1557+ PREFETCH(32(%esi)) // Note: Athlons have 64 bytes long ones, but
1558+ PREFETCH(64(%esi)) // PIIIs only 32! This gives ~20% speedup
1559+ PREFETCH(64+32(%esi)) // for PIII
1560+ PREFETCH(128(%esi)) // Note2: 128 pf depth is slower for Athlons
1561+ PREFETCH(128+32(%esi)) // let them enjoy 256
1562+ PREFETCH(192(%esi))
1563+ PREFETCH(192+32(%esi))
1564+
1565+ //KERNEL_FPU_BEGIN // We can't use lazy save - can be in irq :(
1566+K( movl %cr0, %ebx )
1567+K( clts )
1568+ subl $108, %esp
1569+ fnsave (%esp)
1570+ fwait
1571+
1572+ clc
1573+
1574+#undef ROUND0
1575+#undef ROUND
1576+#define ROUND0(r) \
1577+SRC( movq (%esi), r ;) \
1578+ adcl (%esi), %eax ; \
1579+ adcl 4(%esi), %eax ; \
1580+DST( movntq r, (%edi) ;) \
1581+
1582+#define ROUND(x,r) \
1583+SRC( movq x(%esi), r ;) \
1584+ adcl x(%esi), %eax ; \
1585+ adcl x+4(%esi), %eax ; \
1586+DST( movntq r, x(%edi) ;) \
1587+
1588+// moving store to the end of a ROUND makes it faster
1589+// don't ask me why
1590+// we don't need SRC() around adcl's
1591+// (exception, if any, would be caught by 1st one)
1592+// (FIXME: can races against interrupts bite us?)
1593+
1594+10:
1595+ PREFETCH(256(%esi))
1596+ ROUND0(%mm0) // using mm1,2,3 does not speed up things
1597+ ROUND(8,%mm0)
1598+ ROUND(16,%mm0)
1599+ ROUND(24,%mm0)
1600+/* PREFETCH(256+32(%esi))
1601+ ROUND(32,%mm0)
1602+ ROUND(40,%mm0)
1603+ ROUND(48,%mm0)
1604+ ROUND(56,%mm0)*/
1605+
1606+ lea ITER_SZ(%esi), %esi
1607+ lea ITER_SZ(%edi), %edi
1608+ //dec %ecx
1609+ //jnz 10b
1610+ loop 10b // Beware: loop and ITER_BITS>5 don't mix
1611+ adcl $0, %eax
1612+
1613+ sfence
1614+ //KERNEL_FPU_END(%ebx)
1615+ frstor (%esp)
1616+ addl $108, %esp
1617+K( movl %ebx, %cr0 )
1618+
1619+20:
1620+ # loop for dwords
1621+ movl %edx, %ecx
1622+ andl $ITER_MSK, %edx
1623+ jz 40f
1624+ shrl $2, %edx # this also clears CF
1625+30:
1626+SRC( movl (%esi), %ebx )
1627+ adcl %ebx, %eax
1628+DST( movl %ebx, (%edi) )
1629+ lea 4(%esi), %esi
1630+ lea 4(%edi), %edi
1631+ dec %edx
1632+ jnz 30b
1633+ adcl $0, %eax
1634+
1635+40: andl $3, %ecx
1636+ ret
This page took 0.28767 seconds and 4 git commands to generate.