]>
Commit | Line | Data |
---|---|---|
e6d11017 JR |
1 | New csum functions optimized for different processors. |
2 | Author: Denis Vlasenko <vda@port.imtp.ilyichevsk.odessa.ua> | |
3 | ||
4 | diff -urN linux-2.4.20-pre11/arch/i386/lib/Makefile linux-2.4.20-pre11csum/arch/i386/lib/Makefile | |
5 | --- linux-2.4.20-pre11/arch/i386/lib/Makefile Mon Sep 10 12:31:30 2001 | |
6 | +++ linux-2.4.20-pre11csum/arch/i386/lib/Makefile Fri Nov 1 23:55:58 2002 | |
4bf063fb | 7 | @@ -7,9 +7,17 @@ |
e6d11017 JR |
8 | |
9 | L_TARGET = lib.a | |
10 | ||
11 | -obj-y = checksum.o old-checksum.o delay.o \ | |
12 | +obj-y = old-checksum.o delay.o \ | |
13 | usercopy.o getuser.o \ | |
14 | - memcpy.o strstr.o | |
15 | + memcpy.o strstr.o \ | |
16 | + bench_csum.o \ | |
4bf063fb | 17 | + bench_func.o \ |
e6d11017 JR |
18 | + csum.o \ |
19 | + csum_basic.o \ | |
20 | + csum_naive.o \ | |
21 | + csum_3dnow.o \ | |
22 | + csum_ssemmxplus.o \ | |
23 | + csumcpy.o | |
24 | ||
25 | obj-$(CONFIG_X86_USE_3DNOW) += mmx.o | |
26 | obj-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o | |
27 | diff -urN linux-2.4.20-pre11/arch/i386/lib/bench_csum.c linux-2.4.20-pre11csum/arch/i386/lib/bench_csum.c | |
28 | --- linux-2.4.20-pre11/arch/i386/lib/bench_csum.c Wed Dec 31 22:00:00 1969 | |
29 | +++ linux-2.4.20-pre11csum/arch/i386/lib/bench_csum.c Sat Nov 2 11:51:40 2002 | |
4bf063fb | 30 | @@ -0,0 +1,216 @@ |
e6d11017 JR |
31 | +#include <linux/mm.h> // for get_pages |
32 | +#include <asm/uaccess.h> // for access_ok in asm/checksum.h | |
33 | +#include <linux/in6.h> // for in6_addr in asm/checksum.h | |
34 | +#include <asm/byteorder.h> // for ntoh in asm/checksum.h | |
35 | +#include <asm/cpufeature.h> // for X86_FEATURE_xx | |
36 | +#include <linux/byteorder/generic.h> // for ntohX in asm/checksum.h | |
37 | +#include <linux/stddef.h> // for NULL in asm/checksum.h | |
38 | +#include <linux/linkage.h> // for asmlinkage in asm/checksum.h | |
39 | +#include <linux/module.h> | |
40 | + | |
e6d11017 | 41 | +#include <asm/checksum.h> |
4bf063fb | 42 | +#include "bench_func.h" |
e6d11017 JR |
43 | + |
44 | +//#define dprintk(a...) printk(a) | |
45 | +#define dprintk(a...) ((void)0) | |
46 | + | |
47 | +/* Features usable for mem optimization: | |
48 | + Intel | |
49 | +X86_FEATURE_FPU Onboard FPU | |
50 | +X86_FEATURE_MMX Multimedia Extensions | |
51 | +X86_FEATURE_XMM Streaming SIMD Extensions | |
52 | +X86_FEATURE_XMM2 Streaming SIMD Extensions-2 | |
53 | + AMD | |
54 | +X86_FEATURE_3DNOW 3DNow! | |
55 | +X86_FEATURE_MMXEXT AMD MMX extensions | |
56 | +X86_FEATURE_3DNOWEXT AMD 3DNow! extensions | |
57 | + Cyrix | |
58 | +X86_FEATURE_CXMMX Cyrix MMX extensions | |
59 | +*/ | |
60 | + | |
61 | +typedef typeof(jiffies) jiffies_t; | |
62 | + | |
63 | +typedef void asm_helper(void); | |
64 | + | |
65 | +extern asm_helper csum_basic; | |
66 | +extern asm_helper csum_naive; | |
67 | +extern asm_helper csum_3dnow; | |
68 | +extern asm_helper csum_ssemmxplus; | |
69 | + | |
70 | +static struct candidate csum_runner[] = { | |
71 | + { "basic" , csum_basic , 1, { -1 } }, | |
72 | + { "simple" , csum_naive , 1, { -1 } }, | |
73 | + { "3Dnow!" , csum_3dnow , 1, { X86_FEATURE_3DNOW, -1 } }, | |
74 | + { "AMD MMX", csum_ssemmxplus, 1, { X86_FEATURE_MMXEXT, -1 } }, | |
75 | + { "SSE1+", csum_ssemmxplus, 1, { X86_FEATURE_XMM, -1 } }, | |
76 | +}; | |
77 | + | |
78 | +extern asm_helper csumcpy_basic; | |
79 | +extern asm_helper csumcpy_naive; | |
80 | +extern asm_helper csumcpy_ssemmxplus; | |
81 | +extern asm_helper csumcpy_sse; | |
82 | + | |
83 | +static struct candidate csumcpy_runner[] = { | |
84 | + { "basic" , csumcpy_basic , 2, { -1 } }, | |
85 | + { "simple" , csumcpy_naive , 2, { -1 } }, | |
86 | + /* higher weight: we prefer these for less cache pollution: */ | |
87 | + { "AND MMX", csumcpy_ssemmxplus, 3, { X86_FEATURE_MMXEXT, -1 } }, | |
88 | + { "SSE1+", csumcpy_ssemmxplus, 3, { X86_FEATURE_XMM, -1 } }, | |
89 | + { "SSE1" , csumcpy_sse , 3, { X86_FEATURE_XMM, -1 } }, | |
90 | +}; | |
91 | + | |
92 | +//====== TODO: split here: above: arch, below:generic | |
93 | + | |
94 | +/* set this to value bigger than cache(s) */ | |
95 | +/* TODO: heuristic for buffer size */ | |
96 | +#define bufshift 20 /* 10=1kb, 20=1MB etc */ | |
97 | +/* typical size of a packet */ | |
98 | +#define chunksz (4*1024) | |
99 | + | |
100 | +#define bufsz (1<<bufshift) | |
101 | +#define chunkcnt (bufsz/chunksz) | |
102 | + | |
103 | +#define VECTOR_SZ(a) (sizeof(a)/sizeof((a)[0])) | |
104 | + | |
105 | +asm_helper *best_csum = csum_basic; | |
106 | +asm_helper *best_csumcpy = csumcpy_basic; | |
107 | + | |
108 | +/* | |
109 | +** Count the number of iterations done during a fixed period, | |
110 | +** and use this to calculate throughput. | |
111 | +*/ | |
112 | + | |
113 | +static int duration = 1; // jiffies for each run | |
114 | +static int report; | |
115 | + | |
116 | +static inline void | |
117 | +wait_for_jiffy(void) { | |
118 | + jiffies_t now = jiffies; | |
119 | + while(now == jiffies) cpu_relax(); | |
120 | +} | |
121 | + | |
122 | +static int | |
123 | +bench_csum(struct candidate *cand, char *buf) | |
124 | +{ | |
125 | + int i, max; | |
126 | + best_csum = (asm_helper*)(cand->f); | |
127 | + | |
128 | + max = 0; | |
129 | + // In practice these are pretty repeatable | |
130 | + // so 3 runs is an overkill | |
131 | + for(i=0; i<3; i++) { | |
132 | + int count = 0; | |
133 | + jiffies_t limit; | |
134 | + wait_for_jiffy(); | |
135 | + limit = jiffies+duration; | |
136 | + while(time_before(jiffies, limit)) { | |
137 | + int i; | |
138 | + mb(); | |
139 | + // interleaved to avoid bias due to prefetch | |
140 | + for(i=0; i<chunkcnt; i+=2) | |
141 | + csum_partial(buf+i*chunksz, chunksz, 0); | |
142 | + for(i=1; i<chunkcnt; i+=2) | |
143 | + csum_partial(buf+i*chunksz, chunksz, 0); | |
144 | + mb(); | |
145 | + count++; | |
146 | + mb(); | |
147 | + } | |
148 | + dprintk(" count =%6i\n",count); | |
149 | + if(count>max) | |
150 | + max = count; | |
151 | + } | |
152 | + | |
153 | + if(report) { | |
154 | + int kb_sec = max * (((chunksz*chunkcnt)/1024) * HZ) / duration; | |
155 | + printk(" %-10s:%6d.%03d MB/sec\n", cand->name, | |
156 | + kb_sec / 1000, kb_sec % 1000); | |
157 | + } | |
158 | + | |
159 | + return max; | |
160 | +} | |
161 | + | |
162 | +static int | |
163 | +bench_csumcpy(struct candidate *cand, char *buf) | |
164 | +{ | |
165 | + int err; | |
166 | + int i, max; | |
167 | + best_csumcpy = (asm_helper*)(cand->f); | |
168 | + | |
169 | + max = 0; | |
170 | + for(i=0; i<3; i++) { | |
171 | + int count = 0; | |
172 | + jiffies_t limit; | |
173 | + wait_for_jiffy(); | |
174 | + limit = jiffies+duration; | |
175 | + while(time_before(jiffies, limit)) { | |
176 | + int i; | |
177 | + mb(); | |
178 | + // interleaved to avoid bias due to prefetch | |
179 | + for(i=0; i<chunkcnt; i+=2) | |
180 | + csum_partial_copy_generic(buf+i*chunksz, | |
181 | + buf+(chunkcnt-1-i)*chunksz, | |
182 | + chunksz, 0, &err, &err); | |
183 | + for(i=1; i<chunkcnt; i+=2) | |
184 | + csum_partial_copy_generic(buf+i*chunksz, | |
185 | + buf+(chunkcnt-1-i)*chunksz, | |
186 | + chunksz, 0, &err, &err); | |
187 | + mb(); | |
188 | + count++; | |
189 | + mb(); | |
190 | + } | |
191 | + dprintk(" count =%6i\n",count); | |
192 | + if(count>max) | |
193 | + max = count; | |
194 | + } | |
195 | + | |
196 | + if(report) { | |
197 | + int kb_sec = max * (((chunksz*chunkcnt)/1024) * HZ) / duration; | |
198 | + printk(" %-10s:%6d.%03d MB/sec\n", cand->name, | |
199 | + kb_sec / 1000, kb_sec % 1000); | |
200 | + } | |
201 | + | |
202 | + return max; | |
203 | +} | |
204 | + | |
205 | +static int | |
206 | +find_best_csum(void) | |
207 | +{ | |
208 | + struct candidate *best; | |
209 | + char *buffer = (char *) __get_free_pages(GFP_KERNEL, | |
210 | + (bufshift-PAGE_SHIFT)); | |
211 | + | |
212 | + printk(KERN_INFO "Measuring network checksumming speed\n"); | |
213 | + if(!buffer) { | |
214 | + printk("csum: cannot allocate %i pages\n", | |
215 | + 1<<(bufshift-PAGE_SHIFT) | |
216 | + ); | |
217 | + return -ENOMEM; | |
218 | + } | |
219 | + dprintk("allocated %i pages\n",1<<(bufshift-PAGE_SHIFT)); | |
220 | + | |
221 | + // find # of jiffies suitable for reliable results | |
222 | + // (at least %5 accuracy) | |
223 | + while(bench_csumcpy(&csumcpy_runner[0], buffer)<20) { | |
224 | + duration<<=1; | |
225 | + } | |
226 | + dprintk("test run will last %i ticks\n", duration); | |
227 | + report = 1; | |
228 | + | |
229 | + best = find_best(bench_csum, buffer, csum_runner, | |
230 | + VECTOR_SZ(csum_runner)); | |
231 | + printk("csum: using csum function: %s\n", best->name); | |
232 | + best_csum = (asm_helper*)(best->f); | |
233 | + | |
234 | + best = find_best(bench_csumcpy, buffer, csumcpy_runner, | |
235 | + VECTOR_SZ(csumcpy_runner)); | |
236 | + printk("csum: using csum_copy function: %s\n", best->name); | |
237 | + best_csumcpy = (asm_helper*)(best->f); | |
238 | + | |
239 | + free_pages((unsigned long)buffer, (bufshift-PAGE_SHIFT)); | |
240 | + dprintk("freed %i pages\n",1<<(bufshift-PAGE_SHIFT)); | |
241 | + return 0; | |
242 | +} | |
243 | + | |
244 | +MODULE_LICENSE("GPL"); | |
245 | + | |
246 | +module_init(find_best_csum); | |
4bf063fb JR |
247 | diff -urN linux-2.4.20-pre11/arch/i386/lib/bench_func.c linux-2.4.20-pre11csum/arch/i386/lib/bench_func.c |
248 | --- linux-2.4.20-pre11/arch/i386/lib/bench_func.c Wed Dec 31 22:00:00 1969 | |
249 | +++ linux-2.4.20-pre11csum/arch/i386/lib/bench_func.c Fri Nov 1 18:08:37 2002 | |
250 | @@ -0,0 +1,53 @@ | |
251 | +#include <linux/kernel.h> // for KERN_DEBUG | |
252 | + | |
253 | +#include <asm/bitops.h> // for test_bit | |
254 | +#include <asm/processor.h> // cpu caps | |
255 | +#include <asm/cpufeature.h> // cpu features constants | |
256 | +#include "bench_func.h" | |
257 | + | |
258 | +//#define dprintk(a...) printk(a) | |
259 | +#define dprintk(a...) ((void)0) | |
260 | + | |
261 | +// 2.4 only, already in 2.5 | |
262 | +extern inline int | |
263 | +boot_cpu_has(int cap) | |
264 | +{ | |
265 | + return test_bit(cap, boot_cpu_data.x86_capability); | |
266 | +} | |
267 | + | |
268 | +extern inline int | |
269 | +cpu_supports(int *cap) | |
270 | +{ | |
271 | + while(*cap != -1) { | |
272 | + if(!boot_cpu_has(*cap)) { | |
273 | + dprintk("unsupported caps: %i\n", *cap); | |
274 | + return 0; | |
275 | + } | |
276 | + cap++; | |
277 | + } | |
278 | + return 1; | |
279 | +} | |
280 | + | |
281 | +/* | |
282 | +** Call all the candidates which can be run on this CPU, | |
283 | +** find the best | |
284 | +*/ | |
285 | +struct candidate* | |
286 | +find_best(bench_func *bench, char *opaque, struct candidate runner[], int count) | |
287 | +{ | |
288 | + int score, max = 0; | |
289 | + struct candidate *best = 0; | |
290 | + while(count--) { | |
291 | + if(!cpu_supports(runner->cpu_caps_needed)) { | |
292 | + printk("func %s skipped: not supported by CPU\n", runner->name); | |
293 | + } else { | |
294 | + score = bench(runner,opaque) * runner->weight; | |
295 | + if(max < score) { | |
296 | + max = score; | |
297 | + best = runner; | |
298 | + } | |
299 | + } | |
300 | + runner++; | |
301 | + } | |
302 | + return best; | |
303 | +} | |
304 | diff -urN linux-2.4.20-pre11/arch/i386/lib/bench_func.h linux-2.4.20-pre11csum/arch/i386/lib/bench_func.h | |
305 | --- linux-2.4.20-pre11/arch/i386/lib/bench_func.h Wed Dec 31 22:00:00 1969 | |
306 | +++ linux-2.4.20-pre11csum/arch/i386/lib/bench_func.h Fri Nov 1 18:08:37 2002 | |
307 | @@ -0,0 +1,16 @@ | |
308 | +#ifndef _BENCH_FUNC_H | |
309 | +#define _BENCH_FUNC_H | |
310 | + | |
311 | +struct candidate { | |
312 | + const char *name; | |
313 | + void *f; // pointer to func | |
314 | + int weight; | |
315 | + int cpu_caps_needed[4]; | |
316 | +}; | |
317 | + | |
318 | +typedef int bench_func(struct candidate *cand, char *opaque); | |
319 | + | |
320 | +struct candidate* find_best(bench_func *bench, char *opaque, | |
321 | + struct candidate runner[], int count); | |
322 | + | |
323 | +#endif | |
e6d11017 JR |
324 | diff -urN linux-2.4.20-pre11/arch/i386/lib/checksum.S linux-2.4.20-pre11csum/arch/i386/lib/checksum.S |
325 | --- linux-2.4.20-pre11/arch/i386/lib/checksum.S Fri Nov 1 18:06:59 2002 | |
326 | +++ linux-2.4.20-pre11csum/arch/i386/lib/checksum.S Wed Dec 31 22:00:00 1969 | |
327 | @@ -1,496 +0,0 @@ | |
328 | -/* | |
329 | - * INET An implementation of the TCP/IP protocol suite for the LINUX | |
330 | - * operating system. INET is implemented using the BSD Socket | |
331 | - * interface as the means of communication with the user level. | |
332 | - * | |
333 | - * IP/TCP/UDP checksumming routines | |
334 | - * | |
335 | - * Authors: Jorge Cwik, <jorge@laser.satlink.net> | |
336 | - * Arnt Gulbrandsen, <agulbra@nvg.unit.no> | |
337 | - * Tom May, <ftom@netcom.com> | |
338 | - * Pentium Pro/II routines: | |
339 | - * Alexander Kjeldaas <astor@guardian.no> | |
340 | - * Finn Arne Gangstad <finnag@guardian.no> | |
341 | - * Lots of code moved from tcp.c and ip.c; see those files | |
342 | - * for more names. | |
343 | - * | |
344 | - * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception | |
345 | - * handling. | |
346 | - * Andi Kleen, add zeroing on error | |
347 | - * converted to pure assembler | |
348 | - * | |
349 | - * This program is free software; you can redistribute it and/or | |
350 | - * modify it under the terms of the GNU General Public License | |
351 | - * as published by the Free Software Foundation; either version | |
352 | - * 2 of the License, or (at your option) any later version. | |
353 | - */ | |
354 | - | |
355 | -#include <linux/config.h> | |
356 | -#include <asm/errno.h> | |
357 | - | |
358 | -/* | |
359 | - * computes a partial checksum, e.g. for TCP/UDP fragments | |
360 | - */ | |
361 | - | |
362 | -/* | |
363 | -unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) | |
364 | - */ | |
365 | - | |
366 | -.text | |
367 | -.align 4 | |
368 | -.globl csum_partial | |
369 | - | |
370 | -#ifndef CONFIG_X86_USE_PPRO_CHECKSUM | |
371 | - | |
372 | - /* | |
373 | - * Experiments with Ethernet and SLIP connections show that buff | |
374 | - * is aligned on either a 2-byte or 4-byte boundary. We get at | |
375 | - * least a twofold speedup on 486 and Pentium if it is 4-byte aligned. | |
376 | - * Fortunately, it is easy to convert 2-byte alignment to 4-byte | |
377 | - * alignment for the unrolled loop. | |
378 | - */ | |
379 | -csum_partial: | |
380 | - pushl %esi | |
381 | - pushl %ebx | |
382 | - movl 20(%esp),%eax # Function arg: unsigned int sum | |
383 | - movl 16(%esp),%ecx # Function arg: int len | |
384 | - movl 12(%esp),%esi # Function arg: unsigned char *buff | |
385 | - testl $3, %esi # Check alignment. | |
386 | - jz 2f # Jump if alignment is ok. | |
387 | - testl $1, %esi # Check alignment. | |
388 | - jz 10f # Jump if alignment is boundary of 2bytes. | |
389 | - | |
390 | - # buf is odd | |
391 | - dec %ecx | |
392 | - jl 8f | |
393 | - movzbl (%esi), %ebx | |
394 | - adcl %ebx, %eax | |
395 | - roll $8, %eax | |
396 | - inc %esi | |
397 | - testl $2, %esi | |
398 | - jz 2f | |
399 | -10: | |
400 | - subl $2, %ecx # Alignment uses up two bytes. | |
401 | - jae 1f # Jump if we had at least two bytes. | |
402 | - addl $2, %ecx # ecx was < 2. Deal with it. | |
403 | - jmp 4f | |
404 | -1: movw (%esi), %bx | |
405 | - addl $2, %esi | |
406 | - addw %bx, %ax | |
407 | - adcl $0, %eax | |
408 | -2: | |
409 | - movl %ecx, %edx | |
410 | - shrl $5, %ecx | |
411 | - jz 2f | |
412 | - testl %esi, %esi | |
413 | -1: movl (%esi), %ebx | |
414 | - adcl %ebx, %eax | |
415 | - movl 4(%esi), %ebx | |
416 | - adcl %ebx, %eax | |
417 | - movl 8(%esi), %ebx | |
418 | - adcl %ebx, %eax | |
419 | - movl 12(%esi), %ebx | |
420 | - adcl %ebx, %eax | |
421 | - movl 16(%esi), %ebx | |
422 | - adcl %ebx, %eax | |
423 | - movl 20(%esi), %ebx | |
424 | - adcl %ebx, %eax | |
425 | - movl 24(%esi), %ebx | |
426 | - adcl %ebx, %eax | |
427 | - movl 28(%esi), %ebx | |
428 | - adcl %ebx, %eax | |
429 | - lea 32(%esi), %esi | |
430 | - dec %ecx | |
431 | - jne 1b | |
432 | - adcl $0, %eax | |
433 | -2: movl %edx, %ecx | |
434 | - andl $0x1c, %edx | |
435 | - je 4f | |
436 | - shrl $2, %edx # This clears CF | |
437 | -3: adcl (%esi), %eax | |
438 | - lea 4(%esi), %esi | |
439 | - dec %edx | |
440 | - jne 3b | |
441 | - adcl $0, %eax | |
442 | -4: andl $3, %ecx | |
443 | - jz 7f | |
444 | - cmpl $2, %ecx | |
445 | - jb 5f | |
446 | - movw (%esi),%cx | |
447 | - leal 2(%esi),%esi | |
448 | - je 6f | |
449 | - shll $16,%ecx | |
450 | -5: movb (%esi),%cl | |
451 | -6: addl %ecx,%eax | |
452 | - adcl $0, %eax | |
453 | -7: | |
454 | - testl $1, 12(%esp) | |
455 | - jz 8f | |
456 | - roll $8, %eax | |
457 | -8: | |
458 | - popl %ebx | |
459 | - popl %esi | |
460 | - ret | |
461 | - | |
462 | -#else | |
463 | - | |
464 | -/* Version for PentiumII/PPro */ | |
465 | - | |
466 | -csum_partial: | |
467 | - pushl %esi | |
468 | - pushl %ebx | |
469 | - movl 20(%esp),%eax # Function arg: unsigned int sum | |
470 | - movl 16(%esp),%ecx # Function arg: int len | |
471 | - movl 12(%esp),%esi # Function arg: const unsigned char *buf | |
472 | - | |
473 | - testl $3, %esi | |
474 | - jnz 25f | |
475 | -10: | |
476 | - movl %ecx, %edx | |
477 | - movl %ecx, %ebx | |
478 | - andl $0x7c, %ebx | |
479 | - shrl $7, %ecx | |
480 | - addl %ebx,%esi | |
481 | - shrl $2, %ebx | |
482 | - negl %ebx | |
483 | - lea 45f(%ebx,%ebx,2), %ebx | |
484 | - testl %esi, %esi | |
485 | - jmp *%ebx | |
486 | - | |
487 | - # Handle 2-byte-aligned regions | |
488 | -20: addw (%esi), %ax | |
489 | - lea 2(%esi), %esi | |
490 | - adcl $0, %eax | |
491 | - jmp 10b | |
492 | -25: | |
493 | - testl $1, %esi | |
494 | - jz 30f | |
495 | - # buf is odd | |
496 | - dec %ecx | |
497 | - jl 90f | |
498 | - movzbl (%esi), %ebx | |
499 | - addl %ebx, %eax | |
500 | - adcl $0, %eax | |
501 | - roll $8, %eax | |
502 | - inc %esi | |
503 | - testl $2, %esi | |
504 | - jz 10b | |
505 | - | |
506 | -30: subl $2, %ecx | |
507 | - ja 20b | |
508 | - je 32f | |
509 | - addl $2, %ecx | |
510 | - jz 80f | |
511 | - movzbl (%esi),%ebx # csumming 1 byte, 2-aligned | |
512 | - addl %ebx, %eax | |
513 | - adcl $0, %eax | |
514 | - jmp 80f | |
515 | -32: | |
516 | - addw (%esi), %ax # csumming 2 bytes, 2-aligned | |
517 | - adcl $0, %eax | |
518 | - jmp 80f | |
519 | - | |
520 | -40: | |
521 | - addl -128(%esi), %eax | |
522 | - adcl -124(%esi), %eax | |
523 | - adcl -120(%esi), %eax | |
524 | - adcl -116(%esi), %eax | |
525 | - adcl -112(%esi), %eax | |
526 | - adcl -108(%esi), %eax | |
527 | - adcl -104(%esi), %eax | |
528 | - adcl -100(%esi), %eax | |
529 | - adcl -96(%esi), %eax | |
530 | - adcl -92(%esi), %eax | |
531 | - adcl -88(%esi), %eax | |
532 | - adcl -84(%esi), %eax | |
533 | - adcl -80(%esi), %eax | |
534 | - adcl -76(%esi), %eax | |
535 | - adcl -72(%esi), %eax | |
536 | - adcl -68(%esi), %eax | |
537 | - adcl -64(%esi), %eax | |
538 | - adcl -60(%esi), %eax | |
539 | - adcl -56(%esi), %eax | |
540 | - adcl -52(%esi), %eax | |
541 | - adcl -48(%esi), %eax | |
542 | - adcl -44(%esi), %eax | |
543 | - adcl -40(%esi), %eax | |
544 | - adcl -36(%esi), %eax | |
545 | - adcl -32(%esi), %eax | |
546 | - adcl -28(%esi), %eax | |
547 | - adcl -24(%esi), %eax | |
548 | - adcl -20(%esi), %eax | |
549 | - adcl -16(%esi), %eax | |
550 | - adcl -12(%esi), %eax | |
551 | - adcl -8(%esi), %eax | |
552 | - adcl -4(%esi), %eax | |
553 | -45: | |
554 | - lea 128(%esi), %esi | |
555 | - adcl $0, %eax | |
556 | - dec %ecx | |
557 | - jge 40b | |
558 | - movl %edx, %ecx | |
559 | -50: andl $3, %ecx | |
560 | - jz 80f | |
561 | - | |
562 | - # Handle the last 1-3 bytes without jumping | |
563 | - notl %ecx # 1->2, 2->1, 3->0, higher bits are masked | |
564 | - movl $0xffffff,%ebx # by the shll and shrl instructions | |
565 | - shll $3,%ecx | |
566 | - shrl %cl,%ebx | |
567 | - andl -128(%esi),%ebx # esi is 4-aligned so should be ok | |
568 | - addl %ebx,%eax | |
569 | - adcl $0,%eax | |
570 | -80: | |
571 | - testl $1, 12(%esp) | |
572 | - jz 90f | |
573 | - roll $8, %eax | |
574 | -90: | |
575 | - popl %ebx | |
576 | - popl %esi | |
577 | - ret | |
578 | - | |
579 | -#endif | |
580 | - | |
581 | -/* | |
582 | -unsigned int csum_partial_copy_generic (const char *src, char *dst, | |
583 | - int len, int sum, int *src_err_ptr, int *dst_err_ptr) | |
584 | - */ | |
585 | - | |
586 | -/* | |
587 | - * Copy from ds while checksumming, otherwise like csum_partial | |
588 | - * | |
589 | - * The macros SRC and DST specify the type of access for the instruction. | |
590 | - * thus we can call a custom exception handler for all access types. | |
591 | - * | |
592 | - * FIXME: could someone double-check whether I haven't mixed up some SRC and | |
593 | - * DST definitions? It's damn hard to trigger all cases. I hope I got | |
594 | - * them all but there's no guarantee. | |
595 | - */ | |
596 | - | |
597 | -#define SRC(y...) \ | |
598 | - 9999: y; \ | |
599 | - .section __ex_table, "a"; \ | |
600 | - .long 9999b, 6001f ; \ | |
601 | - .previous | |
602 | - | |
603 | -#define DST(y...) \ | |
604 | - 9999: y; \ | |
605 | - .section __ex_table, "a"; \ | |
606 | - .long 9999b, 6002f ; \ | |
607 | - .previous | |
608 | - | |
609 | -.align 4 | |
610 | -.globl csum_partial_copy_generic | |
611 | - | |
612 | -#ifndef CONFIG_X86_USE_PPRO_CHECKSUM | |
613 | - | |
614 | -#define ARGBASE 16 | |
615 | -#define FP 12 | |
616 | - | |
617 | -csum_partial_copy_generic: | |
618 | - subl $4,%esp | |
619 | - pushl %edi | |
620 | - pushl %esi | |
621 | - pushl %ebx | |
622 | - movl ARGBASE+16(%esp),%eax # sum | |
623 | - movl ARGBASE+12(%esp),%ecx # len | |
624 | - movl ARGBASE+4(%esp),%esi # src | |
625 | - movl ARGBASE+8(%esp),%edi # dst | |
626 | - | |
627 | - testl $2, %edi # Check alignment. | |
628 | - jz 2f # Jump if alignment is ok. | |
629 | - subl $2, %ecx # Alignment uses up two bytes. | |
630 | - jae 1f # Jump if we had at least two bytes. | |
631 | - addl $2, %ecx # ecx was < 2. Deal with it. | |
632 | - jmp 4f | |
633 | -SRC(1: movw (%esi), %bx ) | |
634 | - addl $2, %esi | |
635 | -DST( movw %bx, (%edi) ) | |
636 | - addl $2, %edi | |
637 | - addw %bx, %ax | |
638 | - adcl $0, %eax | |
639 | -2: | |
640 | - movl %ecx, FP(%esp) | |
641 | - shrl $5, %ecx | |
642 | - jz 2f | |
643 | - testl %esi, %esi | |
644 | -SRC(1: movl (%esi), %ebx ) | |
645 | -SRC( movl 4(%esi), %edx ) | |
646 | - adcl %ebx, %eax | |
647 | -DST( movl %ebx, (%edi) ) | |
648 | - adcl %edx, %eax | |
649 | -DST( movl %edx, 4(%edi) ) | |
650 | - | |
651 | -SRC( movl 8(%esi), %ebx ) | |
652 | -SRC( movl 12(%esi), %edx ) | |
653 | - adcl %ebx, %eax | |
654 | -DST( movl %ebx, 8(%edi) ) | |
655 | - adcl %edx, %eax | |
656 | -DST( movl %edx, 12(%edi) ) | |
657 | - | |
658 | -SRC( movl 16(%esi), %ebx ) | |
659 | -SRC( movl 20(%esi), %edx ) | |
660 | - adcl %ebx, %eax | |
661 | -DST( movl %ebx, 16(%edi) ) | |
662 | - adcl %edx, %eax | |
663 | -DST( movl %edx, 20(%edi) ) | |
664 | - | |
665 | -SRC( movl 24(%esi), %ebx ) | |
666 | -SRC( movl 28(%esi), %edx ) | |
667 | - adcl %ebx, %eax | |
668 | -DST( movl %ebx, 24(%edi) ) | |
669 | - adcl %edx, %eax | |
670 | -DST( movl %edx, 28(%edi) ) | |
671 | - | |
672 | - lea 32(%esi), %esi | |
673 | - lea 32(%edi), %edi | |
674 | - dec %ecx | |
675 | - jne 1b | |
676 | - adcl $0, %eax | |
677 | -2: movl FP(%esp), %edx | |
678 | - movl %edx, %ecx | |
679 | - andl $0x1c, %edx | |
680 | - je 4f | |
681 | - shrl $2, %edx # This clears CF | |
682 | -SRC(3: movl (%esi), %ebx ) | |
683 | - adcl %ebx, %eax | |
684 | -DST( movl %ebx, (%edi) ) | |
685 | - lea 4(%esi), %esi | |
686 | - lea 4(%edi), %edi | |
687 | - dec %edx | |
688 | - jne 3b | |
689 | - adcl $0, %eax | |
690 | -4: andl $3, %ecx | |
691 | - jz 7f | |
692 | - cmpl $2, %ecx | |
693 | - jb 5f | |
694 | -SRC( movw (%esi), %cx ) | |
695 | - leal 2(%esi), %esi | |
696 | -DST( movw %cx, (%edi) ) | |
697 | - leal 2(%edi), %edi | |
698 | - je 6f | |
699 | - shll $16,%ecx | |
700 | -SRC(5: movb (%esi), %cl ) | |
701 | -DST( movb %cl, (%edi) ) | |
702 | -6: addl %ecx, %eax | |
703 | - adcl $0, %eax | |
704 | -7: | |
705 | -5000: | |
706 | - | |
707 | -# Exception handler: | |
708 | -.section .fixup, "ax" | |
709 | - | |
710 | -6001: | |
711 | - movl ARGBASE+20(%esp), %ebx # src_err_ptr | |
712 | - movl $-EFAULT, (%ebx) | |
713 | - | |
714 | - # zero the complete destination - computing the rest | |
715 | - # is too much work | |
716 | - movl ARGBASE+8(%esp), %edi # dst | |
717 | - movl ARGBASE+12(%esp), %ecx # len | |
718 | - xorl %eax,%eax | |
719 | - rep ; stosb | |
720 | - | |
721 | - jmp 5000b | |
722 | - | |
723 | -6002: | |
724 | - movl ARGBASE+24(%esp), %ebx # dst_err_ptr | |
725 | - movl $-EFAULT,(%ebx) | |
726 | - jmp 5000b | |
727 | - | |
728 | -.previous | |
729 | - | |
730 | - popl %ebx | |
731 | - popl %esi | |
732 | - popl %edi | |
733 | - popl %ecx # equivalent to addl $4,%esp | |
734 | - ret | |
735 | - | |
736 | -#else | |
737 | - | |
738 | -/* Version for PentiumII/PPro */ | |
739 | - | |
740 | -#define ROUND1(x) \ | |
741 | - SRC(movl x(%esi), %ebx ) ; \ | |
742 | - addl %ebx, %eax ; \ | |
743 | - DST(movl %ebx, x(%edi) ) ; | |
744 | - | |
745 | -#define ROUND(x) \ | |
746 | - SRC(movl x(%esi), %ebx ) ; \ | |
747 | - adcl %ebx, %eax ; \ | |
748 | - DST(movl %ebx, x(%edi) ) ; | |
749 | - | |
750 | -#define ARGBASE 12 | |
751 | - | |
752 | -csum_partial_copy_generic: | |
753 | - pushl %ebx | |
754 | - pushl %edi | |
755 | - pushl %esi | |
756 | - movl ARGBASE+4(%esp),%esi #src | |
757 | - movl ARGBASE+8(%esp),%edi #dst | |
758 | - movl ARGBASE+12(%esp),%ecx #len | |
759 | - movl ARGBASE+16(%esp),%eax #sum | |
760 | -# movl %ecx, %edx | |
761 | - movl %ecx, %ebx | |
762 | - movl %esi, %edx | |
763 | - shrl $6, %ecx | |
764 | - andl $0x3c, %ebx | |
765 | - negl %ebx | |
766 | - subl %ebx, %esi | |
767 | - subl %ebx, %edi | |
768 | - lea -1(%esi),%edx | |
769 | - andl $-32,%edx | |
770 | - lea 3f(%ebx,%ebx), %ebx | |
771 | - testl %esi, %esi | |
772 | - jmp *%ebx | |
773 | -1: addl $64,%esi | |
774 | - addl $64,%edi | |
775 | - SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl) | |
776 | - ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52) | |
777 | - ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36) | |
778 | - ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20) | |
779 | - ROUND (-16) ROUND(-12) ROUND(-8) ROUND(-4) | |
780 | -3: adcl $0,%eax | |
781 | - addl $64, %edx | |
782 | - dec %ecx | |
783 | - jge 1b | |
784 | -4: movl ARGBASE+12(%esp),%edx #len | |
785 | - andl $3, %edx | |
786 | - jz 7f | |
787 | - cmpl $2, %edx | |
788 | - jb 5f | |
789 | -SRC( movw (%esi), %dx ) | |
790 | - leal 2(%esi), %esi | |
791 | -DST( movw %dx, (%edi) ) | |
792 | - leal 2(%edi), %edi | |
793 | - je 6f | |
794 | - shll $16,%edx | |
795 | -5: | |
796 | -SRC( movb (%esi), %dl ) | |
797 | -DST( movb %dl, (%edi) ) | |
798 | -6: addl %edx, %eax | |
799 | - adcl $0, %eax | |
800 | -7: | |
801 | -.section .fixup, "ax" | |
802 | -6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr | |
803 | - movl $-EFAULT, (%ebx) | |
804 | - # zero the complete destination (computing the rest is too much work) | |
805 | - movl ARGBASE+8(%esp),%edi # dst | |
806 | - movl ARGBASE+12(%esp),%ecx # len | |
807 | - xorl %eax,%eax | |
808 | - rep; stosb | |
809 | - jmp 7b | |
810 | -6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr | |
811 | - movl $-EFAULT, (%ebx) | |
812 | - jmp 7b | |
813 | -.previous | |
814 | - | |
815 | - popl %esi | |
816 | - popl %edi | |
817 | - popl %ebx | |
818 | - ret | |
819 | - | |
820 | -#undef ROUND | |
821 | -#undef ROUND1 | |
822 | - | |
823 | -#endif | |
824 | diff -urN linux-2.4.20-pre11/arch/i386/lib/csum.S linux-2.4.20-pre11csum/arch/i386/lib/csum.S | |
825 | --- linux-2.4.20-pre11/arch/i386/lib/csum.S Wed Dec 31 22:00:00 1969 | |
826 | +++ linux-2.4.20-pre11csum/arch/i386/lib/csum.S Fri Nov 1 22:45:31 2002 | |
827 | @@ -0,0 +1,97 @@ | |
828 | +/* | |
829 | + * INET An implementation of the TCP/IP protocol suite for the LINUX | |
830 | + * operating system. INET is implemented using the BSD Socket | |
831 | + * interface as the means of communication with the user level. | |
832 | + * | |
833 | + * IP/TCP/UDP checksumming routines | |
834 | + * | |
835 | + * Authors: Jorge Cwik, <jorge@laser.satlink.net> | |
836 | + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> | |
837 | + * Tom May, <ftom@netcom.com> | |
838 | + * Pentium Pro/II routines: | |
839 | + * Alexander Kjeldaas <astor@guardian.no> | |
840 | + * Finn Arne Gangstad <finnag@guardian.no> | |
841 | + * Lots of code moved from tcp.c and ip.c; see those files | |
842 | + * for more names. | |
843 | + * | |
844 | + * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception | |
845 | + * handling. | |
846 | + * Andi Kleen, add zeroing on error converted to pure assembler | |
847 | + * 2002-10-30 Denis Vlasenko | |
848 | + * boot-time benchmarking, 3Dnow/MMX+/SSE versions | |
849 | + * | |
850 | + * This program is free software; you can redistribute it and/or | |
851 | + * modify it under the terms of the GNU General Public License | |
852 | + * as published by the Free Software Foundation; either version | |
853 | + * 2 of the License, or (at your option) any later version. | |
854 | + */ | |
855 | + | |
856 | +/* | |
857 | +** computes a partial checksum, e.g. for TCP/UDP fragments | |
858 | +** | |
859 | +** unsigned int csum_partial(const unsigned char * buff, | |
860 | +** int len, unsigned int sum) | |
861 | +*/ | |
862 | + | |
863 | +.text | |
864 | +.align 4 | |
865 | +.globl csum_partial | |
866 | + | |
867 | +csum_partial: | |
868 | + pushl %esi | |
869 | + pushl %ebx | |
870 | + movl 20(%esp), %eax # arg: sum | |
871 | + movl 16(%esp), %ecx # arg: len | |
872 | + movl 12(%esp), %esi # arg: buf | |
873 | + | |
874 | + testl $3, %esi | |
875 | + jz 40f | |
876 | +20: | |
877 | + # not 4-aligned: analyze and align... | |
878 | + testl $1, %esi | |
879 | + jz 30f | |
880 | + | |
881 | + # unaligned start addr | |
882 | + decl %ecx | |
883 | + js 90f # sz==0, exit | |
884 | + movzbl (%esi), %ebx # eat one byte... | |
885 | + addl %ebx, %eax | |
886 | + adcl $0, %eax | |
887 | + roll $8, %eax # NB: need to be undone at exit! | |
888 | + incl %esi | |
889 | + testl $2, %esi | |
890 | + jz 40f | |
891 | +30: | |
892 | + # Note: 2-aligned, but not 4-aligned | |
893 | + cmpl $3, %ecx | |
894 | + jbe 60f | |
895 | + addw (%esi), %ax # eat 2 bytes | |
896 | + leal 2(%esi), %esi | |
897 | + adcl $0, %eax | |
898 | + subl $2, %ecx | |
899 | +40: | |
900 | + # esi is 4-aligned here, call block routine | |
901 | + movl $csum_basic, %ebx # known ok even for ecx==0 etc | |
902 | + cmpl $128, %ecx # use optimized routine | |
903 | + jb 50f # only for large blocks | |
904 | + movl best_csum, %ebx | |
905 | +50: call *%ebx | |
906 | +60: | |
907 | + # handle the last 0-3 bytes without much jumping | |
908 | + jecxz 80f | |
909 | + notl %ecx # 0->3, 1->2, 2->1, 3->0, higher bits are masked | |
910 | + movl $0xffffff, %ebx # by the shll and shrl instructions | |
911 | + shll $3, %ecx | |
912 | + shrl %cl, %ebx | |
913 | + andl (%esi), %ebx # esi is 4-aligned so should be ok | |
914 | + addl %ebx, %eax | |
915 | + adcl $0, %eax | |
916 | +80: | |
917 | + # undo csum rotation if start addr was odd | |
918 | + testl $1, 12(%esp) | |
919 | + jz 90f | |
920 | + roll $8, %eax | |
921 | +90: | |
922 | + popl %ebx | |
923 | + popl %esi | |
924 | + ret | |
925 | diff -urN linux-2.4.20-pre11/arch/i386/lib/csum_3dnow.S linux-2.4.20-pre11csum/arch/i386/lib/csum_3dnow.S | |
926 | --- linux-2.4.20-pre11/arch/i386/lib/csum_3dnow.S Wed Dec 31 22:00:00 1969 | |
927 | +++ linux-2.4.20-pre11csum/arch/i386/lib/csum_3dnow.S Fri Nov 1 22:48:32 2002 | |
928 | @@ -0,0 +1,4 @@ | |
929 | +#define PREFETCH(a) prefetch a | |
930 | +#define NAME csum_3dnow | |
931 | + | |
932 | +#include "csum_pf.inc" | |
933 | diff -urN linux-2.4.20-pre11/arch/i386/lib/csum_basic.S linux-2.4.20-pre11csum/arch/i386/lib/csum_basic.S | |
934 | --- linux-2.4.20-pre11/arch/i386/lib/csum_basic.S Wed Dec 31 22:00:00 1969 | |
935 | +++ linux-2.4.20-pre11csum/arch/i386/lib/csum_basic.S Fri Nov 1 22:56:19 2002 | |
936 | @@ -0,0 +1,63 @@ | |
937 | +.text | |
938 | +.align 4 | |
939 | +.globl csum_basic | |
940 | + | |
941 | +/* Experiments with Ethernet and SLIP connections show that buff | |
942 | +** is aligned on either a 2-byte or 4-byte boundary. We get at | |
943 | +** least a twofold speedup on 486 and Pentium if it is 4-byte aligned. | |
944 | +** Fortunately, it is easy to convert 2-byte alignment to 4-byte | |
945 | +** alignment for the unrolled loop. | |
946 | +*/ | |
947 | +csum_basic: | |
948 | + movl %ecx, %ebx | |
949 | + movl %ecx, %edx | |
950 | + shrl $7, %ecx | |
951 | + andl $0x7c, %ebx | |
952 | + addl %ebx, %esi | |
953 | + shrl $2, %ebx | |
954 | + negl %ebx | |
955 | + leal 50f(%ebx,%ebx,2), %ebx | |
956 | + clc | |
957 | + jmp *%ebx | |
958 | +40: | |
959 | + leal 128(%esi), %esi | |
960 | + adcl -128(%esi), %eax | |
961 | + adcl -124(%esi), %eax | |
962 | + adcl -120(%esi), %eax | |
963 | + adcl -116(%esi), %eax | |
964 | + adcl -112(%esi), %eax | |
965 | + adcl -108(%esi), %eax | |
966 | + adcl -104(%esi), %eax | |
967 | + adcl -100(%esi), %eax | |
968 | + adcl -96(%esi), %eax | |
969 | + adcl -92(%esi), %eax | |
970 | + adcl -88(%esi), %eax | |
971 | + adcl -84(%esi), %eax | |
972 | + adcl -80(%esi), %eax | |
973 | + adcl -76(%esi), %eax | |
974 | + adcl -72(%esi), %eax | |
975 | + adcl -68(%esi), %eax | |
976 | + adcl -64(%esi), %eax | |
977 | + adcl -60(%esi), %eax | |
978 | + adcl -56(%esi), %eax | |
979 | + adcl -52(%esi), %eax | |
980 | + adcl -48(%esi), %eax | |
981 | + adcl -44(%esi), %eax | |
982 | + adcl -40(%esi), %eax | |
983 | + adcl -36(%esi), %eax | |
984 | + adcl -32(%esi), %eax | |
985 | + adcl -28(%esi), %eax | |
986 | + adcl -24(%esi), %eax | |
987 | + adcl -20(%esi), %eax | |
988 | + adcl -16(%esi), %eax | |
989 | + adcl -12(%esi), %eax | |
990 | + adcl -8(%esi), %eax | |
991 | + adcl -4(%esi), %eax | |
992 | +50: | |
993 | + decl %ecx | |
994 | + jge 40b | |
995 | + | |
996 | + adcl $0, %eax | |
997 | + movl %edx, %ecx | |
998 | + andl $3, %ecx | |
999 | + ret | |
1000 | diff -urN linux-2.4.20-pre11/arch/i386/lib/csum_naive.S linux-2.4.20-pre11csum/arch/i386/lib/csum_naive.S | |
1001 | --- linux-2.4.20-pre11/arch/i386/lib/csum_naive.S Wed Dec 31 22:00:00 1969 | |
1002 | +++ linux-2.4.20-pre11csum/arch/i386/lib/csum_naive.S Fri Nov 1 22:36:20 2002 | |
1003 | @@ -0,0 +1,17 @@ | |
1004 | +.text | |
1005 | +.align 4 | |
1006 | +.globl csum_naive | |
1007 | + | |
1008 | +csum_naive: | |
1009 | + mov %ecx, %edx | |
1010 | + shrl $2, %ecx | |
1011 | + clc | |
1012 | +1: | |
1013 | + adcl (%esi), %eax | |
1014 | + leal 4(%esi), %esi | |
1015 | + loop 1b | |
1016 | + | |
1017 | + adcl $0, %eax | |
1018 | + mov %edx, %ecx | |
1019 | + andl $3, %ecx | |
1020 | + ret | |
1021 | diff -urN linux-2.4.20-pre11/arch/i386/lib/csum_pf.inc linux-2.4.20-pre11csum/arch/i386/lib/csum_pf.inc | |
1022 | --- linux-2.4.20-pre11/arch/i386/lib/csum_pf.inc Wed Dec 31 22:00:00 1969 | |
1023 | +++ linux-2.4.20-pre11csum/arch/i386/lib/csum_pf.inc Fri Nov 1 22:57:20 2002 | |
1024 | @@ -0,0 +1,95 @@ | |
1025 | +//#define PREFETCH(a) prefetchnta a | |
1026 | +//#define PREFETCH(a) prefetch a | |
1027 | +//#define PREFETCH(a) | |
1028 | + | |
1029 | +// How much unrolling do you want? | |
1030 | +//vda: 5 is best on Duron 650 | |
1031 | +#define ITER_BITS 5 // ...5,6,7 - ...32,64,128 bytes | |
1032 | + // NB: tweak unrolled loop too... | |
1033 | +/* | |
1034 | +** computes a partial checksum, e.g. for TCP/UDP fragments | |
1035 | +** int csum_partial(const char *buff, int len, int sum) | |
1036 | +*/ | |
1037 | + | |
1038 | +#define ITER_SZ (1<<ITER_BITS) | |
1039 | +#define ITER_MSK ((1<<ITER_BITS)-4) | |
1040 | + | |
1041 | +.text | |
1042 | +.align 4 | |
1043 | +.globl NAME | |
1044 | + | |
1045 | +NAME: | |
1046 | + | |
1047 | +# Guaranteed by caller: esi is 4-aligned, ecx>=16 | |
1048 | +10: | |
1049 | + PREFETCH((%esi)) # Prefetch _each_ cacheline | |
1050 | + PREFETCH(32(%esi)) # Note! Athlons have 64 bytes long ones, but | |
1051 | + PREFETCH(64(%esi)) # PIIIs only 32! This gives ~20% speedup | |
1052 | + PREFETCH(64+32(%esi)) # for PIII | |
1053 | + PREFETCH(128(%esi)) | |
1054 | + PREFETCH(128+32(%esi)) | |
1055 | + PREFETCH(192(%esi)) | |
1056 | + PREFETCH(192+32(%esi)) | |
1057 | + movl %ecx, %ebx | |
1058 | + movl %ecx, %edx | |
1059 | + andl $ITER_MSK, %ebx # = bytes to handle in first (partial) iteration | |
1060 | + shrl $ITER_BITS, %ecx # = iterations to make | |
1061 | + addl %ebx, %esi # => 1st byte to handle in 2nd complete iteration | |
1062 | + shrl $2, %ebx # = dwords to handle | |
1063 | + negl %ebx | |
1064 | + lea 50f(%ebx,%ebx,2), %ebx # = 45f - 3*dwords_to_handle | |
1065 | + clc | |
1066 | + jmp *%ebx # here we go! | |
1067 | + | |
1068 | +40: | |
1069 | + PREFETCH(256(%esi)) | |
1070 | +41: | |
1071 | + lea ITER_SZ(%esi), %esi # does NOT change CF! | |
1072 | +/* | |
1073 | + addl -128(%esi), %eax | |
1074 | + adcl -124(%esi), %eax | |
1075 | + adcl -120(%esi), %eax | |
1076 | + adcl -116(%esi), %eax | |
1077 | + adcl -112(%esi), %eax | |
1078 | + adcl -108(%esi), %eax | |
1079 | + adcl -104(%esi), %eax | |
1080 | + adcl -100(%esi), %eax | |
1081 | + adcl -96(%esi), %eax | |
1082 | + adcl -92(%esi), %eax | |
1083 | + adcl -88(%esi), %eax | |
1084 | + adcl -84(%esi), %eax | |
1085 | + adcl -80(%esi), %eax | |
1086 | + adcl -76(%esi), %eax | |
1087 | + adcl -72(%esi), %eax | |
1088 | + adcl -68(%esi), %eax | |
1089 | + adcl -64(%esi), %eax | |
1090 | + adcl -60(%esi), %eax | |
1091 | + adcl -56(%esi), %eax | |
1092 | + adcl -52(%esi), %eax | |
1093 | + adcl -48(%esi), %eax | |
1094 | + adcl -44(%esi), %eax | |
1095 | + adcl -40(%esi), %eax | |
1096 | + adcl -36(%esi), %eax | |
1097 | +*/ | |
1098 | + addl -32(%esi), %eax | |
1099 | + adcl -28(%esi), %eax | |
1100 | + adcl -24(%esi), %eax | |
1101 | + adcl -20(%esi), %eax | |
1102 | + adcl -16(%esi), %eax | |
1103 | + adcl -12(%esi), %eax | |
1104 | + adcl -8(%esi), %eax | |
1105 | + adcl -4(%esi), %eax | |
1106 | +50: | |
1107 | + adcl $0, %eax | |
1108 | + dec %ecx # does NOT change CF! | |
1109 | + # We can do just "jge 40b" here, but we can be a bit clever... | |
1110 | + # This little twist gives surprisingly noticeable benefits! | |
1111 | + # Seen 11% increase on random 1K blocks on Duron 650 | |
1112 | + js 60f | |
1113 | + cmp $256/ITER_SZ, %ecx | |
1114 | + jae 40b # need prefetch | |
1115 | + jmp 41b # do not need it | |
1116 | +60: | |
1117 | + movl %edx, %ecx | |
1118 | + andl $3, %ecx | |
1119 | + ret | |
1120 | diff -urN linux-2.4.20-pre11/arch/i386/lib/csum_ssemmxplus.S linux-2.4.20-pre11csum/arch/i386/lib/csum_ssemmxplus.S | |
1121 | --- linux-2.4.20-pre11/arch/i386/lib/csum_ssemmxplus.S Wed Dec 31 22:00:00 1969 | |
1122 | +++ linux-2.4.20-pre11csum/arch/i386/lib/csum_ssemmxplus.S Fri Nov 1 22:48:39 2002 | |
1123 | @@ -0,0 +1,4 @@ | |
1124 | +#define PREFETCH(a) prefetchnta a | |
1125 | +#define NAME csum_ssemmxplus | |
1126 | + | |
1127 | +#include "csum_pf.inc" | |
1128 | diff -urN linux-2.4.20-pre11/arch/i386/lib/csumcpy.S linux-2.4.20-pre11csum/arch/i386/lib/csumcpy.S | |
1129 | --- linux-2.4.20-pre11/arch/i386/lib/csumcpy.S Wed Dec 31 22:00:00 1969 | |
1130 | +++ linux-2.4.20-pre11csum/arch/i386/lib/csumcpy.S Fri Nov 1 22:49:44 2002 | |
1131 | @@ -0,0 +1,178 @@ | |
1132 | +/* | |
1133 | + * INET An implementation of the TCP/IP protocol suite for the LINUX | |
1134 | + * operating system. INET is implemented using the BSD Socket | |
1135 | + * interface as the means of communication with the user level. | |
1136 | + * | |
1137 | + * IP/TCP/UDP checksumming routines | |
1138 | + * | |
1139 | + * Authors: Jorge Cwik, <jorge@laser.satlink.net> | |
1140 | + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> | |
1141 | + * Tom May, <ftom@netcom.com> | |
1142 | + * Pentium Pro/II routines: | |
1143 | + * Alexander Kjeldaas <astor@guardian.no> | |
1144 | + * Finn Arne Gangstad <finnag@guardian.no> | |
1145 | + * Lots of code moved from tcp.c and ip.c; see those files | |
1146 | + * for more names. | |
1147 | + * | |
1148 | + * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception | |
1149 | + * handling. | |
1150 | + * Andi Kleen, add zeroing on error converted to pure assembler | |
1151 | + * 2002-10-30 Denis Vlasenko | |
1152 | + * boot-time benchmarking, 3Dnow/MMX+/SSE versions | |
1153 | + * | |
1154 | + * This program is free software; you can redistribute it and/or | |
1155 | + * modify it under the terms of the GNU General Public License | |
1156 | + * as published by the Free Software Foundation; either version | |
1157 | + * 2 of the License, or (at your option) any later version. | |
1158 | + */ | |
1159 | + | |
1160 | +#include <asm/errno.h> | |
1161 | + | |
1162 | +/* | |
1163 | +** computes a partial checksum, e.g. for TCP/UDP fragments | |
1164 | +** | |
1165 | +** unsigned int csum_partial(const unsigned char * buff, | |
1166 | +** int len, unsigned int sum) | |
1167 | +*/ | |
1168 | + | |
1169 | +#ifdef __KERNEL__ | |
1170 | +#define K(a...) a | |
1171 | +#else | |
1172 | +#define K(a...) | |
1173 | +#endif | |
1174 | + | |
1175 | +#define SRC(y...) \ | |
1176 | +9999: y ;\ | |
1177 | + .section __ex_table, "a";\ | |
1178 | + .long 9999b, 6001f ;\ | |
1179 | + .previous | |
1180 | + | |
1181 | +#define DST(y...) \ | |
1182 | +9999: y ;\ | |
1183 | + .section __ex_table, "a";\ | |
1184 | + .long 9999b, 6002f ;\ | |
1185 | + .previous | |
1186 | + | |
1187 | +#define KERNEL_FPU_BEGIN \ | |
1188 | + call kernel_fpu_begin | |
1189 | + | |
1190 | +#define KERNEL_FPU_END(r) \ | |
1191 | +K( movl %cr0, r ;)\ | |
1192 | +K( orl $8, r ;)\ | |
1193 | +K( movl r, %cr0 ;) | |
1194 | + | |
1195 | +.text | |
1196 | + | |
1197 | +#include "csumcpy_naive.inc" | |
1198 | +#include "csumcpy_basic.inc" | |
1199 | +#include "csumcpy_ssemmxplus.inc" | |
1200 | +#include "csumcpy_sse.inc" | |
1201 | + | |
1202 | +.align 4 | |
1203 | +.globl csum_partial_copy_generic | |
1204 | + | |
1205 | +csum_partial_copy_generic: | |
1206 | + pushl %ebx | |
1207 | + pushl %edi | |
1208 | + pushl %esi | |
1209 | + pushl %ebp | |
1210 | + movl %esp, %ebp | |
1211 | + | |
1212 | +#define STK_DERR 40(%ebp) | |
1213 | +#define STK_SERR 36(%ebp) | |
1214 | +#define STK_SUM 32(%ebp) | |
1215 | +#define STK_LEN 28(%ebp) | |
1216 | +#define STK_DST 24(%ebp) | |
1217 | +#define STK_SRC 20(%ebp) | |
1218 | +#define STK_EIP 16(%ebp) | |
1219 | +#define STK_EBX 12(%ebp) | |
1220 | +#define STK_EDI 8(%ebp) | |
1221 | +#define STK_ESI 4(%ebp) | |
1222 | +#define STK_EBP (%ebp) | |
1223 | + | |
1224 | + movl STK_SRC, %esi #src | |
1225 | + movl STK_DST, %edi #dst | |
1226 | + movl STK_LEN, %ecx #len | |
1227 | + movl STK_SUM, %eax #sum | |
1228 | + | |
1229 | + testl $3, %edi # Check dst alignment | |
1230 | + jz 40f | |
1231 | + | |
1232 | + # not 4-aligned: analyze and align... | |
1233 | + testl $1, %edi | |
1234 | + jz 30f | |
1235 | + | |
1236 | + # unaligned start addr | |
1237 | + decl %ecx | |
1238 | + js 90f # sz==0, exit | |
1239 | + movzbl (%esi), %ebx # eat one byte... | |
1240 | + movb %bl, (%edi) | |
1241 | + addl %ebx, %eax | |
1242 | + adcl $0, %eax | |
1243 | + roll $8, %eax # NB: need to be undone at exit! | |
1244 | + incl %esi | |
1245 | + incl %edi | |
1246 | + testl $2, %edi | |
1247 | + jz 40f | |
1248 | +30: | |
4bf063fb | 1249 | + # xxx 2-aligned, but not 4-aligned |
e6d11017 JR |
1250 | + cmpl $3, %ecx |
1251 | + jbe 60f | |
1252 | + movw (%esi), %bx # eat 2 bytes | |
1253 | + addw %bx, %ax | |
1254 | + movw %bx, (%edi) | |
1255 | + adcl $0, %eax | |
1256 | + leal 2(%esi), %esi | |
1257 | + leal 2(%edi), %edi | |
1258 | + subl $2, %ecx | |
1259 | +40: | |
1260 | + # edi is 4-aligned now: call block routine | |
1261 | + movl $csumcpy_basic, %ebx # 'default', known good for ecx==0 etc | |
1262 | + cmpl $128, %ecx # use optimized routine | |
1263 | + jb 50f # only for large blocks | |
1264 | + movl best_csumcpy, %ebx | |
1265 | +50: call *%ebx | |
1266 | +60: | |
1267 | + # handle last 0-3 bytes | |
1268 | + jecxz 80f | |
1269 | + cmpl $2, %ecx | |
1270 | + jb 70f | |
1271 | +SRC( movw (%esi), %cx ) | |
1272 | + leal 2(%esi), %esi | |
1273 | +DST( movw %cx, (%edi) ) | |
1274 | + leal 2(%edi), %edi | |
1275 | + je 75f | |
1276 | + shll $16, %ecx | |
1277 | +70: | |
1278 | +SRC( movb (%esi), %cl ) | |
1279 | +DST( movb %cl, (%edi) ) | |
1280 | +75: addl %ecx, %eax | |
1281 | + adcl $0, %eax | |
1282 | +80: | |
1283 | + # undo csum rotation if dst was unaligned | |
1284 | + testl $1, STK_DST | |
1285 | + jz 90f | |
1286 | + roll $8, %eax | |
1287 | +90: | |
1288 | + movl %esp, %ebp | |
1289 | + popl %ebp | |
1290 | + popl %esi | |
1291 | + popl %edi | |
1292 | + popl %ebx | |
1293 | + ret | |
1294 | + | |
1295 | + | |
1296 | +.section .fixup, "ax" | |
1297 | +6001: movl STK_SERR, %ebx # src_err_ptr | |
1298 | + movl $-EFAULT, (%ebx) | |
1299 | + # zero the complete destination (computing the rest is too much work) | |
1300 | + movl STK_DST, %edi # dst | |
1301 | + movl STK_LEN, %ecx # len | |
1302 | + xorl %eax, %eax | |
1303 | + cld | |
1304 | + rep; stosb | |
1305 | + jmp 90b | |
1306 | +6002: movl STK_DERR, %ebx # dst_err_ptr | |
1307 | + movl $-EFAULT, (%ebx) | |
1308 | + jmp 90b | |
1309 | +.previous | |
1310 | diff -urN linux-2.4.20-pre11/arch/i386/lib/csumcpy_basic.inc linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_basic.inc | |
1311 | --- linux-2.4.20-pre11/arch/i386/lib/csumcpy_basic.inc Wed Dec 31 22:00:00 1969 | |
1312 | +++ linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_basic.inc Fri Nov 1 23:27:28 2002 | |
1313 | @@ -0,0 +1,40 @@ | |
1314 | +// Please somebody experiment with unroll length | |
1315 | +// on a PII. Do _not_ optimize for PIII/Athlons/etc, | |
1316 | +// they won't typically use this... | |
1317 | + | |
1318 | +.align 4 | |
1319 | +.globl csumcpy_basic | |
1320 | + | |
1321 | +csumcpy_basic: | |
1322 | + movl %ecx, %ebx | |
1323 | + movl %ecx, %edx | |
1324 | + shrl $6, %ecx | |
1325 | + andl $0x3c, %ebx | |
1326 | + negl %ebx | |
1327 | + subl %ebx, %esi | |
1328 | + subl %ebx, %edi | |
1329 | + leal 50f(%ebx,%ebx), %ebx | |
1330 | + clc | |
1331 | + jmp *%ebx | |
1332 | +40: | |
1333 | + leal 64(%esi), %esi | |
1334 | + leal 64(%edi), %edi | |
1335 | + | |
1336 | +#undef ROUND | |
1337 | +#define ROUND(x) \ | |
1338 | +SRC( movl x(%esi), %ebx ); \ | |
1339 | + adcl %ebx, %eax ; \ | |
1340 | +DST( movl %ebx, x(%edi) ); | |
1341 | + | |
1342 | + ROUND(-64) ROUND(-60) ROUND(-56) ROUND(-52) | |
1343 | + ROUND(-48) ROUND(-44) ROUND(-40) ROUND(-36) | |
1344 | + ROUND(-32) ROUND(-28) ROUND(-24) ROUND(-20) | |
1345 | + ROUND(-16) ROUND(-12) ROUND(-8) ROUND(-4) | |
1346 | +50: | |
1347 | + decl %ecx | |
1348 | + jge 40b | |
1349 | + | |
1350 | + adcl $0, %eax | |
1351 | + movl %edx, %ecx | |
1352 | + andl $3, %ecx | |
1353 | + ret | |
1354 | diff -urN linux-2.4.20-pre11/arch/i386/lib/csumcpy_naive.inc linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_naive.inc | |
1355 | --- linux-2.4.20-pre11/arch/i386/lib/csumcpy_naive.inc Wed Dec 31 22:00:00 1969 | |
1356 | +++ linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_naive.inc Fri Nov 1 23:27:51 2002 | |
1357 | @@ -0,0 +1,21 @@ | |
1358 | +// Heh... at least it's small ;) | |
1359 | + | |
1360 | +.align 4 | |
1361 | +.globl csumcpy_naive | |
1362 | + | |
1363 | +csumcpy_naive: | |
1364 | + mov %ecx, %edx | |
1365 | + shrl $2, %ecx | |
1366 | + clc | |
1367 | +1: | |
1368 | +SRC( movl (%esi), %ebx ) | |
1369 | +DST( movl %ebx, (%edi) ) | |
1370 | + adcl %ebx, %eax | |
1371 | + leal 4(%esi), %esi | |
1372 | + leal 4(%edi), %edi | |
1373 | + loop 1b | |
1374 | + | |
1375 | + adcl $0, %eax | |
1376 | + mov %edx, %ecx | |
1377 | + and $3, %ecx | |
1378 | + ret | |
1379 | diff -urN linux-2.4.20-pre11/arch/i386/lib/csumcpy_sse.inc linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_sse.inc | |
1380 | --- linux-2.4.20-pre11/arch/i386/lib/csumcpy_sse.inc Wed Dec 31 22:00:00 1969 | |
1381 | +++ linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_sse.inc Fri Nov 1 23:38:32 2002 | |
1382 | @@ -0,0 +1,147 @@ | |
1383 | +// Huge routine, I don't like it's size and number | |
1384 | +// of fixups... think of that when you want | |
1385 | +// to unroll loop more | |
1386 | +// TODO: benchmark and reduce size | |
1387 | +// I won't stand 1K behemot just for 5% speedup | |
1388 | + | |
1389 | +#undef PREFETCH | |
1390 | +#define PREFETCH(a) prefetchnta a | |
1391 | + | |
1392 | +// How much unrolling do you want? | |
1393 | +// vda: celeron 1200: 5 with movaps, 4 with movups | |
1394 | +#undef ITER_BITS | |
1395 | +#define ITER_BITS 6 // ...4,5,6,7 - ...16,32,64,128 bytes | |
1396 | + // NB: tweak unrolled loop too... | |
1397 | + | |
1398 | +#undef ITER_SZ | |
1399 | +#undef ITER_MSK | |
1400 | +#define ITER_SZ (1<<ITER_BITS) | |
1401 | +#define ITER_MSK ((1<<ITER_BITS)-4) | |
1402 | + | |
1403 | +.align 4 | |
1404 | +.globl csumcpy_sse | |
1405 | + | |
1406 | +csumcpy_sse: | |
1407 | + testl $0xe, %edi # Check alignment | |
1408 | + jnz 5500f # align to 16 bytes | |
1409 | +1: | |
1410 | + movl %ecx, %edx | |
1411 | + shrl $ITER_BITS, %ecx | |
1412 | + jz 20f | |
1413 | + | |
1414 | +# "big chunks" loop | |
1415 | + PREFETCH((%esi)) # Prefetch a couple of cachelines | |
1416 | + PREFETCH(32(%esi)) // Note: Athlons have 64 bytes long ones, but | |
1417 | + PREFETCH(64(%esi)) // PIIIs only 32! This gives ~20% speedup | |
1418 | + PREFETCH(64+32(%esi)) // for PIII | |
1419 | + PREFETCH(128(%esi)) // Note2: 128 pf depth is slower for Athlons | |
1420 | + PREFETCH(128+32(%esi)) // let them enjoy 256 | |
1421 | + PREFETCH(192(%esi)) | |
1422 | + PREFETCH(192+32(%esi)) | |
1423 | + | |
1424 | + //KERNEL_FPU_BEGIN // We can't use lazy save - can be in irq :( | |
1425 | + subl $32, %esp // hopefully this is not too slow... | |
1426 | +K( movl %cr0, %ebx ) | |
1427 | +K( clts ) | |
1428 | + movups %xmm0, (%esp) | |
1429 | + movups %xmm1, 16(%esp) | |
1430 | + | |
1431 | + | |
1432 | +#undef ROUND0 | |
1433 | +#undef ROUND | |
1434 | +#define ROUND0(au,r) \ | |
1435 | +SRC( mov##au##ps (%esi), r ;) \ | |
1436 | + adcl (%esi), %eax ; \ | |
1437 | + adcl 4(%esi), %eax ; \ | |
1438 | + adcl 8(%esi), %eax ; \ | |
1439 | + adcl 12(%esi), %eax ; \ | |
1440 | +DST( movntps r, (%edi) ;) \ | |
1441 | + | |
1442 | +#define ROUND(au,x,r) \ | |
1443 | +SRC( mov##au##ps x(%esi), r ;) \ | |
1444 | + adcl x(%esi), %eax ; \ | |
1445 | + adcl x+4(%esi), %eax ; \ | |
1446 | + adcl x+8(%esi), %eax ; \ | |
1447 | + adcl x+12(%esi), %eax; \ | |
1448 | +DST( movntps r, x(%edi) ;) \ | |
1449 | + | |
1450 | +// ROUND[0]: edi must be 16-aligned! | |
1451 | +// if esi is not aligned, movaps wouldn't work, | |
1452 | +// not caught by testsuite. TODO. | |
1453 | +// We don't need SRC() around adcl's | |
1454 | +// (exception, if any, would be caught by 1st one) | |
1455 | +// (FIXME: can races against interrupts bite us?) | |
1456 | + | |
1457 | + testl $0xf, %esi # Check esi alignment + clear CF | |
1458 | + jz 15f | |
1459 | +10: # esi is NOT 16-aligned | |
1460 | + PREFETCH(256(%esi)) | |
1461 | + ROUND0(u,%xmm0) | |
1462 | + ROUND(u,16,%xmm1) | |
1463 | + PREFETCH(256+32(%esi)) | |
1464 | + ROUND(u,32,%xmm0) | |
1465 | + ROUND(u,48,%xmm1) | |
1466 | + lea ITER_SZ(%esi), %esi | |
1467 | + lea ITER_SZ(%edi), %edi | |
1468 | + //dec %ecx | |
1469 | + //jnz 10b | |
1470 | + loop 10b // Beware: loop and ITER_BITS>6 don't mix | |
1471 | + adcl $0, %eax | |
1472 | + jmp 19f | |
1473 | +15: # esi is 16-aligned | |
1474 | + PREFETCH(256(%esi)) | |
1475 | + ROUND0(a,%xmm0) | |
1476 | + ROUND(a,16,%xmm1) | |
1477 | + PREFETCH(256+32(%esi)) | |
1478 | + ROUND(a,32,%xmm0) | |
1479 | + ROUND(a,48,%xmm1) | |
1480 | + lea ITER_SZ(%esi), %esi | |
1481 | + lea ITER_SZ(%edi), %edi | |
1482 | + //dec %ecx | |
1483 | + //jnz 15b | |
1484 | + loop 15b // Beware: loop and ITER_BITS>6 don't mix | |
1485 | + adcl $0, %eax | |
1486 | +19: | |
1487 | + sfence # clean up XMM | |
1488 | + //KERNEL_FPU_END(%ebx) | |
1489 | + movups (%esp), %xmm0 | |
1490 | + movups 16(%esp), %xmm1 | |
1491 | + addl $32, %esp | |
1492 | +K( movl %ebx, %cr0 ) | |
1493 | + | |
1494 | +20: | |
1495 | + # loop for dwords | |
1496 | + movl %edx, %ecx | |
1497 | + andl $ITER_MSK, %edx | |
1498 | + jz 40f | |
1499 | + shrl $2, %edx # this also clears CF | |
1500 | +30: | |
1501 | +SRC( movl (%esi), %ebx ) | |
1502 | + adcl %ebx, %eax | |
1503 | +DST( movl %ebx, (%edi) ) | |
1504 | + lea 4(%esi), %esi | |
1505 | + lea 4(%edi), %edi | |
1506 | + dec %edx | |
1507 | + jnz 30b | |
1508 | + adcl $0, %eax | |
1509 | +40: | |
1510 | + # last 1, 2 or 3 bytes: handled by caller | |
1511 | + andl $3, %ecx | |
1512 | + ret | |
1513 | + | |
1514 | + | |
4bf063fb | 1515 | +# xxx 16-align edi and get back |
e6d11017 JR |
1516 | +5500: cmp $ITER_SZ, %ecx # edi is 4-aligned here |
1517 | + mov %ecx, %edx # edx needed at 20: | |
1518 | + jb 20b # not worthy: too short | |
1519 | + | |
1520 | +5520: test $0xe, %edi # loop until we are 16-aligned | |
1521 | + jz 1b | |
1522 | +SRC( movl (%esi), %ebx ) | |
1523 | + addl $4, %esi | |
1524 | +DST( movl %ebx, (%edi) ) | |
1525 | + addl $4, %edi | |
1526 | + addl %ebx, %eax | |
1527 | + adcl $0, %eax | |
1528 | + subl $4, %ecx | |
1529 | + jmp 5520b | |
1530 | diff -urN linux-2.4.20-pre11/arch/i386/lib/csumcpy_ssemmxplus.inc linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_ssemmxplus.inc | |
1531 | --- linux-2.4.20-pre11/arch/i386/lib/csumcpy_ssemmxplus.inc Wed Dec 31 22:00:00 1969 | |
1532 | +++ linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_ssemmxplus.inc Fri Nov 1 23:22:58 2002 | |
1533 | @@ -0,0 +1,103 @@ | |
1534 | +#undef PREFETCH | |
1535 | +#define PREFETCH(a) prefetchnta a | |
1536 | + | |
1537 | +// How much unrolling do you want? | |
1538 | +#undef ITER_BITS | |
1539 | +#define ITER_BITS 5 // ...5,6,7 - ...32,64,128 bytes | |
1540 | + // NB: tweak unrolled loop too... | |
1541 | + | |
1542 | +#undef ITER_SZ | |
1543 | +#undef ITER_MSK | |
1544 | +#define ITER_SZ (1<<ITER_BITS) | |
1545 | +#define ITER_MSK ((1<<ITER_BITS)-4) | |
1546 | + | |
1547 | +.align 4 | |
1548 | +.globl csumcpy_ssemmxplus | |
1549 | + | |
1550 | +csumcpy_ssemmxplus: | |
1551 | + movl %ecx, %edx | |
1552 | + shrl $ITER_BITS, %ecx | |
1553 | + jz 20f | |
1554 | + | |
1555 | +# "big chunks" loop | |
1556 | + PREFETCH((%esi)) # Prefetch a couple of cachelines | |
1557 | + PREFETCH(32(%esi)) // Note: Athlons have 64 bytes long ones, but | |
1558 | + PREFETCH(64(%esi)) // PIIIs only 32! This gives ~20% speedup | |
1559 | + PREFETCH(64+32(%esi)) // for PIII | |
1560 | + PREFETCH(128(%esi)) // Note2: 128 pf depth is slower for Athlons | |
1561 | + PREFETCH(128+32(%esi)) // let them enjoy 256 | |
1562 | + PREFETCH(192(%esi)) | |
1563 | + PREFETCH(192+32(%esi)) | |
1564 | + | |
1565 | + //KERNEL_FPU_BEGIN // We can't use lazy save - can be in irq :( | |
1566 | +K( movl %cr0, %ebx ) | |
1567 | +K( clts ) | |
1568 | + subl $108, %esp | |
1569 | + fnsave (%esp) | |
1570 | + fwait | |
1571 | + | |
1572 | + clc | |
1573 | + | |
1574 | +#undef ROUND0 | |
1575 | +#undef ROUND | |
1576 | +#define ROUND0(r) \ | |
1577 | +SRC( movq (%esi), r ;) \ | |
1578 | + adcl (%esi), %eax ; \ | |
1579 | + adcl 4(%esi), %eax ; \ | |
1580 | +DST( movntq r, (%edi) ;) \ | |
1581 | + | |
1582 | +#define ROUND(x,r) \ | |
1583 | +SRC( movq x(%esi), r ;) \ | |
1584 | + adcl x(%esi), %eax ; \ | |
1585 | + adcl x+4(%esi), %eax ; \ | |
1586 | +DST( movntq r, x(%edi) ;) \ | |
1587 | + | |
1588 | +// moving store to the end of a ROUND makes it faster | |
1589 | +// don't ask me why | |
1590 | +// we don't need SRC() around adcl's | |
1591 | +// (exception, if any, would be caught by 1st one) | |
1592 | +// (FIXME: can races against interrupts bite us?) | |
1593 | + | |
1594 | +10: | |
1595 | + PREFETCH(256(%esi)) | |
1596 | + ROUND0(%mm0) // using mm1,2,3 does not speed up things | |
1597 | + ROUND(8,%mm0) | |
1598 | + ROUND(16,%mm0) | |
1599 | + ROUND(24,%mm0) | |
1600 | +/* PREFETCH(256+32(%esi)) | |
1601 | + ROUND(32,%mm0) | |
1602 | + ROUND(40,%mm0) | |
1603 | + ROUND(48,%mm0) | |
1604 | + ROUND(56,%mm0)*/ | |
1605 | + | |
1606 | + lea ITER_SZ(%esi), %esi | |
1607 | + lea ITER_SZ(%edi), %edi | |
1608 | + //dec %ecx | |
1609 | + //jnz 10b | |
1610 | + loop 10b // Beware: loop and ITER_BITS>5 don't mix | |
1611 | + adcl $0, %eax | |
1612 | + | |
1613 | + sfence | |
1614 | + //KERNEL_FPU_END(%ebx) | |
1615 | + frstor (%esp) | |
1616 | + addl $108, %esp | |
1617 | +K( movl %ebx, %cr0 ) | |
1618 | + | |
1619 | +20: | |
1620 | + # loop for dwords | |
1621 | + movl %edx, %ecx | |
1622 | + andl $ITER_MSK, %edx | |
1623 | + jz 40f | |
1624 | + shrl $2, %edx # this also clears CF | |
1625 | +30: | |
1626 | +SRC( movl (%esi), %ebx ) | |
1627 | + adcl %ebx, %eax | |
1628 | +DST( movl %ebx, (%edi) ) | |
1629 | + lea 4(%esi), %esi | |
1630 | + lea 4(%edi), %edi | |
1631 | + dec %edx | |
1632 | + jnz 30b | |
1633 | + adcl $0, %eax | |
1634 | + | |
1635 | +40: andl $3, %ecx | |
1636 | + ret |