jam-15-fast-csum-D.patch

   1         New csum functions optimized for different processors.
   2         Author: Denis Vlasenko <vda@port.imtp.ilyichevsk.odessa.ua>
   3
   4 diff -urN linux-2.4.20-pre11/arch/i386/lib/Makefile linux-2.4.20-pre11csum/arch/i386/lib/Makefile
   5 --- linux-2.4.20-pre11/arch/i386/lib/Makefile   Mon Sep 10 12:31:30 2001
   6 +++ linux-2.4.20-pre11csum/arch/i386/lib/Makefile       Fri Nov  1 23:55:58 2002
   7 @@ -7,9 +7,17 @@
   8
   9  L_TARGET = lib.a
  10
  11 -obj-y = checksum.o old-checksum.o delay.o \
  12 +obj-y = old-checksum.o delay.o \
  13         usercopy.o getuser.o \
  14 -       memcpy.o strstr.o
  15 +       memcpy.o strstr.o \
  16 +       bench_csum.o \
  17 +       bench_func.o \
  18 +       csum.o \
  19 +       csum_basic.o \
  20 +       csum_naive.o \
  21 +       csum_3dnow.o \
  22 +       csum_ssemmxplus.o \
  23 +       csumcpy.o
  24
  25  obj-$(CONFIG_X86_USE_3DNOW) += mmx.o
  26  obj-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
  27 diff -urN linux-2.4.20-pre11/arch/i386/lib/bench_csum.c linux-2.4.20-pre11csum/arch/i386/lib/bench_csum.c
  28 --- linux-2.4.20-pre11/arch/i386/lib/bench_csum.c       Wed Dec 31 22:00:00 1969
  29 +++ linux-2.4.20-pre11csum/arch/i386/lib/bench_csum.c   Sat Nov  2 11:51:40 2002
  30 @@ -0,0 +1,216 @@
  31 +#include <linux/mm.h>          // for get_pages
  32 +#include <asm/uaccess.h>       // for access_ok in asm/checksum.h
  33 +#include <linux/in6.h>         // for in6_addr in asm/checksum.h
  34 +#include <asm/byteorder.h>     // for ntoh in asm/checksum.h
  35 +#include <asm/cpufeature.h>    // for X86_FEATURE_xx
  36 +#include <linux/byteorder/generic.h>   // for ntohX in asm/checksum.h
  37 +#include <linux/stddef.h>      // for NULL in asm/checksum.h
  38 +#include <linux/linkage.h>     // for asmlinkage in asm/checksum.h
  39 +#include <linux/module.h>
  40 +
  41 +#include <asm/checksum.h>
  42 +#include "bench_func.h"
  43 +
  44 +//#define dprintk(a...)        printk(a)
  45 +#define dprintk(a...) ((void)0)
  46 +
  47 +/* Features usable for mem optimization:
  48 +       Intel
  49 +X86_FEATURE_FPU                Onboard FPU
  50 +X86_FEATURE_MMX                Multimedia Extensions
  51 +X86_FEATURE_XMM                Streaming SIMD Extensions
  52 +X86_FEATURE_XMM2       Streaming SIMD Extensions-2
  53 +       AMD
  54 +X86_FEATURE_3DNOW      3DNow!
  55 +X86_FEATURE_MMXEXT     AMD MMX extensions
  56 +X86_FEATURE_3DNOWEXT   AMD 3DNow! extensions
  57 +       Cyrix
  58 +X86_FEATURE_CXMMX      Cyrix MMX extensions
  59 +*/
  60 +
  61 +typedef typeof(jiffies) jiffies_t;
  62 +
  63 +typedef void asm_helper(void);
  64 +
  65 +extern asm_helper csum_basic;
  66 +extern asm_helper csum_naive;
  67 +extern asm_helper csum_3dnow;
  68 +extern asm_helper csum_ssemmxplus;
  69 +
  70 +static struct candidate csum_runner[] = {
  71 +    { "basic"  , csum_basic     , 1, { -1 } },
  72 +    { "simple" , csum_naive     , 1, { -1 } },
  73 +    { "3Dnow!" , csum_3dnow     , 1, { X86_FEATURE_3DNOW, -1 } },
  74 +    { "AMD MMX", csum_ssemmxplus, 1, { X86_FEATURE_MMXEXT, -1 } },
  75 +    { "SSE1+", csum_ssemmxplus, 1, { X86_FEATURE_XMM, -1 } },
  76 +};
  77 +
  78 +extern asm_helper csumcpy_basic;
  79 +extern asm_helper csumcpy_naive;
  80 +extern asm_helper csumcpy_ssemmxplus;
  81 +extern asm_helper csumcpy_sse;
  82 +
  83 +static struct candidate csumcpy_runner[] = {
  84 +    { "basic"  , csumcpy_basic     , 2, { -1 } },
  85 +    { "simple" , csumcpy_naive     , 2, { -1 } },
  86 +    /* higher weight: we prefer these for less cache pollution: */
  87 +    { "AND MMX", csumcpy_ssemmxplus, 3, { X86_FEATURE_MMXEXT, -1 } },
  88 +    { "SSE1+", csumcpy_ssemmxplus, 3, { X86_FEATURE_XMM, -1 } },
  89 +    { "SSE1"   , csumcpy_sse       , 3, { X86_FEATURE_XMM, -1 } },
  90 +};
  91 +
  92 +//====== TODO: split here: above: arch, below:generic
  93 +
  94 +/* set this to value bigger than cache(s) */
  95 +/* TODO: heuristic for buffer size */
  96 +#define bufshift       20              /* 10=1kb, 20=1MB etc */
  97 +/* typical size of a packet */
  98 +#define chunksz                (4*1024)
  99 +
 100 +#define bufsz          (1<<bufshift)
 101 +#define chunkcnt       (bufsz/chunksz)
 102 +
 103 +#define VECTOR_SZ(a)   (sizeof(a)/sizeof((a)[0]))
 104 +
 105 +asm_helper *best_csum = csum_basic;
 106 +asm_helper *best_csumcpy = csumcpy_basic;
 107 +
 108 +/*
 109 +** Count the number of iterations done during a fixed period,
 110 +** and use this to calculate throughput.
 111 +*/
 112 +
 113 +static int duration = 1;       // jiffies for each run
 114 +static int report;
 115 +
 116 +static inline void
 117 +wait_for_jiffy(void) {
 118 +       jiffies_t now = jiffies;
 119 +       while(now == jiffies) cpu_relax();
 120 +}
 121 +
 122 +static int
 123 +bench_csum(struct candidate *cand, char *buf)
 124 +{
 125 +       int i, max;
 126 +       best_csum = (asm_helper*)(cand->f);
 127 +
 128 +       max = 0;
 129 +       // In practice these are pretty repeatable
 130 +       // so 3 runs is an overkill
 131 +       for(i=0; i<3; i++) {
 132 +               int count = 0;
 133 +               jiffies_t limit;
 134 +               wait_for_jiffy();
 135 +               limit = jiffies+duration;
 136 +               while(time_before(jiffies, limit)) {
 137 +                       int i;
 138 +                       mb();
 139 +                       // interleaved to avoid bias due to prefetch
 140 +                       for(i=0; i<chunkcnt; i+=2)
 141 +                               csum_partial(buf+i*chunksz, chunksz, 0);
 142 +                       for(i=1; i<chunkcnt; i+=2)
 143 +                               csum_partial(buf+i*chunksz, chunksz, 0);
 144 +                       mb();
 145 +                       count++;
 146 +                       mb();
 147 +               }
 148 +               dprintk("   count =%6i\n",count);
 149 +               if(count>max)
 150 +                       max = count;
 151 +       }
 152 +
 153 +       if(report) {
 154 +               int kb_sec = max * (((chunksz*chunkcnt)/1024) * HZ) / duration;
 155 +               printk("   %-10s:%6d.%03d MB/sec\n", cand->name,
 156 +                       kb_sec / 1000, kb_sec % 1000);
 157 +       }
 158 +
 159 +       return max;
 160 +}
 161 +
 162 +static int
 163 +bench_csumcpy(struct candidate *cand, char *buf)
 164 +{
 165 +       int err;
 166 +       int i, max;
 167 +       best_csumcpy = (asm_helper*)(cand->f);
 168 +
 169 +       max = 0;
 170 +       for(i=0; i<3; i++) {
 171 +               int count = 0;
 172 +               jiffies_t limit;
 173 +               wait_for_jiffy();
 174 +               limit = jiffies+duration;
 175 +               while(time_before(jiffies, limit)) {
 176 +                       int i;
 177 +                       mb();
 178 +                       // interleaved to avoid bias due to prefetch
 179 +                       for(i=0; i<chunkcnt; i+=2)
 180 +                               csum_partial_copy_generic(buf+i*chunksz,
 181 +                                       buf+(chunkcnt-1-i)*chunksz,
 182 +                                       chunksz, 0, &err, &err);
 183 +                       for(i=1; i<chunkcnt; i+=2)
 184 +                               csum_partial_copy_generic(buf+i*chunksz,
 185 +                                       buf+(chunkcnt-1-i)*chunksz,
 186 +                                       chunksz, 0, &err, &err);
 187 +                       mb();
 188 +                       count++;
 189 +                       mb();
 190 +               }
 191 +               dprintk("   count =%6i\n",count);
 192 +               if(count>max)
 193 +                       max = count;
 194 +       }
 195 +
 196 +       if(report) {
 197 +               int kb_sec = max * (((chunksz*chunkcnt)/1024) * HZ) / duration;
 198 +               printk("   %-10s:%6d.%03d MB/sec\n", cand->name,
 199 +                       kb_sec / 1000, kb_sec % 1000);
 200 +       }
 201 +
 202 +       return max;
 203 +}
 204 +
 205 +static int
 206 +find_best_csum(void)
 207 +{
 208 +       struct candidate *best;
 209 +       char *buffer = (char *) __get_free_pages(GFP_KERNEL,
 210 +                                       (bufshift-PAGE_SHIFT));
 211 +
 212 +       printk(KERN_INFO "Measuring network checksumming speed\n");
 213 +       if(!buffer) {
 214 +               printk("csum: cannot allocate %i pages\n",
 215 +                       1<<(bufshift-PAGE_SHIFT)
 216 +               );
 217 +               return -ENOMEM;
 218 +       }
 219 +       dprintk("allocated %i pages\n",1<<(bufshift-PAGE_SHIFT));
 220 +
 221 +       // find # of jiffies suitable for reliable results
 222 +       // (at least %5 accuracy)
 223 +       while(bench_csumcpy(&csumcpy_runner[0], buffer)<20) {
 224 +               duration<<=1;
 225 +       }
 226 +       dprintk("test run will last %i ticks\n", duration);
 227 +       report = 1;
 228 +
 229 +       best = find_best(bench_csum, buffer, csum_runner,
 230 +                       VECTOR_SZ(csum_runner));
 231 +       printk("csum: using csum function: %s\n", best->name);
 232 +       best_csum = (asm_helper*)(best->f);
 233 +
 234 +       best = find_best(bench_csumcpy, buffer, csumcpy_runner,
 235 +                       VECTOR_SZ(csumcpy_runner));
 236 +       printk("csum: using csum_copy function: %s\n", best->name);
 237 +       best_csumcpy = (asm_helper*)(best->f);
 238 +
 239 +       free_pages((unsigned long)buffer, (bufshift-PAGE_SHIFT));
 240 +       dprintk("freed %i pages\n",1<<(bufshift-PAGE_SHIFT));
 241 +       return 0;
 242 +}
 243 +
 244 +MODULE_LICENSE("GPL");
 245 +
 246 +module_init(find_best_csum);
 247 diff -urN linux-2.4.20-pre11/arch/i386/lib/bench_func.c linux-2.4.20-pre11csum/arch/i386/lib/bench_func.c
 248 --- linux-2.4.20-pre11/arch/i386/lib/bench_func.c       Wed Dec 31 22:00:00 1969
 249 +++ linux-2.4.20-pre11csum/arch/i386/lib/bench_func.c   Fri Nov  1 18:08:37 2002
 250 @@ -0,0 +1,53 @@
 251 +#include <linux/kernel.h>      // for KERN_DEBUG
 252 +
 253 +#include <asm/bitops.h>                // for test_bit
 254 +#include <asm/processor.h>     // cpu caps
 255 +#include <asm/cpufeature.h>    // cpu features constants
 256 +#include "bench_func.h"
 257 +
 258 +//#define dprintk(a...)        printk(a)
 259 +#define dprintk(a...) ((void)0)
 260 +
 261 +// 2.4 only, already in 2.5
 262 +extern inline int
 263 +boot_cpu_has(int cap)
 264 +{
 265 +       return test_bit(cap,  boot_cpu_data.x86_capability);
 266 +}
 267 +
 268 +extern inline int
 269 +cpu_supports(int *cap)
 270 +{
 271 +       while(*cap != -1) {
 272 +               if(!boot_cpu_has(*cap)) {
 273 +                       dprintk("unsupported caps: %i\n", *cap);
 274 +                       return 0;
 275 +               }
 276 +               cap++;
 277 +       }
 278 +       return 1;
 279 +}
 280 +
 281 +/*
 282 +** Call all the candidates which can be run on this CPU,
 283 +** find the best
 284 +*/
 285 +struct candidate*
 286 +find_best(bench_func *bench, char *opaque, struct candidate runner[], int count)
 287 +{
 288 +       int score, max = 0;
 289 +       struct candidate *best = 0;
 290 +       while(count--) {
 291 +               if(!cpu_supports(runner->cpu_caps_needed)) {
 292 +                       printk("func %s skipped: not supported by CPU\n", runner->name);
 293 +               } else {
 294 +                       score = bench(runner,opaque) * runner->weight;
 295 +                       if(max < score) {
 296 +                               max = score;
 297 +                               best = runner;
 298 +                       }
 299 +               }
 300 +               runner++;
 301 +       }
 302 +       return best;
 303 +}
 304 diff -urN linux-2.4.20-pre11/arch/i386/lib/bench_func.h linux-2.4.20-pre11csum/arch/i386/lib/bench_func.h
 305 --- linux-2.4.20-pre11/arch/i386/lib/bench_func.h       Wed Dec 31 22:00:00 1969
 306 +++ linux-2.4.20-pre11csum/arch/i386/lib/bench_func.h   Fri Nov  1 18:08:37 2002
 307 @@ -0,0 +1,16 @@
 308 +#ifndef _BENCH_FUNC_H
 309 +#define _BENCH_FUNC_H
 310 +
 311 +struct candidate {
 312 +       const char      *name;
 313 +       void            *f;     // pointer to func
 314 +       int             weight;
 315 +       int             cpu_caps_needed[4];
 316 +};
 317 +
 318 +typedef int bench_func(struct candidate *cand, char *opaque);
 319 +
 320 +struct candidate* find_best(bench_func *bench, char *opaque,
 321 +               struct candidate runner[], int count);
 322 +
 323 +#endif
 324 diff -urN linux-2.4.20-pre11/arch/i386/lib/checksum.S linux-2.4.20-pre11csum/arch/i386/lib/checksum.S
 325 --- linux-2.4.20-pre11/arch/i386/lib/checksum.S Fri Nov  1 18:06:59 2002
 326 +++ linux-2.4.20-pre11csum/arch/i386/lib/checksum.S     Wed Dec 31 22:00:00 1969
 327 @@ -1,496 +0,0 @@
 328 -/*
 329 - * INET                An implementation of the TCP/IP protocol suite for the LINUX
 330 - *             operating system.  INET is implemented using the  BSD Socket
 331 - *             interface as the means of communication with the user level.
 332 - *
 333 - *             IP/TCP/UDP checksumming routines
 334 - *
 335 - * Authors:    Jorge Cwik, <jorge@laser.satlink.net>
 336 - *             Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 337 - *             Tom May, <ftom@netcom.com>
 338 - *              Pentium Pro/II routines:
 339 - *              Alexander Kjeldaas <astor@guardian.no>
 340 - *              Finn Arne Gangstad <finnag@guardian.no>
 341 - *             Lots of code moved from tcp.c and ip.c; see those files
 342 - *             for more names.
 343 - *
 344 - * Changes:     Ingo Molnar, converted csum_partial_copy() to 2.1 exception
 345 - *                          handling.
 346 - *             Andi Kleen,  add zeroing on error
 347 - *                   converted to pure assembler
 348 - *
 349 - *             This program is free software; you can redistribute it and/or
 350 - *             modify it under the terms of the GNU General Public License
 351 - *             as published by the Free Software Foundation; either version
 352 - *             2 of the License, or (at your option) any later version.
 353 - */
 354 -
 355 -#include <linux/config.h>
 356 -#include <asm/errno.h>
 357 -
 358 -/*
 359 - * computes a partial checksum, e.g. for TCP/UDP fragments
 360 - */
 361 -
 362 -/*
 363 -unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
 364 - */
 365 -
 366 -.text
 367 -.align 4
 368 -.globl csum_partial
 369 -
 370 -#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
 371 -
 372 -         /*
 373 -          * Experiments with Ethernet and SLIP connections show that buff
 374 -          * is aligned on either a 2-byte or 4-byte boundary.  We get at
 375 -          * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
 376 -          * Fortunately, it is easy to convert 2-byte alignment to 4-byte
 377 -          * alignment for the unrolled loop.
 378 -          */
 379 -csum_partial:
 380 -       pushl %esi
 381 -       pushl %ebx
 382 -       movl 20(%esp),%eax      # Function arg: unsigned int sum
 383 -       movl 16(%esp),%ecx      # Function arg: int len
 384 -       movl 12(%esp),%esi      # Function arg: unsigned char *buff
 385 -       testl $3, %esi          # Check alignment.
 386 -       jz 2f                   # Jump if alignment is ok.
 387 -       testl $1, %esi          # Check alignment.
 388 -       jz 10f                  # Jump if alignment is boundary of 2bytes.
 389 -
 390 -       # buf is odd
 391 -       dec %ecx
 392 -       jl 8f
 393 -       movzbl (%esi), %ebx
 394 -       adcl %ebx, %eax
 395 -       roll $8, %eax
 396 -       inc %esi
 397 -       testl $2, %esi
 398 -       jz 2f
 399 -10:
 400 -       subl $2, %ecx           # Alignment uses up two bytes.
 401 -       jae 1f                  # Jump if we had at least two bytes.
 402 -       addl $2, %ecx           # ecx was < 2.  Deal with it.
 403 -       jmp 4f
 404 -1:     movw (%esi), %bx
 405 -       addl $2, %esi
 406 -       addw %bx, %ax
 407 -       adcl $0, %eax
 408 -2:
 409 -       movl %ecx, %edx
 410 -       shrl $5, %ecx
 411 -       jz 2f
 412 -       testl %esi, %esi
 413 -1:     movl (%esi), %ebx
 414 -       adcl %ebx, %eax
 415 -       movl 4(%esi), %ebx
 416 -       adcl %ebx, %eax
 417 -       movl 8(%esi), %ebx
 418 -       adcl %ebx, %eax
 419 -       movl 12(%esi), %ebx
 420 -       adcl %ebx, %eax
 421 -       movl 16(%esi), %ebx
 422 -       adcl %ebx, %eax
 423 -       movl 20(%esi), %ebx
 424 -       adcl %ebx, %eax
 425 -       movl 24(%esi), %ebx
 426 -       adcl %ebx, %eax
 427 -       movl 28(%esi), %ebx
 428 -       adcl %ebx, %eax
 429 -       lea 32(%esi), %esi
 430 -       dec %ecx
 431 -       jne 1b
 432 -       adcl $0, %eax
 433 -2:     movl %edx, %ecx
 434 -       andl $0x1c, %edx
 435 -       je 4f
 436 -       shrl $2, %edx           # This clears CF
 437 -3:     adcl (%esi), %eax
 438 -       lea 4(%esi), %esi
 439 -       dec %edx
 440 -       jne 3b
 441 -       adcl $0, %eax
 442 -4:     andl $3, %ecx
 443 -       jz 7f
 444 -       cmpl $2, %ecx
 445 -       jb 5f
 446 -       movw (%esi),%cx
 447 -       leal 2(%esi),%esi
 448 -       je 6f
 449 -       shll $16,%ecx
 450 -5:     movb (%esi),%cl
 451 -6:     addl %ecx,%eax
 452 -       adcl $0, %eax
 453 -7:
 454 -       testl $1, 12(%esp)
 455 -       jz 8f
 456 -       roll $8, %eax
 457 -8:
 458 -       popl %ebx
 459 -       popl %esi
 460 -       ret
 461 -
 462 -#else
 463 -
 464 -/* Version for PentiumII/PPro */
 465 -
 466 -csum_partial:
 467 -       pushl %esi
 468 -       pushl %ebx
 469 -       movl 20(%esp),%eax      # Function arg: unsigned int sum
 470 -       movl 16(%esp),%ecx      # Function arg: int len
 471 -       movl 12(%esp),%esi      # Function arg: const unsigned char *buf
 472 -
 473 -       testl $3, %esi
 474 -       jnz 25f
 475 -10:
 476 -       movl %ecx, %edx
 477 -       movl %ecx, %ebx
 478 -       andl $0x7c, %ebx
 479 -       shrl $7, %ecx
 480 -       addl %ebx,%esi
 481 -       shrl $2, %ebx
 482 -       negl %ebx
 483 -       lea 45f(%ebx,%ebx,2), %ebx
 484 -       testl %esi, %esi
 485 -       jmp *%ebx
 486 -
 487 -       # Handle 2-byte-aligned regions
 488 -20:    addw (%esi), %ax
 489 -       lea 2(%esi), %esi
 490 -       adcl $0, %eax
 491 -       jmp 10b
 492 -25:
 493 -       testl $1, %esi
 494 -       jz 30f
 495 -       # buf is odd
 496 -       dec %ecx
 497 -       jl 90f
 498 -       movzbl (%esi), %ebx
 499 -       addl %ebx, %eax
 500 -       adcl $0, %eax
 501 -       roll $8, %eax
 502 -       inc %esi
 503 -       testl $2, %esi
 504 -       jz 10b
 505 -
 506 -30:    subl $2, %ecx
 507 -       ja 20b
 508 -       je 32f
 509 -       addl $2, %ecx
 510 -       jz 80f
 511 -       movzbl (%esi),%ebx      # csumming 1 byte, 2-aligned
 512 -       addl %ebx, %eax
 513 -       adcl $0, %eax
 514 -       jmp 80f
 515 -32:
 516 -       addw (%esi), %ax        # csumming 2 bytes, 2-aligned
 517 -       adcl $0, %eax
 518 -       jmp 80f
 519 -
 520 -40:
 521 -       addl -128(%esi), %eax
 522 -       adcl -124(%esi), %eax
 523 -       adcl -120(%esi), %eax
 524 -       adcl -116(%esi), %eax
 525 -       adcl -112(%esi), %eax
 526 -       adcl -108(%esi), %eax
 527 -       adcl -104(%esi), %eax
 528 -       adcl -100(%esi), %eax
 529 -       adcl -96(%esi), %eax
 530 -       adcl -92(%esi), %eax
 531 -       adcl -88(%esi), %eax
 532 -       adcl -84(%esi), %eax
 533 -       adcl -80(%esi), %eax
 534 -       adcl -76(%esi), %eax
 535 -       adcl -72(%esi), %eax
 536 -       adcl -68(%esi), %eax
 537 -       adcl -64(%esi), %eax
 538 -       adcl -60(%esi), %eax
 539 -       adcl -56(%esi), %eax
 540 -       adcl -52(%esi), %eax
 541 -       adcl -48(%esi), %eax
 542 -       adcl -44(%esi), %eax
 543 -       adcl -40(%esi), %eax
 544 -       adcl -36(%esi), %eax
 545 -       adcl -32(%esi), %eax
 546 -       adcl -28(%esi), %eax
 547 -       adcl -24(%esi), %eax
 548 -       adcl -20(%esi), %eax
 549 -       adcl -16(%esi), %eax
 550 -       adcl -12(%esi), %eax
 551 -       adcl -8(%esi), %eax
 552 -       adcl -4(%esi), %eax
 553 -45:
 554 -       lea 128(%esi), %esi
 555 -       adcl $0, %eax
 556 -       dec %ecx
 557 -       jge 40b
 558 -       movl %edx, %ecx
 559 -50:    andl $3, %ecx
 560 -       jz 80f
 561 -
 562 -       # Handle the last 1-3 bytes without jumping
 563 -       notl %ecx               # 1->2, 2->1, 3->0, higher bits are masked
 564 -       movl $0xffffff,%ebx     # by the shll and shrl instructions
 565 -       shll $3,%ecx
 566 -       shrl %cl,%ebx
 567 -       andl -128(%esi),%ebx    # esi is 4-aligned so should be ok
 568 -       addl %ebx,%eax
 569 -       adcl $0,%eax
 570 -80:
 571 -       testl $1, 12(%esp)
 572 -       jz 90f
 573 -       roll $8, %eax
 574 -90:
 575 -       popl %ebx
 576 -       popl %esi
 577 -       ret
 578 -
 579 -#endif
 580 -
 581 -/*
 582 -unsigned int csum_partial_copy_generic (const char *src, char *dst,
 583 -                                 int len, int sum, int *src_err_ptr, int *dst_err_ptr)
 584 - */
 585 -
 586 -/*
 587 - * Copy from ds while checksumming, otherwise like csum_partial
 588 - *
 589 - * The macros SRC and DST specify the type of access for the instruction.
 590 - * thus we can call a custom exception handler for all access types.
 591 - *
 592 - * FIXME: could someone double-check whether I haven't mixed up some SRC and
 593 - *       DST definitions? It's damn hard to trigger all cases.  I hope I got
 594 - *       them all but there's no guarantee.
 595 - */
 596 -
 597 -#define SRC(y...)                      \
 598 -       9999: y;                        \
 599 -       .section __ex_table, "a";       \
 600 -       .long 9999b, 6001f      ;       \
 601 -       .previous
 602 -
 603 -#define DST(y...)                      \
 604 -       9999: y;                        \
 605 -       .section __ex_table, "a";       \
 606 -       .long 9999b, 6002f      ;       \
 607 -       .previous
 608 -
 609 -.align 4
 610 -.globl csum_partial_copy_generic
 611 -
 612 -#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
 613 -
 614 -#define ARGBASE 16
 615 -#define FP             12
 616 -
 617 -csum_partial_copy_generic:
 618 -       subl  $4,%esp
 619 -       pushl %edi
 620 -       pushl %esi
 621 -       pushl %ebx
 622 -       movl ARGBASE+16(%esp),%eax      # sum
 623 -       movl ARGBASE+12(%esp),%ecx      # len
 624 -       movl ARGBASE+4(%esp),%esi       # src
 625 -       movl ARGBASE+8(%esp),%edi       # dst
 626 -
 627 -       testl $2, %edi                  # Check alignment.
 628 -       jz 2f                           # Jump if alignment is ok.
 629 -       subl $2, %ecx                   # Alignment uses up two bytes.
 630 -       jae 1f                          # Jump if we had at least two bytes.
 631 -       addl $2, %ecx                   # ecx was < 2.  Deal with it.
 632 -       jmp 4f
 633 -SRC(1: movw (%esi), %bx        )
 634 -       addl $2, %esi
 635 -DST(   movw %bx, (%edi)        )
 636 -       addl $2, %edi
 637 -       addw %bx, %ax
 638 -       adcl $0, %eax
 639 -2:
 640 -       movl %ecx, FP(%esp)
 641 -       shrl $5, %ecx
 642 -       jz 2f
 643 -       testl %esi, %esi
 644 -SRC(1: movl (%esi), %ebx       )
 645 -SRC(   movl 4(%esi), %edx      )
 646 -       adcl %ebx, %eax
 647 -DST(   movl %ebx, (%edi)       )
 648 -       adcl %edx, %eax
 649 -DST(   movl %edx, 4(%edi)      )
 650 -
 651 -SRC(   movl 8(%esi), %ebx      )
 652 -SRC(   movl 12(%esi), %edx     )
 653 -       adcl %ebx, %eax
 654 -DST(   movl %ebx, 8(%edi)      )
 655 -       adcl %edx, %eax
 656 -DST(   movl %edx, 12(%edi)     )
 657 -
 658 -SRC(   movl 16(%esi), %ebx     )
 659 -SRC(   movl 20(%esi), %edx     )
 660 -       adcl %ebx, %eax
 661 -DST(   movl %ebx, 16(%edi)     )
 662 -       adcl %edx, %eax
 663 -DST(   movl %edx, 20(%edi)     )
 664 -
 665 -SRC(   movl 24(%esi), %ebx     )
 666 -SRC(   movl 28(%esi), %edx     )
 667 -       adcl %ebx, %eax
 668 -DST(   movl %ebx, 24(%edi)     )
 669 -       adcl %edx, %eax
 670 -DST(   movl %edx, 28(%edi)     )
 671 -
 672 -       lea 32(%esi), %esi
 673 -       lea 32(%edi), %edi
 674 -       dec %ecx
 675 -       jne 1b
 676 -       adcl $0, %eax
 677 -2:     movl FP(%esp), %edx
 678 -       movl %edx, %ecx
 679 -       andl $0x1c, %edx
 680 -       je 4f
 681 -       shrl $2, %edx                   # This clears CF
 682 -SRC(3: movl (%esi), %ebx       )
 683 -       adcl %ebx, %eax
 684 -DST(   movl %ebx, (%edi)       )
 685 -       lea 4(%esi), %esi
 686 -       lea 4(%edi), %edi
 687 -       dec %edx
 688 -       jne 3b
 689 -       adcl $0, %eax
 690 -4:     andl $3, %ecx
 691 -       jz 7f
 692 -       cmpl $2, %ecx
 693 -       jb 5f
 694 -SRC(   movw (%esi), %cx        )
 695 -       leal 2(%esi), %esi
 696 -DST(   movw %cx, (%edi)        )
 697 -       leal 2(%edi), %edi
 698 -       je 6f
 699 -       shll $16,%ecx
 700 -SRC(5: movb (%esi), %cl        )
 701 -DST(   movb %cl, (%edi)        )
 702 -6:     addl %ecx, %eax
 703 -       adcl $0, %eax
 704 -7:
 705 -5000:
 706 -
 707 -# Exception handler:
 708 -.section .fixup, "ax"
 709 -
 710 -6001:
 711 -       movl ARGBASE+20(%esp), %ebx     # src_err_ptr
 712 -       movl $-EFAULT, (%ebx)
 713 -
 714 -       # zero the complete destination - computing the rest
 715 -       # is too much work
 716 -       movl ARGBASE+8(%esp), %edi      # dst
 717 -       movl ARGBASE+12(%esp), %ecx     # len
 718 -       xorl %eax,%eax
 719 -       rep ; stosb
 720 -
 721 -       jmp 5000b
 722 -
 723 -6002:
 724 -       movl ARGBASE+24(%esp), %ebx     # dst_err_ptr
 725 -       movl $-EFAULT,(%ebx)
 726 -       jmp 5000b
 727 -
 728 -.previous
 729 -
 730 -       popl %ebx
 731 -       popl %esi
 732 -       popl %edi
 733 -       popl %ecx                       # equivalent to addl $4,%esp
 734 -       ret
 735 -
 736 -#else
 737 -
 738 -/* Version for PentiumII/PPro */
 739 -
 740 -#define ROUND1(x) \
 741 -       SRC(movl x(%esi), %ebx  )       ;       \
 742 -       addl %ebx, %eax                 ;       \
 743 -       DST(movl %ebx, x(%edi)  )       ;
 744 -
 745 -#define ROUND(x) \
 746 -       SRC(movl x(%esi), %ebx  )       ;       \
 747 -       adcl %ebx, %eax                 ;       \
 748 -       DST(movl %ebx, x(%edi)  )       ;
 749 -
 750 -#define ARGBASE 12
 751 -
 752 -csum_partial_copy_generic:
 753 -       pushl %ebx
 754 -       pushl %edi
 755 -       pushl %esi
 756 -       movl ARGBASE+4(%esp),%esi       #src
 757 -       movl ARGBASE+8(%esp),%edi       #dst
 758 -       movl ARGBASE+12(%esp),%ecx      #len
 759 -       movl ARGBASE+16(%esp),%eax      #sum
 760 -#      movl %ecx, %edx
 761 -       movl %ecx, %ebx
 762 -       movl %esi, %edx
 763 -       shrl $6, %ecx
 764 -       andl $0x3c, %ebx
 765 -       negl %ebx
 766 -       subl %ebx, %esi
 767 -       subl %ebx, %edi
 768 -       lea  -1(%esi),%edx
 769 -       andl $-32,%edx
 770 -       lea 3f(%ebx,%ebx), %ebx
 771 -       testl %esi, %esi
 772 -       jmp *%ebx
 773 -1:     addl $64,%esi
 774 -       addl $64,%edi
 775 -       SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl)
 776 -       ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)
 777 -       ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)
 778 -       ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)
 779 -       ROUND (-16) ROUND(-12) ROUND(-8)  ROUND(-4)
 780 -3:     adcl $0,%eax
 781 -       addl $64, %edx
 782 -       dec %ecx
 783 -       jge 1b
 784 -4:     movl ARGBASE+12(%esp),%edx      #len
 785 -       andl $3, %edx
 786 -       jz 7f
 787 -       cmpl $2, %edx
 788 -       jb 5f
 789 -SRC(   movw (%esi), %dx         )
 790 -       leal 2(%esi), %esi
 791 -DST(   movw %dx, (%edi)         )
 792 -       leal 2(%edi), %edi
 793 -       je 6f
 794 -       shll $16,%edx
 795 -5:
 796 -SRC(   movb (%esi), %dl         )
 797 -DST(   movb %dl, (%edi)         )
 798 -6:     addl %edx, %eax
 799 -       adcl $0, %eax
 800 -7:
 801 -.section .fixup, "ax"
 802 -6001:  movl    ARGBASE+20(%esp), %ebx  # src_err_ptr
 803 -       movl $-EFAULT, (%ebx)
 804 -       # zero the complete destination (computing the rest is too much work)
 805 -       movl ARGBASE+8(%esp),%edi       # dst
 806 -       movl ARGBASE+12(%esp),%ecx      # len
 807 -       xorl %eax,%eax
 808 -       rep; stosb
 809 -       jmp 7b
 810 -6002:  movl ARGBASE+24(%esp), %ebx     # dst_err_ptr
 811 -       movl $-EFAULT, (%ebx)
 812 -       jmp  7b
 813 -.previous
 814 -
 815 -       popl %esi
 816 -       popl %edi
 817 -       popl %ebx
 818 -       ret
 819 -
 820 -#undef ROUND
 821 -#undef ROUND1
 822 -
 823 -#endif
 824 diff -urN linux-2.4.20-pre11/arch/i386/lib/csum.S linux-2.4.20-pre11csum/arch/i386/lib/csum.S
 825 --- linux-2.4.20-pre11/arch/i386/lib/csum.S     Wed Dec 31 22:00:00 1969
 826 +++ linux-2.4.20-pre11csum/arch/i386/lib/csum.S Fri Nov  1 22:45:31 2002
 827 @@ -0,0 +1,97 @@
 828 +/*
 829 + * INET                An implementation of the TCP/IP protocol suite for the LINUX
 830 + *             operating system.  INET is implemented using the  BSD Socket
 831 + *             interface as the means of communication with the user level.
 832 + *
 833 + *             IP/TCP/UDP checksumming routines
 834 + *
 835 + * Authors:    Jorge Cwik, <jorge@laser.satlink.net>
 836 + *             Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 837 + *             Tom May, <ftom@netcom.com>
 838 + *              Pentium Pro/II routines:
 839 + *              Alexander Kjeldaas <astor@guardian.no>
 840 + *              Finn Arne Gangstad <finnag@guardian.no>
 841 + *             Lots of code moved from tcp.c and ip.c; see those files
 842 + *             for more names.
 843 + *
 844 + * Changes:     Ingo Molnar, converted csum_partial_copy() to 2.1 exception
 845 + *                          handling.
 846 + *             Andi Kleen,  add zeroing on error converted to pure assembler
 847 + *             2002-10-30 Denis Vlasenko
 848 + *                     boot-time benchmarking, 3Dnow/MMX+/SSE versions
 849 + *
 850 + *             This program is free software; you can redistribute it and/or
 851 + *             modify it under the terms of the GNU General Public License
 852 + *             as published by the Free Software Foundation; either version
 853 + *             2 of the License, or (at your option) any later version.
 854 + */
 855 +
 856 +/*
 857 +** computes a partial checksum, e.g. for TCP/UDP fragments
 858 +**
 859 +** unsigned int csum_partial(const unsigned char * buff,
 860 +**             int len, unsigned int sum)
 861 +*/
 862 +
 863 +.text
 864 +.align 4
 865 +.globl csum_partial
 866 +
 867 +csum_partial:
 868 +       pushl   %esi
 869 +       pushl   %ebx
 870 +       movl    20(%esp), %eax  # arg: sum
 871 +       movl    16(%esp), %ecx  # arg: len
 872 +       movl    12(%esp), %esi  # arg:  buf
 873 +
 874 +       testl   $3, %esi
 875 +       jz      40f
 876 +20:
 877 +    # not 4-aligned: analyze and align...
 878 +       testl   $1, %esi
 879 +       jz      30f
 880 +
 881 +    # unaligned start addr
 882 +       decl    %ecx
 883 +       js      90f             # sz==0, exit
 884 +       movzbl  (%esi), %ebx    # eat one byte...
 885 +       addl    %ebx, %eax
 886 +       adcl    $0, %eax
 887 +       roll    $8, %eax        # NB: need to be undone at exit!
 888 +       incl    %esi
 889 +       testl   $2, %esi
 890 +       jz      40f
 891 +30:
 892 +    # Note: 2-aligned, but not 4-aligned
 893 +       cmpl    $3, %ecx
 894 +       jbe     60f
 895 +       addw    (%esi), %ax     # eat 2 bytes
 896 +       leal    2(%esi), %esi
 897 +       adcl    $0, %eax
 898 +       subl    $2, %ecx
 899 +40:
 900 +    # esi is 4-aligned here, call block routine
 901 +       movl    $csum_basic, %ebx       # known ok even for ecx==0 etc
 902 +       cmpl    $128, %ecx              # use optimized routine
 903 +       jb      50f                     #   only for large blocks
 904 +       movl    best_csum, %ebx
 905 +50:    call    *%ebx
 906 +60:
 907 +    # handle the last 0-3 bytes without much jumping
 908 +       jecxz   80f
 909 +       notl    %ecx            # 0->3, 1->2, 2->1, 3->0, higher bits are masked
 910 +       movl    $0xffffff, %ebx #   by the shll and shrl instructions
 911 +       shll    $3, %ecx
 912 +       shrl    %cl, %ebx
 913 +       andl    (%esi), %ebx    # esi is 4-aligned so should be ok
 914 +       addl    %ebx, %eax
 915 +       adcl    $0, %eax
 916 +80:
 917 +    # undo csum rotation if start addr was odd
 918 +       testl   $1, 12(%esp)
 919 +       jz      90f
 920 +       roll    $8, %eax
 921 +90:
 922 +       popl    %ebx
 923 +       popl    %esi
 924 +       ret
 925 diff -urN linux-2.4.20-pre11/arch/i386/lib/csum_3dnow.S linux-2.4.20-pre11csum/arch/i386/lib/csum_3dnow.S
 926 --- linux-2.4.20-pre11/arch/i386/lib/csum_3dnow.S       Wed Dec 31 22:00:00 1969
 927 +++ linux-2.4.20-pre11csum/arch/i386/lib/csum_3dnow.S   Fri Nov  1 22:48:32 2002
 928 @@ -0,0 +1,4 @@
 929 +#define PREFETCH(a) prefetch a
 930 +#define NAME csum_3dnow
 931 +
 932 +#include "csum_pf.inc"
 933 diff -urN linux-2.4.20-pre11/arch/i386/lib/csum_basic.S linux-2.4.20-pre11csum/arch/i386/lib/csum_basic.S
 934 --- linux-2.4.20-pre11/arch/i386/lib/csum_basic.S       Wed Dec 31 22:00:00 1969
 935 +++ linux-2.4.20-pre11csum/arch/i386/lib/csum_basic.S   Fri Nov  1 22:56:19 2002
 936 @@ -0,0 +1,63 @@
 937 +.text
 938 +.align 4
 939 +.globl csum_basic
 940 +
 941 +/* Experiments with Ethernet and SLIP connections show that buff
 942 +** is aligned on either a 2-byte or 4-byte boundary.  We get at
 943 +** least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
 944 +** Fortunately, it is easy to convert 2-byte alignment to 4-byte
 945 +** alignment for the unrolled loop.
 946 +*/
 947 +csum_basic:
 948 +       movl    %ecx, %ebx
 949 +       movl    %ecx, %edx
 950 +       shrl    $7, %ecx
 951 +       andl    $0x7c, %ebx
 952 +       addl    %ebx, %esi
 953 +       shrl    $2, %ebx
 954 +       negl    %ebx
 955 +       leal    50f(%ebx,%ebx,2), %ebx
 956 +       clc
 957 +       jmp     *%ebx
 958 +40:
 959 +       leal    128(%esi), %esi
 960 +       adcl    -128(%esi), %eax
 961 +       adcl    -124(%esi), %eax
 962 +       adcl    -120(%esi), %eax
 963 +       adcl    -116(%esi), %eax
 964 +       adcl    -112(%esi), %eax
 965 +       adcl    -108(%esi), %eax
 966 +       adcl    -104(%esi), %eax
 967 +       adcl    -100(%esi), %eax
 968 +       adcl    -96(%esi), %eax
 969 +       adcl    -92(%esi), %eax
 970 +       adcl    -88(%esi), %eax
 971 +       adcl    -84(%esi), %eax
 972 +       adcl    -80(%esi), %eax
 973 +       adcl    -76(%esi), %eax
 974 +       adcl    -72(%esi), %eax
 975 +       adcl    -68(%esi), %eax
 976 +       adcl    -64(%esi), %eax
 977 +       adcl    -60(%esi), %eax
 978 +       adcl    -56(%esi), %eax
 979 +       adcl    -52(%esi), %eax
 980 +       adcl    -48(%esi), %eax
 981 +       adcl    -44(%esi), %eax
 982 +       adcl    -40(%esi), %eax
 983 +       adcl    -36(%esi), %eax
 984 +       adcl    -32(%esi), %eax
 985 +       adcl    -28(%esi), %eax
 986 +       adcl    -24(%esi), %eax
 987 +       adcl    -20(%esi), %eax
 988 +       adcl    -16(%esi), %eax
 989 +       adcl    -12(%esi), %eax
 990 +       adcl    -8(%esi), %eax
 991 +       adcl    -4(%esi), %eax
 992 +50:
 993 +       decl    %ecx
 994 +       jge     40b
 995 +
 996 +       adcl    $0, %eax
 997 +       movl    %edx, %ecx
 998 +       andl    $3, %ecx
 999 +       ret
1000 diff -urN linux-2.4.20-pre11/arch/i386/lib/csum_naive.S linux-2.4.20-pre11csum/arch/i386/lib/csum_naive.S
1001 --- linux-2.4.20-pre11/arch/i386/lib/csum_naive.S       Wed Dec 31 22:00:00 1969
1002 +++ linux-2.4.20-pre11csum/arch/i386/lib/csum_naive.S   Fri Nov  1 22:36:20 2002
1003 @@ -0,0 +1,17 @@
1004 +.text
1005 +.align 4
1006 +.globl csum_naive
1007 +
1008 +csum_naive:
1009 +       mov     %ecx, %edx
1010 +       shrl    $2, %ecx
1011 +       clc
1012 +1:
1013 +       adcl    (%esi), %eax
1014 +       leal    4(%esi), %esi
1015 +       loop    1b
1016 +
1017 +       adcl    $0, %eax
1018 +       mov     %edx, %ecx
1019 +       andl    $3, %ecx
1020 +       ret
1021 diff -urN linux-2.4.20-pre11/arch/i386/lib/csum_pf.inc linux-2.4.20-pre11csum/arch/i386/lib/csum_pf.inc
1022 --- linux-2.4.20-pre11/arch/i386/lib/csum_pf.inc        Wed Dec 31 22:00:00 1969
1023 +++ linux-2.4.20-pre11csum/arch/i386/lib/csum_pf.inc    Fri Nov  1 22:57:20 2002
1024 @@ -0,0 +1,95 @@
1025 +//#define PREFETCH(a) prefetchnta a
1026 +//#define PREFETCH(a) prefetch a
1027 +//#define PREFETCH(a)
1028 +
1029 +// How much unrolling do you want?
1030 +//vda: 5 is best on Duron 650
1031 +#define ITER_BITS 5            // ...5,6,7 - ...32,64,128 bytes
1032 +                               // NB: tweak unrolled loop too...
1033 +/*
1034 +** computes a partial checksum, e.g. for TCP/UDP fragments
1035 +** int csum_partial(const char *buff, int len, int sum)
1036 +*/
1037 +
1038 +#define ITER_SZ                (1<<ITER_BITS)
1039 +#define ITER_MSK       ((1<<ITER_BITS)-4)
1040 +
1041 +.text
1042 +.align 4
1043 +.globl NAME
1044 +
1045 +NAME:
1046 +
1047 +# Guaranteed by caller: esi is 4-aligned, ecx>=16
1048 +10:
1049 +       PREFETCH((%esi))        # Prefetch _each_ cacheline
1050 +       PREFETCH(32(%esi))      # Note! Athlons have 64 bytes long ones, but
1051 +       PREFETCH(64(%esi))      # PIIIs only 32! This gives ~20% speedup
1052 +       PREFETCH(64+32(%esi))   # for PIII
1053 +       PREFETCH(128(%esi))
1054 +       PREFETCH(128+32(%esi))
1055 +       PREFETCH(192(%esi))
1056 +       PREFETCH(192+32(%esi))
1057 +       movl    %ecx, %ebx
1058 +       movl    %ecx, %edx
1059 +       andl    $ITER_MSK, %ebx # = bytes to handle in first (partial) iteration
1060 +       shrl    $ITER_BITS, %ecx # = iterations to make
1061 +       addl    %ebx, %esi      # => 1st byte to handle in 2nd complete iteration
1062 +       shrl    $2, %ebx        # = dwords to handle
1063 +       negl    %ebx
1064 +       lea     50f(%ebx,%ebx,2), %ebx # = 45f - 3*dwords_to_handle
1065 +       clc
1066 +       jmp     *%ebx           # here we go!
1067 +
1068 +40:
1069 +       PREFETCH(256(%esi))
1070 +41:
1071 +       lea     ITER_SZ(%esi), %esi     # does NOT change CF!
1072 +/*
1073 +       addl    -128(%esi), %eax
1074 +       adcl    -124(%esi), %eax
1075 +       adcl    -120(%esi), %eax
1076 +       adcl    -116(%esi), %eax
1077 +       adcl    -112(%esi), %eax
1078 +       adcl    -108(%esi), %eax
1079 +       adcl    -104(%esi), %eax
1080 +       adcl    -100(%esi), %eax
1081 +       adcl    -96(%esi), %eax
1082 +       adcl    -92(%esi), %eax
1083 +       adcl    -88(%esi), %eax
1084 +       adcl    -84(%esi), %eax
1085 +       adcl    -80(%esi), %eax
1086 +       adcl    -76(%esi), %eax
1087 +       adcl    -72(%esi), %eax
1088 +       adcl    -68(%esi), %eax
1089 +       adcl    -64(%esi), %eax
1090 +       adcl    -60(%esi), %eax
1091 +       adcl    -56(%esi), %eax
1092 +       adcl    -52(%esi), %eax
1093 +       adcl    -48(%esi), %eax
1094 +       adcl    -44(%esi), %eax
1095 +       adcl    -40(%esi), %eax
1096 +       adcl    -36(%esi), %eax
1097 +*/
1098 +       addl    -32(%esi), %eax
1099 +       adcl    -28(%esi), %eax
1100 +       adcl    -24(%esi), %eax
1101 +       adcl    -20(%esi), %eax
1102 +       adcl    -16(%esi), %eax
1103 +       adcl    -12(%esi), %eax
1104 +       adcl    -8(%esi), %eax
1105 +       adcl    -4(%esi), %eax
1106 +50:
1107 +       adcl    $0, %eax
1108 +       dec     %ecx                    # does NOT change CF!
1109 +    # We can do just "jge 40b" here, but we can be a bit clever...
1110 +    # This little twist gives surprisingly noticeable benefits!
1111 +    # Seen 11% increase on random 1K blocks on Duron 650
1112 +       js      60f
1113 +       cmp     $256/ITER_SZ, %ecx
1114 +       jae     40b     # need prefetch
1115 +       jmp     41b     # do not need it
1116 +60:
1117 +       movl    %edx, %ecx
1118 +       andl    $3, %ecx
1119 +       ret
1120 diff -urN linux-2.4.20-pre11/arch/i386/lib/csum_ssemmxplus.S linux-2.4.20-pre11csum/arch/i386/lib/csum_ssemmxplus.S
1121 --- linux-2.4.20-pre11/arch/i386/lib/csum_ssemmxplus.S  Wed Dec 31 22:00:00 1969
1122 +++ linux-2.4.20-pre11csum/arch/i386/lib/csum_ssemmxplus.S      Fri Nov  1 22:48:39 2002
1123 @@ -0,0 +1,4 @@
1124 +#define PREFETCH(a) prefetchnta a
1125 +#define NAME csum_ssemmxplus
1126 +
1127 +#include "csum_pf.inc"
1128 diff -urN linux-2.4.20-pre11/arch/i386/lib/csumcpy.S linux-2.4.20-pre11csum/arch/i386/lib/csumcpy.S
1129 --- linux-2.4.20-pre11/arch/i386/lib/csumcpy.S  Wed Dec 31 22:00:00 1969
1130 +++ linux-2.4.20-pre11csum/arch/i386/lib/csumcpy.S      Fri Nov  1 22:49:44 2002
1131 @@ -0,0 +1,178 @@
1132 +/*
1133 + * INET                An implementation of the TCP/IP protocol suite for the LINUX
1134 + *             operating system.  INET is implemented using the  BSD Socket
1135 + *             interface as the means of communication with the user level.
1136 + *
1137 + *             IP/TCP/UDP checksumming routines
1138 + *
1139 + * Authors:    Jorge Cwik, <jorge@laser.satlink.net>
1140 + *             Arnt Gulbrandsen, <agulbra@nvg.unit.no>
1141 + *             Tom May, <ftom@netcom.com>
1142 + *              Pentium Pro/II routines:
1143 + *              Alexander Kjeldaas <astor@guardian.no>
1144 + *              Finn Arne Gangstad <finnag@guardian.no>
1145 + *             Lots of code moved from tcp.c and ip.c; see those files
1146 + *             for more names.
1147 + *
1148 + * Changes:     Ingo Molnar, converted csum_partial_copy() to 2.1 exception
1149 + *                          handling.
1150 + *             Andi Kleen,  add zeroing on error converted to pure assembler
1151 + *             2002-10-30 Denis Vlasenko
1152 + *                     boot-time benchmarking, 3Dnow/MMX+/SSE versions
1153 + *
1154 + *             This program is free software; you can redistribute it and/or
1155 + *             modify it under the terms of the GNU General Public License
1156 + *             as published by the Free Software Foundation; either version
1157 + *             2 of the License, or (at your option) any later version.
1158 + */
1159 +
1160 +#include <asm/errno.h>
1161 +
1162 +/*
1163 +** computes a partial checksum, e.g. for TCP/UDP fragments
1164 +**
1165 +** unsigned int csum_partial(const unsigned char * buff,
1166 +**             int len, unsigned int sum)
1167 +*/
1168 +
1169 +#ifdef __KERNEL__
1170 +#define K(a...) a
1171 +#else
1172 +#define K(a...)
1173 +#endif
1174 +
1175 +#define SRC(y...) \
1176 +9999:  y                       ;\
1177 +       .section __ex_table, "a";\
1178 +       .long   9999b, 6001f    ;\
1179 +       .previous
1180 +
1181 +#define DST(y...) \
1182 +9999:  y                       ;\
1183 +       .section __ex_table, "a";\
1184 +       .long   9999b, 6002f    ;\
1185 +       .previous
1186 +
1187 +#define        KERNEL_FPU_BEGIN \
1188 +       call    kernel_fpu_begin
1189 +
1190 +#define        KERNEL_FPU_END(r) \
1191 +K(     movl    %cr0, r         ;)\
1192 +K(     orl     $8, r           ;)\
1193 +K(     movl    r, %cr0         ;)
1194 +
1195 +.text
1196 +
1197 +#include "csumcpy_naive.inc"
1198 +#include "csumcpy_basic.inc"
1199 +#include "csumcpy_ssemmxplus.inc"
1200 +#include "csumcpy_sse.inc"
1201 +
1202 +.align 4
1203 +.globl csum_partial_copy_generic
1204 +
1205 +csum_partial_copy_generic:
1206 +       pushl   %ebx
1207 +       pushl   %edi
1208 +       pushl   %esi
1209 +       pushl   %ebp
1210 +       movl    %esp, %ebp
1211 +
1212 +#define STK_DERR 40(%ebp)
1213 +#define STK_SERR 36(%ebp)
1214 +#define STK_SUM 32(%ebp)
1215 +#define STK_LEN 28(%ebp)
1216 +#define STK_DST 24(%ebp)
1217 +#define STK_SRC 20(%ebp)
1218 +#define STK_EIP 16(%ebp)
1219 +#define STK_EBX 12(%ebp)
1220 +#define STK_EDI 8(%ebp)
1221 +#define STK_ESI 4(%ebp)
1222 +#define STK_EBP (%ebp)
1223 +
1224 +       movl    STK_SRC, %esi   #src
1225 +       movl    STK_DST, %edi   #dst
1226 +       movl    STK_LEN, %ecx   #len
1227 +       movl    STK_SUM, %eax   #sum
1228 +
1229 +        testl  $3, %edi        # Check dst alignment
1230 +        jz     40f
1231 +
1232 +    # not 4-aligned: analyze and align...
1233 +       testl   $1, %edi
1234 +       jz      30f
1235 +
1236 +    # unaligned start addr
1237 +       decl    %ecx
1238 +       js      90f             # sz==0, exit
1239 +       movzbl  (%esi), %ebx    # eat one byte...
1240 +       movb    %bl, (%edi)
1241 +       addl    %ebx, %eax
1242 +       adcl    $0, %eax
1243 +       roll    $8, %eax        # NB: need to be undone at exit!
1244 +       incl    %esi
1245 +       incl    %edi
1246 +       testl   $2, %edi
1247 +       jz      40f
1248 +30:
1249 +    # xxx 2-aligned, but not 4-aligned
1250 +       cmpl    $3, %ecx
1251 +       jbe     60f
1252 +       movw    (%esi), %bx     # eat 2 bytes
1253 +       addw    %bx, %ax
1254 +       movw    %bx, (%edi)
1255 +       adcl    $0, %eax
1256 +       leal    2(%esi), %esi
1257 +       leal    2(%edi), %edi
1258 +       subl    $2, %ecx
1259 +40:
1260 +    # edi is 4-aligned now: call block routine
1261 +       movl    $csumcpy_basic, %ebx    # 'default', known good for ecx==0 etc
1262 +       cmpl    $128, %ecx              # use optimized routine
1263 +       jb      50f                     #   only for large blocks
1264 +       movl    best_csumcpy, %ebx
1265 +50:    call    *%ebx
1266 +60:
1267 +    # handle last 0-3 bytes
1268 +       jecxz   80f
1269 +       cmpl    $2, %ecx
1270 +       jb      70f
1271 +SRC(   movw    (%esi), %cx         )
1272 +       leal    2(%esi), %esi
1273 +DST(   movw    %cx, (%edi)         )
1274 +       leal    2(%edi), %edi
1275 +       je      75f
1276 +       shll    $16, %ecx
1277 +70:
1278 +SRC(   movb    (%esi), %cl         )
1279 +DST(   movb    %cl, (%edi)         )
1280 +75:    addl    %ecx, %eax
1281 +       adcl    $0, %eax
1282 +80:
1283 +    # undo csum rotation if dst was unaligned
1284 +       testl   $1, STK_DST
1285 +       jz      90f
1286 +       roll    $8, %eax
1287 +90:
1288 +       movl    %esp, %ebp
1289 +       popl    %ebp
1290 +       popl    %esi
1291 +       popl    %edi
1292 +       popl    %ebx
1293 +       ret
1294 +
1295 +
1296 +.section .fixup, "ax"
1297 +6001:  movl    STK_SERR, %ebx          # src_err_ptr
1298 +       movl    $-EFAULT, (%ebx)
1299 +       # zero the complete destination (computing the rest is too much work)
1300 +       movl    STK_DST, %edi           # dst
1301 +       movl    STK_LEN, %ecx           # len
1302 +       xorl    %eax, %eax
1303 +       cld
1304 +       rep; stosb
1305 +       jmp     90b
1306 +6002:  movl    STK_DERR, %ebx          # dst_err_ptr
1307 +       movl    $-EFAULT, (%ebx)
1308 +       jmp     90b
1309 +.previous
1310 diff -urN linux-2.4.20-pre11/arch/i386/lib/csumcpy_basic.inc linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_basic.inc
1311 --- linux-2.4.20-pre11/arch/i386/lib/csumcpy_basic.inc  Wed Dec 31 22:00:00 1969
1312 +++ linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_basic.inc      Fri Nov  1 23:27:28 2002
1313 @@ -0,0 +1,40 @@
1314 +// Please somebody experiment with unroll length
1315 +// on a PII. Do _not_ optimize for PIII/Athlons/etc,
1316 +// they won't typically use this...
1317 +
1318 +.align 4
1319 +.globl csumcpy_basic
1320 +
1321 +csumcpy_basic:
1322 +       movl    %ecx, %ebx
1323 +       movl    %ecx, %edx
1324 +       shrl    $6, %ecx
1325 +       andl    $0x3c, %ebx
1326 +       negl    %ebx
1327 +       subl    %ebx, %esi
1328 +       subl    %ebx, %edi
1329 +       leal    50f(%ebx,%ebx), %ebx
1330 +       clc
1331 +       jmp     *%ebx
1332 +40:
1333 +       leal    64(%esi), %esi
1334 +       leal    64(%edi), %edi
1335 +
1336 +#undef ROUND
1337 +#define ROUND(x) \
1338 +SRC(   movl    x(%esi), %ebx   );      \
1339 +       adcl    %ebx, %eax      ;       \
1340 +DST(   movl    %ebx, x(%edi)   );
1341 +
1342 +       ROUND(-64) ROUND(-60) ROUND(-56) ROUND(-52)
1343 +       ROUND(-48) ROUND(-44) ROUND(-40) ROUND(-36)
1344 +       ROUND(-32) ROUND(-28) ROUND(-24) ROUND(-20)
1345 +       ROUND(-16) ROUND(-12) ROUND(-8)  ROUND(-4)
1346 +50:
1347 +       decl    %ecx
1348 +       jge     40b
1349 +
1350 +       adcl    $0, %eax
1351 +       movl    %edx, %ecx
1352 +       andl    $3, %ecx
1353 +       ret
1354 diff -urN linux-2.4.20-pre11/arch/i386/lib/csumcpy_naive.inc linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_naive.inc
1355 --- linux-2.4.20-pre11/arch/i386/lib/csumcpy_naive.inc  Wed Dec 31 22:00:00 1969
1356 +++ linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_naive.inc      Fri Nov  1 23:27:51 2002
1357 @@ -0,0 +1,21 @@
1358 +// Heh... at least it's small ;)
1359 +
1360 +.align 4
1361 +.globl csumcpy_naive
1362 +
1363 +csumcpy_naive:
1364 +       mov     %ecx, %edx
1365 +       shrl    $2, %ecx
1366 +       clc
1367 +1:
1368 +SRC(   movl    (%esi), %ebx    )
1369 +DST(   movl    %ebx, (%edi)    )
1370 +       adcl    %ebx, %eax
1371 +       leal    4(%esi), %esi
1372 +       leal    4(%edi), %edi
1373 +       loop    1b
1374 +
1375 +       adcl    $0, %eax
1376 +       mov     %edx, %ecx
1377 +       and     $3, %ecx
1378 +       ret
1379 diff -urN linux-2.4.20-pre11/arch/i386/lib/csumcpy_sse.inc linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_sse.inc
1380 --- linux-2.4.20-pre11/arch/i386/lib/csumcpy_sse.inc    Wed Dec 31 22:00:00 1969
1381 +++ linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_sse.inc        Fri Nov  1 23:38:32 2002
1382 @@ -0,0 +1,147 @@
1383 +// Huge routine, I don't like it's size and number
1384 +// of fixups... think of that when you want
1385 +// to unroll loop more
1386 +// TODO: benchmark and reduce size
1387 +// I won't stand 1K behemot just for 5% speedup
1388 +
1389 +#undef PREFETCH
1390 +#define PREFETCH(a) prefetchnta a
1391 +
1392 +// How much unrolling do you want?
1393 +// vda: celeron 1200: 5 with movaps, 4 with movups
1394 +#undef ITER_BITS
1395 +#define ITER_BITS 6    // ...4,5,6,7 - ...16,32,64,128 bytes
1396 +                        // NB: tweak unrolled loop too...
1397 +
1398 +#undef ITER_SZ
1399 +#undef ITER_MSK
1400 +#define ITER_SZ                (1<<ITER_BITS)
1401 +#define ITER_MSK       ((1<<ITER_BITS)-4)
1402 +
1403 +.align 4
1404 +.globl csumcpy_sse
1405 +
1406 +csumcpy_sse:
1407 +       testl   $0xe, %edi      # Check alignment
1408 +       jnz     5500f           # align to 16 bytes
1409 +1:
1410 +       movl    %ecx, %edx
1411 +       shrl    $ITER_BITS, %ecx
1412 +       jz      20f
1413 +
1414 +# "big chunks" loop
1415 +       PREFETCH((%esi))        # Prefetch a couple of cachelines
1416 +       PREFETCH(32(%esi))      // Note: Athlons have 64 bytes long ones, but
1417 +       PREFETCH(64(%esi))      // PIIIs only 32! This gives ~20% speedup
1418 +       PREFETCH(64+32(%esi))   // for PIII
1419 +       PREFETCH(128(%esi))     // Note2: 128 pf depth is slower for Athlons
1420 +       PREFETCH(128+32(%esi))  // let them enjoy 256
1421 +       PREFETCH(192(%esi))
1422 +       PREFETCH(192+32(%esi))
1423 +
1424 +       //KERNEL_FPU_BEGIN      // We can't use lazy save - can be in irq :(
1425 +       subl    $32, %esp       // hopefully this is not too slow...
1426 +K(     movl    %cr0, %ebx      )
1427 +K(     clts                    )
1428 +       movups  %xmm0, (%esp)
1429 +       movups  %xmm1, 16(%esp)
1430 +
1431 +
1432 +#undef ROUND0
1433 +#undef ROUND
1434 +#define ROUND0(au,r) \
1435 +SRC(   mov##au##ps (%esi), r   ;)      \
1436 +       adcl    (%esi), %eax    ;       \
1437 +       adcl    4(%esi), %eax   ;       \
1438 +       adcl    8(%esi), %eax   ;       \
1439 +       adcl    12(%esi), %eax  ;       \
1440 +DST(   movntps r, (%edi)       ;)      \
1441 +
1442 +#define ROUND(au,x,r) \
1443 +SRC(   mov##au##ps x(%esi), r  ;)      \
1444 +       adcl    x(%esi), %eax   ;       \
1445 +       adcl    x+4(%esi), %eax ;       \
1446 +       adcl    x+8(%esi), %eax ;       \
1447 +       adcl    x+12(%esi), %eax;       \
1448 +DST(   movntps r, x(%edi)      ;)      \
1449 +
1450 +// ROUND[0]: edi must be 16-aligned!
1451 +// if esi is not aligned, movaps wouldn't work,
1452 +// not caught by testsuite. TODO.
1453 +// We don't need SRC() around adcl's
1454 +// (exception, if any, would be caught by 1st one)
1455 +// (FIXME: can races against interrupts bite us?)
1456 +
1457 +       testl   $0xf, %esi      # Check esi alignment + clear CF
1458 +       jz      15f
1459 +10:                            # esi is NOT 16-aligned
1460 +       PREFETCH(256(%esi))
1461 +       ROUND0(u,%xmm0)
1462 +       ROUND(u,16,%xmm1)
1463 +       PREFETCH(256+32(%esi))
1464 +       ROUND(u,32,%xmm0)
1465 +       ROUND(u,48,%xmm1)
1466 +       lea     ITER_SZ(%esi), %esi
1467 +       lea     ITER_SZ(%edi), %edi
1468 +       //dec   %ecx
1469 +       //jnz   10b
1470 +       loop    10b     // Beware: loop and ITER_BITS>6 don't mix
1471 +       adcl    $0, %eax
1472 +       jmp     19f
1473 +15:                            # esi is 16-aligned
1474 +       PREFETCH(256(%esi))
1475 +       ROUND0(a,%xmm0)
1476 +       ROUND(a,16,%xmm1)
1477 +       PREFETCH(256+32(%esi))
1478 +       ROUND(a,32,%xmm0)
1479 +       ROUND(a,48,%xmm1)
1480 +       lea     ITER_SZ(%esi), %esi
1481 +       lea     ITER_SZ(%edi), %edi
1482 +       //dec   %ecx
1483 +       //jnz   15b
1484 +       loop    15b     // Beware: loop and ITER_BITS>6 don't mix
1485 +       adcl    $0, %eax
1486 +19:
1487 +       sfence                  # clean up XMM
1488 +       //KERNEL_FPU_END(%ebx)
1489 +       movups  (%esp), %xmm0
1490 +       movups  16(%esp), %xmm1
1491 +       addl    $32, %esp
1492 +K(     movl    %ebx, %cr0      )
1493 +
1494 +20:
1495 +    # loop for dwords
1496 +       movl    %edx, %ecx
1497 +       andl    $ITER_MSK, %edx
1498 +       jz      40f
1499 +       shrl    $2, %edx        # this also clears CF
1500 +30:
1501 +SRC(   movl    (%esi), %ebx    )
1502 +       adcl    %ebx, %eax
1503 +DST(   movl    %ebx, (%edi)    )
1504 +       lea     4(%esi), %esi
1505 +       lea     4(%edi), %edi
1506 +       dec     %edx
1507 +       jnz     30b
1508 +       adcl    $0, %eax
1509 +40:
1510 +    # last 1, 2 or 3 bytes: handled by caller
1511 +       andl    $3, %ecx
1512 +       ret
1513 +
1514 +
1515 +# xxx 16-align edi and get back
1516 +5500:  cmp     $ITER_SZ, %ecx  # edi is 4-aligned here
1517 +       mov     %ecx, %edx      # edx needed at 20:
1518 +       jb      20b             # not worthy: too short
1519 +
1520 +5520:  test    $0xe, %edi      # loop until we are 16-aligned
1521 +       jz      1b
1522 +SRC(   movl    (%esi), %ebx    )
1523 +       addl    $4, %esi
1524 +DST(   movl    %ebx, (%edi)    )
1525 +       addl    $4, %edi
1526 +       addl    %ebx, %eax
1527 +       adcl    $0, %eax
1528 +       subl    $4, %ecx
1529 +       jmp     5520b
1530 diff -urN linux-2.4.20-pre11/arch/i386/lib/csumcpy_ssemmxplus.inc linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_ssemmxplus.inc
1531 --- linux-2.4.20-pre11/arch/i386/lib/csumcpy_ssemmxplus.inc     Wed Dec 31 22:00:00 1969
1532 +++ linux-2.4.20-pre11csum/arch/i386/lib/csumcpy_ssemmxplus.inc Fri Nov  1 23:22:58 2002
1533 @@ -0,0 +1,103 @@
1534 +#undef PREFETCH
1535 +#define PREFETCH(a) prefetchnta a
1536 +
1537 +// How much unrolling do you want?
1538 +#undef ITER_BITS
1539 +#define ITER_BITS 5             // ...5,6,7 - ...32,64,128 bytes
1540 +                                // NB: tweak unrolled loop too...
1541 +
1542 +#undef ITER_SZ
1543 +#undef ITER_MSK
1544 +#define ITER_SZ                (1<<ITER_BITS)
1545 +#define ITER_MSK       ((1<<ITER_BITS)-4)
1546 +
1547 +.align 4
1548 +.globl csumcpy_ssemmxplus
1549 +
1550 +csumcpy_ssemmxplus:
1551 +       movl    %ecx, %edx
1552 +       shrl    $ITER_BITS, %ecx
1553 +       jz      20f
1554 +
1555 +# "big chunks" loop
1556 +       PREFETCH((%esi))        # Prefetch a couple of cachelines
1557 +       PREFETCH(32(%esi))      // Note: Athlons have 64 bytes long ones, but
1558 +       PREFETCH(64(%esi))      // PIIIs only 32! This gives ~20% speedup
1559 +       PREFETCH(64+32(%esi))   // for PIII
1560 +       PREFETCH(128(%esi))     // Note2: 128 pf depth is slower for Athlons
1561 +       PREFETCH(128+32(%esi))  // let them enjoy 256
1562 +       PREFETCH(192(%esi))
1563 +       PREFETCH(192+32(%esi))
1564 +
1565 +       //KERNEL_FPU_BEGIN      // We can't use lazy save - can be in irq :(
1566 +K(     movl    %cr0, %ebx      )
1567 +K(     clts                    )
1568 +       subl    $108, %esp
1569 +       fnsave  (%esp)
1570 +       fwait
1571 +
1572 +       clc
1573 +
1574 +#undef ROUND0
1575 +#undef ROUND
1576 +#define ROUND0(r) \
1577 +SRC(   movq    (%esi), r       ;)      \
1578 +       adcl    (%esi), %eax    ;       \
1579 +       adcl    4(%esi), %eax   ;       \
1580 +DST(   movntq  r, (%edi)       ;)      \
1581 +
1582 +#define ROUND(x,r) \
1583 +SRC(   movq    x(%esi), r      ;)      \
1584 +       adcl    x(%esi), %eax   ;       \
1585 +       adcl    x+4(%esi), %eax ;       \
1586 +DST(   movntq  r, x(%edi)      ;)      \
1587 +
1588 +// moving store to the end of a ROUND makes it faster
1589 +// don't ask me why
1590 +// we don't need SRC() around adcl's
1591 +// (exception, if any, would be caught by 1st one)
1592 +// (FIXME: can races against interrupts bite us?)
1593 +
1594 +10:
1595 +       PREFETCH(256(%esi))
1596 +       ROUND0(%mm0)            // using mm1,2,3 does not speed up things
1597 +       ROUND(8,%mm0)
1598 +       ROUND(16,%mm0)
1599 +       ROUND(24,%mm0)
1600 +/*     PREFETCH(256+32(%esi))
1601 +       ROUND(32,%mm0)
1602 +       ROUND(40,%mm0)
1603 +       ROUND(48,%mm0)
1604 +       ROUND(56,%mm0)*/
1605 +
1606 +       lea     ITER_SZ(%esi), %esi
1607 +       lea     ITER_SZ(%edi), %edi
1608 +       //dec   %ecx
1609 +       //jnz   10b
1610 +       loop    10b     // Beware: loop and ITER_BITS>5 don't mix
1611 +       adcl    $0, %eax
1612 +
1613 +       sfence
1614 +       //KERNEL_FPU_END(%ebx)
1615 +       frstor  (%esp)
1616 +       addl    $108, %esp
1617 +K(     movl    %ebx, %cr0      )
1618 +
1619 +20:
1620 +    # loop for dwords
1621 +       movl    %edx, %ecx
1622 +       andl    $ITER_MSK, %edx
1623 +       jz      40f
1624 +       shrl    $2, %edx        # this also clears CF
1625 +30:
1626 +SRC(   movl    (%esi), %ebx    )
1627 +       adcl    %ebx, %eax
1628 +DST(   movl    %ebx, (%edi)    )
1629 +       lea     4(%esi), %esi
1630 +       lea     4(%edi), %edi
1631 +       dec     %edx
1632 +       jnz     30b
1633 +       adcl    $0, %eax
1634 +
1635 +40:    andl    $3, %ecx
1636 +       ret