linux-2.4.22-evms-2.1.1.patch

   1 diff -urN linux-2.4.22/drivers/md/Config.in linux-2.4.22-evms/drivers/md/Config.in
   2 --- linux-2.4.22/drivers/md/Config.in   2003-09-15 17:07:45.000000000 +0200
   3 +++ linux-2.4.22-evms/drivers/md/Config.in      2003-09-15 17:09:48.000000000 +0200
   4 @@ -16,5 +16,9 @@
   5  dep_tristate ' Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM $CONFIG_MD
   6  dep_tristate ' Device-mapper support' CONFIG_BLK_DEV_DM $CONFIG_MD
   7  dep_tristate '  Mirror (RAID-1) support' CONFIG_BLK_DEV_DM_MIRROR $CONFIG_BLK_DEV_DM
   8 +if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
   9 +   dep_tristate '   Bad Block Relocation Device Target' CONFIG_BLK_DEV_DM_BBR $CONFIG_BLK_DEV_DM
  10 +   dep_tristate '   Sparse Device Target' CONFIG_BLK_DEV_DM_SPARSE $CONFIG_BLK_DEV_DM
  11 +fi
  12
  13  endmenu
  14 diff -urN linux-2.4.22/drivers/md/Makefile linux-2.4.22-evms/drivers/md/Makefile
  15 --- linux-2.4.22/drivers/md/Makefile    2003-09-15 17:07:45.000000000 +0200
  16 +++ linux-2.4.22-evms/drivers/md/Makefile       2003-09-15 17:09:48.000000000 +0200
  17 @@ -30,6 +30,8 @@
  18
  19  obj-$(CONFIG_BLK_DEV_DM)               += dm-mod.o
  20  obj-$(CONFIG_BLK_DEV_DM_MIRROR)                += dm-mirror.o
  21 +obj-$(CONFIG_BLK_DEV_DM_BBR)           += dm-bbr.o
  22 +obj-$(CONFIG_BLK_DEV_DM_SPARSE)                += dm-sparse.o
  23
  24  include $(TOPDIR)/Rules.make
  25
  26 diff -urN linux-2.4.22/drivers/md/dm-bbr.c linux-2.4.22-evms/drivers/md/dm-bbr.c
  27 --- linux-2.4.22/drivers/md/dm-bbr.c    1970-01-01 01:00:00.000000000 +0100
  28 +++ linux-2.4.22-evms/drivers/md/dm-bbr.c       2003-09-15 17:08:42.000000000 +0200
  29 @@ -0,0 +1,1228 @@
  30 +/*
  31 + *   Copyright (c) International Business Machines  Corp., 2002-2003
  32 + *
  33 + *   This program is free software;  you can redistribute it and/or modify
  34 + *   it under the terms of the GNU General Public License as published by
  35 + *   the Free Software Foundation; either version 2 of the License, or
  36 + *   (at your option) any later version.
  37 + *
  38 + *   This program is distributed in the hope that it will be useful,
  39 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  40 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  41 + *   the GNU General Public License for more details.
  42 + *
  43 + *   You should have received a copy of the GNU General Public License
  44 + *   along with this program;  if not, write to the Free Software
  45 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  46 + *
  47 + * linux/drivers/md/dm-bbr.c
  48 + *
  49 + * Bad-block-relocation (BBR) target for device-mapper.
  50 + *
  51 + * The BBR target is designed to remap I/O write failures to another safe
  52 + * location on disk. Note that most disk drives have BBR built into them,
  53 + * this means that our software BBR will be only activated when all hardware
  54 + * BBR replacement sectors have been used.
  55 + */
  56 +
  57 +#include <linux/kernel.h>
  58 +#include <linux/module.h>
  59 +#include <linux/init.h>
  60 +#include <linux/blkdev.h>
  61 +#include <linux/spinlock.h>
  62 +#include <linux/smp_lock.h>
  63 +#include <linux/slab.h>
  64 +#include <linux/mempool.h>
  65 +#include "dm.h"
  66 +#include "dm-bbr.h"
  67 +#include "dm-daemon.h"
  68 +#include "dm-io.h"
  69 +
  70 +/* Number of active BBR devices. */
  71 +static int bbr_instances = 0;
  72 +static DECLARE_MUTEX(bbr_instances_lock);
  73 +
  74 +/* Data pertaining to the I/O thread. */
  75 +static struct dm_daemon * bbr_io_thread = NULL;
  76 +static spinlock_t bbr_io_list_lock = SPIN_LOCK_UNLOCKED;
  77 +static LIST_HEAD(bbr_io_list);
  78 +static void bbr_io_handler(void);
  79 +
  80 +/* Global pools for bbr_io_buf's and bbr_remap's. */
  81 +static kmem_cache_t * bbr_io_buf_cache;
  82 +static mempool_t * bbr_io_buf_pool;
  83 +static kmem_cache_t * bbr_remap_cache;
  84 +static mempool_t * bbr_remap_pool;
  85 +
  86 +static void bbr_free_remap(struct bbr_private * bbr_id);
  87 +
  88 +/**
  89 + * destroy_pools
  90 + *
  91 + * Delete the pools for the remap list and I/O anchors.
  92 + **/
  93 +static void destroy_pools(void)
  94 +{
  95 +       if (bbr_io_buf_pool) {
  96 +               mempool_destroy(bbr_io_buf_pool);
  97 +               bbr_io_buf_pool = NULL;
  98 +       }
  99 +       if (bbr_io_buf_cache) {
 100 +               kmem_cache_destroy(bbr_io_buf_cache);
 101 +               bbr_io_buf_cache = NULL;
 102 +       }
 103 +       if (bbr_remap_pool) {
 104 +               mempool_destroy(bbr_remap_pool);
 105 +               bbr_remap_pool = NULL;
 106 +       }
 107 +       if (bbr_remap_cache) {
 108 +               kmem_cache_destroy(bbr_remap_cache);
 109 +               bbr_remap_cache = NULL;
 110 +       }
 111 +}
 112 +
 113 +/**
 114 + * create_pools
 115 + *
 116 + * Create mempools for the remap list and I/O anchors.
 117 + **/
 118 +static int create_pools(void)
 119 +{
 120 +       if (!bbr_remap_cache) {
 121 +               bbr_remap_cache = kmem_cache_create("BBR_Remap_Cache",
 122 +                                                   sizeof(struct bbr_runtime_remap),
 123 +                                                   0, SLAB_HWCACHE_ALIGN,
 124 +                                                   NULL, NULL);
 125 +               if (!bbr_remap_cache) {
 126 +                       DMERR("Unable to create BBR remap cache.");
 127 +                       goto out;
 128 +               }
 129 +       }
 130 +       if (!bbr_remap_pool) {
 131 +               bbr_remap_pool = mempool_create(64, mempool_alloc_slab,
 132 +                                               mempool_free_slab,
 133 +                                               bbr_remap_cache);
 134 +               if (!bbr_remap_pool) {
 135 +                       DMERR("Unable to create BBR remap mempool.");
 136 +                       goto out;
 137 +               }
 138 +       }
 139 +
 140 +       if (!bbr_io_buf_cache) {
 141 +               bbr_io_buf_cache = kmem_cache_create("BBR_IO_Buf_Cache",
 142 +                                                    sizeof(struct bbr_io_buffer),
 143 +                                                    0, SLAB_HWCACHE_ALIGN,
 144 +                                                    NULL, NULL);
 145 +               if (!bbr_io_buf_cache) {
 146 +                       DMERR("Unable to create BBR I/O buffer cache.");
 147 +                       goto out;
 148 +               }
 149 +       }
 150 +       if (!bbr_io_buf_pool) {
 151 +               bbr_io_buf_pool = mempool_create(256, mempool_alloc_slab,
 152 +                                                mempool_free_slab,
 153 +                                                bbr_io_buf_cache);
 154 +               if (!bbr_io_buf_pool) {
 155 +                       DMERR("Unable to create BBR I/O buffer mempool.");
 156 +                       goto out;
 157 +               }
 158 +       }
 159 +
 160 +out:
 161 +       if (!bbr_remap_cache  || !bbr_remap_pool ||
 162 +           !bbr_io_buf_cache || !bbr_io_buf_pool ) {
 163 +               destroy_pools();
 164 +               return -ENOMEM;
 165 +       }
 166 +
 167 +       return 0;
 168 +}
 169 +
 170 +/**
 171 + * stop_io_thread
 172 + *
 173 + * Use the dm-daemon services to stop the BBR I/O thread.
 174 + **/
 175 +static void stop_io_thread(void)
 176 +{
 177 +       if (bbr_io_thread) {
 178 +               dm_daemon_stop(bbr_io_thread);
 179 +               kfree(bbr_io_thread);
 180 +               bbr_io_thread = NULL;
 181 +       }
 182 +}
 183 +
 184 +/**
 185 + * stop_io_thread
 186 + *
 187 + * Use the dm-daemon services to start the BBR I/O thread.
 188 + **/
 189 +static int start_io_thread(void)
 190 +{
 191 +       int rc;
 192 +
 193 +       if (!bbr_io_thread) {
 194 +               bbr_io_thread = kmalloc(sizeof(*bbr_io_thread), GFP_KERNEL);
 195 +               if (!bbr_io_thread) {
 196 +                       return -ENOMEM;
 197 +               }
 198 +
 199 +               rc = dm_daemon_start(bbr_io_thread, "bbr_io", bbr_io_handler);
 200 +               if (rc) {
 201 +                       kfree(bbr_io_thread);
 202 +                       return rc;
 203 +               }
 204 +       }
 205 +
 206 +       return 0;
 207 +}
 208 +
 209 +/**
 210 + * bbr_global_init
 211 + *
 212 + * Set up the mempools, I/O thread, and sync-I/O service. This should
 213 + * be called only when the first bbr device is created.
 214 + **/
 215 +static int bbr_global_init(void)
 216 +{
 217 +       int rc;
 218 +
 219 +       rc = create_pools();
 220 +       if (rc) {
 221 +               goto out;
 222 +       }
 223 +
 224 +       rc = start_io_thread();
 225 +       if (rc) {
 226 +               destroy_pools();
 227 +               goto out;
 228 +       }
 229 +
 230 +       rc = dm_io_get(1);
 231 +       if (rc) {
 232 +               destroy_pools();
 233 +               stop_io_thread();
 234 +               goto out;
 235 +       }
 236 +
 237 +out:
 238 +       return rc;
 239 +}
 240 +
 241 +/**
 242 + * bbr_global_cleanup
 243 + *
 244 + * Cleanup the mempools, I/O thread and sync-I/O service. This should
 245 + * be called only when the last bbr device is removed.
 246 + **/
 247 +static void bbr_global_cleanup(void)
 248 +{
 249 +       destroy_pools();
 250 +       stop_io_thread();
 251 +       dm_io_put(1);
 252 +}
 253 +
 254 +static struct bbr_private * bbr_alloc_private(void)
 255 +{
 256 +       struct bbr_private * bbr_id;
 257 +
 258 +       bbr_id = kmalloc(sizeof(*bbr_id), GFP_KERNEL);
 259 +       if (bbr_id) {
 260 +               memset(bbr_id, 0, sizeof(*bbr_id));
 261 +               bbr_id->in_use_replacement_blks = (atomic_t)ATOMIC_INIT(0);
 262 +               bbr_id->bbr_id_lock = SPIN_LOCK_UNLOCKED;
 263 +       }
 264 +
 265 +       return bbr_id;
 266 +}
 267 +
 268 +static void bbr_free_private(struct bbr_private * bbr_id)
 269 +{
 270 +       if (bbr_id->bbr_table) {
 271 +               kfree(bbr_id->bbr_table);
 272 +       }
 273 +       bbr_free_remap(bbr_id);
 274 +       kfree(bbr_id);
 275 +}
 276 +
 277 +static u32 crc_table[256];
 278 +static u32 crc_table_built = 0;
 279 +
 280 +static void build_crc_table(void)
 281 +{
 282 +       u32 i, j, crc;
 283 +
 284 +       for (i = 0; i <= 255; i++) {
 285 +               crc = i;
 286 +               for (j = 8; j > 0; j--) {
 287 +                       if (crc & 1)
 288 +                               crc = (crc >> 1) ^ CRC_POLYNOMIAL;
 289 +                       else
 290 +                               crc >>= 1;
 291 +               }
 292 +               crc_table[i] = crc;
 293 +       }
 294 +       crc_table_built = 1;
 295 +}
 296 +
 297 +static u32 calculate_crc(u32 crc, void * buffer, u32 buffersize)
 298 +{
 299 +       unsigned char * current_byte;
 300 +       u32 temp1, temp2, i;
 301 +
 302 +       current_byte = (unsigned char *) buffer;
 303 +       /* Make sure the crc table is available */
 304 +       if (!crc_table_built)
 305 +               build_crc_table();
 306 +       /* Process each byte in the buffer. */
 307 +       for (i = 0; i < buffersize; i++) {
 308 +               temp1 = (crc >> 8) & 0x00FFFFFF;
 309 +               temp2 = crc_table[(crc ^ (u32) * current_byte) &
 310 +                                 (u32) 0xff];
 311 +               current_byte++;
 312 +               crc = temp1 ^ temp2;
 313 +       }
 314 +       return crc;
 315 +}
 316 +
 317 +/**
 318 + * le_bbr_table_sector_to_cpu
 319 + *
 320 + * Convert bbr meta data from on-disk (LE) format
 321 + * to the native cpu endian format.
 322 + **/
 323 +static void le_bbr_table_sector_to_cpu(struct bbr_table * p)
 324 +{
 325 +       int i;
 326 +       p->signature            = le32_to_cpup(&p->signature);
 327 +       p->crc                  = le32_to_cpup(&p->crc);
 328 +       p->sequence_number      = le32_to_cpup(&p->sequence_number);
 329 +       p->in_use_cnt           = le32_to_cpup(&p->in_use_cnt);
 330 +       for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
 331 +               p->entries[i].bad_sect =
 332 +                       le64_to_cpup(&p->entries[i].bad_sect);
 333 +               p->entries[i].replacement_sect =
 334 +                       le64_to_cpup(&p->entries[i].replacement_sect);
 335 +       }
 336 +}
 337 +
 338 +/**
 339 + * cpu_bbr_table_sector_to_le
 340 + *
 341 + * Convert bbr meta data from cpu endian format to on-disk (LE) format
 342 + **/
 343 +static void cpu_bbr_table_sector_to_le(struct bbr_table * p,
 344 +                                      struct bbr_table * le)
 345 +{
 346 +       int i;
 347 +       le->signature           = cpu_to_le32p(&p->signature);
 348 +       le->crc                 = cpu_to_le32p(&p->crc);
 349 +       le->sequence_number     = cpu_to_le32p(&p->sequence_number);
 350 +       le->in_use_cnt          = cpu_to_le32p(&p->in_use_cnt);
 351 +       for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
 352 +               le->entries[i].bad_sect =
 353 +                       cpu_to_le64p(&p->entries[i].bad_sect);
 354 +               le->entries[i].replacement_sect =
 355 +                       cpu_to_le64p(&p->entries[i].replacement_sect);
 356 +       }
 357 +}
 358 +
 359 +/**
 360 + * validate_bbr_table_sector
 361 + *
 362 + * Check the specified BBR table sector for a valid signature and CRC. If it's
 363 + * valid, endian-convert the table sector.
 364 + **/
 365 +static int validate_bbr_table_sector(struct bbr_table * p)
 366 +{
 367 +       int rc = 0;
 368 +       int org_crc, final_crc;
 369 +
 370 +       if (le32_to_cpup(&p->signature) != BBR_TABLE_SIGNATURE) {
 371 +               DMERR("BBR table signature doesn't match!");
 372 +               DMERR("Found 0x%x. Expecting 0x%x",
 373 +                     le32_to_cpup(&p->signature), BBR_TABLE_SIGNATURE);
 374 +               rc = -EINVAL;
 375 +               goto out;
 376 +       }
 377 +
 378 +       if (!p->crc) {
 379 +               DMERR("BBR table sector has no CRC!");
 380 +               rc = -EINVAL;
 381 +               goto out;
 382 +       }
 383 +
 384 +       org_crc = le32_to_cpup(&p->crc);
 385 +       p->crc = 0;
 386 +       final_crc = calculate_crc(INITIAL_CRC, (void *)p, sizeof(*p));
 387 +       if (final_crc != org_crc) {
 388 +               DMERR("CRC failed!");
 389 +               DMERR("Found 0x%x. Expecting 0x%x",
 390 +                     org_crc, final_crc);
 391 +               rc = -EINVAL;
 392 +               goto out;
 393 +       }
 394 +
 395 +       p->crc = cpu_to_le32p(&org_crc);
 396 +       le_bbr_table_sector_to_cpu(p);
 397 +
 398 +out:
 399 +       return rc;
 400 +}
 401 +
 402 +/**
 403 + * bbr_binary_tree_insert
 404 + *
 405 + * Insert a node into the binary tree.
 406 + **/
 407 +static void bbr_binary_tree_insert(struct bbr_runtime_remap ** root,
 408 +                                  struct bbr_runtime_remap * newnode)
 409 +{
 410 +       struct bbr_runtime_remap ** node = root;
 411 +       while (node && *node) {
 412 +               if (newnode->remap.bad_sect > (*node)->remap.bad_sect) {
 413 +                       node = &((*node)->right);
 414 +               } else {
 415 +                       node = &((*node)->left);
 416 +               }
 417 +       }
 418 +
 419 +       newnode->left = newnode->right = NULL;
 420 +       *node = newnode;
 421 +}
 422 +
 423 +/**
 424 + * bbr_binary_search
 425 + *
 426 + * Search for a node that contains bad_sect == lsn.
 427 + **/
 428 +static struct bbr_runtime_remap * bbr_binary_search(
 429 +       struct bbr_runtime_remap * root,
 430 +       u64 lsn)
 431 +{
 432 +       struct bbr_runtime_remap * node = root;
 433 +       while (node) {
 434 +               if (node->remap.bad_sect == lsn) {
 435 +                       break;
 436 +               }
 437 +               if (lsn > node->remap.bad_sect) {
 438 +                       node = node->right;
 439 +               } else {
 440 +                       node = node->left;
 441 +               }
 442 +       }
 443 +       return node;
 444 +}
 445 +
 446 +/**
 447 + * bbr_binary_tree_destroy
 448 + *
 449 + * Destroy the binary tree.
 450 + **/
 451 +static void bbr_binary_tree_destroy(struct bbr_runtime_remap * root,
 452 +                                   struct bbr_private * bbr_id)
 453 +{
 454 +       struct bbr_runtime_remap ** link = NULL;
 455 +       struct bbr_runtime_remap * node = root;
 456 +
 457 +       while (node) {
 458 +               if (node->left) {
 459 +                       link = &(node->left);
 460 +                       node = node->left;
 461 +                       continue;
 462 +               }
 463 +               if (node->right) {
 464 +                       link = &(node->right);
 465 +                       node = node->right;
 466 +                       continue;
 467 +               }
 468 +
 469 +               mempool_free(node, bbr_remap_pool);
 470 +               if (node == root) {
 471 +                       /* If root is deleted, we're done. */
 472 +                       break;
 473 +               }
 474 +
 475 +               /* Back to root. */
 476 +               node = root;
 477 +               *link = NULL;
 478 +       }
 479 +}
 480 +
 481 +static void bbr_free_remap(struct bbr_private * bbr_id)
 482 +{
 483 +       spin_lock_irq(&bbr_id->bbr_id_lock);
 484 +       bbr_binary_tree_destroy(bbr_id->remap_root, bbr_id);
 485 +       bbr_id->remap_root = NULL;
 486 +       spin_unlock_irq(&bbr_id->bbr_id_lock);
 487 +}
 488 +
 489 +/**
 490 + * bbr_insert_remap_entry
 491 + *
 492 + * Create a new remap entry and add it to the binary tree for this node.
 493 + **/
 494 +static int bbr_insert_remap_entry(struct bbr_private * bbr_id,
 495 +                                 struct bbr_table_entry * new_bbr_entry)
 496 +{
 497 +       struct bbr_runtime_remap * newnode;
 498 +
 499 +       newnode = mempool_alloc(bbr_remap_pool, GFP_NOIO);
 500 +       if (!newnode) {
 501 +               DMERR("Could not allocate from remap mempool!");
 502 +               return -ENOMEM;
 503 +       }
 504 +       newnode->remap.bad_sect  = new_bbr_entry->bad_sect;
 505 +       newnode->remap.replacement_sect = new_bbr_entry->replacement_sect;
 506 +       spin_lock_irq(&bbr_id->bbr_id_lock);
 507 +       bbr_binary_tree_insert(&bbr_id->remap_root, newnode);
 508 +       spin_unlock_irq(&bbr_id->bbr_id_lock);
 509 +       return 0;
 510 +}
 511 +
 512 +/**
 513 + * bbr_table_to_remap_list
 514 + *
 515 + * The on-disk bbr table is sorted by the replacement sector LBA. In order to
 516 + * improve run time performance, the in memory remap list must be sorted by
 517 + * the bad sector LBA. This function is called at discovery time to initialize
 518 + * the remap list. This function assumes that at least one copy of meta data
 519 + * is valid.
 520 + **/
 521 +static u32 bbr_table_to_remap_list(struct bbr_private * bbr_id)
 522 +{
 523 +       u32 in_use_blks = 0;
 524 +       int i, j;
 525 +       struct bbr_table * p;
 526 +
 527 +
 528 +       for (i = 0, p = bbr_id->bbr_table;
 529 +            i < bbr_id->nr_sects_bbr_table;
 530 +            i++, p++ ) {
 531 +               if (!p->in_use_cnt) {
 532 +                       break;
 533 +               }
 534 +               in_use_blks += p->in_use_cnt;
 535 +               for (j = 0; j < p->in_use_cnt; j++) {
 536 +                       bbr_insert_remap_entry(bbr_id, &p->entries[j]);
 537 +               }
 538 +       }
 539 +       if (in_use_blks)
 540 +               DMWARN("There are %u BBR entries for device %u:%u",
 541 +                      in_use_blks, MAJOR(bbr_id->dev->dev),
 542 +                      MINOR(bbr_id->dev->dev));
 543 +
 544 +       return in_use_blks;
 545 +}
 546 +
 547 +/**
 548 + * bbr_search_remap_entry
 549 + *
 550 + * Search remap entry for the specified sector. If found, return a pointer to
 551 + * the table entry. Otherwise, return NULL.
 552 + **/
 553 +static struct bbr_table_entry * bbr_search_remap_entry(
 554 +       struct bbr_private * bbr_id,
 555 +       u64 lsn)
 556 +{
 557 +       struct bbr_runtime_remap * p;
 558 +
 559 +       spin_lock_irq(&bbr_id->bbr_id_lock);
 560 +       p = bbr_binary_search(bbr_id->remap_root, lsn);
 561 +       spin_unlock_irq(&bbr_id->bbr_id_lock);
 562 +       if (p) {
 563 +               return (&p->remap);
 564 +       } else {
 565 +               return NULL;
 566 +       }
 567 +}
 568 +
 569 +/**
 570 + * bbr_remap
 571 + *
 572 + * If *lsn is in the remap table, return TRUE and modify *lsn,
 573 + * else, return FALSE.
 574 + **/
 575 +static inline int bbr_remap(struct bbr_private * bbr_id,
 576 +                           u64 * lsn)
 577 +{
 578 +       struct bbr_table_entry * e;
 579 +
 580 +       if (atomic_read(&bbr_id->in_use_replacement_blks)) {
 581 +               e = bbr_search_remap_entry(bbr_id, *lsn);
 582 +               if (e) {
 583 +                       *lsn = e->replacement_sect;
 584 +                       return 1;
 585 +               }
 586 +       }
 587 +       return 0;
 588 +}
 589 +
 590 +/**
 591 + * bbr_remap_probe
 592 + *
 593 + * If any of the sectors in the range [lsn, lsn+nr_sects] are in the remap
 594 + * table return TRUE, Else, return FALSE.
 595 + **/
 596 +static inline int bbr_remap_probe(struct bbr_private * bbr_id,
 597 +                                 u64 lsn, u64 nr_sects)
 598 +{
 599 +       u64 tmp, cnt;
 600 +
 601 +       if (atomic_read(&bbr_id->in_use_replacement_blks)) {
 602 +               for (cnt = 0, tmp = lsn;
 603 +                    cnt < nr_sects;
 604 +                    cnt += bbr_id->blksize_in_sects, tmp = lsn + cnt) {
 605 +                       if (bbr_remap(bbr_id,&tmp)) {
 606 +                               return 1;
 607 +                       }
 608 +               }
 609 +       }
 610 +       return 0;
 611 +}
 612 +
 613 +/**
 614 + * bbr_setup
 615 + *
 616 + * Read the remap tables from disk and set up the initial remap tree.
 617 + **/
 618 +static int bbr_setup(struct bbr_private * bbr_id)
 619 +{
 620 +       struct bbr_table * table = bbr_id->bbr_table;
 621 +       struct page * page;
 622 +       struct io_region job;
 623 +       unsigned int error, offset;
 624 +       int i, rc = 0;
 625 +
 626 +       job.dev = bbr_id->dev->dev;
 627 +       job.count = 1;
 628 +
 629 +       /* Read and verify each BBR table sector individually. */
 630 +       for (i = 0; i < bbr_id->nr_sects_bbr_table; i++, table++) {
 631 +               job.sector = bbr_id->lba_table1 + i;
 632 +               page = virt_to_page(table);
 633 +               offset = (unsigned long)table & ~PAGE_MASK;
 634 +               rc = dm_io_sync(1, &job, READ, page, offset, &error);
 635 +               if (rc && bbr_id->lba_table2) {
 636 +                       job.sector = bbr_id->lba_table2 + i;
 637 +                       rc = dm_io_sync(1, &job, READ, page, offset, &error);
 638 +               }
 639 +               if (rc) {
 640 +                       goto out;
 641 +               }
 642 +
 643 +               rc = validate_bbr_table_sector(table);
 644 +               if (rc) {
 645 +                       goto out;
 646 +               }
 647 +       }
 648 +       atomic_set(&bbr_id->in_use_replacement_blks,
 649 +                  bbr_table_to_remap_list(bbr_id));
 650 +
 651 +out:
 652 +       if (rc) {
 653 +               DMERR("dm-bbr: error during device setup: %d", rc);
 654 +       }
 655 +       return rc;
 656 +}
 657 +
 658 +static struct bbr_io_buffer * allocate_bbr_io_buf(struct bbr_private * bbr_id,
 659 +                                                 struct buffer_head * bh,
 660 +                                                 int rw)
 661 +{
 662 +       struct bbr_io_buffer * bbr_io_buf;
 663 +
 664 +       bbr_io_buf = mempool_alloc(bbr_io_buf_pool, GFP_NOIO);
 665 +       if (bbr_io_buf) {
 666 +               memset(bbr_io_buf, 0, sizeof(struct bbr_io_buffer));
 667 +               INIT_LIST_HEAD(&bbr_io_buf->bbr_io_list);
 668 +               bbr_io_buf->bbr_id = bbr_id;
 669 +               bbr_io_buf->sector = bh->b_rsector;
 670 +               bbr_io_buf->bh = bh;
 671 +               bbr_io_buf->rw = rw;
 672 +       } else {
 673 +               DMWARN("Could not allocate from BBR I/O buffer pool!");
 674 +       }
 675 +       return bbr_io_buf;
 676 +}
 677 +
 678 +static void free_bbr_io_buf(struct bbr_io_buffer * bbr_io_buf)
 679 +{
 680 +       mempool_free(bbr_io_buf, bbr_io_buf_pool);
 681 +}
 682 +
 683 +/**
 684 + * bbr_io_remap_error
 685 + * @bbr_id:            Private data for the BBR node.
 686 + * @rw:                        READ or WRITE.
 687 + * @starting_lsn:      Starting sector of request to remap.
 688 + * @count:             Number of sectors in the request.
 689 + * @buffer:            Data buffer for the request.
 690 + *
 691 + * For the requested range, try to write each sector individually. For each
 692 + * sector that fails, find the next available remap location and write the
 693 + * data to that new location. Then update the table and write both copies
 694 + * of the table to disk. Finally, update the in-memory mapping and do any
 695 + * other necessary bookkeeping.
 696 + **/
 697 +static int bbr_io_remap_error(struct bbr_private * bbr_id,
 698 +                             int rw,
 699 +                             u64 starting_lsn,
 700 +                             u64 count,
 701 +                             char * buffer)
 702 +{
 703 +       struct bbr_table * bbr_table;
 704 +       struct io_region job;
 705 +       struct page * page;
 706 +       unsigned long table_sector_index;
 707 +       unsigned long table_sector_offset;
 708 +       unsigned long index;
 709 +       unsigned int offset_in_page, error;
 710 +       u64 lsn, new_lsn;
 711 +       int rc;
 712 +
 713 +       if (rw == READ) {
 714 +               /* Nothing can be done about read errors. */
 715 +               return -EIO;
 716 +       }
 717 +
 718 +       job.dev = bbr_id->dev->dev;
 719 +
 720 +       /* For each sector in the request. */
 721 +       for (lsn = 0; lsn < count; lsn++, buffer += SECTOR_SIZE) {
 722 +               job.sector = starting_lsn + lsn;
 723 +               job.count = 1;
 724 +               page = virt_to_page(buffer);
 725 +               offset_in_page = (unsigned long)buffer & ~PAGE_MASK;
 726 +               rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
 727 +               while (rc) {
 728 +                       /* Find the next available relocation sector. */
 729 +                       new_lsn = atomic_read(&bbr_id->in_use_replacement_blks);
 730 +                       if (new_lsn >= bbr_id->nr_replacement_blks) {
 731 +                               /* No more replacement sectors available. */
 732 +                               return -EIO;
 733 +                       }
 734 +                       new_lsn += bbr_id->start_replacement_sect;
 735 +
 736 +                       /* Write the data to its new location. */
 737 +                       DMWARN("dm-bbr: device %u:%u: Trying to remap bad sector "PFU64" to sector "PFU64,
 738 +                              MAJOR(bbr_id->dev->dev), MINOR(bbr_id->dev->dev),
 739 +                              starting_lsn + lsn, new_lsn);
 740 +                       job.sector = new_lsn;
 741 +                       rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
 742 +                       if (rc) {
 743 +                               /* This replacement sector is bad.
 744 +                                * Try the next one.
 745 +                                */
 746 +                               DMERR("dm-bbr: device %u:%u: replacement sector "PFU64" is bad. Skipping.",
 747 +                                     MAJOR(bbr_id->dev->dev), MINOR(bbr_id->dev->dev), new_lsn);
 748 +                               atomic_inc(&bbr_id->in_use_replacement_blks);
 749 +                               continue;
 750 +                       }
 751 +
 752 +                       /* Add this new entry to the on-disk table. */
 753 +                       table_sector_index = new_lsn -
 754 +                                            bbr_id->start_replacement_sect;
 755 +                       table_sector_offset = table_sector_index /
 756 +                                             BBR_ENTRIES_PER_SECT;
 757 +                       index = table_sector_index % BBR_ENTRIES_PER_SECT;
 758 +
 759 +                       bbr_table = &bbr_id->bbr_table[table_sector_offset];
 760 +                       bbr_table->entries[index].bad_sect = starting_lsn + lsn;
 761 +                       bbr_table->entries[index].replacement_sect = new_lsn;
 762 +                       bbr_table->in_use_cnt++;
 763 +                       bbr_table->sequence_number++;
 764 +                       bbr_table->crc = 0;
 765 +                       bbr_table->crc = calculate_crc(INITIAL_CRC,
 766 +                                                      bbr_table,
 767 +                                                      sizeof(struct bbr_table));
 768 +
 769 +                       /* Write the table to disk. */
 770 +                       cpu_bbr_table_sector_to_le(bbr_table, bbr_table);
 771 +                       page = virt_to_page(bbr_table);
 772 +                       offset_in_page = (unsigned long)bbr_table & ~PAGE_MASK;
 773 +                       if (bbr_id->lba_table1) {
 774 +                               job.sector = bbr_id->lba_table1 + table_sector_offset;
 775 +                               job.count = 1;
 776 +                               rc = dm_io_sync(1, &job, WRITE, page, offset_in_page, &error);
 777 +                       }
 778 +                       if (bbr_id->lba_table2) {
 779 +                               job.sector = bbr_id->lba_table2 + table_sector_offset;
 780 +                               rc |= dm_io_sync(1, &job, WRITE, page, offset_in_page, &error);
 781 +                       }
 782 +                       le_bbr_table_sector_to_cpu(bbr_table);
 783 +
 784 +                       if (rc) {
 785 +                               /* Error writing one of the tables to disk. */
 786 +                               DMERR("dm-bbr: device %u:%u: error updating BBR tables on disk.",
 787 +                                     MAJOR(bbr_id->dev->dev), MINOR(bbr_id->dev->dev));
 788 +                               return rc;
 789 +                       }
 790 +
 791 +                       /* Insert a new entry in the remapping binary-tree. */
 792 +                       rc = bbr_insert_remap_entry(bbr_id,
 793 +                                                   &bbr_table->entries[index]);
 794 +                       if (rc) {
 795 +                               DMERR("dm-bbr: device %u:%u: error adding new entry to remap tree.",
 796 +                                     MAJOR(bbr_id->dev->dev), MINOR(bbr_id->dev->dev));
 797 +                               return rc;
 798 +                       }
 799 +
 800 +                       atomic_inc(&bbr_id->in_use_replacement_blks);
 801 +               }
 802 +       }
 803 +
 804 +       return 0;
 805 +}
 806 +
 807 +/**
 808 + * bbr_io_process_request
 809 + *
 810 + * For each sector in this request, check if the sector has already
 811 + * been remapped. If so, process all previous sectors in the request,
 812 + * followed by the remapped sector. Then reset the starting lsn and
 813 + * count, and keep going with the rest of the request as if it were
 814 + * a whole new request. If any of the sync_io's return an error,
 815 + * call the remapper to relocate the bad sector(s).
 816 + **/
 817 +static int bbr_io_process_request(struct bbr_io_buffer * bbr_io_buf)
 818 +{
 819 +       struct bbr_private * bbr_id = bbr_io_buf->bbr_id;
 820 +       struct io_region job;
 821 +       u64 starting_lsn = bbr_io_buf->sector;
 822 +       u64 count = bbr_io_buf->bh->b_size >> SECTOR_SHIFT;
 823 +       u64 lsn, remapped_lsn;
 824 +       char * buffer = bbr_io_buf->bh->b_data;
 825 +       struct page * page = virt_to_page(buffer);
 826 +       unsigned int offset_in_page = (unsigned long)buffer & ~PAGE_MASK;
 827 +       unsigned int error;
 828 +       int rw = bbr_io_buf->rw;
 829 +       int rc = 0;
 830 +
 831 +       job.dev = bbr_id->dev->dev;
 832 +
 833 +       /* For each sector in this request, check if this sector has already
 834 +        * been remapped. If so, process all previous sectors in this request,
 835 +        * followed by the remapped sector. Then reset the starting lsn and
 836 +        * count and keep going with the rest of the request as if it were
 837 +        * a whole new request.
 838 +        */
 839 +       for (lsn = 0; lsn < count; lsn++) {
 840 +               remapped_lsn = starting_lsn + lsn;
 841 +               rc = bbr_remap(bbr_id, &remapped_lsn);
 842 +               if (!rc) {
 843 +                       /* This sector is fine. */
 844 +                       continue;
 845 +               }
 846 +
 847 +               /* Process all sectors in the request up to this one. */
 848 +               if (lsn > 0) {
 849 +                       job.sector = starting_lsn;
 850 +                       job.count = lsn;
 851 +                       rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
 852 +                       if (rc) {
 853 +                               /* If this I/O failed, then one of the sectors
 854 +                                * in this request needs to be relocated.
 855 +                                */
 856 +                               rc = bbr_io_remap_error(bbr_id, bbr_io_buf->rw, starting_lsn,
 857 +                                                       lsn, buffer);
 858 +                               if (rc) {
 859 +                                       return rc;
 860 +                               }
 861 +                       }
 862 +                       buffer += (lsn << SECTOR_SHIFT);
 863 +                       page = virt_to_page(buffer);
 864 +                       offset_in_page = (unsigned long)buffer & ~PAGE_MASK;
 865 +               }
 866 +
 867 +               /* Process the remapped sector. */
 868 +               job.sector = remapped_lsn;
 869 +               job.count = 1;
 870 +               rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
 871 +               if (rc) {
 872 +                       /* BUGBUG - Need more processing if this caused an
 873 +                        * an error. If this I/O failed, then the existing
 874 +                        * remap is now bad, and we need to find a new remap.
 875 +                        * Can't use bbr_io_remap_error(), because the existing
 876 +                        * map entry needs to be changed, not added again, and
 877 +                        * the original table entry also needs to be changed.
 878 +                        */
 879 +                       return rc;
 880 +               }
 881 +
 882 +               buffer          += SECTOR_SIZE;
 883 +               starting_lsn    += (lsn + 1);
 884 +               count           -= (lsn + 1);
 885 +               lsn             = -1;
 886 +               page            = virt_to_page(buffer);
 887 +               offset_in_page  = (unsigned long)buffer & ~PAGE_MASK;
 888 +       }
 889 +
 890 +       /* Check for any remaining sectors after the last split. This could
 891 +        * potentially be the whole request, but that should be a rare case
 892 +        * because requests should only be processed by the thread if we know
 893 +        * an error occurred or they contained one or more remapped sectors.
 894 +        */
 895 +       if (count) {
 896 +               job.sector = starting_lsn;
 897 +               job.count = count;
 898 +               rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
 899 +               if (rc) {
 900 +                       /* If this I/O failed, then one of the sectors in this
 901 +                        * request needs to be relocated.
 902 +                        */
 903 +                       rc = bbr_io_remap_error(bbr_id, bbr_io_buf->rw, starting_lsn,
 904 +                                               count, buffer);
 905 +                       if (rc) {
 906 +                               return rc;
 907 +                       }
 908 +               }
 909 +       }
 910 +
 911 +       return 0;
 912 +}
 913 +
 914 +/**
 915 + * bbr_io_handler
 916 + *
 917 + * This is the handler for the bbr_io_thread. It continuously loops,
 918 + * taking I/O requests off its list and processing them. If nothing
 919 + * is on the list, the thread goes back to sleep until specifically
 920 + * woken up.
 921 + *
 922 + * I/O requests should only be sent to this thread if we know that:
 923 + * a) the request contains at least one remapped sector.
 924 + *   or
 925 + * b) the request caused an error on the normal I/O path.
 926 + * This function uses synchronous I/O, so sending a request to this
 927 + * thread that doesn't need special processing will cause severe
 928 + * performance degredation.
 929 + **/
 930 +static void bbr_io_handler(void)
 931 +{
 932 +       struct bbr_io_buffer * bbr_io_buf;
 933 +       struct buffer_head * bh;
 934 +       unsigned long flags;
 935 +       int rc;
 936 +
 937 +       while (1) {
 938 +               /* Process bbr_io_list, one entry at a time. */
 939 +               spin_lock_irqsave(&bbr_io_list_lock, flags);
 940 +               if (list_empty(&bbr_io_list)) {
 941 +                       /* No more items on the list. */
 942 +                       spin_unlock_irqrestore(&bbr_io_list_lock, flags);
 943 +                       break;
 944 +               }
 945 +               bbr_io_buf = list_entry(bbr_io_list.next,
 946 +                                       struct bbr_io_buffer, bbr_io_list);
 947 +               list_del_init(&bbr_io_buf->bbr_io_list);
 948 +               spin_unlock_irqrestore(&bbr_io_list_lock, flags);
 949 +
 950 +               rc = bbr_io_process_request(bbr_io_buf);
 951 +
 952 +               /* Clean up and complete the original I/O. */
 953 +               bbr_io_buf->flags |= BBR_IO_HANDLED;
 954 +               bh = bbr_io_buf->bh;
 955 +               if (bh->b_end_io) {
 956 +                       /* If this was the bbr_io_buf for an error on the
 957 +                        * normal WRITE, don't free it here. It will be
 958 +                        * freed later in bbr_callback()
 959 +                        */
 960 +                       if (!(bbr_io_buf->flags & BBR_IO_RELOCATE))
 961 +                               free_bbr_io_buf(bbr_io_buf);
 962 +                       bh->b_end_io(bh, rc ? 0 : 1);
 963 +               }
 964 +       }
 965 +}
 966 +
 967 +/**
 968 + * bbr_schedule_io
 969 + *
 970 + * Place the specified bbr_io_buf on the thread's processing list.
 971 + **/
 972 +static void bbr_schedule_io(struct bbr_io_buffer * bbr_io_buf)
 973 +{
 974 +       unsigned long flags;
 975 +       spin_lock_irqsave(&bbr_io_list_lock, flags);
 976 +       list_add_tail(&bbr_io_buf->bbr_io_list, &bbr_io_list);
 977 +       spin_unlock_irqrestore(&bbr_io_list_lock, flags);
 978 +       dm_daemon_wake(bbr_io_thread);
 979 +}
 980 +
 981 +/**
 982 + * bbr_read
 983 + *
 984 + * If there are any remapped sectors on this object, send this request over
 985 + * to the thread for processing. Otherwise send it down the stack normally.
 986 + **/
 987 +static int bbr_read(struct bbr_private * bbr_id,
 988 +                   struct buffer_head * bh)
 989 +{
 990 +       struct bbr_io_buffer * bbr_io_buf;
 991 +
 992 +
 993 +       if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
 994 +           !bbr_remap_probe(bbr_id, bh->b_rsector,
 995 +                            bh->b_size >> SECTOR_SHIFT)) {
 996 +               /* No existing remaps or this request doesn't
 997 +                * contain any remapped sectors.
 998 +                */
 999 +               bh->b_rdev = bbr_id->dev->dev;
1000 +               return 1;
1001 +       }
1002 +
1003 +       /* This request has at least one remapped sector. */
1004 +       bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, READ);
1005 +       if (!bbr_io_buf) {
1006 +               /* Can't get memory to track the I/O. */
1007 +               bh->b_end_io(bh, 0);
1008 +               return -ENOMEM;
1009 +       }
1010 +
1011 +       bbr_schedule_io(bbr_io_buf);
1012 +       return 0;
1013 +}
1014 +
1015 +/**
1016 + * bbr_callback
1017 + *
1018 + * This is the callback for normal write requests. Check for an error
1019 + * during the I/O, and send to the thread for processing if necessary.
1020 + **/
1021 +static int bbr_callback(struct dm_target * ti,
1022 +                       struct buffer_head * bh,
1023 +                       int rw,
1024 +                       int error,
1025 +                       union map_info * map_context)
1026 +{
1027 +       struct bbr_io_buffer * bbr_io_buf = (struct bbr_io_buffer *) map_context->ptr;
1028 +
1029 +       if (!bbr_io_buf)
1030 +               return error;
1031 +
1032 +       /* Will try to relocate the WRITE if:
1033 +        * - It is an error, and
1034 +        * - It is not an error of BBR relocation, and
1035 +        */
1036 +       if (error && !(bbr_io_buf->flags & BBR_IO_HANDLED)) {
1037 +               DMERR("dm-bbr: device %u:%u: Write failure on sector %lu. Scheduling for retry.",
1038 +                     MAJOR(bh->b_rdev), MINOR(bh->b_rdev),
1039 +                     (unsigned long)bbr_io_buf->sector);
1040 +               /* Indicate this bbr_io_buf is for an error on normal WRITE */
1041 +               bbr_io_buf->flags |= BBR_IO_RELOCATE;
1042 +               bbr_schedule_io(bbr_io_buf);
1043 +               /* Returns >0 so that DM will let us retry the I/O */
1044 +               return 1;
1045 +       }
1046 +
1047 +       free_bbr_io_buf(bbr_io_buf);
1048 +       return error;
1049 +}
1050 +
1051 +/**
1052 + * bbr_write
1053 + *
1054 + * If there are any remapped sectors on this object, send the request over
1055 + * to the thread for processing. Otherwise, register for callback
1056 + * notification, and send the request down normally.
1057 + **/
1058 +static int bbr_write(struct bbr_private * bbr_id,
1059 +                    struct buffer_head * bh,
1060 +                    union map_info * map_context)
1061 +{
1062 +       struct bbr_io_buffer * bbr_io_buf;
1063 +
1064 +       bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, WRITE);
1065 +       if (!bbr_io_buf) {
1066 +               /* Can't get memory to track the I/O. */
1067 +               bh->b_end_io(bh, 0);
1068 +               return -ENOMEM;
1069 +       }
1070 +
1071 +       if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
1072 +           !bbr_remap_probe(bbr_id, bh->b_rsector,
1073 +                            bh->b_size >> SECTOR_SHIFT)) {
1074 +               /* No existing remaps or this request
1075 +                * contains no remapped sectors.
1076 +                */
1077 +               bh->b_rdev = bbr_id->dev->dev;
1078 +               map_context->ptr = bbr_io_buf;
1079 +               return 1;
1080 +       } else {
1081 +               /* This request contains at least one remapped sector. */
1082 +               map_context->ptr = NULL;
1083 +               bbr_schedule_io(bbr_io_buf);
1084 +       }
1085 +       return 0;
1086 +}
1087 +
1088 +/**
1089 + * Construct a bbr mapping
1090 + **/
1091 +static int bbr_ctr(struct dm_target * ti, unsigned int argc, char ** argv)
1092 +{
1093 +       struct bbr_private * bbr_id;
1094 +       u32 block_size;
1095 +       char * end;
1096 +       int rc = -EINVAL;
1097 +
1098 +       if (argc != 8) {
1099 +               ti->error = "dm-bbr requires exactly 8 arguments: "
1100 +                           "device offset table1_lsn table2_lsn table_size start_replacement nr_replacement_blks block_size";
1101 +               goto out1;
1102 +       }
1103 +
1104 +       bbr_id = bbr_alloc_private();
1105 +       if (!bbr_id) {
1106 +               ti->error = "dm-bbr: Error allocating bbr private data.";
1107 +               goto out1;
1108 +       }
1109 +
1110 +       bbr_id->offset = simple_strtoull(argv[1], &end, 10);
1111 +       bbr_id->lba_table1 = simple_strtoull(argv[2], &end, 10);
1112 +       bbr_id->lba_table2 = simple_strtoull(argv[3], &end, 10);
1113 +       bbr_id->nr_sects_bbr_table = simple_strtoull(argv[4], &end, 10);
1114 +       bbr_id->start_replacement_sect = simple_strtoull(argv[5], &end, 10);
1115 +       bbr_id->nr_replacement_blks = simple_strtoull(argv[6], &end, 10);
1116 +       block_size = simple_strtoul(argv[7], &end, 10);
1117 +       bbr_id->blksize_in_sects = (block_size >> SECTOR_SHIFT);
1118 +
1119 +       bbr_id->bbr_table = kmalloc(bbr_id->nr_sects_bbr_table << SECTOR_SHIFT,
1120 +                                   GFP_KERNEL);
1121 +       if (!bbr_id->bbr_table) {
1122 +               ti->error = "dm-bbr: Error allocating bbr table.";
1123 +               goto out2;
1124 +       }
1125 +
1126 +       if (dm_get_device(ti, argv[0], 0, ti->len,
1127 +                         dm_table_get_mode(ti->table), &bbr_id->dev)) {
1128 +               ti->error = "dm-bbr: Device lookup failed";
1129 +               goto out2;
1130 +       }
1131 +
1132 +       /* Using a semaphore here is probably overkill,
1133 +        * but at least it will be correct.
1134 +        */
1135 +       down(&bbr_instances_lock);
1136 +       if (bbr_instances == 0) {
1137 +               rc = bbr_global_init();
1138 +               if (rc) {
1139 +                       up(&bbr_instances_lock);
1140 +                       goto out3;
1141 +               }
1142 +       }
1143 +       bbr_instances++;
1144 +       up(&bbr_instances_lock);
1145 +
1146 +       rc = bbr_setup(bbr_id);
1147 +       if (rc) {
1148 +               ti->error = "dm-bbr: Device setup failed";
1149 +               goto out4;
1150 +       }
1151 +
1152 +       ti->private = bbr_id;
1153 +       return 0;
1154 +
1155 +out4:
1156 +       down(&bbr_instances_lock);
1157 +       bbr_instances--;
1158 +       if (bbr_instances == 0) {
1159 +               bbr_global_cleanup();
1160 +       }
1161 +       up(&bbr_instances_lock);
1162 +
1163 +out3:
1164 +       dm_put_device(ti, bbr_id->dev);
1165 +out2:
1166 +       bbr_free_private(bbr_id);
1167 +out1:
1168 +       return rc;
1169 +}
1170 +
1171 +static void bbr_dtr(struct dm_target * ti)
1172 +{
1173 +       struct bbr_private * bbr_id = (struct bbr_private *) ti->private;
1174 +
1175 +       dm_put_device(ti, bbr_id->dev);
1176 +       bbr_free_private(bbr_id);
1177 +
1178 +       down(&bbr_instances_lock);
1179 +       bbr_instances--;
1180 +       if (bbr_instances == 0) {
1181 +               bbr_global_cleanup();
1182 +       }
1183 +       up(&bbr_instances_lock);
1184 +}
1185 +
1186 +static int bbr_map(struct dm_target * ti, struct buffer_head * bh, int rw,
1187 +                  union map_info * map_context)
1188 +{
1189 +       struct bbr_private * bbr_id = (struct bbr_private *) ti->private;
1190 +
1191 +       bh->b_rsector += bbr_id->offset;
1192 +       switch (rw) {
1193 +               case READ:
1194 +               case READA:
1195 +                       map_context->ptr = NULL;
1196 +                       return bbr_read(bbr_id, bh);
1197 +               case WRITE:
1198 +                       return bbr_write(bbr_id, bh, map_context);
1199 +               default:
1200 +                       return -EIO;
1201 +       }
1202 +}
1203 +
1204 +static int bbr_status(struct dm_target * ti, status_type_t type,
1205 +                     char * result, unsigned int maxlen)
1206 +{
1207 +       struct bbr_private * bbr_id = (struct bbr_private *) ti->private;
1208 +
1209 +       switch (type) {
1210 +       case STATUSTYPE_INFO:
1211 +               result[0] = '\0';
1212 +               break;
1213 +
1214 +       case STATUSTYPE_TABLE:
1215 +               snprintf(result, maxlen, "%s "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" %u",
1216 +                        dm_kdevname(bbr_id->dev->dev), bbr_id->offset,
1217 +                        bbr_id->lba_table1, bbr_id->lba_table2,
1218 +                        bbr_id->nr_sects_bbr_table,
1219 +                        bbr_id->start_replacement_sect,
1220 +                        bbr_id->nr_replacement_blks,
1221 +                        bbr_id->blksize_in_sects << SECTOR_SHIFT);
1222 +                break;
1223 +       }
1224 +       return 0;
1225 +}
1226 +
1227 +static struct target_type bbr_target = {
1228 +       name:   "bbr",
1229 +       module: THIS_MODULE,
1230 +       ctr:    bbr_ctr,
1231 +       dtr:    bbr_dtr,
1232 +       map:    bbr_map,
1233 +       end_io: bbr_callback,
1234 +       status: bbr_status,
1235 +};
1236 +
1237 +int __init dm_bbr_init(void)
1238 +{
1239 +       int r = dm_register_target(&bbr_target);
1240 +
1241 +       if (r < 0)
1242 +               DMERR("dm-bbr: register failed %d", r);
1243 +
1244 +       return r;
1245 +}
1246 +
1247 +void __exit dm_bbr_exit(void)
1248 +{
1249 +       int r = dm_unregister_target(&bbr_target);
1250 +
1251 +       if (r < 0)
1252 +               DMERR("dm-bbr: unregister failed %d", r);
1253 +}
1254 +
1255 +module_init(dm_bbr_init);
1256 +module_exit(dm_bbr_exit);
1257 +MODULE_LICENSE("GPL");
1258 diff -urN linux-2.4.22/drivers/md/dm-bbr.h linux-2.4.22-evms/drivers/md/dm-bbr.h
1259 --- linux-2.4.22/drivers/md/dm-bbr.h    1970-01-01 01:00:00.000000000 +0100
1260 +++ linux-2.4.22-evms/drivers/md/dm-bbr.h       2003-09-15 17:08:42.000000000 +0200
1261 @@ -0,0 +1,148 @@
1262 +/*
1263 + *   Copyright (c) International Business Machines  Corp., 2002-2003
1264 + *
1265 + *   This program is free software;  you can redistribute it and/or modify
1266 + *   it under the terms of the GNU General Public License as published by
1267 + *   the Free Software Foundation; either version 2 of the License, or
1268 + *   (at your option) any later version.
1269 + *
1270 + *   This program is distributed in the hope that it will be useful,
1271 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
1272 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
1273 + *   the GNU General Public License for more details.
1274 + *
1275 + *   You should have received a copy of the GNU General Public License
1276 + *   along with this program;  if not, write to the Free Software
1277 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
1278 + *
1279 + * linux/drivers/md/dm-bbr.h
1280 + *
1281 + * Bad-block-relocation (BBR) target for device-mapper.
1282 + *
1283 + * The BBR target is designed to remap I/O write failures to another safe
1284 + * location on disk. Note that most disk drives have BBR built into them,
1285 + * this means that our software BBR will be only activated when all hardware
1286 + * BBR replacement sectors have been used.
1287 + */
1288 +
1289 +#ifndef _DM_BBR_H_
1290 +#define _DM_BBR_H_
1291 +
1292 +#define BBR_TABLE_SIGNATURE            0x42627254 /* BbrT */
1293 +#define BBR_ENTRIES_PER_SECT           31
1294 +#define BBR_NR_BUFS                    128
1295 +#define INITIAL_CRC                    0xFFFFFFFF
1296 +#define CRC_POLYNOMIAL                 0xEDB88320L
1297 +
1298 +/**
1299 + * Macros to cleanly print 64-bit numbers on both 32-bit and 64-bit machines.
1300 + * Use these in place of %Ld, %Lu, and %Lx.
1301 + **/
1302 +#if BITS_PER_LONG > 32
1303 +#define PFU64 "%lu"
1304 +#else
1305 +#define PFU64 "%Lu"
1306 +#endif
1307 +
1308 +/**
1309 + * struct bbr_table_entry
1310 + * @bad_sect:          LBA of bad location.
1311 + * @replacement_sect:  LBA of new location.
1312 + *
1313 + * Structure to describe one BBR remap.
1314 + **/
1315 +struct bbr_table_entry {
1316 +       u64 bad_sect;
1317 +       u64 replacement_sect;
1318 +};
1319 +
1320 +/**
1321 + * struct bbr_table
1322 + * @signature:         Signature on each BBR table sector.
1323 + * @crc:               CRC for this table sector.
1324 + * @sequence_number:   Used to resolve conflicts when primary and secondary
1325 + *                     tables do not match.
1326 + * @in_use_cnt:                Number of in-use table entries.
1327 + * @entries:           Actual table of remaps.
1328 + *
1329 + * Structure to describe each sector of the metadata table. Each sector in this
1330 + * table can describe 31 remapped sectors.
1331 + **/
1332 +struct bbr_table {
1333 +       u32                     signature;
1334 +       u32                     crc;
1335 +       u32                     sequence_number;
1336 +       u32                     in_use_cnt;
1337 +       struct bbr_table_entry  entries[BBR_ENTRIES_PER_SECT];
1338 +};
1339 +
1340 +/**
1341 + * struct bbr_runtime_remap
1342 + *
1343 + * Node in the binary tree used to keep track of remaps.
1344 + **/
1345 +struct bbr_runtime_remap {
1346 +       struct bbr_table_entry          remap;
1347 +       struct bbr_runtime_remap        *left;
1348 +       struct bbr_runtime_remap        *right;
1349 +};
1350 +
1351 +/**
1352 + * struct bbr_private
1353 + * @dev:                       Info about underlying device.
1354 + * @bbr_table:                 Copy of metadata table.
1355 + * @offset:                    LBA of data area.
1356 + * @lba_table1:                        LBA of primary BBR table.
1357 + * @lba_table2:                        LBA of secondary BBR table.
1358 + * @nr_sects_bbr_table:                Size of each BBR table.
1359 + * @nr_replacement_blks:       Number of replacement blocks.
1360 + * @start_replacement_sect:    LBA of start of replacement blocks.
1361 + * @blksize_in_sects:          Size of each block.
1362 + * @in_use_replacement_blks:   Current number of remapped blocks.
1363 + * @remap_root:                        Binary tree containing all remaps.
1364 + * @bbr_id_lock:               Lock for the binary tree.
1365 + *
1366 + * Private data for each BBR target.
1367 + **/
1368 +struct bbr_private {
1369 +       struct dm_dev                   * dev;
1370 +       struct bbr_table                * bbr_table;
1371 +       struct bbr_runtime_remap        * remap_root;
1372 +       u64                             offset;
1373 +       u64                             lba_table1;
1374 +       u64                             lba_table2;
1375 +       u64                             nr_sects_bbr_table;
1376 +       u64                             start_replacement_sect;
1377 +       u64                             nr_replacement_blks;
1378 +       u32                             blksize_in_sects;
1379 +       atomic_t                        in_use_replacement_blks;
1380 +       spinlock_t                      bbr_id_lock;
1381 +};
1382 +
1383 +#define BBR_IO_HANDLED (1<<0)
1384 +#define BBR_IO_RELOCATE        (1<<1)
1385 +
1386 +/**
1387 + * struct bbr_io_buffer
1388 + * @bbr_io_list:       Thread's list of bbr_io_buf's.
1389 + * @bbr_id:            Object for this request.
1390 + * @bh:                        Original buffer_head.
1391 + * @sector:            Original sector
1392 + * @flags:             Operation flag (BBR_IO_*)
1393 + * @rw:                        READ or WRITE.
1394 + * @rc:                        Return code from bbr_io_handler.
1395 + *
1396 + * Structure used to track each write request.
1397 + **/
1398 +struct bbr_io_buffer {
1399 +       struct list_head        bbr_io_list;
1400 +       struct bbr_private      *bbr_id;
1401 +       struct buffer_head      *bh;
1402 +       u64                     sector;
1403 +       u32                     flags;
1404 +       s32                     rw;
1405 +       s32                     rc;
1406 +};
1407 +
1408 +#endif
1409 +
1410 diff -urN linux-2.4.22/drivers/md/dm-snapshot.c linux-2.4.22-evms/drivers/md/dm-snapshot.c
1411 --- linux-2.4.22/drivers/md/dm-snapshot.c       2003-09-15 17:07:45.000000000 +0200
1412 +++ linux-2.4.22-evms/drivers/md/dm-snapshot.c  2003-09-15 17:08:35.000000000 +0200
1413 @@ -92,6 +92,9 @@
1414
1415         /* List of snapshots for this origin */
1416         struct list_head snapshots;
1417 +
1418 +       /* Count of snapshots and origins referrencing this structure. */
1419 +       unsigned int count;
1420  };
1421
1422  /*
1423 @@ -155,6 +158,35 @@
1424  }
1425
1426  /*
1427 + * Allocate and initialize an origin structure.
1428 + */
1429 +static struct origin * __alloc_origin(kdev_t dev)
1430 +{
1431 +       struct origin *o = kmalloc(sizeof(*o), GFP_KERNEL);
1432 +       if (o) {
1433 +               o->dev = dev;
1434 +               INIT_LIST_HEAD(&o->hash_list);
1435 +               INIT_LIST_HEAD(&o->snapshots);
1436 +               __insert_origin(o);
1437 +       }
1438 +       return o;
1439 +}
1440 +
1441 +static void __get_origin(struct origin *o)
1442 +{
1443 +       o->count++;
1444 +}
1445 +
1446 +static void __put_origin(struct origin *o)
1447 +{
1448 +       o->count--;
1449 +       if (o->count == 0) {
1450 +               list_del(&o->hash_list);
1451 +               kfree(o);
1452 +       }
1453 +}
1454 +
1455 +/*
1456   * Make a note of the snapshot and its origin so we can look it
1457   * up when the origin has a write on it.
1458   */
1459 @@ -168,20 +200,37 @@
1460
1461         if (!o) {
1462                 /* New origin */
1463 -               o = kmalloc(sizeof(*o), GFP_KERNEL);
1464 +               o = __alloc_origin(dev);
1465                 if (!o) {
1466                         up_write(&_origins_lock);
1467                         return -ENOMEM;
1468                 }
1469 +       }
1470
1471 -               /* Initialise the struct */
1472 -               INIT_LIST_HEAD(&o->snapshots);
1473 -               o->dev = dev;
1474 +       __get_origin(o);
1475 +       list_add_tail(&snap->list, &o->snapshots);
1476
1477 -               __insert_origin(o);
1478 +       up_write(&_origins_lock);
1479 +       return 0;
1480 +}
1481 +
1482 +static int register_origin(kdev_t dev)
1483 +{
1484 +       struct origin *o;
1485 +
1486 +       down_write(&_origins_lock);
1487 +       o = __lookup_origin(dev);
1488 +
1489 +       if (!o) {
1490 +               /* New origin */
1491 +               o = __alloc_origin(dev);
1492 +               if (!o) {
1493 +                       up_write(&_origins_lock);
1494 +                       return -ENOMEM;
1495 +               }
1496         }
1497
1498 -       list_add_tail(&snap->list, &o->snapshots);
1499 +       __get_origin(o);
1500
1501         up_write(&_origins_lock);
1502         return 0;
1503 @@ -195,11 +244,18 @@
1504         o = __lookup_origin(s->origin->dev);
1505
1506         list_del(&s->list);
1507 -       if (list_empty(&o->snapshots)) {
1508 -               list_del(&o->hash_list);
1509 -               kfree(o);
1510 -       }
1511 +       __put_origin(o);
1512 +
1513 +       up_write(&_origins_lock);
1514 +}
1515 +
1516 +static void unregister_origin(kdev_t dev)
1517 +{
1518 +       struct origin *o;
1519
1520 +       down_write(&_origins_lock);
1521 +       o = __lookup_origin(dev);
1522 +       __put_origin(o);
1523         up_write(&_origins_lock);
1524  }
1525
1526 @@ -1090,6 +1146,13 @@
1527                 return r;
1528         }
1529
1530 +       r = register_origin(dev->dev);
1531 +       if (r) {
1532 +               ti->error = "Cannot register origin";
1533 +               dm_put_device(ti, dev);
1534 +               return r;
1535 +       }
1536 +
1537         ti->private = dev;
1538         return 0;
1539  }
1540 @@ -1097,6 +1160,7 @@
1541  static void origin_dtr(struct dm_target *ti)
1542  {
1543         struct dm_dev *dev = (struct dm_dev *) ti->private;
1544 +       unregister_origin(dev->dev);
1545         dm_put_device(ti, dev);
1546  }
1547
1548 diff -urN linux-2.4.22/drivers/md/dm-sparse.c linux-2.4.22-evms/drivers/md/dm-sparse.c
1549 --- linux-2.4.22/drivers/md/dm-sparse.c 1970-01-01 01:00:00.000000000 +0100
1550 +++ linux-2.4.22-evms/drivers/md/dm-sparse.c    2003-09-15 17:09:48.000000000 +0200
1551 @@ -0,0 +1,713 @@
1552 +/* -*- linux-c -*- */
1553 +
1554 +/*
1555 + *   Copyright (c) International Business Machines  Corp., 2002
1556 + *
1557 + *   This program is free software;  you can redistribute it and/or modify
1558 + *   it under the terms of the GNU General Public License as published by
1559 + *   the Free Software Foundation; either version 2 of the License, or
1560 + *   (at your option) any later version.
1561 + *
1562 + *   This program is distributed in the hope that it will be useful,
1563 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
1564 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
1565 + *   the GNU General Public License for more details.
1566 + *
1567 + *   You should have received a copy of the GNU General Public License
1568 + *   along with this program;  if not, write to the Free Software
1569 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
1570 + *
1571 + * linux/drivers/md/dm-sparse.c
1572 + *
1573 + * Sparse target for device-mapper.
1574 + *
1575 + * This target provides the ability to create a sparse device. This
1576 + * allows a device to pretend to be larger than it really is.
1577 + */
1578 +
1579 +#include <linux/module.h>
1580 +#include <linux/init.h>
1581 +#include <linux/blkdev.h>
1582 +#include <linux/slab.h>
1583 +#include <linux/mempool.h>
1584 +#include <linux/vmalloc.h>
1585 +
1586 +#include "dm.h"
1587 +#include "dm-io.h"
1588 +
1589 +#define MAX_HASH_CHAIN_ENTRIES 10
1590 +#define NAME_SIZE 127
1591 +
1592 +/* Sparse Ioctl
1593 +   device
1594 +   start
1595 +   chunk_size
1596 +   chunks
1597 + */
1598 +
1599 +// Entries in the sparse remapping structure
1600 +struct sparse_hash_entry {
1601 +    u64 org_chunk; // Chunk number, not LBA.
1602 +    u64 sparse_chunk; // Chunk number, not LBA.
1603 +    struct sparse_hash_entry * next;
1604 +    struct sparse_hash_entry * prev;
1605 +};
1606 +
1607 +//Private data structure
1608 +struct sparse_volume {
1609 +    struct dm_dev *dev;
1610 +    struct rw_semaphore sparse_semaphore;
1611 +    struct sparse_hash_entry ** sparse_map; // Hash table of remappings
1612 +    struct sparse_hash_entry * free_hash_list;
1613 +    kmem_cache_t * hash_slab;
1614 +    mempool_t * hash_pool;
1615 +    u32 dm_io_flag;
1616 +    u32 chunk_size;    // Sectors.
1617 +    u32 chunk_shift; // Shift value for chunk size.
1618 +    u32 num_chunks;    // In this volume.
1619 +    u32 next_cow_entry; // Index into current COW table.
1620 +    u64 current_cow_sector;    // LOGICAL sector of current COW table.
1621 +    u32 next_free_chunk; // Index of next free chunk (not LBA!).
1622 +    u32 hash_table_size; // Size of the hash table for the remap.
1623 +    u64 start;
1624 +    u64 cow_table[64]; // One sector's worth of COW tables.
1625 +};
1626 +
1627 +/*************************** OLD SERVICES ****************************/
1628 +
1629 +/* computes log base 2 of value */
1630 +inline int log2(u32 value) //ok to change to u32?
1631 +{
1632 +    int result = -1;
1633 +    long tmp;              //ok to change to long?
1634 +
1635 +    if (value) {
1636 +       tmp = value;
1637 +       result++;
1638 +       while (!(tmp & 1)) {
1639 +           result++;
1640 +           tmp >>= 1;
1641 +       }
1642 +       if (tmp != 1) {
1643 +           result = -2;
1644 +       }
1645 +    }
1646 +    return result;
1647 +}
1648 +
1649 +/********************************* Functions *********************************/
1650 +
1651 +/***************************** Hash Functions *****************************/
1652 +
1653 +/* Take and initialize from the free hash list */
1654 +static struct sparse_hash_entry *
1655 +allocate_sparse_hash_entry( struct sparse_volume * volume,
1656 +                           u64 org_chunk,
1657 +                           u64 sparse_chunk )
1658 +{
1659 +    struct sparse_hash_entry * hash_entry;
1660 +
1661 +       hash_entry = volume->free_hash_list;
1662 +       if ( hash_entry ) { //should always be the case b/c preallocate these
1663 +           volume->free_hash_list = hash_entry->next;
1664 +           hash_entry->org_chunk = org_chunk;
1665 +           hash_entry->sparse_chunk = sparse_chunk;
1666 +           hash_entry->next = NULL;
1667 +           hash_entry->prev = NULL;
1668 +       }
1669 +
1670 +       return hash_entry;
1671 +}
1672 +
1673 +/*
1674 + *     This function inserts a new entry into a sparse hash chain, immediately
1675 + *     following the specified entry. This function should not be used to add
1676 + *     an entry into an empty list, or as the first entry in an existing list.
1677 + *     For that case, use insert_sparse_map_entry_at_head().
1678 + */
1679 +static int insert_sparse_hash_entry( struct sparse_hash_entry * entry,
1680 +                                    struct sparse_hash_entry * base )
1681 +{
1682 +       entry->next = base->next;
1683 +       entry->prev = base;
1684 +       base->next = entry;
1685 +       if ( entry->next ) {
1686 +               entry->next->prev = entry;
1687 +       }
1688 +       return 0;
1689 +}
1690 +
1691 +/*
1692 + *     This function inserts a new entry into a sparse chain as the first
1693 + *     entry in the chain.
1694 + */
1695 +static int insert_sparse_hash_entry_at_head( struct sparse_hash_entry * entry,
1696 +                                            struct sparse_hash_entry ** head )
1697 +{
1698 +       entry->next = *head;
1699 +       entry->prev = NULL;
1700 +       *head = entry;
1701 +       if ( entry->next ) {
1702 +           entry->next->prev = entry;
1703 +       }
1704 +       return 0;
1705 +}
1706 +
1707 +/*
1708 + *     Delete all items in a single chain in the hash table.
1709 + */
1710 +static int delete_sparse_hash_chain( struct sparse_volume * vol,
1711 +                                    struct sparse_hash_entry * head )
1712 +{
1713 +    struct sparse_hash_entry * next;
1714 +
1715 +    while ( head ) {
1716 +       next = head->next;
1717 +       mempool_free( head, vol->hash_pool );
1718 +       head = next;
1719 +    }
1720 +    return 0;
1721 +}
1722 +
1723 +/*
1724 + *     This function will search the hash chain that is anchored at the
1725 + *     specified head pointer. If the chunk number is found, a pointer to that
1726 + *     entry in the chain is set, and a 1 is returned. If the chunk is not
1727 + *     found, a pointer to the previous entry is set and 0 is returned. If the
1728 + *     return pointer is NULL, this means either the list is empty, or the
1729 + *     specified sector should become the first list item.
1730 + */
1731 +static int search_sparse_hash_chain( u64 chunk,
1732 +                                    struct sparse_hash_entry * head,
1733 +                                    struct sparse_hash_entry ** result )
1734 +{
1735 +    struct sparse_hash_entry * curr = head;
1736 +    struct sparse_hash_entry * prev = head;
1737 +    while ( curr && curr->org_chunk < chunk ) {
1738 +       prev = curr;
1739 +       curr = curr->next;
1740 +    }
1741 +    if (!curr) { // Either an empty chain or went off the end of the chain.
1742 +       *result = prev;
1743 +       return 0;
1744 +    }
1745 +    else if ( curr->org_chunk != chunk ) {
1746 +       *result = curr->prev;
1747 +       return 0;
1748 +    }
1749 +    else {
1750 +       *result = curr;
1751 +       return 1;
1752 +    }
1753 +}
1754 +
1755 +/*
1756 + *     This function takes a cow table entry (from the on-disk data), and
1757 + *     converts it into an appropriate entry for the sparse map, and
1758 + *     inserts it into the appropriate map for the specified volume.
1759 + */
1760 +static int add_cow_entry_to_sparse_map( u64 org_chunk,
1761 +                                       u64 sparse_chunk,
1762 +                                       struct sparse_volume * volume )
1763 +{
1764 +    struct sparse_hash_entry * new_entry;
1765 +    struct sparse_hash_entry * target_entry;
1766 +    u32 hash_value;
1767 +    int rc = -EINVAL;
1768 +
1769 +    new_entry = allocate_sparse_hash_entry(volume, org_chunk, sparse_chunk);
1770 +    if (!new_entry) {
1771 +       return -ENOMEM;
1772 +    }
1773 +
1774 +    hash_value = (long)org_chunk % volume->hash_table_size;
1775 +
1776 +    if (! search_sparse_hash_chain( org_chunk,
1777 +                                   volume->sparse_map[hash_value],
1778 +                                   &target_entry ) ) {
1779 +       //should always take this path
1780 +
1781 +       if ( target_entry ) {
1782 +           insert_sparse_hash_entry( new_entry, target_entry );
1783 +       }
1784 +       else {
1785 +           insert_sparse_hash_entry_at_head
1786 +               ( new_entry, &(volume->sparse_map[hash_value]) );
1787 +       }
1788 +       rc = 0;
1789 +    }
1790 +    return rc;
1791 +}
1792 +
1793 +/*
1794 + *     Construct the initial hash table state based on
1795 + *     existing COW tables on the disk.
1796 + */
1797 +static int build_sparse_maps(struct sparse_volume * volume)
1798 +{
1799 +    int rc = 0, done = 0;
1800 +    struct io_region job;
1801 +    struct page * page;
1802 +    unsigned int error, offset;
1803 +
1804 +    while (!done) {
1805 +
1806 +       // Read in one sector's worth of COW tables.
1807 +        job.dev = volume->dev->dev;
1808 +        job.sector = volume->current_cow_sector;
1809 +        job.count = 1;
1810 +        page = virt_to_page(volume->cow_table);
1811 +        offset = (unsigned long)volume->cow_table & ~PAGE_MASK;
1812 +        rc = dm_io_sync(1, &job, READ, page, offset, &error);
1813 +        if (rc) {
1814 +            return rc;
1815 +       }
1816 +
1817 +       // Translate every valid COW table entry into
1818 +       // a sparse map entry.
1819 +       for ( volume->next_cow_entry = 0;
1820 +
1821 +             volume->next_cow_entry < (SECTOR_SIZE/sizeof(u64)) &&
1822 +                 volume->cow_table[volume->next_cow_entry] !=
1823 +                 0xffffffffffffffff;
1824 +
1825 +             volume->next_cow_entry++, volume->next_free_chunk++ ) {
1826 +
1827 +           if ( (rc = add_cow_entry_to_sparse_map
1828 +                 ( le64_to_cpu( volume->cow_table[volume->next_cow_entry] ),
1829 +                   volume->next_free_chunk, volume ))) {
1830 +               return( rc );
1831 +           }
1832 +       }
1833 +       // Move on to the next sector if necessary.
1834 +       if ( volume->next_cow_entry == (SECTOR_SIZE/sizeof(u64)) ) {
1835 +           volume->current_cow_sector++;
1836 +       }
1837 +       else {
1838 +           done = 1;
1839 +       }
1840 +    }
1841 +    return 0;
1842 +}
1843 +
1844 +/************************* Other Functions ************************/
1845 +
1846 +/*
1847 + * Function: sparse_remap_chunk
1848 + *
1849 + *     This function performs a sector remap on a sparse volume. This should
1850 + *     be called from the I/O path, It first determines the base sector
1851 + *     of the chunk containing the specified sector, and saves the remainder.
1852 + *     Then it performs a search through the sparse map for the specified
1853 + *     volume. If a match is found, the sector number is changed to the new
1854 + *     value. If no match is found, the value is left the same, meaning the
1855 + *     chunk has not been remapped.
1856 + */
1857 +static int sparse_remap_chunk( struct sparse_volume * sparse_volume,
1858 +                              u64 * sector )
1859 +{
1860 +    struct sparse_hash_entry * result;
1861 +    u64 chunk;
1862 +    u32 hash_value;
1863 +    u32 remainder;
1864 +    int rc = 1;
1865 +
1866 +    down_read(&sparse_volume->sparse_semaphore);
1867 +
1868 +    remainder = *sector & (u64)(sparse_volume->chunk_size - 1);
1869 +    chunk = *sector >> sparse_volume->chunk_shift;
1870 +    hash_value = ((u32)chunk) % sparse_volume->hash_table_size;
1871 +
1872 +    if ( search_sparse_hash_chain( chunk,
1873 +                                  sparse_volume->sparse_map[hash_value],
1874 +                                  &result) ) {
1875 +       *sector = ( result->sparse_chunk << sparse_volume->chunk_shift )
1876 +           + remainder;
1877 +       rc =  0;
1878 +    }
1879 +    up_read(&sparse_volume->sparse_semaphore);
1880 +    return rc;
1881 +}
1882 +
1883 +/* Function: sparse_cow_write
1884 + *
1885 + *     Check this sparse node to see if the given sector/chunk has been
1886 + *     remapped yet. If it hasn't, create a new hash table entry, update the
1887 + *     in-memory COW table, write the COW table to disk.
1888 + */
1889 +
1890 +static int sparse_cow_write( struct sparse_volume * sparse_volume,
1891 +                            u64 * sector )
1892 +{
1893 +    struct sparse_hash_entry * target_entry, * new_map_entry;
1894 +    struct io_region job;
1895 +    struct page * page;
1896 +    char * cow = NULL;
1897 +    unsigned int error, offset;
1898 +    u64 chunk;
1899 +    u32 hash_value = 0;
1900 +    u32 remainder;
1901 +    int rc;
1902 +
1903 +    down_write(&sparse_volume->sparse_semaphore);
1904 +
1905 +    remainder = *sector & (u64)(sparse_volume->chunk_size - 1);
1906 +    chunk = *sector >> sparse_volume->chunk_shift;
1907 +    hash_value = ((u32)chunk) % sparse_volume->hash_table_size;
1908 +
1909 +    if ( search_sparse_hash_chain( chunk,
1910 +                                  sparse_volume->sparse_map[hash_value],
1911 +                                  &target_entry) ) {
1912 +       *sector =
1913 +           ( target_entry->sparse_chunk << sparse_volume->chunk_shift )
1914 +           + remainder;
1915 +       rc = 0;
1916 +       goto out;
1917 +    }
1918 +
1919 +    // Is there enough room left on this sparse to remap this chunk?
1920 +    if ( sparse_volume->next_free_chunk >= sparse_volume->num_chunks ) {
1921 +       DMERR("dm-sparse: full no new remaps allowed\n");
1922 +       rc = -ENOSPC;
1923 +       goto out;
1924 +    }
1925 +
1926 +    // Create and initialize a new hash table entry for the new remap.
1927 +    new_map_entry = allocate_sparse_hash_entry
1928 +       (sparse_volume, chunk, sparse_volume->next_free_chunk);
1929 +    if ( ! new_map_entry ) {
1930 +       // Can't get memory for map entry. Disable this sparse.
1931 +       DMERR("dm-sparse: memory error allocating hash entry\n");
1932 +       rc = -ENOMEM;
1933 +       goto out;
1934 +    }
1935 +
1936 +    //Always write cow table so its safe
1937 +    cow = kmalloc( SECTOR_SIZE, GFP_KERNEL );
1938 +    if (! cow ) {
1939 +       // Can't get I/O buffer. Disable this sparse.
1940 +       DMERR("dm-sparse: memory error allocating COW table buffer");
1941 +       rc = -ENOMEM;
1942 +       goto out;
1943 +    }
1944 +
1945 +    // Add the entry to the hash table.
1946 +    if ( target_entry ) {
1947 +       insert_sparse_hash_entry( new_map_entry, target_entry );
1948 +    }
1949 +    else {
1950 +       insert_sparse_hash_entry_at_head
1951 +           ( new_map_entry,
1952 +             &(sparse_volume->sparse_map[hash_value]) );
1953 +    }
1954 +
1955 +    sparse_volume->next_free_chunk++;
1956 +
1957 +    // Update the appropriate entry in the COW table.
1958 +    sparse_volume->cow_table[sparse_volume->next_cow_entry] =
1959 +       cpu_to_le64(chunk);
1960 +    sparse_volume->next_cow_entry++;
1961 +
1962 +    memcpy(cow, sparse_volume->cow_table, SECTOR_SIZE);
1963 +
1964 +    //because of ordering issues needs to be synchronous
1965 +    job.dev = sparse_volume->dev->dev;
1966 +    job.sector = sparse_volume->current_cow_sector;
1967 +    job.count = 1;
1968 +    page = virt_to_page(cow);
1969 +    offset = (unsigned long)cow & ~PAGE_MASK;
1970 +    dm_io_sync(1, &job, WRITE, page, offset, &error);
1971 +
1972 +    // Update the in-memory COW table values.
1973 +    if ( sparse_volume->next_cow_entry >= (SECTOR_SIZE/sizeof(u64)) )
1974 +       {
1975 +           sparse_volume->next_cow_entry = 0;
1976 +           sparse_volume->current_cow_sector++;
1977 +           memset(sparse_volume->cow_table, 0xff, SECTOR_SIZE);
1978 +       }
1979 +
1980 +    *sector = ( new_map_entry->sparse_chunk << sparse_volume->chunk_shift )
1981 +       + remainder;
1982 +
1983 +    rc = 0;
1984 +
1985 + out:
1986 +    up_write(&sparse_volume->sparse_semaphore);
1987 +    if ( cow ) {
1988 +       kfree( cow );
1989 +    }
1990 +
1991 +    return rc;
1992 +}
1993 +
1994 +/************************ EXPORT FUNCTIONS ************************/
1995 +
1996 +/*
1997 + * Function: sparse_dtr
1998 + */
1999 +static void sparse_dtr( struct dm_target *ti )
2000 +{
2001 +    struct sparse_volume * vol = (struct sparse_volume *)ti->private;
2002 +    int i;
2003 +
2004 +    if (vol) {
2005 +
2006 +       if (vol->sparse_map) {
2007 +           for ( i = 0; i < vol->hash_table_size; i++ ) {
2008 +               delete_sparse_hash_chain( vol, vol->sparse_map[i] );
2009 +           }
2010 +           delete_sparse_hash_chain( vol, vol->free_hash_list );
2011 +           vfree(vol->sparse_map);
2012 +       }
2013 +
2014 +       if (vol->hash_pool)
2015 +           mempool_destroy(vol->hash_pool);
2016 +
2017 +       if (vol->hash_slab)
2018 +           kmem_cache_destroy(vol->hash_slab);
2019 +
2020 +       dm_put_device(ti, vol->dev);
2021 +
2022 +        if (vol->dm_io_flag) {
2023 +           dm_io_put(1);
2024 +       }
2025 +
2026 +       kfree( vol );
2027 +    }
2028 +}
2029 +
2030 +/*
2031 + * Function: sparse_ctr
2032 + */
2033 +static int sparse_ctr( struct dm_target *ti, unsigned int argc, char** argv )
2034 +{
2035 +    int i, rc = -EINVAL;
2036 +    struct sparse_hash_entry *new_entry;
2037 +    struct sparse_volume *vol;
2038 +    struct dm_dev *dev;
2039 +    u32 chunk_size, chunks;
2040 +    u64 start;
2041 +    char* end, slab_name[NAME_SIZE+1];
2042 +
2043 +    if ( argc != 4 ) {
2044 +       ti->error="dm-sparse: wrong number of arguments";
2045 +       return rc;
2046 +    }
2047 +
2048 +    start = simple_strtoull(argv[1], &end, 10);
2049 +    if (*end) {
2050 +       ti->error="dm-sparse: Invalid first chunk lba";
2051 +       return rc;
2052 +    }
2053 +
2054 +    chunk_size = simple_strtoul(argv[2], &end, 10);
2055 +    if (*end) {
2056 +       ti->error="dm-sparse: Invalid chunk_size";
2057 +       return rc;
2058 +    }
2059 +
2060 +    chunks = simple_strtoul(argv[3], &end, 10);
2061 +    if (*end) {
2062 +       ti->error="dm-sparse: Invalid number of chunks";
2063 +       return rc;
2064 +    }
2065 +
2066 +    if ( dm_get_device( ti, argv[0], ti->begin, start + chunks * chunk_size,
2067 +                       dm_table_get_mode(ti->table), &dev ) ) {
2068 +       ti->error = "dm-sparse: Device lookup failed";
2069 +       return rc;
2070 +    }
2071 +
2072 +    vol = kmalloc(sizeof(struct sparse_volume), GFP_KERNEL);
2073 +    if ( !vol ) {
2074 +       ti->error = "dm-sparse: Memory allocation for private-data failed";
2075 +        rc = -ENOMEM;
2076 +       goto out;
2077 +    }
2078 +
2079 +    memset( vol, 0, sizeof(struct sparse_volume) );
2080 +
2081 +    rc = dm_io_get(1);
2082 +    if (rc) {
2083 +           ti->error = "dm-sparse: failed to initialize dm-io.";
2084 +           sparse_dtr(ti);
2085 +           return rc;
2086 +    }
2087 +
2088 +    // Initialize
2089 +    vol->dm_io_flag = 1;
2090 +    vol->chunk_size = chunk_size;
2091 +    vol->chunk_shift = log2(chunk_size);
2092 +    vol->num_chunks = chunks;
2093 +    vol->current_cow_sector = 1;
2094 +    vol->hash_table_size = chunks / MAX_HASH_CHAIN_ENTRIES + 1;
2095 +    vol->start = start;
2096 +    vol->dev = dev;
2097 +    init_rwsem(&vol->sparse_semaphore);
2098 +
2099 +    snprintf(slab_name, NAME_SIZE, "sparse-%p", vol);
2100 +    vol->hash_slab = kmem_cache_create(slab_name,
2101 +                                      sizeof(struct sparse_hash_entry),
2102 +                                      0, SLAB_HWCACHE_ALIGN,
2103 +                                      NULL, NULL);
2104 +    if ( ! vol->hash_slab ) {
2105 +       ti->error = "dm-sparse: memory allocation error in hash slab create";
2106 +       sparse_dtr(ti);
2107 +       return -ENOMEM;
2108 +    }
2109 +    vol->hash_pool = mempool_create(1, mempool_alloc_slab,
2110 +                                   mempool_free_slab,
2111 +                                   vol->hash_slab);
2112 +    if ( ! vol->hash_pool ) {
2113 +       ti->error = "dm-sparse: memory allocation error in hash pool create";
2114 +       sparse_dtr(ti);
2115 +       return -ENOMEM;
2116 +    }
2117 +
2118 +    // Sparse hash table
2119 +    vol->sparse_map = vmalloc( vol->hash_table_size *
2120 +                              sizeof( struct sparse_hash_entry * ) );
2121 +    if ( ! vol->sparse_map ) {
2122 +       ti->error = "dm-sparse: Memory allocation error in sparse_map create";
2123 +       sparse_dtr(ti);
2124 +       return -ENOMEM;
2125 +    }
2126 +
2127 +    memset( vol->sparse_map, 0, vol->hash_table_size *
2128 +           sizeof( struct sparse_hash_entry * ) );
2129 +
2130 +    for ( i = 0; i < chunks; i++ ) {
2131 +
2132 +       new_entry = mempool_alloc(vol->hash_pool, GFP_KERNEL );
2133 +       if ( ! new_entry ) {
2134 +           ti->error="dm-sparse: memory allocation error in hash table setup";
2135 +           sparse_dtr(ti);
2136 +           return -ENOMEM;
2137 +       }
2138 +
2139 +       new_entry->next = vol->free_hash_list;
2140 +       vol->free_hash_list = new_entry;
2141 +    }
2142 +
2143 +    rc = build_sparse_maps(vol);
2144 +    if (rc) {
2145 +       ti->error = "dm-sparse: error building hash tables";
2146 +       sparse_dtr(ti);
2147 +       return rc;
2148 +    }
2149 +
2150 +    ti->private = vol;
2151 +    return rc;
2152 +
2153 + out:
2154 +    dm_put_device(ti, dev);
2155 +    return rc;
2156 +}
2157 +
2158 +/*
2159 + * Function: sparse_map
2160 + */
2161 +static int sparse_map( struct dm_target * ti, struct buffer_head * bh, int rw,
2162 +                      union map_info *map_context )
2163 +{
2164 +    struct sparse_volume * volume = (struct sparse_volume*)ti->private;
2165 +    u64 sector = bh->b_rsector;
2166 +    int rc;
2167 +
2168 +
2169 +
2170 +    // Check if this sector has been remapped
2171 +    rc = sparse_remap_chunk( volume, &sector );
2172 +
2173 +    if ( rc < 0 ) { //Error
2174 +       bh->b_end_io(bh, 0);
2175 +       return rc;
2176 +    }
2177 +
2178 +    if ( rc == 0 ) { // Remapped I/O : read or write same logic
2179 +       bh->b_rsector = volume->start + sector;
2180 +       bh->b_rdev = volume->dev->dev;
2181 +       return 1;
2182 +    }
2183 +
2184 +    // ( Previously )Un-mapped:        read / write different logic
2185 +
2186 +    if ( rw ) { //write :
2187 +       rc = sparse_cow_write( volume, &sector );
2188 +
2189 +       if ( rc < 0 ) { //Error
2190 +           bh->b_end_io(bh, 0);
2191 +           return rc;
2192 +       }
2193 +       //Send write on
2194 +       bh->b_rsector = volume->start + sector;
2195 +       bh->b_rdev = volume->dev->dev;
2196 +       return 1;
2197 +    }
2198 +
2199 +    //Reading something that was never written
2200 +    //return zeros and indicate complete
2201 +    memset(bh->b_data, 0x0, bh->b_size);
2202 +    bh->b_end_io(bh, 1);
2203 +    return 0;
2204 +}
2205 +
2206 +static int sparse_status( struct dm_target *ti, status_type_t type,
2207 +                         char *result, unsigned int maxlen )
2208 +{
2209 +    struct sparse_volume * vol = (struct sparse_volume * )ti->private;
2210 +
2211 +    switch(type) {
2212 +
2213 +    case STATUSTYPE_INFO:
2214 +       snprintf( result, maxlen, "%d%%",
2215 +                 ( vol->next_free_chunk * 100 ) / vol->num_chunks );
2216 +       break;
2217 +
2218 +    case STATUSTYPE_TABLE:
2219 +       snprintf( result, maxlen, "%s %Lu %u %u",
2220 +                 dm_kdevname(vol->dev->dev), vol->start,
2221 +                 vol->chunk_size, vol->num_chunks );
2222 +       break;
2223 +
2224 +    default:
2225 +       break;
2226 +    }
2227 +
2228 +    return 0;
2229 +}
2230 +
2231 +/****************** FUNCTION TABLE **********************/
2232 +
2233 +static struct target_type sparse_target = {
2234 +    .name = "sparse",
2235 +    .module = THIS_MODULE,
2236 +    .ctr = sparse_ctr,
2237 +    .dtr = sparse_dtr,
2238 +    .map = sparse_map,
2239 +    .status = sparse_status,
2240 +};
2241 +
2242 +/********************* REGISTRATION *****************/
2243 +
2244 +int __init sparse_init(void)
2245 +{
2246 +    int rc = dm_register_target(&sparse_target);
2247 +
2248 +    if ( rc < 0 )
2249 +       DMWARN("sparse target registration failed");
2250 +
2251 +    return rc;
2252 +}
2253 +
2254 +void __exit sparse_exit(void)
2255 +{
2256 +    if (dm_unregister_target(&sparse_target) )
2257 +       DMWARN("sparse target unregistration failed");
2258 +
2259 +    return;
2260 +}
2261 +
2262 +module_init(sparse_init);
2263 +module_exit(sparse_exit);
2264 +MODULE_LICENSE("GPL");
2265 diff -urN linux-2.4.22/drivers/md/multipath.c linux-2.4.22-evms/drivers/md/multipath.c
2266 --- linux-2.4.22/drivers/md/multipath.c 2003-06-13 16:51:34.000000000 +0200
2267 +++ linux-2.4.22-evms/drivers/md/multipath.c    2003-09-15 17:09:36.000000000 +0200
2268 @@ -139,15 +139,16 @@
2269  static int multipath_map (mddev_t *mddev, kdev_t *rdev)
2270  {
2271         multipath_conf_t *conf = mddev_to_conf(mddev);
2272 -       int i, disks = MD_SB_DISKS;
2273 +       int i;
2274
2275         /*
2276          * Later we do read balancing on the read side
2277          * now we use the first available disk.
2278          */
2279
2280 -       for (i = 0; i < disks; i++) {
2281 +       for (i = 0; i < conf->nr_disks; i++) {
2282                 if (conf->multipaths[i].operational) {
2283 +                       /* first operational is winner! */
2284                         *rdev = conf->multipaths[i].dev;
2285                         return (0);
2286                 }
2287 @@ -191,6 +192,8 @@
2288  {
2289         struct multipath_bh * mp_bh = (struct multipath_bh *)(bh->b_private);
2290
2291 +       atomic_dec(&mp_bh->multipath->nr_pending);
2292 +
2293         /*
2294          * this branch is our 'one multipath IO has finished' event handler:
2295          */
2296 @@ -223,19 +226,39 @@
2297  }
2298
2299  /*
2300 - * This routine returns the disk from which the requested read should
2301 - * be done.
2302 + * Multipath read balance ...
2303 + *
2304 + * Returns:
2305 + *
2306 + *     If no active paths
2307 + *
2308 + *             - Error ( -1 )
2309 + *
2310 + *     If active paths == 1
2311 + *
2312 + *             - 1st active path encountered
2313 + *
2314 + *     If active paths > 1
2315 + *
2316 + *             - 1st idle active path encountered
2317 + *             - else ... the active path doing the least amount of work.
2318   */
2319 -
2320  static int multipath_read_balance (multipath_conf_t *conf)
2321  {
2322 -       int disk;
2323 -
2324 -       for (disk = 0; disk < conf->raid_disks; disk++)
2325 -               if (conf->multipaths[disk].operational)
2326 -                       return disk;
2327 -       BUG();
2328 -       return 0;
2329 +       int i, disk=-1, nr_pending, least_pending=0;
2330 +
2331 +       for (i=0; i<conf->nr_disks; i++) {
2332 +               if (conf->multipaths[i].operational) {
2333 +                       nr_pending = atomic_read(&conf->multipaths[i].nr_pending);
2334 +                       if (nr_pending==0 || conf->working_disks==1)
2335 +                               return i;
2336 +                       if (least_pending==0 || nr_pending<least_pending) {
2337 +                               disk = i;
2338 +                               least_pending = nr_pending;
2339 +                       }
2340 +               }
2341 +       }
2342 +       return disk;
2343  }
2344
2345  static int multipath_make_request (mddev_t *mddev, int rw,
2346 @@ -245,6 +268,7 @@
2347         struct buffer_head *bh_req;
2348         struct multipath_bh * mp_bh;
2349         struct multipath_info *multipath;
2350 +       int disk;
2351
2352         if (!buffer_locked(bh))
2353                 BUG();
2354 @@ -267,7 +291,16 @@
2355         /*
2356          * read balancing logic:
2357          */
2358 -       multipath = conf->multipaths + multipath_read_balance(conf);
2359 +       disk = multipath_read_balance(conf);
2360 +       if (disk==-1) {
2361 +               printk (KERN_ERR "multipath_make_request: no more operational IO paths.\n");
2362 +               buffer_IO_error(bh);
2363 +               return 0;
2364 +       }
2365 +
2366 +       multipath = conf->multipaths + disk;
2367 +       mp_bh->multipath = multipath;
2368 +       atomic_inc(&multipath->nr_pending);
2369
2370         bh_req = &mp_bh->bh_req;
2371         memcpy(bh_req, bh, sizeof(*bh));
2372 @@ -331,13 +364,14 @@
2373  {
2374         multipath_conf_t *conf = mddev_to_conf(mddev);
2375         struct multipath_info * multipaths = conf->multipaths;
2376 -       int disks = MD_SB_DISKS;
2377         int other_paths = 1;
2378 -       int i;
2379 +       int i, first = 1;
2380 +       mdk_rdev_t *rdev;
2381 +       struct md_list_head *tmp;
2382
2383         if (conf->working_disks == 1) {
2384                 other_paths = 0;
2385 -               for (i = 0; i < disks; i++) {
2386 +               for (i = 0; i < MD_SB_DISKS; i++) {
2387                         if (multipaths[i].spare) {
2388                                 other_paths = 1;
2389                                 break;
2390 @@ -351,16 +385,17 @@
2391                  * first check if this is a queued request for a device
2392                  * which has just failed.
2393                  */
2394 -               for (i = 0; i < disks; i++) {
2395 +               for (i = 0; i < MD_SB_DISKS; i++) {
2396                         if (multipaths[i].dev==dev && !multipaths[i].operational)
2397                                 return 0;
2398                 }
2399                 printk (LAST_DISK);
2400         } else {
2401 +               mdp_super_t *sb = mddev->sb;
2402                 /*
2403                  * Mark disk as unusable
2404                  */
2405 -               for (i = 0; i < disks; i++) {
2406 +               for (i = 0; i < MD_SB_DISKS; i++) {
2407                         if (multipaths[i].dev==dev && multipaths[i].operational) {
2408                                 mark_disk_bad(mddev, i);
2409                                 break;
2410 @@ -369,7 +404,6 @@
2411                 if (!conf->working_disks) {
2412                         int err = 1;
2413                         mdp_disk_t *spare;
2414 -                       mdp_super_t *sb = mddev->sb;
2415
2416                         spare = get_spare(mddev);
2417                         if (spare) {
2418 @@ -384,6 +418,21 @@
2419                                 sb->spare_disks--;
2420                         }
2421                 }
2422 +               /* prevent unnecessary work in md_do_recovery() */
2423 +               if (conf->working_disks) {
2424 +                       conf->raid_disks = conf->working_disks
2425 +                                        = sb->raid_disks = sb->active_disks;
2426 +               }
2427 +               /* update alias disk info to insure we can do sb commit. */
2428 +               ITERATE_RDEV(mddev,rdev,tmp) {
2429 +                       if (first && disk_active(&sb->disks[rdev->desc_nr])) {
2430 +                               rdev->alias_device = 0;
2431 +                               first = 0;
2432 +                       } else {
2433 +                               if (!disk_faulty(&sb->disks[rdev->desc_nr]))
2434 +                                       rdev->alias_device = 1;
2435 +                       }
2436 +               }
2437         }
2438         return 0;
2439  }
2440 @@ -677,9 +726,8 @@
2441  /*
2442   * This is a kernel thread which:
2443   *
2444 - *     1.      Retries failed read operations on working multipaths.
2445 + *     1.      Retries failed operations on working multipaths.
2446   *     2.      Updates the raid superblock when problems encounter.
2447 - *     3.      Performs writes following reads for array syncronising.
2448   */
2449
2450  static void multipathd (void *data)
2451 @@ -833,6 +881,7 @@
2452         mdk_rdev_t *rdev, *def_rdev = NULL;
2453         struct md_list_head *tmp;
2454         int num_rdevs = 0;
2455 +       int active_disks = 0, spare_disks = 0, faulty_disks = 0;
2456
2457         MOD_INC_USE_COUNT;
2458
2459 @@ -881,9 +930,7 @@
2460                         printk(NOT_IN_SYNC, partition_name(rdev->dev));
2461
2462                 /*
2463 -                * Mark all disks as spare to start with, then pick our
2464 -                * active disk.  If we have a disk that is marked active
2465 -                * in the sb, then use it, else use the first rdev.
2466 +                * Mark all disks as spare to start with.
2467                  */
2468                 disk->number = desc->number;
2469                 disk->raid_disk = desc->raid_disk;
2470 @@ -894,20 +941,21 @@
2471                 mark_disk_sync(desc);
2472
2473                 if (disk_active(desc)) {
2474 -                       if(!conf->working_disks) {
2475 -                               printk(OPERATIONAL, partition_name(rdev->dev),
2476 -                                       desc->raid_disk);
2477 -                               disk->operational = 1;
2478 -                               disk->spare = 0;
2479 -                               conf->working_disks++;
2480 -                               def_rdev = rdev;
2481 -                       } else {
2482 -                               mark_disk_spare(desc);
2483 -                       }
2484 -               } else
2485 -                       mark_disk_spare(desc);
2486 +                       printk(OPERATIONAL, partition_name(rdev->dev),
2487 +                               desc->raid_disk);
2488 +                       disk->operational = 1;
2489 +                       disk->spare = 0;
2490 +                       conf->working_disks++;
2491 +                       def_rdev = rdev;
2492 +                       active_disks++;
2493 +               } else if (disk_faulty(desc)) {
2494 +                       disk->spare = 0;
2495 +                       faulty_disks++;
2496 +               } else {
2497 +                       spare_disks++;
2498 +               }
2499
2500 -               if(!num_rdevs++) def_rdev = rdev;
2501 +               num_rdevs++;
2502         }
2503         if(!conf->working_disks && num_rdevs) {
2504                 desc = &sb->disks[def_rdev->desc_nr];
2505 @@ -918,11 +966,12 @@
2506                 disk->spare = 0;
2507                 conf->working_disks++;
2508                 mark_disk_active(desc);
2509 +               active_disks++;
2510         }
2511         /*
2512 -        * Make sure our active path is in desc spot 0
2513 +        * If there is only 1 active path ... make sure it is in desc spot 0
2514          */
2515 -       if(def_rdev->desc_nr != 0) {
2516 +       if (active_disks == 1 && def_rdev->desc_nr != 0) {
2517                 rdev = find_rdev_nr(mddev, 0);
2518                 desc = &sb->disks[def_rdev->desc_nr];
2519                 desc2 = sb->disks;
2520 @@ -940,10 +989,10 @@
2521                         def_rdev->desc_nr = 0;
2522                 }
2523         }
2524 -       conf->raid_disks = sb->raid_disks = sb->active_disks = 1;
2525 +       conf->raid_disks = sb->raid_disks = sb->active_disks = active_disks;
2526         conf->nr_disks = sb->nr_disks = sb->working_disks = num_rdevs;
2527 -       sb->failed_disks = 0;
2528 -       sb->spare_disks = num_rdevs - 1;
2529 +       sb->failed_disks = faulty_disks;
2530 +       sb->spare_disks = spare_disks;
2531         mddev->sb_dirty = 1;
2532         conf->mddev = mddev;
2533         conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
2534 diff -urN linux-2.4.22/include/linux/raid/multipath.h linux-2.4.22-evms/include/linux/raid/multipath.h
2535 --- linux-2.4.22/include/linux/raid/multipath.h 2001-11-12 18:51:56.000000000 +0100
2536 +++ linux-2.4.22-evms/include/linux/raid/multipath.h    2003-09-15 17:09:36.000000000 +0200
2537 @@ -15,6 +15,7 @@
2538         int             spare;
2539
2540         int             used_slot;
2541 +       atomic_t        nr_pending;     /* number of pending requests */
2542  };
2543
2544  struct multipath_private_data {
2545 @@ -63,6 +64,7 @@
2546         struct buffer_head      *master_bh;
2547         struct buffer_head      bh_req;
2548         struct multipath_bh     *next_mp; /* next for retry or in free list */
2549 +       struct multipath_info   *multipath; /* allows end_request to easilly dec pending buffer count*/
2550  };
2551  /* bits for multipath_bh.state */
2552  #define        MPBH_Uptodate   1