linux-2.4.25-evms-2.2.1.patch

   1 diff -urN linux-2.4.24.org/drivers/md/Config.in linux-2.4.24/drivers/md/Config.in
   2 --- linux-2.4.24.org/drivers/md/Config.in       2004-01-18 15:09:18.503177509 +0100
   3 +++ linux-2.4.24/drivers/md/Config.in   2004-01-18 16:05:08.202479073 +0100
   4 @@ -12,6 +12,10 @@
   5  dep_tristate '  RAID-1 (mirroring) mode' CONFIG_MD_RAID1 $CONFIG_BLK_DEV_MD
   6  dep_tristate '  RAID-4/RAID-5 mode' CONFIG_MD_RAID5 $CONFIG_BLK_DEV_MD
   7  dep_tristate '  Multipath I/O support' CONFIG_MD_MULTIPATH $CONFIG_BLK_DEV_MD
   8 +if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
   9 +       dep_tristate '  Bad Block Relocation Device Target (EXPERIMENTAL)' CONFIG_BLK_DEV_DM_BBR $CONFIG_BLK_DEV_DM
  10 +       dep_tristate '  Sparse Device Target (EXPERIMENTAL)' CONFIG_BLK_DEV_DM_SPARSE $CONFIG_BLK_DEV_DM
  11 +fi
  12
  13  dep_tristate ' Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM $CONFIG_MD
  14  dep_tristate ' Device-mapper support' CONFIG_BLK_DEV_DM $CONFIG_MD
  15 diff -urN linux-2.4.24.org/drivers/md/dm-bbr.c linux-2.4.24/drivers/md/dm-bbr.c
  16 --- linux-2.4.24.org/drivers/md/dm-bbr.c        1970-01-01 01:00:00.000000000 +0100
  17 +++ linux-2.4.24/drivers/md/dm-bbr.c    2004-01-18 16:03:13.099546349 +0100
  18 @@ -0,0 +1,1227 @@
  19 +/*
  20 + *   (C) Copyright IBM Corp. 2002, 2003
  21 + *
  22 + *   This program is free software;  you can redistribute it and/or modify
  23 + *   it under the terms of the GNU General Public License as published by
  24 + *   the Free Software Foundation; either version 2 of the License, or
  25 + *   (at your option) any later version.
  26 + *
  27 + *   This program is distributed in the hope that it will be useful,
  28 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  29 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  30 + *   the GNU General Public License for more details.
  31 + *
  32 + *   You should have received a copy of the GNU General Public License
  33 + *   along with this program;  if not, write to the Free Software
  34 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  35 + *
  36 + * linux/drivers/md/dm-bbr.c
  37 + *
  38 + * Bad-block-relocation (BBR) target for device-mapper.
  39 + *
  40 + * The BBR target is designed to remap I/O write failures to another safe
  41 + * location on disk. Note that most disk drives have BBR built into them,
  42 + * this means that our software BBR will be only activated when all hardware
  43 + * BBR replacement sectors have been used.
  44 + */
  45 +
  46 +#include <linux/kernel.h>
  47 +#include <linux/module.h>
  48 +#include <linux/init.h>
  49 +#include <linux/blkdev.h>
  50 +#include <linux/spinlock.h>
  51 +#include <linux/smp_lock.h>
  52 +#include <linux/slab.h>
  53 +#include <linux/mempool.h>
  54 +#include "dm.h"
  55 +#include "dm-bbr.h"
  56 +#include "dm-daemon.h"
  57 +#include "dm-io.h"
  58 +
  59 +/* Number of active BBR devices. */
  60 +static int bbr_instances = 0;
  61 +static DECLARE_MUTEX(bbr_instances_lock);
  62 +
  63 +/* Data pertaining to the I/O thread. */
  64 +static struct dm_daemon * bbr_io_thread = NULL;
  65 +static spinlock_t bbr_io_list_lock = SPIN_LOCK_UNLOCKED;
  66 +static LIST_HEAD(bbr_io_list);
  67 +static void bbr_io_handler(void);
  68 +
  69 +/* Global pools for bbr_io_buf's and bbr_remap's. */
  70 +static kmem_cache_t * bbr_io_buf_cache;
  71 +static mempool_t * bbr_io_buf_pool;
  72 +static kmem_cache_t * bbr_remap_cache;
  73 +static mempool_t * bbr_remap_pool;
  74 +
  75 +static void bbr_free_remap(struct bbr_private * bbr_id);
  76 +
  77 +/**
  78 + * destroy_pools
  79 + *
  80 + * Delete the pools for the remap list and I/O anchors.
  81 + **/
  82 +static void destroy_pools(void)
  83 +{
  84 +       if (bbr_io_buf_pool) {
  85 +               mempool_destroy(bbr_io_buf_pool);
  86 +               bbr_io_buf_pool = NULL;
  87 +       }
  88 +       if (bbr_io_buf_cache) {
  89 +               kmem_cache_destroy(bbr_io_buf_cache);
  90 +               bbr_io_buf_cache = NULL;
  91 +       }
  92 +       if (bbr_remap_pool) {
  93 +               mempool_destroy(bbr_remap_pool);
  94 +               bbr_remap_pool = NULL;
  95 +       }
  96 +       if (bbr_remap_cache) {
  97 +               kmem_cache_destroy(bbr_remap_cache);
  98 +               bbr_remap_cache = NULL;
  99 +       }
 100 +}
 101 +
 102 +/**
 103 + * create_pools
 104 + *
 105 + * Create mempools for the remap list and I/O anchors.
 106 + **/
 107 +static int create_pools(void)
 108 +{
 109 +       if (!bbr_remap_cache) {
 110 +               bbr_remap_cache = kmem_cache_create("BBR_Remap_Cache",
 111 +                                                   sizeof(struct bbr_runtime_remap),
 112 +                                                   0, SLAB_HWCACHE_ALIGN,
 113 +                                                   NULL, NULL);
 114 +               if (!bbr_remap_cache) {
 115 +                       DMERR("Unable to create BBR remap cache.");
 116 +                       goto out;
 117 +               }
 118 +       }
 119 +       if (!bbr_remap_pool) {
 120 +               bbr_remap_pool = mempool_create(64, mempool_alloc_slab,
 121 +                                               mempool_free_slab,
 122 +                                               bbr_remap_cache);
 123 +               if (!bbr_remap_pool) {
 124 +                       DMERR("Unable to create BBR remap mempool.");
 125 +                       goto out;
 126 +               }
 127 +       }
 128 +
 129 +       if (!bbr_io_buf_cache) {
 130 +               bbr_io_buf_cache = kmem_cache_create("BBR_IO_Buf_Cache",
 131 +                                                    sizeof(struct bbr_io_buffer),
 132 +                                                    0, SLAB_HWCACHE_ALIGN,
 133 +                                                    NULL, NULL);
 134 +               if (!bbr_io_buf_cache) {
 135 +                       DMERR("Unable to create BBR I/O buffer cache.");
 136 +                       goto out;
 137 +               }
 138 +       }
 139 +       if (!bbr_io_buf_pool) {
 140 +               bbr_io_buf_pool = mempool_create(256, mempool_alloc_slab,
 141 +                                                mempool_free_slab,
 142 +                                                bbr_io_buf_cache);
 143 +               if (!bbr_io_buf_pool) {
 144 +                       DMERR("Unable to create BBR I/O buffer mempool.");
 145 +                       goto out;
 146 +               }
 147 +       }
 148 +
 149 +out:
 150 +       if (!bbr_remap_cache  || !bbr_remap_pool ||
 151 +           !bbr_io_buf_cache || !bbr_io_buf_pool ) {
 152 +               destroy_pools();
 153 +               return -ENOMEM;
 154 +       }
 155 +
 156 +       return 0;
 157 +}
 158 +
 159 +/**
 160 + * stop_io_thread
 161 + *
 162 + * Use the dm-daemon services to stop the BBR I/O thread.
 163 + **/
 164 +static void stop_io_thread(void)
 165 +{
 166 +       if (bbr_io_thread) {
 167 +               dm_daemon_stop(bbr_io_thread);
 168 +               kfree(bbr_io_thread);
 169 +               bbr_io_thread = NULL;
 170 +       }
 171 +}
 172 +
 173 +/**
 174 + * start_io_thread
 175 + *
 176 + * Use the dm-daemon services to start the BBR I/O thread.
 177 + **/
 178 +static int start_io_thread(void)
 179 +{
 180 +       int rc;
 181 +
 182 +       if (!bbr_io_thread) {
 183 +               bbr_io_thread = kmalloc(sizeof(*bbr_io_thread), GFP_KERNEL);
 184 +               if (!bbr_io_thread) {
 185 +                       return -ENOMEM;
 186 +               }
 187 +
 188 +               rc = dm_daemon_start(bbr_io_thread, "bbr_io", bbr_io_handler);
 189 +               if (rc) {
 190 +                       kfree(bbr_io_thread);
 191 +                       return rc;
 192 +               }
 193 +       }
 194 +
 195 +       return 0;
 196 +}
 197 +
 198 +/**
 199 + * bbr_global_init
 200 + *
 201 + * Set up the mempools, I/O thread, and sync-I/O service. This should
 202 + * be called only when the first bbr device is created.
 203 + **/
 204 +static int bbr_global_init(void)
 205 +{
 206 +       int rc;
 207 +
 208 +       rc = create_pools();
 209 +       if (rc) {
 210 +               goto out;
 211 +       }
 212 +
 213 +       rc = start_io_thread();
 214 +       if (rc) {
 215 +               destroy_pools();
 216 +               goto out;
 217 +       }
 218 +
 219 +       rc = dm_io_get(1);
 220 +       if (rc) {
 221 +               destroy_pools();
 222 +               stop_io_thread();
 223 +               goto out;
 224 +       }
 225 +
 226 +out:
 227 +       return rc;
 228 +}
 229 +
 230 +/**
 231 + * bbr_global_cleanup
 232 + *
 233 + * Cleanup the mempools, I/O thread and sync-I/O service. This should
 234 + * be called only when the last bbr device is removed.
 235 + **/
 236 +static void bbr_global_cleanup(void)
 237 +{
 238 +       destroy_pools();
 239 +       stop_io_thread();
 240 +       dm_io_put(1);
 241 +}
 242 +
 243 +static struct bbr_private * bbr_alloc_private(void)
 244 +{
 245 +       struct bbr_private *bbr_id;
 246 +
 247 +       bbr_id = kmalloc(sizeof(*bbr_id), GFP_KERNEL);
 248 +       if (bbr_id) {
 249 +               memset(bbr_id, 0, sizeof(*bbr_id));
 250 +               bbr_id->in_use_replacement_blks = (atomic_t)ATOMIC_INIT(0);
 251 +               bbr_id->bbr_id_lock = SPIN_LOCK_UNLOCKED;
 252 +       }
 253 +
 254 +       return bbr_id;
 255 +}
 256 +
 257 +static void bbr_free_private(struct bbr_private *bbr_id)
 258 +{
 259 +       if (bbr_id->bbr_table) {
 260 +               kfree(bbr_id->bbr_table);
 261 +       }
 262 +       bbr_free_remap(bbr_id);
 263 +       kfree(bbr_id);
 264 +}
 265 +
 266 +static u32 crc_table[256];
 267 +static u32 crc_table_built = 0;
 268 +
 269 +static void build_crc_table(void)
 270 +{
 271 +       u32 i, j, crc;
 272 +
 273 +       for (i = 0; i <= 255; i++) {
 274 +               crc = i;
 275 +               for (j = 8; j > 0; j--) {
 276 +                       if (crc & 1)
 277 +                               crc = (crc >> 1) ^ CRC_POLYNOMIAL;
 278 +                       else
 279 +                               crc >>= 1;
 280 +               }
 281 +               crc_table[i] = crc;
 282 +       }
 283 +       crc_table_built = 1;
 284 +}
 285 +
 286 +static u32 calculate_crc(u32 crc, void *buffer, u32 buffersize)
 287 +{
 288 +       unsigned char *current_byte;
 289 +       u32 temp1, temp2, i;
 290 +
 291 +       current_byte = (unsigned char *) buffer;
 292 +       /* Make sure the crc table is available */
 293 +       if (!crc_table_built)
 294 +               build_crc_table();
 295 +       /* Process each byte in the buffer. */
 296 +       for (i = 0; i < buffersize; i++) {
 297 +               temp1 = (crc >> 8) & 0x00FFFFFF;
 298 +               temp2 = crc_table[(crc ^ (u32) * current_byte) &
 299 +                                 (u32) 0xff];
 300 +               current_byte++;
 301 +               crc = temp1 ^ temp2;
 302 +       }
 303 +       return crc;
 304 +}
 305 +
 306 +/**
 307 + * le_bbr_table_sector_to_cpu
 308 + *
 309 + * Convert bbr meta data from on-disk (LE) format
 310 + * to the native cpu endian format.
 311 + **/
 312 +static void le_bbr_table_sector_to_cpu(struct bbr_table *p)
 313 +{
 314 +       int i;
 315 +       p->signature            = le32_to_cpup(&p->signature);
 316 +       p->crc                  = le32_to_cpup(&p->crc);
 317 +       p->sequence_number      = le32_to_cpup(&p->sequence_number);
 318 +       p->in_use_cnt           = le32_to_cpup(&p->in_use_cnt);
 319 +       for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
 320 +               p->entries[i].bad_sect =
 321 +                       le64_to_cpup(&p->entries[i].bad_sect);
 322 +               p->entries[i].replacement_sect =
 323 +                       le64_to_cpup(&p->entries[i].replacement_sect);
 324 +       }
 325 +}
 326 +
 327 +/**
 328 + * cpu_bbr_table_sector_to_le
 329 + *
 330 + * Convert bbr meta data from cpu endian format to on-disk (LE) format
 331 + **/
 332 +static void cpu_bbr_table_sector_to_le(struct bbr_table * p,
 333 +                                      struct bbr_table * le)
 334 +{
 335 +       int i;
 336 +       le->signature           = cpu_to_le32p(&p->signature);
 337 +       le->crc                 = cpu_to_le32p(&p->crc);
 338 +       le->sequence_number     = cpu_to_le32p(&p->sequence_number);
 339 +       le->in_use_cnt          = cpu_to_le32p(&p->in_use_cnt);
 340 +       for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
 341 +               le->entries[i].bad_sect =
 342 +                       cpu_to_le64p(&p->entries[i].bad_sect);
 343 +               le->entries[i].replacement_sect =
 344 +                       cpu_to_le64p(&p->entries[i].replacement_sect);
 345 +       }
 346 +}
 347 +
 348 +/**
 349 + * validate_bbr_table_sector
 350 + *
 351 + * Check the specified BBR table sector for a valid signature and CRC. If it's
 352 + * valid, endian-convert the table sector.
 353 + **/
 354 +static int validate_bbr_table_sector(struct bbr_table * p)
 355 +{
 356 +       int rc = 0;
 357 +       int org_crc, final_crc;
 358 +
 359 +       if (le32_to_cpup(&p->signature) != BBR_TABLE_SIGNATURE) {
 360 +               DMERR("BBR table signature doesn't match!");
 361 +               DMERR("Found 0x%x. Expecting 0x%x",
 362 +                     le32_to_cpup(&p->signature), BBR_TABLE_SIGNATURE);
 363 +               rc = -EINVAL;
 364 +               goto out;
 365 +       }
 366 +
 367 +       if (!p->crc) {
 368 +               DMERR("BBR table sector has no CRC!");
 369 +               rc = -EINVAL;
 370 +               goto out;
 371 +       }
 372 +
 373 +       org_crc = le32_to_cpup(&p->crc);
 374 +       p->crc = 0;
 375 +       final_crc = calculate_crc(INITIAL_CRC, (void *)p, sizeof(*p));
 376 +       if (final_crc != org_crc) {
 377 +               DMERR("CRC failed!");
 378 +               DMERR("Found 0x%x. Expecting 0x%x",
 379 +                     org_crc, final_crc);
 380 +               rc = -EINVAL;
 381 +               goto out;
 382 +       }
 383 +
 384 +       p->crc = cpu_to_le32p(&org_crc);
 385 +       le_bbr_table_sector_to_cpu(p);
 386 +
 387 +out:
 388 +       return rc;
 389 +}
 390 +
 391 +/**
 392 + * bbr_binary_tree_insert
 393 + *
 394 + * Insert a node into the binary tree.
 395 + **/
 396 +static void bbr_binary_tree_insert(struct bbr_runtime_remap **root,
 397 +                                  struct bbr_runtime_remap *newnode)
 398 +{
 399 +       struct bbr_runtime_remap **node = root;
 400 +       while (node && *node) {
 401 +               if (newnode->remap.bad_sect > (*node)->remap.bad_sect) {
 402 +                       node = &((*node)->right);
 403 +               } else {
 404 +                       node = &((*node)->left);
 405 +               }
 406 +       }
 407 +
 408 +       newnode->left = newnode->right = NULL;
 409 +       *node = newnode;
 410 +}
 411 +
 412 +/**
 413 + * bbr_binary_search
 414 + *
 415 + * Search for a node that contains bad_sect == lsn.
 416 + **/
 417 +static struct bbr_runtime_remap * bbr_binary_search(
 418 +       struct bbr_runtime_remap *root,
 419 +       u64 lsn)
 420 +{
 421 +       struct bbr_runtime_remap *node = root;
 422 +       while (node) {
 423 +               if (node->remap.bad_sect == lsn) {
 424 +                       break;
 425 +               }
 426 +               if (lsn > node->remap.bad_sect) {
 427 +                       node = node->right;
 428 +               } else {
 429 +                       node = node->left;
 430 +               }
 431 +       }
 432 +       return node;
 433 +}
 434 +
 435 +/**
 436 + * bbr_binary_tree_destroy
 437 + *
 438 + * Destroy the binary tree.
 439 + **/
 440 +static void bbr_binary_tree_destroy(struct bbr_runtime_remap * root,
 441 +                                   struct bbr_private * bbr_id)
 442 +{
 443 +       struct bbr_runtime_remap **link = NULL;
 444 +       struct bbr_runtime_remap *node = root;
 445 +
 446 +       while (node) {
 447 +               if (node->left) {
 448 +                       link = &(node->left);
 449 +                       node = node->left;
 450 +                       continue;
 451 +               }
 452 +               if (node->right) {
 453 +                       link = &(node->right);
 454 +                       node = node->right;
 455 +                       continue;
 456 +               }
 457 +
 458 +               mempool_free(node, bbr_remap_pool);
 459 +               if (node == root) {
 460 +                       /* If root is deleted, we're done. */
 461 +                       break;
 462 +               }
 463 +
 464 +               /* Back to root. */
 465 +               node = root;
 466 +               *link = NULL;
 467 +       }
 468 +}
 469 +
 470 +static void bbr_free_remap(struct bbr_private * bbr_id)
 471 +{
 472 +       spin_lock_irq(&bbr_id->bbr_id_lock);
 473 +       bbr_binary_tree_destroy(bbr_id->remap_root, bbr_id);
 474 +       bbr_id->remap_root = NULL;
 475 +       spin_unlock_irq(&bbr_id->bbr_id_lock);
 476 +}
 477 +
 478 +/**
 479 + * bbr_insert_remap_entry
 480 + *
 481 + * Create a new remap entry and add it to the binary tree for this node.
 482 + **/
 483 +static int bbr_insert_remap_entry(struct bbr_private *bbr_id,
 484 +                                 struct bbr_table_entry *new_bbr_entry)
 485 +{
 486 +       struct bbr_runtime_remap *newnode;
 487 +
 488 +       newnode = mempool_alloc(bbr_remap_pool, GFP_NOIO);
 489 +       if (!newnode) {
 490 +               DMERR("Could not allocate from remap mempool!");
 491 +               return -ENOMEM;
 492 +       }
 493 +       newnode->remap.bad_sect  = new_bbr_entry->bad_sect;
 494 +       newnode->remap.replacement_sect = new_bbr_entry->replacement_sect;
 495 +       spin_lock_irq(&bbr_id->bbr_id_lock);
 496 +       bbr_binary_tree_insert(&bbr_id->remap_root, newnode);
 497 +       spin_unlock_irq(&bbr_id->bbr_id_lock);
 498 +       return 0;
 499 +}
 500 +
 501 +/**
 502 + * bbr_table_to_remap_list
 503 + *
 504 + * The on-disk bbr table is sorted by the replacement sector LBA. In order to
 505 + * improve run time performance, the in memory remap list must be sorted by
 506 + * the bad sector LBA. This function is called at discovery time to initialize
 507 + * the remap list. This function assumes that at least one copy of meta data
 508 + * is valid.
 509 + **/
 510 +static u32 bbr_table_to_remap_list(struct bbr_private * bbr_id)
 511 +{
 512 +       u32 in_use_blks = 0;
 513 +       int i, j;
 514 +       struct bbr_table *p;
 515 +
 516 +       for (i = 0, p = bbr_id->bbr_table;
 517 +            i < bbr_id->nr_sects_bbr_table;
 518 +            i++, p++) {
 519 +               if (!p->in_use_cnt) {
 520 +                       break;
 521 +               }
 522 +               in_use_blks += p->in_use_cnt;
 523 +               for (j = 0; j < p->in_use_cnt; j++) {
 524 +                       bbr_insert_remap_entry(bbr_id, &p->entries[j]);
 525 +               }
 526 +       }
 527 +       if (in_use_blks) {
 528 +               DMWARN("There are %u BBR entries for device %s",
 529 +                      in_use_blks, dm_kdevname(bbr_id->dev->dev));
 530 +       }
 531 +
 532 +       return in_use_blks;
 533 +}
 534 +
 535 +/**
 536 + * bbr_search_remap_entry
 537 + *
 538 + * Search remap entry for the specified sector. If found, return a pointer to
 539 + * the table entry. Otherwise, return NULL.
 540 + **/
 541 +static struct bbr_table_entry * bbr_search_remap_entry(
 542 +       struct bbr_private *bbr_id,
 543 +       u64 lsn)
 544 +{
 545 +       struct bbr_runtime_remap *p;
 546 +
 547 +       spin_lock_irq(&bbr_id->bbr_id_lock);
 548 +       p = bbr_binary_search(bbr_id->remap_root, lsn);
 549 +       spin_unlock_irq(&bbr_id->bbr_id_lock);
 550 +       if (p) {
 551 +               return (&p->remap);
 552 +       } else {
 553 +               return NULL;
 554 +       }
 555 +}
 556 +
 557 +/**
 558 + * bbr_remap
 559 + *
 560 + * If *lsn is in the remap table, return TRUE and modify *lsn,
 561 + * else, return FALSE.
 562 + **/
 563 +static inline int bbr_remap(struct bbr_private *bbr_id,
 564 +                           u64 *lsn)
 565 +{
 566 +       struct bbr_table_entry *e;
 567 +
 568 +       if (atomic_read(&bbr_id->in_use_replacement_blks)) {
 569 +               e = bbr_search_remap_entry(bbr_id, *lsn);
 570 +               if (e) {
 571 +                       *lsn = e->replacement_sect;
 572 +                       return 1;
 573 +               }
 574 +       }
 575 +       return 0;
 576 +}
 577 +
 578 +/**
 579 + * bbr_remap_probe
 580 + *
 581 + * If any of the sectors in the range [lsn, lsn+nr_sects] are in the remap
 582 + * table return TRUE, Else, return FALSE.
 583 + **/
 584 +static inline int bbr_remap_probe(struct bbr_private * bbr_id,
 585 +                                 u64 lsn, u64 nr_sects)
 586 +{
 587 +       u64 tmp, cnt;
 588 +
 589 +       if (atomic_read(&bbr_id->in_use_replacement_blks)) {
 590 +               for (cnt = 0, tmp = lsn;
 591 +                    cnt < nr_sects;
 592 +                    cnt += bbr_id->blksize_in_sects, tmp = lsn + cnt) {
 593 +                       if (bbr_remap(bbr_id,&tmp)) {
 594 +                               return 1;
 595 +                       }
 596 +               }
 597 +       }
 598 +       return 0;
 599 +}
 600 +
 601 +/**
 602 + * bbr_setup
 603 + *
 604 + * Read the remap tables from disk and set up the initial remap tree.
 605 + **/
 606 +static int bbr_setup(struct bbr_private *bbr_id)
 607 +{
 608 +       struct bbr_table *table = bbr_id->bbr_table;
 609 +       struct page *page;
 610 +       struct io_region job;
 611 +       unsigned int error, offset;
 612 +       int i, rc = 0;
 613 +
 614 +       job.dev = bbr_id->dev->dev;
 615 +       job.count = 1;
 616 +
 617 +       /* Read and verify each BBR table sector individually. */
 618 +       for (i = 0; i < bbr_id->nr_sects_bbr_table; i++, table++) {
 619 +               job.sector = bbr_id->lba_table1 + i;
 620 +               page = virt_to_page(table);
 621 +               offset = (unsigned long)table & ~PAGE_MASK;
 622 +               rc = dm_io_sync(1, &job, READ, page, offset, &error);
 623 +               if (rc && bbr_id->lba_table2) {
 624 +                       job.sector = bbr_id->lba_table2 + i;
 625 +                       rc = dm_io_sync(1, &job, READ, page, offset, &error);
 626 +               }
 627 +               if (rc) {
 628 +                       goto out;
 629 +               }
 630 +
 631 +               rc = validate_bbr_table_sector(table);
 632 +               if (rc) {
 633 +                       goto out;
 634 +               }
 635 +       }
 636 +       atomic_set(&bbr_id->in_use_replacement_blks,
 637 +                  bbr_table_to_remap_list(bbr_id));
 638 +
 639 +out:
 640 +       if (rc) {
 641 +               DMERR("dm-bbr: error during device setup: %d", rc);
 642 +       }
 643 +       return rc;
 644 +}
 645 +
 646 +static struct bbr_io_buffer * allocate_bbr_io_buf(struct bbr_private * bbr_id,
 647 +                                                 struct buffer_head * bh,
 648 +                                                 int rw)
 649 +{
 650 +       struct bbr_io_buffer * bbr_io_buf;
 651 +
 652 +       bbr_io_buf = mempool_alloc(bbr_io_buf_pool, GFP_NOIO);
 653 +       if (bbr_io_buf) {
 654 +               memset(bbr_io_buf, 0, sizeof(struct bbr_io_buffer));
 655 +               INIT_LIST_HEAD(&bbr_io_buf->bbr_io_list);
 656 +               bbr_io_buf->bbr_id = bbr_id;
 657 +               bbr_io_buf->sector = bh->b_rsector;
 658 +               bbr_io_buf->bh = bh;
 659 +               bbr_io_buf->rw = rw;
 660 +       } else {
 661 +               DMWARN("Could not allocate from BBR I/O buffer pool!");
 662 +       }
 663 +       return bbr_io_buf;
 664 +}
 665 +
 666 +static void free_bbr_io_buf(struct bbr_io_buffer * bbr_io_buf)
 667 +{
 668 +       mempool_free(bbr_io_buf, bbr_io_buf_pool);
 669 +}
 670 +
 671 +/**
 672 + * bbr_io_remap_error
 673 + * @bbr_id:            Private data for the BBR node.
 674 + * @rw:                        READ or WRITE.
 675 + * @starting_lsn:      Starting sector of request to remap.
 676 + * @count:             Number of sectors in the request.
 677 + * @buffer:            Data buffer for the request.
 678 + *
 679 + * For the requested range, try to write each sector individually. For each
 680 + * sector that fails, find the next available remap location and write the
 681 + * data to that new location. Then update the table and write both copies
 682 + * of the table to disk. Finally, update the in-memory mapping and do any
 683 + * other necessary bookkeeping.
 684 + **/
 685 +static int bbr_io_remap_error(struct bbr_private *bbr_id,
 686 +                             int rw,
 687 +                             u64 starting_lsn,
 688 +                             u64 count,
 689 +                             char *buffer)
 690 +{
 691 +       struct bbr_table *bbr_table;
 692 +       struct io_region job;
 693 +       struct page *page;
 694 +       unsigned long table_sector_index;
 695 +       unsigned long table_sector_offset;
 696 +       unsigned long index;
 697 +       unsigned int offset_in_page, error;
 698 +       u64 lsn, new_lsn;
 699 +       int rc;
 700 +
 701 +       if (rw == READ) {
 702 +               /* Nothing can be done about read errors. */
 703 +               return -EIO;
 704 +       }
 705 +
 706 +       job.dev = bbr_id->dev->dev;
 707 +       job.count = 1;
 708 +
 709 +       /* For each sector in the request. */
 710 +       for (lsn = 0; lsn < count; lsn++, buffer += SECTOR_SIZE) {
 711 +               job.sector = starting_lsn + lsn;
 712 +               page = virt_to_page(buffer);
 713 +               offset_in_page = (unsigned long)buffer & ~PAGE_MASK;
 714 +               rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
 715 +               while (rc) {
 716 +                       /* Find the next available relocation sector. */
 717 +                       new_lsn = atomic_read(&bbr_id->in_use_replacement_blks);
 718 +                       if (new_lsn >= bbr_id->nr_replacement_blks) {
 719 +                               /* No more replacement sectors available. */
 720 +                               return -EIO;
 721 +                       }
 722 +                       new_lsn += bbr_id->start_replacement_sect;
 723 +
 724 +                       /* Write the data to its new location. */
 725 +                       DMWARN("dm-bbr: device %s: Trying to remap bad sector "PFU64" to sector "PFU64,
 726 +                              dm_kdevname(bbr_id->dev->dev),
 727 +                              starting_lsn + lsn, new_lsn);
 728 +                       job.sector = new_lsn;
 729 +                       rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
 730 +                       if (rc) {
 731 +                               /* This replacement sector is bad.
 732 +                                * Try the next one.
 733 +                                */
 734 +                               DMERR("dm-bbr: device %s: replacement sector "PFU64" is bad. Skipping.",
 735 +                                     dm_kdevname(bbr_id->dev->dev), new_lsn);
 736 +                               atomic_inc(&bbr_id->in_use_replacement_blks);
 737 +                               continue;
 738 +                       }
 739 +
 740 +                       /* Add this new entry to the on-disk table. */
 741 +                       table_sector_index = new_lsn -
 742 +                                            bbr_id->start_replacement_sect;
 743 +                       table_sector_offset = table_sector_index /
 744 +                                             BBR_ENTRIES_PER_SECT;
 745 +                       index = table_sector_index % BBR_ENTRIES_PER_SECT;
 746 +
 747 +                       bbr_table = &bbr_id->bbr_table[table_sector_offset];
 748 +                       bbr_table->entries[index].bad_sect = starting_lsn + lsn;
 749 +                       bbr_table->entries[index].replacement_sect = new_lsn;
 750 +                       bbr_table->in_use_cnt++;
 751 +                       bbr_table->sequence_number++;
 752 +                       bbr_table->crc = 0;
 753 +                       bbr_table->crc = calculate_crc(INITIAL_CRC,
 754 +                                                      bbr_table,
 755 +                                                      sizeof(struct bbr_table));
 756 +
 757 +                       /* Write the table to disk. */
 758 +                       cpu_bbr_table_sector_to_le(bbr_table, bbr_table);
 759 +                       page = virt_to_page(bbr_table);
 760 +                       offset_in_page = (unsigned long)bbr_table & ~PAGE_MASK;
 761 +                       if (bbr_id->lba_table1) {
 762 +                               job.sector = bbr_id->lba_table1 + table_sector_offset;
 763 +                               rc = dm_io_sync(1, &job, WRITE, page, offset_in_page, &error);
 764 +                       }
 765 +                       if (bbr_id->lba_table2) {
 766 +                               job.sector = bbr_id->lba_table2 + table_sector_offset;
 767 +                               rc |= dm_io_sync(1, &job, WRITE, page, offset_in_page, &error);
 768 +                       }
 769 +                       le_bbr_table_sector_to_cpu(bbr_table);
 770 +
 771 +                       if (rc) {
 772 +                               /* Error writing one of the tables to disk. */
 773 +                               DMERR("dm-bbr: device %s: error updating BBR tables on disk.",
 774 +                                     dm_kdevname(bbr_id->dev->dev));
 775 +                               return rc;
 776 +                       }
 777 +
 778 +                       /* Insert a new entry in the remapping binary-tree. */
 779 +                       rc = bbr_insert_remap_entry(bbr_id,
 780 +                                                   &bbr_table->entries[index]);
 781 +                       if (rc) {
 782 +                               DMERR("dm-bbr: device %s: error adding new entry to remap tree.",
 783 +                                     dm_kdevname(bbr_id->dev->dev));
 784 +                               return rc;
 785 +                       }
 786 +
 787 +                       atomic_inc(&bbr_id->in_use_replacement_blks);
 788 +               }
 789 +       }
 790 +
 791 +       return 0;
 792 +}
 793 +
 794 +/**
 795 + * bbr_io_process_request
 796 + *
 797 + * For each sector in this request, check if the sector has already
 798 + * been remapped. If so, process all previous sectors in the request,
 799 + * followed by the remapped sector. Then reset the starting lsn and
 800 + * count, and keep going with the rest of the request as if it were
 801 + * a whole new request. If any of the sync_io's return an error,
 802 + * call the remapper to relocate the bad sector(s).
 803 + **/
 804 +static int bbr_io_process_request(struct bbr_io_buffer *bbr_io_buf)
 805 +{
 806 +       struct bbr_private *bbr_id = bbr_io_buf->bbr_id;
 807 +       struct io_region job;
 808 +       u64 starting_lsn = bbr_io_buf->sector;
 809 +       u64 count = bbr_io_buf->bh->b_size >> SECTOR_SHIFT;
 810 +       u64 lsn, remapped_lsn;
 811 +       char *buffer = bbr_io_buf->bh->b_data;
 812 +       struct page *page = virt_to_page(buffer);
 813 +       unsigned int offset_in_page = (unsigned long)buffer & ~PAGE_MASK;
 814 +       unsigned int error;
 815 +       int rw = bbr_io_buf->rw;
 816 +       int rc = 0;
 817 +
 818 +       job.dev = bbr_id->dev->dev;
 819 +
 820 +       /* For each sector in this request, check if this sector has
 821 +        * already been remapped. If so, process all previous sectors
 822 +        * in this request, followed by the remapped sector. Then reset
 823 +        * the starting lsn and count and keep going with the rest of
 824 +        * the request as if it were a whole new request.
 825 +        */
 826 +       for (lsn = 0; lsn < count; lsn++) {
 827 +               remapped_lsn = starting_lsn + lsn;
 828 +               rc = bbr_remap(bbr_id, &remapped_lsn);
 829 +               if (!rc) {
 830 +                       /* This sector is fine. */
 831 +                       continue;
 832 +               }
 833 +
 834 +               /* Process all sectors in the request up to this one. */
 835 +               if (lsn > 0) {
 836 +                       job.sector = starting_lsn;
 837 +                       job.count = lsn;
 838 +                       rc = dm_io_sync(1, &job, rw, page,
 839 +                                       offset_in_page, &error);
 840 +                       if (rc) {
 841 +                               /* If this I/O failed, then one of the
 842 +                                * sectors in this request needs to be
 843 +                                * relocated.
 844 +                                */
 845 +                               rc = bbr_io_remap_error(bbr_id, rw,
 846 +                                                       starting_lsn,
 847 +                                                       lsn, buffer);
 848 +                               if (rc) {
 849 +                                       return rc;
 850 +                               }
 851 +                       }
 852 +                       buffer += (lsn << SECTOR_SHIFT);
 853 +                       page = virt_to_page(buffer);
 854 +                       offset_in_page = (unsigned long)buffer & ~PAGE_MASK;
 855 +               }
 856 +
 857 +               /* Process the remapped sector. */
 858 +               job.sector = remapped_lsn;
 859 +               job.count = 1;
 860 +               rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
 861 +               if (rc) {
 862 +                       /* BUGBUG - Need more processing if this caused
 863 +                        * an error. If this I/O failed, then the
 864 +                        * existing remap is now bad, and we need to
 865 +                        * find a new remap. Can't use
 866 +                        * bbr_io_remap_error(), because the existing
 867 +                        * map entry needs to be changed, not added
 868 +                        * again, and the original table entry also
 869 +                        * needs to be changed.
 870 +                        */
 871 +                       return rc;
 872 +               }
 873 +
 874 +               buffer          += SECTOR_SIZE;
 875 +               starting_lsn    += (lsn + 1);
 876 +               count           -= (lsn + 1);
 877 +               lsn             = -1;
 878 +               page            = virt_to_page(buffer);
 879 +               offset_in_page  = (unsigned long)buffer & ~PAGE_MASK;
 880 +       }
 881 +
 882 +       /* Check for any remaining sectors after the last split. This
 883 +        * could potentially be the whole request, but that should be a
 884 +        * rare case because requests should only be processed by the
 885 +        * thread if we know an error occurred or they contained one or
 886 +        * more remapped sectors.
 887 +        */
 888 +       if (count) {
 889 +               job.sector = starting_lsn;
 890 +               job.count = count;
 891 +               rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
 892 +               if (rc) {
 893 +                       /* If this I/O failed, then one of the sectors
 894 +                        * in this request needs to be relocated.
 895 +                        */
 896 +                       rc = bbr_io_remap_error(bbr_id, rw, starting_lsn,
 897 +                                               count, buffer);
 898 +                       if (rc) {
 899 +                               return rc;
 900 +                       }
 901 +               }
 902 +       }
 903 +
 904 +       return 0;
 905 +}
 906 +
 907 +/**
 908 + * bbr_io_handler
 909 + *
 910 + * This is the handler for the bbr_io_thread. It continuously loops,
 911 + * taking I/O requests off its list and processing them. If nothing
 912 + * is on the list, the thread goes back to sleep until specifically
 913 + * woken up.
 914 + *
 915 + * I/O requests should only be sent to this thread if we know that:
 916 + * a) the request contains at least one remapped sector.
 917 + *   or
 918 + * b) the request caused an error on the normal I/O path.
 919 + * This function uses synchronous I/O, so sending a request to this
 920 + * thread that doesn't need special processing will cause severe
 921 + * performance degredation.
 922 + **/
 923 +static void bbr_io_handler(void)
 924 +{
 925 +       struct bbr_io_buffer *bbr_io_buf;
 926 +       struct buffer_head *bh;
 927 +       unsigned long flags;
 928 +       int rc;
 929 +
 930 +       while (1) {
 931 +               /* Process bbr_io_list, one entry at a time. */
 932 +               spin_lock_irqsave(&bbr_io_list_lock, flags);
 933 +               if (list_empty(&bbr_io_list)) {
 934 +                       /* No more items on the list. */
 935 +                       spin_unlock_irqrestore(&bbr_io_list_lock, flags);
 936 +                       break;
 937 +               }
 938 +               bbr_io_buf = list_entry(bbr_io_list.next,
 939 +                                       struct bbr_io_buffer, bbr_io_list);
 940 +               list_del_init(&bbr_io_buf->bbr_io_list);
 941 +               spin_unlock_irqrestore(&bbr_io_list_lock, flags);
 942 +
 943 +               rc = bbr_io_process_request(bbr_io_buf);
 944 +
 945 +               /* Clean up and complete the original I/O. */
 946 +               bbr_io_buf->flags |= BBR_IO_HANDLED;
 947 +               bh = bbr_io_buf->bh;
 948 +               if (bh->b_end_io) {
 949 +                       /* If this was the bbr_io_buf for an error on the
 950 +                        * normal WRITE, don't free it here. It will be
 951 +                        * freed later in bbr_callback()
 952 +                        */
 953 +                       if (!(bbr_io_buf->flags & BBR_IO_RELOCATE))
 954 +                               free_bbr_io_buf(bbr_io_buf);
 955 +                       bh->b_end_io(bh, rc ? 0 : 1);
 956 +               }
 957 +       }
 958 +}
 959 +
 960 +/**
 961 + * bbr_schedule_io
 962 + *
 963 + * Place the specified bbr_io_buf on the thread's processing list.
 964 + **/
 965 +static void bbr_schedule_io(struct bbr_io_buffer *bbr_io_buf)
 966 +{
 967 +       unsigned long flags;
 968 +       spin_lock_irqsave(&bbr_io_list_lock, flags);
 969 +       list_add_tail(&bbr_io_buf->bbr_io_list, &bbr_io_list);
 970 +       spin_unlock_irqrestore(&bbr_io_list_lock, flags);
 971 +       dm_daemon_wake(bbr_io_thread);
 972 +}
 973 +
 974 +/**
 975 + * bbr_read
 976 + *
 977 + * If there are any remapped sectors on this object, send this request over
 978 + * to the thread for processing. Otherwise send it down the stack normally.
 979 + **/
 980 +static int bbr_read(struct bbr_private *bbr_id,
 981 +                   struct buffer_head *bh)
 982 +{
 983 +       struct bbr_io_buffer *bbr_io_buf;
 984 +
 985 +       if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
 986 +           !bbr_remap_probe(bbr_id, bh->b_rsector,
 987 +                            bh->b_size >> SECTOR_SHIFT)) {
 988 +               /* No existing remaps or this request doesn't
 989 +                * contain any remapped sectors.
 990 +                */
 991 +               bh->b_rdev = bbr_id->dev->dev;
 992 +               return 1;
 993 +       }
 994 +
 995 +       /* This request has at least one remapped sector. */
 996 +       bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, READ);
 997 +       if (!bbr_io_buf) {
 998 +               /* Can't get memory to track the I/O. */
 999 +               return -ENOMEM;
1000 +       }
1001 +
1002 +       bbr_schedule_io(bbr_io_buf);
1003 +       return 0;
1004 +}
1005 +
1006 +/**
1007 + * bbr_callback
1008 + *
1009 + * This is the callback for normal write requests. Check for an error
1010 + * during the I/O, and send to the thread for processing if necessary.
1011 + **/
1012 +static int bbr_callback(struct dm_target *ti, struct buffer_head *bh, int rw,
1013 +                       int error, union map_info *map_context)
1014 +{
1015 +       struct bbr_io_buffer *bbr_io_buf = map_context->ptr;
1016 +
1017 +       if (!bbr_io_buf)
1018 +               return error;
1019 +
1020 +       /* Will try to relocate the WRITE if:
1021 +        * - It is an error, and
1022 +        * - It is not an error of BBR relocation, and
1023 +        */
1024 +       if (error && !(bbr_io_buf->flags & BBR_IO_HANDLED)) {
1025 +               DMERR("dm-bbr: device %s: Write failure on sector %lu. Scheduling for retry.",
1026 +                     dm_kdevname(bh->b_rdev),
1027 +                     (unsigned long)bbr_io_buf->sector);
1028 +               /* Indicate this bbr_io_buf is for an error on normal WRITE */
1029 +               bbr_io_buf->flags |= BBR_IO_RELOCATE;
1030 +               bbr_schedule_io(bbr_io_buf);
1031 +               /* Returns >0 so that DM will let us retry the I/O */
1032 +               return 1;
1033 +       }
1034 +
1035 +       free_bbr_io_buf(bbr_io_buf);
1036 +       return error;
1037 +}
1038 +
1039 +/**
1040 + * bbr_write
1041 + *
1042 + * If there are any remapped sectors on this object, send the request over
1043 + * to the thread for processing. Otherwise, register for callback
1044 + * notification, and send the request down normally.
1045 + **/
1046 +static int bbr_write(struct bbr_private *bbr_id,
1047 +                    struct buffer_head *bh,
1048 +                    union map_info *map_context)
1049 +{
1050 +       struct bbr_io_buffer *bbr_io_buf;
1051 +       int rc = 1;
1052 +
1053 +       bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, WRITE);
1054 +       if (!bbr_io_buf) {
1055 +               /* Can't get memory to track the I/O. */
1056 +               return -ENOMEM;
1057 +       }
1058 +
1059 +       if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
1060 +           !bbr_remap_probe(bbr_id, bh->b_rsector,
1061 +                            bh->b_size >> SECTOR_SHIFT)) {
1062 +               /* No existing remaps or this request
1063 +                * contains no remapped sectors.
1064 +                */
1065 +               bh->b_rdev = bbr_id->dev->dev;
1066 +               map_context->ptr = bbr_io_buf;
1067 +       } else {
1068 +               /* This request contains at least one remapped sector. */
1069 +               bbr_schedule_io(bbr_io_buf);
1070 +               rc = 0;
1071 +       }
1072 +
1073 +       return rc;
1074 +}
1075 +
1076 +/**
1077 + * Construct a bbr mapping
1078 + **/
1079 +static int bbr_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1080 +{
1081 +       struct bbr_private *bbr_id;
1082 +       unsigned long block_size;
1083 +       char *end;
1084 +       int rc = -EINVAL;
1085 +
1086 +       if (argc != 8) {
1087 +               ti->error = "dm-bbr requires exactly 8 arguments: "
1088 +                           "device offset table1_lsn table2_lsn table_size start_replacement nr_replacement_blks block_size";
1089 +               goto out1;
1090 +       }
1091 +
1092 +       bbr_id = bbr_alloc_private();
1093 +       if (!bbr_id) {
1094 +               ti->error = "dm-bbr: Error allocating bbr private data.";
1095 +               goto out1;
1096 +       }
1097 +
1098 +       bbr_id->offset = simple_strtoull(argv[1], &end, 10);
1099 +       bbr_id->lba_table1 = simple_strtoull(argv[2], &end, 10);
1100 +       bbr_id->lba_table2 = simple_strtoull(argv[3], &end, 10);
1101 +       bbr_id->nr_sects_bbr_table = simple_strtoull(argv[4], &end, 10);
1102 +       bbr_id->start_replacement_sect = simple_strtoull(argv[5], &end, 10);
1103 +       bbr_id->nr_replacement_blks = simple_strtoull(argv[6], &end, 10);
1104 +       block_size = simple_strtoul(argv[7], &end, 10);
1105 +       bbr_id->blksize_in_sects = (block_size >> SECTOR_SHIFT);
1106 +
1107 +       bbr_id->bbr_table = kmalloc(bbr_id->nr_sects_bbr_table << SECTOR_SHIFT,
1108 +                                   GFP_KERNEL);
1109 +       if (!bbr_id->bbr_table) {
1110 +               ti->error = "dm-bbr: Error allocating bbr table.";
1111 +               goto out2;
1112 +       }
1113 +
1114 +       if (dm_get_device(ti, argv[0], 0, ti->len,
1115 +                         dm_table_get_mode(ti->table), &bbr_id->dev)) {
1116 +               ti->error = "dm-bbr: Device lookup failed";
1117 +               goto out2;
1118 +       }
1119 +
1120 +       /* Using a semaphore here is probably overkill,
1121 +        * but at least it will be correct.
1122 +        */
1123 +       down(&bbr_instances_lock);
1124 +       if (bbr_instances == 0) {
1125 +               rc = bbr_global_init();
1126 +               if (rc) {
1127 +                       up(&bbr_instances_lock);
1128 +                       goto out3;
1129 +               }
1130 +       }
1131 +       bbr_instances++;
1132 +       up(&bbr_instances_lock);
1133 +
1134 +       rc = bbr_setup(bbr_id);
1135 +       if (rc) {
1136 +               ti->error = "dm-bbr: Device setup failed";
1137 +               goto out4;
1138 +       }
1139 +
1140 +       ti->private = bbr_id;
1141 +       return 0;
1142 +
1143 +out4:
1144 +       down(&bbr_instances_lock);
1145 +       bbr_instances--;
1146 +       if (bbr_instances == 0) {
1147 +               bbr_global_cleanup();
1148 +       }
1149 +       up(&bbr_instances_lock);
1150 +
1151 +out3:
1152 +       dm_put_device(ti, bbr_id->dev);
1153 +out2:
1154 +       bbr_free_private(bbr_id);
1155 +out1:
1156 +       return rc;
1157 +}
1158 +
1159 +static void bbr_dtr(struct dm_target *ti)
1160 +{
1161 +       struct bbr_private *bbr_id = ti->private;
1162 +
1163 +       dm_put_device(ti, bbr_id->dev);
1164 +       bbr_free_private(bbr_id);
1165 +
1166 +       down(&bbr_instances_lock);
1167 +       bbr_instances--;
1168 +       if (bbr_instances == 0) {
1169 +               bbr_global_cleanup();
1170 +       }
1171 +       up(&bbr_instances_lock);
1172 +}
1173 +
1174 +static int bbr_map(struct dm_target *ti, struct buffer_head *bh, int rw,
1175 +                  union map_info *map_context)
1176 +{
1177 +       struct bbr_private *bbr_id = ti->private;
1178 +
1179 +       bh->b_rsector += bbr_id->offset;
1180 +       map_context->ptr = NULL;
1181 +       switch (rw) {
1182 +       case READ:
1183 +       case READA:
1184 +               return bbr_read(bbr_id, bh);
1185 +       case WRITE:
1186 +               return bbr_write(bbr_id, bh, map_context);
1187 +       default:
1188 +               return -EIO;
1189 +       }
1190 +}
1191 +
1192 +static int bbr_status(struct dm_target *ti, status_type_t type,
1193 +                     char *result, unsigned int maxlen)
1194 +{
1195 +       struct bbr_private *bbr_id = ti->private;
1196 +
1197 +       switch (type) {
1198 +       case STATUSTYPE_INFO:
1199 +               result[0] = '\0';
1200 +               break;
1201 +
1202 +       case STATUSTYPE_TABLE:
1203 +               snprintf(result, maxlen, "%s "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" %u",
1204 +                        dm_kdevname(bbr_id->dev->dev),
1205 +                        bbr_id->offset, bbr_id->lba_table1, bbr_id->lba_table2,
1206 +                        bbr_id->nr_sects_bbr_table,
1207 +                        bbr_id->start_replacement_sect,
1208 +                        bbr_id->nr_replacement_blks,
1209 +                        bbr_id->blksize_in_sects << SECTOR_SHIFT);
1210 +                break;
1211 +       }
1212 +       return 0;
1213 +}
1214 +
1215 +static struct target_type bbr_target = {
1216 +       name:   "bbr",
1217 +       module: THIS_MODULE,
1218 +       ctr:    bbr_ctr,
1219 +       dtr:    bbr_dtr,
1220 +       map:    bbr_map,
1221 +       end_io: bbr_callback,
1222 +       status: bbr_status,
1223 +};
1224 +
1225 +int __init dm_bbr_init(void)
1226 +{
1227 +       int r = dm_register_target(&bbr_target);
1228 +
1229 +       if (r < 0)
1230 +               DMERR("dm-bbr: register failed %d", r);
1231 +
1232 +       return r;
1233 +}
1234 +
1235 +void __exit dm_bbr_exit(void)
1236 +{
1237 +       int r = dm_unregister_target(&bbr_target);
1238 +
1239 +       if (r < 0)
1240 +               DMERR("dm-bbr: unregister failed %d", r);
1241 +}
1242 +
1243 +module_init(dm_bbr_init);
1244 +module_exit(dm_bbr_exit);
1245 +MODULE_LICENSE("GPL");
1246 diff -urN linux-2.4.24.org/drivers/md/dm-bbr.h linux-2.4.24/drivers/md/dm-bbr.h
1247 --- linux-2.4.24.org/drivers/md/dm-bbr.h        1970-01-01 01:00:00.000000000 +0100
1248 +++ linux-2.4.24/drivers/md/dm-bbr.h    2004-01-18 16:03:13.101545929 +0100
1249 @@ -0,0 +1,143 @@
1250 +/*
1251 + *   (C) Copyright IBM Corp. 2002, 2003
1252 + *
1253 + *   This program is free software;  you can redistribute it and/or modify
1254 + *   it under the terms of the GNU General Public License as published by
1255 + *   the Free Software Foundation; either version 2 of the License, or
1256 + *   (at your option) any later version.
1257 + *
1258 + *   This program is distributed in the hope that it will be useful,
1259 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
1260 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
1261 + *   the GNU General Public License for more details.
1262 + *
1263 + *   You should have received a copy of the GNU General Public License
1264 + *   along with this program;  if not, write to the Free Software
1265 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
1266 + *
1267 + * linux/drivers/md/dm-bbr.h
1268 + *
1269 + * Bad-block-relocation (BBR) target for device-mapper.
1270 + *
1271 + * The BBR target is designed to remap I/O write failures to another safe
1272 + * location on disk. Note that most disk drives have BBR built into them,
1273 + * this means that our software BBR will be only activated when all hardware
1274 + * BBR replacement sectors have been used.
1275 + */
1276 +
1277 +#define BBR_TABLE_SIGNATURE            0x42627254 /* BbrT */
1278 +#define BBR_ENTRIES_PER_SECT           31
1279 +#define BBR_NR_BUFS                    128
1280 +#define INITIAL_CRC                    0xFFFFFFFF
1281 +#define CRC_POLYNOMIAL                 0xEDB88320L
1282 +
1283 +/**
1284 + * Macros to cleanly print 64-bit numbers on both 32-bit and 64-bit machines.
1285 + * Use these in place of %Ld, %Lu, and %Lx.
1286 + **/
1287 +#if BITS_PER_LONG > 32
1288 +#define PFU64 "%lu"
1289 +#else
1290 +#define PFU64 "%Lu"
1291 +#endif
1292 +
1293 +/**
1294 + * struct bbr_table_entry
1295 + * @bad_sect:          LBA of bad location.
1296 + * @replacement_sect:  LBA of new location.
1297 + *
1298 + * Structure to describe one BBR remap.
1299 + **/
1300 +struct bbr_table_entry {
1301 +       u64 bad_sect;
1302 +       u64 replacement_sect;
1303 +};
1304 +
1305 +/**
1306 + * struct bbr_table
1307 + * @signature:         Signature on each BBR table sector.
1308 + * @crc:               CRC for this table sector.
1309 + * @sequence_number:   Used to resolve conflicts when primary and secondary
1310 + *                     tables do not match.
1311 + * @in_use_cnt:                Number of in-use table entries.
1312 + * @entries:           Actual table of remaps.
1313 + *
1314 + * Structure to describe each sector of the metadata table. Each sector in this
1315 + * table can describe 31 remapped sectors.
1316 + **/
1317 +struct bbr_table {
1318 +       u32                     signature;
1319 +       u32                     crc;
1320 +       u32                     sequence_number;
1321 +       u32                     in_use_cnt;
1322 +       struct bbr_table_entry  entries[BBR_ENTRIES_PER_SECT];
1323 +};
1324 +
1325 +/**
1326 + * struct bbr_runtime_remap
1327 + *
1328 + * Node in the binary tree used to keep track of remaps.
1329 + **/
1330 +struct bbr_runtime_remap {
1331 +       struct bbr_table_entry          remap;
1332 +       struct bbr_runtime_remap        *left;
1333 +       struct bbr_runtime_remap        *right;
1334 +};
1335 +
1336 +/**
1337 + * struct bbr_private
1338 + * @dev:                       Info about underlying device.
1339 + * @bbr_table:                 Copy of metadata table.
1340 + * @remap_root:                        Binary tree containing all remaps.
1341 + * @offset:                    LBA of data area.
1342 + * @lba_table1:                        LBA of primary BBR table.
1343 + * @lba_table2:                        LBA of secondary BBR table.
1344 + * @nr_sects_bbr_table:                Size of each BBR table.
1345 + * @nr_replacement_blks:       Number of replacement blocks.
1346 + * @start_replacement_sect:    LBA of start of replacement blocks.
1347 + * @blksize_in_sects:          Size of each block.
1348 + * @in_use_replacement_blks:   Current number of remapped blocks.
1349 + * @bbr_id_lock:               Lock for the binary tree.
1350 + *
1351 + * Private data for each BBR target.
1352 + **/
1353 +struct bbr_private {
1354 +       struct dm_dev                   *dev;
1355 +       struct bbr_table                *bbr_table;
1356 +       struct bbr_runtime_remap        *remap_root;
1357 +       u64                             offset;
1358 +       u64                             lba_table1;
1359 +       u64                             lba_table2;
1360 +       u64                             nr_sects_bbr_table;
1361 +       u64                             start_replacement_sect;
1362 +       u64                             nr_replacement_blks;
1363 +       u32                             blksize_in_sects;
1364 +       atomic_t                        in_use_replacement_blks;
1365 +       spinlock_t                      bbr_id_lock;
1366 +};
1367 +
1368 +#define BBR_IO_HANDLED (1<<0)
1369 +#define BBR_IO_RELOCATE        (1<<1)
1370 +
1371 +/**
1372 + * struct bbr_io_buffer
1373 + * @bbr_io_list:       Thread's list of bbr_io_buf's.
1374 + * @bbr_id:            Object for this request.
1375 + * @bh:                        Original buffer_head.
1376 + * @sector:            Original sector
1377 + * @flags:             Operation flag (BBR_IO_*)
1378 + * @rw:                        READ or WRITE.
1379 + * @rc:                        Return code from bbr_io_handler.
1380 + *
1381 + * Structure used to track each write request.
1382 + **/
1383 +struct bbr_io_buffer {
1384 +       struct list_head        bbr_io_list;
1385 +       struct bbr_private      *bbr_id;
1386 +       struct buffer_head      *bh;
1387 +       u64                     sector;
1388 +       u32                     flags;
1389 +       s32                     rw;
1390 +       s32                     rc;
1391 +};
1392 +
1393 diff -urN linux-2.4.24.org/drivers/md/dm.c linux-2.4.24/drivers/md/dm.c
1394 --- linux-2.4.24.org/drivers/md/dm.c    2004-01-18 15:09:18.533171353 +0100
1395 +++ linux-2.4.24/drivers/md/dm.c        2004-01-18 15:59:40.046635861 +0100
1396 @@ -951,13 +951,23 @@
1397         int r = 0;
1398         DECLARE_WAITQUEUE(wait, current);
1399
1400 -       down_write(&md->lock);
1401 +       /* Flush IO to the origin device */
1402 +       down_read(&md->lock);
1403 +       if (test_bit(DMF_BLOCK_IO, &md->flags)) {
1404 +               up_read(&md->lock);
1405 +               return -EINVAL;
1406 +       }
1407 +
1408 +       fsync_dev_lockfs(md->dev);
1409 +       up_read(&md->lock);
1410 +
1411
1412         /*
1413 -        * First we set the BLOCK_IO flag so no more ios will be
1414 -        * mapped.
1415 +        * Set the BLOCK_IO flag so no more ios will be mapped.
1416          */
1417 +       down_write(&md->lock);
1418         if (test_bit(DMF_BLOCK_IO, &md->flags)) {
1419 +               unlockfs(md->dev);
1420                 up_write(&md->lock);
1421                 return -EINVAL;
1422         }
1423 @@ -986,6 +996,7 @@
1424
1425         /* did we flush everything ? */
1426         if (atomic_read(&md->pending)) {
1427 +               unlockfs(md->dev);
1428                 clear_bit(DMF_BLOCK_IO, &md->flags);
1429                 r = -EINTR;
1430         } else {
1431 @@ -1017,6 +1028,7 @@
1432         md->deferred = NULL;
1433         up_write(&md->lock);
1434
1435 +       unlockfs(md->dev);
1436         flush_deferred_io(def);
1437         run_task_queue(&tq_disk);
1438
1439 diff -urN linux-2.4.24.org/drivers/md/dm-snapshot.c linux-2.4.24/drivers/md/dm-snapshot.c
1440 --- linux-2.4.24.org/drivers/md/dm-snapshot.c   2004-01-18 15:09:18.569163966 +0100
1441 +++ linux-2.4.24/drivers/md/dm-snapshot.c       2004-01-18 16:02:40.858328124 +0100
1442 @@ -92,6 +92,9 @@
1443
1444         /* List of snapshots for this origin */
1445         struct list_head snapshots;
1446 +
1447 +       /* Count of snapshots and origins referrencing this structure. */
1448 +       unsigned int count;
1449  };
1450
1451  /*
1452 @@ -155,6 +158,35 @@
1453  }
1454
1455  /*
1456 + * Allocate and initialize an origin structure.
1457 + */
1458 +static struct origin * __alloc_origin(kdev_t dev)
1459 +{
1460 +       struct origin *o = kmalloc(sizeof(*o), GFP_KERNEL);
1461 +       if (o) {
1462 +               o->dev = dev;
1463 +               INIT_LIST_HEAD(&o->hash_list);
1464 +               INIT_LIST_HEAD(&o->snapshots);
1465 +               __insert_origin(o);
1466 +       }
1467 +       return o;
1468 +}
1469 +
1470 +static void __get_origin(struct origin *o)
1471 +{
1472 +       o->count++;
1473 +}
1474 +
1475 +static void __put_origin(struct origin *o)
1476 +{
1477 +       o->count--;
1478 +       if (o->count == 0) {
1479 +               list_del(&o->hash_list);
1480 +               kfree(o);
1481 +       }
1482 +}
1483 +
1484 +/*
1485   * Make a note of the snapshot and its origin so we can look it
1486   * up when the origin has a write on it.
1487   */
1488 @@ -168,20 +200,37 @@
1489
1490         if (!o) {
1491                 /* New origin */
1492 -               o = kmalloc(sizeof(*o), GFP_KERNEL);
1493 +               o = __alloc_origin(dev);
1494                 if (!o) {
1495                         up_write(&_origins_lock);
1496                         return -ENOMEM;
1497                 }
1498 +       }
1499
1500 -               /* Initialise the struct */
1501 -               INIT_LIST_HEAD(&o->snapshots);
1502 -               o->dev = dev;
1503 +       __get_origin(o);
1504 +       list_add_tail(&snap->list, &o->snapshots);
1505
1506 -               __insert_origin(o);
1507 +       up_write(&_origins_lock);
1508 +       return 0;
1509 +}
1510 +
1511 +static int register_origin(kdev_t dev)
1512 +{
1513 +       struct origin *o;
1514 +
1515 +       down_write(&_origins_lock);
1516 +       o = __lookup_origin(dev);
1517 +
1518 +       if (!o) {
1519 +               /* New origin */
1520 +               o = __alloc_origin(dev);
1521 +               if (!o) {
1522 +                       up_write(&_origins_lock);
1523 +                       return -ENOMEM;
1524 +               }
1525         }
1526
1527 -       list_add_tail(&snap->list, &o->snapshots);
1528 +       __get_origin(o);
1529
1530         up_write(&_origins_lock);
1531         return 0;
1532 @@ -195,11 +244,18 @@
1533         o = __lookup_origin(s->origin->dev);
1534
1535         list_del(&s->list);
1536 -       if (list_empty(&o->snapshots)) {
1537 -               list_del(&o->hash_list);
1538 -               kfree(o);
1539 -       }
1540 +       __put_origin(o);
1541 +
1542 +       up_write(&_origins_lock);
1543 +}
1544 +
1545 +static void unregister_origin(kdev_t dev)
1546 +{
1547 +       struct origin *o;
1548
1549 +       down_write(&_origins_lock);
1550 +       o = __lookup_origin(dev);
1551 +       __put_origin(o);
1552         up_write(&_origins_lock);
1553  }
1554
1555 @@ -524,9 +580,6 @@
1556                 goto bad5;
1557         }
1558
1559 -       /* Flush IO to the origin device */
1560 -       fsync_dev(s->origin->dev);
1561 -
1562         /* Add snapshot to the list of snapshots for this origin */
1563         if (register_snapshot(s)) {
1564                 r = -EINVAL;
1565 @@ -1093,6 +1146,13 @@
1566                 return r;
1567         }
1568
1569 +       r = register_origin(dev->dev);
1570 +       if (r) {
1571 +               ti->error = "Cannot register origin";
1572 +               dm_put_device(ti, dev);
1573 +               return r;
1574 +       }
1575 +
1576         ti->private = dev;
1577         return 0;
1578  }
1579 @@ -1100,6 +1160,7 @@
1580  static void origin_dtr(struct dm_target *ti)
1581  {
1582         struct dm_dev *dev = (struct dm_dev *) ti->private;
1583 +       unregister_origin(dev->dev);
1584         dm_put_device(ti, dev);
1585  }
1586
1587 diff -urN linux-2.4.24.org/drivers/md/dm-sparse.c linux-2.4.24/drivers/md/dm-sparse.c
1588 --- linux-2.4.24.org/drivers/md/dm-sparse.c     1970-01-01 01:00:00.000000000 +0100
1589 +++ linux-2.4.24/drivers/md/dm-sparse.c 2004-01-18 16:04:48.284615142 +0100
1590 @@ -0,0 +1,709 @@
1591 +/* -*- linux-c -*- */
1592 +
1593 +/*
1594 + *   Copyright (c) International Business Machines  Corp., 2002
1595 + *
1596 + *   This program is free software;  you can redistribute it and/or modify
1597 + *   it under the terms of the GNU General Public License as published by
1598 + *   the Free Software Foundation; either version 2 of the License, or
1599 + *   (at your option) any later version.
1600 + *
1601 + *   This program is distributed in the hope that it will be useful,
1602 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
1603 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
1604 + *   the GNU General Public License for more details.
1605 + *
1606 + *   You should have received a copy of the GNU General Public License
1607 + *   along with this program;  if not, write to the Free Software
1608 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
1609 + *
1610 + * linux/drivers/md/dm-sparse.c
1611 + *
1612 + * Sparse target for device-mapper.
1613 + *
1614 + * This target provides the ability to create a sparse device. This
1615 + * allows a device to pretend to be larger than it really is.
1616 + */
1617 +
1618 +#include <linux/module.h>
1619 +#include <linux/init.h>
1620 +#include <linux/blkdev.h>
1621 +#include <linux/slab.h>
1622 +#include <linux/mempool.h>
1623 +#include <linux/vmalloc.h>
1624 +
1625 +#include "dm.h"
1626 +#include "dm-io.h"
1627 +
1628 +#define MAX_HASH_CHAIN_ENTRIES 10
1629 +#define NAME_SIZE 127
1630 +
1631 +/* Sparse Ioctl
1632 +   device
1633 +   start
1634 +   chunk_size
1635 +   chunks
1636 + */
1637 +
1638 +// Entries in the sparse remapping structure
1639 +struct sparse_hash_entry {
1640 +    u64 org_chunk; // Chunk number, not LBA.
1641 +    u64 sparse_chunk; // Chunk number, not LBA.
1642 +    struct sparse_hash_entry * next;
1643 +    struct sparse_hash_entry * prev;
1644 +};
1645 +
1646 +//Private data structure
1647 +struct sparse_volume {
1648 +    struct dm_dev *dev;
1649 +    struct rw_semaphore sparse_semaphore;
1650 +    struct sparse_hash_entry ** sparse_map; // Hash table of remappings
1651 +    struct sparse_hash_entry * free_hash_list;
1652 +    kmem_cache_t * hash_slab;
1653 +    mempool_t * hash_pool;
1654 +    u32 dm_io_flag;
1655 +    u32 chunk_size;    // Sectors.
1656 +    u32 chunk_shift; // Shift value for chunk size.
1657 +    u32 num_chunks;    // In this volume.
1658 +    u32 next_cow_entry; // Index into current COW table.
1659 +    u64 current_cow_sector;    // LOGICAL sector of current COW table.
1660 +    u32 next_free_chunk; // Index of next free chunk (not LBA!).
1661 +    u32 hash_table_size; // Size of the hash table for the remap.
1662 +    u64 start;
1663 +    u64 cow_table[64]; // One sector's worth of COW tables.
1664 +};
1665 +
1666 +/*************************** OLD SERVICES ****************************/
1667 +
1668 +/* computes log base 2 of value */
1669 +inline int log2(u32 value) //ok to change to u32?
1670 +{
1671 +    int result = -1;
1672 +    long tmp;              //ok to change to long?
1673 +
1674 +    if (value) {
1675 +       tmp = value;
1676 +       result++;
1677 +       while (!(tmp & 1)) {
1678 +           result++;
1679 +           tmp >>= 1;
1680 +       }
1681 +       if (tmp != 1) {
1682 +           result = -2;
1683 +       }
1684 +    }
1685 +    return result;
1686 +}
1687 +
1688 +/********************************* Functions *********************************/
1689 +
1690 +/***************************** Hash Functions *****************************/
1691 +
1692 +/* Take and initialize from the free hash list */
1693 +static struct sparse_hash_entry *
1694 +allocate_sparse_hash_entry( struct sparse_volume * volume,
1695 +                           u64 org_chunk,
1696 +                           u64 sparse_chunk )
1697 +{
1698 +    struct sparse_hash_entry * hash_entry;
1699 +
1700 +       hash_entry = volume->free_hash_list;
1701 +       if ( hash_entry ) { //should always be the case b/c preallocate these
1702 +           volume->free_hash_list = hash_entry->next;
1703 +           hash_entry->org_chunk = org_chunk;
1704 +           hash_entry->sparse_chunk = sparse_chunk;
1705 +           hash_entry->next = NULL;
1706 +           hash_entry->prev = NULL;
1707 +       }
1708 +
1709 +       return hash_entry;
1710 +}
1711 +
1712 +/*
1713 + *     This function inserts a new entry into a sparse hash chain, immediately
1714 + *     following the specified entry. This function should not be used to add
1715 + *     an entry into an empty list, or as the first entry in an existing list.
1716 + *     For that case, use insert_sparse_map_entry_at_head().
1717 + */
1718 +static int insert_sparse_hash_entry( struct sparse_hash_entry * entry,
1719 +                                    struct sparse_hash_entry * base )
1720 +{
1721 +       entry->next = base->next;
1722 +       entry->prev = base;
1723 +       base->next = entry;
1724 +       if ( entry->next ) {
1725 +               entry->next->prev = entry;
1726 +       }
1727 +       return 0;
1728 +}
1729 +
1730 +/*
1731 + *     This function inserts a new entry into a sparse chain as the first
1732 + *     entry in the chain.
1733 + */
1734 +static int insert_sparse_hash_entry_at_head( struct sparse_hash_entry * entry,
1735 +                                            struct sparse_hash_entry ** head )
1736 +{
1737 +       entry->next = *head;
1738 +       entry->prev = NULL;
1739 +       *head = entry;
1740 +       if ( entry->next ) {
1741 +           entry->next->prev = entry;
1742 +       }
1743 +       return 0;
1744 +}
1745 +
1746 +/*
1747 + *     Delete all items in a single chain in the hash table.
1748 + */
1749 +static int delete_sparse_hash_chain( struct sparse_volume * vol,
1750 +                                    struct sparse_hash_entry * head )
1751 +{
1752 +    struct sparse_hash_entry * next;
1753 +
1754 +    while ( head ) {
1755 +       next = head->next;
1756 +       mempool_free( head, vol->hash_pool );
1757 +       head = next;
1758 +    }
1759 +    return 0;
1760 +}
1761 +
1762 +/*
1763 + *     This function will search the hash chain that is anchored at the
1764 + *     specified head pointer. If the chunk number is found, a pointer to that
1765 + *     entry in the chain is set, and a 1 is returned. If the chunk is not
1766 + *     found, a pointer to the previous entry is set and 0 is returned. If the
1767 + *     return pointer is NULL, this means either the list is empty, or the
1768 + *     specified sector should become the first list item.
1769 + */
1770 +static int search_sparse_hash_chain( u64 chunk,
1771 +                                    struct sparse_hash_entry * head,
1772 +                                    struct sparse_hash_entry ** result )
1773 +{
1774 +    struct sparse_hash_entry * curr = head;
1775 +    struct sparse_hash_entry * prev = head;
1776 +    while ( curr && curr->org_chunk < chunk ) {
1777 +       prev = curr;
1778 +       curr = curr->next;
1779 +    }
1780 +    if (!curr) { // Either an empty chain or went off the end of the chain.
1781 +       *result = prev;
1782 +       return 0;
1783 +    }
1784 +    else if ( curr->org_chunk != chunk ) {
1785 +       *result = curr->prev;
1786 +       return 0;
1787 +    }
1788 +    else {
1789 +       *result = curr;
1790 +       return 1;
1791 +    }
1792 +}
1793 +
1794 +/*
1795 + *     This function takes a cow table entry (from the on-disk data), and
1796 + *     converts it into an appropriate entry for the sparse map, and
1797 + *     inserts it into the appropriate map for the specified volume.
1798 + */
1799 +static int add_cow_entry_to_sparse_map( u64 org_chunk,
1800 +                                       u64 sparse_chunk,
1801 +                                       struct sparse_volume * volume )
1802 +{
1803 +    struct sparse_hash_entry * new_entry;
1804 +    struct sparse_hash_entry * target_entry;
1805 +    u32 hash_value;
1806 +    int rc = -EINVAL;
1807 +
1808 +    new_entry = allocate_sparse_hash_entry(volume, org_chunk, sparse_chunk);
1809 +    if (!new_entry) {
1810 +       return -ENOMEM;
1811 +    }
1812 +
1813 +    hash_value = (long)org_chunk % volume->hash_table_size;
1814 +
1815 +    if (! search_sparse_hash_chain( org_chunk,
1816 +                                   volume->sparse_map[hash_value],
1817 +                                   &target_entry ) ) {
1818 +       //should always take this path
1819 +
1820 +       if ( target_entry ) {
1821 +           insert_sparse_hash_entry( new_entry, target_entry );
1822 +       }
1823 +       else {
1824 +           insert_sparse_hash_entry_at_head
1825 +               ( new_entry, &(volume->sparse_map[hash_value]) );
1826 +       }
1827 +       rc = 0;
1828 +    }
1829 +    return rc;
1830 +}
1831 +
1832 +/*
1833 + *     Construct the initial hash table state based on
1834 + *     existing COW tables on the disk.
1835 + */
1836 +static int build_sparse_maps(struct sparse_volume * volume)
1837 +{
1838 +    int rc = 0, done = 0;
1839 +    struct io_region job;
1840 +    struct page * page;
1841 +    unsigned int error, offset;
1842 +
1843 +    while (!done) {
1844 +
1845 +       // Read in one sector's worth of COW tables.
1846 +        job.dev = volume->dev->dev;
1847 +        job.sector = volume->current_cow_sector;
1848 +        job.count = 1;
1849 +        page = virt_to_page(volume->cow_table);
1850 +        offset = (unsigned long)volume->cow_table & ~PAGE_MASK;
1851 +        rc = dm_io_sync(1, &job, READ, page, offset, &error);
1852 +        if (rc) {
1853 +            return rc;
1854 +       }
1855 +
1856 +       // Translate every valid COW table entry into
1857 +       // a sparse map entry.
1858 +       for ( volume->next_cow_entry = 0;
1859 +
1860 +             volume->next_cow_entry < (SECTOR_SIZE/sizeof(u64)) &&
1861 +                 volume->cow_table[volume->next_cow_entry] !=
1862 +                 0xffffffffffffffff;
1863 +
1864 +             volume->next_cow_entry++, volume->next_free_chunk++ ) {
1865 +
1866 +           if ( (rc = add_cow_entry_to_sparse_map
1867 +                 ( le64_to_cpu( volume->cow_table[volume->next_cow_entry] ),
1868 +                   volume->next_free_chunk, volume ))) {
1869 +               return( rc );
1870 +           }
1871 +       }
1872 +       // Move on to the next sector if necessary.
1873 +       if ( volume->next_cow_entry == (SECTOR_SIZE/sizeof(u64)) ) {
1874 +           volume->current_cow_sector++;
1875 +       }
1876 +       else {
1877 +           done = 1;
1878 +       }
1879 +    }
1880 +    return 0;
1881 +}
1882 +
1883 +/************************* Other Functions ************************/
1884 +
1885 +/*
1886 + * Function: sparse_remap_chunk
1887 + *
1888 + *     This function performs a sector remap on a sparse volume. This should
1889 + *     be called from the I/O path, It first determines the base sector
1890 + *     of the chunk containing the specified sector, and saves the remainder.
1891 + *     Then it performs a search through the sparse map for the specified
1892 + *     volume. If a match is found, the sector number is changed to the new
1893 + *     value. If no match is found, the value is left the same, meaning the
1894 + *     chunk has not been remapped.
1895 + */
1896 +static int sparse_remap_chunk( struct sparse_volume * sparse_volume,
1897 +                              u64 * sector )
1898 +{
1899 +    struct sparse_hash_entry * result;
1900 +    u64 chunk;
1901 +    u32 hash_value;
1902 +    u32 remainder;
1903 +    int rc = 1;
1904 +
1905 +    down_read(&sparse_volume->sparse_semaphore);
1906 +
1907 +    remainder = *sector & (u64)(sparse_volume->chunk_size - 1);
1908 +    chunk = *sector >> sparse_volume->chunk_shift;
1909 +    hash_value = ((u32)chunk) % sparse_volume->hash_table_size;
1910 +
1911 +    if ( search_sparse_hash_chain( chunk,
1912 +                                  sparse_volume->sparse_map[hash_value],
1913 +                                  &result) ) {
1914 +       *sector = ( result->sparse_chunk << sparse_volume->chunk_shift )
1915 +           + remainder;
1916 +       rc =  0;
1917 +    }
1918 +    up_read(&sparse_volume->sparse_semaphore);
1919 +    return rc;
1920 +}
1921 +
1922 +/* Function: sparse_cow_write
1923 + *
1924 + *     Check this sparse node to see if the given sector/chunk has been
1925 + *     remapped yet. If it hasn't, create a new hash table entry, update the
1926 + *     in-memory COW table, write the COW table to disk.
1927 + */
1928 +
1929 +static int sparse_cow_write( struct sparse_volume * sparse_volume,
1930 +                            u64 * sector )
1931 +{
1932 +    struct sparse_hash_entry * target_entry, * new_map_entry;
1933 +    struct io_region job;
1934 +    struct page * page;
1935 +    char * cow = NULL;
1936 +    unsigned int error, offset;
1937 +    u64 chunk;
1938 +    u32 hash_value = 0;
1939 +    u32 remainder;
1940 +    int rc;
1941 +
1942 +    down_write(&sparse_volume->sparse_semaphore);
1943 +
1944 +    remainder = *sector & (u64)(sparse_volume->chunk_size - 1);
1945 +    chunk = *sector >> sparse_volume->chunk_shift;
1946 +    hash_value = ((u32)chunk) % sparse_volume->hash_table_size;
1947 +
1948 +    if ( search_sparse_hash_chain( chunk,
1949 +                                  sparse_volume->sparse_map[hash_value],
1950 +                                  &target_entry) ) {
1951 +       *sector =
1952 +           ( target_entry->sparse_chunk << sparse_volume->chunk_shift )
1953 +           + remainder;
1954 +       rc = 0;
1955 +       goto out;
1956 +    }
1957 +
1958 +    // Is there enough room left on this sparse to remap this chunk?
1959 +    if ( sparse_volume->next_free_chunk >= sparse_volume->num_chunks ) {
1960 +       DMERR("dm-sparse: full no new remaps allowed\n");
1961 +       rc = -ENOSPC;
1962 +       goto out;
1963 +    }
1964 +
1965 +    // Create and initialize a new hash table entry for the new remap.
1966 +    new_map_entry = allocate_sparse_hash_entry
1967 +       (sparse_volume, chunk, sparse_volume->next_free_chunk);
1968 +    if ( ! new_map_entry ) {
1969 +       // Can't get memory for map entry. Disable this sparse.
1970 +       DMERR("dm-sparse: memory error allocating hash entry\n");
1971 +       rc = -ENOMEM;
1972 +       goto out;
1973 +    }
1974 +
1975 +    //Always write cow table so its safe
1976 +    cow = kmalloc( SECTOR_SIZE, GFP_KERNEL );
1977 +    if (! cow ) {
1978 +       // Can't get I/O buffer. Disable this sparse.
1979 +       DMERR("dm-sparse: memory error allocating COW table buffer");
1980 +       rc = -ENOMEM;
1981 +       goto out;
1982 +    }
1983 +
1984 +    // Add the entry to the hash table.
1985 +    if ( target_entry ) {
1986 +       insert_sparse_hash_entry( new_map_entry, target_entry );
1987 +    }
1988 +    else {
1989 +       insert_sparse_hash_entry_at_head
1990 +           ( new_map_entry,
1991 +             &(sparse_volume->sparse_map[hash_value]) );
1992 +    }
1993 +
1994 +    sparse_volume->next_free_chunk++;
1995 +
1996 +    // Update the appropriate entry in the COW table.
1997 +    sparse_volume->cow_table[sparse_volume->next_cow_entry] =
1998 +       cpu_to_le64(chunk);
1999 +    sparse_volume->next_cow_entry++;
2000 +
2001 +    memcpy(cow, sparse_volume->cow_table, SECTOR_SIZE);
2002 +
2003 +    //because of ordering issues needs to be synchronous
2004 +    job.dev = sparse_volume->dev->dev;
2005 +    job.sector = sparse_volume->current_cow_sector;
2006 +    job.count = 1;
2007 +    page = virt_to_page(cow);
2008 +    offset = (unsigned long)cow & ~PAGE_MASK;
2009 +    dm_io_sync(1, &job, WRITE, page, offset, &error);
2010 +
2011 +    // Update the in-memory COW table values.
2012 +    if ( sparse_volume->next_cow_entry >= (SECTOR_SIZE/sizeof(u64)) )
2013 +       {
2014 +           sparse_volume->next_cow_entry = 0;
2015 +           sparse_volume->current_cow_sector++;
2016 +           memset(sparse_volume->cow_table, 0xff, SECTOR_SIZE);
2017 +       }
2018 +
2019 +    *sector = ( new_map_entry->sparse_chunk << sparse_volume->chunk_shift )
2020 +       + remainder;
2021 +
2022 +    rc = 0;
2023 +
2024 + out:
2025 +    up_write(&sparse_volume->sparse_semaphore);
2026 +    if ( cow ) {
2027 +       kfree( cow );
2028 +    }
2029 +
2030 +    return rc;
2031 +}
2032 +
2033 +/************************ EXPORT FUNCTIONS ************************/
2034 +
2035 +/*
2036 + * Function: sparse_dtr
2037 + */
2038 +static void sparse_dtr( struct dm_target *ti )
2039 +{
2040 +    struct sparse_volume * vol = (struct sparse_volume *)ti->private;
2041 +    int i;
2042 +
2043 +    if (vol) {
2044 +
2045 +       if (vol->sparse_map) {
2046 +           for ( i = 0; i < vol->hash_table_size; i++ ) {
2047 +               delete_sparse_hash_chain( vol, vol->sparse_map[i] );
2048 +           }
2049 +           delete_sparse_hash_chain( vol, vol->free_hash_list );
2050 +           vfree(vol->sparse_map);
2051 +       }
2052 +
2053 +       if (vol->hash_pool)
2054 +           mempool_destroy(vol->hash_pool);
2055 +
2056 +       if (vol->hash_slab)
2057 +           kmem_cache_destroy(vol->hash_slab);
2058 +
2059 +       dm_put_device(ti, vol->dev);
2060 +
2061 +        if (vol->dm_io_flag) {
2062 +           dm_io_put(1);
2063 +       }
2064 +
2065 +       kfree( vol );
2066 +    }
2067 +}
2068 +
2069 +/*
2070 + * Function: sparse_ctr
2071 + */
2072 +static int sparse_ctr( struct dm_target *ti, unsigned int argc, char** argv )
2073 +{
2074 +    int i, rc = -EINVAL;
2075 +    struct sparse_hash_entry *new_entry;
2076 +    struct sparse_volume *vol;
2077 +    struct dm_dev *dev;
2078 +    u32 chunk_size, chunks;
2079 +    u64 start;
2080 +    char* end, slab_name[NAME_SIZE+1];
2081 +
2082 +    if ( argc != 4 ) {
2083 +       ti->error="dm-sparse: wrong number of arguments";
2084 +       return rc;
2085 +    }
2086 +
2087 +    start = simple_strtoull(argv[1], &end, 10);
2088 +    if (*end) {
2089 +       ti->error="dm-sparse: Invalid first chunk lba";
2090 +       return rc;
2091 +    }
2092 +
2093 +    chunk_size = simple_strtoul(argv[2], &end, 10);
2094 +    if (*end) {
2095 +       ti->error="dm-sparse: Invalid chunk_size";
2096 +       return rc;
2097 +    }
2098 +
2099 +    chunks = simple_strtoul(argv[3], &end, 10);
2100 +    if (*end) {
2101 +       ti->error="dm-sparse: Invalid number of chunks";
2102 +       return rc;
2103 +    }
2104 +
2105 +    if ( dm_get_device( ti, argv[0], ti->begin, start + chunks * chunk_size,
2106 +                       dm_table_get_mode(ti->table), &dev ) ) {
2107 +       ti->error = "dm-sparse: Device lookup failed";
2108 +       return rc;
2109 +    }
2110 +
2111 +    vol = kmalloc(sizeof(struct sparse_volume), GFP_KERNEL);
2112 +    if ( !vol ) {
2113 +       ti->error = "dm-sparse: Memory allocation for private-data failed";
2114 +        rc = -ENOMEM;
2115 +       goto out;
2116 +    }
2117 +
2118 +    memset( vol, 0, sizeof(struct sparse_volume) );
2119 +
2120 +    rc = dm_io_get(1);
2121 +    if (rc) {
2122 +           ti->error = "dm-sparse: failed to initialize dm-io.";
2123 +           sparse_dtr(ti);
2124 +           return rc;
2125 +    }
2126 +
2127 +    // Initialize
2128 +    vol->dm_io_flag = 1;
2129 +    vol->chunk_size = chunk_size;
2130 +    vol->chunk_shift = log2(chunk_size);
2131 +    vol->num_chunks = chunks;
2132 +    vol->current_cow_sector = 1;
2133 +    vol->hash_table_size = chunks / MAX_HASH_CHAIN_ENTRIES + 1;
2134 +    vol->start = start;
2135 +    vol->dev = dev;
2136 +    init_rwsem(&vol->sparse_semaphore);
2137 +
2138 +    snprintf(slab_name, NAME_SIZE, "sparse-%p", vol);
2139 +    vol->hash_slab = kmem_cache_create(slab_name,
2140 +                                      sizeof(struct sparse_hash_entry),
2141 +                                      0, SLAB_HWCACHE_ALIGN,
2142 +                                      NULL, NULL);
2143 +    if ( ! vol->hash_slab ) {
2144 +       ti->error = "dm-sparse: memory allocation error in hash slab create";
2145 +       sparse_dtr(ti);
2146 +       return -ENOMEM;
2147 +    }
2148 +    vol->hash_pool = mempool_create(1, mempool_alloc_slab,
2149 +                                   mempool_free_slab,
2150 +                                   vol->hash_slab);
2151 +    if ( ! vol->hash_pool ) {
2152 +       ti->error = "dm-sparse: memory allocation error in hash pool create";
2153 +       sparse_dtr(ti);
2154 +       return -ENOMEM;
2155 +    }
2156 +
2157 +    // Sparse hash table
2158 +    vol->sparse_map = vmalloc( vol->hash_table_size *
2159 +                              sizeof( struct sparse_hash_entry * ) );
2160 +    if ( ! vol->sparse_map ) {
2161 +       ti->error = "dm-sparse: Memory allocation error in sparse_map create";
2162 +       sparse_dtr(ti);
2163 +       return -ENOMEM;
2164 +    }
2165 +
2166 +    memset( vol->sparse_map, 0, vol->hash_table_size *
2167 +           sizeof( struct sparse_hash_entry * ) );
2168 +
2169 +    for ( i = 0; i < chunks; i++ ) {
2170 +
2171 +       new_entry = mempool_alloc(vol->hash_pool, GFP_KERNEL );
2172 +       if ( ! new_entry ) {
2173 +           ti->error="dm-sparse: memory allocation error in hash table setup";
2174 +           sparse_dtr(ti);
2175 +           return -ENOMEM;
2176 +       }
2177 +
2178 +       new_entry->next = vol->free_hash_list;
2179 +       vol->free_hash_list = new_entry;
2180 +    }
2181 +
2182 +    rc = build_sparse_maps(vol);
2183 +    if (rc) {
2184 +       ti->error = "dm-sparse: error building hash tables";
2185 +       sparse_dtr(ti);
2186 +       return rc;
2187 +    }
2188 +
2189 +    ti->private = vol;
2190 +    return rc;
2191 +
2192 + out:
2193 +    dm_put_device(ti, dev);
2194 +    return rc;
2195 +}
2196 +
2197 +/*
2198 + * Function: sparse_map
2199 + */
2200 +static int sparse_map( struct dm_target * ti, struct buffer_head * bh, int rw,
2201 +                      union map_info *map_context )
2202 +{
2203 +    struct sparse_volume * volume = (struct sparse_volume*)ti->private;
2204 +    u64 sector = bh->b_rsector;
2205 +    int rc;
2206 +
2207 +    // Check if this sector has been remapped
2208 +    rc = sparse_remap_chunk( volume, &sector );
2209 +
2210 +    if ( rc < 0 ) { //Error
2211 +       return rc;
2212 +    }
2213 +
2214 +    if ( rc == 0 ) { // Remapped I/O : read or write same logic
2215 +       bh->b_rsector = volume->start + sector;
2216 +       bh->b_rdev = volume->dev->dev;
2217 +       return 1;
2218 +    }
2219 +
2220 +    // ( Previously )Un-mapped:        read / write different logic
2221 +
2222 +    if ( rw ) { //write :
2223 +       rc = sparse_cow_write( volume, &sector );
2224 +
2225 +       if ( rc < 0 ) { //Error
2226 +           return rc;
2227 +       }
2228 +       //Send write on
2229 +       bh->b_rsector = volume->start + sector;
2230 +       bh->b_rdev = volume->dev->dev;
2231 +       return 1;
2232 +    }
2233 +
2234 +    //Reading something that was never written
2235 +    //return zeros and indicate complete
2236 +    memset(bh->b_data, 0x0, bh->b_size);
2237 +    bh->b_end_io(bh, 1);
2238 +    return 0;
2239 +}
2240 +
2241 +static int sparse_status( struct dm_target *ti, status_type_t type,
2242 +                         char *result, unsigned int maxlen )
2243 +{
2244 +    struct sparse_volume * vol = (struct sparse_volume * )ti->private;
2245 +
2246 +    switch(type) {
2247 +
2248 +    case STATUSTYPE_INFO:
2249 +       snprintf( result, maxlen, "%d%%",
2250 +                 ( vol->next_free_chunk * 100 ) / vol->num_chunks );
2251 +       break;
2252 +
2253 +    case STATUSTYPE_TABLE:
2254 +       snprintf( result, maxlen, "%s %Lu %u %u",
2255 +                 dm_kdevname(vol->dev->dev), vol->start,
2256 +                 vol->chunk_size, vol->num_chunks );
2257 +       break;
2258 +
2259 +    default:
2260 +       break;
2261 +    }
2262 +
2263 +    return 0;
2264 +}
2265 +
2266 +/****************** FUNCTION TABLE **********************/
2267 +
2268 +static struct target_type sparse_target = {
2269 +    .name = "sparse",
2270 +    .module = THIS_MODULE,
2271 +    .ctr = sparse_ctr,
2272 +    .dtr = sparse_dtr,
2273 +    .map = sparse_map,
2274 +    .status = sparse_status,
2275 +};
2276 +
2277 +/********************* REGISTRATION *****************/
2278 +
2279 +int __init sparse_init(void)
2280 +{
2281 +    int rc = dm_register_target(&sparse_target);
2282 +
2283 +    if ( rc < 0 )
2284 +       DMWARN("sparse target registration failed");
2285 +
2286 +    return rc;
2287 +}
2288 +
2289 +void __exit sparse_exit(void)
2290 +{
2291 +    if (dm_unregister_target(&sparse_target) )
2292 +       DMWARN("sparse target unregistration failed");
2293 +
2294 +    return;
2295 +}
2296 +
2297 +module_init(sparse_init);
2298 +module_exit(sparse_exit);
2299 +MODULE_LICENSE("GPL");
2300 diff -urN linux-2.4.24.org/drivers/md/lvm.c linux-2.4.24/drivers/md/lvm.c
2301 --- linux-2.4.24.org/drivers/md/lvm.c   2004-01-18 14:58:09.106704262 +0100
2302 +++ linux-2.4.24/drivers/md/lvm.c       2004-01-18 15:57:55.568033496 +0100
2303 @@ -236,9 +236,6 @@
2304  #define DEVICE_OFF(device)
2305  #define LOCAL_END_REQUEST
2306
2307 -/* lvm_do_lv_create calls fsync_dev_lockfs()/unlockfs() */
2308 -/* #define     LVM_VFS_ENHANCEMENT */
2309 -
2310  #include <linux/config.h>
2311  #include <linux/module.h>
2312  #include <linux/kernel.h>
2313 @@ -2250,12 +2247,8 @@
2314         if (lv_ptr->lv_access & LV_SNAPSHOT) {
2315                 lv_t *org = lv_ptr->lv_snapshot_org, *last;
2316
2317 -               /* sync the original logical volume */
2318 -               fsync_dev(org->lv_dev);
2319 -#ifdef LVM_VFS_ENHANCEMENT
2320                 /* VFS function call to sync and lock the filesystem */
2321                 fsync_dev_lockfs(org->lv_dev);
2322 -#endif
2323
2324                 down_write(&org->lv_lock);
2325                 org->lv_access |= LV_SNAPSHOT_ORG;
2326 @@ -2281,11 +2274,9 @@
2327         else
2328                 set_device_ro(lv_ptr->lv_dev, 1);
2329
2330 -#ifdef LVM_VFS_ENHANCEMENT
2331  /* VFS function call to unlock the filesystem */
2332         if (lv_ptr->lv_access & LV_SNAPSHOT)
2333                 unlockfs(lv_ptr->lv_snapshot_org->lv_dev);
2334 -#endif
2335
2336         lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].de =
2337             lvm_fs_create_lv(vg_ptr, lv_ptr);
2338 diff -urN linux-2.4.24.org/drivers/md/Makefile linux-2.4.24/drivers/md/Makefile
2339 --- linux-2.4.24.org/drivers/md/Makefile        2004-01-18 15:09:18.620153502 +0100
2340 +++ linux-2.4.24/drivers/md/Makefile    2004-01-18 16:04:48.278616388 +0100
2341 @@ -28,6 +28,8 @@
2342  obj-$(CONFIG_BLK_DEV_LVM)              += lvm-mod.o
2343
2344  obj-$(CONFIG_BLK_DEV_DM)               += dm-mod.o
2345 +obj-$(CONFIG_BLK_DEV_DM_BBR)           += dm-bbr.o
2346 +obj-$(CONFIG_BLK_DEV_DM_SPARSE)                += dm-sparse.o
2347
2348  include $(TOPDIR)/Rules.make
2349
2350 diff -urN linux-2.4.24.org/drivers/md/md.c linux-2.4.24/drivers/md/md.c
2351 --- linux-2.4.24.org/drivers/md/md.c    2004-01-18 14:58:09.227678566 +0100
2352 +++ linux-2.4.24/drivers/md/md.c        2004-01-18 16:04:27.702900923 +0100
2353 @@ -2146,6 +2146,8 @@
2354
2355         SET_FROM_SB(utime);
2356         SET_FROM_SB(state);
2357 +       if (mddev->curr_resync)
2358 +               info.state |= (1 << MD_ARRAY_RECOVERY_RUNNING);
2359         SET_FROM_SB(active_disks);
2360         SET_FROM_SB(working_disks);
2361         SET_FROM_SB(failed_disks);
2362 diff -urN linux-2.4.24.org/drivers/md/multipath.c linux-2.4.24/drivers/md/multipath.c
2363 --- linux-2.4.24.org/drivers/md/multipath.c     2004-01-18 14:58:09.254672832 +0100
2364 +++ linux-2.4.24/drivers/md/multipath.c 2004-01-18 16:04:38.291691263 +0100
2365 @@ -139,15 +139,16 @@
2366  static int multipath_map (mddev_t *mddev, kdev_t *rdev)
2367  {
2368         multipath_conf_t *conf = mddev_to_conf(mddev);
2369 -       int i, disks = MD_SB_DISKS;
2370 +       int i;
2371
2372         /*
2373          * Later we do read balancing on the read side
2374          * now we use the first available disk.
2375          */
2376
2377 -       for (i = 0; i < disks; i++) {
2378 +       for (i = 0; i < conf->nr_disks; i++) {
2379                 if (conf->multipaths[i].operational) {
2380 +                       /* first operational is winner! */
2381                         *rdev = conf->multipaths[i].dev;
2382                         return (0);
2383                 }
2384 @@ -191,6 +192,8 @@
2385  {
2386         struct multipath_bh * mp_bh = (struct multipath_bh *)(bh->b_private);
2387
2388 +       atomic_dec(&mp_bh->multipath->nr_pending);
2389 +
2390         /*
2391          * this branch is our 'one multipath IO has finished' event handler:
2392          */
2393 @@ -223,19 +226,39 @@
2394  }
2395
2396  /*
2397 - * This routine returns the disk from which the requested read should
2398 - * be done.
2399 + * Multipath read balance ...
2400 + *
2401 + * Returns:
2402 + *
2403 + *     If no active paths
2404 + *
2405 + *             - Error ( -1 )
2406 + *
2407 + *     If active paths == 1
2408 + *
2409 + *             - 1st active path encountered
2410 + *
2411 + *     If active paths > 1
2412 + *
2413 + *             - 1st idle active path encountered
2414 + *             - else ... the active path doing the least amount of work.
2415   */
2416 -
2417  static int multipath_read_balance (multipath_conf_t *conf)
2418  {
2419 -       int disk;
2420 -
2421 -       for (disk = 0; disk < conf->raid_disks; disk++)
2422 -               if (conf->multipaths[disk].operational)
2423 -                       return disk;
2424 -       BUG();
2425 -       return 0;
2426 +       int i, disk=-1, nr_pending, least_pending=0;
2427 +
2428 +       for (i=0; i<conf->nr_disks; i++) {
2429 +               if (conf->multipaths[i].operational) {
2430 +                       nr_pending = atomic_read(&conf->multipaths[i].nr_pending);
2431 +                       if (nr_pending==0 || conf->working_disks==1)
2432 +                               return i;
2433 +                       if (least_pending==0 || nr_pending<least_pending) {
2434 +                               disk = i;
2435 +                               least_pending = nr_pending;
2436 +                       }
2437 +               }
2438 +       }
2439 +       return disk;
2440  }
2441
2442  static int multipath_make_request (mddev_t *mddev, int rw,
2443 @@ -245,6 +268,7 @@
2444         struct buffer_head *bh_req;
2445         struct multipath_bh * mp_bh;
2446         struct multipath_info *multipath;
2447 +       int disk;
2448
2449         if (!buffer_locked(bh))
2450                 BUG();
2451 @@ -267,7 +291,16 @@
2452         /*
2453          * read balancing logic:
2454          */
2455 -       multipath = conf->multipaths + multipath_read_balance(conf);
2456 +       disk = multipath_read_balance(conf);
2457 +       if (disk==-1) {
2458 +               printk (KERN_ERR "multipath_make_request: no more operational IO paths.\n");
2459 +               buffer_IO_error(bh);
2460 +               return 0;
2461 +       }
2462 +
2463 +       multipath = conf->multipaths + disk;
2464 +       mp_bh->multipath = multipath;
2465 +       atomic_inc(&multipath->nr_pending);
2466
2467         bh_req = &mp_bh->bh_req;
2468         memcpy(bh_req, bh, sizeof(*bh));
2469 @@ -331,13 +364,14 @@
2470  {
2471         multipath_conf_t *conf = mddev_to_conf(mddev);
2472         struct multipath_info * multipaths = conf->multipaths;
2473 -       int disks = MD_SB_DISKS;
2474         int other_paths = 1;
2475 -       int i;
2476 +       int i, first = 1;
2477 +       mdk_rdev_t *rdev;
2478 +       struct md_list_head *tmp;
2479
2480         if (conf->working_disks == 1) {
2481                 other_paths = 0;
2482 -               for (i = 0; i < disks; i++) {
2483 +               for (i = 0; i < MD_SB_DISKS; i++) {
2484                         if (multipaths[i].spare) {
2485                                 other_paths = 1;
2486                                 break;
2487 @@ -351,16 +385,17 @@
2488                  * first check if this is a queued request for a device
2489                  * which has just failed.
2490                  */
2491 -               for (i = 0; i < disks; i++) {
2492 +               for (i = 0; i < MD_SB_DISKS; i++) {
2493                         if (multipaths[i].dev==dev && !multipaths[i].operational)
2494                                 return 0;
2495                 }
2496                 printk (LAST_DISK);
2497         } else {
2498 +               mdp_super_t *sb = mddev->sb;
2499                 /*
2500                  * Mark disk as unusable
2501                  */
2502 -               for (i = 0; i < disks; i++) {
2503 +               for (i = 0; i < MD_SB_DISKS; i++) {
2504                         if (multipaths[i].dev==dev && multipaths[i].operational) {
2505                                 mark_disk_bad(mddev, i);
2506                                 break;
2507 @@ -369,7 +404,6 @@
2508                 if (!conf->working_disks) {
2509                         int err = 1;
2510                         mdp_disk_t *spare;
2511 -                       mdp_super_t *sb = mddev->sb;
2512
2513                         spare = get_spare(mddev);
2514                         if (spare) {
2515 @@ -384,6 +418,21 @@
2516                                 sb->spare_disks--;
2517                         }
2518                 }
2519 +               /* prevent unnecessary work in md_do_recovery() */
2520 +               if (conf->working_disks) {
2521 +                       conf->raid_disks = conf->working_disks
2522 +                                        = sb->raid_disks = sb->active_disks;
2523 +               }
2524 +               /* update alias disk info to insure we can do sb commit. */
2525 +               ITERATE_RDEV(mddev,rdev,tmp) {
2526 +                       if (first && disk_active(&sb->disks[rdev->desc_nr])) {
2527 +                               rdev->alias_device = 0;
2528 +                               first = 0;
2529 +                       } else {
2530 +                               if (!disk_faulty(&sb->disks[rdev->desc_nr]))
2531 +                                       rdev->alias_device = 1;
2532 +                       }
2533 +               }
2534         }
2535         return 0;
2536  }
2537 @@ -677,9 +726,8 @@
2538  /*
2539   * This is a kernel thread which:
2540   *
2541 - *     1.      Retries failed read operations on working multipaths.
2542 + *     1.      Retries failed operations on working multipaths.
2543   *     2.      Updates the raid superblock when problems encounter.
2544 - *     3.      Performs writes following reads for array syncronising.
2545   */
2546
2547  static void multipathd (void *data)
2548 @@ -833,6 +881,7 @@
2549         mdk_rdev_t *rdev, *def_rdev = NULL;
2550         struct md_list_head *tmp;
2551         int num_rdevs = 0;
2552 +       int active_disks = 0, spare_disks = 0, faulty_disks = 0;
2553
2554         MOD_INC_USE_COUNT;
2555
2556 @@ -881,9 +930,7 @@
2557                         printk(NOT_IN_SYNC, partition_name(rdev->dev));
2558
2559                 /*
2560 -                * Mark all disks as spare to start with, then pick our
2561 -                * active disk.  If we have a disk that is marked active
2562 -                * in the sb, then use it, else use the first rdev.
2563 +                * Mark all disks as spare to start with.
2564                  */
2565                 disk->number = desc->number;
2566                 disk->raid_disk = desc->raid_disk;
2567 @@ -894,20 +941,21 @@
2568                 mark_disk_sync(desc);
2569
2570                 if (disk_active(desc)) {
2571 -                       if(!conf->working_disks) {
2572 -                               printk(OPERATIONAL, partition_name(rdev->dev),
2573 -                                       desc->raid_disk);
2574 -                               disk->operational = 1;
2575 -                               disk->spare = 0;
2576 -                               conf->working_disks++;
2577 -                               def_rdev = rdev;
2578 -                       } else {
2579 -                               mark_disk_spare(desc);
2580 -                       }
2581 -               } else
2582 -                       mark_disk_spare(desc);
2583 +                       printk(OPERATIONAL, partition_name(rdev->dev),
2584 +                               desc->raid_disk);
2585 +                       disk->operational = 1;
2586 +                       disk->spare = 0;
2587 +                       conf->working_disks++;
2588 +                       def_rdev = rdev;
2589 +                       active_disks++;
2590 +               } else if (disk_faulty(desc)) {
2591 +                       disk->spare = 0;
2592 +                       faulty_disks++;
2593 +               } else {
2594 +                       spare_disks++;
2595 +               }
2596
2597 -               if(!num_rdevs++) def_rdev = rdev;
2598 +               num_rdevs++;
2599         }
2600         if(!conf->working_disks && num_rdevs) {
2601                 desc = &sb->disks[def_rdev->desc_nr];
2602 @@ -918,11 +966,12 @@
2603                 disk->spare = 0;
2604                 conf->working_disks++;
2605                 mark_disk_active(desc);
2606 +               active_disks++;
2607         }
2608         /*
2609 -        * Make sure our active path is in desc spot 0
2610 +        * If there is only 1 active path ... make sure it is in desc spot 0
2611          */
2612 -       if(def_rdev->desc_nr != 0) {
2613 +       if (active_disks == 1 && def_rdev->desc_nr != 0) {
2614                 rdev = find_rdev_nr(mddev, 0);
2615                 desc = &sb->disks[def_rdev->desc_nr];
2616                 desc2 = sb->disks;
2617 @@ -940,10 +989,10 @@
2618                         def_rdev->desc_nr = 0;
2619                 }
2620         }
2621 -       conf->raid_disks = sb->raid_disks = sb->active_disks = 1;
2622 +       conf->raid_disks = sb->raid_disks = sb->active_disks = active_disks;
2623         conf->nr_disks = sb->nr_disks = sb->working_disks = num_rdevs;
2624 -       sb->failed_disks = 0;
2625 -       sb->spare_disks = num_rdevs - 1;
2626 +       sb->failed_disks = faulty_disks;
2627 +       sb->spare_disks = spare_disks;
2628         mddev->sb_dirty = 1;
2629         conf->mddev = mddev;
2630         conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
2631 diff -urN linux-2.4.24.org/fs/buffer.c linux-2.4.24/fs/buffer.c
2632 --- linux-2.4.24.org/fs/buffer.c        2004-01-18 14:55:22.305275818 +0100
2633 +++ linux-2.4.24/fs/buffer.c    2004-01-18 15:57:55.602026171 +0100
2634 @@ -419,6 +419,34 @@
2635         fsync_dev(dev);
2636  }
2637
2638 +int fsync_dev_lockfs(kdev_t dev)
2639 +{
2640 +       /* you are not allowed to try locking all the filesystems
2641 +       ** on the system, your chances of getting through without
2642 +       ** total deadlock are slim to none.
2643 +       */
2644 +       if (!dev)
2645 +               return fsync_dev(dev) ;
2646 +
2647 +       sync_buffers(dev, 0);
2648 +
2649 +       lock_kernel();
2650 +       /* note, the FS might need to start transactions to
2651 +       ** sync the inodes, or the quota, no locking until
2652 +       ** after these are done
2653 +       */
2654 +       sync_inodes(dev);
2655 +       DQUOT_SYNC_DEV(dev);
2656 +       /* if inodes or quotas could be dirtied during the
2657 +       ** sync_supers_lockfs call, the FS is responsible for getting
2658 +       ** them on disk, without deadlocking against the lock
2659 +       */
2660 +       sync_supers_lockfs(dev) ;
2661 +       unlock_kernel();
2662 +
2663 +       return sync_buffers(dev, 1) ;
2664 +}
2665 +
2666  asmlinkage long sys_sync(void)
2667  {
2668         fsync_dev(0);
2669 diff -urN linux-2.4.24.org/fs/reiserfs/super.c linux-2.4.24/fs/reiserfs/super.c
2670 --- linux-2.4.24.org/fs/reiserfs/super.c        2004-01-18 14:55:18.875002271 +0100
2671 +++ linux-2.4.24/fs/reiserfs/super.c    2004-01-18 15:57:55.657014322 +0100
2672 @@ -84,7 +84,7 @@
2673      reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
2674      journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s));
2675      reiserfs_block_writes(&th) ;
2676 -    journal_end(&th, s, 1) ;
2677 +    journal_end_sync(&th, s, 1) ;
2678    }
2679    s->s_dirt = 0;
2680    unlock_kernel() ;
2681 diff -urN linux-2.4.24.org/fs/super.c linux-2.4.24/fs/super.c
2682 --- linux-2.4.24.org/fs/super.c 2004-01-18 14:55:11.177633010 +0100
2683 +++ linux-2.4.24/fs/super.c     2004-01-18 15:57:55.687007859 +0100
2684 @@ -38,6 +38,13 @@
2685  LIST_HEAD(super_blocks);
2686  spinlock_t sb_lock = SPIN_LOCK_UNLOCKED;
2687
2688 +/*
2689 + * lock/unlockfs grab a read lock on s_umount, but you need this lock to
2690 + * make sure no lockfs runs are in progress before inserting/removing
2691 + * supers from the list.
2692 + */
2693 +static DECLARE_MUTEX(lockfs_sem);
2694 +
2695  /*
2696   * Handling of filesystem drivers list.
2697   * Rules:
2698 @@ -436,6 +443,19 @@
2699         put_super(sb);
2700  }
2701
2702 +static void write_super_lockfs(struct super_block *sb)
2703 +{
2704 +       lock_super(sb);
2705 +       if (sb->s_root && sb->s_op) {
2706 +               if (sb->s_dirt && sb->s_op->write_super)
2707 +                       sb->s_op->write_super(sb);
2708 +               if (sb->s_op->write_super_lockfs) {
2709 +                       sb->s_op->write_super_lockfs(sb);
2710 +               }
2711 +       }
2712 +       unlock_super(sb);
2713 +}
2714 +
2715  static inline void write_super(struct super_block *sb)
2716  {
2717         lock_super(sb);
2718 @@ -483,6 +503,39 @@
2719         spin_unlock(&sb_lock);
2720  }
2721
2722 +/*
2723 + * Note: don't check the dirty flag before waiting, we want the lock
2724 + * to happen every time this is called.  dev must be non-zero
2725 + */
2726 +void sync_supers_lockfs(kdev_t dev)
2727 +{
2728 +       struct super_block * sb;
2729 +
2730 +       down(&lockfs_sem) ;
2731 +       if (dev) {
2732 +               sb = get_super(dev);
2733 +               if (sb) {
2734 +                       write_super_lockfs(sb);
2735 +                       drop_super(sb);
2736 +               }
2737 +       }
2738 +}
2739 +
2740 +void unlockfs(kdev_t dev)
2741 +{
2742 +       struct super_block * sb;
2743 +
2744 +       if (dev) {
2745 +               sb = get_super(dev);
2746 +               if (sb) {
2747 +                       if (sb->s_op && sb->s_op->unlockfs)
2748 +                               sb->s_op->unlockfs(sb) ;
2749 +                       drop_super(sb);
2750 +               }
2751 +       }
2752 +       up(&lockfs_sem) ;
2753 +}
2754 +
2755  /**
2756   *     get_super       -       get the superblock of a device
2757   *     @dev: device to get the superblock for
2758 @@ -702,6 +755,7 @@
2759                 goto out1;
2760
2761         error = -EBUSY;
2762 +       down(&lockfs_sem);
2763  restart:
2764         spin_lock(&sb_lock);
2765
2766 @@ -713,6 +767,7 @@
2767                     ((flags ^ old->s_flags) & MS_RDONLY)) {
2768                         spin_unlock(&sb_lock);
2769                         destroy_super(s);
2770 +                       up(&lockfs_sem);
2771                         goto out1;
2772                 }
2773                 if (!grab_super(old))
2774 @@ -720,12 +775,14 @@
2775                 destroy_super(s);
2776                 blkdev_put(bdev, BDEV_FS);
2777                 path_release(&nd);
2778 +               up(&lockfs_sem);
2779                 return old;
2780         }
2781         s->s_dev = dev;
2782         s->s_bdev = bdev;
2783         s->s_flags = flags;
2784         insert_super(s, fs_type);
2785 +       up(&lockfs_sem);
2786         if (!fs_type->read_super(s, data, flags & MS_VERBOSE ? 1 : 0))
2787                 goto Einval;
2788         s->s_flags |= MS_ACTIVE;
2789 @@ -833,7 +890,10 @@
2790         if (!deactivate_super(sb))
2791                 return;
2792
2793 +       down(&lockfs_sem);
2794         down_write(&sb->s_umount);
2795 +       up(&lockfs_sem);
2796 +
2797         sb->s_root = NULL;
2798         /* Need to clean after the sucker */
2799         if (fs->fs_flags & FS_LITTER)
2800 diff -urN linux-2.4.24.org/include/linux/fs.h linux-2.4.24/include/linux/fs.h
2801 --- linux-2.4.24.org/include/linux/fs.h 2004-01-18 14:55:29.014855364 +0100
2802 +++ linux-2.4.24/include/linux/fs.h     2004-01-18 15:59:11.694692181 +0100
2803 @@ -1287,6 +1287,7 @@
2804  extern int sync_buffers(kdev_t, int);
2805  extern void sync_dev(kdev_t);
2806  extern int fsync_dev(kdev_t);
2807 +extern int fsync_dev_lockfs(kdev_t);
2808  extern int fsync_super(struct super_block *);
2809  extern int fsync_no_super(kdev_t);
2810  extern void sync_inodes_sb(struct super_block *);
2811 @@ -1305,6 +1306,8 @@
2812  extern int filemap_fdatasync(struct address_space *);
2813  extern int filemap_fdatawait(struct address_space *);
2814  extern void sync_supers(kdev_t dev, int wait);
2815 +extern void sync_supers_lockfs(kdev_t);
2816 +extern void unlockfs(kdev_t);
2817  extern int bmap(struct inode *, int);
2818  extern int notify_change(struct dentry *, struct iattr *);
2819  extern int permission(struct inode *, int);
2820 diff -urN linux-2.4.24.org/include/linux/raid/md_u.h linux-2.4.24/include/linux/raid/md_u.h
2821 --- linux-2.4.24.org/include/linux/raid/md_u.h  2004-01-18 14:55:35.554471508 +0100
2822 +++ linux-2.4.24/include/linux/raid/md_u.h      2004-01-18 16:04:27.764887949 +0100
2823 @@ -50,6 +50,10 @@
2824         int patchlevel;
2825  } mdu_version_t;
2826
2827 +#define MD_ARRAY_CLEAN                 0
2828 +#define MD_ARRAY_ERRORS                        1
2829 +#define MD_ARRAY_RECOVERY_RUNNING      2
2830 +
2831  typedef struct mdu_array_info_s {
2832         /*
2833          * Generic constant information
2834 diff -urN linux-2.4.24.org/include/linux/raid/multipath.h linux-2.4.24/include/linux/raid/multipath.h
2835 --- linux-2.4.24.org/include/linux/raid/multipath.h     2004-01-18 14:55:35.563469605 +0100
2836 +++ linux-2.4.24/include/linux/raid/multipath.h 2004-01-18 16:04:38.329683369 +0100
2837 @@ -15,6 +15,7 @@
2838         int             spare;
2839
2840         int             used_slot;
2841 +       atomic_t        nr_pending;     /* number of pending requests */
2842  };
2843
2844  struct multipath_private_data {
2845 @@ -63,6 +64,7 @@
2846         struct buffer_head      *master_bh;
2847         struct buffer_head      bh_req;
2848         struct multipath_bh     *next_mp; /* next for retry or in free list */
2849 +       struct multipath_info   *multipath; /* allows end_request to easilly dec pending buffer count*/
2850  };
2851  /* bits for multipath_bh.state */
2852  #define        MPBH_Uptodate   1
2853 diff -urN linux-2.4.24.org/kernel/ksyms.c linux-2.4.24/kernel/ksyms.c
2854 --- linux-2.4.24.org/kernel/ksyms.c     2004-01-18 14:55:22.698192617 +0100
2855 +++ linux-2.4.24/kernel/ksyms.c 2004-01-18 15:57:55.824978130 +0100
2856 @@ -200,6 +200,8 @@
2857  EXPORT_SYMBOL(invalidate_inode_pages);
2858  EXPORT_SYMBOL(truncate_inode_pages);
2859  EXPORT_SYMBOL(fsync_dev);
2860 +EXPORT_SYMBOL(fsync_dev_lockfs);
2861 +EXPORT_SYMBOL(unlockfs);
2862  EXPORT_SYMBOL(fsync_no_super);
2863  EXPORT_SYMBOL(permission);
2864  EXPORT_SYMBOL(vfs_permission);