1 diff -urN linux-2.4.22/drivers/md/Config.in linux-2.4.22-evms/drivers/md/Config.in
2 --- linux-2.4.22/drivers/md/Config.in 2003-09-15 17:07:45.000000000 +0200
3 +++ linux-2.4.22-evms/drivers/md/Config.in 2003-09-15 17:09:48.000000000 +0200
5 dep_tristate ' Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM $CONFIG_MD
6 dep_tristate ' Device-mapper support' CONFIG_BLK_DEV_DM $CONFIG_MD
7 dep_tristate ' Mirror (RAID-1) support' CONFIG_BLK_DEV_DM_MIRROR $CONFIG_BLK_DEV_DM
8 +if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
9 + dep_tristate ' Bad Block Relocation Device Target' CONFIG_BLK_DEV_DM_BBR $CONFIG_BLK_DEV_DM
10 + dep_tristate ' Sparse Device Target' CONFIG_BLK_DEV_DM_SPARSE $CONFIG_BLK_DEV_DM
14 diff -urN linux-2.4.22/drivers/md/Makefile linux-2.4.22-evms/drivers/md/Makefile
15 --- linux-2.4.22/drivers/md/Makefile 2003-09-15 17:07:45.000000000 +0200
16 +++ linux-2.4.22-evms/drivers/md/Makefile 2003-09-15 17:09:48.000000000 +0200
19 obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
20 obj-$(CONFIG_BLK_DEV_DM_MIRROR) += dm-mirror.o
21 +obj-$(CONFIG_BLK_DEV_DM_BBR) += dm-bbr.o
22 +obj-$(CONFIG_BLK_DEV_DM_SPARSE) += dm-sparse.o
24 include $(TOPDIR)/Rules.make
26 diff -urN linux-2.4.22/drivers/md/dm-bbr.c linux-2.4.22-evms/drivers/md/dm-bbr.c
27 --- linux-2.4.22/drivers/md/dm-bbr.c 1970-01-01 01:00:00.000000000 +0100
28 +++ linux-2.4.22-evms/drivers/md/dm-bbr.c 2003-09-15 17:08:42.000000000 +0200
31 + * Copyright (c) International Business Machines Corp., 2002-2003
33 + * This program is free software; you can redistribute it and/or modify
34 + * it under the terms of the GNU General Public License as published by
35 + * the Free Software Foundation; either version 2 of the License, or
36 + * (at your option) any later version.
38 + * This program is distributed in the hope that it will be useful,
39 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
40 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
41 + * the GNU General Public License for more details.
43 + * You should have received a copy of the GNU General Public License
44 + * along with this program; if not, write to the Free Software
45 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
47 + * linux/drivers/md/dm-bbr.c
49 + * Bad-block-relocation (BBR) target for device-mapper.
51 + * The BBR target is designed to remap I/O write failures to another safe
52 + * location on disk. Note that most disk drives have BBR built into them,
53 + * this means that our software BBR will be only activated when all hardware
54 + * BBR replacement sectors have been used.
57 +#include <linux/kernel.h>
58 +#include <linux/module.h>
59 +#include <linux/init.h>
60 +#include <linux/blkdev.h>
61 +#include <linux/spinlock.h>
62 +#include <linux/smp_lock.h>
63 +#include <linux/slab.h>
64 +#include <linux/mempool.h>
67 +#include "dm-daemon.h"
70 +/* Number of active BBR devices. */
71 +static int bbr_instances = 0;
72 +static DECLARE_MUTEX(bbr_instances_lock);
74 +/* Data pertaining to the I/O thread. */
75 +static struct dm_daemon * bbr_io_thread = NULL;
76 +static spinlock_t bbr_io_list_lock = SPIN_LOCK_UNLOCKED;
77 +static LIST_HEAD(bbr_io_list);
78 +static void bbr_io_handler(void);
80 +/* Global pools for bbr_io_buf's and bbr_remap's. */
81 +static kmem_cache_t * bbr_io_buf_cache;
82 +static mempool_t * bbr_io_buf_pool;
83 +static kmem_cache_t * bbr_remap_cache;
84 +static mempool_t * bbr_remap_pool;
86 +static void bbr_free_remap(struct bbr_private * bbr_id);
91 + * Delete the pools for the remap list and I/O anchors.
93 +static void destroy_pools(void)
95 + if (bbr_io_buf_pool) {
96 + mempool_destroy(bbr_io_buf_pool);
97 + bbr_io_buf_pool = NULL;
99 + if (bbr_io_buf_cache) {
100 + kmem_cache_destroy(bbr_io_buf_cache);
101 + bbr_io_buf_cache = NULL;
103 + if (bbr_remap_pool) {
104 + mempool_destroy(bbr_remap_pool);
105 + bbr_remap_pool = NULL;
107 + if (bbr_remap_cache) {
108 + kmem_cache_destroy(bbr_remap_cache);
109 + bbr_remap_cache = NULL;
116 + * Create mempools for the remap list and I/O anchors.
118 +static int create_pools(void)
120 + if (!bbr_remap_cache) {
121 + bbr_remap_cache = kmem_cache_create("BBR_Remap_Cache",
122 + sizeof(struct bbr_runtime_remap),
123 + 0, SLAB_HWCACHE_ALIGN,
125 + if (!bbr_remap_cache) {
126 + DMERR("Unable to create BBR remap cache.");
130 + if (!bbr_remap_pool) {
131 + bbr_remap_pool = mempool_create(64, mempool_alloc_slab,
134 + if (!bbr_remap_pool) {
135 + DMERR("Unable to create BBR remap mempool.");
140 + if (!bbr_io_buf_cache) {
141 + bbr_io_buf_cache = kmem_cache_create("BBR_IO_Buf_Cache",
142 + sizeof(struct bbr_io_buffer),
143 + 0, SLAB_HWCACHE_ALIGN,
145 + if (!bbr_io_buf_cache) {
146 + DMERR("Unable to create BBR I/O buffer cache.");
150 + if (!bbr_io_buf_pool) {
151 + bbr_io_buf_pool = mempool_create(256, mempool_alloc_slab,
154 + if (!bbr_io_buf_pool) {
155 + DMERR("Unable to create BBR I/O buffer mempool.");
161 + if (!bbr_remap_cache || !bbr_remap_pool ||
162 + !bbr_io_buf_cache || !bbr_io_buf_pool ) {
173 + * Use the dm-daemon services to stop the BBR I/O thread.
175 +static void stop_io_thread(void)
177 + if (bbr_io_thread) {
178 + dm_daemon_stop(bbr_io_thread);
179 + kfree(bbr_io_thread);
180 + bbr_io_thread = NULL;
187 + * Use the dm-daemon services to start the BBR I/O thread.
189 +static int start_io_thread(void)
193 + if (!bbr_io_thread) {
194 + bbr_io_thread = kmalloc(sizeof(*bbr_io_thread), GFP_KERNEL);
195 + if (!bbr_io_thread) {
199 + rc = dm_daemon_start(bbr_io_thread, "bbr_io", bbr_io_handler);
201 + kfree(bbr_io_thread);
212 + * Set up the mempools, I/O thread, and sync-I/O service. This should
213 + * be called only when the first bbr device is created.
215 +static int bbr_global_init(void)
219 + rc = create_pools();
224 + rc = start_io_thread();
242 + * bbr_global_cleanup
244 + * Cleanup the mempools, I/O thread and sync-I/O service. This should
245 + * be called only when the last bbr device is removed.
247 +static void bbr_global_cleanup(void)
254 +static struct bbr_private * bbr_alloc_private(void)
256 + struct bbr_private * bbr_id;
258 + bbr_id = kmalloc(sizeof(*bbr_id), GFP_KERNEL);
260 + memset(bbr_id, 0, sizeof(*bbr_id));
261 + bbr_id->in_use_replacement_blks = (atomic_t)ATOMIC_INIT(0);
262 + bbr_id->bbr_id_lock = SPIN_LOCK_UNLOCKED;
268 +static void bbr_free_private(struct bbr_private * bbr_id)
270 + if (bbr_id->bbr_table) {
271 + kfree(bbr_id->bbr_table);
273 + bbr_free_remap(bbr_id);
277 +static u32 crc_table[256];
278 +static u32 crc_table_built = 0;
280 +static void build_crc_table(void)
284 + for (i = 0; i <= 255; i++) {
286 + for (j = 8; j > 0; j--) {
288 + crc = (crc >> 1) ^ CRC_POLYNOMIAL;
292 + crc_table[i] = crc;
294 + crc_table_built = 1;
297 +static u32 calculate_crc(u32 crc, void * buffer, u32 buffersize)
299 + unsigned char * current_byte;
300 + u32 temp1, temp2, i;
302 + current_byte = (unsigned char *) buffer;
303 + /* Make sure the crc table is available */
304 + if (!crc_table_built)
306 + /* Process each byte in the buffer. */
307 + for (i = 0; i < buffersize; i++) {
308 + temp1 = (crc >> 8) & 0x00FFFFFF;
309 + temp2 = crc_table[(crc ^ (u32) * current_byte) &
312 + crc = temp1 ^ temp2;
318 + * le_bbr_table_sector_to_cpu
320 + * Convert bbr meta data from on-disk (LE) format
321 + * to the native cpu endian format.
323 +static void le_bbr_table_sector_to_cpu(struct bbr_table * p)
326 + p->signature = le32_to_cpup(&p->signature);
327 + p->crc = le32_to_cpup(&p->crc);
328 + p->sequence_number = le32_to_cpup(&p->sequence_number);
329 + p->in_use_cnt = le32_to_cpup(&p->in_use_cnt);
330 + for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
331 + p->entries[i].bad_sect =
332 + le64_to_cpup(&p->entries[i].bad_sect);
333 + p->entries[i].replacement_sect =
334 + le64_to_cpup(&p->entries[i].replacement_sect);
339 + * cpu_bbr_table_sector_to_le
341 + * Convert bbr meta data from cpu endian format to on-disk (LE) format
343 +static void cpu_bbr_table_sector_to_le(struct bbr_table * p,
344 + struct bbr_table * le)
347 + le->signature = cpu_to_le32p(&p->signature);
348 + le->crc = cpu_to_le32p(&p->crc);
349 + le->sequence_number = cpu_to_le32p(&p->sequence_number);
350 + le->in_use_cnt = cpu_to_le32p(&p->in_use_cnt);
351 + for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
352 + le->entries[i].bad_sect =
353 + cpu_to_le64p(&p->entries[i].bad_sect);
354 + le->entries[i].replacement_sect =
355 + cpu_to_le64p(&p->entries[i].replacement_sect);
360 + * validate_bbr_table_sector
362 + * Check the specified BBR table sector for a valid signature and CRC. If it's
363 + * valid, endian-convert the table sector.
365 +static int validate_bbr_table_sector(struct bbr_table * p)
368 + int org_crc, final_crc;
370 + if (le32_to_cpup(&p->signature) != BBR_TABLE_SIGNATURE) {
371 + DMERR("BBR table signature doesn't match!");
372 + DMERR("Found 0x%x. Expecting 0x%x",
373 + le32_to_cpup(&p->signature), BBR_TABLE_SIGNATURE);
379 + DMERR("BBR table sector has no CRC!");
384 + org_crc = le32_to_cpup(&p->crc);
386 + final_crc = calculate_crc(INITIAL_CRC, (void *)p, sizeof(*p));
387 + if (final_crc != org_crc) {
388 + DMERR("CRC failed!");
389 + DMERR("Found 0x%x. Expecting 0x%x",
390 + org_crc, final_crc);
395 + p->crc = cpu_to_le32p(&org_crc);
396 + le_bbr_table_sector_to_cpu(p);
403 + * bbr_binary_tree_insert
405 + * Insert a node into the binary tree.
407 +static void bbr_binary_tree_insert(struct bbr_runtime_remap ** root,
408 + struct bbr_runtime_remap * newnode)
410 + struct bbr_runtime_remap ** node = root;
411 + while (node && *node) {
412 + if (newnode->remap.bad_sect > (*node)->remap.bad_sect) {
413 + node = &((*node)->right);
415 + node = &((*node)->left);
419 + newnode->left = newnode->right = NULL;
424 + * bbr_binary_search
426 + * Search for a node that contains bad_sect == lsn.
428 +static struct bbr_runtime_remap * bbr_binary_search(
429 + struct bbr_runtime_remap * root,
432 + struct bbr_runtime_remap * node = root;
434 + if (node->remap.bad_sect == lsn) {
437 + if (lsn > node->remap.bad_sect) {
438 + node = node->right;
447 + * bbr_binary_tree_destroy
449 + * Destroy the binary tree.
451 +static void bbr_binary_tree_destroy(struct bbr_runtime_remap * root,
452 + struct bbr_private * bbr_id)
454 + struct bbr_runtime_remap ** link = NULL;
455 + struct bbr_runtime_remap * node = root;
459 + link = &(node->left);
464 + link = &(node->right);
465 + node = node->right;
469 + mempool_free(node, bbr_remap_pool);
470 + if (node == root) {
471 + /* If root is deleted, we're done. */
475 + /* Back to root. */
481 +static void bbr_free_remap(struct bbr_private * bbr_id)
483 + spin_lock_irq(&bbr_id->bbr_id_lock);
484 + bbr_binary_tree_destroy(bbr_id->remap_root, bbr_id);
485 + bbr_id->remap_root = NULL;
486 + spin_unlock_irq(&bbr_id->bbr_id_lock);
490 + * bbr_insert_remap_entry
492 + * Create a new remap entry and add it to the binary tree for this node.
494 +static int bbr_insert_remap_entry(struct bbr_private * bbr_id,
495 + struct bbr_table_entry * new_bbr_entry)
497 + struct bbr_runtime_remap * newnode;
499 + newnode = mempool_alloc(bbr_remap_pool, GFP_NOIO);
501 + DMERR("Could not allocate from remap mempool!");
504 + newnode->remap.bad_sect = new_bbr_entry->bad_sect;
505 + newnode->remap.replacement_sect = new_bbr_entry->replacement_sect;
506 + spin_lock_irq(&bbr_id->bbr_id_lock);
507 + bbr_binary_tree_insert(&bbr_id->remap_root, newnode);
508 + spin_unlock_irq(&bbr_id->bbr_id_lock);
513 + * bbr_table_to_remap_list
515 + * The on-disk bbr table is sorted by the replacement sector LBA. In order to
516 + * improve run time performance, the in memory remap list must be sorted by
517 + * the bad sector LBA. This function is called at discovery time to initialize
518 + * the remap list. This function assumes that at least one copy of meta data
521 +static u32 bbr_table_to_remap_list(struct bbr_private * bbr_id)
523 + u32 in_use_blks = 0;
525 + struct bbr_table * p;
528 + for (i = 0, p = bbr_id->bbr_table;
529 + i < bbr_id->nr_sects_bbr_table;
531 + if (!p->in_use_cnt) {
534 + in_use_blks += p->in_use_cnt;
535 + for (j = 0; j < p->in_use_cnt; j++) {
536 + bbr_insert_remap_entry(bbr_id, &p->entries[j]);
540 + DMWARN("There are %u BBR entries for device %u:%u",
541 + in_use_blks, MAJOR(bbr_id->dev->dev),
542 + MINOR(bbr_id->dev->dev));
544 + return in_use_blks;
548 + * bbr_search_remap_entry
550 + * Search remap entry for the specified sector. If found, return a pointer to
551 + * the table entry. Otherwise, return NULL.
553 +static struct bbr_table_entry * bbr_search_remap_entry(
554 + struct bbr_private * bbr_id,
557 + struct bbr_runtime_remap * p;
559 + spin_lock_irq(&bbr_id->bbr_id_lock);
560 + p = bbr_binary_search(bbr_id->remap_root, lsn);
561 + spin_unlock_irq(&bbr_id->bbr_id_lock);
563 + return (&p->remap);
572 + * If *lsn is in the remap table, return TRUE and modify *lsn,
573 + * else, return FALSE.
575 +static inline int bbr_remap(struct bbr_private * bbr_id,
578 + struct bbr_table_entry * e;
580 + if (atomic_read(&bbr_id->in_use_replacement_blks)) {
581 + e = bbr_search_remap_entry(bbr_id, *lsn);
583 + *lsn = e->replacement_sect;
593 + * If any of the sectors in the range [lsn, lsn+nr_sects] are in the remap
594 + * table return TRUE, Else, return FALSE.
596 +static inline int bbr_remap_probe(struct bbr_private * bbr_id,
597 + u64 lsn, u64 nr_sects)
601 + if (atomic_read(&bbr_id->in_use_replacement_blks)) {
602 + for (cnt = 0, tmp = lsn;
604 + cnt += bbr_id->blksize_in_sects, tmp = lsn + cnt) {
605 + if (bbr_remap(bbr_id,&tmp)) {
616 + * Read the remap tables from disk and set up the initial remap tree.
618 +static int bbr_setup(struct bbr_private * bbr_id)
620 + struct bbr_table * table = bbr_id->bbr_table;
621 + struct page * page;
622 + struct io_region job;
623 + unsigned int error, offset;
626 + job.dev = bbr_id->dev->dev;
629 + /* Read and verify each BBR table sector individually. */
630 + for (i = 0; i < bbr_id->nr_sects_bbr_table; i++, table++) {
631 + job.sector = bbr_id->lba_table1 + i;
632 + page = virt_to_page(table);
633 + offset = (unsigned long)table & ~PAGE_MASK;
634 + rc = dm_io_sync(1, &job, READ, page, offset, &error);
635 + if (rc && bbr_id->lba_table2) {
636 + job.sector = bbr_id->lba_table2 + i;
637 + rc = dm_io_sync(1, &job, READ, page, offset, &error);
643 + rc = validate_bbr_table_sector(table);
648 + atomic_set(&bbr_id->in_use_replacement_blks,
649 + bbr_table_to_remap_list(bbr_id));
653 + DMERR("dm-bbr: error during device setup: %d", rc);
658 +static struct bbr_io_buffer * allocate_bbr_io_buf(struct bbr_private * bbr_id,
659 + struct buffer_head * bh,
662 + struct bbr_io_buffer * bbr_io_buf;
664 + bbr_io_buf = mempool_alloc(bbr_io_buf_pool, GFP_NOIO);
666 + memset(bbr_io_buf, 0, sizeof(struct bbr_io_buffer));
667 + INIT_LIST_HEAD(&bbr_io_buf->bbr_io_list);
668 + bbr_io_buf->bbr_id = bbr_id;
669 + bbr_io_buf->sector = bh->b_rsector;
670 + bbr_io_buf->bh = bh;
671 + bbr_io_buf->rw = rw;
673 + DMWARN("Could not allocate from BBR I/O buffer pool!");
678 +static void free_bbr_io_buf(struct bbr_io_buffer * bbr_io_buf)
680 + mempool_free(bbr_io_buf, bbr_io_buf_pool);
684 + * bbr_io_remap_error
685 + * @bbr_id: Private data for the BBR node.
686 + * @rw: READ or WRITE.
687 + * @starting_lsn: Starting sector of request to remap.
688 + * @count: Number of sectors in the request.
689 + * @buffer: Data buffer for the request.
691 + * For the requested range, try to write each sector individually. For each
692 + * sector that fails, find the next available remap location and write the
693 + * data to that new location. Then update the table and write both copies
694 + * of the table to disk. Finally, update the in-memory mapping and do any
695 + * other necessary bookkeeping.
697 +static int bbr_io_remap_error(struct bbr_private * bbr_id,
703 + struct bbr_table * bbr_table;
704 + struct io_region job;
705 + struct page * page;
706 + unsigned long table_sector_index;
707 + unsigned long table_sector_offset;
708 + unsigned long index;
709 + unsigned int offset_in_page, error;
714 + /* Nothing can be done about read errors. */
718 + job.dev = bbr_id->dev->dev;
720 + /* For each sector in the request. */
721 + for (lsn = 0; lsn < count; lsn++, buffer += SECTOR_SIZE) {
722 + job.sector = starting_lsn + lsn;
724 + page = virt_to_page(buffer);
725 + offset_in_page = (unsigned long)buffer & ~PAGE_MASK;
726 + rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
728 + /* Find the next available relocation sector. */
729 + new_lsn = atomic_read(&bbr_id->in_use_replacement_blks);
730 + if (new_lsn >= bbr_id->nr_replacement_blks) {
731 + /* No more replacement sectors available. */
734 + new_lsn += bbr_id->start_replacement_sect;
736 + /* Write the data to its new location. */
737 + DMWARN("dm-bbr: device %u:%u: Trying to remap bad sector "PFU64" to sector "PFU64,
738 + MAJOR(bbr_id->dev->dev), MINOR(bbr_id->dev->dev),
739 + starting_lsn + lsn, new_lsn);
740 + job.sector = new_lsn;
741 + rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
743 + /* This replacement sector is bad.
744 + * Try the next one.
746 + DMERR("dm-bbr: device %u:%u: replacement sector "PFU64" is bad. Skipping.",
747 + MAJOR(bbr_id->dev->dev), MINOR(bbr_id->dev->dev), new_lsn);
748 + atomic_inc(&bbr_id->in_use_replacement_blks);
752 + /* Add this new entry to the on-disk table. */
753 + table_sector_index = new_lsn -
754 + bbr_id->start_replacement_sect;
755 + table_sector_offset = table_sector_index /
756 + BBR_ENTRIES_PER_SECT;
757 + index = table_sector_index % BBR_ENTRIES_PER_SECT;
759 + bbr_table = &bbr_id->bbr_table[table_sector_offset];
760 + bbr_table->entries[index].bad_sect = starting_lsn + lsn;
761 + bbr_table->entries[index].replacement_sect = new_lsn;
762 + bbr_table->in_use_cnt++;
763 + bbr_table->sequence_number++;
764 + bbr_table->crc = 0;
765 + bbr_table->crc = calculate_crc(INITIAL_CRC,
767 + sizeof(struct bbr_table));
769 + /* Write the table to disk. */
770 + cpu_bbr_table_sector_to_le(bbr_table, bbr_table);
771 + page = virt_to_page(bbr_table);
772 + offset_in_page = (unsigned long)bbr_table & ~PAGE_MASK;
773 + if (bbr_id->lba_table1) {
774 + job.sector = bbr_id->lba_table1 + table_sector_offset;
776 + rc = dm_io_sync(1, &job, WRITE, page, offset_in_page, &error);
778 + if (bbr_id->lba_table2) {
779 + job.sector = bbr_id->lba_table2 + table_sector_offset;
780 + rc |= dm_io_sync(1, &job, WRITE, page, offset_in_page, &error);
782 + le_bbr_table_sector_to_cpu(bbr_table);
785 + /* Error writing one of the tables to disk. */
786 + DMERR("dm-bbr: device %u:%u: error updating BBR tables on disk.",
787 + MAJOR(bbr_id->dev->dev), MINOR(bbr_id->dev->dev));
791 + /* Insert a new entry in the remapping binary-tree. */
792 + rc = bbr_insert_remap_entry(bbr_id,
793 + &bbr_table->entries[index]);
795 + DMERR("dm-bbr: device %u:%u: error adding new entry to remap tree.",
796 + MAJOR(bbr_id->dev->dev), MINOR(bbr_id->dev->dev));
800 + atomic_inc(&bbr_id->in_use_replacement_blks);
808 + * bbr_io_process_request
810 + * For each sector in this request, check if the sector has already
811 + * been remapped. If so, process all previous sectors in the request,
812 + * followed by the remapped sector. Then reset the starting lsn and
813 + * count, and keep going with the rest of the request as if it were
814 + * a whole new request. If any of the sync_io's return an error,
815 + * call the remapper to relocate the bad sector(s).
817 +static int bbr_io_process_request(struct bbr_io_buffer * bbr_io_buf)
819 + struct bbr_private * bbr_id = bbr_io_buf->bbr_id;
820 + struct io_region job;
821 + u64 starting_lsn = bbr_io_buf->sector;
822 + u64 count = bbr_io_buf->bh->b_size >> SECTOR_SHIFT;
823 + u64 lsn, remapped_lsn;
824 + char * buffer = bbr_io_buf->bh->b_data;
825 + struct page * page = virt_to_page(buffer);
826 + unsigned int offset_in_page = (unsigned long)buffer & ~PAGE_MASK;
827 + unsigned int error;
828 + int rw = bbr_io_buf->rw;
831 + job.dev = bbr_id->dev->dev;
833 + /* For each sector in this request, check if this sector has already
834 + * been remapped. If so, process all previous sectors in this request,
835 + * followed by the remapped sector. Then reset the starting lsn and
836 + * count and keep going with the rest of the request as if it were
837 + * a whole new request.
839 + for (lsn = 0; lsn < count; lsn++) {
840 + remapped_lsn = starting_lsn + lsn;
841 + rc = bbr_remap(bbr_id, &remapped_lsn);
843 + /* This sector is fine. */
847 + /* Process all sectors in the request up to this one. */
849 + job.sector = starting_lsn;
851 + rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
853 + /* If this I/O failed, then one of the sectors
854 + * in this request needs to be relocated.
856 + rc = bbr_io_remap_error(bbr_id, bbr_io_buf->rw, starting_lsn,
862 + buffer += (lsn << SECTOR_SHIFT);
863 + page = virt_to_page(buffer);
864 + offset_in_page = (unsigned long)buffer & ~PAGE_MASK;
867 + /* Process the remapped sector. */
868 + job.sector = remapped_lsn;
870 + rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
872 + /* BUGBUG - Need more processing if this caused an
873 + * an error. If this I/O failed, then the existing
874 + * remap is now bad, and we need to find a new remap.
875 + * Can't use bbr_io_remap_error(), because the existing
876 + * map entry needs to be changed, not added again, and
877 + * the original table entry also needs to be changed.
882 + buffer += SECTOR_SIZE;
883 + starting_lsn += (lsn + 1);
884 + count -= (lsn + 1);
886 + page = virt_to_page(buffer);
887 + offset_in_page = (unsigned long)buffer & ~PAGE_MASK;
890 + /* Check for any remaining sectors after the last split. This could
891 + * potentially be the whole request, but that should be a rare case
892 + * because requests should only be processed by the thread if we know
893 + * an error occurred or they contained one or more remapped sectors.
896 + job.sector = starting_lsn;
898 + rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
900 + /* If this I/O failed, then one of the sectors in this
901 + * request needs to be relocated.
903 + rc = bbr_io_remap_error(bbr_id, bbr_io_buf->rw, starting_lsn,
917 + * This is the handler for the bbr_io_thread. It continuously loops,
918 + * taking I/O requests off its list and processing them. If nothing
919 + * is on the list, the thread goes back to sleep until specifically
922 + * I/O requests should only be sent to this thread if we know that:
923 + * a) the request contains at least one remapped sector.
925 + * b) the request caused an error on the normal I/O path.
926 + * This function uses synchronous I/O, so sending a request to this
927 + * thread that doesn't need special processing will cause severe
928 + * performance degredation.
930 +static void bbr_io_handler(void)
932 + struct bbr_io_buffer * bbr_io_buf;
933 + struct buffer_head * bh;
934 + unsigned long flags;
938 + /* Process bbr_io_list, one entry at a time. */
939 + spin_lock_irqsave(&bbr_io_list_lock, flags);
940 + if (list_empty(&bbr_io_list)) {
941 + /* No more items on the list. */
942 + spin_unlock_irqrestore(&bbr_io_list_lock, flags);
945 + bbr_io_buf = list_entry(bbr_io_list.next,
946 + struct bbr_io_buffer, bbr_io_list);
947 + list_del_init(&bbr_io_buf->bbr_io_list);
948 + spin_unlock_irqrestore(&bbr_io_list_lock, flags);
950 + rc = bbr_io_process_request(bbr_io_buf);
952 + /* Clean up and complete the original I/O. */
953 + bbr_io_buf->flags |= BBR_IO_HANDLED;
954 + bh = bbr_io_buf->bh;
955 + if (bh->b_end_io) {
956 + /* If this was the bbr_io_buf for an error on the
957 + * normal WRITE, don't free it here. It will be
958 + * freed later in bbr_callback()
960 + if (!(bbr_io_buf->flags & BBR_IO_RELOCATE))
961 + free_bbr_io_buf(bbr_io_buf);
962 + bh->b_end_io(bh, rc ? 0 : 1);
970 + * Place the specified bbr_io_buf on the thread's processing list.
972 +static void bbr_schedule_io(struct bbr_io_buffer * bbr_io_buf)
974 + unsigned long flags;
975 + spin_lock_irqsave(&bbr_io_list_lock, flags);
976 + list_add_tail(&bbr_io_buf->bbr_io_list, &bbr_io_list);
977 + spin_unlock_irqrestore(&bbr_io_list_lock, flags);
978 + dm_daemon_wake(bbr_io_thread);
984 + * If there are any remapped sectors on this object, send this request over
985 + * to the thread for processing. Otherwise send it down the stack normally.
987 +static int bbr_read(struct bbr_private * bbr_id,
988 + struct buffer_head * bh)
990 + struct bbr_io_buffer * bbr_io_buf;
993 + if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
994 + !bbr_remap_probe(bbr_id, bh->b_rsector,
995 + bh->b_size >> SECTOR_SHIFT)) {
996 + /* No existing remaps or this request doesn't
997 + * contain any remapped sectors.
999 + bh->b_rdev = bbr_id->dev->dev;
1003 + /* This request has at least one remapped sector. */
1004 + bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, READ);
1005 + if (!bbr_io_buf) {
1006 + /* Can't get memory to track the I/O. */
1007 + bh->b_end_io(bh, 0);
1011 + bbr_schedule_io(bbr_io_buf);
1018 + * This is the callback for normal write requests. Check for an error
1019 + * during the I/O, and send to the thread for processing if necessary.
1021 +static int bbr_callback(struct dm_target * ti,
1022 + struct buffer_head * bh,
1025 + union map_info * map_context)
1027 + struct bbr_io_buffer * bbr_io_buf = (struct bbr_io_buffer *) map_context->ptr;
1032 + /* Will try to relocate the WRITE if:
1033 + * - It is an error, and
1034 + * - It is not an error of BBR relocation, and
1036 + if (error && !(bbr_io_buf->flags & BBR_IO_HANDLED)) {
1037 + DMERR("dm-bbr: device %u:%u: Write failure on sector %lu. Scheduling for retry.",
1038 + MAJOR(bh->b_rdev), MINOR(bh->b_rdev),
1039 + (unsigned long)bbr_io_buf->sector);
1040 + /* Indicate this bbr_io_buf is for an error on normal WRITE */
1041 + bbr_io_buf->flags |= BBR_IO_RELOCATE;
1042 + bbr_schedule_io(bbr_io_buf);
1043 + /* Returns >0 so that DM will let us retry the I/O */
1047 + free_bbr_io_buf(bbr_io_buf);
1054 + * If there are any remapped sectors on this object, send the request over
1055 + * to the thread for processing. Otherwise, register for callback
1056 + * notification, and send the request down normally.
1058 +static int bbr_write(struct bbr_private * bbr_id,
1059 + struct buffer_head * bh,
1060 + union map_info * map_context)
1062 + struct bbr_io_buffer * bbr_io_buf;
1064 + bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, WRITE);
1065 + if (!bbr_io_buf) {
1066 + /* Can't get memory to track the I/O. */
1067 + bh->b_end_io(bh, 0);
1071 + if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
1072 + !bbr_remap_probe(bbr_id, bh->b_rsector,
1073 + bh->b_size >> SECTOR_SHIFT)) {
1074 + /* No existing remaps or this request
1075 + * contains no remapped sectors.
1077 + bh->b_rdev = bbr_id->dev->dev;
1078 + map_context->ptr = bbr_io_buf;
1081 + /* This request contains at least one remapped sector. */
1082 + map_context->ptr = NULL;
1083 + bbr_schedule_io(bbr_io_buf);
1089 + * Construct a bbr mapping
1091 +static int bbr_ctr(struct dm_target * ti, unsigned int argc, char ** argv)
1093 + struct bbr_private * bbr_id;
1099 + ti->error = "dm-bbr requires exactly 8 arguments: "
1100 + "device offset table1_lsn table2_lsn table_size start_replacement nr_replacement_blks block_size";
1104 + bbr_id = bbr_alloc_private();
1106 + ti->error = "dm-bbr: Error allocating bbr private data.";
1110 + bbr_id->offset = simple_strtoull(argv[1], &end, 10);
1111 + bbr_id->lba_table1 = simple_strtoull(argv[2], &end, 10);
1112 + bbr_id->lba_table2 = simple_strtoull(argv[3], &end, 10);
1113 + bbr_id->nr_sects_bbr_table = simple_strtoull(argv[4], &end, 10);
1114 + bbr_id->start_replacement_sect = simple_strtoull(argv[5], &end, 10);
1115 + bbr_id->nr_replacement_blks = simple_strtoull(argv[6], &end, 10);
1116 + block_size = simple_strtoul(argv[7], &end, 10);
1117 + bbr_id->blksize_in_sects = (block_size >> SECTOR_SHIFT);
1119 + bbr_id->bbr_table = kmalloc(bbr_id->nr_sects_bbr_table << SECTOR_SHIFT,
1121 + if (!bbr_id->bbr_table) {
1122 + ti->error = "dm-bbr: Error allocating bbr table.";
1126 + if (dm_get_device(ti, argv[0], 0, ti->len,
1127 + dm_table_get_mode(ti->table), &bbr_id->dev)) {
1128 + ti->error = "dm-bbr: Device lookup failed";
1132 + /* Using a semaphore here is probably overkill,
1133 + * but at least it will be correct.
1135 + down(&bbr_instances_lock);
1136 + if (bbr_instances == 0) {
1137 + rc = bbr_global_init();
1139 + up(&bbr_instances_lock);
1144 + up(&bbr_instances_lock);
1146 + rc = bbr_setup(bbr_id);
1148 + ti->error = "dm-bbr: Device setup failed";
1152 + ti->private = bbr_id;
1156 + down(&bbr_instances_lock);
1158 + if (bbr_instances == 0) {
1159 + bbr_global_cleanup();
1161 + up(&bbr_instances_lock);
1164 + dm_put_device(ti, bbr_id->dev);
1166 + bbr_free_private(bbr_id);
1171 +static void bbr_dtr(struct dm_target * ti)
1173 + struct bbr_private * bbr_id = (struct bbr_private *) ti->private;
1175 + dm_put_device(ti, bbr_id->dev);
1176 + bbr_free_private(bbr_id);
1178 + down(&bbr_instances_lock);
1180 + if (bbr_instances == 0) {
1181 + bbr_global_cleanup();
1183 + up(&bbr_instances_lock);
1186 +static int bbr_map(struct dm_target * ti, struct buffer_head * bh, int rw,
1187 + union map_info * map_context)
1189 + struct bbr_private * bbr_id = (struct bbr_private *) ti->private;
1191 + bh->b_rsector += bbr_id->offset;
1195 + map_context->ptr = NULL;
1196 + return bbr_read(bbr_id, bh);
1198 + return bbr_write(bbr_id, bh, map_context);
1204 +static int bbr_status(struct dm_target * ti, status_type_t type,
1205 + char * result, unsigned int maxlen)
1207 + struct bbr_private * bbr_id = (struct bbr_private *) ti->private;
1210 + case STATUSTYPE_INFO:
1214 + case STATUSTYPE_TABLE:
1215 + snprintf(result, maxlen, "%s "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" %u",
1216 + dm_kdevname(bbr_id->dev->dev), bbr_id->offset,
1217 + bbr_id->lba_table1, bbr_id->lba_table2,
1218 + bbr_id->nr_sects_bbr_table,
1219 + bbr_id->start_replacement_sect,
1220 + bbr_id->nr_replacement_blks,
1221 + bbr_id->blksize_in_sects << SECTOR_SHIFT);
1227 +static struct target_type bbr_target = {
1229 + module: THIS_MODULE,
1233 + end_io: bbr_callback,
1234 + status: bbr_status,
1237 +int __init dm_bbr_init(void)
1239 + int r = dm_register_target(&bbr_target);
1242 + DMERR("dm-bbr: register failed %d", r);
1247 +void __exit dm_bbr_exit(void)
1249 + int r = dm_unregister_target(&bbr_target);
1252 + DMERR("dm-bbr: unregister failed %d", r);
1255 +module_init(dm_bbr_init);
1256 +module_exit(dm_bbr_exit);
1257 +MODULE_LICENSE("GPL");
1258 diff -urN linux-2.4.22/drivers/md/dm-bbr.h linux-2.4.22-evms/drivers/md/dm-bbr.h
1259 --- linux-2.4.22/drivers/md/dm-bbr.h 1970-01-01 01:00:00.000000000 +0100
1260 +++ linux-2.4.22-evms/drivers/md/dm-bbr.h 2003-09-15 17:08:42.000000000 +0200
1263 + * Copyright (c) International Business Machines Corp., 2002-2003
1265 + * This program is free software; you can redistribute it and/or modify
1266 + * it under the terms of the GNU General Public License as published by
1267 + * the Free Software Foundation; either version 2 of the License, or
1268 + * (at your option) any later version.
1270 + * This program is distributed in the hope that it will be useful,
1271 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
1272 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
1273 + * the GNU General Public License for more details.
1275 + * You should have received a copy of the GNU General Public License
1276 + * along with this program; if not, write to the Free Software
1277 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
1279 + * linux/drivers/md/dm-bbr.h
1281 + * Bad-block-relocation (BBR) target for device-mapper.
1283 + * The BBR target is designed to remap I/O write failures to another safe
1284 + * location on disk. Note that most disk drives have BBR built into them,
1285 + * this means that our software BBR will be only activated when all hardware
1286 + * BBR replacement sectors have been used.
1292 +#define BBR_TABLE_SIGNATURE 0x42627254 /* BbrT */
1293 +#define BBR_ENTRIES_PER_SECT 31
1294 +#define BBR_NR_BUFS 128
1295 +#define INITIAL_CRC 0xFFFFFFFF
1296 +#define CRC_POLYNOMIAL 0xEDB88320L
1299 + * Macros to cleanly print 64-bit numbers on both 32-bit and 64-bit machines.
1300 + * Use these in place of %Ld, %Lu, and %Lx.
1302 +#if BITS_PER_LONG > 32
1303 +#define PFU64 "%lu"
1305 +#define PFU64 "%Lu"
1309 + * struct bbr_table_entry
1310 + * @bad_sect: LBA of bad location.
1311 + * @replacement_sect: LBA of new location.
1313 + * Structure to describe one BBR remap.
1315 +struct bbr_table_entry {
1317 + u64 replacement_sect;
1321 + * struct bbr_table
1322 + * @signature: Signature on each BBR table sector.
1323 + * @crc: CRC for this table sector.
1324 + * @sequence_number: Used to resolve conflicts when primary and secondary
1325 + * tables do not match.
1326 + * @in_use_cnt: Number of in-use table entries.
1327 + * @entries: Actual table of remaps.
1329 + * Structure to describe each sector of the metadata table. Each sector in this
1330 + * table can describe 31 remapped sectors.
1335 + u32 sequence_number;
1337 + struct bbr_table_entry entries[BBR_ENTRIES_PER_SECT];
1341 + * struct bbr_runtime_remap
1343 + * Node in the binary tree used to keep track of remaps.
1345 +struct bbr_runtime_remap {
1346 + struct bbr_table_entry remap;
1347 + struct bbr_runtime_remap *left;
1348 + struct bbr_runtime_remap *right;
1352 + * struct bbr_private
1353 + * @dev: Info about underlying device.
1354 + * @bbr_table: Copy of metadata table.
1355 + * @offset: LBA of data area.
1356 + * @lba_table1: LBA of primary BBR table.
1357 + * @lba_table2: LBA of secondary BBR table.
1358 + * @nr_sects_bbr_table: Size of each BBR table.
1359 + * @nr_replacement_blks: Number of replacement blocks.
1360 + * @start_replacement_sect: LBA of start of replacement blocks.
1361 + * @blksize_in_sects: Size of each block.
1362 + * @in_use_replacement_blks: Current number of remapped blocks.
1363 + * @remap_root: Binary tree containing all remaps.
1364 + * @bbr_id_lock: Lock for the binary tree.
1366 + * Private data for each BBR target.
1368 +struct bbr_private {
1369 + struct dm_dev * dev;
1370 + struct bbr_table * bbr_table;
1371 + struct bbr_runtime_remap * remap_root;
1375 + u64 nr_sects_bbr_table;
1376 + u64 start_replacement_sect;
1377 + u64 nr_replacement_blks;
1378 + u32 blksize_in_sects;
1379 + atomic_t in_use_replacement_blks;
1380 + spinlock_t bbr_id_lock;
1383 +#define BBR_IO_HANDLED (1<<0)
1384 +#define BBR_IO_RELOCATE (1<<1)
1387 + * struct bbr_io_buffer
1388 + * @bbr_io_list: Thread's list of bbr_io_buf's.
1389 + * @bbr_id: Object for this request.
1390 + * @bh: Original buffer_head.
1391 + * @sector: Original sector
1392 + * @flags: Operation flag (BBR_IO_*)
1393 + * @rw: READ or WRITE.
1394 + * @rc: Return code from bbr_io_handler.
1396 + * Structure used to track each write request.
1398 +struct bbr_io_buffer {
1399 + struct list_head bbr_io_list;
1400 + struct bbr_private *bbr_id;
1401 + struct buffer_head *bh;
1410 diff -urN linux-2.4.22/drivers/md/dm-snapshot.c linux-2.4.22-evms/drivers/md/dm-snapshot.c
1411 --- linux-2.4.22/drivers/md/dm-snapshot.c 2003-09-15 17:07:45.000000000 +0200
1412 +++ linux-2.4.22-evms/drivers/md/dm-snapshot.c 2003-09-15 17:08:35.000000000 +0200
1415 /* List of snapshots for this origin */
1416 struct list_head snapshots;
1418 + /* Count of snapshots and origins referrencing this structure. */
1419 + unsigned int count;
1423 @@ -155,6 +158,35 @@
1427 + * Allocate and initialize an origin structure.
1429 +static struct origin * __alloc_origin(kdev_t dev)
1431 + struct origin *o = kmalloc(sizeof(*o), GFP_KERNEL);
1434 + INIT_LIST_HEAD(&o->hash_list);
1435 + INIT_LIST_HEAD(&o->snapshots);
1436 + __insert_origin(o);
1441 +static void __get_origin(struct origin *o)
1446 +static void __put_origin(struct origin *o)
1449 + if (o->count == 0) {
1450 + list_del(&o->hash_list);
1456 * Make a note of the snapshot and its origin so we can look it
1457 * up when the origin has a write on it.
1459 @@ -168,20 +200,37 @@
1463 - o = kmalloc(sizeof(*o), GFP_KERNEL);
1464 + o = __alloc_origin(dev);
1466 up_write(&_origins_lock);
1471 - /* Initialise the struct */
1472 - INIT_LIST_HEAD(&o->snapshots);
1475 + list_add_tail(&snap->list, &o->snapshots);
1477 - __insert_origin(o);
1478 + up_write(&_origins_lock);
1482 +static int register_origin(kdev_t dev)
1486 + down_write(&_origins_lock);
1487 + o = __lookup_origin(dev);
1491 + o = __alloc_origin(dev);
1493 + up_write(&_origins_lock);
1498 - list_add_tail(&snap->list, &o->snapshots);
1501 up_write(&_origins_lock);
1503 @@ -195,11 +244,18 @@
1504 o = __lookup_origin(s->origin->dev);
1507 - if (list_empty(&o->snapshots)) {
1508 - list_del(&o->hash_list);
1513 + up_write(&_origins_lock);
1516 +static void unregister_origin(kdev_t dev)
1520 + down_write(&_origins_lock);
1521 + o = __lookup_origin(dev);
1523 up_write(&_origins_lock);
1526 @@ -1090,6 +1146,13 @@
1530 + r = register_origin(dev->dev);
1532 + ti->error = "Cannot register origin";
1533 + dm_put_device(ti, dev);
1540 @@ -1097,6 +1160,7 @@
1541 static void origin_dtr(struct dm_target *ti)
1543 struct dm_dev *dev = (struct dm_dev *) ti->private;
1544 + unregister_origin(dev->dev);
1545 dm_put_device(ti, dev);
1548 diff -urN linux-2.4.22/drivers/md/dm-sparse.c linux-2.4.22-evms/drivers/md/dm-sparse.c
1549 --- linux-2.4.22/drivers/md/dm-sparse.c 1970-01-01 01:00:00.000000000 +0100
1550 +++ linux-2.4.22-evms/drivers/md/dm-sparse.c 2003-09-15 17:09:48.000000000 +0200
1552 +/* -*- linux-c -*- */
1555 + * Copyright (c) International Business Machines Corp., 2002
1557 + * This program is free software; you can redistribute it and/or modify
1558 + * it under the terms of the GNU General Public License as published by
1559 + * the Free Software Foundation; either version 2 of the License, or
1560 + * (at your option) any later version.
1562 + * This program is distributed in the hope that it will be useful,
1563 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
1564 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
1565 + * the GNU General Public License for more details.
1567 + * You should have received a copy of the GNU General Public License
1568 + * along with this program; if not, write to the Free Software
1569 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
1571 + * linux/drivers/md/dm-sparse.c
1573 + * Sparse target for device-mapper.
1575 + * This target provides the ability to create a sparse device. This
1576 + * allows a device to pretend to be larger than it really is.
1579 +#include <linux/module.h>
1580 +#include <linux/init.h>
1581 +#include <linux/blkdev.h>
1582 +#include <linux/slab.h>
1583 +#include <linux/mempool.h>
1584 +#include <linux/vmalloc.h>
1589 +#define MAX_HASH_CHAIN_ENTRIES 10
1590 +#define NAME_SIZE 127
1599 +// Entries in the sparse remapping structure
1600 +struct sparse_hash_entry {
1601 + u64 org_chunk; // Chunk number, not LBA.
1602 + u64 sparse_chunk; // Chunk number, not LBA.
1603 + struct sparse_hash_entry * next;
1604 + struct sparse_hash_entry * prev;
1607 +//Private data structure
1608 +struct sparse_volume {
1609 + struct dm_dev *dev;
1610 + struct rw_semaphore sparse_semaphore;
1611 + struct sparse_hash_entry ** sparse_map; // Hash table of remappings
1612 + struct sparse_hash_entry * free_hash_list;
1613 + kmem_cache_t * hash_slab;
1614 + mempool_t * hash_pool;
1616 + u32 chunk_size; // Sectors.
1617 + u32 chunk_shift; // Shift value for chunk size.
1618 + u32 num_chunks; // In this volume.
1619 + u32 next_cow_entry; // Index into current COW table.
1620 + u64 current_cow_sector; // LOGICAL sector of current COW table.
1621 + u32 next_free_chunk; // Index of next free chunk (not LBA!).
1622 + u32 hash_table_size; // Size of the hash table for the remap.
1624 + u64 cow_table[64]; // One sector's worth of COW tables.
1627 +/*************************** OLD SERVICES ****************************/
1629 +/* computes log base 2 of value */
1630 +inline int log2(u32 value) //ok to change to u32?
1633 + long tmp; //ok to change to long?
1638 + while (!(tmp & 1)) {
1649 +/********************************* Functions *********************************/
1651 +/***************************** Hash Functions *****************************/
1653 +/* Take and initialize from the free hash list */
1654 +static struct sparse_hash_entry *
1655 +allocate_sparse_hash_entry( struct sparse_volume * volume,
1657 + u64 sparse_chunk )
1659 + struct sparse_hash_entry * hash_entry;
1661 + hash_entry = volume->free_hash_list;
1662 + if ( hash_entry ) { //should always be the case b/c preallocate these
1663 + volume->free_hash_list = hash_entry->next;
1664 + hash_entry->org_chunk = org_chunk;
1665 + hash_entry->sparse_chunk = sparse_chunk;
1666 + hash_entry->next = NULL;
1667 + hash_entry->prev = NULL;
1670 + return hash_entry;
1674 + * This function inserts a new entry into a sparse hash chain, immediately
1675 + * following the specified entry. This function should not be used to add
1676 + * an entry into an empty list, or as the first entry in an existing list.
1677 + * For that case, use insert_sparse_map_entry_at_head().
1679 +static int insert_sparse_hash_entry( struct sparse_hash_entry * entry,
1680 + struct sparse_hash_entry * base )
1682 + entry->next = base->next;
1683 + entry->prev = base;
1684 + base->next = entry;
1685 + if ( entry->next ) {
1686 + entry->next->prev = entry;
1692 + * This function inserts a new entry into a sparse chain as the first
1693 + * entry in the chain.
1695 +static int insert_sparse_hash_entry_at_head( struct sparse_hash_entry * entry,
1696 + struct sparse_hash_entry ** head )
1698 + entry->next = *head;
1699 + entry->prev = NULL;
1701 + if ( entry->next ) {
1702 + entry->next->prev = entry;
1708 + * Delete all items in a single chain in the hash table.
1710 +static int delete_sparse_hash_chain( struct sparse_volume * vol,
1711 + struct sparse_hash_entry * head )
1713 + struct sparse_hash_entry * next;
1716 + next = head->next;
1717 + mempool_free( head, vol->hash_pool );
1724 + * This function will search the hash chain that is anchored at the
1725 + * specified head pointer. If the chunk number is found, a pointer to that
1726 + * entry in the chain is set, and a 1 is returned. If the chunk is not
1727 + * found, a pointer to the previous entry is set and 0 is returned. If the
1728 + * return pointer is NULL, this means either the list is empty, or the
1729 + * specified sector should become the first list item.
1731 +static int search_sparse_hash_chain( u64 chunk,
1732 + struct sparse_hash_entry * head,
1733 + struct sparse_hash_entry ** result )
1735 + struct sparse_hash_entry * curr = head;
1736 + struct sparse_hash_entry * prev = head;
1737 + while ( curr && curr->org_chunk < chunk ) {
1739 + curr = curr->next;
1741 + if (!curr) { // Either an empty chain or went off the end of the chain.
1745 + else if ( curr->org_chunk != chunk ) {
1746 + *result = curr->prev;
1756 + * This function takes a cow table entry (from the on-disk data), and
1757 + * converts it into an appropriate entry for the sparse map, and
1758 + * inserts it into the appropriate map for the specified volume.
1760 +static int add_cow_entry_to_sparse_map( u64 org_chunk,
1762 + struct sparse_volume * volume )
1764 + struct sparse_hash_entry * new_entry;
1765 + struct sparse_hash_entry * target_entry;
1769 + new_entry = allocate_sparse_hash_entry(volume, org_chunk, sparse_chunk);
1774 + hash_value = (long)org_chunk % volume->hash_table_size;
1776 + if (! search_sparse_hash_chain( org_chunk,
1777 + volume->sparse_map[hash_value],
1778 + &target_entry ) ) {
1779 + //should always take this path
1781 + if ( target_entry ) {
1782 + insert_sparse_hash_entry( new_entry, target_entry );
1785 + insert_sparse_hash_entry_at_head
1786 + ( new_entry, &(volume->sparse_map[hash_value]) );
1794 + * Construct the initial hash table state based on
1795 + * existing COW tables on the disk.
1797 +static int build_sparse_maps(struct sparse_volume * volume)
1799 + int rc = 0, done = 0;
1800 + struct io_region job;
1801 + struct page * page;
1802 + unsigned int error, offset;
1806 + // Read in one sector's worth of COW tables.
1807 + job.dev = volume->dev->dev;
1808 + job.sector = volume->current_cow_sector;
1810 + page = virt_to_page(volume->cow_table);
1811 + offset = (unsigned long)volume->cow_table & ~PAGE_MASK;
1812 + rc = dm_io_sync(1, &job, READ, page, offset, &error);
1817 + // Translate every valid COW table entry into
1818 + // a sparse map entry.
1819 + for ( volume->next_cow_entry = 0;
1821 + volume->next_cow_entry < (SECTOR_SIZE/sizeof(u64)) &&
1822 + volume->cow_table[volume->next_cow_entry] !=
1823 + 0xffffffffffffffff;
1825 + volume->next_cow_entry++, volume->next_free_chunk++ ) {
1827 + if ( (rc = add_cow_entry_to_sparse_map
1828 + ( le64_to_cpu( volume->cow_table[volume->next_cow_entry] ),
1829 + volume->next_free_chunk, volume ))) {
1833 + // Move on to the next sector if necessary.
1834 + if ( volume->next_cow_entry == (SECTOR_SIZE/sizeof(u64)) ) {
1835 + volume->current_cow_sector++;
1844 +/************************* Other Functions ************************/
1847 + * Function: sparse_remap_chunk
1849 + * This function performs a sector remap on a sparse volume. This should
1850 + * be called from the I/O path, It first determines the base sector
1851 + * of the chunk containing the specified sector, and saves the remainder.
1852 + * Then it performs a search through the sparse map for the specified
1853 + * volume. If a match is found, the sector number is changed to the new
1854 + * value. If no match is found, the value is left the same, meaning the
1855 + * chunk has not been remapped.
1857 +static int sparse_remap_chunk( struct sparse_volume * sparse_volume,
1860 + struct sparse_hash_entry * result;
1866 + down_read(&sparse_volume->sparse_semaphore);
1868 + remainder = *sector & (u64)(sparse_volume->chunk_size - 1);
1869 + chunk = *sector >> sparse_volume->chunk_shift;
1870 + hash_value = ((u32)chunk) % sparse_volume->hash_table_size;
1872 + if ( search_sparse_hash_chain( chunk,
1873 + sparse_volume->sparse_map[hash_value],
1875 + *sector = ( result->sparse_chunk << sparse_volume->chunk_shift )
1879 + up_read(&sparse_volume->sparse_semaphore);
1883 +/* Function: sparse_cow_write
1885 + * Check this sparse node to see if the given sector/chunk has been
1886 + * remapped yet. If it hasn't, create a new hash table entry, update the
1887 + * in-memory COW table, write the COW table to disk.
1890 +static int sparse_cow_write( struct sparse_volume * sparse_volume,
1893 + struct sparse_hash_entry * target_entry, * new_map_entry;
1894 + struct io_region job;
1895 + struct page * page;
1896 + char * cow = NULL;
1897 + unsigned int error, offset;
1899 + u32 hash_value = 0;
1903 + down_write(&sparse_volume->sparse_semaphore);
1905 + remainder = *sector & (u64)(sparse_volume->chunk_size - 1);
1906 + chunk = *sector >> sparse_volume->chunk_shift;
1907 + hash_value = ((u32)chunk) % sparse_volume->hash_table_size;
1909 + if ( search_sparse_hash_chain( chunk,
1910 + sparse_volume->sparse_map[hash_value],
1911 + &target_entry) ) {
1913 + ( target_entry->sparse_chunk << sparse_volume->chunk_shift )
1919 + // Is there enough room left on this sparse to remap this chunk?
1920 + if ( sparse_volume->next_free_chunk >= sparse_volume->num_chunks ) {
1921 + DMERR("dm-sparse: full no new remaps allowed\n");
1926 + // Create and initialize a new hash table entry for the new remap.
1927 + new_map_entry = allocate_sparse_hash_entry
1928 + (sparse_volume, chunk, sparse_volume->next_free_chunk);
1929 + if ( ! new_map_entry ) {
1930 + // Can't get memory for map entry. Disable this sparse.
1931 + DMERR("dm-sparse: memory error allocating hash entry\n");
1936 + //Always write cow table so its safe
1937 + cow = kmalloc( SECTOR_SIZE, GFP_KERNEL );
1939 + // Can't get I/O buffer. Disable this sparse.
1940 + DMERR("dm-sparse: memory error allocating COW table buffer");
1945 + // Add the entry to the hash table.
1946 + if ( target_entry ) {
1947 + insert_sparse_hash_entry( new_map_entry, target_entry );
1950 + insert_sparse_hash_entry_at_head
1952 + &(sparse_volume->sparse_map[hash_value]) );
1955 + sparse_volume->next_free_chunk++;
1957 + // Update the appropriate entry in the COW table.
1958 + sparse_volume->cow_table[sparse_volume->next_cow_entry] =
1959 + cpu_to_le64(chunk);
1960 + sparse_volume->next_cow_entry++;
1962 + memcpy(cow, sparse_volume->cow_table, SECTOR_SIZE);
1964 + //because of ordering issues needs to be synchronous
1965 + job.dev = sparse_volume->dev->dev;
1966 + job.sector = sparse_volume->current_cow_sector;
1968 + page = virt_to_page(cow);
1969 + offset = (unsigned long)cow & ~PAGE_MASK;
1970 + dm_io_sync(1, &job, WRITE, page, offset, &error);
1972 + // Update the in-memory COW table values.
1973 + if ( sparse_volume->next_cow_entry >= (SECTOR_SIZE/sizeof(u64)) )
1975 + sparse_volume->next_cow_entry = 0;
1976 + sparse_volume->current_cow_sector++;
1977 + memset(sparse_volume->cow_table, 0xff, SECTOR_SIZE);
1980 + *sector = ( new_map_entry->sparse_chunk << sparse_volume->chunk_shift )
1986 + up_write(&sparse_volume->sparse_semaphore);
1994 +/************************ EXPORT FUNCTIONS ************************/
1997 + * Function: sparse_dtr
1999 +static void sparse_dtr( struct dm_target *ti )
2001 + struct sparse_volume * vol = (struct sparse_volume *)ti->private;
2006 + if (vol->sparse_map) {
2007 + for ( i = 0; i < vol->hash_table_size; i++ ) {
2008 + delete_sparse_hash_chain( vol, vol->sparse_map[i] );
2010 + delete_sparse_hash_chain( vol, vol->free_hash_list );
2011 + vfree(vol->sparse_map);
2014 + if (vol->hash_pool)
2015 + mempool_destroy(vol->hash_pool);
2017 + if (vol->hash_slab)
2018 + kmem_cache_destroy(vol->hash_slab);
2020 + dm_put_device(ti, vol->dev);
2022 + if (vol->dm_io_flag) {
2031 + * Function: sparse_ctr
2033 +static int sparse_ctr( struct dm_target *ti, unsigned int argc, char** argv )
2035 + int i, rc = -EINVAL;
2036 + struct sparse_hash_entry *new_entry;
2037 + struct sparse_volume *vol;
2038 + struct dm_dev *dev;
2039 + u32 chunk_size, chunks;
2041 + char* end, slab_name[NAME_SIZE+1];
2043 + if ( argc != 4 ) {
2044 + ti->error="dm-sparse: wrong number of arguments";
2048 + start = simple_strtoull(argv[1], &end, 10);
2050 + ti->error="dm-sparse: Invalid first chunk lba";
2054 + chunk_size = simple_strtoul(argv[2], &end, 10);
2056 + ti->error="dm-sparse: Invalid chunk_size";
2060 + chunks = simple_strtoul(argv[3], &end, 10);
2062 + ti->error="dm-sparse: Invalid number of chunks";
2066 + if ( dm_get_device( ti, argv[0], ti->begin, start + chunks * chunk_size,
2067 + dm_table_get_mode(ti->table), &dev ) ) {
2068 + ti->error = "dm-sparse: Device lookup failed";
2072 + vol = kmalloc(sizeof(struct sparse_volume), GFP_KERNEL);
2074 + ti->error = "dm-sparse: Memory allocation for private-data failed";
2079 + memset( vol, 0, sizeof(struct sparse_volume) );
2081 + rc = dm_io_get(1);
2083 + ti->error = "dm-sparse: failed to initialize dm-io.";
2089 + vol->dm_io_flag = 1;
2090 + vol->chunk_size = chunk_size;
2091 + vol->chunk_shift = log2(chunk_size);
2092 + vol->num_chunks = chunks;
2093 + vol->current_cow_sector = 1;
2094 + vol->hash_table_size = chunks / MAX_HASH_CHAIN_ENTRIES + 1;
2095 + vol->start = start;
2097 + init_rwsem(&vol->sparse_semaphore);
2099 + snprintf(slab_name, NAME_SIZE, "sparse-%p", vol);
2100 + vol->hash_slab = kmem_cache_create(slab_name,
2101 + sizeof(struct sparse_hash_entry),
2102 + 0, SLAB_HWCACHE_ALIGN,
2104 + if ( ! vol->hash_slab ) {
2105 + ti->error = "dm-sparse: memory allocation error in hash slab create";
2109 + vol->hash_pool = mempool_create(1, mempool_alloc_slab,
2110 + mempool_free_slab,
2112 + if ( ! vol->hash_pool ) {
2113 + ti->error = "dm-sparse: memory allocation error in hash pool create";
2118 + // Sparse hash table
2119 + vol->sparse_map = vmalloc( vol->hash_table_size *
2120 + sizeof( struct sparse_hash_entry * ) );
2121 + if ( ! vol->sparse_map ) {
2122 + ti->error = "dm-sparse: Memory allocation error in sparse_map create";
2127 + memset( vol->sparse_map, 0, vol->hash_table_size *
2128 + sizeof( struct sparse_hash_entry * ) );
2130 + for ( i = 0; i < chunks; i++ ) {
2132 + new_entry = mempool_alloc(vol->hash_pool, GFP_KERNEL );
2133 + if ( ! new_entry ) {
2134 + ti->error="dm-sparse: memory allocation error in hash table setup";
2139 + new_entry->next = vol->free_hash_list;
2140 + vol->free_hash_list = new_entry;
2143 + rc = build_sparse_maps(vol);
2145 + ti->error = "dm-sparse: error building hash tables";
2150 + ti->private = vol;
2154 + dm_put_device(ti, dev);
2159 + * Function: sparse_map
2161 +static int sparse_map( struct dm_target * ti, struct buffer_head * bh, int rw,
2162 + union map_info *map_context )
2164 + struct sparse_volume * volume = (struct sparse_volume*)ti->private;
2165 + u64 sector = bh->b_rsector;
2170 + // Check if this sector has been remapped
2171 + rc = sparse_remap_chunk( volume, §or );
2173 + if ( rc < 0 ) { //Error
2174 + bh->b_end_io(bh, 0);
2178 + if ( rc == 0 ) { // Remapped I/O : read or write same logic
2179 + bh->b_rsector = volume->start + sector;
2180 + bh->b_rdev = volume->dev->dev;
2184 + // ( Previously )Un-mapped: read / write different logic
2186 + if ( rw ) { //write :
2187 + rc = sparse_cow_write( volume, §or );
2189 + if ( rc < 0 ) { //Error
2190 + bh->b_end_io(bh, 0);
2194 + bh->b_rsector = volume->start + sector;
2195 + bh->b_rdev = volume->dev->dev;
2199 + //Reading something that was never written
2200 + //return zeros and indicate complete
2201 + memset(bh->b_data, 0x0, bh->b_size);
2202 + bh->b_end_io(bh, 1);
2206 +static int sparse_status( struct dm_target *ti, status_type_t type,
2207 + char *result, unsigned int maxlen )
2209 + struct sparse_volume * vol = (struct sparse_volume * )ti->private;
2213 + case STATUSTYPE_INFO:
2214 + snprintf( result, maxlen, "%d%%",
2215 + ( vol->next_free_chunk * 100 ) / vol->num_chunks );
2218 + case STATUSTYPE_TABLE:
2219 + snprintf( result, maxlen, "%s %Lu %u %u",
2220 + dm_kdevname(vol->dev->dev), vol->start,
2221 + vol->chunk_size, vol->num_chunks );
2231 +/****************** FUNCTION TABLE **********************/
2233 +static struct target_type sparse_target = {
2235 + .module = THIS_MODULE,
2236 + .ctr = sparse_ctr,
2237 + .dtr = sparse_dtr,
2238 + .map = sparse_map,
2239 + .status = sparse_status,
2242 +/********************* REGISTRATION *****************/
2244 +int __init sparse_init(void)
2246 + int rc = dm_register_target(&sparse_target);
2249 + DMWARN("sparse target registration failed");
2254 +void __exit sparse_exit(void)
2256 + if (dm_unregister_target(&sparse_target) )
2257 + DMWARN("sparse target unregistration failed");
2262 +module_init(sparse_init);
2263 +module_exit(sparse_exit);
2264 +MODULE_LICENSE("GPL");
2265 diff -urN linux-2.4.22/drivers/md/multipath.c linux-2.4.22-evms/drivers/md/multipath.c
2266 --- linux-2.4.22/drivers/md/multipath.c 2003-06-13 16:51:34.000000000 +0200
2267 +++ linux-2.4.22-evms/drivers/md/multipath.c 2003-09-15 17:09:36.000000000 +0200
2268 @@ -139,15 +139,16 @@
2269 static int multipath_map (mddev_t *mddev, kdev_t *rdev)
2271 multipath_conf_t *conf = mddev_to_conf(mddev);
2272 - int i, disks = MD_SB_DISKS;
2276 * Later we do read balancing on the read side
2277 * now we use the first available disk.
2280 - for (i = 0; i < disks; i++) {
2281 + for (i = 0; i < conf->nr_disks; i++) {
2282 if (conf->multipaths[i].operational) {
2283 + /* first operational is winner! */
2284 *rdev = conf->multipaths[i].dev;
2289 struct multipath_bh * mp_bh = (struct multipath_bh *)(bh->b_private);
2291 + atomic_dec(&mp_bh->multipath->nr_pending);
2294 * this branch is our 'one multipath IO has finished' event handler:
2296 @@ -223,19 +226,39 @@
2300 - * This routine returns the disk from which the requested read should
2302 + * Multipath read balance ...
2306 + * If no active paths
2310 + * If active paths == 1
2312 + * - 1st active path encountered
2314 + * If active paths > 1
2316 + * - 1st idle active path encountered
2317 + * - else ... the active path doing the least amount of work.
2320 static int multipath_read_balance (multipath_conf_t *conf)
2324 - for (disk = 0; disk < conf->raid_disks; disk++)
2325 - if (conf->multipaths[disk].operational)
2329 + int i, disk=-1, nr_pending, least_pending=0;
2331 + for (i=0; i<conf->nr_disks; i++) {
2332 + if (conf->multipaths[i].operational) {
2333 + nr_pending = atomic_read(&conf->multipaths[i].nr_pending);
2334 + if (nr_pending==0 || conf->working_disks==1)
2336 + if (least_pending==0 || nr_pending<least_pending) {
2338 + least_pending = nr_pending;
2345 static int multipath_make_request (mddev_t *mddev, int rw,
2347 struct buffer_head *bh_req;
2348 struct multipath_bh * mp_bh;
2349 struct multipath_info *multipath;
2352 if (!buffer_locked(bh))
2354 @@ -267,7 +291,16 @@
2356 * read balancing logic:
2358 - multipath = conf->multipaths + multipath_read_balance(conf);
2359 + disk = multipath_read_balance(conf);
2361 + printk (KERN_ERR "multipath_make_request: no more operational IO paths.\n");
2362 + buffer_IO_error(bh);
2366 + multipath = conf->multipaths + disk;
2367 + mp_bh->multipath = multipath;
2368 + atomic_inc(&multipath->nr_pending);
2370 bh_req = &mp_bh->bh_req;
2371 memcpy(bh_req, bh, sizeof(*bh));
2372 @@ -331,13 +364,14 @@
2374 multipath_conf_t *conf = mddev_to_conf(mddev);
2375 struct multipath_info * multipaths = conf->multipaths;
2376 - int disks = MD_SB_DISKS;
2377 int other_paths = 1;
2381 + struct md_list_head *tmp;
2383 if (conf->working_disks == 1) {
2385 - for (i = 0; i < disks; i++) {
2386 + for (i = 0; i < MD_SB_DISKS; i++) {
2387 if (multipaths[i].spare) {
2390 @@ -351,16 +385,17 @@
2391 * first check if this is a queued request for a device
2392 * which has just failed.
2394 - for (i = 0; i < disks; i++) {
2395 + for (i = 0; i < MD_SB_DISKS; i++) {
2396 if (multipaths[i].dev==dev && !multipaths[i].operational)
2401 + mdp_super_t *sb = mddev->sb;
2403 * Mark disk as unusable
2405 - for (i = 0; i < disks; i++) {
2406 + for (i = 0; i < MD_SB_DISKS; i++) {
2407 if (multipaths[i].dev==dev && multipaths[i].operational) {
2408 mark_disk_bad(mddev, i);
2411 if (!conf->working_disks) {
2414 - mdp_super_t *sb = mddev->sb;
2416 spare = get_spare(mddev);
2418 @@ -384,6 +418,21 @@
2422 + /* prevent unnecessary work in md_do_recovery() */
2423 + if (conf->working_disks) {
2424 + conf->raid_disks = conf->working_disks
2425 + = sb->raid_disks = sb->active_disks;
2427 + /* update alias disk info to insure we can do sb commit. */
2428 + ITERATE_RDEV(mddev,rdev,tmp) {
2429 + if (first && disk_active(&sb->disks[rdev->desc_nr])) {
2430 + rdev->alias_device = 0;
2433 + if (!disk_faulty(&sb->disks[rdev->desc_nr]))
2434 + rdev->alias_device = 1;
2442 * This is a kernel thread which:
2444 - * 1. Retries failed read operations on working multipaths.
2445 + * 1. Retries failed operations on working multipaths.
2446 * 2. Updates the raid superblock when problems encounter.
2447 - * 3. Performs writes following reads for array syncronising.
2450 static void multipathd (void *data)
2452 mdk_rdev_t *rdev, *def_rdev = NULL;
2453 struct md_list_head *tmp;
2455 + int active_disks = 0, spare_disks = 0, faulty_disks = 0;
2460 printk(NOT_IN_SYNC, partition_name(rdev->dev));
2463 - * Mark all disks as spare to start with, then pick our
2464 - * active disk. If we have a disk that is marked active
2465 - * in the sb, then use it, else use the first rdev.
2466 + * Mark all disks as spare to start with.
2468 disk->number = desc->number;
2469 disk->raid_disk = desc->raid_disk;
2470 @@ -894,20 +941,21 @@
2471 mark_disk_sync(desc);
2473 if (disk_active(desc)) {
2474 - if(!conf->working_disks) {
2475 - printk(OPERATIONAL, partition_name(rdev->dev),
2477 - disk->operational = 1;
2479 - conf->working_disks++;
2482 - mark_disk_spare(desc);
2485 - mark_disk_spare(desc);
2486 + printk(OPERATIONAL, partition_name(rdev->dev),
2488 + disk->operational = 1;
2490 + conf->working_disks++;
2493 + } else if (disk_faulty(desc)) {
2500 - if(!num_rdevs++) def_rdev = rdev;
2503 if(!conf->working_disks && num_rdevs) {
2504 desc = &sb->disks[def_rdev->desc_nr];
2505 @@ -918,11 +966,12 @@
2507 conf->working_disks++;
2508 mark_disk_active(desc);
2512 - * Make sure our active path is in desc spot 0
2513 + * If there is only 1 active path ... make sure it is in desc spot 0
2515 - if(def_rdev->desc_nr != 0) {
2516 + if (active_disks == 1 && def_rdev->desc_nr != 0) {
2517 rdev = find_rdev_nr(mddev, 0);
2518 desc = &sb->disks[def_rdev->desc_nr];
2520 @@ -940,10 +989,10 @@
2521 def_rdev->desc_nr = 0;
2524 - conf->raid_disks = sb->raid_disks = sb->active_disks = 1;
2525 + conf->raid_disks = sb->raid_disks = sb->active_disks = active_disks;
2526 conf->nr_disks = sb->nr_disks = sb->working_disks = num_rdevs;
2527 - sb->failed_disks = 0;
2528 - sb->spare_disks = num_rdevs - 1;
2529 + sb->failed_disks = faulty_disks;
2530 + sb->spare_disks = spare_disks;
2531 mddev->sb_dirty = 1;
2532 conf->mddev = mddev;
2533 conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
2534 diff -urN linux-2.4.22/include/linux/raid/multipath.h linux-2.4.22-evms/include/linux/raid/multipath.h
2535 --- linux-2.4.22/include/linux/raid/multipath.h 2001-11-12 18:51:56.000000000 +0100
2536 +++ linux-2.4.22-evms/include/linux/raid/multipath.h 2003-09-15 17:09:36.000000000 +0200
2541 + atomic_t nr_pending; /* number of pending requests */
2544 struct multipath_private_data {
2546 struct buffer_head *master_bh;
2547 struct buffer_head bh_req;
2548 struct multipath_bh *next_mp; /* next for retry or in free list */
2549 + struct multipath_info *multipath; /* allows end_request to easilly dec pending buffer count*/
2551 /* bits for multipath_bh.state */
2552 #define MPBH_Uptodate 1