diff -urN linux-2.4.22/drivers/md/Config.in linux-2.4.22-evms/drivers/md/Config.in --- linux-2.4.22/drivers/md/Config.in 2003-09-15 17:07:45.000000000 +0200 +++ linux-2.4.22-evms/drivers/md/Config.in 2003-09-15 17:09:48.000000000 +0200 @@ -16,5 +16,9 @@ dep_tristate ' Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM $CONFIG_MD dep_tristate ' Device-mapper support' CONFIG_BLK_DEV_DM $CONFIG_MD dep_tristate ' Mirror (RAID-1) support' CONFIG_BLK_DEV_DM_MIRROR $CONFIG_BLK_DEV_DM +if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then + dep_tristate ' Bad Block Relocation Device Target' CONFIG_BLK_DEV_DM_BBR $CONFIG_BLK_DEV_DM + dep_tristate ' Sparse Device Target' CONFIG_BLK_DEV_DM_SPARSE $CONFIG_BLK_DEV_DM +fi endmenu diff -urN linux-2.4.22/drivers/md/Makefile linux-2.4.22-evms/drivers/md/Makefile --- linux-2.4.22/drivers/md/Makefile 2003-09-15 17:07:45.000000000 +0200 +++ linux-2.4.22-evms/drivers/md/Makefile 2003-09-15 17:09:48.000000000 +0200 @@ -30,6 +30,8 @@ obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o obj-$(CONFIG_BLK_DEV_DM_MIRROR) += dm-mirror.o +obj-$(CONFIG_BLK_DEV_DM_BBR) += dm-bbr.o +obj-$(CONFIG_BLK_DEV_DM_SPARSE) += dm-sparse.o include $(TOPDIR)/Rules.make diff -urN linux-2.4.22/drivers/md/dm-bbr.c linux-2.4.22-evms/drivers/md/dm-bbr.c --- linux-2.4.22/drivers/md/dm-bbr.c 1970-01-01 01:00:00.000000000 +0100 +++ linux-2.4.22-evms/drivers/md/dm-bbr.c 2003-09-15 17:08:42.000000000 +0200 @@ -0,0 +1,1228 @@ +/* + * Copyright (c) International Business Machines Corp., 2002-2003 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * linux/drivers/md/dm-bbr.c + * + * Bad-block-relocation (BBR) target for device-mapper. + * + * The BBR target is designed to remap I/O write failures to another safe + * location on disk. Note that most disk drives have BBR built into them, + * this means that our software BBR will be only activated when all hardware + * BBR replacement sectors have been used. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "dm.h" +#include "dm-bbr.h" +#include "dm-daemon.h" +#include "dm-io.h" + +/* Number of active BBR devices. */ +static int bbr_instances = 0; +static DECLARE_MUTEX(bbr_instances_lock); + +/* Data pertaining to the I/O thread. */ +static struct dm_daemon * bbr_io_thread = NULL; +static spinlock_t bbr_io_list_lock = SPIN_LOCK_UNLOCKED; +static LIST_HEAD(bbr_io_list); +static void bbr_io_handler(void); + +/* Global pools for bbr_io_buf's and bbr_remap's. */ +static kmem_cache_t * bbr_io_buf_cache; +static mempool_t * bbr_io_buf_pool; +static kmem_cache_t * bbr_remap_cache; +static mempool_t * bbr_remap_pool; + +static void bbr_free_remap(struct bbr_private * bbr_id); + +/** + * destroy_pools + * + * Delete the pools for the remap list and I/O anchors. + **/ +static void destroy_pools(void) +{ + if (bbr_io_buf_pool) { + mempool_destroy(bbr_io_buf_pool); + bbr_io_buf_pool = NULL; + } + if (bbr_io_buf_cache) { + kmem_cache_destroy(bbr_io_buf_cache); + bbr_io_buf_cache = NULL; + } + if (bbr_remap_pool) { + mempool_destroy(bbr_remap_pool); + bbr_remap_pool = NULL; + } + if (bbr_remap_cache) { + kmem_cache_destroy(bbr_remap_cache); + bbr_remap_cache = NULL; + } +} + +/** + * create_pools + * + * Create mempools for the remap list and I/O anchors. + **/ +static int create_pools(void) +{ + if (!bbr_remap_cache) { + bbr_remap_cache = kmem_cache_create("BBR_Remap_Cache", + sizeof(struct bbr_runtime_remap), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!bbr_remap_cache) { + DMERR("Unable to create BBR remap cache."); + goto out; + } + } + if (!bbr_remap_pool) { + bbr_remap_pool = mempool_create(64, mempool_alloc_slab, + mempool_free_slab, + bbr_remap_cache); + if (!bbr_remap_pool) { + DMERR("Unable to create BBR remap mempool."); + goto out; + } + } + + if (!bbr_io_buf_cache) { + bbr_io_buf_cache = kmem_cache_create("BBR_IO_Buf_Cache", + sizeof(struct bbr_io_buffer), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!bbr_io_buf_cache) { + DMERR("Unable to create BBR I/O buffer cache."); + goto out; + } + } + if (!bbr_io_buf_pool) { + bbr_io_buf_pool = mempool_create(256, mempool_alloc_slab, + mempool_free_slab, + bbr_io_buf_cache); + if (!bbr_io_buf_pool) { + DMERR("Unable to create BBR I/O buffer mempool."); + goto out; + } + } + +out: + if (!bbr_remap_cache || !bbr_remap_pool || + !bbr_io_buf_cache || !bbr_io_buf_pool ) { + destroy_pools(); + return -ENOMEM; + } + + return 0; +} + +/** + * stop_io_thread + * + * Use the dm-daemon services to stop the BBR I/O thread. + **/ +static void stop_io_thread(void) +{ + if (bbr_io_thread) { + dm_daemon_stop(bbr_io_thread); + kfree(bbr_io_thread); + bbr_io_thread = NULL; + } +} + +/** + * stop_io_thread + * + * Use the dm-daemon services to start the BBR I/O thread. + **/ +static int start_io_thread(void) +{ + int rc; + + if (!bbr_io_thread) { + bbr_io_thread = kmalloc(sizeof(*bbr_io_thread), GFP_KERNEL); + if (!bbr_io_thread) { + return -ENOMEM; + } + + rc = dm_daemon_start(bbr_io_thread, "bbr_io", bbr_io_handler); + if (rc) { + kfree(bbr_io_thread); + return rc; + } + } + + return 0; +} + +/** + * bbr_global_init + * + * Set up the mempools, I/O thread, and sync-I/O service. This should + * be called only when the first bbr device is created. + **/ +static int bbr_global_init(void) +{ + int rc; + + rc = create_pools(); + if (rc) { + goto out; + } + + rc = start_io_thread(); + if (rc) { + destroy_pools(); + goto out; + } + + rc = dm_io_get(1); + if (rc) { + destroy_pools(); + stop_io_thread(); + goto out; + } + +out: + return rc; +} + +/** + * bbr_global_cleanup + * + * Cleanup the mempools, I/O thread and sync-I/O service. This should + * be called only when the last bbr device is removed. + **/ +static void bbr_global_cleanup(void) +{ + destroy_pools(); + stop_io_thread(); + dm_io_put(1); +} + +static struct bbr_private * bbr_alloc_private(void) +{ + struct bbr_private * bbr_id; + + bbr_id = kmalloc(sizeof(*bbr_id), GFP_KERNEL); + if (bbr_id) { + memset(bbr_id, 0, sizeof(*bbr_id)); + bbr_id->in_use_replacement_blks = (atomic_t)ATOMIC_INIT(0); + bbr_id->bbr_id_lock = SPIN_LOCK_UNLOCKED; + } + + return bbr_id; +} + +static void bbr_free_private(struct bbr_private * bbr_id) +{ + if (bbr_id->bbr_table) { + kfree(bbr_id->bbr_table); + } + bbr_free_remap(bbr_id); + kfree(bbr_id); +} + +static u32 crc_table[256]; +static u32 crc_table_built = 0; + +static void build_crc_table(void) +{ + u32 i, j, crc; + + for (i = 0; i <= 255; i++) { + crc = i; + for (j = 8; j > 0; j--) { + if (crc & 1) + crc = (crc >> 1) ^ CRC_POLYNOMIAL; + else + crc >>= 1; + } + crc_table[i] = crc; + } + crc_table_built = 1; +} + +static u32 calculate_crc(u32 crc, void * buffer, u32 buffersize) +{ + unsigned char * current_byte; + u32 temp1, temp2, i; + + current_byte = (unsigned char *) buffer; + /* Make sure the crc table is available */ + if (!crc_table_built) + build_crc_table(); + /* Process each byte in the buffer. */ + for (i = 0; i < buffersize; i++) { + temp1 = (crc >> 8) & 0x00FFFFFF; + temp2 = crc_table[(crc ^ (u32) * current_byte) & + (u32) 0xff]; + current_byte++; + crc = temp1 ^ temp2; + } + return crc; +} + +/** + * le_bbr_table_sector_to_cpu + * + * Convert bbr meta data from on-disk (LE) format + * to the native cpu endian format. + **/ +static void le_bbr_table_sector_to_cpu(struct bbr_table * p) +{ + int i; + p->signature = le32_to_cpup(&p->signature); + p->crc = le32_to_cpup(&p->crc); + p->sequence_number = le32_to_cpup(&p->sequence_number); + p->in_use_cnt = le32_to_cpup(&p->in_use_cnt); + for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) { + p->entries[i].bad_sect = + le64_to_cpup(&p->entries[i].bad_sect); + p->entries[i].replacement_sect = + le64_to_cpup(&p->entries[i].replacement_sect); + } +} + +/** + * cpu_bbr_table_sector_to_le + * + * Convert bbr meta data from cpu endian format to on-disk (LE) format + **/ +static void cpu_bbr_table_sector_to_le(struct bbr_table * p, + struct bbr_table * le) +{ + int i; + le->signature = cpu_to_le32p(&p->signature); + le->crc = cpu_to_le32p(&p->crc); + le->sequence_number = cpu_to_le32p(&p->sequence_number); + le->in_use_cnt = cpu_to_le32p(&p->in_use_cnt); + for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) { + le->entries[i].bad_sect = + cpu_to_le64p(&p->entries[i].bad_sect); + le->entries[i].replacement_sect = + cpu_to_le64p(&p->entries[i].replacement_sect); + } +} + +/** + * validate_bbr_table_sector + * + * Check the specified BBR table sector for a valid signature and CRC. If it's + * valid, endian-convert the table sector. + **/ +static int validate_bbr_table_sector(struct bbr_table * p) +{ + int rc = 0; + int org_crc, final_crc; + + if (le32_to_cpup(&p->signature) != BBR_TABLE_SIGNATURE) { + DMERR("BBR table signature doesn't match!"); + DMERR("Found 0x%x. Expecting 0x%x", + le32_to_cpup(&p->signature), BBR_TABLE_SIGNATURE); + rc = -EINVAL; + goto out; + } + + if (!p->crc) { + DMERR("BBR table sector has no CRC!"); + rc = -EINVAL; + goto out; + } + + org_crc = le32_to_cpup(&p->crc); + p->crc = 0; + final_crc = calculate_crc(INITIAL_CRC, (void *)p, sizeof(*p)); + if (final_crc != org_crc) { + DMERR("CRC failed!"); + DMERR("Found 0x%x. Expecting 0x%x", + org_crc, final_crc); + rc = -EINVAL; + goto out; + } + + p->crc = cpu_to_le32p(&org_crc); + le_bbr_table_sector_to_cpu(p); + +out: + return rc; +} + +/** + * bbr_binary_tree_insert + * + * Insert a node into the binary tree. + **/ +static void bbr_binary_tree_insert(struct bbr_runtime_remap ** root, + struct bbr_runtime_remap * newnode) +{ + struct bbr_runtime_remap ** node = root; + while (node && *node) { + if (newnode->remap.bad_sect > (*node)->remap.bad_sect) { + node = &((*node)->right); + } else { + node = &((*node)->left); + } + } + + newnode->left = newnode->right = NULL; + *node = newnode; +} + +/** + * bbr_binary_search + * + * Search for a node that contains bad_sect == lsn. + **/ +static struct bbr_runtime_remap * bbr_binary_search( + struct bbr_runtime_remap * root, + u64 lsn) +{ + struct bbr_runtime_remap * node = root; + while (node) { + if (node->remap.bad_sect == lsn) { + break; + } + if (lsn > node->remap.bad_sect) { + node = node->right; + } else { + node = node->left; + } + } + return node; +} + +/** + * bbr_binary_tree_destroy + * + * Destroy the binary tree. + **/ +static void bbr_binary_tree_destroy(struct bbr_runtime_remap * root, + struct bbr_private * bbr_id) +{ + struct bbr_runtime_remap ** link = NULL; + struct bbr_runtime_remap * node = root; + + while (node) { + if (node->left) { + link = &(node->left); + node = node->left; + continue; + } + if (node->right) { + link = &(node->right); + node = node->right; + continue; + } + + mempool_free(node, bbr_remap_pool); + if (node == root) { + /* If root is deleted, we're done. */ + break; + } + + /* Back to root. */ + node = root; + *link = NULL; + } +} + +static void bbr_free_remap(struct bbr_private * bbr_id) +{ + spin_lock_irq(&bbr_id->bbr_id_lock); + bbr_binary_tree_destroy(bbr_id->remap_root, bbr_id); + bbr_id->remap_root = NULL; + spin_unlock_irq(&bbr_id->bbr_id_lock); +} + +/** + * bbr_insert_remap_entry + * + * Create a new remap entry and add it to the binary tree for this node. + **/ +static int bbr_insert_remap_entry(struct bbr_private * bbr_id, + struct bbr_table_entry * new_bbr_entry) +{ + struct bbr_runtime_remap * newnode; + + newnode = mempool_alloc(bbr_remap_pool, GFP_NOIO); + if (!newnode) { + DMERR("Could not allocate from remap mempool!"); + return -ENOMEM; + } + newnode->remap.bad_sect = new_bbr_entry->bad_sect; + newnode->remap.replacement_sect = new_bbr_entry->replacement_sect; + spin_lock_irq(&bbr_id->bbr_id_lock); + bbr_binary_tree_insert(&bbr_id->remap_root, newnode); + spin_unlock_irq(&bbr_id->bbr_id_lock); + return 0; +} + +/** + * bbr_table_to_remap_list + * + * The on-disk bbr table is sorted by the replacement sector LBA. In order to + * improve run time performance, the in memory remap list must be sorted by + * the bad sector LBA. This function is called at discovery time to initialize + * the remap list. This function assumes that at least one copy of meta data + * is valid. + **/ +static u32 bbr_table_to_remap_list(struct bbr_private * bbr_id) +{ + u32 in_use_blks = 0; + int i, j; + struct bbr_table * p; + + + for (i = 0, p = bbr_id->bbr_table; + i < bbr_id->nr_sects_bbr_table; + i++, p++ ) { + if (!p->in_use_cnt) { + break; + } + in_use_blks += p->in_use_cnt; + for (j = 0; j < p->in_use_cnt; j++) { + bbr_insert_remap_entry(bbr_id, &p->entries[j]); + } + } + if (in_use_blks) + DMWARN("There are %u BBR entries for device %u:%u", + in_use_blks, MAJOR(bbr_id->dev->dev), + MINOR(bbr_id->dev->dev)); + + return in_use_blks; +} + +/** + * bbr_search_remap_entry + * + * Search remap entry for the specified sector. If found, return a pointer to + * the table entry. Otherwise, return NULL. + **/ +static struct bbr_table_entry * bbr_search_remap_entry( + struct bbr_private * bbr_id, + u64 lsn) +{ + struct bbr_runtime_remap * p; + + spin_lock_irq(&bbr_id->bbr_id_lock); + p = bbr_binary_search(bbr_id->remap_root, lsn); + spin_unlock_irq(&bbr_id->bbr_id_lock); + if (p) { + return (&p->remap); + } else { + return NULL; + } +} + +/** + * bbr_remap + * + * If *lsn is in the remap table, return TRUE and modify *lsn, + * else, return FALSE. + **/ +static inline int bbr_remap(struct bbr_private * bbr_id, + u64 * lsn) +{ + struct bbr_table_entry * e; + + if (atomic_read(&bbr_id->in_use_replacement_blks)) { + e = bbr_search_remap_entry(bbr_id, *lsn); + if (e) { + *lsn = e->replacement_sect; + return 1; + } + } + return 0; +} + +/** + * bbr_remap_probe + * + * If any of the sectors in the range [lsn, lsn+nr_sects] are in the remap + * table return TRUE, Else, return FALSE. + **/ +static inline int bbr_remap_probe(struct bbr_private * bbr_id, + u64 lsn, u64 nr_sects) +{ + u64 tmp, cnt; + + if (atomic_read(&bbr_id->in_use_replacement_blks)) { + for (cnt = 0, tmp = lsn; + cnt < nr_sects; + cnt += bbr_id->blksize_in_sects, tmp = lsn + cnt) { + if (bbr_remap(bbr_id,&tmp)) { + return 1; + } + } + } + return 0; +} + +/** + * bbr_setup + * + * Read the remap tables from disk and set up the initial remap tree. + **/ +static int bbr_setup(struct bbr_private * bbr_id) +{ + struct bbr_table * table = bbr_id->bbr_table; + struct page * page; + struct io_region job; + unsigned int error, offset; + int i, rc = 0; + + job.dev = bbr_id->dev->dev; + job.count = 1; + + /* Read and verify each BBR table sector individually. */ + for (i = 0; i < bbr_id->nr_sects_bbr_table; i++, table++) { + job.sector = bbr_id->lba_table1 + i; + page = virt_to_page(table); + offset = (unsigned long)table & ~PAGE_MASK; + rc = dm_io_sync(1, &job, READ, page, offset, &error); + if (rc && bbr_id->lba_table2) { + job.sector = bbr_id->lba_table2 + i; + rc = dm_io_sync(1, &job, READ, page, offset, &error); + } + if (rc) { + goto out; + } + + rc = validate_bbr_table_sector(table); + if (rc) { + goto out; + } + } + atomic_set(&bbr_id->in_use_replacement_blks, + bbr_table_to_remap_list(bbr_id)); + +out: + if (rc) { + DMERR("dm-bbr: error during device setup: %d", rc); + } + return rc; +} + +static struct bbr_io_buffer * allocate_bbr_io_buf(struct bbr_private * bbr_id, + struct buffer_head * bh, + int rw) +{ + struct bbr_io_buffer * bbr_io_buf; + + bbr_io_buf = mempool_alloc(bbr_io_buf_pool, GFP_NOIO); + if (bbr_io_buf) { + memset(bbr_io_buf, 0, sizeof(struct bbr_io_buffer)); + INIT_LIST_HEAD(&bbr_io_buf->bbr_io_list); + bbr_io_buf->bbr_id = bbr_id; + bbr_io_buf->sector = bh->b_rsector; + bbr_io_buf->bh = bh; + bbr_io_buf->rw = rw; + } else { + DMWARN("Could not allocate from BBR I/O buffer pool!"); + } + return bbr_io_buf; +} + +static void free_bbr_io_buf(struct bbr_io_buffer * bbr_io_buf) +{ + mempool_free(bbr_io_buf, bbr_io_buf_pool); +} + +/** + * bbr_io_remap_error + * @bbr_id: Private data for the BBR node. + * @rw: READ or WRITE. + * @starting_lsn: Starting sector of request to remap. + * @count: Number of sectors in the request. + * @buffer: Data buffer for the request. + * + * For the requested range, try to write each sector individually. For each + * sector that fails, find the next available remap location and write the + * data to that new location. Then update the table and write both copies + * of the table to disk. Finally, update the in-memory mapping and do any + * other necessary bookkeeping. + **/ +static int bbr_io_remap_error(struct bbr_private * bbr_id, + int rw, + u64 starting_lsn, + u64 count, + char * buffer) +{ + struct bbr_table * bbr_table; + struct io_region job; + struct page * page; + unsigned long table_sector_index; + unsigned long table_sector_offset; + unsigned long index; + unsigned int offset_in_page, error; + u64 lsn, new_lsn; + int rc; + + if (rw == READ) { + /* Nothing can be done about read errors. */ + return -EIO; + } + + job.dev = bbr_id->dev->dev; + + /* For each sector in the request. */ + for (lsn = 0; lsn < count; lsn++, buffer += SECTOR_SIZE) { + job.sector = starting_lsn + lsn; + job.count = 1; + page = virt_to_page(buffer); + offset_in_page = (unsigned long)buffer & ~PAGE_MASK; + rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error); + while (rc) { + /* Find the next available relocation sector. */ + new_lsn = atomic_read(&bbr_id->in_use_replacement_blks); + if (new_lsn >= bbr_id->nr_replacement_blks) { + /* No more replacement sectors available. */ + return -EIO; + } + new_lsn += bbr_id->start_replacement_sect; + + /* Write the data to its new location. */ + DMWARN("dm-bbr: device %u:%u: Trying to remap bad sector "PFU64" to sector "PFU64, + MAJOR(bbr_id->dev->dev), MINOR(bbr_id->dev->dev), + starting_lsn + lsn, new_lsn); + job.sector = new_lsn; + rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error); + if (rc) { + /* This replacement sector is bad. + * Try the next one. + */ + DMERR("dm-bbr: device %u:%u: replacement sector "PFU64" is bad. Skipping.", + MAJOR(bbr_id->dev->dev), MINOR(bbr_id->dev->dev), new_lsn); + atomic_inc(&bbr_id->in_use_replacement_blks); + continue; + } + + /* Add this new entry to the on-disk table. */ + table_sector_index = new_lsn - + bbr_id->start_replacement_sect; + table_sector_offset = table_sector_index / + BBR_ENTRIES_PER_SECT; + index = table_sector_index % BBR_ENTRIES_PER_SECT; + + bbr_table = &bbr_id->bbr_table[table_sector_offset]; + bbr_table->entries[index].bad_sect = starting_lsn + lsn; + bbr_table->entries[index].replacement_sect = new_lsn; + bbr_table->in_use_cnt++; + bbr_table->sequence_number++; + bbr_table->crc = 0; + bbr_table->crc = calculate_crc(INITIAL_CRC, + bbr_table, + sizeof(struct bbr_table)); + + /* Write the table to disk. */ + cpu_bbr_table_sector_to_le(bbr_table, bbr_table); + page = virt_to_page(bbr_table); + offset_in_page = (unsigned long)bbr_table & ~PAGE_MASK; + if (bbr_id->lba_table1) { + job.sector = bbr_id->lba_table1 + table_sector_offset; + job.count = 1; + rc = dm_io_sync(1, &job, WRITE, page, offset_in_page, &error); + } + if (bbr_id->lba_table2) { + job.sector = bbr_id->lba_table2 + table_sector_offset; + rc |= dm_io_sync(1, &job, WRITE, page, offset_in_page, &error); + } + le_bbr_table_sector_to_cpu(bbr_table); + + if (rc) { + /* Error writing one of the tables to disk. */ + DMERR("dm-bbr: device %u:%u: error updating BBR tables on disk.", + MAJOR(bbr_id->dev->dev), MINOR(bbr_id->dev->dev)); + return rc; + } + + /* Insert a new entry in the remapping binary-tree. */ + rc = bbr_insert_remap_entry(bbr_id, + &bbr_table->entries[index]); + if (rc) { + DMERR("dm-bbr: device %u:%u: error adding new entry to remap tree.", + MAJOR(bbr_id->dev->dev), MINOR(bbr_id->dev->dev)); + return rc; + } + + atomic_inc(&bbr_id->in_use_replacement_blks); + } + } + + return 0; +} + +/** + * bbr_io_process_request + * + * For each sector in this request, check if the sector has already + * been remapped. If so, process all previous sectors in the request, + * followed by the remapped sector. Then reset the starting lsn and + * count, and keep going with the rest of the request as if it were + * a whole new request. If any of the sync_io's return an error, + * call the remapper to relocate the bad sector(s). + **/ +static int bbr_io_process_request(struct bbr_io_buffer * bbr_io_buf) +{ + struct bbr_private * bbr_id = bbr_io_buf->bbr_id; + struct io_region job; + u64 starting_lsn = bbr_io_buf->sector; + u64 count = bbr_io_buf->bh->b_size >> SECTOR_SHIFT; + u64 lsn, remapped_lsn; + char * buffer = bbr_io_buf->bh->b_data; + struct page * page = virt_to_page(buffer); + unsigned int offset_in_page = (unsigned long)buffer & ~PAGE_MASK; + unsigned int error; + int rw = bbr_io_buf->rw; + int rc = 0; + + job.dev = bbr_id->dev->dev; + + /* For each sector in this request, check if this sector has already + * been remapped. If so, process all previous sectors in this request, + * followed by the remapped sector. Then reset the starting lsn and + * count and keep going with the rest of the request as if it were + * a whole new request. + */ + for (lsn = 0; lsn < count; lsn++) { + remapped_lsn = starting_lsn + lsn; + rc = bbr_remap(bbr_id, &remapped_lsn); + if (!rc) { + /* This sector is fine. */ + continue; + } + + /* Process all sectors in the request up to this one. */ + if (lsn > 0) { + job.sector = starting_lsn; + job.count = lsn; + rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error); + if (rc) { + /* If this I/O failed, then one of the sectors + * in this request needs to be relocated. + */ + rc = bbr_io_remap_error(bbr_id, bbr_io_buf->rw, starting_lsn, + lsn, buffer); + if (rc) { + return rc; + } + } + buffer += (lsn << SECTOR_SHIFT); + page = virt_to_page(buffer); + offset_in_page = (unsigned long)buffer & ~PAGE_MASK; + } + + /* Process the remapped sector. */ + job.sector = remapped_lsn; + job.count = 1; + rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error); + if (rc) { + /* BUGBUG - Need more processing if this caused an + * an error. If this I/O failed, then the existing + * remap is now bad, and we need to find a new remap. + * Can't use bbr_io_remap_error(), because the existing + * map entry needs to be changed, not added again, and + * the original table entry also needs to be changed. + */ + return rc; + } + + buffer += SECTOR_SIZE; + starting_lsn += (lsn + 1); + count -= (lsn + 1); + lsn = -1; + page = virt_to_page(buffer); + offset_in_page = (unsigned long)buffer & ~PAGE_MASK; + } + + /* Check for any remaining sectors after the last split. This could + * potentially be the whole request, but that should be a rare case + * because requests should only be processed by the thread if we know + * an error occurred or they contained one or more remapped sectors. + */ + if (count) { + job.sector = starting_lsn; + job.count = count; + rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error); + if (rc) { + /* If this I/O failed, then one of the sectors in this + * request needs to be relocated. + */ + rc = bbr_io_remap_error(bbr_id, bbr_io_buf->rw, starting_lsn, + count, buffer); + if (rc) { + return rc; + } + } + } + + return 0; +} + +/** + * bbr_io_handler + * + * This is the handler for the bbr_io_thread. It continuously loops, + * taking I/O requests off its list and processing them. If nothing + * is on the list, the thread goes back to sleep until specifically + * woken up. + * + * I/O requests should only be sent to this thread if we know that: + * a) the request contains at least one remapped sector. + * or + * b) the request caused an error on the normal I/O path. + * This function uses synchronous I/O, so sending a request to this + * thread that doesn't need special processing will cause severe + * performance degredation. + **/ +static void bbr_io_handler(void) +{ + struct bbr_io_buffer * bbr_io_buf; + struct buffer_head * bh; + unsigned long flags; + int rc; + + while (1) { + /* Process bbr_io_list, one entry at a time. */ + spin_lock_irqsave(&bbr_io_list_lock, flags); + if (list_empty(&bbr_io_list)) { + /* No more items on the list. */ + spin_unlock_irqrestore(&bbr_io_list_lock, flags); + break; + } + bbr_io_buf = list_entry(bbr_io_list.next, + struct bbr_io_buffer, bbr_io_list); + list_del_init(&bbr_io_buf->bbr_io_list); + spin_unlock_irqrestore(&bbr_io_list_lock, flags); + + rc = bbr_io_process_request(bbr_io_buf); + + /* Clean up and complete the original I/O. */ + bbr_io_buf->flags |= BBR_IO_HANDLED; + bh = bbr_io_buf->bh; + if (bh->b_end_io) { + /* If this was the bbr_io_buf for an error on the + * normal WRITE, don't free it here. It will be + * freed later in bbr_callback() + */ + if (!(bbr_io_buf->flags & BBR_IO_RELOCATE)) + free_bbr_io_buf(bbr_io_buf); + bh->b_end_io(bh, rc ? 0 : 1); + } + } +} + +/** + * bbr_schedule_io + * + * Place the specified bbr_io_buf on the thread's processing list. + **/ +static void bbr_schedule_io(struct bbr_io_buffer * bbr_io_buf) +{ + unsigned long flags; + spin_lock_irqsave(&bbr_io_list_lock, flags); + list_add_tail(&bbr_io_buf->bbr_io_list, &bbr_io_list); + spin_unlock_irqrestore(&bbr_io_list_lock, flags); + dm_daemon_wake(bbr_io_thread); +} + +/** + * bbr_read + * + * If there are any remapped sectors on this object, send this request over + * to the thread for processing. Otherwise send it down the stack normally. + **/ +static int bbr_read(struct bbr_private * bbr_id, + struct buffer_head * bh) +{ + struct bbr_io_buffer * bbr_io_buf; + + + if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 || + !bbr_remap_probe(bbr_id, bh->b_rsector, + bh->b_size >> SECTOR_SHIFT)) { + /* No existing remaps or this request doesn't + * contain any remapped sectors. + */ + bh->b_rdev = bbr_id->dev->dev; + return 1; + } + + /* This request has at least one remapped sector. */ + bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, READ); + if (!bbr_io_buf) { + /* Can't get memory to track the I/O. */ + bh->b_end_io(bh, 0); + return -ENOMEM; + } + + bbr_schedule_io(bbr_io_buf); + return 0; +} + +/** + * bbr_callback + * + * This is the callback for normal write requests. Check for an error + * during the I/O, and send to the thread for processing if necessary. + **/ +static int bbr_callback(struct dm_target * ti, + struct buffer_head * bh, + int rw, + int error, + union map_info * map_context) +{ + struct bbr_io_buffer * bbr_io_buf = (struct bbr_io_buffer *) map_context->ptr; + + if (!bbr_io_buf) + return error; + + /* Will try to relocate the WRITE if: + * - It is an error, and + * - It is not an error of BBR relocation, and + */ + if (error && !(bbr_io_buf->flags & BBR_IO_HANDLED)) { + DMERR("dm-bbr: device %u:%u: Write failure on sector %lu. Scheduling for retry.", + MAJOR(bh->b_rdev), MINOR(bh->b_rdev), + (unsigned long)bbr_io_buf->sector); + /* Indicate this bbr_io_buf is for an error on normal WRITE */ + bbr_io_buf->flags |= BBR_IO_RELOCATE; + bbr_schedule_io(bbr_io_buf); + /* Returns >0 so that DM will let us retry the I/O */ + return 1; + } + + free_bbr_io_buf(bbr_io_buf); + return error; +} + +/** + * bbr_write + * + * If there are any remapped sectors on this object, send the request over + * to the thread for processing. Otherwise, register for callback + * notification, and send the request down normally. + **/ +static int bbr_write(struct bbr_private * bbr_id, + struct buffer_head * bh, + union map_info * map_context) +{ + struct bbr_io_buffer * bbr_io_buf; + + bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, WRITE); + if (!bbr_io_buf) { + /* Can't get memory to track the I/O. */ + bh->b_end_io(bh, 0); + return -ENOMEM; + } + + if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 || + !bbr_remap_probe(bbr_id, bh->b_rsector, + bh->b_size >> SECTOR_SHIFT)) { + /* No existing remaps or this request + * contains no remapped sectors. + */ + bh->b_rdev = bbr_id->dev->dev; + map_context->ptr = bbr_io_buf; + return 1; + } else { + /* This request contains at least one remapped sector. */ + map_context->ptr = NULL; + bbr_schedule_io(bbr_io_buf); + } + return 0; +} + +/** + * Construct a bbr mapping + **/ +static int bbr_ctr(struct dm_target * ti, unsigned int argc, char ** argv) +{ + struct bbr_private * bbr_id; + u32 block_size; + char * end; + int rc = -EINVAL; + + if (argc != 8) { + ti->error = "dm-bbr requires exactly 8 arguments: " + "device offset table1_lsn table2_lsn table_size start_replacement nr_replacement_blks block_size"; + goto out1; + } + + bbr_id = bbr_alloc_private(); + if (!bbr_id) { + ti->error = "dm-bbr: Error allocating bbr private data."; + goto out1; + } + + bbr_id->offset = simple_strtoull(argv[1], &end, 10); + bbr_id->lba_table1 = simple_strtoull(argv[2], &end, 10); + bbr_id->lba_table2 = simple_strtoull(argv[3], &end, 10); + bbr_id->nr_sects_bbr_table = simple_strtoull(argv[4], &end, 10); + bbr_id->start_replacement_sect = simple_strtoull(argv[5], &end, 10); + bbr_id->nr_replacement_blks = simple_strtoull(argv[6], &end, 10); + block_size = simple_strtoul(argv[7], &end, 10); + bbr_id->blksize_in_sects = (block_size >> SECTOR_SHIFT); + + bbr_id->bbr_table = kmalloc(bbr_id->nr_sects_bbr_table << SECTOR_SHIFT, + GFP_KERNEL); + if (!bbr_id->bbr_table) { + ti->error = "dm-bbr: Error allocating bbr table."; + goto out2; + } + + if (dm_get_device(ti, argv[0], 0, ti->len, + dm_table_get_mode(ti->table), &bbr_id->dev)) { + ti->error = "dm-bbr: Device lookup failed"; + goto out2; + } + + /* Using a semaphore here is probably overkill, + * but at least it will be correct. + */ + down(&bbr_instances_lock); + if (bbr_instances == 0) { + rc = bbr_global_init(); + if (rc) { + up(&bbr_instances_lock); + goto out3; + } + } + bbr_instances++; + up(&bbr_instances_lock); + + rc = bbr_setup(bbr_id); + if (rc) { + ti->error = "dm-bbr: Device setup failed"; + goto out4; + } + + ti->private = bbr_id; + return 0; + +out4: + down(&bbr_instances_lock); + bbr_instances--; + if (bbr_instances == 0) { + bbr_global_cleanup(); + } + up(&bbr_instances_lock); + +out3: + dm_put_device(ti, bbr_id->dev); +out2: + bbr_free_private(bbr_id); +out1: + return rc; +} + +static void bbr_dtr(struct dm_target * ti) +{ + struct bbr_private * bbr_id = (struct bbr_private *) ti->private; + + dm_put_device(ti, bbr_id->dev); + bbr_free_private(bbr_id); + + down(&bbr_instances_lock); + bbr_instances--; + if (bbr_instances == 0) { + bbr_global_cleanup(); + } + up(&bbr_instances_lock); +} + +static int bbr_map(struct dm_target * ti, struct buffer_head * bh, int rw, + union map_info * map_context) +{ + struct bbr_private * bbr_id = (struct bbr_private *) ti->private; + + bh->b_rsector += bbr_id->offset; + switch (rw) { + case READ: + case READA: + map_context->ptr = NULL; + return bbr_read(bbr_id, bh); + case WRITE: + return bbr_write(bbr_id, bh, map_context); + default: + return -EIO; + } +} + +static int bbr_status(struct dm_target * ti, status_type_t type, + char * result, unsigned int maxlen) +{ + struct bbr_private * bbr_id = (struct bbr_private *) ti->private; + + switch (type) { + case STATUSTYPE_INFO: + result[0] = '\0'; + break; + + case STATUSTYPE_TABLE: + snprintf(result, maxlen, "%s "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" %u", + dm_kdevname(bbr_id->dev->dev), bbr_id->offset, + bbr_id->lba_table1, bbr_id->lba_table2, + bbr_id->nr_sects_bbr_table, + bbr_id->start_replacement_sect, + bbr_id->nr_replacement_blks, + bbr_id->blksize_in_sects << SECTOR_SHIFT); + break; + } + return 0; +} + +static struct target_type bbr_target = { + name: "bbr", + module: THIS_MODULE, + ctr: bbr_ctr, + dtr: bbr_dtr, + map: bbr_map, + end_io: bbr_callback, + status: bbr_status, +}; + +int __init dm_bbr_init(void) +{ + int r = dm_register_target(&bbr_target); + + if (r < 0) + DMERR("dm-bbr: register failed %d", r); + + return r; +} + +void __exit dm_bbr_exit(void) +{ + int r = dm_unregister_target(&bbr_target); + + if (r < 0) + DMERR("dm-bbr: unregister failed %d", r); +} + +module_init(dm_bbr_init); +module_exit(dm_bbr_exit); +MODULE_LICENSE("GPL"); diff -urN linux-2.4.22/drivers/md/dm-bbr.h linux-2.4.22-evms/drivers/md/dm-bbr.h --- linux-2.4.22/drivers/md/dm-bbr.h 1970-01-01 01:00:00.000000000 +0100 +++ linux-2.4.22-evms/drivers/md/dm-bbr.h 2003-09-15 17:08:42.000000000 +0200 @@ -0,0 +1,148 @@ +/* + * Copyright (c) International Business Machines Corp., 2002-2003 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * linux/drivers/md/dm-bbr.h + * + * Bad-block-relocation (BBR) target for device-mapper. + * + * The BBR target is designed to remap I/O write failures to another safe + * location on disk. Note that most disk drives have BBR built into them, + * this means that our software BBR will be only activated when all hardware + * BBR replacement sectors have been used. + */ + +#ifndef _DM_BBR_H_ +#define _DM_BBR_H_ + +#define BBR_TABLE_SIGNATURE 0x42627254 /* BbrT */ +#define BBR_ENTRIES_PER_SECT 31 +#define BBR_NR_BUFS 128 +#define INITIAL_CRC 0xFFFFFFFF +#define CRC_POLYNOMIAL 0xEDB88320L + +/** + * Macros to cleanly print 64-bit numbers on both 32-bit and 64-bit machines. + * Use these in place of %Ld, %Lu, and %Lx. + **/ +#if BITS_PER_LONG > 32 +#define PFU64 "%lu" +#else +#define PFU64 "%Lu" +#endif + +/** + * struct bbr_table_entry + * @bad_sect: LBA of bad location. + * @replacement_sect: LBA of new location. + * + * Structure to describe one BBR remap. + **/ +struct bbr_table_entry { + u64 bad_sect; + u64 replacement_sect; +}; + +/** + * struct bbr_table + * @signature: Signature on each BBR table sector. + * @crc: CRC for this table sector. + * @sequence_number: Used to resolve conflicts when primary and secondary + * tables do not match. + * @in_use_cnt: Number of in-use table entries. + * @entries: Actual table of remaps. + * + * Structure to describe each sector of the metadata table. Each sector in this + * table can describe 31 remapped sectors. + **/ +struct bbr_table { + u32 signature; + u32 crc; + u32 sequence_number; + u32 in_use_cnt; + struct bbr_table_entry entries[BBR_ENTRIES_PER_SECT]; +}; + +/** + * struct bbr_runtime_remap + * + * Node in the binary tree used to keep track of remaps. + **/ +struct bbr_runtime_remap { + struct bbr_table_entry remap; + struct bbr_runtime_remap *left; + struct bbr_runtime_remap *right; +}; + +/** + * struct bbr_private + * @dev: Info about underlying device. + * @bbr_table: Copy of metadata table. + * @offset: LBA of data area. + * @lba_table1: LBA of primary BBR table. + * @lba_table2: LBA of secondary BBR table. + * @nr_sects_bbr_table: Size of each BBR table. + * @nr_replacement_blks: Number of replacement blocks. + * @start_replacement_sect: LBA of start of replacement blocks. + * @blksize_in_sects: Size of each block. + * @in_use_replacement_blks: Current number of remapped blocks. + * @remap_root: Binary tree containing all remaps. + * @bbr_id_lock: Lock for the binary tree. + * + * Private data for each BBR target. + **/ +struct bbr_private { + struct dm_dev * dev; + struct bbr_table * bbr_table; + struct bbr_runtime_remap * remap_root; + u64 offset; + u64 lba_table1; + u64 lba_table2; + u64 nr_sects_bbr_table; + u64 start_replacement_sect; + u64 nr_replacement_blks; + u32 blksize_in_sects; + atomic_t in_use_replacement_blks; + spinlock_t bbr_id_lock; +}; + +#define BBR_IO_HANDLED (1<<0) +#define BBR_IO_RELOCATE (1<<1) + +/** + * struct bbr_io_buffer + * @bbr_io_list: Thread's list of bbr_io_buf's. + * @bbr_id: Object for this request. + * @bh: Original buffer_head. + * @sector: Original sector + * @flags: Operation flag (BBR_IO_*) + * @rw: READ or WRITE. + * @rc: Return code from bbr_io_handler. + * + * Structure used to track each write request. + **/ +struct bbr_io_buffer { + struct list_head bbr_io_list; + struct bbr_private *bbr_id; + struct buffer_head *bh; + u64 sector; + u32 flags; + s32 rw; + s32 rc; +}; + +#endif + diff -urN linux-2.4.22/drivers/md/dm-snapshot.c linux-2.4.22-evms/drivers/md/dm-snapshot.c --- linux-2.4.22/drivers/md/dm-snapshot.c 2003-09-15 17:07:45.000000000 +0200 +++ linux-2.4.22-evms/drivers/md/dm-snapshot.c 2003-09-15 17:08:35.000000000 +0200 @@ -92,6 +92,9 @@ /* List of snapshots for this origin */ struct list_head snapshots; + + /* Count of snapshots and origins referrencing this structure. */ + unsigned int count; }; /* @@ -155,6 +158,35 @@ } /* + * Allocate and initialize an origin structure. + */ +static struct origin * __alloc_origin(kdev_t dev) +{ + struct origin *o = kmalloc(sizeof(*o), GFP_KERNEL); + if (o) { + o->dev = dev; + INIT_LIST_HEAD(&o->hash_list); + INIT_LIST_HEAD(&o->snapshots); + __insert_origin(o); + } + return o; +} + +static void __get_origin(struct origin *o) +{ + o->count++; +} + +static void __put_origin(struct origin *o) +{ + o->count--; + if (o->count == 0) { + list_del(&o->hash_list); + kfree(o); + } +} + +/* * Make a note of the snapshot and its origin so we can look it * up when the origin has a write on it. */ @@ -168,20 +200,37 @@ if (!o) { /* New origin */ - o = kmalloc(sizeof(*o), GFP_KERNEL); + o = __alloc_origin(dev); if (!o) { up_write(&_origins_lock); return -ENOMEM; } + } - /* Initialise the struct */ - INIT_LIST_HEAD(&o->snapshots); - o->dev = dev; + __get_origin(o); + list_add_tail(&snap->list, &o->snapshots); - __insert_origin(o); + up_write(&_origins_lock); + return 0; +} + +static int register_origin(kdev_t dev) +{ + struct origin *o; + + down_write(&_origins_lock); + o = __lookup_origin(dev); + + if (!o) { + /* New origin */ + o = __alloc_origin(dev); + if (!o) { + up_write(&_origins_lock); + return -ENOMEM; + } } - list_add_tail(&snap->list, &o->snapshots); + __get_origin(o); up_write(&_origins_lock); return 0; @@ -195,11 +244,18 @@ o = __lookup_origin(s->origin->dev); list_del(&s->list); - if (list_empty(&o->snapshots)) { - list_del(&o->hash_list); - kfree(o); - } + __put_origin(o); + + up_write(&_origins_lock); +} + +static void unregister_origin(kdev_t dev) +{ + struct origin *o; + down_write(&_origins_lock); + o = __lookup_origin(dev); + __put_origin(o); up_write(&_origins_lock); } @@ -1090,6 +1146,13 @@ return r; } + r = register_origin(dev->dev); + if (r) { + ti->error = "Cannot register origin"; + dm_put_device(ti, dev); + return r; + } + ti->private = dev; return 0; } @@ -1097,6 +1160,7 @@ static void origin_dtr(struct dm_target *ti) { struct dm_dev *dev = (struct dm_dev *) ti->private; + unregister_origin(dev->dev); dm_put_device(ti, dev); } diff -urN linux-2.4.22/drivers/md/dm-sparse.c linux-2.4.22-evms/drivers/md/dm-sparse.c --- linux-2.4.22/drivers/md/dm-sparse.c 1970-01-01 01:00:00.000000000 +0100 +++ linux-2.4.22-evms/drivers/md/dm-sparse.c 2003-09-15 17:09:48.000000000 +0200 @@ -0,0 +1,713 @@ +/* -*- linux-c -*- */ + +/* + * Copyright (c) International Business Machines Corp., 2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * linux/drivers/md/dm-sparse.c + * + * Sparse target for device-mapper. + * + * This target provides the ability to create a sparse device. This + * allows a device to pretend to be larger than it really is. + */ + +#include +#include +#include +#include +#include +#include + +#include "dm.h" +#include "dm-io.h" + +#define MAX_HASH_CHAIN_ENTRIES 10 +#define NAME_SIZE 127 + +/* Sparse Ioctl + device + start + chunk_size + chunks + */ + +// Entries in the sparse remapping structure +struct sparse_hash_entry { + u64 org_chunk; // Chunk number, not LBA. + u64 sparse_chunk; // Chunk number, not LBA. + struct sparse_hash_entry * next; + struct sparse_hash_entry * prev; +}; + +//Private data structure +struct sparse_volume { + struct dm_dev *dev; + struct rw_semaphore sparse_semaphore; + struct sparse_hash_entry ** sparse_map; // Hash table of remappings + struct sparse_hash_entry * free_hash_list; + kmem_cache_t * hash_slab; + mempool_t * hash_pool; + u32 dm_io_flag; + u32 chunk_size; // Sectors. + u32 chunk_shift; // Shift value for chunk size. + u32 num_chunks; // In this volume. + u32 next_cow_entry; // Index into current COW table. + u64 current_cow_sector; // LOGICAL sector of current COW table. + u32 next_free_chunk; // Index of next free chunk (not LBA!). + u32 hash_table_size; // Size of the hash table for the remap. + u64 start; + u64 cow_table[64]; // One sector's worth of COW tables. +}; + +/*************************** OLD SERVICES ****************************/ + +/* computes log base 2 of value */ +inline int log2(u32 value) //ok to change to u32? +{ + int result = -1; + long tmp; //ok to change to long? + + if (value) { + tmp = value; + result++; + while (!(tmp & 1)) { + result++; + tmp >>= 1; + } + if (tmp != 1) { + result = -2; + } + } + return result; +} + +/********************************* Functions *********************************/ + +/***************************** Hash Functions *****************************/ + +/* Take and initialize from the free hash list */ +static struct sparse_hash_entry * +allocate_sparse_hash_entry( struct sparse_volume * volume, + u64 org_chunk, + u64 sparse_chunk ) +{ + struct sparse_hash_entry * hash_entry; + + hash_entry = volume->free_hash_list; + if ( hash_entry ) { //should always be the case b/c preallocate these + volume->free_hash_list = hash_entry->next; + hash_entry->org_chunk = org_chunk; + hash_entry->sparse_chunk = sparse_chunk; + hash_entry->next = NULL; + hash_entry->prev = NULL; + } + + return hash_entry; +} + +/* + * This function inserts a new entry into a sparse hash chain, immediately + * following the specified entry. This function should not be used to add + * an entry into an empty list, or as the first entry in an existing list. + * For that case, use insert_sparse_map_entry_at_head(). + */ +static int insert_sparse_hash_entry( struct sparse_hash_entry * entry, + struct sparse_hash_entry * base ) +{ + entry->next = base->next; + entry->prev = base; + base->next = entry; + if ( entry->next ) { + entry->next->prev = entry; + } + return 0; +} + +/* + * This function inserts a new entry into a sparse chain as the first + * entry in the chain. + */ +static int insert_sparse_hash_entry_at_head( struct sparse_hash_entry * entry, + struct sparse_hash_entry ** head ) +{ + entry->next = *head; + entry->prev = NULL; + *head = entry; + if ( entry->next ) { + entry->next->prev = entry; + } + return 0; +} + +/* + * Delete all items in a single chain in the hash table. + */ +static int delete_sparse_hash_chain( struct sparse_volume * vol, + struct sparse_hash_entry * head ) +{ + struct sparse_hash_entry * next; + + while ( head ) { + next = head->next; + mempool_free( head, vol->hash_pool ); + head = next; + } + return 0; +} + +/* + * This function will search the hash chain that is anchored at the + * specified head pointer. If the chunk number is found, a pointer to that + * entry in the chain is set, and a 1 is returned. If the chunk is not + * found, a pointer to the previous entry is set and 0 is returned. If the + * return pointer is NULL, this means either the list is empty, or the + * specified sector should become the first list item. + */ +static int search_sparse_hash_chain( u64 chunk, + struct sparse_hash_entry * head, + struct sparse_hash_entry ** result ) +{ + struct sparse_hash_entry * curr = head; + struct sparse_hash_entry * prev = head; + while ( curr && curr->org_chunk < chunk ) { + prev = curr; + curr = curr->next; + } + if (!curr) { // Either an empty chain or went off the end of the chain. + *result = prev; + return 0; + } + else if ( curr->org_chunk != chunk ) { + *result = curr->prev; + return 0; + } + else { + *result = curr; + return 1; + } +} + +/* + * This function takes a cow table entry (from the on-disk data), and + * converts it into an appropriate entry for the sparse map, and + * inserts it into the appropriate map for the specified volume. + */ +static int add_cow_entry_to_sparse_map( u64 org_chunk, + u64 sparse_chunk, + struct sparse_volume * volume ) +{ + struct sparse_hash_entry * new_entry; + struct sparse_hash_entry * target_entry; + u32 hash_value; + int rc = -EINVAL; + + new_entry = allocate_sparse_hash_entry(volume, org_chunk, sparse_chunk); + if (!new_entry) { + return -ENOMEM; + } + + hash_value = (long)org_chunk % volume->hash_table_size; + + if (! search_sparse_hash_chain( org_chunk, + volume->sparse_map[hash_value], + &target_entry ) ) { + //should always take this path + + if ( target_entry ) { + insert_sparse_hash_entry( new_entry, target_entry ); + } + else { + insert_sparse_hash_entry_at_head + ( new_entry, &(volume->sparse_map[hash_value]) ); + } + rc = 0; + } + return rc; +} + +/* + * Construct the initial hash table state based on + * existing COW tables on the disk. + */ +static int build_sparse_maps(struct sparse_volume * volume) +{ + int rc = 0, done = 0; + struct io_region job; + struct page * page; + unsigned int error, offset; + + while (!done) { + + // Read in one sector's worth of COW tables. + job.dev = volume->dev->dev; + job.sector = volume->current_cow_sector; + job.count = 1; + page = virt_to_page(volume->cow_table); + offset = (unsigned long)volume->cow_table & ~PAGE_MASK; + rc = dm_io_sync(1, &job, READ, page, offset, &error); + if (rc) { + return rc; + } + + // Translate every valid COW table entry into + // a sparse map entry. + for ( volume->next_cow_entry = 0; + + volume->next_cow_entry < (SECTOR_SIZE/sizeof(u64)) && + volume->cow_table[volume->next_cow_entry] != + 0xffffffffffffffff; + + volume->next_cow_entry++, volume->next_free_chunk++ ) { + + if ( (rc = add_cow_entry_to_sparse_map + ( le64_to_cpu( volume->cow_table[volume->next_cow_entry] ), + volume->next_free_chunk, volume ))) { + return( rc ); + } + } + // Move on to the next sector if necessary. + if ( volume->next_cow_entry == (SECTOR_SIZE/sizeof(u64)) ) { + volume->current_cow_sector++; + } + else { + done = 1; + } + } + return 0; +} + +/************************* Other Functions ************************/ + +/* + * Function: sparse_remap_chunk + * + * This function performs a sector remap on a sparse volume. This should + * be called from the I/O path, It first determines the base sector + * of the chunk containing the specified sector, and saves the remainder. + * Then it performs a search through the sparse map for the specified + * volume. If a match is found, the sector number is changed to the new + * value. If no match is found, the value is left the same, meaning the + * chunk has not been remapped. + */ +static int sparse_remap_chunk( struct sparse_volume * sparse_volume, + u64 * sector ) +{ + struct sparse_hash_entry * result; + u64 chunk; + u32 hash_value; + u32 remainder; + int rc = 1; + + down_read(&sparse_volume->sparse_semaphore); + + remainder = *sector & (u64)(sparse_volume->chunk_size - 1); + chunk = *sector >> sparse_volume->chunk_shift; + hash_value = ((u32)chunk) % sparse_volume->hash_table_size; + + if ( search_sparse_hash_chain( chunk, + sparse_volume->sparse_map[hash_value], + &result) ) { + *sector = ( result->sparse_chunk << sparse_volume->chunk_shift ) + + remainder; + rc = 0; + } + up_read(&sparse_volume->sparse_semaphore); + return rc; +} + +/* Function: sparse_cow_write + * + * Check this sparse node to see if the given sector/chunk has been + * remapped yet. If it hasn't, create a new hash table entry, update the + * in-memory COW table, write the COW table to disk. + */ + +static int sparse_cow_write( struct sparse_volume * sparse_volume, + u64 * sector ) +{ + struct sparse_hash_entry * target_entry, * new_map_entry; + struct io_region job; + struct page * page; + char * cow = NULL; + unsigned int error, offset; + u64 chunk; + u32 hash_value = 0; + u32 remainder; + int rc; + + down_write(&sparse_volume->sparse_semaphore); + + remainder = *sector & (u64)(sparse_volume->chunk_size - 1); + chunk = *sector >> sparse_volume->chunk_shift; + hash_value = ((u32)chunk) % sparse_volume->hash_table_size; + + if ( search_sparse_hash_chain( chunk, + sparse_volume->sparse_map[hash_value], + &target_entry) ) { + *sector = + ( target_entry->sparse_chunk << sparse_volume->chunk_shift ) + + remainder; + rc = 0; + goto out; + } + + // Is there enough room left on this sparse to remap this chunk? + if ( sparse_volume->next_free_chunk >= sparse_volume->num_chunks ) { + DMERR("dm-sparse: full no new remaps allowed\n"); + rc = -ENOSPC; + goto out; + } + + // Create and initialize a new hash table entry for the new remap. + new_map_entry = allocate_sparse_hash_entry + (sparse_volume, chunk, sparse_volume->next_free_chunk); + if ( ! new_map_entry ) { + // Can't get memory for map entry. Disable this sparse. + DMERR("dm-sparse: memory error allocating hash entry\n"); + rc = -ENOMEM; + goto out; + } + + //Always write cow table so its safe + cow = kmalloc( SECTOR_SIZE, GFP_KERNEL ); + if (! cow ) { + // Can't get I/O buffer. Disable this sparse. + DMERR("dm-sparse: memory error allocating COW table buffer"); + rc = -ENOMEM; + goto out; + } + + // Add the entry to the hash table. + if ( target_entry ) { + insert_sparse_hash_entry( new_map_entry, target_entry ); + } + else { + insert_sparse_hash_entry_at_head + ( new_map_entry, + &(sparse_volume->sparse_map[hash_value]) ); + } + + sparse_volume->next_free_chunk++; + + // Update the appropriate entry in the COW table. + sparse_volume->cow_table[sparse_volume->next_cow_entry] = + cpu_to_le64(chunk); + sparse_volume->next_cow_entry++; + + memcpy(cow, sparse_volume->cow_table, SECTOR_SIZE); + + //because of ordering issues needs to be synchronous + job.dev = sparse_volume->dev->dev; + job.sector = sparse_volume->current_cow_sector; + job.count = 1; + page = virt_to_page(cow); + offset = (unsigned long)cow & ~PAGE_MASK; + dm_io_sync(1, &job, WRITE, page, offset, &error); + + // Update the in-memory COW table values. + if ( sparse_volume->next_cow_entry >= (SECTOR_SIZE/sizeof(u64)) ) + { + sparse_volume->next_cow_entry = 0; + sparse_volume->current_cow_sector++; + memset(sparse_volume->cow_table, 0xff, SECTOR_SIZE); + } + + *sector = ( new_map_entry->sparse_chunk << sparse_volume->chunk_shift ) + + remainder; + + rc = 0; + + out: + up_write(&sparse_volume->sparse_semaphore); + if ( cow ) { + kfree( cow ); + } + + return rc; +} + +/************************ EXPORT FUNCTIONS ************************/ + +/* + * Function: sparse_dtr + */ +static void sparse_dtr( struct dm_target *ti ) +{ + struct sparse_volume * vol = (struct sparse_volume *)ti->private; + int i; + + if (vol) { + + if (vol->sparse_map) { + for ( i = 0; i < vol->hash_table_size; i++ ) { + delete_sparse_hash_chain( vol, vol->sparse_map[i] ); + } + delete_sparse_hash_chain( vol, vol->free_hash_list ); + vfree(vol->sparse_map); + } + + if (vol->hash_pool) + mempool_destroy(vol->hash_pool); + + if (vol->hash_slab) + kmem_cache_destroy(vol->hash_slab); + + dm_put_device(ti, vol->dev); + + if (vol->dm_io_flag) { + dm_io_put(1); + } + + kfree( vol ); + } +} + +/* + * Function: sparse_ctr + */ +static int sparse_ctr( struct dm_target *ti, unsigned int argc, char** argv ) +{ + int i, rc = -EINVAL; + struct sparse_hash_entry *new_entry; + struct sparse_volume *vol; + struct dm_dev *dev; + u32 chunk_size, chunks; + u64 start; + char* end, slab_name[NAME_SIZE+1]; + + if ( argc != 4 ) { + ti->error="dm-sparse: wrong number of arguments"; + return rc; + } + + start = simple_strtoull(argv[1], &end, 10); + if (*end) { + ti->error="dm-sparse: Invalid first chunk lba"; + return rc; + } + + chunk_size = simple_strtoul(argv[2], &end, 10); + if (*end) { + ti->error="dm-sparse: Invalid chunk_size"; + return rc; + } + + chunks = simple_strtoul(argv[3], &end, 10); + if (*end) { + ti->error="dm-sparse: Invalid number of chunks"; + return rc; + } + + if ( dm_get_device( ti, argv[0], ti->begin, start + chunks * chunk_size, + dm_table_get_mode(ti->table), &dev ) ) { + ti->error = "dm-sparse: Device lookup failed"; + return rc; + } + + vol = kmalloc(sizeof(struct sparse_volume), GFP_KERNEL); + if ( !vol ) { + ti->error = "dm-sparse: Memory allocation for private-data failed"; + rc = -ENOMEM; + goto out; + } + + memset( vol, 0, sizeof(struct sparse_volume) ); + + rc = dm_io_get(1); + if (rc) { + ti->error = "dm-sparse: failed to initialize dm-io."; + sparse_dtr(ti); + return rc; + } + + // Initialize + vol->dm_io_flag = 1; + vol->chunk_size = chunk_size; + vol->chunk_shift = log2(chunk_size); + vol->num_chunks = chunks; + vol->current_cow_sector = 1; + vol->hash_table_size = chunks / MAX_HASH_CHAIN_ENTRIES + 1; + vol->start = start; + vol->dev = dev; + init_rwsem(&vol->sparse_semaphore); + + snprintf(slab_name, NAME_SIZE, "sparse-%p", vol); + vol->hash_slab = kmem_cache_create(slab_name, + sizeof(struct sparse_hash_entry), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if ( ! vol->hash_slab ) { + ti->error = "dm-sparse: memory allocation error in hash slab create"; + sparse_dtr(ti); + return -ENOMEM; + } + vol->hash_pool = mempool_create(1, mempool_alloc_slab, + mempool_free_slab, + vol->hash_slab); + if ( ! vol->hash_pool ) { + ti->error = "dm-sparse: memory allocation error in hash pool create"; + sparse_dtr(ti); + return -ENOMEM; + } + + // Sparse hash table + vol->sparse_map = vmalloc( vol->hash_table_size * + sizeof( struct sparse_hash_entry * ) ); + if ( ! vol->sparse_map ) { + ti->error = "dm-sparse: Memory allocation error in sparse_map create"; + sparse_dtr(ti); + return -ENOMEM; + } + + memset( vol->sparse_map, 0, vol->hash_table_size * + sizeof( struct sparse_hash_entry * ) ); + + for ( i = 0; i < chunks; i++ ) { + + new_entry = mempool_alloc(vol->hash_pool, GFP_KERNEL ); + if ( ! new_entry ) { + ti->error="dm-sparse: memory allocation error in hash table setup"; + sparse_dtr(ti); + return -ENOMEM; + } + + new_entry->next = vol->free_hash_list; + vol->free_hash_list = new_entry; + } + + rc = build_sparse_maps(vol); + if (rc) { + ti->error = "dm-sparse: error building hash tables"; + sparse_dtr(ti); + return rc; + } + + ti->private = vol; + return rc; + + out: + dm_put_device(ti, dev); + return rc; +} + +/* + * Function: sparse_map + */ +static int sparse_map( struct dm_target * ti, struct buffer_head * bh, int rw, + union map_info *map_context ) +{ + struct sparse_volume * volume = (struct sparse_volume*)ti->private; + u64 sector = bh->b_rsector; + int rc; + + + + // Check if this sector has been remapped + rc = sparse_remap_chunk( volume, §or ); + + if ( rc < 0 ) { //Error + bh->b_end_io(bh, 0); + return rc; + } + + if ( rc == 0 ) { // Remapped I/O : read or write same logic + bh->b_rsector = volume->start + sector; + bh->b_rdev = volume->dev->dev; + return 1; + } + + // ( Previously )Un-mapped: read / write different logic + + if ( rw ) { //write : + rc = sparse_cow_write( volume, §or ); + + if ( rc < 0 ) { //Error + bh->b_end_io(bh, 0); + return rc; + } + //Send write on + bh->b_rsector = volume->start + sector; + bh->b_rdev = volume->dev->dev; + return 1; + } + + //Reading something that was never written + //return zeros and indicate complete + memset(bh->b_data, 0x0, bh->b_size); + bh->b_end_io(bh, 1); + return 0; +} + +static int sparse_status( struct dm_target *ti, status_type_t type, + char *result, unsigned int maxlen ) +{ + struct sparse_volume * vol = (struct sparse_volume * )ti->private; + + switch(type) { + + case STATUSTYPE_INFO: + snprintf( result, maxlen, "%d%%", + ( vol->next_free_chunk * 100 ) / vol->num_chunks ); + break; + + case STATUSTYPE_TABLE: + snprintf( result, maxlen, "%s %Lu %u %u", + dm_kdevname(vol->dev->dev), vol->start, + vol->chunk_size, vol->num_chunks ); + break; + + default: + break; + } + + return 0; +} + +/****************** FUNCTION TABLE **********************/ + +static struct target_type sparse_target = { + .name = "sparse", + .module = THIS_MODULE, + .ctr = sparse_ctr, + .dtr = sparse_dtr, + .map = sparse_map, + .status = sparse_status, +}; + +/********************* REGISTRATION *****************/ + +int __init sparse_init(void) +{ + int rc = dm_register_target(&sparse_target); + + if ( rc < 0 ) + DMWARN("sparse target registration failed"); + + return rc; +} + +void __exit sparse_exit(void) +{ + if (dm_unregister_target(&sparse_target) ) + DMWARN("sparse target unregistration failed"); + + return; +} + +module_init(sparse_init); +module_exit(sparse_exit); +MODULE_LICENSE("GPL"); diff -urN linux-2.4.22/drivers/md/multipath.c linux-2.4.22-evms/drivers/md/multipath.c --- linux-2.4.22/drivers/md/multipath.c 2003-06-13 16:51:34.000000000 +0200 +++ linux-2.4.22-evms/drivers/md/multipath.c 2003-09-15 17:09:36.000000000 +0200 @@ -139,15 +139,16 @@ static int multipath_map (mddev_t *mddev, kdev_t *rdev) { multipath_conf_t *conf = mddev_to_conf(mddev); - int i, disks = MD_SB_DISKS; + int i; /* * Later we do read balancing on the read side * now we use the first available disk. */ - for (i = 0; i < disks; i++) { + for (i = 0; i < conf->nr_disks; i++) { if (conf->multipaths[i].operational) { + /* first operational is winner! */ *rdev = conf->multipaths[i].dev; return (0); } @@ -191,6 +192,8 @@ { struct multipath_bh * mp_bh = (struct multipath_bh *)(bh->b_private); + atomic_dec(&mp_bh->multipath->nr_pending); + /* * this branch is our 'one multipath IO has finished' event handler: */ @@ -223,19 +226,39 @@ } /* - * This routine returns the disk from which the requested read should - * be done. + * Multipath read balance ... + * + * Returns: + * + * If no active paths + * + * - Error ( -1 ) + * + * If active paths == 1 + * + * - 1st active path encountered + * + * If active paths > 1 + * + * - 1st idle active path encountered + * - else ... the active path doing the least amount of work. */ - static int multipath_read_balance (multipath_conf_t *conf) { - int disk; - - for (disk = 0; disk < conf->raid_disks; disk++) - if (conf->multipaths[disk].operational) - return disk; - BUG(); - return 0; + int i, disk=-1, nr_pending, least_pending=0; + + for (i=0; inr_disks; i++) { + if (conf->multipaths[i].operational) { + nr_pending = atomic_read(&conf->multipaths[i].nr_pending); + if (nr_pending==0 || conf->working_disks==1) + return i; + if (least_pending==0 || nr_pendingmultipaths + multipath_read_balance(conf); + disk = multipath_read_balance(conf); + if (disk==-1) { + printk (KERN_ERR "multipath_make_request: no more operational IO paths.\n"); + buffer_IO_error(bh); + return 0; + } + + multipath = conf->multipaths + disk; + mp_bh->multipath = multipath; + atomic_inc(&multipath->nr_pending); bh_req = &mp_bh->bh_req; memcpy(bh_req, bh, sizeof(*bh)); @@ -331,13 +364,14 @@ { multipath_conf_t *conf = mddev_to_conf(mddev); struct multipath_info * multipaths = conf->multipaths; - int disks = MD_SB_DISKS; int other_paths = 1; - int i; + int i, first = 1; + mdk_rdev_t *rdev; + struct md_list_head *tmp; if (conf->working_disks == 1) { other_paths = 0; - for (i = 0; i < disks; i++) { + for (i = 0; i < MD_SB_DISKS; i++) { if (multipaths[i].spare) { other_paths = 1; break; @@ -351,16 +385,17 @@ * first check if this is a queued request for a device * which has just failed. */ - for (i = 0; i < disks; i++) { + for (i = 0; i < MD_SB_DISKS; i++) { if (multipaths[i].dev==dev && !multipaths[i].operational) return 0; } printk (LAST_DISK); } else { + mdp_super_t *sb = mddev->sb; /* * Mark disk as unusable */ - for (i = 0; i < disks; i++) { + for (i = 0; i < MD_SB_DISKS; i++) { if (multipaths[i].dev==dev && multipaths[i].operational) { mark_disk_bad(mddev, i); break; @@ -369,7 +404,6 @@ if (!conf->working_disks) { int err = 1; mdp_disk_t *spare; - mdp_super_t *sb = mddev->sb; spare = get_spare(mddev); if (spare) { @@ -384,6 +418,21 @@ sb->spare_disks--; } } + /* prevent unnecessary work in md_do_recovery() */ + if (conf->working_disks) { + conf->raid_disks = conf->working_disks + = sb->raid_disks = sb->active_disks; + } + /* update alias disk info to insure we can do sb commit. */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (first && disk_active(&sb->disks[rdev->desc_nr])) { + rdev->alias_device = 0; + first = 0; + } else { + if (!disk_faulty(&sb->disks[rdev->desc_nr])) + rdev->alias_device = 1; + } + } } return 0; } @@ -677,9 +726,8 @@ /* * This is a kernel thread which: * - * 1. Retries failed read operations on working multipaths. + * 1. Retries failed operations on working multipaths. * 2. Updates the raid superblock when problems encounter. - * 3. Performs writes following reads for array syncronising. */ static void multipathd (void *data) @@ -833,6 +881,7 @@ mdk_rdev_t *rdev, *def_rdev = NULL; struct md_list_head *tmp; int num_rdevs = 0; + int active_disks = 0, spare_disks = 0, faulty_disks = 0; MOD_INC_USE_COUNT; @@ -881,9 +930,7 @@ printk(NOT_IN_SYNC, partition_name(rdev->dev)); /* - * Mark all disks as spare to start with, then pick our - * active disk. If we have a disk that is marked active - * in the sb, then use it, else use the first rdev. + * Mark all disks as spare to start with. */ disk->number = desc->number; disk->raid_disk = desc->raid_disk; @@ -894,20 +941,21 @@ mark_disk_sync(desc); if (disk_active(desc)) { - if(!conf->working_disks) { - printk(OPERATIONAL, partition_name(rdev->dev), - desc->raid_disk); - disk->operational = 1; - disk->spare = 0; - conf->working_disks++; - def_rdev = rdev; - } else { - mark_disk_spare(desc); - } - } else - mark_disk_spare(desc); + printk(OPERATIONAL, partition_name(rdev->dev), + desc->raid_disk); + disk->operational = 1; + disk->spare = 0; + conf->working_disks++; + def_rdev = rdev; + active_disks++; + } else if (disk_faulty(desc)) { + disk->spare = 0; + faulty_disks++; + } else { + spare_disks++; + } - if(!num_rdevs++) def_rdev = rdev; + num_rdevs++; } if(!conf->working_disks && num_rdevs) { desc = &sb->disks[def_rdev->desc_nr]; @@ -918,11 +966,12 @@ disk->spare = 0; conf->working_disks++; mark_disk_active(desc); + active_disks++; } /* - * Make sure our active path is in desc spot 0 + * If there is only 1 active path ... make sure it is in desc spot 0 */ - if(def_rdev->desc_nr != 0) { + if (active_disks == 1 && def_rdev->desc_nr != 0) { rdev = find_rdev_nr(mddev, 0); desc = &sb->disks[def_rdev->desc_nr]; desc2 = sb->disks; @@ -940,10 +989,10 @@ def_rdev->desc_nr = 0; } } - conf->raid_disks = sb->raid_disks = sb->active_disks = 1; + conf->raid_disks = sb->raid_disks = sb->active_disks = active_disks; conf->nr_disks = sb->nr_disks = sb->working_disks = num_rdevs; - sb->failed_disks = 0; - sb->spare_disks = num_rdevs - 1; + sb->failed_disks = faulty_disks; + sb->spare_disks = spare_disks; mddev->sb_dirty = 1; conf->mddev = mddev; conf->device_lock = MD_SPIN_LOCK_UNLOCKED; diff -urN linux-2.4.22/include/linux/raid/multipath.h linux-2.4.22-evms/include/linux/raid/multipath.h --- linux-2.4.22/include/linux/raid/multipath.h 2001-11-12 18:51:56.000000000 +0100 +++ linux-2.4.22-evms/include/linux/raid/multipath.h 2003-09-15 17:09:36.000000000 +0200 @@ -15,6 +15,7 @@ int spare; int used_slot; + atomic_t nr_pending; /* number of pending requests */ }; struct multipath_private_data { @@ -63,6 +64,7 @@ struct buffer_head *master_bh; struct buffer_head bh_req; struct multipath_bh *next_mp; /* next for retry or in free list */ + struct multipath_info *multipath; /* allows end_request to easilly dec pending buffer count*/ }; /* bits for multipath_bh.state */ #define MPBH_Uptodate 1