[packages/kernel.git] / linux-2.4.25-evms-2.2.1.patch

diff -urN linux-2.4.24.org/drivers/md/Config.in linux-2.4.24/drivers/md/Config.in
--- linux-2.4.24.org/drivers/md/Config.in	2004-01-18 15:09:18.503177509 +0100
+++ linux-2.4.24/drivers/md/Config.in	2004-01-18 16:05:08.202479073 +0100
@@ -12,6 +12,10 @@
 dep_tristate '  RAID-1 (mirroring) mode' CONFIG_MD_RAID1 $CONFIG_BLK_DEV_MD
 dep_tristate '  RAID-4/RAID-5 mode' CONFIG_MD_RAID5 $CONFIG_BLK_DEV_MD
 dep_tristate '  Multipath I/O support' CONFIG_MD_MULTIPATH $CONFIG_BLK_DEV_MD
+if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
+	dep_tristate '  Bad Block Relocation Device Target (EXPERIMENTAL)' CONFIG_BLK_DEV_DM_BBR $CONFIG_BLK_DEV_DM
+	dep_tristate '  Sparse Device Target (EXPERIMENTAL)' CONFIG_BLK_DEV_DM_SPARSE $CONFIG_BLK_DEV_DM
+fi
 
 dep_tristate ' Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM $CONFIG_MD
 dep_tristate ' Device-mapper support' CONFIG_BLK_DEV_DM $CONFIG_MD
diff -urN linux-2.4.24.org/drivers/md/dm-bbr.c linux-2.4.24/drivers/md/dm-bbr.c
--- linux-2.4.24.org/drivers/md/dm-bbr.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.4.24/drivers/md/dm-bbr.c	2004-01-18 16:03:13.099546349 +0100
@@ -0,0 +1,1227 @@
+/*
+ *   (C) Copyright IBM Corp. 2002, 2003
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * linux/drivers/md/dm-bbr.c
+ *
+ * Bad-block-relocation (BBR) target for device-mapper.
+ *
+ * The BBR target is designed to remap I/O write failures to another safe
+ * location on disk. Note that most disk drives have BBR built into them,
+ * this means that our software BBR will be only activated when all hardware
+ * BBR replacement sectors have been used.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/spinlock.h>
+#include <linux/smp_lock.h>
+#include <linux/slab.h>
+#include <linux/mempool.h>
+#include "dm.h"
+#include "dm-bbr.h"
+#include "dm-daemon.h"
+#include "dm-io.h"
+
+/* Number of active BBR devices. */
+static int bbr_instances = 0;
+static DECLARE_MUTEX(bbr_instances_lock);
+
+/* Data pertaining to the I/O thread. */
+static struct dm_daemon * bbr_io_thread = NULL;
+static spinlock_t bbr_io_list_lock = SPIN_LOCK_UNLOCKED;
+static LIST_HEAD(bbr_io_list);
+static void bbr_io_handler(void);
+
+/* Global pools for bbr_io_buf's and bbr_remap's. */
+static kmem_cache_t * bbr_io_buf_cache;
+static mempool_t * bbr_io_buf_pool;
+static kmem_cache_t * bbr_remap_cache;
+static mempool_t * bbr_remap_pool;
+
+static void bbr_free_remap(struct bbr_private * bbr_id);
+
+/**
+ * destroy_pools
+ *
+ * Delete the pools for the remap list and I/O anchors.
+ **/
+static void destroy_pools(void)
+{
+	if (bbr_io_buf_pool) {
+		mempool_destroy(bbr_io_buf_pool);
+		bbr_io_buf_pool = NULL;
+	}
+	if (bbr_io_buf_cache) {
+		kmem_cache_destroy(bbr_io_buf_cache);
+		bbr_io_buf_cache = NULL;
+	}
+	if (bbr_remap_pool) {
+		mempool_destroy(bbr_remap_pool);
+		bbr_remap_pool = NULL;
+	}
+	if (bbr_remap_cache) {
+		kmem_cache_destroy(bbr_remap_cache);
+		bbr_remap_cache = NULL;
+	}
+}
+
+/**
+ * create_pools
+ *
+ * Create mempools for the remap list and I/O anchors.
+ **/
+static int create_pools(void)
+{
+	if (!bbr_remap_cache) {
+		bbr_remap_cache = kmem_cache_create("BBR_Remap_Cache",
+						    sizeof(struct bbr_runtime_remap),
+						    0, SLAB_HWCACHE_ALIGN,
+						    NULL, NULL);
+		if (!bbr_remap_cache) {
+			DMERR("Unable to create BBR remap cache.");
+			goto out;
+		}
+	}
+	if (!bbr_remap_pool) {
+		bbr_remap_pool = mempool_create(64, mempool_alloc_slab,
+						mempool_free_slab,
+						bbr_remap_cache);
+		if (!bbr_remap_pool) {
+			DMERR("Unable to create BBR remap mempool.");
+			goto out;
+		}
+	}
+
+	if (!bbr_io_buf_cache) {
+		bbr_io_buf_cache = kmem_cache_create("BBR_IO_Buf_Cache",
+						     sizeof(struct bbr_io_buffer),
+						     0, SLAB_HWCACHE_ALIGN,
+						     NULL, NULL);
+		if (!bbr_io_buf_cache) {
+			DMERR("Unable to create BBR I/O buffer cache.");
+			goto out;
+		}
+	}
+	if (!bbr_io_buf_pool) {
+		bbr_io_buf_pool = mempool_create(256, mempool_alloc_slab,
+						 mempool_free_slab,
+						 bbr_io_buf_cache);
+		if (!bbr_io_buf_pool) {
+			DMERR("Unable to create BBR I/O buffer mempool.");
+			goto out;
+		}
+	}
+
+out:
+	if (!bbr_remap_cache  || !bbr_remap_pool ||
+	    !bbr_io_buf_cache || !bbr_io_buf_pool ) {
+		destroy_pools();
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/**
+ * stop_io_thread
+ *
+ * Use the dm-daemon services to stop the BBR I/O thread.
+ **/
+static void stop_io_thread(void)
+{
+	if (bbr_io_thread) {
+		dm_daemon_stop(bbr_io_thread);
+		kfree(bbr_io_thread);
+		bbr_io_thread = NULL;
+	}
+}
+
+/**
+ * start_io_thread
+ *
+ * Use the dm-daemon services to start the BBR I/O thread.
+ **/
+static int start_io_thread(void)
+{
+	int rc;
+
+	if (!bbr_io_thread) {
+		bbr_io_thread = kmalloc(sizeof(*bbr_io_thread), GFP_KERNEL);
+		if (!bbr_io_thread) {
+			return -ENOMEM;
+		}
+
+		rc = dm_daemon_start(bbr_io_thread, "bbr_io", bbr_io_handler);
+		if (rc) {
+			kfree(bbr_io_thread);
+			return rc;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * bbr_global_init
+ *
+ * Set up the mempools, I/O thread, and sync-I/O service. This should
+ * be called only when the first bbr device is created.
+ **/
+static int bbr_global_init(void)
+{
+	int rc;
+
+	rc = create_pools();
+	if (rc) {
+		goto out;
+	}
+
+	rc = start_io_thread();
+	if (rc) {
+		destroy_pools();
+		goto out;
+	}
+
+	rc = dm_io_get(1);
+	if (rc) {
+		destroy_pools();
+		stop_io_thread();
+		goto out;
+	}
+
+out:
+	return rc;
+}
+
+/**
+ * bbr_global_cleanup
+ *
+ * Cleanup the mempools, I/O thread and sync-I/O service. This should
+ * be called only when the last bbr device is removed.
+ **/
+static void bbr_global_cleanup(void)
+{
+	destroy_pools();
+	stop_io_thread();
+	dm_io_put(1);
+}
+
+static struct bbr_private * bbr_alloc_private(void)
+{
+	struct bbr_private *bbr_id;
+
+	bbr_id = kmalloc(sizeof(*bbr_id), GFP_KERNEL);
+	if (bbr_id) {
+		memset(bbr_id, 0, sizeof(*bbr_id));
+		bbr_id->in_use_replacement_blks = (atomic_t)ATOMIC_INIT(0);
+		bbr_id->bbr_id_lock = SPIN_LOCK_UNLOCKED;
+	}
+
+	return bbr_id;
+}
+
+static void bbr_free_private(struct bbr_private *bbr_id)
+{
+	if (bbr_id->bbr_table) {
+		kfree(bbr_id->bbr_table);
+	}
+	bbr_free_remap(bbr_id);
+	kfree(bbr_id);
+}
+
+static u32 crc_table[256];
+static u32 crc_table_built = 0;
+
+static void build_crc_table(void)
+{
+	u32 i, j, crc;
+
+	for (i = 0; i <= 255; i++) {
+		crc = i;
+		for (j = 8; j > 0; j--) {
+			if (crc & 1)
+				crc = (crc >> 1) ^ CRC_POLYNOMIAL;
+			else
+				crc >>= 1;
+		}
+		crc_table[i] = crc;
+	}
+	crc_table_built = 1;
+}
+
+static u32 calculate_crc(u32 crc, void *buffer, u32 buffersize)
+{
+	unsigned char *current_byte;
+	u32 temp1, temp2, i;
+
+	current_byte = (unsigned char *) buffer;
+	/* Make sure the crc table is available */
+	if (!crc_table_built)
+		build_crc_table();
+	/* Process each byte in the buffer. */
+	for (i = 0; i < buffersize; i++) {
+		temp1 = (crc >> 8) & 0x00FFFFFF;
+		temp2 = crc_table[(crc ^ (u32) * current_byte) &
+				  (u32) 0xff];
+		current_byte++;
+		crc = temp1 ^ temp2;
+	}
+	return crc;
+}
+
+/**
+ * le_bbr_table_sector_to_cpu
+ *
+ * Convert bbr meta data from on-disk (LE) format
+ * to the native cpu endian format.
+ **/
+static void le_bbr_table_sector_to_cpu(struct bbr_table *p)
+{
+	int i;
+	p->signature		= le32_to_cpup(&p->signature);
+	p->crc			= le32_to_cpup(&p->crc);
+	p->sequence_number	= le32_to_cpup(&p->sequence_number);
+	p->in_use_cnt		= le32_to_cpup(&p->in_use_cnt);
+	for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
+		p->entries[i].bad_sect =
+			le64_to_cpup(&p->entries[i].bad_sect);
+		p->entries[i].replacement_sect =
+			le64_to_cpup(&p->entries[i].replacement_sect);
+	}
+}
+
+/**
+ * cpu_bbr_table_sector_to_le
+ *
+ * Convert bbr meta data from cpu endian format to on-disk (LE) format
+ **/
+static void cpu_bbr_table_sector_to_le(struct bbr_table * p,
+				       struct bbr_table * le)
+{
+	int i;
+	le->signature		= cpu_to_le32p(&p->signature);
+	le->crc			= cpu_to_le32p(&p->crc);
+	le->sequence_number	= cpu_to_le32p(&p->sequence_number);
+	le->in_use_cnt		= cpu_to_le32p(&p->in_use_cnt);
+	for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
+		le->entries[i].bad_sect =
+			cpu_to_le64p(&p->entries[i].bad_sect);
+		le->entries[i].replacement_sect =
+			cpu_to_le64p(&p->entries[i].replacement_sect);
+	}
+}
+
+/**
+ * validate_bbr_table_sector
+ *
+ * Check the specified BBR table sector for a valid signature and CRC. If it's
+ * valid, endian-convert the table sector.
+ **/
+static int validate_bbr_table_sector(struct bbr_table * p)
+{
+	int rc = 0;
+	int org_crc, final_crc;
+
+	if (le32_to_cpup(&p->signature) != BBR_TABLE_SIGNATURE) {
+		DMERR("BBR table signature doesn't match!");
+		DMERR("Found 0x%x. Expecting 0x%x",
+		      le32_to_cpup(&p->signature), BBR_TABLE_SIGNATURE);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	if (!p->crc) {
+		DMERR("BBR table sector has no CRC!");
+		rc = -EINVAL;
+		goto out;
+	}
+
+	org_crc = le32_to_cpup(&p->crc);
+	p->crc = 0;
+	final_crc = calculate_crc(INITIAL_CRC, (void *)p, sizeof(*p));
+	if (final_crc != org_crc) {
+		DMERR("CRC failed!");
+		DMERR("Found 0x%x. Expecting 0x%x",
+		      org_crc, final_crc);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	p->crc = cpu_to_le32p(&org_crc);
+	le_bbr_table_sector_to_cpu(p);
+
+out:
+	return rc;
+}
+
+/**
+ * bbr_binary_tree_insert
+ *
+ * Insert a node into the binary tree.
+ **/
+static void bbr_binary_tree_insert(struct bbr_runtime_remap **root,
+				   struct bbr_runtime_remap *newnode)
+{
+	struct bbr_runtime_remap **node = root;
+	while (node && *node) {
+		if (newnode->remap.bad_sect > (*node)->remap.bad_sect) {
+			node = &((*node)->right);
+		} else {
+			node = &((*node)->left);
+		}
+	}
+
+	newnode->left = newnode->right = NULL;
+	*node = newnode;
+}
+
+/**
+ * bbr_binary_search
+ *
+ * Search for a node that contains bad_sect == lsn.
+ **/
+static struct bbr_runtime_remap * bbr_binary_search(
+	struct bbr_runtime_remap *root,
+	u64 lsn)
+{
+	struct bbr_runtime_remap *node = root;
+	while (node) {
+		if (node->remap.bad_sect == lsn) {
+			break;
+		}
+		if (lsn > node->remap.bad_sect) {
+			node = node->right;
+		} else {
+			node = node->left;
+		}
+	}
+	return node;
+}
+
+/**
+ * bbr_binary_tree_destroy
+ *
+ * Destroy the binary tree.
+ **/
+static void bbr_binary_tree_destroy(struct bbr_runtime_remap * root,
+				    struct bbr_private * bbr_id)
+{
+	struct bbr_runtime_remap **link = NULL;
+	struct bbr_runtime_remap *node = root;
+
+	while (node) {
+		if (node->left) {
+			link = &(node->left);
+			node = node->left;
+			continue;
+		}
+		if (node->right) {
+			link = &(node->right);
+			node = node->right;
+			continue;
+		}
+
+		mempool_free(node, bbr_remap_pool);
+		if (node == root) {
+			/* If root is deleted, we're done. */
+			break;
+		}
+
+		/* Back to root. */
+		node = root;
+		*link = NULL;
+	}
+}
+
+static void bbr_free_remap(struct bbr_private * bbr_id)
+{
+	spin_lock_irq(&bbr_id->bbr_id_lock);
+	bbr_binary_tree_destroy(bbr_id->remap_root, bbr_id);
+	bbr_id->remap_root = NULL;
+	spin_unlock_irq(&bbr_id->bbr_id_lock);
+}
+
+/**
+ * bbr_insert_remap_entry
+ *
+ * Create a new remap entry and add it to the binary tree for this node.
+ **/
+static int bbr_insert_remap_entry(struct bbr_private *bbr_id,
+				  struct bbr_table_entry *new_bbr_entry)
+{
+	struct bbr_runtime_remap *newnode;
+
+	newnode = mempool_alloc(bbr_remap_pool, GFP_NOIO);
+	if (!newnode) {
+		DMERR("Could not allocate from remap mempool!");
+		return -ENOMEM;
+	}
+	newnode->remap.bad_sect  = new_bbr_entry->bad_sect;
+	newnode->remap.replacement_sect = new_bbr_entry->replacement_sect;
+	spin_lock_irq(&bbr_id->bbr_id_lock);
+	bbr_binary_tree_insert(&bbr_id->remap_root, newnode);
+	spin_unlock_irq(&bbr_id->bbr_id_lock);
+	return 0;
+}
+
+/**
+ * bbr_table_to_remap_list
+ *
+ * The on-disk bbr table is sorted by the replacement sector LBA. In order to
+ * improve run time performance, the in memory remap list must be sorted by
+ * the bad sector LBA. This function is called at discovery time to initialize
+ * the remap list. This function assumes that at least one copy of meta data
+ * is valid.
+ **/
+static u32 bbr_table_to_remap_list(struct bbr_private * bbr_id)
+{
+	u32 in_use_blks = 0;
+	int i, j;
+	struct bbr_table *p;
+
+	for (i = 0, p = bbr_id->bbr_table;
+	     i < bbr_id->nr_sects_bbr_table;
+	     i++, p++) {
+		if (!p->in_use_cnt) {
+			break;
+		}
+		in_use_blks += p->in_use_cnt;
+		for (j = 0; j < p->in_use_cnt; j++) {
+			bbr_insert_remap_entry(bbr_id, &p->entries[j]);
+		}
+	}
+	if (in_use_blks) {
+		DMWARN("There are %u BBR entries for device %s",
+		       in_use_blks, dm_kdevname(bbr_id->dev->dev));
+	}
+
+	return in_use_blks;
+}
+
+/**
+ * bbr_search_remap_entry
+ *
+ * Search remap entry for the specified sector. If found, return a pointer to
+ * the table entry. Otherwise, return NULL.
+ **/
+static struct bbr_table_entry * bbr_search_remap_entry(
+	struct bbr_private *bbr_id,
+	u64 lsn)
+{
+	struct bbr_runtime_remap *p;
+
+	spin_lock_irq(&bbr_id->bbr_id_lock);
+	p = bbr_binary_search(bbr_id->remap_root, lsn);
+	spin_unlock_irq(&bbr_id->bbr_id_lock);
+	if (p) {
+		return (&p->remap);
+	} else {
+		return NULL;
+	}
+}
+
+/**
+ * bbr_remap
+ *
+ * If *lsn is in the remap table, return TRUE and modify *lsn,
+ * else, return FALSE.
+ **/
+static inline int bbr_remap(struct bbr_private *bbr_id,
+			    u64 *lsn)
+{
+	struct bbr_table_entry *e;
+
+	if (atomic_read(&bbr_id->in_use_replacement_blks)) {
+		e = bbr_search_remap_entry(bbr_id, *lsn);
+		if (e) {
+			*lsn = e->replacement_sect;
+			return 1;
+		}
+	}
+	return 0;
+}
+
+/**
+ * bbr_remap_probe
+ *
+ * If any of the sectors in the range [lsn, lsn+nr_sects] are in the remap
+ * table return TRUE, Else, return FALSE.
+ **/
+static inline int bbr_remap_probe(struct bbr_private * bbr_id,
+				  u64 lsn, u64 nr_sects)
+{
+	u64 tmp, cnt;
+
+	if (atomic_read(&bbr_id->in_use_replacement_blks)) {
+		for (cnt = 0, tmp = lsn;
+		     cnt < nr_sects;
+		     cnt += bbr_id->blksize_in_sects, tmp = lsn + cnt) {
+			if (bbr_remap(bbr_id,&tmp)) {
+				return 1;
+			}
+		}
+	}
+	return 0;
+}
+
+/**
+ * bbr_setup
+ *
+ * Read the remap tables from disk and set up the initial remap tree.
+ **/
+static int bbr_setup(struct bbr_private *bbr_id)
+{
+	struct bbr_table *table = bbr_id->bbr_table;
+	struct page *page;
+	struct io_region job;
+	unsigned int error, offset;
+	int i, rc = 0;
+
+	job.dev = bbr_id->dev->dev;
+	job.count = 1;
+
+	/* Read and verify each BBR table sector individually. */
+	for (i = 0; i < bbr_id->nr_sects_bbr_table; i++, table++) {
+		job.sector = bbr_id->lba_table1 + i;
+		page = virt_to_page(table);
+		offset = (unsigned long)table & ~PAGE_MASK;
+		rc = dm_io_sync(1, &job, READ, page, offset, &error);
+		if (rc && bbr_id->lba_table2) {
+			job.sector = bbr_id->lba_table2 + i;
+			rc = dm_io_sync(1, &job, READ, page, offset, &error);
+		}
+		if (rc) {
+			goto out;
+		}
+
+		rc = validate_bbr_table_sector(table);
+		if (rc) {
+			goto out;
+		}
+	}
+	atomic_set(&bbr_id->in_use_replacement_blks,
+		   bbr_table_to_remap_list(bbr_id));
+
+out:
+	if (rc) {
+		DMERR("dm-bbr: error during device setup: %d", rc);
+	}
+	return rc;
+}
+
+static struct bbr_io_buffer * allocate_bbr_io_buf(struct bbr_private * bbr_id,
+						  struct buffer_head * bh,
+						  int rw)
+{
+	struct bbr_io_buffer * bbr_io_buf;
+
+	bbr_io_buf = mempool_alloc(bbr_io_buf_pool, GFP_NOIO);
+	if (bbr_io_buf) {
+		memset(bbr_io_buf, 0, sizeof(struct bbr_io_buffer));
+		INIT_LIST_HEAD(&bbr_io_buf->bbr_io_list);
+		bbr_io_buf->bbr_id = bbr_id;
+		bbr_io_buf->sector = bh->b_rsector;
+		bbr_io_buf->bh = bh;
+		bbr_io_buf->rw = rw;
+	} else {
+		DMWARN("Could not allocate from BBR I/O buffer pool!");
+	}
+	return bbr_io_buf;
+}
+
+static void free_bbr_io_buf(struct bbr_io_buffer * bbr_io_buf)
+{
+	mempool_free(bbr_io_buf, bbr_io_buf_pool);
+}
+
+/**
+ * bbr_io_remap_error
+ * @bbr_id:		Private data for the BBR node.
+ * @rw:			READ or WRITE.
+ * @starting_lsn:	Starting sector of request to remap.
+ * @count:		Number of sectors in the request.
+ * @buffer:		Data buffer for the request.
+ *
+ * For the requested range, try to write each sector individually. For each
+ * sector that fails, find the next available remap location and write the
+ * data to that new location. Then update the table and write both copies
+ * of the table to disk. Finally, update the in-memory mapping and do any
+ * other necessary bookkeeping.
+ **/
+static int bbr_io_remap_error(struct bbr_private *bbr_id,
+			      int rw,
+			      u64 starting_lsn,
+			      u64 count,
+			      char *buffer)
+{
+	struct bbr_table *bbr_table;
+	struct io_region job;
+	struct page *page;
+	unsigned long table_sector_index;
+	unsigned long table_sector_offset;
+	unsigned long index;
+	unsigned int offset_in_page, error;
+	u64 lsn, new_lsn;
+	int rc;
+
+	if (rw == READ) {
+		/* Nothing can be done about read errors. */
+		return -EIO;
+	}
+
+	job.dev = bbr_id->dev->dev;
+	job.count = 1;
+
+	/* For each sector in the request. */
+	for (lsn = 0; lsn < count; lsn++, buffer += SECTOR_SIZE) {
+		job.sector = starting_lsn + lsn;
+		page = virt_to_page(buffer);
+		offset_in_page = (unsigned long)buffer & ~PAGE_MASK;
+		rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
+		while (rc) {
+			/* Find the next available relocation sector. */
+			new_lsn = atomic_read(&bbr_id->in_use_replacement_blks);
+			if (new_lsn >= bbr_id->nr_replacement_blks) {
+				/* No more replacement sectors available. */
+				return -EIO;
+			}
+			new_lsn += bbr_id->start_replacement_sect;
+
+			/* Write the data to its new location. */
+			DMWARN("dm-bbr: device %s: Trying to remap bad sector "PFU64" to sector "PFU64,
+			       dm_kdevname(bbr_id->dev->dev),
+			       starting_lsn + lsn, new_lsn);
+			job.sector = new_lsn;
+			rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
+			if (rc) {
+				/* This replacement sector is bad.
+				 * Try the next one.
+				 */
+				DMERR("dm-bbr: device %s: replacement sector "PFU64" is bad. Skipping.",
+				      dm_kdevname(bbr_id->dev->dev), new_lsn);
+				atomic_inc(&bbr_id->in_use_replacement_blks);
+				continue;
+			}
+
+			/* Add this new entry to the on-disk table. */
+			table_sector_index = new_lsn -
+					     bbr_id->start_replacement_sect;
+			table_sector_offset = table_sector_index /
+					      BBR_ENTRIES_PER_SECT;
+			index = table_sector_index % BBR_ENTRIES_PER_SECT;
+
+			bbr_table = &bbr_id->bbr_table[table_sector_offset];
+			bbr_table->entries[index].bad_sect = starting_lsn + lsn;
+			bbr_table->entries[index].replacement_sect = new_lsn;
+			bbr_table->in_use_cnt++;
+			bbr_table->sequence_number++;
+			bbr_table->crc = 0;
+			bbr_table->crc = calculate_crc(INITIAL_CRC,
+						       bbr_table,
+						       sizeof(struct bbr_table));
+
+			/* Write the table to disk. */
+			cpu_bbr_table_sector_to_le(bbr_table, bbr_table);
+			page = virt_to_page(bbr_table);
+			offset_in_page = (unsigned long)bbr_table & ~PAGE_MASK;
+			if (bbr_id->lba_table1) {
+				job.sector = bbr_id->lba_table1 + table_sector_offset;
+				rc = dm_io_sync(1, &job, WRITE, page, offset_in_page, &error);
+			}
+			if (bbr_id->lba_table2) {
+				job.sector = bbr_id->lba_table2 + table_sector_offset;
+				rc |= dm_io_sync(1, &job, WRITE, page, offset_in_page, &error);
+			}
+			le_bbr_table_sector_to_cpu(bbr_table);
+
+			if (rc) {
+				/* Error writing one of the tables to disk. */
+				DMERR("dm-bbr: device %s: error updating BBR tables on disk.",
+				      dm_kdevname(bbr_id->dev->dev));
+				return rc;
+			}
+
+			/* Insert a new entry in the remapping binary-tree. */
+			rc = bbr_insert_remap_entry(bbr_id,
+						    &bbr_table->entries[index]);
+			if (rc) {
+				DMERR("dm-bbr: device %s: error adding new entry to remap tree.",
+				      dm_kdevname(bbr_id->dev->dev));
+				return rc;
+			}
+
+			atomic_inc(&bbr_id->in_use_replacement_blks);
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * bbr_io_process_request
+ *
+ * For each sector in this request, check if the sector has already
+ * been remapped. If so, process all previous sectors in the request,
+ * followed by the remapped sector. Then reset the starting lsn and
+ * count, and keep going with the rest of the request as if it were
+ * a whole new request. If any of the sync_io's return an error,
+ * call the remapper to relocate the bad sector(s).
+ **/
+static int bbr_io_process_request(struct bbr_io_buffer *bbr_io_buf)
+{
+	struct bbr_private *bbr_id = bbr_io_buf->bbr_id;
+	struct io_region job;
+	u64 starting_lsn = bbr_io_buf->sector;
+	u64 count = bbr_io_buf->bh->b_size >> SECTOR_SHIFT;
+	u64 lsn, remapped_lsn;
+	char *buffer = bbr_io_buf->bh->b_data;
+	struct page *page = virt_to_page(buffer);
+	unsigned int offset_in_page = (unsigned long)buffer & ~PAGE_MASK;
+	unsigned int error;
+	int rw = bbr_io_buf->rw;
+	int rc = 0;
+
+	job.dev = bbr_id->dev->dev;
+
+	/* For each sector in this request, check if this sector has
+	 * already been remapped. If so, process all previous sectors
+	 * in this request, followed by the remapped sector. Then reset
+	 * the starting lsn and count and keep going with the rest of
+	 * the request as if it were a whole new request.
+	 */
+	for (lsn = 0; lsn < count; lsn++) {
+		remapped_lsn = starting_lsn + lsn;
+		rc = bbr_remap(bbr_id, &remapped_lsn);
+		if (!rc) {
+			/* This sector is fine. */
+			continue;
+		}
+
+		/* Process all sectors in the request up to this one. */
+		if (lsn > 0) {
+			job.sector = starting_lsn;
+			job.count = lsn;
+			rc = dm_io_sync(1, &job, rw, page,
+					offset_in_page, &error);
+			if (rc) {
+				/* If this I/O failed, then one of the
+				 * sectors in this request needs to be
+				 * relocated.
+				 */
+				rc = bbr_io_remap_error(bbr_id, rw,
+							starting_lsn,
+							lsn, buffer);
+				if (rc) {
+					return rc;
+				}
+			}
+			buffer += (lsn << SECTOR_SHIFT);
+			page = virt_to_page(buffer);
+			offset_in_page = (unsigned long)buffer & ~PAGE_MASK;
+		}
+
+		/* Process the remapped sector. */
+		job.sector = remapped_lsn;
+		job.count = 1;
+		rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
+		if (rc) {
+			/* BUGBUG - Need more processing if this caused
+			 * an error. If this I/O failed, then the
+			 * existing remap is now bad, and we need to
+			 * find a new remap. Can't use
+			 * bbr_io_remap_error(), because the existing
+			 * map entry needs to be changed, not added
+			 * again, and the original table entry also
+			 * needs to be changed.
+			 */
+			return rc;
+		}
+
+		buffer		+= SECTOR_SIZE;
+		starting_lsn	+= (lsn + 1);
+		count		-= (lsn + 1);
+		lsn		= -1;
+		page		= virt_to_page(buffer);
+		offset_in_page	= (unsigned long)buffer & ~PAGE_MASK;
+	}
+
+	/* Check for any remaining sectors after the last split. This
+	 * could potentially be the whole request, but that should be a
+	 * rare case because requests should only be processed by the
+	 * thread if we know an error occurred or they contained one or
+	 * more remapped sectors.
+	 */
+	if (count) {
+		job.sector = starting_lsn;
+		job.count = count;
+		rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
+		if (rc) {
+			/* If this I/O failed, then one of the sectors
+			 * in this request needs to be relocated.
+			 */
+			rc = bbr_io_remap_error(bbr_id, rw, starting_lsn,
+						count, buffer);
+			if (rc) {
+				return rc;
+			}
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * bbr_io_handler
+ *
+ * This is the handler for the bbr_io_thread. It continuously loops,
+ * taking I/O requests off its list and processing them. If nothing
+ * is on the list, the thread goes back to sleep until specifically
+ * woken up.
+ *
+ * I/O requests should only be sent to this thread if we know that:
+ * a) the request contains at least one remapped sector.
+ *   or
+ * b) the request caused an error on the normal I/O path.
+ * This function uses synchronous I/O, so sending a request to this
+ * thread that doesn't need special processing will cause severe
+ * performance degredation.
+ **/
+static void bbr_io_handler(void)
+{
+	struct bbr_io_buffer *bbr_io_buf;
+	struct buffer_head *bh;
+	unsigned long flags;
+	int rc;
+
+	while (1) {
+		/* Process bbr_io_list, one entry at a time. */
+		spin_lock_irqsave(&bbr_io_list_lock, flags);
+		if (list_empty(&bbr_io_list)) {
+			/* No more items on the list. */
+			spin_unlock_irqrestore(&bbr_io_list_lock, flags);
+			break;
+		}
+		bbr_io_buf = list_entry(bbr_io_list.next,
+					struct bbr_io_buffer, bbr_io_list);
+		list_del_init(&bbr_io_buf->bbr_io_list);
+		spin_unlock_irqrestore(&bbr_io_list_lock, flags);
+
+		rc = bbr_io_process_request(bbr_io_buf);
+
+		/* Clean up and complete the original I/O. */
+		bbr_io_buf->flags |= BBR_IO_HANDLED;
+		bh = bbr_io_buf->bh;
+		if (bh->b_end_io) {
+			/* If this was the bbr_io_buf for an error on the
+			 * normal WRITE, don't free it here. It will be
+			 * freed later in bbr_callback()
+			 */
+			if (!(bbr_io_buf->flags & BBR_IO_RELOCATE))
+				free_bbr_io_buf(bbr_io_buf);
+			bh->b_end_io(bh, rc ? 0 : 1);
+		}
+	}
+}
+
+/**
+ * bbr_schedule_io
+ *
+ * Place the specified bbr_io_buf on the thread's processing list.
+ **/
+static void bbr_schedule_io(struct bbr_io_buffer *bbr_io_buf)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&bbr_io_list_lock, flags);
+	list_add_tail(&bbr_io_buf->bbr_io_list, &bbr_io_list);
+	spin_unlock_irqrestore(&bbr_io_list_lock, flags);
+	dm_daemon_wake(bbr_io_thread);
+}
+
+/**
+ * bbr_read
+ *
+ * If there are any remapped sectors on this object, send this request over
+ * to the thread for processing. Otherwise send it down the stack normally.
+ **/
+static int bbr_read(struct bbr_private *bbr_id,
+		    struct buffer_head *bh)
+{
+	struct bbr_io_buffer *bbr_io_buf;
+
+	if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
+	    !bbr_remap_probe(bbr_id, bh->b_rsector,
+			     bh->b_size >> SECTOR_SHIFT)) {
+		/* No existing remaps or this request doesn't
+		 * contain any remapped sectors.
+		 */
+		bh->b_rdev = bbr_id->dev->dev;
+		return 1;
+	}
+
+	/* This request has at least one remapped sector. */
+	bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, READ);
+	if (!bbr_io_buf) {
+		/* Can't get memory to track the I/O. */
+		return -ENOMEM;
+	}
+
+	bbr_schedule_io(bbr_io_buf);
+	return 0;
+}
+
+/**
+ * bbr_callback
+ *
+ * This is the callback for normal write requests. Check for an error
+ * during the I/O, and send to the thread for processing if necessary.
+ **/
+static int bbr_callback(struct dm_target *ti, struct buffer_head *bh, int rw,
+			int error, union map_info *map_context)
+{
+	struct bbr_io_buffer *bbr_io_buf = map_context->ptr;
+
+	if (!bbr_io_buf)
+		return error;
+
+	/* Will try to relocate the WRITE if:
+	 * - It is an error, and
+	 * - It is not an error of BBR relocation, and
+	 */
+	if (error && !(bbr_io_buf->flags & BBR_IO_HANDLED)) {
+		DMERR("dm-bbr: device %s: Write failure on sector %lu. Scheduling for retry.",
+		      dm_kdevname(bh->b_rdev),
+		      (unsigned long)bbr_io_buf->sector);
+		/* Indicate this bbr_io_buf is for an error on normal WRITE */
+		bbr_io_buf->flags |= BBR_IO_RELOCATE;
+		bbr_schedule_io(bbr_io_buf);
+		/* Returns >0 so that DM will let us retry the I/O */
+		return 1;
+	}
+
+	free_bbr_io_buf(bbr_io_buf);
+	return error;
+}
+
+/**
+ * bbr_write
+ *
+ * If there are any remapped sectors on this object, send the request over
+ * to the thread for processing. Otherwise, register for callback
+ * notification, and send the request down normally.
+ **/
+static int bbr_write(struct bbr_private *bbr_id,
+		     struct buffer_head *bh,
+		     union map_info *map_context)
+{
+	struct bbr_io_buffer *bbr_io_buf;
+	int rc = 1;
+
+	bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, WRITE);
+	if (!bbr_io_buf) {
+		/* Can't get memory to track the I/O. */
+		return -ENOMEM;
+	}
+
+	if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
+	    !bbr_remap_probe(bbr_id, bh->b_rsector,
+			     bh->b_size >> SECTOR_SHIFT)) {
+		/* No existing remaps or this request
+		 * contains no remapped sectors.
+		 */
+		bh->b_rdev = bbr_id->dev->dev;
+		map_context->ptr = bbr_io_buf;
+	} else {
+		/* This request contains at least one remapped sector. */
+		bbr_schedule_io(bbr_io_buf);
+		rc = 0;
+	}
+
+	return rc;
+}
+
+/**
+ * Construct a bbr mapping
+ **/
+static int bbr_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+	struct bbr_private *bbr_id;
+	unsigned long block_size;
+	char *end;
+	int rc = -EINVAL;
+
+	if (argc != 8) {
+		ti->error = "dm-bbr requires exactly 8 arguments: "
+			    "device offset table1_lsn table2_lsn table_size start_replacement nr_replacement_blks block_size";
+		goto out1;
+	}
+
+	bbr_id = bbr_alloc_private();
+	if (!bbr_id) {
+		ti->error = "dm-bbr: Error allocating bbr private data.";
+		goto out1;
+	}
+
+	bbr_id->offset = simple_strtoull(argv[1], &end, 10);
+	bbr_id->lba_table1 = simple_strtoull(argv[2], &end, 10);
+	bbr_id->lba_table2 = simple_strtoull(argv[3], &end, 10);
+	bbr_id->nr_sects_bbr_table = simple_strtoull(argv[4], &end, 10);
+	bbr_id->start_replacement_sect = simple_strtoull(argv[5], &end, 10);
+	bbr_id->nr_replacement_blks = simple_strtoull(argv[6], &end, 10);
+	block_size = simple_strtoul(argv[7], &end, 10);
+	bbr_id->blksize_in_sects = (block_size >> SECTOR_SHIFT);
+
+	bbr_id->bbr_table = kmalloc(bbr_id->nr_sects_bbr_table << SECTOR_SHIFT,
+				    GFP_KERNEL);
+	if (!bbr_id->bbr_table) {
+		ti->error = "dm-bbr: Error allocating bbr table.";
+		goto out2;
+	}
+
+	if (dm_get_device(ti, argv[0], 0, ti->len,
+			  dm_table_get_mode(ti->table), &bbr_id->dev)) {
+		ti->error = "dm-bbr: Device lookup failed";
+		goto out2;
+	}
+
+	/* Using a semaphore here is probably overkill,
+	 * but at least it will be correct.
+	 */
+	down(&bbr_instances_lock);
+	if (bbr_instances == 0) {
+		rc = bbr_global_init();
+		if (rc) {
+			up(&bbr_instances_lock);
+			goto out3;
+		}
+	}
+	bbr_instances++;
+	up(&bbr_instances_lock);
+
+	rc = bbr_setup(bbr_id);
+	if (rc) {
+		ti->error = "dm-bbr: Device setup failed";
+		goto out4;
+	}
+
+	ti->private = bbr_id;
+	return 0;
+
+out4:
+	down(&bbr_instances_lock);
+	bbr_instances--;
+	if (bbr_instances == 0) {
+		bbr_global_cleanup();
+	}
+	up(&bbr_instances_lock);
+
+out3:
+	dm_put_device(ti, bbr_id->dev);
+out2:
+	bbr_free_private(bbr_id);
+out1:
+	return rc;
+}
+
+static void bbr_dtr(struct dm_target *ti)
+{
+	struct bbr_private *bbr_id = ti->private;
+
+	dm_put_device(ti, bbr_id->dev);
+	bbr_free_private(bbr_id);
+
+	down(&bbr_instances_lock);
+	bbr_instances--;
+	if (bbr_instances == 0) {
+		bbr_global_cleanup();
+	}
+	up(&bbr_instances_lock);
+}
+
+static int bbr_map(struct dm_target *ti, struct buffer_head *bh, int rw,
+		   union map_info *map_context)
+{
+	struct bbr_private *bbr_id = ti->private;
+
+	bh->b_rsector += bbr_id->offset;
+	map_context->ptr = NULL;
+	switch (rw) {
+	case READ:
+	case READA:
+		return bbr_read(bbr_id, bh);
+	case WRITE:
+		return bbr_write(bbr_id, bh, map_context);
+	default:
+		return -EIO;
+	}
+}
+
+static int bbr_status(struct dm_target *ti, status_type_t type,
+		      char *result, unsigned int maxlen)
+{
+	struct bbr_private *bbr_id = ti->private;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		result[0] = '\0';
+		break;
+
+	case STATUSTYPE_TABLE:
+		snprintf(result, maxlen, "%s "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" %u",
+			 dm_kdevname(bbr_id->dev->dev),
+			 bbr_id->offset, bbr_id->lba_table1, bbr_id->lba_table2,
+			 bbr_id->nr_sects_bbr_table,
+			 bbr_id->start_replacement_sect,
+			 bbr_id->nr_replacement_blks,
+			 bbr_id->blksize_in_sects << SECTOR_SHIFT);
+		 break;
+	}
+	return 0;
+}
+
+static struct target_type bbr_target = {
+	name:	"bbr",
+	module:	THIS_MODULE,
+	ctr:	bbr_ctr,
+	dtr:	bbr_dtr,
+	map:	bbr_map,
+	end_io:	bbr_callback,
+	status:	bbr_status,
+};
+
+int __init dm_bbr_init(void)
+{
+	int r = dm_register_target(&bbr_target);
+
+	if (r < 0)
+		DMERR("dm-bbr: register failed %d", r);
+
+	return r;
+}
+
+void __exit dm_bbr_exit(void)
+{
+	int r = dm_unregister_target(&bbr_target);
+
+	if (r < 0)
+		DMERR("dm-bbr: unregister failed %d", r);
+}
+
+module_init(dm_bbr_init);
+module_exit(dm_bbr_exit);
+MODULE_LICENSE("GPL");
diff -urN linux-2.4.24.org/drivers/md/dm-bbr.h linux-2.4.24/drivers/md/dm-bbr.h
--- linux-2.4.24.org/drivers/md/dm-bbr.h	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.4.24/drivers/md/dm-bbr.h	2004-01-18 16:03:13.101545929 +0100
@@ -0,0 +1,143 @@
+/*
+ *   (C) Copyright IBM Corp. 2002, 2003
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * linux/drivers/md/dm-bbr.h
+ *
+ * Bad-block-relocation (BBR) target for device-mapper.
+ *
+ * The BBR target is designed to remap I/O write failures to another safe
+ * location on disk. Note that most disk drives have BBR built into them,
+ * this means that our software BBR will be only activated when all hardware
+ * BBR replacement sectors have been used.
+ */
+
+#define BBR_TABLE_SIGNATURE		0x42627254 /* BbrT */
+#define BBR_ENTRIES_PER_SECT		31
+#define BBR_NR_BUFS			128
+#define INITIAL_CRC			0xFFFFFFFF
+#define CRC_POLYNOMIAL			0xEDB88320L
+
+/**
+ * Macros to cleanly print 64-bit numbers on both 32-bit and 64-bit machines.
+ * Use these in place of %Ld, %Lu, and %Lx.
+ **/
+#if BITS_PER_LONG > 32
+#define PFU64 "%lu"
+#else
+#define PFU64 "%Lu"
+#endif
+
+/**
+ * struct bbr_table_entry
+ * @bad_sect:		LBA of bad location.
+ * @replacement_sect:	LBA of new location.
+ *
+ * Structure to describe one BBR remap.
+ **/
+struct bbr_table_entry {
+	u64 bad_sect;
+	u64 replacement_sect;
+};
+
+/**
+ * struct bbr_table
+ * @signature:		Signature on each BBR table sector.
+ * @crc:		CRC for this table sector.
+ * @sequence_number:	Used to resolve conflicts when primary and secondary
+ *			tables do not match.
+ * @in_use_cnt:		Number of in-use table entries.
+ * @entries:		Actual table of remaps.
+ *
+ * Structure to describe each sector of the metadata table. Each sector in this
+ * table can describe 31 remapped sectors.
+ **/
+struct bbr_table {
+	u32			signature;
+	u32			crc;
+	u32			sequence_number;
+	u32			in_use_cnt;
+	struct bbr_table_entry	entries[BBR_ENTRIES_PER_SECT];
+};
+
+/**
+ * struct bbr_runtime_remap
+ *
+ * Node in the binary tree used to keep track of remaps.
+ **/
+struct bbr_runtime_remap {
+	struct bbr_table_entry		remap;
+	struct bbr_runtime_remap	*left;
+	struct bbr_runtime_remap	*right;
+};
+
+/**
+ * struct bbr_private
+ * @dev:			Info about underlying device.
+ * @bbr_table:			Copy of metadata table.
+ * @remap_root:			Binary tree containing all remaps.
+ * @offset:			LBA of data area.
+ * @lba_table1:			LBA of primary BBR table.
+ * @lba_table2:			LBA of secondary BBR table.
+ * @nr_sects_bbr_table:		Size of each BBR table.
+ * @nr_replacement_blks:	Number of replacement blocks.
+ * @start_replacement_sect:	LBA of start of replacement blocks.
+ * @blksize_in_sects:		Size of each block.
+ * @in_use_replacement_blks:	Current number of remapped blocks.
+ * @bbr_id_lock:		Lock for the binary tree.
+ *
+ * Private data for each BBR target.
+ **/
+struct bbr_private {
+	struct dm_dev			*dev;
+	struct bbr_table		*bbr_table;
+	struct bbr_runtime_remap	*remap_root;
+	u64				offset;
+	u64				lba_table1;
+	u64				lba_table2;
+	u64				nr_sects_bbr_table;
+	u64				start_replacement_sect;
+	u64				nr_replacement_blks;
+	u32				blksize_in_sects;
+	atomic_t			in_use_replacement_blks;
+	spinlock_t			bbr_id_lock;
+};
+
+#define BBR_IO_HANDLED	(1<<0)
+#define BBR_IO_RELOCATE	(1<<1)
+
+/**
+ * struct bbr_io_buffer
+ * @bbr_io_list:	Thread's list of bbr_io_buf's.
+ * @bbr_id:		Object for this request.
+ * @bh:			Original buffer_head.
+ * @sector:		Original sector
+ * @flags:		Operation flag (BBR_IO_*)
+ * @rw:			READ or WRITE.
+ * @rc:			Return code from bbr_io_handler.
+ *
+ * Structure used to track each write request.
+ **/
+struct bbr_io_buffer {
+	struct list_head	bbr_io_list;
+	struct bbr_private	*bbr_id;
+	struct buffer_head	*bh;
+	u64			sector;
+	u32			flags;
+	s32			rw;
+	s32			rc;
+};
+
diff -urN linux-2.4.24.org/drivers/md/dm.c linux-2.4.24/drivers/md/dm.c
--- linux-2.4.24.org/drivers/md/dm.c	2004-01-18 15:09:18.533171353 +0100
+++ linux-2.4.24/drivers/md/dm.c	2004-01-18 15:59:40.046635861 +0100
@@ -951,13 +951,23 @@
 	int r = 0;
 	DECLARE_WAITQUEUE(wait, current);
 
-	down_write(&md->lock);
+	/* Flush IO to the origin device */
+	down_read(&md->lock);
+	if (test_bit(DMF_BLOCK_IO, &md->flags)) {
+		up_read(&md->lock);
+		return -EINVAL;
+	}
+
+	fsync_dev_lockfs(md->dev);
+	up_read(&md->lock);
+
 
 	/*
-	 * First we set the BLOCK_IO flag so no more ios will be
-	 * mapped.
+	 * Set the BLOCK_IO flag so no more ios will be mapped.
 	 */
+	down_write(&md->lock);
 	if (test_bit(DMF_BLOCK_IO, &md->flags)) {
+		unlockfs(md->dev);
 		up_write(&md->lock);
 		return -EINVAL;
 	}
@@ -986,6 +996,7 @@
 
 	/* did we flush everything ? */
 	if (atomic_read(&md->pending)) {
+		unlockfs(md->dev);
 		clear_bit(DMF_BLOCK_IO, &md->flags);
 		r = -EINTR;
 	} else {
@@ -1017,6 +1028,7 @@
 	md->deferred = NULL;
 	up_write(&md->lock);
 
+	unlockfs(md->dev);
 	flush_deferred_io(def);
 	run_task_queue(&tq_disk);
 
diff -urN linux-2.4.24.org/drivers/md/dm-snapshot.c linux-2.4.24/drivers/md/dm-snapshot.c
--- linux-2.4.24.org/drivers/md/dm-snapshot.c	2004-01-18 15:09:18.569163966 +0100
+++ linux-2.4.24/drivers/md/dm-snapshot.c	2004-01-18 16:02:40.858328124 +0100
@@ -92,6 +92,9 @@
 
 	/* List of snapshots for this origin */
 	struct list_head snapshots;
+
+	/* Count of snapshots and origins referrencing this structure. */
+	unsigned int count;
 };
 
 /*
@@ -155,6 +158,35 @@
 }
 
 /*
+ * Allocate and initialize an origin structure.
+ */
+static struct origin * __alloc_origin(kdev_t dev)
+{
+	struct origin *o = kmalloc(sizeof(*o), GFP_KERNEL);
+	if (o) {
+		o->dev = dev;
+		INIT_LIST_HEAD(&o->hash_list);
+		INIT_LIST_HEAD(&o->snapshots);
+		__insert_origin(o);
+	}
+	return o;
+}
+
+static void __get_origin(struct origin *o)
+{
+	o->count++;
+}
+
+static void __put_origin(struct origin *o)
+{
+	o->count--;
+	if (o->count == 0) {
+		list_del(&o->hash_list);
+		kfree(o);
+	}
+}
+
+/*
  * Make a note of the snapshot and its origin so we can look it
  * up when the origin has a write on it.
  */
@@ -168,20 +200,37 @@
 
 	if (!o) {
 		/* New origin */
-		o = kmalloc(sizeof(*o), GFP_KERNEL);
+		o = __alloc_origin(dev);
 		if (!o) {
 			up_write(&_origins_lock);
 			return -ENOMEM;
 		}
+	}
 
-		/* Initialise the struct */
-		INIT_LIST_HEAD(&o->snapshots);
-		o->dev = dev;
+	__get_origin(o);
+	list_add_tail(&snap->list, &o->snapshots);
 
-		__insert_origin(o);
+	up_write(&_origins_lock);
+	return 0;
+}
+
+static int register_origin(kdev_t dev)
+{
+	struct origin *o;
+
+	down_write(&_origins_lock);
+	o = __lookup_origin(dev);
+
+	if (!o) {
+		/* New origin */
+		o = __alloc_origin(dev);
+		if (!o) {
+			up_write(&_origins_lock);
+			return -ENOMEM;
+		}
 	}
 
-	list_add_tail(&snap->list, &o->snapshots);
+	__get_origin(o);
 
 	up_write(&_origins_lock);
 	return 0;
@@ -195,11 +244,18 @@
 	o = __lookup_origin(s->origin->dev);
 
 	list_del(&s->list);
-	if (list_empty(&o->snapshots)) {
-		list_del(&o->hash_list);
-		kfree(o);
-	}
+	__put_origin(o);
+
+	up_write(&_origins_lock);
+}
+
+static void unregister_origin(kdev_t dev)
+{
+	struct origin *o;
 
+	down_write(&_origins_lock);
+	o = __lookup_origin(dev);
+	__put_origin(o);
 	up_write(&_origins_lock);
 }
 
@@ -524,9 +580,6 @@
 		goto bad5;
 	}
 
-	/* Flush IO to the origin device */
-	fsync_dev(s->origin->dev);
-
 	/* Add snapshot to the list of snapshots for this origin */
 	if (register_snapshot(s)) {
 		r = -EINVAL;
@@ -1093,6 +1146,13 @@
 		return r;
 	}
 
+	r = register_origin(dev->dev);
+	if (r) {
+		ti->error = "Cannot register origin";
+		dm_put_device(ti, dev);
+		return r;
+	}
+
 	ti->private = dev;
 	return 0;
 }
@@ -1100,6 +1160,7 @@
 static void origin_dtr(struct dm_target *ti)
 {
 	struct dm_dev *dev = (struct dm_dev *) ti->private;
+	unregister_origin(dev->dev);
 	dm_put_device(ti, dev);
 }
 
diff -urN linux-2.4.24.org/drivers/md/dm-sparse.c linux-2.4.24/drivers/md/dm-sparse.c
--- linux-2.4.24.org/drivers/md/dm-sparse.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.4.24/drivers/md/dm-sparse.c	2004-01-18 16:04:48.284615142 +0100
@@ -0,0 +1,709 @@
+/* -*- linux-c -*- */
+
+/*
+ *   Copyright (c) International Business Machines  Corp., 2002
+ *
+ *   This program is free software;  you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation; either version 2 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ *   the GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program;  if not, write to the Free Software
+ *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * linux/drivers/md/dm-sparse.c
+ *
+ * Sparse target for device-mapper.
+ *
+ * This target provides the ability to create a sparse device. This 
+ * allows a device to pretend to be larger than it really is.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/mempool.h>
+#include <linux/vmalloc.h>
+
+#include "dm.h"
+#include "dm-io.h"
+
+#define MAX_HASH_CHAIN_ENTRIES 10
+#define NAME_SIZE 127
+
+/* Sparse Ioctl
+   device
+   start
+   chunk_size
+   chunks
+ */
+
+// Entries in the sparse remapping structure
+struct sparse_hash_entry {
+    u64 org_chunk; // Chunk number, not LBA.
+    u64 sparse_chunk; // Chunk number, not LBA.
+    struct sparse_hash_entry * next;
+    struct sparse_hash_entry * prev;
+};
+
+//Private data structure
+struct sparse_volume {
+    struct dm_dev *dev;
+    struct rw_semaphore sparse_semaphore;
+    struct sparse_hash_entry ** sparse_map; // Hash table of remappings
+    struct sparse_hash_entry * free_hash_list;
+    kmem_cache_t * hash_slab;
+    mempool_t * hash_pool;
+    u32 dm_io_flag;
+    u32 chunk_size;	// Sectors.
+    u32 chunk_shift; // Shift value for chunk size.
+    u32 num_chunks;	// In this volume.
+    u32 next_cow_entry; // Index into current COW table.
+    u64 current_cow_sector;	// LOGICAL sector of current COW table.
+    u32 next_free_chunk; // Index of next free chunk (not LBA!).
+    u32 hash_table_size; // Size of the hash table for the remap.
+    u64 start;
+    u64 cow_table[64]; // One sector's worth of COW tables.
+};
+
+/*************************** OLD SERVICES ****************************/
+
+/* computes log base 2 of value */
+inline int log2(u32 value) //ok to change to u32?
+{
+    int result = -1;
+    long tmp;              //ok to change to long?
+    
+    if (value) {
+	tmp = value;
+	result++;
+	while (!(tmp & 1)) {
+	    result++;
+	    tmp >>= 1;
+	}
+	if (tmp != 1) {
+	    result = -2;
+	}
+    }
+    return result;
+}
+
+/********************************* Functions *********************************/
+
+/***************************** Hash Functions *****************************/
+
+/* Take and initialize from the free hash list */
+static struct sparse_hash_entry * 
+allocate_sparse_hash_entry( struct sparse_volume * volume,	
+			    u64 org_chunk,
+			    u64 sparse_chunk )
+{
+    struct sparse_hash_entry * hash_entry;
+    
+	hash_entry = volume->free_hash_list;
+	if ( hash_entry ) { //should always be the case b/c preallocate these
+	    volume->free_hash_list = hash_entry->next;
+	    hash_entry->org_chunk = org_chunk;
+	    hash_entry->sparse_chunk = sparse_chunk;
+	    hash_entry->next = NULL;
+	    hash_entry->prev = NULL;
+	}
+	
+	return hash_entry;
+}
+
+/*
+ *	This function inserts a new entry into a sparse hash chain, immediately
+ *	following the specified entry. This function should not be used to add
+ *	an entry into an empty list, or as the first entry in an existing list.
+ *	For that case, use insert_sparse_map_entry_at_head().
+ */
+static int insert_sparse_hash_entry( struct sparse_hash_entry * entry,
+				     struct sparse_hash_entry * base )
+{
+	entry->next = base->next;
+	entry->prev = base;
+	base->next = entry;
+	if ( entry->next ) {
+		entry->next->prev = entry;
+	}
+	return 0;
+}
+
+/*
+ *	This function inserts a new entry into a sparse chain as the first
+ *	entry in the chain.
+ */
+static int insert_sparse_hash_entry_at_head( struct sparse_hash_entry * entry,
+					     struct sparse_hash_entry ** head )
+{
+	entry->next = *head;
+	entry->prev = NULL;
+	*head = entry;
+	if ( entry->next ) {
+	    entry->next->prev = entry;
+	}
+	return 0;
+}
+
+/*
+ *	Delete all items in a single chain in the hash table.
+ */
+static int delete_sparse_hash_chain( struct sparse_volume * vol, 
+				     struct sparse_hash_entry * head )
+{
+    struct sparse_hash_entry * next;
+    
+    while ( head ) {
+	next = head->next;
+	mempool_free( head, vol->hash_pool );
+	head = next;
+    }
+    return 0;
+}
+
+/*
+ *	This function will search the hash chain that is anchored at the
+ *	specified head pointer. If the chunk number is found, a pointer to that
+ *	entry in the chain is set, and a 1 is returned. If the chunk is not
+ *	found, a pointer to the previous entry is set and 0 is returned. If the
+ *	return pointer is NULL, this means either the list is empty, or the
+ *	specified sector should become the first list item.
+ */
+static int search_sparse_hash_chain( u64 chunk,
+				     struct sparse_hash_entry * head,
+				     struct sparse_hash_entry ** result )
+{
+    struct sparse_hash_entry * curr = head;
+    struct sparse_hash_entry * prev = head;
+    while ( curr && curr->org_chunk < chunk ) {
+	prev = curr;
+	curr = curr->next;
+    }
+    if (!curr) { // Either an empty chain or went off the end of the chain.
+	*result = prev;
+	return 0;
+    }
+    else if ( curr->org_chunk != chunk ) {
+	*result = curr->prev;
+	return 0;
+    }
+    else {
+	*result = curr;
+	return 1;
+    }
+}
+
+/*
+ *	This function takes a cow table entry (from the on-disk data), and
+ *	converts it into an appropriate entry for the sparse map, and
+ *	inserts it into the appropriate map for the specified volume.
+ */
+static int add_cow_entry_to_sparse_map( u64 org_chunk,
+					u64 sparse_chunk,
+					struct sparse_volume * volume )
+{
+    struct sparse_hash_entry * new_entry;
+    struct sparse_hash_entry * target_entry;
+    u32 hash_value;
+    int rc = -EINVAL;
+
+    new_entry = allocate_sparse_hash_entry(volume, org_chunk, sparse_chunk);
+    if (!new_entry) {
+	return -ENOMEM;
+    }
+    
+    hash_value = (long)org_chunk % volume->hash_table_size;
+    
+    if (! search_sparse_hash_chain( org_chunk, 
+				    volume->sparse_map[hash_value], 
+				    &target_entry ) ) {	
+	//should always take this path
+
+	if ( target_entry ) {
+	    insert_sparse_hash_entry( new_entry, target_entry );
+	}
+	else {
+	    insert_sparse_hash_entry_at_head
+		( new_entry, &(volume->sparse_map[hash_value]) );
+	}
+	rc = 0;
+    }
+    return rc;
+}
+
+/*
+ *	Construct the initial hash table state based on 
+ *	existing COW tables on the disk.
+ */
+static int build_sparse_maps(struct sparse_volume * volume)
+{
+    int rc = 0, done = 0;
+    struct io_region job;
+    struct page * page;
+    unsigned int error, offset;
+  
+    while (!done) {
+	
+	// Read in one sector's worth of COW tables.
+        job.dev = volume->dev->dev;
+        job.sector = volume->current_cow_sector;
+        job.count = 1;
+        page = virt_to_page(volume->cow_table);
+        offset = (unsigned long)volume->cow_table & ~PAGE_MASK;
+        rc = dm_io_sync(1, &job, READ, page, offset, &error);
+        if (rc) {
+            return rc;
+	}
+
+	// Translate every valid COW table entry into
+	// a sparse map entry.
+	for ( volume->next_cow_entry = 0;
+
+	      volume->next_cow_entry < (SECTOR_SIZE/sizeof(u64)) &&
+		  volume->cow_table[volume->next_cow_entry] != 
+		  0xffffffffffffffff;
+
+	      volume->next_cow_entry++, volume->next_free_chunk++ ) {
+
+	    if ( (rc = add_cow_entry_to_sparse_map
+		  ( le64_to_cpu( volume->cow_table[volume->next_cow_entry] ),
+		    volume->next_free_chunk, volume ))) {
+		return( rc );
+	    }
+	}
+	// Move on to the next sector if necessary.
+	if ( volume->next_cow_entry == (SECTOR_SIZE/sizeof(u64)) ) {
+	    volume->current_cow_sector++;
+	}
+	else {
+	    done = 1;
+	}
+    }
+    return 0;
+}
+
+/************************* Other Functions ************************/
+
+/*
+ * Function: sparse_remap_chunk
+ *
+ *	This function performs a sector remap on a sparse volume. This should
+ *	be called from the I/O path, It first determines the base sector
+ *	of the chunk containing the specified sector, and saves the remainder.
+ *	Then it performs a search through the sparse map for the specified 
+ *	volume. If a match is found, the sector number is changed to the new 
+ *	value. If no match is found, the value is left the same, meaning the 
+ *	chunk has not been remapped.
+ */
+static int sparse_remap_chunk( struct sparse_volume * sparse_volume,
+			       u64 * sector )
+{
+    struct sparse_hash_entry * result;
+    u64 chunk;
+    u32 hash_value;
+    u32 remainder;
+    int rc = 1;
+    
+    down_read(&sparse_volume->sparse_semaphore);
+    
+    remainder = *sector & (u64)(sparse_volume->chunk_size - 1);
+    chunk = *sector >> sparse_volume->chunk_shift;
+    hash_value = ((u32)chunk) % sparse_volume->hash_table_size;
+    
+    if ( search_sparse_hash_chain( chunk, 
+				   sparse_volume->sparse_map[hash_value], 
+				   &result) ) {
+	*sector = ( result->sparse_chunk << sparse_volume->chunk_shift ) 
+	    + remainder;
+	rc =  0;
+    }
+    up_read(&sparse_volume->sparse_semaphore);
+    return rc;
+}
+
+/* Function: sparse_cow_write
+ *
+ *	Check this sparse node to see if the given sector/chunk has been
+ *	remapped yet. If it hasn't, create a new hash table entry, update the
+ *	in-memory COW table, write the COW table to disk.
+ */
+
+static int sparse_cow_write( struct sparse_volume * sparse_volume,
+			     u64 * sector )
+{
+    struct sparse_hash_entry * target_entry, * new_map_entry;
+    struct io_region job;
+    struct page * page;
+    char * cow = NULL;
+    unsigned int error, offset;
+    u64 chunk;
+    u32 hash_value = 0;
+    u32 remainder;
+    int rc;
+    
+    down_write(&sparse_volume->sparse_semaphore);
+    
+    remainder = *sector & (u64)(sparse_volume->chunk_size - 1);
+    chunk = *sector >> sparse_volume->chunk_shift;
+    hash_value = ((u32)chunk) % sparse_volume->hash_table_size;
+    
+    if ( search_sparse_hash_chain( chunk, 
+				   sparse_volume->sparse_map[hash_value], 
+				   &target_entry) ) {
+	*sector = 
+	    ( target_entry->sparse_chunk << sparse_volume->chunk_shift ) 
+	    + remainder;
+	rc = 0;
+	goto out;
+    }
+    
+    // Is there enough room left on this sparse to remap this chunk?
+    if ( sparse_volume->next_free_chunk >= sparse_volume->num_chunks ) {
+	DMERR("dm-sparse: full no new remaps allowed\n");
+	rc = -ENOSPC;
+	goto out;
+    }
+    
+    // Create and initialize a new hash table entry for the new remap.
+    new_map_entry = allocate_sparse_hash_entry
+	(sparse_volume, chunk, sparse_volume->next_free_chunk);
+    if ( ! new_map_entry ) {
+	// Can't get memory for map entry. Disable this sparse.
+	DMERR("dm-sparse: memory error allocating hash entry\n");
+	rc = -ENOMEM;
+	goto out;
+    }
+    
+    //Always write cow table so its safe
+    cow = kmalloc( SECTOR_SIZE, GFP_KERNEL );
+    if (! cow ) {
+	// Can't get I/O buffer. Disable this sparse.
+	DMERR("dm-sparse: memory error allocating COW table buffer");
+	rc = -ENOMEM;
+	goto out;	
+    }
+
+    // Add the entry to the hash table.
+    if ( target_entry ) {	
+	insert_sparse_hash_entry( new_map_entry, target_entry );
+    }
+    else {
+	insert_sparse_hash_entry_at_head
+	    ( new_map_entry, 
+	      &(sparse_volume->sparse_map[hash_value]) );
+    }
+    
+    sparse_volume->next_free_chunk++;
+    
+    // Update the appropriate entry in the COW table. 
+    sparse_volume->cow_table[sparse_volume->next_cow_entry] = 
+	cpu_to_le64(chunk);
+    sparse_volume->next_cow_entry++;
+    
+    memcpy(cow, sparse_volume->cow_table, SECTOR_SIZE);
+
+    //because of ordering issues needs to be synchronous
+    job.dev = sparse_volume->dev->dev;
+    job.sector = sparse_volume->current_cow_sector;
+    job.count = 1;
+    page = virt_to_page(cow);
+    offset = (unsigned long)cow & ~PAGE_MASK;
+    dm_io_sync(1, &job, WRITE, page, offset, &error);
+    
+    // Update the in-memory COW table values.
+    if ( sparse_volume->next_cow_entry >= (SECTOR_SIZE/sizeof(u64)) )
+	{
+	    sparse_volume->next_cow_entry = 0;
+	    sparse_volume->current_cow_sector++;
+	    memset(sparse_volume->cow_table, 0xff, SECTOR_SIZE);
+	}
+    
+    *sector = ( new_map_entry->sparse_chunk << sparse_volume->chunk_shift )
+	+ remainder;
+    
+    rc = 0;
+    
+ out:
+    up_write(&sparse_volume->sparse_semaphore);
+    if ( cow ) {
+	kfree( cow );
+    }
+
+    return rc;
+}
+
+/************************ EXPORT FUNCTIONS ************************/
+
+/*
+ * Function: sparse_dtr
+ */
+static void sparse_dtr( struct dm_target *ti )
+{
+    struct sparse_volume * vol = (struct sparse_volume *)ti->private;
+    int i;
+
+    if (vol) {
+
+	if (vol->sparse_map) {
+	    for ( i = 0; i < vol->hash_table_size; i++ ) {
+		delete_sparse_hash_chain( vol, vol->sparse_map[i] );
+	    }
+	    delete_sparse_hash_chain( vol, vol->free_hash_list );
+	    vfree(vol->sparse_map);
+	}
+
+	if (vol->hash_pool)
+	    mempool_destroy(vol->hash_pool);
+	
+	if (vol->hash_slab) 
+	    kmem_cache_destroy(vol->hash_slab);
+
+	dm_put_device(ti, vol->dev);
+
+        if (vol->dm_io_flag) {
+	    dm_io_put(1);
+	}
+
+	kfree( vol );
+    }
+}
+
+/*
+ * Function: sparse_ctr
+ */
+static int sparse_ctr( struct dm_target *ti, unsigned int argc, char** argv )
+{
+    int i, rc = -EINVAL;
+    struct sparse_hash_entry *new_entry;
+    struct sparse_volume *vol;
+    struct dm_dev *dev;
+    u32 chunk_size, chunks;
+    u64 start;
+    char* end, slab_name[NAME_SIZE+1];
+
+    if ( argc != 4 ) {
+	ti->error="dm-sparse: wrong number of arguments";
+	return rc;
+    }
+
+    start = simple_strtoull(argv[1], &end, 10);
+    if (*end) {
+	ti->error="dm-sparse: Invalid first chunk lba";
+	return rc;
+    }
+
+    chunk_size = simple_strtoul(argv[2], &end, 10);    
+    if (*end) {
+	ti->error="dm-sparse: Invalid chunk_size";
+	return rc;
+    }
+
+    chunks = simple_strtoul(argv[3], &end, 10);
+    if (*end) {
+	ti->error="dm-sparse: Invalid number of chunks";
+	return rc;
+    }
+
+    if ( dm_get_device( ti, argv[0], ti->begin, start + chunks * chunk_size,
+			dm_table_get_mode(ti->table), &dev ) ) {
+	ti->error = "dm-sparse: Device lookup failed";
+	return rc;
+    }
+
+    vol = kmalloc(sizeof(struct sparse_volume), GFP_KERNEL);
+    if ( !vol ) {
+	ti->error = "dm-sparse: Memory allocation for private-data failed";
+        rc = -ENOMEM;
+	goto out;
+    }
+
+    memset( vol, 0, sizeof(struct sparse_volume) );
+
+    rc = dm_io_get(1);
+    if (rc) {
+	    ti->error = "dm-sparse: failed to initialize dm-io.";
+	    sparse_dtr(ti);
+	    return rc;
+    }
+    
+    // Initialize
+    vol->dm_io_flag = 1;
+    vol->chunk_size = chunk_size;
+    vol->chunk_shift = log2(chunk_size);
+    vol->num_chunks = chunks;
+    vol->current_cow_sector = 1;
+    vol->hash_table_size = chunks / MAX_HASH_CHAIN_ENTRIES + 1;
+    vol->start = start;
+    vol->dev = dev;
+    init_rwsem(&vol->sparse_semaphore);
+
+    snprintf(slab_name, NAME_SIZE, "sparse-%p", vol);
+    vol->hash_slab = kmem_cache_create(slab_name,
+				       sizeof(struct sparse_hash_entry),
+				       0, SLAB_HWCACHE_ALIGN,
+				       NULL, NULL);
+    if ( ! vol->hash_slab ) {
+	ti->error = "dm-sparse: memory allocation error in hash slab create";
+	sparse_dtr(ti);
+	return -ENOMEM;
+    }
+    vol->hash_pool = mempool_create(1, mempool_alloc_slab,
+				    mempool_free_slab,
+				    vol->hash_slab);    
+    if ( ! vol->hash_pool ) {
+	ti->error = "dm-sparse: memory allocation error in hash pool create";
+	sparse_dtr(ti);
+	return -ENOMEM;
+    }
+
+    // Sparse hash table
+    vol->sparse_map = vmalloc( vol->hash_table_size * 
+			       sizeof( struct sparse_hash_entry * ) );
+    if ( ! vol->sparse_map ) {
+	ti->error = "dm-sparse: Memory allocation error in sparse_map create";
+	sparse_dtr(ti);
+	return -ENOMEM;
+    }
+
+    memset( vol->sparse_map, 0, vol->hash_table_size * 
+	    sizeof( struct sparse_hash_entry * ) );
+    
+    for ( i = 0; i < chunks; i++ ) {
+
+	new_entry = mempool_alloc(vol->hash_pool, GFP_KERNEL );
+	if ( ! new_entry ) {
+	    ti->error="dm-sparse: memory allocation error in hash table setup";
+	    sparse_dtr(ti);
+	    return -ENOMEM;
+	}
+
+	new_entry->next = vol->free_hash_list;
+	vol->free_hash_list = new_entry;
+    }
+    
+    rc = build_sparse_maps(vol);
+    if (rc) {
+	ti->error = "dm-sparse: error building hash tables";
+	sparse_dtr(ti);
+	return rc;
+    }
+
+    ti->private = vol;
+    return rc;
+
+ out:
+    dm_put_device(ti, dev);
+    return rc;
+}
+
+/*
+ * Function: sparse_map
+ */
+static int sparse_map( struct dm_target * ti, struct buffer_head * bh, int rw,
+		       union map_info *map_context )
+{
+    struct sparse_volume * volume = (struct sparse_volume*)ti->private;
+    u64 sector = bh->b_rsector;
+    int rc;
+
+    // Check if this sector has been remapped
+    rc = sparse_remap_chunk( volume, &sector );
+    
+    if ( rc < 0 ) { //Error
+	return rc;
+    }
+    
+    if ( rc == 0 ) { // Remapped I/O : read or write same logic
+	bh->b_rsector = volume->start + sector;
+	bh->b_rdev = volume->dev->dev;
+	return 1;
+    }
+    
+    // ( Previously )Un-mapped:	read / write different logic
+    
+    if ( rw ) { //write :
+	rc = sparse_cow_write( volume, &sector );
+	
+	if ( rc < 0 ) { //Error
+	    return rc;
+	}		    
+	//Send write on
+	bh->b_rsector = volume->start + sector;
+	bh->b_rdev = volume->dev->dev;
+	return 1;
+    }
+    
+    //Reading something that was never written 
+    //return zeros and indicate complete
+    memset(bh->b_data, 0x0, bh->b_size);
+    bh->b_end_io(bh, 1);
+    return 0;
+}
+
+static int sparse_status( struct dm_target *ti, status_type_t type, 
+			  char *result, unsigned int maxlen )
+{
+    struct sparse_volume * vol = (struct sparse_volume * )ti->private;
+    
+    switch(type) {
+
+    case STATUSTYPE_INFO:
+	snprintf( result, maxlen, "%d%%", 
+		  ( vol->next_free_chunk * 100 ) / vol->num_chunks );	
+	break;
+	
+    case STATUSTYPE_TABLE:
+	snprintf( result, maxlen, "%s %Lu %u %u", 
+		  dm_kdevname(vol->dev->dev), vol->start, 
+		  vol->chunk_size, vol->num_chunks ); 
+	break;
+
+    default:
+	break;
+    }
+    
+    return 0;
+}
+
+/****************** FUNCTION TABLE **********************/
+
+static struct target_type sparse_target = {
+    .name = "sparse",
+    .module = THIS_MODULE,
+    .ctr = sparse_ctr,
+    .dtr = sparse_dtr,
+    .map = sparse_map,
+    .status = sparse_status,
+};
+
+/********************* REGISTRATION *****************/
+
+int __init sparse_init(void)
+{
+    int rc = dm_register_target(&sparse_target);
+
+    if ( rc < 0 )
+	DMWARN("sparse target registration failed");
+
+    return rc;
+}
+
+void __exit sparse_exit(void)
+{
+    if (dm_unregister_target(&sparse_target) )
+	DMWARN("sparse target unregistration failed");
+
+    return;
+}
+
+module_init(sparse_init);
+module_exit(sparse_exit);
+MODULE_LICENSE("GPL");
diff -urN linux-2.4.24.org/drivers/md/lvm.c linux-2.4.24/drivers/md/lvm.c
--- linux-2.4.24.org/drivers/md/lvm.c	2004-01-18 14:58:09.106704262 +0100
+++ linux-2.4.24/drivers/md/lvm.c	2004-01-18 15:57:55.568033496 +0100
@@ -236,9 +236,6 @@
 #define DEVICE_OFF(device)
 #define LOCAL_END_REQUEST
 
-/* lvm_do_lv_create calls fsync_dev_lockfs()/unlockfs() */
-/* #define	LVM_VFS_ENHANCEMENT */
-
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
@@ -2250,12 +2247,8 @@
 	if (lv_ptr->lv_access & LV_SNAPSHOT) {
 		lv_t *org = lv_ptr->lv_snapshot_org, *last;
 
-		/* sync the original logical volume */
-		fsync_dev(org->lv_dev);
-#ifdef	LVM_VFS_ENHANCEMENT
 		/* VFS function call to sync and lock the filesystem */
 		fsync_dev_lockfs(org->lv_dev);
-#endif
 
 		down_write(&org->lv_lock);
 		org->lv_access |= LV_SNAPSHOT_ORG;
@@ -2281,11 +2274,9 @@
 	else
 		set_device_ro(lv_ptr->lv_dev, 1);
 
-#ifdef	LVM_VFS_ENHANCEMENT
 /* VFS function call to unlock the filesystem */
 	if (lv_ptr->lv_access & LV_SNAPSHOT)
 		unlockfs(lv_ptr->lv_snapshot_org->lv_dev);
-#endif
 
 	lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].de =
 	    lvm_fs_create_lv(vg_ptr, lv_ptr);
diff -urN linux-2.4.24.org/drivers/md/Makefile linux-2.4.24/drivers/md/Makefile
--- linux-2.4.24.org/drivers/md/Makefile	2004-01-18 15:09:18.620153502 +0100
+++ linux-2.4.24/drivers/md/Makefile	2004-01-18 16:04:48.278616388 +0100
@@ -28,6 +28,8 @@
 obj-$(CONFIG_BLK_DEV_LVM)		+= lvm-mod.o
 
 obj-$(CONFIG_BLK_DEV_DM)		+= dm-mod.o
+obj-$(CONFIG_BLK_DEV_DM_BBR)		+= dm-bbr.o
+obj-$(CONFIG_BLK_DEV_DM_SPARSE)		+= dm-sparse.o
 
 include $(TOPDIR)/Rules.make
 
diff -urN linux-2.4.24.org/drivers/md/md.c linux-2.4.24/drivers/md/md.c
--- linux-2.4.24.org/drivers/md/md.c	2004-01-18 14:58:09.227678566 +0100
+++ linux-2.4.24/drivers/md/md.c	2004-01-18 16:04:27.702900923 +0100
@@ -2146,6 +2146,8 @@
 
 	SET_FROM_SB(utime);
 	SET_FROM_SB(state);
+	if (mddev->curr_resync)
+		info.state |= (1 << MD_ARRAY_RECOVERY_RUNNING);		
 	SET_FROM_SB(active_disks);
 	SET_FROM_SB(working_disks);
 	SET_FROM_SB(failed_disks);
diff -urN linux-2.4.24.org/drivers/md/multipath.c linux-2.4.24/drivers/md/multipath.c
--- linux-2.4.24.org/drivers/md/multipath.c	2004-01-18 14:58:09.254672832 +0100
+++ linux-2.4.24/drivers/md/multipath.c	2004-01-18 16:04:38.291691263 +0100
@@ -139,15 +139,16 @@
 static int multipath_map (mddev_t *mddev, kdev_t *rdev)
 {
 	multipath_conf_t *conf = mddev_to_conf(mddev);
-	int i, disks = MD_SB_DISKS;
+	int i;
 
 	/*
 	 * Later we do read balancing on the read side 
 	 * now we use the first available disk.
 	 */
 
-	for (i = 0; i < disks; i++) {
+	for (i = 0; i < conf->nr_disks; i++) {
 		if (conf->multipaths[i].operational) {
+			/* first operational is winner! */
 			*rdev = conf->multipaths[i].dev;
 			return (0);
 		}
@@ -191,6 +192,8 @@
 {
 	struct multipath_bh * mp_bh = (struct multipath_bh *)(bh->b_private);
 
+	atomic_dec(&mp_bh->multipath->nr_pending);
+
 	/*
 	 * this branch is our 'one multipath IO has finished' event handler:
 	 */
@@ -223,19 +226,39 @@
 }
 
 /*
- * This routine returns the disk from which the requested read should
- * be done.
+ * Multipath read balance ...
+ *
+ * Returns:
+ *
+ *	If no active paths
+ *
+ *		- Error ( -1 )
+ *
+ * 	If active paths == 1
+ *
+ *		- 1st active path encountered
+ *
+ * 	If active paths > 1
+ *
+ *		- 1st idle active path encountered
+ *		- else ... the active path doing the least amount of work.
  */
-
 static int multipath_read_balance (multipath_conf_t *conf)
 {
-	int disk;
-
-	for (disk = 0; disk < conf->raid_disks; disk++)	
-		if (conf->multipaths[disk].operational)
-			return disk;
-	BUG();
-	return 0;
+	int i, disk=-1, nr_pending, least_pending=0;
+	
+	for (i=0; i<conf->nr_disks; i++) {
+		if (conf->multipaths[i].operational) {
+			nr_pending = atomic_read(&conf->multipaths[i].nr_pending);
+			if (nr_pending==0 || conf->working_disks==1)
+				return i;
+			if (least_pending==0 || nr_pending<least_pending) {
+				disk = i;
+				least_pending = nr_pending;
+			}
+		}
+	}
+	return disk;
 }
 
 static int multipath_make_request (mddev_t *mddev, int rw,
@@ -245,6 +268,7 @@
 	struct buffer_head *bh_req;
 	struct multipath_bh * mp_bh;
 	struct multipath_info *multipath;
+	int disk;
 
 	if (!buffer_locked(bh))
 		BUG();
@@ -267,7 +291,16 @@
 	/*
 	 * read balancing logic:
 	 */
-	multipath = conf->multipaths + multipath_read_balance(conf);
+	disk = multipath_read_balance(conf);
+	if (disk==-1) {
+		printk (KERN_ERR "multipath_make_request: no more operational IO paths.\n");
+		buffer_IO_error(bh);
+		return 0;
+	}
+
+	multipath = conf->multipaths + disk;
+	mp_bh->multipath = multipath;
+	atomic_inc(&multipath->nr_pending);
 
 	bh_req = &mp_bh->bh_req;
 	memcpy(bh_req, bh, sizeof(*bh));
@@ -331,13 +364,14 @@
 {
 	multipath_conf_t *conf = mddev_to_conf(mddev);
 	struct multipath_info * multipaths = conf->multipaths;
-	int disks = MD_SB_DISKS;
 	int other_paths = 1;
-	int i;
+	int i, first = 1;
+	mdk_rdev_t *rdev;
+	struct md_list_head *tmp;
 
 	if (conf->working_disks == 1) {
 		other_paths = 0;
-		for (i = 0; i < disks; i++) {
+		for (i = 0; i < MD_SB_DISKS; i++) {
 			if (multipaths[i].spare) {
 				other_paths = 1;
 				break;
@@ -351,16 +385,17 @@
 		 * first check if this is a queued request for a device
 		 * which has just failed.
 		 */
-		for (i = 0; i < disks; i++) {
+		for (i = 0; i < MD_SB_DISKS; i++) {
 			if (multipaths[i].dev==dev && !multipaths[i].operational)
 				return 0;
 		}
 		printk (LAST_DISK);
 	} else {
+		mdp_super_t *sb = mddev->sb;
 		/*
 		 * Mark disk as unusable
 		 */
-		for (i = 0; i < disks; i++) {
+		for (i = 0; i < MD_SB_DISKS; i++) {
 			if (multipaths[i].dev==dev && multipaths[i].operational) {
 				mark_disk_bad(mddev, i);
 				break;
@@ -369,7 +404,6 @@
 		if (!conf->working_disks) {
 			int err = 1;
 			mdp_disk_t *spare;
-			mdp_super_t *sb = mddev->sb;
 
 			spare = get_spare(mddev);
 			if (spare) {
@@ -384,6 +418,21 @@
 				sb->spare_disks--;
 			}
 		}
+		/* prevent unnecessary work in md_do_recovery() */
+		if (conf->working_disks) {
+			conf->raid_disks = conf->working_disks
+					 = sb->raid_disks = sb->active_disks;
+		}
+		/* update alias disk info to insure we can do sb commit. */
+		ITERATE_RDEV(mddev,rdev,tmp) {
+			if (first && disk_active(&sb->disks[rdev->desc_nr])) {
+				rdev->alias_device = 0;
+ 				first = 0;
+			} else {
+				if (!disk_faulty(&sb->disks[rdev->desc_nr]))
+					rdev->alias_device = 1;
+			}
+ 		}
 	}
 	return 0;
 }
@@ -677,9 +726,8 @@
 /*
  * This is a kernel thread which:
  *
- *	1.	Retries failed read operations on working multipaths.
+ *	1.	Retries failed operations on working multipaths.
  *	2.	Updates the raid superblock when problems encounter.
- *	3.	Performs writes following reads for array syncronising.
  */
 
 static void multipathd (void *data)
@@ -833,6 +881,7 @@
 	mdk_rdev_t *rdev, *def_rdev = NULL;
 	struct md_list_head *tmp;
 	int num_rdevs = 0;
+	int active_disks = 0, spare_disks = 0, faulty_disks = 0;
 
 	MOD_INC_USE_COUNT;
 
@@ -881,9 +930,7 @@
 			printk(NOT_IN_SYNC, partition_name(rdev->dev));
 
 		/*
-		 * Mark all disks as spare to start with, then pick our
-		 * active disk.  If we have a disk that is marked active
-		 * in the sb, then use it, else use the first rdev.
+		 * Mark all disks as spare to start with.
 		 */
 		disk->number = desc->number;
 		disk->raid_disk = desc->raid_disk;
@@ -894,20 +941,21 @@
 		mark_disk_sync(desc);
 
 		if (disk_active(desc)) {
-			if(!conf->working_disks) {
-				printk(OPERATIONAL, partition_name(rdev->dev),
- 					desc->raid_disk);
-				disk->operational = 1;
-				disk->spare = 0;
-				conf->working_disks++;
-				def_rdev = rdev;
-			} else {
-				mark_disk_spare(desc);
-			}
-		} else
-			mark_disk_spare(desc);
+			printk(OPERATIONAL, partition_name(rdev->dev),
+ 				desc->raid_disk);
+			disk->operational = 1;
+			disk->spare = 0;
+			conf->working_disks++;
+			def_rdev = rdev;
+			active_disks++;
+		} else if (disk_faulty(desc)) {
+			disk->spare = 0;
+			faulty_disks++;
+		} else {
+			spare_disks++;
+		}
 
-		if(!num_rdevs++) def_rdev = rdev;
+		num_rdevs++;
 	}
 	if(!conf->working_disks && num_rdevs) {
 		desc = &sb->disks[def_rdev->desc_nr];
@@ -918,11 +966,12 @@
 		disk->spare = 0;
 		conf->working_disks++;
 		mark_disk_active(desc);
+		active_disks++;
 	}
 	/*
-	 * Make sure our active path is in desc spot 0
+	 * If there is only 1 active path ... make sure it is in desc spot 0
 	 */
-	if(def_rdev->desc_nr != 0) {
+	if (active_disks == 1 && def_rdev->desc_nr != 0) {
 		rdev = find_rdev_nr(mddev, 0);
 		desc = &sb->disks[def_rdev->desc_nr];
 		desc2 = sb->disks;
@@ -940,10 +989,10 @@
 			def_rdev->desc_nr = 0;
 		}
 	}
-	conf->raid_disks = sb->raid_disks = sb->active_disks = 1;
+	conf->raid_disks = sb->raid_disks = sb->active_disks = active_disks;
 	conf->nr_disks = sb->nr_disks = sb->working_disks = num_rdevs;
-	sb->failed_disks = 0;
-	sb->spare_disks = num_rdevs - 1;
+	sb->failed_disks = faulty_disks;
+	sb->spare_disks = spare_disks;
 	mddev->sb_dirty = 1;
 	conf->mddev = mddev;
 	conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
diff -urN linux-2.4.24.org/fs/buffer.c linux-2.4.24/fs/buffer.c
--- linux-2.4.24.org/fs/buffer.c	2004-01-18 14:55:22.305275818 +0100
+++ linux-2.4.24/fs/buffer.c	2004-01-18 15:57:55.602026171 +0100
@@ -419,6 +419,34 @@
 	fsync_dev(dev);
 }
 
+int fsync_dev_lockfs(kdev_t dev)
+{
+	/* you are not allowed to try locking all the filesystems
+	** on the system, your chances of getting through without
+	** total deadlock are slim to none.
+	*/
+	if (!dev)
+		return fsync_dev(dev) ;
+
+	sync_buffers(dev, 0);
+
+	lock_kernel();
+	/* note, the FS might need to start transactions to 
+	** sync the inodes, or the quota, no locking until
+	** after these are done
+	*/
+	sync_inodes(dev);
+	DQUOT_SYNC_DEV(dev);
+	/* if inodes or quotas could be dirtied during the
+	** sync_supers_lockfs call, the FS is responsible for getting
+	** them on disk, without deadlocking against the lock
+	*/
+	sync_supers_lockfs(dev) ;
+	unlock_kernel();
+
+	return sync_buffers(dev, 1) ;
+}
+
 asmlinkage long sys_sync(void)
 {
 	fsync_dev(0);
diff -urN linux-2.4.24.org/fs/reiserfs/super.c linux-2.4.24/fs/reiserfs/super.c
--- linux-2.4.24.org/fs/reiserfs/super.c	2004-01-18 14:55:18.875002271 +0100
+++ linux-2.4.24/fs/reiserfs/super.c	2004-01-18 15:57:55.657014322 +0100
@@ -84,7 +84,7 @@
     reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
     journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s));
     reiserfs_block_writes(&th) ;
-    journal_end(&th, s, 1) ;
+    journal_end_sync(&th, s, 1) ;
   }
   s->s_dirt = 0;
   unlock_kernel() ;
diff -urN linux-2.4.24.org/fs/super.c linux-2.4.24/fs/super.c
--- linux-2.4.24.org/fs/super.c	2004-01-18 14:55:11.177633010 +0100
+++ linux-2.4.24/fs/super.c	2004-01-18 15:57:55.687007859 +0100
@@ -38,6 +38,13 @@
 LIST_HEAD(super_blocks);
 spinlock_t sb_lock = SPIN_LOCK_UNLOCKED;
 
+/* 
+ * lock/unlockfs grab a read lock on s_umount, but you need this lock to 
+ * make sure no lockfs runs are in progress before inserting/removing 
+ * supers from the list.  
+ */
+static DECLARE_MUTEX(lockfs_sem);
+
 /*
  * Handling of filesystem drivers list.
  * Rules:
@@ -436,6 +443,19 @@
 	put_super(sb);
 }
 
+static void write_super_lockfs(struct super_block *sb)
+{
+	lock_super(sb);
+	if (sb->s_root && sb->s_op) {
+		if (sb->s_dirt && sb->s_op->write_super)
+			sb->s_op->write_super(sb);
+		if (sb->s_op->write_super_lockfs) {
+			sb->s_op->write_super_lockfs(sb);
+		}
+	}
+	unlock_super(sb);
+}
+
 static inline void write_super(struct super_block *sb)
 {
 	lock_super(sb);
@@ -483,6 +503,39 @@
 	spin_unlock(&sb_lock);
 }
 
+/*
+ * Note: don't check the dirty flag before waiting, we want the lock
+ * to happen every time this is called.  dev must be non-zero
+ */
+void sync_supers_lockfs(kdev_t dev)
+{
+	struct super_block * sb;
+
+	down(&lockfs_sem) ;
+	if (dev) {
+		sb = get_super(dev);
+		if (sb) {
+			write_super_lockfs(sb);
+			drop_super(sb);
+		}
+	}
+}
+
+void unlockfs(kdev_t dev)
+{
+	struct super_block * sb;
+
+	if (dev) {
+		sb = get_super(dev);
+		if (sb) {
+			if (sb->s_op && sb->s_op->unlockfs)
+				sb->s_op->unlockfs(sb) ;
+			drop_super(sb);
+		}
+	}
+	up(&lockfs_sem) ;
+}
+
 /**
  *	get_super	-	get the superblock of a device
  *	@dev: device to get the superblock for
@@ -702,6 +755,7 @@
 		goto out1;
 
 	error = -EBUSY;
+	down(&lockfs_sem);
 restart:
 	spin_lock(&sb_lock);
 
@@ -713,6 +767,7 @@
 		    ((flags ^ old->s_flags) & MS_RDONLY)) {
 			spin_unlock(&sb_lock);
 			destroy_super(s);
+			up(&lockfs_sem);
 			goto out1;
 		}
 		if (!grab_super(old))
@@ -720,12 +775,14 @@
 		destroy_super(s);
 		blkdev_put(bdev, BDEV_FS);
 		path_release(&nd);
+		up(&lockfs_sem);
 		return old;
 	}
 	s->s_dev = dev;
 	s->s_bdev = bdev;
 	s->s_flags = flags;
 	insert_super(s, fs_type);
+	up(&lockfs_sem);
 	if (!fs_type->read_super(s, data, flags & MS_VERBOSE ? 1 : 0))
 		goto Einval;
 	s->s_flags |= MS_ACTIVE;
@@ -833,7 +890,10 @@
 	if (!deactivate_super(sb))
 		return;
 
+	down(&lockfs_sem);
 	down_write(&sb->s_umount);
+	up(&lockfs_sem);
+
 	sb->s_root = NULL;
 	/* Need to clean after the sucker */
 	if (fs->fs_flags & FS_LITTER)
diff -urN linux-2.4.24.org/include/linux/fs.h linux-2.4.24/include/linux/fs.h
--- linux-2.4.24.org/include/linux/fs.h	2004-01-18 14:55:29.014855364 +0100
+++ linux-2.4.24/include/linux/fs.h	2004-01-18 15:59:11.694692181 +0100
@@ -1287,6 +1287,7 @@
 extern int sync_buffers(kdev_t, int);
 extern void sync_dev(kdev_t);
 extern int fsync_dev(kdev_t);
+extern int fsync_dev_lockfs(kdev_t);
 extern int fsync_super(struct super_block *);
 extern int fsync_no_super(kdev_t);
 extern void sync_inodes_sb(struct super_block *);
@@ -1305,6 +1306,8 @@
 extern int filemap_fdatasync(struct address_space *);
 extern int filemap_fdatawait(struct address_space *);
 extern void sync_supers(kdev_t dev, int wait);
+extern void sync_supers_lockfs(kdev_t);
+extern void unlockfs(kdev_t);
 extern int bmap(struct inode *, int);
 extern int notify_change(struct dentry *, struct iattr *);
 extern int permission(struct inode *, int);
diff -urN linux-2.4.24.org/include/linux/raid/md_u.h linux-2.4.24/include/linux/raid/md_u.h
--- linux-2.4.24.org/include/linux/raid/md_u.h	2004-01-18 14:55:35.554471508 +0100
+++ linux-2.4.24/include/linux/raid/md_u.h	2004-01-18 16:04:27.764887949 +0100
@@ -50,6 +50,10 @@
 	int patchlevel;
 } mdu_version_t;
 
+#define MD_ARRAY_CLEAN			0
+#define MD_ARRAY_ERRORS			1
+#define MD_ARRAY_RECOVERY_RUNNING	2
+
 typedef struct mdu_array_info_s {
 	/*
 	 * Generic constant information
diff -urN linux-2.4.24.org/include/linux/raid/multipath.h linux-2.4.24/include/linux/raid/multipath.h
--- linux-2.4.24.org/include/linux/raid/multipath.h	2004-01-18 14:55:35.563469605 +0100
+++ linux-2.4.24/include/linux/raid/multipath.h	2004-01-18 16:04:38.329683369 +0100
@@ -15,6 +15,7 @@
 	int		spare;
 
 	int		used_slot;
+	atomic_t	nr_pending;	/* number of pending requests */
 };
 
 struct multipath_private_data {
@@ -63,6 +64,7 @@
 	struct buffer_head	*master_bh;
 	struct buffer_head	bh_req;
 	struct multipath_bh	*next_mp; /* next for retry or in free list */
+	struct multipath_info	*multipath; /* allows end_request to easilly dec pending buffer count*/
 };
 /* bits for multipath_bh.state */
 #define	MPBH_Uptodate	1
diff -urN linux-2.4.24.org/kernel/ksyms.c linux-2.4.24/kernel/ksyms.c
--- linux-2.4.24.org/kernel/ksyms.c	2004-01-18 14:55:22.698192617 +0100
+++ linux-2.4.24/kernel/ksyms.c	2004-01-18 15:57:55.824978130 +0100
@@ -200,6 +200,8 @@
 EXPORT_SYMBOL(invalidate_inode_pages);
 EXPORT_SYMBOL(truncate_inode_pages);
 EXPORT_SYMBOL(fsync_dev);
+EXPORT_SYMBOL(fsync_dev_lockfs);
+EXPORT_SYMBOL(unlockfs);
 EXPORT_SYMBOL(fsync_no_super);
 EXPORT_SYMBOL(permission);
 EXPORT_SYMBOL(vfs_permission);