1 diff -urN linux-2.4.24.org/drivers/md/Config.in linux-2.4.24/drivers/md/Config.in
2 --- linux-2.4.24.org/drivers/md/Config.in 2004-01-18 15:09:18.503177509 +0100
3 +++ linux-2.4.24/drivers/md/Config.in 2004-01-18 16:05:08.202479073 +0100
5 dep_tristate ' RAID-1 (mirroring) mode' CONFIG_MD_RAID1 $CONFIG_BLK_DEV_MD
6 dep_tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5 $CONFIG_BLK_DEV_MD
7 dep_tristate ' Multipath I/O support' CONFIG_MD_MULTIPATH $CONFIG_BLK_DEV_MD
8 +if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
9 + dep_tristate ' Bad Block Relocation Device Target (EXPERIMENTAL)' CONFIG_BLK_DEV_DM_BBR $CONFIG_BLK_DEV_DM
10 + dep_tristate ' Sparse Device Target (EXPERIMENTAL)' CONFIG_BLK_DEV_DM_SPARSE $CONFIG_BLK_DEV_DM
13 dep_tristate ' Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM $CONFIG_MD
14 dep_tristate ' Device-mapper support' CONFIG_BLK_DEV_DM $CONFIG_MD
15 diff -urN linux-2.4.24.org/drivers/md/dm-bbr.c linux-2.4.24/drivers/md/dm-bbr.c
16 --- linux-2.4.24.org/drivers/md/dm-bbr.c 1970-01-01 01:00:00.000000000 +0100
17 +++ linux-2.4.24/drivers/md/dm-bbr.c 2004-01-18 16:03:13.099546349 +0100
20 + * (C) Copyright IBM Corp. 2002, 2003
22 + * This program is free software; you can redistribute it and/or modify
23 + * it under the terms of the GNU General Public License as published by
24 + * the Free Software Foundation; either version 2 of the License, or
25 + * (at your option) any later version.
27 + * This program is distributed in the hope that it will be useful,
28 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
29 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
30 + * the GNU General Public License for more details.
32 + * You should have received a copy of the GNU General Public License
33 + * along with this program; if not, write to the Free Software
34 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
36 + * linux/drivers/md/dm-bbr.c
38 + * Bad-block-relocation (BBR) target for device-mapper.
40 + * The BBR target is designed to remap I/O write failures to another safe
41 + * location on disk. Note that most disk drives have BBR built into them,
42 + * this means that our software BBR will be only activated when all hardware
43 + * BBR replacement sectors have been used.
46 +#include <linux/kernel.h>
47 +#include <linux/module.h>
48 +#include <linux/init.h>
49 +#include <linux/blkdev.h>
50 +#include <linux/spinlock.h>
51 +#include <linux/smp_lock.h>
52 +#include <linux/slab.h>
53 +#include <linux/mempool.h>
56 +#include "dm-daemon.h"
59 +/* Number of active BBR devices. */
60 +static int bbr_instances = 0;
61 +static DECLARE_MUTEX(bbr_instances_lock);
63 +/* Data pertaining to the I/O thread. */
64 +static struct dm_daemon * bbr_io_thread = NULL;
65 +static spinlock_t bbr_io_list_lock = SPIN_LOCK_UNLOCKED;
66 +static LIST_HEAD(bbr_io_list);
67 +static void bbr_io_handler(void);
69 +/* Global pools for bbr_io_buf's and bbr_remap's. */
70 +static kmem_cache_t * bbr_io_buf_cache;
71 +static mempool_t * bbr_io_buf_pool;
72 +static kmem_cache_t * bbr_remap_cache;
73 +static mempool_t * bbr_remap_pool;
75 +static void bbr_free_remap(struct bbr_private * bbr_id);
80 + * Delete the pools for the remap list and I/O anchors.
82 +static void destroy_pools(void)
84 + if (bbr_io_buf_pool) {
85 + mempool_destroy(bbr_io_buf_pool);
86 + bbr_io_buf_pool = NULL;
88 + if (bbr_io_buf_cache) {
89 + kmem_cache_destroy(bbr_io_buf_cache);
90 + bbr_io_buf_cache = NULL;
92 + if (bbr_remap_pool) {
93 + mempool_destroy(bbr_remap_pool);
94 + bbr_remap_pool = NULL;
96 + if (bbr_remap_cache) {
97 + kmem_cache_destroy(bbr_remap_cache);
98 + bbr_remap_cache = NULL;
105 + * Create mempools for the remap list and I/O anchors.
107 +static int create_pools(void)
109 + if (!bbr_remap_cache) {
110 + bbr_remap_cache = kmem_cache_create("BBR_Remap_Cache",
111 + sizeof(struct bbr_runtime_remap),
112 + 0, SLAB_HWCACHE_ALIGN,
114 + if (!bbr_remap_cache) {
115 + DMERR("Unable to create BBR remap cache.");
119 + if (!bbr_remap_pool) {
120 + bbr_remap_pool = mempool_create(64, mempool_alloc_slab,
123 + if (!bbr_remap_pool) {
124 + DMERR("Unable to create BBR remap mempool.");
129 + if (!bbr_io_buf_cache) {
130 + bbr_io_buf_cache = kmem_cache_create("BBR_IO_Buf_Cache",
131 + sizeof(struct bbr_io_buffer),
132 + 0, SLAB_HWCACHE_ALIGN,
134 + if (!bbr_io_buf_cache) {
135 + DMERR("Unable to create BBR I/O buffer cache.");
139 + if (!bbr_io_buf_pool) {
140 + bbr_io_buf_pool = mempool_create(256, mempool_alloc_slab,
143 + if (!bbr_io_buf_pool) {
144 + DMERR("Unable to create BBR I/O buffer mempool.");
150 + if (!bbr_remap_cache || !bbr_remap_pool ||
151 + !bbr_io_buf_cache || !bbr_io_buf_pool ) {
162 + * Use the dm-daemon services to stop the BBR I/O thread.
164 +static void stop_io_thread(void)
166 + if (bbr_io_thread) {
167 + dm_daemon_stop(bbr_io_thread);
168 + kfree(bbr_io_thread);
169 + bbr_io_thread = NULL;
176 + * Use the dm-daemon services to start the BBR I/O thread.
178 +static int start_io_thread(void)
182 + if (!bbr_io_thread) {
183 + bbr_io_thread = kmalloc(sizeof(*bbr_io_thread), GFP_KERNEL);
184 + if (!bbr_io_thread) {
188 + rc = dm_daemon_start(bbr_io_thread, "bbr_io", bbr_io_handler);
190 + kfree(bbr_io_thread);
201 + * Set up the mempools, I/O thread, and sync-I/O service. This should
202 + * be called only when the first bbr device is created.
204 +static int bbr_global_init(void)
208 + rc = create_pools();
213 + rc = start_io_thread();
231 + * bbr_global_cleanup
233 + * Cleanup the mempools, I/O thread and sync-I/O service. This should
234 + * be called only when the last bbr device is removed.
236 +static void bbr_global_cleanup(void)
243 +static struct bbr_private * bbr_alloc_private(void)
245 + struct bbr_private *bbr_id;
247 + bbr_id = kmalloc(sizeof(*bbr_id), GFP_KERNEL);
249 + memset(bbr_id, 0, sizeof(*bbr_id));
250 + bbr_id->in_use_replacement_blks = (atomic_t)ATOMIC_INIT(0);
251 + bbr_id->bbr_id_lock = SPIN_LOCK_UNLOCKED;
257 +static void bbr_free_private(struct bbr_private *bbr_id)
259 + if (bbr_id->bbr_table) {
260 + kfree(bbr_id->bbr_table);
262 + bbr_free_remap(bbr_id);
266 +static u32 crc_table[256];
267 +static u32 crc_table_built = 0;
269 +static void build_crc_table(void)
273 + for (i = 0; i <= 255; i++) {
275 + for (j = 8; j > 0; j--) {
277 + crc = (crc >> 1) ^ CRC_POLYNOMIAL;
281 + crc_table[i] = crc;
283 + crc_table_built = 1;
286 +static u32 calculate_crc(u32 crc, void *buffer, u32 buffersize)
288 + unsigned char *current_byte;
289 + u32 temp1, temp2, i;
291 + current_byte = (unsigned char *) buffer;
292 + /* Make sure the crc table is available */
293 + if (!crc_table_built)
295 + /* Process each byte in the buffer. */
296 + for (i = 0; i < buffersize; i++) {
297 + temp1 = (crc >> 8) & 0x00FFFFFF;
298 + temp2 = crc_table[(crc ^ (u32) * current_byte) &
301 + crc = temp1 ^ temp2;
307 + * le_bbr_table_sector_to_cpu
309 + * Convert bbr meta data from on-disk (LE) format
310 + * to the native cpu endian format.
312 +static void le_bbr_table_sector_to_cpu(struct bbr_table *p)
315 + p->signature = le32_to_cpup(&p->signature);
316 + p->crc = le32_to_cpup(&p->crc);
317 + p->sequence_number = le32_to_cpup(&p->sequence_number);
318 + p->in_use_cnt = le32_to_cpup(&p->in_use_cnt);
319 + for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
320 + p->entries[i].bad_sect =
321 + le64_to_cpup(&p->entries[i].bad_sect);
322 + p->entries[i].replacement_sect =
323 + le64_to_cpup(&p->entries[i].replacement_sect);
328 + * cpu_bbr_table_sector_to_le
330 + * Convert bbr meta data from cpu endian format to on-disk (LE) format
332 +static void cpu_bbr_table_sector_to_le(struct bbr_table * p,
333 + struct bbr_table * le)
336 + le->signature = cpu_to_le32p(&p->signature);
337 + le->crc = cpu_to_le32p(&p->crc);
338 + le->sequence_number = cpu_to_le32p(&p->sequence_number);
339 + le->in_use_cnt = cpu_to_le32p(&p->in_use_cnt);
340 + for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
341 + le->entries[i].bad_sect =
342 + cpu_to_le64p(&p->entries[i].bad_sect);
343 + le->entries[i].replacement_sect =
344 + cpu_to_le64p(&p->entries[i].replacement_sect);
349 + * validate_bbr_table_sector
351 + * Check the specified BBR table sector for a valid signature and CRC. If it's
352 + * valid, endian-convert the table sector.
354 +static int validate_bbr_table_sector(struct bbr_table * p)
357 + int org_crc, final_crc;
359 + if (le32_to_cpup(&p->signature) != BBR_TABLE_SIGNATURE) {
360 + DMERR("BBR table signature doesn't match!");
361 + DMERR("Found 0x%x. Expecting 0x%x",
362 + le32_to_cpup(&p->signature), BBR_TABLE_SIGNATURE);
368 + DMERR("BBR table sector has no CRC!");
373 + org_crc = le32_to_cpup(&p->crc);
375 + final_crc = calculate_crc(INITIAL_CRC, (void *)p, sizeof(*p));
376 + if (final_crc != org_crc) {
377 + DMERR("CRC failed!");
378 + DMERR("Found 0x%x. Expecting 0x%x",
379 + org_crc, final_crc);
384 + p->crc = cpu_to_le32p(&org_crc);
385 + le_bbr_table_sector_to_cpu(p);
392 + * bbr_binary_tree_insert
394 + * Insert a node into the binary tree.
396 +static void bbr_binary_tree_insert(struct bbr_runtime_remap **root,
397 + struct bbr_runtime_remap *newnode)
399 + struct bbr_runtime_remap **node = root;
400 + while (node && *node) {
401 + if (newnode->remap.bad_sect > (*node)->remap.bad_sect) {
402 + node = &((*node)->right);
404 + node = &((*node)->left);
408 + newnode->left = newnode->right = NULL;
413 + * bbr_binary_search
415 + * Search for a node that contains bad_sect == lsn.
417 +static struct bbr_runtime_remap * bbr_binary_search(
418 + struct bbr_runtime_remap *root,
421 + struct bbr_runtime_remap *node = root;
423 + if (node->remap.bad_sect == lsn) {
426 + if (lsn > node->remap.bad_sect) {
427 + node = node->right;
436 + * bbr_binary_tree_destroy
438 + * Destroy the binary tree.
440 +static void bbr_binary_tree_destroy(struct bbr_runtime_remap * root,
441 + struct bbr_private * bbr_id)
443 + struct bbr_runtime_remap **link = NULL;
444 + struct bbr_runtime_remap *node = root;
448 + link = &(node->left);
453 + link = &(node->right);
454 + node = node->right;
458 + mempool_free(node, bbr_remap_pool);
459 + if (node == root) {
460 + /* If root is deleted, we're done. */
464 + /* Back to root. */
470 +static void bbr_free_remap(struct bbr_private * bbr_id)
472 + spin_lock_irq(&bbr_id->bbr_id_lock);
473 + bbr_binary_tree_destroy(bbr_id->remap_root, bbr_id);
474 + bbr_id->remap_root = NULL;
475 + spin_unlock_irq(&bbr_id->bbr_id_lock);
479 + * bbr_insert_remap_entry
481 + * Create a new remap entry and add it to the binary tree for this node.
483 +static int bbr_insert_remap_entry(struct bbr_private *bbr_id,
484 + struct bbr_table_entry *new_bbr_entry)
486 + struct bbr_runtime_remap *newnode;
488 + newnode = mempool_alloc(bbr_remap_pool, GFP_NOIO);
490 + DMERR("Could not allocate from remap mempool!");
493 + newnode->remap.bad_sect = new_bbr_entry->bad_sect;
494 + newnode->remap.replacement_sect = new_bbr_entry->replacement_sect;
495 + spin_lock_irq(&bbr_id->bbr_id_lock);
496 + bbr_binary_tree_insert(&bbr_id->remap_root, newnode);
497 + spin_unlock_irq(&bbr_id->bbr_id_lock);
502 + * bbr_table_to_remap_list
504 + * The on-disk bbr table is sorted by the replacement sector LBA. In order to
505 + * improve run time performance, the in memory remap list must be sorted by
506 + * the bad sector LBA. This function is called at discovery time to initialize
507 + * the remap list. This function assumes that at least one copy of meta data
510 +static u32 bbr_table_to_remap_list(struct bbr_private * bbr_id)
512 + u32 in_use_blks = 0;
514 + struct bbr_table *p;
516 + for (i = 0, p = bbr_id->bbr_table;
517 + i < bbr_id->nr_sects_bbr_table;
519 + if (!p->in_use_cnt) {
522 + in_use_blks += p->in_use_cnt;
523 + for (j = 0; j < p->in_use_cnt; j++) {
524 + bbr_insert_remap_entry(bbr_id, &p->entries[j]);
528 + DMWARN("There are %u BBR entries for device %s",
529 + in_use_blks, dm_kdevname(bbr_id->dev->dev));
532 + return in_use_blks;
536 + * bbr_search_remap_entry
538 + * Search remap entry for the specified sector. If found, return a pointer to
539 + * the table entry. Otherwise, return NULL.
541 +static struct bbr_table_entry * bbr_search_remap_entry(
542 + struct bbr_private *bbr_id,
545 + struct bbr_runtime_remap *p;
547 + spin_lock_irq(&bbr_id->bbr_id_lock);
548 + p = bbr_binary_search(bbr_id->remap_root, lsn);
549 + spin_unlock_irq(&bbr_id->bbr_id_lock);
551 + return (&p->remap);
560 + * If *lsn is in the remap table, return TRUE and modify *lsn,
561 + * else, return FALSE.
563 +static inline int bbr_remap(struct bbr_private *bbr_id,
566 + struct bbr_table_entry *e;
568 + if (atomic_read(&bbr_id->in_use_replacement_blks)) {
569 + e = bbr_search_remap_entry(bbr_id, *lsn);
571 + *lsn = e->replacement_sect;
581 + * If any of the sectors in the range [lsn, lsn+nr_sects] are in the remap
582 + * table return TRUE, Else, return FALSE.
584 +static inline int bbr_remap_probe(struct bbr_private * bbr_id,
585 + u64 lsn, u64 nr_sects)
589 + if (atomic_read(&bbr_id->in_use_replacement_blks)) {
590 + for (cnt = 0, tmp = lsn;
592 + cnt += bbr_id->blksize_in_sects, tmp = lsn + cnt) {
593 + if (bbr_remap(bbr_id,&tmp)) {
604 + * Read the remap tables from disk and set up the initial remap tree.
606 +static int bbr_setup(struct bbr_private *bbr_id)
608 + struct bbr_table *table = bbr_id->bbr_table;
610 + struct io_region job;
611 + unsigned int error, offset;
614 + job.dev = bbr_id->dev->dev;
617 + /* Read and verify each BBR table sector individually. */
618 + for (i = 0; i < bbr_id->nr_sects_bbr_table; i++, table++) {
619 + job.sector = bbr_id->lba_table1 + i;
620 + page = virt_to_page(table);
621 + offset = (unsigned long)table & ~PAGE_MASK;
622 + rc = dm_io_sync(1, &job, READ, page, offset, &error);
623 + if (rc && bbr_id->lba_table2) {
624 + job.sector = bbr_id->lba_table2 + i;
625 + rc = dm_io_sync(1, &job, READ, page, offset, &error);
631 + rc = validate_bbr_table_sector(table);
636 + atomic_set(&bbr_id->in_use_replacement_blks,
637 + bbr_table_to_remap_list(bbr_id));
641 + DMERR("dm-bbr: error during device setup: %d", rc);
646 +static struct bbr_io_buffer * allocate_bbr_io_buf(struct bbr_private * bbr_id,
647 + struct buffer_head * bh,
650 + struct bbr_io_buffer * bbr_io_buf;
652 + bbr_io_buf = mempool_alloc(bbr_io_buf_pool, GFP_NOIO);
654 + memset(bbr_io_buf, 0, sizeof(struct bbr_io_buffer));
655 + INIT_LIST_HEAD(&bbr_io_buf->bbr_io_list);
656 + bbr_io_buf->bbr_id = bbr_id;
657 + bbr_io_buf->sector = bh->b_rsector;
658 + bbr_io_buf->bh = bh;
659 + bbr_io_buf->rw = rw;
661 + DMWARN("Could not allocate from BBR I/O buffer pool!");
666 +static void free_bbr_io_buf(struct bbr_io_buffer * bbr_io_buf)
668 + mempool_free(bbr_io_buf, bbr_io_buf_pool);
672 + * bbr_io_remap_error
673 + * @bbr_id: Private data for the BBR node.
674 + * @rw: READ or WRITE.
675 + * @starting_lsn: Starting sector of request to remap.
676 + * @count: Number of sectors in the request.
677 + * @buffer: Data buffer for the request.
679 + * For the requested range, try to write each sector individually. For each
680 + * sector that fails, find the next available remap location and write the
681 + * data to that new location. Then update the table and write both copies
682 + * of the table to disk. Finally, update the in-memory mapping and do any
683 + * other necessary bookkeeping.
685 +static int bbr_io_remap_error(struct bbr_private *bbr_id,
691 + struct bbr_table *bbr_table;
692 + struct io_region job;
694 + unsigned long table_sector_index;
695 + unsigned long table_sector_offset;
696 + unsigned long index;
697 + unsigned int offset_in_page, error;
702 + /* Nothing can be done about read errors. */
706 + job.dev = bbr_id->dev->dev;
709 + /* For each sector in the request. */
710 + for (lsn = 0; lsn < count; lsn++, buffer += SECTOR_SIZE) {
711 + job.sector = starting_lsn + lsn;
712 + page = virt_to_page(buffer);
713 + offset_in_page = (unsigned long)buffer & ~PAGE_MASK;
714 + rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
716 + /* Find the next available relocation sector. */
717 + new_lsn = atomic_read(&bbr_id->in_use_replacement_blks);
718 + if (new_lsn >= bbr_id->nr_replacement_blks) {
719 + /* No more replacement sectors available. */
722 + new_lsn += bbr_id->start_replacement_sect;
724 + /* Write the data to its new location. */
725 + DMWARN("dm-bbr: device %s: Trying to remap bad sector "PFU64" to sector "PFU64,
726 + dm_kdevname(bbr_id->dev->dev),
727 + starting_lsn + lsn, new_lsn);
728 + job.sector = new_lsn;
729 + rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
731 + /* This replacement sector is bad.
732 + * Try the next one.
734 + DMERR("dm-bbr: device %s: replacement sector "PFU64" is bad. Skipping.",
735 + dm_kdevname(bbr_id->dev->dev), new_lsn);
736 + atomic_inc(&bbr_id->in_use_replacement_blks);
740 + /* Add this new entry to the on-disk table. */
741 + table_sector_index = new_lsn -
742 + bbr_id->start_replacement_sect;
743 + table_sector_offset = table_sector_index /
744 + BBR_ENTRIES_PER_SECT;
745 + index = table_sector_index % BBR_ENTRIES_PER_SECT;
747 + bbr_table = &bbr_id->bbr_table[table_sector_offset];
748 + bbr_table->entries[index].bad_sect = starting_lsn + lsn;
749 + bbr_table->entries[index].replacement_sect = new_lsn;
750 + bbr_table->in_use_cnt++;
751 + bbr_table->sequence_number++;
752 + bbr_table->crc = 0;
753 + bbr_table->crc = calculate_crc(INITIAL_CRC,
755 + sizeof(struct bbr_table));
757 + /* Write the table to disk. */
758 + cpu_bbr_table_sector_to_le(bbr_table, bbr_table);
759 + page = virt_to_page(bbr_table);
760 + offset_in_page = (unsigned long)bbr_table & ~PAGE_MASK;
761 + if (bbr_id->lba_table1) {
762 + job.sector = bbr_id->lba_table1 + table_sector_offset;
763 + rc = dm_io_sync(1, &job, WRITE, page, offset_in_page, &error);
765 + if (bbr_id->lba_table2) {
766 + job.sector = bbr_id->lba_table2 + table_sector_offset;
767 + rc |= dm_io_sync(1, &job, WRITE, page, offset_in_page, &error);
769 + le_bbr_table_sector_to_cpu(bbr_table);
772 + /* Error writing one of the tables to disk. */
773 + DMERR("dm-bbr: device %s: error updating BBR tables on disk.",
774 + dm_kdevname(bbr_id->dev->dev));
778 + /* Insert a new entry in the remapping binary-tree. */
779 + rc = bbr_insert_remap_entry(bbr_id,
780 + &bbr_table->entries[index]);
782 + DMERR("dm-bbr: device %s: error adding new entry to remap tree.",
783 + dm_kdevname(bbr_id->dev->dev));
787 + atomic_inc(&bbr_id->in_use_replacement_blks);
795 + * bbr_io_process_request
797 + * For each sector in this request, check if the sector has already
798 + * been remapped. If so, process all previous sectors in the request,
799 + * followed by the remapped sector. Then reset the starting lsn and
800 + * count, and keep going with the rest of the request as if it were
801 + * a whole new request. If any of the sync_io's return an error,
802 + * call the remapper to relocate the bad sector(s).
804 +static int bbr_io_process_request(struct bbr_io_buffer *bbr_io_buf)
806 + struct bbr_private *bbr_id = bbr_io_buf->bbr_id;
807 + struct io_region job;
808 + u64 starting_lsn = bbr_io_buf->sector;
809 + u64 count = bbr_io_buf->bh->b_size >> SECTOR_SHIFT;
810 + u64 lsn, remapped_lsn;
811 + char *buffer = bbr_io_buf->bh->b_data;
812 + struct page *page = virt_to_page(buffer);
813 + unsigned int offset_in_page = (unsigned long)buffer & ~PAGE_MASK;
814 + unsigned int error;
815 + int rw = bbr_io_buf->rw;
818 + job.dev = bbr_id->dev->dev;
820 + /* For each sector in this request, check if this sector has
821 + * already been remapped. If so, process all previous sectors
822 + * in this request, followed by the remapped sector. Then reset
823 + * the starting lsn and count and keep going with the rest of
824 + * the request as if it were a whole new request.
826 + for (lsn = 0; lsn < count; lsn++) {
827 + remapped_lsn = starting_lsn + lsn;
828 + rc = bbr_remap(bbr_id, &remapped_lsn);
830 + /* This sector is fine. */
834 + /* Process all sectors in the request up to this one. */
836 + job.sector = starting_lsn;
838 + rc = dm_io_sync(1, &job, rw, page,
839 + offset_in_page, &error);
841 + /* If this I/O failed, then one of the
842 + * sectors in this request needs to be
845 + rc = bbr_io_remap_error(bbr_id, rw,
852 + buffer += (lsn << SECTOR_SHIFT);
853 + page = virt_to_page(buffer);
854 + offset_in_page = (unsigned long)buffer & ~PAGE_MASK;
857 + /* Process the remapped sector. */
858 + job.sector = remapped_lsn;
860 + rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
862 + /* BUGBUG - Need more processing if this caused
863 + * an error. If this I/O failed, then the
864 + * existing remap is now bad, and we need to
865 + * find a new remap. Can't use
866 + * bbr_io_remap_error(), because the existing
867 + * map entry needs to be changed, not added
868 + * again, and the original table entry also
869 + * needs to be changed.
874 + buffer += SECTOR_SIZE;
875 + starting_lsn += (lsn + 1);
876 + count -= (lsn + 1);
878 + page = virt_to_page(buffer);
879 + offset_in_page = (unsigned long)buffer & ~PAGE_MASK;
882 + /* Check for any remaining sectors after the last split. This
883 + * could potentially be the whole request, but that should be a
884 + * rare case because requests should only be processed by the
885 + * thread if we know an error occurred or they contained one or
886 + * more remapped sectors.
889 + job.sector = starting_lsn;
891 + rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
893 + /* If this I/O failed, then one of the sectors
894 + * in this request needs to be relocated.
896 + rc = bbr_io_remap_error(bbr_id, rw, starting_lsn,
910 + * This is the handler for the bbr_io_thread. It continuously loops,
911 + * taking I/O requests off its list and processing them. If nothing
912 + * is on the list, the thread goes back to sleep until specifically
915 + * I/O requests should only be sent to this thread if we know that:
916 + * a) the request contains at least one remapped sector.
918 + * b) the request caused an error on the normal I/O path.
919 + * This function uses synchronous I/O, so sending a request to this
920 + * thread that doesn't need special processing will cause severe
921 + * performance degredation.
923 +static void bbr_io_handler(void)
925 + struct bbr_io_buffer *bbr_io_buf;
926 + struct buffer_head *bh;
927 + unsigned long flags;
931 + /* Process bbr_io_list, one entry at a time. */
932 + spin_lock_irqsave(&bbr_io_list_lock, flags);
933 + if (list_empty(&bbr_io_list)) {
934 + /* No more items on the list. */
935 + spin_unlock_irqrestore(&bbr_io_list_lock, flags);
938 + bbr_io_buf = list_entry(bbr_io_list.next,
939 + struct bbr_io_buffer, bbr_io_list);
940 + list_del_init(&bbr_io_buf->bbr_io_list);
941 + spin_unlock_irqrestore(&bbr_io_list_lock, flags);
943 + rc = bbr_io_process_request(bbr_io_buf);
945 + /* Clean up and complete the original I/O. */
946 + bbr_io_buf->flags |= BBR_IO_HANDLED;
947 + bh = bbr_io_buf->bh;
948 + if (bh->b_end_io) {
949 + /* If this was the bbr_io_buf for an error on the
950 + * normal WRITE, don't free it here. It will be
951 + * freed later in bbr_callback()
953 + if (!(bbr_io_buf->flags & BBR_IO_RELOCATE))
954 + free_bbr_io_buf(bbr_io_buf);
955 + bh->b_end_io(bh, rc ? 0 : 1);
963 + * Place the specified bbr_io_buf on the thread's processing list.
965 +static void bbr_schedule_io(struct bbr_io_buffer *bbr_io_buf)
967 + unsigned long flags;
968 + spin_lock_irqsave(&bbr_io_list_lock, flags);
969 + list_add_tail(&bbr_io_buf->bbr_io_list, &bbr_io_list);
970 + spin_unlock_irqrestore(&bbr_io_list_lock, flags);
971 + dm_daemon_wake(bbr_io_thread);
977 + * If there are any remapped sectors on this object, send this request over
978 + * to the thread for processing. Otherwise send it down the stack normally.
980 +static int bbr_read(struct bbr_private *bbr_id,
981 + struct buffer_head *bh)
983 + struct bbr_io_buffer *bbr_io_buf;
985 + if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
986 + !bbr_remap_probe(bbr_id, bh->b_rsector,
987 + bh->b_size >> SECTOR_SHIFT)) {
988 + /* No existing remaps or this request doesn't
989 + * contain any remapped sectors.
991 + bh->b_rdev = bbr_id->dev->dev;
995 + /* This request has at least one remapped sector. */
996 + bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, READ);
998 + /* Can't get memory to track the I/O. */
1002 + bbr_schedule_io(bbr_io_buf);
1009 + * This is the callback for normal write requests. Check for an error
1010 + * during the I/O, and send to the thread for processing if necessary.
1012 +static int bbr_callback(struct dm_target *ti, struct buffer_head *bh, int rw,
1013 + int error, union map_info *map_context)
1015 + struct bbr_io_buffer *bbr_io_buf = map_context->ptr;
1020 + /* Will try to relocate the WRITE if:
1021 + * - It is an error, and
1022 + * - It is not an error of BBR relocation, and
1024 + if (error && !(bbr_io_buf->flags & BBR_IO_HANDLED)) {
1025 + DMERR("dm-bbr: device %s: Write failure on sector %lu. Scheduling for retry.",
1026 + dm_kdevname(bh->b_rdev),
1027 + (unsigned long)bbr_io_buf->sector);
1028 + /* Indicate this bbr_io_buf is for an error on normal WRITE */
1029 + bbr_io_buf->flags |= BBR_IO_RELOCATE;
1030 + bbr_schedule_io(bbr_io_buf);
1031 + /* Returns >0 so that DM will let us retry the I/O */
1035 + free_bbr_io_buf(bbr_io_buf);
1042 + * If there are any remapped sectors on this object, send the request over
1043 + * to the thread for processing. Otherwise, register for callback
1044 + * notification, and send the request down normally.
1046 +static int bbr_write(struct bbr_private *bbr_id,
1047 + struct buffer_head *bh,
1048 + union map_info *map_context)
1050 + struct bbr_io_buffer *bbr_io_buf;
1053 + bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, WRITE);
1054 + if (!bbr_io_buf) {
1055 + /* Can't get memory to track the I/O. */
1059 + if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
1060 + !bbr_remap_probe(bbr_id, bh->b_rsector,
1061 + bh->b_size >> SECTOR_SHIFT)) {
1062 + /* No existing remaps or this request
1063 + * contains no remapped sectors.
1065 + bh->b_rdev = bbr_id->dev->dev;
1066 + map_context->ptr = bbr_io_buf;
1068 + /* This request contains at least one remapped sector. */
1069 + bbr_schedule_io(bbr_io_buf);
1077 + * Construct a bbr mapping
1079 +static int bbr_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1081 + struct bbr_private *bbr_id;
1082 + unsigned long block_size;
1087 + ti->error = "dm-bbr requires exactly 8 arguments: "
1088 + "device offset table1_lsn table2_lsn table_size start_replacement nr_replacement_blks block_size";
1092 + bbr_id = bbr_alloc_private();
1094 + ti->error = "dm-bbr: Error allocating bbr private data.";
1098 + bbr_id->offset = simple_strtoull(argv[1], &end, 10);
1099 + bbr_id->lba_table1 = simple_strtoull(argv[2], &end, 10);
1100 + bbr_id->lba_table2 = simple_strtoull(argv[3], &end, 10);
1101 + bbr_id->nr_sects_bbr_table = simple_strtoull(argv[4], &end, 10);
1102 + bbr_id->start_replacement_sect = simple_strtoull(argv[5], &end, 10);
1103 + bbr_id->nr_replacement_blks = simple_strtoull(argv[6], &end, 10);
1104 + block_size = simple_strtoul(argv[7], &end, 10);
1105 + bbr_id->blksize_in_sects = (block_size >> SECTOR_SHIFT);
1107 + bbr_id->bbr_table = kmalloc(bbr_id->nr_sects_bbr_table << SECTOR_SHIFT,
1109 + if (!bbr_id->bbr_table) {
1110 + ti->error = "dm-bbr: Error allocating bbr table.";
1114 + if (dm_get_device(ti, argv[0], 0, ti->len,
1115 + dm_table_get_mode(ti->table), &bbr_id->dev)) {
1116 + ti->error = "dm-bbr: Device lookup failed";
1120 + /* Using a semaphore here is probably overkill,
1121 + * but at least it will be correct.
1123 + down(&bbr_instances_lock);
1124 + if (bbr_instances == 0) {
1125 + rc = bbr_global_init();
1127 + up(&bbr_instances_lock);
1132 + up(&bbr_instances_lock);
1134 + rc = bbr_setup(bbr_id);
1136 + ti->error = "dm-bbr: Device setup failed";
1140 + ti->private = bbr_id;
1144 + down(&bbr_instances_lock);
1146 + if (bbr_instances == 0) {
1147 + bbr_global_cleanup();
1149 + up(&bbr_instances_lock);
1152 + dm_put_device(ti, bbr_id->dev);
1154 + bbr_free_private(bbr_id);
1159 +static void bbr_dtr(struct dm_target *ti)
1161 + struct bbr_private *bbr_id = ti->private;
1163 + dm_put_device(ti, bbr_id->dev);
1164 + bbr_free_private(bbr_id);
1166 + down(&bbr_instances_lock);
1168 + if (bbr_instances == 0) {
1169 + bbr_global_cleanup();
1171 + up(&bbr_instances_lock);
1174 +static int bbr_map(struct dm_target *ti, struct buffer_head *bh, int rw,
1175 + union map_info *map_context)
1177 + struct bbr_private *bbr_id = ti->private;
1179 + bh->b_rsector += bbr_id->offset;
1180 + map_context->ptr = NULL;
1184 + return bbr_read(bbr_id, bh);
1186 + return bbr_write(bbr_id, bh, map_context);
1192 +static int bbr_status(struct dm_target *ti, status_type_t type,
1193 + char *result, unsigned int maxlen)
1195 + struct bbr_private *bbr_id = ti->private;
1198 + case STATUSTYPE_INFO:
1202 + case STATUSTYPE_TABLE:
1203 + snprintf(result, maxlen, "%s "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" %u",
1204 + dm_kdevname(bbr_id->dev->dev),
1205 + bbr_id->offset, bbr_id->lba_table1, bbr_id->lba_table2,
1206 + bbr_id->nr_sects_bbr_table,
1207 + bbr_id->start_replacement_sect,
1208 + bbr_id->nr_replacement_blks,
1209 + bbr_id->blksize_in_sects << SECTOR_SHIFT);
1215 +static struct target_type bbr_target = {
1217 + module: THIS_MODULE,
1221 + end_io: bbr_callback,
1222 + status: bbr_status,
1225 +int __init dm_bbr_init(void)
1227 + int r = dm_register_target(&bbr_target);
1230 + DMERR("dm-bbr: register failed %d", r);
1235 +void __exit dm_bbr_exit(void)
1237 + int r = dm_unregister_target(&bbr_target);
1240 + DMERR("dm-bbr: unregister failed %d", r);
1243 +module_init(dm_bbr_init);
1244 +module_exit(dm_bbr_exit);
1245 +MODULE_LICENSE("GPL");
1246 diff -urN linux-2.4.24.org/drivers/md/dm-bbr.h linux-2.4.24/drivers/md/dm-bbr.h
1247 --- linux-2.4.24.org/drivers/md/dm-bbr.h 1970-01-01 01:00:00.000000000 +0100
1248 +++ linux-2.4.24/drivers/md/dm-bbr.h 2004-01-18 16:03:13.101545929 +0100
1251 + * (C) Copyright IBM Corp. 2002, 2003
1253 + * This program is free software; you can redistribute it and/or modify
1254 + * it under the terms of the GNU General Public License as published by
1255 + * the Free Software Foundation; either version 2 of the License, or
1256 + * (at your option) any later version.
1258 + * This program is distributed in the hope that it will be useful,
1259 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
1260 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
1261 + * the GNU General Public License for more details.
1263 + * You should have received a copy of the GNU General Public License
1264 + * along with this program; if not, write to the Free Software
1265 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
1267 + * linux/drivers/md/dm-bbr.h
1269 + * Bad-block-relocation (BBR) target for device-mapper.
1271 + * The BBR target is designed to remap I/O write failures to another safe
1272 + * location on disk. Note that most disk drives have BBR built into them,
1273 + * this means that our software BBR will be only activated when all hardware
1274 + * BBR replacement sectors have been used.
1277 +#define BBR_TABLE_SIGNATURE 0x42627254 /* BbrT */
1278 +#define BBR_ENTRIES_PER_SECT 31
1279 +#define BBR_NR_BUFS 128
1280 +#define INITIAL_CRC 0xFFFFFFFF
1281 +#define CRC_POLYNOMIAL 0xEDB88320L
1284 + * Macros to cleanly print 64-bit numbers on both 32-bit and 64-bit machines.
1285 + * Use these in place of %Ld, %Lu, and %Lx.
1287 +#if BITS_PER_LONG > 32
1288 +#define PFU64 "%lu"
1290 +#define PFU64 "%Lu"
1294 + * struct bbr_table_entry
1295 + * @bad_sect: LBA of bad location.
1296 + * @replacement_sect: LBA of new location.
1298 + * Structure to describe one BBR remap.
1300 +struct bbr_table_entry {
1302 + u64 replacement_sect;
1306 + * struct bbr_table
1307 + * @signature: Signature on each BBR table sector.
1308 + * @crc: CRC for this table sector.
1309 + * @sequence_number: Used to resolve conflicts when primary and secondary
1310 + * tables do not match.
1311 + * @in_use_cnt: Number of in-use table entries.
1312 + * @entries: Actual table of remaps.
1314 + * Structure to describe each sector of the metadata table. Each sector in this
1315 + * table can describe 31 remapped sectors.
1320 + u32 sequence_number;
1322 + struct bbr_table_entry entries[BBR_ENTRIES_PER_SECT];
1326 + * struct bbr_runtime_remap
1328 + * Node in the binary tree used to keep track of remaps.
1330 +struct bbr_runtime_remap {
1331 + struct bbr_table_entry remap;
1332 + struct bbr_runtime_remap *left;
1333 + struct bbr_runtime_remap *right;
1337 + * struct bbr_private
1338 + * @dev: Info about underlying device.
1339 + * @bbr_table: Copy of metadata table.
1340 + * @remap_root: Binary tree containing all remaps.
1341 + * @offset: LBA of data area.
1342 + * @lba_table1: LBA of primary BBR table.
1343 + * @lba_table2: LBA of secondary BBR table.
1344 + * @nr_sects_bbr_table: Size of each BBR table.
1345 + * @nr_replacement_blks: Number of replacement blocks.
1346 + * @start_replacement_sect: LBA of start of replacement blocks.
1347 + * @blksize_in_sects: Size of each block.
1348 + * @in_use_replacement_blks: Current number of remapped blocks.
1349 + * @bbr_id_lock: Lock for the binary tree.
1351 + * Private data for each BBR target.
1353 +struct bbr_private {
1354 + struct dm_dev *dev;
1355 + struct bbr_table *bbr_table;
1356 + struct bbr_runtime_remap *remap_root;
1360 + u64 nr_sects_bbr_table;
1361 + u64 start_replacement_sect;
1362 + u64 nr_replacement_blks;
1363 + u32 blksize_in_sects;
1364 + atomic_t in_use_replacement_blks;
1365 + spinlock_t bbr_id_lock;
1368 +#define BBR_IO_HANDLED (1<<0)
1369 +#define BBR_IO_RELOCATE (1<<1)
1372 + * struct bbr_io_buffer
1373 + * @bbr_io_list: Thread's list of bbr_io_buf's.
1374 + * @bbr_id: Object for this request.
1375 + * @bh: Original buffer_head.
1376 + * @sector: Original sector
1377 + * @flags: Operation flag (BBR_IO_*)
1378 + * @rw: READ or WRITE.
1379 + * @rc: Return code from bbr_io_handler.
1381 + * Structure used to track each write request.
1383 +struct bbr_io_buffer {
1384 + struct list_head bbr_io_list;
1385 + struct bbr_private *bbr_id;
1386 + struct buffer_head *bh;
1393 diff -urN linux-2.4.24.org/drivers/md/dm.c linux-2.4.24/drivers/md/dm.c
1394 --- linux-2.4.24.org/drivers/md/dm.c 2004-01-18 15:09:18.533171353 +0100
1395 +++ linux-2.4.24/drivers/md/dm.c 2004-01-18 15:59:40.046635861 +0100
1396 @@ -951,13 +951,23 @@
1398 DECLARE_WAITQUEUE(wait, current);
1400 - down_write(&md->lock);
1401 + /* Flush IO to the origin device */
1402 + down_read(&md->lock);
1403 + if (test_bit(DMF_BLOCK_IO, &md->flags)) {
1404 + up_read(&md->lock);
1408 + fsync_dev_lockfs(md->dev);
1409 + up_read(&md->lock);
1413 - * First we set the BLOCK_IO flag so no more ios will be
1415 + * Set the BLOCK_IO flag so no more ios will be mapped.
1417 + down_write(&md->lock);
1418 if (test_bit(DMF_BLOCK_IO, &md->flags)) {
1419 + unlockfs(md->dev);
1420 up_write(&md->lock);
1425 /* did we flush everything ? */
1426 if (atomic_read(&md->pending)) {
1427 + unlockfs(md->dev);
1428 clear_bit(DMF_BLOCK_IO, &md->flags);
1431 @@ -1017,6 +1028,7 @@
1432 md->deferred = NULL;
1433 up_write(&md->lock);
1435 + unlockfs(md->dev);
1436 flush_deferred_io(def);
1437 run_task_queue(&tq_disk);
1439 diff -urN linux-2.4.24.org/drivers/md/dm-snapshot.c linux-2.4.24/drivers/md/dm-snapshot.c
1440 --- linux-2.4.24.org/drivers/md/dm-snapshot.c 2004-01-18 15:09:18.569163966 +0100
1441 +++ linux-2.4.24/drivers/md/dm-snapshot.c 2004-01-18 16:02:40.858328124 +0100
1444 /* List of snapshots for this origin */
1445 struct list_head snapshots;
1447 + /* Count of snapshots and origins referrencing this structure. */
1448 + unsigned int count;
1452 @@ -155,6 +158,35 @@
1456 + * Allocate and initialize an origin structure.
1458 +static struct origin * __alloc_origin(kdev_t dev)
1460 + struct origin *o = kmalloc(sizeof(*o), GFP_KERNEL);
1463 + INIT_LIST_HEAD(&o->hash_list);
1464 + INIT_LIST_HEAD(&o->snapshots);
1465 + __insert_origin(o);
1470 +static void __get_origin(struct origin *o)
1475 +static void __put_origin(struct origin *o)
1478 + if (o->count == 0) {
1479 + list_del(&o->hash_list);
1485 * Make a note of the snapshot and its origin so we can look it
1486 * up when the origin has a write on it.
1488 @@ -168,20 +200,37 @@
1492 - o = kmalloc(sizeof(*o), GFP_KERNEL);
1493 + o = __alloc_origin(dev);
1495 up_write(&_origins_lock);
1500 - /* Initialise the struct */
1501 - INIT_LIST_HEAD(&o->snapshots);
1504 + list_add_tail(&snap->list, &o->snapshots);
1506 - __insert_origin(o);
1507 + up_write(&_origins_lock);
1511 +static int register_origin(kdev_t dev)
1515 + down_write(&_origins_lock);
1516 + o = __lookup_origin(dev);
1520 + o = __alloc_origin(dev);
1522 + up_write(&_origins_lock);
1527 - list_add_tail(&snap->list, &o->snapshots);
1530 up_write(&_origins_lock);
1532 @@ -195,11 +244,18 @@
1533 o = __lookup_origin(s->origin->dev);
1536 - if (list_empty(&o->snapshots)) {
1537 - list_del(&o->hash_list);
1542 + up_write(&_origins_lock);
1545 +static void unregister_origin(kdev_t dev)
1549 + down_write(&_origins_lock);
1550 + o = __lookup_origin(dev);
1552 up_write(&_origins_lock);
1559 - /* Flush IO to the origin device */
1560 - fsync_dev(s->origin->dev);
1562 /* Add snapshot to the list of snapshots for this origin */
1563 if (register_snapshot(s)) {
1565 @@ -1093,6 +1146,13 @@
1569 + r = register_origin(dev->dev);
1571 + ti->error = "Cannot register origin";
1572 + dm_put_device(ti, dev);
1579 @@ -1100,6 +1160,7 @@
1580 static void origin_dtr(struct dm_target *ti)
1582 struct dm_dev *dev = (struct dm_dev *) ti->private;
1583 + unregister_origin(dev->dev);
1584 dm_put_device(ti, dev);
1587 diff -urN linux-2.4.24.org/drivers/md/dm-sparse.c linux-2.4.24/drivers/md/dm-sparse.c
1588 --- linux-2.4.24.org/drivers/md/dm-sparse.c 1970-01-01 01:00:00.000000000 +0100
1589 +++ linux-2.4.24/drivers/md/dm-sparse.c 2004-01-18 16:04:48.284615142 +0100
1591 +/* -*- linux-c -*- */
1594 + * Copyright (c) International Business Machines Corp., 2002
1596 + * This program is free software; you can redistribute it and/or modify
1597 + * it under the terms of the GNU General Public License as published by
1598 + * the Free Software Foundation; either version 2 of the License, or
1599 + * (at your option) any later version.
1601 + * This program is distributed in the hope that it will be useful,
1602 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
1603 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
1604 + * the GNU General Public License for more details.
1606 + * You should have received a copy of the GNU General Public License
1607 + * along with this program; if not, write to the Free Software
1608 + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
1610 + * linux/drivers/md/dm-sparse.c
1612 + * Sparse target for device-mapper.
1614 + * This target provides the ability to create a sparse device. This
1615 + * allows a device to pretend to be larger than it really is.
1618 +#include <linux/module.h>
1619 +#include <linux/init.h>
1620 +#include <linux/blkdev.h>
1621 +#include <linux/slab.h>
1622 +#include <linux/mempool.h>
1623 +#include <linux/vmalloc.h>
1628 +#define MAX_HASH_CHAIN_ENTRIES 10
1629 +#define NAME_SIZE 127
1638 +// Entries in the sparse remapping structure
1639 +struct sparse_hash_entry {
1640 + u64 org_chunk; // Chunk number, not LBA.
1641 + u64 sparse_chunk; // Chunk number, not LBA.
1642 + struct sparse_hash_entry * next;
1643 + struct sparse_hash_entry * prev;
1646 +//Private data structure
1647 +struct sparse_volume {
1648 + struct dm_dev *dev;
1649 + struct rw_semaphore sparse_semaphore;
1650 + struct sparse_hash_entry ** sparse_map; // Hash table of remappings
1651 + struct sparse_hash_entry * free_hash_list;
1652 + kmem_cache_t * hash_slab;
1653 + mempool_t * hash_pool;
1655 + u32 chunk_size; // Sectors.
1656 + u32 chunk_shift; // Shift value for chunk size.
1657 + u32 num_chunks; // In this volume.
1658 + u32 next_cow_entry; // Index into current COW table.
1659 + u64 current_cow_sector; // LOGICAL sector of current COW table.
1660 + u32 next_free_chunk; // Index of next free chunk (not LBA!).
1661 + u32 hash_table_size; // Size of the hash table for the remap.
1663 + u64 cow_table[64]; // One sector's worth of COW tables.
1666 +/*************************** OLD SERVICES ****************************/
1668 +/* computes log base 2 of value */
1669 +inline int log2(u32 value) //ok to change to u32?
1672 + long tmp; //ok to change to long?
1677 + while (!(tmp & 1)) {
1688 +/********************************* Functions *********************************/
1690 +/***************************** Hash Functions *****************************/
1692 +/* Take and initialize from the free hash list */
1693 +static struct sparse_hash_entry *
1694 +allocate_sparse_hash_entry( struct sparse_volume * volume,
1696 + u64 sparse_chunk )
1698 + struct sparse_hash_entry * hash_entry;
1700 + hash_entry = volume->free_hash_list;
1701 + if ( hash_entry ) { //should always be the case b/c preallocate these
1702 + volume->free_hash_list = hash_entry->next;
1703 + hash_entry->org_chunk = org_chunk;
1704 + hash_entry->sparse_chunk = sparse_chunk;
1705 + hash_entry->next = NULL;
1706 + hash_entry->prev = NULL;
1709 + return hash_entry;
1713 + * This function inserts a new entry into a sparse hash chain, immediately
1714 + * following the specified entry. This function should not be used to add
1715 + * an entry into an empty list, or as the first entry in an existing list.
1716 + * For that case, use insert_sparse_map_entry_at_head().
1718 +static int insert_sparse_hash_entry( struct sparse_hash_entry * entry,
1719 + struct sparse_hash_entry * base )
1721 + entry->next = base->next;
1722 + entry->prev = base;
1723 + base->next = entry;
1724 + if ( entry->next ) {
1725 + entry->next->prev = entry;
1731 + * This function inserts a new entry into a sparse chain as the first
1732 + * entry in the chain.
1734 +static int insert_sparse_hash_entry_at_head( struct sparse_hash_entry * entry,
1735 + struct sparse_hash_entry ** head )
1737 + entry->next = *head;
1738 + entry->prev = NULL;
1740 + if ( entry->next ) {
1741 + entry->next->prev = entry;
1747 + * Delete all items in a single chain in the hash table.
1749 +static int delete_sparse_hash_chain( struct sparse_volume * vol,
1750 + struct sparse_hash_entry * head )
1752 + struct sparse_hash_entry * next;
1755 + next = head->next;
1756 + mempool_free( head, vol->hash_pool );
1763 + * This function will search the hash chain that is anchored at the
1764 + * specified head pointer. If the chunk number is found, a pointer to that
1765 + * entry in the chain is set, and a 1 is returned. If the chunk is not
1766 + * found, a pointer to the previous entry is set and 0 is returned. If the
1767 + * return pointer is NULL, this means either the list is empty, or the
1768 + * specified sector should become the first list item.
1770 +static int search_sparse_hash_chain( u64 chunk,
1771 + struct sparse_hash_entry * head,
1772 + struct sparse_hash_entry ** result )
1774 + struct sparse_hash_entry * curr = head;
1775 + struct sparse_hash_entry * prev = head;
1776 + while ( curr && curr->org_chunk < chunk ) {
1778 + curr = curr->next;
1780 + if (!curr) { // Either an empty chain or went off the end of the chain.
1784 + else if ( curr->org_chunk != chunk ) {
1785 + *result = curr->prev;
1795 + * This function takes a cow table entry (from the on-disk data), and
1796 + * converts it into an appropriate entry for the sparse map, and
1797 + * inserts it into the appropriate map for the specified volume.
1799 +static int add_cow_entry_to_sparse_map( u64 org_chunk,
1801 + struct sparse_volume * volume )
1803 + struct sparse_hash_entry * new_entry;
1804 + struct sparse_hash_entry * target_entry;
1808 + new_entry = allocate_sparse_hash_entry(volume, org_chunk, sparse_chunk);
1813 + hash_value = (long)org_chunk % volume->hash_table_size;
1815 + if (! search_sparse_hash_chain( org_chunk,
1816 + volume->sparse_map[hash_value],
1817 + &target_entry ) ) {
1818 + //should always take this path
1820 + if ( target_entry ) {
1821 + insert_sparse_hash_entry( new_entry, target_entry );
1824 + insert_sparse_hash_entry_at_head
1825 + ( new_entry, &(volume->sparse_map[hash_value]) );
1833 + * Construct the initial hash table state based on
1834 + * existing COW tables on the disk.
1836 +static int build_sparse_maps(struct sparse_volume * volume)
1838 + int rc = 0, done = 0;
1839 + struct io_region job;
1840 + struct page * page;
1841 + unsigned int error, offset;
1845 + // Read in one sector's worth of COW tables.
1846 + job.dev = volume->dev->dev;
1847 + job.sector = volume->current_cow_sector;
1849 + page = virt_to_page(volume->cow_table);
1850 + offset = (unsigned long)volume->cow_table & ~PAGE_MASK;
1851 + rc = dm_io_sync(1, &job, READ, page, offset, &error);
1856 + // Translate every valid COW table entry into
1857 + // a sparse map entry.
1858 + for ( volume->next_cow_entry = 0;
1860 + volume->next_cow_entry < (SECTOR_SIZE/sizeof(u64)) &&
1861 + volume->cow_table[volume->next_cow_entry] !=
1862 + 0xffffffffffffffff;
1864 + volume->next_cow_entry++, volume->next_free_chunk++ ) {
1866 + if ( (rc = add_cow_entry_to_sparse_map
1867 + ( le64_to_cpu( volume->cow_table[volume->next_cow_entry] ),
1868 + volume->next_free_chunk, volume ))) {
1872 + // Move on to the next sector if necessary.
1873 + if ( volume->next_cow_entry == (SECTOR_SIZE/sizeof(u64)) ) {
1874 + volume->current_cow_sector++;
1883 +/************************* Other Functions ************************/
1886 + * Function: sparse_remap_chunk
1888 + * This function performs a sector remap on a sparse volume. This should
1889 + * be called from the I/O path, It first determines the base sector
1890 + * of the chunk containing the specified sector, and saves the remainder.
1891 + * Then it performs a search through the sparse map for the specified
1892 + * volume. If a match is found, the sector number is changed to the new
1893 + * value. If no match is found, the value is left the same, meaning the
1894 + * chunk has not been remapped.
1896 +static int sparse_remap_chunk( struct sparse_volume * sparse_volume,
1899 + struct sparse_hash_entry * result;
1905 + down_read(&sparse_volume->sparse_semaphore);
1907 + remainder = *sector & (u64)(sparse_volume->chunk_size - 1);
1908 + chunk = *sector >> sparse_volume->chunk_shift;
1909 + hash_value = ((u32)chunk) % sparse_volume->hash_table_size;
1911 + if ( search_sparse_hash_chain( chunk,
1912 + sparse_volume->sparse_map[hash_value],
1914 + *sector = ( result->sparse_chunk << sparse_volume->chunk_shift )
1918 + up_read(&sparse_volume->sparse_semaphore);
1922 +/* Function: sparse_cow_write
1924 + * Check this sparse node to see if the given sector/chunk has been
1925 + * remapped yet. If it hasn't, create a new hash table entry, update the
1926 + * in-memory COW table, write the COW table to disk.
1929 +static int sparse_cow_write( struct sparse_volume * sparse_volume,
1932 + struct sparse_hash_entry * target_entry, * new_map_entry;
1933 + struct io_region job;
1934 + struct page * page;
1935 + char * cow = NULL;
1936 + unsigned int error, offset;
1938 + u32 hash_value = 0;
1942 + down_write(&sparse_volume->sparse_semaphore);
1944 + remainder = *sector & (u64)(sparse_volume->chunk_size - 1);
1945 + chunk = *sector >> sparse_volume->chunk_shift;
1946 + hash_value = ((u32)chunk) % sparse_volume->hash_table_size;
1948 + if ( search_sparse_hash_chain( chunk,
1949 + sparse_volume->sparse_map[hash_value],
1950 + &target_entry) ) {
1952 + ( target_entry->sparse_chunk << sparse_volume->chunk_shift )
1958 + // Is there enough room left on this sparse to remap this chunk?
1959 + if ( sparse_volume->next_free_chunk >= sparse_volume->num_chunks ) {
1960 + DMERR("dm-sparse: full no new remaps allowed\n");
1965 + // Create and initialize a new hash table entry for the new remap.
1966 + new_map_entry = allocate_sparse_hash_entry
1967 + (sparse_volume, chunk, sparse_volume->next_free_chunk);
1968 + if ( ! new_map_entry ) {
1969 + // Can't get memory for map entry. Disable this sparse.
1970 + DMERR("dm-sparse: memory error allocating hash entry\n");
1975 + //Always write cow table so its safe
1976 + cow = kmalloc( SECTOR_SIZE, GFP_KERNEL );
1978 + // Can't get I/O buffer. Disable this sparse.
1979 + DMERR("dm-sparse: memory error allocating COW table buffer");
1984 + // Add the entry to the hash table.
1985 + if ( target_entry ) {
1986 + insert_sparse_hash_entry( new_map_entry, target_entry );
1989 + insert_sparse_hash_entry_at_head
1991 + &(sparse_volume->sparse_map[hash_value]) );
1994 + sparse_volume->next_free_chunk++;
1996 + // Update the appropriate entry in the COW table.
1997 + sparse_volume->cow_table[sparse_volume->next_cow_entry] =
1998 + cpu_to_le64(chunk);
1999 + sparse_volume->next_cow_entry++;
2001 + memcpy(cow, sparse_volume->cow_table, SECTOR_SIZE);
2003 + //because of ordering issues needs to be synchronous
2004 + job.dev = sparse_volume->dev->dev;
2005 + job.sector = sparse_volume->current_cow_sector;
2007 + page = virt_to_page(cow);
2008 + offset = (unsigned long)cow & ~PAGE_MASK;
2009 + dm_io_sync(1, &job, WRITE, page, offset, &error);
2011 + // Update the in-memory COW table values.
2012 + if ( sparse_volume->next_cow_entry >= (SECTOR_SIZE/sizeof(u64)) )
2014 + sparse_volume->next_cow_entry = 0;
2015 + sparse_volume->current_cow_sector++;
2016 + memset(sparse_volume->cow_table, 0xff, SECTOR_SIZE);
2019 + *sector = ( new_map_entry->sparse_chunk << sparse_volume->chunk_shift )
2025 + up_write(&sparse_volume->sparse_semaphore);
2033 +/************************ EXPORT FUNCTIONS ************************/
2036 + * Function: sparse_dtr
2038 +static void sparse_dtr( struct dm_target *ti )
2040 + struct sparse_volume * vol = (struct sparse_volume *)ti->private;
2045 + if (vol->sparse_map) {
2046 + for ( i = 0; i < vol->hash_table_size; i++ ) {
2047 + delete_sparse_hash_chain( vol, vol->sparse_map[i] );
2049 + delete_sparse_hash_chain( vol, vol->free_hash_list );
2050 + vfree(vol->sparse_map);
2053 + if (vol->hash_pool)
2054 + mempool_destroy(vol->hash_pool);
2056 + if (vol->hash_slab)
2057 + kmem_cache_destroy(vol->hash_slab);
2059 + dm_put_device(ti, vol->dev);
2061 + if (vol->dm_io_flag) {
2070 + * Function: sparse_ctr
2072 +static int sparse_ctr( struct dm_target *ti, unsigned int argc, char** argv )
2074 + int i, rc = -EINVAL;
2075 + struct sparse_hash_entry *new_entry;
2076 + struct sparse_volume *vol;
2077 + struct dm_dev *dev;
2078 + u32 chunk_size, chunks;
2080 + char* end, slab_name[NAME_SIZE+1];
2082 + if ( argc != 4 ) {
2083 + ti->error="dm-sparse: wrong number of arguments";
2087 + start = simple_strtoull(argv[1], &end, 10);
2089 + ti->error="dm-sparse: Invalid first chunk lba";
2093 + chunk_size = simple_strtoul(argv[2], &end, 10);
2095 + ti->error="dm-sparse: Invalid chunk_size";
2099 + chunks = simple_strtoul(argv[3], &end, 10);
2101 + ti->error="dm-sparse: Invalid number of chunks";
2105 + if ( dm_get_device( ti, argv[0], ti->begin, start + chunks * chunk_size,
2106 + dm_table_get_mode(ti->table), &dev ) ) {
2107 + ti->error = "dm-sparse: Device lookup failed";
2111 + vol = kmalloc(sizeof(struct sparse_volume), GFP_KERNEL);
2113 + ti->error = "dm-sparse: Memory allocation for private-data failed";
2118 + memset( vol, 0, sizeof(struct sparse_volume) );
2120 + rc = dm_io_get(1);
2122 + ti->error = "dm-sparse: failed to initialize dm-io.";
2128 + vol->dm_io_flag = 1;
2129 + vol->chunk_size = chunk_size;
2130 + vol->chunk_shift = log2(chunk_size);
2131 + vol->num_chunks = chunks;
2132 + vol->current_cow_sector = 1;
2133 + vol->hash_table_size = chunks / MAX_HASH_CHAIN_ENTRIES + 1;
2134 + vol->start = start;
2136 + init_rwsem(&vol->sparse_semaphore);
2138 + snprintf(slab_name, NAME_SIZE, "sparse-%p", vol);
2139 + vol->hash_slab = kmem_cache_create(slab_name,
2140 + sizeof(struct sparse_hash_entry),
2141 + 0, SLAB_HWCACHE_ALIGN,
2143 + if ( ! vol->hash_slab ) {
2144 + ti->error = "dm-sparse: memory allocation error in hash slab create";
2148 + vol->hash_pool = mempool_create(1, mempool_alloc_slab,
2149 + mempool_free_slab,
2151 + if ( ! vol->hash_pool ) {
2152 + ti->error = "dm-sparse: memory allocation error in hash pool create";
2157 + // Sparse hash table
2158 + vol->sparse_map = vmalloc( vol->hash_table_size *
2159 + sizeof( struct sparse_hash_entry * ) );
2160 + if ( ! vol->sparse_map ) {
2161 + ti->error = "dm-sparse: Memory allocation error in sparse_map create";
2166 + memset( vol->sparse_map, 0, vol->hash_table_size *
2167 + sizeof( struct sparse_hash_entry * ) );
2169 + for ( i = 0; i < chunks; i++ ) {
2171 + new_entry = mempool_alloc(vol->hash_pool, GFP_KERNEL );
2172 + if ( ! new_entry ) {
2173 + ti->error="dm-sparse: memory allocation error in hash table setup";
2178 + new_entry->next = vol->free_hash_list;
2179 + vol->free_hash_list = new_entry;
2182 + rc = build_sparse_maps(vol);
2184 + ti->error = "dm-sparse: error building hash tables";
2189 + ti->private = vol;
2193 + dm_put_device(ti, dev);
2198 + * Function: sparse_map
2200 +static int sparse_map( struct dm_target * ti, struct buffer_head * bh, int rw,
2201 + union map_info *map_context )
2203 + struct sparse_volume * volume = (struct sparse_volume*)ti->private;
2204 + u64 sector = bh->b_rsector;
2207 + // Check if this sector has been remapped
2208 + rc = sparse_remap_chunk( volume, §or );
2210 + if ( rc < 0 ) { //Error
2214 + if ( rc == 0 ) { // Remapped I/O : read or write same logic
2215 + bh->b_rsector = volume->start + sector;
2216 + bh->b_rdev = volume->dev->dev;
2220 + // ( Previously )Un-mapped: read / write different logic
2222 + if ( rw ) { //write :
2223 + rc = sparse_cow_write( volume, §or );
2225 + if ( rc < 0 ) { //Error
2229 + bh->b_rsector = volume->start + sector;
2230 + bh->b_rdev = volume->dev->dev;
2234 + //Reading something that was never written
2235 + //return zeros and indicate complete
2236 + memset(bh->b_data, 0x0, bh->b_size);
2237 + bh->b_end_io(bh, 1);
2241 +static int sparse_status( struct dm_target *ti, status_type_t type,
2242 + char *result, unsigned int maxlen )
2244 + struct sparse_volume * vol = (struct sparse_volume * )ti->private;
2248 + case STATUSTYPE_INFO:
2249 + snprintf( result, maxlen, "%d%%",
2250 + ( vol->next_free_chunk * 100 ) / vol->num_chunks );
2253 + case STATUSTYPE_TABLE:
2254 + snprintf( result, maxlen, "%s %Lu %u %u",
2255 + dm_kdevname(vol->dev->dev), vol->start,
2256 + vol->chunk_size, vol->num_chunks );
2266 +/****************** FUNCTION TABLE **********************/
2268 +static struct target_type sparse_target = {
2270 + .module = THIS_MODULE,
2271 + .ctr = sparse_ctr,
2272 + .dtr = sparse_dtr,
2273 + .map = sparse_map,
2274 + .status = sparse_status,
2277 +/********************* REGISTRATION *****************/
2279 +int __init sparse_init(void)
2281 + int rc = dm_register_target(&sparse_target);
2284 + DMWARN("sparse target registration failed");
2289 +void __exit sparse_exit(void)
2291 + if (dm_unregister_target(&sparse_target) )
2292 + DMWARN("sparse target unregistration failed");
2297 +module_init(sparse_init);
2298 +module_exit(sparse_exit);
2299 +MODULE_LICENSE("GPL");
2300 diff -urN linux-2.4.24.org/drivers/md/lvm.c linux-2.4.24/drivers/md/lvm.c
2301 --- linux-2.4.24.org/drivers/md/lvm.c 2004-01-18 14:58:09.106704262 +0100
2302 +++ linux-2.4.24/drivers/md/lvm.c 2004-01-18 15:57:55.568033496 +0100
2304 #define DEVICE_OFF(device)
2305 #define LOCAL_END_REQUEST
2307 -/* lvm_do_lv_create calls fsync_dev_lockfs()/unlockfs() */
2308 -/* #define LVM_VFS_ENHANCEMENT */
2310 #include <linux/config.h>
2311 #include <linux/module.h>
2312 #include <linux/kernel.h>
2313 @@ -2250,12 +2247,8 @@
2314 if (lv_ptr->lv_access & LV_SNAPSHOT) {
2315 lv_t *org = lv_ptr->lv_snapshot_org, *last;
2317 - /* sync the original logical volume */
2318 - fsync_dev(org->lv_dev);
2319 -#ifdef LVM_VFS_ENHANCEMENT
2320 /* VFS function call to sync and lock the filesystem */
2321 fsync_dev_lockfs(org->lv_dev);
2324 down_write(&org->lv_lock);
2325 org->lv_access |= LV_SNAPSHOT_ORG;
2326 @@ -2281,11 +2274,9 @@
2328 set_device_ro(lv_ptr->lv_dev, 1);
2330 -#ifdef LVM_VFS_ENHANCEMENT
2331 /* VFS function call to unlock the filesystem */
2332 if (lv_ptr->lv_access & LV_SNAPSHOT)
2333 unlockfs(lv_ptr->lv_snapshot_org->lv_dev);
2336 lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].de =
2337 lvm_fs_create_lv(vg_ptr, lv_ptr);
2338 diff -urN linux-2.4.24.org/drivers/md/Makefile linux-2.4.24/drivers/md/Makefile
2339 --- linux-2.4.24.org/drivers/md/Makefile 2004-01-18 15:09:18.620153502 +0100
2340 +++ linux-2.4.24/drivers/md/Makefile 2004-01-18 16:04:48.278616388 +0100
2342 obj-$(CONFIG_BLK_DEV_LVM) += lvm-mod.o
2344 obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
2345 +obj-$(CONFIG_BLK_DEV_DM_BBR) += dm-bbr.o
2346 +obj-$(CONFIG_BLK_DEV_DM_SPARSE) += dm-sparse.o
2348 include $(TOPDIR)/Rules.make
2350 diff -urN linux-2.4.24.org/drivers/md/md.c linux-2.4.24/drivers/md/md.c
2351 --- linux-2.4.24.org/drivers/md/md.c 2004-01-18 14:58:09.227678566 +0100
2352 +++ linux-2.4.24/drivers/md/md.c 2004-01-18 16:04:27.702900923 +0100
2353 @@ -2146,6 +2146,8 @@
2357 + if (mddev->curr_resync)
2358 + info.state |= (1 << MD_ARRAY_RECOVERY_RUNNING);
2359 SET_FROM_SB(active_disks);
2360 SET_FROM_SB(working_disks);
2361 SET_FROM_SB(failed_disks);
2362 diff -urN linux-2.4.24.org/drivers/md/multipath.c linux-2.4.24/drivers/md/multipath.c
2363 --- linux-2.4.24.org/drivers/md/multipath.c 2004-01-18 14:58:09.254672832 +0100
2364 +++ linux-2.4.24/drivers/md/multipath.c 2004-01-18 16:04:38.291691263 +0100
2365 @@ -139,15 +139,16 @@
2366 static int multipath_map (mddev_t *mddev, kdev_t *rdev)
2368 multipath_conf_t *conf = mddev_to_conf(mddev);
2369 - int i, disks = MD_SB_DISKS;
2373 * Later we do read balancing on the read side
2374 * now we use the first available disk.
2377 - for (i = 0; i < disks; i++) {
2378 + for (i = 0; i < conf->nr_disks; i++) {
2379 if (conf->multipaths[i].operational) {
2380 + /* first operational is winner! */
2381 *rdev = conf->multipaths[i].dev;
2386 struct multipath_bh * mp_bh = (struct multipath_bh *)(bh->b_private);
2388 + atomic_dec(&mp_bh->multipath->nr_pending);
2391 * this branch is our 'one multipath IO has finished' event handler:
2393 @@ -223,19 +226,39 @@
2397 - * This routine returns the disk from which the requested read should
2399 + * Multipath read balance ...
2403 + * If no active paths
2407 + * If active paths == 1
2409 + * - 1st active path encountered
2411 + * If active paths > 1
2413 + * - 1st idle active path encountered
2414 + * - else ... the active path doing the least amount of work.
2417 static int multipath_read_balance (multipath_conf_t *conf)
2421 - for (disk = 0; disk < conf->raid_disks; disk++)
2422 - if (conf->multipaths[disk].operational)
2426 + int i, disk=-1, nr_pending, least_pending=0;
2428 + for (i=0; i<conf->nr_disks; i++) {
2429 + if (conf->multipaths[i].operational) {
2430 + nr_pending = atomic_read(&conf->multipaths[i].nr_pending);
2431 + if (nr_pending==0 || conf->working_disks==1)
2433 + if (least_pending==0 || nr_pending<least_pending) {
2435 + least_pending = nr_pending;
2442 static int multipath_make_request (mddev_t *mddev, int rw,
2444 struct buffer_head *bh_req;
2445 struct multipath_bh * mp_bh;
2446 struct multipath_info *multipath;
2449 if (!buffer_locked(bh))
2451 @@ -267,7 +291,16 @@
2453 * read balancing logic:
2455 - multipath = conf->multipaths + multipath_read_balance(conf);
2456 + disk = multipath_read_balance(conf);
2458 + printk (KERN_ERR "multipath_make_request: no more operational IO paths.\n");
2459 + buffer_IO_error(bh);
2463 + multipath = conf->multipaths + disk;
2464 + mp_bh->multipath = multipath;
2465 + atomic_inc(&multipath->nr_pending);
2467 bh_req = &mp_bh->bh_req;
2468 memcpy(bh_req, bh, sizeof(*bh));
2469 @@ -331,13 +364,14 @@
2471 multipath_conf_t *conf = mddev_to_conf(mddev);
2472 struct multipath_info * multipaths = conf->multipaths;
2473 - int disks = MD_SB_DISKS;
2474 int other_paths = 1;
2478 + struct md_list_head *tmp;
2480 if (conf->working_disks == 1) {
2482 - for (i = 0; i < disks; i++) {
2483 + for (i = 0; i < MD_SB_DISKS; i++) {
2484 if (multipaths[i].spare) {
2487 @@ -351,16 +385,17 @@
2488 * first check if this is a queued request for a device
2489 * which has just failed.
2491 - for (i = 0; i < disks; i++) {
2492 + for (i = 0; i < MD_SB_DISKS; i++) {
2493 if (multipaths[i].dev==dev && !multipaths[i].operational)
2498 + mdp_super_t *sb = mddev->sb;
2500 * Mark disk as unusable
2502 - for (i = 0; i < disks; i++) {
2503 + for (i = 0; i < MD_SB_DISKS; i++) {
2504 if (multipaths[i].dev==dev && multipaths[i].operational) {
2505 mark_disk_bad(mddev, i);
2508 if (!conf->working_disks) {
2511 - mdp_super_t *sb = mddev->sb;
2513 spare = get_spare(mddev);
2515 @@ -384,6 +418,21 @@
2519 + /* prevent unnecessary work in md_do_recovery() */
2520 + if (conf->working_disks) {
2521 + conf->raid_disks = conf->working_disks
2522 + = sb->raid_disks = sb->active_disks;
2524 + /* update alias disk info to insure we can do sb commit. */
2525 + ITERATE_RDEV(mddev,rdev,tmp) {
2526 + if (first && disk_active(&sb->disks[rdev->desc_nr])) {
2527 + rdev->alias_device = 0;
2530 + if (!disk_faulty(&sb->disks[rdev->desc_nr]))
2531 + rdev->alias_device = 1;
2539 * This is a kernel thread which:
2541 - * 1. Retries failed read operations on working multipaths.
2542 + * 1. Retries failed operations on working multipaths.
2543 * 2. Updates the raid superblock when problems encounter.
2544 - * 3. Performs writes following reads for array syncronising.
2547 static void multipathd (void *data)
2549 mdk_rdev_t *rdev, *def_rdev = NULL;
2550 struct md_list_head *tmp;
2552 + int active_disks = 0, spare_disks = 0, faulty_disks = 0;
2557 printk(NOT_IN_SYNC, partition_name(rdev->dev));
2560 - * Mark all disks as spare to start with, then pick our
2561 - * active disk. If we have a disk that is marked active
2562 - * in the sb, then use it, else use the first rdev.
2563 + * Mark all disks as spare to start with.
2565 disk->number = desc->number;
2566 disk->raid_disk = desc->raid_disk;
2567 @@ -894,20 +941,21 @@
2568 mark_disk_sync(desc);
2570 if (disk_active(desc)) {
2571 - if(!conf->working_disks) {
2572 - printk(OPERATIONAL, partition_name(rdev->dev),
2574 - disk->operational = 1;
2576 - conf->working_disks++;
2579 - mark_disk_spare(desc);
2582 - mark_disk_spare(desc);
2583 + printk(OPERATIONAL, partition_name(rdev->dev),
2585 + disk->operational = 1;
2587 + conf->working_disks++;
2590 + } else if (disk_faulty(desc)) {
2597 - if(!num_rdevs++) def_rdev = rdev;
2600 if(!conf->working_disks && num_rdevs) {
2601 desc = &sb->disks[def_rdev->desc_nr];
2602 @@ -918,11 +966,12 @@
2604 conf->working_disks++;
2605 mark_disk_active(desc);
2609 - * Make sure our active path is in desc spot 0
2610 + * If there is only 1 active path ... make sure it is in desc spot 0
2612 - if(def_rdev->desc_nr != 0) {
2613 + if (active_disks == 1 && def_rdev->desc_nr != 0) {
2614 rdev = find_rdev_nr(mddev, 0);
2615 desc = &sb->disks[def_rdev->desc_nr];
2617 @@ -940,10 +989,10 @@
2618 def_rdev->desc_nr = 0;
2621 - conf->raid_disks = sb->raid_disks = sb->active_disks = 1;
2622 + conf->raid_disks = sb->raid_disks = sb->active_disks = active_disks;
2623 conf->nr_disks = sb->nr_disks = sb->working_disks = num_rdevs;
2624 - sb->failed_disks = 0;
2625 - sb->spare_disks = num_rdevs - 1;
2626 + sb->failed_disks = faulty_disks;
2627 + sb->spare_disks = spare_disks;
2628 mddev->sb_dirty = 1;
2629 conf->mddev = mddev;
2630 conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
2631 diff -urN linux-2.4.24.org/fs/buffer.c linux-2.4.24/fs/buffer.c
2632 --- linux-2.4.24.org/fs/buffer.c 2004-01-18 14:55:22.305275818 +0100
2633 +++ linux-2.4.24/fs/buffer.c 2004-01-18 15:57:55.602026171 +0100
2634 @@ -419,6 +419,34 @@
2638 +int fsync_dev_lockfs(kdev_t dev)
2640 + /* you are not allowed to try locking all the filesystems
2641 + ** on the system, your chances of getting through without
2642 + ** total deadlock are slim to none.
2645 + return fsync_dev(dev) ;
2647 + sync_buffers(dev, 0);
2650 + /* note, the FS might need to start transactions to
2651 + ** sync the inodes, or the quota, no locking until
2652 + ** after these are done
2655 + DQUOT_SYNC_DEV(dev);
2656 + /* if inodes or quotas could be dirtied during the
2657 + ** sync_supers_lockfs call, the FS is responsible for getting
2658 + ** them on disk, without deadlocking against the lock
2660 + sync_supers_lockfs(dev) ;
2663 + return sync_buffers(dev, 1) ;
2666 asmlinkage long sys_sync(void)
2669 diff -urN linux-2.4.24.org/fs/reiserfs/super.c linux-2.4.24/fs/reiserfs/super.c
2670 --- linux-2.4.24.org/fs/reiserfs/super.c 2004-01-18 14:55:18.875002271 +0100
2671 +++ linux-2.4.24/fs/reiserfs/super.c 2004-01-18 15:57:55.657014322 +0100
2673 reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
2674 journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s));
2675 reiserfs_block_writes(&th) ;
2676 - journal_end(&th, s, 1) ;
2677 + journal_end_sync(&th, s, 1) ;
2681 diff -urN linux-2.4.24.org/fs/super.c linux-2.4.24/fs/super.c
2682 --- linux-2.4.24.org/fs/super.c 2004-01-18 14:55:11.177633010 +0100
2683 +++ linux-2.4.24/fs/super.c 2004-01-18 15:57:55.687007859 +0100
2685 LIST_HEAD(super_blocks);
2686 spinlock_t sb_lock = SPIN_LOCK_UNLOCKED;
2689 + * lock/unlockfs grab a read lock on s_umount, but you need this lock to
2690 + * make sure no lockfs runs are in progress before inserting/removing
2691 + * supers from the list.
2693 +static DECLARE_MUTEX(lockfs_sem);
2696 * Handling of filesystem drivers list.
2698 @@ -436,6 +443,19 @@
2702 +static void write_super_lockfs(struct super_block *sb)
2705 + if (sb->s_root && sb->s_op) {
2706 + if (sb->s_dirt && sb->s_op->write_super)
2707 + sb->s_op->write_super(sb);
2708 + if (sb->s_op->write_super_lockfs) {
2709 + sb->s_op->write_super_lockfs(sb);
2715 static inline void write_super(struct super_block *sb)
2718 @@ -483,6 +503,39 @@
2719 spin_unlock(&sb_lock);
2723 + * Note: don't check the dirty flag before waiting, we want the lock
2724 + * to happen every time this is called. dev must be non-zero
2726 +void sync_supers_lockfs(kdev_t dev)
2728 + struct super_block * sb;
2730 + down(&lockfs_sem) ;
2732 + sb = get_super(dev);
2734 + write_super_lockfs(sb);
2740 +void unlockfs(kdev_t dev)
2742 + struct super_block * sb;
2745 + sb = get_super(dev);
2747 + if (sb->s_op && sb->s_op->unlockfs)
2748 + sb->s_op->unlockfs(sb) ;
2756 * get_super - get the superblock of a device
2757 * @dev: device to get the superblock for
2762 + down(&lockfs_sem);
2764 spin_lock(&sb_lock);
2767 ((flags ^ old->s_flags) & MS_RDONLY)) {
2768 spin_unlock(&sb_lock);
2773 if (!grab_super(old))
2774 @@ -720,12 +775,14 @@
2776 blkdev_put(bdev, BDEV_FS);
2784 insert_super(s, fs_type);
2786 if (!fs_type->read_super(s, data, flags & MS_VERBOSE ? 1 : 0))
2788 s->s_flags |= MS_ACTIVE;
2789 @@ -833,7 +890,10 @@
2790 if (!deactivate_super(sb))
2793 + down(&lockfs_sem);
2794 down_write(&sb->s_umount);
2798 /* Need to clean after the sucker */
2799 if (fs->fs_flags & FS_LITTER)
2800 diff -urN linux-2.4.24.org/include/linux/fs.h linux-2.4.24/include/linux/fs.h
2801 --- linux-2.4.24.org/include/linux/fs.h 2004-01-18 14:55:29.014855364 +0100
2802 +++ linux-2.4.24/include/linux/fs.h 2004-01-18 15:59:11.694692181 +0100
2803 @@ -1287,6 +1287,7 @@
2804 extern int sync_buffers(kdev_t, int);
2805 extern void sync_dev(kdev_t);
2806 extern int fsync_dev(kdev_t);
2807 +extern int fsync_dev_lockfs(kdev_t);
2808 extern int fsync_super(struct super_block *);
2809 extern int fsync_no_super(kdev_t);
2810 extern void sync_inodes_sb(struct super_block *);
2811 @@ -1305,6 +1306,8 @@
2812 extern int filemap_fdatasync(struct address_space *);
2813 extern int filemap_fdatawait(struct address_space *);
2814 extern void sync_supers(kdev_t dev, int wait);
2815 +extern void sync_supers_lockfs(kdev_t);
2816 +extern void unlockfs(kdev_t);
2817 extern int bmap(struct inode *, int);
2818 extern int notify_change(struct dentry *, struct iattr *);
2819 extern int permission(struct inode *, int);
2820 diff -urN linux-2.4.24.org/include/linux/raid/md_u.h linux-2.4.24/include/linux/raid/md_u.h
2821 --- linux-2.4.24.org/include/linux/raid/md_u.h 2004-01-18 14:55:35.554471508 +0100
2822 +++ linux-2.4.24/include/linux/raid/md_u.h 2004-01-18 16:04:27.764887949 +0100
2827 +#define MD_ARRAY_CLEAN 0
2828 +#define MD_ARRAY_ERRORS 1
2829 +#define MD_ARRAY_RECOVERY_RUNNING 2
2831 typedef struct mdu_array_info_s {
2833 * Generic constant information
2834 diff -urN linux-2.4.24.org/include/linux/raid/multipath.h linux-2.4.24/include/linux/raid/multipath.h
2835 --- linux-2.4.24.org/include/linux/raid/multipath.h 2004-01-18 14:55:35.563469605 +0100
2836 +++ linux-2.4.24/include/linux/raid/multipath.h 2004-01-18 16:04:38.329683369 +0100
2841 + atomic_t nr_pending; /* number of pending requests */
2844 struct multipath_private_data {
2846 struct buffer_head *master_bh;
2847 struct buffer_head bh_req;
2848 struct multipath_bh *next_mp; /* next for retry or in free list */
2849 + struct multipath_info *multipath; /* allows end_request to easilly dec pending buffer count*/
2851 /* bits for multipath_bh.state */
2852 #define MPBH_Uptodate 1
2853 diff -urN linux-2.4.24.org/kernel/ksyms.c linux-2.4.24/kernel/ksyms.c
2854 --- linux-2.4.24.org/kernel/ksyms.c 2004-01-18 14:55:22.698192617 +0100
2855 +++ linux-2.4.24/kernel/ksyms.c 2004-01-18 15:57:55.824978130 +0100
2857 EXPORT_SYMBOL(invalidate_inode_pages);
2858 EXPORT_SYMBOL(truncate_inode_pages);
2859 EXPORT_SYMBOL(fsync_dev);
2860 +EXPORT_SYMBOL(fsync_dev_lockfs);
2861 +EXPORT_SYMBOL(unlockfs);
2862 EXPORT_SYMBOL(fsync_no_super);
2863 EXPORT_SYMBOL(permission);
2864 EXPORT_SYMBOL(vfs_permission);