]> git.pld-linux.org Git - packages/kernel.git/blame - linux-2.4.22-evms-2.1.1.patch
- added description of djurban's branch
[packages/kernel.git] / linux-2.4.22-evms-2.1.1.patch
CommitLineData
4dd5eeca
JR
1diff -urN linux-2.4.22/drivers/md/Config.in linux-2.4.22-evms/drivers/md/Config.in
2--- linux-2.4.22/drivers/md/Config.in 2003-09-15 17:07:45.000000000 +0200
3+++ linux-2.4.22-evms/drivers/md/Config.in 2003-09-15 17:09:48.000000000 +0200
4@@ -16,5 +16,9 @@
5 dep_tristate ' Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM $CONFIG_MD
6 dep_tristate ' Device-mapper support' CONFIG_BLK_DEV_DM $CONFIG_MD
7 dep_tristate ' Mirror (RAID-1) support' CONFIG_BLK_DEV_DM_MIRROR $CONFIG_BLK_DEV_DM
8+if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
9+ dep_tristate ' Bad Block Relocation Device Target' CONFIG_BLK_DEV_DM_BBR $CONFIG_BLK_DEV_DM
10+ dep_tristate ' Sparse Device Target' CONFIG_BLK_DEV_DM_SPARSE $CONFIG_BLK_DEV_DM
11+fi
12
13 endmenu
14diff -urN linux-2.4.22/drivers/md/Makefile linux-2.4.22-evms/drivers/md/Makefile
15--- linux-2.4.22/drivers/md/Makefile 2003-09-15 17:07:45.000000000 +0200
16+++ linux-2.4.22-evms/drivers/md/Makefile 2003-09-15 17:09:48.000000000 +0200
17@@ -30,6 +30,8 @@
18
19 obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
20 obj-$(CONFIG_BLK_DEV_DM_MIRROR) += dm-mirror.o
21+obj-$(CONFIG_BLK_DEV_DM_BBR) += dm-bbr.o
22+obj-$(CONFIG_BLK_DEV_DM_SPARSE) += dm-sparse.o
23
24 include $(TOPDIR)/Rules.make
25
26diff -urN linux-2.4.22/drivers/md/dm-bbr.c linux-2.4.22-evms/drivers/md/dm-bbr.c
27--- linux-2.4.22/drivers/md/dm-bbr.c 1970-01-01 01:00:00.000000000 +0100
28+++ linux-2.4.22-evms/drivers/md/dm-bbr.c 2003-09-15 17:08:42.000000000 +0200
29@@ -0,0 +1,1228 @@
30+/*
31+ * Copyright (c) International Business Machines Corp., 2002-2003
32+ *
33+ * This program is free software; you can redistribute it and/or modify
34+ * it under the terms of the GNU General Public License as published by
35+ * the Free Software Foundation; either version 2 of the License, or
36+ * (at your option) any later version.
37+ *
38+ * This program is distributed in the hope that it will be useful,
39+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
40+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
41+ * the GNU General Public License for more details.
42+ *
43+ * You should have received a copy of the GNU General Public License
44+ * along with this program; if not, write to the Free Software
45+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
46+ *
47+ * linux/drivers/md/dm-bbr.c
48+ *
49+ * Bad-block-relocation (BBR) target for device-mapper.
50+ *
51+ * The BBR target is designed to remap I/O write failures to another safe
52+ * location on disk. Note that most disk drives have BBR built into them,
53+ * this means that our software BBR will be only activated when all hardware
54+ * BBR replacement sectors have been used.
55+ */
56+
57+#include <linux/kernel.h>
58+#include <linux/module.h>
59+#include <linux/init.h>
60+#include <linux/blkdev.h>
61+#include <linux/spinlock.h>
62+#include <linux/smp_lock.h>
63+#include <linux/slab.h>
64+#include <linux/mempool.h>
65+#include "dm.h"
66+#include "dm-bbr.h"
67+#include "dm-daemon.h"
68+#include "dm-io.h"
69+
70+/* Number of active BBR devices. */
71+static int bbr_instances = 0;
72+static DECLARE_MUTEX(bbr_instances_lock);
73+
74+/* Data pertaining to the I/O thread. */
75+static struct dm_daemon * bbr_io_thread = NULL;
76+static spinlock_t bbr_io_list_lock = SPIN_LOCK_UNLOCKED;
77+static LIST_HEAD(bbr_io_list);
78+static void bbr_io_handler(void);
79+
80+/* Global pools for bbr_io_buf's and bbr_remap's. */
81+static kmem_cache_t * bbr_io_buf_cache;
82+static mempool_t * bbr_io_buf_pool;
83+static kmem_cache_t * bbr_remap_cache;
84+static mempool_t * bbr_remap_pool;
85+
86+static void bbr_free_remap(struct bbr_private * bbr_id);
87+
88+/**
89+ * destroy_pools
90+ *
91+ * Delete the pools for the remap list and I/O anchors.
92+ **/
93+static void destroy_pools(void)
94+{
95+ if (bbr_io_buf_pool) {
96+ mempool_destroy(bbr_io_buf_pool);
97+ bbr_io_buf_pool = NULL;
98+ }
99+ if (bbr_io_buf_cache) {
100+ kmem_cache_destroy(bbr_io_buf_cache);
101+ bbr_io_buf_cache = NULL;
102+ }
103+ if (bbr_remap_pool) {
104+ mempool_destroy(bbr_remap_pool);
105+ bbr_remap_pool = NULL;
106+ }
107+ if (bbr_remap_cache) {
108+ kmem_cache_destroy(bbr_remap_cache);
109+ bbr_remap_cache = NULL;
110+ }
111+}
112+
113+/**
114+ * create_pools
115+ *
116+ * Create mempools for the remap list and I/O anchors.
117+ **/
118+static int create_pools(void)
119+{
120+ if (!bbr_remap_cache) {
121+ bbr_remap_cache = kmem_cache_create("BBR_Remap_Cache",
122+ sizeof(struct bbr_runtime_remap),
123+ 0, SLAB_HWCACHE_ALIGN,
124+ NULL, NULL);
125+ if (!bbr_remap_cache) {
126+ DMERR("Unable to create BBR remap cache.");
127+ goto out;
128+ }
129+ }
130+ if (!bbr_remap_pool) {
131+ bbr_remap_pool = mempool_create(64, mempool_alloc_slab,
132+ mempool_free_slab,
133+ bbr_remap_cache);
134+ if (!bbr_remap_pool) {
135+ DMERR("Unable to create BBR remap mempool.");
136+ goto out;
137+ }
138+ }
139+
140+ if (!bbr_io_buf_cache) {
141+ bbr_io_buf_cache = kmem_cache_create("BBR_IO_Buf_Cache",
142+ sizeof(struct bbr_io_buffer),
143+ 0, SLAB_HWCACHE_ALIGN,
144+ NULL, NULL);
145+ if (!bbr_io_buf_cache) {
146+ DMERR("Unable to create BBR I/O buffer cache.");
147+ goto out;
148+ }
149+ }
150+ if (!bbr_io_buf_pool) {
151+ bbr_io_buf_pool = mempool_create(256, mempool_alloc_slab,
152+ mempool_free_slab,
153+ bbr_io_buf_cache);
154+ if (!bbr_io_buf_pool) {
155+ DMERR("Unable to create BBR I/O buffer mempool.");
156+ goto out;
157+ }
158+ }
159+
160+out:
161+ if (!bbr_remap_cache || !bbr_remap_pool ||
162+ !bbr_io_buf_cache || !bbr_io_buf_pool ) {
163+ destroy_pools();
164+ return -ENOMEM;
165+ }
166+
167+ return 0;
168+}
169+
170+/**
171+ * stop_io_thread
172+ *
173+ * Use the dm-daemon services to stop the BBR I/O thread.
174+ **/
175+static void stop_io_thread(void)
176+{
177+ if (bbr_io_thread) {
178+ dm_daemon_stop(bbr_io_thread);
179+ kfree(bbr_io_thread);
180+ bbr_io_thread = NULL;
181+ }
182+}
183+
184+/**
185+ * stop_io_thread
186+ *
187+ * Use the dm-daemon services to start the BBR I/O thread.
188+ **/
189+static int start_io_thread(void)
190+{
191+ int rc;
192+
193+ if (!bbr_io_thread) {
194+ bbr_io_thread = kmalloc(sizeof(*bbr_io_thread), GFP_KERNEL);
195+ if (!bbr_io_thread) {
196+ return -ENOMEM;
197+ }
198+
199+ rc = dm_daemon_start(bbr_io_thread, "bbr_io", bbr_io_handler);
200+ if (rc) {
201+ kfree(bbr_io_thread);
202+ return rc;
203+ }
204+ }
205+
206+ return 0;
207+}
208+
209+/**
210+ * bbr_global_init
211+ *
212+ * Set up the mempools, I/O thread, and sync-I/O service. This should
213+ * be called only when the first bbr device is created.
214+ **/
215+static int bbr_global_init(void)
216+{
217+ int rc;
218+
219+ rc = create_pools();
220+ if (rc) {
221+ goto out;
222+ }
223+
224+ rc = start_io_thread();
225+ if (rc) {
226+ destroy_pools();
227+ goto out;
228+ }
229+
230+ rc = dm_io_get(1);
231+ if (rc) {
232+ destroy_pools();
233+ stop_io_thread();
234+ goto out;
235+ }
236+
237+out:
238+ return rc;
239+}
240+
241+/**
242+ * bbr_global_cleanup
243+ *
244+ * Cleanup the mempools, I/O thread and sync-I/O service. This should
245+ * be called only when the last bbr device is removed.
246+ **/
247+static void bbr_global_cleanup(void)
248+{
249+ destroy_pools();
250+ stop_io_thread();
251+ dm_io_put(1);
252+}
253+
254+static struct bbr_private * bbr_alloc_private(void)
255+{
256+ struct bbr_private * bbr_id;
257+
258+ bbr_id = kmalloc(sizeof(*bbr_id), GFP_KERNEL);
259+ if (bbr_id) {
260+ memset(bbr_id, 0, sizeof(*bbr_id));
261+ bbr_id->in_use_replacement_blks = (atomic_t)ATOMIC_INIT(0);
262+ bbr_id->bbr_id_lock = SPIN_LOCK_UNLOCKED;
263+ }
264+
265+ return bbr_id;
266+}
267+
268+static void bbr_free_private(struct bbr_private * bbr_id)
269+{
270+ if (bbr_id->bbr_table) {
271+ kfree(bbr_id->bbr_table);
272+ }
273+ bbr_free_remap(bbr_id);
274+ kfree(bbr_id);
275+}
276+
277+static u32 crc_table[256];
278+static u32 crc_table_built = 0;
279+
280+static void build_crc_table(void)
281+{
282+ u32 i, j, crc;
283+
284+ for (i = 0; i <= 255; i++) {
285+ crc = i;
286+ for (j = 8; j > 0; j--) {
287+ if (crc & 1)
288+ crc = (crc >> 1) ^ CRC_POLYNOMIAL;
289+ else
290+ crc >>= 1;
291+ }
292+ crc_table[i] = crc;
293+ }
294+ crc_table_built = 1;
295+}
296+
297+static u32 calculate_crc(u32 crc, void * buffer, u32 buffersize)
298+{
299+ unsigned char * current_byte;
300+ u32 temp1, temp2, i;
301+
302+ current_byte = (unsigned char *) buffer;
303+ /* Make sure the crc table is available */
304+ if (!crc_table_built)
305+ build_crc_table();
306+ /* Process each byte in the buffer. */
307+ for (i = 0; i < buffersize; i++) {
308+ temp1 = (crc >> 8) & 0x00FFFFFF;
309+ temp2 = crc_table[(crc ^ (u32) * current_byte) &
310+ (u32) 0xff];
311+ current_byte++;
312+ crc = temp1 ^ temp2;
313+ }
314+ return crc;
315+}
316+
317+/**
318+ * le_bbr_table_sector_to_cpu
319+ *
320+ * Convert bbr meta data from on-disk (LE) format
321+ * to the native cpu endian format.
322+ **/
323+static void le_bbr_table_sector_to_cpu(struct bbr_table * p)
324+{
325+ int i;
326+ p->signature = le32_to_cpup(&p->signature);
327+ p->crc = le32_to_cpup(&p->crc);
328+ p->sequence_number = le32_to_cpup(&p->sequence_number);
329+ p->in_use_cnt = le32_to_cpup(&p->in_use_cnt);
330+ for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
331+ p->entries[i].bad_sect =
332+ le64_to_cpup(&p->entries[i].bad_sect);
333+ p->entries[i].replacement_sect =
334+ le64_to_cpup(&p->entries[i].replacement_sect);
335+ }
336+}
337+
338+/**
339+ * cpu_bbr_table_sector_to_le
340+ *
341+ * Convert bbr meta data from cpu endian format to on-disk (LE) format
342+ **/
343+static void cpu_bbr_table_sector_to_le(struct bbr_table * p,
344+ struct bbr_table * le)
345+{
346+ int i;
347+ le->signature = cpu_to_le32p(&p->signature);
348+ le->crc = cpu_to_le32p(&p->crc);
349+ le->sequence_number = cpu_to_le32p(&p->sequence_number);
350+ le->in_use_cnt = cpu_to_le32p(&p->in_use_cnt);
351+ for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
352+ le->entries[i].bad_sect =
353+ cpu_to_le64p(&p->entries[i].bad_sect);
354+ le->entries[i].replacement_sect =
355+ cpu_to_le64p(&p->entries[i].replacement_sect);
356+ }
357+}
358+
359+/**
360+ * validate_bbr_table_sector
361+ *
362+ * Check the specified BBR table sector for a valid signature and CRC. If it's
363+ * valid, endian-convert the table sector.
364+ **/
365+static int validate_bbr_table_sector(struct bbr_table * p)
366+{
367+ int rc = 0;
368+ int org_crc, final_crc;
369+
370+ if (le32_to_cpup(&p->signature) != BBR_TABLE_SIGNATURE) {
371+ DMERR("BBR table signature doesn't match!");
372+ DMERR("Found 0x%x. Expecting 0x%x",
373+ le32_to_cpup(&p->signature), BBR_TABLE_SIGNATURE);
374+ rc = -EINVAL;
375+ goto out;
376+ }
377+
378+ if (!p->crc) {
379+ DMERR("BBR table sector has no CRC!");
380+ rc = -EINVAL;
381+ goto out;
382+ }
383+
384+ org_crc = le32_to_cpup(&p->crc);
385+ p->crc = 0;
386+ final_crc = calculate_crc(INITIAL_CRC, (void *)p, sizeof(*p));
387+ if (final_crc != org_crc) {
388+ DMERR("CRC failed!");
389+ DMERR("Found 0x%x. Expecting 0x%x",
390+ org_crc, final_crc);
391+ rc = -EINVAL;
392+ goto out;
393+ }
394+
395+ p->crc = cpu_to_le32p(&org_crc);
396+ le_bbr_table_sector_to_cpu(p);
397+
398+out:
399+ return rc;
400+}
401+
402+/**
403+ * bbr_binary_tree_insert
404+ *
405+ * Insert a node into the binary tree.
406+ **/
407+static void bbr_binary_tree_insert(struct bbr_runtime_remap ** root,
408+ struct bbr_runtime_remap * newnode)
409+{
410+ struct bbr_runtime_remap ** node = root;
411+ while (node && *node) {
412+ if (newnode->remap.bad_sect > (*node)->remap.bad_sect) {
413+ node = &((*node)->right);
414+ } else {
415+ node = &((*node)->left);
416+ }
417+ }
418+
419+ newnode->left = newnode->right = NULL;
420+ *node = newnode;
421+}
422+
423+/**
424+ * bbr_binary_search
425+ *
426+ * Search for a node that contains bad_sect == lsn.
427+ **/
428+static struct bbr_runtime_remap * bbr_binary_search(
429+ struct bbr_runtime_remap * root,
430+ u64 lsn)
431+{
432+ struct bbr_runtime_remap * node = root;
433+ while (node) {
434+ if (node->remap.bad_sect == lsn) {
435+ break;
436+ }
437+ if (lsn > node->remap.bad_sect) {
438+ node = node->right;
439+ } else {
440+ node = node->left;
441+ }
442+ }
443+ return node;
444+}
445+
446+/**
447+ * bbr_binary_tree_destroy
448+ *
449+ * Destroy the binary tree.
450+ **/
451+static void bbr_binary_tree_destroy(struct bbr_runtime_remap * root,
452+ struct bbr_private * bbr_id)
453+{
454+ struct bbr_runtime_remap ** link = NULL;
455+ struct bbr_runtime_remap * node = root;
456+
457+ while (node) {
458+ if (node->left) {
459+ link = &(node->left);
460+ node = node->left;
461+ continue;
462+ }
463+ if (node->right) {
464+ link = &(node->right);
465+ node = node->right;
466+ continue;
467+ }
468+
469+ mempool_free(node, bbr_remap_pool);
470+ if (node == root) {
471+ /* If root is deleted, we're done. */
472+ break;
473+ }
474+
475+ /* Back to root. */
476+ node = root;
477+ *link = NULL;
478+ }
479+}
480+
481+static void bbr_free_remap(struct bbr_private * bbr_id)
482+{
483+ spin_lock_irq(&bbr_id->bbr_id_lock);
484+ bbr_binary_tree_destroy(bbr_id->remap_root, bbr_id);
485+ bbr_id->remap_root = NULL;
486+ spin_unlock_irq(&bbr_id->bbr_id_lock);
487+}
488+
489+/**
490+ * bbr_insert_remap_entry
491+ *
492+ * Create a new remap entry and add it to the binary tree for this node.
493+ **/
494+static int bbr_insert_remap_entry(struct bbr_private * bbr_id,
495+ struct bbr_table_entry * new_bbr_entry)
496+{
497+ struct bbr_runtime_remap * newnode;
498+
499+ newnode = mempool_alloc(bbr_remap_pool, GFP_NOIO);
500+ if (!newnode) {
501+ DMERR("Could not allocate from remap mempool!");
502+ return -ENOMEM;
503+ }
504+ newnode->remap.bad_sect = new_bbr_entry->bad_sect;
505+ newnode->remap.replacement_sect = new_bbr_entry->replacement_sect;
506+ spin_lock_irq(&bbr_id->bbr_id_lock);
507+ bbr_binary_tree_insert(&bbr_id->remap_root, newnode);
508+ spin_unlock_irq(&bbr_id->bbr_id_lock);
509+ return 0;
510+}
511+
512+/**
513+ * bbr_table_to_remap_list
514+ *
515+ * The on-disk bbr table is sorted by the replacement sector LBA. In order to
516+ * improve run time performance, the in memory remap list must be sorted by
517+ * the bad sector LBA. This function is called at discovery time to initialize
518+ * the remap list. This function assumes that at least one copy of meta data
519+ * is valid.
520+ **/
521+static u32 bbr_table_to_remap_list(struct bbr_private * bbr_id)
522+{
523+ u32 in_use_blks = 0;
524+ int i, j;
525+ struct bbr_table * p;
526+
527+
528+ for (i = 0, p = bbr_id->bbr_table;
529+ i < bbr_id->nr_sects_bbr_table;
530+ i++, p++ ) {
531+ if (!p->in_use_cnt) {
532+ break;
533+ }
534+ in_use_blks += p->in_use_cnt;
535+ for (j = 0; j < p->in_use_cnt; j++) {
536+ bbr_insert_remap_entry(bbr_id, &p->entries[j]);
537+ }
538+ }
539+ if (in_use_blks)
540+ DMWARN("There are %u BBR entries for device %u:%u",
541+ in_use_blks, MAJOR(bbr_id->dev->dev),
542+ MINOR(bbr_id->dev->dev));
543+
544+ return in_use_blks;
545+}
546+
547+/**
548+ * bbr_search_remap_entry
549+ *
550+ * Search remap entry for the specified sector. If found, return a pointer to
551+ * the table entry. Otherwise, return NULL.
552+ **/
553+static struct bbr_table_entry * bbr_search_remap_entry(
554+ struct bbr_private * bbr_id,
555+ u64 lsn)
556+{
557+ struct bbr_runtime_remap * p;
558+
559+ spin_lock_irq(&bbr_id->bbr_id_lock);
560+ p = bbr_binary_search(bbr_id->remap_root, lsn);
561+ spin_unlock_irq(&bbr_id->bbr_id_lock);
562+ if (p) {
563+ return (&p->remap);
564+ } else {
565+ return NULL;
566+ }
567+}
568+
569+/**
570+ * bbr_remap
571+ *
572+ * If *lsn is in the remap table, return TRUE and modify *lsn,
573+ * else, return FALSE.
574+ **/
575+static inline int bbr_remap(struct bbr_private * bbr_id,
576+ u64 * lsn)
577+{
578+ struct bbr_table_entry * e;
579+
580+ if (atomic_read(&bbr_id->in_use_replacement_blks)) {
581+ e = bbr_search_remap_entry(bbr_id, *lsn);
582+ if (e) {
583+ *lsn = e->replacement_sect;
584+ return 1;
585+ }
586+ }
587+ return 0;
588+}
589+
590+/**
591+ * bbr_remap_probe
592+ *
593+ * If any of the sectors in the range [lsn, lsn+nr_sects] are in the remap
594+ * table return TRUE, Else, return FALSE.
595+ **/
596+static inline int bbr_remap_probe(struct bbr_private * bbr_id,
597+ u64 lsn, u64 nr_sects)
598+{
599+ u64 tmp, cnt;
600+
601+ if (atomic_read(&bbr_id->in_use_replacement_blks)) {
602+ for (cnt = 0, tmp = lsn;
603+ cnt < nr_sects;
604+ cnt += bbr_id->blksize_in_sects, tmp = lsn + cnt) {
605+ if (bbr_remap(bbr_id,&tmp)) {
606+ return 1;
607+ }
608+ }
609+ }
610+ return 0;
611+}
612+
613+/**
614+ * bbr_setup
615+ *
616+ * Read the remap tables from disk and set up the initial remap tree.
617+ **/
618+static int bbr_setup(struct bbr_private * bbr_id)
619+{
620+ struct bbr_table * table = bbr_id->bbr_table;
621+ struct page * page;
622+ struct io_region job;
623+ unsigned int error, offset;
624+ int i, rc = 0;
625+
626+ job.dev = bbr_id->dev->dev;
627+ job.count = 1;
628+
629+ /* Read and verify each BBR table sector individually. */
630+ for (i = 0; i < bbr_id->nr_sects_bbr_table; i++, table++) {
631+ job.sector = bbr_id->lba_table1 + i;
632+ page = virt_to_page(table);
633+ offset = (unsigned long)table & ~PAGE_MASK;
634+ rc = dm_io_sync(1, &job, READ, page, offset, &error);
635+ if (rc && bbr_id->lba_table2) {
636+ job.sector = bbr_id->lba_table2 + i;
637+ rc = dm_io_sync(1, &job, READ, page, offset, &error);
638+ }
639+ if (rc) {
640+ goto out;
641+ }
642+
643+ rc = validate_bbr_table_sector(table);
644+ if (rc) {
645+ goto out;
646+ }
647+ }
648+ atomic_set(&bbr_id->in_use_replacement_blks,
649+ bbr_table_to_remap_list(bbr_id));
650+
651+out:
652+ if (rc) {
653+ DMERR("dm-bbr: error during device setup: %d", rc);
654+ }
655+ return rc;
656+}
657+
658+static struct bbr_io_buffer * allocate_bbr_io_buf(struct bbr_private * bbr_id,
659+ struct buffer_head * bh,
660+ int rw)
661+{
662+ struct bbr_io_buffer * bbr_io_buf;
663+
664+ bbr_io_buf = mempool_alloc(bbr_io_buf_pool, GFP_NOIO);
665+ if (bbr_io_buf) {
666+ memset(bbr_io_buf, 0, sizeof(struct bbr_io_buffer));
667+ INIT_LIST_HEAD(&bbr_io_buf->bbr_io_list);
668+ bbr_io_buf->bbr_id = bbr_id;
669+ bbr_io_buf->sector = bh->b_rsector;
670+ bbr_io_buf->bh = bh;
671+ bbr_io_buf->rw = rw;
672+ } else {
673+ DMWARN("Could not allocate from BBR I/O buffer pool!");
674+ }
675+ return bbr_io_buf;
676+}
677+
678+static void free_bbr_io_buf(struct bbr_io_buffer * bbr_io_buf)
679+{
680+ mempool_free(bbr_io_buf, bbr_io_buf_pool);
681+}
682+
683+/**
684+ * bbr_io_remap_error
685+ * @bbr_id: Private data for the BBR node.
686+ * @rw: READ or WRITE.
687+ * @starting_lsn: Starting sector of request to remap.
688+ * @count: Number of sectors in the request.
689+ * @buffer: Data buffer for the request.
690+ *
691+ * For the requested range, try to write each sector individually. For each
692+ * sector that fails, find the next available remap location and write the
693+ * data to that new location. Then update the table and write both copies
694+ * of the table to disk. Finally, update the in-memory mapping and do any
695+ * other necessary bookkeeping.
696+ **/
697+static int bbr_io_remap_error(struct bbr_private * bbr_id,
698+ int rw,
699+ u64 starting_lsn,
700+ u64 count,
701+ char * buffer)
702+{
703+ struct bbr_table * bbr_table;
704+ struct io_region job;
705+ struct page * page;
706+ unsigned long table_sector_index;
707+ unsigned long table_sector_offset;
708+ unsigned long index;
709+ unsigned int offset_in_page, error;
710+ u64 lsn, new_lsn;
711+ int rc;
712+
713+ if (rw == READ) {
714+ /* Nothing can be done about read errors. */
715+ return -EIO;
716+ }
717+
718+ job.dev = bbr_id->dev->dev;
719+
720+ /* For each sector in the request. */
721+ for (lsn = 0; lsn < count; lsn++, buffer += SECTOR_SIZE) {
722+ job.sector = starting_lsn + lsn;
723+ job.count = 1;
724+ page = virt_to_page(buffer);
725+ offset_in_page = (unsigned long)buffer & ~PAGE_MASK;
726+ rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
727+ while (rc) {
728+ /* Find the next available relocation sector. */
729+ new_lsn = atomic_read(&bbr_id->in_use_replacement_blks);
730+ if (new_lsn >= bbr_id->nr_replacement_blks) {
731+ /* No more replacement sectors available. */
732+ return -EIO;
733+ }
734+ new_lsn += bbr_id->start_replacement_sect;
735+
736+ /* Write the data to its new location. */
737+ DMWARN("dm-bbr: device %u:%u: Trying to remap bad sector "PFU64" to sector "PFU64,
738+ MAJOR(bbr_id->dev->dev), MINOR(bbr_id->dev->dev),
739+ starting_lsn + lsn, new_lsn);
740+ job.sector = new_lsn;
741+ rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
742+ if (rc) {
743+ /* This replacement sector is bad.
744+ * Try the next one.
745+ */
746+ DMERR("dm-bbr: device %u:%u: replacement sector "PFU64" is bad. Skipping.",
747+ MAJOR(bbr_id->dev->dev), MINOR(bbr_id->dev->dev), new_lsn);
748+ atomic_inc(&bbr_id->in_use_replacement_blks);
749+ continue;
750+ }
751+
752+ /* Add this new entry to the on-disk table. */
753+ table_sector_index = new_lsn -
754+ bbr_id->start_replacement_sect;
755+ table_sector_offset = table_sector_index /
756+ BBR_ENTRIES_PER_SECT;
757+ index = table_sector_index % BBR_ENTRIES_PER_SECT;
758+
759+ bbr_table = &bbr_id->bbr_table[table_sector_offset];
760+ bbr_table->entries[index].bad_sect = starting_lsn + lsn;
761+ bbr_table->entries[index].replacement_sect = new_lsn;
762+ bbr_table->in_use_cnt++;
763+ bbr_table->sequence_number++;
764+ bbr_table->crc = 0;
765+ bbr_table->crc = calculate_crc(INITIAL_CRC,
766+ bbr_table,
767+ sizeof(struct bbr_table));
768+
769+ /* Write the table to disk. */
770+ cpu_bbr_table_sector_to_le(bbr_table, bbr_table);
771+ page = virt_to_page(bbr_table);
772+ offset_in_page = (unsigned long)bbr_table & ~PAGE_MASK;
773+ if (bbr_id->lba_table1) {
774+ job.sector = bbr_id->lba_table1 + table_sector_offset;
775+ job.count = 1;
776+ rc = dm_io_sync(1, &job, WRITE, page, offset_in_page, &error);
777+ }
778+ if (bbr_id->lba_table2) {
779+ job.sector = bbr_id->lba_table2 + table_sector_offset;
780+ rc |= dm_io_sync(1, &job, WRITE, page, offset_in_page, &error);
781+ }
782+ le_bbr_table_sector_to_cpu(bbr_table);
783+
784+ if (rc) {
785+ /* Error writing one of the tables to disk. */
786+ DMERR("dm-bbr: device %u:%u: error updating BBR tables on disk.",
787+ MAJOR(bbr_id->dev->dev), MINOR(bbr_id->dev->dev));
788+ return rc;
789+ }
790+
791+ /* Insert a new entry in the remapping binary-tree. */
792+ rc = bbr_insert_remap_entry(bbr_id,
793+ &bbr_table->entries[index]);
794+ if (rc) {
795+ DMERR("dm-bbr: device %u:%u: error adding new entry to remap tree.",
796+ MAJOR(bbr_id->dev->dev), MINOR(bbr_id->dev->dev));
797+ return rc;
798+ }
799+
800+ atomic_inc(&bbr_id->in_use_replacement_blks);
801+ }
802+ }
803+
804+ return 0;
805+}
806+
807+/**
808+ * bbr_io_process_request
809+ *
810+ * For each sector in this request, check if the sector has already
811+ * been remapped. If so, process all previous sectors in the request,
812+ * followed by the remapped sector. Then reset the starting lsn and
813+ * count, and keep going with the rest of the request as if it were
814+ * a whole new request. If any of the sync_io's return an error,
815+ * call the remapper to relocate the bad sector(s).
816+ **/
817+static int bbr_io_process_request(struct bbr_io_buffer * bbr_io_buf)
818+{
819+ struct bbr_private * bbr_id = bbr_io_buf->bbr_id;
820+ struct io_region job;
821+ u64 starting_lsn = bbr_io_buf->sector;
822+ u64 count = bbr_io_buf->bh->b_size >> SECTOR_SHIFT;
823+ u64 lsn, remapped_lsn;
824+ char * buffer = bbr_io_buf->bh->b_data;
825+ struct page * page = virt_to_page(buffer);
826+ unsigned int offset_in_page = (unsigned long)buffer & ~PAGE_MASK;
827+ unsigned int error;
828+ int rw = bbr_io_buf->rw;
829+ int rc = 0;
830+
831+ job.dev = bbr_id->dev->dev;
832+
833+ /* For each sector in this request, check if this sector has already
834+ * been remapped. If so, process all previous sectors in this request,
835+ * followed by the remapped sector. Then reset the starting lsn and
836+ * count and keep going with the rest of the request as if it were
837+ * a whole new request.
838+ */
839+ for (lsn = 0; lsn < count; lsn++) {
840+ remapped_lsn = starting_lsn + lsn;
841+ rc = bbr_remap(bbr_id, &remapped_lsn);
842+ if (!rc) {
843+ /* This sector is fine. */
844+ continue;
845+ }
846+
847+ /* Process all sectors in the request up to this one. */
848+ if (lsn > 0) {
849+ job.sector = starting_lsn;
850+ job.count = lsn;
851+ rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
852+ if (rc) {
853+ /* If this I/O failed, then one of the sectors
854+ * in this request needs to be relocated.
855+ */
856+ rc = bbr_io_remap_error(bbr_id, bbr_io_buf->rw, starting_lsn,
857+ lsn, buffer);
858+ if (rc) {
859+ return rc;
860+ }
861+ }
862+ buffer += (lsn << SECTOR_SHIFT);
863+ page = virt_to_page(buffer);
864+ offset_in_page = (unsigned long)buffer & ~PAGE_MASK;
865+ }
866+
867+ /* Process the remapped sector. */
868+ job.sector = remapped_lsn;
869+ job.count = 1;
870+ rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
871+ if (rc) {
872+ /* BUGBUG - Need more processing if this caused an
873+ * an error. If this I/O failed, then the existing
874+ * remap is now bad, and we need to find a new remap.
875+ * Can't use bbr_io_remap_error(), because the existing
876+ * map entry needs to be changed, not added again, and
877+ * the original table entry also needs to be changed.
878+ */
879+ return rc;
880+ }
881+
882+ buffer += SECTOR_SIZE;
883+ starting_lsn += (lsn + 1);
884+ count -= (lsn + 1);
885+ lsn = -1;
886+ page = virt_to_page(buffer);
887+ offset_in_page = (unsigned long)buffer & ~PAGE_MASK;
888+ }
889+
890+ /* Check for any remaining sectors after the last split. This could
891+ * potentially be the whole request, but that should be a rare case
892+ * because requests should only be processed by the thread if we know
893+ * an error occurred or they contained one or more remapped sectors.
894+ */
895+ if (count) {
896+ job.sector = starting_lsn;
897+ job.count = count;
898+ rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
899+ if (rc) {
900+ /* If this I/O failed, then one of the sectors in this
901+ * request needs to be relocated.
902+ */
903+ rc = bbr_io_remap_error(bbr_id, bbr_io_buf->rw, starting_lsn,
904+ count, buffer);
905+ if (rc) {
906+ return rc;
907+ }
908+ }
909+ }
910+
911+ return 0;
912+}
913+
914+/**
915+ * bbr_io_handler
916+ *
917+ * This is the handler for the bbr_io_thread. It continuously loops,
918+ * taking I/O requests off its list and processing them. If nothing
919+ * is on the list, the thread goes back to sleep until specifically
920+ * woken up.
921+ *
922+ * I/O requests should only be sent to this thread if we know that:
923+ * a) the request contains at least one remapped sector.
924+ * or
925+ * b) the request caused an error on the normal I/O path.
926+ * This function uses synchronous I/O, so sending a request to this
927+ * thread that doesn't need special processing will cause severe
928+ * performance degredation.
929+ **/
930+static void bbr_io_handler(void)
931+{
932+ struct bbr_io_buffer * bbr_io_buf;
933+ struct buffer_head * bh;
934+ unsigned long flags;
935+ int rc;
936+
937+ while (1) {
938+ /* Process bbr_io_list, one entry at a time. */
939+ spin_lock_irqsave(&bbr_io_list_lock, flags);
940+ if (list_empty(&bbr_io_list)) {
941+ /* No more items on the list. */
942+ spin_unlock_irqrestore(&bbr_io_list_lock, flags);
943+ break;
944+ }
945+ bbr_io_buf = list_entry(bbr_io_list.next,
946+ struct bbr_io_buffer, bbr_io_list);
947+ list_del_init(&bbr_io_buf->bbr_io_list);
948+ spin_unlock_irqrestore(&bbr_io_list_lock, flags);
949+
950+ rc = bbr_io_process_request(bbr_io_buf);
951+
952+ /* Clean up and complete the original I/O. */
953+ bbr_io_buf->flags |= BBR_IO_HANDLED;
954+ bh = bbr_io_buf->bh;
955+ if (bh->b_end_io) {
956+ /* If this was the bbr_io_buf for an error on the
957+ * normal WRITE, don't free it here. It will be
958+ * freed later in bbr_callback()
959+ */
960+ if (!(bbr_io_buf->flags & BBR_IO_RELOCATE))
961+ free_bbr_io_buf(bbr_io_buf);
962+ bh->b_end_io(bh, rc ? 0 : 1);
963+ }
964+ }
965+}
966+
967+/**
968+ * bbr_schedule_io
969+ *
970+ * Place the specified bbr_io_buf on the thread's processing list.
971+ **/
972+static void bbr_schedule_io(struct bbr_io_buffer * bbr_io_buf)
973+{
974+ unsigned long flags;
975+ spin_lock_irqsave(&bbr_io_list_lock, flags);
976+ list_add_tail(&bbr_io_buf->bbr_io_list, &bbr_io_list);
977+ spin_unlock_irqrestore(&bbr_io_list_lock, flags);
978+ dm_daemon_wake(bbr_io_thread);
979+}
980+
981+/**
982+ * bbr_read
983+ *
984+ * If there are any remapped sectors on this object, send this request over
985+ * to the thread for processing. Otherwise send it down the stack normally.
986+ **/
987+static int bbr_read(struct bbr_private * bbr_id,
988+ struct buffer_head * bh)
989+{
990+ struct bbr_io_buffer * bbr_io_buf;
991+
992+
993+ if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
994+ !bbr_remap_probe(bbr_id, bh->b_rsector,
995+ bh->b_size >> SECTOR_SHIFT)) {
996+ /* No existing remaps or this request doesn't
997+ * contain any remapped sectors.
998+ */
999+ bh->b_rdev = bbr_id->dev->dev;
1000+ return 1;
1001+ }
1002+
1003+ /* This request has at least one remapped sector. */
1004+ bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, READ);
1005+ if (!bbr_io_buf) {
1006+ /* Can't get memory to track the I/O. */
1007+ bh->b_end_io(bh, 0);
1008+ return -ENOMEM;
1009+ }
1010+
1011+ bbr_schedule_io(bbr_io_buf);
1012+ return 0;
1013+}
1014+
1015+/**
1016+ * bbr_callback
1017+ *
1018+ * This is the callback for normal write requests. Check for an error
1019+ * during the I/O, and send to the thread for processing if necessary.
1020+ **/
1021+static int bbr_callback(struct dm_target * ti,
1022+ struct buffer_head * bh,
1023+ int rw,
1024+ int error,
1025+ union map_info * map_context)
1026+{
1027+ struct bbr_io_buffer * bbr_io_buf = (struct bbr_io_buffer *) map_context->ptr;
1028+
1029+ if (!bbr_io_buf)
1030+ return error;
1031+
1032+ /* Will try to relocate the WRITE if:
1033+ * - It is an error, and
1034+ * - It is not an error of BBR relocation, and
1035+ */
1036+ if (error && !(bbr_io_buf->flags & BBR_IO_HANDLED)) {
1037+ DMERR("dm-bbr: device %u:%u: Write failure on sector %lu. Scheduling for retry.",
1038+ MAJOR(bh->b_rdev), MINOR(bh->b_rdev),
1039+ (unsigned long)bbr_io_buf->sector);
1040+ /* Indicate this bbr_io_buf is for an error on normal WRITE */
1041+ bbr_io_buf->flags |= BBR_IO_RELOCATE;
1042+ bbr_schedule_io(bbr_io_buf);
1043+ /* Returns >0 so that DM will let us retry the I/O */
1044+ return 1;
1045+ }
1046+
1047+ free_bbr_io_buf(bbr_io_buf);
1048+ return error;
1049+}
1050+
1051+/**
1052+ * bbr_write
1053+ *
1054+ * If there are any remapped sectors on this object, send the request over
1055+ * to the thread for processing. Otherwise, register for callback
1056+ * notification, and send the request down normally.
1057+ **/
1058+static int bbr_write(struct bbr_private * bbr_id,
1059+ struct buffer_head * bh,
1060+ union map_info * map_context)
1061+{
1062+ struct bbr_io_buffer * bbr_io_buf;
1063+
1064+ bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, WRITE);
1065+ if (!bbr_io_buf) {
1066+ /* Can't get memory to track the I/O. */
1067+ bh->b_end_io(bh, 0);
1068+ return -ENOMEM;
1069+ }
1070+
1071+ if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
1072+ !bbr_remap_probe(bbr_id, bh->b_rsector,
1073+ bh->b_size >> SECTOR_SHIFT)) {
1074+ /* No existing remaps or this request
1075+ * contains no remapped sectors.
1076+ */
1077+ bh->b_rdev = bbr_id->dev->dev;
1078+ map_context->ptr = bbr_io_buf;
1079+ return 1;
1080+ } else {
1081+ /* This request contains at least one remapped sector. */
1082+ map_context->ptr = NULL;
1083+ bbr_schedule_io(bbr_io_buf);
1084+ }
1085+ return 0;
1086+}
1087+
1088+/**
1089+ * Construct a bbr mapping
1090+ **/
1091+static int bbr_ctr(struct dm_target * ti, unsigned int argc, char ** argv)
1092+{
1093+ struct bbr_private * bbr_id;
1094+ u32 block_size;
1095+ char * end;
1096+ int rc = -EINVAL;
1097+
1098+ if (argc != 8) {
1099+ ti->error = "dm-bbr requires exactly 8 arguments: "
1100+ "device offset table1_lsn table2_lsn table_size start_replacement nr_replacement_blks block_size";
1101+ goto out1;
1102+ }
1103+
1104+ bbr_id = bbr_alloc_private();
1105+ if (!bbr_id) {
1106+ ti->error = "dm-bbr: Error allocating bbr private data.";
1107+ goto out1;
1108+ }
1109+
1110+ bbr_id->offset = simple_strtoull(argv[1], &end, 10);
1111+ bbr_id->lba_table1 = simple_strtoull(argv[2], &end, 10);
1112+ bbr_id->lba_table2 = simple_strtoull(argv[3], &end, 10);
1113+ bbr_id->nr_sects_bbr_table = simple_strtoull(argv[4], &end, 10);
1114+ bbr_id->start_replacement_sect = simple_strtoull(argv[5], &end, 10);
1115+ bbr_id->nr_replacement_blks = simple_strtoull(argv[6], &end, 10);
1116+ block_size = simple_strtoul(argv[7], &end, 10);
1117+ bbr_id->blksize_in_sects = (block_size >> SECTOR_SHIFT);
1118+
1119+ bbr_id->bbr_table = kmalloc(bbr_id->nr_sects_bbr_table << SECTOR_SHIFT,
1120+ GFP_KERNEL);
1121+ if (!bbr_id->bbr_table) {
1122+ ti->error = "dm-bbr: Error allocating bbr table.";
1123+ goto out2;
1124+ }
1125+
1126+ if (dm_get_device(ti, argv[0], 0, ti->len,
1127+ dm_table_get_mode(ti->table), &bbr_id->dev)) {
1128+ ti->error = "dm-bbr: Device lookup failed";
1129+ goto out2;
1130+ }
1131+
1132+ /* Using a semaphore here is probably overkill,
1133+ * but at least it will be correct.
1134+ */
1135+ down(&bbr_instances_lock);
1136+ if (bbr_instances == 0) {
1137+ rc = bbr_global_init();
1138+ if (rc) {
1139+ up(&bbr_instances_lock);
1140+ goto out3;
1141+ }
1142+ }
1143+ bbr_instances++;
1144+ up(&bbr_instances_lock);
1145+
1146+ rc = bbr_setup(bbr_id);
1147+ if (rc) {
1148+ ti->error = "dm-bbr: Device setup failed";
1149+ goto out4;
1150+ }
1151+
1152+ ti->private = bbr_id;
1153+ return 0;
1154+
1155+out4:
1156+ down(&bbr_instances_lock);
1157+ bbr_instances--;
1158+ if (bbr_instances == 0) {
1159+ bbr_global_cleanup();
1160+ }
1161+ up(&bbr_instances_lock);
1162+
1163+out3:
1164+ dm_put_device(ti, bbr_id->dev);
1165+out2:
1166+ bbr_free_private(bbr_id);
1167+out1:
1168+ return rc;
1169+}
1170+
1171+static void bbr_dtr(struct dm_target * ti)
1172+{
1173+ struct bbr_private * bbr_id = (struct bbr_private *) ti->private;
1174+
1175+ dm_put_device(ti, bbr_id->dev);
1176+ bbr_free_private(bbr_id);
1177+
1178+ down(&bbr_instances_lock);
1179+ bbr_instances--;
1180+ if (bbr_instances == 0) {
1181+ bbr_global_cleanup();
1182+ }
1183+ up(&bbr_instances_lock);
1184+}
1185+
1186+static int bbr_map(struct dm_target * ti, struct buffer_head * bh, int rw,
1187+ union map_info * map_context)
1188+{
1189+ struct bbr_private * bbr_id = (struct bbr_private *) ti->private;
1190+
1191+ bh->b_rsector += bbr_id->offset;
1192+ switch (rw) {
1193+ case READ:
1194+ case READA:
1195+ map_context->ptr = NULL;
1196+ return bbr_read(bbr_id, bh);
1197+ case WRITE:
1198+ return bbr_write(bbr_id, bh, map_context);
1199+ default:
1200+ return -EIO;
1201+ }
1202+}
1203+
1204+static int bbr_status(struct dm_target * ti, status_type_t type,
1205+ char * result, unsigned int maxlen)
1206+{
1207+ struct bbr_private * bbr_id = (struct bbr_private *) ti->private;
1208+
1209+ switch (type) {
1210+ case STATUSTYPE_INFO:
1211+ result[0] = '\0';
1212+ break;
1213+
1214+ case STATUSTYPE_TABLE:
1215+ snprintf(result, maxlen, "%s "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" %u",
1216+ dm_kdevname(bbr_id->dev->dev), bbr_id->offset,
1217+ bbr_id->lba_table1, bbr_id->lba_table2,
1218+ bbr_id->nr_sects_bbr_table,
1219+ bbr_id->start_replacement_sect,
1220+ bbr_id->nr_replacement_blks,
1221+ bbr_id->blksize_in_sects << SECTOR_SHIFT);
1222+ break;
1223+ }
1224+ return 0;
1225+}
1226+
1227+static struct target_type bbr_target = {
1228+ name: "bbr",
1229+ module: THIS_MODULE,
1230+ ctr: bbr_ctr,
1231+ dtr: bbr_dtr,
1232+ map: bbr_map,
1233+ end_io: bbr_callback,
1234+ status: bbr_status,
1235+};
1236+
1237+int __init dm_bbr_init(void)
1238+{
1239+ int r = dm_register_target(&bbr_target);
1240+
1241+ if (r < 0)
1242+ DMERR("dm-bbr: register failed %d", r);
1243+
1244+ return r;
1245+}
1246+
1247+void __exit dm_bbr_exit(void)
1248+{
1249+ int r = dm_unregister_target(&bbr_target);
1250+
1251+ if (r < 0)
1252+ DMERR("dm-bbr: unregister failed %d", r);
1253+}
1254+
1255+module_init(dm_bbr_init);
1256+module_exit(dm_bbr_exit);
1257+MODULE_LICENSE("GPL");
1258diff -urN linux-2.4.22/drivers/md/dm-bbr.h linux-2.4.22-evms/drivers/md/dm-bbr.h
1259--- linux-2.4.22/drivers/md/dm-bbr.h 1970-01-01 01:00:00.000000000 +0100
1260+++ linux-2.4.22-evms/drivers/md/dm-bbr.h 2003-09-15 17:08:42.000000000 +0200
1261@@ -0,0 +1,148 @@
1262+/*
1263+ * Copyright (c) International Business Machines Corp., 2002-2003
1264+ *
1265+ * This program is free software; you can redistribute it and/or modify
1266+ * it under the terms of the GNU General Public License as published by
1267+ * the Free Software Foundation; either version 2 of the License, or
1268+ * (at your option) any later version.
1269+ *
1270+ * This program is distributed in the hope that it will be useful,
1271+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
1272+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
1273+ * the GNU General Public License for more details.
1274+ *
1275+ * You should have received a copy of the GNU General Public License
1276+ * along with this program; if not, write to the Free Software
1277+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
1278+ *
1279+ * linux/drivers/md/dm-bbr.h
1280+ *
1281+ * Bad-block-relocation (BBR) target for device-mapper.
1282+ *
1283+ * The BBR target is designed to remap I/O write failures to another safe
1284+ * location on disk. Note that most disk drives have BBR built into them,
1285+ * this means that our software BBR will be only activated when all hardware
1286+ * BBR replacement sectors have been used.
1287+ */
1288+
1289+#ifndef _DM_BBR_H_
1290+#define _DM_BBR_H_
1291+
1292+#define BBR_TABLE_SIGNATURE 0x42627254 /* BbrT */
1293+#define BBR_ENTRIES_PER_SECT 31
1294+#define BBR_NR_BUFS 128
1295+#define INITIAL_CRC 0xFFFFFFFF
1296+#define CRC_POLYNOMIAL 0xEDB88320L
1297+
1298+/**
1299+ * Macros to cleanly print 64-bit numbers on both 32-bit and 64-bit machines.
1300+ * Use these in place of %Ld, %Lu, and %Lx.
1301+ **/
1302+#if BITS_PER_LONG > 32
1303+#define PFU64 "%lu"
1304+#else
1305+#define PFU64 "%Lu"
1306+#endif
1307+
1308+/**
1309+ * struct bbr_table_entry
1310+ * @bad_sect: LBA of bad location.
1311+ * @replacement_sect: LBA of new location.
1312+ *
1313+ * Structure to describe one BBR remap.
1314+ **/
1315+struct bbr_table_entry {
1316+ u64 bad_sect;
1317+ u64 replacement_sect;
1318+};
1319+
1320+/**
1321+ * struct bbr_table
1322+ * @signature: Signature on each BBR table sector.
1323+ * @crc: CRC for this table sector.
1324+ * @sequence_number: Used to resolve conflicts when primary and secondary
1325+ * tables do not match.
1326+ * @in_use_cnt: Number of in-use table entries.
1327+ * @entries: Actual table of remaps.
1328+ *
1329+ * Structure to describe each sector of the metadata table. Each sector in this
1330+ * table can describe 31 remapped sectors.
1331+ **/
1332+struct bbr_table {
1333+ u32 signature;
1334+ u32 crc;
1335+ u32 sequence_number;
1336+ u32 in_use_cnt;
1337+ struct bbr_table_entry entries[BBR_ENTRIES_PER_SECT];
1338+};
1339+
1340+/**
1341+ * struct bbr_runtime_remap
1342+ *
1343+ * Node in the binary tree used to keep track of remaps.
1344+ **/
1345+struct bbr_runtime_remap {
1346+ struct bbr_table_entry remap;
1347+ struct bbr_runtime_remap *left;
1348+ struct bbr_runtime_remap *right;
1349+};
1350+
1351+/**
1352+ * struct bbr_private
1353+ * @dev: Info about underlying device.
1354+ * @bbr_table: Copy of metadata table.
1355+ * @offset: LBA of data area.
1356+ * @lba_table1: LBA of primary BBR table.
1357+ * @lba_table2: LBA of secondary BBR table.
1358+ * @nr_sects_bbr_table: Size of each BBR table.
1359+ * @nr_replacement_blks: Number of replacement blocks.
1360+ * @start_replacement_sect: LBA of start of replacement blocks.
1361+ * @blksize_in_sects: Size of each block.
1362+ * @in_use_replacement_blks: Current number of remapped blocks.
1363+ * @remap_root: Binary tree containing all remaps.
1364+ * @bbr_id_lock: Lock for the binary tree.
1365+ *
1366+ * Private data for each BBR target.
1367+ **/
1368+struct bbr_private {
1369+ struct dm_dev * dev;
1370+ struct bbr_table * bbr_table;
1371+ struct bbr_runtime_remap * remap_root;
1372+ u64 offset;
1373+ u64 lba_table1;
1374+ u64 lba_table2;
1375+ u64 nr_sects_bbr_table;
1376+ u64 start_replacement_sect;
1377+ u64 nr_replacement_blks;
1378+ u32 blksize_in_sects;
1379+ atomic_t in_use_replacement_blks;
1380+ spinlock_t bbr_id_lock;
1381+};
1382+
1383+#define BBR_IO_HANDLED (1<<0)
1384+#define BBR_IO_RELOCATE (1<<1)
1385+
1386+/**
1387+ * struct bbr_io_buffer
1388+ * @bbr_io_list: Thread's list of bbr_io_buf's.
1389+ * @bbr_id: Object for this request.
1390+ * @bh: Original buffer_head.
1391+ * @sector: Original sector
1392+ * @flags: Operation flag (BBR_IO_*)
1393+ * @rw: READ or WRITE.
1394+ * @rc: Return code from bbr_io_handler.
1395+ *
1396+ * Structure used to track each write request.
1397+ **/
1398+struct bbr_io_buffer {
1399+ struct list_head bbr_io_list;
1400+ struct bbr_private *bbr_id;
1401+ struct buffer_head *bh;
1402+ u64 sector;
1403+ u32 flags;
1404+ s32 rw;
1405+ s32 rc;
1406+};
1407+
1408+#endif
1409+
1410diff -urN linux-2.4.22/drivers/md/dm-snapshot.c linux-2.4.22-evms/drivers/md/dm-snapshot.c
1411--- linux-2.4.22/drivers/md/dm-snapshot.c 2003-09-15 17:07:45.000000000 +0200
1412+++ linux-2.4.22-evms/drivers/md/dm-snapshot.c 2003-09-15 17:08:35.000000000 +0200
1413@@ -92,6 +92,9 @@
1414
1415 /* List of snapshots for this origin */
1416 struct list_head snapshots;
1417+
1418+ /* Count of snapshots and origins referrencing this structure. */
1419+ unsigned int count;
1420 };
1421
1422 /*
1423@@ -155,6 +158,35 @@
1424 }
1425
1426 /*
1427+ * Allocate and initialize an origin structure.
1428+ */
1429+static struct origin * __alloc_origin(kdev_t dev)
1430+{
1431+ struct origin *o = kmalloc(sizeof(*o), GFP_KERNEL);
1432+ if (o) {
1433+ o->dev = dev;
1434+ INIT_LIST_HEAD(&o->hash_list);
1435+ INIT_LIST_HEAD(&o->snapshots);
1436+ __insert_origin(o);
1437+ }
1438+ return o;
1439+}
1440+
1441+static void __get_origin(struct origin *o)
1442+{
1443+ o->count++;
1444+}
1445+
1446+static void __put_origin(struct origin *o)
1447+{
1448+ o->count--;
1449+ if (o->count == 0) {
1450+ list_del(&o->hash_list);
1451+ kfree(o);
1452+ }
1453+}
1454+
1455+/*
1456 * Make a note of the snapshot and its origin so we can look it
1457 * up when the origin has a write on it.
1458 */
1459@@ -168,20 +200,37 @@
1460
1461 if (!o) {
1462 /* New origin */
1463- o = kmalloc(sizeof(*o), GFP_KERNEL);
1464+ o = __alloc_origin(dev);
1465 if (!o) {
1466 up_write(&_origins_lock);
1467 return -ENOMEM;
1468 }
1469+ }
1470
1471- /* Initialise the struct */
1472- INIT_LIST_HEAD(&o->snapshots);
1473- o->dev = dev;
1474+ __get_origin(o);
1475+ list_add_tail(&snap->list, &o->snapshots);
1476
1477- __insert_origin(o);
1478+ up_write(&_origins_lock);
1479+ return 0;
1480+}
1481+
1482+static int register_origin(kdev_t dev)
1483+{
1484+ struct origin *o;
1485+
1486+ down_write(&_origins_lock);
1487+ o = __lookup_origin(dev);
1488+
1489+ if (!o) {
1490+ /* New origin */
1491+ o = __alloc_origin(dev);
1492+ if (!o) {
1493+ up_write(&_origins_lock);
1494+ return -ENOMEM;
1495+ }
1496 }
1497
1498- list_add_tail(&snap->list, &o->snapshots);
1499+ __get_origin(o);
1500
1501 up_write(&_origins_lock);
1502 return 0;
1503@@ -195,11 +244,18 @@
1504 o = __lookup_origin(s->origin->dev);
1505
1506 list_del(&s->list);
1507- if (list_empty(&o->snapshots)) {
1508- list_del(&o->hash_list);
1509- kfree(o);
1510- }
1511+ __put_origin(o);
1512+
1513+ up_write(&_origins_lock);
1514+}
1515+
1516+static void unregister_origin(kdev_t dev)
1517+{
1518+ struct origin *o;
1519
1520+ down_write(&_origins_lock);
1521+ o = __lookup_origin(dev);
1522+ __put_origin(o);
1523 up_write(&_origins_lock);
1524 }
1525
1526@@ -1090,6 +1146,13 @@
1527 return r;
1528 }
1529
1530+ r = register_origin(dev->dev);
1531+ if (r) {
1532+ ti->error = "Cannot register origin";
1533+ dm_put_device(ti, dev);
1534+ return r;
1535+ }
1536+
1537 ti->private = dev;
1538 return 0;
1539 }
1540@@ -1097,6 +1160,7 @@
1541 static void origin_dtr(struct dm_target *ti)
1542 {
1543 struct dm_dev *dev = (struct dm_dev *) ti->private;
1544+ unregister_origin(dev->dev);
1545 dm_put_device(ti, dev);
1546 }
1547
1548diff -urN linux-2.4.22/drivers/md/dm-sparse.c linux-2.4.22-evms/drivers/md/dm-sparse.c
1549--- linux-2.4.22/drivers/md/dm-sparse.c 1970-01-01 01:00:00.000000000 +0100
1550+++ linux-2.4.22-evms/drivers/md/dm-sparse.c 2003-09-15 17:09:48.000000000 +0200
1551@@ -0,0 +1,713 @@
1552+/* -*- linux-c -*- */
1553+
1554+/*
1555+ * Copyright (c) International Business Machines Corp., 2002
1556+ *
1557+ * This program is free software; you can redistribute it and/or modify
1558+ * it under the terms of the GNU General Public License as published by
1559+ * the Free Software Foundation; either version 2 of the License, or
1560+ * (at your option) any later version.
1561+ *
1562+ * This program is distributed in the hope that it will be useful,
1563+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
1564+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
1565+ * the GNU General Public License for more details.
1566+ *
1567+ * You should have received a copy of the GNU General Public License
1568+ * along with this program; if not, write to the Free Software
1569+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
1570+ *
1571+ * linux/drivers/md/dm-sparse.c
1572+ *
1573+ * Sparse target for device-mapper.
1574+ *
1575+ * This target provides the ability to create a sparse device. This
1576+ * allows a device to pretend to be larger than it really is.
1577+ */
1578+
1579+#include <linux/module.h>
1580+#include <linux/init.h>
1581+#include <linux/blkdev.h>
1582+#include <linux/slab.h>
1583+#include <linux/mempool.h>
1584+#include <linux/vmalloc.h>
1585+
1586+#include "dm.h"
1587+#include "dm-io.h"
1588+
1589+#define MAX_HASH_CHAIN_ENTRIES 10
1590+#define NAME_SIZE 127
1591+
1592+/* Sparse Ioctl
1593+ device
1594+ start
1595+ chunk_size
1596+ chunks
1597+ */
1598+
1599+// Entries in the sparse remapping structure
1600+struct sparse_hash_entry {
1601+ u64 org_chunk; // Chunk number, not LBA.
1602+ u64 sparse_chunk; // Chunk number, not LBA.
1603+ struct sparse_hash_entry * next;
1604+ struct sparse_hash_entry * prev;
1605+};
1606+
1607+//Private data structure
1608+struct sparse_volume {
1609+ struct dm_dev *dev;
1610+ struct rw_semaphore sparse_semaphore;
1611+ struct sparse_hash_entry ** sparse_map; // Hash table of remappings
1612+ struct sparse_hash_entry * free_hash_list;
1613+ kmem_cache_t * hash_slab;
1614+ mempool_t * hash_pool;
1615+ u32 dm_io_flag;
1616+ u32 chunk_size; // Sectors.
1617+ u32 chunk_shift; // Shift value for chunk size.
1618+ u32 num_chunks; // In this volume.
1619+ u32 next_cow_entry; // Index into current COW table.
1620+ u64 current_cow_sector; // LOGICAL sector of current COW table.
1621+ u32 next_free_chunk; // Index of next free chunk (not LBA!).
1622+ u32 hash_table_size; // Size of the hash table for the remap.
1623+ u64 start;
1624+ u64 cow_table[64]; // One sector's worth of COW tables.
1625+};
1626+
1627+/*************************** OLD SERVICES ****************************/
1628+
1629+/* computes log base 2 of value */
1630+inline int log2(u32 value) //ok to change to u32?
1631+{
1632+ int result = -1;
1633+ long tmp; //ok to change to long?
1634+
1635+ if (value) {
1636+ tmp = value;
1637+ result++;
1638+ while (!(tmp & 1)) {
1639+ result++;
1640+ tmp >>= 1;
1641+ }
1642+ if (tmp != 1) {
1643+ result = -2;
1644+ }
1645+ }
1646+ return result;
1647+}
1648+
1649+/********************************* Functions *********************************/
1650+
1651+/***************************** Hash Functions *****************************/
1652+
1653+/* Take and initialize from the free hash list */
1654+static struct sparse_hash_entry *
1655+allocate_sparse_hash_entry( struct sparse_volume * volume,
1656+ u64 org_chunk,
1657+ u64 sparse_chunk )
1658+{
1659+ struct sparse_hash_entry * hash_entry;
1660+
1661+ hash_entry = volume->free_hash_list;
1662+ if ( hash_entry ) { //should always be the case b/c preallocate these
1663+ volume->free_hash_list = hash_entry->next;
1664+ hash_entry->org_chunk = org_chunk;
1665+ hash_entry->sparse_chunk = sparse_chunk;
1666+ hash_entry->next = NULL;
1667+ hash_entry->prev = NULL;
1668+ }
1669+
1670+ return hash_entry;
1671+}
1672+
1673+/*
1674+ * This function inserts a new entry into a sparse hash chain, immediately
1675+ * following the specified entry. This function should not be used to add
1676+ * an entry into an empty list, or as the first entry in an existing list.
1677+ * For that case, use insert_sparse_map_entry_at_head().
1678+ */
1679+static int insert_sparse_hash_entry( struct sparse_hash_entry * entry,
1680+ struct sparse_hash_entry * base )
1681+{
1682+ entry->next = base->next;
1683+ entry->prev = base;
1684+ base->next = entry;
1685+ if ( entry->next ) {
1686+ entry->next->prev = entry;
1687+ }
1688+ return 0;
1689+}
1690+
1691+/*
1692+ * This function inserts a new entry into a sparse chain as the first
1693+ * entry in the chain.
1694+ */
1695+static int insert_sparse_hash_entry_at_head( struct sparse_hash_entry * entry,
1696+ struct sparse_hash_entry ** head )
1697+{
1698+ entry->next = *head;
1699+ entry->prev = NULL;
1700+ *head = entry;
1701+ if ( entry->next ) {
1702+ entry->next->prev = entry;
1703+ }
1704+ return 0;
1705+}
1706+
1707+/*
1708+ * Delete all items in a single chain in the hash table.
1709+ */
1710+static int delete_sparse_hash_chain( struct sparse_volume * vol,
1711+ struct sparse_hash_entry * head )
1712+{
1713+ struct sparse_hash_entry * next;
1714+
1715+ while ( head ) {
1716+ next = head->next;
1717+ mempool_free( head, vol->hash_pool );
1718+ head = next;
1719+ }
1720+ return 0;
1721+}
1722+
1723+/*
1724+ * This function will search the hash chain that is anchored at the
1725+ * specified head pointer. If the chunk number is found, a pointer to that
1726+ * entry in the chain is set, and a 1 is returned. If the chunk is not
1727+ * found, a pointer to the previous entry is set and 0 is returned. If the
1728+ * return pointer is NULL, this means either the list is empty, or the
1729+ * specified sector should become the first list item.
1730+ */
1731+static int search_sparse_hash_chain( u64 chunk,
1732+ struct sparse_hash_entry * head,
1733+ struct sparse_hash_entry ** result )
1734+{
1735+ struct sparse_hash_entry * curr = head;
1736+ struct sparse_hash_entry * prev = head;
1737+ while ( curr && curr->org_chunk < chunk ) {
1738+ prev = curr;
1739+ curr = curr->next;
1740+ }
1741+ if (!curr) { // Either an empty chain or went off the end of the chain.
1742+ *result = prev;
1743+ return 0;
1744+ }
1745+ else if ( curr->org_chunk != chunk ) {
1746+ *result = curr->prev;
1747+ return 0;
1748+ }
1749+ else {
1750+ *result = curr;
1751+ return 1;
1752+ }
1753+}
1754+
1755+/*
1756+ * This function takes a cow table entry (from the on-disk data), and
1757+ * converts it into an appropriate entry for the sparse map, and
1758+ * inserts it into the appropriate map for the specified volume.
1759+ */
1760+static int add_cow_entry_to_sparse_map( u64 org_chunk,
1761+ u64 sparse_chunk,
1762+ struct sparse_volume * volume )
1763+{
1764+ struct sparse_hash_entry * new_entry;
1765+ struct sparse_hash_entry * target_entry;
1766+ u32 hash_value;
1767+ int rc = -EINVAL;
1768+
1769+ new_entry = allocate_sparse_hash_entry(volume, org_chunk, sparse_chunk);
1770+ if (!new_entry) {
1771+ return -ENOMEM;
1772+ }
1773+
1774+ hash_value = (long)org_chunk % volume->hash_table_size;
1775+
1776+ if (! search_sparse_hash_chain( org_chunk,
1777+ volume->sparse_map[hash_value],
1778+ &target_entry ) ) {
1779+ //should always take this path
1780+
1781+ if ( target_entry ) {
1782+ insert_sparse_hash_entry( new_entry, target_entry );
1783+ }
1784+ else {
1785+ insert_sparse_hash_entry_at_head
1786+ ( new_entry, &(volume->sparse_map[hash_value]) );
1787+ }
1788+ rc = 0;
1789+ }
1790+ return rc;
1791+}
1792+
1793+/*
1794+ * Construct the initial hash table state based on
1795+ * existing COW tables on the disk.
1796+ */
1797+static int build_sparse_maps(struct sparse_volume * volume)
1798+{
1799+ int rc = 0, done = 0;
1800+ struct io_region job;
1801+ struct page * page;
1802+ unsigned int error, offset;
1803+
1804+ while (!done) {
1805+
1806+ // Read in one sector's worth of COW tables.
1807+ job.dev = volume->dev->dev;
1808+ job.sector = volume->current_cow_sector;
1809+ job.count = 1;
1810+ page = virt_to_page(volume->cow_table);
1811+ offset = (unsigned long)volume->cow_table & ~PAGE_MASK;
1812+ rc = dm_io_sync(1, &job, READ, page, offset, &error);
1813+ if (rc) {
1814+ return rc;
1815+ }
1816+
1817+ // Translate every valid COW table entry into
1818+ // a sparse map entry.
1819+ for ( volume->next_cow_entry = 0;
1820+
1821+ volume->next_cow_entry < (SECTOR_SIZE/sizeof(u64)) &&
1822+ volume->cow_table[volume->next_cow_entry] !=
1823+ 0xffffffffffffffff;
1824+
1825+ volume->next_cow_entry++, volume->next_free_chunk++ ) {
1826+
1827+ if ( (rc = add_cow_entry_to_sparse_map
1828+ ( le64_to_cpu( volume->cow_table[volume->next_cow_entry] ),
1829+ volume->next_free_chunk, volume ))) {
1830+ return( rc );
1831+ }
1832+ }
1833+ // Move on to the next sector if necessary.
1834+ if ( volume->next_cow_entry == (SECTOR_SIZE/sizeof(u64)) ) {
1835+ volume->current_cow_sector++;
1836+ }
1837+ else {
1838+ done = 1;
1839+ }
1840+ }
1841+ return 0;
1842+}
1843+
1844+/************************* Other Functions ************************/
1845+
1846+/*
1847+ * Function: sparse_remap_chunk
1848+ *
1849+ * This function performs a sector remap on a sparse volume. This should
1850+ * be called from the I/O path, It first determines the base sector
1851+ * of the chunk containing the specified sector, and saves the remainder.
1852+ * Then it performs a search through the sparse map for the specified
1853+ * volume. If a match is found, the sector number is changed to the new
1854+ * value. If no match is found, the value is left the same, meaning the
1855+ * chunk has not been remapped.
1856+ */
1857+static int sparse_remap_chunk( struct sparse_volume * sparse_volume,
1858+ u64 * sector )
1859+{
1860+ struct sparse_hash_entry * result;
1861+ u64 chunk;
1862+ u32 hash_value;
1863+ u32 remainder;
1864+ int rc = 1;
1865+
1866+ down_read(&sparse_volume->sparse_semaphore);
1867+
1868+ remainder = *sector & (u64)(sparse_volume->chunk_size - 1);
1869+ chunk = *sector >> sparse_volume->chunk_shift;
1870+ hash_value = ((u32)chunk) % sparse_volume->hash_table_size;
1871+
1872+ if ( search_sparse_hash_chain( chunk,
1873+ sparse_volume->sparse_map[hash_value],
1874+ &result) ) {
1875+ *sector = ( result->sparse_chunk << sparse_volume->chunk_shift )
1876+ + remainder;
1877+ rc = 0;
1878+ }
1879+ up_read(&sparse_volume->sparse_semaphore);
1880+ return rc;
1881+}
1882+
1883+/* Function: sparse_cow_write
1884+ *
1885+ * Check this sparse node to see if the given sector/chunk has been
1886+ * remapped yet. If it hasn't, create a new hash table entry, update the
1887+ * in-memory COW table, write the COW table to disk.
1888+ */
1889+
1890+static int sparse_cow_write( struct sparse_volume * sparse_volume,
1891+ u64 * sector )
1892+{
1893+ struct sparse_hash_entry * target_entry, * new_map_entry;
1894+ struct io_region job;
1895+ struct page * page;
1896+ char * cow = NULL;
1897+ unsigned int error, offset;
1898+ u64 chunk;
1899+ u32 hash_value = 0;
1900+ u32 remainder;
1901+ int rc;
1902+
1903+ down_write(&sparse_volume->sparse_semaphore);
1904+
1905+ remainder = *sector & (u64)(sparse_volume->chunk_size - 1);
1906+ chunk = *sector >> sparse_volume->chunk_shift;
1907+ hash_value = ((u32)chunk) % sparse_volume->hash_table_size;
1908+
1909+ if ( search_sparse_hash_chain( chunk,
1910+ sparse_volume->sparse_map[hash_value],
1911+ &target_entry) ) {
1912+ *sector =
1913+ ( target_entry->sparse_chunk << sparse_volume->chunk_shift )
1914+ + remainder;
1915+ rc = 0;
1916+ goto out;
1917+ }
1918+
1919+ // Is there enough room left on this sparse to remap this chunk?
1920+ if ( sparse_volume->next_free_chunk >= sparse_volume->num_chunks ) {
1921+ DMERR("dm-sparse: full no new remaps allowed\n");
1922+ rc = -ENOSPC;
1923+ goto out;
1924+ }
1925+
1926+ // Create and initialize a new hash table entry for the new remap.
1927+ new_map_entry = allocate_sparse_hash_entry
1928+ (sparse_volume, chunk, sparse_volume->next_free_chunk);
1929+ if ( ! new_map_entry ) {
1930+ // Can't get memory for map entry. Disable this sparse.
1931+ DMERR("dm-sparse: memory error allocating hash entry\n");
1932+ rc = -ENOMEM;
1933+ goto out;
1934+ }
1935+
1936+ //Always write cow table so its safe
1937+ cow = kmalloc( SECTOR_SIZE, GFP_KERNEL );
1938+ if (! cow ) {
1939+ // Can't get I/O buffer. Disable this sparse.
1940+ DMERR("dm-sparse: memory error allocating COW table buffer");
1941+ rc = -ENOMEM;
1942+ goto out;
1943+ }
1944+
1945+ // Add the entry to the hash table.
1946+ if ( target_entry ) {
1947+ insert_sparse_hash_entry( new_map_entry, target_entry );
1948+ }
1949+ else {
1950+ insert_sparse_hash_entry_at_head
1951+ ( new_map_entry,
1952+ &(sparse_volume->sparse_map[hash_value]) );
1953+ }
1954+
1955+ sparse_volume->next_free_chunk++;
1956+
1957+ // Update the appropriate entry in the COW table.
1958+ sparse_volume->cow_table[sparse_volume->next_cow_entry] =
1959+ cpu_to_le64(chunk);
1960+ sparse_volume->next_cow_entry++;
1961+
1962+ memcpy(cow, sparse_volume->cow_table, SECTOR_SIZE);
1963+
1964+ //because of ordering issues needs to be synchronous
1965+ job.dev = sparse_volume->dev->dev;
1966+ job.sector = sparse_volume->current_cow_sector;
1967+ job.count = 1;
1968+ page = virt_to_page(cow);
1969+ offset = (unsigned long)cow & ~PAGE_MASK;
1970+ dm_io_sync(1, &job, WRITE, page, offset, &error);
1971+
1972+ // Update the in-memory COW table values.
1973+ if ( sparse_volume->next_cow_entry >= (SECTOR_SIZE/sizeof(u64)) )
1974+ {
1975+ sparse_volume->next_cow_entry = 0;
1976+ sparse_volume->current_cow_sector++;
1977+ memset(sparse_volume->cow_table, 0xff, SECTOR_SIZE);
1978+ }
1979+
1980+ *sector = ( new_map_entry->sparse_chunk << sparse_volume->chunk_shift )
1981+ + remainder;
1982+
1983+ rc = 0;
1984+
1985+ out:
1986+ up_write(&sparse_volume->sparse_semaphore);
1987+ if ( cow ) {
1988+ kfree( cow );
1989+ }
1990+
1991+ return rc;
1992+}
1993+
1994+/************************ EXPORT FUNCTIONS ************************/
1995+
1996+/*
1997+ * Function: sparse_dtr
1998+ */
1999+static void sparse_dtr( struct dm_target *ti )
2000+{
2001+ struct sparse_volume * vol = (struct sparse_volume *)ti->private;
2002+ int i;
2003+
2004+ if (vol) {
2005+
2006+ if (vol->sparse_map) {
2007+ for ( i = 0; i < vol->hash_table_size; i++ ) {
2008+ delete_sparse_hash_chain( vol, vol->sparse_map[i] );
2009+ }
2010+ delete_sparse_hash_chain( vol, vol->free_hash_list );
2011+ vfree(vol->sparse_map);
2012+ }
2013+
2014+ if (vol->hash_pool)
2015+ mempool_destroy(vol->hash_pool);
2016+
2017+ if (vol->hash_slab)
2018+ kmem_cache_destroy(vol->hash_slab);
2019+
2020+ dm_put_device(ti, vol->dev);
2021+
2022+ if (vol->dm_io_flag) {
2023+ dm_io_put(1);
2024+ }
2025+
2026+ kfree( vol );
2027+ }
2028+}
2029+
2030+/*
2031+ * Function: sparse_ctr
2032+ */
2033+static int sparse_ctr( struct dm_target *ti, unsigned int argc, char** argv )
2034+{
2035+ int i, rc = -EINVAL;
2036+ struct sparse_hash_entry *new_entry;
2037+ struct sparse_volume *vol;
2038+ struct dm_dev *dev;
2039+ u32 chunk_size, chunks;
2040+ u64 start;
2041+ char* end, slab_name[NAME_SIZE+1];
2042+
2043+ if ( argc != 4 ) {
2044+ ti->error="dm-sparse: wrong number of arguments";
2045+ return rc;
2046+ }
2047+
2048+ start = simple_strtoull(argv[1], &end, 10);
2049+ if (*end) {
2050+ ti->error="dm-sparse: Invalid first chunk lba";
2051+ return rc;
2052+ }
2053+
2054+ chunk_size = simple_strtoul(argv[2], &end, 10);
2055+ if (*end) {
2056+ ti->error="dm-sparse: Invalid chunk_size";
2057+ return rc;
2058+ }
2059+
2060+ chunks = simple_strtoul(argv[3], &end, 10);
2061+ if (*end) {
2062+ ti->error="dm-sparse: Invalid number of chunks";
2063+ return rc;
2064+ }
2065+
2066+ if ( dm_get_device( ti, argv[0], ti->begin, start + chunks * chunk_size,
2067+ dm_table_get_mode(ti->table), &dev ) ) {
2068+ ti->error = "dm-sparse: Device lookup failed";
2069+ return rc;
2070+ }
2071+
2072+ vol = kmalloc(sizeof(struct sparse_volume), GFP_KERNEL);
2073+ if ( !vol ) {
2074+ ti->error = "dm-sparse: Memory allocation for private-data failed";
2075+ rc = -ENOMEM;
2076+ goto out;
2077+ }
2078+
2079+ memset( vol, 0, sizeof(struct sparse_volume) );
2080+
2081+ rc = dm_io_get(1);
2082+ if (rc) {
2083+ ti->error = "dm-sparse: failed to initialize dm-io.";
2084+ sparse_dtr(ti);
2085+ return rc;
2086+ }
2087+
2088+ // Initialize
2089+ vol->dm_io_flag = 1;
2090+ vol->chunk_size = chunk_size;
2091+ vol->chunk_shift = log2(chunk_size);
2092+ vol->num_chunks = chunks;
2093+ vol->current_cow_sector = 1;
2094+ vol->hash_table_size = chunks / MAX_HASH_CHAIN_ENTRIES + 1;
2095+ vol->start = start;
2096+ vol->dev = dev;
2097+ init_rwsem(&vol->sparse_semaphore);
2098+
2099+ snprintf(slab_name, NAME_SIZE, "sparse-%p", vol);
2100+ vol->hash_slab = kmem_cache_create(slab_name,
2101+ sizeof(struct sparse_hash_entry),
2102+ 0, SLAB_HWCACHE_ALIGN,
2103+ NULL, NULL);
2104+ if ( ! vol->hash_slab ) {
2105+ ti->error = "dm-sparse: memory allocation error in hash slab create";
2106+ sparse_dtr(ti);
2107+ return -ENOMEM;
2108+ }
2109+ vol->hash_pool = mempool_create(1, mempool_alloc_slab,
2110+ mempool_free_slab,
2111+ vol->hash_slab);
2112+ if ( ! vol->hash_pool ) {
2113+ ti->error = "dm-sparse: memory allocation error in hash pool create";
2114+ sparse_dtr(ti);
2115+ return -ENOMEM;
2116+ }
2117+
2118+ // Sparse hash table
2119+ vol->sparse_map = vmalloc( vol->hash_table_size *
2120+ sizeof( struct sparse_hash_entry * ) );
2121+ if ( ! vol->sparse_map ) {
2122+ ti->error = "dm-sparse: Memory allocation error in sparse_map create";
2123+ sparse_dtr(ti);
2124+ return -ENOMEM;
2125+ }
2126+
2127+ memset( vol->sparse_map, 0, vol->hash_table_size *
2128+ sizeof( struct sparse_hash_entry * ) );
2129+
2130+ for ( i = 0; i < chunks; i++ ) {
2131+
2132+ new_entry = mempool_alloc(vol->hash_pool, GFP_KERNEL );
2133+ if ( ! new_entry ) {
2134+ ti->error="dm-sparse: memory allocation error in hash table setup";
2135+ sparse_dtr(ti);
2136+ return -ENOMEM;
2137+ }
2138+
2139+ new_entry->next = vol->free_hash_list;
2140+ vol->free_hash_list = new_entry;
2141+ }
2142+
2143+ rc = build_sparse_maps(vol);
2144+ if (rc) {
2145+ ti->error = "dm-sparse: error building hash tables";
2146+ sparse_dtr(ti);
2147+ return rc;
2148+ }
2149+
2150+ ti->private = vol;
2151+ return rc;
2152+
2153+ out:
2154+ dm_put_device(ti, dev);
2155+ return rc;
2156+}
2157+
2158+/*
2159+ * Function: sparse_map
2160+ */
2161+static int sparse_map( struct dm_target * ti, struct buffer_head * bh, int rw,
2162+ union map_info *map_context )
2163+{
2164+ struct sparse_volume * volume = (struct sparse_volume*)ti->private;
2165+ u64 sector = bh->b_rsector;
2166+ int rc;
2167+
2168+
2169+
2170+ // Check if this sector has been remapped
2171+ rc = sparse_remap_chunk( volume, &sector );
2172+
2173+ if ( rc < 0 ) { //Error
2174+ bh->b_end_io(bh, 0);
2175+ return rc;
2176+ }
2177+
2178+ if ( rc == 0 ) { // Remapped I/O : read or write same logic
2179+ bh->b_rsector = volume->start + sector;
2180+ bh->b_rdev = volume->dev->dev;
2181+ return 1;
2182+ }
2183+
2184+ // ( Previously )Un-mapped: read / write different logic
2185+
2186+ if ( rw ) { //write :
2187+ rc = sparse_cow_write( volume, &sector );
2188+
2189+ if ( rc < 0 ) { //Error
2190+ bh->b_end_io(bh, 0);
2191+ return rc;
2192+ }
2193+ //Send write on
2194+ bh->b_rsector = volume->start + sector;
2195+ bh->b_rdev = volume->dev->dev;
2196+ return 1;
2197+ }
2198+
2199+ //Reading something that was never written
2200+ //return zeros and indicate complete
2201+ memset(bh->b_data, 0x0, bh->b_size);
2202+ bh->b_end_io(bh, 1);
2203+ return 0;
2204+}
2205+
2206+static int sparse_status( struct dm_target *ti, status_type_t type,
2207+ char *result, unsigned int maxlen )
2208+{
2209+ struct sparse_volume * vol = (struct sparse_volume * )ti->private;
2210+
2211+ switch(type) {
2212+
2213+ case STATUSTYPE_INFO:
2214+ snprintf( result, maxlen, "%d%%",
2215+ ( vol->next_free_chunk * 100 ) / vol->num_chunks );
2216+ break;
2217+
2218+ case STATUSTYPE_TABLE:
2219+ snprintf( result, maxlen, "%s %Lu %u %u",
2220+ dm_kdevname(vol->dev->dev), vol->start,
2221+ vol->chunk_size, vol->num_chunks );
2222+ break;
2223+
2224+ default:
2225+ break;
2226+ }
2227+
2228+ return 0;
2229+}
2230+
2231+/****************** FUNCTION TABLE **********************/
2232+
2233+static struct target_type sparse_target = {
2234+ .name = "sparse",
2235+ .module = THIS_MODULE,
2236+ .ctr = sparse_ctr,
2237+ .dtr = sparse_dtr,
2238+ .map = sparse_map,
2239+ .status = sparse_status,
2240+};
2241+
2242+/********************* REGISTRATION *****************/
2243+
2244+int __init sparse_init(void)
2245+{
2246+ int rc = dm_register_target(&sparse_target);
2247+
2248+ if ( rc < 0 )
2249+ DMWARN("sparse target registration failed");
2250+
2251+ return rc;
2252+}
2253+
2254+void __exit sparse_exit(void)
2255+{
2256+ if (dm_unregister_target(&sparse_target) )
2257+ DMWARN("sparse target unregistration failed");
2258+
2259+ return;
2260+}
2261+
2262+module_init(sparse_init);
2263+module_exit(sparse_exit);
2264+MODULE_LICENSE("GPL");
2265diff -urN linux-2.4.22/drivers/md/multipath.c linux-2.4.22-evms/drivers/md/multipath.c
2266--- linux-2.4.22/drivers/md/multipath.c 2003-06-13 16:51:34.000000000 +0200
2267+++ linux-2.4.22-evms/drivers/md/multipath.c 2003-09-15 17:09:36.000000000 +0200
2268@@ -139,15 +139,16 @@
2269 static int multipath_map (mddev_t *mddev, kdev_t *rdev)
2270 {
2271 multipath_conf_t *conf = mddev_to_conf(mddev);
2272- int i, disks = MD_SB_DISKS;
2273+ int i;
2274
2275 /*
2276 * Later we do read balancing on the read side
2277 * now we use the first available disk.
2278 */
2279
2280- for (i = 0; i < disks; i++) {
2281+ for (i = 0; i < conf->nr_disks; i++) {
2282 if (conf->multipaths[i].operational) {
2283+ /* first operational is winner! */
2284 *rdev = conf->multipaths[i].dev;
2285 return (0);
2286 }
2287@@ -191,6 +192,8 @@
2288 {
2289 struct multipath_bh * mp_bh = (struct multipath_bh *)(bh->b_private);
2290
2291+ atomic_dec(&mp_bh->multipath->nr_pending);
2292+
2293 /*
2294 * this branch is our 'one multipath IO has finished' event handler:
2295 */
2296@@ -223,19 +226,39 @@
2297 }
2298
2299 /*
2300- * This routine returns the disk from which the requested read should
2301- * be done.
2302+ * Multipath read balance ...
2303+ *
2304+ * Returns:
2305+ *
2306+ * If no active paths
2307+ *
2308+ * - Error ( -1 )
2309+ *
2310+ * If active paths == 1
2311+ *
2312+ * - 1st active path encountered
2313+ *
2314+ * If active paths > 1
2315+ *
2316+ * - 1st idle active path encountered
2317+ * - else ... the active path doing the least amount of work.
2318 */
2319-
2320 static int multipath_read_balance (multipath_conf_t *conf)
2321 {
2322- int disk;
2323-
2324- for (disk = 0; disk < conf->raid_disks; disk++)
2325- if (conf->multipaths[disk].operational)
2326- return disk;
2327- BUG();
2328- return 0;
2329+ int i, disk=-1, nr_pending, least_pending=0;
2330+
2331+ for (i=0; i<conf->nr_disks; i++) {
2332+ if (conf->multipaths[i].operational) {
2333+ nr_pending = atomic_read(&conf->multipaths[i].nr_pending);
2334+ if (nr_pending==0 || conf->working_disks==1)
2335+ return i;
2336+ if (least_pending==0 || nr_pending<least_pending) {
2337+ disk = i;
2338+ least_pending = nr_pending;
2339+ }
2340+ }
2341+ }
2342+ return disk;
2343 }
2344
2345 static int multipath_make_request (mddev_t *mddev, int rw,
2346@@ -245,6 +268,7 @@
2347 struct buffer_head *bh_req;
2348 struct multipath_bh * mp_bh;
2349 struct multipath_info *multipath;
2350+ int disk;
2351
2352 if (!buffer_locked(bh))
2353 BUG();
2354@@ -267,7 +291,16 @@
2355 /*
2356 * read balancing logic:
2357 */
2358- multipath = conf->multipaths + multipath_read_balance(conf);
2359+ disk = multipath_read_balance(conf);
2360+ if (disk==-1) {
2361+ printk (KERN_ERR "multipath_make_request: no more operational IO paths.\n");
2362+ buffer_IO_error(bh);
2363+ return 0;
2364+ }
2365+
2366+ multipath = conf->multipaths + disk;
2367+ mp_bh->multipath = multipath;
2368+ atomic_inc(&multipath->nr_pending);
2369
2370 bh_req = &mp_bh->bh_req;
2371 memcpy(bh_req, bh, sizeof(*bh));
2372@@ -331,13 +364,14 @@
2373 {
2374 multipath_conf_t *conf = mddev_to_conf(mddev);
2375 struct multipath_info * multipaths = conf->multipaths;
2376- int disks = MD_SB_DISKS;
2377 int other_paths = 1;
2378- int i;
2379+ int i, first = 1;
2380+ mdk_rdev_t *rdev;
2381+ struct md_list_head *tmp;
2382
2383 if (conf->working_disks == 1) {
2384 other_paths = 0;
2385- for (i = 0; i < disks; i++) {
2386+ for (i = 0; i < MD_SB_DISKS; i++) {
2387 if (multipaths[i].spare) {
2388 other_paths = 1;
2389 break;
2390@@ -351,16 +385,17 @@
2391 * first check if this is a queued request for a device
2392 * which has just failed.
2393 */
2394- for (i = 0; i < disks; i++) {
2395+ for (i = 0; i < MD_SB_DISKS; i++) {
2396 if (multipaths[i].dev==dev && !multipaths[i].operational)
2397 return 0;
2398 }
2399 printk (LAST_DISK);
2400 } else {
2401+ mdp_super_t *sb = mddev->sb;
2402 /*
2403 * Mark disk as unusable
2404 */
2405- for (i = 0; i < disks; i++) {
2406+ for (i = 0; i < MD_SB_DISKS; i++) {
2407 if (multipaths[i].dev==dev && multipaths[i].operational) {
2408 mark_disk_bad(mddev, i);
2409 break;
2410@@ -369,7 +404,6 @@
2411 if (!conf->working_disks) {
2412 int err = 1;
2413 mdp_disk_t *spare;
2414- mdp_super_t *sb = mddev->sb;
2415
2416 spare = get_spare(mddev);
2417 if (spare) {
2418@@ -384,6 +418,21 @@
2419 sb->spare_disks--;
2420 }
2421 }
2422+ /* prevent unnecessary work in md_do_recovery() */
2423+ if (conf->working_disks) {
2424+ conf->raid_disks = conf->working_disks
2425+ = sb->raid_disks = sb->active_disks;
2426+ }
2427+ /* update alias disk info to insure we can do sb commit. */
2428+ ITERATE_RDEV(mddev,rdev,tmp) {
2429+ if (first && disk_active(&sb->disks[rdev->desc_nr])) {
2430+ rdev->alias_device = 0;
2431+ first = 0;
2432+ } else {
2433+ if (!disk_faulty(&sb->disks[rdev->desc_nr]))
2434+ rdev->alias_device = 1;
2435+ }
2436+ }
2437 }
2438 return 0;
2439 }
2440@@ -677,9 +726,8 @@
2441 /*
2442 * This is a kernel thread which:
2443 *
2444- * 1. Retries failed read operations on working multipaths.
2445+ * 1. Retries failed operations on working multipaths.
2446 * 2. Updates the raid superblock when problems encounter.
2447- * 3. Performs writes following reads for array syncronising.
2448 */
2449
2450 static void multipathd (void *data)
2451@@ -833,6 +881,7 @@
2452 mdk_rdev_t *rdev, *def_rdev = NULL;
2453 struct md_list_head *tmp;
2454 int num_rdevs = 0;
2455+ int active_disks = 0, spare_disks = 0, faulty_disks = 0;
2456
2457 MOD_INC_USE_COUNT;
2458
2459@@ -881,9 +930,7 @@
2460 printk(NOT_IN_SYNC, partition_name(rdev->dev));
2461
2462 /*
2463- * Mark all disks as spare to start with, then pick our
2464- * active disk. If we have a disk that is marked active
2465- * in the sb, then use it, else use the first rdev.
2466+ * Mark all disks as spare to start with.
2467 */
2468 disk->number = desc->number;
2469 disk->raid_disk = desc->raid_disk;
2470@@ -894,20 +941,21 @@
2471 mark_disk_sync(desc);
2472
2473 if (disk_active(desc)) {
2474- if(!conf->working_disks) {
2475- printk(OPERATIONAL, partition_name(rdev->dev),
2476- desc->raid_disk);
2477- disk->operational = 1;
2478- disk->spare = 0;
2479- conf->working_disks++;
2480- def_rdev = rdev;
2481- } else {
2482- mark_disk_spare(desc);
2483- }
2484- } else
2485- mark_disk_spare(desc);
2486+ printk(OPERATIONAL, partition_name(rdev->dev),
2487+ desc->raid_disk);
2488+ disk->operational = 1;
2489+ disk->spare = 0;
2490+ conf->working_disks++;
2491+ def_rdev = rdev;
2492+ active_disks++;
2493+ } else if (disk_faulty(desc)) {
2494+ disk->spare = 0;
2495+ faulty_disks++;
2496+ } else {
2497+ spare_disks++;
2498+ }
2499
2500- if(!num_rdevs++) def_rdev = rdev;
2501+ num_rdevs++;
2502 }
2503 if(!conf->working_disks && num_rdevs) {
2504 desc = &sb->disks[def_rdev->desc_nr];
2505@@ -918,11 +966,12 @@
2506 disk->spare = 0;
2507 conf->working_disks++;
2508 mark_disk_active(desc);
2509+ active_disks++;
2510 }
2511 /*
2512- * Make sure our active path is in desc spot 0
2513+ * If there is only 1 active path ... make sure it is in desc spot 0
2514 */
2515- if(def_rdev->desc_nr != 0) {
2516+ if (active_disks == 1 && def_rdev->desc_nr != 0) {
2517 rdev = find_rdev_nr(mddev, 0);
2518 desc = &sb->disks[def_rdev->desc_nr];
2519 desc2 = sb->disks;
2520@@ -940,10 +989,10 @@
2521 def_rdev->desc_nr = 0;
2522 }
2523 }
2524- conf->raid_disks = sb->raid_disks = sb->active_disks = 1;
2525+ conf->raid_disks = sb->raid_disks = sb->active_disks = active_disks;
2526 conf->nr_disks = sb->nr_disks = sb->working_disks = num_rdevs;
2527- sb->failed_disks = 0;
2528- sb->spare_disks = num_rdevs - 1;
2529+ sb->failed_disks = faulty_disks;
2530+ sb->spare_disks = spare_disks;
2531 mddev->sb_dirty = 1;
2532 conf->mddev = mddev;
2533 conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
4dd5eeca
JR
2534diff -urN linux-2.4.22/include/linux/raid/multipath.h linux-2.4.22-evms/include/linux/raid/multipath.h
2535--- linux-2.4.22/include/linux/raid/multipath.h 2001-11-12 18:51:56.000000000 +0100
2536+++ linux-2.4.22-evms/include/linux/raid/multipath.h 2003-09-15 17:09:36.000000000 +0200
2537@@ -15,6 +15,7 @@
2538 int spare;
2539
2540 int used_slot;
2541+ atomic_t nr_pending; /* number of pending requests */
2542 };
2543
2544 struct multipath_private_data {
2545@@ -63,6 +64,7 @@
2546 struct buffer_head *master_bh;
2547 struct buffer_head bh_req;
2548 struct multipath_bh *next_mp; /* next for retry or in free list */
2549+ struct multipath_info *multipath; /* allows end_request to easilly dec pending buffer count*/
2550 };
2551 /* bits for multipath_bh.state */
2552 #define MPBH_Uptodate 1
This page took 0.373766 seconds and 4 git commands to generate.