]> git.pld-linux.org Git - packages/kernel.git/blame - linux-2.4.25-evms-2.2.1.patch
- added description of djurban's branch
[packages/kernel.git] / linux-2.4.25-evms-2.2.1.patch
CommitLineData
cdeda7f0
AM
1diff -urN linux-2.4.24.org/drivers/md/Config.in linux-2.4.24/drivers/md/Config.in
2--- linux-2.4.24.org/drivers/md/Config.in 2004-01-18 15:09:18.503177509 +0100
3+++ linux-2.4.24/drivers/md/Config.in 2004-01-18 16:05:08.202479073 +0100
4@@ -12,6 +12,10 @@
5 dep_tristate ' RAID-1 (mirroring) mode' CONFIG_MD_RAID1 $CONFIG_BLK_DEV_MD
6 dep_tristate ' RAID-4/RAID-5 mode' CONFIG_MD_RAID5 $CONFIG_BLK_DEV_MD
7 dep_tristate ' Multipath I/O support' CONFIG_MD_MULTIPATH $CONFIG_BLK_DEV_MD
8+if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
9+ dep_tristate ' Bad Block Relocation Device Target (EXPERIMENTAL)' CONFIG_BLK_DEV_DM_BBR $CONFIG_BLK_DEV_DM
10+ dep_tristate ' Sparse Device Target (EXPERIMENTAL)' CONFIG_BLK_DEV_DM_SPARSE $CONFIG_BLK_DEV_DM
11+fi
12
13 dep_tristate ' Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM $CONFIG_MD
14 dep_tristate ' Device-mapper support' CONFIG_BLK_DEV_DM $CONFIG_MD
15diff -urN linux-2.4.24.org/drivers/md/dm-bbr.c linux-2.4.24/drivers/md/dm-bbr.c
16--- linux-2.4.24.org/drivers/md/dm-bbr.c 1970-01-01 01:00:00.000000000 +0100
17+++ linux-2.4.24/drivers/md/dm-bbr.c 2004-01-18 16:03:13.099546349 +0100
18@@ -0,0 +1,1227 @@
19+/*
20+ * (C) Copyright IBM Corp. 2002, 2003
21+ *
22+ * This program is free software; you can redistribute it and/or modify
23+ * it under the terms of the GNU General Public License as published by
24+ * the Free Software Foundation; either version 2 of the License, or
25+ * (at your option) any later version.
26+ *
27+ * This program is distributed in the hope that it will be useful,
28+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
29+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
30+ * the GNU General Public License for more details.
31+ *
32+ * You should have received a copy of the GNU General Public License
33+ * along with this program; if not, write to the Free Software
34+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
35+ *
36+ * linux/drivers/md/dm-bbr.c
37+ *
38+ * Bad-block-relocation (BBR) target for device-mapper.
39+ *
40+ * The BBR target is designed to remap I/O write failures to another safe
41+ * location on disk. Note that most disk drives have BBR built into them,
42+ * this means that our software BBR will be only activated when all hardware
43+ * BBR replacement sectors have been used.
44+ */
45+
46+#include <linux/kernel.h>
47+#include <linux/module.h>
48+#include <linux/init.h>
49+#include <linux/blkdev.h>
50+#include <linux/spinlock.h>
51+#include <linux/smp_lock.h>
52+#include <linux/slab.h>
53+#include <linux/mempool.h>
54+#include "dm.h"
55+#include "dm-bbr.h"
56+#include "dm-daemon.h"
57+#include "dm-io.h"
58+
59+/* Number of active BBR devices. */
60+static int bbr_instances = 0;
61+static DECLARE_MUTEX(bbr_instances_lock);
62+
63+/* Data pertaining to the I/O thread. */
64+static struct dm_daemon * bbr_io_thread = NULL;
65+static spinlock_t bbr_io_list_lock = SPIN_LOCK_UNLOCKED;
66+static LIST_HEAD(bbr_io_list);
67+static void bbr_io_handler(void);
68+
69+/* Global pools for bbr_io_buf's and bbr_remap's. */
70+static kmem_cache_t * bbr_io_buf_cache;
71+static mempool_t * bbr_io_buf_pool;
72+static kmem_cache_t * bbr_remap_cache;
73+static mempool_t * bbr_remap_pool;
74+
75+static void bbr_free_remap(struct bbr_private * bbr_id);
76+
77+/**
78+ * destroy_pools
79+ *
80+ * Delete the pools for the remap list and I/O anchors.
81+ **/
82+static void destroy_pools(void)
83+{
84+ if (bbr_io_buf_pool) {
85+ mempool_destroy(bbr_io_buf_pool);
86+ bbr_io_buf_pool = NULL;
87+ }
88+ if (bbr_io_buf_cache) {
89+ kmem_cache_destroy(bbr_io_buf_cache);
90+ bbr_io_buf_cache = NULL;
91+ }
92+ if (bbr_remap_pool) {
93+ mempool_destroy(bbr_remap_pool);
94+ bbr_remap_pool = NULL;
95+ }
96+ if (bbr_remap_cache) {
97+ kmem_cache_destroy(bbr_remap_cache);
98+ bbr_remap_cache = NULL;
99+ }
100+}
101+
102+/**
103+ * create_pools
104+ *
105+ * Create mempools for the remap list and I/O anchors.
106+ **/
107+static int create_pools(void)
108+{
109+ if (!bbr_remap_cache) {
110+ bbr_remap_cache = kmem_cache_create("BBR_Remap_Cache",
111+ sizeof(struct bbr_runtime_remap),
112+ 0, SLAB_HWCACHE_ALIGN,
113+ NULL, NULL);
114+ if (!bbr_remap_cache) {
115+ DMERR("Unable to create BBR remap cache.");
116+ goto out;
117+ }
118+ }
119+ if (!bbr_remap_pool) {
120+ bbr_remap_pool = mempool_create(64, mempool_alloc_slab,
121+ mempool_free_slab,
122+ bbr_remap_cache);
123+ if (!bbr_remap_pool) {
124+ DMERR("Unable to create BBR remap mempool.");
125+ goto out;
126+ }
127+ }
128+
129+ if (!bbr_io_buf_cache) {
130+ bbr_io_buf_cache = kmem_cache_create("BBR_IO_Buf_Cache",
131+ sizeof(struct bbr_io_buffer),
132+ 0, SLAB_HWCACHE_ALIGN,
133+ NULL, NULL);
134+ if (!bbr_io_buf_cache) {
135+ DMERR("Unable to create BBR I/O buffer cache.");
136+ goto out;
137+ }
138+ }
139+ if (!bbr_io_buf_pool) {
140+ bbr_io_buf_pool = mempool_create(256, mempool_alloc_slab,
141+ mempool_free_slab,
142+ bbr_io_buf_cache);
143+ if (!bbr_io_buf_pool) {
144+ DMERR("Unable to create BBR I/O buffer mempool.");
145+ goto out;
146+ }
147+ }
148+
149+out:
150+ if (!bbr_remap_cache || !bbr_remap_pool ||
151+ !bbr_io_buf_cache || !bbr_io_buf_pool ) {
152+ destroy_pools();
153+ return -ENOMEM;
154+ }
155+
156+ return 0;
157+}
158+
159+/**
160+ * stop_io_thread
161+ *
162+ * Use the dm-daemon services to stop the BBR I/O thread.
163+ **/
164+static void stop_io_thread(void)
165+{
166+ if (bbr_io_thread) {
167+ dm_daemon_stop(bbr_io_thread);
168+ kfree(bbr_io_thread);
169+ bbr_io_thread = NULL;
170+ }
171+}
172+
173+/**
174+ * start_io_thread
175+ *
176+ * Use the dm-daemon services to start the BBR I/O thread.
177+ **/
178+static int start_io_thread(void)
179+{
180+ int rc;
181+
182+ if (!bbr_io_thread) {
183+ bbr_io_thread = kmalloc(sizeof(*bbr_io_thread), GFP_KERNEL);
184+ if (!bbr_io_thread) {
185+ return -ENOMEM;
186+ }
187+
188+ rc = dm_daemon_start(bbr_io_thread, "bbr_io", bbr_io_handler);
189+ if (rc) {
190+ kfree(bbr_io_thread);
191+ return rc;
192+ }
193+ }
194+
195+ return 0;
196+}
197+
198+/**
199+ * bbr_global_init
200+ *
201+ * Set up the mempools, I/O thread, and sync-I/O service. This should
202+ * be called only when the first bbr device is created.
203+ **/
204+static int bbr_global_init(void)
205+{
206+ int rc;
207+
208+ rc = create_pools();
209+ if (rc) {
210+ goto out;
211+ }
212+
213+ rc = start_io_thread();
214+ if (rc) {
215+ destroy_pools();
216+ goto out;
217+ }
218+
219+ rc = dm_io_get(1);
220+ if (rc) {
221+ destroy_pools();
222+ stop_io_thread();
223+ goto out;
224+ }
225+
226+out:
227+ return rc;
228+}
229+
230+/**
231+ * bbr_global_cleanup
232+ *
233+ * Cleanup the mempools, I/O thread and sync-I/O service. This should
234+ * be called only when the last bbr device is removed.
235+ **/
236+static void bbr_global_cleanup(void)
237+{
238+ destroy_pools();
239+ stop_io_thread();
240+ dm_io_put(1);
241+}
242+
243+static struct bbr_private * bbr_alloc_private(void)
244+{
245+ struct bbr_private *bbr_id;
246+
247+ bbr_id = kmalloc(sizeof(*bbr_id), GFP_KERNEL);
248+ if (bbr_id) {
249+ memset(bbr_id, 0, sizeof(*bbr_id));
250+ bbr_id->in_use_replacement_blks = (atomic_t)ATOMIC_INIT(0);
251+ bbr_id->bbr_id_lock = SPIN_LOCK_UNLOCKED;
252+ }
253+
254+ return bbr_id;
255+}
256+
257+static void bbr_free_private(struct bbr_private *bbr_id)
258+{
259+ if (bbr_id->bbr_table) {
260+ kfree(bbr_id->bbr_table);
261+ }
262+ bbr_free_remap(bbr_id);
263+ kfree(bbr_id);
264+}
265+
266+static u32 crc_table[256];
267+static u32 crc_table_built = 0;
268+
269+static void build_crc_table(void)
270+{
271+ u32 i, j, crc;
272+
273+ for (i = 0; i <= 255; i++) {
274+ crc = i;
275+ for (j = 8; j > 0; j--) {
276+ if (crc & 1)
277+ crc = (crc >> 1) ^ CRC_POLYNOMIAL;
278+ else
279+ crc >>= 1;
280+ }
281+ crc_table[i] = crc;
282+ }
283+ crc_table_built = 1;
284+}
285+
286+static u32 calculate_crc(u32 crc, void *buffer, u32 buffersize)
287+{
288+ unsigned char *current_byte;
289+ u32 temp1, temp2, i;
290+
291+ current_byte = (unsigned char *) buffer;
292+ /* Make sure the crc table is available */
293+ if (!crc_table_built)
294+ build_crc_table();
295+ /* Process each byte in the buffer. */
296+ for (i = 0; i < buffersize; i++) {
297+ temp1 = (crc >> 8) & 0x00FFFFFF;
298+ temp2 = crc_table[(crc ^ (u32) * current_byte) &
299+ (u32) 0xff];
300+ current_byte++;
301+ crc = temp1 ^ temp2;
302+ }
303+ return crc;
304+}
305+
306+/**
307+ * le_bbr_table_sector_to_cpu
308+ *
309+ * Convert bbr meta data from on-disk (LE) format
310+ * to the native cpu endian format.
311+ **/
312+static void le_bbr_table_sector_to_cpu(struct bbr_table *p)
313+{
314+ int i;
315+ p->signature = le32_to_cpup(&p->signature);
316+ p->crc = le32_to_cpup(&p->crc);
317+ p->sequence_number = le32_to_cpup(&p->sequence_number);
318+ p->in_use_cnt = le32_to_cpup(&p->in_use_cnt);
319+ for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
320+ p->entries[i].bad_sect =
321+ le64_to_cpup(&p->entries[i].bad_sect);
322+ p->entries[i].replacement_sect =
323+ le64_to_cpup(&p->entries[i].replacement_sect);
324+ }
325+}
326+
327+/**
328+ * cpu_bbr_table_sector_to_le
329+ *
330+ * Convert bbr meta data from cpu endian format to on-disk (LE) format
331+ **/
332+static void cpu_bbr_table_sector_to_le(struct bbr_table * p,
333+ struct bbr_table * le)
334+{
335+ int i;
336+ le->signature = cpu_to_le32p(&p->signature);
337+ le->crc = cpu_to_le32p(&p->crc);
338+ le->sequence_number = cpu_to_le32p(&p->sequence_number);
339+ le->in_use_cnt = cpu_to_le32p(&p->in_use_cnt);
340+ for (i = 0; i < BBR_ENTRIES_PER_SECT; i++) {
341+ le->entries[i].bad_sect =
342+ cpu_to_le64p(&p->entries[i].bad_sect);
343+ le->entries[i].replacement_sect =
344+ cpu_to_le64p(&p->entries[i].replacement_sect);
345+ }
346+}
347+
348+/**
349+ * validate_bbr_table_sector
350+ *
351+ * Check the specified BBR table sector for a valid signature and CRC. If it's
352+ * valid, endian-convert the table sector.
353+ **/
354+static int validate_bbr_table_sector(struct bbr_table * p)
355+{
356+ int rc = 0;
357+ int org_crc, final_crc;
358+
359+ if (le32_to_cpup(&p->signature) != BBR_TABLE_SIGNATURE) {
360+ DMERR("BBR table signature doesn't match!");
361+ DMERR("Found 0x%x. Expecting 0x%x",
362+ le32_to_cpup(&p->signature), BBR_TABLE_SIGNATURE);
363+ rc = -EINVAL;
364+ goto out;
365+ }
366+
367+ if (!p->crc) {
368+ DMERR("BBR table sector has no CRC!");
369+ rc = -EINVAL;
370+ goto out;
371+ }
372+
373+ org_crc = le32_to_cpup(&p->crc);
374+ p->crc = 0;
375+ final_crc = calculate_crc(INITIAL_CRC, (void *)p, sizeof(*p));
376+ if (final_crc != org_crc) {
377+ DMERR("CRC failed!");
378+ DMERR("Found 0x%x. Expecting 0x%x",
379+ org_crc, final_crc);
380+ rc = -EINVAL;
381+ goto out;
382+ }
383+
384+ p->crc = cpu_to_le32p(&org_crc);
385+ le_bbr_table_sector_to_cpu(p);
386+
387+out:
388+ return rc;
389+}
390+
391+/**
392+ * bbr_binary_tree_insert
393+ *
394+ * Insert a node into the binary tree.
395+ **/
396+static void bbr_binary_tree_insert(struct bbr_runtime_remap **root,
397+ struct bbr_runtime_remap *newnode)
398+{
399+ struct bbr_runtime_remap **node = root;
400+ while (node && *node) {
401+ if (newnode->remap.bad_sect > (*node)->remap.bad_sect) {
402+ node = &((*node)->right);
403+ } else {
404+ node = &((*node)->left);
405+ }
406+ }
407+
408+ newnode->left = newnode->right = NULL;
409+ *node = newnode;
410+}
411+
412+/**
413+ * bbr_binary_search
414+ *
415+ * Search for a node that contains bad_sect == lsn.
416+ **/
417+static struct bbr_runtime_remap * bbr_binary_search(
418+ struct bbr_runtime_remap *root,
419+ u64 lsn)
420+{
421+ struct bbr_runtime_remap *node = root;
422+ while (node) {
423+ if (node->remap.bad_sect == lsn) {
424+ break;
425+ }
426+ if (lsn > node->remap.bad_sect) {
427+ node = node->right;
428+ } else {
429+ node = node->left;
430+ }
431+ }
432+ return node;
433+}
434+
435+/**
436+ * bbr_binary_tree_destroy
437+ *
438+ * Destroy the binary tree.
439+ **/
440+static void bbr_binary_tree_destroy(struct bbr_runtime_remap * root,
441+ struct bbr_private * bbr_id)
442+{
443+ struct bbr_runtime_remap **link = NULL;
444+ struct bbr_runtime_remap *node = root;
445+
446+ while (node) {
447+ if (node->left) {
448+ link = &(node->left);
449+ node = node->left;
450+ continue;
451+ }
452+ if (node->right) {
453+ link = &(node->right);
454+ node = node->right;
455+ continue;
456+ }
457+
458+ mempool_free(node, bbr_remap_pool);
459+ if (node == root) {
460+ /* If root is deleted, we're done. */
461+ break;
462+ }
463+
464+ /* Back to root. */
465+ node = root;
466+ *link = NULL;
467+ }
468+}
469+
470+static void bbr_free_remap(struct bbr_private * bbr_id)
471+{
472+ spin_lock_irq(&bbr_id->bbr_id_lock);
473+ bbr_binary_tree_destroy(bbr_id->remap_root, bbr_id);
474+ bbr_id->remap_root = NULL;
475+ spin_unlock_irq(&bbr_id->bbr_id_lock);
476+}
477+
478+/**
479+ * bbr_insert_remap_entry
480+ *
481+ * Create a new remap entry and add it to the binary tree for this node.
482+ **/
483+static int bbr_insert_remap_entry(struct bbr_private *bbr_id,
484+ struct bbr_table_entry *new_bbr_entry)
485+{
486+ struct bbr_runtime_remap *newnode;
487+
488+ newnode = mempool_alloc(bbr_remap_pool, GFP_NOIO);
489+ if (!newnode) {
490+ DMERR("Could not allocate from remap mempool!");
491+ return -ENOMEM;
492+ }
493+ newnode->remap.bad_sect = new_bbr_entry->bad_sect;
494+ newnode->remap.replacement_sect = new_bbr_entry->replacement_sect;
495+ spin_lock_irq(&bbr_id->bbr_id_lock);
496+ bbr_binary_tree_insert(&bbr_id->remap_root, newnode);
497+ spin_unlock_irq(&bbr_id->bbr_id_lock);
498+ return 0;
499+}
500+
501+/**
502+ * bbr_table_to_remap_list
503+ *
504+ * The on-disk bbr table is sorted by the replacement sector LBA. In order to
505+ * improve run time performance, the in memory remap list must be sorted by
506+ * the bad sector LBA. This function is called at discovery time to initialize
507+ * the remap list. This function assumes that at least one copy of meta data
508+ * is valid.
509+ **/
510+static u32 bbr_table_to_remap_list(struct bbr_private * bbr_id)
511+{
512+ u32 in_use_blks = 0;
513+ int i, j;
514+ struct bbr_table *p;
515+
516+ for (i = 0, p = bbr_id->bbr_table;
517+ i < bbr_id->nr_sects_bbr_table;
518+ i++, p++) {
519+ if (!p->in_use_cnt) {
520+ break;
521+ }
522+ in_use_blks += p->in_use_cnt;
523+ for (j = 0; j < p->in_use_cnt; j++) {
524+ bbr_insert_remap_entry(bbr_id, &p->entries[j]);
525+ }
526+ }
527+ if (in_use_blks) {
528+ DMWARN("There are %u BBR entries for device %s",
529+ in_use_blks, dm_kdevname(bbr_id->dev->dev));
530+ }
531+
532+ return in_use_blks;
533+}
534+
535+/**
536+ * bbr_search_remap_entry
537+ *
538+ * Search remap entry for the specified sector. If found, return a pointer to
539+ * the table entry. Otherwise, return NULL.
540+ **/
541+static struct bbr_table_entry * bbr_search_remap_entry(
542+ struct bbr_private *bbr_id,
543+ u64 lsn)
544+{
545+ struct bbr_runtime_remap *p;
546+
547+ spin_lock_irq(&bbr_id->bbr_id_lock);
548+ p = bbr_binary_search(bbr_id->remap_root, lsn);
549+ spin_unlock_irq(&bbr_id->bbr_id_lock);
550+ if (p) {
551+ return (&p->remap);
552+ } else {
553+ return NULL;
554+ }
555+}
556+
557+/**
558+ * bbr_remap
559+ *
560+ * If *lsn is in the remap table, return TRUE and modify *lsn,
561+ * else, return FALSE.
562+ **/
563+static inline int bbr_remap(struct bbr_private *bbr_id,
564+ u64 *lsn)
565+{
566+ struct bbr_table_entry *e;
567+
568+ if (atomic_read(&bbr_id->in_use_replacement_blks)) {
569+ e = bbr_search_remap_entry(bbr_id, *lsn);
570+ if (e) {
571+ *lsn = e->replacement_sect;
572+ return 1;
573+ }
574+ }
575+ return 0;
576+}
577+
578+/**
579+ * bbr_remap_probe
580+ *
581+ * If any of the sectors in the range [lsn, lsn+nr_sects] are in the remap
582+ * table return TRUE, Else, return FALSE.
583+ **/
584+static inline int bbr_remap_probe(struct bbr_private * bbr_id,
585+ u64 lsn, u64 nr_sects)
586+{
587+ u64 tmp, cnt;
588+
589+ if (atomic_read(&bbr_id->in_use_replacement_blks)) {
590+ for (cnt = 0, tmp = lsn;
591+ cnt < nr_sects;
592+ cnt += bbr_id->blksize_in_sects, tmp = lsn + cnt) {
593+ if (bbr_remap(bbr_id,&tmp)) {
594+ return 1;
595+ }
596+ }
597+ }
598+ return 0;
599+}
600+
601+/**
602+ * bbr_setup
603+ *
604+ * Read the remap tables from disk and set up the initial remap tree.
605+ **/
606+static int bbr_setup(struct bbr_private *bbr_id)
607+{
608+ struct bbr_table *table = bbr_id->bbr_table;
609+ struct page *page;
610+ struct io_region job;
611+ unsigned int error, offset;
612+ int i, rc = 0;
613+
614+ job.dev = bbr_id->dev->dev;
615+ job.count = 1;
616+
617+ /* Read and verify each BBR table sector individually. */
618+ for (i = 0; i < bbr_id->nr_sects_bbr_table; i++, table++) {
619+ job.sector = bbr_id->lba_table1 + i;
620+ page = virt_to_page(table);
621+ offset = (unsigned long)table & ~PAGE_MASK;
622+ rc = dm_io_sync(1, &job, READ, page, offset, &error);
623+ if (rc && bbr_id->lba_table2) {
624+ job.sector = bbr_id->lba_table2 + i;
625+ rc = dm_io_sync(1, &job, READ, page, offset, &error);
626+ }
627+ if (rc) {
628+ goto out;
629+ }
630+
631+ rc = validate_bbr_table_sector(table);
632+ if (rc) {
633+ goto out;
634+ }
635+ }
636+ atomic_set(&bbr_id->in_use_replacement_blks,
637+ bbr_table_to_remap_list(bbr_id));
638+
639+out:
640+ if (rc) {
641+ DMERR("dm-bbr: error during device setup: %d", rc);
642+ }
643+ return rc;
644+}
645+
646+static struct bbr_io_buffer * allocate_bbr_io_buf(struct bbr_private * bbr_id,
647+ struct buffer_head * bh,
648+ int rw)
649+{
650+ struct bbr_io_buffer * bbr_io_buf;
651+
652+ bbr_io_buf = mempool_alloc(bbr_io_buf_pool, GFP_NOIO);
653+ if (bbr_io_buf) {
654+ memset(bbr_io_buf, 0, sizeof(struct bbr_io_buffer));
655+ INIT_LIST_HEAD(&bbr_io_buf->bbr_io_list);
656+ bbr_io_buf->bbr_id = bbr_id;
657+ bbr_io_buf->sector = bh->b_rsector;
658+ bbr_io_buf->bh = bh;
659+ bbr_io_buf->rw = rw;
660+ } else {
661+ DMWARN("Could not allocate from BBR I/O buffer pool!");
662+ }
663+ return bbr_io_buf;
664+}
665+
666+static void free_bbr_io_buf(struct bbr_io_buffer * bbr_io_buf)
667+{
668+ mempool_free(bbr_io_buf, bbr_io_buf_pool);
669+}
670+
671+/**
672+ * bbr_io_remap_error
673+ * @bbr_id: Private data for the BBR node.
674+ * @rw: READ or WRITE.
675+ * @starting_lsn: Starting sector of request to remap.
676+ * @count: Number of sectors in the request.
677+ * @buffer: Data buffer for the request.
678+ *
679+ * For the requested range, try to write each sector individually. For each
680+ * sector that fails, find the next available remap location and write the
681+ * data to that new location. Then update the table and write both copies
682+ * of the table to disk. Finally, update the in-memory mapping and do any
683+ * other necessary bookkeeping.
684+ **/
685+static int bbr_io_remap_error(struct bbr_private *bbr_id,
686+ int rw,
687+ u64 starting_lsn,
688+ u64 count,
689+ char *buffer)
690+{
691+ struct bbr_table *bbr_table;
692+ struct io_region job;
693+ struct page *page;
694+ unsigned long table_sector_index;
695+ unsigned long table_sector_offset;
696+ unsigned long index;
697+ unsigned int offset_in_page, error;
698+ u64 lsn, new_lsn;
699+ int rc;
700+
701+ if (rw == READ) {
702+ /* Nothing can be done about read errors. */
703+ return -EIO;
704+ }
705+
706+ job.dev = bbr_id->dev->dev;
707+ job.count = 1;
708+
709+ /* For each sector in the request. */
710+ for (lsn = 0; lsn < count; lsn++, buffer += SECTOR_SIZE) {
711+ job.sector = starting_lsn + lsn;
712+ page = virt_to_page(buffer);
713+ offset_in_page = (unsigned long)buffer & ~PAGE_MASK;
714+ rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
715+ while (rc) {
716+ /* Find the next available relocation sector. */
717+ new_lsn = atomic_read(&bbr_id->in_use_replacement_blks);
718+ if (new_lsn >= bbr_id->nr_replacement_blks) {
719+ /* No more replacement sectors available. */
720+ return -EIO;
721+ }
722+ new_lsn += bbr_id->start_replacement_sect;
723+
724+ /* Write the data to its new location. */
725+ DMWARN("dm-bbr: device %s: Trying to remap bad sector "PFU64" to sector "PFU64,
726+ dm_kdevname(bbr_id->dev->dev),
727+ starting_lsn + lsn, new_lsn);
728+ job.sector = new_lsn;
729+ rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
730+ if (rc) {
731+ /* This replacement sector is bad.
732+ * Try the next one.
733+ */
734+ DMERR("dm-bbr: device %s: replacement sector "PFU64" is bad. Skipping.",
735+ dm_kdevname(bbr_id->dev->dev), new_lsn);
736+ atomic_inc(&bbr_id->in_use_replacement_blks);
737+ continue;
738+ }
739+
740+ /* Add this new entry to the on-disk table. */
741+ table_sector_index = new_lsn -
742+ bbr_id->start_replacement_sect;
743+ table_sector_offset = table_sector_index /
744+ BBR_ENTRIES_PER_SECT;
745+ index = table_sector_index % BBR_ENTRIES_PER_SECT;
746+
747+ bbr_table = &bbr_id->bbr_table[table_sector_offset];
748+ bbr_table->entries[index].bad_sect = starting_lsn + lsn;
749+ bbr_table->entries[index].replacement_sect = new_lsn;
750+ bbr_table->in_use_cnt++;
751+ bbr_table->sequence_number++;
752+ bbr_table->crc = 0;
753+ bbr_table->crc = calculate_crc(INITIAL_CRC,
754+ bbr_table,
755+ sizeof(struct bbr_table));
756+
757+ /* Write the table to disk. */
758+ cpu_bbr_table_sector_to_le(bbr_table, bbr_table);
759+ page = virt_to_page(bbr_table);
760+ offset_in_page = (unsigned long)bbr_table & ~PAGE_MASK;
761+ if (bbr_id->lba_table1) {
762+ job.sector = bbr_id->lba_table1 + table_sector_offset;
763+ rc = dm_io_sync(1, &job, WRITE, page, offset_in_page, &error);
764+ }
765+ if (bbr_id->lba_table2) {
766+ job.sector = bbr_id->lba_table2 + table_sector_offset;
767+ rc |= dm_io_sync(1, &job, WRITE, page, offset_in_page, &error);
768+ }
769+ le_bbr_table_sector_to_cpu(bbr_table);
770+
771+ if (rc) {
772+ /* Error writing one of the tables to disk. */
773+ DMERR("dm-bbr: device %s: error updating BBR tables on disk.",
774+ dm_kdevname(bbr_id->dev->dev));
775+ return rc;
776+ }
777+
778+ /* Insert a new entry in the remapping binary-tree. */
779+ rc = bbr_insert_remap_entry(bbr_id,
780+ &bbr_table->entries[index]);
781+ if (rc) {
782+ DMERR("dm-bbr: device %s: error adding new entry to remap tree.",
783+ dm_kdevname(bbr_id->dev->dev));
784+ return rc;
785+ }
786+
787+ atomic_inc(&bbr_id->in_use_replacement_blks);
788+ }
789+ }
790+
791+ return 0;
792+}
793+
794+/**
795+ * bbr_io_process_request
796+ *
797+ * For each sector in this request, check if the sector has already
798+ * been remapped. If so, process all previous sectors in the request,
799+ * followed by the remapped sector. Then reset the starting lsn and
800+ * count, and keep going with the rest of the request as if it were
801+ * a whole new request. If any of the sync_io's return an error,
802+ * call the remapper to relocate the bad sector(s).
803+ **/
804+static int bbr_io_process_request(struct bbr_io_buffer *bbr_io_buf)
805+{
806+ struct bbr_private *bbr_id = bbr_io_buf->bbr_id;
807+ struct io_region job;
808+ u64 starting_lsn = bbr_io_buf->sector;
809+ u64 count = bbr_io_buf->bh->b_size >> SECTOR_SHIFT;
810+ u64 lsn, remapped_lsn;
811+ char *buffer = bbr_io_buf->bh->b_data;
812+ struct page *page = virt_to_page(buffer);
813+ unsigned int offset_in_page = (unsigned long)buffer & ~PAGE_MASK;
814+ unsigned int error;
815+ int rw = bbr_io_buf->rw;
816+ int rc = 0;
817+
818+ job.dev = bbr_id->dev->dev;
819+
820+ /* For each sector in this request, check if this sector has
821+ * already been remapped. If so, process all previous sectors
822+ * in this request, followed by the remapped sector. Then reset
823+ * the starting lsn and count and keep going with the rest of
824+ * the request as if it were a whole new request.
825+ */
826+ for (lsn = 0; lsn < count; lsn++) {
827+ remapped_lsn = starting_lsn + lsn;
828+ rc = bbr_remap(bbr_id, &remapped_lsn);
829+ if (!rc) {
830+ /* This sector is fine. */
831+ continue;
832+ }
833+
834+ /* Process all sectors in the request up to this one. */
835+ if (lsn > 0) {
836+ job.sector = starting_lsn;
837+ job.count = lsn;
838+ rc = dm_io_sync(1, &job, rw, page,
839+ offset_in_page, &error);
840+ if (rc) {
841+ /* If this I/O failed, then one of the
842+ * sectors in this request needs to be
843+ * relocated.
844+ */
845+ rc = bbr_io_remap_error(bbr_id, rw,
846+ starting_lsn,
847+ lsn, buffer);
848+ if (rc) {
849+ return rc;
850+ }
851+ }
852+ buffer += (lsn << SECTOR_SHIFT);
853+ page = virt_to_page(buffer);
854+ offset_in_page = (unsigned long)buffer & ~PAGE_MASK;
855+ }
856+
857+ /* Process the remapped sector. */
858+ job.sector = remapped_lsn;
859+ job.count = 1;
860+ rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
861+ if (rc) {
862+ /* BUGBUG - Need more processing if this caused
863+ * an error. If this I/O failed, then the
864+ * existing remap is now bad, and we need to
865+ * find a new remap. Can't use
866+ * bbr_io_remap_error(), because the existing
867+ * map entry needs to be changed, not added
868+ * again, and the original table entry also
869+ * needs to be changed.
870+ */
871+ return rc;
872+ }
873+
874+ buffer += SECTOR_SIZE;
875+ starting_lsn += (lsn + 1);
876+ count -= (lsn + 1);
877+ lsn = -1;
878+ page = virt_to_page(buffer);
879+ offset_in_page = (unsigned long)buffer & ~PAGE_MASK;
880+ }
881+
882+ /* Check for any remaining sectors after the last split. This
883+ * could potentially be the whole request, but that should be a
884+ * rare case because requests should only be processed by the
885+ * thread if we know an error occurred or they contained one or
886+ * more remapped sectors.
887+ */
888+ if (count) {
889+ job.sector = starting_lsn;
890+ job.count = count;
891+ rc = dm_io_sync(1, &job, rw, page, offset_in_page, &error);
892+ if (rc) {
893+ /* If this I/O failed, then one of the sectors
894+ * in this request needs to be relocated.
895+ */
896+ rc = bbr_io_remap_error(bbr_id, rw, starting_lsn,
897+ count, buffer);
898+ if (rc) {
899+ return rc;
900+ }
901+ }
902+ }
903+
904+ return 0;
905+}
906+
907+/**
908+ * bbr_io_handler
909+ *
910+ * This is the handler for the bbr_io_thread. It continuously loops,
911+ * taking I/O requests off its list and processing them. If nothing
912+ * is on the list, the thread goes back to sleep until specifically
913+ * woken up.
914+ *
915+ * I/O requests should only be sent to this thread if we know that:
916+ * a) the request contains at least one remapped sector.
917+ * or
918+ * b) the request caused an error on the normal I/O path.
919+ * This function uses synchronous I/O, so sending a request to this
920+ * thread that doesn't need special processing will cause severe
921+ * performance degredation.
922+ **/
923+static void bbr_io_handler(void)
924+{
925+ struct bbr_io_buffer *bbr_io_buf;
926+ struct buffer_head *bh;
927+ unsigned long flags;
928+ int rc;
929+
930+ while (1) {
931+ /* Process bbr_io_list, one entry at a time. */
932+ spin_lock_irqsave(&bbr_io_list_lock, flags);
933+ if (list_empty(&bbr_io_list)) {
934+ /* No more items on the list. */
935+ spin_unlock_irqrestore(&bbr_io_list_lock, flags);
936+ break;
937+ }
938+ bbr_io_buf = list_entry(bbr_io_list.next,
939+ struct bbr_io_buffer, bbr_io_list);
940+ list_del_init(&bbr_io_buf->bbr_io_list);
941+ spin_unlock_irqrestore(&bbr_io_list_lock, flags);
942+
943+ rc = bbr_io_process_request(bbr_io_buf);
944+
945+ /* Clean up and complete the original I/O. */
946+ bbr_io_buf->flags |= BBR_IO_HANDLED;
947+ bh = bbr_io_buf->bh;
948+ if (bh->b_end_io) {
949+ /* If this was the bbr_io_buf for an error on the
950+ * normal WRITE, don't free it here. It will be
951+ * freed later in bbr_callback()
952+ */
953+ if (!(bbr_io_buf->flags & BBR_IO_RELOCATE))
954+ free_bbr_io_buf(bbr_io_buf);
955+ bh->b_end_io(bh, rc ? 0 : 1);
956+ }
957+ }
958+}
959+
960+/**
961+ * bbr_schedule_io
962+ *
963+ * Place the specified bbr_io_buf on the thread's processing list.
964+ **/
965+static void bbr_schedule_io(struct bbr_io_buffer *bbr_io_buf)
966+{
967+ unsigned long flags;
968+ spin_lock_irqsave(&bbr_io_list_lock, flags);
969+ list_add_tail(&bbr_io_buf->bbr_io_list, &bbr_io_list);
970+ spin_unlock_irqrestore(&bbr_io_list_lock, flags);
971+ dm_daemon_wake(bbr_io_thread);
972+}
973+
974+/**
975+ * bbr_read
976+ *
977+ * If there are any remapped sectors on this object, send this request over
978+ * to the thread for processing. Otherwise send it down the stack normally.
979+ **/
980+static int bbr_read(struct bbr_private *bbr_id,
981+ struct buffer_head *bh)
982+{
983+ struct bbr_io_buffer *bbr_io_buf;
984+
985+ if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
986+ !bbr_remap_probe(bbr_id, bh->b_rsector,
987+ bh->b_size >> SECTOR_SHIFT)) {
988+ /* No existing remaps or this request doesn't
989+ * contain any remapped sectors.
990+ */
991+ bh->b_rdev = bbr_id->dev->dev;
992+ return 1;
993+ }
994+
995+ /* This request has at least one remapped sector. */
996+ bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, READ);
997+ if (!bbr_io_buf) {
998+ /* Can't get memory to track the I/O. */
999+ return -ENOMEM;
1000+ }
1001+
1002+ bbr_schedule_io(bbr_io_buf);
1003+ return 0;
1004+}
1005+
1006+/**
1007+ * bbr_callback
1008+ *
1009+ * This is the callback for normal write requests. Check for an error
1010+ * during the I/O, and send to the thread for processing if necessary.
1011+ **/
1012+static int bbr_callback(struct dm_target *ti, struct buffer_head *bh, int rw,
1013+ int error, union map_info *map_context)
1014+{
1015+ struct bbr_io_buffer *bbr_io_buf = map_context->ptr;
1016+
1017+ if (!bbr_io_buf)
1018+ return error;
1019+
1020+ /* Will try to relocate the WRITE if:
1021+ * - It is an error, and
1022+ * - It is not an error of BBR relocation, and
1023+ */
1024+ if (error && !(bbr_io_buf->flags & BBR_IO_HANDLED)) {
1025+ DMERR("dm-bbr: device %s: Write failure on sector %lu. Scheduling for retry.",
1026+ dm_kdevname(bh->b_rdev),
1027+ (unsigned long)bbr_io_buf->sector);
1028+ /* Indicate this bbr_io_buf is for an error on normal WRITE */
1029+ bbr_io_buf->flags |= BBR_IO_RELOCATE;
1030+ bbr_schedule_io(bbr_io_buf);
1031+ /* Returns >0 so that DM will let us retry the I/O */
1032+ return 1;
1033+ }
1034+
1035+ free_bbr_io_buf(bbr_io_buf);
1036+ return error;
1037+}
1038+
1039+/**
1040+ * bbr_write
1041+ *
1042+ * If there are any remapped sectors on this object, send the request over
1043+ * to the thread for processing. Otherwise, register for callback
1044+ * notification, and send the request down normally.
1045+ **/
1046+static int bbr_write(struct bbr_private *bbr_id,
1047+ struct buffer_head *bh,
1048+ union map_info *map_context)
1049+{
1050+ struct bbr_io_buffer *bbr_io_buf;
1051+ int rc = 1;
1052+
1053+ bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, WRITE);
1054+ if (!bbr_io_buf) {
1055+ /* Can't get memory to track the I/O. */
1056+ return -ENOMEM;
1057+ }
1058+
1059+ if (atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
1060+ !bbr_remap_probe(bbr_id, bh->b_rsector,
1061+ bh->b_size >> SECTOR_SHIFT)) {
1062+ /* No existing remaps or this request
1063+ * contains no remapped sectors.
1064+ */
1065+ bh->b_rdev = bbr_id->dev->dev;
1066+ map_context->ptr = bbr_io_buf;
1067+ } else {
1068+ /* This request contains at least one remapped sector. */
1069+ bbr_schedule_io(bbr_io_buf);
1070+ rc = 0;
1071+ }
1072+
1073+ return rc;
1074+}
1075+
1076+/**
1077+ * Construct a bbr mapping
1078+ **/
1079+static int bbr_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1080+{
1081+ struct bbr_private *bbr_id;
1082+ unsigned long block_size;
1083+ char *end;
1084+ int rc = -EINVAL;
1085+
1086+ if (argc != 8) {
1087+ ti->error = "dm-bbr requires exactly 8 arguments: "
1088+ "device offset table1_lsn table2_lsn table_size start_replacement nr_replacement_blks block_size";
1089+ goto out1;
1090+ }
1091+
1092+ bbr_id = bbr_alloc_private();
1093+ if (!bbr_id) {
1094+ ti->error = "dm-bbr: Error allocating bbr private data.";
1095+ goto out1;
1096+ }
1097+
1098+ bbr_id->offset = simple_strtoull(argv[1], &end, 10);
1099+ bbr_id->lba_table1 = simple_strtoull(argv[2], &end, 10);
1100+ bbr_id->lba_table2 = simple_strtoull(argv[3], &end, 10);
1101+ bbr_id->nr_sects_bbr_table = simple_strtoull(argv[4], &end, 10);
1102+ bbr_id->start_replacement_sect = simple_strtoull(argv[5], &end, 10);
1103+ bbr_id->nr_replacement_blks = simple_strtoull(argv[6], &end, 10);
1104+ block_size = simple_strtoul(argv[7], &end, 10);
1105+ bbr_id->blksize_in_sects = (block_size >> SECTOR_SHIFT);
1106+
1107+ bbr_id->bbr_table = kmalloc(bbr_id->nr_sects_bbr_table << SECTOR_SHIFT,
1108+ GFP_KERNEL);
1109+ if (!bbr_id->bbr_table) {
1110+ ti->error = "dm-bbr: Error allocating bbr table.";
1111+ goto out2;
1112+ }
1113+
1114+ if (dm_get_device(ti, argv[0], 0, ti->len,
1115+ dm_table_get_mode(ti->table), &bbr_id->dev)) {
1116+ ti->error = "dm-bbr: Device lookup failed";
1117+ goto out2;
1118+ }
1119+
1120+ /* Using a semaphore here is probably overkill,
1121+ * but at least it will be correct.
1122+ */
1123+ down(&bbr_instances_lock);
1124+ if (bbr_instances == 0) {
1125+ rc = bbr_global_init();
1126+ if (rc) {
1127+ up(&bbr_instances_lock);
1128+ goto out3;
1129+ }
1130+ }
1131+ bbr_instances++;
1132+ up(&bbr_instances_lock);
1133+
1134+ rc = bbr_setup(bbr_id);
1135+ if (rc) {
1136+ ti->error = "dm-bbr: Device setup failed";
1137+ goto out4;
1138+ }
1139+
1140+ ti->private = bbr_id;
1141+ return 0;
1142+
1143+out4:
1144+ down(&bbr_instances_lock);
1145+ bbr_instances--;
1146+ if (bbr_instances == 0) {
1147+ bbr_global_cleanup();
1148+ }
1149+ up(&bbr_instances_lock);
1150+
1151+out3:
1152+ dm_put_device(ti, bbr_id->dev);
1153+out2:
1154+ bbr_free_private(bbr_id);
1155+out1:
1156+ return rc;
1157+}
1158+
1159+static void bbr_dtr(struct dm_target *ti)
1160+{
1161+ struct bbr_private *bbr_id = ti->private;
1162+
1163+ dm_put_device(ti, bbr_id->dev);
1164+ bbr_free_private(bbr_id);
1165+
1166+ down(&bbr_instances_lock);
1167+ bbr_instances--;
1168+ if (bbr_instances == 0) {
1169+ bbr_global_cleanup();
1170+ }
1171+ up(&bbr_instances_lock);
1172+}
1173+
1174+static int bbr_map(struct dm_target *ti, struct buffer_head *bh, int rw,
1175+ union map_info *map_context)
1176+{
1177+ struct bbr_private *bbr_id = ti->private;
1178+
1179+ bh->b_rsector += bbr_id->offset;
1180+ map_context->ptr = NULL;
1181+ switch (rw) {
1182+ case READ:
1183+ case READA:
1184+ return bbr_read(bbr_id, bh);
1185+ case WRITE:
1186+ return bbr_write(bbr_id, bh, map_context);
1187+ default:
1188+ return -EIO;
1189+ }
1190+}
1191+
1192+static int bbr_status(struct dm_target *ti, status_type_t type,
1193+ char *result, unsigned int maxlen)
1194+{
1195+ struct bbr_private *bbr_id = ti->private;
1196+
1197+ switch (type) {
1198+ case STATUSTYPE_INFO:
1199+ result[0] = '\0';
1200+ break;
1201+
1202+ case STATUSTYPE_TABLE:
1203+ snprintf(result, maxlen, "%s "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" "PFU64" %u",
1204+ dm_kdevname(bbr_id->dev->dev),
1205+ bbr_id->offset, bbr_id->lba_table1, bbr_id->lba_table2,
1206+ bbr_id->nr_sects_bbr_table,
1207+ bbr_id->start_replacement_sect,
1208+ bbr_id->nr_replacement_blks,
1209+ bbr_id->blksize_in_sects << SECTOR_SHIFT);
1210+ break;
1211+ }
1212+ return 0;
1213+}
1214+
1215+static struct target_type bbr_target = {
1216+ name: "bbr",
1217+ module: THIS_MODULE,
1218+ ctr: bbr_ctr,
1219+ dtr: bbr_dtr,
1220+ map: bbr_map,
1221+ end_io: bbr_callback,
1222+ status: bbr_status,
1223+};
1224+
1225+int __init dm_bbr_init(void)
1226+{
1227+ int r = dm_register_target(&bbr_target);
1228+
1229+ if (r < 0)
1230+ DMERR("dm-bbr: register failed %d", r);
1231+
1232+ return r;
1233+}
1234+
1235+void __exit dm_bbr_exit(void)
1236+{
1237+ int r = dm_unregister_target(&bbr_target);
1238+
1239+ if (r < 0)
1240+ DMERR("dm-bbr: unregister failed %d", r);
1241+}
1242+
1243+module_init(dm_bbr_init);
1244+module_exit(dm_bbr_exit);
1245+MODULE_LICENSE("GPL");
1246diff -urN linux-2.4.24.org/drivers/md/dm-bbr.h linux-2.4.24/drivers/md/dm-bbr.h
1247--- linux-2.4.24.org/drivers/md/dm-bbr.h 1970-01-01 01:00:00.000000000 +0100
1248+++ linux-2.4.24/drivers/md/dm-bbr.h 2004-01-18 16:03:13.101545929 +0100
1249@@ -0,0 +1,143 @@
1250+/*
1251+ * (C) Copyright IBM Corp. 2002, 2003
1252+ *
1253+ * This program is free software; you can redistribute it and/or modify
1254+ * it under the terms of the GNU General Public License as published by
1255+ * the Free Software Foundation; either version 2 of the License, or
1256+ * (at your option) any later version.
1257+ *
1258+ * This program is distributed in the hope that it will be useful,
1259+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
1260+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
1261+ * the GNU General Public License for more details.
1262+ *
1263+ * You should have received a copy of the GNU General Public License
1264+ * along with this program; if not, write to the Free Software
1265+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
1266+ *
1267+ * linux/drivers/md/dm-bbr.h
1268+ *
1269+ * Bad-block-relocation (BBR) target for device-mapper.
1270+ *
1271+ * The BBR target is designed to remap I/O write failures to another safe
1272+ * location on disk. Note that most disk drives have BBR built into them,
1273+ * this means that our software BBR will be only activated when all hardware
1274+ * BBR replacement sectors have been used.
1275+ */
1276+
1277+#define BBR_TABLE_SIGNATURE 0x42627254 /* BbrT */
1278+#define BBR_ENTRIES_PER_SECT 31
1279+#define BBR_NR_BUFS 128
1280+#define INITIAL_CRC 0xFFFFFFFF
1281+#define CRC_POLYNOMIAL 0xEDB88320L
1282+
1283+/**
1284+ * Macros to cleanly print 64-bit numbers on both 32-bit and 64-bit machines.
1285+ * Use these in place of %Ld, %Lu, and %Lx.
1286+ **/
1287+#if BITS_PER_LONG > 32
1288+#define PFU64 "%lu"
1289+#else
1290+#define PFU64 "%Lu"
1291+#endif
1292+
1293+/**
1294+ * struct bbr_table_entry
1295+ * @bad_sect: LBA of bad location.
1296+ * @replacement_sect: LBA of new location.
1297+ *
1298+ * Structure to describe one BBR remap.
1299+ **/
1300+struct bbr_table_entry {
1301+ u64 bad_sect;
1302+ u64 replacement_sect;
1303+};
1304+
1305+/**
1306+ * struct bbr_table
1307+ * @signature: Signature on each BBR table sector.
1308+ * @crc: CRC for this table sector.
1309+ * @sequence_number: Used to resolve conflicts when primary and secondary
1310+ * tables do not match.
1311+ * @in_use_cnt: Number of in-use table entries.
1312+ * @entries: Actual table of remaps.
1313+ *
1314+ * Structure to describe each sector of the metadata table. Each sector in this
1315+ * table can describe 31 remapped sectors.
1316+ **/
1317+struct bbr_table {
1318+ u32 signature;
1319+ u32 crc;
1320+ u32 sequence_number;
1321+ u32 in_use_cnt;
1322+ struct bbr_table_entry entries[BBR_ENTRIES_PER_SECT];
1323+};
1324+
1325+/**
1326+ * struct bbr_runtime_remap
1327+ *
1328+ * Node in the binary tree used to keep track of remaps.
1329+ **/
1330+struct bbr_runtime_remap {
1331+ struct bbr_table_entry remap;
1332+ struct bbr_runtime_remap *left;
1333+ struct bbr_runtime_remap *right;
1334+};
1335+
1336+/**
1337+ * struct bbr_private
1338+ * @dev: Info about underlying device.
1339+ * @bbr_table: Copy of metadata table.
1340+ * @remap_root: Binary tree containing all remaps.
1341+ * @offset: LBA of data area.
1342+ * @lba_table1: LBA of primary BBR table.
1343+ * @lba_table2: LBA of secondary BBR table.
1344+ * @nr_sects_bbr_table: Size of each BBR table.
1345+ * @nr_replacement_blks: Number of replacement blocks.
1346+ * @start_replacement_sect: LBA of start of replacement blocks.
1347+ * @blksize_in_sects: Size of each block.
1348+ * @in_use_replacement_blks: Current number of remapped blocks.
1349+ * @bbr_id_lock: Lock for the binary tree.
1350+ *
1351+ * Private data for each BBR target.
1352+ **/
1353+struct bbr_private {
1354+ struct dm_dev *dev;
1355+ struct bbr_table *bbr_table;
1356+ struct bbr_runtime_remap *remap_root;
1357+ u64 offset;
1358+ u64 lba_table1;
1359+ u64 lba_table2;
1360+ u64 nr_sects_bbr_table;
1361+ u64 start_replacement_sect;
1362+ u64 nr_replacement_blks;
1363+ u32 blksize_in_sects;
1364+ atomic_t in_use_replacement_blks;
1365+ spinlock_t bbr_id_lock;
1366+};
1367+
1368+#define BBR_IO_HANDLED (1<<0)
1369+#define BBR_IO_RELOCATE (1<<1)
1370+
1371+/**
1372+ * struct bbr_io_buffer
1373+ * @bbr_io_list: Thread's list of bbr_io_buf's.
1374+ * @bbr_id: Object for this request.
1375+ * @bh: Original buffer_head.
1376+ * @sector: Original sector
1377+ * @flags: Operation flag (BBR_IO_*)
1378+ * @rw: READ or WRITE.
1379+ * @rc: Return code from bbr_io_handler.
1380+ *
1381+ * Structure used to track each write request.
1382+ **/
1383+struct bbr_io_buffer {
1384+ struct list_head bbr_io_list;
1385+ struct bbr_private *bbr_id;
1386+ struct buffer_head *bh;
1387+ u64 sector;
1388+ u32 flags;
1389+ s32 rw;
1390+ s32 rc;
1391+};
1392+
1393diff -urN linux-2.4.24.org/drivers/md/dm.c linux-2.4.24/drivers/md/dm.c
1394--- linux-2.4.24.org/drivers/md/dm.c 2004-01-18 15:09:18.533171353 +0100
1395+++ linux-2.4.24/drivers/md/dm.c 2004-01-18 15:59:40.046635861 +0100
1396@@ -951,13 +951,23 @@
1397 int r = 0;
1398 DECLARE_WAITQUEUE(wait, current);
1399
1400- down_write(&md->lock);
1401+ /* Flush IO to the origin device */
1402+ down_read(&md->lock);
1403+ if (test_bit(DMF_BLOCK_IO, &md->flags)) {
1404+ up_read(&md->lock);
1405+ return -EINVAL;
1406+ }
1407+
1408+ fsync_dev_lockfs(md->dev);
1409+ up_read(&md->lock);
1410+
1411
1412 /*
1413- * First we set the BLOCK_IO flag so no more ios will be
1414- * mapped.
1415+ * Set the BLOCK_IO flag so no more ios will be mapped.
1416 */
1417+ down_write(&md->lock);
1418 if (test_bit(DMF_BLOCK_IO, &md->flags)) {
1419+ unlockfs(md->dev);
1420 up_write(&md->lock);
1421 return -EINVAL;
1422 }
1423@@ -986,6 +996,7 @@
1424
1425 /* did we flush everything ? */
1426 if (atomic_read(&md->pending)) {
1427+ unlockfs(md->dev);
1428 clear_bit(DMF_BLOCK_IO, &md->flags);
1429 r = -EINTR;
1430 } else {
1431@@ -1017,6 +1028,7 @@
1432 md->deferred = NULL;
1433 up_write(&md->lock);
1434
1435+ unlockfs(md->dev);
1436 flush_deferred_io(def);
1437 run_task_queue(&tq_disk);
1438
1439diff -urN linux-2.4.24.org/drivers/md/dm-snapshot.c linux-2.4.24/drivers/md/dm-snapshot.c
1440--- linux-2.4.24.org/drivers/md/dm-snapshot.c 2004-01-18 15:09:18.569163966 +0100
1441+++ linux-2.4.24/drivers/md/dm-snapshot.c 2004-01-18 16:02:40.858328124 +0100
1442@@ -92,6 +92,9 @@
1443
1444 /* List of snapshots for this origin */
1445 struct list_head snapshots;
1446+
1447+ /* Count of snapshots and origins referrencing this structure. */
1448+ unsigned int count;
1449 };
1450
1451 /*
1452@@ -155,6 +158,35 @@
1453 }
1454
1455 /*
1456+ * Allocate and initialize an origin structure.
1457+ */
1458+static struct origin * __alloc_origin(kdev_t dev)
1459+{
1460+ struct origin *o = kmalloc(sizeof(*o), GFP_KERNEL);
1461+ if (o) {
1462+ o->dev = dev;
1463+ INIT_LIST_HEAD(&o->hash_list);
1464+ INIT_LIST_HEAD(&o->snapshots);
1465+ __insert_origin(o);
1466+ }
1467+ return o;
1468+}
1469+
1470+static void __get_origin(struct origin *o)
1471+{
1472+ o->count++;
1473+}
1474+
1475+static void __put_origin(struct origin *o)
1476+{
1477+ o->count--;
1478+ if (o->count == 0) {
1479+ list_del(&o->hash_list);
1480+ kfree(o);
1481+ }
1482+}
1483+
1484+/*
1485 * Make a note of the snapshot and its origin so we can look it
1486 * up when the origin has a write on it.
1487 */
1488@@ -168,20 +200,37 @@
1489
1490 if (!o) {
1491 /* New origin */
1492- o = kmalloc(sizeof(*o), GFP_KERNEL);
1493+ o = __alloc_origin(dev);
1494 if (!o) {
1495 up_write(&_origins_lock);
1496 return -ENOMEM;
1497 }
1498+ }
1499
1500- /* Initialise the struct */
1501- INIT_LIST_HEAD(&o->snapshots);
1502- o->dev = dev;
1503+ __get_origin(o);
1504+ list_add_tail(&snap->list, &o->snapshots);
1505
1506- __insert_origin(o);
1507+ up_write(&_origins_lock);
1508+ return 0;
1509+}
1510+
1511+static int register_origin(kdev_t dev)
1512+{
1513+ struct origin *o;
1514+
1515+ down_write(&_origins_lock);
1516+ o = __lookup_origin(dev);
1517+
1518+ if (!o) {
1519+ /* New origin */
1520+ o = __alloc_origin(dev);
1521+ if (!o) {
1522+ up_write(&_origins_lock);
1523+ return -ENOMEM;
1524+ }
1525 }
1526
1527- list_add_tail(&snap->list, &o->snapshots);
1528+ __get_origin(o);
1529
1530 up_write(&_origins_lock);
1531 return 0;
1532@@ -195,11 +244,18 @@
1533 o = __lookup_origin(s->origin->dev);
1534
1535 list_del(&s->list);
1536- if (list_empty(&o->snapshots)) {
1537- list_del(&o->hash_list);
1538- kfree(o);
1539- }
1540+ __put_origin(o);
1541+
1542+ up_write(&_origins_lock);
1543+}
1544+
1545+static void unregister_origin(kdev_t dev)
1546+{
1547+ struct origin *o;
1548
1549+ down_write(&_origins_lock);
1550+ o = __lookup_origin(dev);
1551+ __put_origin(o);
1552 up_write(&_origins_lock);
1553 }
1554
1555@@ -524,9 +580,6 @@
1556 goto bad5;
1557 }
1558
1559- /* Flush IO to the origin device */
1560- fsync_dev(s->origin->dev);
1561-
1562 /* Add snapshot to the list of snapshots for this origin */
1563 if (register_snapshot(s)) {
1564 r = -EINVAL;
1565@@ -1093,6 +1146,13 @@
1566 return r;
1567 }
1568
1569+ r = register_origin(dev->dev);
1570+ if (r) {
1571+ ti->error = "Cannot register origin";
1572+ dm_put_device(ti, dev);
1573+ return r;
1574+ }
1575+
1576 ti->private = dev;
1577 return 0;
1578 }
1579@@ -1100,6 +1160,7 @@
1580 static void origin_dtr(struct dm_target *ti)
1581 {
1582 struct dm_dev *dev = (struct dm_dev *) ti->private;
1583+ unregister_origin(dev->dev);
1584 dm_put_device(ti, dev);
1585 }
1586
1587diff -urN linux-2.4.24.org/drivers/md/dm-sparse.c linux-2.4.24/drivers/md/dm-sparse.c
1588--- linux-2.4.24.org/drivers/md/dm-sparse.c 1970-01-01 01:00:00.000000000 +0100
1589+++ linux-2.4.24/drivers/md/dm-sparse.c 2004-01-18 16:04:48.284615142 +0100
1590@@ -0,0 +1,709 @@
1591+/* -*- linux-c -*- */
1592+
1593+/*
1594+ * Copyright (c) International Business Machines Corp., 2002
1595+ *
1596+ * This program is free software; you can redistribute it and/or modify
1597+ * it under the terms of the GNU General Public License as published by
1598+ * the Free Software Foundation; either version 2 of the License, or
1599+ * (at your option) any later version.
1600+ *
1601+ * This program is distributed in the hope that it will be useful,
1602+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
1603+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
1604+ * the GNU General Public License for more details.
1605+ *
1606+ * You should have received a copy of the GNU General Public License
1607+ * along with this program; if not, write to the Free Software
1608+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
1609+ *
1610+ * linux/drivers/md/dm-sparse.c
1611+ *
1612+ * Sparse target for device-mapper.
1613+ *
1614+ * This target provides the ability to create a sparse device. This
1615+ * allows a device to pretend to be larger than it really is.
1616+ */
1617+
1618+#include <linux/module.h>
1619+#include <linux/init.h>
1620+#include <linux/blkdev.h>
1621+#include <linux/slab.h>
1622+#include <linux/mempool.h>
1623+#include <linux/vmalloc.h>
1624+
1625+#include "dm.h"
1626+#include "dm-io.h"
1627+
1628+#define MAX_HASH_CHAIN_ENTRIES 10
1629+#define NAME_SIZE 127
1630+
1631+/* Sparse Ioctl
1632+ device
1633+ start
1634+ chunk_size
1635+ chunks
1636+ */
1637+
1638+// Entries in the sparse remapping structure
1639+struct sparse_hash_entry {
1640+ u64 org_chunk; // Chunk number, not LBA.
1641+ u64 sparse_chunk; // Chunk number, not LBA.
1642+ struct sparse_hash_entry * next;
1643+ struct sparse_hash_entry * prev;
1644+};
1645+
1646+//Private data structure
1647+struct sparse_volume {
1648+ struct dm_dev *dev;
1649+ struct rw_semaphore sparse_semaphore;
1650+ struct sparse_hash_entry ** sparse_map; // Hash table of remappings
1651+ struct sparse_hash_entry * free_hash_list;
1652+ kmem_cache_t * hash_slab;
1653+ mempool_t * hash_pool;
1654+ u32 dm_io_flag;
1655+ u32 chunk_size; // Sectors.
1656+ u32 chunk_shift; // Shift value for chunk size.
1657+ u32 num_chunks; // In this volume.
1658+ u32 next_cow_entry; // Index into current COW table.
1659+ u64 current_cow_sector; // LOGICAL sector of current COW table.
1660+ u32 next_free_chunk; // Index of next free chunk (not LBA!).
1661+ u32 hash_table_size; // Size of the hash table for the remap.
1662+ u64 start;
1663+ u64 cow_table[64]; // One sector's worth of COW tables.
1664+};
1665+
1666+/*************************** OLD SERVICES ****************************/
1667+
1668+/* computes log base 2 of value */
1669+inline int log2(u32 value) //ok to change to u32?
1670+{
1671+ int result = -1;
1672+ long tmp; //ok to change to long?
1673+
1674+ if (value) {
1675+ tmp = value;
1676+ result++;
1677+ while (!(tmp & 1)) {
1678+ result++;
1679+ tmp >>= 1;
1680+ }
1681+ if (tmp != 1) {
1682+ result = -2;
1683+ }
1684+ }
1685+ return result;
1686+}
1687+
1688+/********************************* Functions *********************************/
1689+
1690+/***************************** Hash Functions *****************************/
1691+
1692+/* Take and initialize from the free hash list */
1693+static struct sparse_hash_entry *
1694+allocate_sparse_hash_entry( struct sparse_volume * volume,
1695+ u64 org_chunk,
1696+ u64 sparse_chunk )
1697+{
1698+ struct sparse_hash_entry * hash_entry;
1699+
1700+ hash_entry = volume->free_hash_list;
1701+ if ( hash_entry ) { //should always be the case b/c preallocate these
1702+ volume->free_hash_list = hash_entry->next;
1703+ hash_entry->org_chunk = org_chunk;
1704+ hash_entry->sparse_chunk = sparse_chunk;
1705+ hash_entry->next = NULL;
1706+ hash_entry->prev = NULL;
1707+ }
1708+
1709+ return hash_entry;
1710+}
1711+
1712+/*
1713+ * This function inserts a new entry into a sparse hash chain, immediately
1714+ * following the specified entry. This function should not be used to add
1715+ * an entry into an empty list, or as the first entry in an existing list.
1716+ * For that case, use insert_sparse_map_entry_at_head().
1717+ */
1718+static int insert_sparse_hash_entry( struct sparse_hash_entry * entry,
1719+ struct sparse_hash_entry * base )
1720+{
1721+ entry->next = base->next;
1722+ entry->prev = base;
1723+ base->next = entry;
1724+ if ( entry->next ) {
1725+ entry->next->prev = entry;
1726+ }
1727+ return 0;
1728+}
1729+
1730+/*
1731+ * This function inserts a new entry into a sparse chain as the first
1732+ * entry in the chain.
1733+ */
1734+static int insert_sparse_hash_entry_at_head( struct sparse_hash_entry * entry,
1735+ struct sparse_hash_entry ** head )
1736+{
1737+ entry->next = *head;
1738+ entry->prev = NULL;
1739+ *head = entry;
1740+ if ( entry->next ) {
1741+ entry->next->prev = entry;
1742+ }
1743+ return 0;
1744+}
1745+
1746+/*
1747+ * Delete all items in a single chain in the hash table.
1748+ */
1749+static int delete_sparse_hash_chain( struct sparse_volume * vol,
1750+ struct sparse_hash_entry * head )
1751+{
1752+ struct sparse_hash_entry * next;
1753+
1754+ while ( head ) {
1755+ next = head->next;
1756+ mempool_free( head, vol->hash_pool );
1757+ head = next;
1758+ }
1759+ return 0;
1760+}
1761+
1762+/*
1763+ * This function will search the hash chain that is anchored at the
1764+ * specified head pointer. If the chunk number is found, a pointer to that
1765+ * entry in the chain is set, and a 1 is returned. If the chunk is not
1766+ * found, a pointer to the previous entry is set and 0 is returned. If the
1767+ * return pointer is NULL, this means either the list is empty, or the
1768+ * specified sector should become the first list item.
1769+ */
1770+static int search_sparse_hash_chain( u64 chunk,
1771+ struct sparse_hash_entry * head,
1772+ struct sparse_hash_entry ** result )
1773+{
1774+ struct sparse_hash_entry * curr = head;
1775+ struct sparse_hash_entry * prev = head;
1776+ while ( curr && curr->org_chunk < chunk ) {
1777+ prev = curr;
1778+ curr = curr->next;
1779+ }
1780+ if (!curr) { // Either an empty chain or went off the end of the chain.
1781+ *result = prev;
1782+ return 0;
1783+ }
1784+ else if ( curr->org_chunk != chunk ) {
1785+ *result = curr->prev;
1786+ return 0;
1787+ }
1788+ else {
1789+ *result = curr;
1790+ return 1;
1791+ }
1792+}
1793+
1794+/*
1795+ * This function takes a cow table entry (from the on-disk data), and
1796+ * converts it into an appropriate entry for the sparse map, and
1797+ * inserts it into the appropriate map for the specified volume.
1798+ */
1799+static int add_cow_entry_to_sparse_map( u64 org_chunk,
1800+ u64 sparse_chunk,
1801+ struct sparse_volume * volume )
1802+{
1803+ struct sparse_hash_entry * new_entry;
1804+ struct sparse_hash_entry * target_entry;
1805+ u32 hash_value;
1806+ int rc = -EINVAL;
1807+
1808+ new_entry = allocate_sparse_hash_entry(volume, org_chunk, sparse_chunk);
1809+ if (!new_entry) {
1810+ return -ENOMEM;
1811+ }
1812+
1813+ hash_value = (long)org_chunk % volume->hash_table_size;
1814+
1815+ if (! search_sparse_hash_chain( org_chunk,
1816+ volume->sparse_map[hash_value],
1817+ &target_entry ) ) {
1818+ //should always take this path
1819+
1820+ if ( target_entry ) {
1821+ insert_sparse_hash_entry( new_entry, target_entry );
1822+ }
1823+ else {
1824+ insert_sparse_hash_entry_at_head
1825+ ( new_entry, &(volume->sparse_map[hash_value]) );
1826+ }
1827+ rc = 0;
1828+ }
1829+ return rc;
1830+}
1831+
1832+/*
1833+ * Construct the initial hash table state based on
1834+ * existing COW tables on the disk.
1835+ */
1836+static int build_sparse_maps(struct sparse_volume * volume)
1837+{
1838+ int rc = 0, done = 0;
1839+ struct io_region job;
1840+ struct page * page;
1841+ unsigned int error, offset;
1842+
1843+ while (!done) {
1844+
1845+ // Read in one sector's worth of COW tables.
1846+ job.dev = volume->dev->dev;
1847+ job.sector = volume->current_cow_sector;
1848+ job.count = 1;
1849+ page = virt_to_page(volume->cow_table);
1850+ offset = (unsigned long)volume->cow_table & ~PAGE_MASK;
1851+ rc = dm_io_sync(1, &job, READ, page, offset, &error);
1852+ if (rc) {
1853+ return rc;
1854+ }
1855+
1856+ // Translate every valid COW table entry into
1857+ // a sparse map entry.
1858+ for ( volume->next_cow_entry = 0;
1859+
1860+ volume->next_cow_entry < (SECTOR_SIZE/sizeof(u64)) &&
1861+ volume->cow_table[volume->next_cow_entry] !=
1862+ 0xffffffffffffffff;
1863+
1864+ volume->next_cow_entry++, volume->next_free_chunk++ ) {
1865+
1866+ if ( (rc = add_cow_entry_to_sparse_map
1867+ ( le64_to_cpu( volume->cow_table[volume->next_cow_entry] ),
1868+ volume->next_free_chunk, volume ))) {
1869+ return( rc );
1870+ }
1871+ }
1872+ // Move on to the next sector if necessary.
1873+ if ( volume->next_cow_entry == (SECTOR_SIZE/sizeof(u64)) ) {
1874+ volume->current_cow_sector++;
1875+ }
1876+ else {
1877+ done = 1;
1878+ }
1879+ }
1880+ return 0;
1881+}
1882+
1883+/************************* Other Functions ************************/
1884+
1885+/*
1886+ * Function: sparse_remap_chunk
1887+ *
1888+ * This function performs a sector remap on a sparse volume. This should
1889+ * be called from the I/O path, It first determines the base sector
1890+ * of the chunk containing the specified sector, and saves the remainder.
1891+ * Then it performs a search through the sparse map for the specified
1892+ * volume. If a match is found, the sector number is changed to the new
1893+ * value. If no match is found, the value is left the same, meaning the
1894+ * chunk has not been remapped.
1895+ */
1896+static int sparse_remap_chunk( struct sparse_volume * sparse_volume,
1897+ u64 * sector )
1898+{
1899+ struct sparse_hash_entry * result;
1900+ u64 chunk;
1901+ u32 hash_value;
1902+ u32 remainder;
1903+ int rc = 1;
1904+
1905+ down_read(&sparse_volume->sparse_semaphore);
1906+
1907+ remainder = *sector & (u64)(sparse_volume->chunk_size - 1);
1908+ chunk = *sector >> sparse_volume->chunk_shift;
1909+ hash_value = ((u32)chunk) % sparse_volume->hash_table_size;
1910+
1911+ if ( search_sparse_hash_chain( chunk,
1912+ sparse_volume->sparse_map[hash_value],
1913+ &result) ) {
1914+ *sector = ( result->sparse_chunk << sparse_volume->chunk_shift )
1915+ + remainder;
1916+ rc = 0;
1917+ }
1918+ up_read(&sparse_volume->sparse_semaphore);
1919+ return rc;
1920+}
1921+
1922+/* Function: sparse_cow_write
1923+ *
1924+ * Check this sparse node to see if the given sector/chunk has been
1925+ * remapped yet. If it hasn't, create a new hash table entry, update the
1926+ * in-memory COW table, write the COW table to disk.
1927+ */
1928+
1929+static int sparse_cow_write( struct sparse_volume * sparse_volume,
1930+ u64 * sector )
1931+{
1932+ struct sparse_hash_entry * target_entry, * new_map_entry;
1933+ struct io_region job;
1934+ struct page * page;
1935+ char * cow = NULL;
1936+ unsigned int error, offset;
1937+ u64 chunk;
1938+ u32 hash_value = 0;
1939+ u32 remainder;
1940+ int rc;
1941+
1942+ down_write(&sparse_volume->sparse_semaphore);
1943+
1944+ remainder = *sector & (u64)(sparse_volume->chunk_size - 1);
1945+ chunk = *sector >> sparse_volume->chunk_shift;
1946+ hash_value = ((u32)chunk) % sparse_volume->hash_table_size;
1947+
1948+ if ( search_sparse_hash_chain( chunk,
1949+ sparse_volume->sparse_map[hash_value],
1950+ &target_entry) ) {
1951+ *sector =
1952+ ( target_entry->sparse_chunk << sparse_volume->chunk_shift )
1953+ + remainder;
1954+ rc = 0;
1955+ goto out;
1956+ }
1957+
1958+ // Is there enough room left on this sparse to remap this chunk?
1959+ if ( sparse_volume->next_free_chunk >= sparse_volume->num_chunks ) {
1960+ DMERR("dm-sparse: full no new remaps allowed\n");
1961+ rc = -ENOSPC;
1962+ goto out;
1963+ }
1964+
1965+ // Create and initialize a new hash table entry for the new remap.
1966+ new_map_entry = allocate_sparse_hash_entry
1967+ (sparse_volume, chunk, sparse_volume->next_free_chunk);
1968+ if ( ! new_map_entry ) {
1969+ // Can't get memory for map entry. Disable this sparse.
1970+ DMERR("dm-sparse: memory error allocating hash entry\n");
1971+ rc = -ENOMEM;
1972+ goto out;
1973+ }
1974+
1975+ //Always write cow table so its safe
1976+ cow = kmalloc( SECTOR_SIZE, GFP_KERNEL );
1977+ if (! cow ) {
1978+ // Can't get I/O buffer. Disable this sparse.
1979+ DMERR("dm-sparse: memory error allocating COW table buffer");
1980+ rc = -ENOMEM;
1981+ goto out;
1982+ }
1983+
1984+ // Add the entry to the hash table.
1985+ if ( target_entry ) {
1986+ insert_sparse_hash_entry( new_map_entry, target_entry );
1987+ }
1988+ else {
1989+ insert_sparse_hash_entry_at_head
1990+ ( new_map_entry,
1991+ &(sparse_volume->sparse_map[hash_value]) );
1992+ }
1993+
1994+ sparse_volume->next_free_chunk++;
1995+
1996+ // Update the appropriate entry in the COW table.
1997+ sparse_volume->cow_table[sparse_volume->next_cow_entry] =
1998+ cpu_to_le64(chunk);
1999+ sparse_volume->next_cow_entry++;
2000+
2001+ memcpy(cow, sparse_volume->cow_table, SECTOR_SIZE);
2002+
2003+ //because of ordering issues needs to be synchronous
2004+ job.dev = sparse_volume->dev->dev;
2005+ job.sector = sparse_volume->current_cow_sector;
2006+ job.count = 1;
2007+ page = virt_to_page(cow);
2008+ offset = (unsigned long)cow & ~PAGE_MASK;
2009+ dm_io_sync(1, &job, WRITE, page, offset, &error);
2010+
2011+ // Update the in-memory COW table values.
2012+ if ( sparse_volume->next_cow_entry >= (SECTOR_SIZE/sizeof(u64)) )
2013+ {
2014+ sparse_volume->next_cow_entry = 0;
2015+ sparse_volume->current_cow_sector++;
2016+ memset(sparse_volume->cow_table, 0xff, SECTOR_SIZE);
2017+ }
2018+
2019+ *sector = ( new_map_entry->sparse_chunk << sparse_volume->chunk_shift )
2020+ + remainder;
2021+
2022+ rc = 0;
2023+
2024+ out:
2025+ up_write(&sparse_volume->sparse_semaphore);
2026+ if ( cow ) {
2027+ kfree( cow );
2028+ }
2029+
2030+ return rc;
2031+}
2032+
2033+/************************ EXPORT FUNCTIONS ************************/
2034+
2035+/*
2036+ * Function: sparse_dtr
2037+ */
2038+static void sparse_dtr( struct dm_target *ti )
2039+{
2040+ struct sparse_volume * vol = (struct sparse_volume *)ti->private;
2041+ int i;
2042+
2043+ if (vol) {
2044+
2045+ if (vol->sparse_map) {
2046+ for ( i = 0; i < vol->hash_table_size; i++ ) {
2047+ delete_sparse_hash_chain( vol, vol->sparse_map[i] );
2048+ }
2049+ delete_sparse_hash_chain( vol, vol->free_hash_list );
2050+ vfree(vol->sparse_map);
2051+ }
2052+
2053+ if (vol->hash_pool)
2054+ mempool_destroy(vol->hash_pool);
2055+
2056+ if (vol->hash_slab)
2057+ kmem_cache_destroy(vol->hash_slab);
2058+
2059+ dm_put_device(ti, vol->dev);
2060+
2061+ if (vol->dm_io_flag) {
2062+ dm_io_put(1);
2063+ }
2064+
2065+ kfree( vol );
2066+ }
2067+}
2068+
2069+/*
2070+ * Function: sparse_ctr
2071+ */
2072+static int sparse_ctr( struct dm_target *ti, unsigned int argc, char** argv )
2073+{
2074+ int i, rc = -EINVAL;
2075+ struct sparse_hash_entry *new_entry;
2076+ struct sparse_volume *vol;
2077+ struct dm_dev *dev;
2078+ u32 chunk_size, chunks;
2079+ u64 start;
2080+ char* end, slab_name[NAME_SIZE+1];
2081+
2082+ if ( argc != 4 ) {
2083+ ti->error="dm-sparse: wrong number of arguments";
2084+ return rc;
2085+ }
2086+
2087+ start = simple_strtoull(argv[1], &end, 10);
2088+ if (*end) {
2089+ ti->error="dm-sparse: Invalid first chunk lba";
2090+ return rc;
2091+ }
2092+
2093+ chunk_size = simple_strtoul(argv[2], &end, 10);
2094+ if (*end) {
2095+ ti->error="dm-sparse: Invalid chunk_size";
2096+ return rc;
2097+ }
2098+
2099+ chunks = simple_strtoul(argv[3], &end, 10);
2100+ if (*end) {
2101+ ti->error="dm-sparse: Invalid number of chunks";
2102+ return rc;
2103+ }
2104+
2105+ if ( dm_get_device( ti, argv[0], ti->begin, start + chunks * chunk_size,
2106+ dm_table_get_mode(ti->table), &dev ) ) {
2107+ ti->error = "dm-sparse: Device lookup failed";
2108+ return rc;
2109+ }
2110+
2111+ vol = kmalloc(sizeof(struct sparse_volume), GFP_KERNEL);
2112+ if ( !vol ) {
2113+ ti->error = "dm-sparse: Memory allocation for private-data failed";
2114+ rc = -ENOMEM;
2115+ goto out;
2116+ }
2117+
2118+ memset( vol, 0, sizeof(struct sparse_volume) );
2119+
2120+ rc = dm_io_get(1);
2121+ if (rc) {
2122+ ti->error = "dm-sparse: failed to initialize dm-io.";
2123+ sparse_dtr(ti);
2124+ return rc;
2125+ }
2126+
2127+ // Initialize
2128+ vol->dm_io_flag = 1;
2129+ vol->chunk_size = chunk_size;
2130+ vol->chunk_shift = log2(chunk_size);
2131+ vol->num_chunks = chunks;
2132+ vol->current_cow_sector = 1;
2133+ vol->hash_table_size = chunks / MAX_HASH_CHAIN_ENTRIES + 1;
2134+ vol->start = start;
2135+ vol->dev = dev;
2136+ init_rwsem(&vol->sparse_semaphore);
2137+
2138+ snprintf(slab_name, NAME_SIZE, "sparse-%p", vol);
2139+ vol->hash_slab = kmem_cache_create(slab_name,
2140+ sizeof(struct sparse_hash_entry),
2141+ 0, SLAB_HWCACHE_ALIGN,
2142+ NULL, NULL);
2143+ if ( ! vol->hash_slab ) {
2144+ ti->error = "dm-sparse: memory allocation error in hash slab create";
2145+ sparse_dtr(ti);
2146+ return -ENOMEM;
2147+ }
2148+ vol->hash_pool = mempool_create(1, mempool_alloc_slab,
2149+ mempool_free_slab,
2150+ vol->hash_slab);
2151+ if ( ! vol->hash_pool ) {
2152+ ti->error = "dm-sparse: memory allocation error in hash pool create";
2153+ sparse_dtr(ti);
2154+ return -ENOMEM;
2155+ }
2156+
2157+ // Sparse hash table
2158+ vol->sparse_map = vmalloc( vol->hash_table_size *
2159+ sizeof( struct sparse_hash_entry * ) );
2160+ if ( ! vol->sparse_map ) {
2161+ ti->error = "dm-sparse: Memory allocation error in sparse_map create";
2162+ sparse_dtr(ti);
2163+ return -ENOMEM;
2164+ }
2165+
2166+ memset( vol->sparse_map, 0, vol->hash_table_size *
2167+ sizeof( struct sparse_hash_entry * ) );
2168+
2169+ for ( i = 0; i < chunks; i++ ) {
2170+
2171+ new_entry = mempool_alloc(vol->hash_pool, GFP_KERNEL );
2172+ if ( ! new_entry ) {
2173+ ti->error="dm-sparse: memory allocation error in hash table setup";
2174+ sparse_dtr(ti);
2175+ return -ENOMEM;
2176+ }
2177+
2178+ new_entry->next = vol->free_hash_list;
2179+ vol->free_hash_list = new_entry;
2180+ }
2181+
2182+ rc = build_sparse_maps(vol);
2183+ if (rc) {
2184+ ti->error = "dm-sparse: error building hash tables";
2185+ sparse_dtr(ti);
2186+ return rc;
2187+ }
2188+
2189+ ti->private = vol;
2190+ return rc;
2191+
2192+ out:
2193+ dm_put_device(ti, dev);
2194+ return rc;
2195+}
2196+
2197+/*
2198+ * Function: sparse_map
2199+ */
2200+static int sparse_map( struct dm_target * ti, struct buffer_head * bh, int rw,
2201+ union map_info *map_context )
2202+{
2203+ struct sparse_volume * volume = (struct sparse_volume*)ti->private;
2204+ u64 sector = bh->b_rsector;
2205+ int rc;
2206+
2207+ // Check if this sector has been remapped
2208+ rc = sparse_remap_chunk( volume, &sector );
2209+
2210+ if ( rc < 0 ) { //Error
2211+ return rc;
2212+ }
2213+
2214+ if ( rc == 0 ) { // Remapped I/O : read or write same logic
2215+ bh->b_rsector = volume->start + sector;
2216+ bh->b_rdev = volume->dev->dev;
2217+ return 1;
2218+ }
2219+
2220+ // ( Previously )Un-mapped: read / write different logic
2221+
2222+ if ( rw ) { //write :
2223+ rc = sparse_cow_write( volume, &sector );
2224+
2225+ if ( rc < 0 ) { //Error
2226+ return rc;
2227+ }
2228+ //Send write on
2229+ bh->b_rsector = volume->start + sector;
2230+ bh->b_rdev = volume->dev->dev;
2231+ return 1;
2232+ }
2233+
2234+ //Reading something that was never written
2235+ //return zeros and indicate complete
2236+ memset(bh->b_data, 0x0, bh->b_size);
2237+ bh->b_end_io(bh, 1);
2238+ return 0;
2239+}
2240+
2241+static int sparse_status( struct dm_target *ti, status_type_t type,
2242+ char *result, unsigned int maxlen )
2243+{
2244+ struct sparse_volume * vol = (struct sparse_volume * )ti->private;
2245+
2246+ switch(type) {
2247+
2248+ case STATUSTYPE_INFO:
2249+ snprintf( result, maxlen, "%d%%",
2250+ ( vol->next_free_chunk * 100 ) / vol->num_chunks );
2251+ break;
2252+
2253+ case STATUSTYPE_TABLE:
2254+ snprintf( result, maxlen, "%s %Lu %u %u",
2255+ dm_kdevname(vol->dev->dev), vol->start,
2256+ vol->chunk_size, vol->num_chunks );
2257+ break;
2258+
2259+ default:
2260+ break;
2261+ }
2262+
2263+ return 0;
2264+}
2265+
2266+/****************** FUNCTION TABLE **********************/
2267+
2268+static struct target_type sparse_target = {
2269+ .name = "sparse",
2270+ .module = THIS_MODULE,
2271+ .ctr = sparse_ctr,
2272+ .dtr = sparse_dtr,
2273+ .map = sparse_map,
2274+ .status = sparse_status,
2275+};
2276+
2277+/********************* REGISTRATION *****************/
2278+
2279+int __init sparse_init(void)
2280+{
2281+ int rc = dm_register_target(&sparse_target);
2282+
2283+ if ( rc < 0 )
2284+ DMWARN("sparse target registration failed");
2285+
2286+ return rc;
2287+}
2288+
2289+void __exit sparse_exit(void)
2290+{
2291+ if (dm_unregister_target(&sparse_target) )
2292+ DMWARN("sparse target unregistration failed");
2293+
2294+ return;
2295+}
2296+
2297+module_init(sparse_init);
2298+module_exit(sparse_exit);
2299+MODULE_LICENSE("GPL");
2300diff -urN linux-2.4.24.org/drivers/md/lvm.c linux-2.4.24/drivers/md/lvm.c
2301--- linux-2.4.24.org/drivers/md/lvm.c 2004-01-18 14:58:09.106704262 +0100
2302+++ linux-2.4.24/drivers/md/lvm.c 2004-01-18 15:57:55.568033496 +0100
2303@@ -236,9 +236,6 @@
2304 #define DEVICE_OFF(device)
2305 #define LOCAL_END_REQUEST
2306
2307-/* lvm_do_lv_create calls fsync_dev_lockfs()/unlockfs() */
2308-/* #define LVM_VFS_ENHANCEMENT */
2309-
2310 #include <linux/config.h>
2311 #include <linux/module.h>
2312 #include <linux/kernel.h>
2313@@ -2250,12 +2247,8 @@
2314 if (lv_ptr->lv_access & LV_SNAPSHOT) {
2315 lv_t *org = lv_ptr->lv_snapshot_org, *last;
2316
2317- /* sync the original logical volume */
2318- fsync_dev(org->lv_dev);
2319-#ifdef LVM_VFS_ENHANCEMENT
2320 /* VFS function call to sync and lock the filesystem */
2321 fsync_dev_lockfs(org->lv_dev);
2322-#endif
2323
2324 down_write(&org->lv_lock);
2325 org->lv_access |= LV_SNAPSHOT_ORG;
2326@@ -2281,11 +2274,9 @@
2327 else
2328 set_device_ro(lv_ptr->lv_dev, 1);
2329
2330-#ifdef LVM_VFS_ENHANCEMENT
2331 /* VFS function call to unlock the filesystem */
2332 if (lv_ptr->lv_access & LV_SNAPSHOT)
2333 unlockfs(lv_ptr->lv_snapshot_org->lv_dev);
2334-#endif
2335
2336 lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].de =
2337 lvm_fs_create_lv(vg_ptr, lv_ptr);
2338diff -urN linux-2.4.24.org/drivers/md/Makefile linux-2.4.24/drivers/md/Makefile
2339--- linux-2.4.24.org/drivers/md/Makefile 2004-01-18 15:09:18.620153502 +0100
2340+++ linux-2.4.24/drivers/md/Makefile 2004-01-18 16:04:48.278616388 +0100
2341@@ -28,6 +28,8 @@
2342 obj-$(CONFIG_BLK_DEV_LVM) += lvm-mod.o
2343
2344 obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
2345+obj-$(CONFIG_BLK_DEV_DM_BBR) += dm-bbr.o
2346+obj-$(CONFIG_BLK_DEV_DM_SPARSE) += dm-sparse.o
2347
2348 include $(TOPDIR)/Rules.make
2349
2350diff -urN linux-2.4.24.org/drivers/md/md.c linux-2.4.24/drivers/md/md.c
2351--- linux-2.4.24.org/drivers/md/md.c 2004-01-18 14:58:09.227678566 +0100
2352+++ linux-2.4.24/drivers/md/md.c 2004-01-18 16:04:27.702900923 +0100
2353@@ -2146,6 +2146,8 @@
2354
2355 SET_FROM_SB(utime);
2356 SET_FROM_SB(state);
2357+ if (mddev->curr_resync)
2358+ info.state |= (1 << MD_ARRAY_RECOVERY_RUNNING);
2359 SET_FROM_SB(active_disks);
2360 SET_FROM_SB(working_disks);
2361 SET_FROM_SB(failed_disks);
2362diff -urN linux-2.4.24.org/drivers/md/multipath.c linux-2.4.24/drivers/md/multipath.c
2363--- linux-2.4.24.org/drivers/md/multipath.c 2004-01-18 14:58:09.254672832 +0100
2364+++ linux-2.4.24/drivers/md/multipath.c 2004-01-18 16:04:38.291691263 +0100
2365@@ -139,15 +139,16 @@
2366 static int multipath_map (mddev_t *mddev, kdev_t *rdev)
2367 {
2368 multipath_conf_t *conf = mddev_to_conf(mddev);
2369- int i, disks = MD_SB_DISKS;
2370+ int i;
2371
2372 /*
2373 * Later we do read balancing on the read side
2374 * now we use the first available disk.
2375 */
2376
2377- for (i = 0; i < disks; i++) {
2378+ for (i = 0; i < conf->nr_disks; i++) {
2379 if (conf->multipaths[i].operational) {
2380+ /* first operational is winner! */
2381 *rdev = conf->multipaths[i].dev;
2382 return (0);
2383 }
2384@@ -191,6 +192,8 @@
2385 {
2386 struct multipath_bh * mp_bh = (struct multipath_bh *)(bh->b_private);
2387
2388+ atomic_dec(&mp_bh->multipath->nr_pending);
2389+
2390 /*
2391 * this branch is our 'one multipath IO has finished' event handler:
2392 */
2393@@ -223,19 +226,39 @@
2394 }
2395
2396 /*
2397- * This routine returns the disk from which the requested read should
2398- * be done.
2399+ * Multipath read balance ...
2400+ *
2401+ * Returns:
2402+ *
2403+ * If no active paths
2404+ *
2405+ * - Error ( -1 )
2406+ *
2407+ * If active paths == 1
2408+ *
2409+ * - 1st active path encountered
2410+ *
2411+ * If active paths > 1
2412+ *
2413+ * - 1st idle active path encountered
2414+ * - else ... the active path doing the least amount of work.
2415 */
2416-
2417 static int multipath_read_balance (multipath_conf_t *conf)
2418 {
2419- int disk;
2420-
2421- for (disk = 0; disk < conf->raid_disks; disk++)
2422- if (conf->multipaths[disk].operational)
2423- return disk;
2424- BUG();
2425- return 0;
2426+ int i, disk=-1, nr_pending, least_pending=0;
2427+
2428+ for (i=0; i<conf->nr_disks; i++) {
2429+ if (conf->multipaths[i].operational) {
2430+ nr_pending = atomic_read(&conf->multipaths[i].nr_pending);
2431+ if (nr_pending==0 || conf->working_disks==1)
2432+ return i;
2433+ if (least_pending==0 || nr_pending<least_pending) {
2434+ disk = i;
2435+ least_pending = nr_pending;
2436+ }
2437+ }
2438+ }
2439+ return disk;
2440 }
2441
2442 static int multipath_make_request (mddev_t *mddev, int rw,
2443@@ -245,6 +268,7 @@
2444 struct buffer_head *bh_req;
2445 struct multipath_bh * mp_bh;
2446 struct multipath_info *multipath;
2447+ int disk;
2448
2449 if (!buffer_locked(bh))
2450 BUG();
2451@@ -267,7 +291,16 @@
2452 /*
2453 * read balancing logic:
2454 */
2455- multipath = conf->multipaths + multipath_read_balance(conf);
2456+ disk = multipath_read_balance(conf);
2457+ if (disk==-1) {
2458+ printk (KERN_ERR "multipath_make_request: no more operational IO paths.\n");
2459+ buffer_IO_error(bh);
2460+ return 0;
2461+ }
2462+
2463+ multipath = conf->multipaths + disk;
2464+ mp_bh->multipath = multipath;
2465+ atomic_inc(&multipath->nr_pending);
2466
2467 bh_req = &mp_bh->bh_req;
2468 memcpy(bh_req, bh, sizeof(*bh));
2469@@ -331,13 +364,14 @@
2470 {
2471 multipath_conf_t *conf = mddev_to_conf(mddev);
2472 struct multipath_info * multipaths = conf->multipaths;
2473- int disks = MD_SB_DISKS;
2474 int other_paths = 1;
2475- int i;
2476+ int i, first = 1;
2477+ mdk_rdev_t *rdev;
2478+ struct md_list_head *tmp;
2479
2480 if (conf->working_disks == 1) {
2481 other_paths = 0;
2482- for (i = 0; i < disks; i++) {
2483+ for (i = 0; i < MD_SB_DISKS; i++) {
2484 if (multipaths[i].spare) {
2485 other_paths = 1;
2486 break;
2487@@ -351,16 +385,17 @@
2488 * first check if this is a queued request for a device
2489 * which has just failed.
2490 */
2491- for (i = 0; i < disks; i++) {
2492+ for (i = 0; i < MD_SB_DISKS; i++) {
2493 if (multipaths[i].dev==dev && !multipaths[i].operational)
2494 return 0;
2495 }
2496 printk (LAST_DISK);
2497 } else {
2498+ mdp_super_t *sb = mddev->sb;
2499 /*
2500 * Mark disk as unusable
2501 */
2502- for (i = 0; i < disks; i++) {
2503+ for (i = 0; i < MD_SB_DISKS; i++) {
2504 if (multipaths[i].dev==dev && multipaths[i].operational) {
2505 mark_disk_bad(mddev, i);
2506 break;
2507@@ -369,7 +404,6 @@
2508 if (!conf->working_disks) {
2509 int err = 1;
2510 mdp_disk_t *spare;
2511- mdp_super_t *sb = mddev->sb;
2512
2513 spare = get_spare(mddev);
2514 if (spare) {
2515@@ -384,6 +418,21 @@
2516 sb->spare_disks--;
2517 }
2518 }
2519+ /* prevent unnecessary work in md_do_recovery() */
2520+ if (conf->working_disks) {
2521+ conf->raid_disks = conf->working_disks
2522+ = sb->raid_disks = sb->active_disks;
2523+ }
2524+ /* update alias disk info to insure we can do sb commit. */
2525+ ITERATE_RDEV(mddev,rdev,tmp) {
2526+ if (first && disk_active(&sb->disks[rdev->desc_nr])) {
2527+ rdev->alias_device = 0;
2528+ first = 0;
2529+ } else {
2530+ if (!disk_faulty(&sb->disks[rdev->desc_nr]))
2531+ rdev->alias_device = 1;
2532+ }
2533+ }
2534 }
2535 return 0;
2536 }
2537@@ -677,9 +726,8 @@
2538 /*
2539 * This is a kernel thread which:
2540 *
2541- * 1. Retries failed read operations on working multipaths.
2542+ * 1. Retries failed operations on working multipaths.
2543 * 2. Updates the raid superblock when problems encounter.
2544- * 3. Performs writes following reads for array syncronising.
2545 */
2546
2547 static void multipathd (void *data)
2548@@ -833,6 +881,7 @@
2549 mdk_rdev_t *rdev, *def_rdev = NULL;
2550 struct md_list_head *tmp;
2551 int num_rdevs = 0;
2552+ int active_disks = 0, spare_disks = 0, faulty_disks = 0;
2553
2554 MOD_INC_USE_COUNT;
2555
2556@@ -881,9 +930,7 @@
2557 printk(NOT_IN_SYNC, partition_name(rdev->dev));
2558
2559 /*
2560- * Mark all disks as spare to start with, then pick our
2561- * active disk. If we have a disk that is marked active
2562- * in the sb, then use it, else use the first rdev.
2563+ * Mark all disks as spare to start with.
2564 */
2565 disk->number = desc->number;
2566 disk->raid_disk = desc->raid_disk;
2567@@ -894,20 +941,21 @@
2568 mark_disk_sync(desc);
2569
2570 if (disk_active(desc)) {
2571- if(!conf->working_disks) {
2572- printk(OPERATIONAL, partition_name(rdev->dev),
2573- desc->raid_disk);
2574- disk->operational = 1;
2575- disk->spare = 0;
2576- conf->working_disks++;
2577- def_rdev = rdev;
2578- } else {
2579- mark_disk_spare(desc);
2580- }
2581- } else
2582- mark_disk_spare(desc);
2583+ printk(OPERATIONAL, partition_name(rdev->dev),
2584+ desc->raid_disk);
2585+ disk->operational = 1;
2586+ disk->spare = 0;
2587+ conf->working_disks++;
2588+ def_rdev = rdev;
2589+ active_disks++;
2590+ } else if (disk_faulty(desc)) {
2591+ disk->spare = 0;
2592+ faulty_disks++;
2593+ } else {
2594+ spare_disks++;
2595+ }
2596
2597- if(!num_rdevs++) def_rdev = rdev;
2598+ num_rdevs++;
2599 }
2600 if(!conf->working_disks && num_rdevs) {
2601 desc = &sb->disks[def_rdev->desc_nr];
2602@@ -918,11 +966,12 @@
2603 disk->spare = 0;
2604 conf->working_disks++;
2605 mark_disk_active(desc);
2606+ active_disks++;
2607 }
2608 /*
2609- * Make sure our active path is in desc spot 0
2610+ * If there is only 1 active path ... make sure it is in desc spot 0
2611 */
2612- if(def_rdev->desc_nr != 0) {
2613+ if (active_disks == 1 && def_rdev->desc_nr != 0) {
2614 rdev = find_rdev_nr(mddev, 0);
2615 desc = &sb->disks[def_rdev->desc_nr];
2616 desc2 = sb->disks;
2617@@ -940,10 +989,10 @@
2618 def_rdev->desc_nr = 0;
2619 }
2620 }
2621- conf->raid_disks = sb->raid_disks = sb->active_disks = 1;
2622+ conf->raid_disks = sb->raid_disks = sb->active_disks = active_disks;
2623 conf->nr_disks = sb->nr_disks = sb->working_disks = num_rdevs;
2624- sb->failed_disks = 0;
2625- sb->spare_disks = num_rdevs - 1;
2626+ sb->failed_disks = faulty_disks;
2627+ sb->spare_disks = spare_disks;
2628 mddev->sb_dirty = 1;
2629 conf->mddev = mddev;
2630 conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
2631diff -urN linux-2.4.24.org/fs/buffer.c linux-2.4.24/fs/buffer.c
2632--- linux-2.4.24.org/fs/buffer.c 2004-01-18 14:55:22.305275818 +0100
2633+++ linux-2.4.24/fs/buffer.c 2004-01-18 15:57:55.602026171 +0100
2634@@ -419,6 +419,34 @@
2635 fsync_dev(dev);
2636 }
2637
2638+int fsync_dev_lockfs(kdev_t dev)
2639+{
2640+ /* you are not allowed to try locking all the filesystems
2641+ ** on the system, your chances of getting through without
2642+ ** total deadlock are slim to none.
2643+ */
2644+ if (!dev)
2645+ return fsync_dev(dev) ;
2646+
2647+ sync_buffers(dev, 0);
2648+
2649+ lock_kernel();
2650+ /* note, the FS might need to start transactions to
2651+ ** sync the inodes, or the quota, no locking until
2652+ ** after these are done
2653+ */
2654+ sync_inodes(dev);
6ba999b3 2655+ DQUOT_SYNC_DEV(dev);
cdeda7f0
AM
2656+ /* if inodes or quotas could be dirtied during the
2657+ ** sync_supers_lockfs call, the FS is responsible for getting
2658+ ** them on disk, without deadlocking against the lock
2659+ */
2660+ sync_supers_lockfs(dev) ;
2661+ unlock_kernel();
2662+
2663+ return sync_buffers(dev, 1) ;
2664+}
2665+
2666 asmlinkage long sys_sync(void)
2667 {
2668 fsync_dev(0);
2669diff -urN linux-2.4.24.org/fs/reiserfs/super.c linux-2.4.24/fs/reiserfs/super.c
2670--- linux-2.4.24.org/fs/reiserfs/super.c 2004-01-18 14:55:18.875002271 +0100
2671+++ linux-2.4.24/fs/reiserfs/super.c 2004-01-18 15:57:55.657014322 +0100
2672@@ -84,7 +84,7 @@
2673 reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
2674 journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s));
2675 reiserfs_block_writes(&th) ;
2676- journal_end(&th, s, 1) ;
2677+ journal_end_sync(&th, s, 1) ;
2678 }
2679 s->s_dirt = 0;
2680 unlock_kernel() ;
2681diff -urN linux-2.4.24.org/fs/super.c linux-2.4.24/fs/super.c
2682--- linux-2.4.24.org/fs/super.c 2004-01-18 14:55:11.177633010 +0100
2683+++ linux-2.4.24/fs/super.c 2004-01-18 15:57:55.687007859 +0100
2684@@ -38,6 +38,13 @@
2685 LIST_HEAD(super_blocks);
2686 spinlock_t sb_lock = SPIN_LOCK_UNLOCKED;
2687
2688+/*
2689+ * lock/unlockfs grab a read lock on s_umount, but you need this lock to
2690+ * make sure no lockfs runs are in progress before inserting/removing
2691+ * supers from the list.
2692+ */
2693+static DECLARE_MUTEX(lockfs_sem);
2694+
2695 /*
2696 * Handling of filesystem drivers list.
2697 * Rules:
2698@@ -436,6 +443,19 @@
2699 put_super(sb);
2700 }
2701
2702+static void write_super_lockfs(struct super_block *sb)
2703+{
2704+ lock_super(sb);
2705+ if (sb->s_root && sb->s_op) {
2706+ if (sb->s_dirt && sb->s_op->write_super)
2707+ sb->s_op->write_super(sb);
2708+ if (sb->s_op->write_super_lockfs) {
2709+ sb->s_op->write_super_lockfs(sb);
2710+ }
2711+ }
2712+ unlock_super(sb);
2713+}
2714+
2715 static inline void write_super(struct super_block *sb)
2716 {
2717 lock_super(sb);
2718@@ -483,6 +503,39 @@
2719 spin_unlock(&sb_lock);
2720 }
2721
2722+/*
2723+ * Note: don't check the dirty flag before waiting, we want the lock
2724+ * to happen every time this is called. dev must be non-zero
2725+ */
2726+void sync_supers_lockfs(kdev_t dev)
2727+{
2728+ struct super_block * sb;
2729+
2730+ down(&lockfs_sem) ;
2731+ if (dev) {
2732+ sb = get_super(dev);
2733+ if (sb) {
2734+ write_super_lockfs(sb);
2735+ drop_super(sb);
2736+ }
2737+ }
2738+}
2739+
2740+void unlockfs(kdev_t dev)
2741+{
2742+ struct super_block * sb;
2743+
2744+ if (dev) {
2745+ sb = get_super(dev);
2746+ if (sb) {
2747+ if (sb->s_op && sb->s_op->unlockfs)
2748+ sb->s_op->unlockfs(sb) ;
2749+ drop_super(sb);
2750+ }
2751+ }
2752+ up(&lockfs_sem) ;
2753+}
2754+
2755 /**
2756 * get_super - get the superblock of a device
2757 * @dev: device to get the superblock for
2758@@ -702,6 +755,7 @@
2759 goto out1;
2760
2761 error = -EBUSY;
2762+ down(&lockfs_sem);
2763 restart:
2764 spin_lock(&sb_lock);
2765
2766@@ -713,6 +767,7 @@
2767 ((flags ^ old->s_flags) & MS_RDONLY)) {
2768 spin_unlock(&sb_lock);
2769 destroy_super(s);
2770+ up(&lockfs_sem);
2771 goto out1;
2772 }
2773 if (!grab_super(old))
2774@@ -720,12 +775,14 @@
2775 destroy_super(s);
2776 blkdev_put(bdev, BDEV_FS);
2777 path_release(&nd);
2778+ up(&lockfs_sem);
2779 return old;
2780 }
2781 s->s_dev = dev;
2782 s->s_bdev = bdev;
2783 s->s_flags = flags;
2784 insert_super(s, fs_type);
2785+ up(&lockfs_sem);
2786 if (!fs_type->read_super(s, data, flags & MS_VERBOSE ? 1 : 0))
2787 goto Einval;
2788 s->s_flags |= MS_ACTIVE;
2789@@ -833,7 +890,10 @@
2790 if (!deactivate_super(sb))
2791 return;
2792
2793+ down(&lockfs_sem);
2794 down_write(&sb->s_umount);
2795+ up(&lockfs_sem);
2796+
2797 sb->s_root = NULL;
2798 /* Need to clean after the sucker */
2799 if (fs->fs_flags & FS_LITTER)
2800diff -urN linux-2.4.24.org/include/linux/fs.h linux-2.4.24/include/linux/fs.h
2801--- linux-2.4.24.org/include/linux/fs.h 2004-01-18 14:55:29.014855364 +0100
2802+++ linux-2.4.24/include/linux/fs.h 2004-01-18 15:59:11.694692181 +0100
2803@@ -1287,6 +1287,7 @@
2804 extern int sync_buffers(kdev_t, int);
2805 extern void sync_dev(kdev_t);
2806 extern int fsync_dev(kdev_t);
2807+extern int fsync_dev_lockfs(kdev_t);
2808 extern int fsync_super(struct super_block *);
2809 extern int fsync_no_super(kdev_t);
2810 extern void sync_inodes_sb(struct super_block *);
2811@@ -1305,6 +1306,8 @@
2812 extern int filemap_fdatasync(struct address_space *);
2813 extern int filemap_fdatawait(struct address_space *);
2814 extern void sync_supers(kdev_t dev, int wait);
2815+extern void sync_supers_lockfs(kdev_t);
2816+extern void unlockfs(kdev_t);
2817 extern int bmap(struct inode *, int);
2818 extern int notify_change(struct dentry *, struct iattr *);
2819 extern int permission(struct inode *, int);
2820diff -urN linux-2.4.24.org/include/linux/raid/md_u.h linux-2.4.24/include/linux/raid/md_u.h
2821--- linux-2.4.24.org/include/linux/raid/md_u.h 2004-01-18 14:55:35.554471508 +0100
2822+++ linux-2.4.24/include/linux/raid/md_u.h 2004-01-18 16:04:27.764887949 +0100
2823@@ -50,6 +50,10 @@
2824 int patchlevel;
2825 } mdu_version_t;
2826
2827+#define MD_ARRAY_CLEAN 0
2828+#define MD_ARRAY_ERRORS 1
2829+#define MD_ARRAY_RECOVERY_RUNNING 2
2830+
2831 typedef struct mdu_array_info_s {
2832 /*
2833 * Generic constant information
2834diff -urN linux-2.4.24.org/include/linux/raid/multipath.h linux-2.4.24/include/linux/raid/multipath.h
2835--- linux-2.4.24.org/include/linux/raid/multipath.h 2004-01-18 14:55:35.563469605 +0100
2836+++ linux-2.4.24/include/linux/raid/multipath.h 2004-01-18 16:04:38.329683369 +0100
2837@@ -15,6 +15,7 @@
2838 int spare;
2839
2840 int used_slot;
2841+ atomic_t nr_pending; /* number of pending requests */
2842 };
2843
2844 struct multipath_private_data {
2845@@ -63,6 +64,7 @@
2846 struct buffer_head *master_bh;
2847 struct buffer_head bh_req;
2848 struct multipath_bh *next_mp; /* next for retry or in free list */
2849+ struct multipath_info *multipath; /* allows end_request to easilly dec pending buffer count*/
2850 };
2851 /* bits for multipath_bh.state */
2852 #define MPBH_Uptodate 1
2853diff -urN linux-2.4.24.org/kernel/ksyms.c linux-2.4.24/kernel/ksyms.c
2854--- linux-2.4.24.org/kernel/ksyms.c 2004-01-18 14:55:22.698192617 +0100
2855+++ linux-2.4.24/kernel/ksyms.c 2004-01-18 15:57:55.824978130 +0100
2856@@ -200,6 +200,8 @@
2857 EXPORT_SYMBOL(invalidate_inode_pages);
2858 EXPORT_SYMBOL(truncate_inode_pages);
2859 EXPORT_SYMBOL(fsync_dev);
2860+EXPORT_SYMBOL(fsync_dev_lockfs);
2861+EXPORT_SYMBOL(unlockfs);
2862 EXPORT_SYMBOL(fsync_no_super);
2863 EXPORT_SYMBOL(permission);
2864 EXPORT_SYMBOL(vfs_permission);
This page took 0.420746 seconds and 4 git commands to generate.