]> git.pld-linux.org Git - packages/kernel.git/blame - linux-2.4.25-dm-3.patch
- argh, back (used by kernel24.spec@HEAD)
[packages/kernel.git] / linux-2.4.25-dm-3.patch
CommitLineData
5206b469
AM
1diff -urN linux-2.4.24.org/arch/mips64/kernel/ioctl32.c linux-2.4.24/arch/mips64/kernel/ioctl32.c
2--- linux-2.4.24.org/arch/mips64/kernel/ioctl32.c 2004-01-18 14:59:17.636181134 +0100
3+++ linux-2.4.24/arch/mips64/kernel/ioctl32.c 2004-01-18 15:01:17.736881093 +0100
4@@ -62,6 +62,7 @@
5
6 #include <linux/mtd/mtd.h>
7 #include <linux/serial.h>
8+#include <linux/dm-ioctl.h>
9
10 #ifdef CONFIG_SIBYTE_TBPROF
11 #include <asm/sibyte/trace_prof.h>
12@@ -2324,6 +2325,22 @@
13 IOCTL32_DEFAULT(RESTART_ARRAY_RW),
14 #endif /* CONFIG_MD */
15
16+#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
17+ IOCTL32_DEFAULT(DM_VERSION),
18+ IOCTL32_DEFAULT(DM_REMOVE_ALL),
19+ IOCTL32_DEFAULT(DM_DEV_CREATE),
20+ IOCTL32_DEFAULT(DM_DEV_REMOVE),
21+ IOCTL32_DEFAULT(DM_TABLE_LOAD),
22+ IOCTL32_DEFAULT(DM_DEV_SUSPEND),
23+ IOCTL32_DEFAULT(DM_DEV_RENAME),
24+ IOCTL32_DEFAULT(DM_TABLE_DEPS),
25+ IOCTL32_DEFAULT(DM_DEV_STATUS),
26+ IOCTL32_DEFAULT(DM_TABLE_STATUS),
27+ IOCTL32_DEFAULT(DM_DEV_WAIT),
28+ IOCTL32_DEFAULT(DM_LIST_DEVICES),
29+ IOCTL32_DEFAULT(DM_TABLE_CLEAR),
30+#endif /* CONFIG_BLK_DEV_DM */
31+
32 #ifdef CONFIG_SIBYTE_TBPROF
33 IOCTL32_DEFAULT(SBPROF_ZBSTART),
34 IOCTL32_DEFAULT(SBPROF_ZBSTOP),
35diff -urN linux-2.4.24.org/arch/parisc/kernel/ioctl32.c linux-2.4.24/arch/parisc/kernel/ioctl32.c
36--- linux-2.4.24.org/arch/parisc/kernel/ioctl32.c 2004-01-18 14:59:20.929484849 +0100
37+++ linux-2.4.24/arch/parisc/kernel/ioctl32.c 2004-01-18 15:01:17.742879834 +0100
38@@ -55,6 +55,7 @@
39 #define max max */
40 #include <linux/lvm.h>
41 #endif /* LVM */
42+#include <linux/dm-ioctl.h>
43
44 #include <scsi/scsi.h>
45 /* Ugly hack. */
46@@ -3423,6 +3424,22 @@
47 COMPATIBLE_IOCTL(LV_BMAP)
48 COMPATIBLE_IOCTL(LV_SNAPSHOT_USE_RATE)
49 #endif /* LVM */
50+/* Device-Mapper */
51+#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
52+COMPATIBLE_IOCTL(DM_VERSION)
53+COMPATIBLE_IOCTL(DM_REMOVE_ALL)
54+COMPATIBLE_IOCTL(DM_DEV_CREATE)
55+COMPATIBLE_IOCTL(DM_DEV_REMOVE)
56+COMPATIBLE_IOCTL(DM_TABLE_LOAD)
57+COMPATIBLE_IOCTL(DM_DEV_SUSPEND)
58+COMPATIBLE_IOCTL(DM_DEV_RENAME)
59+COMPATIBLE_IOCTL(DM_TABLE_DEPS)
60+COMPATIBLE_IOCTL(DM_DEV_STATUS)
61+COMPATIBLE_IOCTL(DM_TABLE_STATUS)
62+COMPATIBLE_IOCTL(DM_DEV_WAIT)
63+COMPATIBLE_IOCTL(DM_LIST_DEVICES)
64+COMPATIBLE_IOCTL(DM_TABLE_CLEAR)
65+#endif /* CONFIG_BLK_DEV_DM */
66 #if defined(CONFIG_DRM) || defined(CONFIG_DRM_MODULE)
67 COMPATIBLE_IOCTL(DRM_IOCTL_GET_MAGIC)
68 COMPATIBLE_IOCTL(DRM_IOCTL_IRQ_BUSID)
69diff -urN linux-2.4.24.org/arch/ppc64/kernel/ioctl32.c linux-2.4.24/arch/ppc64/kernel/ioctl32.c
70--- linux-2.4.24.org/arch/ppc64/kernel/ioctl32.c 2004-01-18 14:58:17.568907286 +0100
71+++ linux-2.4.24/arch/ppc64/kernel/ioctl32.c 2004-01-18 15:01:17.754877316 +0100
72@@ -66,6 +66,7 @@
73 #if defined(CONFIG_BLK_DEV_LVM) || defined(CONFIG_BLK_DEV_LVM_MODULE)
74 #include <linux/lvm.h>
75 #endif /* LVM */
76+#include <linux/dm-ioctl.h>
77
78 #include <scsi/scsi.h>
79 /* Ugly hack. */
80@@ -4408,6 +4409,22 @@
81 COMPATIBLE_IOCTL(NBD_PRINT_DEBUG),
82 COMPATIBLE_IOCTL(NBD_SET_SIZE_BLOCKS),
83 COMPATIBLE_IOCTL(NBD_DISCONNECT),
84+/* device-mapper */
85+#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
86+COMPATIBLE_IOCTL(DM_VERSION),
87+COMPATIBLE_IOCTL(DM_REMOVE_ALL),
88+COMPATIBLE_IOCTL(DM_DEV_CREATE),
89+COMPATIBLE_IOCTL(DM_DEV_REMOVE),
90+COMPATIBLE_IOCTL(DM_TABLE_LOAD),
91+COMPATIBLE_IOCTL(DM_DEV_SUSPEND),
92+COMPATIBLE_IOCTL(DM_DEV_RENAME),
93+COMPATIBLE_IOCTL(DM_TABLE_DEPS),
94+COMPATIBLE_IOCTL(DM_DEV_STATUS),
95+COMPATIBLE_IOCTL(DM_TABLE_STATUS),
96+COMPATIBLE_IOCTL(DM_DEV_WAIT),
97+COMPATIBLE_IOCTL(DM_LIST_DEVICES),
98+COMPATIBLE_IOCTL(DM_TABLE_CLEAR),
99+#endif /* CONFIG_BLK_DEV_DM */
100 /* Remove *PRIVATE in 2.5 */
101 COMPATIBLE_IOCTL(SIOCDEVPRIVATE),
102 COMPATIBLE_IOCTL(SIOCDEVPRIVATE+1),
103diff -urN linux-2.4.24.org/arch/s390x/kernel/ioctl32.c linux-2.4.24/arch/s390x/kernel/ioctl32.c
104--- linux-2.4.24.org/arch/s390x/kernel/ioctl32.c 2004-01-18 14:59:24.825661296 +0100
105+++ linux-2.4.24/arch/s390x/kernel/ioctl32.c 2004-01-18 15:01:17.759876266 +0100
106@@ -30,6 +30,7 @@
107 #include <linux/blk.h>
108 #include <linux/elevator.h>
109 #include <linux/raw.h>
110+#include <linux/dm-ioctl.h>
111 #include <asm/types.h>
112 #include <asm/uaccess.h>
113 #include <asm/dasd.h>
114@@ -627,6 +628,20 @@
115
116 IOCTL32_DEFAULT(SIOCGSTAMP),
117
118+ IOCTL32_DEFAULT(DM_VERSION),
119+ IOCTL32_DEFAULT(DM_REMOVE_ALL),
120+ IOCTL32_DEFAULT(DM_DEV_CREATE),
121+ IOCTL32_DEFAULT(DM_DEV_REMOVE),
122+ IOCTL32_DEFAULT(DM_TABLE_LOAD),
123+ IOCTL32_DEFAULT(DM_DEV_SUSPEND),
124+ IOCTL32_DEFAULT(DM_DEV_RENAME),
125+ IOCTL32_DEFAULT(DM_TABLE_DEPS),
126+ IOCTL32_DEFAULT(DM_DEV_STATUS),
127+ IOCTL32_DEFAULT(DM_TABLE_STATUS),
128+ IOCTL32_DEFAULT(DM_DEV_WAIT),
129+ IOCTL32_DEFAULT(DM_LIST_DEVICES),
130+ IOCTL32_DEFAULT(DM_TABLE_CLEAR),
131+
132 IOCTL32_DEFAULT(LOOP_SET_FD),
133 IOCTL32_DEFAULT(LOOP_CLR_FD),
134
135diff -urN linux-2.4.24.org/arch/sparc64/kernel/ioctl32.c linux-2.4.24/arch/sparc64/kernel/ioctl32.c
136--- linux-2.4.24.org/arch/sparc64/kernel/ioctl32.c 2004-01-18 14:58:59.210079599 +0100
137+++ linux-2.4.24/arch/sparc64/kernel/ioctl32.c 2004-01-18 15:01:17.768874378 +0100
138@@ -56,6 +56,7 @@
139 #if defined(CONFIG_BLK_DEV_LVM) || defined(CONFIG_BLK_DEV_LVM_MODULE)
140 #include <linux/lvm.h>
141 #endif /* LVM */
142+#include <linux/dm-ioctl.h>
143
144 #include <scsi/scsi.h>
145 /* Ugly hack. */
146@@ -5086,6 +5087,22 @@
147 COMPATIBLE_IOCTL(NBD_PRINT_DEBUG)
148 COMPATIBLE_IOCTL(NBD_SET_SIZE_BLOCKS)
149 COMPATIBLE_IOCTL(NBD_DISCONNECT)
150+/* device-mapper */
151+#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
152+COMPATIBLE_IOCTL(DM_VERSION)
153+COMPATIBLE_IOCTL(DM_REMOVE_ALL)
154+COMPATIBLE_IOCTL(DM_DEV_CREATE)
155+COMPATIBLE_IOCTL(DM_DEV_REMOVE)
156+COMPATIBLE_IOCTL(DM_TABLE_LOAD)
157+COMPATIBLE_IOCTL(DM_DEV_SUSPEND)
158+COMPATIBLE_IOCTL(DM_DEV_RENAME)
159+COMPATIBLE_IOCTL(DM_TABLE_DEPS)
160+COMPATIBLE_IOCTL(DM_DEV_STATUS)
161+COMPATIBLE_IOCTL(DM_TABLE_STATUS)
162+COMPATIBLE_IOCTL(DM_DEV_WAIT)
163+COMPATIBLE_IOCTL(DM_LIST_DEVICES)
164+COMPATIBLE_IOCTL(DM_TABLE_CLEAR)
165+#endif /* CONFIG_BLK_DEV_DM */
166 /* Linux-1394 */
167 #if defined(CONFIG_IEEE1394) || defined(CONFIG_IEEE1394_MODULE)
168 COMPATIBLE_IOCTL(AMDTP_IOC_CHANNEL)
169diff -urN linux-2.4.24.org/arch/x86_64/ia32/ia32_ioctl.c linux-2.4.24/arch/x86_64/ia32/ia32_ioctl.c
170--- linux-2.4.24.org/arch/x86_64/ia32/ia32_ioctl.c 2004-01-18 14:58:15.119427333 +0100
171+++ linux-2.4.24/arch/x86_64/ia32/ia32_ioctl.c 2004-01-18 15:01:17.778872279 +0100
172@@ -67,6 +67,7 @@
173 #define max max
174 #include <linux/lvm.h>
175 #endif /* LVM */
176+#include <linux/dm-ioctl.h>
177
178 #include <scsi/scsi.h>
179 /* Ugly hack. */
180@@ -4051,6 +4052,22 @@
181 COMPATIBLE_IOCTL(LV_BMAP)
182 COMPATIBLE_IOCTL(LV_SNAPSHOT_USE_RATE)
183 #endif /* LVM */
184+/* Device-Mapper */
185+#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
186+COMPATIBLE_IOCTL(DM_VERSION)
187+COMPATIBLE_IOCTL(DM_REMOVE_ALL)
188+COMPATIBLE_IOCTL(DM_DEV_CREATE)
189+COMPATIBLE_IOCTL(DM_DEV_REMOVE)
190+COMPATIBLE_IOCTL(DM_TABLE_LOAD)
191+COMPATIBLE_IOCTL(DM_DEV_SUSPEND)
192+COMPATIBLE_IOCTL(DM_DEV_RENAME)
193+COMPATIBLE_IOCTL(DM_TABLE_DEPS)
194+COMPATIBLE_IOCTL(DM_DEV_STATUS)
195+COMPATIBLE_IOCTL(DM_TABLE_STATUS)
196+COMPATIBLE_IOCTL(DM_DEV_WAIT)
197+COMPATIBLE_IOCTL(DM_LIST_DEVICES)
198+COMPATIBLE_IOCTL(DM_TABLE_CLEAR)
199+#endif /* CONFIG_BLK_DEV_DM */
200 #ifdef CONFIG_AUTOFS_FS
201 COMPATIBLE_IOCTL(AUTOFS_IOC_READY)
202 COMPATIBLE_IOCTL(AUTOFS_IOC_FAIL)
203diff -urN linux-2.4.24.org/Documentation/Configure.help linux-2.4.24/Documentation/Configure.help
204--- linux-2.4.24.org/Documentation/Configure.help 2004-01-18 14:59:47.177940541 +0100
205+++ linux-2.4.24/Documentation/Configure.help 2004-01-18 15:01:13.758716197 +0100
206@@ -1952,6 +1952,20 @@
207 want), say M here and read <file:Documentation/modules.txt>. The
208 module will be called lvm-mod.o.
209
210+Device-mapper support
211+CONFIG_BLK_DEV_DM
212+ Device-mapper is a low level volume manager. It works by allowing
213+ people to specify mappings for ranges of logical sectors. Various
214+ mapping types are available, in addition people may write their own
215+ modules containing custom mappings if they wish.
216+
217+ Higher level volume managers such as LVM2 use this driver.
218+
219+ If you want to compile this as a module, say M here and read
220+ <file:Documentation/modules.txt>. The module will be called dm-mod.o.
221+
222+ If unsure, say N.
223+
224 Multiple devices driver support (RAID and LVM)
225 CONFIG_MD
226 Support multiple physical spindles through a single logical device.
227diff -urN linux-2.4.24.org/drivers/md/Config.in linux-2.4.24/drivers/md/Config.in
228--- linux-2.4.24.org/drivers/md/Config.in 2004-01-18 14:58:09.306661789 +0100
229+++ linux-2.4.24/drivers/md/Config.in 2004-01-18 15:01:13.770713678 +0100
230@@ -14,5 +14,6 @@
231 dep_tristate ' Multipath I/O support' CONFIG_MD_MULTIPATH $CONFIG_BLK_DEV_MD
232
233 dep_tristate ' Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM $CONFIG_MD
234+dep_tristate ' Device-mapper support' CONFIG_BLK_DEV_DM $CONFIG_MD
235
236 endmenu
237diff -urN linux-2.4.24.org/drivers/md/dm.c linux-2.4.24/drivers/md/dm.c
238--- linux-2.4.24.org/drivers/md/dm.c 1970-01-01 01:00:00.000000000 +0100
239+++ linux-2.4.24/drivers/md/dm.c 2004-01-18 15:01:29.214472770 +0100
240@@ -0,0 +1,1115 @@
241+/*
242+ * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
243+ *
244+ * This file is released under the GPL.
245+ */
246+
247+#include "dm.h"
248+#include "kcopyd.h"
249+
250+#include <linux/init.h>
251+#include <linux/module.h>
252+#include <linux/blk.h>
253+#include <linux/blkpg.h>
254+#include <linux/mempool.h>
255+#include <linux/slab.h>
256+#include <linux/major.h>
257+#include <linux/kdev_t.h>
258+#include <linux/lvm.h>
259+
260+#include <asm/uaccess.h>
261+
262+static const char *_name = DM_NAME;
263+#define DEFAULT_READ_AHEAD 64
264+
265+struct dm_io {
266+ struct mapped_device *md;
267+
268+ struct dm_target *ti;
269+ int rw;
270+ union map_info map_context;
271+ void (*end_io) (struct buffer_head * bh, int uptodate);
272+ void *context;
273+};
274+
275+struct deferred_io {
276+ int rw;
277+ struct buffer_head *bh;
278+ struct deferred_io *next;
279+};
280+
281+/*
282+ * Bits for the md->flags field.
283+ */
284+#define DMF_BLOCK_IO 0
285+#define DMF_SUSPENDED 1
286+
287+struct mapped_device {
288+ struct rw_semaphore lock;
289+ atomic_t holders;
290+
291+ kdev_t dev;
292+ unsigned long flags;
293+
294+ /*
295+ * A list of ios that arrived while we were suspended.
296+ */
297+ atomic_t pending;
298+ wait_queue_head_t wait;
299+ struct deferred_io *deferred;
300+
301+ /*
302+ * The current mapping.
303+ */
304+ struct dm_table *map;
305+
306+ /*
307+ * io objects are allocated from here.
308+ */
309+ mempool_t *io_pool;
310+
311+ /*
312+ * Event handling.
313+ */
314+ uint32_t event_nr;
315+ wait_queue_head_t eventq;
316+};
317+
318+#define MIN_IOS 256
319+static kmem_cache_t *_io_cache;
320+
321+static struct mapped_device *get_kdev(kdev_t dev);
322+static int dm_request(request_queue_t *q, int rw, struct buffer_head *bh);
323+static int dm_user_bmap(struct inode *inode, struct lv_bmap *lvb);
324+
325+/*-----------------------------------------------------------------
326+ * In order to avoid the 256 minor number limit we are going to
327+ * register more major numbers as neccessary.
328+ *---------------------------------------------------------------*/
329+#define MAX_MINORS (1 << MINORBITS)
330+
331+struct major_details {
332+ unsigned int major;
333+
334+ int transient;
335+ struct list_head transient_list;
336+
337+ unsigned int first_free_minor;
338+ int nr_free_minors;
339+
340+ struct mapped_device *mds[MAX_MINORS];
341+ int blk_size[MAX_MINORS];
342+ int blksize_size[MAX_MINORS];
343+ int hardsect_size[MAX_MINORS];
344+};
345+
346+static struct rw_semaphore _dev_lock;
347+static struct major_details *_majors[MAX_BLKDEV];
348+
349+/*
350+ * This holds a list of majors that non-specified device numbers
351+ * may be allocated from. Only majors with free minors appear on
352+ * this list.
353+ */
354+static LIST_HEAD(_transients_free);
355+
356+static int __alloc_major(unsigned int major, struct major_details **result)
357+{
358+ int r;
359+ unsigned int transient = !major;
360+ struct major_details *maj;
361+
362+ /* Major already allocated? */
363+ if (major && _majors[major])
364+ return 0;
365+
366+ maj = kmalloc(sizeof(*maj), GFP_KERNEL);
367+ if (!maj)
368+ return -ENOMEM;
369+
370+ memset(maj, 0, sizeof(*maj));
371+ INIT_LIST_HEAD(&maj->transient_list);
372+
373+ maj->nr_free_minors = MAX_MINORS;
374+
375+ r = register_blkdev(major, _name, &dm_blk_dops);
376+ if (r < 0) {
377+ DMERR("register_blkdev failed for %d", major);
378+ kfree(maj);
379+ return r;
380+ }
381+ if (r > 0)
382+ major = r;
383+
384+ maj->major = major;
385+
386+ if (transient) {
387+ maj->transient = transient;
388+ list_add_tail(&maj->transient_list, &_transients_free);
389+ }
390+
391+ _majors[major] = maj;
392+
393+ blk_size[major] = maj->blk_size;
394+ blksize_size[major] = maj->blksize_size;
395+ hardsect_size[major] = maj->hardsect_size;
396+ read_ahead[major] = DEFAULT_READ_AHEAD;
397+
398+ blk_queue_make_request(BLK_DEFAULT_QUEUE(major), dm_request);
399+
400+ *result = maj;
401+ return 0;
402+}
403+
404+static void __free_major(struct major_details *maj)
405+{
406+ unsigned int major = maj->major;
407+
408+ list_del(&maj->transient_list);
409+
410+ read_ahead[major] = 0;
411+ blk_size[major] = NULL;
412+ blksize_size[major] = NULL;
413+ hardsect_size[major] = NULL;
414+
415+ _majors[major] = NULL;
416+ kfree(maj);
417+
418+ if (unregister_blkdev(major, _name) < 0)
419+ DMERR("devfs_unregister_blkdev failed");
420+}
421+
422+static void free_all_majors(void)
423+{
424+ unsigned int major = ARRAY_SIZE(_majors);
425+
426+ down_write(&_dev_lock);
427+
428+ while (major--)
429+ if (_majors[major])
430+ __free_major(_majors[major]);
431+
432+ up_write(&_dev_lock);
433+}
434+
435+static void free_dev(kdev_t dev)
436+{
437+ unsigned int major = major(dev);
438+ unsigned int minor = minor(dev);
439+ struct major_details *maj;
440+
441+ down_write(&_dev_lock);
442+
443+ maj = _majors[major];
444+ if (!maj)
445+ goto out;
446+
447+ maj->mds[minor] = NULL;
448+ maj->nr_free_minors++;
449+
450+ if (maj->nr_free_minors == MAX_MINORS) {
451+ __free_major(maj);
452+ goto out;
453+ }
454+
455+ if (!maj->transient)
456+ goto out;
457+
458+ if (maj->nr_free_minors == 1)
459+ list_add_tail(&maj->transient_list, &_transients_free);
460+
461+ if (minor < maj->first_free_minor)
462+ maj->first_free_minor = minor;
463+
464+ out:
465+ up_write(&_dev_lock);
466+}
467+
468+static void __alloc_minor(struct major_details *maj, unsigned int minor,
469+ struct mapped_device *md)
470+{
471+ maj->mds[minor] = md;
472+ md->dev = mk_kdev(maj->major, minor);
473+ maj->nr_free_minors--;
474+
475+ if (maj->transient && !maj->nr_free_minors)
476+ list_del_init(&maj->transient_list);
477+}
478+
479+/*
480+ * See if requested kdev_t is available.
481+ */
482+static int specific_dev(kdev_t dev, struct mapped_device *md)
483+{
484+ int r = 0;
485+ unsigned int major = major(dev);
486+ unsigned int minor = minor(dev);
487+ struct major_details *maj;
488+
489+ if (!major || (major > MAX_BLKDEV) || (minor >= MAX_MINORS)) {
490+ DMWARN("device number requested out of range (%d, %d)",
491+ major, minor);
492+ return -EINVAL;
493+ }
494+
495+ down_write(&_dev_lock);
496+ maj = _majors[major];
497+
498+ /* Register requested major? */
499+ if (!maj) {
500+ r = __alloc_major(major, &maj);
501+ if (r)
502+ goto out;
503+
504+ major = maj->major;
505+ }
506+
507+ if (maj->mds[minor]) {
508+ r = -EBUSY;
509+ goto out;
510+ }
511+
512+ __alloc_minor(maj, minor, md);
513+
514+ out:
515+ up_write(&_dev_lock);
516+
517+ return r;
518+}
519+
520+/*
521+ * Find first unused device number, requesting a new major number if required.
522+ */
523+static int first_free_dev(struct mapped_device *md)
524+{
525+ int r = 0;
526+ struct major_details *maj;
527+
528+ down_write(&_dev_lock);
529+
530+ if (list_empty(&_transients_free)) {
531+ r = __alloc_major(0, &maj);
532+ if (r)
533+ goto out;
534+ } else
535+ maj = list_entry(_transients_free.next, struct major_details,
536+ transient_list);
537+
538+ while (maj->mds[maj->first_free_minor++])
539+ ;
540+
541+ __alloc_minor(maj, maj->first_free_minor - 1, md);
542+
543+ out:
544+ up_write(&_dev_lock);
545+
546+ return r;
547+}
548+
549+static struct mapped_device *get_kdev(kdev_t dev)
550+{
551+ struct mapped_device *md;
552+ struct major_details *maj;
553+
554+ down_read(&_dev_lock);
555+ maj = _majors[major(dev)];
556+ if (!maj) {
557+ md = NULL;
558+ goto out;
559+ }
560+ md = maj->mds[minor(dev)];
561+ if (md)
562+ dm_get(md);
563+ out:
564+ up_read(&_dev_lock);
565+
566+ return md;
567+}
568+
569+/*-----------------------------------------------------------------
570+ * init/exit code
571+ *---------------------------------------------------------------*/
572+
573+static __init int local_init(void)
574+{
575+ init_rwsem(&_dev_lock);
576+
577+ /* allocate a slab for the dm_ios */
578+ _io_cache = kmem_cache_create("dm io",
579+ sizeof(struct dm_io), 0, 0, NULL, NULL);
580+
581+ if (!_io_cache)
582+ return -ENOMEM;
583+
584+ return 0;
585+}
586+
587+static void local_exit(void)
588+{
589+ kmem_cache_destroy(_io_cache);
590+ free_all_majors();
591+
592+ DMINFO("cleaned up");
593+}
594+
595+/*
596+ * We have a lot of init/exit functions, so it seems easier to
597+ * store them in an array. The disposable macro 'xx'
598+ * expands a prefix into a pair of function names.
599+ */
600+static struct {
601+ int (*init) (void);
602+ void (*exit) (void);
603+
604+} _inits[] = {
605+#define xx(n) {n ## _init, n ## _exit},
606+ xx(local)
607+ xx(dm_target)
608+ xx(dm_linear)
609+ xx(dm_stripe)
610+ xx(dm_interface)
611+ xx(kcopyd)
612+ xx(dm_snapshot)
613+#undef xx
614+};
615+
616+static int __init dm_init(void)
617+{
618+ const int count = ARRAY_SIZE(_inits);
619+
620+ int r, i;
621+
622+ for (i = 0; i < count; i++) {
623+ r = _inits[i].init();
624+ if (r)
625+ goto bad;
626+ }
627+
628+ return 0;
629+
630+ bad:
631+ while (i--)
632+ _inits[i].exit();
633+
634+ return r;
635+}
636+
637+static void __exit dm_exit(void)
638+{
639+ int i = ARRAY_SIZE(_inits);
640+
641+ while (i--)
642+ _inits[i].exit();
643+}
644+
645+/*
646+ * Block device functions
647+ */
648+static int dm_blk_open(struct inode *inode, struct file *file)
649+{
650+ struct mapped_device *md;
651+
652+ md = get_kdev(inode->i_rdev);
653+ if (!md)
654+ return -ENXIO;
655+
656+ return 0;
657+}
658+
659+static int dm_blk_close(struct inode *inode, struct file *file)
660+{
661+ struct mapped_device *md;
662+
663+ md = get_kdev(inode->i_rdev);
664+ dm_put(md); /* put the reference gained by dm_blk_open */
665+ dm_put(md);
666+ return 0;
667+}
668+
669+static inline struct dm_io *alloc_io(struct mapped_device *md)
670+{
671+ return mempool_alloc(md->io_pool, GFP_NOIO);
672+}
673+
674+static inline void free_io(struct mapped_device *md, struct dm_io *io)
675+{
676+ mempool_free(io, md->io_pool);
677+}
678+
679+static inline struct deferred_io *alloc_deferred(void)
680+{
681+ return kmalloc(sizeof(struct deferred_io), GFP_NOIO);
682+}
683+
684+static inline void free_deferred(struct deferred_io *di)
685+{
686+ kfree(di);
687+}
688+
689+static inline sector_t volume_size(kdev_t dev)
690+{
691+ return blk_size[major(dev)][minor(dev)] << 1;
692+}
693+
694+/* FIXME: check this */
695+static int dm_blk_ioctl(struct inode *inode, struct file *file,
696+ unsigned int command, unsigned long a)
697+{
698+ kdev_t dev = inode->i_rdev;
699+ long size;
700+
701+ switch (command) {
702+ case BLKROSET:
703+ case BLKROGET:
704+ case BLKRASET:
705+ case BLKRAGET:
706+ case BLKFLSBUF:
707+ case BLKSSZGET:
708+ //case BLKRRPART: /* Re-read partition tables */
709+ //case BLKPG:
710+ case BLKELVGET:
711+ case BLKELVSET:
712+ case BLKBSZGET:
713+ case BLKBSZSET:
714+ return blk_ioctl(dev, command, a);
715+ break;
716+
717+ case BLKGETSIZE:
718+ size = volume_size(dev);
719+ if (copy_to_user((void *) a, &size, sizeof(long)))
720+ return -EFAULT;
721+ break;
722+
723+ case BLKGETSIZE64:
724+ size = volume_size(dev);
725+ if (put_user((u64) ((u64) size) << 9, (u64 *) a))
726+ return -EFAULT;
727+ break;
728+
729+ case BLKRRPART:
730+ return -ENOTTY;
731+
732+ case LV_BMAP:
733+ return dm_user_bmap(inode, (struct lv_bmap *) a);
734+
735+ default:
736+ DMWARN("unknown block ioctl 0x%x", command);
737+ return -ENOTTY;
738+ }
739+
740+ return 0;
741+}
742+
743+/*
744+ * Add the buffer to the list of deferred io.
745+ */
746+static int queue_io(struct mapped_device *md, struct buffer_head *bh, int rw)
747+{
748+ struct deferred_io *di;
749+
750+ di = alloc_deferred();
751+ if (!di)
752+ return -ENOMEM;
753+
754+ down_write(&md->lock);
755+
756+ if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
757+ up_write(&md->lock);
758+ free_deferred(di);
759+ return 1;
760+ }
761+
762+ di->bh = bh;
763+ di->rw = rw;
764+ di->next = md->deferred;
765+ md->deferred = di;
766+
767+ up_write(&md->lock);
768+ return 0; /* deferred successfully */
769+}
770+
771+/*
772+ * bh->b_end_io routine that decrements the pending count
773+ * and then calls the original bh->b_end_io fn.
774+ */
775+static void dec_pending(struct buffer_head *bh, int uptodate)
776+{
777+ int r;
778+ struct dm_io *io = bh->b_private;
779+ dm_endio_fn endio = io->ti->type->end_io;
780+
781+ if (endio) {
782+ r = endio(io->ti, bh, io->rw, uptodate ? 0 : -EIO,
783+ &io->map_context);
784+ if (r < 0)
785+ uptodate = 0;
786+
787+ else if (r > 0)
788+ /* the target wants another shot at the io */
789+ return;
790+ }
791+
792+ if (atomic_dec_and_test(&io->md->pending))
793+ /* nudge anyone waiting on suspend queue */
794+ wake_up(&io->md->wait);
795+
796+ bh->b_end_io = io->end_io;
797+ bh->b_private = io->context;
798+ free_io(io->md, io);
799+
800+ bh->b_end_io(bh, uptodate);
801+}
802+
803+/*
804+ * Do the bh mapping for a given leaf
805+ */
806+static inline int __map_buffer(struct mapped_device *md, int rw,
807+ struct buffer_head *bh, struct dm_io *io)
808+{
809+ struct dm_target *ti;
810+
811+ if (!md->map)
812+ return -EINVAL;
813+
814+ ti = dm_table_find_target(md->map, bh->b_rsector);
815+ if (!ti->type)
816+ return -EINVAL;
817+
818+ /* hook the end io request fn */
819+ atomic_inc(&md->pending);
820+ io->md = md;
821+ io->ti = ti;
822+ io->rw = rw;
823+ io->end_io = bh->b_end_io;
824+ io->context = bh->b_private;
825+ bh->b_end_io = dec_pending;
826+ bh->b_private = io;
827+
828+ return ti->type->map(ti, bh, rw, &io->map_context);
829+}
830+
831+/*
832+ * Checks to see if we should be deferring io, if so it queues it
833+ * and returns 1.
834+ */
835+static inline int __deferring(struct mapped_device *md, int rw,
836+ struct buffer_head *bh)
837+{
838+ int r;
839+
840+ /*
841+ * If we're suspended we have to queue this io for later.
842+ */
843+ while (test_bit(DMF_BLOCK_IO, &md->flags)) {
844+ up_read(&md->lock);
845+
846+ /*
847+ * There's no point deferring a read ahead
848+ * request, just drop it.
849+ */
850+ if (rw == READA) {
851+ down_read(&md->lock);
852+ return -EIO;
853+ }
854+
855+ r = queue_io(md, bh, rw);
856+ down_read(&md->lock);
857+
858+ if (r < 0)
859+ return r;
860+
861+ if (r == 0)
862+ return 1; /* deferred successfully */
863+
864+ }
865+
866+ return 0;
867+}
868+
869+static int dm_request(request_queue_t *q, int rw, struct buffer_head *bh)
870+{
871+ int r;
872+ struct dm_io *io;
873+ struct mapped_device *md;
874+
875+ md = get_kdev(bh->b_rdev);
876+ if (!md) {
877+ buffer_IO_error(bh);
878+ return 0;
879+ }
880+
881+ io = alloc_io(md);
882+ down_read(&md->lock);
883+
884+ r = __deferring(md, rw, bh);
885+ if (r < 0)
886+ goto bad;
887+
888+ else if (!r) {
889+ /* not deferring */
890+ r = __map_buffer(md, rw, bh, io);
891+ if (r < 0)
892+ goto bad;
893+ } else
894+ r = 0;
895+
896+ up_read(&md->lock);
897+ dm_put(md);
898+ return r;
899+
900+ bad:
901+ buffer_IO_error(bh);
902+ up_read(&md->lock);
903+ dm_put(md);
904+ return 0;
905+}
906+
907+static int check_dev_size(kdev_t dev, unsigned long block)
908+{
909+ unsigned int major = major(dev);
910+ unsigned int minor = minor(dev);
911+
912+ /* FIXME: check this */
913+ unsigned long max_sector = (blk_size[major][minor] << 1) + 1;
914+ unsigned long sector = (block + 1) * (blksize_size[major][minor] >> 9);
915+
916+ return (sector > max_sector) ? 0 : 1;
917+}
918+
919+/*
920+ * Creates a dummy buffer head and maps it (for lilo).
921+ */
922+static int __bmap(struct mapped_device *md, kdev_t dev, unsigned long block,
923+ kdev_t *r_dev, unsigned long *r_block)
924+{
925+ struct buffer_head bh;
926+ struct dm_target *ti;
927+ union map_info map_context;
928+ int r;
929+
930+ if (test_bit(DMF_BLOCK_IO, &md->flags)) {
931+ return -EPERM;
932+ }
933+
934+ if (!check_dev_size(dev, block)) {
935+ return -EINVAL;
936+ }
937+
938+ if (!md->map)
939+ return -EINVAL;
940+
941+ /* setup dummy bh */
942+ memset(&bh, 0, sizeof(bh));
943+ bh.b_blocknr = block;
944+ bh.b_dev = bh.b_rdev = dev;
945+ bh.b_size = blksize_size[major(dev)][minor(dev)];
946+ bh.b_rsector = block * (bh.b_size >> 9);
947+
948+ /* find target */
949+ ti = dm_table_find_target(md->map, bh.b_rsector);
950+
951+ /* do the mapping */
952+ r = ti->type->map(ti, &bh, READ, &map_context);
953+ ti->type->end_io(ti, &bh, READ, 0, &map_context);
954+
955+ if (!r) {
956+ *r_dev = bh.b_rdev;
957+ *r_block = bh.b_rsector / (bh.b_size >> 9);
958+ }
959+
960+ return r;
961+}
962+
963+/*
964+ * Marshals arguments and results between user and kernel space.
965+ */
966+static int dm_user_bmap(struct inode *inode, struct lv_bmap *lvb)
967+{
968+ struct mapped_device *md;
969+ unsigned long block, r_block;
970+ kdev_t r_dev;
971+ int r;
972+
973+ if (get_user(block, &lvb->lv_block))
974+ return -EFAULT;
975+
976+ md = get_kdev(inode->i_rdev);
977+ if (!md)
978+ return -ENXIO;
979+
980+ down_read(&md->lock);
981+ r = __bmap(md, inode->i_rdev, block, &r_dev, &r_block);
982+ up_read(&md->lock);
983+ dm_put(md);
984+
985+ if (!r && (put_user(kdev_t_to_nr(r_dev), &lvb->lv_dev) ||
986+ put_user(r_block, &lvb->lv_block)))
987+ r = -EFAULT;
988+
989+ return r;
990+}
991+
992+static void free_md(struct mapped_device *md)
993+{
994+ free_dev(md->dev);
995+ mempool_destroy(md->io_pool);
996+ kfree(md);
997+}
998+
999+/*
1000+ * Allocate and initialise a blank device with a given minor.
1001+ */
1002+static struct mapped_device *alloc_md(kdev_t dev)
1003+{
1004+ int r;
1005+ struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL);
1006+
1007+ if (!md) {
1008+ DMWARN("unable to allocate device, out of memory.");
1009+ return NULL;
1010+ }
1011+
1012+ memset(md, 0, sizeof(*md));
1013+
1014+ /* Allocate suitable device number */
1015+ if (!dev)
1016+ r = first_free_dev(md);
1017+ else
1018+ r = specific_dev(dev, md);
1019+
1020+ if (r) {
1021+ kfree(md);
1022+ return NULL;
1023+ }
1024+
1025+ md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
1026+ mempool_free_slab, _io_cache);
1027+ if (!md->io_pool) {
1028+ free_md(md);
1029+ kfree(md);
1030+ return NULL;
1031+ }
1032+
1033+ init_rwsem(&md->lock);
1034+ atomic_set(&md->holders, 1);
1035+ atomic_set(&md->pending, 0);
1036+ init_waitqueue_head(&md->wait);
1037+ init_waitqueue_head(&md->eventq);
1038+
1039+ return md;
1040+}
1041+
1042+/*
1043+ * The hardsect size for a mapped device is the largest hardsect size
1044+ * from the devices it maps onto.
1045+ */
1046+static int __find_hardsect_size(struct list_head *devices)
1047+{
1048+ int result = 512, size;
1049+ struct list_head *tmp;
1050+
1051+ list_for_each (tmp, devices) {
1052+ struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
1053+ size = get_hardsect_size(dd->dev);
1054+ if (size > result)
1055+ result = size;
1056+ }
1057+
1058+ return result;
1059+}
1060+
1061+/*
1062+ * Bind a table to the device.
1063+ */
1064+static void event_callback(void *context)
1065+{
1066+ struct mapped_device *md = (struct mapped_device *) context;
1067+
1068+ down_write(&md->lock);
1069+ md->event_nr++;
1070+ wake_up_interruptible(&md->eventq);
1071+ up_write(&md->lock);
1072+}
1073+
1074+static int __bind(struct mapped_device *md, struct dm_table *t)
1075+{
1076+ unsigned int minor = minor(md->dev);
1077+ unsigned int major = major(md->dev);
1078+ md->map = t;
1079+
1080+ /* in k */
1081+ blk_size[major][minor] = dm_table_get_size(t) >> 1;
1082+ blksize_size[major][minor] = BLOCK_SIZE;
1083+ hardsect_size[major][minor] =
1084+ __find_hardsect_size(dm_table_get_devices(t));
1085+ register_disk(NULL, md->dev, 1, &dm_blk_dops, blk_size[major][minor]);
1086+
1087+ dm_table_event_callback(md->map, event_callback, md);
1088+ dm_table_get(t);
1089+ return 0;
1090+}
1091+
1092+static void __unbind(struct mapped_device *md)
1093+{
1094+ unsigned int minor = minor(md->dev);
1095+ unsigned int major = major(md->dev);
1096+
1097+ if (md->map) {
1098+ dm_table_event_callback(md->map, NULL, NULL);
1099+ dm_table_put(md->map);
1100+ md->map = NULL;
1101+
1102+ }
1103+
1104+ blk_size[major][minor] = 0;
1105+ blksize_size[major][minor] = 0;
1106+ hardsect_size[major][minor] = 0;
1107+}
1108+
1109+/*
1110+ * Constructor for a new device.
1111+ */
1112+int dm_create(kdev_t dev, struct mapped_device **result)
1113+{
1114+ struct mapped_device *md;
1115+
1116+ md = alloc_md(dev);
1117+ if (!md)
1118+ return -ENXIO;
1119+
1120+ __unbind(md); /* Ensure zero device size */
1121+
1122+ *result = md;
1123+ return 0;
1124+}
1125+
1126+void dm_get(struct mapped_device *md)
1127+{
1128+ atomic_inc(&md->holders);
1129+}
1130+
1131+void dm_put(struct mapped_device *md)
1132+{
1133+ if (atomic_dec_and_test(&md->holders)) {
1134+ if (md->map)
1135+ dm_table_suspend_targets(md->map);
1136+ __unbind(md);
1137+ free_md(md);
1138+ }
1139+}
1140+
1141+/*
1142+ * Requeue the deferred io by calling generic_make_request.
1143+ */
1144+static void flush_deferred_io(struct deferred_io *c)
1145+{
1146+ struct deferred_io *n;
1147+
1148+ while (c) {
1149+ n = c->next;
1150+ generic_make_request(c->rw, c->bh);
1151+ free_deferred(c);
1152+ c = n;
1153+ }
1154+}
1155+
1156+/*
1157+ * Swap in a new table (destroying old one).
1158+ */
1159+int dm_swap_table(struct mapped_device *md, struct dm_table *table)
1160+{
1161+ int r;
1162+
1163+ down_write(&md->lock);
1164+
1165+ /*
1166+ * The device must be suspended, or have no table bound yet.
1167+ */
1168+ if (md->map && !test_bit(DMF_SUSPENDED, &md->flags)) {
1169+ up_write(&md->lock);
1170+ return -EPERM;
1171+ }
1172+
1173+ __unbind(md);
1174+ r = __bind(md, table);
1175+ if (r)
1176+ return r;
1177+
1178+ up_write(&md->lock);
1179+ return 0;
1180+}
1181+
1182+/*
1183+ * We need to be able to change a mapping table under a mounted
1184+ * filesystem. For example we might want to move some data in
1185+ * the background. Before the table can be swapped with
1186+ * dm_bind_table, dm_suspend must be called to flush any in
1187+ * flight io and ensure that any further io gets deferred.
1188+ */
1189+int dm_suspend(struct mapped_device *md)
1190+{
1191+ int r = 0;
1192+ DECLARE_WAITQUEUE(wait, current);
1193+
1194+ down_write(&md->lock);
1195+
1196+ /*
1197+ * First we set the BLOCK_IO flag so no more ios will be
1198+ * mapped.
1199+ */
1200+ if (test_bit(DMF_BLOCK_IO, &md->flags)) {
1201+ up_write(&md->lock);
1202+ return -EINVAL;
1203+ }
1204+
1205+ set_bit(DMF_BLOCK_IO, &md->flags);
1206+ add_wait_queue(&md->wait, &wait);
1207+ up_write(&md->lock);
1208+
1209+ /*
1210+ * Then we wait for the already mapped ios to
1211+ * complete.
1212+ */
1213+ run_task_queue(&tq_disk);
1214+ while (1) {
1215+ set_current_state(TASK_INTERRUPTIBLE);
1216+
1217+ if (!atomic_read(&md->pending) || signal_pending(current))
1218+ break;
1219+
1220+ schedule();
1221+ }
1222+ set_current_state(TASK_RUNNING);
1223+
1224+ down_write(&md->lock);
1225+ remove_wait_queue(&md->wait, &wait);
1226+
1227+ /* did we flush everything ? */
1228+ if (atomic_read(&md->pending)) {
1229+ clear_bit(DMF_BLOCK_IO, &md->flags);
1230+ r = -EINTR;
1231+ } else {
1232+ set_bit(DMF_SUSPENDED, &md->flags);
1233+ if (md->map)
1234+ dm_table_suspend_targets(md->map);
1235+ }
1236+ up_write(&md->lock);
1237+
1238+ return r;
1239+}
1240+
1241+int dm_resume(struct mapped_device *md)
1242+{
1243+ struct deferred_io *def;
1244+
1245+ down_write(&md->lock);
1246+ if (!test_bit(DMF_SUSPENDED, &md->flags)) {
1247+ up_write(&md->lock);
1248+ return -EINVAL;
1249+ }
1250+
1251+ if (md->map)
1252+ dm_table_resume_targets(md->map);
1253+
1254+ clear_bit(DMF_SUSPENDED, &md->flags);
1255+ clear_bit(DMF_BLOCK_IO, &md->flags);
1256+ def = md->deferred;
1257+ md->deferred = NULL;
1258+ up_write(&md->lock);
1259+
1260+ flush_deferred_io(def);
1261+ run_task_queue(&tq_disk);
1262+
1263+ return 0;
1264+}
1265+
1266+struct dm_table *dm_get_table(struct mapped_device *md)
1267+{
1268+ struct dm_table *t;
1269+
1270+ down_read(&md->lock);
1271+ t = md->map;
1272+ if (t)
1273+ dm_table_get(t);
1274+ up_read(&md->lock);
1275+
1276+ return t;
1277+}
1278+
1279+/*-----------------------------------------------------------------
1280+ * Event notification.
1281+ *---------------------------------------------------------------*/
1282+uint32_t dm_get_event_nr(struct mapped_device *md)
1283+{
1284+ uint32_t r;
1285+
1286+ down_read(&md->lock);
1287+ r = md->event_nr;
1288+ up_read(&md->lock);
1289+
1290+ return r;
1291+}
1292+
1293+int dm_add_wait_queue(struct mapped_device *md, wait_queue_t *wq,
1294+ uint32_t event_nr)
1295+{
1296+ down_write(&md->lock);
1297+ if (event_nr != md->event_nr) {
1298+ up_write(&md->lock);
1299+ return 1;
1300+ }
1301+
1302+ add_wait_queue(&md->eventq, wq);
1303+ up_write(&md->lock);
1304+
1305+ return 0;
1306+}
1307+
1308+const char *dm_kdevname(kdev_t dev)
1309+{
1310+ static char buffer[32];
1311+ sprintf(buffer, "%03d:%03d", MAJOR(dev), MINOR(dev));
1312+ return buffer;
1313+}
1314+
1315+void dm_remove_wait_queue(struct mapped_device *md, wait_queue_t *wq)
1316+{
1317+ down_write(&md->lock);
1318+ remove_wait_queue(&md->eventq, wq);
1319+ up_write(&md->lock);
1320+}
1321+
1322+kdev_t dm_kdev(struct mapped_device *md)
1323+{
1324+ kdev_t dev;
1325+
1326+ down_read(&md->lock);
1327+ dev = md->dev;
1328+ up_read(&md->lock);
1329+
1330+ return dev;
1331+}
1332+
1333+int dm_suspended(struct mapped_device *md)
1334+{
1335+ return test_bit(DMF_SUSPENDED, &md->flags);
1336+}
1337+
1338+struct block_device_operations dm_blk_dops = {
1339+ .open = dm_blk_open,
1340+ .release = dm_blk_close,
1341+ .ioctl = dm_blk_ioctl,
1342+ .owner = THIS_MODULE
1343+};
1344+
1345+/*
1346+ * module hooks
1347+ */
1348+module_init(dm_init);
1349+module_exit(dm_exit);
1350+
1351+MODULE_DESCRIPTION(DM_NAME " driver");
1352+MODULE_AUTHOR("Joe Thornber <thornber@sistina.com>");
1353+MODULE_LICENSE("GPL");
1354+
1355+EXPORT_SYMBOL(dm_kdevname);
1356diff -urN linux-2.4.24.org/drivers/md/dm-daemon.c linux-2.4.24/drivers/md/dm-daemon.c
1357--- linux-2.4.24.org/drivers/md/dm-daemon.c 1970-01-01 01:00:00.000000000 +0100
1358+++ linux-2.4.24/drivers/md/dm-daemon.c 2004-01-18 15:01:21.977991002 +0100
1359@@ -0,0 +1,113 @@
1360+/*
1361+ * Copyright (C) 2003 Sistina Software
1362+ *
1363+ * This file is released under the LGPL.
1364+ */
1365+
1366+#include "dm.h"
1367+#include "dm-daemon.h"
1368+
1369+#include <linux/module.h>
1370+#include <linux/sched.h>
1371+
1372+static int daemon(void *arg)
1373+{
1374+ struct dm_daemon *dd = (struct dm_daemon *) arg;
1375+ DECLARE_WAITQUEUE(wq, current);
1376+
1377+ daemonize();
1378+ reparent_to_init();
1379+
1380+ /* block all signals */
1381+ spin_lock_irq(&current->sigmask_lock);
1382+ sigfillset(&current->blocked);
1383+ flush_signals(current);
1384+ spin_unlock_irq(&current->sigmask_lock);
1385+
1386+ strcpy(current->comm, dd->name);
1387+ atomic_set(&dd->please_die, 0);
1388+
1389+ add_wait_queue(&dd->job_queue, &wq);
1390+
1391+ down(&dd->run_lock);
1392+ up(&dd->start_lock);
1393+
1394+ /*
1395+ * dd->fn() could do anything, very likely it will
1396+ * suspend. So we can't set the state to
1397+ * TASK_INTERRUPTIBLE before calling it. In order to
1398+ * prevent a race with a waking thread we do this little
1399+ * dance with the dd->woken variable.
1400+ */
1401+ while (1) {
1402+ do {
1403+ set_current_state(TASK_RUNNING);
1404+
1405+ if (atomic_read(&dd->please_die))
1406+ goto out;
1407+
1408+ atomic_set(&dd->woken, 0);
1409+ dd->fn();
1410+ yield();
1411+
1412+ set_current_state(TASK_INTERRUPTIBLE);
1413+ } while (atomic_read(&dd->woken));
1414+
1415+ schedule();
1416+ }
1417+
1418+ out:
1419+ remove_wait_queue(&dd->job_queue, &wq);
1420+ up(&dd->run_lock);
1421+ return 0;
1422+}
1423+
1424+int dm_daemon_start(struct dm_daemon *dd, const char *name, void (*fn)(void))
1425+{
1426+ pid_t pid = 0;
1427+
1428+ /*
1429+ * Initialise the dm_daemon.
1430+ */
1431+ dd->fn = fn;
1432+ strncpy(dd->name, name, sizeof(dd->name) - 1);
1433+ sema_init(&dd->start_lock, 1);
1434+ sema_init(&dd->run_lock, 1);
1435+ init_waitqueue_head(&dd->job_queue);
1436+
1437+ /*
1438+ * Start the new thread.
1439+ */
1440+ down(&dd->start_lock);
1441+ pid = kernel_thread(daemon, dd, 0);
1442+ if (pid <= 0) {
1443+ DMERR("Failed to start %s thread", name);
1444+ return -EAGAIN;
1445+ }
1446+
1447+ /*
1448+ * wait for the daemon to up this mutex.
1449+ */
1450+ down(&dd->start_lock);
1451+ up(&dd->start_lock);
1452+
1453+ return 0;
1454+}
1455+
1456+void dm_daemon_stop(struct dm_daemon *dd)
1457+{
1458+ atomic_set(&dd->please_die, 1);
1459+ dm_daemon_wake(dd);
1460+ down(&dd->run_lock);
1461+ up(&dd->run_lock);
1462+}
1463+
1464+void dm_daemon_wake(struct dm_daemon *dd)
1465+{
1466+ atomic_set(&dd->woken, 1);
1467+ wake_up_interruptible(&dd->job_queue);
1468+}
1469+
1470+EXPORT_SYMBOL(dm_daemon_start);
1471+EXPORT_SYMBOL(dm_daemon_stop);
1472+EXPORT_SYMBOL(dm_daemon_wake);
1473diff -urN linux-2.4.24.org/drivers/md/dm-daemon.h linux-2.4.24/drivers/md/dm-daemon.h
1474--- linux-2.4.24.org/drivers/md/dm-daemon.h 1970-01-01 01:00:00.000000000 +0100
1475+++ linux-2.4.24/drivers/md/dm-daemon.h 2004-01-18 15:01:21.980990372 +0100
1476@@ -0,0 +1,29 @@
1477+/*
1478+ * Copyright (C) 2003 Sistina Software
1479+ *
1480+ * This file is released under the LGPL.
1481+ */
1482+
1483+#ifndef DM_DAEMON_H
1484+#define DM_DAEMON_H
1485+
1486+#include <asm/atomic.h>
1487+#include <asm/semaphore.h>
1488+
1489+struct dm_daemon {
1490+ void (*fn)(void);
1491+ char name[16];
1492+ atomic_t please_die;
1493+ struct semaphore start_lock;
1494+ struct semaphore run_lock;
1495+
1496+ atomic_t woken;
1497+ wait_queue_head_t job_queue;
1498+};
1499+
1500+int dm_daemon_start(struct dm_daemon *dd, const char *name, void (*fn)(void));
1501+void dm_daemon_stop(struct dm_daemon *dd);
1502+void dm_daemon_wake(struct dm_daemon *dd);
1503+int dm_daemon_running(struct dm_daemon *dd);
1504+
1505+#endif
1506diff -urN linux-2.4.24.org/drivers/md/dm-exception-store.c linux-2.4.24/drivers/md/dm-exception-store.c
1507--- linux-2.4.24.org/drivers/md/dm-exception-store.c 1970-01-01 01:00:00.000000000 +0100
1508+++ linux-2.4.24/drivers/md/dm-exception-store.c 2004-01-18 15:01:29.225470463 +0100
1509@@ -0,0 +1,673 @@
1510+/*
1511+ * dm-snapshot.c
1512+ *
1513+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
1514+ *
1515+ * This file is released under the GPL.
1516+ */
1517+
1518+#include "dm-snapshot.h"
1519+#include "dm-io.h"
1520+#include "kcopyd.h"
1521+
1522+#include <linux/mm.h>
1523+#include <linux/pagemap.h>
1524+#include <linux/vmalloc.h>
1525+#include <linux/slab.h>
1526+
1527+/*-----------------------------------------------------------------
1528+ * Persistent snapshots, by persistent we mean that the snapshot
1529+ * will survive a reboot.
1530+ *---------------------------------------------------------------*/
1531+
1532+/*
1533+ * We need to store a record of which parts of the origin have
1534+ * been copied to the snapshot device. The snapshot code
1535+ * requires that we copy exception chunks to chunk aligned areas
1536+ * of the COW store. It makes sense therefore, to store the
1537+ * metadata in chunk size blocks.
1538+ *
1539+ * There is no backward or forward compatibility implemented,
1540+ * snapshots with different disk versions than the kernel will
1541+ * not be usable. It is expected that "lvcreate" will blank out
1542+ * the start of a fresh COW device before calling the snapshot
1543+ * constructor.
1544+ *
1545+ * The first chunk of the COW device just contains the header.
1546+ * After this there is a chunk filled with exception metadata,
1547+ * followed by as many exception chunks as can fit in the
1548+ * metadata areas.
1549+ *
1550+ * All on disk structures are in little-endian format. The end
1551+ * of the exceptions info is indicated by an exception with a
1552+ * new_chunk of 0, which is invalid since it would point to the
1553+ * header chunk.
1554+ */
1555+
1556+/*
1557+ * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
1558+ */
1559+#define SNAP_MAGIC 0x70416e53
1560+
1561+/*
1562+ * The on-disk version of the metadata.
1563+ */
1564+#define SNAPSHOT_DISK_VERSION 1
1565+
1566+struct disk_header {
1567+ uint32_t magic;
1568+
1569+ /*
1570+ * Is this snapshot valid. There is no way of recovering
1571+ * an invalid snapshot.
1572+ */
1573+ uint32_t valid;
1574+
1575+ /*
1576+ * Simple, incrementing version. no backward
1577+ * compatibility.
1578+ */
1579+ uint32_t version;
1580+
1581+ /* In sectors */
1582+ uint32_t chunk_size;
1583+};
1584+
1585+struct disk_exception {
1586+ uint64_t old_chunk;
1587+ uint64_t new_chunk;
1588+};
1589+
1590+struct commit_callback {
1591+ void (*callback)(void *, int success);
1592+ void *context;
1593+};
1594+
1595+/*
1596+ * The top level structure for a persistent exception store.
1597+ */
1598+struct pstore {
1599+ struct dm_snapshot *snap; /* up pointer to my snapshot */
1600+ int version;
1601+ int valid;
1602+ uint32_t chunk_size;
1603+ uint32_t exceptions_per_area;
1604+
1605+ /*
1606+ * Now that we have an asynchronous kcopyd there is no
1607+ * need for large chunk sizes, so it wont hurt to have a
1608+ * whole chunks worth of metadata in memory at once.
1609+ */
1610+ void *area;
1611+
1612+ /*
1613+ * Used to keep track of which metadata area the data in
1614+ * 'chunk' refers to.
1615+ */
1616+ uint32_t current_area;
1617+
1618+ /*
1619+ * The next free chunk for an exception.
1620+ */
1621+ uint32_t next_free;
1622+
1623+ /*
1624+ * The index of next free exception in the current
1625+ * metadata area.
1626+ */
1627+ uint32_t current_committed;
1628+
1629+ atomic_t pending_count;
1630+ uint32_t callback_count;
1631+ struct commit_callback *callbacks;
1632+};
1633+
1634+static inline unsigned int sectors_to_pages(unsigned int sectors)
1635+{
1636+ return sectors / (PAGE_SIZE / SECTOR_SIZE);
1637+}
1638+
1639+static int alloc_area(struct pstore *ps)
1640+{
1641+ int r = -ENOMEM;
1642+ size_t i, len, nr_pages;
1643+ struct page *page, *last = NULL;
1644+
1645+ len = ps->chunk_size << SECTOR_SHIFT;
1646+
1647+ /*
1648+ * Allocate the chunk_size block of memory that will hold
1649+ * a single metadata area.
1650+ */
1651+ ps->area = vmalloc(len);
1652+ if (!ps->area)
1653+ return r;
1654+
1655+ nr_pages = sectors_to_pages(ps->chunk_size);
1656+
1657+ /*
1658+ * We lock the pages for ps->area into memory since
1659+ * they'll be doing a lot of io. We also chain them
1660+ * together ready for dm-io.
1661+ */
1662+ for (i = 0; i < nr_pages; i++) {
1663+ page = vmalloc_to_page(ps->area + (i * PAGE_SIZE));
1664+ LockPage(page);
1665+ if (last)
1666+ last->list.next = &page->list;
1667+ last = page;
1668+ }
1669+
1670+ return 0;
1671+}
1672+
1673+static void free_area(struct pstore *ps)
1674+{
1675+ size_t i, nr_pages;
1676+ struct page *page;
1677+
1678+ nr_pages = sectors_to_pages(ps->chunk_size);
1679+ for (i = 0; i < nr_pages; i++) {
1680+ page = vmalloc_to_page(ps->area + (i * PAGE_SIZE));
1681+ page->list.next = NULL;
1682+ UnlockPage(page);
1683+ }
1684+
1685+ vfree(ps->area);
1686+}
1687+
1688+/*
1689+ * Read or write a chunk aligned and sized block of data from a device.
1690+ */
1691+static int chunk_io(struct pstore *ps, uint32_t chunk, int rw)
1692+{
1693+ struct io_region where;
1694+ unsigned int bits;
1695+
1696+ where.dev = ps->snap->cow->dev;
1697+ where.sector = ps->chunk_size * chunk;
1698+ where.count = ps->chunk_size;
1699+
1700+ return dm_io_sync(1, &where, rw, vmalloc_to_page(ps->area), 0, &bits);
1701+}
1702+
1703+/*
1704+ * Read or write a metadata area. Remembering to skip the first
1705+ * chunk which holds the header.
1706+ */
1707+static int area_io(struct pstore *ps, uint32_t area, int rw)
1708+{
1709+ int r;
1710+ uint32_t chunk;
1711+
1712+ /* convert a metadata area index to a chunk index */
1713+ chunk = 1 + ((ps->exceptions_per_area + 1) * area);
1714+
1715+ r = chunk_io(ps, chunk, rw);
1716+ if (r)
1717+ return r;
1718+
1719+ ps->current_area = area;
1720+ return 0;
1721+}
1722+
1723+static int zero_area(struct pstore *ps, uint32_t area)
1724+{
1725+ memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT);
1726+ return area_io(ps, area, WRITE);
1727+}
1728+
1729+static int read_header(struct pstore *ps, int *new_snapshot)
1730+{
1731+ int r;
1732+ struct disk_header *dh;
1733+
1734+ r = chunk_io(ps, 0, READ);
1735+ if (r)
1736+ return r;
1737+
1738+ dh = (struct disk_header *) ps->area;
1739+
1740+ if (le32_to_cpu(dh->magic) == 0) {
1741+ *new_snapshot = 1;
1742+
1743+ } else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) {
1744+ *new_snapshot = 0;
1745+ ps->valid = le32_to_cpu(dh->valid);
1746+ ps->version = le32_to_cpu(dh->version);
1747+ ps->chunk_size = le32_to_cpu(dh->chunk_size);
1748+
1749+ } else {
1750+ DMWARN("Invalid/corrupt snapshot");
1751+ r = -ENXIO;
1752+ }
1753+
1754+ return r;
1755+}
1756+
1757+static int write_header(struct pstore *ps)
1758+{
1759+ struct disk_header *dh;
1760+
1761+ memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT);
1762+
1763+ dh = (struct disk_header *) ps->area;
1764+ dh->magic = cpu_to_le32(SNAP_MAGIC);
1765+ dh->valid = cpu_to_le32(ps->valid);
1766+ dh->version = cpu_to_le32(ps->version);
1767+ dh->chunk_size = cpu_to_le32(ps->chunk_size);
1768+
1769+ return chunk_io(ps, 0, WRITE);
1770+}
1771+
1772+/*
1773+ * Access functions for the disk exceptions, these do the endian conversions.
1774+ */
1775+static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
1776+{
1777+ if (index >= ps->exceptions_per_area)
1778+ return NULL;
1779+
1780+ return ((struct disk_exception *) ps->area) + index;
1781+}
1782+
1783+static int read_exception(struct pstore *ps,
1784+ uint32_t index, struct disk_exception *result)
1785+{
1786+ struct disk_exception *e;
1787+
1788+ e = get_exception(ps, index);
1789+ if (!e)
1790+ return -EINVAL;
1791+
1792+ /* copy it */
1793+ result->old_chunk = le64_to_cpu(e->old_chunk);
1794+ result->new_chunk = le64_to_cpu(e->new_chunk);
1795+
1796+ return 0;
1797+}
1798+
1799+static int write_exception(struct pstore *ps,
1800+ uint32_t index, struct disk_exception *de)
1801+{
1802+ struct disk_exception *e;
1803+
1804+ e = get_exception(ps, index);
1805+ if (!e)
1806+ return -EINVAL;
1807+
1808+ /* copy it */
1809+ e->old_chunk = cpu_to_le64(de->old_chunk);
1810+ e->new_chunk = cpu_to_le64(de->new_chunk);
1811+
1812+ return 0;
1813+}
1814+
1815+/*
1816+ * Registers the exceptions that are present in the current area.
1817+ * 'full' is filled in to indicate if the area has been
1818+ * filled.
1819+ */
1820+static int insert_exceptions(struct pstore *ps, int *full)
1821+{
1822+ int r;
1823+ unsigned int i;
1824+ struct disk_exception de;
1825+
1826+ /* presume the area is full */
1827+ *full = 1;
1828+
1829+ for (i = 0; i < ps->exceptions_per_area; i++) {
1830+ r = read_exception(ps, i, &de);
1831+
1832+ if (r)
1833+ return r;
1834+
1835+ /*
1836+ * If the new_chunk is pointing at the start of
1837+ * the COW device, where the first metadata area
1838+ * is we know that we've hit the end of the
1839+ * exceptions. Therefore the area is not full.
1840+ */
1841+ if (de.new_chunk == 0LL) {
1842+ ps->current_committed = i;
1843+ *full = 0;
1844+ break;
1845+ }
1846+
1847+ /*
1848+ * Keep track of the start of the free chunks.
1849+ */
1850+ if (ps->next_free <= de.new_chunk)
1851+ ps->next_free = de.new_chunk + 1;
1852+
1853+ /*
1854+ * Otherwise we add the exception to the snapshot.
1855+ */
1856+ r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk);
1857+ if (r)
1858+ return r;
1859+ }
1860+
1861+ return 0;
1862+}
1863+
1864+static int read_exceptions(struct pstore *ps)
1865+{
1866+ uint32_t area;
1867+ int r, full = 1;
1868+
1869+ /*
1870+ * Keeping reading chunks and inserting exceptions until
1871+ * we find a partially full area.
1872+ */
1873+ for (area = 0; full; area++) {
1874+ r = area_io(ps, area, READ);
1875+ if (r)
1876+ return r;
1877+
1878+ r = insert_exceptions(ps, &full);
1879+ if (r)
1880+ return r;
1881+ }
1882+
1883+ return 0;
1884+}
1885+
1886+static inline struct pstore *get_info(struct exception_store *store)
1887+{
1888+ return (struct pstore *) store->context;
1889+}
1890+
1891+static void persistent_fraction_full(struct exception_store *store,
1892+ sector_t *numerator, sector_t *denominator)
1893+{
1894+ *numerator = get_info(store)->next_free * store->snap->chunk_size;
1895+ *denominator = get_dev_size(store->snap->cow->dev);
1896+}
1897+
1898+static void persistent_destroy(struct exception_store *store)
1899+{
1900+ struct pstore *ps = get_info(store);
1901+
1902+ dm_io_put(sectors_to_pages(ps->chunk_size));
1903+ vfree(ps->callbacks);
1904+ free_area(ps);
1905+ kfree(ps);
1906+}
1907+
1908+static int persistent_read_metadata(struct exception_store *store)
1909+{
1910+ int r, new_snapshot;
1911+ struct pstore *ps = get_info(store);
1912+
1913+ /*
1914+ * Read the snapshot header.
1915+ */
1916+ r = read_header(ps, &new_snapshot);
1917+ if (r)
1918+ return r;
1919+
1920+ /*
1921+ * Do we need to setup a new snapshot ?
1922+ */
1923+ if (new_snapshot) {
1924+ r = write_header(ps);
1925+ if (r) {
1926+ DMWARN("write_header failed");
1927+ return r;
1928+ }
1929+
1930+ r = zero_area(ps, 0);
1931+ if (r) {
1932+ DMWARN("zero_area(0) failed");
1933+ return r;
1934+ }
1935+
1936+ } else {
1937+ /*
1938+ * Sanity checks.
1939+ */
1940+ if (!ps->valid) {
1941+ DMWARN("snapshot is marked invalid");
1942+ return -EINVAL;
1943+ }
1944+
1945+ if (ps->version != SNAPSHOT_DISK_VERSION) {
1946+ DMWARN("unable to handle snapshot disk version %d",
1947+ ps->version);
1948+ return -EINVAL;
1949+ }
1950+
1951+ /*
1952+ * Read the metadata.
1953+ */
1954+ r = read_exceptions(ps);
1955+ if (r)
1956+ return r;
1957+ }
1958+
1959+ return 0;
1960+}
1961+
1962+static int persistent_prepare(struct exception_store *store,
1963+ struct exception *e)
1964+{
1965+ struct pstore *ps = get_info(store);
1966+ uint32_t stride;
1967+ sector_t size = get_dev_size(store->snap->cow->dev);
1968+
1969+ /* Is there enough room ? */
1970+ if (size < ((ps->next_free + 1) * store->snap->chunk_size))
1971+ return -ENOSPC;
1972+
1973+ e->new_chunk = ps->next_free;
1974+
1975+ /*
1976+ * Move onto the next free pending, making sure to take
1977+ * into account the location of the metadata chunks.
1978+ */
1979+ stride = (ps->exceptions_per_area + 1);
1980+ if ((++ps->next_free % stride) == 1)
1981+ ps->next_free++;
1982+
1983+ atomic_inc(&ps->pending_count);
1984+ return 0;
1985+}
1986+
1987+static void persistent_commit(struct exception_store *store,
1988+ struct exception *e,
1989+ void (*callback) (void *, int success),
1990+ void *callback_context)
1991+{
1992+ int r;
1993+ unsigned int i;
1994+ struct pstore *ps = get_info(store);
1995+ struct disk_exception de;
1996+ struct commit_callback *cb;
1997+
1998+ de.old_chunk = e->old_chunk;
1999+ de.new_chunk = e->new_chunk;
2000+ write_exception(ps, ps->current_committed++, &de);
2001+
2002+ /*
2003+ * Add the callback to the back of the array. This code
2004+ * is the only place where the callback array is
2005+ * manipulated, and we know that it will never be called
2006+ * multiple times concurrently.
2007+ */
2008+ cb = ps->callbacks + ps->callback_count++;
2009+ cb->callback = callback;
2010+ cb->context = callback_context;
2011+
2012+ /*
2013+ * If there are no more exceptions in flight, or we have
2014+ * filled this metadata area we commit the exceptions to
2015+ * disk.
2016+ */
2017+ if (atomic_dec_and_test(&ps->pending_count) ||
2018+ (ps->current_committed == ps->exceptions_per_area)) {
2019+ r = area_io(ps, ps->current_area, WRITE);
2020+ if (r)
2021+ ps->valid = 0;
2022+
2023+ for (i = 0; i < ps->callback_count; i++) {
2024+ cb = ps->callbacks + i;
2025+ cb->callback(cb->context, r == 0 ? 1 : 0);
2026+ }
2027+
2028+ ps->callback_count = 0;
2029+ }
2030+
2031+ /*
2032+ * Have we completely filled the current area ?
2033+ */
2034+ if (ps->current_committed == ps->exceptions_per_area) {
2035+ ps->current_committed = 0;
2036+ r = zero_area(ps, ps->current_area + 1);
2037+ if (r)
2038+ ps->valid = 0;
2039+ }
2040+}
2041+
2042+static void persistent_drop(struct exception_store *store)
2043+{
2044+ struct pstore *ps = get_info(store);
2045+
2046+ ps->valid = 0;
2047+ if (write_header(ps))
2048+ DMWARN("write header failed");
2049+}
2050+
2051+int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
2052+{
2053+ int r;
2054+ struct pstore *ps;
2055+
2056+ r = dm_io_get(sectors_to_pages(chunk_size));
2057+ if (r)
2058+ return r;
2059+
2060+ /* allocate the pstore */
2061+ ps = kmalloc(sizeof(*ps), GFP_KERNEL);
2062+ if (!ps) {
2063+ r = -ENOMEM;
2064+ goto bad;
2065+ }
2066+
2067+ ps->snap = store->snap;
2068+ ps->valid = 1;
2069+ ps->version = SNAPSHOT_DISK_VERSION;
2070+ ps->chunk_size = chunk_size;
2071+ ps->exceptions_per_area = (chunk_size << SECTOR_SHIFT) /
2072+ sizeof(struct disk_exception);
2073+ ps->next_free = 2; /* skipping the header and first area */
2074+ ps->current_committed = 0;
2075+
2076+ r = alloc_area(ps);
2077+ if (r)
2078+ goto bad;
2079+
2080+ /*
2081+ * Allocate space for all the callbacks.
2082+ */
2083+ ps->callback_count = 0;
2084+ atomic_set(&ps->pending_count, 0);
2085+ ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
2086+ sizeof(*ps->callbacks));
2087+
2088+ if (!ps->callbacks) {
2089+ r = -ENOMEM;
2090+ goto bad;
2091+ }
2092+
2093+ store->destroy = persistent_destroy;
2094+ store->read_metadata = persistent_read_metadata;
2095+ store->prepare_exception = persistent_prepare;
2096+ store->commit_exception = persistent_commit;
2097+ store->drop_snapshot = persistent_drop;
2098+ store->fraction_full = persistent_fraction_full;
2099+ store->context = ps;
2100+
2101+ return 0;
2102+
2103+ bad:
2104+ dm_io_put(sectors_to_pages(chunk_size));
2105+ if (ps) {
2106+ if (ps->callbacks)
2107+ vfree(ps->callbacks);
2108+
2109+ kfree(ps);
2110+ }
2111+ return r;
2112+}
2113+
2114+/*-----------------------------------------------------------------
2115+ * Implementation of the store for non-persistent snapshots.
2116+ *---------------------------------------------------------------*/
2117+struct transient_c {
2118+ sector_t next_free;
2119+};
2120+
2121+void transient_destroy(struct exception_store *store)
2122+{
2123+ kfree(store->context);
2124+}
2125+
2126+int transient_read_metadata(struct exception_store *store)
2127+{
2128+ return 0;
2129+}
2130+
2131+int transient_prepare(struct exception_store *store, struct exception *e)
2132+{
2133+ struct transient_c *tc = (struct transient_c *) store->context;
2134+ sector_t size = get_dev_size(store->snap->cow->dev);
2135+
2136+ if (size < (tc->next_free + store->snap->chunk_size))
2137+ return -1;
2138+
2139+ e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
2140+ tc->next_free += store->snap->chunk_size;
2141+
2142+ return 0;
2143+}
2144+
2145+void transient_commit(struct exception_store *store,
2146+ struct exception *e,
2147+ void (*callback) (void *, int success),
2148+ void *callback_context)
2149+{
2150+ /* Just succeed */
2151+ callback(callback_context, 1);
2152+}
2153+
2154+static void transient_fraction_full(struct exception_store *store,
2155+ sector_t *numerator, sector_t *denominator)
2156+{
2157+ *numerator = ((struct transient_c *) store->context)->next_free;
2158+ *denominator = get_dev_size(store->snap->cow->dev);
2159+}
2160+
2161+int dm_create_transient(struct exception_store *store,
2162+ struct dm_snapshot *s, int blocksize)
2163+{
2164+ struct transient_c *tc;
2165+
2166+ memset(store, 0, sizeof(*store));
2167+ store->destroy = transient_destroy;
2168+ store->read_metadata = transient_read_metadata;
2169+ store->prepare_exception = transient_prepare;
2170+ store->commit_exception = transient_commit;
2171+ store->fraction_full = transient_fraction_full;
2172+ store->snap = s;
2173+
2174+ tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
2175+ if (!tc)
2176+ return -ENOMEM;
2177+
2178+ tc->next_free = 0;
2179+ store->context = tc;
2180+
2181+ return 0;
2182+}
2183diff -urN linux-2.4.24.org/drivers/md/dm.h linux-2.4.24/drivers/md/dm.h
2184--- linux-2.4.24.org/drivers/md/dm.h 1970-01-01 01:00:00.000000000 +0100
2185+++ linux-2.4.24/drivers/md/dm.h 2004-01-18 15:01:29.219471722 +0100
2186@@ -0,0 +1,176 @@
2187+/*
2188+ * Internal header file for device mapper
2189+ *
2190+ * Copyright (C) 2001, 2002 Sistina Software
2191+ *
2192+ * This file is released under the LGPL.
2193+ */
2194+
2195+#ifndef DM_INTERNAL_H
2196+#define DM_INTERNAL_H
2197+
2198+#include <linux/fs.h>
2199+#include <linux/device-mapper.h>
2200+#include <linux/list.h>
2201+#include <linux/blkdev.h>
2202+
2203+#define DM_NAME "device-mapper"
2204+#define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x)
2205+#define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x)
2206+#define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x)
2207+
2208+/*
2209+ * FIXME: I think this should be with the definition of sector_t
2210+ * in types.h.
2211+ */
2212+#ifdef CONFIG_LBD
2213+#define SECTOR_FORMAT "%Lu"
2214+#else
2215+#define SECTOR_FORMAT "%lu"
2216+#endif
2217+
2218+#define SECTOR_SHIFT 9
2219+#define SECTOR_SIZE (1 << SECTOR_SHIFT)
2220+
2221+extern struct block_device_operations dm_blk_dops;
2222+
2223+/*
2224+ * List of devices that a metadevice uses and should open/close.
2225+ */
2226+struct dm_dev {
2227+ struct list_head list;
2228+
2229+ atomic_t count;
2230+ int mode;
2231+ kdev_t dev;
2232+ struct block_device *bdev;
2233+};
2234+
2235+struct dm_table;
2236+struct mapped_device;
2237+
2238+/*-----------------------------------------------------------------
2239+ * Functions for manipulating a struct mapped_device.
2240+ * Drop the reference with dm_put when you finish with the object.
2241+ *---------------------------------------------------------------*/
2242+int dm_create(kdev_t dev, struct mapped_device **md);
2243+
2244+/*
2245+ * Reference counting for md.
2246+ */
2247+void dm_get(struct mapped_device *md);
2248+void dm_put(struct mapped_device *md);
2249+
2250+/*
2251+ * A device can still be used while suspended, but I/O is deferred.
2252+ */
2253+int dm_suspend(struct mapped_device *md);
2254+int dm_resume(struct mapped_device *md);
2255+
2256+/*
2257+ * The device must be suspended before calling this method.
2258+ */
2259+int dm_swap_table(struct mapped_device *md, struct dm_table *t);
2260+
2261+/*
2262+ * Drop a reference on the table when you've finished with the
2263+ * result.
2264+ */
2265+struct dm_table *dm_get_table(struct mapped_device *md);
2266+
2267+/*
2268+ * Event functions.
2269+ */
2270+uint32_t dm_get_event_nr(struct mapped_device *md);
2271+int dm_add_wait_queue(struct mapped_device *md, wait_queue_t *wq,
2272+ uint32_t event_nr);
2273+void dm_remove_wait_queue(struct mapped_device *md, wait_queue_t *wq);
2274+
2275+/*
2276+ * Info functions.
2277+ */
2278+kdev_t dm_kdev(struct mapped_device *md);
2279+int dm_suspended(struct mapped_device *md);
2280+
2281+/*-----------------------------------------------------------------
2282+ * Functions for manipulating a table. Tables are also reference
2283+ * counted.
2284+ *---------------------------------------------------------------*/
2285+int dm_table_create(struct dm_table **result, int mode, unsigned num_targets);
2286+
2287+void dm_table_get(struct dm_table *t);
2288+void dm_table_put(struct dm_table *t);
2289+
2290+int dm_table_add_target(struct dm_table *t, const char *type,
2291+ sector_t start, sector_t len, char *params);
2292+int dm_table_complete(struct dm_table *t);
2293+void dm_table_event_callback(struct dm_table *t,
2294+ void (*fn)(void *), void *context);
2295+void dm_table_event(struct dm_table *t);
2296+sector_t dm_table_get_size(struct dm_table *t);
2297+struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
2298+struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
2299+unsigned int dm_table_get_num_targets(struct dm_table *t);
2300+struct list_head *dm_table_get_devices(struct dm_table *t);
2301+int dm_table_get_mode(struct dm_table *t);
2302+void dm_table_suspend_targets(struct dm_table *t);
2303+void dm_table_resume_targets(struct dm_table *t);
2304+
2305+/*-----------------------------------------------------------------
2306+ * A registry of target types.
2307+ *---------------------------------------------------------------*/
2308+int dm_target_init(void);
2309+void dm_target_exit(void);
2310+struct target_type *dm_get_target_type(const char *name);
2311+void dm_put_target_type(struct target_type *t);
2312+
2313+
2314+/*-----------------------------------------------------------------
2315+ * Useful inlines.
2316+ *---------------------------------------------------------------*/
2317+static inline int array_too_big(unsigned long fixed, unsigned long obj,
2318+ unsigned long num)
2319+{
2320+ return (num > (ULONG_MAX - fixed) / obj);
2321+}
2322+
2323+/*
2324+ * ceiling(n / size) * size
2325+ */
2326+static inline unsigned long dm_round_up(unsigned long n, unsigned long size)
2327+{
2328+ unsigned long r = n % size;
2329+ return n + (r ? (size - r) : 0);
2330+}
2331+
2332+/*
2333+ * Ceiling(n / size)
2334+ */
2335+static inline unsigned long dm_div_up(unsigned long n, unsigned long size)
2336+{
2337+ return dm_round_up(n, size) / size;
2338+}
2339+
2340+const char *dm_kdevname(kdev_t dev);
2341+void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size);
2342+
2343+/*
2344+ * The device-mapper can be driven through one of two interfaces;
2345+ * ioctl or filesystem, depending which patch you have applied.
2346+ */
2347+int dm_interface_init(void);
2348+void dm_interface_exit(void);
2349+
2350+/*
2351+ * Targets for linear and striped mappings
2352+ */
2353+int dm_linear_init(void);
2354+void dm_linear_exit(void);
2355+
2356+int dm_stripe_init(void);
2357+void dm_stripe_exit(void);
2358+
2359+int dm_snapshot_init(void);
2360+void dm_snapshot_exit(void);
2361+
2362+#endif
2363diff -urN linux-2.4.24.org/drivers/md/dm-io.c linux-2.4.24/drivers/md/dm-io.c
2364--- linux-2.4.24.org/drivers/md/dm-io.c 1970-01-01 01:00:00.000000000 +0100
2365+++ linux-2.4.24/drivers/md/dm-io.c 2004-01-18 15:01:25.790191115 +0100
2366@@ -0,0 +1,361 @@
2367+/*
2368+ * Copyright (C) 2003 Sistina Software
2369+ *
2370+ * This file is released under the GPL.
2371+ */
2372+
2373+#include "dm-io.h"
2374+
2375+#include <linux/mempool.h>
2376+#include <linux/module.h>
2377+#include <linux/slab.h>
2378+#include <linux/sched.h>
2379+#include <linux/bitops.h>
2380+
2381+/* FIXME: can we shrink this ? */
2382+struct io_context {
2383+ int rw;
2384+ unsigned int error;
2385+ atomic_t count;
2386+ struct task_struct *sleeper;
2387+ io_notify_fn callback;
2388+ void *context;
2389+};
2390+
2391+/*
2392+ * We maintain a pool of buffer heads for dispatching the io.
2393+ */
2394+static unsigned int _num_bhs;
2395+static mempool_t *_buffer_pool;
2396+
2397+/*
2398+ * io contexts are only dynamically allocated for asynchronous
2399+ * io. Since async io is likely to be the majority of io we'll
2400+ * have the same number of io contexts as buffer heads ! (FIXME:
2401+ * must reduce this).
2402+ */
2403+mempool_t *_io_pool;
2404+
2405+static void *alloc_bh(int gfp_mask, void *pool_data)
2406+{
2407+ struct buffer_head *bh;
2408+
2409+ bh = kmem_cache_alloc(bh_cachep, gfp_mask);
2410+ if (bh) {
2411+ bh->b_reqnext = NULL;
2412+ init_waitqueue_head(&bh->b_wait);
2413+ INIT_LIST_HEAD(&bh->b_inode_buffers);
2414+ }
2415+
2416+ return bh;
2417+}
2418+
2419+static void *alloc_io(int gfp_mask, void *pool_data)
2420+{
2421+ return kmalloc(sizeof(struct io_context), gfp_mask);
2422+}
2423+
2424+static void free_io(void *element, void *pool_data)
2425+{
2426+ kfree(element);
2427+}
2428+
2429+static unsigned int pages_to_buffers(unsigned int pages)
2430+{
2431+ return 4 * pages; /* too many ? */
2432+}
2433+
2434+static int resize_pool(unsigned int new_bhs)
2435+{
2436+ int r = 0;
2437+
2438+ if (_buffer_pool) {
2439+ if (new_bhs == 0) {
2440+ /* free off the pools */
2441+ mempool_destroy(_buffer_pool);
2442+ mempool_destroy(_io_pool);
2443+ _buffer_pool = _io_pool = NULL;
2444+ } else {
2445+ /* resize the pools */
2446+ r = mempool_resize(_buffer_pool, new_bhs, GFP_KERNEL);
2447+ if (!r)
2448+ r = mempool_resize(_io_pool,
2449+ new_bhs, GFP_KERNEL);
2450+ }
2451+ } else {
2452+ /* create new pools */
2453+ _buffer_pool = mempool_create(new_bhs, alloc_bh,
2454+ mempool_free_slab, bh_cachep);
2455+ if (!_buffer_pool)
2456+ r = -ENOMEM;
2457+
2458+ _io_pool = mempool_create(new_bhs, alloc_io, free_io, NULL);
2459+ if (!_io_pool) {
2460+ mempool_destroy(_buffer_pool);
2461+ _buffer_pool = NULL;
2462+ r = -ENOMEM;
2463+ }
2464+ }
2465+
2466+ if (!r)
2467+ _num_bhs = new_bhs;
2468+
2469+ return r;
2470+}
2471+
2472+int dm_io_get(unsigned int num_pages)
2473+{
2474+ return resize_pool(_num_bhs + pages_to_buffers(num_pages));
2475+}
2476+
2477+void dm_io_put(unsigned int num_pages)
2478+{
2479+ resize_pool(_num_bhs - pages_to_buffers(num_pages));
2480+}
2481+
2482+/*-----------------------------------------------------------------
2483+ * We need to keep track of which region a buffer is doing io
2484+ * for. In order to save a memory allocation we store this in an
2485+ * unused field of the buffer head, and provide these access
2486+ * functions.
2487+ *
2488+ * FIXME: add compile time check that an unsigned int can fit
2489+ * into a pointer.
2490+ *
2491+ *---------------------------------------------------------------*/
2492+static inline void bh_set_region(struct buffer_head *bh, unsigned int region)
2493+{
2494+ bh->b_journal_head = (void *) region;
2495+}
2496+
2497+static inline int bh_get_region(struct buffer_head *bh)
2498+{
2499+ return (unsigned int) bh->b_journal_head;
2500+}
2501+
2502+/*-----------------------------------------------------------------
2503+ * We need an io object to keep track of the number of bhs that
2504+ * have been dispatched for a particular io.
2505+ *---------------------------------------------------------------*/
2506+static void dec_count(struct io_context *io, unsigned int region, int error)
2507+{
2508+ if (error)
2509+ set_bit(region, &io->error);
2510+
2511+ if (atomic_dec_and_test(&io->count)) {
2512+ if (io->sleeper)
2513+ wake_up_process(io->sleeper);
2514+
2515+ else {
2516+ int r = io->error;
2517+ io_notify_fn fn = io->callback;
2518+ void *context = io->context;
2519+
2520+ mempool_free(io, _io_pool);
2521+ fn(r, context);
2522+ }
2523+ }
2524+}
2525+
2526+static void endio(struct buffer_head *bh, int uptodate)
2527+{
2528+ struct io_context *io = (struct io_context *) bh->b_private;
2529+
2530+ if (!uptodate && io->rw != WRITE) {
2531+ /*
2532+ * We need to zero this region, otherwise people
2533+ * like kcopyd may write the arbitrary contents
2534+ * of the page.
2535+ */
2536+ memset(bh->b_data, 0, bh->b_size);
2537+ }
2538+
2539+ dec_count((struct io_context *) bh->b_private,
2540+ bh_get_region(bh), !uptodate);
2541+ mempool_free(bh, _buffer_pool);
2542+}
2543+
2544+/*
2545+ * Primitives for alignment calculations.
2546+ */
2547+int fls(unsigned n)
2548+{
2549+ return generic_fls32(n);
2550+}
2551+
2552+static inline int log2_floor(unsigned n)
2553+{
2554+ return ffs(n) - 1;
2555+}
2556+
2557+static inline int log2_align(unsigned n)
2558+{
2559+ return fls(n) - 1;
2560+}
2561+
2562+/*
2563+ * Returns the next block for io.
2564+ */
2565+static int do_page(kdev_t dev, sector_t *block, sector_t end_block,
2566+ unsigned int block_size,
2567+ struct page *p, unsigned int offset,
2568+ unsigned int region, struct io_context *io)
2569+{
2570+ struct buffer_head *bh;
2571+ sector_t b = *block;
2572+ sector_t blocks_per_page = PAGE_SIZE / block_size;
2573+ unsigned int this_size; /* holds the size of the current io */
2574+ sector_t len;
2575+
2576+ if (!blocks_per_page) {
2577+ DMERR("dm-io: PAGE_SIZE (%lu) < block_size (%u) unsupported",
2578+ PAGE_SIZE, block_size);
2579+ return 0;
2580+ }
2581+
2582+ while ((offset < PAGE_SIZE) && (b != end_block)) {
2583+ bh = mempool_alloc(_buffer_pool, GFP_NOIO);
2584+ init_buffer(bh, endio, io);
2585+ bh_set_region(bh, region);
2586+
2587+ /*
2588+ * Block size must be a power of 2 and aligned
2589+ * correctly.
2590+ */
2591+
2592+ len = min(end_block - b, blocks_per_page);
2593+ len = min(len, blocks_per_page - offset / block_size);
2594+
2595+ if (!len) {
2596+ DMERR("dm-io: Invalid offset/block_size (%u/%u).",
2597+ offset, block_size);
2598+ return 0;
2599+ }
2600+
2601+ this_size = 1 << log2_align(len);
2602+ if (b)
2603+ this_size = min(this_size,
2604+ (unsigned) 1 << log2_floor(b));
2605+
2606+ /*
2607+ * Add in the job offset.
2608+ */
2609+ bh->b_blocknr = (b / this_size);
2610+ bh->b_size = block_size * this_size;
2611+ set_bh_page(bh, p, offset);
2612+ bh->b_this_page = bh;
2613+
2614+ bh->b_dev = dev;
2615+ atomic_set(&bh->b_count, 1);
2616+
2617+ bh->b_state = ((1 << BH_Uptodate) | (1 << BH_Mapped) |
2618+ (1 << BH_Lock));
2619+
2620+ if (io->rw == WRITE)
2621+ clear_bit(BH_Dirty, &bh->b_state);
2622+
2623+ atomic_inc(&io->count);
2624+ submit_bh(io->rw, bh);
2625+
2626+ b += this_size;
2627+ offset += block_size * this_size;
2628+ }
2629+
2630+ *block = b;
2631+ return (b == end_block);
2632+}
2633+
2634+static void do_region(unsigned int region, struct io_region *where,
2635+ struct page *page, unsigned int offset,
2636+ struct io_context *io)
2637+{
2638+ unsigned int block_size = get_hardsect_size(where->dev);
2639+ unsigned int sblock_size = block_size >> 9;
2640+ sector_t block = where->sector / sblock_size;
2641+ sector_t end_block = (where->sector + where->count) / sblock_size;
2642+
2643+ while (1) {
2644+ if (do_page(where->dev, &block, end_block, block_size,
2645+ page, offset, region, io))
2646+ break;
2647+
2648+ offset = 0; /* only offset the first page */
2649+
2650+ page = list_entry(page->list.next, struct page, list);
2651+ }
2652+}
2653+
2654+static void dispatch_io(unsigned int num_regions, struct io_region *where,
2655+ struct page *pages, unsigned int offset,
2656+ struct io_context *io)
2657+{
2658+ int i;
2659+
2660+ for (i = 0; i < num_regions; i++)
2661+ if (where[i].count)
2662+ do_region(i, where + i, pages, offset, io);
2663+
2664+ /*
2665+ * Drop the extra refence that we were holding to avoid
2666+ * the io being completed too early.
2667+ */
2668+ dec_count(io, 0, 0);
2669+}
2670+
2671+/*
2672+ * Synchronous io
2673+ */
2674+int dm_io_sync(unsigned int num_regions, struct io_region *where,
2675+ int rw, struct page *pages, unsigned int offset,
2676+ unsigned int *error_bits)
2677+{
2678+ struct io_context io;
2679+
2680+ BUG_ON(num_regions > 1 && rw != WRITE);
2681+
2682+ io.rw = rw;
2683+ io.error = 0;
2684+ atomic_set(&io.count, 1); /* see dispatch_io() */
2685+ io.sleeper = current;
2686+
2687+ dispatch_io(num_regions, where, pages, offset, &io);
2688+ run_task_queue(&tq_disk);
2689+
2690+ while (1) {
2691+ set_current_state(TASK_UNINTERRUPTIBLE);
2692+
2693+ if (!atomic_read(&io.count))
2694+ break;
2695+
2696+ schedule();
2697+ }
2698+ set_current_state(TASK_RUNNING);
2699+
2700+ *error_bits = io.error;
2701+ return io.error ? -EIO : 0;
2702+}
2703+
2704+/*
2705+ * Asynchronous io
2706+ */
2707+int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
2708+ struct page *pages, unsigned int offset,
2709+ io_notify_fn fn, void *context)
2710+{
2711+ struct io_context *io = mempool_alloc(_io_pool, GFP_NOIO);
2712+
2713+ io->rw = rw;
2714+ io->error = 0;
2715+ atomic_set(&io->count, 1); /* see dispatch_io() */
2716+ io->sleeper = NULL;
2717+ io->callback = fn;
2718+ io->context = context;
2719+
2720+ dispatch_io(num_regions, where, pages, offset, io);
2721+ return 0;
2722+}
2723+
2724+EXPORT_SYMBOL(dm_io_get);
2725+EXPORT_SYMBOL(dm_io_put);
2726+EXPORT_SYMBOL(dm_io_sync);
2727+EXPORT_SYMBOL(dm_io_async);
2728diff -urN linux-2.4.24.org/drivers/md/dm-ioctl.c linux-2.4.24/drivers/md/dm-ioctl.c
2729--- linux-2.4.24.org/drivers/md/dm-ioctl.c 1970-01-01 01:00:00.000000000 +0100
2730+++ linux-2.4.24/drivers/md/dm-ioctl.c 2004-01-18 15:01:17.790869761 +0100
2731@@ -0,0 +1,1284 @@
2732+/*
2733+ * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
2734+ *
2735+ * This file is released under the GPL.
2736+ */
2737+
2738+#include "dm.h"
2739+
2740+#include <linux/module.h>
2741+#include <linux/vmalloc.h>
2742+#include <linux/miscdevice.h>
2743+#include <linux/dm-ioctl.h>
2744+#include <linux/init.h>
2745+#include <linux/wait.h>
2746+#include <linux/blk.h>
2747+#include <linux/slab.h>
2748+
2749+#include <asm/uaccess.h>
2750+
2751+#define DM_DRIVER_EMAIL "dm@uk.sistina.com"
2752+
2753+/*-----------------------------------------------------------------
2754+ * The ioctl interface needs to be able to look up devices by
2755+ * name or uuid.
2756+ *---------------------------------------------------------------*/
2757+struct hash_cell {
2758+ struct list_head name_list;
2759+ struct list_head uuid_list;
2760+
2761+ char *name;
2762+ char *uuid;
2763+ struct mapped_device *md;
2764+ struct dm_table *new_map;
2765+
2766+ /* I hate devfs */
2767+ devfs_handle_t devfs_entry;
2768+};
2769+
2770+#define NUM_BUCKETS 64
2771+#define MASK_BUCKETS (NUM_BUCKETS - 1)
2772+static struct list_head _name_buckets[NUM_BUCKETS];
2773+static struct list_head _uuid_buckets[NUM_BUCKETS];
2774+
2775+static devfs_handle_t _dev_dir;
2776+void dm_hash_remove_all(void);
2777+
2778+/*
2779+ * Guards access to both hash tables.
2780+ */
2781+static DECLARE_RWSEM(_hash_lock);
2782+
2783+static void init_buckets(struct list_head *buckets)
2784+{
2785+ unsigned int i;
2786+
2787+ for (i = 0; i < NUM_BUCKETS; i++)
2788+ INIT_LIST_HEAD(buckets + i);
2789+}
2790+
2791+int dm_hash_init(void)
2792+{
2793+ init_buckets(_name_buckets);
2794+ init_buckets(_uuid_buckets);
2795+ _dev_dir = devfs_mk_dir(0, DM_DIR, NULL);
2796+ return 0;
2797+}
2798+
2799+void dm_hash_exit(void)
2800+{
2801+ dm_hash_remove_all();
2802+ devfs_unregister(_dev_dir);
2803+}
2804+
2805+/*-----------------------------------------------------------------
2806+ * Hash function:
2807+ * We're not really concerned with the str hash function being
2808+ * fast since it's only used by the ioctl interface.
2809+ *---------------------------------------------------------------*/
2810+static unsigned int hash_str(const char *str)
2811+{
2812+ const unsigned int hash_mult = 2654435387U;
2813+ unsigned int h = 0;
2814+
2815+ while (*str)
2816+ h = (h + (unsigned int) *str++) * hash_mult;
2817+
2818+ return h & MASK_BUCKETS;
2819+}
2820+
2821+/*-----------------------------------------------------------------
2822+ * Code for looking up a device by name
2823+ *---------------------------------------------------------------*/
2824+static struct hash_cell *__get_name_cell(const char *str)
2825+{
2826+ struct list_head *tmp;
2827+ struct hash_cell *hc;
2828+ unsigned int h = hash_str(str);
2829+
2830+ list_for_each (tmp, _name_buckets + h) {
2831+ hc = list_entry(tmp, struct hash_cell, name_list);
2832+ if (!strcmp(hc->name, str))
2833+ return hc;
2834+ }
2835+
2836+ return NULL;
2837+}
2838+
2839+static struct hash_cell *__get_uuid_cell(const char *str)
2840+{
2841+ struct list_head *tmp;
2842+ struct hash_cell *hc;
2843+ unsigned int h = hash_str(str);
2844+
2845+ list_for_each (tmp, _uuid_buckets + h) {
2846+ hc = list_entry(tmp, struct hash_cell, uuid_list);
2847+ if (!strcmp(hc->uuid, str))
2848+ return hc;
2849+ }
2850+
2851+ return NULL;
2852+}
2853+
2854+/*-----------------------------------------------------------------
2855+ * Inserting, removing and renaming a device.
2856+ *---------------------------------------------------------------*/
2857+static inline char *kstrdup(const char *str)
2858+{
2859+ char *r = kmalloc(strlen(str) + 1, GFP_KERNEL);
2860+ if (r)
2861+ strcpy(r, str);
2862+ return r;
2863+}
2864+
2865+static struct hash_cell *alloc_cell(const char *name, const char *uuid,
2866+ struct mapped_device *md)
2867+{
2868+ struct hash_cell *hc;
2869+
2870+ hc = kmalloc(sizeof(*hc), GFP_KERNEL);
2871+ if (!hc)
2872+ return NULL;
2873+
2874+ hc->name = kstrdup(name);
2875+ if (!hc->name) {
2876+ kfree(hc);
2877+ return NULL;
2878+ }
2879+
2880+ if (!uuid)
2881+ hc->uuid = NULL;
2882+
2883+ else {
2884+ hc->uuid = kstrdup(uuid);
2885+ if (!hc->uuid) {
2886+ kfree(hc->name);
2887+ kfree(hc);
2888+ return NULL;
2889+ }
2890+ }
2891+
2892+ INIT_LIST_HEAD(&hc->name_list);
2893+ INIT_LIST_HEAD(&hc->uuid_list);
2894+ hc->md = md;
2895+ hc->new_map = NULL;
2896+ return hc;
2897+}
2898+
2899+static void free_cell(struct hash_cell *hc)
2900+{
2901+ if (hc) {
2902+ kfree(hc->name);
2903+ kfree(hc->uuid);
2904+ kfree(hc);
2905+ }
2906+}
2907+
2908+/*
2909+ * devfs stuff.
2910+ */
2911+static int register_with_devfs(struct hash_cell *hc)
2912+{
2913+ kdev_t dev = dm_kdev(hc->md);
2914+
2915+ hc->devfs_entry =
2916+ devfs_register(_dev_dir, hc->name, DEVFS_FL_CURRENT_OWNER,
2917+ major(dev), minor(dev),
2918+ S_IFBLK | S_IRUSR | S_IWUSR | S_IRGRP,
2919+ &dm_blk_dops, NULL);
2920+
2921+ return 0;
2922+}
2923+
2924+static int unregister_with_devfs(struct hash_cell *hc)
2925+{
2926+ devfs_unregister(hc->devfs_entry);
2927+ return 0;
2928+}
2929+
2930+/*
2931+ * The kdev_t and uuid of a device can never change once it is
2932+ * initially inserted.
2933+ */
2934+int dm_hash_insert(const char *name, const char *uuid, struct mapped_device *md)
2935+{
2936+ struct hash_cell *cell;
2937+
2938+ /*
2939+ * Allocate the new cells.
2940+ */
2941+ cell = alloc_cell(name, uuid, md);
2942+ if (!cell)
2943+ return -ENOMEM;
2944+
2945+ /*
2946+ * Insert the cell into both hash tables.
2947+ */
2948+ down_write(&_hash_lock);
2949+ if (__get_name_cell(name))
2950+ goto bad;
2951+
2952+ list_add(&cell->name_list, _name_buckets + hash_str(name));
2953+
2954+ if (uuid) {
2955+ if (__get_uuid_cell(uuid)) {
2956+ list_del(&cell->name_list);
2957+ goto bad;
2958+ }
2959+ list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid));
2960+ }
2961+ register_with_devfs(cell);
2962+ dm_get(md);
2963+ up_write(&_hash_lock);
2964+
2965+ return 0;
2966+
2967+ bad:
2968+ up_write(&_hash_lock);
2969+ free_cell(cell);
2970+ return -EBUSY;
2971+}
2972+
2973+void __hash_remove(struct hash_cell *hc)
2974+{
2975+ /* remove from the dev hash */
2976+ list_del(&hc->uuid_list);
2977+ list_del(&hc->name_list);
2978+ unregister_with_devfs(hc);
2979+ dm_put(hc->md);
2980+ if (hc->new_map)
2981+ dm_table_put(hc->new_map);
2982+ free_cell(hc);
2983+}
2984+
2985+void dm_hash_remove_all(void)
2986+{
2987+ int i;
2988+ struct hash_cell *hc;
2989+ struct list_head *tmp, *n;
2990+
2991+ down_write(&_hash_lock);
2992+ for (i = 0; i < NUM_BUCKETS; i++) {
2993+ list_for_each_safe (tmp, n, _name_buckets + i) {
2994+ hc = list_entry(tmp, struct hash_cell, name_list);
2995+ __hash_remove(hc);
2996+ }
2997+ }
2998+ up_write(&_hash_lock);
2999+}
3000+
3001+int dm_hash_rename(const char *old, const char *new)
3002+{
3003+ char *new_name, *old_name;
3004+ struct hash_cell *hc;
3005+
3006+ /*
3007+ * duplicate new.
3008+ */
3009+ new_name = kstrdup(new);
3010+ if (!new_name)
3011+ return -ENOMEM;
3012+
3013+ down_write(&_hash_lock);
3014+
3015+ /*
3016+ * Is new free ?
3017+ */
3018+ hc = __get_name_cell(new);
3019+ if (hc) {
3020+ DMWARN("asked to rename to an already existing name %s -> %s",
3021+ old, new);
3022+ up_write(&_hash_lock);
3023+ kfree(new_name);
3024+ return -EBUSY;
3025+ }
3026+
3027+ /*
3028+ * Is there such a device as 'old' ?
3029+ */
3030+ hc = __get_name_cell(old);
3031+ if (!hc) {
3032+ DMWARN("asked to rename a non existent device %s -> %s",
3033+ old, new);
3034+ up_write(&_hash_lock);
3035+ kfree(new_name);
3036+ return -ENXIO;
3037+ }
3038+
3039+ /*
3040+ * rename and move the name cell.
3041+ */
3042+ list_del(&hc->name_list);
3043+ old_name = hc->name;
3044+ hc->name = new_name;
3045+ list_add(&hc->name_list, _name_buckets + hash_str(new_name));
3046+
3047+ /* rename the device node in devfs */
3048+ unregister_with_devfs(hc);
3049+ register_with_devfs(hc);
3050+
3051+ up_write(&_hash_lock);
3052+ kfree(old_name);
3053+ return 0;
3054+}
3055+
3056+/*-----------------------------------------------------------------
3057+ * Implementation of the ioctl commands
3058+ *---------------------------------------------------------------*/
3059+/*
3060+ * All the ioctl commands get dispatched to functions with this
3061+ * prototype.
3062+ */
3063+typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size);
3064+
3065+static int remove_all(struct dm_ioctl *param, size_t param_size)
3066+{
3067+ dm_hash_remove_all();
3068+ param->data_size = 0;
3069+ return 0;
3070+}
3071+
3072+/*
3073+ * Round up the ptr to an 8-byte boundary.
3074+ */
3075+#define ALIGN_MASK 7
3076+static inline void *align_ptr(void *ptr)
3077+{
3078+ return (void *) (((size_t) (ptr + ALIGN_MASK)) & ~ALIGN_MASK);
3079+}
3080+
3081+/*
3082+ * Retrieves the data payload buffer from an already allocated
3083+ * struct dm_ioctl.
3084+ */
3085+static void *get_result_buffer(struct dm_ioctl *param, size_t param_size,
3086+ size_t *len)
3087+{
3088+ param->data_start = align_ptr(param + 1) - (void *) param;
3089+
3090+ if (param->data_start < param_size)
3091+ *len = param_size - param->data_start;
3092+ else
3093+ *len = 0;
3094+
3095+ return ((void *) param) + param->data_start;
3096+}
3097+
3098+static int list_devices(struct dm_ioctl *param, size_t param_size)
3099+{
3100+ unsigned int i;
3101+ struct hash_cell *hc;
3102+ size_t len, needed = 0;
3103+ struct dm_name_list *nl, *old_nl = NULL;
3104+
3105+ down_write(&_hash_lock);
3106+
3107+ /*
3108+ * Loop through all the devices working out how much
3109+ * space we need.
3110+ */
3111+ for (i = 0; i < NUM_BUCKETS; i++) {
3112+ list_for_each_entry (hc, _name_buckets + i, name_list) {
3113+ needed += sizeof(struct dm_name_list);
3114+ needed += strlen(hc->name);
3115+ needed += ALIGN_MASK;
3116+ }
3117+ }
3118+
3119+ /*
3120+ * Grab our output buffer.
3121+ */
3122+ nl = get_result_buffer(param, param_size, &len);
3123+ if (len < needed) {
3124+ param->flags |= DM_BUFFER_FULL_FLAG;
3125+ goto out;
3126+ }
3127+ param->data_size = param->data_start + needed;
3128+
3129+ nl->dev = 0; /* Flags no data */
3130+
3131+ /*
3132+ * Now loop through filling out the names.
3133+ */
3134+ for (i = 0; i < NUM_BUCKETS; i++) {
3135+ list_for_each_entry (hc, _name_buckets + i, name_list) {
3136+ if (old_nl)
3137+ old_nl->next = (uint32_t) ((void *) nl -
3138+ (void *) old_nl);
3139+
3140+ nl->dev = dm_kdev(hc->md);
3141+ nl->next = 0;
3142+ strcpy(nl->name, hc->name);
3143+
3144+ old_nl = nl;
3145+ nl = align_ptr(((void *) ++nl) + strlen(hc->name) + 1);
3146+ }
3147+ }
3148+
3149+ out:
3150+ up_write(&_hash_lock);
3151+ return 0;
3152+}
3153+
3154+static int check_name(const char *name)
3155+{
3156+ if (strchr(name, '/')) {
3157+ DMWARN("invalid device name");
3158+ return -EINVAL;
3159+ }
3160+
3161+ return 0;
3162+}
3163+
3164+/*
3165+ * Fills in a dm_ioctl structure, ready for sending back to
3166+ * userland.
3167+ */
3168+static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
3169+{
3170+ kdev_t dev = dm_kdev(md);
3171+ struct dm_table *table;
3172+ struct block_device *bdev;
3173+
3174+ param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
3175+ DM_ACTIVE_PRESENT_FLAG);
3176+
3177+ if (dm_suspended(md))
3178+ param->flags |= DM_SUSPEND_FLAG;
3179+
3180+ param->dev = kdev_t_to_nr(dev);
3181+
3182+ if (is_read_only(dev))
3183+ param->flags |= DM_READONLY_FLAG;
3184+
3185+ param->event_nr = dm_get_event_nr(md);
3186+
3187+ table = dm_get_table(md);
3188+ if (table) {
3189+ param->flags |= DM_ACTIVE_PRESENT_FLAG;
3190+ param->target_count = dm_table_get_num_targets(table);
3191+ dm_table_put(table);
3192+ } else
3193+ param->target_count = 0;
3194+
3195+ bdev = bdget(param->dev);
3196+ if (!bdev)
3197+ return -ENXIO;
3198+ param->open_count = bdev->bd_openers;
3199+ bdput(bdev);
3200+
3201+ return 0;
3202+}
3203+
3204+static int dev_create(struct dm_ioctl *param, size_t param_size)
3205+{
3206+ int r;
3207+ kdev_t dev = 0;
3208+ struct mapped_device *md;
3209+
3210+ r = check_name(param->name);
3211+ if (r)
3212+ return r;
3213+
3214+ if (param->flags & DM_PERSISTENT_DEV_FLAG)
3215+ dev = to_kdev_t(param->dev);
3216+
3217+ r = dm_create(dev, &md);
3218+ if (r)
3219+ return r;
3220+
3221+ r = dm_hash_insert(param->name, *param->uuid ? param->uuid : NULL, md);
3222+ if (r) {
3223+ dm_put(md);
3224+ return r;
3225+ }
3226+
3227+ param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
3228+
3229+ r = __dev_status(md, param);
3230+ dm_put(md);
3231+
3232+ return r;
3233+}
3234+
3235+/*
3236+ * Always use UUID for lookups if it's present, otherwise use name.
3237+ */
3238+static inline struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param)
3239+{
3240+ return *param->uuid ?
3241+ __get_uuid_cell(param->uuid) : __get_name_cell(param->name);
3242+}
3243+
3244+static inline struct mapped_device *find_device(struct dm_ioctl *param)
3245+{
3246+ struct hash_cell *hc;
3247+ struct mapped_device *md = NULL;
3248+
3249+ down_read(&_hash_lock);
3250+ hc = __find_device_hash_cell(param);
3251+ if (hc) {
3252+ md = hc->md;
3253+
3254+ /*
3255+ * Sneakily write in both the name and the uuid
3256+ * while we have the cell.
3257+ */
3258+ strncpy(param->name, hc->name, sizeof(param->name));
3259+ if (hc->uuid)
3260+ strncpy(param->uuid, hc->uuid, sizeof(param->uuid) - 1);
3261+ else
3262+ param->uuid[0] = '\0';
3263+
3264+ if (hc->new_map)
3265+ param->flags |= DM_INACTIVE_PRESENT_FLAG;
3266+ else
3267+ param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
3268+
3269+ dm_get(md);
3270+ }
3271+ up_read(&_hash_lock);
3272+
3273+ return md;
3274+}
3275+
3276+static int dev_remove(struct dm_ioctl *param, size_t param_size)
3277+{
3278+ struct hash_cell *hc;
3279+
3280+ down_write(&_hash_lock);
3281+ hc = __find_device_hash_cell(param);
3282+
3283+ if (!hc) {
3284+ DMWARN("device doesn't appear to be in the dev hash table.");
3285+ up_write(&_hash_lock);
3286+ return -ENXIO;
3287+ }
3288+
3289+ __hash_remove(hc);
3290+ up_write(&_hash_lock);
3291+ param->data_size = 0;
3292+ return 0;
3293+}
3294+
3295+/*
3296+ * Check a string doesn't overrun the chunk of
3297+ * memory we copied from userland.
3298+ */
3299+static int invalid_str(char *str, void *end)
3300+{
3301+ while ((void *) str < end)
3302+ if (!*str++)
3303+ return 0;
3304+
3305+ return -EINVAL;
3306+}
3307+
3308+static int dev_rename(struct dm_ioctl *param, size_t param_size)
3309+{
3310+ int r;
3311+ char *new_name = (char *) param + param->data_start;
3312+
3313+ if (new_name < (char *) (param + 1) ||
3314+ invalid_str(new_name, (void *) param + param_size)) {
3315+ DMWARN("Invalid new logical volume name supplied.");
3316+ return -EINVAL;
3317+ }
3318+
3319+ r = check_name(new_name);
3320+ if (r)
3321+ return r;
3322+
3323+ param->data_size = 0;
3324+ return dm_hash_rename(param->name, new_name);
3325+}
3326+
3327+static int do_suspend(struct dm_ioctl *param)
3328+{
3329+ int r = 0;
3330+ struct mapped_device *md;
3331+
3332+ md = find_device(param);
3333+ if (!md)
3334+ return -ENXIO;
3335+
3336+ if (!dm_suspended(md))
3337+ r = dm_suspend(md);
3338+
3339+ if (!r)
3340+ r = __dev_status(md, param);
3341+
3342+ dm_put(md);
3343+ return r;
3344+}
3345+
3346+static int do_resume(struct dm_ioctl *param)
3347+{
3348+ int r = 0;
3349+ struct hash_cell *hc;
3350+ struct mapped_device *md;
3351+ struct dm_table *new_map;
3352+
3353+ down_write(&_hash_lock);
3354+
3355+ hc = __find_device_hash_cell(param);
3356+ if (!hc) {
3357+ DMWARN("device doesn't appear to be in the dev hash table.");
3358+ up_write(&_hash_lock);
3359+ return -ENXIO;
3360+ }
3361+
3362+ md = hc->md;
3363+ dm_get(md);
3364+
3365+ new_map = hc->new_map;
3366+ hc->new_map = NULL;
3367+ param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
3368+
3369+ up_write(&_hash_lock);
3370+
3371+ /* Do we need to load a new map ? */
3372+ if (new_map) {
3373+ /* Suspend if it isn't already suspended */
3374+ if (!dm_suspended(md))
3375+ dm_suspend(md);
3376+
3377+ r = dm_swap_table(md, new_map);
3378+ if (r) {
3379+ dm_put(md);
3380+ dm_table_put(new_map);
3381+ return r;
3382+ }
3383+
3384+ if (dm_table_get_mode(new_map) & FMODE_WRITE)
3385+ set_device_ro(dm_kdev(md), 0);
3386+ else
3387+ set_device_ro(dm_kdev(md), 1);
3388+
3389+ dm_table_put(new_map);
3390+ }
3391+
3392+ if (dm_suspended(md))
3393+ r = dm_resume(md);
3394+
3395+ if (!r)
3396+ r = __dev_status(md, param);
3397+
3398+ dm_put(md);
3399+ return r;
3400+}
3401+
3402+/*
3403+ * Set or unset the suspension state of a device.
3404+ * If the device already is in the requested state we just return its status.
3405+ */
3406+static int dev_suspend(struct dm_ioctl *param, size_t param_size)
3407+{
3408+ if (param->flags & DM_SUSPEND_FLAG)
3409+ return do_suspend(param);
3410+
3411+ return do_resume(param);
3412+}
3413+
3414+/*
3415+ * Copies device info back to user space, used by
3416+ * the create and info ioctls.
3417+ */
3418+static int dev_status(struct dm_ioctl *param, size_t param_size)
3419+{
3420+ int r;
3421+ struct mapped_device *md;
3422+
3423+ md = find_device(param);
3424+ if (!md)
3425+ return -ENXIO;
3426+
3427+ r = __dev_status(md, param);
3428+ dm_put(md);
3429+ return r;
3430+}
3431+
3432+/*
3433+ * Build up the status struct for each target
3434+ */
3435+static void retrieve_status(struct dm_table *table, struct dm_ioctl *param,
3436+ size_t param_size)
3437+{
3438+ unsigned int i, num_targets;
3439+ struct dm_target_spec *spec;
3440+ char *outbuf, *outptr;
3441+ status_type_t type;
3442+ size_t remaining, len, used = 0;
3443+
3444+ outptr = outbuf = get_result_buffer(param, param_size, &len);
3445+
3446+ if (param->flags & DM_STATUS_TABLE_FLAG)
3447+ type = STATUSTYPE_TABLE;
3448+ else
3449+ type = STATUSTYPE_INFO;
3450+
3451+ /* Get all the target info */
3452+ num_targets = dm_table_get_num_targets(table);
3453+ for (i = 0; i < num_targets; i++) {
3454+ struct dm_target *ti = dm_table_get_target(table, i);
3455+
3456+ remaining = len - (outptr - outbuf);
3457+ if (remaining < sizeof(struct dm_target_spec)) {
3458+ param->flags |= DM_BUFFER_FULL_FLAG;
3459+ break;
3460+ }
3461+
3462+ spec = (struct dm_target_spec *) outptr;
3463+
3464+ spec->status = 0;
3465+ spec->sector_start = ti->begin;
3466+ spec->length = ti->len;
3467+ strncpy(spec->target_type, ti->type->name,
3468+ sizeof(spec->target_type));
3469+
3470+ outptr += sizeof(struct dm_target_spec);
3471+ remaining = len - (outptr - outbuf);
3472+
3473+ /* Get the status/table string from the target driver */
3474+ if (ti->type->status) {
3475+ if (ti->type->status(ti, type, outptr, remaining)) {
3476+ param->flags |= DM_BUFFER_FULL_FLAG;
3477+ break;
3478+ }
3479+ } else
3480+ outptr[0] = '\0';
3481+
3482+ outptr += strlen(outptr) + 1;
3483+ used = param->data_start + (outptr - outbuf);
3484+
3485+ align_ptr(outptr);
3486+ spec->next = outptr - outbuf;
3487+ }
3488+
3489+ if (used)
3490+ param->data_size = used;
3491+
3492+ param->target_count = num_targets;
3493+}
3494+
3495+/*
3496+ * Wait for a device to report an event
3497+ */
3498+static int dev_wait(struct dm_ioctl *param, size_t param_size)
3499+{
3500+ int r;
3501+ struct mapped_device *md;
3502+ struct dm_table *table;
3503+ DECLARE_WAITQUEUE(wq, current);
3504+
3505+ md = find_device(param);
3506+ if (!md)
3507+ return -ENXIO;
3508+
3509+ /*
3510+ * Wait for a notification event
3511+ */
3512+ set_current_state(TASK_INTERRUPTIBLE);
3513+ if (!dm_add_wait_queue(md, &wq, param->event_nr)) {
3514+ schedule();
3515+ dm_remove_wait_queue(md, &wq);
3516+ }
3517+ set_current_state(TASK_RUNNING);
3518+
3519+ /*
3520+ * The userland program is going to want to know what
3521+ * changed to trigger the event, so we may as well tell
3522+ * him and save an ioctl.
3523+ */
3524+ r = __dev_status(md, param);
3525+ if (r)
3526+ goto out;
3527+
3528+ table = dm_get_table(md);
3529+ if (table) {
3530+ retrieve_status(table, param, param_size);
3531+ dm_table_put(table);
3532+ }
3533+
3534+ out:
3535+ dm_put(md);
3536+ return r;
3537+}
3538+
3539+static inline int get_mode(struct dm_ioctl *param)
3540+{
3541+ int mode = FMODE_READ | FMODE_WRITE;
3542+
3543+ if (param->flags & DM_READONLY_FLAG)
3544+ mode = FMODE_READ;
3545+
3546+ return mode;
3547+}
3548+
3549+static int next_target(struct dm_target_spec *last, uint32_t next, void *end,
3550+ struct dm_target_spec **spec, char **target_params)
3551+{
3552+ *spec = (struct dm_target_spec *) ((unsigned char *) last + next);
3553+ *target_params = (char *) (*spec + 1);
3554+
3555+ if (*spec < (last + 1))
3556+ return -EINVAL;
3557+
3558+ return invalid_str(*target_params, end);
3559+}
3560+
3561+static int populate_table(struct dm_table *table, struct dm_ioctl *param,
3562+ size_t param_size)
3563+{
3564+ int r;
3565+ unsigned int i = 0;
3566+ struct dm_target_spec *spec = (struct dm_target_spec *) param;
3567+ uint32_t next = param->data_start;
3568+ void *end = (void *) param + param_size;
3569+ char *target_params;
3570+
3571+ if (!param->target_count) {
3572+ DMWARN("populate_table: no targets specified");
3573+ return -EINVAL;
3574+ }
3575+
3576+ for (i = 0; i < param->target_count; i++) {
3577+
3578+ r = next_target(spec, next, end, &spec, &target_params);
3579+ if (r) {
3580+ DMWARN("unable to find target");
3581+ return r;
3582+ }
3583+
3584+ r = dm_table_add_target(table, spec->target_type,
3585+ (sector_t) spec->sector_start,
3586+ (sector_t) spec->length,
3587+ target_params);
3588+ if (r) {
3589+ DMWARN("error adding target to table");
3590+ return r;
3591+ }
3592+
3593+ next = spec->next;
3594+ }
3595+
3596+ return dm_table_complete(table);
3597+}
3598+
3599+static int table_load(struct dm_ioctl *param, size_t param_size)
3600+{
3601+ int r;
3602+ struct hash_cell *hc;
3603+ struct dm_table *t;
3604+
3605+ r = dm_table_create(&t, get_mode(param), param->target_count);
3606+ if (r)
3607+ return r;
3608+
3609+ r = populate_table(t, param, param_size);
3610+ if (r) {
3611+ dm_table_put(t);
3612+ return r;
3613+ }
3614+
3615+ down_write(&_hash_lock);
3616+ hc = __find_device_hash_cell(param);
3617+ if (!hc) {
3618+ DMWARN("device doesn't appear to be in the dev hash table.");
3619+ up_write(&_hash_lock);
3620+ return -ENXIO;
3621+ }
3622+
3623+ if (hc->new_map)
3624+ dm_table_put(hc->new_map);
3625+ hc->new_map = t;
3626+ param->flags |= DM_INACTIVE_PRESENT_FLAG;
3627+
3628+ r = __dev_status(hc->md, param);
3629+ up_write(&_hash_lock);
3630+ return r;
3631+}
3632+
3633+static int table_clear(struct dm_ioctl *param, size_t param_size)
3634+{
3635+ int r;
3636+ struct hash_cell *hc;
3637+
3638+ down_write(&_hash_lock);
3639+
3640+ hc = __find_device_hash_cell(param);
3641+ if (!hc) {
3642+ DMWARN("device doesn't appear to be in the dev hash table.");
3643+ up_write(&_hash_lock);
3644+ return -ENXIO;
3645+ }
3646+
3647+ if (hc->new_map) {
3648+ dm_table_put(hc->new_map);
3649+ hc->new_map = NULL;
3650+ }
3651+
3652+ param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
3653+
3654+ r = __dev_status(hc->md, param);
3655+ up_write(&_hash_lock);
3656+ return r;
3657+}
3658+
3659+/*
3660+ * Retrieves a list of devices used by a particular dm device.
3661+ */
3662+static void retrieve_deps(struct dm_table *table, struct dm_ioctl *param,
3663+ size_t param_size)
3664+{
3665+ unsigned int count = 0;
3666+ struct list_head *tmp;
3667+ size_t len, needed;
3668+ struct dm_target_deps *deps;
3669+
3670+ deps = get_result_buffer(param, param_size, &len);
3671+
3672+ /*
3673+ * Count the devices.
3674+ */
3675+ list_for_each(tmp, dm_table_get_devices(table))
3676+ count++;
3677+
3678+ /*
3679+ * Check we have enough space.
3680+ */
3681+ needed = sizeof(*deps) + (sizeof(*deps->dev) * count);
3682+ if (len < needed) {
3683+ param->flags |= DM_BUFFER_FULL_FLAG;
3684+ return;
3685+ }
3686+
3687+ /*
3688+ * Fill in the devices.
3689+ */
3690+ deps->count = count;
3691+ count = 0;
3692+ list_for_each(tmp, dm_table_get_devices(table)) {
3693+ struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
3694+ deps->dev[count++] = dd->bdev->bd_dev;
3695+ }
3696+
3697+ param->data_size = param->data_start + needed;
3698+}
3699+
3700+static int table_deps(struct dm_ioctl *param, size_t param_size)
3701+{
3702+ int r;
3703+ struct mapped_device *md;
3704+ struct dm_table *table;
3705+
3706+ md = find_device(param);
3707+ if (!md)
3708+ return -ENXIO;
3709+
3710+ r = __dev_status(md, param);
3711+ if (r)
3712+ goto out;
3713+
3714+ table = dm_get_table(md);
3715+ if (table) {
3716+ retrieve_deps(table, param, param_size);
3717+ dm_table_put(table);
3718+ }
3719+
3720+ out:
3721+ dm_put(md);
3722+ return r;
3723+}
3724+
3725+/*
3726+ * Return the status of a device as a text string for each
3727+ * target.
3728+ */
3729+static int table_status(struct dm_ioctl *param, size_t param_size)
3730+{
3731+ int r;
3732+ struct mapped_device *md;
3733+ struct dm_table *table;
3734+
3735+ md = find_device(param);
3736+ if (!md)
3737+ return -ENXIO;
3738+
3739+ r = __dev_status(md, param);
3740+ if (r)
3741+ goto out;
3742+
3743+ table = dm_get_table(md);
3744+ if (table) {
3745+ retrieve_status(table, param, param_size);
3746+ dm_table_put(table);
3747+ }
3748+
3749+ out:
3750+ dm_put(md);
3751+ return r;
3752+}
3753+
3754+/*-----------------------------------------------------------------
3755+ * Implementation of open/close/ioctl on the special char
3756+ * device.
3757+ *---------------------------------------------------------------*/
3758+static ioctl_fn lookup_ioctl(unsigned int cmd)
3759+{
3760+ static struct {
3761+ int cmd;
3762+ ioctl_fn fn;
3763+ } _ioctls[] = {
3764+ {DM_VERSION_CMD, NULL}, /* version is dealt with elsewhere */
3765+ {DM_REMOVE_ALL_CMD, remove_all},
3766+ {DM_LIST_DEVICES_CMD, list_devices},
3767+
3768+ {DM_DEV_CREATE_CMD, dev_create},
3769+ {DM_DEV_REMOVE_CMD, dev_remove},
3770+ {DM_DEV_RENAME_CMD, dev_rename},
3771+ {DM_DEV_SUSPEND_CMD, dev_suspend},
3772+ {DM_DEV_STATUS_CMD, dev_status},
3773+ {DM_DEV_WAIT_CMD, dev_wait},
3774+
3775+ {DM_TABLE_LOAD_CMD, table_load},
3776+ {DM_TABLE_CLEAR_CMD, table_clear},
3777+ {DM_TABLE_DEPS_CMD, table_deps},
3778+ {DM_TABLE_STATUS_CMD, table_status}
3779+ };
3780+
3781+ return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn;
3782+}
3783+
3784+/*
3785+ * As well as checking the version compatibility this always
3786+ * copies the kernel interface version out.
3787+ */
3788+static int check_version(unsigned int cmd, struct dm_ioctl *user)
3789+{
3790+ uint32_t version[3];
3791+ int r = 0;
3792+
3793+ if (copy_from_user(version, user->version, sizeof(version)))
3794+ return -EFAULT;
3795+
3796+ if ((DM_VERSION_MAJOR != version[0]) ||
3797+ (DM_VERSION_MINOR < version[1])) {
3798+ DMWARN("ioctl interface mismatch: "
3799+ "kernel(%u.%u.%u), user(%u.%u.%u), cmd(%d)",
3800+ DM_VERSION_MAJOR, DM_VERSION_MINOR,
3801+ DM_VERSION_PATCHLEVEL,
3802+ version[0], version[1], version[2], cmd);
3803+ r = -EINVAL;
3804+ }
3805+
3806+ /*
3807+ * Fill in the kernel version.
3808+ */
3809+ version[0] = DM_VERSION_MAJOR;
3810+ version[1] = DM_VERSION_MINOR;
3811+ version[2] = DM_VERSION_PATCHLEVEL;
3812+ if (copy_to_user(user->version, version, sizeof(version)))
3813+ return -EFAULT;
3814+
3815+ return r;
3816+}
3817+
3818+static void free_params(struct dm_ioctl *param)
3819+{
3820+ vfree(param);
3821+}
3822+
3823+static int copy_params(struct dm_ioctl *user, struct dm_ioctl **param)
3824+{
3825+ struct dm_ioctl tmp, *dmi;
3826+
3827+ if (copy_from_user(&tmp, user, sizeof(tmp)))
3828+ return -EFAULT;
3829+
3830+ if (tmp.data_size < sizeof(tmp))
3831+ return -EINVAL;
3832+
3833+ dmi = (struct dm_ioctl *) vmalloc(tmp.data_size);
3834+ if (!dmi)
3835+ return -ENOMEM;
3836+
3837+ if (copy_from_user(dmi, user, tmp.data_size)) {
3838+ vfree(dmi);
3839+ return -EFAULT;
3840+ }
3841+
3842+ *param = dmi;
3843+ return 0;
3844+}
3845+
3846+static int validate_params(uint cmd, struct dm_ioctl *param)
3847+{
3848+ /* Always clear this flag */
3849+ param->flags &= ~DM_BUFFER_FULL_FLAG;
3850+
3851+ /* Ignores parameters */
3852+ if (cmd == DM_REMOVE_ALL_CMD || cmd == DM_LIST_DEVICES_CMD)
3853+ return 0;
3854+
3855+ /* Unless creating, either name or uuid but not both */
3856+ if (cmd != DM_DEV_CREATE_CMD) {
3857+ if ((!*param->uuid && !*param->name) ||
3858+ (*param->uuid && *param->name)) {
3859+ DMWARN("one of name or uuid must be supplied, cmd(%u)",
3860+ cmd);
3861+ return -EINVAL;
3862+ }
3863+ }
3864+
3865+ /* Ensure strings are terminated */
3866+ param->name[DM_NAME_LEN - 1] = '\0';
3867+ param->uuid[DM_UUID_LEN - 1] = '\0';
3868+
3869+ return 0;
3870+}
3871+
3872+static int ctl_ioctl(struct inode *inode, struct file *file,
3873+ uint command, ulong u)
3874+{
3875+ int r = 0;
3876+ unsigned int cmd;
3877+ struct dm_ioctl *param;
3878+ struct dm_ioctl *user = (struct dm_ioctl *) u;
3879+ ioctl_fn fn = NULL;
3880+ size_t param_size;
3881+
3882+ /* only root can play with this */
3883+ if (!capable(CAP_SYS_ADMIN))
3884+ return -EACCES;
3885+
3886+ if (_IOC_TYPE(command) != DM_IOCTL)
3887+ return -ENOTTY;
3888+
3889+ cmd = _IOC_NR(command);
3890+
3891+ /*
3892+ * Check the interface version passed in. This also
3893+ * writes out the kernel's interface version.
3894+ */
3895+ r = check_version(cmd, user);
3896+ if (r)
3897+ return r;
3898+
3899+ /*
3900+ * Nothing more to do for the version command.
3901+ */
3902+ if (cmd == DM_VERSION_CMD)
3903+ return 0;
3904+
3905+ fn = lookup_ioctl(cmd);
3906+ if (!fn) {
3907+ DMWARN("dm_ctl_ioctl: unknown command 0x%x", command);
3908+ return -ENOTTY;
3909+ }
3910+
3911+ /*
3912+ * FIXME: I don't like this, we're trying to avoid low
3913+ * memory issues when a device is suspended.
3914+ */
3915+ current->flags |= PF_MEMALLOC;
3916+
3917+ /*
3918+ * Copy the parameters into kernel space.
3919+ */
3920+ r = copy_params(user, &param);
3921+ if (r) {
3922+ current->flags &= ~PF_MEMALLOC;
3923+ return r;
3924+ }
3925+
3926+ r = validate_params(cmd, param);
3927+ if (r)
3928+ goto out;
3929+
3930+ param_size = param->data_size;
3931+ param->data_size = sizeof(*param);
3932+ r = fn(param, param_size);
3933+
3934+ /*
3935+ * Copy the results back to userland.
3936+ */
3937+ if (!r && copy_to_user(user, param, param->data_size))
3938+ r = -EFAULT;
3939+
3940+ out:
3941+ free_params(param);
3942+ current->flags &= ~PF_MEMALLOC;
3943+ return r;
3944+}
3945+
3946+static struct file_operations _ctl_fops = {
3947+ .ioctl = ctl_ioctl,
3948+ .owner = THIS_MODULE,
3949+};
3950+
3951+static devfs_handle_t _ctl_handle;
3952+
3953+static struct miscdevice _dm_misc = {
3954+ .minor = MISC_DYNAMIC_MINOR,
3955+ .name = DM_NAME,
3956+ .fops = &_ctl_fops
3957+};
3958+
3959+/*
3960+ * Create misc character device and link to DM_DIR/control.
3961+ */
3962+int __init dm_interface_init(void)
3963+{
3964+ int r;
3965+ char rname[64];
3966+
3967+ r = dm_hash_init();
3968+ if (r)
3969+ return r;
3970+
3971+ r = misc_register(&_dm_misc);
3972+ if (r) {
3973+ DMERR("misc_register failed for control device");
3974+ dm_hash_exit();
3975+ return r;
3976+ }
3977+
3978+ r = devfs_generate_path(_dm_misc.devfs_handle, rname + 3,
3979+ sizeof rname - 3);
3980+ if (r == -ENOSYS)
3981+ goto done; /* devfs not present */
3982+
3983+ if (r < 0) {
3984+ DMERR("devfs_generate_path failed for control device");
3985+ goto failed;
3986+ }
3987+
3988+ strncpy(rname + r, "../", 3);
3989+ r = devfs_mk_symlink(NULL, DM_DIR "/control",
3990+ DEVFS_FL_DEFAULT, rname + r, &_ctl_handle, NULL);
3991+ if (r) {
3992+ DMERR("devfs_mk_symlink failed for control device");
3993+ goto failed;
3994+ }
3995+ devfs_auto_unregister(_dm_misc.devfs_handle, _ctl_handle);
3996+
3997+ done:
3998+ DMINFO("%d.%d.%d%s initialised: %s", DM_VERSION_MAJOR,
3999+ DM_VERSION_MINOR, DM_VERSION_PATCHLEVEL, DM_VERSION_EXTRA,
4000+ DM_DRIVER_EMAIL);
4001+ return 0;
4002+
4003+ failed:
4004+ misc_deregister(&_dm_misc);
4005+ dm_hash_exit();
4006+ return r;
4007+}
4008+
4009+void dm_interface_exit(void)
4010+{
4011+ if (misc_deregister(&_dm_misc) < 0)
4012+ DMERR("misc_deregister failed for control device");
4013+
4014+ dm_hash_exit();
4015+}
4016diff -urN linux-2.4.24.org/drivers/md/dm-io.h linux-2.4.24/drivers/md/dm-io.h
4017--- linux-2.4.24.org/drivers/md/dm-io.h 1970-01-01 01:00:00.000000000 +0100
4018+++ linux-2.4.24/drivers/md/dm-io.h 2004-01-18 15:01:25.794190275 +0100
4019@@ -0,0 +1,86 @@
4020+/*
4021+ * Copyright (C) 2003 Sistina Software
4022+ *
4023+ * This file is released under the GPL.
4024+ */
4025+
4026+#ifndef _DM_IO_H
4027+#define _DM_IO_H
4028+
4029+#include "dm.h"
4030+
4031+#include <linux/list.h>
4032+
4033+/* Move these to bitops.h eventually */
4034+/* Improved generic_fls algorithm (in 2.4 there is no generic_fls so far) */
4035+/* (c) 2002, D.Phillips and Sistina Software */
4036+/* Licensed under Version 2 of the GPL */
4037+
4038+static unsigned generic_fls8(unsigned n)
4039+{
4040+ return n & 0xf0 ?
4041+ n & 0xc0 ? (n >> 7) + 7 : (n >> 5) + 5:
4042+ n & 0x0c ? (n >> 3) + 3 : n - ((n + 1) >> 2);
4043+}
4044+
4045+static inline unsigned generic_fls16(unsigned n)
4046+{
4047+ return n & 0xff00? generic_fls8(n >> 8) + 8 : generic_fls8(n);
4048+}
4049+
4050+static inline unsigned generic_fls32(unsigned n)
4051+{
4052+ return n & 0xffff0000 ? generic_fls16(n >> 16) + 16 : generic_fls16(n);
4053+}
4054+
4055+/* FIXME make this configurable */
4056+#define DM_MAX_IO_REGIONS 8
4057+
4058+struct io_region {
4059+ kdev_t dev;
4060+ sector_t sector;
4061+ sector_t count;
4062+};
4063+
4064+
4065+/*
4066+ * 'error' is a bitset, with each bit indicating whether an error
4067+ * occurred doing io to the corresponding region.
4068+ */
4069+typedef void (*io_notify_fn)(unsigned int error, void *context);
4070+
4071+
4072+/*
4073+ * Before anyone uses the IO interface they should call
4074+ * dm_io_get(), specifying roughly how many pages they are
4075+ * expecting to perform io on concurrently.
4076+ *
4077+ * This function may block.
4078+ */
4079+int dm_io_get(unsigned int num_pages);
4080+void dm_io_put(unsigned int num_pages);
4081+
4082+
4083+/*
4084+ * Synchronous IO.
4085+ *
4086+ * Please ensure that the rw flag in the next two functions is
4087+ * either READ or WRITE, ie. we don't take READA. Any
4088+ * regions with a zero count field will be ignored.
4089+ */
4090+int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw,
4091+ struct page *pages, unsigned int offset,
4092+ unsigned int *error_bits);
4093+
4094+
4095+/*
4096+ * Aynchronous IO.
4097+ *
4098+ * The 'where' array may be safely allocated on the stack since
4099+ * the function takes a copy.
4100+ */
4101+int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
4102+ struct page *pages, unsigned int offset,
4103+ io_notify_fn fn, void *context);
4104+
4105+#endif
4106diff -urN linux-2.4.24.org/drivers/md/dm-linear.c linux-2.4.24/drivers/md/dm-linear.c
4107--- linux-2.4.24.org/drivers/md/dm-linear.c 1970-01-01 01:00:00.000000000 +0100
4108+++ linux-2.4.24/drivers/md/dm-linear.c 2004-01-18 15:01:13.777712209 +0100
4109@@ -0,0 +1,123 @@
4110+/*
4111+ * Copyright (C) 2001 Sistina Software (UK) Limited.
4112+ *
4113+ * This file is released under the GPL.
4114+ */
4115+
4116+#include "dm.h"
4117+
4118+#include <linux/module.h>
4119+#include <linux/init.h>
4120+#include <linux/blkdev.h>
4121+#include <linux/slab.h>
4122+
4123+/*
4124+ * Linear: maps a linear range of a device.
4125+ */
4126+struct linear_c {
4127+ struct dm_dev *dev;
4128+ sector_t start;
4129+};
4130+
4131+/*
4132+ * Construct a linear mapping: <dev_path> <offset>
4133+ */
4134+static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
4135+{
4136+ struct linear_c *lc;
4137+
4138+ if (argc != 2) {
4139+ ti->error = "dm-linear: Invalid argument count";
4140+ return -EINVAL;
4141+ }
4142+
4143+ lc = kmalloc(sizeof(*lc), GFP_KERNEL);
4144+ if (lc == NULL) {
4145+ ti->error = "dm-linear: Cannot allocate linear context";
4146+ return -ENOMEM;
4147+ }
4148+
4149+ if (sscanf(argv[1], SECTOR_FORMAT, &lc->start) != 1) {
4150+ ti->error = "dm-linear: Invalid device sector";
4151+ goto bad;
4152+ }
4153+
4154+ if (dm_get_device(ti, argv[0], lc->start, ti->len,
4155+ dm_table_get_mode(ti->table), &lc->dev)) {
4156+ ti->error = "dm-linear: Device lookup failed";
4157+ goto bad;
4158+ }
4159+
4160+ ti->private = lc;
4161+ return 0;
4162+
4163+ bad:
4164+ kfree(lc);
4165+ return -EINVAL;
4166+}
4167+
4168+static void linear_dtr(struct dm_target *ti)
4169+{
4170+ struct linear_c *lc = (struct linear_c *) ti->private;
4171+
4172+ dm_put_device(ti, lc->dev);
4173+ kfree(lc);
4174+}
4175+
4176+static int linear_map(struct dm_target *ti, struct buffer_head *bh, int rw,
4177+ union map_info *map_context)
4178+{
4179+ struct linear_c *lc = (struct linear_c *) ti->private;
4180+
4181+ bh->b_rdev = lc->dev->dev;
4182+ bh->b_rsector = lc->start + (bh->b_rsector - ti->begin);
4183+
4184+ return 1;
4185+}
4186+
4187+static int linear_status(struct dm_target *ti, status_type_t type,
4188+ char *result, unsigned int maxlen)
4189+{
4190+ struct linear_c *lc = (struct linear_c *) ti->private;
4191+ kdev_t kdev;
4192+
4193+ switch (type) {
4194+ case STATUSTYPE_INFO:
4195+ result[0] = '\0';
4196+ break;
4197+
4198+ case STATUSTYPE_TABLE:
4199+ kdev = to_kdev_t(lc->dev->bdev->bd_dev);
4200+ snprintf(result, maxlen, "%s " SECTOR_FORMAT,
4201+ dm_kdevname(kdev), lc->start);
4202+ break;
4203+ }
4204+ return 0;
4205+}
4206+
4207+static struct target_type linear_target = {
4208+ .name = "linear",
4209+ .module = THIS_MODULE,
4210+ .ctr = linear_ctr,
4211+ .dtr = linear_dtr,
4212+ .map = linear_map,
4213+ .status = linear_status,
4214+};
4215+
4216+int __init dm_linear_init(void)
4217+{
4218+ int r = dm_register_target(&linear_target);
4219+
4220+ if (r < 0)
4221+ DMERR("linear: register failed %d", r);
4222+
4223+ return r;
4224+}
4225+
4226+void dm_linear_exit(void)
4227+{
4228+ int r = dm_unregister_target(&linear_target);
4229+
4230+ if (r < 0)
4231+ DMERR("linear: unregister failed %d", r);
4232+}
4233diff -urN linux-2.4.24.org/drivers/md/dm-snapshot.c linux-2.4.24/drivers/md/dm-snapshot.c
4234--- linux-2.4.24.org/drivers/md/dm-snapshot.c 1970-01-01 01:00:00.000000000 +0100
4235+++ linux-2.4.24/drivers/md/dm-snapshot.c 2004-01-18 15:01:29.247465850 +0100
4236@@ -0,0 +1,1235 @@
4237+/*
4238+ * dm-snapshot.c
4239+ *
4240+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
4241+ *
4242+ * This file is released under the GPL.
4243+ */
4244+
4245+#include <linux/config.h>
4246+#include <linux/ctype.h>
4247+#include <linux/module.h>
4248+#include <linux/init.h>
4249+#include <linux/slab.h>
4250+#include <linux/list.h>
4251+#include <linux/fs.h>
4252+#include <linux/blkdev.h>
4253+#include <linux/mempool.h>
4254+#include <linux/device-mapper.h>
4255+#include <linux/vmalloc.h>
4256+
4257+#include "dm-snapshot.h"
4258+#include "kcopyd.h"
4259+
4260+/*
4261+ * FIXME: Remove this before release.
4262+ */
4263+#if 0
4264+#define DMDEBUG(x...) DMWARN( ## x)
4265+#else
4266+#define DMDEBUG(x...)
4267+#endif
4268+
4269+/*
4270+ * The percentage increment we will wake up users at
4271+ */
4272+#define WAKE_UP_PERCENT 5
4273+
4274+/*
4275+ * kcopyd priority of snapshot operations
4276+ */
4277+#define SNAPSHOT_COPY_PRIORITY 2
4278+
4279+/*
4280+ * Each snapshot reserves this many pages for io
4281+ * FIXME: calculate this
4282+ */
4283+#define SNAPSHOT_PAGES 256
4284+
4285+struct pending_exception {
4286+ struct exception e;
4287+
4288+ /*
4289+ * Origin buffers waiting for this to complete are held
4290+ * in a list (using b_reqnext).
4291+ */
4292+ struct buffer_head *origin_bhs;
4293+ struct buffer_head *snapshot_bhs;
4294+
4295+ /*
4296+ * Other pending_exceptions that are processing this
4297+ * chunk. When this list is empty, we know we can
4298+ * complete the origins.
4299+ */
4300+ struct list_head siblings;
4301+
4302+ /* Pointer back to snapshot context */
4303+ struct dm_snapshot *snap;
4304+
4305+ /*
4306+ * 1 indicates the exception has already been sent to
4307+ * kcopyd.
4308+ */
4309+ int started;
4310+};
4311+
4312+/*
4313+ * Hash table mapping origin volumes to lists of snapshots and
4314+ * a lock to protect it
4315+ */
4316+static kmem_cache_t *exception_cache;
4317+static kmem_cache_t *pending_cache;
4318+static mempool_t *pending_pool;
4319+
4320+/*
4321+ * One of these per registered origin, held in the snapshot_origins hash
4322+ */
4323+struct origin {
4324+ /* The origin device */
4325+ kdev_t dev;
4326+
4327+ struct list_head hash_list;
4328+
4329+ /* List of snapshots for this origin */
4330+ struct list_head snapshots;
4331+};
4332+
4333+/*
4334+ * Size of the hash table for origin volumes. If we make this
4335+ * the size of the minors list then it should be nearly perfect
4336+ */
4337+#define ORIGIN_HASH_SIZE 256
4338+#define ORIGIN_MASK 0xFF
4339+static struct list_head *_origins;
4340+static struct rw_semaphore _origins_lock;
4341+
4342+static int init_origin_hash(void)
4343+{
4344+ int i;
4345+
4346+ _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
4347+ GFP_KERNEL);
4348+ if (!_origins) {
4349+ DMERR("Device mapper: Snapshot: unable to allocate memory");
4350+ return -ENOMEM;
4351+ }
4352+
4353+ for (i = 0; i < ORIGIN_HASH_SIZE; i++)
4354+ INIT_LIST_HEAD(_origins + i);
4355+ init_rwsem(&_origins_lock);
4356+
4357+ return 0;
4358+}
4359+
4360+static void exit_origin_hash(void)
4361+{
4362+ kfree(_origins);
4363+}
4364+
4365+static inline unsigned int origin_hash(kdev_t dev)
4366+{
4367+ return MINOR(dev) & ORIGIN_MASK;
4368+}
4369+
4370+static struct origin *__lookup_origin(kdev_t origin)
4371+{
4372+ struct list_head *slist;
4373+ struct list_head *ol;
4374+ struct origin *o;
4375+
4376+ ol = &_origins[origin_hash(origin)];
4377+ list_for_each(slist, ol) {
4378+ o = list_entry(slist, struct origin, hash_list);
4379+
4380+ if (o->dev == origin)
4381+ return o;
4382+ }
4383+
4384+ return NULL;
4385+}
4386+
4387+static void __insert_origin(struct origin *o)
4388+{
4389+ struct list_head *sl = &_origins[origin_hash(o->dev)];
4390+ list_add_tail(&o->hash_list, sl);
4391+}
4392+
4393+/*
4394+ * Make a note of the snapshot and its origin so we can look it
4395+ * up when the origin has a write on it.
4396+ */
4397+static int register_snapshot(struct dm_snapshot *snap)
4398+{
4399+ struct origin *o;
4400+ kdev_t dev = snap->origin->dev;
4401+
4402+ down_write(&_origins_lock);
4403+ o = __lookup_origin(dev);
4404+
4405+ if (!o) {
4406+ /* New origin */
4407+ o = kmalloc(sizeof(*o), GFP_KERNEL);
4408+ if (!o) {
4409+ up_write(&_origins_lock);
4410+ return -ENOMEM;
4411+ }
4412+
4413+ /* Initialise the struct */
4414+ INIT_LIST_HEAD(&o->snapshots);
4415+ o->dev = dev;
4416+
4417+ __insert_origin(o);
4418+ }
4419+
4420+ list_add_tail(&snap->list, &o->snapshots);
4421+
4422+ up_write(&_origins_lock);
4423+ return 0;
4424+}
4425+
4426+static void unregister_snapshot(struct dm_snapshot *s)
4427+{
4428+ struct origin *o;
4429+
4430+ down_write(&_origins_lock);
4431+ o = __lookup_origin(s->origin->dev);
4432+
4433+ list_del(&s->list);
4434+ if (list_empty(&o->snapshots)) {
4435+ list_del(&o->hash_list);
4436+ kfree(o);
4437+ }
4438+
4439+ up_write(&_origins_lock);
4440+}
4441+
4442+/*
4443+ * Implementation of the exception hash tables.
4444+ */
4445+static int init_exception_table(struct exception_table *et, uint32_t size)
4446+{
4447+ unsigned int i;
4448+
4449+ et->hash_mask = size - 1;
4450+ et->table = dm_vcalloc(size, sizeof(struct list_head));
4451+ if (!et->table)
4452+ return -ENOMEM;
4453+
4454+ for (i = 0; i < size; i++)
4455+ INIT_LIST_HEAD(et->table + i);
4456+
4457+ return 0;
4458+}
4459+
4460+static void exit_exception_table(struct exception_table *et, kmem_cache_t *mem)
4461+{
4462+ struct list_head *slot, *entry, *temp;
4463+ struct exception *ex;
4464+ int i, size;
4465+
4466+ size = et->hash_mask + 1;
4467+ for (i = 0; i < size; i++) {
4468+ slot = et->table + i;
4469+
4470+ list_for_each_safe(entry, temp, slot) {
4471+ ex = list_entry(entry, struct exception, hash_list);
4472+ kmem_cache_free(mem, ex);
4473+ }
4474+ }
4475+
4476+ vfree(et->table);
4477+}
4478+
4479+/*
4480+ * FIXME: check how this hash fn is performing.
4481+ */
4482+static inline uint32_t exception_hash(struct exception_table *et, chunk_t chunk)
4483+{
4484+ return chunk & et->hash_mask;
4485+}
4486+
4487+static void insert_exception(struct exception_table *eh, struct exception *e)
4488+{
4489+ struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)];
4490+ list_add(&e->hash_list, l);
4491+}
4492+
4493+static inline void remove_exception(struct exception *e)
4494+{
4495+ list_del(&e->hash_list);
4496+}
4497+
4498+/*
4499+ * Return the exception data for a sector, or NULL if not
4500+ * remapped.
4501+ */
4502+static struct exception *lookup_exception(struct exception_table *et,
4503+ chunk_t chunk)
4504+{
4505+ struct list_head *slot, *el;
4506+ struct exception *e;
4507+
4508+ slot = &et->table[exception_hash(et, chunk)];
4509+ list_for_each(el, slot) {
4510+ e = list_entry(el, struct exception, hash_list);
4511+ if (e->old_chunk == chunk)
4512+ return e;
4513+ }
4514+
4515+ return NULL;
4516+}
4517+
4518+static inline struct exception *alloc_exception(void)
4519+{
4520+ struct exception *e;
4521+
4522+ e = kmem_cache_alloc(exception_cache, GFP_NOIO);
4523+ if (!e)
4524+ e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
4525+
4526+ return e;
4527+}
4528+
4529+static inline void free_exception(struct exception *e)
4530+{
4531+ kmem_cache_free(exception_cache, e);
4532+}
4533+
4534+static inline struct pending_exception *alloc_pending_exception(void)
4535+{
4536+ return mempool_alloc(pending_pool, GFP_NOIO);
4537+}
4538+
4539+static inline void free_pending_exception(struct pending_exception *pe)
4540+{
4541+ mempool_free(pe, pending_pool);
4542+}
4543+
4544+int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new)
4545+{
4546+ struct exception *e;
4547+
4548+ e = alloc_exception();
4549+ if (!e)
4550+ return -ENOMEM;
4551+
4552+ e->old_chunk = old;
4553+ e->new_chunk = new;
4554+ insert_exception(&s->complete, e);
4555+ return 0;
4556+}
4557+
4558+/*
4559+ * Hard coded magic.
4560+ */
4561+static int calc_max_buckets(void)
4562+{
4563+ unsigned long mem;
4564+
4565+ mem = num_physpages << PAGE_SHIFT;
4566+ mem /= 50;
4567+ mem /= sizeof(struct list_head);
4568+
4569+ return mem;
4570+}
4571+
4572+/*
4573+ * Rounds a number down to a power of 2.
4574+ */
4575+static inline uint32_t round_down(uint32_t n)
4576+{
4577+ while (n & (n - 1))
4578+ n &= (n - 1);
4579+ return n;
4580+}
4581+
4582+/*
4583+ * Allocate room for a suitable hash table.
4584+ */
4585+static int init_hash_tables(struct dm_snapshot *s)
4586+{
4587+ sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets;
4588+
4589+ /*
4590+ * Calculate based on the size of the original volume or
4591+ * the COW volume...
4592+ */
4593+ cow_dev_size = get_dev_size(s->cow->dev);
4594+ origin_dev_size = get_dev_size(s->origin->dev);
4595+ max_buckets = calc_max_buckets();
4596+
4597+ hash_size = min(origin_dev_size, cow_dev_size) / s->chunk_size;
4598+ hash_size = min(hash_size, max_buckets);
4599+
4600+ /* Round it down to a power of 2 */
4601+ hash_size = round_down(hash_size);
4602+ if (init_exception_table(&s->complete, hash_size))
4603+ return -ENOMEM;
4604+
4605+ /*
4606+ * Allocate hash table for in-flight exceptions
4607+ * Make this smaller than the real hash table
4608+ */
4609+ hash_size >>= 3;
4610+ if (!hash_size)
4611+ hash_size = 64;
4612+
4613+ if (init_exception_table(&s->pending, hash_size)) {
4614+ exit_exception_table(&s->complete, exception_cache);
4615+ return -ENOMEM;
4616+ }
4617+
4618+ return 0;
4619+}
4620+
4621+/*
4622+ * Round a number up to the nearest 'size' boundary. size must
4623+ * be a power of 2.
4624+ */
4625+static inline ulong round_up(ulong n, ulong size)
4626+{
4627+ size--;
4628+ return (n + size) & ~size;
4629+}
4630+
4631+/*
4632+ * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
4633+ */
4634+static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
4635+{
4636+ struct dm_snapshot *s;
4637+ unsigned long chunk_size;
4638+ int r = -EINVAL;
4639+ char persistent;
4640+ char *origin_path;
4641+ char *cow_path;
4642+ char *value;
4643+ int blocksize;
4644+
4645+ if (argc < 4) {
4646+ ti->error = "dm-snapshot: requires exactly 4 arguments";
4647+ r = -EINVAL;
4648+ goto bad1;
4649+ }
4650+
4651+ origin_path = argv[0];
4652+ cow_path = argv[1];
4653+ persistent = toupper(*argv[2]);
4654+
4655+ if (persistent != 'P' && persistent != 'N') {
4656+ ti->error = "Persistent flag is not P or N";
4657+ r = -EINVAL;
4658+ goto bad1;
4659+ }
4660+
4661+ chunk_size = simple_strtoul(argv[3], &value, 10);
4662+ if (chunk_size == 0 || value == NULL) {
4663+ ti->error = "Invalid chunk size";
4664+ r = -EINVAL;
4665+ goto bad1;
4666+ }
4667+
4668+ s = kmalloc(sizeof(*s), GFP_KERNEL);
4669+ if (s == NULL) {
4670+ ti->error = "Cannot allocate snapshot context private "
4671+ "structure";
4672+ r = -ENOMEM;
4673+ goto bad1;
4674+ }
4675+
4676+ r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin);
4677+ if (r) {
4678+ ti->error = "Cannot get origin device";
4679+ goto bad2;
4680+ }
4681+
4682+ /* FIXME: get cow length */
4683+ r = dm_get_device(ti, cow_path, 0, 0,
4684+ FMODE_READ | FMODE_WRITE, &s->cow);
4685+ if (r) {
4686+ dm_put_device(ti, s->origin);
4687+ ti->error = "Cannot get COW device";
4688+ goto bad2;
4689+ }
4690+
4691+ /*
4692+ * Chunk size must be multiple of page size. Silently
4693+ * round up if it's not.
4694+ */
4695+ chunk_size = round_up(chunk_size, PAGE_SIZE / SECTOR_SIZE);
4696+
4697+ /* Validate the chunk size against the device block size */
4698+ blocksize = get_hardsect_size(s->cow->dev);
4699+ if (chunk_size % (blocksize / SECTOR_SIZE)) {
4700+ ti->error = "Chunk size is not a multiple of device blocksize";
4701+ r = -EINVAL;
4702+ goto bad3;
4703+ }
4704+
4705+ /* Check the sizes are small enough to fit in one kiovec */
4706+ if (chunk_size > KIO_MAX_SECTORS) {
4707+ ti->error = "Chunk size is too big";
4708+ r = -EINVAL;
4709+ goto bad3;
4710+ }
4711+
4712+ /* Check chunk_size is a power of 2 */
4713+ if (chunk_size & (chunk_size - 1)) {
4714+ ti->error = "Chunk size is not a power of 2";
4715+ r = -EINVAL;
4716+ goto bad3;
4717+ }
4718+
4719+ s->chunk_size = chunk_size;
4720+ s->chunk_mask = chunk_size - 1;
4721+ s->type = persistent;
4722+ for (s->chunk_shift = 0; chunk_size;
4723+ s->chunk_shift++, chunk_size >>= 1)
4724+ ;
4725+ s->chunk_shift--;
4726+
4727+ s->valid = 1;
4728+ s->have_metadata = 0;
4729+ s->last_percent = 0;
4730+ init_rwsem(&s->lock);
4731+ s->table = ti->table;
4732+
4733+ /* Allocate hash table for COW data */
4734+ if (init_hash_tables(s)) {
4735+ ti->error = "Unable to allocate hash table space";
4736+ r = -ENOMEM;
4737+ goto bad3;
4738+ }
4739+
4740+ /*
4741+ * Check the persistent flag - done here because we need the iobuf
4742+ * to check the LV header
4743+ */
4744+ s->store.snap = s;
4745+
4746+ if (persistent == 'P')
4747+ r = dm_create_persistent(&s->store, s->chunk_size);
4748+ else
4749+ r = dm_create_transient(&s->store, s, blocksize);
4750+
4751+ if (r) {
4752+ ti->error = "Couldn't create exception store";
4753+ r = -EINVAL;
4754+ goto bad4;
4755+ }
4756+
4757+ r = kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client);
4758+ if (r) {
4759+ ti->error = "Could not create kcopyd client";
4760+ goto bad5;
4761+ }
4762+
4763+ /* Flush IO to the origin device */
4764+ fsync_dev(s->origin->dev);
4765+
4766+ /* Add snapshot to the list of snapshots for this origin */
4767+ if (register_snapshot(s)) {
4768+ r = -EINVAL;
4769+ ti->error = "Cannot register snapshot origin";
4770+ goto bad6;
4771+ }
4772+
4773+ ti->private = s;
4774+ return 0;
4775+
4776+ bad6:
4777+ kcopyd_client_destroy(s->kcopyd_client);
4778+
4779+ bad5:
4780+ s->store.destroy(&s->store);
4781+
4782+ bad4:
4783+ exit_exception_table(&s->pending, pending_cache);
4784+ exit_exception_table(&s->complete, exception_cache);
4785+
4786+ bad3:
4787+ dm_put_device(ti, s->cow);
4788+ dm_put_device(ti, s->origin);
4789+
4790+ bad2:
4791+ kfree(s);
4792+
4793+ bad1:
4794+ return r;
4795+}
4796+
4797+static void snapshot_dtr(struct dm_target *ti)
4798+{
4799+ struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
4800+
4801+ dm_table_event(ti->table);
4802+
4803+ unregister_snapshot(s);
4804+
4805+ exit_exception_table(&s->pending, pending_cache);
4806+ exit_exception_table(&s->complete, exception_cache);
4807+
4808+ /* Deallocate memory used */
4809+ s->store.destroy(&s->store);
4810+
4811+ dm_put_device(ti, s->origin);
4812+ dm_put_device(ti, s->cow);
4813+ kcopyd_client_destroy(s->kcopyd_client);
4814+ kfree(s);
4815+}
4816+
4817+/*
4818+ * We hold lists of buffer_heads, using the b_reqnext field.
4819+ */
4820+static void queue_buffer(struct buffer_head **queue, struct buffer_head *bh)
4821+{
4822+ bh->b_reqnext = *queue;
4823+ *queue = bh;
4824+}
4825+
4826+/*
4827+ * FIXME: inefficient.
4828+ */
4829+static void queue_buffers(struct buffer_head **queue, struct buffer_head *bhs)
4830+{
4831+ while (*queue)
4832+ queue = &((*queue)->b_reqnext);
4833+
4834+ *queue = bhs;
4835+}
4836+
4837+/*
4838+ * Flush a list of buffers.
4839+ */
4840+static void flush_buffers(struct buffer_head *bh)
4841+{
4842+ struct buffer_head *n;
4843+
4844+ DMDEBUG("begin flush");
4845+ while (bh) {
4846+ n = bh->b_reqnext;
4847+ bh->b_reqnext = NULL;
4848+ DMDEBUG("flushing %p", bh);
4849+ generic_make_request(WRITE, bh);
4850+ bh = n;
4851+ }
4852+
4853+ run_task_queue(&tq_disk);
4854+}
4855+
4856+/*
4857+ * Error a list of buffers.
4858+ */
4859+static void error_buffers(struct buffer_head *bh)
4860+{
4861+ struct buffer_head *n;
4862+
4863+ while (bh) {
4864+ n = bh->b_reqnext;
4865+ bh->b_reqnext = NULL;
4866+ buffer_IO_error(bh);
4867+ bh = n;
4868+ }
4869+}
4870+
4871+static struct buffer_head *__flush_bhs(struct pending_exception *pe)
4872+{
4873+ struct pending_exception *sibling;
4874+
4875+ if (list_empty(&pe->siblings))
4876+ return pe->origin_bhs;
4877+
4878+ sibling = list_entry(pe->siblings.next,
4879+ struct pending_exception, siblings);
4880+
4881+ list_del(&pe->siblings);
4882+
4883+ /* FIXME: I think there's a race on SMP machines here, add spin lock */
4884+ queue_buffers(&sibling->origin_bhs, pe->origin_bhs);
4885+
4886+ return NULL;
4887+}
4888+
4889+static void pending_complete(struct pending_exception *pe, int success)
4890+{
4891+ struct exception *e;
4892+ struct dm_snapshot *s = pe->snap;
4893+ struct buffer_head *flush = NULL;
4894+
4895+ if (success) {
4896+ e = alloc_exception();
4897+ if (!e) {
4898+ DMWARN("Unable to allocate exception.");
4899+ down_write(&s->lock);
4900+ s->store.drop_snapshot(&s->store);
4901+ s->valid = 0;
4902+ flush = __flush_bhs(pe);
4903+ up_write(&s->lock);
4904+
4905+ error_buffers(pe->snapshot_bhs);
4906+ goto out;
4907+ }
4908+
4909+ /*
4910+ * Add a proper exception, and remove the
4911+ * in-flight exception from the list.
4912+ */
4913+ down_write(&s->lock);
4914+
4915+ memcpy(e, &pe->e, sizeof(*e));
4916+ insert_exception(&s->complete, e);
4917+ remove_exception(&pe->e);
4918+ flush = __flush_bhs(pe);
4919+
4920+ /* Submit any pending write BHs */
4921+ up_write(&s->lock);
4922+
4923+ flush_buffers(pe->snapshot_bhs);
4924+ DMDEBUG("Exception completed successfully.");
4925+
4926+ /* Notify any interested parties */
4927+ if (s->store.fraction_full) {
4928+ sector_t numerator, denominator;
4929+ int pc;
4930+
4931+ s->store.fraction_full(&s->store, &numerator,
4932+ &denominator);
4933+ pc = numerator * 100 / denominator;
4934+
4935+ if (pc >= s->last_percent + WAKE_UP_PERCENT) {
4936+ dm_table_event(s->table);
4937+ s->last_percent = pc - pc % WAKE_UP_PERCENT;
4938+ }
4939+ }
4940+
4941+ } else {
4942+ /* Read/write error - snapshot is unusable */
4943+ down_write(&s->lock);
4944+ if (s->valid)
4945+ DMERR("Error reading/writing snapshot");
4946+ s->store.drop_snapshot(&s->store);
4947+ s->valid = 0;
4948+ remove_exception(&pe->e);
4949+ flush = __flush_bhs(pe);
4950+ up_write(&s->lock);
4951+
4952+ error_buffers(pe->snapshot_bhs);
4953+
4954+ dm_table_event(s->table);
4955+ DMDEBUG("Exception failed.");
4956+ }
4957+
4958+ out:
4959+ if (flush)
4960+ flush_buffers(flush);
4961+
4962+ free_pending_exception(pe);
4963+}
4964+
4965+static void commit_callback(void *context, int success)
4966+{
4967+ struct pending_exception *pe = (struct pending_exception *) context;
4968+ pending_complete(pe, success);
4969+}
4970+
4971+/*
4972+ * Called when the copy I/O has finished. kcopyd actually runs
4973+ * this code so don't block.
4974+ */
4975+static void copy_callback(int read_err, unsigned int write_err, void *context)
4976+{
4977+ struct pending_exception *pe = (struct pending_exception *) context;
4978+ struct dm_snapshot *s = pe->snap;
4979+
4980+ if (read_err || write_err)
4981+ pending_complete(pe, 0);
4982+
4983+ else
4984+ /* Update the metadata if we are persistent */
4985+ s->store.commit_exception(&s->store, &pe->e, commit_callback,
4986+ pe);
4987+}
4988+
4989+/*
4990+ * Dispatches the copy operation to kcopyd.
4991+ */
4992+static inline void start_copy(struct pending_exception *pe)
4993+{
4994+ struct dm_snapshot *s = pe->snap;
4995+ struct io_region src, dest;
4996+ kdev_t dev = s->origin->dev;
4997+ int *sizes = blk_size[major(dev)];
4998+ sector_t dev_size = (sector_t) -1;
4999+
5000+ if (pe->started)
5001+ return;
5002+
5003+ /* this is protected by snap->lock */
5004+ pe->started = 1;
5005+
5006+ if (sizes && sizes[minor(dev)])
5007+ dev_size = sizes[minor(dev)] << 1;
5008+
5009+ src.dev = dev;
5010+ src.sector = chunk_to_sector(s, pe->e.old_chunk);
5011+ src.count = min(s->chunk_size, dev_size - src.sector);
5012+
5013+ dest.dev = s->cow->dev;
5014+ dest.sector = chunk_to_sector(s, pe->e.new_chunk);
5015+ dest.count = src.count;
5016+
5017+ /* Hand over to kcopyd */
5018+ kcopyd_copy(s->kcopyd_client,
5019+ &src, 1, &dest, 0, copy_callback, pe);
5020+}
5021+
5022+/*
5023+ * Looks to see if this snapshot already has a pending exception
5024+ * for this chunk, otherwise it allocates a new one and inserts
5025+ * it into the pending table.
5026+ */
5027+static struct pending_exception *find_pending_exception(struct dm_snapshot *s,
5028+ struct buffer_head *bh)
5029+{
5030+ struct exception *e;
5031+ struct pending_exception *pe;
5032+ chunk_t chunk = sector_to_chunk(s, bh->b_rsector);
5033+
5034+ /*
5035+ * Is there a pending exception for this already ?
5036+ */
5037+ e = lookup_exception(&s->pending, chunk);
5038+ if (e) {
5039+ /* cast the exception to a pending exception */
5040+ pe = list_entry(e, struct pending_exception, e);
5041+
5042+ } else {
5043+ /* Create a new pending exception */
5044+ pe = alloc_pending_exception();
5045+ pe->e.old_chunk = chunk;
5046+ pe->origin_bhs = pe->snapshot_bhs = NULL;
5047+ INIT_LIST_HEAD(&pe->siblings);
5048+ pe->snap = s;
5049+ pe->started = 0;
5050+
5051+ if (s->store.prepare_exception(&s->store, &pe->e)) {
5052+ free_pending_exception(pe);
5053+ s->valid = 0;
5054+ return NULL;
5055+ }
5056+
5057+ insert_exception(&s->pending, &pe->e);
5058+ }
5059+
5060+ return pe;
5061+}
5062+
5063+static inline void remap_exception(struct dm_snapshot *s, struct exception *e,
5064+ struct buffer_head *bh)
5065+{
5066+ bh->b_rdev = s->cow->dev;
5067+ bh->b_rsector = chunk_to_sector(s, e->new_chunk) +
5068+ (bh->b_rsector & s->chunk_mask);
5069+}
5070+
5071+static int snapshot_map(struct dm_target *ti, struct buffer_head *bh, int rw,
5072+ union map_info *map_context)
5073+{
5074+ struct exception *e;
5075+ struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
5076+ int r = 1;
5077+ chunk_t chunk;
5078+ struct pending_exception *pe;
5079+
5080+ chunk = sector_to_chunk(s, bh->b_rsector);
5081+
5082+ /* Full snapshots are not usable */
5083+ if (!s->valid)
5084+ return -1;
5085+
5086+ /*
5087+ * Write to snapshot - higher level takes care of RW/RO
5088+ * flags so we should only get this if we are
5089+ * writeable.
5090+ */
5091+ if (rw == WRITE) {
5092+
5093+ down_write(&s->lock);
5094+
5095+ /* If the block is already remapped - use that, else remap it */
5096+ e = lookup_exception(&s->complete, chunk);
5097+ if (e)
5098+ remap_exception(s, e, bh);
5099+
5100+ else {
5101+ pe = find_pending_exception(s, bh);
5102+
5103+ if (!pe) {
5104+ s->store.drop_snapshot(&s->store);
5105+ s->valid = 0;
5106+ r = -EIO;
5107+ } else {
5108+ remap_exception(s, &pe->e, bh);
5109+ queue_buffer(&pe->snapshot_bhs, bh);
5110+ start_copy(pe);
5111+ r = 0;
5112+ }
5113+ }
5114+
5115+ up_write(&s->lock);
5116+
5117+ } else {
5118+ /*
5119+ * FIXME: this read path scares me because we
5120+ * always use the origin when we have a pending
5121+ * exception. However I can't think of a
5122+ * situation where this is wrong - ejt.
5123+ */
5124+
5125+ /* Do reads */
5126+ down_read(&s->lock);
5127+
5128+ /* See if it it has been remapped */
5129+ e = lookup_exception(&s->complete, chunk);
5130+ if (e)
5131+ remap_exception(s, e, bh);
5132+ else
5133+ bh->b_rdev = s->origin->dev;
5134+
5135+ up_read(&s->lock);
5136+ }
5137+
5138+ return r;
5139+}
5140+
5141+void snapshot_resume(struct dm_target *ti)
5142+{
5143+ struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
5144+
5145+ if (s->have_metadata)
5146+ return;
5147+
5148+ if (s->store.read_metadata(&s->store)) {
5149+ down_write(&s->lock);
5150+ s->valid = 0;
5151+ up_write(&s->lock);
5152+ }
5153+
5154+ s->have_metadata = 1;
5155+}
5156+
5157+static int snapshot_status(struct dm_target *ti, status_type_t type,
5158+ char *result, unsigned int maxlen)
5159+{
5160+ struct dm_snapshot *snap = (struct dm_snapshot *) ti->private;
5161+ char cow[16];
5162+ char org[16];
5163+
5164+ switch (type) {
5165+ case STATUSTYPE_INFO:
5166+ if (!snap->valid)
5167+ snprintf(result, maxlen, "Invalid");
5168+ else {
5169+ if (snap->store.fraction_full) {
5170+ sector_t numerator, denominator;
5171+ snap->store.fraction_full(&snap->store,
5172+ &numerator,
5173+ &denominator);
5174+ snprintf(result, maxlen,
5175+ SECTOR_FORMAT "/" SECTOR_FORMAT,
5176+ numerator, denominator);
5177+ }
5178+ else
5179+ snprintf(result, maxlen, "Unknown");
5180+ }
5181+ break;
5182+
5183+ case STATUSTYPE_TABLE:
5184+ /*
5185+ * kdevname returns a static pointer so we need
5186+ * to make private copies if the output is to
5187+ * make sense.
5188+ */
5189+ strncpy(cow, dm_kdevname(snap->cow->dev), sizeof(cow));
5190+ strncpy(org, dm_kdevname(snap->origin->dev), sizeof(org));
5191+ snprintf(result, maxlen, "%s %s %c %ld", org, cow,
5192+ snap->type, snap->chunk_size);
5193+ break;
5194+ }
5195+
5196+ return 0;
5197+}
5198+
5199+/*-----------------------------------------------------------------
5200+ * Origin methods
5201+ *---------------------------------------------------------------*/
5202+static void list_merge(struct list_head *l1, struct list_head *l2)
5203+{
5204+ struct list_head *l1_n, *l2_p;
5205+
5206+ l1_n = l1->next;
5207+ l2_p = l2->prev;
5208+
5209+ l1->next = l2;
5210+ l2->prev = l1;
5211+
5212+ l2_p->next = l1_n;
5213+ l1_n->prev = l2_p;
5214+}
5215+
5216+static int __origin_write(struct list_head *snapshots, struct buffer_head *bh)
5217+{
5218+ int r = 1, first = 1;
5219+ struct list_head *sl;
5220+ struct dm_snapshot *snap;
5221+ struct exception *e;
5222+ struct pending_exception *pe, *last = NULL;
5223+ chunk_t chunk;
5224+
5225+ /* Do all the snapshots on this origin */
5226+ list_for_each(sl, snapshots) {
5227+ snap = list_entry(sl, struct dm_snapshot, list);
5228+
5229+ /* Only deal with valid snapshots */
5230+ if (!snap->valid)
5231+ continue;
5232+
5233+ down_write(&snap->lock);
5234+
5235+ /*
5236+ * Remember, different snapshots can have
5237+ * different chunk sizes.
5238+ */
5239+ chunk = sector_to_chunk(snap, bh->b_rsector);
5240+
5241+ /*
5242+ * Check exception table to see if block
5243+ * is already remapped in this snapshot
5244+ * and trigger an exception if not.
5245+ */
5246+ e = lookup_exception(&snap->complete, chunk);
5247+ if (!e) {
5248+ pe = find_pending_exception(snap, bh);
5249+ if (!pe) {
5250+ snap->store.drop_snapshot(&snap->store);
5251+ snap->valid = 0;
5252+
5253+ } else {
5254+ if (last)
5255+ list_merge(&pe->siblings,
5256+ &last->siblings);
5257+
5258+ last = pe;
5259+ r = 0;
5260+ }
5261+ }
5262+
5263+ up_write(&snap->lock);
5264+ }
5265+
5266+ /*
5267+ * Now that we have a complete pe list we can start the copying.
5268+ */
5269+ if (last) {
5270+ pe = last;
5271+ do {
5272+ down_write(&pe->snap->lock);
5273+ if (first)
5274+ queue_buffer(&pe->origin_bhs, bh);
5275+ start_copy(pe);
5276+ up_write(&pe->snap->lock);
5277+ first = 0;
5278+ pe = list_entry(pe->siblings.next,
5279+ struct pending_exception, siblings);
5280+
5281+ } while (pe != last);
5282+ }
5283+
5284+ return r;
5285+}
5286+
5287+/*
5288+ * Called on a write from the origin driver.
5289+ */
5290+int do_origin(struct dm_dev *origin, struct buffer_head *bh)
5291+{
5292+ struct origin *o;
5293+ int r;
5294+
5295+ down_read(&_origins_lock);
5296+ o = __lookup_origin(origin->dev);
5297+ if (!o)
5298+ BUG();
5299+
5300+ r = __origin_write(&o->snapshots, bh);
5301+ up_read(&_origins_lock);
5302+
5303+ return r;
5304+}
5305+
5306+/*
5307+ * Origin: maps a linear range of a device, with hooks for snapshotting.
5308+ */
5309+
5310+/*
5311+ * Construct an origin mapping: <dev_path>
5312+ * The context for an origin is merely a 'struct dm_dev *'
5313+ * pointing to the real device.
5314+ */
5315+static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
5316+{
5317+ int r;
5318+ struct dm_dev *dev;
5319+
5320+ if (argc != 1) {
5321+ ti->error = "dm-origin: incorrect number of arguments";
5322+ return -EINVAL;
5323+ }
5324+
5325+ r = dm_get_device(ti, argv[0], 0, ti->len,
5326+ dm_table_get_mode(ti->table), &dev);
5327+ if (r) {
5328+ ti->error = "Cannot get target device";
5329+ return r;
5330+ }
5331+
5332+ ti->private = dev;
5333+ return 0;
5334+}
5335+
5336+static void origin_dtr(struct dm_target *ti)
5337+{
5338+ struct dm_dev *dev = (struct dm_dev *) ti->private;
5339+ dm_put_device(ti, dev);
5340+}
5341+
5342+static int origin_map(struct dm_target *ti, struct buffer_head *bh, int rw,
5343+ union map_info *map_context)
5344+{
5345+ struct dm_dev *dev = (struct dm_dev *) ti->private;
5346+ bh->b_rdev = dev->dev;
5347+
5348+ /* Only tell snapshots if this is a write */
5349+ return (rw == WRITE) ? do_origin(dev, bh) : 1;
5350+}
5351+
5352+static int origin_status(struct dm_target *ti, status_type_t type, char *result,
5353+ unsigned int maxlen)
5354+{
5355+ struct dm_dev *dev = (struct dm_dev *) ti->private;
5356+
5357+ switch (type) {
5358+ case STATUSTYPE_INFO:
5359+ result[0] = '\0';
5360+ break;
5361+
5362+ case STATUSTYPE_TABLE:
5363+ snprintf(result, maxlen, "%s", dm_kdevname(dev->dev));
5364+ break;
5365+ }
5366+
5367+ return 0;
5368+}
5369+
5370+static struct target_type origin_target = {
5371+ name: "snapshot-origin",
5372+ module: THIS_MODULE,
5373+ ctr: origin_ctr,
5374+ dtr: origin_dtr,
5375+ map: origin_map,
5376+ status: origin_status,
5377+};
5378+
5379+static struct target_type snapshot_target = {
5380+ name: "snapshot",
5381+ module: THIS_MODULE,
5382+ ctr: snapshot_ctr,
5383+ dtr: snapshot_dtr,
5384+ map: snapshot_map,
5385+ resume: snapshot_resume,
5386+ status: snapshot_status,
5387+};
5388+
5389+int __init dm_snapshot_init(void)
5390+{
5391+ int r;
5392+
5393+ r = dm_register_target(&snapshot_target);
5394+ if (r) {
5395+ DMERR("snapshot target register failed %d", r);
5396+ return r;
5397+ }
5398+
5399+ r = dm_register_target(&origin_target);
5400+ if (r < 0) {
5401+ DMERR("Device mapper: Origin: register failed %d\n", r);
5402+ goto bad1;
5403+ }
5404+
5405+ r = init_origin_hash();
5406+ if (r) {
5407+ DMERR("init_origin_hash failed.");
5408+ goto bad2;
5409+ }
5410+
5411+ exception_cache = kmem_cache_create("dm-snapshot-ex",
5412+ sizeof(struct exception),
5413+ __alignof__(struct exception),
5414+ 0, NULL, NULL);
5415+ if (!exception_cache) {
5416+ DMERR("Couldn't create exception cache.");
5417+ r = -ENOMEM;
5418+ goto bad3;
5419+ }
5420+
5421+ pending_cache =
5422+ kmem_cache_create("dm-snapshot-in",
5423+ sizeof(struct pending_exception),
5424+ __alignof__(struct pending_exception),
5425+ 0, NULL, NULL);
5426+ if (!pending_cache) {
5427+ DMERR("Couldn't create pending cache.");
5428+ r = -ENOMEM;
5429+ goto bad4;
5430+ }
5431+
5432+ pending_pool = mempool_create(128, mempool_alloc_slab,
5433+ mempool_free_slab, pending_cache);
5434+ if (!pending_pool) {
5435+ DMERR("Couldn't create pending pool.");
5436+ r = -ENOMEM;
5437+ goto bad5;
5438+ }
5439+
5440+ return 0;
5441+
5442+ bad5:
5443+ kmem_cache_destroy(pending_cache);
5444+ bad4:
5445+ kmem_cache_destroy(exception_cache);
5446+ bad3:
5447+ exit_origin_hash();
5448+ bad2:
5449+ dm_unregister_target(&origin_target);
5450+ bad1:
5451+ dm_unregister_target(&snapshot_target);
5452+ return r;
5453+}
5454+
5455+void dm_snapshot_exit(void)
5456+{
5457+ int r;
5458+
5459+ r = dm_unregister_target(&snapshot_target);
5460+ if (r)
5461+ DMERR("snapshot unregister failed %d", r);
5462+
5463+ r = dm_unregister_target(&origin_target);
5464+ if (r)
5465+ DMERR("origin unregister failed %d", r);
5466+
5467+ exit_origin_hash();
5468+ mempool_destroy(pending_pool);
5469+ kmem_cache_destroy(pending_cache);
5470+ kmem_cache_destroy(exception_cache);
5471+}
5472diff -urN linux-2.4.24.org/drivers/md/dm-snapshot.h linux-2.4.24/drivers/md/dm-snapshot.h
5473--- linux-2.4.24.org/drivers/md/dm-snapshot.h 1970-01-01 01:00:00.000000000 +0100
5474+++ linux-2.4.24/drivers/md/dm-snapshot.h 2004-01-18 15:01:29.250465221 +0100
5475@@ -0,0 +1,158 @@
5476+/*
5477+ * dm-snapshot.c
5478+ *
5479+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
5480+ *
5481+ * This file is released under the GPL.
5482+ */
5483+
5484+#ifndef DM_SNAPSHOT_H
5485+#define DM_SNAPSHOT_H
5486+
5487+#include "dm.h"
5488+#include <linux/blkdev.h>
5489+
5490+struct exception_table {
5491+ uint32_t hash_mask;
5492+ struct list_head *table;
5493+};
5494+
5495+/*
5496+ * The snapshot code deals with largish chunks of the disk at a
5497+ * time. Typically 64k - 256k.
5498+ */
5499+/* FIXME: can we get away with limiting these to a uint32_t ? */
5500+typedef sector_t chunk_t;
5501+
5502+/*
5503+ * An exception is used where an old chunk of data has been
5504+ * replaced by a new one.
5505+ */
5506+struct exception {
5507+ struct list_head hash_list;
5508+
5509+ chunk_t old_chunk;
5510+ chunk_t new_chunk;
5511+};
5512+
5513+/*
5514+ * Abstraction to handle the meta/layout of exception stores (the
5515+ * COW device).
5516+ */
5517+struct exception_store {
5518+
5519+ /*
5520+ * Destroys this object when you've finished with it.
5521+ */
5522+ void (*destroy) (struct exception_store *store);
5523+
5524+ /*
5525+ * The target shouldn't read the COW device until this is
5526+ * called.
5527+ */
5528+ int (*read_metadata) (struct exception_store *store);
5529+
5530+ /*
5531+ * Find somewhere to store the next exception.
5532+ */
5533+ int (*prepare_exception) (struct exception_store *store,
5534+ struct exception *e);
5535+
5536+ /*
5537+ * Update the metadata with this exception.
5538+ */
5539+ void (*commit_exception) (struct exception_store *store,
5540+ struct exception *e,
5541+ void (*callback) (void *, int success),
5542+ void *callback_context);
5543+
5544+ /*
5545+ * The snapshot is invalid, note this in the metadata.
5546+ */
5547+ void (*drop_snapshot) (struct exception_store *store);
5548+
5549+ /*
5550+ * Return how full the snapshot is.
5551+ */
5552+ void (*fraction_full) (struct exception_store *store,
5553+ sector_t *numerator,
5554+ sector_t *denominator);
5555+
5556+ struct dm_snapshot *snap;
5557+ void *context;
5558+};
5559+
5560+struct dm_snapshot {
5561+ struct rw_semaphore lock;
5562+ struct dm_table *table;
5563+
5564+ struct dm_dev *origin;
5565+ struct dm_dev *cow;
5566+
5567+ /* List of snapshots per Origin */
5568+ struct list_head list;
5569+
5570+ /* Size of data blocks saved - must be a power of 2 */
5571+ chunk_t chunk_size;
5572+ chunk_t chunk_mask;
5573+ chunk_t chunk_shift;
5574+
5575+ /* You can't use a snapshot if this is 0 (e.g. if full) */
5576+ int valid;
5577+ int have_metadata;
5578+
5579+ /* Used for display of table */
5580+ char type;
5581+
5582+ /* The last percentage we notified */
5583+ int last_percent;
5584+
5585+ struct exception_table pending;
5586+ struct exception_table complete;
5587+
5588+ /* The on disk metadata handler */
5589+ struct exception_store store;
5590+
5591+ struct kcopyd_client *kcopyd_client;
5592+};
5593+
5594+/*
5595+ * Used by the exception stores to load exceptions hen
5596+ * initialising.
5597+ */
5598+int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new);
5599+
5600+/*
5601+ * Constructor and destructor for the default persistent
5602+ * store.
5603+ */
5604+int dm_create_persistent(struct exception_store *store, uint32_t chunk_size);
5605+
5606+int dm_create_transient(struct exception_store *store,
5607+ struct dm_snapshot *s, int blocksize);
5608+
5609+/*
5610+ * Return the number of sectors in the device.
5611+ */
5612+static inline sector_t get_dev_size(kdev_t dev)
5613+{
5614+ int *sizes;
5615+
5616+ sizes = blk_size[MAJOR(dev)];
5617+ if (sizes)
5618+ return sizes[MINOR(dev)] << 1;
5619+
5620+ return 0;
5621+}
5622+
5623+static inline chunk_t sector_to_chunk(struct dm_snapshot *s, sector_t sector)
5624+{
5625+ return (sector & ~s->chunk_mask) >> s->chunk_shift;
5626+}
5627+
5628+static inline sector_t chunk_to_sector(struct dm_snapshot *s, chunk_t chunk)
5629+{
5630+ return chunk << s->chunk_shift;
5631+}
5632+
5633+#endif
5634diff -urN linux-2.4.24.org/drivers/md/dm-stripe.c linux-2.4.24/drivers/md/dm-stripe.c
5635--- linux-2.4.24.org/drivers/md/dm-stripe.c 1970-01-01 01:00:00.000000000 +0100
5636+++ linux-2.4.24/drivers/md/dm-stripe.c 2004-01-18 15:01:13.781711369 +0100
5637@@ -0,0 +1,258 @@
5638+/*
5639+ * Copyright (C) 2001 Sistina Software (UK) Limited.
5640+ *
5641+ * This file is released under the GPL.
5642+ */
5643+
5644+#include "dm.h"
5645+
5646+#include <linux/module.h>
5647+#include <linux/init.h>
5648+#include <linux/blkdev.h>
5649+#include <linux/slab.h>
5650+
5651+struct stripe {
5652+ struct dm_dev *dev;
5653+ sector_t physical_start;
5654+};
5655+
5656+struct stripe_c {
5657+ uint32_t stripes;
5658+
5659+ /* The size of this target / num. stripes */
5660+ uint32_t stripe_width;
5661+
5662+ /* stripe chunk size */
5663+ uint32_t chunk_shift;
5664+ sector_t chunk_mask;
5665+
5666+ struct stripe stripe[0];
5667+};
5668+
5669+static inline struct stripe_c *alloc_context(unsigned int stripes)
5670+{
5671+ size_t len;
5672+
5673+ if (array_too_big(sizeof(struct stripe_c), sizeof(struct stripe),
5674+ stripes))
5675+ return NULL;
5676+
5677+ len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes);
5678+
5679+ return kmalloc(len, GFP_KERNEL);
5680+}
5681+
5682+/*
5683+ * Parse a single <dev> <sector> pair
5684+ */
5685+static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
5686+ unsigned int stripe, char **argv)
5687+{
5688+ sector_t start;
5689+
5690+ if (sscanf(argv[1], SECTOR_FORMAT, &start) != 1)
5691+ return -EINVAL;
5692+
5693+ if (dm_get_device(ti, argv[0], start, sc->stripe_width,
5694+ dm_table_get_mode(ti->table),
5695+ &sc->stripe[stripe].dev))
5696+ return -ENXIO;
5697+
5698+ sc->stripe[stripe].physical_start = start;
5699+ return 0;
5700+}
5701+
5702+/*
5703+ * FIXME: Nasty function, only present because we can't link
5704+ * against __moddi3 and __divdi3.
5705+ *
5706+ * returns a == b * n
5707+ */
5708+static int multiple(sector_t a, sector_t b, sector_t *n)
5709+{
5710+ sector_t acc, prev, i;
5711+
5712+ *n = 0;
5713+ while (a >= b) {
5714+ for (acc = b, prev = 0, i = 1;
5715+ acc <= a;
5716+ prev = acc, acc <<= 1, i <<= 1)
5717+ ;
5718+
5719+ a -= prev;
5720+ *n += i >> 1;
5721+ }
5722+
5723+ return a == 0;
5724+}
5725+
5726+/*
5727+ * Construct a striped mapping.
5728+ * <number of stripes> <chunk size (2^^n)> [<dev_path> <offset>]+
5729+ */
5730+static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
5731+{
5732+ struct stripe_c *sc;
5733+ sector_t width;
5734+ uint32_t stripes;
5735+ uint32_t chunk_size;
5736+ char *end;
5737+ int r;
5738+ unsigned int i;
5739+
5740+ if (argc < 2) {
5741+ ti->error = "dm-stripe: Not enough arguments";
5742+ return -EINVAL;
5743+ }
5744+
5745+ stripes = simple_strtoul(argv[0], &end, 10);
5746+ if (*end) {
5747+ ti->error = "dm-stripe: Invalid stripe count";
5748+ return -EINVAL;
5749+ }
5750+
5751+ chunk_size = simple_strtoul(argv[1], &end, 10);
5752+ if (*end) {
5753+ ti->error = "dm-stripe: Invalid chunk_size";
5754+ return -EINVAL;
5755+ }
5756+
5757+ /*
5758+ * chunk_size is a power of two
5759+ */
5760+ if (!chunk_size || (chunk_size & (chunk_size - 1))) {
5761+ ti->error = "dm-stripe: Invalid chunk size";
5762+ return -EINVAL;
5763+ }
5764+
5765+ if (!multiple(ti->len, stripes, &width)) {
5766+ ti->error = "dm-stripe: Target length not divisable by "
5767+ "number of stripes";
5768+ return -EINVAL;
5769+ }
5770+
5771+ /*
5772+ * Do we have enough arguments for that many stripes ?
5773+ */
5774+ if (argc != (2 + 2 * stripes)) {
5775+ ti->error = "dm-stripe: Not enough destinations specified";
5776+ return -EINVAL;
5777+ }
5778+
5779+ sc = alloc_context(stripes);
5780+ if (!sc) {
5781+ ti->error = "dm-stripe: Memory allocation for striped context "
5782+ "failed";
5783+ return -ENOMEM;
5784+ }
5785+
5786+ sc->stripes = stripes;
5787+ sc->stripe_width = width;
5788+
5789+ sc->chunk_mask = ((sector_t) chunk_size) - 1;
5790+ for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++)
5791+ chunk_size >>= 1;
5792+ sc->chunk_shift--;
5793+
5794+ /*
5795+ * Get the stripe destinations.
5796+ */
5797+ for (i = 0; i < stripes; i++) {
5798+ argv += 2;
5799+
5800+ r = get_stripe(ti, sc, i, argv);
5801+ if (r < 0) {
5802+ ti->error = "dm-stripe: Couldn't parse stripe "
5803+ "destination";
5804+ while (i--)
5805+ dm_put_device(ti, sc->stripe[i].dev);
5806+ kfree(sc);
5807+ return r;
5808+ }
5809+ }
5810+
5811+ ti->private = sc;
5812+ return 0;
5813+}
5814+
5815+static void stripe_dtr(struct dm_target *ti)
5816+{
5817+ unsigned int i;
5818+ struct stripe_c *sc = (struct stripe_c *) ti->private;
5819+
5820+ for (i = 0; i < sc->stripes; i++)
5821+ dm_put_device(ti, sc->stripe[i].dev);
5822+
5823+ kfree(sc);
5824+}
5825+
5826+static int stripe_map(struct dm_target *ti, struct buffer_head *bh, int rw,
5827+ union map_info *context)
5828+{
5829+ struct stripe_c *sc = (struct stripe_c *) ti->private;
5830+
5831+ sector_t offset = bh->b_rsector - ti->begin;
5832+ uint32_t chunk = (uint32_t) (offset >> sc->chunk_shift);
5833+ uint32_t stripe = chunk % sc->stripes; /* 32bit modulus */
5834+ chunk = chunk / sc->stripes;
5835+
5836+ bh->b_rdev = sc->stripe[stripe].dev->dev;
5837+ bh->b_rsector = sc->stripe[stripe].physical_start +
5838+ (chunk << sc->chunk_shift) + (offset & sc->chunk_mask);
5839+ return 1;
5840+}
5841+
5842+static int stripe_status(struct dm_target *ti, status_type_t type,
5843+ char *result, unsigned int maxlen)
5844+{
5845+ struct stripe_c *sc = (struct stripe_c *) ti->private;
5846+ int offset;
5847+ unsigned int i;
5848+
5849+ switch (type) {
5850+ case STATUSTYPE_INFO:
5851+ result[0] = '\0';
5852+ break;
5853+
5854+ case STATUSTYPE_TABLE:
5855+ offset = snprintf(result, maxlen, "%d " SECTOR_FORMAT,
5856+ sc->stripes, sc->chunk_mask + 1);
5857+ for (i = 0; i < sc->stripes; i++) {
5858+ offset +=
5859+ snprintf(result + offset, maxlen - offset,
5860+ " %s " SECTOR_FORMAT,
5861+ dm_kdevname(to_kdev_t(sc->stripe[i].dev->bdev->bd_dev)),
5862+ sc->stripe[i].physical_start);
5863+ }
5864+ break;
5865+ }
5866+ return 0;
5867+}
5868+
5869+static struct target_type stripe_target = {
5870+ .name = "striped",
5871+ .module = THIS_MODULE,
5872+ .ctr = stripe_ctr,
5873+ .dtr = stripe_dtr,
5874+ .map = stripe_map,
5875+ .status = stripe_status,
5876+};
5877+
5878+int __init dm_stripe_init(void)
5879+{
5880+ int r;
5881+
5882+ r = dm_register_target(&stripe_target);
5883+ if (r < 0)
5884+ DMWARN("striped target registration failed");
5885+
5886+ return r;
5887+}
5888+
5889+void dm_stripe_exit(void)
5890+{
5891+ if (dm_unregister_target(&stripe_target))
5892+ DMWARN("striped target unregistration failed");
5893+
5894+ return;
5895+}
5896diff -urN linux-2.4.24.org/drivers/md/dm-table.c linux-2.4.24/drivers/md/dm-table.c
5897--- linux-2.4.24.org/drivers/md/dm-table.c 1970-01-01 01:00:00.000000000 +0100
5898+++ linux-2.4.24/drivers/md/dm-table.c 2004-01-18 15:01:13.786710320 +0100
5899@@ -0,0 +1,696 @@
5900+/*
5901+ * Copyright (C) 2001 Sistina Software (UK) Limited.
5902+ *
5903+ * This file is released under the GPL.
5904+ */
5905+
5906+#include "dm.h"
5907+
5908+#include <linux/module.h>
5909+#include <linux/vmalloc.h>
5910+#include <linux/blkdev.h>
5911+#include <linux/ctype.h>
5912+#include <linux/slab.h>
5913+#include <asm/atomic.h>
5914+
5915+#define MAX_DEPTH 16
5916+#define NODE_SIZE L1_CACHE_BYTES
5917+#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
5918+#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
5919+
5920+struct dm_table {
5921+ atomic_t holders;
5922+
5923+ /* btree table */
5924+ unsigned int depth;
5925+ unsigned int counts[MAX_DEPTH]; /* in nodes */
5926+ sector_t *index[MAX_DEPTH];
5927+
5928+ unsigned int num_targets;
5929+ unsigned int num_allocated;
5930+ sector_t *highs;
5931+ struct dm_target *targets;
5932+
5933+ /*
5934+ * Indicates the rw permissions for the new logical
5935+ * device. This should be a combination of FMODE_READ
5936+ * and FMODE_WRITE.
5937+ */
5938+ int mode;
5939+
5940+ /* a list of devices used by this table */
5941+ struct list_head devices;
5942+
5943+ /* events get handed up using this callback */
5944+ void (*event_fn)(void *);
5945+ void *event_context;
5946+};
5947+
5948+/*
5949+ * Similar to ceiling(log_size(n))
5950+ */
5951+static unsigned int int_log(unsigned long n, unsigned long base)
5952+{
5953+ int result = 0;
5954+
5955+ while (n > 1) {
5956+ n = dm_div_up(n, base);
5957+ result++;
5958+ }
5959+
5960+ return result;
5961+}
5962+
5963+/*
5964+ * Calculate the index of the child node of the n'th node k'th key.
5965+ */
5966+static inline unsigned int get_child(unsigned int n, unsigned int k)
5967+{
5968+ return (n * CHILDREN_PER_NODE) + k;
5969+}
5970+
5971+/*
5972+ * Return the n'th node of level l from table t.
5973+ */
5974+static inline sector_t *get_node(struct dm_table *t, unsigned int l,
5975+ unsigned int n)
5976+{
5977+ return t->index[l] + (n * KEYS_PER_NODE);
5978+}
5979+
5980+/*
5981+ * Return the highest key that you could lookup from the n'th
5982+ * node on level l of the btree.
5983+ */
5984+static sector_t high(struct dm_table *t, unsigned int l, unsigned int n)
5985+{
5986+ for (; l < t->depth - 1; l++)
5987+ n = get_child(n, CHILDREN_PER_NODE - 1);
5988+
5989+ if (n >= t->counts[l])
5990+ return (sector_t) - 1;
5991+
5992+ return get_node(t, l, n)[KEYS_PER_NODE - 1];
5993+}
5994+
5995+/*
5996+ * Fills in a level of the btree based on the highs of the level
5997+ * below it.
5998+ */
5999+static int setup_btree_index(unsigned int l, struct dm_table *t)
6000+{
6001+ unsigned int n, k;
6002+ sector_t *node;
6003+
6004+ for (n = 0U; n < t->counts[l]; n++) {
6005+ node = get_node(t, l, n);
6006+
6007+ for (k = 0U; k < KEYS_PER_NODE; k++)
6008+ node[k] = high(t, l + 1, get_child(n, k));
6009+ }
6010+
6011+ return 0;
6012+}
6013+
6014+void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size)
6015+{
6016+ unsigned long size;
6017+ void *addr;
6018+
6019+ /*
6020+ * Check that we're not going to overflow.
6021+ */
6022+ if (nmemb > (ULONG_MAX / elem_size))
6023+ return NULL;
6024+
6025+ size = nmemb * elem_size;
6026+ addr = vmalloc(size);
6027+ if (addr)
6028+ memset(addr, 0, size);
6029+
6030+ return addr;
6031+}
6032+
6033+int dm_table_create(struct dm_table **result, int mode, unsigned num_targets)
6034+{
6035+ struct dm_table *t = kmalloc(sizeof(*t), GFP_KERNEL);
6036+
6037+ if (!t)
6038+ return -ENOMEM;
6039+
6040+ memset(t, 0, sizeof(*t));
6041+ INIT_LIST_HEAD(&t->devices);
6042+ atomic_set(&t->holders, 1);
6043+
6044+ num_targets = dm_round_up(num_targets, KEYS_PER_NODE);
6045+
6046+ /* Allocate both the target array and offset array at once. */
6047+ t->highs = (sector_t *) dm_vcalloc(sizeof(struct dm_target) +
6048+ sizeof(sector_t), num_targets);
6049+ if (!t->highs) {
6050+ kfree(t);
6051+ return -ENOMEM;
6052+ }
6053+
6054+ memset(t->highs, -1, sizeof(*t->highs) * num_targets);
6055+
6056+ t->targets = (struct dm_target *) (t->highs + num_targets);
6057+ t->num_allocated = num_targets;
6058+ t->mode = mode;
6059+ *result = t;
6060+ return 0;
6061+}
6062+
6063+static void free_devices(struct list_head *devices)
6064+{
6065+ struct list_head *tmp, *next;
6066+
6067+ for (tmp = devices->next; tmp != devices; tmp = next) {
6068+ struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
6069+ next = tmp->next;
6070+ kfree(dd);
6071+ }
6072+}
6073+
6074+void table_destroy(struct dm_table *t)
6075+{
6076+ unsigned int i;
6077+
6078+ /* free the indexes (see dm_table_complete) */
6079+ if (t->depth >= 2)
6080+ vfree(t->index[t->depth - 2]);
6081+
6082+ /* free the targets */
6083+ for (i = 0; i < t->num_targets; i++) {
6084+ struct dm_target *tgt = t->targets + i;
6085+
6086+ if (tgt->type->dtr)
6087+ tgt->type->dtr(tgt);
6088+
6089+ dm_put_target_type(tgt->type);
6090+ }
6091+
6092+ vfree(t->highs);
6093+
6094+ /* free the device list */
6095+ if (t->devices.next != &t->devices) {
6096+ DMWARN("devices still present during destroy: "
6097+ "dm_table_remove_device calls missing");
6098+
6099+ free_devices(&t->devices);
6100+ }
6101+
6102+ kfree(t);
6103+}
6104+
6105+void dm_table_get(struct dm_table *t)
6106+{
6107+ atomic_inc(&t->holders);
6108+}
6109+
6110+void dm_table_put(struct dm_table *t)
6111+{
6112+ if (atomic_dec_and_test(&t->holders))
6113+ table_destroy(t);
6114+}
6115+
6116+/*
6117+ * Convert a device path to a dev_t.
6118+ */
6119+static int lookup_device(const char *path, kdev_t *dev)
6120+{
6121+ int r;
6122+ struct nameidata nd;
6123+ struct inode *inode;
6124+
6125+ if (!path_init(path, LOOKUP_FOLLOW, &nd))
6126+ return 0;
6127+
6128+ if ((r = path_walk(path, &nd)))
6129+ goto out;
6130+
6131+ inode = nd.dentry->d_inode;
6132+ if (!inode) {
6133+ r = -ENOENT;
6134+ goto out;
6135+ }
6136+
6137+ if (!S_ISBLK(inode->i_mode)) {
6138+ r = -ENOTBLK;
6139+ goto out;
6140+ }
6141+
6142+ *dev = inode->i_rdev;
6143+
6144+ out:
6145+ path_release(&nd);
6146+ return r;
6147+}
6148+
6149+/*
6150+ * See if we've already got a device in the list.
6151+ */
6152+static struct dm_dev *find_device(struct list_head *l, kdev_t dev)
6153+{
6154+ struct list_head *tmp;
6155+
6156+ list_for_each(tmp, l) {
6157+ struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
6158+ if (kdev_same(dd->dev, dev))
6159+ return dd;
6160+ }
6161+
6162+ return NULL;
6163+}
6164+
6165+/*
6166+ * Open a device so we can use it as a map destination.
6167+ */
6168+static int open_dev(struct dm_dev *dd)
6169+{
6170+ if (dd->bdev)
6171+ BUG();
6172+
6173+ dd->bdev = bdget(kdev_t_to_nr(dd->dev));
6174+ if (!dd->bdev)
6175+ return -ENOMEM;
6176+
6177+ return blkdev_get(dd->bdev, dd->mode, 0, BDEV_RAW);
6178+}
6179+
6180+/*
6181+ * Close a device that we've been using.
6182+ */
6183+static void close_dev(struct dm_dev *dd)
6184+{
6185+ if (!dd->bdev)
6186+ return;
6187+
6188+ blkdev_put(dd->bdev, BDEV_RAW);
6189+ dd->bdev = NULL;
6190+}
6191+
6192+/*
6193+ * If possible (ie. blk_size[major] is set), this checks an area
6194+ * of a destination device is valid.
6195+ */
6196+static int check_device_area(kdev_t dev, sector_t start, sector_t len)
6197+{
6198+ int *sizes;
6199+ sector_t dev_size;
6200+
6201+ if (!(sizes = blk_size[major(dev)]) || !(dev_size = sizes[minor(dev)]))
6202+ /* we don't know the device details,
6203+ * so give the benefit of the doubt */
6204+ return 1;
6205+
6206+ /* convert to 512-byte sectors */
6207+ dev_size <<= 1;
6208+
6209+ return ((start < dev_size) && (len <= (dev_size - start)));
6210+}
6211+
6212+/*
6213+ * This upgrades the mode on an already open dm_dev. Being
6214+ * careful to leave things as they were if we fail to reopen the
6215+ * device.
6216+ */
6217+static int upgrade_mode(struct dm_dev *dd, int new_mode)
6218+{
6219+ int r;
6220+ struct dm_dev dd_copy;
6221+
6222+ memcpy(&dd_copy, dd, sizeof(dd_copy));
6223+
6224+ dd->mode |= new_mode;
6225+ dd->bdev = NULL;
6226+ r = open_dev(dd);
6227+ if (!r)
6228+ close_dev(&dd_copy);
6229+ else
6230+ memcpy(dd, &dd_copy, sizeof(dd_copy));
6231+
6232+ return r;
6233+}
6234+
6235+/*
6236+ * Add a device to the list, or just increment the usage count if
6237+ * it's already present.
6238+ */
6239+int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
6240+ sector_t len, int mode, struct dm_dev **result)
6241+{
6242+ int r;
6243+ kdev_t dev;
6244+ struct dm_dev *dd;
6245+ unsigned major, minor;
6246+ struct dm_table *t = ti->table;
6247+
6248+ if (!t)
6249+ BUG();
6250+
6251+ if (sscanf(path, "%u:%u", &major, &minor) == 2) {
6252+ /* Extract the major/minor numbers */
6253+ dev = mk_kdev(major, minor);
6254+ } else {
6255+ /* convert the path to a device */
6256+ if ((r = lookup_device(path, &dev)))
6257+ return r;
6258+ }
6259+
6260+ dd = find_device(&t->devices, dev);
6261+ if (!dd) {
6262+ dd = kmalloc(sizeof(*dd), GFP_KERNEL);
6263+ if (!dd)
6264+ return -ENOMEM;
6265+
6266+ dd->dev = dev;
6267+ dd->mode = mode;
6268+ dd->bdev = NULL;
6269+
6270+ if ((r = open_dev(dd))) {
6271+ kfree(dd);
6272+ return r;
6273+ }
6274+
6275+ atomic_set(&dd->count, 0);
6276+ list_add(&dd->list, &t->devices);
6277+
6278+ } else if (dd->mode != (mode | dd->mode)) {
6279+ r = upgrade_mode(dd, mode);
6280+ if (r)
6281+ return r;
6282+ }
6283+ atomic_inc(&dd->count);
6284+
6285+ if (!check_device_area(dd->dev, start, len)) {
6286+ DMWARN("device %s too small for target", path);
6287+ dm_put_device(ti, dd);
6288+ return -EINVAL;
6289+ }
6290+
6291+ *result = dd;
6292+
6293+ return 0;
6294+}
6295+
6296+/*
6297+ * Decrement a devices use count and remove it if neccessary.
6298+ */
6299+void dm_put_device(struct dm_target *ti, struct dm_dev *dd)
6300+{
6301+ if (atomic_dec_and_test(&dd->count)) {
6302+ close_dev(dd);
6303+ list_del(&dd->list);
6304+ kfree(dd);
6305+ }
6306+}
6307+
6308+/*
6309+ * Checks to see if the target joins onto the end of the table.
6310+ */
6311+static int adjoin(struct dm_table *table, struct dm_target *ti)
6312+{
6313+ struct dm_target *prev;
6314+
6315+ if (!table->num_targets)
6316+ return !ti->begin;
6317+
6318+ prev = &table->targets[table->num_targets - 1];
6319+ return (ti->begin == (prev->begin + prev->len));
6320+}
6321+
6322+/*
6323+ * Used to dynamically allocate the arg array.
6324+ */
6325+static char **realloc_argv(unsigned *array_size, char **old_argv)
6326+{
6327+ char **argv;
6328+ unsigned new_size;
6329+
6330+ new_size = *array_size ? *array_size * 2 : 64;
6331+ argv = kmalloc(new_size * sizeof(*argv), GFP_KERNEL);
6332+ if (argv) {
6333+ memcpy(argv, old_argv, *array_size * sizeof(*argv));
6334+ *array_size = new_size;
6335+ }
6336+
6337+ kfree(old_argv);
6338+ return argv;
6339+}
6340+
6341+/*
6342+ * Destructively splits up the argument list to pass to ctr.
6343+ */
6344+static int split_args(int *argc, char ***argvp, char *input)
6345+{
6346+ char *start, *end = input, *out, **argv = NULL;
6347+ unsigned array_size = 0;
6348+
6349+ *argc = 0;
6350+ argv = realloc_argv(&array_size, argv);
6351+ if (!argv)
6352+ return -ENOMEM;
6353+
6354+ while (1) {
6355+ start = end;
6356+
6357+ /* Skip whitespace */
6358+ while (*start && isspace(*start))
6359+ start++;
6360+
6361+ if (!*start)
6362+ break; /* success, we hit the end */
6363+
6364+ /* 'out' is used to remove any back-quotes */
6365+ end = out = start;
6366+ while (*end) {
6367+ /* Everything apart from '\0' can be quoted */
6368+ if (*end == '\\' && *(end + 1)) {
6369+ *out++ = *(end + 1);
6370+ end += 2;
6371+ continue;
6372+ }
6373+
6374+ if (isspace(*end))
6375+ break; /* end of token */
6376+
6377+ *out++ = *end++;
6378+ }
6379+
6380+ /* have we already filled the array ? */
6381+ if ((*argc + 1) > array_size) {
6382+ argv = realloc_argv(&array_size, argv);
6383+ if (!argv)
6384+ return -ENOMEM;
6385+ }
6386+
6387+ /* we know this is whitespace */
6388+ if (*end)
6389+ end++;
6390+
6391+ /* terminate the string and put it in the array */
6392+ *out = '\0';
6393+ argv[*argc] = start;
6394+ (*argc)++;
6395+ }
6396+
6397+ *argvp = argv;
6398+ return 0;
6399+}
6400+
6401+int dm_table_add_target(struct dm_table *t, const char *type,
6402+ sector_t start, sector_t len, char *params)
6403+{
6404+ int r = -EINVAL, argc;
6405+ char **argv;
6406+ struct dm_target *tgt;
6407+
6408+ if (t->num_targets >= t->num_allocated)
6409+ return -ENOMEM;
6410+
6411+ tgt = t->targets + t->num_targets;
6412+ memset(tgt, 0, sizeof(*tgt));
6413+
6414+ tgt->type = dm_get_target_type(type);
6415+ if (!tgt->type) {
6416+ tgt->error = "unknown target type";
6417+ return -EINVAL;
6418+ }
6419+
6420+ tgt->table = t;
6421+ tgt->begin = start;
6422+ tgt->len = len;
6423+ tgt->error = "Unknown error";
6424+
6425+ /*
6426+ * Does this target adjoin the previous one ?
6427+ */
6428+ if (!adjoin(t, tgt)) {
6429+ tgt->error = "Gap in table";
6430+ r = -EINVAL;
6431+ goto bad;
6432+ }
6433+
6434+ r = split_args(&argc, &argv, params);
6435+ if (r) {
6436+ tgt->error = "couldn't split parameters (insufficient memory)";
6437+ goto bad;
6438+ }
6439+
6440+ r = tgt->type->ctr(tgt, argc, argv);
6441+ kfree(argv);
6442+ if (r)
6443+ goto bad;
6444+
6445+ t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
6446+ return 0;
6447+
6448+ bad:
6449+ printk(KERN_ERR DM_NAME ": %s\n", tgt->error);
6450+ dm_put_target_type(tgt->type);
6451+ return r;
6452+}
6453+
6454+static int setup_indexes(struct dm_table *t)
6455+{
6456+ int i;
6457+ unsigned int total = 0;
6458+ sector_t *indexes;
6459+
6460+ /* allocate the space for *all* the indexes */
6461+ for (i = t->depth - 2; i >= 0; i--) {
6462+ t->counts[i] = dm_div_up(t->counts[i + 1], CHILDREN_PER_NODE);
6463+ total += t->counts[i];
6464+ }
6465+
6466+ indexes = (sector_t *) dm_vcalloc(total, (unsigned long) NODE_SIZE);
6467+ if (!indexes)
6468+ return -ENOMEM;
6469+
6470+ /* set up internal nodes, bottom-up */
6471+ for (i = t->depth - 2, total = 0; i >= 0; i--) {
6472+ t->index[i] = indexes;
6473+ indexes += (KEYS_PER_NODE * t->counts[i]);
6474+ setup_btree_index(i, t);
6475+ }
6476+
6477+ return 0;
6478+}
6479+
6480+/*
6481+ * Builds the btree to index the map.
6482+ */
6483+int dm_table_complete(struct dm_table *t)
6484+{
6485+ int r = 0;
6486+ unsigned int leaf_nodes;
6487+
6488+ /* how many indexes will the btree have ? */
6489+ leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
6490+ t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
6491+
6492+ /* leaf layer has already been set up */
6493+ t->counts[t->depth - 1] = leaf_nodes;
6494+ t->index[t->depth - 1] = t->highs;
6495+
6496+ if (t->depth >= 2)
6497+ r = setup_indexes(t);
6498+
6499+ return r;
6500+}
6501+
6502+static spinlock_t _event_lock = SPIN_LOCK_UNLOCKED;
6503+void dm_table_event_callback(struct dm_table *t,
6504+ void (*fn)(void *), void *context)
6505+{
6506+ spin_lock_irq(&_event_lock);
6507+ t->event_fn = fn;
6508+ t->event_context = context;
6509+ spin_unlock_irq(&_event_lock);
6510+}
6511+
6512+void dm_table_event(struct dm_table *t)
6513+{
6514+ spin_lock(&_event_lock);
6515+ if (t->event_fn)
6516+ t->event_fn(t->event_context);
6517+ spin_unlock(&_event_lock);
6518+}
6519+
6520+sector_t dm_table_get_size(struct dm_table *t)
6521+{
6522+ return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0;
6523+}
6524+
6525+struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index)
6526+{
6527+ if (index > t->num_targets)
6528+ return NULL;
6529+
6530+ return t->targets + index;
6531+}
6532+
6533+/*
6534+ * Search the btree for the correct target.
6535+ */
6536+struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
6537+{
6538+ unsigned int l, n = 0, k = 0;
6539+ sector_t *node;
6540+
6541+ for (l = 0; l < t->depth; l++) {
6542+ n = get_child(n, k);
6543+ node = get_node(t, l, n);
6544+
6545+ for (k = 0; k < KEYS_PER_NODE; k++)
6546+ if (node[k] >= sector)
6547+ break;
6548+ }
6549+
6550+ return &t->targets[(KEYS_PER_NODE * n) + k];
6551+}
6552+
6553+unsigned int dm_table_get_num_targets(struct dm_table *t)
6554+{
6555+ return t->num_targets;
6556+}
6557+
6558+struct list_head *dm_table_get_devices(struct dm_table *t)
6559+{
6560+ return &t->devices;
6561+}
6562+
6563+int dm_table_get_mode(struct dm_table *t)
6564+{
6565+ return t->mode;
6566+}
6567+
6568+void dm_table_suspend_targets(struct dm_table *t)
6569+{
6570+ int i;
6571+
6572+ for (i = 0; i < t->num_targets; i++) {
6573+ struct dm_target *ti = t->targets + i;
6574+
6575+ if (ti->type->suspend)
6576+ ti->type->suspend(ti);
6577+ }
6578+}
6579+
6580+void dm_table_resume_targets(struct dm_table *t)
6581+{
6582+ int i;
6583+
6584+ for (i = 0; i < t->num_targets; i++) {
6585+ struct dm_target *ti = t->targets + i;
6586+
6587+ if (ti->type->resume)
6588+ ti->type->resume(ti);
6589+ }
6590+}
6591+
6592+EXPORT_SYMBOL(dm_get_device);
6593+EXPORT_SYMBOL(dm_put_device);
6594+EXPORT_SYMBOL(dm_table_event);
6595+EXPORT_SYMBOL(dm_table_get_mode);
6596diff -urN linux-2.4.24.org/drivers/md/dm-target.c linux-2.4.24/drivers/md/dm-target.c
6597--- linux-2.4.24.org/drivers/md/dm-target.c 1970-01-01 01:00:00.000000000 +0100
6598+++ linux-2.4.24/drivers/md/dm-target.c 2004-01-18 15:01:13.789709690 +0100
6599@@ -0,0 +1,188 @@
6600+/*
6601+ * Copyright (C) 2001 Sistina Software (UK) Limited
6602+ *
6603+ * This file is released under the GPL.
6604+ */
6605+
6606+#include "dm.h"
6607+
6608+#include <linux/module.h>
6609+#include <linux/kmod.h>
6610+#include <linux/slab.h>
6611+
6612+struct tt_internal {
6613+ struct target_type tt;
6614+
6615+ struct list_head list;
6616+ long use;
6617+};
6618+
6619+static LIST_HEAD(_targets);
6620+static DECLARE_RWSEM(_lock);
6621+
6622+#define DM_MOD_NAME_SIZE 32
6623+
6624+static inline struct tt_internal *__find_target_type(const char *name)
6625+{
6626+ struct list_head *tih;
6627+ struct tt_internal *ti;
6628+
6629+ list_for_each(tih, &_targets) {
6630+ ti = list_entry(tih, struct tt_internal, list);
6631+
6632+ if (!strcmp(name, ti->tt.name))
6633+ return ti;
6634+ }
6635+
6636+ return NULL;
6637+}
6638+
6639+static struct tt_internal *get_target_type(const char *name)
6640+{
6641+ struct tt_internal *ti;
6642+
6643+ down_read(&_lock);
6644+ ti = __find_target_type(name);
6645+
6646+ if (ti) {
6647+ if (ti->use == 0 && ti->tt.module)
6648+ __MOD_INC_USE_COUNT(ti->tt.module);
6649+ ti->use++;
6650+ }
6651+ up_read(&_lock);
6652+
6653+ return ti;
6654+}
6655+
6656+static void load_module(const char *name)
6657+{
6658+ char module_name[DM_MOD_NAME_SIZE] = "dm-";
6659+
6660+ /* Length check for strcat() below */
6661+ if (strlen(name) > (DM_MOD_NAME_SIZE - 4))
6662+ return;
6663+
6664+ strcat(module_name, name);
6665+ request_module(module_name);
6666+}
6667+
6668+struct target_type *dm_get_target_type(const char *name)
6669+{
6670+ struct tt_internal *ti = get_target_type(name);
6671+
6672+ if (!ti) {
6673+ load_module(name);
6674+ ti = get_target_type(name);
6675+ }
6676+
6677+ return ti ? &ti->tt : NULL;
6678+}
6679+
6680+void dm_put_target_type(struct target_type *t)
6681+{
6682+ struct tt_internal *ti = (struct tt_internal *) t;
6683+
6684+ down_read(&_lock);
6685+ if (--ti->use == 0 && ti->tt.module)
6686+ __MOD_DEC_USE_COUNT(ti->tt.module);
6687+
6688+ if (ti->use < 0)
6689+ BUG();
6690+ up_read(&_lock);
6691+
6692+ return;
6693+}
6694+
6695+static struct tt_internal *alloc_target(struct target_type *t)
6696+{
6697+ struct tt_internal *ti = kmalloc(sizeof(*ti), GFP_KERNEL);
6698+
6699+ if (ti) {
6700+ memset(ti, 0, sizeof(*ti));
6701+ ti->tt = *t;
6702+ }
6703+
6704+ return ti;
6705+}
6706+
6707+int dm_register_target(struct target_type *t)
6708+{
6709+ int rv = 0;
6710+ struct tt_internal *ti = alloc_target(t);
6711+
6712+ if (!ti)
6713+ return -ENOMEM;
6714+
6715+ down_write(&_lock);
6716+ if (__find_target_type(t->name)) {
6717+ kfree(ti);
6718+ rv = -EEXIST;
6719+ } else
6720+ list_add(&ti->list, &_targets);
6721+
6722+ up_write(&_lock);
6723+ return rv;
6724+}
6725+
6726+int dm_unregister_target(struct target_type *t)
6727+{
6728+ struct tt_internal *ti;
6729+
6730+ down_write(&_lock);
6731+ if (!(ti = __find_target_type(t->name))) {
6732+ up_write(&_lock);
6733+ return -EINVAL;
6734+ }
6735+
6736+ if (ti->use) {
6737+ up_write(&_lock);
6738+ return -ETXTBSY;
6739+ }
6740+
6741+ list_del(&ti->list);
6742+ kfree(ti);
6743+
6744+ up_write(&_lock);
6745+ return 0;
6746+}
6747+
6748+/*
6749+ * io-err: always fails an io, useful for bringing
6750+ * up LVs that have holes in them.
6751+ */
6752+static int io_err_ctr(struct dm_target *ti, unsigned int argc, char **args)
6753+{
6754+ return 0;
6755+}
6756+
6757+static void io_err_dtr(struct dm_target *ti)
6758+{
6759+ /* empty */
6760+}
6761+
6762+static int io_err_map(struct dm_target *ti, struct buffer_head *bh, int rw,
6763+ union map_info *map_context)
6764+{
6765+ return -EIO;
6766+}
6767+
6768+static struct target_type error_target = {
6769+ .name = "error",
6770+ .ctr = io_err_ctr,
6771+ .dtr = io_err_dtr,
6772+ .map = io_err_map,
6773+};
6774+
6775+int dm_target_init(void)
6776+{
6777+ return dm_register_target(&error_target);
6778+}
6779+
6780+void dm_target_exit(void)
6781+{
6782+ if (dm_unregister_target(&error_target))
6783+ DMWARN("error target unregistration failed");
6784+}
6785+
6786+EXPORT_SYMBOL(dm_register_target);
6787+EXPORT_SYMBOL(dm_unregister_target);
6788diff -urN linux-2.4.24.org/drivers/md/kcopyd.c linux-2.4.24/drivers/md/kcopyd.c
6789--- linux-2.4.24.org/drivers/md/kcopyd.c 1970-01-01 01:00:00.000000000 +0100
6790+++ linux-2.4.24/drivers/md/kcopyd.c 2004-01-18 15:01:25.797189646 +0100
6791@@ -0,0 +1,666 @@
6792+/*
6793+ * Copyright (C) 2002 Sistina Software (UK) Limited.
6794+ *
6795+ * This file is released under the GPL.
6796+ */
6797+
6798+#include <asm/atomic.h>
6799+
6800+#include <linux/blkdev.h>
6801+#include <linux/config.h>
6802+#include <linux/device-mapper.h>
6803+#include <linux/fs.h>
6804+#include <linux/init.h>
6805+#include <linux/list.h>
6806+#include <linux/locks.h>
6807+#include <linux/mempool.h>
6808+#include <linux/module.h>
6809+#include <linux/pagemap.h>
6810+#include <linux/slab.h>
6811+#include <linux/vmalloc.h>
6812+
6813+#include "kcopyd.h"
6814+#include "dm-daemon.h"
6815+
6816+/* FIXME: this is only needed for the DMERR macros */
6817+#include "dm.h"
6818+
6819+static struct dm_daemon _kcopyd;
6820+
6821+#define SECTORS_PER_PAGE (PAGE_SIZE / SECTOR_SIZE)
6822+#define SUB_JOB_SIZE 128
6823+#define PAGES_PER_SUB_JOB (SUB_JOB_SIZE / SECTORS_PER_PAGE)
6824+#define SUB_JOB_COUNT 8
6825+
6826+/*-----------------------------------------------------------------
6827+ * Each kcopyd client has its own little pool of preallocated
6828+ * pages for kcopyd io.
6829+ *---------------------------------------------------------------*/
6830+struct kcopyd_client {
6831+ struct list_head list;
6832+
6833+ spinlock_t lock;
6834+ struct list_head pages;
6835+ unsigned int nr_pages;
6836+ unsigned int nr_free_pages;
6837+ unsigned int max_split;
6838+};
6839+
6840+static inline void __push_page(struct kcopyd_client *kc, struct page *p)
6841+{
6842+ list_add(&p->list, &kc->pages);
6843+ kc->nr_free_pages++;
6844+}
6845+
6846+static inline struct page *__pop_page(struct kcopyd_client *kc)
6847+{
6848+ struct page *p;
6849+
6850+ p = list_entry(kc->pages.next, struct page, list);
6851+ list_del(&p->list);
6852+ kc->nr_free_pages--;
6853+
6854+ return p;
6855+}
6856+
6857+static int kcopyd_get_pages(struct kcopyd_client *kc,
6858+ unsigned int nr, struct list_head *pages)
6859+{
6860+ struct page *p;
6861+ INIT_LIST_HEAD(pages);
6862+
6863+ spin_lock(&kc->lock);
6864+ if (kc->nr_free_pages < nr) {
6865+ spin_unlock(&kc->lock);
6866+ return -ENOMEM;
6867+ }
6868+
6869+ while (nr--) {
6870+ p = __pop_page(kc);
6871+ list_add(&p->list, pages);
6872+ }
6873+ spin_unlock(&kc->lock);
6874+
6875+ return 0;
6876+}
6877+
6878+static void kcopyd_put_pages(struct kcopyd_client *kc, struct list_head *pages)
6879+{
6880+ struct list_head *tmp, *tmp2;
6881+
6882+ spin_lock(&kc->lock);
6883+ list_for_each_safe (tmp, tmp2, pages)
6884+ __push_page(kc, list_entry(tmp, struct page, list));
6885+ spin_unlock(&kc->lock);
6886+}
6887+
6888+/*
6889+ * These three functions resize the page pool.
6890+ */
6891+static void release_pages(struct list_head *pages)
6892+{
6893+ struct page *p;
6894+ struct list_head *tmp, *tmp2;
6895+
6896+ list_for_each_safe (tmp, tmp2, pages) {
6897+ p = list_entry(tmp, struct page, list);
6898+ UnlockPage(p);
6899+ __free_page(p);
6900+ }
6901+}
6902+
6903+static int client_alloc_pages(struct kcopyd_client *kc, unsigned int nr)
6904+{
6905+ unsigned int i;
6906+ struct page *p;
6907+ LIST_HEAD(new);
6908+
6909+ for (i = 0; i < nr; i++) {
6910+ p = alloc_page(GFP_KERNEL);
6911+ if (!p) {
6912+ release_pages(&new);
6913+ return -ENOMEM;
6914+ }
6915+
6916+ LockPage(p);
6917+ list_add(&p->list, &new);
6918+ }
6919+
6920+ kcopyd_put_pages(kc, &new);
6921+ kc->nr_pages += nr;
6922+ kc->max_split = kc->nr_pages / PAGES_PER_SUB_JOB;
6923+ if (kc->max_split > SUB_JOB_COUNT)
6924+ kc->max_split = SUB_JOB_COUNT;
6925+
6926+ return 0;
6927+}
6928+
6929+static void client_free_pages(struct kcopyd_client *kc)
6930+{
6931+ BUG_ON(kc->nr_free_pages != kc->nr_pages);
6932+ release_pages(&kc->pages);
6933+ kc->nr_free_pages = kc->nr_pages = 0;
6934+}
6935+
6936+/*-----------------------------------------------------------------
6937+ * kcopyd_jobs need to be allocated by the *clients* of kcopyd,
6938+ * for this reason we use a mempool to prevent the client from
6939+ * ever having to do io (which could cause a deadlock).
6940+ *---------------------------------------------------------------*/
6941+struct kcopyd_job {
6942+ struct kcopyd_client *kc;
6943+ struct list_head list;
6944+ unsigned int flags;
6945+
6946+ /*
6947+ * Error state of the job.
6948+ */
6949+ int read_err;
6950+ unsigned int write_err;
6951+
6952+ /*
6953+ * Either READ or WRITE
6954+ */
6955+ int rw;
6956+ struct io_region source;
6957+
6958+ /*
6959+ * The destinations for the transfer.
6960+ */
6961+ unsigned int num_dests;
6962+ struct io_region dests[KCOPYD_MAX_REGIONS];
6963+
6964+ sector_t offset;
6965+ unsigned int nr_pages;
6966+ struct list_head pages;
6967+
6968+ /*
6969+ * Set this to ensure you are notified when the job has
6970+ * completed. 'context' is for callback to use.
6971+ */
6972+ kcopyd_notify_fn fn;
6973+ void *context;
6974+
6975+ /*
6976+ * These fields are only used if the job has been split
6977+ * into more manageable parts.
6978+ */
6979+ struct semaphore lock;
6980+ atomic_t sub_jobs;
6981+ sector_t progress;
6982+};
6983+
6984+/* FIXME: this should scale with the number of pages */
6985+#define MIN_JOBS 512
6986+
6987+static kmem_cache_t *_job_cache;
6988+static mempool_t *_job_pool;
6989+
6990+/*
6991+ * We maintain three lists of jobs:
6992+ *
6993+ * i) jobs waiting for pages
6994+ * ii) jobs that have pages, and are waiting for the io to be issued.
6995+ * iii) jobs that have completed.
6996+ *
6997+ * All three of these are protected by job_lock.
6998+ */
6999+static spinlock_t _job_lock = SPIN_LOCK_UNLOCKED;
7000+
7001+static LIST_HEAD(_complete_jobs);
7002+static LIST_HEAD(_io_jobs);
7003+static LIST_HEAD(_pages_jobs);
7004+
7005+static int jobs_init(void)
7006+{
7007+ INIT_LIST_HEAD(&_complete_jobs);
7008+ INIT_LIST_HEAD(&_io_jobs);
7009+ INIT_LIST_HEAD(&_pages_jobs);
7010+
7011+ _job_cache = kmem_cache_create("kcopyd-jobs",
7012+ sizeof(struct kcopyd_job),
7013+ __alignof__(struct kcopyd_job),
7014+ 0, NULL, NULL);
7015+ if (!_job_cache)
7016+ return -ENOMEM;
7017+
7018+ _job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
7019+ mempool_free_slab, _job_cache);
7020+ if (!_job_pool) {
7021+ kmem_cache_destroy(_job_cache);
7022+ return -ENOMEM;
7023+ }
7024+
7025+ return 0;
7026+}
7027+
7028+static void jobs_exit(void)
7029+{
7030+ BUG_ON(!list_empty(&_complete_jobs));
7031+ BUG_ON(!list_empty(&_io_jobs));
7032+ BUG_ON(!list_empty(&_pages_jobs));
7033+
7034+ mempool_destroy(_job_pool);
7035+ kmem_cache_destroy(_job_cache);
7036+}
7037+
7038+/*
7039+ * Functions to push and pop a job onto the head of a given job
7040+ * list.
7041+ */
7042+static inline struct kcopyd_job *pop(struct list_head *jobs)
7043+{
7044+ struct kcopyd_job *job = NULL;
7045+ unsigned long flags;
7046+
7047+ spin_lock_irqsave(&_job_lock, flags);
7048+
7049+ if (!list_empty(jobs)) {
7050+ job = list_entry(jobs->next, struct kcopyd_job, list);
7051+ list_del(&job->list);
7052+ }
7053+ spin_unlock_irqrestore(&_job_lock, flags);
7054+
7055+ return job;
7056+}
7057+
7058+static inline void push(struct list_head *jobs, struct kcopyd_job *job)
7059+{
7060+ unsigned long flags;
7061+
7062+ spin_lock_irqsave(&_job_lock, flags);
7063+ list_add_tail(&job->list, jobs);
7064+ spin_unlock_irqrestore(&_job_lock, flags);
7065+}
7066+
7067+/*
7068+ * These three functions process 1 item from the corresponding
7069+ * job list.
7070+ *
7071+ * They return:
7072+ * < 0: error
7073+ * 0: success
7074+ * > 0: can't process yet.
7075+ */
7076+static int run_complete_job(struct kcopyd_job *job)
7077+{
7078+ void *context = job->context;
7079+ int read_err = job->read_err;
7080+ unsigned int write_err = job->write_err;
7081+ kcopyd_notify_fn fn = job->fn;
7082+
7083+ kcopyd_put_pages(job->kc, &job->pages);
7084+ mempool_free(job, _job_pool);
7085+ fn(read_err, write_err, context);
7086+ return 0;
7087+}
7088+
7089+static void complete_io(unsigned int error, void *context)
7090+{
7091+ struct kcopyd_job *job = (struct kcopyd_job *) context;
7092+
7093+ if (error) {
7094+ if (job->rw == WRITE)
7095+ job->write_err &= error;
7096+ else
7097+ job->read_err = 1;
7098+
7099+ if (!test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) {
7100+ push(&_complete_jobs, job);
7101+ dm_daemon_wake(&_kcopyd);
7102+ return;
7103+ }
7104+ }
7105+
7106+ if (job->rw == WRITE)
7107+ push(&_complete_jobs, job);
7108+
7109+ else {
7110+ job->rw = WRITE;
7111+ push(&_io_jobs, job);
7112+ }
7113+
7114+ dm_daemon_wake(&_kcopyd);
7115+}
7116+
7117+/*
7118+ * Request io on as many buffer heads as we can currently get for
7119+ * a particular job.
7120+ */
7121+static int run_io_job(struct kcopyd_job *job)
7122+{
7123+ int r;
7124+
7125+ if (job->rw == READ)
7126+ r = dm_io_async(1, &job->source, job->rw,
7127+ list_entry(job->pages.next, struct page, list),
7128+ job->offset, complete_io, job);
7129+
7130+ else
7131+ r = dm_io_async(job->num_dests, job->dests, job->rw,
7132+ list_entry(job->pages.next, struct page, list),
7133+ job->offset, complete_io, job);
7134+
7135+ return r;
7136+}
7137+
7138+static int run_pages_job(struct kcopyd_job *job)
7139+{
7140+ int r;
7141+
7142+ job->nr_pages = dm_div_up(job->dests[0].count + job->offset,
7143+ SECTORS_PER_PAGE);
7144+ r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages);
7145+ if (!r) {
7146+ /* this job is ready for io */
7147+ push(&_io_jobs, job);
7148+ return 0;
7149+ }
7150+
7151+ if (r == -ENOMEM)
7152+ /* can't complete now */
7153+ return 1;
7154+
7155+ return r;
7156+}
7157+
7158+/*
7159+ * Run through a list for as long as possible. Returns the count
7160+ * of successful jobs.
7161+ */
7162+static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *))
7163+{
7164+ struct kcopyd_job *job;
7165+ int r, count = 0;
7166+
7167+ while ((job = pop(jobs))) {
7168+
7169+ r = fn(job);
7170+
7171+ if (r < 0) {
7172+ /* error this rogue job */
7173+ if (job->rw == WRITE)
7174+ job->write_err = (unsigned int) -1;
7175+ else
7176+ job->read_err = 1;
7177+ push(&_complete_jobs, job);
7178+ break;
7179+ }
7180+
7181+ if (r > 0) {
7182+ /*
7183+ * We couldn't service this job ATM, so
7184+ * push this job back onto the list.
7185+ */
7186+ push(jobs, job);
7187+ break;
7188+ }
7189+
7190+ count++;
7191+ }
7192+
7193+ return count;
7194+}
7195+
7196+/*
7197+ * kcopyd does this every time it's woken up.
7198+ */
7199+static void do_work(void)
7200+{
7201+ /*
7202+ * The order that these are called is *very* important.
7203+ * complete jobs can free some pages for pages jobs.
7204+ * Pages jobs when successful will jump onto the io jobs
7205+ * list. io jobs call wake when they complete and it all
7206+ * starts again.
7207+ */
7208+ process_jobs(&_complete_jobs, run_complete_job);
7209+ process_jobs(&_pages_jobs, run_pages_job);
7210+ process_jobs(&_io_jobs, run_io_job);
7211+ run_task_queue(&tq_disk);
7212+}
7213+
7214+/*
7215+ * If we are copying a small region we just dispatch a single job
7216+ * to do the copy, otherwise the io has to be split up into many
7217+ * jobs.
7218+ */
7219+static void dispatch_job(struct kcopyd_job *job)
7220+{
7221+ push(&_pages_jobs, job);
7222+ dm_daemon_wake(&_kcopyd);
7223+}
7224+
7225+static void segment_complete(int read_err,
7226+ unsigned int write_err, void *context)
7227+{
7228+ /* FIXME: tidy this function */
7229+ sector_t progress = 0;
7230+ sector_t count = 0;
7231+ struct kcopyd_job *job = (struct kcopyd_job *) context;
7232+
7233+ down(&job->lock);
7234+
7235+ /* update the error */
7236+ if (read_err)
7237+ job->read_err = 1;
7238+
7239+ if (write_err)
7240+ job->write_err &= write_err;
7241+
7242+ /*
7243+ * Only dispatch more work if there hasn't been an error.
7244+ */
7245+ if ((!job->read_err && !job->write_err) ||
7246+ test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) {
7247+ /* get the next chunk of work */
7248+ progress = job->progress;
7249+ count = job->source.count - progress;
7250+ if (count) {
7251+ if (count > SUB_JOB_SIZE)
7252+ count = SUB_JOB_SIZE;
7253+
7254+ job->progress += count;
7255+ }
7256+ }
7257+ up(&job->lock);
7258+
7259+ if (count) {
7260+ int i;
7261+ struct kcopyd_job *sub_job = mempool_alloc(_job_pool, GFP_NOIO);
7262+
7263+ memcpy(sub_job, job, sizeof(*job));
7264+ sub_job->source.sector += progress;
7265+ sub_job->source.count = count;
7266+
7267+ for (i = 0; i < job->num_dests; i++) {
7268+ sub_job->dests[i].sector += progress;
7269+ sub_job->dests[i].count = count;
7270+ }
7271+
7272+ sub_job->fn = segment_complete;
7273+ sub_job->context = job;
7274+ dispatch_job(sub_job);
7275+
7276+ } else if (atomic_dec_and_test(&job->sub_jobs)) {
7277+
7278+ /*
7279+ * To avoid a race we must keep the job around
7280+ * until after the notify function has completed.
7281+ * Otherwise the client may try and stop the job
7282+ * after we've completed.
7283+ */
7284+ job->fn(read_err, write_err, job->context);
7285+ mempool_free(job, _job_pool);
7286+ }
7287+}
7288+
7289+/*
7290+ * Create some little jobs that will do the move between
7291+ * them.
7292+ */
7293+static void split_job(struct kcopyd_job *job)
7294+{
7295+ int nr;
7296+
7297+ nr = dm_div_up(job->source.count, SUB_JOB_SIZE);
7298+ if (nr > job->kc->max_split)
7299+ nr = job->kc->max_split;
7300+
7301+ atomic_set(&job->sub_jobs, nr);
7302+ while (nr--)
7303+ segment_complete(0, 0u, job);
7304+}
7305+
7306+int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
7307+ unsigned int num_dests, struct io_region *dests,
7308+ unsigned int flags, kcopyd_notify_fn fn, void *context)
7309+{
7310+ struct kcopyd_job *job;
7311+
7312+ /*
7313+ * Allocate a new job.
7314+ */
7315+ job = mempool_alloc(_job_pool, GFP_NOIO);
7316+
7317+ /*
7318+ * set up for the read.
7319+ */
7320+ job->kc = kc;
7321+ job->flags = flags;
7322+ job->read_err = 0;
7323+ job->write_err = 0;
7324+ job->rw = READ;
7325+
7326+ memcpy(&job->source, from, sizeof(*from));
7327+
7328+ job->num_dests = num_dests;
7329+ memcpy(&job->dests, dests, sizeof(*dests) * num_dests);
7330+
7331+ job->offset = 0;
7332+ job->nr_pages = 0;
7333+ INIT_LIST_HEAD(&job->pages);
7334+
7335+ job->fn = fn;
7336+ job->context = context;
7337+
7338+ if (job->source.count < SUB_JOB_SIZE)
7339+ dispatch_job(job);
7340+
7341+ else {
7342+ init_MUTEX(&job->lock);
7343+ job->progress = 0;
7344+ split_job(job);
7345+ }
7346+
7347+ return 0;
7348+}
7349+
7350+/*
7351+ * Cancels a kcopyd job, eg. someone might be deactivating a
7352+ * mirror.
7353+ */
7354+int kcopyd_cancel(struct kcopyd_job *job, int block)
7355+{
7356+ /* FIXME: finish */
7357+ return -1;
7358+}
7359+
7360+/*-----------------------------------------------------------------
7361+ * Unit setup
7362+ *---------------------------------------------------------------*/
7363+static DECLARE_MUTEX(_client_lock);
7364+static LIST_HEAD(_clients);
7365+
7366+static int client_add(struct kcopyd_client *kc)
7367+{
7368+ down(&_client_lock);
7369+ list_add(&kc->list, &_clients);
7370+ up(&_client_lock);
7371+ return 0;
7372+}
7373+
7374+static void client_del(struct kcopyd_client *kc)
7375+{
7376+ down(&_client_lock);
7377+ list_del(&kc->list);
7378+ up(&_client_lock);
7379+}
7380+
7381+int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result)
7382+{
7383+ int r = 0;
7384+ struct kcopyd_client *kc;
7385+
7386+ if (nr_pages * SECTORS_PER_PAGE < SUB_JOB_SIZE) {
7387+ DMERR("kcopyd client requested %u pages: minimum is %lu",
7388+ nr_pages, SUB_JOB_SIZE / SECTORS_PER_PAGE);
7389+ return -ENOMEM;
7390+ }
7391+
7392+ kc = kmalloc(sizeof(*kc), GFP_KERNEL);
7393+ if (!kc)
7394+ return -ENOMEM;
7395+
7396+ kc->lock = SPIN_LOCK_UNLOCKED;
7397+ INIT_LIST_HEAD(&kc->pages);
7398+ kc->nr_pages = kc->nr_free_pages = 0;
7399+ r = client_alloc_pages(kc, nr_pages);
7400+ if (r) {
7401+ kfree(kc);
7402+ return r;
7403+ }
7404+
7405+ r = dm_io_get(nr_pages);
7406+ if (r) {
7407+ client_free_pages(kc);
7408+ kfree(kc);
7409+ return r;
7410+ }
7411+
7412+ r = client_add(kc);
7413+ if (r) {
7414+ dm_io_put(nr_pages);
7415+ client_free_pages(kc);
7416+ kfree(kc);
7417+ return r;
7418+ }
7419+
7420+ *result = kc;
7421+ return 0;
7422+}
7423+
7424+void kcopyd_client_destroy(struct kcopyd_client *kc)
7425+{
7426+ dm_io_put(kc->nr_pages);
7427+ client_free_pages(kc);
7428+ client_del(kc);
7429+ kfree(kc);
7430+}
7431+
7432+
7433+int __init kcopyd_init(void)
7434+{
7435+ int r;
7436+
7437+ r = jobs_init();
7438+ if (r)
7439+ return r;
7440+
7441+ r = dm_daemon_start(&_kcopyd, "kcopyd", do_work);
7442+ if (r)
7443+ jobs_exit();
7444+
7445+ return r;
7446+}
7447+
7448+void kcopyd_exit(void)
7449+{
7450+ jobs_exit();
7451+ dm_daemon_stop(&_kcopyd);
7452+}
7453+
7454+EXPORT_SYMBOL(kcopyd_client_create);
7455+EXPORT_SYMBOL(kcopyd_client_destroy);
7456+EXPORT_SYMBOL(kcopyd_copy);
7457+EXPORT_SYMBOL(kcopyd_cancel);
7458diff -urN linux-2.4.24.org/drivers/md/kcopyd.h linux-2.4.24/drivers/md/kcopyd.h
7459--- linux-2.4.24.org/drivers/md/kcopyd.h 1970-01-01 01:00:00.000000000 +0100
7460+++ linux-2.4.24/drivers/md/kcopyd.h 2004-01-18 15:01:25.800189017 +0100
7461@@ -0,0 +1,47 @@
7462+/*
7463+ * Copyright (C) 2001 Sistina Software
7464+ *
7465+ * This file is released under the GPL.
7466+ */
7467+
7468+#ifndef DM_KCOPYD_H
7469+#define DM_KCOPYD_H
7470+
7471+/*
7472+ * Needed for the definition of offset_t.
7473+ */
7474+#include <linux/device-mapper.h>
7475+#include <linux/iobuf.h>
7476+
7477+#include "dm-io.h"
7478+
7479+int kcopyd_init(void);
7480+void kcopyd_exit(void);
7481+
7482+/* FIXME: make this configurable */
7483+#define KCOPYD_MAX_REGIONS 8
7484+
7485+#define KCOPYD_IGNORE_ERROR 1
7486+
7487+/*
7488+ * To use kcopyd you must first create a kcopyd client object.
7489+ */
7490+struct kcopyd_client;
7491+int kcopyd_client_create(unsigned int num_pages, struct kcopyd_client **result);
7492+void kcopyd_client_destroy(struct kcopyd_client *kc);
7493+
7494+/*
7495+ * Submit a copy job to kcopyd. This is built on top of the
7496+ * previous three fns.
7497+ *
7498+ * read_err is a boolean,
7499+ * write_err is a bitset, with 1 bit for each destination region
7500+ */
7501+typedef void (*kcopyd_notify_fn)(int read_err,
7502+ unsigned int write_err, void *context);
7503+
7504+int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
7505+ unsigned int num_dests, struct io_region *dests,
7506+ unsigned int flags, kcopyd_notify_fn fn, void *context);
7507+
7508+#endif
7509diff -urN linux-2.4.24.org/drivers/md/Makefile linux-2.4.24/drivers/md/Makefile
7510--- linux-2.4.24.org/drivers/md/Makefile 2004-01-18 14:58:09.300663064 +0100
7511+++ linux-2.4.24/drivers/md/Makefile 2004-01-18 15:01:29.209473819 +0100
7512@@ -4,24 +4,35 @@
7513
7514 O_TARGET := mddev.o
7515
7516-export-objs := md.o xor.o
7517-list-multi := lvm-mod.o
7518+export-objs := md.o xor.o dm-table.o dm-target.o dm.o dm-daemon.o \
7519+ kcopyd.o dm-io.o
7520+
7521+list-multi := lvm-mod.o dm-mod.o dm-mirror-mod.o
7522 lvm-mod-objs := lvm.o lvm-snap.o lvm-fs.o
7523+dm-mod-objs := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
7524+ dm-ioctl.o dm-daemon.o kcopyd.o dm-io.o dm-snapshot.o \
7525+ dm-exception-store.o
7526
7527 # Note: link order is important. All raid personalities
7528 # and xor.o must come before md.o, as they each initialise
7529 # themselves, and md.o may use the personalities when it
7530 # auto-initialised.
7531
7532-obj-$(CONFIG_MD_LINEAR) += linear.o
7533-obj-$(CONFIG_MD_RAID0) += raid0.o
7534-obj-$(CONFIG_MD_RAID1) += raid1.o
7535-obj-$(CONFIG_MD_RAID5) += raid5.o xor.o
7536-obj-$(CONFIG_MD_MULTIPATH) += multipath.o
7537-obj-$(CONFIG_BLK_DEV_MD) += md.o
7538-obj-$(CONFIG_BLK_DEV_LVM) += lvm-mod.o
7539+obj-$(CONFIG_MD_LINEAR) += linear.o
7540+obj-$(CONFIG_MD_RAID0) += raid0.o
7541+obj-$(CONFIG_MD_RAID1) += raid1.o
7542+obj-$(CONFIG_MD_RAID5) += raid5.o xor.o
7543+obj-$(CONFIG_MD_MULTIPATH) += multipath.o
7544+obj-$(CONFIG_BLK_DEV_MD) += md.o
7545+
7546+obj-$(CONFIG_BLK_DEV_LVM) += lvm-mod.o
7547+
7548+obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
7549
7550 include $(TOPDIR)/Rules.make
7551
7552 lvm-mod.o: $(lvm-mod-objs)
7553 $(LD) -r -o $@ $(lvm-mod-objs)
7554+
7555+dm-mod.o: $(dm-mod-objs)
7556+ $(LD) -r -o $@ $(dm-mod-objs)
7557diff -urN linux-2.4.24.org/include/linux/device-mapper.h linux-2.4.24/include/linux/device-mapper.h
7558--- linux-2.4.24.org/include/linux/device-mapper.h 1970-01-01 01:00:00.000000000 +0100
7559+++ linux-2.4.24/include/linux/device-mapper.h 2004-01-18 15:01:13.800707381 +0100
7560@@ -0,0 +1,104 @@
7561+/*
7562+ * Copyright (C) 2001 Sistina Software (UK) Limited.
7563+ *
7564+ * This file is released under the LGPL.
7565+ */
7566+
7567+#ifndef _LINUX_DEVICE_MAPPER_H
7568+#define _LINUX_DEVICE_MAPPER_H
7569+
7570+typedef unsigned long sector_t;
7571+
7572+struct dm_target;
7573+struct dm_table;
7574+struct dm_dev;
7575+
7576+typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
7577+
7578+union map_info {
7579+ void *ptr;
7580+ unsigned long long ll;
7581+};
7582+
7583+/*
7584+ * In the constructor the target parameter will already have the
7585+ * table, type, begin and len fields filled in.
7586+ */
7587+typedef int (*dm_ctr_fn) (struct dm_target * target, unsigned int argc,
7588+ char **argv);
7589+
7590+/*
7591+ * The destructor doesn't need to free the dm_target, just
7592+ * anything hidden ti->private.
7593+ */
7594+typedef void (*dm_dtr_fn) (struct dm_target * ti);
7595+
7596+/*
7597+ * The map function must return:
7598+ * < 0: error
7599+ * = 0: The target will handle the io by resubmitting it later
7600+ * > 0: simple remap complete
7601+ */
7602+typedef int (*dm_map_fn) (struct dm_target * ti, struct buffer_head * bh,
7603+ int rw, union map_info *map_context);
7604+
7605+/*
7606+ * Returns:
7607+ * < 0 : error (currently ignored)
7608+ * 0 : ended successfully
7609+ * 1 : for some reason the io has still not completed (eg,
7610+ * multipath target might want to requeue a failed io).
7611+ */
7612+typedef int (*dm_endio_fn) (struct dm_target * ti,
7613+ struct buffer_head * bh, int rw, int error,
7614+ union map_info *map_context);
7615+typedef void (*dm_suspend_fn) (struct dm_target *ti);
7616+typedef void (*dm_resume_fn) (struct dm_target *ti);
7617+typedef int (*dm_status_fn) (struct dm_target * ti, status_type_t status_type,
7618+ char *result, unsigned int maxlen);
7619+
7620+void dm_error(const char *message);
7621+
7622+/*
7623+ * Constructors should call these functions to ensure destination devices
7624+ * are opened/closed correctly.
7625+ * FIXME: too many arguments.
7626+ */
7627+int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
7628+ sector_t len, int mode, struct dm_dev **result);
7629+void dm_put_device(struct dm_target *ti, struct dm_dev *d);
7630+
7631+/*
7632+ * Information about a target type
7633+ */
7634+struct target_type {
7635+ const char *name;
7636+ struct module *module;
7637+ dm_ctr_fn ctr;
7638+ dm_dtr_fn dtr;
7639+ dm_map_fn map;
7640+ dm_endio_fn end_io;
7641+ dm_suspend_fn suspend;
7642+ dm_resume_fn resume;
7643+ dm_status_fn status;
7644+};
7645+
7646+struct dm_target {
7647+ struct dm_table *table;
7648+ struct target_type *type;
7649+
7650+ /* target limits */
7651+ sector_t begin;
7652+ sector_t len;
7653+
7654+ /* target specific data */
7655+ void *private;
7656+
7657+ /* Used to provide an error string from the ctr */
7658+ char *error;
7659+};
7660+
7661+int dm_register_target(struct target_type *t);
7662+int dm_unregister_target(struct target_type *t);
7663+
7664+#endif /* _LINUX_DEVICE_MAPPER_H */
7665diff -urN linux-2.4.24.org/include/linux/dm-ioctl.h linux-2.4.24/include/linux/dm-ioctl.h
7666--- linux-2.4.24.org/include/linux/dm-ioctl.h 1970-01-01 01:00:00.000000000 +0100
7667+++ linux-2.4.24/include/linux/dm-ioctl.h 2004-01-18 15:01:17.793869131 +0100
7668@@ -0,0 +1,237 @@
7669+/*
7670+ * Copyright (C) 2001 - 2003 Sistina Software (UK) Limited.
7671+ *
7672+ * This file is released under the LGPL.
7673+ */
7674+
7675+#ifndef _LINUX_DM_IOCTL_H
7676+#define _LINUX_DM_IOCTL_H
7677+
7678+#include <linux/types.h>
7679+
7680+#define DM_DIR "mapper" /* Slashes not supported */
7681+#define DM_MAX_TYPE_NAME 16
7682+#define DM_NAME_LEN 128
7683+#define DM_UUID_LEN 129
7684+
7685+/*
7686+ * A traditional ioctl interface for the device mapper.
7687+ *
7688+ * Each device can have two tables associated with it, an
7689+ * 'active' table which is the one currently used by io passing
7690+ * through the device, and an 'inactive' one which is a table
7691+ * that is being prepared as a replacement for the 'active' one.
7692+ *
7693+ * DM_VERSION:
7694+ * Just get the version information for the ioctl interface.
7695+ *
7696+ * DM_REMOVE_ALL:
7697+ * Remove all dm devices, destroy all tables. Only really used
7698+ * for debug.
7699+ *
7700+ * DM_LIST_DEVICES:
7701+ * Get a list of all the dm device names.
7702+ *
7703+ * DM_DEV_CREATE:
7704+ * Create a new device, neither the 'active' or 'inactive' table
7705+ * slots will be filled. The device will be in suspended state
7706+ * after creation, however any io to the device will get errored
7707+ * since it will be out-of-bounds.
7708+ *
7709+ * DM_DEV_REMOVE:
7710+ * Remove a device, destroy any tables.
7711+ *
7712+ * DM_DEV_RENAME:
7713+ * Rename a device.
7714+ *
7715+ * DM_SUSPEND:
7716+ * This performs both suspend and resume, depending which flag is
7717+ * passed in.
7718+ * Suspend: This command will not return until all pending io to
7719+ * the device has completed. Further io will be deferred until
7720+ * the device is resumed.
7721+ * Resume: It is no longer an error to issue this command on an
7722+ * unsuspended device. If a table is present in the 'inactive'
7723+ * slot, it will be moved to the active slot, then the old table
7724+ * from the active slot will be _destroyed_. Finally the device
7725+ * is resumed.
7726+ *
7727+ * DM_DEV_STATUS:
7728+ * Retrieves the status for the table in the 'active' slot.
7729+ *
7730+ * DM_DEV_WAIT:
7731+ * Wait for a significant event to occur to the device. This
7732+ * could either be caused by an event triggered by one of the
7733+ * targets of the table in the 'active' slot, or a table change.
7734+ *
7735+ * DM_TABLE_LOAD:
7736+ * Load a table into the 'inactive' slot for the device. The
7737+ * device does _not_ need to be suspended prior to this command.
7738+ *
7739+ * DM_TABLE_CLEAR:
7740+ * Destroy any table in the 'inactive' slot (ie. abort).
7741+ *
7742+ * DM_TABLE_DEPS:
7743+ * Return a set of device dependencies for the 'active' table.
7744+ *
7745+ * DM_TABLE_STATUS:
7746+ * Return the targets status for the 'active' table.
7747+ */
7748+
7749+/*
7750+ * All ioctl arguments consist of a single chunk of memory, with
7751+ * this structure at the start. If a uuid is specified any
7752+ * lookup (eg. for a DM_INFO) will be done on that, *not* the
7753+ * name.
7754+ */
7755+struct dm_ioctl {
7756+ /*
7757+ * The version number is made up of three parts:
7758+ * major - no backward or forward compatibility,
7759+ * minor - only backwards compatible,
7760+ * patch - both backwards and forwards compatible.
7761+ *
7762+ * All clients of the ioctl interface should fill in the
7763+ * version number of the interface that they were
7764+ * compiled with.
7765+ *
7766+ * All recognised ioctl commands (ie. those that don't
7767+ * return -ENOTTY) fill out this field, even if the
7768+ * command failed.
7769+ */
7770+ uint32_t version[3]; /* in/out */
7771+ uint32_t data_size; /* total size of data passed in
7772+ * including this struct */
7773+
7774+ uint32_t data_start; /* offset to start of data
7775+ * relative to start of this struct */
7776+
7777+ uint32_t target_count; /* in/out */
7778+ int32_t open_count; /* out */
7779+ uint32_t flags; /* in/out */
7780+ uint32_t event_nr; /* in/out */
7781+ uint32_t padding;
7782+
7783+ uint64_t dev; /* in/out */
7784+
7785+ char name[DM_NAME_LEN]; /* device name */
7786+ char uuid[DM_UUID_LEN]; /* unique identifier for
7787+ * the block device */
7788+};
7789+
7790+/*
7791+ * Used to specify tables. These structures appear after the
7792+ * dm_ioctl.
7793+ */
7794+struct dm_target_spec {
7795+ uint64_t sector_start;
7796+ uint64_t length;
7797+ int32_t status; /* used when reading from kernel only */
7798+
7799+ /*
7800+ * Offset in bytes (from the start of this struct) to
7801+ * next target_spec.
7802+ */
7803+ uint32_t next;
7804+
7805+ char target_type[DM_MAX_TYPE_NAME];
7806+
7807+ /*
7808+ * Parameter string starts immediately after this object.
7809+ * Be careful to add padding after string to ensure correct
7810+ * alignment of subsequent dm_target_spec.
7811+ */
7812+};
7813+
7814+/*
7815+ * Used to retrieve the target dependencies.
7816+ */
7817+struct dm_target_deps {
7818+ uint32_t count; /* Array size */
7819+ uint32_t padding; /* unused */
7820+ uint64_t dev[0]; /* out */
7821+};
7822+
7823+/*
7824+ * Used to get a list of all dm devices.
7825+ */
7826+struct dm_name_list {
7827+ uint64_t dev;
7828+ uint32_t next; /* offset to the next record from
7829+ the _start_ of this */
7830+ char name[0];
7831+};
7832+
7833+/*
7834+ * If you change this make sure you make the corresponding change
7835+ * to dm-ioctl.c:lookup_ioctl()
7836+ */
7837+enum {
7838+ /* Top level cmds */
7839+ DM_VERSION_CMD = 0,
7840+ DM_REMOVE_ALL_CMD,
7841+ DM_LIST_DEVICES_CMD,
7842+
7843+ /* device level cmds */
7844+ DM_DEV_CREATE_CMD,
7845+ DM_DEV_REMOVE_CMD,
7846+ DM_DEV_RENAME_CMD,
7847+ DM_DEV_SUSPEND_CMD,
7848+ DM_DEV_STATUS_CMD,
7849+ DM_DEV_WAIT_CMD,
7850+
7851+ /* Table level cmds */
7852+ DM_TABLE_LOAD_CMD,
7853+ DM_TABLE_CLEAR_CMD,
7854+ DM_TABLE_DEPS_CMD,
7855+ DM_TABLE_STATUS_CMD,
7856+};
7857+
7858+#define DM_IOCTL 0xfd
7859+
7860+#define DM_VERSION _IOWR(DM_IOCTL, DM_VERSION_CMD, struct dm_ioctl)
7861+#define DM_REMOVE_ALL _IOWR(DM_IOCTL, DM_REMOVE_ALL_CMD, struct dm_ioctl)
7862+#define DM_LIST_DEVICES _IOWR(DM_IOCTL, DM_LIST_DEVICES_CMD, struct dm_ioctl)
7863+
7864+#define DM_DEV_CREATE _IOWR(DM_IOCTL, DM_DEV_CREATE_CMD, struct dm_ioctl)
7865+#define DM_DEV_REMOVE _IOWR(DM_IOCTL, DM_DEV_REMOVE_CMD, struct dm_ioctl)
7866+#define DM_DEV_RENAME _IOWR(DM_IOCTL, DM_DEV_RENAME_CMD, struct dm_ioctl)
7867+#define DM_DEV_SUSPEND _IOWR(DM_IOCTL, DM_DEV_SUSPEND_CMD, struct dm_ioctl)
7868+#define DM_DEV_STATUS _IOWR(DM_IOCTL, DM_DEV_STATUS_CMD, struct dm_ioctl)
7869+#define DM_DEV_WAIT _IOWR(DM_IOCTL, DM_DEV_WAIT_CMD, struct dm_ioctl)
7870+
7871+#define DM_TABLE_LOAD _IOWR(DM_IOCTL, DM_TABLE_LOAD_CMD, struct dm_ioctl)
7872+#define DM_TABLE_CLEAR _IOWR(DM_IOCTL, DM_TABLE_CLEAR_CMD, struct dm_ioctl)
7873+#define DM_TABLE_DEPS _IOWR(DM_IOCTL, DM_TABLE_DEPS_CMD, struct dm_ioctl)
7874+#define DM_TABLE_STATUS _IOWR(DM_IOCTL, DM_TABLE_STATUS_CMD, struct dm_ioctl)
7875+
7876+#define DM_VERSION_MAJOR 4
7877+#define DM_VERSION_MINOR 0
7878+#define DM_VERSION_PATCHLEVEL 5
7879+#define DM_VERSION_EXTRA "-ioctl (2003-11-18)"
7880+
7881+/* Status bits */
7882+#define DM_READONLY_FLAG (1 << 0) /* In/Out */
7883+#define DM_SUSPEND_FLAG (1 << 1) /* In/Out */
7884+#define DM_PERSISTENT_DEV_FLAG (1 << 3) /* In */
7885+
7886+/*
7887+ * Flag passed into ioctl STATUS command to get table information
7888+ * rather than current status.
7889+ */
7890+#define DM_STATUS_TABLE_FLAG (1 << 4) /* In */
7891+
7892+/*
7893+ * Flags that indicate whether a table is present in either of
7894+ * the two table slots that a device has.
7895+ */
7896+#define DM_ACTIVE_PRESENT_FLAG (1 << 5) /* Out */
7897+#define DM_INACTIVE_PRESENT_FLAG (1 << 6) /* Out */
7898+
7899+/*
7900+ * Indicates that the buffer passed in wasn't big enough for the
7901+ * results.
7902+ */
7903+#define DM_BUFFER_FULL_FLAG (1 << 8) /* Out */
7904+
7905+#endif /* _LINUX_DM_IOCTL_H */
7906diff -urN linux-2.4.24.org/include/linux/mempool.h linux-2.4.24/include/linux/mempool.h
7907--- linux-2.4.24.org/include/linux/mempool.h 1970-01-01 01:00:00.000000000 +0100
7908+++ linux-2.4.24/include/linux/mempool.h 2004-01-18 15:01:09.522605662 +0100
7909@@ -0,0 +1,31 @@
7910+/*
7911+ * memory buffer pool support
7912+ */
7913+#ifndef _LINUX_MEMPOOL_H
7914+#define _LINUX_MEMPOOL_H
7915+
7916+#include <linux/list.h>
7917+#include <linux/wait.h>
7918+
7919+struct mempool_s;
7920+typedef struct mempool_s mempool_t;
7921+
7922+typedef void * (mempool_alloc_t)(int gfp_mask, void *pool_data);
7923+typedef void (mempool_free_t)(void *element, void *pool_data);
7924+
7925+extern mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
7926+ mempool_free_t *free_fn, void *pool_data);
7927+extern int mempool_resize(mempool_t *pool, int new_min_nr, int gfp_mask);
7928+extern void mempool_destroy(mempool_t *pool);
7929+extern void * mempool_alloc(mempool_t *pool, int gfp_mask);
7930+extern void mempool_free(void *element, mempool_t *pool);
7931+
7932+/*
7933+ * A mempool_alloc_t and mempool_free_t that get the memory from
7934+ * a slab that is passed in through pool_data.
7935+ */
7936+void *mempool_alloc_slab(int gfp_mask, void *pool_data);
7937+void mempool_free_slab(void *element, void *pool_data);
7938+
7939+
7940+#endif /* _LINUX_MEMPOOL_H */
7941diff -urN linux-2.4.24.org/MAINTAINERS linux-2.4.24/MAINTAINERS
7942--- linux-2.4.24.org/MAINTAINERS 2004-01-18 14:59:47.570857618 +0100
7943+++ linux-2.4.24/MAINTAINERS 2004-01-18 15:01:13.766714518 +0100
7944@@ -581,6 +581,13 @@
7945 W: http://www.debian.org/~dz/i8k/
7946 S: Maintained
7947
7948+DEVICE MAPPER
7949+P: Joe Thornber
7950+M: dm@uk.sistina.com
7951+L: linux-LVM@sistina.com
7952+W: http://www.sistina.com/lvm
7953+S: Maintained
7954+
7955 DEVICE NUMBER REGISTRY
7956 P: H. Peter Anvin
7957 M: hpa@zytor.com
7958diff -urN linux-2.4.24.org/mm/Makefile linux-2.4.24/mm/Makefile
7959--- linux-2.4.24.org/mm/Makefile 2004-01-18 14:55:23.909936044 +0100
7960+++ linux-2.4.24/mm/Makefile 2004-01-18 15:01:09.497610911 +0100
7961@@ -9,12 +9,12 @@
7962
7963 O_TARGET := mm.o
7964
7965-export-objs := shmem.o filemap.o memory.o page_alloc.o
7966+export-objs := shmem.o filemap.o memory.o page_alloc.o mempool.o
7967
7968 obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
7969 vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
7970 page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \
7971- shmem.o
7972+ shmem.o mempool.o
7973
7974 obj-$(CONFIG_HIGHMEM) += highmem.o
7975 obj-$(CONFIG_PROC_MM) += proc_mm.o
7976diff -urN linux-2.4.24.org/mm/mempool.c linux-2.4.24/mm/mempool.c
7977--- linux-2.4.24.org/mm/mempool.c 1970-01-01 01:00:00.000000000 +0100
7978+++ linux-2.4.24/mm/mempool.c 2004-01-18 15:01:09.525605032 +0100
7979@@ -0,0 +1,299 @@
7980+/*
7981+ * linux/mm/mempool.c
7982+ *
7983+ * memory buffer pool support. Such pools are mostly used
7984+ * for guaranteed, deadlock-free memory allocations during
7985+ * extreme VM load.
7986+ *
7987+ * started by Ingo Molnar, Copyright (C) 2001
7988+ */
7989+
7990+#include <linux/mm.h>
7991+#include <linux/slab.h>
7992+#include <linux/module.h>
7993+#include <linux/mempool.h>
7994+
7995+struct mempool_s {
7996+ spinlock_t lock;
7997+ int min_nr; /* nr of elements at *elements */
7998+ int curr_nr; /* Current nr of elements at *elements */
7999+ void **elements;
8000+
8001+ void *pool_data;
8002+ mempool_alloc_t *alloc;
8003+ mempool_free_t *free;
8004+ wait_queue_head_t wait;
8005+};
8006+
8007+static void add_element(mempool_t *pool, void *element)
8008+{
8009+ BUG_ON(pool->curr_nr >= pool->min_nr);
8010+ pool->elements[pool->curr_nr++] = element;
8011+}
8012+
8013+static void *remove_element(mempool_t *pool)
8014+{
8015+ BUG_ON(pool->curr_nr <= 0);
8016+ return pool->elements[--pool->curr_nr];
8017+}
8018+
8019+static void free_pool(mempool_t *pool)
8020+{
8021+ while (pool->curr_nr) {
8022+ void *element = remove_element(pool);
8023+ pool->free(element, pool->pool_data);
8024+ }
8025+ kfree(pool->elements);
8026+ kfree(pool);
8027+}
8028+
8029+/**
8030+ * mempool_create - create a memory pool
8031+ * @min_nr: the minimum number of elements guaranteed to be
8032+ * allocated for this pool.
8033+ * @alloc_fn: user-defined element-allocation function.
8034+ * @free_fn: user-defined element-freeing function.
8035+ * @pool_data: optional private data available to the user-defined functions.
8036+ *
8037+ * this function creates and allocates a guaranteed size, preallocated
8038+ * memory pool. The pool can be used from the mempool_alloc and mempool_free
8039+ * functions. This function might sleep. Both the alloc_fn() and the free_fn()
8040+ * functions might sleep - as long as the mempool_alloc function is not called
8041+ * from IRQ contexts.
8042+ */
8043+mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
8044+ mempool_free_t *free_fn, void *pool_data)
8045+{
8046+ mempool_t *pool;
8047+
8048+ pool = kmalloc(sizeof(*pool), GFP_KERNEL);
8049+ if (!pool)
8050+ return NULL;
8051+ memset(pool, 0, sizeof(*pool));
8052+ pool->elements = kmalloc(min_nr * sizeof(void *), GFP_KERNEL);
8053+ if (!pool->elements) {
8054+ kfree(pool);
8055+ return NULL;
8056+ }
8057+ spin_lock_init(&pool->lock);
8058+ pool->min_nr = min_nr;
8059+ pool->pool_data = pool_data;
8060+ init_waitqueue_head(&pool->wait);
8061+ pool->alloc = alloc_fn;
8062+ pool->free = free_fn;
8063+
8064+ /*
8065+ * First pre-allocate the guaranteed number of buffers.
8066+ */
8067+ while (pool->curr_nr < pool->min_nr) {
8068+ void *element;
8069+
8070+ element = pool->alloc(GFP_KERNEL, pool->pool_data);
8071+ if (unlikely(!element)) {
8072+ free_pool(pool);
8073+ return NULL;
8074+ }
8075+ add_element(pool, element);
8076+ }
8077+ return pool;
8078+}
8079+
8080+/**
8081+ * mempool_resize - resize an existing memory pool
8082+ * @pool: pointer to the memory pool which was allocated via
8083+ * mempool_create().
8084+ * @new_min_nr: the new minimum number of elements guaranteed to be
8085+ * allocated for this pool.
8086+ * @gfp_mask: the usual allocation bitmask.
8087+ *
8088+ * This function shrinks/grows the pool. In the case of growing,
8089+ * it cannot be guaranteed that the pool will be grown to the new
8090+ * size immediately, but new mempool_free() calls will refill it.
8091+ *
8092+ * Note, the caller must guarantee that no mempool_destroy is called
8093+ * while this function is running. mempool_alloc() & mempool_free()
8094+ * might be called (eg. from IRQ contexts) while this function executes.
8095+ */
8096+int mempool_resize(mempool_t *pool, int new_min_nr, int gfp_mask)
8097+{
8098+ void *element;
8099+ void **new_elements;
8100+ unsigned long flags;
8101+
8102+ BUG_ON(new_min_nr <= 0);
8103+
8104+ spin_lock_irqsave(&pool->lock, flags);
8105+ if (new_min_nr < pool->min_nr) {
8106+ while (pool->curr_nr > new_min_nr) {
8107+ element = remove_element(pool);
8108+ spin_unlock_irqrestore(&pool->lock, flags);
8109+ pool->free(element, pool->pool_data);
8110+ spin_lock_irqsave(&pool->lock, flags);
8111+ }
8112+ pool->min_nr = new_min_nr;
8113+ goto out_unlock;
8114+ }
8115+ spin_unlock_irqrestore(&pool->lock, flags);
8116+
8117+ /* Grow the pool */
8118+ new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask);
8119+ if (!new_elements)
8120+ return -ENOMEM;
8121+
8122+ spin_lock_irqsave(&pool->lock, flags);
8123+ memcpy(new_elements, pool->elements,
8124+ pool->curr_nr * sizeof(*new_elements));
8125+ kfree(pool->elements);
8126+ pool->elements = new_elements;
8127+ pool->min_nr = new_min_nr;
8128+
8129+ while (pool->curr_nr < pool->min_nr) {
8130+ spin_unlock_irqrestore(&pool->lock, flags);
8131+ element = pool->alloc(gfp_mask, pool->pool_data);
8132+ if (!element)
8133+ goto out;
8134+ spin_lock_irqsave(&pool->lock, flags);
8135+ if (pool->curr_nr < pool->min_nr)
8136+ add_element(pool, element);
8137+ else
8138+ kfree(element); /* Raced */
8139+ }
8140+out_unlock:
8141+ spin_unlock_irqrestore(&pool->lock, flags);
8142+out:
8143+ return 0;
8144+}
8145+
8146+/**
8147+ * mempool_destroy - deallocate a memory pool
8148+ * @pool: pointer to the memory pool which was allocated via
8149+ * mempool_create().
8150+ *
8151+ * this function only sleeps if the free_fn() function sleeps. The caller
8152+ * has to guarantee that all elements have been returned to the pool (ie:
8153+ * freed) prior to calling mempool_destroy().
8154+ */
8155+void mempool_destroy(mempool_t *pool)
8156+{
8157+ if (pool->curr_nr != pool->min_nr)
8158+ BUG(); /* There were outstanding elements */
8159+ free_pool(pool);
8160+}
8161+
8162+/**
8163+ * mempool_alloc - allocate an element from a specific memory pool
8164+ * @pool: pointer to the memory pool which was allocated via
8165+ * mempool_create().
8166+ * @gfp_mask: the usual allocation bitmask.
8167+ *
8168+ * this function only sleeps if the alloc_fn function sleeps or
8169+ * returns NULL. Note that due to preallocation, this function
8170+ * *never* fails when called from process contexts. (it might
8171+ * fail if called from an IRQ context.)
8172+ */
8173+void * mempool_alloc(mempool_t *pool, int gfp_mask)
8174+{
8175+ void *element;
8176+ unsigned long flags;
8177+ int curr_nr;
8178+ DECLARE_WAITQUEUE(wait, current);
8179+ int gfp_nowait = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
8180+
8181+repeat_alloc:
8182+ element = pool->alloc(gfp_nowait, pool->pool_data);
8183+ if (likely(element != NULL))
8184+ return element;
8185+
8186+ /*
8187+ * If the pool is less than 50% full then try harder
8188+ * to allocate an element:
8189+ */
8190+ if ((gfp_mask != gfp_nowait) && (pool->curr_nr <= pool->min_nr/2)) {
8191+ element = pool->alloc(gfp_mask, pool->pool_data);
8192+ if (likely(element != NULL))
8193+ return element;
8194+ }
8195+
8196+ /*
8197+ * Kick the VM at this point.
8198+ */
8199+ wakeup_bdflush();
8200+
8201+ spin_lock_irqsave(&pool->lock, flags);
8202+ if (likely(pool->curr_nr)) {
8203+ element = remove_element(pool);
8204+ spin_unlock_irqrestore(&pool->lock, flags);
8205+ return element;
8206+ }
8207+ spin_unlock_irqrestore(&pool->lock, flags);
8208+
8209+ /* We must not sleep in the GFP_ATOMIC case */
8210+ if (gfp_mask == gfp_nowait)
8211+ return NULL;
8212+
8213+ run_task_queue(&tq_disk);
8214+
8215+ add_wait_queue_exclusive(&pool->wait, &wait);
8216+ set_task_state(current, TASK_UNINTERRUPTIBLE);
8217+
8218+ spin_lock_irqsave(&pool->lock, flags);
8219+ curr_nr = pool->curr_nr;
8220+ spin_unlock_irqrestore(&pool->lock, flags);
8221+
8222+ if (!curr_nr)
8223+ schedule();
8224+
8225+ current->state = TASK_RUNNING;
8226+ remove_wait_queue(&pool->wait, &wait);
8227+
8228+ goto repeat_alloc;
8229+}
8230+
8231+/**
8232+ * mempool_free - return an element to the pool.
8233+ * @element: pool element pointer.
8234+ * @pool: pointer to the memory pool which was allocated via
8235+ * mempool_create().
8236+ *
8237+ * this function only sleeps if the free_fn() function sleeps.
8238+ */
8239+void mempool_free(void *element, mempool_t *pool)
8240+{
8241+ unsigned long flags;
8242+
8243+ if (pool->curr_nr < pool->min_nr) {
8244+ spin_lock_irqsave(&pool->lock, flags);
8245+ if (pool->curr_nr < pool->min_nr) {
8246+ add_element(pool, element);
8247+ spin_unlock_irqrestore(&pool->lock, flags);
8248+ wake_up(&pool->wait);
8249+ return;
8250+ }
8251+ spin_unlock_irqrestore(&pool->lock, flags);
8252+ }
8253+ pool->free(element, pool->pool_data);
8254+}
8255+
8256+/*
8257+ * A commonly used alloc and free fn.
8258+ */
8259+void *mempool_alloc_slab(int gfp_mask, void *pool_data)
8260+{
8261+ kmem_cache_t *mem = (kmem_cache_t *) pool_data;
8262+ return kmem_cache_alloc(mem, gfp_mask);
8263+}
8264+
8265+void mempool_free_slab(void *element, void *pool_data)
8266+{
8267+ kmem_cache_t *mem = (kmem_cache_t *) pool_data;
8268+ kmem_cache_free(mem, element);
8269+}
8270+
8271+
8272+EXPORT_SYMBOL(mempool_create);
8273+EXPORT_SYMBOL(mempool_resize);
8274+EXPORT_SYMBOL(mempool_destroy);
8275+EXPORT_SYMBOL(mempool_alloc);
8276+EXPORT_SYMBOL(mempool_free);
8277+EXPORT_SYMBOL(mempool_alloc_slab);
8278+EXPORT_SYMBOL(mempool_free_slab);
This page took 1.029759 seconds and 4 git commands to generate.