]> git.pld-linux.org Git - packages/kernel.git/blame - combined-linux-2.4.21-devmapper-ioctl.patch
- obsolete
[packages/kernel.git] / combined-linux-2.4.21-devmapper-ioctl.patch
CommitLineData
339dbf15
AM
1--- linux-2.4.21/Documentation/Configure.help Fri Jun 13 16:32:30 2003
2+++ linux/Documentation/Configure.help Wed Aug 20 14:41:36 2003
2ac564b8
AM
3@@ -1839,6 +1839,20 @@
4 want), say M here and read <file:Documentation/modules.txt>. The
5 module will be called lvm-mod.o.
6
7+Device-mapper support
8+CONFIG_BLK_DEV_DM
9+ Device-mapper is a low level volume manager. It works by allowing
10+ people to specify mappings for ranges of logical sectors. Various
11+ mapping types are available, in addition people may write their own
12+ modules containing custom mappings if they wish.
13+
14+ Higher level volume managers such as LVM2 use this driver.
15+
16+ If you want to compile this as a module, say M here and read
17+ <file:Documentation/modules.txt>. The module will be called dm-mod.o.
18+
19+ If unsure, say N.
20+
21 Multiple devices driver support (RAID and LVM)
22 CONFIG_MD
23 Support multiple physical spindles through a single logical device.
339dbf15
AM
24--- linux-2.4.21/MAINTAINERS Fri Jun 13 16:32:30 2003
25+++ linux/MAINTAINERS Wed Aug 20 14:41:36 2003
2ac564b8
AM
26@@ -476,6 +476,13 @@
27 W: http://www.debian.org/~dz/i8k/
28 S: Maintained
29
30+DEVICE MAPPER
31+P: Joe Thornber
32+M: dm@uk.sistina.com
33+L: linux-LVM@sistina.com
34+W: http://www.sistina.com/lvm
35+S: Maintained
36+
37 DEVICE NUMBER REGISTRY
38 P: H. Peter Anvin
39 M: hpa@zytor.com
339dbf15
AM
40--- linux-2.4.21/arch/mips64/kernel/ioctl32.c Fri Jan 10 16:34:18 2003
41+++ linux/arch/mips64/kernel/ioctl32.c Wed Aug 20 14:41:28 2003
2ac564b8
AM
42@@ -33,6 +33,7 @@
43 #include <linux/auto_fs.h>
44 #include <linux/ext2_fs.h>
45 #include <linux/raid/md_u.h>
46+#include <linux/dm-ioctl.h>
47
48 #include <scsi/scsi.h>
49 #undef __KERNEL__ /* This file was born to be ugly ... */
50@@ -914,6 +915,22 @@
51 IOCTL32_DEFAULT(STOP_ARRAY_RO),
52 IOCTL32_DEFAULT(RESTART_ARRAY_RW),
53 #endif /* CONFIG_MD */
54+
55+#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
56+ IOCTL32_DEFAULT(DM_VERSION),
57+ IOCTL32_DEFAULT(DM_REMOVE_ALL),
58+ IOCTL32_DEFAULT(DM_DEV_CREATE),
59+ IOCTL32_DEFAULT(DM_DEV_REMOVE),
60+ IOCTL32_DEFAULT(DM_TABLE_LOAD),
61+ IOCTL32_DEFAULT(DM_DEV_SUSPEND),
62+ IOCTL32_DEFAULT(DM_DEV_RENAME),
63+ IOCTL32_DEFAULT(DM_TABLE_DEPS),
64+ IOCTL32_DEFAULT(DM_DEV_STATUS),
65+ IOCTL32_DEFAULT(DM_TABLE_STATUS),
66+ IOCTL32_DEFAULT(DM_DEV_WAIT),
67+ IOCTL32_DEFAULT(DM_LIST_DEVICES),
68+ IOCTL32_DEFAULT(DM_TABLE_CLEAR),
69+#endif /* CONFIG_BLK_DEV_DM */
70
71 IOCTL32_DEFAULT(MTIOCTOP), /* mtio.h ioctls */
72 IOCTL32_HANDLER(MTIOCGET32, mt_ioctl_trans),
339dbf15
AM
73--- linux-2.4.21/arch/parisc/kernel/ioctl32.c Fri Jun 13 16:32:32 2003
74+++ linux/arch/parisc/kernel/ioctl32.c Wed Aug 20 14:41:28 2003
2ac564b8
AM
75@@ -55,6 +55,7 @@
76 #define max max */
77 #include <linux/lvm.h>
78 #endif /* LVM */
79+#include <linux/dm-ioctl.h>
80
81 #include <scsi/scsi.h>
82 /* Ugly hack. */
83@@ -3418,6 +3419,22 @@
84 COMPATIBLE_IOCTL(LV_BMAP)
85 COMPATIBLE_IOCTL(LV_SNAPSHOT_USE_RATE)
86 #endif /* LVM */
87+/* Device-Mapper */
88+#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
89+COMPATIBLE_IOCTL(DM_VERSION)
90+COMPATIBLE_IOCTL(DM_REMOVE_ALL)
91+COMPATIBLE_IOCTL(DM_DEV_CREATE)
92+COMPATIBLE_IOCTL(DM_DEV_REMOVE)
93+COMPATIBLE_IOCTL(DM_TABLE_LOAD)
94+COMPATIBLE_IOCTL(DM_DEV_SUSPEND)
95+COMPATIBLE_IOCTL(DM_DEV_RENAME)
96+COMPATIBLE_IOCTL(DM_TABLE_DEPS)
97+COMPATIBLE_IOCTL(DM_DEV_STATUS)
98+COMPATIBLE_IOCTL(DM_TABLE_STATUS)
99+COMPATIBLE_IOCTL(DM_DEV_WAIT)
100+COMPATIBLE_IOCTL(DM_LIST_DEVICES)
101+COMPATIBLE_IOCTL(DM_TABLE_CLEAR)
102+#endif /* CONFIG_BLK_DEV_DM */
103 #if defined(CONFIG_DRM) || defined(CONFIG_DRM_MODULE)
104 COMPATIBLE_IOCTL(DRM_IOCTL_GET_MAGIC)
105 COMPATIBLE_IOCTL(DRM_IOCTL_IRQ_BUSID)
339dbf15
AM
106--- linux-2.4.21/arch/ppc64/kernel/ioctl32.c Fri Jun 13 16:32:33 2003
107+++ linux/arch/ppc64/kernel/ioctl32.c Wed Aug 20 14:41:29 2003
2ac564b8
AM
108@@ -66,6 +66,7 @@
109 #if defined(CONFIG_BLK_DEV_LVM) || defined(CONFIG_BLK_DEV_LVM_MODULE)
110 #include <linux/lvm.h>
111 #endif /* LVM */
112+#include <linux/dm-ioctl.h>
113
114 #include <scsi/scsi.h>
115 /* Ugly hack. */
116@@ -4423,6 +4424,22 @@
117 COMPATIBLE_IOCTL(NBD_PRINT_DEBUG),
118 COMPATIBLE_IOCTL(NBD_SET_SIZE_BLOCKS),
119 COMPATIBLE_IOCTL(NBD_DISCONNECT),
120+/* device-mapper */
121+#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
122+COMPATIBLE_IOCTL(DM_VERSION),
123+COMPATIBLE_IOCTL(DM_REMOVE_ALL),
124+COMPATIBLE_IOCTL(DM_DEV_CREATE),
125+COMPATIBLE_IOCTL(DM_DEV_REMOVE),
126+COMPATIBLE_IOCTL(DM_TABLE_LOAD),
127+COMPATIBLE_IOCTL(DM_DEV_SUSPEND),
128+COMPATIBLE_IOCTL(DM_DEV_RENAME),
129+COMPATIBLE_IOCTL(DM_TABLE_DEPS),
130+COMPATIBLE_IOCTL(DM_DEV_STATUS),
131+COMPATIBLE_IOCTL(DM_TABLE_STATUS),
132+COMPATIBLE_IOCTL(DM_DEV_WAIT),
133+COMPATIBLE_IOCTL(DM_LIST_DEVICES),
134+COMPATIBLE_IOCTL(DM_TABLE_CLEAR),
135+#endif /* CONFIG_BLK_DEV_DM */
136 /* Remove *PRIVATE in 2.5 */
137 COMPATIBLE_IOCTL(SIOCDEVPRIVATE),
138 COMPATIBLE_IOCTL(SIOCDEVPRIVATE+1),
339dbf15
AM
139--- linux-2.4.21/arch/s390x/kernel/ioctl32.c Fri Jan 10 16:34:26 2003
140+++ linux/arch/s390x/kernel/ioctl32.c Wed Aug 20 14:41:29 2003
2ac564b8
AM
141@@ -25,6 +25,7 @@
142 #include <linux/ext2_fs.h>
143 #include <linux/hdreg.h>
144 #include <linux/if_bonding.h>
145+#include <linux/dm-ioctl.h>
146 #include <asm/types.h>
147 #include <asm/uaccess.h>
148 #include <asm/dasd.h>
149@@ -507,6 +508,20 @@
150 IOCTL32_DEFAULT(VT_UNLOCKSWITCH),
151
152 IOCTL32_DEFAULT(SIOCGSTAMP),
153+
154+ IOCTL32_DEFAULT(DM_VERSION),
155+ IOCTL32_DEFAULT(DM_REMOVE_ALL),
156+ IOCTL32_DEFAULT(DM_DEV_CREATE),
157+ IOCTL32_DEFAULT(DM_DEV_REMOVE),
158+ IOCTL32_DEFAULT(DM_TABLE_LOAD),
159+ IOCTL32_DEFAULT(DM_DEV_SUSPEND),
160+ IOCTL32_DEFAULT(DM_DEV_RENAME),
161+ IOCTL32_DEFAULT(DM_TABLE_DEPS),
162+ IOCTL32_DEFAULT(DM_DEV_STATUS),
163+ IOCTL32_DEFAULT(DM_TABLE_STATUS),
164+ IOCTL32_DEFAULT(DM_DEV_WAIT),
165+ IOCTL32_DEFAULT(DM_LIST_DEVICES),
166+ IOCTL32_DEFAULT(DM_TABLE_CLEAR),
167
168 IOCTL32_HANDLER(SIOCGIFNAME, dev_ifname32),
169 IOCTL32_HANDLER(SIOCGIFCONF, dev_ifconf),
339dbf15
AM
170--- linux-2.4.21/arch/sparc64/kernel/ioctl32.c Fri Jun 13 16:32:34 2003
171+++ linux/arch/sparc64/kernel/ioctl32.c Wed Aug 20 14:41:29 2003
2ac564b8
AM
172@@ -56,6 +56,7 @@
173 #if defined(CONFIG_BLK_DEV_LVM) || defined(CONFIG_BLK_DEV_LVM_MODULE)
174 #include <linux/lvm.h>
175 #endif /* LVM */
176+#include <linux/dm-ioctl.h>
177
178 #include <scsi/scsi.h>
179 /* Ugly hack. */
180@@ -5076,6 +5077,22 @@
181 COMPATIBLE_IOCTL(NBD_PRINT_DEBUG)
182 COMPATIBLE_IOCTL(NBD_SET_SIZE_BLOCKS)
183 COMPATIBLE_IOCTL(NBD_DISCONNECT)
184+/* device-mapper */
185+#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
186+COMPATIBLE_IOCTL(DM_VERSION)
187+COMPATIBLE_IOCTL(DM_REMOVE_ALL)
188+COMPATIBLE_IOCTL(DM_DEV_CREATE)
189+COMPATIBLE_IOCTL(DM_DEV_REMOVE)
190+COMPATIBLE_IOCTL(DM_TABLE_LOAD)
191+COMPATIBLE_IOCTL(DM_DEV_SUSPEND)
192+COMPATIBLE_IOCTL(DM_DEV_RENAME)
193+COMPATIBLE_IOCTL(DM_TABLE_DEPS)
194+COMPATIBLE_IOCTL(DM_DEV_STATUS)
195+COMPATIBLE_IOCTL(DM_TABLE_STATUS)
196+COMPATIBLE_IOCTL(DM_DEV_WAIT)
197+COMPATIBLE_IOCTL(DM_LIST_DEVICES)
198+COMPATIBLE_IOCTL(DM_TABLE_CLEAR)
199+#endif /* CONFIG_BLK_DEV_DM */
200 /* Linux-1394 */
201 #if defined(CONFIG_IEEE1394) || defined(CONFIG_IEEE1394_MODULE)
202 COMPATIBLE_IOCTL(AMDTP_IOC_CHANNEL)
339dbf15
AM
203--- linux-2.4.21/arch/x86_64/ia32/ia32_ioctl.c Fri Jun 13 16:32:35 2003
204+++ linux/arch/x86_64/ia32/ia32_ioctl.c Wed Aug 20 14:41:29 2003
2ac564b8
AM
205@@ -67,6 +67,7 @@
206 #define max max
207 #include <linux/lvm.h>
208 #endif /* LVM */
209+#include <linux/dm-ioctl.h>
210
211 #include <scsi/scsi.h>
212 /* Ugly hack. */
213@@ -4047,6 +4048,22 @@
214 COMPATIBLE_IOCTL(LV_BMAP)
215 COMPATIBLE_IOCTL(LV_SNAPSHOT_USE_RATE)
216 #endif /* LVM */
217+/* Device-Mapper */
218+#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
219+COMPATIBLE_IOCTL(DM_VERSION)
220+COMPATIBLE_IOCTL(DM_REMOVE_ALL)
221+COMPATIBLE_IOCTL(DM_DEV_CREATE)
222+COMPATIBLE_IOCTL(DM_DEV_REMOVE)
223+COMPATIBLE_IOCTL(DM_TABLE_LOAD)
224+COMPATIBLE_IOCTL(DM_DEV_SUSPEND)
225+COMPATIBLE_IOCTL(DM_DEV_RENAME)
226+COMPATIBLE_IOCTL(DM_TABLE_DEPS)
227+COMPATIBLE_IOCTL(DM_DEV_STATUS)
228+COMPATIBLE_IOCTL(DM_TABLE_STATUS)
229+COMPATIBLE_IOCTL(DM_DEV_WAIT)
230+COMPATIBLE_IOCTL(DM_LIST_DEVICES)
231+COMPATIBLE_IOCTL(DM_TABLE_CLEAR)
232+#endif /* CONFIG_BLK_DEV_DM */
233 #ifdef CONFIG_AUTOFS_FS
234 COMPATIBLE_IOCTL(AUTOFS_IOC_READY)
235 COMPATIBLE_IOCTL(AUTOFS_IOC_FAIL)
339dbf15
AM
236--- linux-2.4.21/drivers/md/Config.in Fri Jan 10 16:34:50 2003
237+++ linux/drivers/md/Config.in Wed Aug 20 14:41:36 2003
238@@ -14,5 +14,7 @@
2ac564b8
AM
239 dep_tristate ' Multipath I/O support' CONFIG_MD_MULTIPATH $CONFIG_BLK_DEV_MD
240
241 dep_tristate ' Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM $CONFIG_MD
339dbf15
AM
242+dep_tristate ' Device-mapper support' CONFIG_BLK_DEV_DM $CONFIG_MD
243+dep_tristate ' Mirror (RAID-1) support' CONFIG_BLK_DEV_DM_MIRROR $CONFIG_BLK_DEV_DM
2ac564b8
AM
244
245 endmenu
339dbf15
AM
246--- linux-2.4.21/drivers/md/Makefile Fri Jan 10 16:34:50 2003
247+++ linux/drivers/md/Makefile Wed Aug 20 14:41:44 2003
2ac564b8
AM
248@@ -4,24 +4,41 @@
249
250 O_TARGET := mddev.o
251
252-export-objs := md.o xor.o
253-list-multi := lvm-mod.o
254+export-objs := md.o xor.o dm-table.o dm-target.o kcopyd.o dm-daemon.o \
255+ dm-log.o dm-io.o dm.o
256+
257+list-multi := lvm-mod.o dm-mod.o dm-mirror-mod.o
258 lvm-mod-objs := lvm.o lvm-snap.o lvm-fs.o
259+dm-mod-objs := dm.o dm-table.o dm-target.o dm-ioctl.o \
260+ dm-linear.o dm-stripe.o dm-snapshot.o dm-exception-store.o \
261+ kcopyd.o dm-daemon.o dm-io.o
262+dm-mirror-mod-objs := dm-raid1.o dm-log.o
263
264 # Note: link order is important. All raid personalities
265 # and xor.o must come before md.o, as they each initialise
266 # themselves, and md.o may use the personalities when it
267 # auto-initialised.
268
269-obj-$(CONFIG_MD_LINEAR) += linear.o
270-obj-$(CONFIG_MD_RAID0) += raid0.o
271-obj-$(CONFIG_MD_RAID1) += raid1.o
272-obj-$(CONFIG_MD_RAID5) += raid5.o xor.o
273-obj-$(CONFIG_MD_MULTIPATH) += multipath.o
274-obj-$(CONFIG_BLK_DEV_MD) += md.o
275-obj-$(CONFIG_BLK_DEV_LVM) += lvm-mod.o
276+obj-$(CONFIG_MD_LINEAR) += linear.o
277+obj-$(CONFIG_MD_RAID0) += raid0.o
278+obj-$(CONFIG_MD_RAID1) += raid1.o
279+obj-$(CONFIG_MD_RAID5) += raid5.o xor.o
280+obj-$(CONFIG_MD_MULTIPATH) += multipath.o
281+obj-$(CONFIG_BLK_DEV_MD) += md.o
282+
283+obj-$(CONFIG_BLK_DEV_LVM) += lvm-mod.o
284+
285+obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
286+obj-$(CONFIG_BLK_DEV_DM_MIRROR) += dm-mirror.o
287
288 include $(TOPDIR)/Rules.make
289
290 lvm-mod.o: $(lvm-mod-objs)
291 $(LD) -r -o $@ $(lvm-mod-objs)
292+
293+dm-mod.o: $(dm-mod-objs)
294+ $(LD) -r -o $@ $(dm-mod-objs)
295+
296+dm-mirror.o: $(dm-mirror-mod-objs)
297+ $(LD) -r -o $@ $(dm-mirror-mod-objs)
298+
339dbf15
AM
299--- linux-2.4.21/drivers/md/dm-daemon.c Thu Jan 1 01:00:00 1970
300+++ linux/drivers/md/dm-daemon.c Wed Aug 20 14:41:38 2003
2ac564b8
AM
301@@ -0,0 +1,113 @@
302+/*
303+ * Copyright (C) 2003 Sistina Software
304+ *
305+ * This file is released under the LGPL.
306+ */
307+
308+#include "dm.h"
309+#include "dm-daemon.h"
310+
311+#include <linux/module.h>
312+#include <linux/sched.h>
313+
314+static int daemon(void *arg)
315+{
316+ struct dm_daemon *dd = (struct dm_daemon *) arg;
317+ DECLARE_WAITQUEUE(wq, current);
318+
319+ daemonize();
320+ reparent_to_init();
321+
322+ /* block all signals */
323+ spin_lock_irq(&current->sigmask_lock);
324+ sigfillset(&current->blocked);
325+ flush_signals(current);
326+ spin_unlock_irq(&current->sigmask_lock);
327+
328+ strcpy(current->comm, dd->name);
329+ atomic_set(&dd->please_die, 0);
330+
331+ add_wait_queue(&dd->job_queue, &wq);
332+
333+ down(&dd->run_lock);
334+ up(&dd->start_lock);
335+
336+ /*
337+ * dd->fn() could do anything, very likely it will
338+ * suspend. So we can't set the state to
339+ * TASK_INTERRUPTIBLE before calling it. In order to
340+ * prevent a race with a waking thread we do this little
341+ * dance with the dd->woken variable.
342+ */
343+ while (1) {
344+ do {
345+ set_current_state(TASK_RUNNING);
346+
347+ if (atomic_read(&dd->please_die))
348+ goto out;
349+
350+ atomic_set(&dd->woken, 0);
351+ dd->fn();
352+ yield();
353+
354+ set_current_state(TASK_INTERRUPTIBLE);
355+ } while (atomic_read(&dd->woken));
356+
357+ schedule();
358+ }
359+
360+ out:
361+ remove_wait_queue(&dd->job_queue, &wq);
362+ up(&dd->run_lock);
363+ return 0;
364+}
365+
366+int dm_daemon_start(struct dm_daemon *dd, const char *name, void (*fn)(void))
367+{
368+ pid_t pid = 0;
369+
370+ /*
371+ * Initialise the dm_daemon.
372+ */
373+ dd->fn = fn;
374+ strncpy(dd->name, name, sizeof(dd->name) - 1);
375+ sema_init(&dd->start_lock, 1);
376+ sema_init(&dd->run_lock, 1);
377+ init_waitqueue_head(&dd->job_queue);
378+
379+ /*
380+ * Start the new thread.
381+ */
382+ down(&dd->start_lock);
383+ pid = kernel_thread(daemon, dd, 0);
384+ if (pid <= 0) {
339dbf15 385+ DMERR("Failed to start %s thread", name);
2ac564b8
AM
386+ return -EAGAIN;
387+ }
388+
389+ /*
390+ * wait for the daemon to up this mutex.
391+ */
392+ down(&dd->start_lock);
393+ up(&dd->start_lock);
394+
395+ return 0;
396+}
397+
398+void dm_daemon_stop(struct dm_daemon *dd)
399+{
400+ atomic_set(&dd->please_die, 1);
401+ dm_daemon_wake(dd);
402+ down(&dd->run_lock);
403+ up(&dd->run_lock);
404+}
405+
406+void dm_daemon_wake(struct dm_daemon *dd)
407+{
408+ atomic_set(&dd->woken, 1);
409+ wake_up_interruptible(&dd->job_queue);
410+}
411+
412+EXPORT_SYMBOL(dm_daemon_start);
413+EXPORT_SYMBOL(dm_daemon_stop);
414+EXPORT_SYMBOL(dm_daemon_wake);
339dbf15
AM
415--- linux-2.4.21/drivers/md/dm-daemon.h Thu Jan 1 01:00:00 1970
416+++ linux/drivers/md/dm-daemon.h Wed Aug 20 14:41:38 2003
2ac564b8
AM
417@@ -0,0 +1,29 @@
418+/*
419+ * Copyright (C) 2003 Sistina Software
420+ *
421+ * This file is released under the LGPL.
422+ */
423+
424+#ifndef DM_DAEMON_H
425+#define DM_DAEMON_H
426+
427+#include <asm/atomic.h>
428+#include <asm/semaphore.h>
429+
430+struct dm_daemon {
431+ void (*fn)(void);
432+ char name[16];
433+ atomic_t please_die;
434+ struct semaphore start_lock;
435+ struct semaphore run_lock;
436+
437+ atomic_t woken;
438+ wait_queue_head_t job_queue;
439+};
440+
441+int dm_daemon_start(struct dm_daemon *dd, const char *name, void (*fn)(void));
442+void dm_daemon_stop(struct dm_daemon *dd);
443+void dm_daemon_wake(struct dm_daemon *dd);
444+int dm_daemon_running(struct dm_daemon *dd);
445+
446+#endif
339dbf15
AM
447--- linux-2.4.21/drivers/md/dm-exception-store.c Thu Jan 1 01:00:00 1970
448+++ linux/drivers/md/dm-exception-store.c Wed Aug 20 14:41:38 2003
449@@ -0,0 +1,673 @@
2ac564b8
AM
450+/*
451+ * dm-snapshot.c
452+ *
453+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
454+ *
455+ * This file is released under the GPL.
456+ */
457+
458+#include "dm-snapshot.h"
459+#include "dm-io.h"
460+#include "kcopyd.h"
461+
462+#include <linux/mm.h>
463+#include <linux/pagemap.h>
464+#include <linux/vmalloc.h>
465+#include <linux/slab.h>
466+
467+/*-----------------------------------------------------------------
468+ * Persistent snapshots, by persistent we mean that the snapshot
469+ * will survive a reboot.
470+ *---------------------------------------------------------------*/
471+
472+/*
473+ * We need to store a record of which parts of the origin have
474+ * been copied to the snapshot device. The snapshot code
475+ * requires that we copy exception chunks to chunk aligned areas
476+ * of the COW store. It makes sense therefore, to store the
477+ * metadata in chunk size blocks.
478+ *
479+ * There is no backward or forward compatibility implemented,
480+ * snapshots with different disk versions than the kernel will
481+ * not be usable. It is expected that "lvcreate" will blank out
482+ * the start of a fresh COW device before calling the snapshot
483+ * constructor.
484+ *
485+ * The first chunk of the COW device just contains the header.
486+ * After this there is a chunk filled with exception metadata,
487+ * followed by as many exception chunks as can fit in the
488+ * metadata areas.
489+ *
490+ * All on disk structures are in little-endian format. The end
491+ * of the exceptions info is indicated by an exception with a
492+ * new_chunk of 0, which is invalid since it would point to the
493+ * header chunk.
494+ */
495+
496+/*
497+ * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
498+ */
499+#define SNAP_MAGIC 0x70416e53
500+
501+/*
502+ * The on-disk version of the metadata.
503+ */
504+#define SNAPSHOT_DISK_VERSION 1
505+
506+struct disk_header {
507+ uint32_t magic;
508+
509+ /*
510+ * Is this snapshot valid. There is no way of recovering
511+ * an invalid snapshot.
512+ */
513+ uint32_t valid;
514+
515+ /*
516+ * Simple, incrementing version. no backward
517+ * compatibility.
518+ */
519+ uint32_t version;
520+
521+ /* In sectors */
522+ uint32_t chunk_size;
523+};
524+
525+struct disk_exception {
526+ uint64_t old_chunk;
527+ uint64_t new_chunk;
528+};
529+
530+struct commit_callback {
531+ void (*callback)(void *, int success);
532+ void *context;
533+};
534+
535+/*
536+ * The top level structure for a persistent exception store.
537+ */
538+struct pstore {
539+ struct dm_snapshot *snap; /* up pointer to my snapshot */
540+ int version;
541+ int valid;
542+ uint32_t chunk_size;
543+ uint32_t exceptions_per_area;
544+
545+ /*
546+ * Now that we have an asynchronous kcopyd there is no
547+ * need for large chunk sizes, so it wont hurt to have a
548+ * whole chunks worth of metadata in memory at once.
549+ */
550+ void *area;
551+
552+ /*
553+ * Used to keep track of which metadata area the data in
554+ * 'chunk' refers to.
555+ */
556+ uint32_t current_area;
557+
558+ /*
559+ * The next free chunk for an exception.
560+ */
561+ uint32_t next_free;
562+
563+ /*
564+ * The index of next free exception in the current
565+ * metadata area.
566+ */
567+ uint32_t current_committed;
568+
569+ atomic_t pending_count;
570+ uint32_t callback_count;
571+ struct commit_callback *callbacks;
572+};
573+
574+static inline unsigned int sectors_to_pages(unsigned int sectors)
575+{
576+ return sectors / (PAGE_SIZE / SECTOR_SIZE);
577+}
578+
579+static int alloc_area(struct pstore *ps)
580+{
581+ int r = -ENOMEM;
582+ size_t i, len, nr_pages;
583+ struct page *page, *last = NULL;
584+
585+ len = ps->chunk_size << SECTOR_SHIFT;
586+
587+ /*
588+ * Allocate the chunk_size block of memory that will hold
589+ * a single metadata area.
590+ */
591+ ps->area = vmalloc(len);
592+ if (!ps->area)
593+ return r;
594+
595+ nr_pages = sectors_to_pages(ps->chunk_size);
596+
597+ /*
598+ * We lock the pages for ps->area into memory since
599+ * they'll be doing a lot of io. We also chain them
600+ * together ready for dm-io.
601+ */
602+ for (i = 0; i < nr_pages; i++) {
603+ page = vmalloc_to_page(ps->area + (i * PAGE_SIZE));
604+ LockPage(page);
605+ if (last)
606+ last->list.next = &page->list;
607+ last = page;
608+ }
609+
610+ return 0;
611+}
612+
613+static void free_area(struct pstore *ps)
614+{
615+ size_t i, nr_pages;
616+ struct page *page;
617+
618+ nr_pages = sectors_to_pages(ps->chunk_size);
619+ for (i = 0; i < nr_pages; i++) {
620+ page = vmalloc_to_page(ps->area + (i * PAGE_SIZE));
621+ page->list.next = NULL;
622+ UnlockPage(page);
623+ }
624+
625+ vfree(ps->area);
626+}
627+
628+/*
629+ * Read or write a chunk aligned and sized block of data from a device.
630+ */
631+static int chunk_io(struct pstore *ps, uint32_t chunk, int rw)
632+{
633+ struct io_region where;
634+ unsigned int bits;
635+
636+ where.dev = ps->snap->cow->dev;
637+ where.sector = ps->chunk_size * chunk;
638+ where.count = ps->chunk_size;
639+
640+ return dm_io_sync(1, &where, rw, vmalloc_to_page(ps->area), 0, &bits);
641+}
642+
643+/*
644+ * Read or write a metadata area. Remembering to skip the first
645+ * chunk which holds the header.
646+ */
647+static int area_io(struct pstore *ps, uint32_t area, int rw)
648+{
649+ int r;
650+ uint32_t chunk;
651+
652+ /* convert a metadata area index to a chunk index */
653+ chunk = 1 + ((ps->exceptions_per_area + 1) * area);
654+
655+ r = chunk_io(ps, chunk, rw);
656+ if (r)
657+ return r;
658+
659+ ps->current_area = area;
660+ return 0;
661+}
662+
663+static int zero_area(struct pstore *ps, uint32_t area)
664+{
665+ memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT);
666+ return area_io(ps, area, WRITE);
667+}
668+
669+static int read_header(struct pstore *ps, int *new_snapshot)
670+{
671+ int r;
672+ struct disk_header *dh;
673+
674+ r = chunk_io(ps, 0, READ);
675+ if (r)
676+ return r;
677+
678+ dh = (struct disk_header *) ps->area;
679+
680+ if (le32_to_cpu(dh->magic) == 0) {
681+ *new_snapshot = 1;
682+
683+ } else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) {
684+ *new_snapshot = 0;
685+ ps->valid = le32_to_cpu(dh->valid);
686+ ps->version = le32_to_cpu(dh->version);
687+ ps->chunk_size = le32_to_cpu(dh->chunk_size);
688+
689+ } else {
690+ DMWARN("Invalid/corrupt snapshot");
691+ r = -ENXIO;
692+ }
693+
694+ return r;
695+}
696+
697+static int write_header(struct pstore *ps)
698+{
699+ struct disk_header *dh;
700+
701+ memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT);
702+
703+ dh = (struct disk_header *) ps->area;
704+ dh->magic = cpu_to_le32(SNAP_MAGIC);
705+ dh->valid = cpu_to_le32(ps->valid);
706+ dh->version = cpu_to_le32(ps->version);
707+ dh->chunk_size = cpu_to_le32(ps->chunk_size);
708+
709+ return chunk_io(ps, 0, WRITE);
710+}
711+
712+/*
713+ * Access functions for the disk exceptions, these do the endian conversions.
714+ */
715+static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
716+{
717+ if (index >= ps->exceptions_per_area)
718+ return NULL;
719+
720+ return ((struct disk_exception *) ps->area) + index;
721+}
722+
723+static int read_exception(struct pstore *ps,
724+ uint32_t index, struct disk_exception *result)
725+{
726+ struct disk_exception *e;
727+
728+ e = get_exception(ps, index);
729+ if (!e)
730+ return -EINVAL;
731+
732+ /* copy it */
733+ result->old_chunk = le64_to_cpu(e->old_chunk);
734+ result->new_chunk = le64_to_cpu(e->new_chunk);
735+
736+ return 0;
737+}
738+
739+static int write_exception(struct pstore *ps,
740+ uint32_t index, struct disk_exception *de)
741+{
742+ struct disk_exception *e;
743+
744+ e = get_exception(ps, index);
745+ if (!e)
746+ return -EINVAL;
747+
748+ /* copy it */
749+ e->old_chunk = cpu_to_le64(de->old_chunk);
750+ e->new_chunk = cpu_to_le64(de->new_chunk);
751+
752+ return 0;
753+}
754+
755+/*
756+ * Registers the exceptions that are present in the current area.
757+ * 'full' is filled in to indicate if the area has been
758+ * filled.
759+ */
760+static int insert_exceptions(struct pstore *ps, int *full)
761+{
762+ int r;
763+ unsigned int i;
764+ struct disk_exception de;
765+
766+ /* presume the area is full */
767+ *full = 1;
768+
769+ for (i = 0; i < ps->exceptions_per_area; i++) {
770+ r = read_exception(ps, i, &de);
771+
772+ if (r)
773+ return r;
774+
775+ /*
776+ * If the new_chunk is pointing at the start of
777+ * the COW device, where the first metadata area
778+ * is we know that we've hit the end of the
779+ * exceptions. Therefore the area is not full.
780+ */
781+ if (de.new_chunk == 0LL) {
782+ ps->current_committed = i;
783+ *full = 0;
784+ break;
785+ }
786+
787+ /*
788+ * Keep track of the start of the free chunks.
789+ */
790+ if (ps->next_free <= de.new_chunk)
791+ ps->next_free = de.new_chunk + 1;
792+
793+ /*
794+ * Otherwise we add the exception to the snapshot.
795+ */
796+ r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk);
797+ if (r)
798+ return r;
799+ }
800+
801+ return 0;
802+}
803+
804+static int read_exceptions(struct pstore *ps)
805+{
806+ uint32_t area;
807+ int r, full = 1;
808+
809+ /*
810+ * Keeping reading chunks and inserting exceptions until
811+ * we find a partially full area.
812+ */
813+ for (area = 0; full; area++) {
814+ r = area_io(ps, area, READ);
815+ if (r)
816+ return r;
817+
818+ r = insert_exceptions(ps, &full);
819+ if (r)
820+ return r;
2ac564b8
AM
821+ }
822+
823+ return 0;
824+}
825+
826+static inline struct pstore *get_info(struct exception_store *store)
827+{
828+ return (struct pstore *) store->context;
829+}
830+
831+static void persistent_fraction_full(struct exception_store *store,
832+ sector_t *numerator, sector_t *denominator)
833+{
834+ *numerator = get_info(store)->next_free * store->snap->chunk_size;
835+ *denominator = get_dev_size(store->snap->cow->dev);
836+}
837+
838+static void persistent_destroy(struct exception_store *store)
839+{
840+ struct pstore *ps = get_info(store);
841+
842+ dm_io_put(sectors_to_pages(ps->chunk_size));
843+ vfree(ps->callbacks);
844+ free_area(ps);
845+ kfree(ps);
846+}
847+
848+static int persistent_read_metadata(struct exception_store *store)
849+{
850+ int r, new_snapshot;
851+ struct pstore *ps = get_info(store);
852+
853+ /*
854+ * Read the snapshot header.
855+ */
856+ r = read_header(ps, &new_snapshot);
857+ if (r)
858+ return r;
859+
860+ /*
861+ * Do we need to setup a new snapshot ?
862+ */
863+ if (new_snapshot) {
864+ r = write_header(ps);
865+ if (r) {
866+ DMWARN("write_header failed");
867+ return r;
868+ }
869+
870+ r = zero_area(ps, 0);
871+ if (r) {
872+ DMWARN("zero_area(0) failed");
873+ return r;
874+ }
875+
876+ } else {
877+ /*
878+ * Sanity checks.
879+ */
880+ if (!ps->valid) {
881+ DMWARN("snapshot is marked invalid");
882+ return -EINVAL;
883+ }
884+
885+ if (ps->version != SNAPSHOT_DISK_VERSION) {
886+ DMWARN("unable to handle snapshot disk version %d",
887+ ps->version);
888+ return -EINVAL;
889+ }
890+
891+ /*
892+ * Read the metadata.
893+ */
894+ r = read_exceptions(ps);
895+ if (r)
896+ return r;
897+ }
898+
899+ return 0;
900+}
901+
902+static int persistent_prepare(struct exception_store *store,
903+ struct exception *e)
904+{
905+ struct pstore *ps = get_info(store);
906+ uint32_t stride;
907+ sector_t size = get_dev_size(store->snap->cow->dev);
908+
909+ /* Is there enough room ? */
910+ if (size < ((ps->next_free + 1) * store->snap->chunk_size))
911+ return -ENOSPC;
912+
913+ e->new_chunk = ps->next_free;
914+
915+ /*
916+ * Move onto the next free pending, making sure to take
917+ * into account the location of the metadata chunks.
918+ */
919+ stride = (ps->exceptions_per_area + 1);
920+ if ((++ps->next_free % stride) == 1)
921+ ps->next_free++;
922+
923+ atomic_inc(&ps->pending_count);
924+ return 0;
925+}
926+
927+static void persistent_commit(struct exception_store *store,
928+ struct exception *e,
929+ void (*callback) (void *, int success),
930+ void *callback_context)
931+{
932+ int r;
933+ unsigned int i;
934+ struct pstore *ps = get_info(store);
935+ struct disk_exception de;
936+ struct commit_callback *cb;
937+
938+ de.old_chunk = e->old_chunk;
939+ de.new_chunk = e->new_chunk;
940+ write_exception(ps, ps->current_committed++, &de);
941+
942+ /*
943+ * Add the callback to the back of the array. This code
944+ * is the only place where the callback array is
945+ * manipulated, and we know that it will never be called
946+ * multiple times concurrently.
947+ */
948+ cb = ps->callbacks + ps->callback_count++;
949+ cb->callback = callback;
950+ cb->context = callback_context;
951+
952+ /*
953+ * If there are no more exceptions in flight, or we have
954+ * filled this metadata area we commit the exceptions to
955+ * disk.
956+ */
957+ if (atomic_dec_and_test(&ps->pending_count) ||
958+ (ps->current_committed == ps->exceptions_per_area)) {
959+ r = area_io(ps, ps->current_area, WRITE);
960+ if (r)
961+ ps->valid = 0;
962+
963+ for (i = 0; i < ps->callback_count; i++) {
964+ cb = ps->callbacks + i;
965+ cb->callback(cb->context, r == 0 ? 1 : 0);
966+ }
967+
968+ ps->callback_count = 0;
969+ }
970+
971+ /*
972+ * Have we completely filled the current area ?
973+ */
974+ if (ps->current_committed == ps->exceptions_per_area) {
975+ ps->current_committed = 0;
976+ r = zero_area(ps, ps->current_area + 1);
977+ if (r)
978+ ps->valid = 0;
979+ }
980+}
981+
982+static void persistent_drop(struct exception_store *store)
983+{
984+ struct pstore *ps = get_info(store);
985+
986+ ps->valid = 0;
987+ if (write_header(ps))
988+ DMWARN("write header failed");
989+}
990+
991+int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
992+{
993+ int r;
994+ struct pstore *ps;
995+
996+ r = dm_io_get(sectors_to_pages(chunk_size));
997+ if (r)
998+ return r;
999+
1000+ /* allocate the pstore */
1001+ ps = kmalloc(sizeof(*ps), GFP_KERNEL);
1002+ if (!ps) {
1003+ r = -ENOMEM;
1004+ goto bad;
1005+ }
1006+
1007+ ps->snap = store->snap;
1008+ ps->valid = 1;
1009+ ps->version = SNAPSHOT_DISK_VERSION;
1010+ ps->chunk_size = chunk_size;
1011+ ps->exceptions_per_area = (chunk_size << SECTOR_SHIFT) /
1012+ sizeof(struct disk_exception);
1013+ ps->next_free = 2; /* skipping the header and first area */
1014+ ps->current_committed = 0;
1015+
1016+ r = alloc_area(ps);
1017+ if (r)
1018+ goto bad;
1019+
1020+ /*
1021+ * Allocate space for all the callbacks.
1022+ */
1023+ ps->callback_count = 0;
1024+ atomic_set(&ps->pending_count, 0);
1025+ ps->callbacks = vcalloc(ps->exceptions_per_area,
1026+ sizeof(*ps->callbacks));
1027+
1028+ if (!ps->callbacks) {
1029+ r = -ENOMEM;
1030+ goto bad;
1031+ }
1032+
1033+ store->destroy = persistent_destroy;
1034+ store->read_metadata = persistent_read_metadata;
1035+ store->prepare_exception = persistent_prepare;
1036+ store->commit_exception = persistent_commit;
1037+ store->drop_snapshot = persistent_drop;
1038+ store->fraction_full = persistent_fraction_full;
1039+ store->context = ps;
1040+
1041+ return 0;
1042+
1043+ bad:
1044+ dm_io_put(sectors_to_pages(chunk_size));
1045+ if (ps) {
1046+ if (ps->callbacks)
1047+ vfree(ps->callbacks);
1048+
1049+ kfree(ps);
1050+ }
1051+ return r;
1052+}
1053+
1054+/*-----------------------------------------------------------------
1055+ * Implementation of the store for non-persistent snapshots.
1056+ *---------------------------------------------------------------*/
1057+struct transient_c {
1058+ sector_t next_free;
1059+};
1060+
1061+void transient_destroy(struct exception_store *store)
1062+{
1063+ kfree(store->context);
1064+}
1065+
1066+int transient_read_metadata(struct exception_store *store)
1067+{
1068+ return 0;
1069+}
1070+
1071+int transient_prepare(struct exception_store *store, struct exception *e)
1072+{
1073+ struct transient_c *tc = (struct transient_c *) store->context;
1074+ sector_t size = get_dev_size(store->snap->cow->dev);
1075+
1076+ if (size < (tc->next_free + store->snap->chunk_size))
1077+ return -1;
1078+
1079+ e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
1080+ tc->next_free += store->snap->chunk_size;
1081+
1082+ return 0;
1083+}
1084+
1085+void transient_commit(struct exception_store *store,
1086+ struct exception *e,
1087+ void (*callback) (void *, int success),
1088+ void *callback_context)
1089+{
1090+ /* Just succeed */
1091+ callback(callback_context, 1);
1092+}
1093+
1094+static void transient_fraction_full(struct exception_store *store,
1095+ sector_t *numerator, sector_t *denominator)
1096+{
1097+ *numerator = ((struct transient_c *) store->context)->next_free;
1098+ *denominator = get_dev_size(store->snap->cow->dev);
1099+}
1100+
1101+int dm_create_transient(struct exception_store *store,
1102+ struct dm_snapshot *s, int blocksize)
1103+{
1104+ struct transient_c *tc;
1105+
1106+ memset(store, 0, sizeof(*store));
1107+ store->destroy = transient_destroy;
1108+ store->read_metadata = transient_read_metadata;
1109+ store->prepare_exception = transient_prepare;
1110+ store->commit_exception = transient_commit;
1111+ store->fraction_full = transient_fraction_full;
1112+ store->snap = s;
1113+
1114+ tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
1115+ if (!tc)
1116+ return -ENOMEM;
1117+
1118+ tc->next_free = 0;
1119+ store->context = tc;
1120+
1121+ return 0;
1122+}
339dbf15
AM
1123--- linux-2.4.21/drivers/md/dm-io.c Thu Jan 1 01:00:00 1970
1124+++ linux/drivers/md/dm-io.c Wed Aug 20 14:41:38 2003
2ac564b8
AM
1125@@ -0,0 +1,344 @@
1126+/*
1127+ * Copyright (C) 2003 Sistina Software
1128+ *
1129+ * This file is released under the GPL.
1130+ */
1131+
1132+#include "dm-io.h"
1133+
1134+#include <linux/mempool.h>
1135+#include <linux/module.h>
1136+#include <linux/slab.h>
1137+#include <linux/sched.h>
1138+
1139+/* FIXME: can we shrink this ? */
1140+struct io_context {
1141+ int rw;
1142+ unsigned int error;
1143+ atomic_t count;
1144+ struct task_struct *sleeper;
1145+ io_notify_fn callback;
1146+ void *context;
1147+};
1148+
1149+/*
1150+ * We maintain a pool of buffer heads for dispatching the io.
1151+ */
1152+static unsigned int _num_bhs;
1153+static mempool_t *_buffer_pool;
1154+
1155+/*
1156+ * io contexts are only dynamically allocated for asynchronous
1157+ * io. Since async io is likely to be the majority of io we'll
1158+ * have the same number of io contexts as buffer heads ! (FIXME:
1159+ * must reduce this).
1160+ */
1161+mempool_t *_io_pool;
1162+
1163+static void *alloc_bh(int gfp_mask, void *pool_data)
1164+{
1165+ struct buffer_head *bh;
1166+
1167+ bh = kmem_cache_alloc(bh_cachep, gfp_mask);
1168+ if (bh) {
1169+ bh->b_reqnext = NULL;
1170+ init_waitqueue_head(&bh->b_wait);
1171+ INIT_LIST_HEAD(&bh->b_inode_buffers);
1172+ }
1173+
1174+ return bh;
1175+}
1176+
1177+static void *alloc_io(int gfp_mask, void *pool_data)
1178+{
1179+ return kmalloc(sizeof(struct io_context), gfp_mask);
1180+}
1181+
1182+static void free_io(void *element, void *pool_data)
1183+{
1184+ kfree(element);
1185+}
1186+
1187+static unsigned int pages_to_buffers(unsigned int pages)
1188+{
1189+ return 4 * pages; /* too many ? */
1190+}
1191+
1192+static int resize_pool(unsigned int new_bhs)
1193+{
1194+ int r = 0;
1195+
1196+ if (_buffer_pool) {
1197+ if (new_bhs == 0) {
1198+ /* free off the pools */
1199+ mempool_destroy(_buffer_pool);
1200+ mempool_destroy(_io_pool);
1201+ _buffer_pool = _io_pool = NULL;
1202+ } else {
1203+ /* resize the pools */
1204+ r = mempool_resize(_buffer_pool, new_bhs, GFP_KERNEL);
1205+ if (!r)
1206+ r = mempool_resize(_io_pool,
1207+ new_bhs, GFP_KERNEL);
1208+ }
1209+ } else {
1210+ /* create new pools */
1211+ _buffer_pool = mempool_create(new_bhs, alloc_bh,
1212+ mempool_free_slab, bh_cachep);
1213+ if (!_buffer_pool)
1214+ r = -ENOMEM;
1215+
1216+ _io_pool = mempool_create(new_bhs, alloc_io, free_io, NULL);
1217+ if (!_io_pool) {
1218+ mempool_destroy(_buffer_pool);
1219+ _buffer_pool = NULL;
1220+ r = -ENOMEM;
1221+ }
1222+ }
1223+
1224+ if (!r)
1225+ _num_bhs = new_bhs;
1226+
1227+ return r;
1228+}
1229+
1230+int dm_io_get(unsigned int num_pages)
1231+{
1232+ return resize_pool(_num_bhs + pages_to_buffers(num_pages));
1233+}
1234+
1235+void dm_io_put(unsigned int num_pages)
1236+{
1237+ resize_pool(_num_bhs - pages_to_buffers(num_pages));
1238+}
1239+
1240+/*-----------------------------------------------------------------
1241+ * We need to keep track of which region a buffer is doing io
1242+ * for. In order to save a memory allocation we store this in an
1243+ * unused field of the buffer head, and provide these access
1244+ * functions.
1245+ *
1246+ * FIXME: add compile time check that an unsigned int can fit
1247+ * into a pointer.
1248+ *
1249+ *---------------------------------------------------------------*/
1250+static inline void bh_set_region(struct buffer_head *bh, unsigned int region)
1251+{
1252+ bh->b_journal_head = (void *) region;
1253+}
1254+
1255+static inline int bh_get_region(struct buffer_head *bh)
1256+{
1257+ return (unsigned int) bh->b_journal_head;
1258+}
1259+
1260+/*-----------------------------------------------------------------
1261+ * We need an io object to keep track of the number of bhs that
1262+ * have been dispatched for a particular io.
1263+ *---------------------------------------------------------------*/
1264+static void dec_count(struct io_context *io, unsigned int region, int error)
1265+{
1266+ if (error)
1267+ set_bit(region, &io->error);
1268+
1269+ if (atomic_dec_and_test(&io->count)) {
1270+ if (io->sleeper)
1271+ wake_up_process(io->sleeper);
1272+
1273+ else {
1274+ int r = io->error;
1275+ io_notify_fn fn = io->callback;
1276+ void *context = io->context;
1277+
1278+ mempool_free(io, _io_pool);
1279+ fn(r, context);
1280+ }
1281+ }
1282+}
1283+
1284+static void endio(struct buffer_head *bh, int uptodate)
1285+{
1286+ struct io_context *io = (struct io_context *) bh->b_private;
1287+
1288+ if (!uptodate && io->rw != WRITE) {
1289+ /*
1290+ * We need to zero this region, otherwise people
1291+ * like kcopyd may write the arbitrary contents
1292+ * of the page.
1293+ */
1294+ memset(bh->b_data, 0, bh->b_size);
1295+ }
1296+
1297+ dec_count((struct io_context *) bh->b_private,
1298+ bh_get_region(bh), !uptodate);
1299+ mempool_free(bh, _buffer_pool);
1300+}
1301+
1302+/*
1303+ * Primitives for alignment calculations.
1304+ */
1305+int fls(unsigned n)
1306+{
1307+ return generic_fls32(n);
1308+}
1309+
1310+static inline int log2_floor(unsigned n)
1311+{
1312+ return ffs(n) - 1;
1313+}
1314+
1315+static inline int log2_align(unsigned n)
1316+{
1317+ return fls(n) - 1;
1318+}
1319+
1320+/*
1321+ * Returns the next block for io.
1322+ */
1323+static int do_page(kdev_t dev, sector_t *block, sector_t end_block,
1324+ unsigned int block_size,
1325+ struct page *p, unsigned int offset,
1326+ unsigned int region, struct io_context *io)
1327+{
1328+ struct buffer_head *bh;
1329+ sector_t b = *block;
1330+ sector_t blocks_per_page = PAGE_SIZE / block_size;
1331+ unsigned int this_size; /* holds the size of the current io */
1332+ unsigned int len;
1333+
1334+ while ((offset < PAGE_SIZE) && (b != end_block)) {
1335+ bh = mempool_alloc(_buffer_pool, GFP_NOIO);
1336+ init_buffer(bh, endio, io);
1337+ bh_set_region(bh, region);
1338+
1339+ /*
1340+ * Block size must be a power of 2 and aligned
1341+ * correctly.
1342+ */
1343+ len = end_block - b;
1344+ this_size = min((sector_t) 1 << log2_floor(b), blocks_per_page);
1345+ if (this_size > len)
1346+ this_size = 1 << log2_align(len);
1347+
1348+ /*
1349+ * Add in the job offset.
1350+ */
1351+ bh->b_blocknr = (b / this_size);
1352+ bh->b_size = block_size * this_size;
1353+ set_bh_page(bh, p, offset);
1354+ bh->b_this_page = bh;
1355+
1356+ bh->b_dev = dev;
1357+ atomic_set(&bh->b_count, 1);
1358+
1359+ bh->b_state = ((1 << BH_Uptodate) | (1 << BH_Mapped) |
1360+ (1 << BH_Lock));
1361+
1362+ if (io->rw == WRITE)
1363+ clear_bit(BH_Dirty, &bh->b_state);
1364+
1365+ atomic_inc(&io->count);
1366+ submit_bh(io->rw, bh);
1367+
1368+ b += this_size;
1369+ offset += block_size * this_size;
1370+ }
1371+
1372+ *block = b;
1373+ return (b == end_block);
1374+}
1375+
1376+static void do_region(unsigned int region, struct io_region *where,
1377+ struct page *page, unsigned int offset,
1378+ struct io_context *io)
1379+{
1380+ unsigned int block_size = get_hardsect_size(where->dev);
1381+ unsigned int sblock_size = block_size >> 9;
1382+ sector_t block = where->sector / sblock_size;
1383+ sector_t end_block = (where->sector + where->count) / sblock_size;
1384+
1385+ while (1) {
1386+ if (do_page(where->dev, &block, end_block, block_size,
1387+ page, offset, region, io))
1388+ break;
1389+
1390+ offset = 0; /* only offset the first page */
1391+
1392+ page = list_entry(page->list.next, struct page, list);
1393+ }
1394+}
1395+
1396+static void dispatch_io(unsigned int num_regions, struct io_region *where,
1397+ struct page *pages, unsigned int offset,
1398+ struct io_context *io)
1399+{
1400+ int i;
1401+
1402+ for (i = 0; i < num_regions; i++)
1403+ if (where[i].count)
1404+ do_region(i, where + i, pages, offset, io);
1405+
1406+ /*
1407+ * Drop the extra refence that we were holding to avoid
1408+ * the io being completed too early.
1409+ */
1410+ dec_count(io, 0, 0);
1411+}
1412+
1413+/*
1414+ * Synchronous io
1415+ */
1416+int dm_io_sync(unsigned int num_regions, struct io_region *where,
1417+ int rw, struct page *pages, unsigned int offset,
1418+ unsigned int *error_bits)
1419+{
1420+ struct io_context io;
1421+
1422+ BUG_ON(num_regions > 1 && rw != WRITE);
1423+
1424+ io.rw = rw;
1425+ io.error = 0;
1426+ atomic_set(&io.count, 1); /* see dispatch_io() */
1427+ io.sleeper = current;
1428+
1429+ dispatch_io(num_regions, where, pages, offset, &io);
1430+ run_task_queue(&tq_disk);
1431+
1432+ while (1) {
1433+ set_current_state(TASK_UNINTERRUPTIBLE);
1434+
1435+ if (!atomic_read(&io.count))
1436+ break;
1437+
1438+ schedule();
1439+ }
1440+ set_current_state(TASK_RUNNING);
1441+
1442+ *error_bits = io.error;
1443+ return io.error ? -EIO : 0;
1444+}
1445+
1446+/*
1447+ * Asynchronous io
1448+ */
1449+int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
1450+ struct page *pages, unsigned int offset,
1451+ io_notify_fn fn, void *context)
1452+{
1453+ struct io_context *io = mempool_alloc(_io_pool, GFP_NOIO);
1454+
1455+ io->rw = rw;
1456+ io->error = 0;
1457+ atomic_set(&io->count, 1); /* see dispatch_io() */
1458+ io->sleeper = NULL;
1459+ io->callback = fn;
1460+ io->context = context;
1461+
1462+ dispatch_io(num_regions, where, pages, offset, io);
1463+ return 0;
1464+}
1465+
1466+EXPORT_SYMBOL(dm_io_get);
1467+EXPORT_SYMBOL(dm_io_put);
1468+EXPORT_SYMBOL(dm_io_sync);
1469+EXPORT_SYMBOL(dm_io_async);
339dbf15
AM
1470--- linux-2.4.21/drivers/md/dm-io.h Thu Jan 1 01:00:00 1970
1471+++ linux/drivers/md/dm-io.h Wed Aug 20 14:41:38 2003
2ac564b8
AM
1472@@ -0,0 +1,86 @@
1473+/*
1474+ * Copyright (C) 2003 Sistina Software
1475+ *
1476+ * This file is released under the GPL.
1477+ */
1478+
1479+#ifndef _DM_IO_H
1480+#define _DM_IO_H
1481+
1482+#include "dm.h"
1483+
1484+#include <linux/list.h>
1485+
1486+/* Move these to bitops.h eventually */
1487+/* Improved generic_fls algorithm (in 2.4 there is no generic_fls so far) */
1488+/* (c) 2002, D.Phillips and Sistina Software */
1489+/* Licensed under Version 2 of the GPL */
1490+
1491+static unsigned generic_fls8(unsigned n)
1492+{
1493+ return n & 0xf0 ?
1494+ n & 0xc0 ? (n >> 7) + 7 : (n >> 5) + 5:
1495+ n & 0x0c ? (n >> 3) + 3 : n - ((n + 1) >> 2);
1496+}
1497+
1498+static inline unsigned generic_fls16(unsigned n)
1499+{
1500+ return n & 0xff00? generic_fls8(n >> 8) + 8 : generic_fls8(n);
1501+}
1502+
1503+static inline unsigned generic_fls32(unsigned n)
1504+{
1505+ return n & 0xffff0000 ? generic_fls16(n >> 16) + 16 : generic_fls16(n);
1506+}
1507+
1508+/* FIXME make this configurable */
1509+#define DM_MAX_IO_REGIONS 8
1510+
1511+struct io_region {
1512+ kdev_t dev;
1513+ sector_t sector;
1514+ sector_t count;
1515+};
1516+
1517+
1518+/*
1519+ * 'error' is a bitset, with each bit indicating whether an error
1520+ * occurred doing io to the corresponding region.
1521+ */
1522+typedef void (*io_notify_fn)(unsigned int error, void *context);
1523+
1524+
1525+/*
1526+ * Before anyone uses the IO interface they should call
1527+ * dm_io_get(), specifying roughly how many pages they are
1528+ * expecting to perform io on concurrently.
1529+ *
1530+ * This function may block.
1531+ */
1532+int dm_io_get(unsigned int num_pages);
1533+void dm_io_put(unsigned int num_pages);
1534+
1535+
1536+/*
1537+ * Synchronous IO.
1538+ *
1539+ * Please ensure that the rw flag in the next two functions is
1540+ * either READ or WRITE, ie. we don't take READA. Any
1541+ * regions with a zero count field will be ignored.
1542+ */
1543+int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw,
1544+ struct page *pages, unsigned int offset,
1545+ unsigned int *error_bits);
1546+
1547+
1548+/*
1549+ * Aynchronous IO.
1550+ *
1551+ * The 'where' array may be safely allocated on the stack since
1552+ * the function takes a copy.
1553+ */
1554+int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
1555+ struct page *pages, unsigned int offset,
1556+ io_notify_fn fn, void *context);
1557+
1558+#endif
339dbf15
AM
1559--- linux-2.4.21/drivers/md/dm-ioctl.c Thu Jan 1 01:00:00 1970
1560+++ linux/drivers/md/dm-ioctl.c Wed Aug 20 14:41:38 2003
1561@@ -0,0 +1,1284 @@
2ac564b8
AM
1562+/*
1563+ * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
1564+ *
1565+ * This file is released under the GPL.
1566+ */
1567+
1568+#include "dm.h"
1569+
1570+#include <linux/module.h>
1571+#include <linux/vmalloc.h>
1572+#include <linux/miscdevice.h>
1573+#include <linux/dm-ioctl.h>
1574+#include <linux/init.h>
1575+#include <linux/wait.h>
1576+#include <linux/blk.h>
1577+#include <linux/slab.h>
1578+
1579+#include <asm/uaccess.h>
1580+
1581+#define DM_DRIVER_EMAIL "dm@uk.sistina.com"
1582+
1583+/*-----------------------------------------------------------------
1584+ * The ioctl interface needs to be able to look up devices by
1585+ * name or uuid.
1586+ *---------------------------------------------------------------*/
1587+struct hash_cell {
1588+ struct list_head name_list;
1589+ struct list_head uuid_list;
1590+
1591+ char *name;
1592+ char *uuid;
1593+ struct mapped_device *md;
1594+ struct dm_table *new_map;
1595+
1596+ /* I hate devfs */
1597+ devfs_handle_t devfs_entry;
1598+};
1599+
1600+#define NUM_BUCKETS 64
1601+#define MASK_BUCKETS (NUM_BUCKETS - 1)
1602+static struct list_head _name_buckets[NUM_BUCKETS];
1603+static struct list_head _uuid_buckets[NUM_BUCKETS];
1604+
1605+static devfs_handle_t _dev_dir;
1606+void dm_hash_remove_all(void);
1607+
1608+/*
1609+ * Guards access to both hash tables.
1610+ */
1611+static DECLARE_RWSEM(_hash_lock);
1612+
1613+static void init_buckets(struct list_head *buckets)
1614+{
1615+ unsigned int i;
1616+
1617+ for (i = 0; i < NUM_BUCKETS; i++)
1618+ INIT_LIST_HEAD(buckets + i);
1619+}
1620+
1621+int dm_hash_init(void)
1622+{
1623+ init_buckets(_name_buckets);
1624+ init_buckets(_uuid_buckets);
1625+ _dev_dir = devfs_mk_dir(0, DM_DIR, NULL);
1626+ return 0;
1627+}
1628+
1629+void dm_hash_exit(void)
1630+{
1631+ dm_hash_remove_all();
1632+ devfs_unregister(_dev_dir);
1633+}
1634+
1635+/*-----------------------------------------------------------------
1636+ * Hash function:
1637+ * We're not really concerned with the str hash function being
1638+ * fast since it's only used by the ioctl interface.
1639+ *---------------------------------------------------------------*/
1640+static unsigned int hash_str(const char *str)
1641+{
1642+ const unsigned int hash_mult = 2654435387U;
1643+ unsigned int h = 0;
1644+
1645+ while (*str)
1646+ h = (h + (unsigned int) *str++) * hash_mult;
1647+
1648+ return h & MASK_BUCKETS;
1649+}
1650+
1651+/*-----------------------------------------------------------------
1652+ * Code for looking up a device by name
1653+ *---------------------------------------------------------------*/
1654+static struct hash_cell *__get_name_cell(const char *str)
1655+{
1656+ struct list_head *tmp;
1657+ struct hash_cell *hc;
1658+ unsigned int h = hash_str(str);
1659+
1660+ list_for_each (tmp, _name_buckets + h) {
1661+ hc = list_entry(tmp, struct hash_cell, name_list);
1662+ if (!strcmp(hc->name, str))
1663+ return hc;
1664+ }
1665+
1666+ return NULL;
1667+}
1668+
1669+static struct hash_cell *__get_uuid_cell(const char *str)
1670+{
1671+ struct list_head *tmp;
1672+ struct hash_cell *hc;
1673+ unsigned int h = hash_str(str);
1674+
1675+ list_for_each (tmp, _uuid_buckets + h) {
1676+ hc = list_entry(tmp, struct hash_cell, uuid_list);
1677+ if (!strcmp(hc->uuid, str))
1678+ return hc;
1679+ }
1680+
1681+ return NULL;
1682+}
1683+
1684+/*-----------------------------------------------------------------
1685+ * Inserting, removing and renaming a device.
1686+ *---------------------------------------------------------------*/
1687+static inline char *kstrdup(const char *str)
1688+{
1689+ char *r = kmalloc(strlen(str) + 1, GFP_KERNEL);
1690+ if (r)
1691+ strcpy(r, str);
1692+ return r;
1693+}
1694+
1695+static struct hash_cell *alloc_cell(const char *name, const char *uuid,
1696+ struct mapped_device *md)
1697+{
1698+ struct hash_cell *hc;
1699+
1700+ hc = kmalloc(sizeof(*hc), GFP_KERNEL);
1701+ if (!hc)
1702+ return NULL;
1703+
1704+ hc->name = kstrdup(name);
1705+ if (!hc->name) {
1706+ kfree(hc);
1707+ return NULL;
1708+ }
1709+
1710+ if (!uuid)
1711+ hc->uuid = NULL;
1712+
1713+ else {
1714+ hc->uuid = kstrdup(uuid);
1715+ if (!hc->uuid) {
1716+ kfree(hc->name);
1717+ kfree(hc);
1718+ return NULL;
1719+ }
1720+ }
1721+
1722+ INIT_LIST_HEAD(&hc->name_list);
1723+ INIT_LIST_HEAD(&hc->uuid_list);
1724+ hc->md = md;
1725+ hc->new_map = NULL;
1726+ return hc;
1727+}
1728+
1729+static void free_cell(struct hash_cell *hc)
1730+{
1731+ if (hc) {
1732+ kfree(hc->name);
1733+ kfree(hc->uuid);
1734+ kfree(hc);
1735+ }
1736+}
1737+
1738+/*
1739+ * devfs stuff.
1740+ */
1741+static int register_with_devfs(struct hash_cell *hc)
1742+{
1743+ kdev_t dev = dm_kdev(hc->md);
1744+
1745+ hc->devfs_entry =
1746+ devfs_register(_dev_dir, hc->name, DEVFS_FL_CURRENT_OWNER,
1747+ major(dev), minor(dev),
1748+ S_IFBLK | S_IRUSR | S_IWUSR | S_IRGRP,
1749+ &dm_blk_dops, NULL);
1750+
1751+ return 0;
1752+}
1753+
1754+static int unregister_with_devfs(struct hash_cell *hc)
1755+{
1756+ devfs_unregister(hc->devfs_entry);
1757+ return 0;
1758+}
1759+
1760+/*
1761+ * The kdev_t and uuid of a device can never change once it is
1762+ * initially inserted.
1763+ */
1764+int dm_hash_insert(const char *name, const char *uuid, struct mapped_device *md)
1765+{
1766+ struct hash_cell *cell;
1767+
1768+ /*
1769+ * Allocate the new cells.
1770+ */
1771+ cell = alloc_cell(name, uuid, md);
1772+ if (!cell)
1773+ return -ENOMEM;
1774+
1775+ /*
1776+ * Insert the cell into both hash tables.
1777+ */
1778+ down_write(&_hash_lock);
1779+ if (__get_name_cell(name))
1780+ goto bad;
1781+
1782+ list_add(&cell->name_list, _name_buckets + hash_str(name));
1783+
1784+ if (uuid) {
1785+ if (__get_uuid_cell(uuid)) {
1786+ list_del(&cell->name_list);
1787+ goto bad;
1788+ }
1789+ list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid));
1790+ }
1791+ register_with_devfs(cell);
1792+ dm_get(md);
1793+ up_write(&_hash_lock);
1794+
1795+ return 0;
1796+
1797+ bad:
1798+ up_write(&_hash_lock);
1799+ free_cell(cell);
1800+ return -EBUSY;
1801+}
1802+
1803+void __hash_remove(struct hash_cell *hc)
1804+{
1805+ /* remove from the dev hash */
1806+ list_del(&hc->uuid_list);
1807+ list_del(&hc->name_list);
1808+ unregister_with_devfs(hc);
1809+ dm_put(hc->md);
1810+ if (hc->new_map)
1811+ dm_table_put(hc->new_map);
1812+ free_cell(hc);
1813+}
1814+
1815+void dm_hash_remove_all(void)
1816+{
1817+ int i;
1818+ struct hash_cell *hc;
1819+ struct list_head *tmp, *n;
1820+
1821+ down_write(&_hash_lock);
1822+ for (i = 0; i < NUM_BUCKETS; i++) {
1823+ list_for_each_safe (tmp, n, _name_buckets + i) {
1824+ hc = list_entry(tmp, struct hash_cell, name_list);
1825+ __hash_remove(hc);
1826+ }
1827+ }
1828+ up_write(&_hash_lock);
1829+}
1830+
1831+int dm_hash_rename(const char *old, const char *new)
1832+{
1833+ char *new_name, *old_name;
1834+ struct hash_cell *hc;
1835+
1836+ /*
1837+ * duplicate new.
1838+ */
1839+ new_name = kstrdup(new);
1840+ if (!new_name)
1841+ return -ENOMEM;
1842+
1843+ down_write(&_hash_lock);
1844+
1845+ /*
1846+ * Is new free ?
1847+ */
1848+ hc = __get_name_cell(new);
1849+ if (hc) {
1850+ DMWARN("asked to rename to an already existing name %s -> %s",
1851+ old, new);
1852+ up_write(&_hash_lock);
1853+ kfree(new_name);
1854+ return -EBUSY;
1855+ }
1856+
1857+ /*
1858+ * Is there such a device as 'old' ?
1859+ */
1860+ hc = __get_name_cell(old);
1861+ if (!hc) {
1862+ DMWARN("asked to rename a non existent device %s -> %s",
1863+ old, new);
1864+ up_write(&_hash_lock);
1865+ kfree(new_name);
1866+ return -ENXIO;
1867+ }
1868+
1869+ /*
1870+ * rename and move the name cell.
1871+ */
1872+ list_del(&hc->name_list);
1873+ old_name = hc->name;
1874+ hc->name = new_name;
1875+ list_add(&hc->name_list, _name_buckets + hash_str(new_name));
1876+
1877+ /* rename the device node in devfs */
1878+ unregister_with_devfs(hc);
1879+ register_with_devfs(hc);
1880+
1881+ up_write(&_hash_lock);
1882+ kfree(old_name);
1883+ return 0;
1884+}
1885+
1886+/*-----------------------------------------------------------------
1887+ * Implementation of the ioctl commands
1888+ *---------------------------------------------------------------*/
1889+/*
1890+ * All the ioctl commands get dispatched to functions with this
1891+ * prototype.
1892+ */
1893+typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size);
1894+
1895+static int remove_all(struct dm_ioctl *param, size_t param_size)
1896+{
1897+ dm_hash_remove_all();
1898+ param->data_size = 0;
1899+ return 0;
1900+}
1901+
1902+/*
1903+ * Round up the ptr to an 8-byte boundary.
1904+ */
1905+#define ALIGN_MASK 7
1906+static inline void *align_ptr(void *ptr)
1907+{
1908+ return (void *) (((size_t) (ptr + ALIGN_MASK)) & ~ALIGN_MASK);
1909+}
1910+
1911+/*
1912+ * Retrieves the data payload buffer from an already allocated
1913+ * struct dm_ioctl.
1914+ */
1915+static void *get_result_buffer(struct dm_ioctl *param, size_t param_size,
1916+ size_t *len)
1917+{
1918+ param->data_start = align_ptr(param + 1) - (void *) param;
1919+
1920+ if (param->data_start < param_size)
1921+ *len = param_size - param->data_start;
1922+ else
1923+ *len = 0;
1924+
1925+ return ((void *) param) + param->data_start;
1926+}
1927+
1928+static int list_devices(struct dm_ioctl *param, size_t param_size)
1929+{
1930+ unsigned int i;
1931+ struct hash_cell *hc;
1932+ size_t len, needed = 0;
1933+ struct dm_name_list *nl, *old_nl = NULL;
1934+
1935+ down_write(&_hash_lock);
1936+
1937+ /*
1938+ * Loop through all the devices working out how much
1939+ * space we need.
1940+ */
1941+ for (i = 0; i < NUM_BUCKETS; i++) {
1942+ list_for_each_entry (hc, _name_buckets + i, name_list) {
1943+ needed += sizeof(struct dm_name_list);
1944+ needed += strlen(hc->name);
1945+ needed += ALIGN_MASK;
1946+ }
1947+ }
1948+
1949+ /*
1950+ * Grab our output buffer.
1951+ */
1952+ nl = get_result_buffer(param, param_size, &len);
1953+ if (len < needed) {
1954+ param->flags |= DM_BUFFER_FULL_FLAG;
1955+ goto out;
1956+ }
1957+ param->data_size = param->data_start + needed;
1958+
1959+ nl->dev = 0; /* Flags no data */
1960+
1961+ /*
1962+ * Now loop through filling out the names.
1963+ */
1964+ for (i = 0; i < NUM_BUCKETS; i++) {
1965+ list_for_each_entry (hc, _name_buckets + i, name_list) {
1966+ if (old_nl)
1967+ old_nl->next = (uint32_t) ((void *) nl -
1968+ (void *) old_nl);
1969+
1970+ nl->dev = dm_kdev(hc->md);
1971+ nl->next = 0;
1972+ strcpy(nl->name, hc->name);
1973+
1974+ old_nl = nl;
1975+ nl = align_ptr(((void *) ++nl) + strlen(hc->name) + 1);
1976+ }
1977+ }
1978+
1979+ out:
1980+ up_write(&_hash_lock);
1981+ return 0;
1982+}
1983+
1984+static int check_name(const char *name)
1985+{
1986+ if (strchr(name, '/')) {
1987+ DMWARN("invalid device name");
1988+ return -EINVAL;
1989+ }
1990+
1991+ return 0;
1992+}
1993+
1994+/*
1995+ * Fills in a dm_ioctl structure, ready for sending back to
1996+ * userland.
1997+ */
1998+static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
1999+{
2000+ kdev_t dev = dm_kdev(md);
2001+ struct dm_table *table;
2002+ struct block_device *bdev;
2003+
2004+ param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
2005+ DM_ACTIVE_PRESENT_FLAG);
2006+
2007+ if (dm_suspended(md))
2008+ param->flags |= DM_SUSPEND_FLAG;
2009+
2010+ param->dev = kdev_t_to_nr(dev);
2011+
2012+ if (is_read_only(dev))
2013+ param->flags |= DM_READONLY_FLAG;
2014+
2015+ param->event_nr = dm_get_event_nr(md);
2016+
2017+ table = dm_get_table(md);
2018+ if (table) {
2019+ param->flags |= DM_ACTIVE_PRESENT_FLAG;
2020+ param->target_count = dm_table_get_num_targets(table);
2021+ dm_table_put(table);
2022+ } else
2023+ param->target_count = 0;
2024+
2025+ bdev = bdget(param->dev);
2026+ if (!bdev)
2027+ return -ENXIO;
2028+ param->open_count = bdev->bd_openers;
2029+ bdput(bdev);
2030+
2031+ return 0;
2032+}
2033+
2034+static int dev_create(struct dm_ioctl *param, size_t param_size)
2035+{
2036+ int r;
2037+ kdev_t dev = 0;
2038+ struct mapped_device *md;
2039+
2040+ r = check_name(param->name);
2041+ if (r)
2042+ return r;
2043+
2044+ if (param->flags & DM_PERSISTENT_DEV_FLAG)
2045+ dev = to_kdev_t(param->dev);
2046+
2047+ r = dm_create(dev, &md);
2048+ if (r)
2049+ return r;
2050+
2051+ r = dm_hash_insert(param->name, *param->uuid ? param->uuid : NULL, md);
2052+ if (r) {
2053+ dm_put(md);
2054+ return r;
2055+ }
2056+
2057+ param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
2058+
2059+ r = __dev_status(md, param);
2060+ dm_put(md);
2061+
2062+ return r;
2063+}
2064+
2065+/*
2066+ * Always use UUID for lookups if it's present, otherwise use name.
2067+ */
2068+static inline struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param)
2069+{
2070+ return *param->uuid ?
2071+ __get_uuid_cell(param->uuid) : __get_name_cell(param->name);
2072+}
2073+
2074+static inline struct mapped_device *find_device(struct dm_ioctl *param)
2075+{
2076+ struct hash_cell *hc;
2077+ struct mapped_device *md = NULL;
2078+
2079+ down_read(&_hash_lock);
2080+ hc = __find_device_hash_cell(param);
2081+ if (hc) {
2082+ md = hc->md;
2083+
2084+ /*
2085+ * Sneakily write in both the name and the uuid
2086+ * while we have the cell.
2087+ */
2088+ strncpy(param->name, hc->name, sizeof(param->name));
2089+ if (hc->uuid)
2090+ strncpy(param->uuid, hc->uuid, sizeof(param->uuid) - 1);
2091+ else
2092+ param->uuid[0] = '\0';
2093+
2094+ if (hc->new_map)
2095+ param->flags |= DM_INACTIVE_PRESENT_FLAG;
2096+ else
2097+ param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
2098+
2099+ dm_get(md);
2100+ }
2101+ up_read(&_hash_lock);
2102+
2103+ return md;
2104+}
2105+
2106+static int dev_remove(struct dm_ioctl *param, size_t param_size)
2107+{
2108+ struct hash_cell *hc;
2109+
2110+ down_write(&_hash_lock);
2111+ hc = __find_device_hash_cell(param);
2112+
2113+ if (!hc) {
2114+ DMWARN("device doesn't appear to be in the dev hash table.");
2115+ up_write(&_hash_lock);
2116+ return -ENXIO;
2117+ }
2118+
2119+ __hash_remove(hc);
2120+ up_write(&_hash_lock);
2121+ param->data_size = 0;
2122+ return 0;
2123+}
2124+
2125+/*
2126+ * Check a string doesn't overrun the chunk of
2127+ * memory we copied from userland.
2128+ */
2129+static int invalid_str(char *str, void *end)
2130+{
2131+ while ((void *) str < end)
2132+ if (!*str++)
2133+ return 0;
2134+
2135+ return -EINVAL;
2136+}
2137+
2138+static int dev_rename(struct dm_ioctl *param, size_t param_size)
2139+{
2140+ int r;
2141+ char *new_name = (char *) param + param->data_start;
2142+
2143+ if (new_name < (char *) (param + 1) ||
2144+ invalid_str(new_name, (void *) param + param_size)) {
2145+ DMWARN("Invalid new logical volume name supplied.");
2146+ return -EINVAL;
2147+ }
2148+
2149+ r = check_name(new_name);
2150+ if (r)
2151+ return r;
2152+
2153+ param->data_size = 0;
2154+ return dm_hash_rename(param->name, new_name);
2155+}
2156+
339dbf15 2157+static int do_suspend(struct dm_ioctl *param)
2ac564b8
AM
2158+{
2159+ int r = 0;
2160+ struct mapped_device *md;
2161+
2162+ md = find_device(param);
2163+ if (!md)
2164+ return -ENXIO;
2165+
2166+ if (!dm_suspended(md))
2167+ r = dm_suspend(md);
2168+
2169+ if (!r)
2170+ r = __dev_status(md, param);
2171+
2172+ dm_put(md);
2173+ return r;
2174+}
2175+
339dbf15 2176+static int do_resume(struct dm_ioctl *param)
2ac564b8
AM
2177+{
2178+ int r = 0;
2179+ struct hash_cell *hc;
2180+ struct mapped_device *md;
2181+ struct dm_table *new_map;
2182+
2183+ down_write(&_hash_lock);
2184+
2185+ hc = __find_device_hash_cell(param);
2186+ if (!hc) {
2187+ DMWARN("device doesn't appear to be in the dev hash table.");
2188+ up_write(&_hash_lock);
2189+ return -ENXIO;
2190+ }
2191+
2192+ md = hc->md;
2193+ dm_get(md);
2194+
2195+ new_map = hc->new_map;
2196+ hc->new_map = NULL;
2197+ param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
2198+
2199+ up_write(&_hash_lock);
2200+
2201+ /* Do we need to load a new map ? */
2202+ if (new_map) {
2203+ /* Suspend if it isn't already suspended */
2204+ if (!dm_suspended(md))
2205+ dm_suspend(md);
2206+
2207+ r = dm_swap_table(md, new_map);
2208+ if (r) {
2209+ dm_put(md);
2210+ dm_table_put(new_map);
2211+ return r;
2212+ }
2213+
2214+ if (dm_table_get_mode(new_map) & FMODE_WRITE)
2215+ set_device_ro(dm_kdev(md), 0);
2216+ else
2217+ set_device_ro(dm_kdev(md), 1);
2218+
2219+ dm_table_put(new_map);
2220+ }
2221+
2222+ if (dm_suspended(md))
2223+ r = dm_resume(md);
2224+
2225+ if (!r)
2226+ r = __dev_status(md, param);
2227+
2228+ dm_put(md);
2229+ return r;
2230+}
2231+
2232+/*
2233+ * Set or unset the suspension state of a device.
2234+ * If the device already is in the requested state we just return its status.
2235+ */
2236+static int dev_suspend(struct dm_ioctl *param, size_t param_size)
2237+{
2238+ if (param->flags & DM_SUSPEND_FLAG)
339dbf15 2239+ return do_suspend(param);
2ac564b8 2240+
339dbf15 2241+ return do_resume(param);
2ac564b8
AM
2242+}
2243+
2244+/*
2245+ * Copies device info back to user space, used by
2246+ * the create and info ioctls.
2247+ */
2248+static int dev_status(struct dm_ioctl *param, size_t param_size)
2249+{
2250+ int r;
2251+ struct mapped_device *md;
2252+
2253+ md = find_device(param);
2254+ if (!md)
2255+ return -ENXIO;
2256+
2257+ r = __dev_status(md, param);
2258+ dm_put(md);
2259+ return r;
2260+}
2261+
2ac564b8
AM
2262+static inline int get_mode(struct dm_ioctl *param)
2263+{
2264+ int mode = FMODE_READ | FMODE_WRITE;
2265+
2266+ if (param->flags & DM_READONLY_FLAG)
2267+ mode = FMODE_READ;
2268+
2269+ return mode;
2270+}
2271+
2272+static int next_target(struct dm_target_spec *last, uint32_t next, void *end,
2273+ struct dm_target_spec **spec, char **target_params)
2274+{
2275+ *spec = (struct dm_target_spec *) ((unsigned char *) last + next);
2276+ *target_params = (char *) (*spec + 1);
2277+
2278+ if (*spec < (last + 1))
2279+ return -EINVAL;
2280+
2281+ return invalid_str(*target_params, end);
2282+}
2283+
2284+static int populate_table(struct dm_table *table, struct dm_ioctl *param,
2285+ size_t param_size)
2286+{
2287+ int r;
2288+ unsigned int i = 0;
2289+ struct dm_target_spec *spec = (struct dm_target_spec *) param;
2290+ uint32_t next = param->data_start;
2291+ void *end = (void *) param + param_size;
2292+ char *target_params;
2293+
2294+ if (!param->target_count) {
2295+ DMWARN("populate_table: no targets specified");
2296+ return -EINVAL;
2297+ }
2298+
2299+ for (i = 0; i < param->target_count; i++) {
2300+
2301+ r = next_target(spec, next, end, &spec, &target_params);
2302+ if (r) {
2303+ DMWARN("unable to find target");
2304+ return r;
2305+ }
2306+
2307+ r = dm_table_add_target(table, spec->target_type,
2308+ (sector_t) spec->sector_start,
2309+ (sector_t) spec->length,
2310+ target_params);
2311+ if (r) {
2312+ DMWARN("error adding target to table");
2313+ return r;
2314+ }
2315+
2316+ next = spec->next;
2317+ }
2318+
2319+ return dm_table_complete(table);
2320+}
2321+
2322+static int table_load(struct dm_ioctl *param, size_t param_size)
2323+{
2324+ int r;
2325+ struct hash_cell *hc;
2326+ struct dm_table *t;
2327+
2328+ r = dm_table_create(&t, get_mode(param));
2329+ if (r)
2330+ return r;
2331+
2332+ r = populate_table(t, param, param_size);
2333+ if (r) {
2334+ dm_table_put(t);
2335+ return r;
2336+ }
2337+
2338+ down_write(&_hash_lock);
2339+ hc = __find_device_hash_cell(param);
2340+ if (!hc) {
2341+ DMWARN("device doesn't appear to be in the dev hash table.");
2342+ up_write(&_hash_lock);
2343+ return -ENXIO;
2344+ }
2345+
339dbf15
AM
2346+ if (hc->new_map)
2347+ dm_table_put(hc->new_map);
2ac564b8
AM
2348+ hc->new_map = t;
2349+ param->flags |= DM_INACTIVE_PRESENT_FLAG;
2350+
2351+ r = __dev_status(hc->md, param);
2352+ up_write(&_hash_lock);
2353+ return r;
2354+}
2355+
2356+static int table_clear(struct dm_ioctl *param, size_t param_size)
2357+{
2358+ int r;
2359+ struct hash_cell *hc;
2360+
2361+ down_write(&_hash_lock);
2362+
2363+ hc = __find_device_hash_cell(param);
2364+ if (!hc) {
2365+ DMWARN("device doesn't appear to be in the dev hash table.");
2366+ up_write(&_hash_lock);
2367+ return -ENXIO;
2368+ }
2369+
2370+ if (hc->new_map) {
2371+ dm_table_put(hc->new_map);
2372+ hc->new_map = NULL;
2373+ }
2374+
2375+ param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
2376+
2377+ r = __dev_status(hc->md, param);
2378+ up_write(&_hash_lock);
2379+ return r;
2380+}
2381+
2382+/*
2383+ * Retrieves a list of devices used by a particular dm device.
2384+ */
2385+static void retrieve_deps(struct dm_table *table, struct dm_ioctl *param,
2386+ size_t param_size)
2387+{
2388+ unsigned int count = 0;
2389+ struct list_head *tmp;
2390+ size_t len, needed;
2391+ struct dm_target_deps *deps;
2392+
2393+ deps = get_result_buffer(param, param_size, &len);
2394+
2395+ /*
2396+ * Count the devices.
2397+ */
2398+ list_for_each(tmp, dm_table_get_devices(table))
2399+ count++;
2400+
2401+ /*
2402+ * Check we have enough space.
2403+ */
2404+ needed = sizeof(*deps) + (sizeof(*deps->dev) * count);
2405+ if (len < needed) {
2406+ param->flags |= DM_BUFFER_FULL_FLAG;
2407+ return;
2408+ }
2409+
2410+ /*
2411+ * Fill in the devices.
2412+ */
2413+ deps->count = count;
2414+ count = 0;
2415+ list_for_each(tmp, dm_table_get_devices(table)) {
2416+ struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
2417+ deps->dev[count++] = dd->bdev->bd_dev;
2418+ }
2419+
2420+ param->data_size = param->data_start + needed;
2421+}
2422+
2423+static int table_deps(struct dm_ioctl *param, size_t param_size)
2424+{
2425+ int r;
2426+ struct mapped_device *md;
2427+ struct dm_table *table;
2428+
2429+ md = find_device(param);
2430+ if (!md)
2431+ return -ENXIO;
2432+
2433+ r = __dev_status(md, param);
2434+ if (r)
2435+ goto out;
2436+
2437+ table = dm_get_table(md);
2438+ if (table) {
2439+ retrieve_deps(table, param, param_size);
2440+ dm_table_put(table);
2441+ }
2442+
2443+ out:
2444+ dm_put(md);
2445+ return r;
2446+}
2447+
2448+/*
2449+ * Build up the status struct for each target
2450+ */
2451+static void retrieve_status(struct dm_table *table, struct dm_ioctl *param,
2452+ size_t param_size)
2453+{
2454+ unsigned int i, num_targets;
2455+ struct dm_target_spec *spec;
2456+ char *outbuf, *outptr;
2457+ status_type_t type;
2458+ size_t remaining, len, used = 0;
2459+
2460+ outptr = outbuf = get_result_buffer(param, param_size, &len);
2461+
2462+ if (param->flags & DM_STATUS_TABLE_FLAG)
2463+ type = STATUSTYPE_TABLE;
2464+ else
2465+ type = STATUSTYPE_INFO;
2466+
2467+ /* Get all the target info */
2468+ num_targets = dm_table_get_num_targets(table);
2469+ for (i = 0; i < num_targets; i++) {
2470+ struct dm_target *ti = dm_table_get_target(table, i);
2471+
2472+ remaining = len - (outptr - outbuf);
2473+ if (remaining < sizeof(struct dm_target_spec)) {
2474+ param->flags |= DM_BUFFER_FULL_FLAG;
2475+ break;
2476+ }
2477+
2478+ spec = (struct dm_target_spec *) outptr;
2479+
2480+ spec->status = 0;
2481+ spec->sector_start = ti->begin;
2482+ spec->length = ti->len;
2483+ strncpy(spec->target_type, ti->type->name,
2484+ sizeof(spec->target_type));
2485+
2486+ outptr += sizeof(struct dm_target_spec);
2487+ remaining = len - (outptr - outbuf);
2488+
2489+ /* Get the status/table string from the target driver */
2490+ if (ti->type->status) {
2491+ if (ti->type->status(ti, type, outptr, remaining)) {
2492+ param->flags |= DM_BUFFER_FULL_FLAG;
2493+ break;
2494+ }
2495+ } else
2496+ outptr[0] = '\0';
2497+
2498+ outptr += strlen(outptr) + 1;
2499+ used = param->data_start + (outptr - outbuf);
2500+
2501+ align_ptr(outptr);
2502+ spec->next = outptr - outbuf;
2503+ }
2504+
2505+ if (used)
2506+ param->data_size = used;
2507+
2508+ param->target_count = num_targets;
2509+}
2510+
2511+/*
2512+ * Return the status of a device as a text string for each
2513+ * target.
2514+ */
2515+static int table_status(struct dm_ioctl *param, size_t param_size)
2516+{
2517+ int r;
2518+ struct mapped_device *md;
2519+ struct dm_table *table;
2520+
2521+ md = find_device(param);
2522+ if (!md)
2523+ return -ENXIO;
2524+
2525+ r = __dev_status(md, param);
2526+ if (r)
2527+ goto out;
2528+
2529+ table = dm_get_table(md);
2530+ if (table) {
2531+ retrieve_status(table, param, param_size);
2532+ dm_table_put(table);
2533+ }
2534+
2535+ out:
2536+ dm_put(md);
2537+ return r;
2538+}
2539+
339dbf15
AM
2540+/*
2541+ * Wait for a device to report an event
2542+ */
2543+static int dev_wait(struct dm_ioctl *param, size_t param_size)
2544+{
2545+ int r;
2546+ struct mapped_device *md;
2547+ struct dm_table *table;
2548+ DECLARE_WAITQUEUE(wq, current);
2549+
2550+ md = find_device(param);
2551+ if (!md)
2552+ return -ENXIO;
2553+
2554+ /*
2555+ * Wait for a notification event
2556+ */
2557+ set_current_state(TASK_INTERRUPTIBLE);
2558+ if (!dm_add_wait_queue(md, &wq, param->event_nr)) {
2559+ schedule();
2560+ dm_remove_wait_queue(md, &wq);
2561+ }
2562+ set_current_state(TASK_RUNNING);
2563+
2564+ /*
2565+ * The userland program is going to want to know what
2566+ * changed to trigger the event, so we may as well tell
2567+ * him and save an ioctl.
2568+ */
2569+ r = __dev_status(md, param);
2570+ if (r)
2571+ goto out;
2572+
2573+ table = dm_get_table(md);
2574+ if (table) {
2575+ retrieve_status(table, param, param_size);
2576+ dm_table_put(table);
2577+ }
2578+
2579+ out:
2580+ dm_put(md);
2581+ return r;
2582+}
2583+
2ac564b8
AM
2584+/*-----------------------------------------------------------------
2585+ * Implementation of open/close/ioctl on the special char
2586+ * device.
2587+ *---------------------------------------------------------------*/
2588+static ioctl_fn lookup_ioctl(unsigned int cmd)
2589+{
2590+ static struct {
2591+ int cmd;
2592+ ioctl_fn fn;
2593+ } _ioctls[] = {
2594+ {DM_VERSION_CMD, NULL}, /* version is dealt with elsewhere */
2595+ {DM_REMOVE_ALL_CMD, remove_all},
2596+ {DM_LIST_DEVICES_CMD, list_devices},
2597+
2598+ {DM_DEV_CREATE_CMD, dev_create},
2599+ {DM_DEV_REMOVE_CMD, dev_remove},
2600+ {DM_DEV_RENAME_CMD, dev_rename},
2601+ {DM_DEV_SUSPEND_CMD, dev_suspend},
2602+ {DM_DEV_STATUS_CMD, dev_status},
2603+ {DM_DEV_WAIT_CMD, dev_wait},
2604+
2605+ {DM_TABLE_LOAD_CMD, table_load},
2606+ {DM_TABLE_CLEAR_CMD, table_clear},
2607+ {DM_TABLE_DEPS_CMD, table_deps},
2608+ {DM_TABLE_STATUS_CMD, table_status}
2609+ };
2610+
2611+ return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn;
2612+}
2613+
2614+/*
2615+ * As well as checking the version compatibility this always
2616+ * copies the kernel interface version out.
2617+ */
2618+static int check_version(unsigned int cmd, struct dm_ioctl *user)
2619+{
2620+ uint32_t version[3];
2621+ int r = 0;
2622+
2623+ if (copy_from_user(version, user->version, sizeof(version)))
2624+ return -EFAULT;
2625+
2626+ if ((DM_VERSION_MAJOR != version[0]) ||
2627+ (DM_VERSION_MINOR < version[1])) {
2628+ DMWARN("ioctl interface mismatch: "
2629+ "kernel(%u.%u.%u), user(%u.%u.%u), cmd(%d)",
2630+ DM_VERSION_MAJOR, DM_VERSION_MINOR,
2631+ DM_VERSION_PATCHLEVEL,
2632+ version[0], version[1], version[2], cmd);
2633+ r = -EINVAL;
2634+ }
2635+
2636+ /*
2637+ * Fill in the kernel version.
2638+ */
2639+ version[0] = DM_VERSION_MAJOR;
2640+ version[1] = DM_VERSION_MINOR;
2641+ version[2] = DM_VERSION_PATCHLEVEL;
2642+ if (copy_to_user(user->version, version, sizeof(version)))
2643+ return -EFAULT;
2644+
2645+ return r;
2646+}
2647+
2648+static void free_params(struct dm_ioctl *param)
2649+{
2650+ vfree(param);
2651+}
2652+
2653+static int copy_params(struct dm_ioctl *user, struct dm_ioctl **param)
2654+{
2655+ struct dm_ioctl tmp, *dmi;
2656+
2657+ if (copy_from_user(&tmp, user, sizeof(tmp)))
2658+ return -EFAULT;
2659+
2660+ if (tmp.data_size < sizeof(tmp))
2661+ return -EINVAL;
2662+
2663+ dmi = (struct dm_ioctl *) vmalloc(tmp.data_size);
2664+ if (!dmi)
2665+ return -ENOMEM;
2666+
2667+ if (copy_from_user(dmi, user, tmp.data_size)) {
2668+ vfree(dmi);
2669+ return -EFAULT;
2670+ }
2671+
2672+ *param = dmi;
2673+ return 0;
2674+}
2675+
2676+static int validate_params(uint cmd, struct dm_ioctl *param)
2677+{
2678+ /* Always clear this flag */
2679+ param->flags &= ~DM_BUFFER_FULL_FLAG;
2680+
2681+ /* Ignores parameters */
2682+ if (cmd == DM_REMOVE_ALL_CMD || cmd == DM_LIST_DEVICES_CMD)
2683+ return 0;
2684+
2685+ /* Unless creating, either name or uuid but not both */
2686+ if (cmd != DM_DEV_CREATE_CMD) {
2687+ if ((!*param->uuid && !*param->name) ||
2688+ (*param->uuid && *param->name)) {
2689+ DMWARN("one of name or uuid must be supplied, cmd(%u)",
2690+ cmd);
2691+ return -EINVAL;
2692+ }
2693+ }
2694+
2695+ /* Ensure strings are terminated */
2696+ param->name[DM_NAME_LEN - 1] = '\0';
2697+ param->uuid[DM_UUID_LEN - 1] = '\0';
2698+
2699+ return 0;
2700+}
2701+
2702+static int ctl_ioctl(struct inode *inode, struct file *file,
2703+ uint command, ulong u)
2704+{
2705+ int r = 0;
2706+ unsigned int cmd;
2707+ struct dm_ioctl *param;
2708+ struct dm_ioctl *user = (struct dm_ioctl *) u;
2709+ ioctl_fn fn = NULL;
2710+ size_t param_size;
2711+
2712+ /* only root can play with this */
2713+ if (!capable(CAP_SYS_ADMIN))
2714+ return -EACCES;
2715+
2716+ if (_IOC_TYPE(command) != DM_IOCTL)
2717+ return -ENOTTY;
2718+
2719+ cmd = _IOC_NR(command);
2720+
2721+ /*
2722+ * Check the interface version passed in. This also
2723+ * writes out the kernel's interface version.
2724+ */
2725+ r = check_version(cmd, user);
2726+ if (r)
2727+ return r;
2728+
2729+ /*
2730+ * Nothing more to do for the version command.
2731+ */
2732+ if (cmd == DM_VERSION_CMD)
2733+ return 0;
2734+
2735+ fn = lookup_ioctl(cmd);
2736+ if (!fn) {
2737+ DMWARN("dm_ctl_ioctl: unknown command 0x%x", command);
2738+ return -ENOTTY;
2739+ }
2740+
2741+ /*
2742+ * FIXME: I don't like this, we're trying to avoid low
2743+ * memory issues when a device is suspended.
2744+ */
2745+ current->flags |= PF_MEMALLOC;
2746+
2747+ /*
2748+ * Copy the parameters into kernel space.
2749+ */
2750+ r = copy_params(user, &param);
2751+ if (r) {
2752+ current->flags &= ~PF_MEMALLOC;
2753+ return r;
2754+ }
2755+
2756+ r = validate_params(cmd, param);
2757+ if (r)
2758+ goto out;
2759+
2760+ param_size = param->data_size;
2761+ param->data_size = sizeof(*param);
2762+ r = fn(param, param_size);
2763+
2764+ /*
2765+ * Copy the results back to userland.
2766+ */
2767+ if (!r && copy_to_user(user, param, param->data_size))
2768+ r = -EFAULT;
2769+
2770+ out:
2771+ free_params(param);
2772+ current->flags &= ~PF_MEMALLOC;
2773+ return r;
2774+}
2775+
2776+static struct file_operations _ctl_fops = {
2777+ .ioctl = ctl_ioctl,
2778+ .owner = THIS_MODULE,
2779+};
2780+
2781+static devfs_handle_t _ctl_handle;
2782+
2783+static struct miscdevice _dm_misc = {
2784+ .minor = MISC_DYNAMIC_MINOR,
2785+ .name = DM_NAME,
2786+ .fops = &_ctl_fops
2787+};
2788+
2789+/*
2790+ * Create misc character device and link to DM_DIR/control.
2791+ */
2792+int __init dm_interface_init(void)
2793+{
2794+ int r;
2795+ char rname[64];
2796+
2797+ r = dm_hash_init();
2798+ if (r)
2799+ return r;
2800+
2801+ r = misc_register(&_dm_misc);
2802+ if (r) {
2803+ DMERR("misc_register failed for control device");
2804+ dm_hash_exit();
2805+ return r;
2806+ }
2807+
2808+ r = devfs_generate_path(_dm_misc.devfs_handle, rname + 3,
2809+ sizeof rname - 3);
2810+ if (r == -ENOSYS)
2811+ goto done; /* devfs not present */
2812+
2813+ if (r < 0) {
2814+ DMERR("devfs_generate_path failed for control device");
2815+ goto failed;
2816+ }
2817+
2818+ strncpy(rname + r, "../", 3);
2819+ r = devfs_mk_symlink(NULL, DM_DIR "/control",
2820+ DEVFS_FL_DEFAULT, rname + r, &_ctl_handle, NULL);
2821+ if (r) {
2822+ DMERR("devfs_mk_symlink failed for control device");
2823+ goto failed;
2824+ }
2825+ devfs_auto_unregister(_dm_misc.devfs_handle, _ctl_handle);
2826+
2827+ done:
2828+ DMINFO("%d.%d.%d%s initialised: %s", DM_VERSION_MAJOR,
2829+ DM_VERSION_MINOR, DM_VERSION_PATCHLEVEL, DM_VERSION_EXTRA,
2830+ DM_DRIVER_EMAIL);
2831+ return 0;
2832+
2833+ failed:
2834+ misc_deregister(&_dm_misc);
2835+ dm_hash_exit();
2836+ return r;
2837+}
2838+
2839+void dm_interface_exit(void)
2840+{
2841+ if (misc_deregister(&_dm_misc) < 0)
2842+ DMERR("misc_deregister failed for control device");
2843+
2844+ dm_hash_exit();
2845+}
339dbf15
AM
2846--- linux-2.4.21/drivers/md/dm-linear.c Thu Jan 1 01:00:00 1970
2847+++ linux/drivers/md/dm-linear.c Wed Aug 20 14:41:38 2003
2ac564b8
AM
2848@@ -0,0 +1,123 @@
2849+/*
2850+ * Copyright (C) 2001 Sistina Software (UK) Limited.
2851+ *
2852+ * This file is released under the GPL.
2853+ */
2854+
2855+#include "dm.h"
2856+
2857+#include <linux/module.h>
2858+#include <linux/init.h>
2859+#include <linux/blkdev.h>
2860+#include <linux/slab.h>
2861+
2862+/*
2863+ * Linear: maps a linear range of a device.
2864+ */
2865+struct linear_c {
2866+ struct dm_dev *dev;
2867+ sector_t start;
2868+};
2869+
2870+/*
2871+ * Construct a linear mapping: <dev_path> <offset>
2872+ */
2873+static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
2874+{
2875+ struct linear_c *lc;
2876+
2877+ if (argc != 2) {
339dbf15 2878+ ti->error = "dm-linear: Invalid argument count";
2ac564b8
AM
2879+ return -EINVAL;
2880+ }
2881+
2882+ lc = kmalloc(sizeof(*lc), GFP_KERNEL);
2883+ if (lc == NULL) {
2884+ ti->error = "dm-linear: Cannot allocate linear context";
2885+ return -ENOMEM;
2886+ }
2887+
2888+ if (sscanf(argv[1], SECTOR_FORMAT, &lc->start) != 1) {
2889+ ti->error = "dm-linear: Invalid device sector";
2890+ goto bad;
2891+ }
2892+
2893+ if (dm_get_device(ti, argv[0], lc->start, ti->len,
2894+ dm_table_get_mode(ti->table), &lc->dev)) {
2895+ ti->error = "dm-linear: Device lookup failed";
2896+ goto bad;
2897+ }
2898+
2899+ ti->private = lc;
2900+ return 0;
2901+
2902+ bad:
2903+ kfree(lc);
2904+ return -EINVAL;
2905+}
2906+
2907+static void linear_dtr(struct dm_target *ti)
2908+{
2909+ struct linear_c *lc = (struct linear_c *) ti->private;
2910+
2911+ dm_put_device(ti, lc->dev);
2912+ kfree(lc);
2913+}
2914+
2915+static int linear_map(struct dm_target *ti, struct buffer_head *bh, int rw,
2916+ union map_info *map_context)
2917+{
2918+ struct linear_c *lc = (struct linear_c *) ti->private;
2919+
2920+ bh->b_rdev = lc->dev->dev;
2921+ bh->b_rsector = lc->start + (bh->b_rsector - ti->begin);
2922+
2923+ return 1;
2924+}
2925+
2926+static int linear_status(struct dm_target *ti, status_type_t type,
2927+ char *result, unsigned int maxlen)
2928+{
2929+ struct linear_c *lc = (struct linear_c *) ti->private;
2930+ kdev_t kdev;
2931+
2932+ switch (type) {
2933+ case STATUSTYPE_INFO:
2934+ result[0] = '\0';
2935+ break;
2936+
2937+ case STATUSTYPE_TABLE:
2938+ kdev = to_kdev_t(lc->dev->bdev->bd_dev);
2939+ snprintf(result, maxlen, "%s " SECTOR_FORMAT,
2940+ dm_kdevname(kdev), lc->start);
2941+ break;
2942+ }
2943+ return 0;
2944+}
2945+
2946+static struct target_type linear_target = {
2947+ .name = "linear",
2948+ .module = THIS_MODULE,
2949+ .ctr = linear_ctr,
2950+ .dtr = linear_dtr,
2951+ .map = linear_map,
2952+ .status = linear_status,
2953+};
2954+
2955+int __init dm_linear_init(void)
2956+{
2957+ int r = dm_register_target(&linear_target);
2958+
2959+ if (r < 0)
2960+ DMERR("linear: register failed %d", r);
2961+
2962+ return r;
2963+}
2964+
2965+void dm_linear_exit(void)
2966+{
2967+ int r = dm_unregister_target(&linear_target);
2968+
2969+ if (r < 0)
2970+ DMERR("linear: unregister failed %d", r);
2971+}
339dbf15
AM
2972--- linux-2.4.21/drivers/md/dm-log.c Thu Jan 1 01:00:00 1970
2973+++ linux/drivers/md/dm-log.c Wed Aug 20 14:41:38 2003
2ac564b8
AM
2974@@ -0,0 +1,302 @@
2975+/*
2976+ * Copyright (C) 2003 Sistina Software
2977+ *
2978+ * This file is released under the LGPL.
2979+ */
2980+
2981+#include <linux/init.h>
2982+#include <linux/slab.h>
2983+#include <linux/module.h>
2984+#include <linux/vmalloc.h>
2985+
2986+#include "dm-log.h"
2987+#include "dm-io.h"
2988+
2989+static LIST_HEAD(_log_types);
2990+static spinlock_t _lock = SPIN_LOCK_UNLOCKED;
2991+
2992+int dm_register_dirty_log_type(struct dirty_log_type *type)
2993+{
2994+ spin_lock(&_lock);
2995+ type->use_count = 0;
2996+ if (type->module)
2997+ __MOD_INC_USE_COUNT(type->module);
2998+
2999+ list_add(&type->list, &_log_types);
3000+ spin_unlock(&_lock);
3001+
3002+ return 0;
3003+}
3004+
3005+int dm_unregister_dirty_log_type(struct dirty_log_type *type)
3006+{
3007+ spin_lock(&_lock);
3008+
3009+ if (type->use_count)
3010+ DMWARN("Attempt to unregister a log type that is still in use");
3011+ else {
3012+ list_del(&type->list);
3013+ if (type->module)
3014+ __MOD_DEC_USE_COUNT(type->module);
3015+ }
3016+
3017+ spin_unlock(&_lock);
3018+
3019+ return 0;
3020+}
3021+
3022+static struct dirty_log_type *get_type(const char *type_name)
3023+{
3024+ struct dirty_log_type *type;
3025+ struct list_head *tmp;
3026+
3027+ spin_lock(&_lock);
3028+ list_for_each (tmp, &_log_types) {
3029+ type = list_entry(tmp, struct dirty_log_type, list);
3030+ if (!strcmp(type_name, type->name)) {
3031+ type->use_count++;
3032+ spin_unlock(&_lock);
3033+ return type;
3034+ }
3035+ }
3036+
3037+ spin_unlock(&_lock);
3038+ return NULL;
3039+}
3040+
3041+static void put_type(struct dirty_log_type *type)
3042+{
3043+ spin_lock(&_lock);
3044+ type->use_count--;
3045+ spin_unlock(&_lock);
3046+}
3047+
3048+struct dirty_log *dm_create_dirty_log(const char *type_name, sector_t dev_size,
3049+ unsigned int argc, char **argv)
3050+{
3051+ struct dirty_log_type *type;
3052+ struct dirty_log *log;
3053+
3054+ log = kmalloc(sizeof(*log), GFP_KERNEL);
3055+ if (!log)
3056+ return NULL;
3057+
3058+ type = get_type(type_name);
3059+ if (!type) {
3060+ kfree(log);
3061+ return NULL;
3062+ }
3063+
3064+ log->type = type;
3065+ if (type->ctr(log, dev_size, argc, argv)) {
3066+ kfree(log);
3067+ put_type(type);
3068+ return NULL;
3069+ }
3070+
3071+ return log;
3072+}
3073+
3074+void dm_destroy_dirty_log(struct dirty_log *log)
3075+{
3076+ log->type->dtr(log);
3077+ put_type(log->type);
3078+ kfree(log);
3079+}
3080+
3081+
3082+/*-----------------------------------------------------------------
3083+ * In core log, ie. trivial, non-persistent
3084+ *
3085+ * For now we'll keep this simple and just have 2 bitsets, one
3086+ * for clean/dirty, the other for sync/nosync. The sync bitset
3087+ * will be freed when everything is in sync.
3088+ *
3089+ * FIXME: problems with a 64bit sector_t
3090+ *---------------------------------------------------------------*/
3091+struct core_log {
3092+ sector_t region_size;
3093+ unsigned int region_count;
3094+ unsigned long *clean_bits;
3095+ unsigned long *sync_bits;
3096+ unsigned long *recovering_bits; /* FIXME: this seems excessive */
3097+
3098+ int sync_search;
3099+};
3100+
3101+static int core_ctr(struct dirty_log *log, sector_t dev_size,
3102+ unsigned int argc, char **argv)
3103+{
3104+ struct core_log *clog;
3105+ sector_t region_size;
3106+ unsigned int region_count;
3107+ size_t bitset_size;
3108+
3109+ if (argc != 1) {
3110+ DMWARN("wrong number of arguments to core_log");
3111+ return -EINVAL;
3112+ }
3113+
3114+ if (sscanf(argv[0], SECTOR_FORMAT, &region_size) != 1) {
3115+ DMWARN("invalid region size string");
3116+ return -EINVAL;
3117+ }
3118+
3119+ region_count = dm_div_up(dev_size, region_size);
3120+
3121+ clog = kmalloc(sizeof(*clog), GFP_KERNEL);
3122+ if (!clog) {
3123+ DMWARN("couldn't allocate core log");
3124+ return -ENOMEM;
3125+ }
3126+
3127+ clog->region_size = region_size;
3128+ clog->region_count = region_count;
3129+
3130+ bitset_size = dm_round_up(region_count >> 3, sizeof(*clog->clean_bits));
3131+ clog->clean_bits = vmalloc(bitset_size);
3132+ if (!clog->clean_bits) {
3133+ DMWARN("couldn't allocate clean bitset");
3134+ kfree(clog);
3135+ return -ENOMEM;
3136+ }
3137+ memset(clog->clean_bits, -1, bitset_size);
3138+
3139+ clog->sync_bits = vmalloc(bitset_size);
3140+ if (!clog->sync_bits) {
3141+ DMWARN("couldn't allocate sync bitset");
3142+ vfree(clog->clean_bits);
3143+ kfree(clog);
3144+ return -ENOMEM;
3145+ }
3146+ memset(clog->sync_bits, 0, bitset_size);
3147+
3148+ clog->recovering_bits = vmalloc(bitset_size);
3149+ if (!clog->recovering_bits) {
3150+ DMWARN("couldn't allocate sync bitset");
3151+ vfree(clog->sync_bits);
3152+ vfree(clog->clean_bits);
3153+ kfree(clog);
3154+ return -ENOMEM;
3155+ }
3156+ memset(clog->recovering_bits, 0, bitset_size);
3157+ clog->sync_search = 0;
3158+ log->context = clog;
3159+ return 0;
3160+}
3161+
3162+static void core_dtr(struct dirty_log *log)
3163+{
3164+ struct core_log *clog = (struct core_log *) log->context;
3165+ vfree(clog->clean_bits);
3166+ vfree(clog->sync_bits);
3167+ vfree(clog->recovering_bits);
3168+ kfree(clog);
3169+}
3170+
3171+static sector_t core_get_region_size(struct dirty_log *log)
3172+{
3173+ struct core_log *clog = (struct core_log *) log->context;
3174+ return clog->region_size;
3175+}
3176+
3177+static int core_is_clean(struct dirty_log *log, region_t region)
3178+{
3179+ struct core_log *clog = (struct core_log *) log->context;
3180+ return test_bit(region, clog->clean_bits);
3181+}
3182+
3183+static int core_in_sync(struct dirty_log *log, region_t region, int block)
3184+{
3185+ struct core_log *clog = (struct core_log *) log->context;
3186+
3187+ return test_bit(region, clog->sync_bits) ? 1 : 0;
3188+}
3189+
3190+static int core_flush(struct dirty_log *log)
3191+{
3192+ /* no op */
3193+ return 0;
3194+}
3195+
3196+static void core_mark_region(struct dirty_log *log, region_t region)
3197+{
3198+ struct core_log *clog = (struct core_log *) log->context;
3199+ clear_bit(region, clog->clean_bits);
3200+}
3201+
3202+static void core_clear_region(struct dirty_log *log, region_t region)
3203+{
3204+ struct core_log *clog = (struct core_log *) log->context;
3205+ set_bit(region, clog->clean_bits);
3206+}
3207+
3208+static int core_get_resync_work(struct dirty_log *log, region_t *region)
3209+{
3210+ struct core_log *clog = (struct core_log *) log->context;
3211+
3212+ if (clog->sync_search >= clog->region_count)
3213+ return 0;
3214+
3215+ do {
3216+ *region = find_next_zero_bit(clog->sync_bits,
3217+ clog->region_count,
3218+ clog->sync_search);
3219+ clog->sync_search = *region + 1;
3220+
3221+ if (*region == clog->region_count)
3222+ return 0;
3223+
3224+ } while (test_bit(*region, clog->recovering_bits));
3225+
3226+ set_bit(*region, clog->recovering_bits);
3227+ return 1;
3228+}
3229+
3230+static void core_complete_resync_work(struct dirty_log *log, region_t region,
3231+ int success)
3232+{
3233+ struct core_log *clog = (struct core_log *) log->context;
3234+
3235+ clear_bit(region, clog->recovering_bits);
3236+ if (success)
3237+ set_bit(region, clog->sync_bits);
3238+}
3239+
3240+static struct dirty_log_type _core_type = {
3241+ .name = "core",
3242+
3243+ .ctr = core_ctr,
3244+ .dtr = core_dtr,
3245+ .get_region_size = core_get_region_size,
3246+ .is_clean = core_is_clean,
3247+ .in_sync = core_in_sync,
3248+ .flush = core_flush,
3249+ .mark_region = core_mark_region,
3250+ .clear_region = core_clear_region,
3251+ .get_resync_work = core_get_resync_work,
3252+ .complete_resync_work = core_complete_resync_work
3253+};
3254+
3255+__init int dm_dirty_log_init(void)
3256+{
3257+ int r;
3258+
3259+ r = dm_register_dirty_log_type(&_core_type);
3260+ if (r)
3261+ DMWARN("couldn't register core log");
3262+
3263+ return r;
3264+}
3265+
3266+void dm_dirty_log_exit(void)
3267+{
3268+ dm_unregister_dirty_log_type(&_core_type);
3269+}
3270+
3271+EXPORT_SYMBOL(dm_register_dirty_log_type);
3272+EXPORT_SYMBOL(dm_unregister_dirty_log_type);
3273+EXPORT_SYMBOL(dm_dirty_log_init);
3274+EXPORT_SYMBOL(dm_dirty_log_exit);
3275+EXPORT_SYMBOL(dm_create_dirty_log);
3276+EXPORT_SYMBOL(dm_destroy_dirty_log);
339dbf15
AM
3277--- linux-2.4.21/drivers/md/dm-log.h Thu Jan 1 01:00:00 1970
3278+++ linux/drivers/md/dm-log.h Wed Aug 20 14:41:38 2003
2ac564b8
AM
3279@@ -0,0 +1,112 @@
3280+/*
3281+ * Copyright (C) 2003 Sistina Software
3282+ *
3283+ * This file is released under the LGPL.
3284+ */
3285+
3286+#ifndef DM_DIRTY_LOG
3287+#define DM_DIRTY_LOG
3288+
3289+#include "dm.h"
3290+
3291+typedef sector_t region_t;
3292+
3293+struct dirty_log_type;
3294+
3295+struct dirty_log {
3296+ struct dirty_log_type *type;
3297+ void *context;
3298+};
3299+
3300+struct dirty_log_type {
3301+ struct list_head list;
3302+ const char *name;
3303+ struct module *module;
3304+ unsigned int use_count;
3305+
3306+ int (*ctr)(struct dirty_log *log, sector_t dev_size,
3307+ unsigned int argc, char **argv);
3308+ void (*dtr)(struct dirty_log *log);
3309+
3310+ /*
3311+ * Retrieves the smallest size of region that the log can
3312+ * deal with.
3313+ */
3314+ sector_t (*get_region_size)(struct dirty_log *log);
3315+
3316+ /*
3317+ * A predicate to say whether a region is clean or not.
3318+ * May block.
3319+ */
3320+ int (*is_clean)(struct dirty_log *log, region_t region);
3321+
3322+ /*
3323+ * Returns: 0, 1, -EWOULDBLOCK, < 0
3324+ *
3325+ * A predicate function to check the area given by
3326+ * [sector, sector + len) is in sync.
3327+ *
3328+ * If -EWOULDBLOCK is returned the state of the region is
3329+ * unknown, typically this will result in a read being
3330+ * passed to a daemon to deal with, since a daemon is
3331+ * allowed to block.
3332+ */
3333+ int (*in_sync)(struct dirty_log *log, region_t region, int can_block);
3334+
3335+ /*
3336+ * Flush the current log state (eg, to disk). This
3337+ * function may block.
3338+ */
3339+ int (*flush)(struct dirty_log *log);
3340+
3341+ /*
3342+ * Mark an area as clean or dirty. These functions may
3343+ * block, though for performance reasons blocking should
3344+ * be extremely rare (eg, allocating another chunk of
3345+ * memory for some reason).
3346+ */
3347+ void (*mark_region)(struct dirty_log *log, region_t region);
3348+ void (*clear_region)(struct dirty_log *log, region_t region);
3349+
3350+ /*
3351+ * Returns: <0 (error), 0 (no region), 1 (region)
3352+ *
3353+ * The mirrord will need perform recovery on regions of
3354+ * the mirror that are in the NOSYNC state. This
3355+ * function asks the log to tell the caller about the
3356+ * next region that this machine should recover.
3357+ *
3358+ * Do not confuse this function with 'in_sync()', one
3359+ * tells you if an area is synchronised, the other
3360+ * assigns recovery work.
3361+ */
3362+ int (*get_resync_work)(struct dirty_log *log, region_t *region);
3363+
3364+ /*
3365+ * This notifies the log that the resync of an area has
3366+ * been completed. The log should then mark this region
3367+ * as CLEAN.
3368+ */
3369+ void (*complete_resync_work)(struct dirty_log *log,
3370+ region_t region, int success);
3371+};
3372+
3373+int dm_register_dirty_log_type(struct dirty_log_type *type);
3374+int dm_unregister_dirty_log_type(struct dirty_log_type *type);
3375+
3376+
3377+/*
3378+ * Make sure you use these two functions, rather than calling
3379+ * type->constructor/destructor() directly.
3380+ */
3381+struct dirty_log *dm_create_dirty_log(const char *type_name, sector_t dev_size,
3382+ unsigned int argc, char **argv);
3383+void dm_destroy_dirty_log(struct dirty_log *log);
3384+
3385+/*
3386+ * init/exit functions.
3387+ */
3388+int dm_dirty_log_init(void);
3389+void dm_dirty_log_exit(void);
3390+
3391+#endif
339dbf15
AM
3392--- linux-2.4.21/drivers/md/dm-raid1.c Thu Jan 1 01:00:00 1970
3393+++ linux/drivers/md/dm-raid1.c Wed Aug 20 14:41:38 2003
2ac564b8
AM
3394@@ -0,0 +1,1297 @@
3395+/*
3396+ * Copyright (C) 2003 Sistina Software Limited.
3397+ *
3398+ * This file is released under the GPL.
3399+ */
3400+
3401+#include "dm.h"
3402+#include "dm-daemon.h"
3403+#include "dm-io.h"
3404+#include "dm-log.h"
3405+#include "kcopyd.h"
3406+
3407+#include <linux/ctype.h>
3408+#include <linux/init.h>
3409+#include <linux/mempool.h>
3410+#include <linux/module.h>
3411+#include <linux/pagemap.h>
3412+#include <linux/slab.h>
3413+#include <linux/time.h>
3414+#include <linux/vmalloc.h>
3415+
3416+static struct dm_daemon _kmirrord;
3417+
3418+/*-----------------------------------------------------------------
3419+ * buffer lists:
3420+ *
3421+ * We play with singly linked lists of buffers, but we want to be
3422+ * careful to add new buffers to the back of the list, to avoid
3423+ * buffers being starved of attention.
3424+ *---------------------------------------------------------------*/
3425+struct buffer_list {
3426+ struct buffer_head *head;
3427+ struct buffer_head *tail;
3428+};
3429+
3430+static inline void buffer_list_init(struct buffer_list *bl)
3431+{
3432+ bl->head = bl->tail = NULL;
3433+}
3434+
3435+static inline void buffer_list_add(struct buffer_list *bl,
3436+ struct buffer_head *bh)
3437+{
3438+ bh->b_reqnext = NULL;
3439+
3440+ if (bl->tail) {
3441+ bl->tail->b_reqnext = bh;
3442+ bl->tail = bh;
3443+ } else
3444+ bl->head = bl->tail = bh;
3445+}
3446+
3447+static struct buffer_head *buffer_list_pop(struct buffer_list *bl)
3448+{
3449+ struct buffer_head *bh = bl->head;
3450+
3451+ if (bh) {
3452+ bl->head = bl->head->b_reqnext;
3453+ if (!bl->head)
3454+ bl->tail = NULL;
3455+
3456+ bh->b_reqnext = NULL;
3457+ }
3458+
3459+ return bh;
3460+}
3461+
3462+/*-----------------------------------------------------------------
3463+ * Region hash
3464+ *
3465+ * The mirror splits itself up into discrete regions. Each
3466+ * region can be in one of three states: clean, dirty,
3467+ * nosync. There is no need to put clean regions in the hash.
3468+ *
3469+ * In addition to being present in the hash table a region _may_
3470+ * be present on one of three lists.
3471+ *
3472+ * clean_regions: Regions on this list have no io pending to
3473+ * them, they are in sync, we are no longer interested in them,
3474+ * they are dull. rh_update_states() will remove them from the
3475+ * hash table.
3476+ *
3477+ * quiesced_regions: These regions have been spun down, ready
3478+ * for recovery. rh_recovery_start() will remove regions from
3479+ * this list and hand them to kmirrord, which will schedule the
3480+ * recovery io with kcopyd.
3481+ *
3482+ * recovered_regions: Regions that kcopyd has successfully
3483+ * recovered. rh_update_states() will now schedule any delayed
3484+ * io, up the recovery_count, and remove the region from the
3485+ * hash.
3486+ *
3487+ * There are 2 locks:
3488+ * A rw spin lock 'hash_lock' protects just the hash table,
3489+ * this is never held in write mode from interrupt context,
3490+ * which I believe means that we only have to disable irqs when
3491+ * doing a write lock.
3492+ *
3493+ * An ordinary spin lock 'region_lock' that protects the three
3494+ * lists in the region_hash, with the 'state', 'list' and
3495+ * 'bhs_delayed' fields of the regions. This is used from irq
3496+ * context, so all other uses will have to suspend local irqs.
3497+ *---------------------------------------------------------------*/
3498+struct mirror_set;
3499+struct region_hash {
3500+ struct mirror_set *ms;
3501+ sector_t region_size;
3502+
3503+ /* holds persistent region state */
3504+ struct dirty_log *log;
3505+
3506+ /* hash table */
3507+ rwlock_t hash_lock;
3508+ mempool_t *region_pool;
3509+ unsigned int mask;
3510+ unsigned int nr_buckets;
3511+ struct list_head *buckets;
3512+
3513+ spinlock_t region_lock;
3514+ struct semaphore recovery_count;
3515+ struct list_head clean_regions;
3516+ struct list_head quiesced_regions;
3517+ struct list_head recovered_regions;
3518+};
3519+
3520+enum {
3521+ RH_CLEAN,
3522+ RH_DIRTY,
3523+ RH_NOSYNC,
3524+ RH_RECOVERING
3525+};
3526+
3527+struct region {
3528+ struct region_hash *rh; /* FIXME: can we get rid of this ? */
3529+ region_t key;
3530+ int state;
3531+
3532+ struct list_head hash_list;
3533+ struct list_head list;
3534+
3535+ atomic_t pending;
3536+ struct buffer_head *delayed_bhs;
3537+};
3538+
3539+/*
3540+ * Conversion fns
3541+ */
3542+static inline region_t bh_to_region(struct region_hash *rh,
3543+ struct buffer_head *bh)
3544+{
3545+ return bh->b_rsector / rh->region_size;
3546+}
3547+
3548+static inline sector_t region_to_sector(struct region_hash *rh, region_t region)
3549+{
3550+ return region * rh->region_size;
3551+}
3552+
3553+/* FIXME move this */
3554+static void queue_bh(struct mirror_set *ms, struct buffer_head *bh, int rw);
3555+
3556+static void *region_alloc(int gfp_mask, void *pool_data)
3557+{
3558+ return kmalloc(sizeof(struct region), gfp_mask);
3559+}
3560+
3561+static void region_free(void *element, void *pool_data)
3562+{
3563+ kfree(element);
3564+}
3565+
3566+#define MIN_REGIONS 64
3567+#define MAX_RECOVERY 1
3568+static int rh_init(struct region_hash *rh, struct mirror_set *ms,
3569+ struct dirty_log *log, sector_t region_size,
3570+ region_t nr_regions)
3571+{
3572+ unsigned int nr_buckets, max_buckets;
3573+ size_t i;
3574+
3575+ /*
3576+ * Calculate a suitable number of buckets for our hash
3577+ * table.
3578+ */
3579+ max_buckets = nr_regions >> 6;
3580+ for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1)
3581+ ;
3582+ nr_buckets >>= 1;
3583+
3584+ rh->ms = ms;
3585+ rh->log = log;
3586+ rh->region_size = region_size;
3587+ rwlock_init(&rh->hash_lock);
3588+ rh->mask = nr_buckets - 1;
3589+ rh->nr_buckets = nr_buckets;
3590+
3591+ rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets));
3592+ if (!rh->buckets) {
3593+ DMERR("unable to allocate region hash memory");
3594+ return -ENOMEM;
3595+ }
3596+
3597+ for (i = 0; i < nr_buckets; i++)
3598+ INIT_LIST_HEAD(rh->buckets + i);
3599+
3600+ spin_lock_init(&rh->region_lock);
3601+ sema_init(&rh->recovery_count, 0);
3602+ INIT_LIST_HEAD(&rh->clean_regions);
3603+ INIT_LIST_HEAD(&rh->quiesced_regions);
3604+ INIT_LIST_HEAD(&rh->recovered_regions);
3605+
3606+ rh->region_pool = mempool_create(MIN_REGIONS, region_alloc,
3607+ region_free, NULL);
3608+ if (!rh->region_pool) {
3609+ vfree(rh->buckets);
3610+ rh->buckets = NULL;
3611+ return -ENOMEM;
3612+ }
3613+
3614+ return 0;
3615+}
3616+
3617+static void rh_exit(struct region_hash *rh)
3618+{
3619+ unsigned int h;
3620+ struct region *reg;
3621+ struct list_head *tmp, *tmp2;
3622+
3623+ BUG_ON(!list_empty(&rh->quiesced_regions));
3624+ for (h = 0; h < rh->nr_buckets; h++) {
3625+ list_for_each_safe (tmp, tmp2, rh->buckets + h) {
3626+ reg = list_entry(tmp, struct region, hash_list);
3627+ BUG_ON(atomic_read(&reg->pending));
3628+ mempool_free(reg, rh->region_pool);
3629+ }
3630+ }
3631+
3632+ if (rh->log)
3633+ dm_destroy_dirty_log(rh->log);
3634+ if (rh->region_pool)
3635+ mempool_destroy(rh->region_pool);
3636+ vfree(rh->buckets);
3637+}
3638+
3639+#define RH_HASH_MULT 2654435387U
3640+
3641+static inline unsigned int rh_hash(struct region_hash *rh, region_t region)
3642+{
3643+ return (unsigned int) ((region * RH_HASH_MULT) >> 12) & rh->mask;
3644+}
3645+
3646+static struct region *__rh_lookup(struct region_hash *rh, region_t region)
3647+{
3648+ struct region *reg;
3649+
3650+ list_for_each_entry (reg, rh->buckets + rh_hash(rh, region), hash_list)
3651+ if (reg->key == region)
3652+ return reg;
3653+
3654+ return NULL;
3655+}
3656+
3657+static void __rh_insert(struct region_hash *rh, struct region *reg)
3658+{
3659+ unsigned int h = rh_hash(rh, reg->key);
3660+ list_add(&reg->hash_list, rh->buckets + h);
3661+}
3662+
3663+static struct region *__rh_alloc(struct region_hash *rh, region_t region)
3664+{
3665+ struct region *reg, *nreg;
3666+
3667+ read_unlock(&rh->hash_lock);
3668+ nreg = mempool_alloc(rh->region_pool, GFP_NOIO);
3669+ nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
3670+ RH_CLEAN : RH_NOSYNC;
3671+ nreg->rh = rh;
3672+ nreg->key = region;
3673+
3674+ INIT_LIST_HEAD(&nreg->list);
3675+
3676+ atomic_set(&nreg->pending, 0);
3677+ nreg->delayed_bhs = NULL;
3678+ write_lock_irq(&rh->hash_lock);
3679+
3680+ reg = __rh_lookup(rh, region);
3681+ if (reg)
3682+ /* we lost the race */
3683+ mempool_free(nreg, rh->region_pool);
3684+
3685+ else {
3686+ __rh_insert(rh, nreg);
3687+ if (nreg->state == RH_CLEAN) {
3688+ spin_lock_irq(&rh->region_lock);
3689+ list_add(&nreg->list, &rh->clean_regions);
3690+ spin_unlock_irq(&rh->region_lock);
3691+ }
3692+ reg = nreg;
3693+ }
3694+ write_unlock_irq(&rh->hash_lock);
3695+ read_lock(&rh->hash_lock);
3696+
3697+ return reg;
3698+}
3699+
3700+static inline struct region *__rh_find(struct region_hash *rh, region_t region)
3701+{
3702+ struct region *reg;
3703+
3704+ reg = __rh_lookup(rh, region);
3705+ if (!reg)
3706+ reg = __rh_alloc(rh, region);
3707+
3708+ return reg;
3709+}
3710+
3711+static int rh_state(struct region_hash *rh, region_t region, int may_block)
3712+{
3713+ int r;
3714+ struct region *reg;
3715+
3716+ read_lock(&rh->hash_lock);
3717+ reg = __rh_lookup(rh, region);
3718+ read_unlock(&rh->hash_lock);
3719+
3720+ if (reg)
3721+ return reg->state;
3722+
3723+ /*
3724+ * The region wasn't in the hash, so we fall back to the
3725+ * dirty log.
3726+ */
3727+ r = rh->log->type->in_sync(rh->log, region, may_block);
3728+
3729+ /*
3730+ * Any error from the dirty log (eg. -EWOULDBLOCK) gets
3731+ * taken as a RH_NOSYNC
3732+ */
3733+ return r == 1 ? RH_CLEAN : RH_NOSYNC;
3734+}
3735+
3736+static inline int rh_in_sync(struct region_hash *rh,
3737+ region_t region, int may_block)
3738+{
3739+ int state = rh_state(rh, region, may_block);
3740+ return state == RH_CLEAN || state == RH_DIRTY;
3741+}
3742+
3743+static void dispatch_buffers(struct mirror_set *ms, struct buffer_head *bh)
3744+{
3745+ struct buffer_head *nbh;
3746+
3747+ while (bh) {
3748+ nbh = bh->b_reqnext;
3749+ queue_bh(ms, bh, WRITE);
3750+ bh = nbh;
3751+ }
3752+}
3753+
3754+static void rh_update_states(struct region_hash *rh)
3755+{
3756+ struct list_head *tmp, *tmp2;
3757+ struct region *reg;
3758+
3759+ LIST_HEAD(clean);
3760+ LIST_HEAD(recovered);
3761+
3762+ /*
3763+ * Quickly grab the lists.
3764+ */
3765+ write_lock_irq(&rh->hash_lock);
3766+ spin_lock(&rh->region_lock);
3767+ if (!list_empty(&rh->clean_regions)) {
3768+ list_splice(&rh->clean_regions, &clean);
3769+ INIT_LIST_HEAD(&rh->clean_regions);
3770+
3771+ list_for_each_entry (reg, &clean, list) {
3772+ rh->log->type->clear_region(rh->log, reg->key);
3773+ list_del(&reg->hash_list);
3774+ }
3775+ }
3776+
3777+ if (!list_empty(&rh->recovered_regions)) {
3778+ list_splice(&rh->recovered_regions, &recovered);
3779+ INIT_LIST_HEAD(&rh->recovered_regions);
3780+
3781+ list_for_each_entry (reg, &recovered, list)
3782+ list_del(&reg->hash_list);
3783+ }
3784+ spin_unlock(&rh->region_lock);
3785+ write_unlock_irq(&rh->hash_lock);
3786+
3787+ /*
3788+ * All the regions on the recovered and clean lists have
3789+ * now been pulled out of the system, so no need to do
3790+ * any more locking.
3791+ */
3792+ list_for_each_safe (tmp, tmp2, &recovered) {
3793+ reg = list_entry(tmp, struct region, list);
3794+
3795+ rh->log->type->complete_resync_work(rh->log, reg->key, 1);
3796+ dispatch_buffers(rh->ms, reg->delayed_bhs);
3797+ up(&rh->recovery_count);
3798+ mempool_free(reg, rh->region_pool);
3799+ }
3800+
3801+ list_for_each_safe (tmp, tmp2, &clean) {
3802+ reg = list_entry(tmp, struct region, list);
3803+ mempool_free(reg, rh->region_pool);
3804+ }
3805+}
3806+
3807+static void rh_inc(struct region_hash *rh, region_t region)
3808+{
3809+ struct region *reg;
3810+
3811+ read_lock(&rh->hash_lock);
3812+ reg = __rh_find(rh, region);
3813+ if (reg->state == RH_CLEAN) {
3814+ rh->log->type->mark_region(rh->log, reg->key);
3815+
3816+ spin_lock_irq(&rh->region_lock);
3817+ reg->state = RH_DIRTY;
3818+ list_del_init(&reg->list); /* take off the clean list */
3819+ spin_unlock_irq(&rh->region_lock);
3820+ }
3821+
3822+ atomic_inc(&reg->pending);
3823+ read_unlock(&rh->hash_lock);
3824+}
3825+
3826+static void rh_inc_pending(struct region_hash *rh, struct buffer_list *buffers)
3827+{
3828+ struct buffer_head *bh;
3829+
3830+ for (bh = buffers->head; bh; bh = bh->b_reqnext)
3831+ rh_inc(rh, bh_to_region(rh, bh));
3832+}
3833+
3834+static void rh_dec(struct region_hash *rh, region_t region)
3835+{
3836+ unsigned long flags;
3837+ struct region *reg;
3838+ int wake = 0;
3839+
3840+ read_lock(&rh->hash_lock);
3841+ reg = __rh_lookup(rh, region);
3842+ read_unlock(&rh->hash_lock);
3843+
3844+ if (atomic_dec_and_test(&reg->pending)) {
3845+ spin_lock_irqsave(&rh->region_lock, flags);
3846+ if (reg->state == RH_RECOVERING) {
3847+ list_add_tail(&reg->list, &rh->quiesced_regions);
3848+ } else {
3849+ reg->state = RH_CLEAN;
3850+ list_add(&reg->list, &rh->clean_regions);
3851+ }
3852+ spin_unlock_irqrestore(&rh->region_lock, flags);
3853+ wake = 1;
3854+ }
3855+
3856+ if (wake)
3857+ dm_daemon_wake(&_kmirrord);
3858+}
3859+
3860+/*
3861+ * Starts quiescing a region in preparation for recovery.
3862+ */
3863+static int __rh_recovery_prepare(struct region_hash *rh)
3864+{
3865+ int r;
3866+ struct region *reg;
3867+ region_t region;
3868+
3869+ /*
3870+ * Ask the dirty log what's next.
3871+ */
3872+ r = rh->log->type->get_resync_work(rh->log, &region);
3873+ if (r <= 0)
3874+ return r;
3875+
3876+ /*
3877+ * Get this region, and start it quiescing by setting the
3878+ * recovering flag.
3879+ */
3880+ read_lock(&rh->hash_lock);
3881+ reg = __rh_find(rh, region);
3882+ read_unlock(&rh->hash_lock);
3883+
3884+ spin_lock_irq(&rh->region_lock);
3885+ reg->state = RH_RECOVERING;
3886+
3887+ /* Already quiesced ? */
3888+ if (atomic_read(&reg->pending))
3889+ list_del_init(&reg->list);
3890+
3891+ else {
3892+ list_del_init(&reg->list);
3893+ list_add(&reg->list, &rh->quiesced_regions);
3894+ }
3895+ spin_unlock_irq(&rh->region_lock);
3896+
3897+ return 1;
3898+}
3899+
3900+static void rh_recovery_prepare(struct region_hash *rh)
3901+{
3902+ while (!down_trylock(&rh->recovery_count))
3903+ if (__rh_recovery_prepare(rh) <= 0) {
3904+ up(&rh->recovery_count);
3905+ break;
3906+ }
3907+}
3908+
3909+/*
3910+ * Returns any quiesced regions.
3911+ */
3912+static struct region *rh_recovery_start(struct region_hash *rh)
3913+{
3914+ struct region *reg = NULL;
3915+
3916+ spin_lock_irq(&rh->region_lock);
3917+ if (!list_empty(&rh->quiesced_regions)) {
3918+ reg = list_entry(rh->quiesced_regions.next,
3919+ struct region, list);
3920+ list_del_init(&reg->list); /* remove from the quiesced list */
3921+ }
3922+ spin_unlock_irq(&rh->region_lock);
3923+
3924+ return reg;
3925+}
3926+
3927+/* FIXME: success ignored for now */
3928+static void rh_recovery_end(struct region *reg, int success)
3929+{
3930+ struct region_hash *rh = reg->rh;
3931+
3932+ spin_lock_irq(&rh->region_lock);
3933+ list_add(&reg->list, &reg->rh->recovered_regions);
3934+ spin_unlock_irq(&rh->region_lock);
3935+
3936+ dm_daemon_wake(&_kmirrord);
3937+}
3938+
3939+static void rh_flush(struct region_hash *rh)
3940+{
3941+ rh->log->type->flush(rh->log);
3942+}
3943+
3944+static void rh_delay(struct region_hash *rh, struct buffer_head *bh)
3945+{
3946+ struct region *reg;
3947+
3948+ read_lock(&rh->hash_lock);
3949+ reg = __rh_find(rh, bh_to_region(rh, bh));
3950+ bh->b_reqnext = reg->delayed_bhs;
3951+ reg->delayed_bhs = bh;
3952+ read_unlock(&rh->hash_lock);
3953+}
3954+
3955+static void rh_stop_recovery(struct region_hash *rh)
3956+{
3957+ int i;
3958+
3959+ /* wait for any recovering regions */
3960+ for (i = 0; i < MAX_RECOVERY; i++)
3961+ down(&rh->recovery_count);
3962+}
3963+
3964+static void rh_start_recovery(struct region_hash *rh)
3965+{
3966+ int i;
3967+
3968+ for (i = 0; i < MAX_RECOVERY; i++)
3969+ up(&rh->recovery_count);
3970+
3971+ dm_daemon_wake(&_kmirrord);
3972+}
3973+
3974+/*-----------------------------------------------------------------
3975+ * Mirror set structures.
3976+ *---------------------------------------------------------------*/
3977+struct mirror {
3978+ atomic_t error_count;
3979+ struct dm_dev *dev;
3980+ sector_t offset;
3981+};
3982+
3983+struct mirror_set {
3984+ struct dm_target *ti;
3985+ struct list_head list;
3986+ struct region_hash rh;
3987+ struct kcopyd_client *kcopyd_client;
3988+
3989+ spinlock_t lock; /* protects the next two lists */
3990+ struct buffer_list reads;
3991+ struct buffer_list writes;
3992+
3993+ /* recovery */
3994+ region_t nr_regions;
3995+ region_t sync_count;
3996+
3997+ unsigned int nr_mirrors;
3998+ struct mirror mirror[0];
3999+};
4000+
4001+/*
4002+ * Every mirror should look like this one.
4003+ */
4004+#define DEFAULT_MIRROR 0
4005+
4006+/*
4007+ * This is yucky. We squirrel the mirror_set struct away inside
4008+ * b_reqnext for write buffers. This is safe since the bh
4009+ * doesn't get submitted to the lower levels of block layer.
4010+ */
4011+static struct mirror_set *bh_get_ms(struct buffer_head *bh)
4012+{
4013+ return (struct mirror_set *) bh->b_reqnext;
4014+}
4015+
4016+static void bh_set_ms(struct buffer_head *bh, struct mirror_set *ms)
4017+{
4018+ bh->b_reqnext = (struct buffer_head *) ms;
4019+}
4020+
4021+/*-----------------------------------------------------------------
4022+ * Recovery.
4023+ *
4024+ * When a mirror is first activated we may find that some regions
4025+ * are in the no-sync state. We have to recover these by
4026+ * recopying from the default mirror to all the others.
4027+ *---------------------------------------------------------------*/
4028+static void recovery_complete(int read_err, unsigned int write_err,
4029+ void *context)
4030+{
4031+ struct region *reg = (struct region *) context;
4032+ struct mirror_set *ms = reg->rh->ms;
4033+
4034+ /* FIXME: better error handling */
4035+ rh_recovery_end(reg, read_err || write_err);
4036+ if (++ms->sync_count == ms->nr_regions)
4037+ /* the sync is complete */
4038+ dm_table_event(ms->ti->table);
4039+}
4040+
4041+static int recover(struct mirror_set *ms, struct region *reg)
4042+{
4043+ int r;
4044+ unsigned int i;
4045+ struct io_region from, to[ms->nr_mirrors - 1], *dest;
4046+ struct mirror *m;
4047+ unsigned int flags = 0;
4048+
4049+ /* fill in the source */
4050+ m = ms->mirror + DEFAULT_MIRROR;
4051+ from.dev = m->dev->dev;
4052+ from.sector = m->offset + region_to_sector(reg->rh, reg->key);
4053+ if (reg->key == (ms->nr_regions - 1)) {
4054+ /*
4055+ * The final region may be smaller than
4056+ * region_size.
4057+ */
4058+ from.count = ms->ti->len & (reg->rh->region_size - 1);
4059+ if (!from.count)
4060+ from.count = reg->rh->region_size;
4061+ } else
4062+ from.count = reg->rh->region_size;
4063+
4064+ /* fill in the destinations */
4065+ for (i = 1; i < ms->nr_mirrors; i++) {
4066+ m = ms->mirror + i;
4067+ dest = to + (i - 1);
4068+
4069+ dest->dev = m->dev->dev;
4070+ dest->sector = m->offset + region_to_sector(reg->rh, reg->key);
4071+ dest->count = from.count;
4072+ }
4073+
4074+ /* hand to kcopyd */
4075+ set_bit(KCOPYD_IGNORE_ERROR, &flags);
4076+ r = kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, flags,
4077+ recovery_complete, reg);
4078+
4079+ return r;
4080+}
4081+
4082+static void do_recovery(struct mirror_set *ms)
4083+{
4084+ int r;
4085+ struct region *reg;
4086+
4087+ /*
4088+ * Start quiescing some regions.
4089+ */
4090+ rh_recovery_prepare(&ms->rh);
4091+
4092+ /*
4093+ * Copy any already quiesced regions.
4094+ */
4095+ while ((reg = rh_recovery_start(&ms->rh))) {
4096+ r = recover(ms, reg);
4097+ if (r)
4098+ rh_recovery_end(reg, 0);
4099+ }
4100+}
4101+
4102+/*-----------------------------------------------------------------
4103+ * Reads
4104+ *---------------------------------------------------------------*/
4105+static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
4106+{
4107+ /* FIXME: add read balancing */
4108+ return ms->mirror + DEFAULT_MIRROR;
4109+}
4110+
4111+/*
4112+ * remap a buffer to a particular mirror.
4113+ */
4114+static void map_buffer(struct mirror_set *ms,
4115+ struct mirror *m, struct buffer_head *bh)
4116+{
4117+ sector_t bsize = bh->b_size >> 9;
4118+ sector_t rsector = bh->b_blocknr * bsize;
4119+
4120+ bh->b_rdev = m->dev->dev;
4121+ bh->b_rsector = m->offset + (rsector - ms->ti->begin);
4122+}
4123+
4124+static void do_reads(struct mirror_set *ms, struct buffer_list *reads)
4125+{
4126+ region_t region;
4127+ struct buffer_head *bh;
4128+ struct mirror *m;
4129+
4130+ while ((bh = buffer_list_pop(reads))) {
4131+ region = bh_to_region(&ms->rh, bh);
4132+
4133+ /*
4134+ * We can only read balance if the region is in sync.
4135+ */
4136+ if (rh_in_sync(&ms->rh, region, 0))
4137+ m = choose_mirror(ms, bh->b_rsector);
4138+ else
4139+ m = ms->mirror + DEFAULT_MIRROR;
4140+
4141+ map_buffer(ms, m, bh);
4142+ generic_make_request(READ, bh);
4143+ }
4144+}
4145+
4146+/*-----------------------------------------------------------------
4147+ * Writes.
4148+ *
4149+ * We do different things with the write io depending on the
4150+ * state of the region that it's in:
4151+ *
4152+ * SYNC: increment pending, use kcopyd to write to *all* mirrors
4153+ * RECOVERING: delay the io until recovery completes
4154+ * NOSYNC: increment pending, just write to the default mirror
4155+ *---------------------------------------------------------------*/
4156+static void write_callback(unsigned int error, void *context)
4157+{
4158+ unsigned int i;
4159+ int uptodate = 1;
4160+ struct buffer_head *bh = (struct buffer_head *) context;
4161+ struct mirror_set *ms;
4162+
4163+ ms = bh_get_ms(bh);
4164+ bh_set_ms(bh, NULL);
4165+
4166+ /*
4167+ * NOTE: We don't decrement the pending count here,
4168+ * instead it is done by the targets endio function.
4169+ * This way we handle both writes to SYNC and NOSYNC
4170+ * regions with the same code.
4171+ */
4172+
4173+ if (error) {
4174+ /*
4175+ * only error the io if all mirrors failed.
4176+ * FIXME: bogus
4177+ */
4178+ uptodate = 0;
4179+ for (i = 0; i < ms->nr_mirrors; i++)
4180+ if (!test_bit(i, &error)) {
4181+ uptodate = 1;
4182+ break;
4183+ }
4184+ }
4185+ bh->b_end_io(bh, uptodate);
4186+}
4187+
4188+static void do_write(struct mirror_set *ms, struct buffer_head *bh)
4189+{
4190+ unsigned int i;
4191+ struct io_region io[ms->nr_mirrors];
4192+ struct mirror *m;
4193+
4194+ for (i = 0; i < ms->nr_mirrors; i++) {
4195+ m = ms->mirror + i;
4196+
4197+ io[i].dev = m->dev->dev;
4198+ io[i].sector = m->offset + (bh->b_rsector - ms->ti->begin);
4199+ io[i].count = bh->b_size >> 9;
4200+ }
4201+
4202+ bh_set_ms(bh, ms);
4203+ dm_io_async(ms->nr_mirrors, io, WRITE, bh->b_page,
4204+ (unsigned int) bh->b_data & ~PAGE_MASK, write_callback, bh);
4205+}
4206+
4207+static void do_writes(struct mirror_set *ms, struct buffer_list *writes)
4208+{
4209+ int state;
4210+ struct buffer_head *bh;
4211+ struct buffer_list sync, nosync, recover, *this_list = NULL;
4212+
4213+ if (!writes->head)
4214+ return;
4215+
4216+ /*
4217+ * Classify each write.
4218+ */
4219+ buffer_list_init(&sync);
4220+ buffer_list_init(&nosync);
4221+ buffer_list_init(&recover);
4222+
4223+ while ((bh = buffer_list_pop(writes))) {
4224+ state = rh_state(&ms->rh, bh_to_region(&ms->rh, bh), 1);
4225+ switch (state) {
4226+ case RH_CLEAN:
4227+ case RH_DIRTY:
4228+ this_list = &sync;
4229+ break;
4230+
4231+ case RH_NOSYNC:
4232+ this_list = &nosync;
4233+ break;
4234+
4235+ case RH_RECOVERING:
4236+ this_list = &recover;
4237+ break;
4238+ }
4239+
4240+ buffer_list_add(this_list, bh);
4241+ }
4242+
4243+ /*
4244+ * Increment the pending counts for any regions that will
4245+ * be written to (writes to recover regions are going to
4246+ * be delayed).
4247+ */
4248+ rh_inc_pending(&ms->rh, &sync);
4249+ rh_inc_pending(&ms->rh, &nosync);
4250+ rh_flush(&ms->rh);
4251+
4252+ /*
4253+ * Dispatch io.
4254+ */
4255+ while ((bh = buffer_list_pop(&sync)))
4256+ do_write(ms, bh);
4257+
4258+ while ((bh = buffer_list_pop(&recover)))
4259+ rh_delay(&ms->rh, bh);
4260+
4261+ while ((bh = buffer_list_pop(&nosync))) {
4262+ map_buffer(ms, ms->mirror + DEFAULT_MIRROR, bh);
4263+ generic_make_request(WRITE, bh);
4264+ }
4265+}
4266+
4267+/*-----------------------------------------------------------------
4268+ * kmirrord
4269+ *---------------------------------------------------------------*/
4270+static LIST_HEAD(_mirror_sets);
4271+static DECLARE_RWSEM(_mirror_sets_lock);
4272+
4273+static void do_mirror(struct mirror_set *ms)
4274+{
4275+ struct buffer_list reads, writes;
4276+
4277+ spin_lock(&ms->lock);
4278+ memcpy(&reads, &ms->reads, sizeof(reads));
4279+ buffer_list_init(&ms->reads);
4280+ memcpy(&writes, &ms->writes, sizeof(writes));
4281+ buffer_list_init(&ms->writes);
4282+ spin_unlock(&ms->lock);
4283+
4284+ rh_update_states(&ms->rh);
4285+ do_recovery(ms);
4286+ do_reads(ms, &reads);
4287+ do_writes(ms, &writes);
4288+ run_task_queue(&tq_disk);
4289+}
4290+
4291+static void do_work(void)
4292+{
4293+ struct mirror_set *ms;
4294+
4295+ down_read(&_mirror_sets_lock);
4296+ list_for_each_entry (ms, &_mirror_sets, list)
4297+ do_mirror(ms);
4298+ up_read(&_mirror_sets_lock);
4299+}
4300+
4301+/*-----------------------------------------------------------------
4302+ * Target functions
4303+ *---------------------------------------------------------------*/
4304+static struct mirror_set *alloc_context(unsigned int nr_mirrors,
4305+ sector_t region_size,
4306+ struct dm_target *ti,
4307+ struct dirty_log *dl)
4308+{
4309+ size_t len;
4310+ struct mirror_set *ms = NULL;
4311+
4312+ if (array_too_big(sizeof(*ms), sizeof(ms->mirror[0]), nr_mirrors))
4313+ return NULL;
4314+
4315+ len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors);
4316+
4317+ ms = kmalloc(len, GFP_KERNEL);
4318+ if (!ms) {
4319+ ti->error = "dm-mirror: Cannot allocate mirror context";
4320+ return NULL;
4321+ }
4322+
4323+ memset(ms, 0, len);
4324+ spin_lock_init(&ms->lock);
4325+
4326+ ms->ti = ti;
4327+ ms->nr_mirrors = nr_mirrors;
4328+ ms->nr_regions = dm_div_up(ti->len, region_size);
4329+
4330+ if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
4331+ ti->error = "dm-mirror: Error creating dirty region hash";
4332+ kfree(ms);
4333+ return NULL;
4334+ }
4335+
4336+ return ms;
4337+}
4338+
4339+static void free_context(struct mirror_set *ms, struct dm_target *ti,
4340+ unsigned int m)
4341+{
4342+ while (m--)
4343+ dm_put_device(ti, ms->mirror[m].dev);
4344+
4345+ rh_exit(&ms->rh);
4346+ kfree(ms);
4347+}
4348+
4349+static inline int _check_region_size(struct dm_target *ti, sector_t size)
4350+{
4351+ return !(size % (PAGE_SIZE >> 9) || (size & (size - 1)) ||
4352+ size > ti->len);
4353+}
4354+
4355+static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
4356+ unsigned int mirror, char **argv)
4357+{
4358+ sector_t offset;
4359+
4360+ if (sscanf(argv[1], SECTOR_FORMAT, &offset) != 1) {
4361+ ti->error = "dm-mirror: Invalid offset";
4362+ return -EINVAL;
4363+ }
4364+
4365+ if (dm_get_device(ti, argv[0], offset, ti->len,
4366+ dm_table_get_mode(ti->table),
4367+ &ms->mirror[mirror].dev)) {
4368+ ti->error = "dm-mirror: Device lookup failure";
4369+ return -ENXIO;
4370+ }
4371+
4372+ ms->mirror[mirror].offset = offset;
4373+
4374+ return 0;
4375+}
4376+
4377+static int add_mirror_set(struct mirror_set *ms)
4378+{
4379+ down_write(&_mirror_sets_lock);
4380+ list_add_tail(&ms->list, &_mirror_sets);
4381+ up_write(&_mirror_sets_lock);
4382+ dm_daemon_wake(&_kmirrord);
4383+
4384+ return 0;
4385+}
4386+
4387+static void del_mirror_set(struct mirror_set *ms)
4388+{
4389+ down_write(&_mirror_sets_lock);
4390+ list_del(&ms->list);
4391+ up_write(&_mirror_sets_lock);
4392+}
4393+
4394+/*
4395+ * Create dirty log: log_type #log_params <log_params>
4396+ */
4397+static struct dirty_log *create_dirty_log(struct dm_target *ti,
4398+ unsigned int argc, char **argv,
4399+ unsigned int *args_used)
4400+{
4401+ unsigned int param_count;
4402+ struct dirty_log *dl;
4403+
4404+ if (argc < 2) {
4405+ ti->error = "dm-mirror: Insufficient mirror log arguments";
4406+ return NULL;
4407+ }
4408+
4409+ if (sscanf(argv[1], "%u", &param_count) != 1 || param_count != 1) {
4410+ ti->error = "dm-mirror: Invalid mirror log argument count";
4411+ return NULL;
4412+ }
4413+
4414+ *args_used = 2 + param_count;
4415+
4416+ if (argc < *args_used) {
4417+ ti->error = "dm-mirror: Insufficient mirror log arguments";
4418+ return NULL;
4419+ }
4420+
4421+ dl = dm_create_dirty_log(argv[0], ti->len, param_count, argv + 2);
4422+ if (!dl) {
4423+ ti->error = "dm-mirror: Error creating mirror dirty log";
4424+ return NULL;
4425+ }
4426+
4427+ if (!_check_region_size(ti, dl->type->get_region_size(dl))) {
4428+ ti->error = "dm-mirror: Invalid region size";
4429+ dm_destroy_dirty_log(dl);
4430+ return NULL;
4431+ }
4432+
4433+ return dl;
4434+}
4435+
4436+/*
4437+ * Construct a mirror mapping:
4438+ *
4439+ * log_type #log_params <log_params>
4440+ * #mirrors [mirror_path offset]{2,}
4441+ *
4442+ * For now, #log_params = 1, log_type = "core"
4443+ *
4444+ */
4445+#define DM_IO_PAGES 64
4446+static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
4447+{
4448+ int r;
4449+ unsigned int nr_mirrors, m, args_used;
4450+ struct mirror_set *ms;
4451+ struct dirty_log *dl;
4452+
4453+ dl = create_dirty_log(ti, argc, argv, &args_used);
4454+ if (!dl)
4455+ return -EINVAL;
4456+
4457+ argv += args_used;
4458+ argc -= args_used;
4459+
4460+ if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 ||
4461+ nr_mirrors < 2) {
4462+ ti->error = "dm-mirror: Invalid number of mirrors";
4463+ dm_destroy_dirty_log(dl);
4464+ return -EINVAL;
4465+ }
4466+
4467+ argv++, argc--;
4468+
4469+ if (argc != nr_mirrors * 2) {
4470+ ti->error = "dm-mirror: Wrong number of mirror arguments";
4471+ dm_destroy_dirty_log(dl);
4472+ return -EINVAL;
4473+ }
4474+
4475+ ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl);
4476+ if (!ms) {
4477+ dm_destroy_dirty_log(dl);
4478+ return -ENOMEM;
4479+ }
4480+
4481+ /* Get the mirror parameter sets */
4482+ for (m = 0; m < nr_mirrors; m++) {
4483+ r = get_mirror(ms, ti, m, argv);
4484+ if (r) {
4485+ free_context(ms, ti, m);
4486+ return r;
4487+ }
4488+ argv += 2;
4489+ argc -= 2;
4490+ }
4491+
4492+ ti->private = ms;
4493+
4494+ r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client);
4495+ if (r) {
4496+ free_context(ms, ti, ms->nr_mirrors);
4497+ return r;
4498+ }
4499+
4500+ add_mirror_set(ms);
4501+ return 0;
4502+}
4503+
4504+static void mirror_dtr(struct dm_target *ti)
4505+{
4506+ struct mirror_set *ms = (struct mirror_set *) ti->private;
4507+
4508+ del_mirror_set(ms);
4509+ kcopyd_client_destroy(ms->kcopyd_client);
4510+ free_context(ms, ti, ms->nr_mirrors);
4511+}
4512+
4513+static void queue_bh(struct mirror_set *ms, struct buffer_head *bh, int rw)
4514+{
4515+ int wake = 0;
4516+ struct buffer_list *bl;
4517+
4518+ bl = (rw == WRITE) ? &ms->writes : &ms->reads;
4519+ spin_lock(&ms->lock);
4520+ wake = !(bl->head);
4521+ buffer_list_add(bl, bh);
4522+ spin_unlock(&ms->lock);
4523+
4524+ if (wake)
4525+ dm_daemon_wake(&_kmirrord);
4526+}
4527+
4528+/*
4529+ * Mirror mapping function
4530+ */
4531+static int mirror_map(struct dm_target *ti, struct buffer_head *bh,
4532+ int rw, union map_info *map_context)
4533+{
4534+ int r;
4535+ struct mirror *m;
4536+ struct mirror_set *ms = ti->private;
4537+
4538+ /* FIXME: nasty hack, 32 bit sector_t only */
4539+ map_context->ll = bh->b_rsector / ms->rh.region_size;
4540+
4541+ if (rw == WRITE) {
4542+ queue_bh(ms, bh, rw);
4543+ return 0;
4544+ }
4545+
4546+ r = ms->rh.log->type->in_sync(ms->rh.log, bh_to_region(&ms->rh, bh), 0);
4547+ if (r < 0 && r != -EWOULDBLOCK)
4548+ return r;
4549+
4550+ if (r == -EWOULDBLOCK) /* FIXME: ugly */
4551+ r = 0;
4552+
4553+ /*
4554+ * We don't want to fast track a recovery just for a read
4555+ * ahead. So we just let it silently fail.
4556+ * FIXME: get rid of this.
4557+ */
4558+ if (!r && rw == READA)
4559+ return -EIO;
4560+
4561+ if (!r) {
4562+ /* Pass this io over to the daemon */
4563+ queue_bh(ms, bh, rw);
4564+ return 0;
4565+ }
4566+
4567+ m = choose_mirror(ms, bh->b_rsector);
4568+ if (!m)
4569+ return -EIO;
4570+
4571+ map_buffer(ms, m, bh);
4572+ return 1;
4573+}
4574+
4575+static int mirror_end_io(struct dm_target *ti, struct buffer_head *bh,
4576+ int rw, int error, union map_info *map_context)
4577+{
4578+ struct mirror_set *ms = (struct mirror_set *) ti->private;
4579+ region_t region = map_context->ll;
4580+
4581+ /*
4582+ * We need to dec pending if this was a write.
4583+ */
4584+ if (rw == WRITE)
4585+ rh_dec(&ms->rh, region);
4586+
4587+ return 0;
4588+}
4589+
4590+static void mirror_suspend(struct dm_target *ti)
4591+{
4592+ struct mirror_set *ms = (struct mirror_set *) ti->private;
4593+ rh_stop_recovery(&ms->rh);
4594+}
4595+
4596+static void mirror_resume(struct dm_target *ti)
4597+{
4598+ struct mirror_set *ms = (struct mirror_set *) ti->private;
4599+ rh_start_recovery(&ms->rh);
4600+}
4601+
4602+static int mirror_status(struct dm_target *ti, status_type_t type,
4603+ char *result, unsigned int maxlen)
4604+{
4605+ unsigned int m, sz = 0;
4606+ struct mirror_set *ms = (struct mirror_set *) ti->private;
4607+
4608+ switch (type) {
4609+ case STATUSTYPE_INFO:
4610+ sz += snprintf(result + sz, maxlen - sz, "%d ", ms->nr_mirrors);
4611+
4612+ for (m = 0; m < ms->nr_mirrors; m++)
4613+ sz += snprintf(result + sz, maxlen - sz, "%s ",
4614+ dm_kdevname(ms->mirror[m].dev->dev));
4615+
4616+ sz += snprintf(result + sz, maxlen - sz, "%lu/%lu",
4617+ ms->sync_count, ms->nr_regions);
4618+ break;
4619+
4620+ case STATUSTYPE_TABLE:
4621+ sz += snprintf(result + sz, maxlen - sz,
4622+ "%s 1 " SECTOR_FORMAT " %d ",
4623+ ms->rh.log->type->name, ms->rh.region_size,
4624+ ms->nr_mirrors);
4625+
4626+ for (m = 0; m < ms->nr_mirrors; m++)
4627+ sz += snprintf(result + sz, maxlen - sz, "%s %ld ",
4628+ dm_kdevname(ms->mirror[m].dev->dev),
4629+ ms->mirror[m].offset);
4630+ }
4631+
4632+ return 0;
4633+}
4634+
4635+static struct target_type mirror_target = {
4636+ .name = "mirror",
4637+ .module = THIS_MODULE,
4638+ .ctr = mirror_ctr,
4639+ .dtr = mirror_dtr,
4640+ .map = mirror_map,
4641+ .end_io = mirror_end_io,
4642+ .suspend = mirror_suspend,
4643+ .resume = mirror_resume,
4644+ .status = mirror_status,
4645+};
4646+
4647+static int __init dm_mirror_init(void)
4648+{
4649+ int r;
4650+
4651+ r = dm_dirty_log_init();
4652+ if (r)
4653+ return r;
4654+
4655+ r = dm_daemon_start(&_kmirrord, "kmirrord", do_work);
4656+ if (r) {
4657+ DMERR("couldn't start kmirrord");
4658+ dm_dirty_log_exit();
4659+ return r;
4660+ }
4661+
4662+ r = dm_register_target(&mirror_target);
4663+ if (r < 0) {
4664+ DMERR("%s: Failed to register mirror target",
4665+ mirror_target.name);
4666+ dm_dirty_log_exit();
4667+ dm_daemon_stop(&_kmirrord);
4668+ }
4669+
4670+ return r;
4671+}
4672+
4673+static void __exit dm_mirror_exit(void)
4674+{
4675+ int r;
4676+
4677+ r = dm_unregister_target(&mirror_target);
4678+ if (r < 0)
4679+ DMERR("%s: unregister failed %d", mirror_target.name, r);
4680+
4681+ dm_daemon_stop(&_kmirrord);
4682+ dm_dirty_log_exit();
4683+}
4684+
4685+/* Module hooks */
4686+module_init(dm_mirror_init);
4687+module_exit(dm_mirror_exit);
4688+
4689+MODULE_DESCRIPTION(DM_NAME " mirror target");
4690+MODULE_AUTHOR("Heinz Mauelshagen <mge@sistina.com>");
4691+MODULE_LICENSE("GPL");
339dbf15
AM
4692--- linux-2.4.21/drivers/md/dm-snapshot.c Thu Jan 1 01:00:00 1970
4693+++ linux/drivers/md/dm-snapshot.c Wed Aug 20 14:41:38 2003
2ac564b8
AM
4694@@ -0,0 +1,1235 @@
4695+/*
4696+ * dm-snapshot.c
4697+ *
4698+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
4699+ *
4700+ * This file is released under the GPL.
4701+ */
4702+
4703+#include <linux/config.h>
4704+#include <linux/ctype.h>
4705+#include <linux/module.h>
4706+#include <linux/init.h>
4707+#include <linux/slab.h>
4708+#include <linux/list.h>
4709+#include <linux/fs.h>
4710+#include <linux/blkdev.h>
4711+#include <linux/mempool.h>
4712+#include <linux/device-mapper.h>
4713+#include <linux/vmalloc.h>
4714+
4715+#include "dm-snapshot.h"
4716+#include "kcopyd.h"
4717+
4718+/*
4719+ * FIXME: Remove this before release.
4720+ */
4721+#if 0
4722+#define DMDEBUG(x...) DMWARN( ## x)
4723+#else
4724+#define DMDEBUG(x...)
4725+#endif
4726+
4727+/*
4728+ * The percentage increment we will wake up users at
4729+ */
4730+#define WAKE_UP_PERCENT 5
4731+
4732+/*
4733+ * kcopyd priority of snapshot operations
4734+ */
4735+#define SNAPSHOT_COPY_PRIORITY 2
4736+
4737+/*
4738+ * Each snapshot reserves this many pages for io
4739+ * FIXME: calculate this
4740+ */
4741+#define SNAPSHOT_PAGES 256
4742+
4743+struct pending_exception {
4744+ struct exception e;
4745+
4746+ /*
4747+ * Origin buffers waiting for this to complete are held
4748+ * in a list (using b_reqnext).
4749+ */
4750+ struct buffer_head *origin_bhs;
4751+ struct buffer_head *snapshot_bhs;
4752+
4753+ /*
4754+ * Other pending_exceptions that are processing this
4755+ * chunk. When this list is empty, we know we can
4756+ * complete the origins.
4757+ */
4758+ struct list_head siblings;
4759+
4760+ /* Pointer back to snapshot context */
4761+ struct dm_snapshot *snap;
4762+
4763+ /*
4764+ * 1 indicates the exception has already been sent to
4765+ * kcopyd.
4766+ */
4767+ int started;
4768+};
4769+
4770+/*
4771+ * Hash table mapping origin volumes to lists of snapshots and
4772+ * a lock to protect it
4773+ */
4774+static kmem_cache_t *exception_cache;
4775+static kmem_cache_t *pending_cache;
4776+static mempool_t *pending_pool;
4777+
4778+/*
4779+ * One of these per registered origin, held in the snapshot_origins hash
4780+ */
4781+struct origin {
4782+ /* The origin device */
4783+ kdev_t dev;
4784+
4785+ struct list_head hash_list;
4786+
4787+ /* List of snapshots for this origin */
4788+ struct list_head snapshots;
4789+};
4790+
4791+/*
4792+ * Size of the hash table for origin volumes. If we make this
4793+ * the size of the minors list then it should be nearly perfect
4794+ */
4795+#define ORIGIN_HASH_SIZE 256
4796+#define ORIGIN_MASK 0xFF
4797+static struct list_head *_origins;
4798+static struct rw_semaphore _origins_lock;
4799+
4800+static int init_origin_hash(void)
4801+{
4802+ int i;
4803+
4804+ _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
4805+ GFP_KERNEL);
4806+ if (!_origins) {
4807+ DMERR("Device mapper: Snapshot: unable to allocate memory");
4808+ return -ENOMEM;
4809+ }
4810+
4811+ for (i = 0; i < ORIGIN_HASH_SIZE; i++)
4812+ INIT_LIST_HEAD(_origins + i);
4813+ init_rwsem(&_origins_lock);
4814+
4815+ return 0;
4816+}
4817+
4818+static void exit_origin_hash(void)
4819+{
4820+ kfree(_origins);
4821+}
4822+
4823+static inline unsigned int origin_hash(kdev_t dev)
4824+{
4825+ return MINOR(dev) & ORIGIN_MASK;
4826+}
4827+
4828+static struct origin *__lookup_origin(kdev_t origin)
4829+{
4830+ struct list_head *slist;
4831+ struct list_head *ol;
4832+ struct origin *o;
4833+
4834+ ol = &_origins[origin_hash(origin)];
4835+ list_for_each(slist, ol) {
4836+ o = list_entry(slist, struct origin, hash_list);
4837+
4838+ if (o->dev == origin)
4839+ return o;
4840+ }
4841+
4842+ return NULL;
4843+}
4844+
4845+static void __insert_origin(struct origin *o)
4846+{
4847+ struct list_head *sl = &_origins[origin_hash(o->dev)];
4848+ list_add_tail(&o->hash_list, sl);
4849+}
4850+
4851+/*
4852+ * Make a note of the snapshot and its origin so we can look it
4853+ * up when the origin has a write on it.
4854+ */
4855+static int register_snapshot(struct dm_snapshot *snap)
4856+{
4857+ struct origin *o;
4858+ kdev_t dev = snap->origin->dev;
4859+
4860+ down_write(&_origins_lock);
4861+ o = __lookup_origin(dev);
4862+
4863+ if (!o) {
4864+ /* New origin */
4865+ o = kmalloc(sizeof(*o), GFP_KERNEL);
4866+ if (!o) {
4867+ up_write(&_origins_lock);
4868+ return -ENOMEM;
4869+ }
4870+
4871+ /* Initialise the struct */
4872+ INIT_LIST_HEAD(&o->snapshots);
4873+ o->dev = dev;
4874+
4875+ __insert_origin(o);
4876+ }
4877+
4878+ list_add_tail(&snap->list, &o->snapshots);
4879+
4880+ up_write(&_origins_lock);
4881+ return 0;
4882+}
4883+
4884+static void unregister_snapshot(struct dm_snapshot *s)
4885+{
4886+ struct origin *o;
4887+
4888+ down_write(&_origins_lock);
4889+ o = __lookup_origin(s->origin->dev);
4890+
4891+ list_del(&s->list);
4892+ if (list_empty(&o->snapshots)) {
4893+ list_del(&o->hash_list);
4894+ kfree(o);
4895+ }
4896+
4897+ up_write(&_origins_lock);
4898+}
4899+
4900+/*
4901+ * Implementation of the exception hash tables.
4902+ */
4903+static int init_exception_table(struct exception_table *et, uint32_t size)
4904+{
4905+ unsigned int i;
4906+
4907+ et->hash_mask = size - 1;
4908+ et->table = vcalloc(size, sizeof(struct list_head));
4909+ if (!et->table)
4910+ return -ENOMEM;
4911+
4912+ for (i = 0; i < size; i++)
4913+ INIT_LIST_HEAD(et->table + i);
4914+
4915+ return 0;
4916+}
4917+
4918+static void exit_exception_table(struct exception_table *et, kmem_cache_t *mem)
4919+{
4920+ struct list_head *slot, *entry, *temp;
4921+ struct exception *ex;
4922+ int i, size;
4923+
4924+ size = et->hash_mask + 1;
4925+ for (i = 0; i < size; i++) {
4926+ slot = et->table + i;
4927+
4928+ list_for_each_safe(entry, temp, slot) {
4929+ ex = list_entry(entry, struct exception, hash_list);
4930+ kmem_cache_free(mem, ex);
4931+ }
4932+ }
4933+
4934+ vfree(et->table);
4935+}
4936+
4937+/*
4938+ * FIXME: check how this hash fn is performing.
4939+ */
4940+static inline uint32_t exception_hash(struct exception_table *et, chunk_t chunk)
4941+{
4942+ return chunk & et->hash_mask;
4943+}
4944+
4945+static void insert_exception(struct exception_table *eh, struct exception *e)
4946+{
4947+ struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)];
4948+ list_add(&e->hash_list, l);
4949+}
4950+
4951+static inline void remove_exception(struct exception *e)
4952+{
4953+ list_del(&e->hash_list);
4954+}
4955+
4956+/*
4957+ * Return the exception data for a sector, or NULL if not
4958+ * remapped.
4959+ */
4960+static struct exception *lookup_exception(struct exception_table *et,
4961+ chunk_t chunk)
4962+{
4963+ struct list_head *slot, *el;
4964+ struct exception *e;
4965+
4966+ slot = &et->table[exception_hash(et, chunk)];
4967+ list_for_each(el, slot) {
4968+ e = list_entry(el, struct exception, hash_list);
4969+ if (e->old_chunk == chunk)
4970+ return e;
4971+ }
4972+
4973+ return NULL;
4974+}
4975+
4976+static inline struct exception *alloc_exception(void)
4977+{
4978+ struct exception *e;
4979+
4980+ e = kmem_cache_alloc(exception_cache, GFP_NOIO);
4981+ if (!e)
4982+ e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
4983+
4984+ return e;
4985+}
4986+
4987+static inline void free_exception(struct exception *e)
4988+{
4989+ kmem_cache_free(exception_cache, e);
4990+}
4991+
4992+static inline struct pending_exception *alloc_pending_exception(void)
4993+{
4994+ return mempool_alloc(pending_pool, GFP_NOIO);
4995+}
4996+
4997+static inline void free_pending_exception(struct pending_exception *pe)
4998+{
4999+ mempool_free(pe, pending_pool);
5000+}
5001+
5002+int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new)
5003+{
5004+ struct exception *e;
5005+
5006+ e = alloc_exception();
5007+ if (!e)
5008+ return -ENOMEM;
5009+
5010+ e->old_chunk = old;
5011+ e->new_chunk = new;
5012+ insert_exception(&s->complete, e);
5013+ return 0;
5014+}
5015+
5016+/*
5017+ * Hard coded magic.
5018+ */
5019+static int calc_max_buckets(void)
5020+{
5021+ unsigned long mem;
5022+
5023+ mem = num_physpages << PAGE_SHIFT;
5024+ mem /= 50;
5025+ mem /= sizeof(struct list_head);
5026+
5027+ return mem;
5028+}
5029+
5030+/*
5031+ * Rounds a number down to a power of 2.
5032+ */
5033+static inline uint32_t round_down(uint32_t n)
5034+{
5035+ while (n & (n - 1))
5036+ n &= (n - 1);
5037+ return n;
5038+}
5039+
5040+/*
5041+ * Allocate room for a suitable hash table.
5042+ */
5043+static int init_hash_tables(struct dm_snapshot *s)
5044+{
5045+ sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets;
5046+
5047+ /*
5048+ * Calculate based on the size of the original volume or
5049+ * the COW volume...
5050+ */
5051+ cow_dev_size = get_dev_size(s->cow->dev);
5052+ origin_dev_size = get_dev_size(s->origin->dev);
5053+ max_buckets = calc_max_buckets();
5054+
5055+ hash_size = min(origin_dev_size, cow_dev_size) / s->chunk_size;
5056+ hash_size = min(hash_size, max_buckets);
5057+
5058+ /* Round it down to a power of 2 */
5059+ hash_size = round_down(hash_size);
5060+ if (init_exception_table(&s->complete, hash_size))
5061+ return -ENOMEM;
5062+
5063+ /*
5064+ * Allocate hash table for in-flight exceptions
5065+ * Make this smaller than the real hash table
5066+ */
5067+ hash_size >>= 3;
5068+ if (!hash_size)
5069+ hash_size = 64;
5070+
5071+ if (init_exception_table(&s->pending, hash_size)) {
5072+ exit_exception_table(&s->complete, exception_cache);
5073+ return -ENOMEM;
5074+ }
5075+
5076+ return 0;
5077+}
5078+
5079+/*
5080+ * Round a number up to the nearest 'size' boundary. size must
5081+ * be a power of 2.
5082+ */
5083+static inline ulong round_up(ulong n, ulong size)
5084+{
5085+ size--;
5086+ return (n + size) & ~size;
5087+}
5088+
5089+/*
5090+ * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
5091+ */
5092+static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
5093+{
5094+ struct dm_snapshot *s;
5095+ unsigned long chunk_size;
5096+ int r = -EINVAL;
5097+ char persistent;
5098+ char *origin_path;
5099+ char *cow_path;
5100+ char *value;
5101+ int blocksize;
5102+
5103+ if (argc < 4) {
5104+ ti->error = "dm-snapshot: requires exactly 4 arguments";
5105+ r = -EINVAL;
5106+ goto bad1;
5107+ }
5108+
5109+ origin_path = argv[0];
5110+ cow_path = argv[1];
5111+ persistent = toupper(*argv[2]);
5112+
5113+ if (persistent != 'P' && persistent != 'N') {
5114+ ti->error = "Persistent flag is not P or N";
5115+ r = -EINVAL;
5116+ goto bad1;
5117+ }
5118+
5119+ chunk_size = simple_strtoul(argv[3], &value, 10);
5120+ if (chunk_size == 0 || value == NULL) {
5121+ ti->error = "Invalid chunk size";
5122+ r = -EINVAL;
5123+ goto bad1;
5124+ }
5125+
5126+ s = kmalloc(sizeof(*s), GFP_KERNEL);
5127+ if (s == NULL) {
5128+ ti->error = "Cannot allocate snapshot context private "
5129+ "structure";
5130+ r = -ENOMEM;
5131+ goto bad1;
5132+ }
5133+
5134+ r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin);
5135+ if (r) {
5136+ ti->error = "Cannot get origin device";
5137+ goto bad2;
5138+ }
5139+
5140+ /* FIXME: get cow length */
5141+ r = dm_get_device(ti, cow_path, 0, 0,
5142+ FMODE_READ | FMODE_WRITE, &s->cow);
5143+ if (r) {
5144+ dm_put_device(ti, s->origin);
5145+ ti->error = "Cannot get COW device";
5146+ goto bad2;
5147+ }
5148+
5149+ /*
5150+ * Chunk size must be multiple of page size. Silently
5151+ * round up if it's not.
5152+ */
5153+ chunk_size = round_up(chunk_size, PAGE_SIZE / SECTOR_SIZE);
5154+
5155+ /* Validate the chunk size against the device block size */
5156+ blocksize = get_hardsect_size(s->cow->dev);
5157+ if (chunk_size % (blocksize / SECTOR_SIZE)) {
5158+ ti->error = "Chunk size is not a multiple of device blocksize";
5159+ r = -EINVAL;
5160+ goto bad3;
5161+ }
5162+
5163+ /* Check the sizes are small enough to fit in one kiovec */
5164+ if (chunk_size > KIO_MAX_SECTORS) {
5165+ ti->error = "Chunk size is too big";
5166+ r = -EINVAL;
5167+ goto bad3;
5168+ }
5169+
5170+ /* Check chunk_size is a power of 2 */
5171+ if (chunk_size & (chunk_size - 1)) {
5172+ ti->error = "Chunk size is not a power of 2";
5173+ r = -EINVAL;
5174+ goto bad3;
5175+ }
5176+
5177+ s->chunk_size = chunk_size;
5178+ s->chunk_mask = chunk_size - 1;
5179+ s->type = persistent;
5180+ for (s->chunk_shift = 0; chunk_size;
5181+ s->chunk_shift++, chunk_size >>= 1)
5182+ ;
5183+ s->chunk_shift--;
5184+
5185+ s->valid = 1;
5186+ s->have_metadata = 0;
5187+ s->last_percent = 0;
5188+ init_rwsem(&s->lock);
5189+ s->table = ti->table;
5190+
5191+ /* Allocate hash table for COW data */
5192+ if (init_hash_tables(s)) {
5193+ ti->error = "Unable to allocate hash table space";
5194+ r = -ENOMEM;
5195+ goto bad3;
5196+ }
5197+
5198+ /*
5199+ * Check the persistent flag - done here because we need the iobuf
5200+ * to check the LV header
5201+ */
5202+ s->store.snap = s;
5203+
5204+ if (persistent == 'P')
5205+ r = dm_create_persistent(&s->store, s->chunk_size);
5206+ else
5207+ r = dm_create_transient(&s->store, s, blocksize);
5208+
5209+ if (r) {
5210+ ti->error = "Couldn't create exception store";
5211+ r = -EINVAL;
5212+ goto bad4;
5213+ }
5214+
5215+ r = kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client);
5216+ if (r) {
5217+ ti->error = "Could not create kcopyd client";
5218+ goto bad5;
5219+ }
5220+
5221+ /* Flush IO to the origin device */
5222+ fsync_dev(s->origin->dev);
5223+
5224+ /* Add snapshot to the list of snapshots for this origin */
5225+ if (register_snapshot(s)) {
5226+ r = -EINVAL;
5227+ ti->error = "Cannot register snapshot origin";
5228+ goto bad6;
5229+ }
5230+
5231+ ti->private = s;
5232+ return 0;
5233+
5234+ bad6:
5235+ kcopyd_client_destroy(s->kcopyd_client);
5236+
5237+ bad5:
5238+ s->store.destroy(&s->store);
5239+
5240+ bad4:
5241+ exit_exception_table(&s->pending, pending_cache);
5242+ exit_exception_table(&s->complete, exception_cache);
5243+
5244+ bad3:
5245+ dm_put_device(ti, s->cow);
5246+ dm_put_device(ti, s->origin);
5247+
5248+ bad2:
5249+ kfree(s);
5250+
5251+ bad1:
5252+ return r;
5253+}
5254+
5255+static void snapshot_dtr(struct dm_target *ti)
5256+{
5257+ struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
5258+
5259+ dm_table_event(ti->table);
5260+
5261+ unregister_snapshot(s);
5262+
5263+ exit_exception_table(&s->pending, pending_cache);
5264+ exit_exception_table(&s->complete, exception_cache);
5265+
5266+ /* Deallocate memory used */
5267+ s->store.destroy(&s->store);
5268+
5269+ dm_put_device(ti, s->origin);
5270+ dm_put_device(ti, s->cow);
5271+ kcopyd_client_destroy(s->kcopyd_client);
5272+ kfree(s);
5273+}
5274+
5275+/*
5276+ * We hold lists of buffer_heads, using the b_reqnext field.
5277+ */
5278+static void queue_buffer(struct buffer_head **queue, struct buffer_head *bh)
5279+{
5280+ bh->b_reqnext = *queue;
5281+ *queue = bh;
5282+}
5283+
5284+/*
5285+ * FIXME: inefficient.
5286+ */
5287+static void queue_buffers(struct buffer_head **queue, struct buffer_head *bhs)
5288+{
5289+ while (*queue)
5290+ queue = &((*queue)->b_reqnext);
5291+
5292+ *queue = bhs;
5293+}
5294+
5295+/*
5296+ * Flush a list of buffers.
5297+ */
5298+static void flush_buffers(struct buffer_head *bh)
5299+{
5300+ struct buffer_head *n;
5301+
5302+ DMDEBUG("begin flush");
5303+ while (bh) {
5304+ n = bh->b_reqnext;
5305+ bh->b_reqnext = NULL;
5306+ DMDEBUG("flushing %p", bh);
5307+ generic_make_request(WRITE, bh);
5308+ bh = n;
5309+ }
5310+
5311+ run_task_queue(&tq_disk);
5312+}
5313+
5314+/*
5315+ * Error a list of buffers.
5316+ */
5317+static void error_buffers(struct buffer_head *bh)
5318+{
5319+ struct buffer_head *n;
5320+
5321+ while (bh) {
5322+ n = bh->b_reqnext;
5323+ bh->b_reqnext = NULL;
5324+ buffer_IO_error(bh);
5325+ bh = n;
5326+ }
5327+}
5328+
5329+static struct buffer_head *__flush_bhs(struct pending_exception *pe)
5330+{
5331+ struct pending_exception *sibling;
5332+
5333+ if (list_empty(&pe->siblings))
5334+ return pe->origin_bhs;
5335+
5336+ sibling = list_entry(pe->siblings.next,
5337+ struct pending_exception, siblings);
5338+
5339+ list_del(&pe->siblings);
5340+
5341+ /* FIXME: I think there's a race on SMP machines here, add spin lock */
5342+ queue_buffers(&sibling->origin_bhs, pe->origin_bhs);
5343+
5344+ return NULL;
5345+}
5346+
5347+static void pending_complete(struct pending_exception *pe, int success)
5348+{
5349+ struct exception *e;
5350+ struct dm_snapshot *s = pe->snap;
5351+ struct buffer_head *flush = NULL;
5352+
5353+ if (success) {
5354+ e = alloc_exception();
5355+ if (!e) {
5356+ DMWARN("Unable to allocate exception.");
5357+ down_write(&s->lock);
5358+ s->store.drop_snapshot(&s->store);
5359+ s->valid = 0;
5360+ flush = __flush_bhs(pe);
5361+ up_write(&s->lock);
5362+
5363+ error_buffers(pe->snapshot_bhs);
5364+ goto out;
5365+ }
5366+
5367+ /*
5368+ * Add a proper exception, and remove the
5369+ * in-flight exception from the list.
5370+ */
5371+ down_write(&s->lock);
5372+
5373+ memcpy(e, &pe->e, sizeof(*e));
5374+ insert_exception(&s->complete, e);
5375+ remove_exception(&pe->e);
5376+ flush = __flush_bhs(pe);
5377+
5378+ /* Submit any pending write BHs */
5379+ up_write(&s->lock);
5380+
5381+ flush_buffers(pe->snapshot_bhs);
5382+ DMDEBUG("Exception completed successfully.");
5383+
5384+ /* Notify any interested parties */
5385+ if (s->store.fraction_full) {
5386+ sector_t numerator, denominator;
5387+ int pc;
5388+
5389+ s->store.fraction_full(&s->store, &numerator,
5390+ &denominator);
5391+ pc = numerator * 100 / denominator;
5392+
5393+ if (pc >= s->last_percent + WAKE_UP_PERCENT) {
5394+ dm_table_event(s->table);
5395+ s->last_percent = pc - pc % WAKE_UP_PERCENT;
5396+ }
5397+ }
5398+
5399+ } else {
5400+ /* Read/write error - snapshot is unusable */
5401+ down_write(&s->lock);
5402+ if (s->valid)
5403+ DMERR("Error reading/writing snapshot");
5404+ s->store.drop_snapshot(&s->store);
5405+ s->valid = 0;
5406+ remove_exception(&pe->e);
5407+ flush = __flush_bhs(pe);
5408+ up_write(&s->lock);
5409+
5410+ error_buffers(pe->snapshot_bhs);
5411+
5412+ dm_table_event(s->table);
5413+ DMDEBUG("Exception failed.");
5414+ }
5415+
5416+ out:
5417+ if (flush)
5418+ flush_buffers(flush);
5419+
5420+ free_pending_exception(pe);
5421+}
5422+
5423+static void commit_callback(void *context, int success)
5424+{
5425+ struct pending_exception *pe = (struct pending_exception *) context;
5426+ pending_complete(pe, success);
5427+}
5428+
5429+/*
5430+ * Called when the copy I/O has finished. kcopyd actually runs
5431+ * this code so don't block.
5432+ */
5433+static void copy_callback(int read_err, unsigned int write_err, void *context)
5434+{
5435+ struct pending_exception *pe = (struct pending_exception *) context;
5436+ struct dm_snapshot *s = pe->snap;
5437+
5438+ if (read_err || write_err)
5439+ pending_complete(pe, 0);
5440+
5441+ else
5442+ /* Update the metadata if we are persistent */
5443+ s->store.commit_exception(&s->store, &pe->e, commit_callback,
5444+ pe);
5445+}
5446+
5447+/*
5448+ * Dispatches the copy operation to kcopyd.
5449+ */
5450+static inline void start_copy(struct pending_exception *pe)
5451+{
5452+ struct dm_snapshot *s = pe->snap;
5453+ struct io_region src, dest;
5454+ kdev_t dev = s->origin->dev;
5455+ int *sizes = blk_size[major(dev)];
5456+ sector_t dev_size = (sector_t) -1;
5457+
5458+ if (pe->started)
5459+ return;
5460+
5461+ /* this is protected by snap->lock */
5462+ pe->started = 1;
5463+
5464+ if (sizes && sizes[minor(dev)])
5465+ dev_size = sizes[minor(dev)] << 1;
5466+
5467+ src.dev = dev;
5468+ src.sector = chunk_to_sector(s, pe->e.old_chunk);
5469+ src.count = min(s->chunk_size, dev_size - src.sector);
5470+
5471+ dest.dev = s->cow->dev;
5472+ dest.sector = chunk_to_sector(s, pe->e.new_chunk);
5473+ dest.count = src.count;
5474+
5475+ /* Hand over to kcopyd */
5476+ kcopyd_copy(s->kcopyd_client,
5477+ &src, 1, &dest, 0, copy_callback, pe);
5478+}
5479+
5480+/*
5481+ * Looks to see if this snapshot already has a pending exception
5482+ * for this chunk, otherwise it allocates a new one and inserts
5483+ * it into the pending table.
5484+ */
5485+static struct pending_exception *find_pending_exception(struct dm_snapshot *s,
5486+ struct buffer_head *bh)
5487+{
5488+ struct exception *e;
5489+ struct pending_exception *pe;
5490+ chunk_t chunk = sector_to_chunk(s, bh->b_rsector);
5491+
5492+ /*
5493+ * Is there a pending exception for this already ?
5494+ */
5495+ e = lookup_exception(&s->pending, chunk);
5496+ if (e) {
5497+ /* cast the exception to a pending exception */
5498+ pe = list_entry(e, struct pending_exception, e);
5499+
5500+ } else {
5501+ /* Create a new pending exception */
5502+ pe = alloc_pending_exception();
5503+ pe->e.old_chunk = chunk;
5504+ pe->origin_bhs = pe->snapshot_bhs = NULL;
5505+ INIT_LIST_HEAD(&pe->siblings);
5506+ pe->snap = s;
5507+ pe->started = 0;
5508+
5509+ if (s->store.prepare_exception(&s->store, &pe->e)) {
5510+ free_pending_exception(pe);
5511+ s->valid = 0;
5512+ return NULL;
5513+ }
5514+
5515+ insert_exception(&s->pending, &pe->e);
5516+ }
5517+
5518+ return pe;
5519+}
5520+
5521+static inline void remap_exception(struct dm_snapshot *s, struct exception *e,
5522+ struct buffer_head *bh)
5523+{
5524+ bh->b_rdev = s->cow->dev;
5525+ bh->b_rsector = chunk_to_sector(s, e->new_chunk) +
5526+ (bh->b_rsector & s->chunk_mask);
5527+}
5528+
5529+static int snapshot_map(struct dm_target *ti, struct buffer_head *bh, int rw,
5530+ union map_info *map_context)
5531+{
5532+ struct exception *e;
5533+ struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
5534+ int r = 1;
5535+ chunk_t chunk;
5536+ struct pending_exception *pe;
5537+
5538+ chunk = sector_to_chunk(s, bh->b_rsector);
5539+
5540+ /* Full snapshots are not usable */
5541+ if (!s->valid)
5542+ return -1;
5543+
5544+ /*
5545+ * Write to snapshot - higher level takes care of RW/RO
5546+ * flags so we should only get this if we are
5547+ * writeable.
5548+ */
5549+ if (rw == WRITE) {
5550+
5551+ down_write(&s->lock);
5552+
5553+ /* If the block is already remapped - use that, else remap it */
5554+ e = lookup_exception(&s->complete, chunk);
5555+ if (e)
5556+ remap_exception(s, e, bh);
5557+
5558+ else {
5559+ pe = find_pending_exception(s, bh);
5560+
5561+ if (!pe) {
5562+ s->store.drop_snapshot(&s->store);
5563+ s->valid = 0;
5564+ r = -EIO;
5565+ } else {
5566+ remap_exception(s, &pe->e, bh);
5567+ queue_buffer(&pe->snapshot_bhs, bh);
5568+ start_copy(pe);
5569+ r = 0;
5570+ }
5571+ }
5572+
5573+ up_write(&s->lock);
5574+
5575+ } else {
5576+ /*
5577+ * FIXME: this read path scares me because we
5578+ * always use the origin when we have a pending
5579+ * exception. However I can't think of a
5580+ * situation where this is wrong - ejt.
5581+ */
5582+
5583+ /* Do reads */
5584+ down_read(&s->lock);
5585+
5586+ /* See if it it has been remapped */
5587+ e = lookup_exception(&s->complete, chunk);
5588+ if (e)
5589+ remap_exception(s, e, bh);
5590+ else
5591+ bh->b_rdev = s->origin->dev;
5592+
5593+ up_read(&s->lock);
5594+ }
5595+
5596+ return r;
5597+}
5598+
5599+void snapshot_resume(struct dm_target *ti)
5600+{
5601+ struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
5602+
5603+ if (s->have_metadata)
5604+ return;
5605+
5606+ if (s->store.read_metadata(&s->store)) {
5607+ down_write(&s->lock);
5608+ s->valid = 0;
5609+ up_write(&s->lock);
5610+ }
5611+
5612+ s->have_metadata = 1;
5613+}
5614+
5615+static int snapshot_status(struct dm_target *ti, status_type_t type,
5616+ char *result, unsigned int maxlen)
5617+{
5618+ struct dm_snapshot *snap = (struct dm_snapshot *) ti->private;
5619+ char cow[16];
5620+ char org[16];
5621+
5622+ switch (type) {
5623+ case STATUSTYPE_INFO:
5624+ if (!snap->valid)
5625+ snprintf(result, maxlen, "Invalid");
5626+ else {
5627+ if (snap->store.fraction_full) {
5628+ sector_t numerator, denominator;
5629+ snap->store.fraction_full(&snap->store,
5630+ &numerator,
5631+ &denominator);
5632+ snprintf(result, maxlen,
5633+ SECTOR_FORMAT "/" SECTOR_FORMAT,
5634+ numerator, denominator);
5635+ }
5636+ else
5637+ snprintf(result, maxlen, "Unknown");
5638+ }
5639+ break;
5640+
5641+ case STATUSTYPE_TABLE:
5642+ /*
5643+ * kdevname returns a static pointer so we need
5644+ * to make private copies if the output is to
5645+ * make sense.
5646+ */
5647+ strncpy(cow, dm_kdevname(snap->cow->dev), sizeof(cow));
5648+ strncpy(org, dm_kdevname(snap->origin->dev), sizeof(org));
5649+ snprintf(result, maxlen, "%s %s %c %ld", org, cow,
5650+ snap->type, snap->chunk_size);
5651+ break;
5652+ }
5653+
5654+ return 0;
5655+}
5656+
5657+/*-----------------------------------------------------------------
5658+ * Origin methods
5659+ *---------------------------------------------------------------*/
5660+static void list_merge(struct list_head *l1, struct list_head *l2)
5661+{
5662+ struct list_head *l1_n, *l2_p;
5663+
5664+ l1_n = l1->next;
5665+ l2_p = l2->prev;
5666+
5667+ l1->next = l2;
5668+ l2->prev = l1;
5669+
5670+ l2_p->next = l1_n;
5671+ l1_n->prev = l2_p;
5672+}
5673+
5674+static int __origin_write(struct list_head *snapshots, struct buffer_head *bh)
5675+{
5676+ int r = 1, first = 1;
5677+ struct list_head *sl;
5678+ struct dm_snapshot *snap;
5679+ struct exception *e;
5680+ struct pending_exception *pe, *last = NULL;
5681+ chunk_t chunk;
5682+
5683+ /* Do all the snapshots on this origin */
5684+ list_for_each(sl, snapshots) {
5685+ snap = list_entry(sl, struct dm_snapshot, list);
5686+
5687+ /* Only deal with valid snapshots */
5688+ if (!snap->valid)
5689+ continue;
5690+
5691+ down_write(&snap->lock);
5692+
5693+ /*
5694+ * Remember, different snapshots can have
5695+ * different chunk sizes.
5696+ */
5697+ chunk = sector_to_chunk(snap, bh->b_rsector);
5698+
5699+ /*
5700+ * Check exception table to see if block
5701+ * is already remapped in this snapshot
5702+ * and trigger an exception if not.
5703+ */
5704+ e = lookup_exception(&snap->complete, chunk);
5705+ if (!e) {
5706+ pe = find_pending_exception(snap, bh);
5707+ if (!pe) {
5708+ snap->store.drop_snapshot(&snap->store);
5709+ snap->valid = 0;
5710+
5711+ } else {
5712+ if (last)
5713+ list_merge(&pe->siblings,
5714+ &last->siblings);
5715+
5716+ last = pe;
5717+ r = 0;
5718+ }
5719+ }
5720+
5721+ up_write(&snap->lock);
5722+ }
5723+
5724+ /*
5725+ * Now that we have a complete pe list we can start the copying.
5726+ */
5727+ if (last) {
5728+ pe = last;
5729+ do {
5730+ down_write(&pe->snap->lock);
5731+ if (first)
5732+ queue_buffer(&pe->origin_bhs, bh);
5733+ start_copy(pe);
5734+ up_write(&pe->snap->lock);
5735+ first = 0;
5736+ pe = list_entry(pe->siblings.next,
5737+ struct pending_exception, siblings);
5738+
5739+ } while (pe != last);
5740+ }
5741+
5742+ return r;
5743+}
5744+
5745+/*
5746+ * Called on a write from the origin driver.
5747+ */
5748+int do_origin(struct dm_dev *origin, struct buffer_head *bh)
5749+{
5750+ struct origin *o;
5751+ int r;
5752+
5753+ down_read(&_origins_lock);
5754+ o = __lookup_origin(origin->dev);
5755+ if (!o)
5756+ BUG();
5757+
5758+ r = __origin_write(&o->snapshots, bh);
5759+ up_read(&_origins_lock);
5760+
5761+ return r;
5762+}
5763+
5764+/*
5765+ * Origin: maps a linear range of a device, with hooks for snapshotting.
5766+ */
5767+
5768+/*
5769+ * Construct an origin mapping: <dev_path>
5770+ * The context for an origin is merely a 'struct dm_dev *'
5771+ * pointing to the real device.
5772+ */
5773+static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
5774+{
5775+ int r;
5776+ struct dm_dev *dev;
5777+
5778+ if (argc != 1) {
5779+ ti->error = "dm-origin: incorrect number of arguments";
5780+ return -EINVAL;
5781+ }
5782+
5783+ r = dm_get_device(ti, argv[0], 0, ti->len,
5784+ dm_table_get_mode(ti->table), &dev);
5785+ if (r) {
5786+ ti->error = "Cannot get target device";
5787+ return r;
5788+ }
5789+
5790+ ti->private = dev;
5791+ return 0;
5792+}
5793+
5794+static void origin_dtr(struct dm_target *ti)
5795+{
5796+ struct dm_dev *dev = (struct dm_dev *) ti->private;
5797+ dm_put_device(ti, dev);
5798+}
5799+
5800+static int origin_map(struct dm_target *ti, struct buffer_head *bh, int rw,
5801+ union map_info *map_context)
5802+{
5803+ struct dm_dev *dev = (struct dm_dev *) ti->private;
5804+ bh->b_rdev = dev->dev;
5805+
5806+ /* Only tell snapshots if this is a write */
5807+ return (rw == WRITE) ? do_origin(dev, bh) : 1;
5808+}
5809+
5810+static int origin_status(struct dm_target *ti, status_type_t type, char *result,
5811+ unsigned int maxlen)
5812+{
5813+ struct dm_dev *dev = (struct dm_dev *) ti->private;
5814+
5815+ switch (type) {
5816+ case STATUSTYPE_INFO:
5817+ result[0] = '\0';
5818+ break;
5819+
5820+ case STATUSTYPE_TABLE:
5821+ snprintf(result, maxlen, "%s", dm_kdevname(dev->dev));
5822+ break;
5823+ }
5824+
5825+ return 0;
5826+}
5827+
5828+static struct target_type origin_target = {
5829+ name: "snapshot-origin",
5830+ module: THIS_MODULE,
5831+ ctr: origin_ctr,
5832+ dtr: origin_dtr,
5833+ map: origin_map,
5834+ status: origin_status,
5835+};
5836+
5837+static struct target_type snapshot_target = {
5838+ name: "snapshot",
5839+ module: THIS_MODULE,
5840+ ctr: snapshot_ctr,
5841+ dtr: snapshot_dtr,
5842+ map: snapshot_map,
5843+ resume: snapshot_resume,
5844+ status: snapshot_status,
5845+};
5846+
5847+int __init dm_snapshot_init(void)
5848+{
5849+ int r;
5850+
5851+ r = dm_register_target(&snapshot_target);
5852+ if (r) {
5853+ DMERR("snapshot target register failed %d", r);
5854+ return r;
5855+ }
5856+
5857+ r = dm_register_target(&origin_target);
5858+ if (r < 0) {
5859+ DMERR("Device mapper: Origin: register failed %d\n", r);
5860+ goto bad1;
5861+ }
5862+
5863+ r = init_origin_hash();
5864+ if (r) {
5865+ DMERR("init_origin_hash failed.");
5866+ goto bad2;
5867+ }
5868+
5869+ exception_cache = kmem_cache_create("dm-snapshot-ex",
5870+ sizeof(struct exception),
5871+ __alignof__(struct exception),
5872+ 0, NULL, NULL);
5873+ if (!exception_cache) {
5874+ DMERR("Couldn't create exception cache.");
5875+ r = -ENOMEM;
5876+ goto bad3;
5877+ }
5878+
5879+ pending_cache =
5880+ kmem_cache_create("dm-snapshot-in",
5881+ sizeof(struct pending_exception),
5882+ __alignof__(struct pending_exception),
5883+ 0, NULL, NULL);
5884+ if (!pending_cache) {
5885+ DMERR("Couldn't create pending cache.");
5886+ r = -ENOMEM;
5887+ goto bad4;
5888+ }
5889+
5890+ pending_pool = mempool_create(128, mempool_alloc_slab,
5891+ mempool_free_slab, pending_cache);
5892+ if (!pending_pool) {
5893+ DMERR("Couldn't create pending pool.");
5894+ r = -ENOMEM;
5895+ goto bad5;
5896+ }
5897+
5898+ return 0;
5899+
5900+ bad5:
5901+ kmem_cache_destroy(pending_cache);
5902+ bad4:
5903+ kmem_cache_destroy(exception_cache);
5904+ bad3:
5905+ exit_origin_hash();
5906+ bad2:
5907+ dm_unregister_target(&origin_target);
5908+ bad1:
5909+ dm_unregister_target(&snapshot_target);
5910+ return r;
5911+}
5912+
5913+void dm_snapshot_exit(void)
5914+{
5915+ int r;
5916+
5917+ r = dm_unregister_target(&snapshot_target);
5918+ if (r)
5919+ DMERR("snapshot unregister failed %d", r);
5920+
5921+ r = dm_unregister_target(&origin_target);
5922+ if (r)
5923+ DMERR("origin unregister failed %d", r);
5924+
5925+ exit_origin_hash();
5926+ mempool_destroy(pending_pool);
5927+ kmem_cache_destroy(pending_cache);
5928+ kmem_cache_destroy(exception_cache);
5929+}
339dbf15
AM
5930--- linux-2.4.21/drivers/md/dm-snapshot.h Thu Jan 1 01:00:00 1970
5931+++ linux/drivers/md/dm-snapshot.h Wed Aug 20 14:41:38 2003
2ac564b8
AM
5932@@ -0,0 +1,158 @@
5933+/*
5934+ * dm-snapshot.c
5935+ *
5936+ * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
5937+ *
5938+ * This file is released under the GPL.
5939+ */
5940+
5941+#ifndef DM_SNAPSHOT_H
5942+#define DM_SNAPSHOT_H
5943+
5944+#include "dm.h"
5945+#include <linux/blkdev.h>
5946+
5947+struct exception_table {
5948+ uint32_t hash_mask;
5949+ struct list_head *table;
5950+};
5951+
5952+/*
5953+ * The snapshot code deals with largish chunks of the disk at a
5954+ * time. Typically 64k - 256k.
5955+ */
5956+/* FIXME: can we get away with limiting these to a uint32_t ? */
5957+typedef sector_t chunk_t;
5958+
5959+/*
5960+ * An exception is used where an old chunk of data has been
5961+ * replaced by a new one.
5962+ */
5963+struct exception {
5964+ struct list_head hash_list;
5965+
5966+ chunk_t old_chunk;
5967+ chunk_t new_chunk;
5968+};
5969+
5970+/*
5971+ * Abstraction to handle the meta/layout of exception stores (the
5972+ * COW device).
5973+ */
5974+struct exception_store {
5975+
5976+ /*
5977+ * Destroys this object when you've finished with it.
5978+ */
5979+ void (*destroy) (struct exception_store *store);
5980+
5981+ /*
5982+ * The target shouldn't read the COW device until this is
5983+ * called.
5984+ */
5985+ int (*read_metadata) (struct exception_store *store);
5986+
5987+ /*
5988+ * Find somewhere to store the next exception.
5989+ */
5990+ int (*prepare_exception) (struct exception_store *store,
5991+ struct exception *e);
5992+
5993+ /*
5994+ * Update the metadata with this exception.
5995+ */
5996+ void (*commit_exception) (struct exception_store *store,
5997+ struct exception *e,
5998+ void (*callback) (void *, int success),
5999+ void *callback_context);
6000+
6001+ /*
6002+ * The snapshot is invalid, note this in the metadata.
6003+ */
6004+ void (*drop_snapshot) (struct exception_store *store);
6005+
6006+ /*
6007+ * Return how full the snapshot is.
6008+ */
6009+ void (*fraction_full) (struct exception_store *store,
6010+ sector_t *numerator,
6011+ sector_t *denominator);
6012+
6013+ struct dm_snapshot *snap;
6014+ void *context;
6015+};
6016+
6017+struct dm_snapshot {
6018+ struct rw_semaphore lock;
6019+ struct dm_table *table;
6020+
6021+ struct dm_dev *origin;
6022+ struct dm_dev *cow;
6023+
6024+ /* List of snapshots per Origin */
6025+ struct list_head list;
6026+
6027+ /* Size of data blocks saved - must be a power of 2 */
6028+ chunk_t chunk_size;
6029+ chunk_t chunk_mask;
6030+ chunk_t chunk_shift;
6031+
6032+ /* You can't use a snapshot if this is 0 (e.g. if full) */
6033+ int valid;
6034+ int have_metadata;
6035+
6036+ /* Used for display of table */
6037+ char type;
6038+
6039+ /* The last percentage we notified */
6040+ int last_percent;
6041+
6042+ struct exception_table pending;
6043+ struct exception_table complete;
6044+
6045+ /* The on disk metadata handler */
6046+ struct exception_store store;
6047+
6048+ struct kcopyd_client *kcopyd_client;
6049+};
6050+
6051+/*
6052+ * Used by the exception stores to load exceptions hen
6053+ * initialising.
6054+ */
6055+int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new);
6056+
6057+/*
6058+ * Constructor and destructor for the default persistent
6059+ * store.
6060+ */
6061+int dm_create_persistent(struct exception_store *store, uint32_t chunk_size);
6062+
6063+int dm_create_transient(struct exception_store *store,
6064+ struct dm_snapshot *s, int blocksize);
6065+
6066+/*
6067+ * Return the number of sectors in the device.
6068+ */
6069+static inline sector_t get_dev_size(kdev_t dev)
6070+{
6071+ int *sizes;
6072+
6073+ sizes = blk_size[MAJOR(dev)];
6074+ if (sizes)
6075+ return sizes[MINOR(dev)] << 1;
6076+
6077+ return 0;
6078+}
6079+
6080+static inline chunk_t sector_to_chunk(struct dm_snapshot *s, sector_t sector)
6081+{
6082+ return (sector & ~s->chunk_mask) >> s->chunk_shift;
6083+}
6084+
6085+static inline sector_t chunk_to_sector(struct dm_snapshot *s, chunk_t chunk)
6086+{
6087+ return chunk << s->chunk_shift;
6088+}
6089+
6090+#endif
339dbf15
AM
6091--- linux-2.4.21/drivers/md/dm-stripe.c Thu Jan 1 01:00:00 1970
6092+++ linux/drivers/md/dm-stripe.c Wed Aug 20 14:41:38 2003
2ac564b8
AM
6093@@ -0,0 +1,258 @@
6094+/*
6095+ * Copyright (C) 2001 Sistina Software (UK) Limited.
6096+ *
6097+ * This file is released under the GPL.
6098+ */
6099+
6100+#include "dm.h"
6101+
6102+#include <linux/module.h>
6103+#include <linux/init.h>
6104+#include <linux/blkdev.h>
6105+#include <linux/slab.h>
6106+
6107+struct stripe {
6108+ struct dm_dev *dev;
6109+ sector_t physical_start;
6110+};
6111+
6112+struct stripe_c {
6113+ uint32_t stripes;
6114+
6115+ /* The size of this target / num. stripes */
6116+ uint32_t stripe_width;
6117+
6118+ /* stripe chunk size */
6119+ uint32_t chunk_shift;
6120+ sector_t chunk_mask;
6121+
6122+ struct stripe stripe[0];
6123+};
6124+
6125+static inline struct stripe_c *alloc_context(unsigned int stripes)
6126+{
6127+ size_t len;
6128+
6129+ if (array_too_big(sizeof(struct stripe_c), sizeof(struct stripe),
6130+ stripes))
6131+ return NULL;
6132+
6133+ len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes);
6134+
6135+ return kmalloc(len, GFP_KERNEL);
6136+}
6137+
6138+/*
6139+ * Parse a single <dev> <sector> pair
6140+ */
6141+static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
6142+ unsigned int stripe, char **argv)
6143+{
6144+ sector_t start;
6145+
6146+ if (sscanf(argv[1], SECTOR_FORMAT, &start) != 1)
6147+ return -EINVAL;
6148+
6149+ if (dm_get_device(ti, argv[0], start, sc->stripe_width,
6150+ dm_table_get_mode(ti->table),
6151+ &sc->stripe[stripe].dev))
6152+ return -ENXIO;
6153+
6154+ sc->stripe[stripe].physical_start = start;
6155+ return 0;
6156+}
6157+
6158+/*
6159+ * FIXME: Nasty function, only present because we can't link
6160+ * against __moddi3 and __divdi3.
6161+ *
6162+ * returns a == b * n
6163+ */
6164+static int multiple(sector_t a, sector_t b, sector_t *n)
6165+{
6166+ sector_t acc, prev, i;
6167+
6168+ *n = 0;
6169+ while (a >= b) {
6170+ for (acc = b, prev = 0, i = 1;
6171+ acc <= a;
6172+ prev = acc, acc <<= 1, i <<= 1)
6173+ ;
6174+
6175+ a -= prev;
6176+ *n += i >> 1;
6177+ }
6178+
6179+ return a == 0;
6180+}
6181+
6182+/*
6183+ * Construct a striped mapping.
6184+ * <number of stripes> <chunk size (2^^n)> [<dev_path> <offset>]+
6185+ */
6186+static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
6187+{
6188+ struct stripe_c *sc;
6189+ sector_t width;
6190+ uint32_t stripes;
6191+ uint32_t chunk_size;
6192+ char *end;
6193+ int r;
6194+ unsigned int i;
6195+
6196+ if (argc < 2) {
6197+ ti->error = "dm-stripe: Not enough arguments";
6198+ return -EINVAL;
6199+ }
6200+
6201+ stripes = simple_strtoul(argv[0], &end, 10);
6202+ if (*end) {
6203+ ti->error = "dm-stripe: Invalid stripe count";
6204+ return -EINVAL;
6205+ }
6206+
6207+ chunk_size = simple_strtoul(argv[1], &end, 10);
6208+ if (*end) {
6209+ ti->error = "dm-stripe: Invalid chunk_size";
6210+ return -EINVAL;
6211+ }
6212+
6213+ /*
6214+ * chunk_size is a power of two
6215+ */
6216+ if (!chunk_size || (chunk_size & (chunk_size - 1))) {
6217+ ti->error = "dm-stripe: Invalid chunk size";
6218+ return -EINVAL;
6219+ }
6220+
6221+ if (!multiple(ti->len, stripes, &width)) {
6222+ ti->error = "dm-stripe: Target length not divisable by "
6223+ "number of stripes";
6224+ return -EINVAL;
6225+ }
6226+
6227+ /*
6228+ * Do we have enough arguments for that many stripes ?
6229+ */
6230+ if (argc != (2 + 2 * stripes)) {
6231+ ti->error = "dm-stripe: Not enough destinations specified";
6232+ return -EINVAL;
6233+ }
6234+
6235+ sc = alloc_context(stripes);
6236+ if (!sc) {
6237+ ti->error = "dm-stripe: Memory allocation for striped context "
6238+ "failed";
6239+ return -ENOMEM;
6240+ }
6241+
6242+ sc->stripes = stripes;
6243+ sc->stripe_width = width;
6244+
6245+ sc->chunk_mask = ((sector_t) chunk_size) - 1;
6246+ for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++)
6247+ chunk_size >>= 1;
6248+ sc->chunk_shift--;
6249+
6250+ /*
6251+ * Get the stripe destinations.
6252+ */
6253+ for (i = 0; i < stripes; i++) {
6254+ argv += 2;
6255+
6256+ r = get_stripe(ti, sc, i, argv);
6257+ if (r < 0) {
6258+ ti->error = "dm-stripe: Couldn't parse stripe "
6259+ "destination";
6260+ while (i--)
6261+ dm_put_device(ti, sc->stripe[i].dev);
6262+ kfree(sc);
6263+ return r;
6264+ }
6265+ }
6266+
6267+ ti->private = sc;
6268+ return 0;
6269+}
6270+
6271+static void stripe_dtr(struct dm_target *ti)
6272+{
6273+ unsigned int i;
6274+ struct stripe_c *sc = (struct stripe_c *) ti->private;
6275+
6276+ for (i = 0; i < sc->stripes; i++)
6277+ dm_put_device(ti, sc->stripe[i].dev);
6278+
6279+ kfree(sc);
6280+}
6281+
6282+static int stripe_map(struct dm_target *ti, struct buffer_head *bh, int rw,
6283+ union map_info *context)
6284+{
6285+ struct stripe_c *sc = (struct stripe_c *) ti->private;
6286+
6287+ sector_t offset = bh->b_rsector - ti->begin;
6288+ uint32_t chunk = (uint32_t) (offset >> sc->chunk_shift);
6289+ uint32_t stripe = chunk % sc->stripes; /* 32bit modulus */
6290+ chunk = chunk / sc->stripes;
6291+
6292+ bh->b_rdev = sc->stripe[stripe].dev->dev;
6293+ bh->b_rsector = sc->stripe[stripe].physical_start +
6294+ (chunk << sc->chunk_shift) + (offset & sc->chunk_mask);
6295+ return 1;
6296+}
6297+
6298+static int stripe_status(struct dm_target *ti, status_type_t type,
6299+ char *result, unsigned int maxlen)
6300+{
6301+ struct stripe_c *sc = (struct stripe_c *) ti->private;
6302+ int offset;
6303+ unsigned int i;
6304+
6305+ switch (type) {
6306+ case STATUSTYPE_INFO:
6307+ result[0] = '\0';
6308+ break;
6309+
6310+ case STATUSTYPE_TABLE:
6311+ offset = snprintf(result, maxlen, "%d " SECTOR_FORMAT,
6312+ sc->stripes, sc->chunk_mask + 1);
6313+ for (i = 0; i < sc->stripes; i++) {
6314+ offset +=
6315+ snprintf(result + offset, maxlen - offset,
6316+ " %s " SECTOR_FORMAT,
6317+ dm_kdevname(to_kdev_t(sc->stripe[i].dev->bdev->bd_dev)),
6318+ sc->stripe[i].physical_start);
6319+ }
6320+ break;
6321+ }
6322+ return 0;
6323+}
6324+
6325+static struct target_type stripe_target = {
6326+ .name = "striped",
6327+ .module = THIS_MODULE,
6328+ .ctr = stripe_ctr,
6329+ .dtr = stripe_dtr,
6330+ .map = stripe_map,
6331+ .status = stripe_status,
6332+};
6333+
6334+int __init dm_stripe_init(void)
6335+{
6336+ int r;
6337+
6338+ r = dm_register_target(&stripe_target);
6339+ if (r < 0)
6340+ DMWARN("striped target registration failed");
6341+
6342+ return r;
6343+}
6344+
6345+void dm_stripe_exit(void)
6346+{
6347+ if (dm_unregister_target(&stripe_target))
6348+ DMWARN("striped target unregistration failed");
6349+
6350+ return;
6351+}
339dbf15
AM
6352--- linux-2.4.21/drivers/md/dm-table.c Thu Jan 1 01:00:00 1970
6353+++ linux/drivers/md/dm-table.c Wed Aug 20 14:41:38 2003
6354@@ -0,0 +1,708 @@
2ac564b8
AM
6355+/*
6356+ * Copyright (C) 2001 Sistina Software (UK) Limited.
6357+ *
6358+ * This file is released under the GPL.
6359+ */
6360+
6361+#include "dm.h"
6362+
6363+#include <linux/module.h>
6364+#include <linux/vmalloc.h>
6365+#include <linux/blkdev.h>
6366+#include <linux/ctype.h>
6367+#include <linux/slab.h>
6368+#include <asm/atomic.h>
6369+
6370+#define MAX_DEPTH 16
6371+#define NODE_SIZE L1_CACHE_BYTES
6372+#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
6373+#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
339dbf15 6374+#define MAX_TARGET_ARGS 64
2ac564b8
AM
6375+
6376+struct dm_table {
6377+ atomic_t holders;
6378+
6379+ /* btree table */
6380+ unsigned int depth;
6381+ unsigned int counts[MAX_DEPTH]; /* in nodes */
6382+ sector_t *index[MAX_DEPTH];
6383+
6384+ unsigned int num_targets;
6385+ unsigned int num_allocated;
6386+ sector_t *highs;
6387+ struct dm_target *targets;
6388+
6389+ /*
6390+ * Indicates the rw permissions for the new logical
6391+ * device. This should be a combination of FMODE_READ
6392+ * and FMODE_WRITE.
6393+ */
6394+ int mode;
6395+
6396+ /* a list of devices used by this table */
6397+ struct list_head devices;
6398+
6399+ /* events get handed up using this callback */
6400+ void (*event_fn)(void *);
6401+ void *event_context;
6402+};
6403+
6404+/*
6405+ * Similar to ceiling(log_size(n))
6406+ */
6407+static unsigned int int_log(unsigned long n, unsigned long base)
6408+{
6409+ int result = 0;
6410+
6411+ while (n > 1) {
6412+ n = dm_div_up(n, base);
6413+ result++;
6414+ }
6415+
6416+ return result;
6417+}
6418+
6419+/*
6420+ * Calculate the index of the child node of the n'th node k'th key.
6421+ */
6422+static inline unsigned int get_child(unsigned int n, unsigned int k)
6423+{
6424+ return (n * CHILDREN_PER_NODE) + k;
6425+}
6426+
6427+/*
6428+ * Return the n'th node of level l from table t.
6429+ */
6430+static inline sector_t *get_node(struct dm_table *t, unsigned int l,
6431+ unsigned int n)
6432+{
6433+ return t->index[l] + (n * KEYS_PER_NODE);
6434+}
6435+
6436+/*
6437+ * Return the highest key that you could lookup from the n'th
6438+ * node on level l of the btree.
6439+ */
6440+static sector_t high(struct dm_table *t, unsigned int l, unsigned int n)
6441+{
6442+ for (; l < t->depth - 1; l++)
6443+ n = get_child(n, CHILDREN_PER_NODE - 1);
6444+
6445+ if (n >= t->counts[l])
6446+ return (sector_t) - 1;
6447+
6448+ return get_node(t, l, n)[KEYS_PER_NODE - 1];
6449+}
6450+
6451+/*
6452+ * Fills in a level of the btree based on the highs of the level
6453+ * below it.
6454+ */
6455+static int setup_btree_index(unsigned int l, struct dm_table *t)
6456+{
6457+ unsigned int n, k;
6458+ sector_t *node;
6459+
6460+ for (n = 0U; n < t->counts[l]; n++) {
6461+ node = get_node(t, l, n);
6462+
6463+ for (k = 0U; k < KEYS_PER_NODE; k++)
6464+ node[k] = high(t, l + 1, get_child(n, k));
6465+ }
6466+
6467+ return 0;
6468+}
6469+
6470+/*
6471+ * highs, and targets are managed as dynamic arrays during a
6472+ * table load.
6473+ */
6474+static int alloc_targets(struct dm_table *t, unsigned int num)
6475+{
6476+ sector_t *n_highs;
6477+ struct dm_target *n_targets;
6478+ int n = t->num_targets;
6479+
6480+ /*
6481+ * Allocate both the target array and offset array at once.
6482+ */
6483+ n_highs = (sector_t *) vcalloc(sizeof(struct dm_target) +
6484+ sizeof(sector_t), num);
6485+ if (!n_highs)
6486+ return -ENOMEM;
6487+
6488+ n_targets = (struct dm_target *) (n_highs + num);
6489+
6490+ if (n) {
6491+ memcpy(n_highs, t->highs, sizeof(*n_highs) * n);
6492+ memcpy(n_targets, t->targets, sizeof(*n_targets) * n);
6493+ }
6494+
6495+ memset(n_highs + n, -1, sizeof(*n_highs) * (num - n));
6496+ vfree(t->highs);
6497+
6498+ t->num_allocated = num;
6499+ t->highs = n_highs;
6500+ t->targets = n_targets;
6501+
6502+ return 0;
6503+}
6504+
6505+int dm_table_create(struct dm_table **result, int mode)
6506+{
6507+ struct dm_table *t = kmalloc(sizeof(*t), GFP_NOIO);
6508+
6509+ if (!t)
6510+ return -ENOMEM;
6511+
6512+ memset(t, 0, sizeof(*t));
6513+ INIT_LIST_HEAD(&t->devices);
6514+ atomic_set(&t->holders, 1);
6515+
6516+ /* allocate a single nodes worth of targets to begin with */
6517+ if (alloc_targets(t, KEYS_PER_NODE)) {
6518+ kfree(t);
6519+ t = NULL;
6520+ return -ENOMEM;
6521+ }
6522+
6523+ t->mode = mode;
6524+ *result = t;
6525+ return 0;
6526+}
6527+
6528+static void free_devices(struct list_head *devices)
6529+{
6530+ struct list_head *tmp, *next;
6531+
6532+ for (tmp = devices->next; tmp != devices; tmp = next) {
6533+ struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
6534+ next = tmp->next;
6535+ kfree(dd);
6536+ }
6537+}
6538+
6539+void table_destroy(struct dm_table *t)
6540+{
6541+ unsigned int i;
6542+
6543+ /* free the indexes (see dm_table_complete) */
6544+ if (t->depth >= 2)
6545+ vfree(t->index[t->depth - 2]);
6546+
6547+ /* free the targets */
6548+ for (i = 0; i < t->num_targets; i++) {
6549+ struct dm_target *tgt = t->targets + i;
6550+
6551+ if (tgt->type->dtr)
6552+ tgt->type->dtr(tgt);
6553+
6554+ dm_put_target_type(tgt->type);
6555+ }
6556+
6557+ vfree(t->highs);
6558+
6559+ /* free the device list */
6560+ if (t->devices.next != &t->devices) {
6561+ DMWARN("devices still present during destroy: "
6562+ "dm_table_remove_device calls missing");
6563+
6564+ free_devices(&t->devices);
6565+ }
6566+
6567+ kfree(t);
6568+}
6569+
6570+void dm_table_get(struct dm_table *t)
6571+{
6572+ atomic_inc(&t->holders);
6573+}
6574+
6575+void dm_table_put(struct dm_table *t)
6576+{
6577+ if (atomic_dec_and_test(&t->holders))
6578+ table_destroy(t);
6579+}
6580+
6581+/*
6582+ * Checks to see if we need to extend highs or targets.
6583+ */
6584+static inline int check_space(struct dm_table *t)
6585+{
6586+ if (t->num_targets >= t->num_allocated)
6587+ return alloc_targets(t, t->num_allocated * 2);
6588+
6589+ return 0;
6590+}
6591+
6592+/*
6593+ * Convert a device path to a dev_t.
6594+ */
6595+static int lookup_device(const char *path, kdev_t *dev)
6596+{
6597+ int r;
6598+ struct nameidata nd;
6599+ struct inode *inode;
6600+
6601+ if (!path_init(path, LOOKUP_FOLLOW, &nd))
6602+ return 0;
6603+
6604+ if ((r = path_walk(path, &nd)))
6605+ goto out;
6606+
6607+ inode = nd.dentry->d_inode;
6608+ if (!inode) {
6609+ r = -ENOENT;
6610+ goto out;
6611+ }
6612+
6613+ if (!S_ISBLK(inode->i_mode)) {
6614+ r = -ENOTBLK;
6615+ goto out;
6616+ }
6617+
6618+ *dev = inode->i_rdev;
6619+
6620+ out:
6621+ path_release(&nd);
6622+ return r;
6623+}
6624+
6625+/*
6626+ * See if we've already got a device in the list.
6627+ */
6628+static struct dm_dev *find_device(struct list_head *l, kdev_t dev)
6629+{
6630+ struct list_head *tmp;
6631+
6632+ list_for_each(tmp, l) {
6633+ struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
6634+ if (kdev_same(dd->dev, dev))
6635+ return dd;
6636+ }
6637+
6638+ return NULL;
6639+}
6640+
6641+/*
6642+ * Open a device so we can use it as a map destination.
6643+ */
6644+static int open_dev(struct dm_dev *dd)
6645+{
6646+ if (dd->bdev)
6647+ BUG();
6648+
6649+ dd->bdev = bdget(kdev_t_to_nr(dd->dev));
6650+ if (!dd->bdev)
6651+ return -ENOMEM;
6652+
6653+ return blkdev_get(dd->bdev, dd->mode, 0, BDEV_RAW);
6654+}
6655+
6656+/*
6657+ * Close a device that we've been using.
6658+ */
6659+static void close_dev(struct dm_dev *dd)
6660+{
6661+ if (!dd->bdev)
6662+ return;
6663+
6664+ blkdev_put(dd->bdev, BDEV_RAW);
6665+ dd->bdev = NULL;
6666+}
6667+
6668+/*
6669+ * If possible (ie. blk_size[major] is set), this checks an area
6670+ * of a destination device is valid.
6671+ */
6672+static int check_device_area(kdev_t dev, sector_t start, sector_t len)
6673+{
6674+ int *sizes;
6675+ sector_t dev_size;
6676+
6677+ if (!(sizes = blk_size[major(dev)]) || !(dev_size = sizes[minor(dev)]))
6678+ /* we don't know the device details,
6679+ * so give the benefit of the doubt */
6680+ return 1;
6681+
6682+ /* convert to 512-byte sectors */
6683+ dev_size <<= 1;
6684+
6685+ return ((start < dev_size) && (len <= (dev_size - start)));
6686+}
6687+
6688+/*
6689+ * This upgrades the mode on an already open dm_dev. Being
6690+ * careful to leave things as they were if we fail to reopen the
6691+ * device.
6692+ */
6693+static int upgrade_mode(struct dm_dev *dd, int new_mode)
6694+{
6695+ int r;
6696+ struct dm_dev dd_copy;
6697+
6698+ memcpy(&dd_copy, dd, sizeof(dd_copy));
6699+
6700+ dd->mode |= new_mode;
6701+ dd->bdev = NULL;
6702+ r = open_dev(dd);
6703+ if (!r)
6704+ close_dev(&dd_copy);
6705+ else
6706+ memcpy(dd, &dd_copy, sizeof(dd_copy));
6707+
6708+ return r;
6709+}
6710+
6711+/*
6712+ * Add a device to the list, or just increment the usage count if
6713+ * it's already present.
6714+ */
6715+int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
6716+ sector_t len, int mode, struct dm_dev **result)
6717+{
6718+ int r;
6719+ kdev_t dev;
6720+ struct dm_dev *dd;
6721+ unsigned major, minor;
6722+ struct dm_table *t = ti->table;
6723+
6724+ if (!t)
6725+ BUG();
6726+
6727+ if (sscanf(path, "%u:%u", &major, &minor) == 2) {
6728+ /* Extract the major/minor numbers */
6729+ dev = mk_kdev(major, minor);
6730+ } else {
6731+ /* convert the path to a device */
6732+ if ((r = lookup_device(path, &dev)))
6733+ return r;
6734+ }
6735+
6736+ dd = find_device(&t->devices, dev);
6737+ if (!dd) {
6738+ dd = kmalloc(sizeof(*dd), GFP_KERNEL);
6739+ if (!dd)
6740+ return -ENOMEM;
6741+
6742+ dd->dev = dev;
6743+ dd->mode = mode;
6744+ dd->bdev = NULL;
6745+
6746+ if ((r = open_dev(dd))) {
6747+ kfree(dd);
6748+ return r;
6749+ }
6750+
6751+ atomic_set(&dd->count, 0);
6752+ list_add(&dd->list, &t->devices);
6753+
6754+ } else if (dd->mode != (mode | dd->mode)) {
6755+ r = upgrade_mode(dd, mode);
6756+ if (r)
6757+ return r;
6758+ }
6759+ atomic_inc(&dd->count);
6760+
6761+ if (!check_device_area(dd->dev, start, len)) {
6762+ DMWARN("device %s too small for target", path);
6763+ dm_put_device(ti, dd);
6764+ return -EINVAL;
6765+ }
6766+
6767+ *result = dd;
6768+
6769+ return 0;
6770+}
6771+
6772+/*
6773+ * Decrement a devices use count and remove it if neccessary.
6774+ */
6775+void dm_put_device(struct dm_target *ti, struct dm_dev *dd)
6776+{
6777+ if (atomic_dec_and_test(&dd->count)) {
6778+ close_dev(dd);
6779+ list_del(&dd->list);
6780+ kfree(dd);
6781+ }
6782+}
6783+
6784+/*
6785+ * Checks to see if the target joins onto the end of the table.
6786+ */
6787+static int adjoin(struct dm_table *table, struct dm_target *ti)
6788+{
6789+ struct dm_target *prev;
6790+
6791+ if (!table->num_targets)
6792+ return !ti->begin;
6793+
6794+ prev = &table->targets[table->num_targets - 1];
6795+ return (ti->begin == (prev->begin + prev->len));
6796+}
6797+
6798+/*
6799+ * Destructively splits up the argument list to pass to ctr.
6800+ */
339dbf15 6801+static int split_args(int *argc, char ***argvp, char *input)
2ac564b8
AM
6802+{
6803+ char *start, *end = input, *out;
339dbf15
AM
6804+ char **argv;
6805+ int max_args = MAX_TARGET_ARGS;
6806+
2ac564b8 6807+ *argc = 0;
339dbf15
AM
6808+ argv = kmalloc(sizeof(*argv) * max_args, GFP_NOIO);
6809+ if (!argv)
6810+ return -ENOMEM;
2ac564b8
AM
6811+
6812+ while (1) {
6813+ start = end;
6814+
6815+ /* Skip whitespace */
6816+ while (*start && isspace(*start))
6817+ start++;
6818+
6819+ if (!*start)
6820+ break; /* success, we hit the end */
6821+
6822+ /* 'out' is used to remove any back-quotes */
6823+ end = out = start;
6824+ while (*end) {
6825+ /* Everything apart from '\0' can be quoted */
6826+ if (*end == '\\' && *(end + 1)) {
6827+ *out++ = *(end + 1);
6828+ end += 2;
6829+ continue;
6830+ }
6831+
6832+ if (isspace(*end))
6833+ break; /* end of token */
6834+
6835+ *out++ = *end++;
6836+ }
6837+
6838+ /* have we already filled the array ? */
339dbf15
AM
6839+ if ((*argc + 1) > max_args) {
6840+ char **argv2;
6841+
6842+ max_args *= 2;
6843+ argv2 = kmalloc(sizeof(*argv2) * max_args, GFP_NOIO);
6844+ if (!argv2) {
6845+ kfree(argv);
6846+ return -ENOMEM;
6847+ }
6848+
6849+ memcpy(argv2, argv, sizeof(*argv) * *argc);
6850+ kfree(argv);
6851+ argv = argv2;
6852+ }
2ac564b8
AM
6853+
6854+ /* we know this is whitespace */
6855+ if (*end)
6856+ end++;
6857+
6858+ /* terminate the string and put it in the array */
6859+ *out = '\0';
6860+ argv[*argc] = start;
6861+ (*argc)++;
6862+ }
6863+
339dbf15 6864+ *argvp = argv;
2ac564b8
AM
6865+ return 0;
6866+}
6867+
6868+int dm_table_add_target(struct dm_table *t, const char *type,
6869+ sector_t start, sector_t len, char *params)
6870+{
6871+ int r = -EINVAL, argc;
339dbf15 6872+ char **argv;
2ac564b8
AM
6873+ struct dm_target *tgt;
6874+
6875+ if ((r = check_space(t)))
6876+ return r;
6877+
6878+ tgt = t->targets + t->num_targets;
6879+ memset(tgt, 0, sizeof(*tgt));
6880+
6881+ tgt->type = dm_get_target_type(type);
6882+ if (!tgt->type) {
6883+ tgt->error = "unknown target type";
6884+ return -EINVAL;
6885+ }
6886+
6887+ tgt->table = t;
6888+ tgt->begin = start;
6889+ tgt->len = len;
6890+ tgt->error = "Unknown error";
6891+
6892+ /*
6893+ * Does this target adjoin the previous one ?
6894+ */
6895+ if (!adjoin(t, tgt)) {
6896+ tgt->error = "Gap in table";
6897+ r = -EINVAL;
6898+ goto bad;
6899+ }
6900+
339dbf15 6901+ r = split_args(&argc, &argv, params);
2ac564b8 6902+ if (r) {
339dbf15 6903+ tgt->error = "couldn't split parameters (insufficient memory)";
2ac564b8
AM
6904+ goto bad;
6905+ }
6906+
6907+ r = tgt->type->ctr(tgt, argc, argv);
339dbf15 6908+ kfree(argv);
2ac564b8
AM
6909+ if (r)
6910+ goto bad;
6911+
6912+ t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
6913+ return 0;
6914+
6915+ bad:
6916+ printk(KERN_ERR DM_NAME ": %s\n", tgt->error);
6917+ dm_put_target_type(tgt->type);
6918+ return r;
6919+}
6920+
6921+static int setup_indexes(struct dm_table *t)
6922+{
6923+ int i;
6924+ unsigned int total = 0;
6925+ sector_t *indexes;
6926+
6927+ /* allocate the space for *all* the indexes */
6928+ for (i = t->depth - 2; i >= 0; i--) {
6929+ t->counts[i] = dm_div_up(t->counts[i + 1], CHILDREN_PER_NODE);
6930+ total += t->counts[i];
6931+ }
6932+
6933+ indexes = (sector_t *) vcalloc(total, (unsigned long) NODE_SIZE);
6934+ if (!indexes)
6935+ return -ENOMEM;
6936+
6937+ /* set up internal nodes, bottom-up */
6938+ for (i = t->depth - 2, total = 0; i >= 0; i--) {
6939+ t->index[i] = indexes;
6940+ indexes += (KEYS_PER_NODE * t->counts[i]);
6941+ setup_btree_index(i, t);
6942+ }
6943+
6944+ return 0;
6945+}
6946+
6947+/*
6948+ * Builds the btree to index the map.
6949+ */
6950+int dm_table_complete(struct dm_table *t)
6951+{
6952+ int r = 0;
6953+ unsigned int leaf_nodes;
6954+
6955+ /* how many indexes will the btree have ? */
6956+ leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
6957+ t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
6958+
6959+ /* leaf layer has already been set up */
6960+ t->counts[t->depth - 1] = leaf_nodes;
6961+ t->index[t->depth - 1] = t->highs;
6962+
6963+ if (t->depth >= 2)
6964+ r = setup_indexes(t);
6965+
6966+ return r;
6967+}
6968+
6969+static spinlock_t _event_lock = SPIN_LOCK_UNLOCKED;
6970+void dm_table_event_callback(struct dm_table *t,
6971+ void (*fn)(void *), void *context)
6972+{
6973+ spin_lock_irq(&_event_lock);
6974+ t->event_fn = fn;
6975+ t->event_context = context;
6976+ spin_unlock_irq(&_event_lock);
6977+}
6978+
6979+void dm_table_event(struct dm_table *t)
6980+{
6981+ spin_lock(&_event_lock);
6982+ if (t->event_fn)
6983+ t->event_fn(t->event_context);
6984+ spin_unlock(&_event_lock);
6985+}
6986+
6987+sector_t dm_table_get_size(struct dm_table *t)
6988+{
6989+ return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0;
6990+}
6991+
6992+struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index)
6993+{
6994+ if (index > t->num_targets)
6995+ return NULL;
6996+
6997+ return t->targets + index;
6998+}
6999+
7000+/*
7001+ * Search the btree for the correct target.
7002+ */
7003+struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
7004+{
7005+ unsigned int l, n = 0, k = 0;
7006+ sector_t *node;
7007+
7008+ for (l = 0; l < t->depth; l++) {
7009+ n = get_child(n, k);
7010+ node = get_node(t, l, n);
7011+
7012+ for (k = 0; k < KEYS_PER_NODE; k++)
7013+ if (node[k] >= sector)
7014+ break;
7015+ }
7016+
7017+ return &t->targets[(KEYS_PER_NODE * n) + k];
7018+}
7019+
7020+unsigned int dm_table_get_num_targets(struct dm_table *t)
7021+{
7022+ return t->num_targets;
7023+}
7024+
7025+struct list_head *dm_table_get_devices(struct dm_table *t)
7026+{
7027+ return &t->devices;
7028+}
7029+
7030+int dm_table_get_mode(struct dm_table *t)
7031+{
7032+ return t->mode;
7033+}
7034+
7035+void dm_table_suspend_targets(struct dm_table *t)
7036+{
7037+ int i;
7038+
7039+ for (i = 0; i < t->num_targets; i++) {
7040+ struct dm_target *ti = t->targets + i;
7041+
7042+ if (ti->type->suspend)
7043+ ti->type->suspend(ti);
7044+ }
7045+}
7046+
7047+void dm_table_resume_targets(struct dm_table *t)
7048+{
7049+ int i;
7050+
7051+ for (i = 0; i < t->num_targets; i++) {
7052+ struct dm_target *ti = t->targets + i;
7053+
7054+ if (ti->type->resume)
7055+ ti->type->resume(ti);
7056+ }
7057+}
7058+
7059+EXPORT_SYMBOL(dm_get_device);
7060+EXPORT_SYMBOL(dm_put_device);
7061+EXPORT_SYMBOL(dm_table_event);
7062+EXPORT_SYMBOL(dm_table_get_mode);
339dbf15
AM
7063--- linux-2.4.21/drivers/md/dm-target.c Thu Jan 1 01:00:00 1970
7064+++ linux/drivers/md/dm-target.c Wed Aug 20 14:41:38 2003
2ac564b8
AM
7065@@ -0,0 +1,188 @@
7066+/*
7067+ * Copyright (C) 2001 Sistina Software (UK) Limited
7068+ *
7069+ * This file is released under the GPL.
7070+ */
7071+
7072+#include "dm.h"
7073+
7074+#include <linux/module.h>
7075+#include <linux/kmod.h>
7076+#include <linux/slab.h>
7077+
7078+struct tt_internal {
7079+ struct target_type tt;
7080+
7081+ struct list_head list;
7082+ long use;
7083+};
7084+
7085+static LIST_HEAD(_targets);
7086+static DECLARE_RWSEM(_lock);
7087+
7088+#define DM_MOD_NAME_SIZE 32
7089+
7090+static inline struct tt_internal *__find_target_type(const char *name)
7091+{
7092+ struct list_head *tih;
7093+ struct tt_internal *ti;
7094+
7095+ list_for_each(tih, &_targets) {
7096+ ti = list_entry(tih, struct tt_internal, list);
7097+
7098+ if (!strcmp(name, ti->tt.name))
7099+ return ti;
7100+ }
7101+
7102+ return NULL;
7103+}
7104+
7105+static struct tt_internal *get_target_type(const char *name)
7106+{
7107+ struct tt_internal *ti;
7108+
7109+ down_read(&_lock);
7110+ ti = __find_target_type(name);
7111+
7112+ if (ti) {
7113+ if (ti->use == 0 && ti->tt.module)
7114+ __MOD_INC_USE_COUNT(ti->tt.module);
7115+ ti->use++;
7116+ }
7117+ up_read(&_lock);
7118+
7119+ return ti;
7120+}
7121+
7122+static void load_module(const char *name)
7123+{
7124+ char module_name[DM_MOD_NAME_SIZE] = "dm-";
7125+
7126+ /* Length check for strcat() below */
7127+ if (strlen(name) > (DM_MOD_NAME_SIZE - 4))
7128+ return;
7129+
7130+ strcat(module_name, name);
7131+ request_module(module_name);
7132+}
7133+
7134+struct target_type *dm_get_target_type(const char *name)
7135+{
7136+ struct tt_internal *ti = get_target_type(name);
7137+
7138+ if (!ti) {
7139+ load_module(name);
7140+ ti = get_target_type(name);
7141+ }
7142+
7143+ return ti ? &ti->tt : NULL;
7144+}
7145+
7146+void dm_put_target_type(struct target_type *t)
7147+{
7148+ struct tt_internal *ti = (struct tt_internal *) t;
7149+
7150+ down_read(&_lock);
7151+ if (--ti->use == 0 && ti->tt.module)
7152+ __MOD_DEC_USE_COUNT(ti->tt.module);
7153+
7154+ if (ti->use < 0)
7155+ BUG();
7156+ up_read(&_lock);
7157+
7158+ return;
7159+}
7160+
7161+static struct tt_internal *alloc_target(struct target_type *t)
7162+{
7163+ struct tt_internal *ti = kmalloc(sizeof(*ti), GFP_KERNEL);
7164+
7165+ if (ti) {
7166+ memset(ti, 0, sizeof(*ti));
7167+ ti->tt = *t;
7168+ }
7169+
7170+ return ti;
7171+}
7172+
7173+int dm_register_target(struct target_type *t)
7174+{
7175+ int rv = 0;
7176+ struct tt_internal *ti = alloc_target(t);
7177+
7178+ if (!ti)
7179+ return -ENOMEM;
7180+
7181+ down_write(&_lock);
7182+ if (__find_target_type(t->name)) {
7183+ kfree(ti);
7184+ rv = -EEXIST;
7185+ } else
7186+ list_add(&ti->list, &_targets);
7187+
7188+ up_write(&_lock);
7189+ return rv;
7190+}
7191+
7192+int dm_unregister_target(struct target_type *t)
7193+{
7194+ struct tt_internal *ti;
7195+
7196+ down_write(&_lock);
7197+ if (!(ti = __find_target_type(t->name))) {
7198+ up_write(&_lock);
7199+ return -EINVAL;
7200+ }
7201+
7202+ if (ti->use) {
7203+ up_write(&_lock);
7204+ return -ETXTBSY;
7205+ }
7206+
7207+ list_del(&ti->list);
7208+ kfree(ti);
7209+
7210+ up_write(&_lock);
7211+ return 0;
7212+}
7213+
7214+/*
7215+ * io-err: always fails an io, useful for bringing
7216+ * up LVs that have holes in them.
7217+ */
7218+static int io_err_ctr(struct dm_target *ti, unsigned int argc, char **args)
7219+{
7220+ return 0;
7221+}
7222+
7223+static void io_err_dtr(struct dm_target *ti)
7224+{
7225+ /* empty */
7226+}
7227+
7228+static int io_err_map(struct dm_target *ti, struct buffer_head *bh, int rw,
7229+ union map_info *map_context)
7230+{
7231+ return -EIO;
7232+}
7233+
7234+static struct target_type error_target = {
7235+ .name = "error",
7236+ .ctr = io_err_ctr,
7237+ .dtr = io_err_dtr,
7238+ .map = io_err_map,
7239+};
7240+
7241+int dm_target_init(void)
7242+{
7243+ return dm_register_target(&error_target);
7244+}
7245+
7246+void dm_target_exit(void)
7247+{
7248+ if (dm_unregister_target(&error_target))
7249+ DMWARN("error target unregistration failed");
7250+}
7251+
7252+EXPORT_SYMBOL(dm_register_target);
7253+EXPORT_SYMBOL(dm_unregister_target);
339dbf15
AM
7254--- linux-2.4.21/drivers/md/dm.c Thu Jan 1 01:00:00 1970
7255+++ linux/drivers/md/dm.c Wed Aug 20 14:41:38 2003
2ac564b8
AM
7256@@ -0,0 +1,1115 @@
7257+/*
7258+ * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
7259+ *
7260+ * This file is released under the GPL.
7261+ */
7262+
7263+#include "dm.h"
7264+#include "kcopyd.h"
7265+
7266+#include <linux/init.h>
7267+#include <linux/module.h>
7268+#include <linux/blk.h>
7269+#include <linux/blkpg.h>
7270+#include <linux/mempool.h>
7271+#include <linux/slab.h>
7272+#include <linux/major.h>
7273+#include <linux/kdev_t.h>
7274+#include <linux/lvm.h>
7275+
7276+#include <asm/uaccess.h>
7277+
7278+static const char *_name = DM_NAME;
7279+#define DEFAULT_READ_AHEAD 64
7280+
7281+struct dm_io {
7282+ struct mapped_device *md;
7283+
7284+ struct dm_target *ti;
7285+ int rw;
7286+ union map_info map_context;
7287+ void (*end_io) (struct buffer_head * bh, int uptodate);
7288+ void *context;
7289+};
7290+
7291+struct deferred_io {
7292+ int rw;
7293+ struct buffer_head *bh;
7294+ struct deferred_io *next;
7295+};
7296+
7297+/*
7298+ * Bits for the md->flags field.
7299+ */
7300+#define DMF_BLOCK_IO 0
7301+#define DMF_SUSPENDED 1
7302+
7303+struct mapped_device {
7304+ struct rw_semaphore lock;
7305+ atomic_t holders;
7306+
7307+ kdev_t dev;
7308+ unsigned long flags;
7309+
7310+ /*
7311+ * A list of ios that arrived while we were suspended.
7312+ */
7313+ atomic_t pending;
7314+ wait_queue_head_t wait;
7315+ struct deferred_io *deferred;
7316+
7317+ /*
7318+ * The current mapping.
7319+ */
7320+ struct dm_table *map;
7321+
7322+ /*
7323+ * io objects are allocated from here.
7324+ */
7325+ mempool_t *io_pool;
7326+
7327+ /*
7328+ * Event handling.
7329+ */
7330+ uint32_t event_nr;
7331+ wait_queue_head_t eventq;
7332+};
7333+
7334+#define MIN_IOS 256
7335+static kmem_cache_t *_io_cache;
7336+
7337+static struct mapped_device *get_kdev(kdev_t dev);
7338+static int dm_request(request_queue_t *q, int rw, struct buffer_head *bh);
7339+static int dm_user_bmap(struct inode *inode, struct lv_bmap *lvb);
7340+
7341+/*-----------------------------------------------------------------
7342+ * In order to avoid the 256 minor number limit we are going to
7343+ * register more major numbers as neccessary.
7344+ *---------------------------------------------------------------*/
7345+#define MAX_MINORS (1 << MINORBITS)
7346+
7347+struct major_details {
7348+ unsigned int major;
7349+
7350+ int transient;
7351+ struct list_head transient_list;
7352+
7353+ unsigned int first_free_minor;
7354+ int nr_free_minors;
7355+
7356+ struct mapped_device *mds[MAX_MINORS];
7357+ int blk_size[MAX_MINORS];
7358+ int blksize_size[MAX_MINORS];
7359+ int hardsect_size[MAX_MINORS];
7360+};
7361+
7362+static struct rw_semaphore _dev_lock;
7363+static struct major_details *_majors[MAX_BLKDEV];
7364+
7365+/*
7366+ * This holds a list of majors that non-specified device numbers
7367+ * may be allocated from. Only majors with free minors appear on
7368+ * this list.
7369+ */
7370+static LIST_HEAD(_transients_free);
7371+
7372+static int __alloc_major(unsigned int major, struct major_details **result)
7373+{
7374+ int r;
7375+ unsigned int transient = !major;
7376+ struct major_details *maj;
7377+
7378+ /* Major already allocated? */
7379+ if (major && _majors[major])
7380+ return 0;
7381+
7382+ maj = kmalloc(sizeof(*maj), GFP_KERNEL);
7383+ if (!maj)
7384+ return -ENOMEM;
7385+
7386+ memset(maj, 0, sizeof(*maj));
7387+ INIT_LIST_HEAD(&maj->transient_list);
7388+
7389+ maj->nr_free_minors = MAX_MINORS;
7390+
7391+ r = register_blkdev(major, _name, &dm_blk_dops);
7392+ if (r < 0) {
7393+ DMERR("register_blkdev failed for %d", major);
7394+ kfree(maj);
7395+ return r;
7396+ }
7397+ if (r > 0)
7398+ major = r;
7399+
7400+ maj->major = major;
7401+
7402+ if (transient) {
7403+ maj->transient = transient;
7404+ list_add_tail(&maj->transient_list, &_transients_free);
7405+ }
7406+
7407+ _majors[major] = maj;
7408+
7409+ blk_size[major] = maj->blk_size;
7410+ blksize_size[major] = maj->blksize_size;
7411+ hardsect_size[major] = maj->hardsect_size;
7412+ read_ahead[major] = DEFAULT_READ_AHEAD;
7413+
7414+ blk_queue_make_request(BLK_DEFAULT_QUEUE(major), dm_request);
7415+
7416+ *result = maj;
7417+ return 0;
7418+}
7419+
7420+static void __free_major(struct major_details *maj)
7421+{
7422+ unsigned int major = maj->major;
7423+
7424+ list_del(&maj->transient_list);
7425+
7426+ read_ahead[major] = 0;
7427+ blk_size[major] = NULL;
7428+ blksize_size[major] = NULL;
7429+ hardsect_size[major] = NULL;
7430+
7431+ _majors[major] = NULL;
7432+ kfree(maj);
7433+
7434+ if (unregister_blkdev(major, _name) < 0)
7435+ DMERR("devfs_unregister_blkdev failed");
7436+}
7437+
7438+static void free_all_majors(void)
7439+{
7440+ unsigned int major = ARRAY_SIZE(_majors);
7441+
7442+ down_write(&_dev_lock);
7443+
7444+ while (major--)
7445+ if (_majors[major])
7446+ __free_major(_majors[major]);
7447+
7448+ up_write(&_dev_lock);
7449+}
7450+
7451+static void free_dev(kdev_t dev)
7452+{
7453+ unsigned int major = major(dev);
7454+ unsigned int minor = minor(dev);
7455+ struct major_details *maj;
7456+
7457+ down_write(&_dev_lock);
7458+
7459+ maj = _majors[major];
7460+ if (!maj)
7461+ goto out;
7462+
7463+ maj->mds[minor] = NULL;
7464+ maj->nr_free_minors++;
7465+
7466+ if (maj->nr_free_minors == MAX_MINORS) {
7467+ __free_major(maj);
7468+ goto out;
7469+ }
7470+
7471+ if (!maj->transient)
7472+ goto out;
7473+
7474+ if (maj->nr_free_minors == 1)
7475+ list_add_tail(&maj->transient_list, &_transients_free);
7476+
7477+ if (minor < maj->first_free_minor)
7478+ maj->first_free_minor = minor;
7479+
7480+ out:
7481+ up_write(&_dev_lock);
7482+}
7483+
7484+static void __alloc_minor(struct major_details *maj, unsigned int minor,
7485+ struct mapped_device *md)
7486+{
7487+ maj->mds[minor] = md;
7488+ md->dev = mk_kdev(maj->major, minor);
7489+ maj->nr_free_minors--;
7490+
7491+ if (maj->transient && !maj->nr_free_minors)
7492+ list_del_init(&maj->transient_list);
7493+}
7494+
7495+/*
7496+ * See if requested kdev_t is available.
7497+ */
7498+static int specific_dev(kdev_t dev, struct mapped_device *md)
7499+{
7500+ int r = 0;
7501+ unsigned int major = major(dev);
7502+ unsigned int minor = minor(dev);
7503+ struct major_details *maj;
7504+
7505+ if (!major || (major > MAX_BLKDEV) || (minor >= MAX_MINORS)) {
7506+ DMWARN("device number requested out of range (%d, %d)",
7507+ major, minor);
7508+ return -EINVAL;
7509+ }
7510+
7511+ down_write(&_dev_lock);
7512+ maj = _majors[major];
7513+
7514+ /* Register requested major? */
7515+ if (!maj) {
7516+ r = __alloc_major(major, &maj);
7517+ if (r)
7518+ goto out;
7519+
7520+ major = maj->major;
7521+ }
7522+
7523+ if (maj->mds[minor]) {
7524+ r = -EBUSY;
7525+ goto out;
7526+ }
7527+
7528+ __alloc_minor(maj, minor, md);
7529+
7530+ out:
7531+ up_write(&_dev_lock);
7532+
7533+ return r;
7534+}
7535+
7536+/*
7537+ * Find first unused device number, requesting a new major number if required.
7538+ */
7539+static int first_free_dev(struct mapped_device *md)
7540+{
7541+ int r = 0;
7542+ struct major_details *maj;
7543+
7544+ down_write(&_dev_lock);
7545+
7546+ if (list_empty(&_transients_free)) {
7547+ r = __alloc_major(0, &maj);
7548+ if (r)
7549+ goto out;
7550+ } else
7551+ maj = list_entry(_transients_free.next, struct major_details,
7552+ transient_list);
7553+
7554+ while (maj->mds[maj->first_free_minor++])
7555+ ;
7556+
7557+ __alloc_minor(maj, maj->first_free_minor - 1, md);
7558+
7559+ out:
7560+ up_write(&_dev_lock);
7561+
7562+ return r;
7563+}
7564+
7565+static struct mapped_device *get_kdev(kdev_t dev)
7566+{
7567+ struct mapped_device *md;
7568+ struct major_details *maj;
7569+
7570+ down_read(&_dev_lock);
7571+ maj = _majors[major(dev)];
7572+ if (!maj) {
7573+ md = NULL;
7574+ goto out;
7575+ }
7576+ md = maj->mds[minor(dev)];
7577+ if (md)
7578+ dm_get(md);
7579+ out:
7580+ up_read(&_dev_lock);
7581+
7582+ return md;
7583+}
7584+
7585+/*-----------------------------------------------------------------
7586+ * init/exit code
7587+ *---------------------------------------------------------------*/
7588+
7589+static __init int local_init(void)
7590+{
7591+ init_rwsem(&_dev_lock);
7592+
7593+ /* allocate a slab for the dm_ios */
7594+ _io_cache = kmem_cache_create("dm io",
7595+ sizeof(struct dm_io), 0, 0, NULL, NULL);
7596+
7597+ if (!_io_cache)
7598+ return -ENOMEM;
7599+
7600+ return 0;
7601+}
7602+
7603+static void local_exit(void)
7604+{
7605+ kmem_cache_destroy(_io_cache);
7606+ free_all_majors();
7607+
7608+ DMINFO("cleaned up");
7609+}
7610+
7611+/*
7612+ * We have a lot of init/exit functions, so it seems easier to
7613+ * store them in an array. The disposable macro 'xx'
7614+ * expands a prefix into a pair of function names.
7615+ */
7616+static struct {
7617+ int (*init) (void);
7618+ void (*exit) (void);
7619+
7620+} _inits[] = {
7621+#define xx(n) {n ## _init, n ## _exit},
7622+ xx(local)
7623+ xx(kcopyd)
7624+ xx(dm_target)
7625+ xx(dm_linear)
7626+ xx(dm_stripe)
7627+ xx(dm_snapshot)
7628+ xx(dm_interface)
7629+#undef xx
7630+};
7631+
7632+static int __init dm_init(void)
7633+{
7634+ const int count = ARRAY_SIZE(_inits);
7635+
7636+ int r, i;
7637+
7638+ for (i = 0; i < count; i++) {
7639+ r = _inits[i].init();
7640+ if (r)
7641+ goto bad;
7642+ }
7643+
7644+ return 0;
7645+
7646+ bad:
7647+ while (i--)
7648+ _inits[i].exit();
7649+
7650+ return r;
7651+}
7652+
7653+static void __exit dm_exit(void)
7654+{
7655+ int i = ARRAY_SIZE(_inits);
7656+
7657+ while (i--)
7658+ _inits[i].exit();
7659+}
7660+
7661+/*
7662+ * Block device functions
7663+ */
7664+static int dm_blk_open(struct inode *inode, struct file *file)
7665+{
7666+ struct mapped_device *md;
7667+
7668+ md = get_kdev(inode->i_rdev);
7669+ if (!md)
7670+ return -ENXIO;
7671+
7672+ return 0;
7673+}
7674+
7675+static int dm_blk_close(struct inode *inode, struct file *file)
7676+{
7677+ struct mapped_device *md;
7678+
7679+ md = get_kdev(inode->i_rdev);
7680+ dm_put(md); /* put the reference gained by dm_blk_open */
7681+ dm_put(md);
7682+ return 0;
7683+}
7684+
7685+static inline struct dm_io *alloc_io(struct mapped_device *md)
7686+{
7687+ return mempool_alloc(md->io_pool, GFP_NOIO);
7688+}
7689+
7690+static inline void free_io(struct mapped_device *md, struct dm_io *io)
7691+{
7692+ mempool_free(io, md->io_pool);
7693+}
7694+
7695+static inline struct deferred_io *alloc_deferred(void)
7696+{
7697+ return kmalloc(sizeof(struct deferred_io), GFP_NOIO);
7698+}
7699+
7700+static inline void free_deferred(struct deferred_io *di)
7701+{
7702+ kfree(di);
7703+}
7704+
7705+static inline sector_t volume_size(kdev_t dev)
7706+{
7707+ return blk_size[major(dev)][minor(dev)] << 1;
7708+}
7709+
7710+/* FIXME: check this */
7711+static int dm_blk_ioctl(struct inode *inode, struct file *file,
7712+ unsigned int command, unsigned long a)
7713+{
7714+ kdev_t dev = inode->i_rdev;
7715+ long size;
7716+
7717+ switch (command) {
7718+ case BLKROSET:
7719+ case BLKROGET:
7720+ case BLKRASET:
7721+ case BLKRAGET:
7722+ case BLKFLSBUF:
7723+ case BLKSSZGET:
7724+ //case BLKRRPART: /* Re-read partition tables */
7725+ //case BLKPG:
7726+ case BLKELVGET:
7727+ case BLKELVSET:
7728+ case BLKBSZGET:
7729+ case BLKBSZSET:
7730+ return blk_ioctl(dev, command, a);
7731+ break;
7732+
7733+ case BLKGETSIZE:
7734+ size = volume_size(dev);
7735+ if (copy_to_user((void *) a, &size, sizeof(long)))
7736+ return -EFAULT;
7737+ break;
7738+
7739+ case BLKGETSIZE64:
7740+ size = volume_size(dev);
7741+ if (put_user((u64) ((u64) size) << 9, (u64 *) a))
7742+ return -EFAULT;
7743+ break;
7744+
7745+ case BLKRRPART:
7746+ return -ENOTTY;
7747+
7748+ case LV_BMAP:
7749+ return dm_user_bmap(inode, (struct lv_bmap *) a);
7750+
7751+ default:
7752+ DMWARN("unknown block ioctl 0x%x", command);
7753+ return -ENOTTY;
7754+ }
7755+
7756+ return 0;
7757+}
7758+
7759+/*
7760+ * Add the buffer to the list of deferred io.
7761+ */
7762+static int queue_io(struct mapped_device *md, struct buffer_head *bh, int rw)
7763+{
7764+ struct deferred_io *di;
7765+
7766+ di = alloc_deferred();
7767+ if (!di)
7768+ return -ENOMEM;
7769+
7770+ down_write(&md->lock);
7771+
7772+ if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
7773+ up_write(&md->lock);
7774+ free_deferred(di);
7775+ return 1;
7776+ }
7777+
7778+ di->bh = bh;
7779+ di->rw = rw;
7780+ di->next = md->deferred;
7781+ md->deferred = di;
7782+
7783+ up_write(&md->lock);
7784+ return 0; /* deferred successfully */
7785+}
7786+
7787+/*
7788+ * bh->b_end_io routine that decrements the pending count
7789+ * and then calls the original bh->b_end_io fn.
7790+ */
7791+static void dec_pending(struct buffer_head *bh, int uptodate)
7792+{
7793+ int r;
7794+ struct dm_io *io = bh->b_private;
7795+ dm_endio_fn endio = io->ti->type->end_io;
7796+
7797+ if (endio) {
7798+ r = endio(io->ti, bh, io->rw, uptodate ? 0 : -EIO,
7799+ &io->map_context);
7800+ if (r < 0)
7801+ uptodate = 0;
7802+
7803+ else if (r > 0)
7804+ /* the target wants another shot at the io */
7805+ return;
7806+ }
7807+
7808+ if (atomic_dec_and_test(&io->md->pending))
7809+ /* nudge anyone waiting on suspend queue */
7810+ wake_up(&io->md->wait);
7811+
7812+ bh->b_end_io = io->end_io;
7813+ bh->b_private = io->context;
7814+ free_io(io->md, io);
7815+
7816+ bh->b_end_io(bh, uptodate);
7817+}
7818+
7819+/*
7820+ * Do the bh mapping for a given leaf
7821+ */
7822+static inline int __map_buffer(struct mapped_device *md, int rw,
7823+ struct buffer_head *bh, struct dm_io *io)
7824+{
7825+ struct dm_target *ti;
7826+
7827+ if (!md->map)
7828+ return -EINVAL;
7829+
7830+ ti = dm_table_find_target(md->map, bh->b_rsector);
7831+ if (!ti->type)
7832+ return -EINVAL;
7833+
7834+ /* hook the end io request fn */
7835+ atomic_inc(&md->pending);
7836+ io->md = md;
7837+ io->ti = ti;
7838+ io->rw = rw;
7839+ io->end_io = bh->b_end_io;
7840+ io->context = bh->b_private;
7841+ bh->b_end_io = dec_pending;
7842+ bh->b_private = io;
7843+
7844+ return ti->type->map(ti, bh, rw, &io->map_context);
7845+}
7846+
7847+/*
7848+ * Checks to see if we should be deferring io, if so it queues it
7849+ * and returns 1.
7850+ */
7851+static inline int __deferring(struct mapped_device *md, int rw,
7852+ struct buffer_head *bh)
7853+{
7854+ int r;
7855+
7856+ /*
7857+ * If we're suspended we have to queue this io for later.
7858+ */
7859+ while (test_bit(DMF_BLOCK_IO, &md->flags)) {
7860+ up_read(&md->lock);
7861+
7862+ /*
7863+ * There's no point deferring a read ahead
7864+ * request, just drop it.
7865+ */
7866+ if (rw == READA) {
7867+ down_read(&md->lock);
7868+ return -EIO;
7869+ }
7870+
7871+ r = queue_io(md, bh, rw);
7872+ down_read(&md->lock);
7873+
7874+ if (r < 0)
7875+ return r;
7876+
7877+ if (r == 0)
7878+ return 1; /* deferred successfully */
7879+
7880+ }
7881+
7882+ return 0;
7883+}
7884+
7885+static int dm_request(request_queue_t *q, int rw, struct buffer_head *bh)
7886+{
7887+ int r;
7888+ struct dm_io *io;
7889+ struct mapped_device *md;
7890+
7891+ md = get_kdev(bh->b_rdev);
7892+ if (!md) {
7893+ buffer_IO_error(bh);
7894+ return 0;
7895+ }
7896+
7897+ io = alloc_io(md);
7898+ down_read(&md->lock);
7899+
7900+ r = __deferring(md, rw, bh);
7901+ if (r < 0)
7902+ goto bad;
7903+
7904+ else if (!r) {
7905+ /* not deferring */
7906+ r = __map_buffer(md, rw, bh, io);
7907+ if (r < 0)
7908+ goto bad;
7909+ } else
7910+ r = 0;
7911+
7912+ up_read(&md->lock);
7913+ dm_put(md);
7914+ return r;
7915+
7916+ bad:
7917+ buffer_IO_error(bh);
7918+ up_read(&md->lock);
7919+ dm_put(md);
7920+ return 0;
7921+}
7922+
7923+static int check_dev_size(kdev_t dev, unsigned long block)
7924+{
7925+ unsigned int major = major(dev);
7926+ unsigned int minor = minor(dev);
7927+
7928+ /* FIXME: check this */
7929+ unsigned long max_sector = (blk_size[major][minor] << 1) + 1;
7930+ unsigned long sector = (block + 1) * (blksize_size[major][minor] >> 9);
7931+
7932+ return (sector > max_sector) ? 0 : 1;
7933+}
7934+
7935+/*
7936+ * Creates a dummy buffer head and maps it (for lilo).
7937+ */
7938+static int __bmap(struct mapped_device *md, kdev_t dev, unsigned long block,
7939+ kdev_t *r_dev, unsigned long *r_block)
7940+{
7941+ struct buffer_head bh;
7942+ struct dm_target *ti;
7943+ union map_info map_context;
7944+ int r;
7945+
7946+ if (test_bit(DMF_BLOCK_IO, &md->flags)) {
7947+ return -EPERM;
7948+ }
7949+
7950+ if (!check_dev_size(dev, block)) {
7951+ return -EINVAL;
7952+ }
7953+
7954+ if (!md->map)
7955+ return -EINVAL;
7956+
7957+ /* setup dummy bh */
7958+ memset(&bh, 0, sizeof(bh));
7959+ bh.b_blocknr = block;
7960+ bh.b_dev = bh.b_rdev = dev;
7961+ bh.b_size = blksize_size[major(dev)][minor(dev)];
7962+ bh.b_rsector = block * (bh.b_size >> 9);
7963+
7964+ /* find target */
7965+ ti = dm_table_find_target(md->map, bh.b_rsector);
7966+
7967+ /* do the mapping */
7968+ r = ti->type->map(ti, &bh, READ, &map_context);
7969+ ti->type->end_io(ti, &bh, READ, 0, &map_context);
7970+
7971+ if (!r) {
7972+ *r_dev = bh.b_rdev;
7973+ *r_block = bh.b_rsector / (bh.b_size >> 9);
7974+ }
7975+
7976+ return r;
7977+}
7978+
7979+/*
7980+ * Marshals arguments and results between user and kernel space.
7981+ */
7982+static int dm_user_bmap(struct inode *inode, struct lv_bmap *lvb)
7983+{
7984+ struct mapped_device *md;
7985+ unsigned long block, r_block;
7986+ kdev_t r_dev;
7987+ int r;
7988+
7989+ if (get_user(block, &lvb->lv_block))
7990+ return -EFAULT;
7991+
7992+ md = get_kdev(inode->i_rdev);
7993+ if (!md)
7994+ return -ENXIO;
7995+
7996+ down_read(&md->lock);
7997+ r = __bmap(md, inode->i_rdev, block, &r_dev, &r_block);
7998+ up_read(&md->lock);
7999+ dm_put(md);
8000+
8001+ if (!r && (put_user(kdev_t_to_nr(r_dev), &lvb->lv_dev) ||
8002+ put_user(r_block, &lvb->lv_block)))
8003+ r = -EFAULT;
8004+
8005+ return r;
8006+}
8007+
8008+static void free_md(struct mapped_device *md)
8009+{
8010+ free_dev(md->dev);
8011+ mempool_destroy(md->io_pool);
8012+ kfree(md);
8013+}
8014+
8015+/*
8016+ * Allocate and initialise a blank device with a given minor.
8017+ */
8018+static struct mapped_device *alloc_md(kdev_t dev)
8019+{
8020+ int r;
8021+ struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL);
8022+
8023+ if (!md) {
8024+ DMWARN("unable to allocate device, out of memory.");
8025+ return NULL;
8026+ }
8027+
8028+ memset(md, 0, sizeof(*md));
8029+
8030+ /* Allocate suitable device number */
8031+ if (!dev)
8032+ r = first_free_dev(md);
8033+ else
8034+ r = specific_dev(dev, md);
8035+
8036+ if (r) {
8037+ kfree(md);
8038+ return NULL;
8039+ }
8040+
8041+ md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
8042+ mempool_free_slab, _io_cache);
8043+ if (!md->io_pool) {
8044+ free_md(md);
8045+ kfree(md);
8046+ return NULL;
8047+ }
8048+
8049+ init_rwsem(&md->lock);
8050+ atomic_set(&md->holders, 1);
8051+ atomic_set(&md->pending, 0);
8052+ init_waitqueue_head(&md->wait);
8053+ init_waitqueue_head(&md->eventq);
8054+
8055+ return md;
8056+}
8057+
8058+/*
8059+ * The hardsect size for a mapped device is the largest hardsect size
8060+ * from the devices it maps onto.
8061+ */
8062+static int __find_hardsect_size(struct list_head *devices)
8063+{
8064+ int result = 512, size;
8065+ struct list_head *tmp;
8066+
8067+ list_for_each (tmp, devices) {
8068+ struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
8069+ size = get_hardsect_size(dd->dev);
8070+ if (size > result)
8071+ result = size;
8072+ }
8073+
8074+ return result;
8075+}
8076+
8077+/*
8078+ * Bind a table to the device.
8079+ */
8080+static void event_callback(void *context)
8081+{
8082+ struct mapped_device *md = (struct mapped_device *) context;
8083+
8084+ down_write(&md->lock);
8085+ md->event_nr++;
8086+ wake_up_interruptible(&md->eventq);
8087+ up_write(&md->lock);
8088+}
8089+
8090+static int __bind(struct mapped_device *md, struct dm_table *t)
8091+{
8092+ unsigned int minor = minor(md->dev);
8093+ unsigned int major = major(md->dev);
8094+ md->map = t;
8095+
8096+ /* in k */
8097+ blk_size[major][minor] = dm_table_get_size(t) >> 1;
8098+ blksize_size[major][minor] = BLOCK_SIZE;
8099+ hardsect_size[major][minor] =
8100+ __find_hardsect_size(dm_table_get_devices(t));
8101+ register_disk(NULL, md->dev, 1, &dm_blk_dops, blk_size[major][minor]);
8102+
8103+ dm_table_event_callback(md->map, event_callback, md);
8104+ dm_table_get(t);
8105+ return 0;
8106+}
8107+
8108+static void __unbind(struct mapped_device *md)
8109+{
8110+ unsigned int minor = minor(md->dev);
8111+ unsigned int major = major(md->dev);
8112+
8113+ if (md->map) {
8114+ dm_table_event_callback(md->map, NULL, NULL);
8115+ dm_table_put(md->map);
8116+ md->map = NULL;
8117+
8118+ }
8119+
8120+ blk_size[major][minor] = 0;
8121+ blksize_size[major][minor] = 0;
8122+ hardsect_size[major][minor] = 0;
8123+}
8124+
8125+/*
8126+ * Constructor for a new device.
8127+ */
8128+int dm_create(kdev_t dev, struct mapped_device **result)
8129+{
8130+ struct mapped_device *md;
8131+
8132+ md = alloc_md(dev);
8133+ if (!md)
8134+ return -ENXIO;
8135+
8136+ __unbind(md); /* Ensure zero device size */
8137+
8138+ *result = md;
8139+ return 0;
8140+}
8141+
8142+void dm_get(struct mapped_device *md)
8143+{
8144+ atomic_inc(&md->holders);
8145+}
8146+
8147+void dm_put(struct mapped_device *md)
8148+{
8149+ if (atomic_dec_and_test(&md->holders)) {
8150+ if (md->map)
8151+ dm_table_suspend_targets(md->map);
8152+ __unbind(md);
8153+ free_md(md);
8154+ }
8155+}
8156+
8157+/*
8158+ * Requeue the deferred io by calling generic_make_request.
8159+ */
8160+static void flush_deferred_io(struct deferred_io *c)
8161+{
8162+ struct deferred_io *n;
8163+
8164+ while (c) {
8165+ n = c->next;
8166+ generic_make_request(c->rw, c->bh);
8167+ free_deferred(c);
8168+ c = n;
8169+ }
8170+}
8171+
8172+/*
8173+ * Swap in a new table (destroying old one).
8174+ */
8175+int dm_swap_table(struct mapped_device *md, struct dm_table *table)
8176+{
8177+ int r;
8178+
8179+ down_write(&md->lock);
8180+
8181+ /*
8182+ * The device must be suspended, or have no table bound yet.
8183+ */
8184+ if (md->map && !test_bit(DMF_SUSPENDED, &md->flags)) {
8185+ up_write(&md->lock);
8186+ return -EPERM;
8187+ }
8188+
8189+ __unbind(md);
8190+ r = __bind(md, table);
8191+ if (r)
8192+ return r;
8193+
8194+ up_write(&md->lock);
8195+ return 0;
8196+}
8197+
8198+/*
8199+ * We need to be able to change a mapping table under a mounted
8200+ * filesystem. For example we might want to move some data in
8201+ * the background. Before the table can be swapped with
8202+ * dm_bind_table, dm_suspend must be called to flush any in
8203+ * flight io and ensure that any further io gets deferred.
8204+ */
8205+int dm_suspend(struct mapped_device *md)
8206+{
8207+ int r = 0;
8208+ DECLARE_WAITQUEUE(wait, current);
8209+
8210+ down_write(&md->lock);
8211+
8212+ /*
8213+ * First we set the BLOCK_IO flag so no more ios will be
8214+ * mapped.
8215+ */
8216+ if (test_bit(DMF_BLOCK_IO, &md->flags)) {
8217+ up_write(&md->lock);
8218+ return -EINVAL;
8219+ }
8220+
8221+ set_bit(DMF_BLOCK_IO, &md->flags);
8222+ add_wait_queue(&md->wait, &wait);
8223+ up_write(&md->lock);
8224+
8225+ /*
8226+ * Then we wait for the already mapped ios to
8227+ * complete.
8228+ */
8229+ run_task_queue(&tq_disk);
8230+ while (1) {
8231+ set_current_state(TASK_INTERRUPTIBLE);
8232+
8233+ if (!atomic_read(&md->pending) || signal_pending(current))
8234+ break;
8235+
8236+ schedule();
8237+ }
8238+ set_current_state(TASK_RUNNING);
8239+
8240+ down_write(&md->lock);
8241+ remove_wait_queue(&md->wait, &wait);
8242+
8243+ /* did we flush everything ? */
8244+ if (atomic_read(&md->pending)) {
8245+ clear_bit(DMF_BLOCK_IO, &md->flags);
8246+ r = -EINTR;
8247+ } else {
8248+ set_bit(DMF_SUSPENDED, &md->flags);
8249+ if (md->map)
8250+ dm_table_suspend_targets(md->map);
8251+ }
8252+ up_write(&md->lock);
8253+
8254+ return r;
8255+}
8256+
8257+int dm_resume(struct mapped_device *md)
8258+{
8259+ struct deferred_io *def;
8260+
8261+ down_write(&md->lock);
8262+ if (!test_bit(DMF_SUSPENDED, &md->flags)) {
8263+ up_write(&md->lock);
8264+ return -EINVAL;
8265+ }
8266+
8267+ if (md->map)
8268+ dm_table_resume_targets(md->map);
8269+
8270+ clear_bit(DMF_SUSPENDED, &md->flags);
8271+ clear_bit(DMF_BLOCK_IO, &md->flags);
8272+ def = md->deferred;
8273+ md->deferred = NULL;
8274+ up_write(&md->lock);
8275+
8276+ flush_deferred_io(def);
8277+ run_task_queue(&tq_disk);
8278+
8279+ return 0;
8280+}
8281+
8282+struct dm_table *dm_get_table(struct mapped_device *md)
8283+{
8284+ struct dm_table *t;
8285+
8286+ down_read(&md->lock);
8287+ t = md->map;
8288+ if (t)
8289+ dm_table_get(t);
8290+ up_read(&md->lock);
8291+
8292+ return t;
8293+}
8294+
8295+/*-----------------------------------------------------------------
8296+ * Event notification.
8297+ *---------------------------------------------------------------*/
8298+uint32_t dm_get_event_nr(struct mapped_device *md)
8299+{
8300+ uint32_t r;
8301+
8302+ down_read(&md->lock);
8303+ r = md->event_nr;
8304+ up_read(&md->lock);
8305+
8306+ return r;
8307+}
8308+
8309+int dm_add_wait_queue(struct mapped_device *md, wait_queue_t *wq,
8310+ uint32_t event_nr)
8311+{
8312+ down_write(&md->lock);
8313+ if (event_nr != md->event_nr) {
8314+ up_write(&md->lock);
8315+ return 1;
8316+ }
8317+
8318+ add_wait_queue(&md->eventq, wq);
8319+ up_write(&md->lock);
8320+
8321+ return 0;
8322+}
8323+
8324+const char *dm_kdevname(kdev_t dev)
8325+{
8326+ static char buffer[32];
8327+ sprintf(buffer, "%03d:%03d", MAJOR(dev), MINOR(dev));
8328+ return buffer;
8329+}
8330+
8331+void dm_remove_wait_queue(struct mapped_device *md, wait_queue_t *wq)
8332+{
8333+ down_write(&md->lock);
8334+ remove_wait_queue(&md->eventq, wq);
8335+ up_write(&md->lock);
8336+}
8337+
8338+kdev_t dm_kdev(struct mapped_device *md)
8339+{
8340+ kdev_t dev;
8341+
8342+ down_read(&md->lock);
8343+ dev = md->dev;
8344+ up_read(&md->lock);
8345+
8346+ return dev;
8347+}
8348+
8349+int dm_suspended(struct mapped_device *md)
8350+{
8351+ return test_bit(DMF_SUSPENDED, &md->flags);
8352+}
8353+
8354+struct block_device_operations dm_blk_dops = {
8355+ .open = dm_blk_open,
8356+ .release = dm_blk_close,
8357+ .ioctl = dm_blk_ioctl,
8358+ .owner = THIS_MODULE
8359+};
8360+
8361+/*
8362+ * module hooks
8363+ */
8364+module_init(dm_init);
8365+module_exit(dm_exit);
8366+
8367+MODULE_DESCRIPTION(DM_NAME " driver");
8368+MODULE_AUTHOR("Joe Thornber <thornber@sistina.com>");
8369+MODULE_LICENSE("GPL");
8370+
8371+EXPORT_SYMBOL(dm_kdevname);
339dbf15
AM
8372--- linux-2.4.21/drivers/md/dm.h Thu Jan 1 01:00:00 1970
8373+++ linux/drivers/md/dm.h Wed Aug 20 14:41:38 2003
2ac564b8
AM
8374@@ -0,0 +1,175 @@
8375+/*
8376+ * Internal header file for device mapper
8377+ *
8378+ * Copyright (C) 2001, 2002 Sistina Software
8379+ *
8380+ * This file is released under the LGPL.
8381+ */
8382+
8383+#ifndef DM_INTERNAL_H
8384+#define DM_INTERNAL_H
8385+
8386+#include <linux/fs.h>
8387+#include <linux/device-mapper.h>
8388+#include <linux/list.h>
8389+#include <linux/blkdev.h>
8390+
8391+#define DM_NAME "device-mapper"
8392+#define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x)
8393+#define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x)
8394+#define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x)
8395+
8396+/*
8397+ * FIXME: I think this should be with the definition of sector_t
8398+ * in types.h.
8399+ */
8400+#ifdef CONFIG_LBD
8401+#define SECTOR_FORMAT "%Lu"
8402+#else
8403+#define SECTOR_FORMAT "%lu"
8404+#endif
8405+
8406+#define SECTOR_SHIFT 9
8407+#define SECTOR_SIZE (1 << SECTOR_SHIFT)
8408+
8409+extern struct block_device_operations dm_blk_dops;
8410+
8411+/*
8412+ * List of devices that a metadevice uses and should open/close.
8413+ */
8414+struct dm_dev {
8415+ struct list_head list;
8416+
8417+ atomic_t count;
8418+ int mode;
8419+ kdev_t dev;
8420+ struct block_device *bdev;
8421+};
8422+
8423+struct dm_table;
8424+struct mapped_device;
8425+
8426+/*-----------------------------------------------------------------
8427+ * Functions for manipulating a struct mapped_device.
8428+ * Drop the reference with dm_put when you finish with the object.
8429+ *---------------------------------------------------------------*/
8430+int dm_create(kdev_t dev, struct mapped_device **md);
8431+
8432+/*
8433+ * Reference counting for md.
8434+ */
8435+void dm_get(struct mapped_device *md);
8436+void dm_put(struct mapped_device *md);
8437+
8438+/*
8439+ * A device can still be used while suspended, but I/O is deferred.
8440+ */
8441+int dm_suspend(struct mapped_device *md);
8442+int dm_resume(struct mapped_device *md);
8443+
8444+/*
8445+ * The device must be suspended before calling this method.
8446+ */
8447+int dm_swap_table(struct mapped_device *md, struct dm_table *t);
8448+
8449+/*
8450+ * Drop a reference on the table when you've finished with the
8451+ * result.
8452+ */
8453+struct dm_table *dm_get_table(struct mapped_device *md);
8454+
8455+/*
8456+ * Event functions.
8457+ */
8458+uint32_t dm_get_event_nr(struct mapped_device *md);
8459+int dm_add_wait_queue(struct mapped_device *md, wait_queue_t *wq,
8460+ uint32_t event_nr);
8461+void dm_remove_wait_queue(struct mapped_device *md, wait_queue_t *wq);
8462+
8463+/*
8464+ * Info functions.
8465+ */
8466+kdev_t dm_kdev(struct mapped_device *md);
8467+int dm_suspended(struct mapped_device *md);
8468+
8469+/*-----------------------------------------------------------------
8470+ * Functions for manipulating a table. Tables are also reference
8471+ * counted.
8472+ *---------------------------------------------------------------*/
8473+int dm_table_create(struct dm_table **result, int mode);
8474+
8475+void dm_table_get(struct dm_table *t);
8476+void dm_table_put(struct dm_table *t);
8477+
8478+int dm_table_add_target(struct dm_table *t, const char *type,
8479+ sector_t start, sector_t len, char *params);
8480+int dm_table_complete(struct dm_table *t);
8481+void dm_table_event_callback(struct dm_table *t,
8482+ void (*fn)(void *), void *context);
8483+void dm_table_event(struct dm_table *t);
8484+sector_t dm_table_get_size(struct dm_table *t);
8485+struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
8486+struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
8487+unsigned int dm_table_get_num_targets(struct dm_table *t);
8488+struct list_head *dm_table_get_devices(struct dm_table *t);
8489+int dm_table_get_mode(struct dm_table *t);
8490+void dm_table_suspend_targets(struct dm_table *t);
8491+void dm_table_resume_targets(struct dm_table *t);
8492+
8493+/*-----------------------------------------------------------------
8494+ * A registry of target types.
8495+ *---------------------------------------------------------------*/
8496+int dm_target_init(void);
8497+void dm_target_exit(void);
8498+struct target_type *dm_get_target_type(const char *name);
8499+void dm_put_target_type(struct target_type *t);
8500+
8501+
8502+/*-----------------------------------------------------------------
8503+ * Useful inlines.
8504+ *---------------------------------------------------------------*/
8505+static inline int array_too_big(unsigned long fixed, unsigned long obj,
8506+ unsigned long num)
8507+{
8508+ return (num > (ULONG_MAX - fixed) / obj);
8509+}
8510+
8511+/*
8512+ * ceiling(n / size) * size
8513+ */
8514+static inline unsigned long dm_round_up(unsigned long n, unsigned long size)
8515+{
8516+ unsigned long r = n % size;
8517+ return n + (r ? (size - r) : 0);
8518+}
8519+
8520+/*
8521+ * Ceiling(n / size)
8522+ */
8523+static inline unsigned long dm_div_up(unsigned long n, unsigned long size)
8524+{
8525+ return dm_round_up(n, size) / size;
8526+}
8527+
8528+const char *dm_kdevname(kdev_t dev);
8529+
8530+/*
8531+ * The device-mapper can be driven through one of two interfaces;
8532+ * ioctl or filesystem, depending which patch you have applied.
8533+ */
8534+int dm_interface_init(void);
8535+void dm_interface_exit(void);
8536+
8537+/*
8538+ * Targets for linear and striped mappings
8539+ */
8540+int dm_linear_init(void);
8541+void dm_linear_exit(void);
8542+
8543+int dm_stripe_init(void);
8544+void dm_stripe_exit(void);
8545+
8546+int dm_snapshot_init(void);
8547+void dm_snapshot_exit(void);
8548+
8549+#endif
339dbf15
AM
8550--- linux-2.4.21/drivers/md/kcopyd.c Thu Jan 1 01:00:00 1970
8551+++ linux/drivers/md/kcopyd.c Wed Aug 20 14:41:38 2003
2ac564b8
AM
8552@@ -0,0 +1,650 @@
8553+/*
8554+ * Copyright (C) 2002 Sistina Software (UK) Limited.
8555+ *
8556+ * This file is released under the GPL.
8557+ */
8558+
8559+#include <asm/atomic.h>
8560+
8561+#include <linux/blkdev.h>
8562+#include <linux/config.h>
8563+#include <linux/device-mapper.h>
8564+#include <linux/fs.h>
8565+#include <linux/init.h>
8566+#include <linux/list.h>
8567+#include <linux/locks.h>
8568+#include <linux/mempool.h>
8569+#include <linux/module.h>
8570+#include <linux/pagemap.h>
8571+#include <linux/slab.h>
8572+#include <linux/vmalloc.h>
8573+
8574+#include "kcopyd.h"
8575+#include "dm-daemon.h"
8576+
8577+/* FIXME: this is only needed for the DMERR macros */
8578+#include "dm.h"
8579+
8580+static struct dm_daemon _kcopyd;
8581+
8582+/*-----------------------------------------------------------------
8583+ * Each kcopyd client has its own little pool of preallocated
8584+ * pages for kcopyd io.
8585+ *---------------------------------------------------------------*/
8586+struct kcopyd_client {
8587+ struct list_head list;
8588+
8589+ spinlock_t lock;
8590+ struct list_head pages;
8591+ unsigned int nr_pages;
8592+ unsigned int nr_free_pages;
8593+};
8594+
8595+static inline void __push_page(struct kcopyd_client *kc, struct page *p)
8596+{
8597+ list_add(&p->list, &kc->pages);
8598+ kc->nr_free_pages++;
8599+}
8600+
8601+static inline struct page *__pop_page(struct kcopyd_client *kc)
8602+{
8603+ struct page *p;
8604+
8605+ p = list_entry(kc->pages.next, struct page, list);
8606+ list_del(&p->list);
8607+ kc->nr_free_pages--;
8608+
8609+ return p;
8610+}
8611+
8612+static int kcopyd_get_pages(struct kcopyd_client *kc,
8613+ unsigned int nr, struct list_head *pages)
8614+{
8615+ struct page *p;
8616+ INIT_LIST_HEAD(pages);
8617+
8618+ spin_lock(&kc->lock);
8619+ if (kc->nr_free_pages < nr) {
8620+ spin_unlock(&kc->lock);
8621+ return -ENOMEM;
8622+ }
8623+
8624+ while (nr--) {
8625+ p = __pop_page(kc);
8626+ list_add(&p->list, pages);
8627+ }
8628+ spin_unlock(&kc->lock);
8629+
8630+ return 0;
8631+}
8632+
8633+static void kcopyd_put_pages(struct kcopyd_client *kc, struct list_head *pages)
8634+{
8635+ struct list_head *tmp, *tmp2;
8636+
8637+ spin_lock(&kc->lock);
8638+ list_for_each_safe (tmp, tmp2, pages)
8639+ __push_page(kc, list_entry(tmp, struct page, list));
8640+ spin_unlock(&kc->lock);
8641+}
8642+
8643+/*
8644+ * These three functions resize the page pool.
8645+ */
8646+static void release_pages(struct list_head *pages)
8647+{
8648+ struct page *p;
8649+ struct list_head *tmp, *tmp2;
8650+
8651+ list_for_each_safe (tmp, tmp2, pages) {
8652+ p = list_entry(tmp, struct page, list);
8653+ UnlockPage(p);
8654+ __free_page(p);
8655+ }
8656+}
8657+
8658+static int client_alloc_pages(struct kcopyd_client *kc, unsigned int nr)
8659+{
8660+ unsigned int i;
8661+ struct page *p;
8662+ LIST_HEAD(new);
8663+
8664+ for (i = 0; i < nr; i++) {
8665+ p = alloc_page(GFP_KERNEL);
8666+ if (!p) {
8667+ release_pages(&new);
8668+ return -ENOMEM;
8669+ }
8670+
8671+ LockPage(p);
8672+ list_add(&p->list, &new);
8673+ }
8674+
8675+ kcopyd_put_pages(kc, &new);
8676+ kc->nr_pages += nr;
8677+ return 0;
8678+}
8679+
8680+static void client_free_pages(struct kcopyd_client *kc)
8681+{
8682+ BUG_ON(kc->nr_free_pages != kc->nr_pages);
8683+ release_pages(&kc->pages);
8684+ kc->nr_free_pages = kc->nr_pages = 0;
8685+}
8686+
8687+/*-----------------------------------------------------------------
8688+ * kcopyd_jobs need to be allocated by the *clients* of kcopyd,
8689+ * for this reason we use a mempool to prevent the client from
8690+ * ever having to do io (which could cause a deadlock).
8691+ *---------------------------------------------------------------*/
8692+struct kcopyd_job {
8693+ struct kcopyd_client *kc;
8694+ struct list_head list;
8695+ unsigned int flags;
8696+
8697+ /*
8698+ * Error state of the job.
8699+ */
8700+ int read_err;
8701+ unsigned int write_err;
8702+
8703+ /*
8704+ * Either READ or WRITE
8705+ */
8706+ int rw;
8707+ struct io_region source;
8708+
8709+ /*
8710+ * The destinations for the transfer.
8711+ */
8712+ unsigned int num_dests;
8713+ struct io_region dests[KCOPYD_MAX_REGIONS];
8714+
8715+ sector_t offset;
8716+ unsigned int nr_pages;
8717+ struct list_head pages;
8718+
8719+ /*
8720+ * Set this to ensure you are notified when the job has
8721+ * completed. 'context' is for callback to use.
8722+ */
8723+ kcopyd_notify_fn fn;
8724+ void *context;
8725+
8726+ /*
8727+ * These fields are only used if the job has been split
8728+ * into more manageable parts.
8729+ */
8730+ struct semaphore lock;
8731+ atomic_t sub_jobs;
8732+ sector_t progress;
8733+};
8734+
8735+/* FIXME: this should scale with the number of pages */
8736+#define MIN_JOBS 512
8737+
339dbf15
AM
8738+static kmem_cache_t *_job_cache;
8739+static mempool_t *_job_pool;
2ac564b8
AM
8740+
8741+/*
8742+ * We maintain three lists of jobs:
8743+ *
8744+ * i) jobs waiting for pages
8745+ * ii) jobs that have pages, and are waiting for the io to be issued.
8746+ * iii) jobs that have completed.
8747+ *
8748+ * All three of these are protected by job_lock.
8749+ */
8750+static spinlock_t _job_lock = SPIN_LOCK_UNLOCKED;
8751+
8752+static LIST_HEAD(_complete_jobs);
8753+static LIST_HEAD(_io_jobs);
8754+static LIST_HEAD(_pages_jobs);
8755+
8756+static int jobs_init(void)
8757+{
8758+ INIT_LIST_HEAD(&_complete_jobs);
8759+ INIT_LIST_HEAD(&_io_jobs);
8760+ INIT_LIST_HEAD(&_pages_jobs);
8761+
8762+ _job_cache = kmem_cache_create("kcopyd-jobs",
8763+ sizeof(struct kcopyd_job),
8764+ __alignof__(struct kcopyd_job),
8765+ 0, NULL, NULL);
8766+ if (!_job_cache)
8767+ return -ENOMEM;
8768+
8769+ _job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
8770+ mempool_free_slab, _job_cache);
8771+ if (!_job_pool) {
8772+ kmem_cache_destroy(_job_cache);
8773+ return -ENOMEM;
8774+ }
8775+
8776+ return 0;
8777+}
8778+
8779+static void jobs_exit(void)
8780+{
8781+ BUG_ON(!list_empty(&_complete_jobs));
8782+ BUG_ON(!list_empty(&_io_jobs));
8783+ BUG_ON(!list_empty(&_pages_jobs));
8784+
8785+ mempool_destroy(_job_pool);
8786+ kmem_cache_destroy(_job_cache);
8787+}
8788+
8789+/*
8790+ * Functions to push and pop a job onto the head of a given job
8791+ * list.
8792+ */
8793+static inline struct kcopyd_job *pop(struct list_head *jobs)
8794+{
8795+ struct kcopyd_job *job = NULL;
8796+ unsigned long flags;
8797+
8798+ spin_lock_irqsave(&_job_lock, flags);
8799+
8800+ if (!list_empty(jobs)) {
8801+ job = list_entry(jobs->next, struct kcopyd_job, list);
8802+ list_del(&job->list);
8803+ }
8804+ spin_unlock_irqrestore(&_job_lock, flags);
8805+
8806+ return job;
8807+}
8808+
8809+static inline void push(struct list_head *jobs, struct kcopyd_job *job)
8810+{
8811+ unsigned long flags;
8812+
8813+ spin_lock_irqsave(&_job_lock, flags);
8814+ list_add_tail(&job->list, jobs);
8815+ spin_unlock_irqrestore(&_job_lock, flags);
8816+}
8817+
8818+/*
8819+ * These three functions process 1 item from the corresponding
8820+ * job list.
8821+ *
8822+ * They return:
8823+ * < 0: error
8824+ * 0: success
8825+ * > 0: can't process yet.
8826+ */
8827+static int run_complete_job(struct kcopyd_job *job)
8828+{
8829+ void *context = job->context;
8830+ int read_err = job->read_err;
8831+ unsigned int write_err = job->write_err;
8832+ kcopyd_notify_fn fn = job->fn;
8833+
8834+ kcopyd_put_pages(job->kc, &job->pages);
8835+ mempool_free(job, _job_pool);
8836+ fn(read_err, write_err, context);
8837+ return 0;
8838+}
8839+
8840+static void complete_io(unsigned int error, void *context)
8841+{
8842+ struct kcopyd_job *job = (struct kcopyd_job *) context;
8843+
8844+ if (error) {
8845+ if (job->rw == WRITE)
8846+ job->write_err &= error;
8847+ else
8848+ job->read_err = 1;
8849+
8850+ if (!test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) {
8851+ push(&_complete_jobs, job);
8852+ dm_daemon_wake(&_kcopyd);
8853+ return;
8854+ }
8855+ }
8856+
8857+ if (job->rw == WRITE)
8858+ push(&_complete_jobs, job);
8859+
8860+ else {
8861+ job->rw = WRITE;
8862+ push(&_io_jobs, job);
8863+ }
8864+
8865+ dm_daemon_wake(&_kcopyd);
8866+}
8867+
8868+/*
8869+ * Request io on as many buffer heads as we can currently get for
8870+ * a particular job.
8871+ */
8872+static int run_io_job(struct kcopyd_job *job)
8873+{
8874+ int r;
8875+
8876+ if (job->rw == READ)
8877+ r = dm_io_async(1, &job->source, job->rw,
8878+ list_entry(job->pages.next, struct page, list),
8879+ job->offset, complete_io, job);
8880+
8881+ else
8882+ r = dm_io_async(job->num_dests, job->dests, job->rw,
8883+ list_entry(job->pages.next, struct page, list),
8884+ job->offset, complete_io, job);
8885+
8886+ return r;
8887+}
8888+
8889+#define SECTORS_PER_PAGE (PAGE_SIZE / SECTOR_SIZE)
8890+static int run_pages_job(struct kcopyd_job *job)
8891+{
8892+ int r;
8893+
8894+ job->nr_pages = dm_div_up(job->dests[0].count + job->offset,
8895+ SECTORS_PER_PAGE);
8896+ r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages);
8897+ if (!r) {
8898+ /* this job is ready for io */
8899+ push(&_io_jobs, job);
8900+ return 0;
8901+ }
8902+
8903+ if (r == -ENOMEM)
8904+ /* can't complete now */
8905+ return 1;
8906+
8907+ return r;
8908+}
8909+
8910+/*
8911+ * Run through a list for as long as possible. Returns the count
8912+ * of successful jobs.
8913+ */
8914+static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *))
8915+{
8916+ struct kcopyd_job *job;
8917+ int r, count = 0;
8918+
8919+ while ((job = pop(jobs))) {
8920+
8921+ r = fn(job);
8922+
8923+ if (r < 0) {
8924+ /* error this rogue job */
8925+ if (job->rw == WRITE)
8926+ job->write_err = (unsigned int) -1;
8927+ else
8928+ job->read_err = 1;
8929+ push(&_complete_jobs, job);
8930+ break;
8931+ }
8932+
8933+ if (r > 0) {
8934+ /*
8935+ * We couldn't service this job ATM, so
8936+ * push this job back onto the list.
8937+ */
8938+ push(jobs, job);
8939+ break;
8940+ }
8941+
8942+ count++;
8943+ }
8944+
8945+ return count;
8946+}
8947+
8948+/*
8949+ * kcopyd does this every time it's woken up.
8950+ */
8951+static void do_work(void)
8952+{
8953+ /*
8954+ * The order that these are called is *very* important.
8955+ * complete jobs can free some pages for pages jobs.
8956+ * Pages jobs when successful will jump onto the io jobs
8957+ * list. io jobs call wake when they complete and it all
8958+ * starts again.
8959+ */
8960+ process_jobs(&_complete_jobs, run_complete_job);
8961+ process_jobs(&_pages_jobs, run_pages_job);
8962+ process_jobs(&_io_jobs, run_io_job);
8963+ run_task_queue(&tq_disk);
8964+}
8965+
8966+/*
8967+ * If we are copying a small region we just dispatch a single job
8968+ * to do the copy, otherwise the io has to be split up into many
8969+ * jobs.
8970+ */
8971+static void dispatch_job(struct kcopyd_job *job)
8972+{
8973+ push(&_pages_jobs, job);
8974+ dm_daemon_wake(&_kcopyd);
8975+}
8976+
8977+#define SUB_JOB_SIZE 128
8978+static void segment_complete(int read_err,
8979+ unsigned int write_err, void *context)
8980+{
8981+ /* FIXME: tidy this function */
8982+ sector_t progress = 0;
8983+ sector_t count = 0;
8984+ struct kcopyd_job *job = (struct kcopyd_job *) context;
8985+
8986+ down(&job->lock);
8987+
8988+ /* update the error */
8989+ if (read_err)
8990+ job->read_err = 1;
8991+
8992+ if (write_err)
8993+ job->write_err &= write_err;
8994+
8995+ /*
8996+ * Only dispatch more work if there hasn't been an error.
8997+ */
8998+ if ((!job->read_err && !job->write_err) ||
8999+ test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) {
9000+ /* get the next chunk of work */
9001+ progress = job->progress;
9002+ count = job->source.count - progress;
9003+ if (count) {
9004+ if (count > SUB_JOB_SIZE)
9005+ count = SUB_JOB_SIZE;
9006+
9007+ job->progress += count;
9008+ }
9009+ }
9010+ up(&job->lock);
9011+
9012+ if (count) {
9013+ int i;
9014+ struct kcopyd_job *sub_job = mempool_alloc(_job_pool, GFP_NOIO);
9015+
9016+ memcpy(sub_job, job, sizeof(*job));
9017+ sub_job->source.sector += progress;
9018+ sub_job->source.count = count;
9019+
9020+ for (i = 0; i < job->num_dests; i++) {
9021+ sub_job->dests[i].sector += progress;
9022+ sub_job->dests[i].count = count;
9023+ }
9024+
9025+ sub_job->fn = segment_complete;
9026+ sub_job->context = job;
9027+ dispatch_job(sub_job);
9028+
9029+ } else if (atomic_dec_and_test(&job->sub_jobs)) {
9030+
9031+ /*
9032+ * To avoid a race we must keep the job around
9033+ * until after the notify function has completed.
9034+ * Otherwise the client may try and stop the job
9035+ * after we've completed.
9036+ */
9037+ job->fn(read_err, write_err, job->context);
9038+ mempool_free(job, _job_pool);
9039+ }
9040+}
9041+
9042+/*
9043+ * Create some little jobs that will do the move between
9044+ * them.
9045+ */
9046+#define SPLIT_COUNT 8
9047+static void split_job(struct kcopyd_job *job)
9048+{
9049+ int i;
9050+
9051+ atomic_set(&job->sub_jobs, SPLIT_COUNT);
9052+ for (i = 0; i < SPLIT_COUNT; i++)
9053+ segment_complete(0, 0u, job);
9054+}
9055+
9056+#define SUB_JOB_THRESHOLD (SPLIT_COUNT * SUB_JOB_SIZE)
9057+int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
9058+ unsigned int num_dests, struct io_region *dests,
9059+ unsigned int flags, kcopyd_notify_fn fn, void *context)
9060+{
9061+ struct kcopyd_job *job;
9062+
9063+ /*
9064+ * Allocate a new job.
9065+ */
9066+ job = mempool_alloc(_job_pool, GFP_NOIO);
9067+
9068+ /*
9069+ * set up for the read.
9070+ */
9071+ job->kc = kc;
9072+ job->flags = flags;
9073+ job->read_err = 0;
9074+ job->write_err = 0;
9075+ job->rw = READ;
9076+
9077+ memcpy(&job->source, from, sizeof(*from));
9078+
9079+ job->num_dests = num_dests;
9080+ memcpy(&job->dests, dests, sizeof(*dests) * num_dests);
9081+
9082+ job->offset = 0;
9083+ job->nr_pages = 0;
9084+ INIT_LIST_HEAD(&job->pages);
9085+
9086+ job->fn = fn;
9087+ job->context = context;
9088+
9089+ if (job->source.count < SUB_JOB_THRESHOLD)
9090+ dispatch_job(job);
9091+
9092+ else {
9093+ init_MUTEX(&job->lock);
9094+ job->progress = 0;
9095+ split_job(job);
9096+ }
9097+
9098+ return 0;
9099+}
9100+
9101+/*
9102+ * Cancels a kcopyd job, eg. someone might be deactivating a
9103+ * mirror.
9104+ */
9105+int kcopyd_cancel(struct kcopyd_job *job, int block)
9106+{
9107+ /* FIXME: finish */
9108+ return -1;
9109+}
9110+
9111+/*-----------------------------------------------------------------
9112+ * Unit setup
9113+ *---------------------------------------------------------------*/
9114+static DECLARE_MUTEX(_client_lock);
9115+static LIST_HEAD(_clients);
9116+
9117+static int client_add(struct kcopyd_client *kc)
9118+{
9119+ down(&_client_lock);
9120+ list_add(&kc->list, &_clients);
9121+ up(&_client_lock);
9122+ return 0;
9123+}
9124+
9125+static void client_del(struct kcopyd_client *kc)
9126+{
9127+ down(&_client_lock);
9128+ list_del(&kc->list);
9129+ up(&_client_lock);
9130+}
9131+
9132+int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result)
9133+{
9134+ int r = 0;
9135+ struct kcopyd_client *kc;
9136+
9137+ kc = kmalloc(sizeof(*kc), GFP_KERNEL);
9138+ if (!kc)
9139+ return -ENOMEM;
9140+
9141+ kc->lock = SPIN_LOCK_UNLOCKED;
9142+ INIT_LIST_HEAD(&kc->pages);
9143+ kc->nr_pages = kc->nr_free_pages = 0;
9144+ r = client_alloc_pages(kc, nr_pages);
9145+ if (r) {
9146+ kfree(kc);
9147+ return r;
9148+ }
9149+
9150+ r = dm_io_get(nr_pages);
9151+ if (r) {
9152+ client_free_pages(kc);
9153+ kfree(kc);
9154+ return r;
9155+ }
9156+
9157+ r = client_add(kc);
9158+ if (r) {
9159+ dm_io_put(nr_pages);
9160+ client_free_pages(kc);
9161+ kfree(kc);
9162+ return r;
9163+ }
9164+
9165+ *result = kc;
9166+ return 0;
9167+}
9168+
9169+void kcopyd_client_destroy(struct kcopyd_client *kc)
9170+{
9171+ dm_io_put(kc->nr_pages);
9172+ client_free_pages(kc);
9173+ client_del(kc);
9174+ kfree(kc);
9175+}
9176+
9177+
9178+int __init kcopyd_init(void)
9179+{
9180+ int r;
9181+
9182+ r = jobs_init();
9183+ if (r)
9184+ return r;
9185+
9186+ r = dm_daemon_start(&_kcopyd, "kcopyd", do_work);
9187+ if (r)
9188+ jobs_exit();
9189+
9190+ return r;
9191+}
9192+
9193+void kcopyd_exit(void)
9194+{
9195+ jobs_exit();
9196+ dm_daemon_stop(&_kcopyd);
9197+}
9198+
9199+EXPORT_SYMBOL(kcopyd_client_create);
9200+EXPORT_SYMBOL(kcopyd_client_destroy);
9201+EXPORT_SYMBOL(kcopyd_copy);
9202+EXPORT_SYMBOL(kcopyd_cancel);
339dbf15
AM
9203--- linux-2.4.21/drivers/md/kcopyd.h Thu Jan 1 01:00:00 1970
9204+++ linux/drivers/md/kcopyd.h Wed Aug 20 14:41:38 2003
2ac564b8
AM
9205@@ -0,0 +1,47 @@
9206+/*
9207+ * Copyright (C) 2001 Sistina Software
9208+ *
9209+ * This file is released under the GPL.
9210+ */
9211+
9212+#ifndef DM_KCOPYD_H
9213+#define DM_KCOPYD_H
9214+
9215+/*
9216+ * Needed for the definition of offset_t.
9217+ */
9218+#include <linux/device-mapper.h>
9219+#include <linux/iobuf.h>
9220+
9221+#include "dm-io.h"
9222+
9223+int kcopyd_init(void);
9224+void kcopyd_exit(void);
9225+
9226+/* FIXME: make this configurable */
9227+#define KCOPYD_MAX_REGIONS 8
9228+
9229+#define KCOPYD_IGNORE_ERROR 1
9230+
9231+/*
9232+ * To use kcopyd you must first create a kcopyd client object.
9233+ */
9234+struct kcopyd_client;
9235+int kcopyd_client_create(unsigned int num_pages, struct kcopyd_client **result);
9236+void kcopyd_client_destroy(struct kcopyd_client *kc);
9237+
9238+/*
9239+ * Submit a copy job to kcopyd. This is built on top of the
9240+ * previous three fns.
9241+ *
9242+ * read_err is a boolean,
9243+ * write_err is a bitset, with 1 bit for each destination region
9244+ */
9245+typedef void (*kcopyd_notify_fn)(int read_err,
9246+ unsigned int write_err, void *context);
9247+
9248+int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
9249+ unsigned int num_dests, struct io_region *dests,
9250+ unsigned int flags, kcopyd_notify_fn fn, void *context);
9251+
9252+#endif
339dbf15
AM
9253--- linux-2.4.21/fs/buffer.c Fri Jun 13 16:32:48 2003
9254+++ linux/fs/buffer.c Wed Aug 20 14:41:32 2003
2ac564b8
AM
9255@@ -735,6 +735,7 @@
9256 bh->b_list = BUF_CLEAN;
9257 bh->b_end_io = handler;
9258 bh->b_private = private;
9259+ bh->b_journal_head = NULL;
9260 }
9261
9262 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
339dbf15
AM
9263--- linux-2.4.21/fs/jbd/journal.c Fri Jun 13 16:32:48 2003
9264+++ linux/fs/jbd/journal.c Wed Aug 20 14:41:32 2003
2ac564b8
AM
9265@@ -1802,9 +1802,9 @@
9266
9267 if (buffer_jbd(bh)) {
9268 /* Someone did it for us! */
9269- J_ASSERT_BH(bh, bh->b_private != NULL);
9270+ J_ASSERT_BH(bh, bh->b_journal_head != NULL);
9271 journal_free_journal_head(jh);
9272- jh = bh->b_private;
9273+ jh = bh->b_journal_head;
9274 } else {
9275 /*
9276 * We actually don't need jh_splice_lock when
9277@@ -1812,7 +1812,7 @@
9278 */
9279 spin_lock(&jh_splice_lock);
9280 set_bit(BH_JBD, &bh->b_state);
9281- bh->b_private = jh;
9282+ bh->b_journal_head = jh;
9283 jh->b_bh = bh;
9284 atomic_inc(&bh->b_count);
9285 spin_unlock(&jh_splice_lock);
9286@@ -1821,7 +1821,7 @@
9287 }
9288 jh->b_jcount++;
9289 spin_unlock(&journal_datalist_lock);
9290- return bh->b_private;
9291+ return bh->b_journal_head;
9292 }
9293
9294 /*
9295@@ -1854,7 +1854,7 @@
9296 J_ASSERT_BH(bh, jh2bh(jh) == bh);
9297 BUFFER_TRACE(bh, "remove journal_head");
9298 spin_lock(&jh_splice_lock);
9299- bh->b_private = NULL;
9300+ bh->b_journal_head = NULL;
9301 jh->b_bh = NULL; /* debug, really */
9302 clear_bit(BH_JBD, &bh->b_state);
9303 __brelse(bh);
339dbf15
AM
9304--- linux-2.4.21/include/linux/device-mapper.h Thu Jan 1 01:00:00 1970
9305+++ linux/include/linux/device-mapper.h Wed Aug 20 14:41:38 2003
2ac564b8
AM
9306@@ -0,0 +1,104 @@
9307+/*
9308+ * Copyright (C) 2001 Sistina Software (UK) Limited.
9309+ *
9310+ * This file is released under the LGPL.
9311+ */
9312+
9313+#ifndef _LINUX_DEVICE_MAPPER_H
9314+#define _LINUX_DEVICE_MAPPER_H
9315+
9316+typedef unsigned long sector_t;
9317+
9318+struct dm_target;
9319+struct dm_table;
9320+struct dm_dev;
9321+
9322+typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
9323+
9324+union map_info {
9325+ void *ptr;
9326+ unsigned long long ll;
9327+};
9328+
9329+/*
9330+ * In the constructor the target parameter will already have the
9331+ * table, type, begin and len fields filled in.
9332+ */
9333+typedef int (*dm_ctr_fn) (struct dm_target * target, unsigned int argc,
9334+ char **argv);
9335+
9336+/*
9337+ * The destructor doesn't need to free the dm_target, just
9338+ * anything hidden ti->private.
9339+ */
9340+typedef void (*dm_dtr_fn) (struct dm_target * ti);
9341+
9342+/*
9343+ * The map function must return:
9344+ * < 0: error
9345+ * = 0: The target will handle the io by resubmitting it later
9346+ * > 0: simple remap complete
9347+ */
9348+typedef int (*dm_map_fn) (struct dm_target * ti, struct buffer_head * bh,
9349+ int rw, union map_info *map_context);
9350+
9351+/*
9352+ * Returns:
9353+ * < 0 : error (currently ignored)
9354+ * 0 : ended successfully
9355+ * 1 : for some reason the io has still not completed (eg,
9356+ * multipath target might want to requeue a failed io).
9357+ */
9358+typedef int (*dm_endio_fn) (struct dm_target * ti,
9359+ struct buffer_head * bh, int rw, int error,
9360+ union map_info *map_context);
9361+typedef void (*dm_suspend_fn) (struct dm_target *ti);
9362+typedef void (*dm_resume_fn) (struct dm_target *ti);
9363+typedef int (*dm_status_fn) (struct dm_target * ti, status_type_t status_type,
9364+ char *result, unsigned int maxlen);
9365+
9366+void dm_error(const char *message);
9367+
9368+/*
9369+ * Constructors should call these functions to ensure destination devices
9370+ * are opened/closed correctly.
9371+ * FIXME: too many arguments.
9372+ */
9373+int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
9374+ sector_t len, int mode, struct dm_dev **result);
9375+void dm_put_device(struct dm_target *ti, struct dm_dev *d);
9376+
9377+/*
9378+ * Information about a target type
9379+ */
9380+struct target_type {
9381+ const char *name;
9382+ struct module *module;
9383+ dm_ctr_fn ctr;
9384+ dm_dtr_fn dtr;
9385+ dm_map_fn map;
9386+ dm_endio_fn end_io;
9387+ dm_suspend_fn suspend;
9388+ dm_resume_fn resume;
9389+ dm_status_fn status;
9390+};
9391+
9392+struct dm_target {
9393+ struct dm_table *table;
9394+ struct target_type *type;
9395+
9396+ /* target limits */
9397+ sector_t begin;
9398+ sector_t len;
9399+
9400+ /* target specific data */
9401+ void *private;
9402+
9403+ /* Used to provide an error string from the ctr */
9404+ char *error;
9405+};
9406+
9407+int dm_register_target(struct target_type *t);
9408+int dm_unregister_target(struct target_type *t);
9409+
9410+#endif /* _LINUX_DEVICE_MAPPER_H */
339dbf15
AM
9411--- linux-2.4.21/include/linux/dm-ioctl.h Thu Jan 1 01:00:00 1970
9412+++ linux/include/linux/dm-ioctl.h Wed Aug 20 14:41:38 2003
2ac564b8
AM
9413@@ -0,0 +1,237 @@
9414+/*
9415+ * Copyright (C) 2001 - 2003 Sistina Software (UK) Limited.
9416+ *
9417+ * This file is released under the LGPL.
9418+ */
9419+
9420+#ifndef _LINUX_DM_IOCTL_H
9421+#define _LINUX_DM_IOCTL_H
9422+
9423+#include <linux/types.h>
9424+
9425+#define DM_DIR "mapper" /* Slashes not supported */
9426+#define DM_MAX_TYPE_NAME 16
9427+#define DM_NAME_LEN 128
9428+#define DM_UUID_LEN 129
9429+
9430+/*
9431+ * A traditional ioctl interface for the device mapper.
9432+ *
9433+ * Each device can have two tables associated with it, an
9434+ * 'active' table which is the one currently used by io passing
9435+ * through the device, and an 'inactive' one which is a table
9436+ * that is being prepared as a replacement for the 'active' one.
9437+ *
9438+ * DM_VERSION:
9439+ * Just get the version information for the ioctl interface.
9440+ *
9441+ * DM_REMOVE_ALL:
9442+ * Remove all dm devices, destroy all tables. Only really used
9443+ * for debug.
9444+ *
9445+ * DM_LIST_DEVICES:
9446+ * Get a list of all the dm device names.
9447+ *
9448+ * DM_DEV_CREATE:
9449+ * Create a new device, neither the 'active' or 'inactive' table
9450+ * slots will be filled. The device will be in suspended state
9451+ * after creation, however any io to the device will get errored
9452+ * since it will be out-of-bounds.
9453+ *
9454+ * DM_DEV_REMOVE:
9455+ * Remove a device, destroy any tables.
9456+ *
9457+ * DM_DEV_RENAME:
9458+ * Rename a device.
9459+ *
9460+ * DM_SUSPEND:
9461+ * This performs both suspend and resume, depending which flag is
9462+ * passed in.
9463+ * Suspend: This command will not return until all pending io to
9464+ * the device has completed. Further io will be deferred until
9465+ * the device is resumed.
9466+ * Resume: It is no longer an error to issue this command on an
9467+ * unsuspended device. If a table is present in the 'inactive'
9468+ * slot, it will be moved to the active slot, then the old table
9469+ * from the active slot will be _destroyed_. Finally the device
9470+ * is resumed.
9471+ *
9472+ * DM_DEV_STATUS:
9473+ * Retrieves the status for the table in the 'active' slot.
9474+ *
9475+ * DM_DEV_WAIT:
9476+ * Wait for a significant event to occur to the device. This
9477+ * could either be caused by an event triggered by one of the
9478+ * targets of the table in the 'active' slot, or a table change.
9479+ *
9480+ * DM_TABLE_LOAD:
9481+ * Load a table into the 'inactive' slot for the device. The
9482+ * device does _not_ need to be suspended prior to this command.
9483+ *
9484+ * DM_TABLE_CLEAR:
9485+ * Destroy any table in the 'inactive' slot (ie. abort).
9486+ *
9487+ * DM_TABLE_DEPS:
9488+ * Return a set of device dependencies for the 'active' table.
9489+ *
9490+ * DM_TABLE_STATUS:
9491+ * Return the targets status for the 'active' table.
9492+ */
9493+
9494+/*
9495+ * All ioctl arguments consist of a single chunk of memory, with
9496+ * this structure at the start. If a uuid is specified any
9497+ * lookup (eg. for a DM_INFO) will be done on that, *not* the
9498+ * name.
9499+ */
9500+struct dm_ioctl {
9501+ /*
9502+ * The version number is made up of three parts:
9503+ * major - no backward or forward compatibility,
9504+ * minor - only backwards compatible,
9505+ * patch - both backwards and forwards compatible.
9506+ *
9507+ * All clients of the ioctl interface should fill in the
9508+ * version number of the interface that they were
9509+ * compiled with.
9510+ *
9511+ * All recognised ioctl commands (ie. those that don't
9512+ * return -ENOTTY) fill out this field, even if the
9513+ * command failed.
9514+ */
9515+ uint32_t version[3]; /* in/out */
9516+ uint32_t data_size; /* total size of data passed in
9517+ * including this struct */
9518+
9519+ uint32_t data_start; /* offset to start of data
9520+ * relative to start of this struct */
9521+
9522+ uint32_t target_count; /* in/out */
9523+ int32_t open_count; /* out */
9524+ uint32_t flags; /* in/out */
9525+ uint32_t event_nr; /* in/out */
9526+ uint32_t padding;
9527+
9528+ uint64_t dev; /* in/out */
9529+
9530+ char name[DM_NAME_LEN]; /* device name */
9531+ char uuid[DM_UUID_LEN]; /* unique identifier for
9532+ * the block device */
9533+};
9534+
9535+/*
9536+ * Used to specify tables. These structures appear after the
9537+ * dm_ioctl.
9538+ */
9539+struct dm_target_spec {
9540+ uint64_t sector_start;
9541+ uint64_t length;
9542+ int32_t status; /* used when reading from kernel only */
9543+
9544+ /*
9545+ * Offset in bytes (from the start of this struct) to
9546+ * next target_spec.
9547+ */
9548+ uint32_t next;
9549+
9550+ char target_type[DM_MAX_TYPE_NAME];
9551+
9552+ /*
9553+ * Parameter string starts immediately after this object.
9554+ * Be careful to add padding after string to ensure correct
9555+ * alignment of subsequent dm_target_spec.
9556+ */
9557+};
9558+
9559+/*
9560+ * Used to retrieve the target dependencies.
9561+ */
9562+struct dm_target_deps {
9563+ uint32_t count; /* Array size */
9564+ uint32_t padding; /* unused */
9565+ uint64_t dev[0]; /* out */
9566+};
9567+
9568+/*
9569+ * Used to get a list of all dm devices.
9570+ */
9571+struct dm_name_list {
9572+ uint64_t dev;
9573+ uint32_t next; /* offset to the next record from
9574+ the _start_ of this */
9575+ char name[0];
9576+};
9577+
9578+/*
9579+ * If you change this make sure you make the corresponding change
9580+ * to dm-ioctl.c:lookup_ioctl()
9581+ */
9582+enum {
9583+ /* Top level cmds */
9584+ DM_VERSION_CMD = 0,
9585+ DM_REMOVE_ALL_CMD,
9586+ DM_LIST_DEVICES_CMD,
9587+
9588+ /* device level cmds */
9589+ DM_DEV_CREATE_CMD,
9590+ DM_DEV_REMOVE_CMD,
9591+ DM_DEV_RENAME_CMD,
9592+ DM_DEV_SUSPEND_CMD,
9593+ DM_DEV_STATUS_CMD,
9594+ DM_DEV_WAIT_CMD,
9595+
9596+ /* Table level cmds */
9597+ DM_TABLE_LOAD_CMD,
9598+ DM_TABLE_CLEAR_CMD,
9599+ DM_TABLE_DEPS_CMD,
9600+ DM_TABLE_STATUS_CMD,
9601+};
9602+
9603+#define DM_IOCTL 0xfd
9604+
9605+#define DM_VERSION _IOWR(DM_IOCTL, DM_VERSION_CMD, struct dm_ioctl)
9606+#define DM_REMOVE_ALL _IOWR(DM_IOCTL, DM_REMOVE_ALL_CMD, struct dm_ioctl)
9607+#define DM_LIST_DEVICES _IOWR(DM_IOCTL, DM_LIST_DEVICES_CMD, struct dm_ioctl)
9608+
9609+#define DM_DEV_CREATE _IOWR(DM_IOCTL, DM_DEV_CREATE_CMD, struct dm_ioctl)
9610+#define DM_DEV_REMOVE _IOWR(DM_IOCTL, DM_DEV_REMOVE_CMD, struct dm_ioctl)
9611+#define DM_DEV_RENAME _IOWR(DM_IOCTL, DM_DEV_RENAME_CMD, struct dm_ioctl)
9612+#define DM_DEV_SUSPEND _IOWR(DM_IOCTL, DM_DEV_SUSPEND_CMD, struct dm_ioctl)
9613+#define DM_DEV_STATUS _IOWR(DM_IOCTL, DM_DEV_STATUS_CMD, struct dm_ioctl)
9614+#define DM_DEV_WAIT _IOWR(DM_IOCTL, DM_DEV_WAIT_CMD, struct dm_ioctl)
9615+
9616+#define DM_TABLE_LOAD _IOWR(DM_IOCTL, DM_TABLE_LOAD_CMD, struct dm_ioctl)
9617+#define DM_TABLE_CLEAR _IOWR(DM_IOCTL, DM_TABLE_CLEAR_CMD, struct dm_ioctl)
9618+#define DM_TABLE_DEPS _IOWR(DM_IOCTL, DM_TABLE_DEPS_CMD, struct dm_ioctl)
9619+#define DM_TABLE_STATUS _IOWR(DM_IOCTL, DM_TABLE_STATUS_CMD, struct dm_ioctl)
9620+
9621+#define DM_VERSION_MAJOR 4
9622+#define DM_VERSION_MINOR 0
339dbf15
AM
9623+#define DM_VERSION_PATCHLEVEL 3
9624+#define DM_VERSION_EXTRA "-ioctl (2003-08-22)"
2ac564b8
AM
9625+
9626+/* Status bits */
9627+#define DM_READONLY_FLAG (1 << 0) /* In/Out */
9628+#define DM_SUSPEND_FLAG (1 << 1) /* In/Out */
9629+#define DM_PERSISTENT_DEV_FLAG (1 << 3) /* In */
9630+
9631+/*
9632+ * Flag passed into ioctl STATUS command to get table information
9633+ * rather than current status.
9634+ */
9635+#define DM_STATUS_TABLE_FLAG (1 << 4) /* In */
9636+
9637+/*
9638+ * Flags that indicate whether a table is present in either of
9639+ * the two table slots that a device has.
9640+ */
9641+#define DM_ACTIVE_PRESENT_FLAG (1 << 5) /* Out */
9642+#define DM_INACTIVE_PRESENT_FLAG (1 << 6) /* Out */
9643+
9644+/*
9645+ * Indicates that the buffer passed in wasn't big enough for the
9646+ * results.
9647+ */
9648+#define DM_BUFFER_FULL_FLAG (1 << 8) /* Out */
9649+
9650+#endif /* _LINUX_DM_IOCTL_H */
339dbf15
AM
9651--- linux-2.4.21/include/linux/fs.h Fri Jun 13 16:32:51 2003
9652+++ linux/include/linux/fs.h Wed Aug 20 14:41:32 2003
2ac564b8
AM
9653@@ -263,7 +263,7 @@
9654 struct page *b_page; /* the page this bh is mapped to */
9655 void (*b_end_io)(struct buffer_head *bh, int uptodate); /* I/O completion */
9656 void *b_private; /* reserved for b_end_io */
9657-
9658+ void *b_journal_head; /* ext3 journal_heads */
9659 unsigned long b_rsector; /* Real buffer location on disk */
9660 wait_queue_head_t b_wait;
9661
339dbf15
AM
9662--- linux-2.4.21/include/linux/jbd.h Fri Jun 13 16:32:51 2003
9663+++ linux/include/linux/jbd.h Wed Aug 20 14:41:32 2003
2ac564b8
AM
9664@@ -311,7 +311,7 @@
9665
9666 static inline struct journal_head *bh2jh(struct buffer_head *bh)
9667 {
9668- return bh->b_private;
9669+ return bh->b_journal_head;
9670 }
9671
9672 #define HAVE_JOURNAL_CALLBACK_STATUS
339dbf15
AM
9673--- linux-2.4.21/include/linux/mempool.h Thu Jan 1 01:00:00 1970
9674+++ linux/include/linux/mempool.h Wed Aug 20 14:41:48 2003
2ac564b8
AM
9675@@ -0,0 +1,31 @@
9676+/*
9677+ * memory buffer pool support
9678+ */
9679+#ifndef _LINUX_MEMPOOL_H
9680+#define _LINUX_MEMPOOL_H
9681+
9682+#include <linux/list.h>
9683+#include <linux/wait.h>
9684+
9685+struct mempool_s;
9686+typedef struct mempool_s mempool_t;
9687+
9688+typedef void * (mempool_alloc_t)(int gfp_mask, void *pool_data);
9689+typedef void (mempool_free_t)(void *element, void *pool_data);
9690+
9691+extern mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
9692+ mempool_free_t *free_fn, void *pool_data);
9693+extern int mempool_resize(mempool_t *pool, int new_min_nr, int gfp_mask);
9694+extern void mempool_destroy(mempool_t *pool);
9695+extern void * mempool_alloc(mempool_t *pool, int gfp_mask);
9696+extern void mempool_free(void *element, mempool_t *pool);
9697+
9698+/*
9699+ * A mempool_alloc_t and mempool_free_t that get the memory from
9700+ * a slab that is passed in through pool_data.
9701+ */
9702+void *mempool_alloc_slab(int gfp_mask, void *pool_data);
9703+void mempool_free_slab(void *element, void *pool_data);
9704+
9705+
9706+#endif /* _LINUX_MEMPOOL_H */
339dbf15
AM
9707--- linux-2.4.21/include/linux/vmalloc.h Fri Jan 10 16:35:58 2003
9708+++ linux/include/linux/vmalloc.h Wed Aug 20 14:41:57 2003
2ac564b8
AM
9709@@ -26,6 +26,7 @@
9710 extern void vmfree_area_pages(unsigned long address, unsigned long size);
9711 extern int vmalloc_area_pages(unsigned long address, unsigned long size,
9712 int gfp_mask, pgprot_t prot);
9713+extern void *vcalloc(unsigned long nmemb, unsigned long elem_size);
9714
9715 /*
9716 * Allocate any pages
339dbf15
AM
9717--- linux-2.4.21/kernel/ksyms.c Fri Jun 13 16:32:52 2003
9718+++ linux/kernel/ksyms.c Wed Aug 20 14:41:57 2003
2ac564b8
AM
9719@@ -112,6 +112,7 @@
9720 EXPORT_SYMBOL(vfree);
9721 EXPORT_SYMBOL(__vmalloc);
9722 EXPORT_SYMBOL(vmalloc_to_page);
9723+EXPORT_SYMBOL(vcalloc);
9724 EXPORT_SYMBOL(mem_map);
9725 EXPORT_SYMBOL(remap_page_range);
9726 EXPORT_SYMBOL(max_mapnr);
339dbf15
AM
9727--- linux-2.4.21/mm/Makefile Fri Jan 10 16:36:02 2003
9728+++ linux/mm/Makefile Wed Aug 20 14:41:48 2003
2ac564b8
AM
9729@@ -9,12 +9,12 @@
9730
9731 O_TARGET := mm.o
9732
9733-export-objs := shmem.o filemap.o memory.o page_alloc.o
9734+export-objs := shmem.o filemap.o memory.o page_alloc.o mempool.o
9735
9736 obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
9737 vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
9738 page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \
9739- shmem.o
9740+ shmem.o mempool.o
9741
9742 obj-$(CONFIG_HIGHMEM) += highmem.o
9743
339dbf15
AM
9744--- linux-2.4.21/mm/filemap.c Fri Jun 13 16:33:25 2003
9745+++ linux/mm/filemap.c Wed Aug 20 14:41:53 2003
2ac564b8
AM
9746@@ -1704,8 +1704,10 @@
9747 retval = generic_file_direct_IO(READ, filp, buf, count, pos);
9748 if (retval > 0)
9749 *ppos = pos + retval;
9750+
9751 }
9752- UPDATE_ATIME(filp->f_dentry->d_inode);
9753+ if (!S_ISBLK(inode->i_mode))
9754+ UPDATE_ATIME(filp->f_dentry->d_inode);
9755 goto out;
9756 }
9757 }
339dbf15
AM
9758--- linux-2.4.21/mm/mempool.c Thu Jan 1 01:00:00 1970
9759+++ linux/mm/mempool.c Wed Aug 20 14:41:48 2003
2ac564b8
AM
9760@@ -0,0 +1,299 @@
9761+/*
9762+ * linux/mm/mempool.c
9763+ *
9764+ * memory buffer pool support. Such pools are mostly used
9765+ * for guaranteed, deadlock-free memory allocations during
9766+ * extreme VM load.
9767+ *
9768+ * started by Ingo Molnar, Copyright (C) 2001
9769+ */
9770+
9771+#include <linux/mm.h>
9772+#include <linux/slab.h>
9773+#include <linux/module.h>
9774+#include <linux/mempool.h>
9775+
9776+struct mempool_s {
9777+ spinlock_t lock;
9778+ int min_nr; /* nr of elements at *elements */
9779+ int curr_nr; /* Current nr of elements at *elements */
9780+ void **elements;
9781+
9782+ void *pool_data;
9783+ mempool_alloc_t *alloc;
9784+ mempool_free_t *free;
9785+ wait_queue_head_t wait;
9786+};
9787+
9788+static void add_element(mempool_t *pool, void *element)
9789+{
9790+ BUG_ON(pool->curr_nr >= pool->min_nr);
9791+ pool->elements[pool->curr_nr++] = element;
9792+}
9793+
9794+static void *remove_element(mempool_t *pool)
9795+{
9796+ BUG_ON(pool->curr_nr <= 0);
9797+ return pool->elements[--pool->curr_nr];
9798+}
9799+
9800+static void free_pool(mempool_t *pool)
9801+{
9802+ while (pool->curr_nr) {
9803+ void *element = remove_element(pool);
9804+ pool->free(element, pool->pool_data);
9805+ }
9806+ kfree(pool->elements);
9807+ kfree(pool);
9808+}
9809+
9810+/**
9811+ * mempool_create - create a memory pool
9812+ * @min_nr: the minimum number of elements guaranteed to be
9813+ * allocated for this pool.
9814+ * @alloc_fn: user-defined element-allocation function.
9815+ * @free_fn: user-defined element-freeing function.
9816+ * @pool_data: optional private data available to the user-defined functions.
9817+ *
9818+ * this function creates and allocates a guaranteed size, preallocated
9819+ * memory pool. The pool can be used from the mempool_alloc and mempool_free
9820+ * functions. This function might sleep. Both the alloc_fn() and the free_fn()
9821+ * functions might sleep - as long as the mempool_alloc function is not called
9822+ * from IRQ contexts.
9823+ */
9824+mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
9825+ mempool_free_t *free_fn, void *pool_data)
9826+{
9827+ mempool_t *pool;
9828+
9829+ pool = kmalloc(sizeof(*pool), GFP_KERNEL);
9830+ if (!pool)
9831+ return NULL;
9832+ memset(pool, 0, sizeof(*pool));
9833+ pool->elements = kmalloc(min_nr * sizeof(void *), GFP_KERNEL);
9834+ if (!pool->elements) {
9835+ kfree(pool);
9836+ return NULL;
9837+ }
9838+ spin_lock_init(&pool->lock);
9839+ pool->min_nr = min_nr;
9840+ pool->pool_data = pool_data;
9841+ init_waitqueue_head(&pool->wait);
9842+ pool->alloc = alloc_fn;
9843+ pool->free = free_fn;
9844+
9845+ /*
9846+ * First pre-allocate the guaranteed number of buffers.
9847+ */
9848+ while (pool->curr_nr < pool->min_nr) {
9849+ void *element;
9850+
9851+ element = pool->alloc(GFP_KERNEL, pool->pool_data);
9852+ if (unlikely(!element)) {
9853+ free_pool(pool);
9854+ return NULL;
9855+ }
9856+ add_element(pool, element);
9857+ }
9858+ return pool;
9859+}
9860+
9861+/**
9862+ * mempool_resize - resize an existing memory pool
9863+ * @pool: pointer to the memory pool which was allocated via
9864+ * mempool_create().
9865+ * @new_min_nr: the new minimum number of elements guaranteed to be
9866+ * allocated for this pool.
9867+ * @gfp_mask: the usual allocation bitmask.
9868+ *
9869+ * This function shrinks/grows the pool. In the case of growing,
9870+ * it cannot be guaranteed that the pool will be grown to the new
9871+ * size immediately, but new mempool_free() calls will refill it.
9872+ *
9873+ * Note, the caller must guarantee that no mempool_destroy is called
9874+ * while this function is running. mempool_alloc() & mempool_free()
9875+ * might be called (eg. from IRQ contexts) while this function executes.
9876+ */
9877+int mempool_resize(mempool_t *pool, int new_min_nr, int gfp_mask)
9878+{
9879+ void *element;
9880+ void **new_elements;
9881+ unsigned long flags;
9882+
9883+ BUG_ON(new_min_nr <= 0);
9884+
9885+ spin_lock_irqsave(&pool->lock, flags);
9886+ if (new_min_nr < pool->min_nr) {
9887+ while (pool->curr_nr > new_min_nr) {
9888+ element = remove_element(pool);
9889+ spin_unlock_irqrestore(&pool->lock, flags);
9890+ pool->free(element, pool->pool_data);
9891+ spin_lock_irqsave(&pool->lock, flags);
9892+ }
9893+ pool->min_nr = new_min_nr;
9894+ goto out_unlock;
9895+ }
9896+ spin_unlock_irqrestore(&pool->lock, flags);
9897+
9898+ /* Grow the pool */
9899+ new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask);
9900+ if (!new_elements)
9901+ return -ENOMEM;
9902+
9903+ spin_lock_irqsave(&pool->lock, flags);
9904+ memcpy(new_elements, pool->elements,
9905+ pool->curr_nr * sizeof(*new_elements));
9906+ kfree(pool->elements);
9907+ pool->elements = new_elements;
9908+ pool->min_nr = new_min_nr;
9909+
9910+ while (pool->curr_nr < pool->min_nr) {
9911+ spin_unlock_irqrestore(&pool->lock, flags);
9912+ element = pool->alloc(gfp_mask, pool->pool_data);
9913+ if (!element)
9914+ goto out;
9915+ spin_lock_irqsave(&pool->lock, flags);
9916+ if (pool->curr_nr < pool->min_nr)
9917+ add_element(pool, element);
9918+ else
9919+ kfree(element); /* Raced */
9920+ }
9921+out_unlock:
9922+ spin_unlock_irqrestore(&pool->lock, flags);
9923+out:
9924+ return 0;
9925+}
9926+
9927+/**
9928+ * mempool_destroy - deallocate a memory pool
9929+ * @pool: pointer to the memory pool which was allocated via
9930+ * mempool_create().
9931+ *
9932+ * this function only sleeps if the free_fn() function sleeps. The caller
9933+ * has to guarantee that all elements have been returned to the pool (ie:
9934+ * freed) prior to calling mempool_destroy().
9935+ */
9936+void mempool_destroy(mempool_t *pool)
9937+{
9938+ if (pool->curr_nr != pool->min_nr)
9939+ BUG(); /* There were outstanding elements */
9940+ free_pool(pool);
9941+}
9942+
9943+/**
9944+ * mempool_alloc - allocate an element from a specific memory pool
9945+ * @pool: pointer to the memory pool which was allocated via
9946+ * mempool_create().
9947+ * @gfp_mask: the usual allocation bitmask.
9948+ *
9949+ * this function only sleeps if the alloc_fn function sleeps or
9950+ * returns NULL. Note that due to preallocation, this function
9951+ * *never* fails when called from process contexts. (it might
9952+ * fail if called from an IRQ context.)
9953+ */
9954+void * mempool_alloc(mempool_t *pool, int gfp_mask)
9955+{
9956+ void *element;
9957+ unsigned long flags;
9958+ int curr_nr;
9959+ DECLARE_WAITQUEUE(wait, current);
9960+ int gfp_nowait = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
9961+
9962+repeat_alloc:
9963+ element = pool->alloc(gfp_nowait, pool->pool_data);
9964+ if (likely(element != NULL))
9965+ return element;
9966+
9967+ /*
9968+ * If the pool is less than 50% full then try harder
9969+ * to allocate an element:
9970+ */
9971+ if ((gfp_mask != gfp_nowait) && (pool->curr_nr <= pool->min_nr/2)) {
9972+ element = pool->alloc(gfp_mask, pool->pool_data);
9973+ if (likely(element != NULL))
9974+ return element;
9975+ }
9976+
9977+ /*
9978+ * Kick the VM at this point.
9979+ */
9980+ wakeup_bdflush();
9981+
9982+ spin_lock_irqsave(&pool->lock, flags);
9983+ if (likely(pool->curr_nr)) {
9984+ element = remove_element(pool);
9985+ spin_unlock_irqrestore(&pool->lock, flags);
9986+ return element;
9987+ }
9988+ spin_unlock_irqrestore(&pool->lock, flags);
9989+
9990+ /* We must not sleep in the GFP_ATOMIC case */
9991+ if (gfp_mask == gfp_nowait)
9992+ return NULL;
9993+
9994+ run_task_queue(&tq_disk);
9995+
9996+ add_wait_queue_exclusive(&pool->wait, &wait);
9997+ set_task_state(current, TASK_UNINTERRUPTIBLE);
9998+
9999+ spin_lock_irqsave(&pool->lock, flags);
10000+ curr_nr = pool->curr_nr;
10001+ spin_unlock_irqrestore(&pool->lock, flags);
10002+
10003+ if (!curr_nr)
10004+ schedule();
10005+
10006+ current->state = TASK_RUNNING;
10007+ remove_wait_queue(&pool->wait, &wait);
10008+
10009+ goto repeat_alloc;
10010+}
10011+
10012+/**
10013+ * mempool_free - return an element to the pool.
10014+ * @element: pool element pointer.
10015+ * @pool: pointer to the memory pool which was allocated via
10016+ * mempool_create().
10017+ *
10018+ * this function only sleeps if the free_fn() function sleeps.
10019+ */
10020+void mempool_free(void *element, mempool_t *pool)
10021+{
10022+ unsigned long flags;
10023+
10024+ if (pool->curr_nr < pool->min_nr) {
10025+ spin_lock_irqsave(&pool->lock, flags);
10026+ if (pool->curr_nr < pool->min_nr) {
10027+ add_element(pool, element);
10028+ spin_unlock_irqrestore(&pool->lock, flags);
10029+ wake_up(&pool->wait);
10030+ return;
10031+ }
10032+ spin_unlock_irqrestore(&pool->lock, flags);
10033+ }
10034+ pool->free(element, pool->pool_data);
10035+}
10036+
10037+/*
10038+ * A commonly used alloc and free fn.
10039+ */
10040+void *mempool_alloc_slab(int gfp_mask, void *pool_data)
10041+{
10042+ kmem_cache_t *mem = (kmem_cache_t *) pool_data;
10043+ return kmem_cache_alloc(mem, gfp_mask);
10044+}
10045+
10046+void mempool_free_slab(void *element, void *pool_data)
10047+{
10048+ kmem_cache_t *mem = (kmem_cache_t *) pool_data;
10049+ kmem_cache_free(mem, element);
10050+}
10051+
10052+
10053+EXPORT_SYMBOL(mempool_create);
10054+EXPORT_SYMBOL(mempool_resize);
10055+EXPORT_SYMBOL(mempool_destroy);
10056+EXPORT_SYMBOL(mempool_alloc);
10057+EXPORT_SYMBOL(mempool_free);
10058+EXPORT_SYMBOL(mempool_alloc_slab);
10059+EXPORT_SYMBOL(mempool_free_slab);
339dbf15
AM
10060--- linux-2.4.21/mm/vmalloc.c Fri Jun 13 16:33:25 2003
10061+++ linux/mm/vmalloc.c Wed Aug 20 14:41:57 2003
2ac564b8
AM
10062@@ -327,3 +327,22 @@
10063 read_unlock(&vmlist_lock);
10064 return buf - buf_start;
10065 }
10066+
10067+void *vcalloc(unsigned long nmemb, unsigned long elem_size)
10068+{
10069+ unsigned long size;
10070+ void *addr;
10071+
10072+ /*
10073+ * Check that we're not going to overflow.
10074+ */
10075+ if (nmemb > (ULONG_MAX / elem_size))
10076+ return NULL;
10077+
10078+ size = nmemb * elem_size;
10079+ addr = vmalloc(size);
10080+ if (addr)
10081+ memset(addr, 0, size);
10082+
10083+ return addr;
10084+}
339dbf15
AM
10085Supply #targets when creating a table to avoid needing to extend it later.
10086--- linux-2.4.21/drivers/md/dm-ioctl.c Mon Aug 18 21:24:26 2003
10087+++ linux/drivers/md/dm-ioctl.c Fri Aug 22 13:49:01 2003
10088@@ -764,7 +764,7 @@
10089 struct hash_cell *hc;
10090 struct dm_table *t;
10091
10092- r = dm_table_create(&t, get_mode(param));
10093+ r = dm_table_create(&t, get_mode(param), param->target_count);
10094 if (r)
10095 return r;
10096
10097--- linux-2.4.21/drivers/md/dm-table.c Tue Aug 19 15:43:50 2003
10098+++ linux/drivers/md/dm-table.c Fri Aug 22 14:48:50 2003
10099@@ -148,7 +148,7 @@
10100 return 0;
10101 }
10102
10103-int dm_table_create(struct dm_table **result, int mode)
10104+int dm_table_create(struct dm_table **result, int mode, unsigned num_targets)
10105 {
10106 struct dm_table *t = kmalloc(sizeof(*t), GFP_NOIO);
10107
10108@@ -159,8 +159,10 @@
10109 INIT_LIST_HEAD(&t->devices);
10110 atomic_set(&t->holders, 1);
10111
10112- /* allocate a single nodes worth of targets to begin with */
10113- if (alloc_targets(t, KEYS_PER_NODE)) {
10114+ if (!num_targets)
10115+ num_targets = KEYS_PER_NODE;
10116+
10117+ if (alloc_targets(t, num_targets)) {
10118 kfree(t);
10119 t = NULL;
10120 return -ENOMEM;
10121--- linux-2.4.21/drivers/md/dm.h Sat Jul 12 17:06:52 2003
10122+++ linux/drivers/md/dm.h Fri Aug 22 13:50:19 2003
10123@@ -96,7 +96,7 @@
10124 * Functions for manipulating a table. Tables are also reference
10125 * counted.
10126 *---------------------------------------------------------------*/
10127-int dm_table_create(struct dm_table **result, int mode);
10128+int dm_table_create(struct dm_table **result, int mode, unsigned num_targets);
10129
10130 void dm_table_get(struct dm_table *t);
10131 void dm_table_put(struct dm_table *t);
This page took 1.291084 seconds and 4 git commands to generate.