1 --- linux-2.4.21/Documentation/Configure.help Fri Jun 13 16:32:30 2003
2 +++ linux/Documentation/Configure.help Wed Aug 20 14:41:36 2003
4 want), say M here and read <file:Documentation/modules.txt>. The
5 module will be called lvm-mod.o.
9 + Device-mapper is a low level volume manager. It works by allowing
10 + people to specify mappings for ranges of logical sectors. Various
11 + mapping types are available, in addition people may write their own
12 + modules containing custom mappings if they wish.
14 + Higher level volume managers such as LVM2 use this driver.
16 + If you want to compile this as a module, say M here and read
17 + <file:Documentation/modules.txt>. The module will be called dm-mod.o.
21 Multiple devices driver support (RAID and LVM)
23 Support multiple physical spindles through a single logical device.
24 --- linux-2.4.21/MAINTAINERS Fri Jun 13 16:32:30 2003
25 +++ linux/MAINTAINERS Wed Aug 20 14:41:36 2003
27 W: http://www.debian.org/~dz/i8k/
33 +L: linux-LVM@sistina.com
34 +W: http://www.sistina.com/lvm
37 DEVICE NUMBER REGISTRY
40 --- linux-2.4.21/arch/mips64/kernel/ioctl32.c Fri Jan 10 16:34:18 2003
41 +++ linux/arch/mips64/kernel/ioctl32.c Wed Aug 20 14:41:28 2003
43 #include <linux/auto_fs.h>
44 #include <linux/ext2_fs.h>
45 #include <linux/raid/md_u.h>
46 +#include <linux/dm-ioctl.h>
48 #include <scsi/scsi.h>
49 #undef __KERNEL__ /* This file was born to be ugly ... */
51 IOCTL32_DEFAULT(STOP_ARRAY_RO),
52 IOCTL32_DEFAULT(RESTART_ARRAY_RW),
53 #endif /* CONFIG_MD */
55 +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
56 + IOCTL32_DEFAULT(DM_VERSION),
57 + IOCTL32_DEFAULT(DM_REMOVE_ALL),
58 + IOCTL32_DEFAULT(DM_DEV_CREATE),
59 + IOCTL32_DEFAULT(DM_DEV_REMOVE),
60 + IOCTL32_DEFAULT(DM_TABLE_LOAD),
61 + IOCTL32_DEFAULT(DM_DEV_SUSPEND),
62 + IOCTL32_DEFAULT(DM_DEV_RENAME),
63 + IOCTL32_DEFAULT(DM_TABLE_DEPS),
64 + IOCTL32_DEFAULT(DM_DEV_STATUS),
65 + IOCTL32_DEFAULT(DM_TABLE_STATUS),
66 + IOCTL32_DEFAULT(DM_DEV_WAIT),
67 + IOCTL32_DEFAULT(DM_LIST_DEVICES),
68 + IOCTL32_DEFAULT(DM_TABLE_CLEAR),
69 +#endif /* CONFIG_BLK_DEV_DM */
71 IOCTL32_DEFAULT(MTIOCTOP), /* mtio.h ioctls */
72 IOCTL32_HANDLER(MTIOCGET32, mt_ioctl_trans),
73 --- linux-2.4.21/arch/parisc/kernel/ioctl32.c Fri Jun 13 16:32:32 2003
74 +++ linux/arch/parisc/kernel/ioctl32.c Wed Aug 20 14:41:28 2003
77 #include <linux/lvm.h>
79 +#include <linux/dm-ioctl.h>
81 #include <scsi/scsi.h>
83 @@ -3418,6 +3419,22 @@
84 COMPATIBLE_IOCTL(LV_BMAP)
85 COMPATIBLE_IOCTL(LV_SNAPSHOT_USE_RATE)
88 +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
89 +COMPATIBLE_IOCTL(DM_VERSION)
90 +COMPATIBLE_IOCTL(DM_REMOVE_ALL)
91 +COMPATIBLE_IOCTL(DM_DEV_CREATE)
92 +COMPATIBLE_IOCTL(DM_DEV_REMOVE)
93 +COMPATIBLE_IOCTL(DM_TABLE_LOAD)
94 +COMPATIBLE_IOCTL(DM_DEV_SUSPEND)
95 +COMPATIBLE_IOCTL(DM_DEV_RENAME)
96 +COMPATIBLE_IOCTL(DM_TABLE_DEPS)
97 +COMPATIBLE_IOCTL(DM_DEV_STATUS)
98 +COMPATIBLE_IOCTL(DM_TABLE_STATUS)
99 +COMPATIBLE_IOCTL(DM_DEV_WAIT)
100 +COMPATIBLE_IOCTL(DM_LIST_DEVICES)
101 +COMPATIBLE_IOCTL(DM_TABLE_CLEAR)
102 +#endif /* CONFIG_BLK_DEV_DM */
103 #if defined(CONFIG_DRM) || defined(CONFIG_DRM_MODULE)
104 COMPATIBLE_IOCTL(DRM_IOCTL_GET_MAGIC)
105 COMPATIBLE_IOCTL(DRM_IOCTL_IRQ_BUSID)
106 --- linux-2.4.21/arch/ppc64/kernel/ioctl32.c Fri Jun 13 16:32:33 2003
107 +++ linux/arch/ppc64/kernel/ioctl32.c Wed Aug 20 14:41:29 2003
109 #if defined(CONFIG_BLK_DEV_LVM) || defined(CONFIG_BLK_DEV_LVM_MODULE)
110 #include <linux/lvm.h>
112 +#include <linux/dm-ioctl.h>
114 #include <scsi/scsi.h>
116 @@ -4423,6 +4424,22 @@
117 COMPATIBLE_IOCTL(NBD_PRINT_DEBUG),
118 COMPATIBLE_IOCTL(NBD_SET_SIZE_BLOCKS),
119 COMPATIBLE_IOCTL(NBD_DISCONNECT),
121 +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
122 +COMPATIBLE_IOCTL(DM_VERSION),
123 +COMPATIBLE_IOCTL(DM_REMOVE_ALL),
124 +COMPATIBLE_IOCTL(DM_DEV_CREATE),
125 +COMPATIBLE_IOCTL(DM_DEV_REMOVE),
126 +COMPATIBLE_IOCTL(DM_TABLE_LOAD),
127 +COMPATIBLE_IOCTL(DM_DEV_SUSPEND),
128 +COMPATIBLE_IOCTL(DM_DEV_RENAME),
129 +COMPATIBLE_IOCTL(DM_TABLE_DEPS),
130 +COMPATIBLE_IOCTL(DM_DEV_STATUS),
131 +COMPATIBLE_IOCTL(DM_TABLE_STATUS),
132 +COMPATIBLE_IOCTL(DM_DEV_WAIT),
133 +COMPATIBLE_IOCTL(DM_LIST_DEVICES),
134 +COMPATIBLE_IOCTL(DM_TABLE_CLEAR),
135 +#endif /* CONFIG_BLK_DEV_DM */
136 /* Remove *PRIVATE in 2.5 */
137 COMPATIBLE_IOCTL(SIOCDEVPRIVATE),
138 COMPATIBLE_IOCTL(SIOCDEVPRIVATE+1),
139 --- linux-2.4.21/arch/s390x/kernel/ioctl32.c Fri Jan 10 16:34:26 2003
140 +++ linux/arch/s390x/kernel/ioctl32.c Wed Aug 20 14:41:29 2003
142 #include <linux/ext2_fs.h>
143 #include <linux/hdreg.h>
144 #include <linux/if_bonding.h>
145 +#include <linux/dm-ioctl.h>
146 #include <asm/types.h>
147 #include <asm/uaccess.h>
148 #include <asm/dasd.h>
150 IOCTL32_DEFAULT(VT_UNLOCKSWITCH),
152 IOCTL32_DEFAULT(SIOCGSTAMP),
154 + IOCTL32_DEFAULT(DM_VERSION),
155 + IOCTL32_DEFAULT(DM_REMOVE_ALL),
156 + IOCTL32_DEFAULT(DM_DEV_CREATE),
157 + IOCTL32_DEFAULT(DM_DEV_REMOVE),
158 + IOCTL32_DEFAULT(DM_TABLE_LOAD),
159 + IOCTL32_DEFAULT(DM_DEV_SUSPEND),
160 + IOCTL32_DEFAULT(DM_DEV_RENAME),
161 + IOCTL32_DEFAULT(DM_TABLE_DEPS),
162 + IOCTL32_DEFAULT(DM_DEV_STATUS),
163 + IOCTL32_DEFAULT(DM_TABLE_STATUS),
164 + IOCTL32_DEFAULT(DM_DEV_WAIT),
165 + IOCTL32_DEFAULT(DM_LIST_DEVICES),
166 + IOCTL32_DEFAULT(DM_TABLE_CLEAR),
168 IOCTL32_HANDLER(SIOCGIFNAME, dev_ifname32),
169 IOCTL32_HANDLER(SIOCGIFCONF, dev_ifconf),
170 --- linux-2.4.21/arch/sparc64/kernel/ioctl32.c Fri Jun 13 16:32:34 2003
171 +++ linux/arch/sparc64/kernel/ioctl32.c Wed Aug 20 14:41:29 2003
173 #if defined(CONFIG_BLK_DEV_LVM) || defined(CONFIG_BLK_DEV_LVM_MODULE)
174 #include <linux/lvm.h>
176 +#include <linux/dm-ioctl.h>
178 #include <scsi/scsi.h>
180 @@ -5076,6 +5077,22 @@
181 COMPATIBLE_IOCTL(NBD_PRINT_DEBUG)
182 COMPATIBLE_IOCTL(NBD_SET_SIZE_BLOCKS)
183 COMPATIBLE_IOCTL(NBD_DISCONNECT)
185 +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
186 +COMPATIBLE_IOCTL(DM_VERSION)
187 +COMPATIBLE_IOCTL(DM_REMOVE_ALL)
188 +COMPATIBLE_IOCTL(DM_DEV_CREATE)
189 +COMPATIBLE_IOCTL(DM_DEV_REMOVE)
190 +COMPATIBLE_IOCTL(DM_TABLE_LOAD)
191 +COMPATIBLE_IOCTL(DM_DEV_SUSPEND)
192 +COMPATIBLE_IOCTL(DM_DEV_RENAME)
193 +COMPATIBLE_IOCTL(DM_TABLE_DEPS)
194 +COMPATIBLE_IOCTL(DM_DEV_STATUS)
195 +COMPATIBLE_IOCTL(DM_TABLE_STATUS)
196 +COMPATIBLE_IOCTL(DM_DEV_WAIT)
197 +COMPATIBLE_IOCTL(DM_LIST_DEVICES)
198 +COMPATIBLE_IOCTL(DM_TABLE_CLEAR)
199 +#endif /* CONFIG_BLK_DEV_DM */
201 #if defined(CONFIG_IEEE1394) || defined(CONFIG_IEEE1394_MODULE)
202 COMPATIBLE_IOCTL(AMDTP_IOC_CHANNEL)
203 --- linux-2.4.21/arch/x86_64/ia32/ia32_ioctl.c Fri Jun 13 16:32:35 2003
204 +++ linux/arch/x86_64/ia32/ia32_ioctl.c Wed Aug 20 14:41:29 2003
207 #include <linux/lvm.h>
209 +#include <linux/dm-ioctl.h>
211 #include <scsi/scsi.h>
213 @@ -4047,6 +4048,22 @@
214 COMPATIBLE_IOCTL(LV_BMAP)
215 COMPATIBLE_IOCTL(LV_SNAPSHOT_USE_RATE)
218 +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
219 +COMPATIBLE_IOCTL(DM_VERSION)
220 +COMPATIBLE_IOCTL(DM_REMOVE_ALL)
221 +COMPATIBLE_IOCTL(DM_DEV_CREATE)
222 +COMPATIBLE_IOCTL(DM_DEV_REMOVE)
223 +COMPATIBLE_IOCTL(DM_TABLE_LOAD)
224 +COMPATIBLE_IOCTL(DM_DEV_SUSPEND)
225 +COMPATIBLE_IOCTL(DM_DEV_RENAME)
226 +COMPATIBLE_IOCTL(DM_TABLE_DEPS)
227 +COMPATIBLE_IOCTL(DM_DEV_STATUS)
228 +COMPATIBLE_IOCTL(DM_TABLE_STATUS)
229 +COMPATIBLE_IOCTL(DM_DEV_WAIT)
230 +COMPATIBLE_IOCTL(DM_LIST_DEVICES)
231 +COMPATIBLE_IOCTL(DM_TABLE_CLEAR)
232 +#endif /* CONFIG_BLK_DEV_DM */
233 #ifdef CONFIG_AUTOFS_FS
234 COMPATIBLE_IOCTL(AUTOFS_IOC_READY)
235 COMPATIBLE_IOCTL(AUTOFS_IOC_FAIL)
236 --- linux-2.4.21/drivers/md/Config.in Fri Jan 10 16:34:50 2003
237 +++ linux/drivers/md/Config.in Wed Aug 20 14:41:36 2003
239 dep_tristate ' Multipath I/O support' CONFIG_MD_MULTIPATH $CONFIG_BLK_DEV_MD
241 dep_tristate ' Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM $CONFIG_MD
242 +dep_tristate ' Device-mapper support' CONFIG_BLK_DEV_DM $CONFIG_MD
243 +dep_tristate ' Mirror (RAID-1) support' CONFIG_BLK_DEV_DM_MIRROR $CONFIG_BLK_DEV_DM
246 --- linux-2.4.21/drivers/md/Makefile Fri Jan 10 16:34:50 2003
247 +++ linux/drivers/md/Makefile Wed Aug 20 14:41:44 2003
252 -export-objs := md.o xor.o
253 -list-multi := lvm-mod.o
254 +export-objs := md.o xor.o dm-table.o dm-target.o kcopyd.o dm-daemon.o \
255 + dm-log.o dm-io.o dm.o
257 +list-multi := lvm-mod.o dm-mod.o dm-mirror-mod.o
258 lvm-mod-objs := lvm.o lvm-snap.o lvm-fs.o
259 +dm-mod-objs := dm.o dm-table.o dm-target.o dm-ioctl.o \
260 + dm-linear.o dm-stripe.o dm-snapshot.o dm-exception-store.o \
261 + kcopyd.o dm-daemon.o dm-io.o
262 +dm-mirror-mod-objs := dm-raid1.o dm-log.o
264 # Note: link order is important. All raid personalities
265 # and xor.o must come before md.o, as they each initialise
266 # themselves, and md.o may use the personalities when it
269 -obj-$(CONFIG_MD_LINEAR) += linear.o
270 -obj-$(CONFIG_MD_RAID0) += raid0.o
271 -obj-$(CONFIG_MD_RAID1) += raid1.o
272 -obj-$(CONFIG_MD_RAID5) += raid5.o xor.o
273 -obj-$(CONFIG_MD_MULTIPATH) += multipath.o
274 -obj-$(CONFIG_BLK_DEV_MD) += md.o
275 -obj-$(CONFIG_BLK_DEV_LVM) += lvm-mod.o
276 +obj-$(CONFIG_MD_LINEAR) += linear.o
277 +obj-$(CONFIG_MD_RAID0) += raid0.o
278 +obj-$(CONFIG_MD_RAID1) += raid1.o
279 +obj-$(CONFIG_MD_RAID5) += raid5.o xor.o
280 +obj-$(CONFIG_MD_MULTIPATH) += multipath.o
281 +obj-$(CONFIG_BLK_DEV_MD) += md.o
283 +obj-$(CONFIG_BLK_DEV_LVM) += lvm-mod.o
285 +obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
286 +obj-$(CONFIG_BLK_DEV_DM_MIRROR) += dm-mirror.o
288 include $(TOPDIR)/Rules.make
290 lvm-mod.o: $(lvm-mod-objs)
291 $(LD) -r -o $@ $(lvm-mod-objs)
293 +dm-mod.o: $(dm-mod-objs)
294 + $(LD) -r -o $@ $(dm-mod-objs)
296 +dm-mirror.o: $(dm-mirror-mod-objs)
297 + $(LD) -r -o $@ $(dm-mirror-mod-objs)
299 --- linux-2.4.21/drivers/md/dm-daemon.c Thu Jan 1 01:00:00 1970
300 +++ linux/drivers/md/dm-daemon.c Wed Aug 20 14:41:38 2003
303 + * Copyright (C) 2003 Sistina Software
305 + * This file is released under the LGPL.
309 +#include "dm-daemon.h"
311 +#include <linux/module.h>
312 +#include <linux/sched.h>
314 +static int daemon(void *arg)
316 + struct dm_daemon *dd = (struct dm_daemon *) arg;
317 + DECLARE_WAITQUEUE(wq, current);
320 + reparent_to_init();
322 + /* block all signals */
323 + spin_lock_irq(¤t->sigmask_lock);
324 + sigfillset(¤t->blocked);
325 + flush_signals(current);
326 + spin_unlock_irq(¤t->sigmask_lock);
328 + strcpy(current->comm, dd->name);
329 + atomic_set(&dd->please_die, 0);
331 + add_wait_queue(&dd->job_queue, &wq);
333 + down(&dd->run_lock);
334 + up(&dd->start_lock);
337 + * dd->fn() could do anything, very likely it will
338 + * suspend. So we can't set the state to
339 + * TASK_INTERRUPTIBLE before calling it. In order to
340 + * prevent a race with a waking thread we do this little
341 + * dance with the dd->woken variable.
345 + set_current_state(TASK_RUNNING);
347 + if (atomic_read(&dd->please_die))
350 + atomic_set(&dd->woken, 0);
354 + set_current_state(TASK_INTERRUPTIBLE);
355 + } while (atomic_read(&dd->woken));
361 + remove_wait_queue(&dd->job_queue, &wq);
366 +int dm_daemon_start(struct dm_daemon *dd, const char *name, void (*fn)(void))
371 + * Initialise the dm_daemon.
374 + strncpy(dd->name, name, sizeof(dd->name) - 1);
375 + sema_init(&dd->start_lock, 1);
376 + sema_init(&dd->run_lock, 1);
377 + init_waitqueue_head(&dd->job_queue);
380 + * Start the new thread.
382 + down(&dd->start_lock);
383 + pid = kernel_thread(daemon, dd, 0);
385 + DMERR("Failed to start %s thread", name);
390 + * wait for the daemon to up this mutex.
392 + down(&dd->start_lock);
393 + up(&dd->start_lock);
398 +void dm_daemon_stop(struct dm_daemon *dd)
400 + atomic_set(&dd->please_die, 1);
401 + dm_daemon_wake(dd);
402 + down(&dd->run_lock);
406 +void dm_daemon_wake(struct dm_daemon *dd)
408 + atomic_set(&dd->woken, 1);
409 + wake_up_interruptible(&dd->job_queue);
412 +EXPORT_SYMBOL(dm_daemon_start);
413 +EXPORT_SYMBOL(dm_daemon_stop);
414 +EXPORT_SYMBOL(dm_daemon_wake);
415 --- linux-2.4.21/drivers/md/dm-daemon.h Thu Jan 1 01:00:00 1970
416 +++ linux/drivers/md/dm-daemon.h Wed Aug 20 14:41:38 2003
419 + * Copyright (C) 2003 Sistina Software
421 + * This file is released under the LGPL.
427 +#include <asm/atomic.h>
428 +#include <asm/semaphore.h>
433 + atomic_t please_die;
434 + struct semaphore start_lock;
435 + struct semaphore run_lock;
438 + wait_queue_head_t job_queue;
441 +int dm_daemon_start(struct dm_daemon *dd, const char *name, void (*fn)(void));
442 +void dm_daemon_stop(struct dm_daemon *dd);
443 +void dm_daemon_wake(struct dm_daemon *dd);
444 +int dm_daemon_running(struct dm_daemon *dd);
447 --- linux-2.4.21/drivers/md/dm-exception-store.c Thu Jan 1 01:00:00 1970
448 +++ linux/drivers/md/dm-exception-store.c Wed Aug 20 14:41:38 2003
453 + * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
455 + * This file is released under the GPL.
458 +#include "dm-snapshot.h"
462 +#include <linux/mm.h>
463 +#include <linux/pagemap.h>
464 +#include <linux/vmalloc.h>
465 +#include <linux/slab.h>
467 +/*-----------------------------------------------------------------
468 + * Persistent snapshots, by persistent we mean that the snapshot
469 + * will survive a reboot.
470 + *---------------------------------------------------------------*/
473 + * We need to store a record of which parts of the origin have
474 + * been copied to the snapshot device. The snapshot code
475 + * requires that we copy exception chunks to chunk aligned areas
476 + * of the COW store. It makes sense therefore, to store the
477 + * metadata in chunk size blocks.
479 + * There is no backward or forward compatibility implemented,
480 + * snapshots with different disk versions than the kernel will
481 + * not be usable. It is expected that "lvcreate" will blank out
482 + * the start of a fresh COW device before calling the snapshot
485 + * The first chunk of the COW device just contains the header.
486 + * After this there is a chunk filled with exception metadata,
487 + * followed by as many exception chunks as can fit in the
490 + * All on disk structures are in little-endian format. The end
491 + * of the exceptions info is indicated by an exception with a
492 + * new_chunk of 0, which is invalid since it would point to the
497 + * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
499 +#define SNAP_MAGIC 0x70416e53
502 + * The on-disk version of the metadata.
504 +#define SNAPSHOT_DISK_VERSION 1
506 +struct disk_header {
510 + * Is this snapshot valid. There is no way of recovering
511 + * an invalid snapshot.
516 + * Simple, incrementing version. no backward
522 + uint32_t chunk_size;
525 +struct disk_exception {
526 + uint64_t old_chunk;
527 + uint64_t new_chunk;
530 +struct commit_callback {
531 + void (*callback)(void *, int success);
536 + * The top level structure for a persistent exception store.
539 + struct dm_snapshot *snap; /* up pointer to my snapshot */
542 + uint32_t chunk_size;
543 + uint32_t exceptions_per_area;
546 + * Now that we have an asynchronous kcopyd there is no
547 + * need for large chunk sizes, so it wont hurt to have a
548 + * whole chunks worth of metadata in memory at once.
553 + * Used to keep track of which metadata area the data in
554 + * 'chunk' refers to.
556 + uint32_t current_area;
559 + * The next free chunk for an exception.
561 + uint32_t next_free;
564 + * The index of next free exception in the current
567 + uint32_t current_committed;
569 + atomic_t pending_count;
570 + uint32_t callback_count;
571 + struct commit_callback *callbacks;
574 +static inline unsigned int sectors_to_pages(unsigned int sectors)
576 + return sectors / (PAGE_SIZE / SECTOR_SIZE);
579 +static int alloc_area(struct pstore *ps)
582 + size_t i, len, nr_pages;
583 + struct page *page, *last = NULL;
585 + len = ps->chunk_size << SECTOR_SHIFT;
588 + * Allocate the chunk_size block of memory that will hold
589 + * a single metadata area.
591 + ps->area = vmalloc(len);
595 + nr_pages = sectors_to_pages(ps->chunk_size);
598 + * We lock the pages for ps->area into memory since
599 + * they'll be doing a lot of io. We also chain them
600 + * together ready for dm-io.
602 + for (i = 0; i < nr_pages; i++) {
603 + page = vmalloc_to_page(ps->area + (i * PAGE_SIZE));
606 + last->list.next = &page->list;
613 +static void free_area(struct pstore *ps)
615 + size_t i, nr_pages;
618 + nr_pages = sectors_to_pages(ps->chunk_size);
619 + for (i = 0; i < nr_pages; i++) {
620 + page = vmalloc_to_page(ps->area + (i * PAGE_SIZE));
621 + page->list.next = NULL;
629 + * Read or write a chunk aligned and sized block of data from a device.
631 +static int chunk_io(struct pstore *ps, uint32_t chunk, int rw)
633 + struct io_region where;
636 + where.dev = ps->snap->cow->dev;
637 + where.sector = ps->chunk_size * chunk;
638 + where.count = ps->chunk_size;
640 + return dm_io_sync(1, &where, rw, vmalloc_to_page(ps->area), 0, &bits);
644 + * Read or write a metadata area. Remembering to skip the first
645 + * chunk which holds the header.
647 +static int area_io(struct pstore *ps, uint32_t area, int rw)
652 + /* convert a metadata area index to a chunk index */
653 + chunk = 1 + ((ps->exceptions_per_area + 1) * area);
655 + r = chunk_io(ps, chunk, rw);
659 + ps->current_area = area;
663 +static int zero_area(struct pstore *ps, uint32_t area)
665 + memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT);
666 + return area_io(ps, area, WRITE);
669 +static int read_header(struct pstore *ps, int *new_snapshot)
672 + struct disk_header *dh;
674 + r = chunk_io(ps, 0, READ);
678 + dh = (struct disk_header *) ps->area;
680 + if (le32_to_cpu(dh->magic) == 0) {
683 + } else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) {
685 + ps->valid = le32_to_cpu(dh->valid);
686 + ps->version = le32_to_cpu(dh->version);
687 + ps->chunk_size = le32_to_cpu(dh->chunk_size);
690 + DMWARN("Invalid/corrupt snapshot");
697 +static int write_header(struct pstore *ps)
699 + struct disk_header *dh;
701 + memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT);
703 + dh = (struct disk_header *) ps->area;
704 + dh->magic = cpu_to_le32(SNAP_MAGIC);
705 + dh->valid = cpu_to_le32(ps->valid);
706 + dh->version = cpu_to_le32(ps->version);
707 + dh->chunk_size = cpu_to_le32(ps->chunk_size);
709 + return chunk_io(ps, 0, WRITE);
713 + * Access functions for the disk exceptions, these do the endian conversions.
715 +static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
717 + if (index >= ps->exceptions_per_area)
720 + return ((struct disk_exception *) ps->area) + index;
723 +static int read_exception(struct pstore *ps,
724 + uint32_t index, struct disk_exception *result)
726 + struct disk_exception *e;
728 + e = get_exception(ps, index);
733 + result->old_chunk = le64_to_cpu(e->old_chunk);
734 + result->new_chunk = le64_to_cpu(e->new_chunk);
739 +static int write_exception(struct pstore *ps,
740 + uint32_t index, struct disk_exception *de)
742 + struct disk_exception *e;
744 + e = get_exception(ps, index);
749 + e->old_chunk = cpu_to_le64(de->old_chunk);
750 + e->new_chunk = cpu_to_le64(de->new_chunk);
756 + * Registers the exceptions that are present in the current area.
757 + * 'full' is filled in to indicate if the area has been
760 +static int insert_exceptions(struct pstore *ps, int *full)
764 + struct disk_exception de;
766 + /* presume the area is full */
769 + for (i = 0; i < ps->exceptions_per_area; i++) {
770 + r = read_exception(ps, i, &de);
776 + * If the new_chunk is pointing at the start of
777 + * the COW device, where the first metadata area
778 + * is we know that we've hit the end of the
779 + * exceptions. Therefore the area is not full.
781 + if (de.new_chunk == 0LL) {
782 + ps->current_committed = i;
788 + * Keep track of the start of the free chunks.
790 + if (ps->next_free <= de.new_chunk)
791 + ps->next_free = de.new_chunk + 1;
794 + * Otherwise we add the exception to the snapshot.
796 + r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk);
804 +static int read_exceptions(struct pstore *ps)
810 + * Keeping reading chunks and inserting exceptions until
811 + * we find a partially full area.
813 + for (area = 0; full; area++) {
814 + r = area_io(ps, area, READ);
818 + r = insert_exceptions(ps, &full);
826 +static inline struct pstore *get_info(struct exception_store *store)
828 + return (struct pstore *) store->context;
831 +static void persistent_fraction_full(struct exception_store *store,
832 + sector_t *numerator, sector_t *denominator)
834 + *numerator = get_info(store)->next_free * store->snap->chunk_size;
835 + *denominator = get_dev_size(store->snap->cow->dev);
838 +static void persistent_destroy(struct exception_store *store)
840 + struct pstore *ps = get_info(store);
842 + dm_io_put(sectors_to_pages(ps->chunk_size));
843 + vfree(ps->callbacks);
848 +static int persistent_read_metadata(struct exception_store *store)
850 + int r, new_snapshot;
851 + struct pstore *ps = get_info(store);
854 + * Read the snapshot header.
856 + r = read_header(ps, &new_snapshot);
861 + * Do we need to setup a new snapshot ?
863 + if (new_snapshot) {
864 + r = write_header(ps);
866 + DMWARN("write_header failed");
870 + r = zero_area(ps, 0);
872 + DMWARN("zero_area(0) failed");
881 + DMWARN("snapshot is marked invalid");
885 + if (ps->version != SNAPSHOT_DISK_VERSION) {
886 + DMWARN("unable to handle snapshot disk version %d",
892 + * Read the metadata.
894 + r = read_exceptions(ps);
902 +static int persistent_prepare(struct exception_store *store,
903 + struct exception *e)
905 + struct pstore *ps = get_info(store);
907 + sector_t size = get_dev_size(store->snap->cow->dev);
909 + /* Is there enough room ? */
910 + if (size < ((ps->next_free + 1) * store->snap->chunk_size))
913 + e->new_chunk = ps->next_free;
916 + * Move onto the next free pending, making sure to take
917 + * into account the location of the metadata chunks.
919 + stride = (ps->exceptions_per_area + 1);
920 + if ((++ps->next_free % stride) == 1)
923 + atomic_inc(&ps->pending_count);
927 +static void persistent_commit(struct exception_store *store,
928 + struct exception *e,
929 + void (*callback) (void *, int success),
930 + void *callback_context)
934 + struct pstore *ps = get_info(store);
935 + struct disk_exception de;
936 + struct commit_callback *cb;
938 + de.old_chunk = e->old_chunk;
939 + de.new_chunk = e->new_chunk;
940 + write_exception(ps, ps->current_committed++, &de);
943 + * Add the callback to the back of the array. This code
944 + * is the only place where the callback array is
945 + * manipulated, and we know that it will never be called
946 + * multiple times concurrently.
948 + cb = ps->callbacks + ps->callback_count++;
949 + cb->callback = callback;
950 + cb->context = callback_context;
953 + * If there are no more exceptions in flight, or we have
954 + * filled this metadata area we commit the exceptions to
957 + if (atomic_dec_and_test(&ps->pending_count) ||
958 + (ps->current_committed == ps->exceptions_per_area)) {
959 + r = area_io(ps, ps->current_area, WRITE);
963 + for (i = 0; i < ps->callback_count; i++) {
964 + cb = ps->callbacks + i;
965 + cb->callback(cb->context, r == 0 ? 1 : 0);
968 + ps->callback_count = 0;
972 + * Have we completely filled the current area ?
974 + if (ps->current_committed == ps->exceptions_per_area) {
975 + ps->current_committed = 0;
976 + r = zero_area(ps, ps->current_area + 1);
982 +static void persistent_drop(struct exception_store *store)
984 + struct pstore *ps = get_info(store);
987 + if (write_header(ps))
988 + DMWARN("write header failed");
991 +int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
996 + r = dm_io_get(sectors_to_pages(chunk_size));
1000 + /* allocate the pstore */
1001 + ps = kmalloc(sizeof(*ps), GFP_KERNEL);
1007 + ps->snap = store->snap;
1009 + ps->version = SNAPSHOT_DISK_VERSION;
1010 + ps->chunk_size = chunk_size;
1011 + ps->exceptions_per_area = (chunk_size << SECTOR_SHIFT) /
1012 + sizeof(struct disk_exception);
1013 + ps->next_free = 2; /* skipping the header and first area */
1014 + ps->current_committed = 0;
1016 + r = alloc_area(ps);
1021 + * Allocate space for all the callbacks.
1023 + ps->callback_count = 0;
1024 + atomic_set(&ps->pending_count, 0);
1025 + ps->callbacks = vcalloc(ps->exceptions_per_area,
1026 + sizeof(*ps->callbacks));
1028 + if (!ps->callbacks) {
1033 + store->destroy = persistent_destroy;
1034 + store->read_metadata = persistent_read_metadata;
1035 + store->prepare_exception = persistent_prepare;
1036 + store->commit_exception = persistent_commit;
1037 + store->drop_snapshot = persistent_drop;
1038 + store->fraction_full = persistent_fraction_full;
1039 + store->context = ps;
1044 + dm_io_put(sectors_to_pages(chunk_size));
1046 + if (ps->callbacks)
1047 + vfree(ps->callbacks);
1054 +/*-----------------------------------------------------------------
1055 + * Implementation of the store for non-persistent snapshots.
1056 + *---------------------------------------------------------------*/
1057 +struct transient_c {
1058 + sector_t next_free;
1061 +void transient_destroy(struct exception_store *store)
1063 + kfree(store->context);
1066 +int transient_read_metadata(struct exception_store *store)
1071 +int transient_prepare(struct exception_store *store, struct exception *e)
1073 + struct transient_c *tc = (struct transient_c *) store->context;
1074 + sector_t size = get_dev_size(store->snap->cow->dev);
1076 + if (size < (tc->next_free + store->snap->chunk_size))
1079 + e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
1080 + tc->next_free += store->snap->chunk_size;
1085 +void transient_commit(struct exception_store *store,
1086 + struct exception *e,
1087 + void (*callback) (void *, int success),
1088 + void *callback_context)
1090 + /* Just succeed */
1091 + callback(callback_context, 1);
1094 +static void transient_fraction_full(struct exception_store *store,
1095 + sector_t *numerator, sector_t *denominator)
1097 + *numerator = ((struct transient_c *) store->context)->next_free;
1098 + *denominator = get_dev_size(store->snap->cow->dev);
1101 +int dm_create_transient(struct exception_store *store,
1102 + struct dm_snapshot *s, int blocksize)
1104 + struct transient_c *tc;
1106 + memset(store, 0, sizeof(*store));
1107 + store->destroy = transient_destroy;
1108 + store->read_metadata = transient_read_metadata;
1109 + store->prepare_exception = transient_prepare;
1110 + store->commit_exception = transient_commit;
1111 + store->fraction_full = transient_fraction_full;
1114 + tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
1118 + tc->next_free = 0;
1119 + store->context = tc;
1123 --- linux-2.4.21/drivers/md/dm-io.c Thu Jan 1 01:00:00 1970
1124 +++ linux/drivers/md/dm-io.c Wed Aug 20 14:41:38 2003
1127 + * Copyright (C) 2003 Sistina Software
1129 + * This file is released under the GPL.
1134 +#include <linux/mempool.h>
1135 +#include <linux/module.h>
1136 +#include <linux/slab.h>
1137 +#include <linux/sched.h>
1139 +/* FIXME: can we shrink this ? */
1140 +struct io_context {
1142 + unsigned int error;
1144 + struct task_struct *sleeper;
1145 + io_notify_fn callback;
1150 + * We maintain a pool of buffer heads for dispatching the io.
1152 +static unsigned int _num_bhs;
1153 +static mempool_t *_buffer_pool;
1156 + * io contexts are only dynamically allocated for asynchronous
1157 + * io. Since async io is likely to be the majority of io we'll
1158 + * have the same number of io contexts as buffer heads ! (FIXME:
1159 + * must reduce this).
1161 +mempool_t *_io_pool;
1163 +static void *alloc_bh(int gfp_mask, void *pool_data)
1165 + struct buffer_head *bh;
1167 + bh = kmem_cache_alloc(bh_cachep, gfp_mask);
1169 + bh->b_reqnext = NULL;
1170 + init_waitqueue_head(&bh->b_wait);
1171 + INIT_LIST_HEAD(&bh->b_inode_buffers);
1177 +static void *alloc_io(int gfp_mask, void *pool_data)
1179 + return kmalloc(sizeof(struct io_context), gfp_mask);
1182 +static void free_io(void *element, void *pool_data)
1187 +static unsigned int pages_to_buffers(unsigned int pages)
1189 + return 4 * pages; /* too many ? */
1192 +static int resize_pool(unsigned int new_bhs)
1196 + if (_buffer_pool) {
1197 + if (new_bhs == 0) {
1198 + /* free off the pools */
1199 + mempool_destroy(_buffer_pool);
1200 + mempool_destroy(_io_pool);
1201 + _buffer_pool = _io_pool = NULL;
1203 + /* resize the pools */
1204 + r = mempool_resize(_buffer_pool, new_bhs, GFP_KERNEL);
1206 + r = mempool_resize(_io_pool,
1207 + new_bhs, GFP_KERNEL);
1210 + /* create new pools */
1211 + _buffer_pool = mempool_create(new_bhs, alloc_bh,
1212 + mempool_free_slab, bh_cachep);
1213 + if (!_buffer_pool)
1216 + _io_pool = mempool_create(new_bhs, alloc_io, free_io, NULL);
1218 + mempool_destroy(_buffer_pool);
1219 + _buffer_pool = NULL;
1225 + _num_bhs = new_bhs;
1230 +int dm_io_get(unsigned int num_pages)
1232 + return resize_pool(_num_bhs + pages_to_buffers(num_pages));
1235 +void dm_io_put(unsigned int num_pages)
1237 + resize_pool(_num_bhs - pages_to_buffers(num_pages));
1240 +/*-----------------------------------------------------------------
1241 + * We need to keep track of which region a buffer is doing io
1242 + * for. In order to save a memory allocation we store this in an
1243 + * unused field of the buffer head, and provide these access
1246 + * FIXME: add compile time check that an unsigned int can fit
1249 + *---------------------------------------------------------------*/
1250 +static inline void bh_set_region(struct buffer_head *bh, unsigned int region)
1252 + bh->b_journal_head = (void *) region;
1255 +static inline int bh_get_region(struct buffer_head *bh)
1257 + return (unsigned int) bh->b_journal_head;
1260 +/*-----------------------------------------------------------------
1261 + * We need an io object to keep track of the number of bhs that
1262 + * have been dispatched for a particular io.
1263 + *---------------------------------------------------------------*/
1264 +static void dec_count(struct io_context *io, unsigned int region, int error)
1267 + set_bit(region, &io->error);
1269 + if (atomic_dec_and_test(&io->count)) {
1271 + wake_up_process(io->sleeper);
1274 + int r = io->error;
1275 + io_notify_fn fn = io->callback;
1276 + void *context = io->context;
1278 + mempool_free(io, _io_pool);
1284 +static void endio(struct buffer_head *bh, int uptodate)
1286 + struct io_context *io = (struct io_context *) bh->b_private;
1288 + if (!uptodate && io->rw != WRITE) {
1290 + * We need to zero this region, otherwise people
1291 + * like kcopyd may write the arbitrary contents
1294 + memset(bh->b_data, 0, bh->b_size);
1297 + dec_count((struct io_context *) bh->b_private,
1298 + bh_get_region(bh), !uptodate);
1299 + mempool_free(bh, _buffer_pool);
1303 + * Primitives for alignment calculations.
1305 +int fls(unsigned n)
1307 + return generic_fls32(n);
1310 +static inline int log2_floor(unsigned n)
1312 + return ffs(n) - 1;
1315 +static inline int log2_align(unsigned n)
1317 + return fls(n) - 1;
1321 + * Returns the next block for io.
1323 +static int do_page(kdev_t dev, sector_t *block, sector_t end_block,
1324 + unsigned int block_size,
1325 + struct page *p, unsigned int offset,
1326 + unsigned int region, struct io_context *io)
1328 + struct buffer_head *bh;
1329 + sector_t b = *block;
1330 + sector_t blocks_per_page = PAGE_SIZE / block_size;
1331 + unsigned int this_size; /* holds the size of the current io */
1334 + while ((offset < PAGE_SIZE) && (b != end_block)) {
1335 + bh = mempool_alloc(_buffer_pool, GFP_NOIO);
1336 + init_buffer(bh, endio, io);
1337 + bh_set_region(bh, region);
1340 + * Block size must be a power of 2 and aligned
1343 + len = end_block - b;
1344 + this_size = min((sector_t) 1 << log2_floor(b), blocks_per_page);
1345 + if (this_size > len)
1346 + this_size = 1 << log2_align(len);
1349 + * Add in the job offset.
1351 + bh->b_blocknr = (b / this_size);
1352 + bh->b_size = block_size * this_size;
1353 + set_bh_page(bh, p, offset);
1354 + bh->b_this_page = bh;
1357 + atomic_set(&bh->b_count, 1);
1359 + bh->b_state = ((1 << BH_Uptodate) | (1 << BH_Mapped) |
1362 + if (io->rw == WRITE)
1363 + clear_bit(BH_Dirty, &bh->b_state);
1365 + atomic_inc(&io->count);
1366 + submit_bh(io->rw, bh);
1369 + offset += block_size * this_size;
1373 + return (b == end_block);
1376 +static void do_region(unsigned int region, struct io_region *where,
1377 + struct page *page, unsigned int offset,
1378 + struct io_context *io)
1380 + unsigned int block_size = get_hardsect_size(where->dev);
1381 + unsigned int sblock_size = block_size >> 9;
1382 + sector_t block = where->sector / sblock_size;
1383 + sector_t end_block = (where->sector + where->count) / sblock_size;
1386 + if (do_page(where->dev, &block, end_block, block_size,
1387 + page, offset, region, io))
1390 + offset = 0; /* only offset the first page */
1392 + page = list_entry(page->list.next, struct page, list);
1396 +static void dispatch_io(unsigned int num_regions, struct io_region *where,
1397 + struct page *pages, unsigned int offset,
1398 + struct io_context *io)
1402 + for (i = 0; i < num_regions; i++)
1403 + if (where[i].count)
1404 + do_region(i, where + i, pages, offset, io);
1407 + * Drop the extra refence that we were holding to avoid
1408 + * the io being completed too early.
1410 + dec_count(io, 0, 0);
1416 +int dm_io_sync(unsigned int num_regions, struct io_region *where,
1417 + int rw, struct page *pages, unsigned int offset,
1418 + unsigned int *error_bits)
1420 + struct io_context io;
1422 + BUG_ON(num_regions > 1 && rw != WRITE);
1426 + atomic_set(&io.count, 1); /* see dispatch_io() */
1427 + io.sleeper = current;
1429 + dispatch_io(num_regions, where, pages, offset, &io);
1430 + run_task_queue(&tq_disk);
1433 + set_current_state(TASK_UNINTERRUPTIBLE);
1435 + if (!atomic_read(&io.count))
1440 + set_current_state(TASK_RUNNING);
1442 + *error_bits = io.error;
1443 + return io.error ? -EIO : 0;
1449 +int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
1450 + struct page *pages, unsigned int offset,
1451 + io_notify_fn fn, void *context)
1453 + struct io_context *io = mempool_alloc(_io_pool, GFP_NOIO);
1457 + atomic_set(&io->count, 1); /* see dispatch_io() */
1458 + io->sleeper = NULL;
1459 + io->callback = fn;
1460 + io->context = context;
1462 + dispatch_io(num_regions, where, pages, offset, io);
1466 +EXPORT_SYMBOL(dm_io_get);
1467 +EXPORT_SYMBOL(dm_io_put);
1468 +EXPORT_SYMBOL(dm_io_sync);
1469 +EXPORT_SYMBOL(dm_io_async);
1470 --- linux-2.4.21/drivers/md/dm-io.h Thu Jan 1 01:00:00 1970
1471 +++ linux/drivers/md/dm-io.h Wed Aug 20 14:41:38 2003
1474 + * Copyright (C) 2003 Sistina Software
1476 + * This file is released under the GPL.
1484 +#include <linux/list.h>
1486 +/* Move these to bitops.h eventually */
1487 +/* Improved generic_fls algorithm (in 2.4 there is no generic_fls so far) */
1488 +/* (c) 2002, D.Phillips and Sistina Software */
1489 +/* Licensed under Version 2 of the GPL */
1491 +static unsigned generic_fls8(unsigned n)
1494 + n & 0xc0 ? (n >> 7) + 7 : (n >> 5) + 5:
1495 + n & 0x0c ? (n >> 3) + 3 : n - ((n + 1) >> 2);
1498 +static inline unsigned generic_fls16(unsigned n)
1500 + return n & 0xff00? generic_fls8(n >> 8) + 8 : generic_fls8(n);
1503 +static inline unsigned generic_fls32(unsigned n)
1505 + return n & 0xffff0000 ? generic_fls16(n >> 16) + 16 : generic_fls16(n);
1508 +/* FIXME make this configurable */
1509 +#define DM_MAX_IO_REGIONS 8
1519 + * 'error' is a bitset, with each bit indicating whether an error
1520 + * occurred doing io to the corresponding region.
1522 +typedef void (*io_notify_fn)(unsigned int error, void *context);
1526 + * Before anyone uses the IO interface they should call
1527 + * dm_io_get(), specifying roughly how many pages they are
1528 + * expecting to perform io on concurrently.
1530 + * This function may block.
1532 +int dm_io_get(unsigned int num_pages);
1533 +void dm_io_put(unsigned int num_pages);
1539 + * Please ensure that the rw flag in the next two functions is
1540 + * either READ or WRITE, ie. we don't take READA. Any
1541 + * regions with a zero count field will be ignored.
1543 +int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw,
1544 + struct page *pages, unsigned int offset,
1545 + unsigned int *error_bits);
1551 + * The 'where' array may be safely allocated on the stack since
1552 + * the function takes a copy.
1554 +int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
1555 + struct page *pages, unsigned int offset,
1556 + io_notify_fn fn, void *context);
1559 --- linux-2.4.21/drivers/md/dm-ioctl.c Thu Jan 1 01:00:00 1970
1560 +++ linux/drivers/md/dm-ioctl.c Wed Aug 20 14:41:38 2003
1563 + * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
1565 + * This file is released under the GPL.
1570 +#include <linux/module.h>
1571 +#include <linux/vmalloc.h>
1572 +#include <linux/miscdevice.h>
1573 +#include <linux/dm-ioctl.h>
1574 +#include <linux/init.h>
1575 +#include <linux/wait.h>
1576 +#include <linux/blk.h>
1577 +#include <linux/slab.h>
1579 +#include <asm/uaccess.h>
1581 +#define DM_DRIVER_EMAIL "dm@uk.sistina.com"
1583 +/*-----------------------------------------------------------------
1584 + * The ioctl interface needs to be able to look up devices by
1586 + *---------------------------------------------------------------*/
1588 + struct list_head name_list;
1589 + struct list_head uuid_list;
1593 + struct mapped_device *md;
1594 + struct dm_table *new_map;
1596 + /* I hate devfs */
1597 + devfs_handle_t devfs_entry;
1600 +#define NUM_BUCKETS 64
1601 +#define MASK_BUCKETS (NUM_BUCKETS - 1)
1602 +static struct list_head _name_buckets[NUM_BUCKETS];
1603 +static struct list_head _uuid_buckets[NUM_BUCKETS];
1605 +static devfs_handle_t _dev_dir;
1606 +void dm_hash_remove_all(void);
1609 + * Guards access to both hash tables.
1611 +static DECLARE_RWSEM(_hash_lock);
1613 +static void init_buckets(struct list_head *buckets)
1617 + for (i = 0; i < NUM_BUCKETS; i++)
1618 + INIT_LIST_HEAD(buckets + i);
1621 +int dm_hash_init(void)
1623 + init_buckets(_name_buckets);
1624 + init_buckets(_uuid_buckets);
1625 + _dev_dir = devfs_mk_dir(0, DM_DIR, NULL);
1629 +void dm_hash_exit(void)
1631 + dm_hash_remove_all();
1632 + devfs_unregister(_dev_dir);
1635 +/*-----------------------------------------------------------------
1637 + * We're not really concerned with the str hash function being
1638 + * fast since it's only used by the ioctl interface.
1639 + *---------------------------------------------------------------*/
1640 +static unsigned int hash_str(const char *str)
1642 + const unsigned int hash_mult = 2654435387U;
1643 + unsigned int h = 0;
1646 + h = (h + (unsigned int) *str++) * hash_mult;
1648 + return h & MASK_BUCKETS;
1651 +/*-----------------------------------------------------------------
1652 + * Code for looking up a device by name
1653 + *---------------------------------------------------------------*/
1654 +static struct hash_cell *__get_name_cell(const char *str)
1656 + struct list_head *tmp;
1657 + struct hash_cell *hc;
1658 + unsigned int h = hash_str(str);
1660 + list_for_each (tmp, _name_buckets + h) {
1661 + hc = list_entry(tmp, struct hash_cell, name_list);
1662 + if (!strcmp(hc->name, str))
1669 +static struct hash_cell *__get_uuid_cell(const char *str)
1671 + struct list_head *tmp;
1672 + struct hash_cell *hc;
1673 + unsigned int h = hash_str(str);
1675 + list_for_each (tmp, _uuid_buckets + h) {
1676 + hc = list_entry(tmp, struct hash_cell, uuid_list);
1677 + if (!strcmp(hc->uuid, str))
1684 +/*-----------------------------------------------------------------
1685 + * Inserting, removing and renaming a device.
1686 + *---------------------------------------------------------------*/
1687 +static inline char *kstrdup(const char *str)
1689 + char *r = kmalloc(strlen(str) + 1, GFP_KERNEL);
1695 +static struct hash_cell *alloc_cell(const char *name, const char *uuid,
1696 + struct mapped_device *md)
1698 + struct hash_cell *hc;
1700 + hc = kmalloc(sizeof(*hc), GFP_KERNEL);
1704 + hc->name = kstrdup(name);
1714 + hc->uuid = kstrdup(uuid);
1722 + INIT_LIST_HEAD(&hc->name_list);
1723 + INIT_LIST_HEAD(&hc->uuid_list);
1725 + hc->new_map = NULL;
1729 +static void free_cell(struct hash_cell *hc)
1741 +static int register_with_devfs(struct hash_cell *hc)
1743 + kdev_t dev = dm_kdev(hc->md);
1746 + devfs_register(_dev_dir, hc->name, DEVFS_FL_CURRENT_OWNER,
1747 + major(dev), minor(dev),
1748 + S_IFBLK | S_IRUSR | S_IWUSR | S_IRGRP,
1749 + &dm_blk_dops, NULL);
1754 +static int unregister_with_devfs(struct hash_cell *hc)
1756 + devfs_unregister(hc->devfs_entry);
1761 + * The kdev_t and uuid of a device can never change once it is
1762 + * initially inserted.
1764 +int dm_hash_insert(const char *name, const char *uuid, struct mapped_device *md)
1766 + struct hash_cell *cell;
1769 + * Allocate the new cells.
1771 + cell = alloc_cell(name, uuid, md);
1776 + * Insert the cell into both hash tables.
1778 + down_write(&_hash_lock);
1779 + if (__get_name_cell(name))
1782 + list_add(&cell->name_list, _name_buckets + hash_str(name));
1785 + if (__get_uuid_cell(uuid)) {
1786 + list_del(&cell->name_list);
1789 + list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid));
1791 + register_with_devfs(cell);
1793 + up_write(&_hash_lock);
1798 + up_write(&_hash_lock);
1803 +void __hash_remove(struct hash_cell *hc)
1805 + /* remove from the dev hash */
1806 + list_del(&hc->uuid_list);
1807 + list_del(&hc->name_list);
1808 + unregister_with_devfs(hc);
1811 + dm_table_put(hc->new_map);
1815 +void dm_hash_remove_all(void)
1818 + struct hash_cell *hc;
1819 + struct list_head *tmp, *n;
1821 + down_write(&_hash_lock);
1822 + for (i = 0; i < NUM_BUCKETS; i++) {
1823 + list_for_each_safe (tmp, n, _name_buckets + i) {
1824 + hc = list_entry(tmp, struct hash_cell, name_list);
1825 + __hash_remove(hc);
1828 + up_write(&_hash_lock);
1831 +int dm_hash_rename(const char *old, const char *new)
1833 + char *new_name, *old_name;
1834 + struct hash_cell *hc;
1839 + new_name = kstrdup(new);
1843 + down_write(&_hash_lock);
1848 + hc = __get_name_cell(new);
1850 + DMWARN("asked to rename to an already existing name %s -> %s",
1852 + up_write(&_hash_lock);
1858 + * Is there such a device as 'old' ?
1860 + hc = __get_name_cell(old);
1862 + DMWARN("asked to rename a non existent device %s -> %s",
1864 + up_write(&_hash_lock);
1870 + * rename and move the name cell.
1872 + list_del(&hc->name_list);
1873 + old_name = hc->name;
1874 + hc->name = new_name;
1875 + list_add(&hc->name_list, _name_buckets + hash_str(new_name));
1877 + /* rename the device node in devfs */
1878 + unregister_with_devfs(hc);
1879 + register_with_devfs(hc);
1881 + up_write(&_hash_lock);
1886 +/*-----------------------------------------------------------------
1887 + * Implementation of the ioctl commands
1888 + *---------------------------------------------------------------*/
1890 + * All the ioctl commands get dispatched to functions with this
1893 +typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size);
1895 +static int remove_all(struct dm_ioctl *param, size_t param_size)
1897 + dm_hash_remove_all();
1898 + param->data_size = 0;
1903 + * Round up the ptr to an 8-byte boundary.
1905 +#define ALIGN_MASK 7
1906 +static inline void *align_ptr(void *ptr)
1908 + return (void *) (((size_t) (ptr + ALIGN_MASK)) & ~ALIGN_MASK);
1912 + * Retrieves the data payload buffer from an already allocated
1913 + * struct dm_ioctl.
1915 +static void *get_result_buffer(struct dm_ioctl *param, size_t param_size,
1918 + param->data_start = align_ptr(param + 1) - (void *) param;
1920 + if (param->data_start < param_size)
1921 + *len = param_size - param->data_start;
1925 + return ((void *) param) + param->data_start;
1928 +static int list_devices(struct dm_ioctl *param, size_t param_size)
1931 + struct hash_cell *hc;
1932 + size_t len, needed = 0;
1933 + struct dm_name_list *nl, *old_nl = NULL;
1935 + down_write(&_hash_lock);
1938 + * Loop through all the devices working out how much
1941 + for (i = 0; i < NUM_BUCKETS; i++) {
1942 + list_for_each_entry (hc, _name_buckets + i, name_list) {
1943 + needed += sizeof(struct dm_name_list);
1944 + needed += strlen(hc->name);
1945 + needed += ALIGN_MASK;
1950 + * Grab our output buffer.
1952 + nl = get_result_buffer(param, param_size, &len);
1953 + if (len < needed) {
1954 + param->flags |= DM_BUFFER_FULL_FLAG;
1957 + param->data_size = param->data_start + needed;
1959 + nl->dev = 0; /* Flags no data */
1962 + * Now loop through filling out the names.
1964 + for (i = 0; i < NUM_BUCKETS; i++) {
1965 + list_for_each_entry (hc, _name_buckets + i, name_list) {
1967 + old_nl->next = (uint32_t) ((void *) nl -
1970 + nl->dev = dm_kdev(hc->md);
1972 + strcpy(nl->name, hc->name);
1975 + nl = align_ptr(((void *) ++nl) + strlen(hc->name) + 1);
1980 + up_write(&_hash_lock);
1984 +static int check_name(const char *name)
1986 + if (strchr(name, '/')) {
1987 + DMWARN("invalid device name");
1995 + * Fills in a dm_ioctl structure, ready for sending back to
1998 +static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
2000 + kdev_t dev = dm_kdev(md);
2001 + struct dm_table *table;
2002 + struct block_device *bdev;
2004 + param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
2005 + DM_ACTIVE_PRESENT_FLAG);
2007 + if (dm_suspended(md))
2008 + param->flags |= DM_SUSPEND_FLAG;
2010 + param->dev = kdev_t_to_nr(dev);
2012 + if (is_read_only(dev))
2013 + param->flags |= DM_READONLY_FLAG;
2015 + param->event_nr = dm_get_event_nr(md);
2017 + table = dm_get_table(md);
2019 + param->flags |= DM_ACTIVE_PRESENT_FLAG;
2020 + param->target_count = dm_table_get_num_targets(table);
2021 + dm_table_put(table);
2023 + param->target_count = 0;
2025 + bdev = bdget(param->dev);
2028 + param->open_count = bdev->bd_openers;
2034 +static int dev_create(struct dm_ioctl *param, size_t param_size)
2038 + struct mapped_device *md;
2040 + r = check_name(param->name);
2044 + if (param->flags & DM_PERSISTENT_DEV_FLAG)
2045 + dev = to_kdev_t(param->dev);
2047 + r = dm_create(dev, &md);
2051 + r = dm_hash_insert(param->name, *param->uuid ? param->uuid : NULL, md);
2057 + param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
2059 + r = __dev_status(md, param);
2066 + * Always use UUID for lookups if it's present, otherwise use name.
2068 +static inline struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param)
2070 + return *param->uuid ?
2071 + __get_uuid_cell(param->uuid) : __get_name_cell(param->name);
2074 +static inline struct mapped_device *find_device(struct dm_ioctl *param)
2076 + struct hash_cell *hc;
2077 + struct mapped_device *md = NULL;
2079 + down_read(&_hash_lock);
2080 + hc = __find_device_hash_cell(param);
2085 + * Sneakily write in both the name and the uuid
2086 + * while we have the cell.
2088 + strncpy(param->name, hc->name, sizeof(param->name));
2090 + strncpy(param->uuid, hc->uuid, sizeof(param->uuid) - 1);
2092 + param->uuid[0] = '\0';
2095 + param->flags |= DM_INACTIVE_PRESENT_FLAG;
2097 + param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
2101 + up_read(&_hash_lock);
2106 +static int dev_remove(struct dm_ioctl *param, size_t param_size)
2108 + struct hash_cell *hc;
2110 + down_write(&_hash_lock);
2111 + hc = __find_device_hash_cell(param);
2114 + DMWARN("device doesn't appear to be in the dev hash table.");
2115 + up_write(&_hash_lock);
2119 + __hash_remove(hc);
2120 + up_write(&_hash_lock);
2121 + param->data_size = 0;
2126 + * Check a string doesn't overrun the chunk of
2127 + * memory we copied from userland.
2129 +static int invalid_str(char *str, void *end)
2131 + while ((void *) str < end)
2138 +static int dev_rename(struct dm_ioctl *param, size_t param_size)
2141 + char *new_name = (char *) param + param->data_start;
2143 + if (new_name < (char *) (param + 1) ||
2144 + invalid_str(new_name, (void *) param + param_size)) {
2145 + DMWARN("Invalid new logical volume name supplied.");
2149 + r = check_name(new_name);
2153 + param->data_size = 0;
2154 + return dm_hash_rename(param->name, new_name);
2157 +static int do_suspend(struct dm_ioctl *param)
2160 + struct mapped_device *md;
2162 + md = find_device(param);
2166 + if (!dm_suspended(md))
2167 + r = dm_suspend(md);
2170 + r = __dev_status(md, param);
2176 +static int do_resume(struct dm_ioctl *param)
2179 + struct hash_cell *hc;
2180 + struct mapped_device *md;
2181 + struct dm_table *new_map;
2183 + down_write(&_hash_lock);
2185 + hc = __find_device_hash_cell(param);
2187 + DMWARN("device doesn't appear to be in the dev hash table.");
2188 + up_write(&_hash_lock);
2195 + new_map = hc->new_map;
2196 + hc->new_map = NULL;
2197 + param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
2199 + up_write(&_hash_lock);
2201 + /* Do we need to load a new map ? */
2203 + /* Suspend if it isn't already suspended */
2204 + if (!dm_suspended(md))
2207 + r = dm_swap_table(md, new_map);
2210 + dm_table_put(new_map);
2214 + if (dm_table_get_mode(new_map) & FMODE_WRITE)
2215 + set_device_ro(dm_kdev(md), 0);
2217 + set_device_ro(dm_kdev(md), 1);
2219 + dm_table_put(new_map);
2222 + if (dm_suspended(md))
2223 + r = dm_resume(md);
2226 + r = __dev_status(md, param);
2233 + * Set or unset the suspension state of a device.
2234 + * If the device already is in the requested state we just return its status.
2236 +static int dev_suspend(struct dm_ioctl *param, size_t param_size)
2238 + if (param->flags & DM_SUSPEND_FLAG)
2239 + return do_suspend(param);
2241 + return do_resume(param);
2245 + * Copies device info back to user space, used by
2246 + * the create and info ioctls.
2248 +static int dev_status(struct dm_ioctl *param, size_t param_size)
2251 + struct mapped_device *md;
2253 + md = find_device(param);
2257 + r = __dev_status(md, param);
2262 +static inline int get_mode(struct dm_ioctl *param)
2264 + int mode = FMODE_READ | FMODE_WRITE;
2266 + if (param->flags & DM_READONLY_FLAG)
2267 + mode = FMODE_READ;
2272 +static int next_target(struct dm_target_spec *last, uint32_t next, void *end,
2273 + struct dm_target_spec **spec, char **target_params)
2275 + *spec = (struct dm_target_spec *) ((unsigned char *) last + next);
2276 + *target_params = (char *) (*spec + 1);
2278 + if (*spec < (last + 1))
2281 + return invalid_str(*target_params, end);
2284 +static int populate_table(struct dm_table *table, struct dm_ioctl *param,
2285 + size_t param_size)
2288 + unsigned int i = 0;
2289 + struct dm_target_spec *spec = (struct dm_target_spec *) param;
2290 + uint32_t next = param->data_start;
2291 + void *end = (void *) param + param_size;
2292 + char *target_params;
2294 + if (!param->target_count) {
2295 + DMWARN("populate_table: no targets specified");
2299 + for (i = 0; i < param->target_count; i++) {
2301 + r = next_target(spec, next, end, &spec, &target_params);
2303 + DMWARN("unable to find target");
2307 + r = dm_table_add_target(table, spec->target_type,
2308 + (sector_t) spec->sector_start,
2309 + (sector_t) spec->length,
2312 + DMWARN("error adding target to table");
2316 + next = spec->next;
2319 + return dm_table_complete(table);
2322 +static int table_load(struct dm_ioctl *param, size_t param_size)
2325 + struct hash_cell *hc;
2326 + struct dm_table *t;
2328 + r = dm_table_create(&t, get_mode(param));
2332 + r = populate_table(t, param, param_size);
2338 + down_write(&_hash_lock);
2339 + hc = __find_device_hash_cell(param);
2341 + DMWARN("device doesn't appear to be in the dev hash table.");
2342 + up_write(&_hash_lock);
2347 + dm_table_put(hc->new_map);
2349 + param->flags |= DM_INACTIVE_PRESENT_FLAG;
2351 + r = __dev_status(hc->md, param);
2352 + up_write(&_hash_lock);
2356 +static int table_clear(struct dm_ioctl *param, size_t param_size)
2359 + struct hash_cell *hc;
2361 + down_write(&_hash_lock);
2363 + hc = __find_device_hash_cell(param);
2365 + DMWARN("device doesn't appear to be in the dev hash table.");
2366 + up_write(&_hash_lock);
2370 + if (hc->new_map) {
2371 + dm_table_put(hc->new_map);
2372 + hc->new_map = NULL;
2375 + param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
2377 + r = __dev_status(hc->md, param);
2378 + up_write(&_hash_lock);
2383 + * Retrieves a list of devices used by a particular dm device.
2385 +static void retrieve_deps(struct dm_table *table, struct dm_ioctl *param,
2386 + size_t param_size)
2388 + unsigned int count = 0;
2389 + struct list_head *tmp;
2390 + size_t len, needed;
2391 + struct dm_target_deps *deps;
2393 + deps = get_result_buffer(param, param_size, &len);
2396 + * Count the devices.
2398 + list_for_each(tmp, dm_table_get_devices(table))
2402 + * Check we have enough space.
2404 + needed = sizeof(*deps) + (sizeof(*deps->dev) * count);
2405 + if (len < needed) {
2406 + param->flags |= DM_BUFFER_FULL_FLAG;
2411 + * Fill in the devices.
2413 + deps->count = count;
2415 + list_for_each(tmp, dm_table_get_devices(table)) {
2416 + struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
2417 + deps->dev[count++] = dd->bdev->bd_dev;
2420 + param->data_size = param->data_start + needed;
2423 +static int table_deps(struct dm_ioctl *param, size_t param_size)
2426 + struct mapped_device *md;
2427 + struct dm_table *table;
2429 + md = find_device(param);
2433 + r = __dev_status(md, param);
2437 + table = dm_get_table(md);
2439 + retrieve_deps(table, param, param_size);
2440 + dm_table_put(table);
2449 + * Build up the status struct for each target
2451 +static void retrieve_status(struct dm_table *table, struct dm_ioctl *param,
2452 + size_t param_size)
2454 + unsigned int i, num_targets;
2455 + struct dm_target_spec *spec;
2456 + char *outbuf, *outptr;
2457 + status_type_t type;
2458 + size_t remaining, len, used = 0;
2460 + outptr = outbuf = get_result_buffer(param, param_size, &len);
2462 + if (param->flags & DM_STATUS_TABLE_FLAG)
2463 + type = STATUSTYPE_TABLE;
2465 + type = STATUSTYPE_INFO;
2467 + /* Get all the target info */
2468 + num_targets = dm_table_get_num_targets(table);
2469 + for (i = 0; i < num_targets; i++) {
2470 + struct dm_target *ti = dm_table_get_target(table, i);
2472 + remaining = len - (outptr - outbuf);
2473 + if (remaining < sizeof(struct dm_target_spec)) {
2474 + param->flags |= DM_BUFFER_FULL_FLAG;
2478 + spec = (struct dm_target_spec *) outptr;
2481 + spec->sector_start = ti->begin;
2482 + spec->length = ti->len;
2483 + strncpy(spec->target_type, ti->type->name,
2484 + sizeof(spec->target_type));
2486 + outptr += sizeof(struct dm_target_spec);
2487 + remaining = len - (outptr - outbuf);
2489 + /* Get the status/table string from the target driver */
2490 + if (ti->type->status) {
2491 + if (ti->type->status(ti, type, outptr, remaining)) {
2492 + param->flags |= DM_BUFFER_FULL_FLAG;
2498 + outptr += strlen(outptr) + 1;
2499 + used = param->data_start + (outptr - outbuf);
2501 + align_ptr(outptr);
2502 + spec->next = outptr - outbuf;
2506 + param->data_size = used;
2508 + param->target_count = num_targets;
2512 + * Return the status of a device as a text string for each
2515 +static int table_status(struct dm_ioctl *param, size_t param_size)
2518 + struct mapped_device *md;
2519 + struct dm_table *table;
2521 + md = find_device(param);
2525 + r = __dev_status(md, param);
2529 + table = dm_get_table(md);
2531 + retrieve_status(table, param, param_size);
2532 + dm_table_put(table);
2541 + * Wait for a device to report an event
2543 +static int dev_wait(struct dm_ioctl *param, size_t param_size)
2546 + struct mapped_device *md;
2547 + struct dm_table *table;
2548 + DECLARE_WAITQUEUE(wq, current);
2550 + md = find_device(param);
2555 + * Wait for a notification event
2557 + set_current_state(TASK_INTERRUPTIBLE);
2558 + if (!dm_add_wait_queue(md, &wq, param->event_nr)) {
2560 + dm_remove_wait_queue(md, &wq);
2562 + set_current_state(TASK_RUNNING);
2565 + * The userland program is going to want to know what
2566 + * changed to trigger the event, so we may as well tell
2567 + * him and save an ioctl.
2569 + r = __dev_status(md, param);
2573 + table = dm_get_table(md);
2575 + retrieve_status(table, param, param_size);
2576 + dm_table_put(table);
2584 +/*-----------------------------------------------------------------
2585 + * Implementation of open/close/ioctl on the special char
2587 + *---------------------------------------------------------------*/
2588 +static ioctl_fn lookup_ioctl(unsigned int cmd)
2594 + {DM_VERSION_CMD, NULL}, /* version is dealt with elsewhere */
2595 + {DM_REMOVE_ALL_CMD, remove_all},
2596 + {DM_LIST_DEVICES_CMD, list_devices},
2598 + {DM_DEV_CREATE_CMD, dev_create},
2599 + {DM_DEV_REMOVE_CMD, dev_remove},
2600 + {DM_DEV_RENAME_CMD, dev_rename},
2601 + {DM_DEV_SUSPEND_CMD, dev_suspend},
2602 + {DM_DEV_STATUS_CMD, dev_status},
2603 + {DM_DEV_WAIT_CMD, dev_wait},
2605 + {DM_TABLE_LOAD_CMD, table_load},
2606 + {DM_TABLE_CLEAR_CMD, table_clear},
2607 + {DM_TABLE_DEPS_CMD, table_deps},
2608 + {DM_TABLE_STATUS_CMD, table_status}
2611 + return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn;
2615 + * As well as checking the version compatibility this always
2616 + * copies the kernel interface version out.
2618 +static int check_version(unsigned int cmd, struct dm_ioctl *user)
2620 + uint32_t version[3];
2623 + if (copy_from_user(version, user->version, sizeof(version)))
2626 + if ((DM_VERSION_MAJOR != version[0]) ||
2627 + (DM_VERSION_MINOR < version[1])) {
2628 + DMWARN("ioctl interface mismatch: "
2629 + "kernel(%u.%u.%u), user(%u.%u.%u), cmd(%d)",
2630 + DM_VERSION_MAJOR, DM_VERSION_MINOR,
2631 + DM_VERSION_PATCHLEVEL,
2632 + version[0], version[1], version[2], cmd);
2637 + * Fill in the kernel version.
2639 + version[0] = DM_VERSION_MAJOR;
2640 + version[1] = DM_VERSION_MINOR;
2641 + version[2] = DM_VERSION_PATCHLEVEL;
2642 + if (copy_to_user(user->version, version, sizeof(version)))
2648 +static void free_params(struct dm_ioctl *param)
2653 +static int copy_params(struct dm_ioctl *user, struct dm_ioctl **param)
2655 + struct dm_ioctl tmp, *dmi;
2657 + if (copy_from_user(&tmp, user, sizeof(tmp)))
2660 + if (tmp.data_size < sizeof(tmp))
2663 + dmi = (struct dm_ioctl *) vmalloc(tmp.data_size);
2667 + if (copy_from_user(dmi, user, tmp.data_size)) {
2676 +static int validate_params(uint cmd, struct dm_ioctl *param)
2678 + /* Always clear this flag */
2679 + param->flags &= ~DM_BUFFER_FULL_FLAG;
2681 + /* Ignores parameters */
2682 + if (cmd == DM_REMOVE_ALL_CMD || cmd == DM_LIST_DEVICES_CMD)
2685 + /* Unless creating, either name or uuid but not both */
2686 + if (cmd != DM_DEV_CREATE_CMD) {
2687 + if ((!*param->uuid && !*param->name) ||
2688 + (*param->uuid && *param->name)) {
2689 + DMWARN("one of name or uuid must be supplied, cmd(%u)",
2695 + /* Ensure strings are terminated */
2696 + param->name[DM_NAME_LEN - 1] = '\0';
2697 + param->uuid[DM_UUID_LEN - 1] = '\0';
2702 +static int ctl_ioctl(struct inode *inode, struct file *file,
2703 + uint command, ulong u)
2707 + struct dm_ioctl *param;
2708 + struct dm_ioctl *user = (struct dm_ioctl *) u;
2709 + ioctl_fn fn = NULL;
2710 + size_t param_size;
2712 + /* only root can play with this */
2713 + if (!capable(CAP_SYS_ADMIN))
2716 + if (_IOC_TYPE(command) != DM_IOCTL)
2719 + cmd = _IOC_NR(command);
2722 + * Check the interface version passed in. This also
2723 + * writes out the kernel's interface version.
2725 + r = check_version(cmd, user);
2730 + * Nothing more to do for the version command.
2732 + if (cmd == DM_VERSION_CMD)
2735 + fn = lookup_ioctl(cmd);
2737 + DMWARN("dm_ctl_ioctl: unknown command 0x%x", command);
2742 + * FIXME: I don't like this, we're trying to avoid low
2743 + * memory issues when a device is suspended.
2745 + current->flags |= PF_MEMALLOC;
2748 + * Copy the parameters into kernel space.
2750 + r = copy_params(user, ¶m);
2752 + current->flags &= ~PF_MEMALLOC;
2756 + r = validate_params(cmd, param);
2760 + param_size = param->data_size;
2761 + param->data_size = sizeof(*param);
2762 + r = fn(param, param_size);
2765 + * Copy the results back to userland.
2767 + if (!r && copy_to_user(user, param, param->data_size))
2771 + free_params(param);
2772 + current->flags &= ~PF_MEMALLOC;
2776 +static struct file_operations _ctl_fops = {
2777 + .ioctl = ctl_ioctl,
2778 + .owner = THIS_MODULE,
2781 +static devfs_handle_t _ctl_handle;
2783 +static struct miscdevice _dm_misc = {
2784 + .minor = MISC_DYNAMIC_MINOR,
2786 + .fops = &_ctl_fops
2790 + * Create misc character device and link to DM_DIR/control.
2792 +int __init dm_interface_init(void)
2797 + r = dm_hash_init();
2801 + r = misc_register(&_dm_misc);
2803 + DMERR("misc_register failed for control device");
2808 + r = devfs_generate_path(_dm_misc.devfs_handle, rname + 3,
2809 + sizeof rname - 3);
2811 + goto done; /* devfs not present */
2814 + DMERR("devfs_generate_path failed for control device");
2818 + strncpy(rname + r, "../", 3);
2819 + r = devfs_mk_symlink(NULL, DM_DIR "/control",
2820 + DEVFS_FL_DEFAULT, rname + r, &_ctl_handle, NULL);
2822 + DMERR("devfs_mk_symlink failed for control device");
2825 + devfs_auto_unregister(_dm_misc.devfs_handle, _ctl_handle);
2828 + DMINFO("%d.%d.%d%s initialised: %s", DM_VERSION_MAJOR,
2829 + DM_VERSION_MINOR, DM_VERSION_PATCHLEVEL, DM_VERSION_EXTRA,
2834 + misc_deregister(&_dm_misc);
2839 +void dm_interface_exit(void)
2841 + if (misc_deregister(&_dm_misc) < 0)
2842 + DMERR("misc_deregister failed for control device");
2846 --- linux-2.4.21/drivers/md/dm-linear.c Thu Jan 1 01:00:00 1970
2847 +++ linux/drivers/md/dm-linear.c Wed Aug 20 14:41:38 2003
2850 + * Copyright (C) 2001 Sistina Software (UK) Limited.
2852 + * This file is released under the GPL.
2857 +#include <linux/module.h>
2858 +#include <linux/init.h>
2859 +#include <linux/blkdev.h>
2860 +#include <linux/slab.h>
2863 + * Linear: maps a linear range of a device.
2866 + struct dm_dev *dev;
2871 + * Construct a linear mapping: <dev_path> <offset>
2873 +static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
2875 + struct linear_c *lc;
2878 + ti->error = "dm-linear: Invalid argument count";
2882 + lc = kmalloc(sizeof(*lc), GFP_KERNEL);
2884 + ti->error = "dm-linear: Cannot allocate linear context";
2888 + if (sscanf(argv[1], SECTOR_FORMAT, &lc->start) != 1) {
2889 + ti->error = "dm-linear: Invalid device sector";
2893 + if (dm_get_device(ti, argv[0], lc->start, ti->len,
2894 + dm_table_get_mode(ti->table), &lc->dev)) {
2895 + ti->error = "dm-linear: Device lookup failed";
2907 +static void linear_dtr(struct dm_target *ti)
2909 + struct linear_c *lc = (struct linear_c *) ti->private;
2911 + dm_put_device(ti, lc->dev);
2915 +static int linear_map(struct dm_target *ti, struct buffer_head *bh, int rw,
2916 + union map_info *map_context)
2918 + struct linear_c *lc = (struct linear_c *) ti->private;
2920 + bh->b_rdev = lc->dev->dev;
2921 + bh->b_rsector = lc->start + (bh->b_rsector - ti->begin);
2926 +static int linear_status(struct dm_target *ti, status_type_t type,
2927 + char *result, unsigned int maxlen)
2929 + struct linear_c *lc = (struct linear_c *) ti->private;
2933 + case STATUSTYPE_INFO:
2937 + case STATUSTYPE_TABLE:
2938 + kdev = to_kdev_t(lc->dev->bdev->bd_dev);
2939 + snprintf(result, maxlen, "%s " SECTOR_FORMAT,
2940 + dm_kdevname(kdev), lc->start);
2946 +static struct target_type linear_target = {
2948 + .module = THIS_MODULE,
2949 + .ctr = linear_ctr,
2950 + .dtr = linear_dtr,
2951 + .map = linear_map,
2952 + .status = linear_status,
2955 +int __init dm_linear_init(void)
2957 + int r = dm_register_target(&linear_target);
2960 + DMERR("linear: register failed %d", r);
2965 +void dm_linear_exit(void)
2967 + int r = dm_unregister_target(&linear_target);
2970 + DMERR("linear: unregister failed %d", r);
2972 --- linux-2.4.21/drivers/md/dm-log.c Thu Jan 1 01:00:00 1970
2973 +++ linux/drivers/md/dm-log.c Wed Aug 20 14:41:38 2003
2976 + * Copyright (C) 2003 Sistina Software
2978 + * This file is released under the LGPL.
2981 +#include <linux/init.h>
2982 +#include <linux/slab.h>
2983 +#include <linux/module.h>
2984 +#include <linux/vmalloc.h>
2986 +#include "dm-log.h"
2989 +static LIST_HEAD(_log_types);
2990 +static spinlock_t _lock = SPIN_LOCK_UNLOCKED;
2992 +int dm_register_dirty_log_type(struct dirty_log_type *type)
2994 + spin_lock(&_lock);
2995 + type->use_count = 0;
2997 + __MOD_INC_USE_COUNT(type->module);
2999 + list_add(&type->list, &_log_types);
3000 + spin_unlock(&_lock);
3005 +int dm_unregister_dirty_log_type(struct dirty_log_type *type)
3007 + spin_lock(&_lock);
3009 + if (type->use_count)
3010 + DMWARN("Attempt to unregister a log type that is still in use");
3012 + list_del(&type->list);
3014 + __MOD_DEC_USE_COUNT(type->module);
3017 + spin_unlock(&_lock);
3022 +static struct dirty_log_type *get_type(const char *type_name)
3024 + struct dirty_log_type *type;
3025 + struct list_head *tmp;
3027 + spin_lock(&_lock);
3028 + list_for_each (tmp, &_log_types) {
3029 + type = list_entry(tmp, struct dirty_log_type, list);
3030 + if (!strcmp(type_name, type->name)) {
3031 + type->use_count++;
3032 + spin_unlock(&_lock);
3037 + spin_unlock(&_lock);
3041 +static void put_type(struct dirty_log_type *type)
3043 + spin_lock(&_lock);
3044 + type->use_count--;
3045 + spin_unlock(&_lock);
3048 +struct dirty_log *dm_create_dirty_log(const char *type_name, sector_t dev_size,
3049 + unsigned int argc, char **argv)
3051 + struct dirty_log_type *type;
3052 + struct dirty_log *log;
3054 + log = kmalloc(sizeof(*log), GFP_KERNEL);
3058 + type = get_type(type_name);
3065 + if (type->ctr(log, dev_size, argc, argv)) {
3074 +void dm_destroy_dirty_log(struct dirty_log *log)
3076 + log->type->dtr(log);
3077 + put_type(log->type);
3082 +/*-----------------------------------------------------------------
3083 + * In core log, ie. trivial, non-persistent
3085 + * For now we'll keep this simple and just have 2 bitsets, one
3086 + * for clean/dirty, the other for sync/nosync. The sync bitset
3087 + * will be freed when everything is in sync.
3089 + * FIXME: problems with a 64bit sector_t
3090 + *---------------------------------------------------------------*/
3092 + sector_t region_size;
3093 + unsigned int region_count;
3094 + unsigned long *clean_bits;
3095 + unsigned long *sync_bits;
3096 + unsigned long *recovering_bits; /* FIXME: this seems excessive */
3101 +static int core_ctr(struct dirty_log *log, sector_t dev_size,
3102 + unsigned int argc, char **argv)
3104 + struct core_log *clog;
3105 + sector_t region_size;
3106 + unsigned int region_count;
3107 + size_t bitset_size;
3110 + DMWARN("wrong number of arguments to core_log");
3114 + if (sscanf(argv[0], SECTOR_FORMAT, ®ion_size) != 1) {
3115 + DMWARN("invalid region size string");
3119 + region_count = dm_div_up(dev_size, region_size);
3121 + clog = kmalloc(sizeof(*clog), GFP_KERNEL);
3123 + DMWARN("couldn't allocate core log");
3127 + clog->region_size = region_size;
3128 + clog->region_count = region_count;
3130 + bitset_size = dm_round_up(region_count >> 3, sizeof(*clog->clean_bits));
3131 + clog->clean_bits = vmalloc(bitset_size);
3132 + if (!clog->clean_bits) {
3133 + DMWARN("couldn't allocate clean bitset");
3137 + memset(clog->clean_bits, -1, bitset_size);
3139 + clog->sync_bits = vmalloc(bitset_size);
3140 + if (!clog->sync_bits) {
3141 + DMWARN("couldn't allocate sync bitset");
3142 + vfree(clog->clean_bits);
3146 + memset(clog->sync_bits, 0, bitset_size);
3148 + clog->recovering_bits = vmalloc(bitset_size);
3149 + if (!clog->recovering_bits) {
3150 + DMWARN("couldn't allocate sync bitset");
3151 + vfree(clog->sync_bits);
3152 + vfree(clog->clean_bits);
3156 + memset(clog->recovering_bits, 0, bitset_size);
3157 + clog->sync_search = 0;
3158 + log->context = clog;
3162 +static void core_dtr(struct dirty_log *log)
3164 + struct core_log *clog = (struct core_log *) log->context;
3165 + vfree(clog->clean_bits);
3166 + vfree(clog->sync_bits);
3167 + vfree(clog->recovering_bits);
3171 +static sector_t core_get_region_size(struct dirty_log *log)
3173 + struct core_log *clog = (struct core_log *) log->context;
3174 + return clog->region_size;
3177 +static int core_is_clean(struct dirty_log *log, region_t region)
3179 + struct core_log *clog = (struct core_log *) log->context;
3180 + return test_bit(region, clog->clean_bits);
3183 +static int core_in_sync(struct dirty_log *log, region_t region, int block)
3185 + struct core_log *clog = (struct core_log *) log->context;
3187 + return test_bit(region, clog->sync_bits) ? 1 : 0;
3190 +static int core_flush(struct dirty_log *log)
3196 +static void core_mark_region(struct dirty_log *log, region_t region)
3198 + struct core_log *clog = (struct core_log *) log->context;
3199 + clear_bit(region, clog->clean_bits);
3202 +static void core_clear_region(struct dirty_log *log, region_t region)
3204 + struct core_log *clog = (struct core_log *) log->context;
3205 + set_bit(region, clog->clean_bits);
3208 +static int core_get_resync_work(struct dirty_log *log, region_t *region)
3210 + struct core_log *clog = (struct core_log *) log->context;
3212 + if (clog->sync_search >= clog->region_count)
3216 + *region = find_next_zero_bit(clog->sync_bits,
3217 + clog->region_count,
3218 + clog->sync_search);
3219 + clog->sync_search = *region + 1;
3221 + if (*region == clog->region_count)
3224 + } while (test_bit(*region, clog->recovering_bits));
3226 + set_bit(*region, clog->recovering_bits);
3230 +static void core_complete_resync_work(struct dirty_log *log, region_t region,
3233 + struct core_log *clog = (struct core_log *) log->context;
3235 + clear_bit(region, clog->recovering_bits);
3237 + set_bit(region, clog->sync_bits);
3240 +static struct dirty_log_type _core_type = {
3245 + .get_region_size = core_get_region_size,
3246 + .is_clean = core_is_clean,
3247 + .in_sync = core_in_sync,
3248 + .flush = core_flush,
3249 + .mark_region = core_mark_region,
3250 + .clear_region = core_clear_region,
3251 + .get_resync_work = core_get_resync_work,
3252 + .complete_resync_work = core_complete_resync_work
3255 +__init int dm_dirty_log_init(void)
3259 + r = dm_register_dirty_log_type(&_core_type);
3261 + DMWARN("couldn't register core log");
3266 +void dm_dirty_log_exit(void)
3268 + dm_unregister_dirty_log_type(&_core_type);
3271 +EXPORT_SYMBOL(dm_register_dirty_log_type);
3272 +EXPORT_SYMBOL(dm_unregister_dirty_log_type);
3273 +EXPORT_SYMBOL(dm_dirty_log_init);
3274 +EXPORT_SYMBOL(dm_dirty_log_exit);
3275 +EXPORT_SYMBOL(dm_create_dirty_log);
3276 +EXPORT_SYMBOL(dm_destroy_dirty_log);
3277 --- linux-2.4.21/drivers/md/dm-log.h Thu Jan 1 01:00:00 1970
3278 +++ linux/drivers/md/dm-log.h Wed Aug 20 14:41:38 2003
3281 + * Copyright (C) 2003 Sistina Software
3283 + * This file is released under the LGPL.
3286 +#ifndef DM_DIRTY_LOG
3287 +#define DM_DIRTY_LOG
3291 +typedef sector_t region_t;
3293 +struct dirty_log_type;
3296 + struct dirty_log_type *type;
3300 +struct dirty_log_type {
3301 + struct list_head list;
3303 + struct module *module;
3304 + unsigned int use_count;
3306 + int (*ctr)(struct dirty_log *log, sector_t dev_size,
3307 + unsigned int argc, char **argv);
3308 + void (*dtr)(struct dirty_log *log);
3311 + * Retrieves the smallest size of region that the log can
3314 + sector_t (*get_region_size)(struct dirty_log *log);
3317 + * A predicate to say whether a region is clean or not.
3320 + int (*is_clean)(struct dirty_log *log, region_t region);
3323 + * Returns: 0, 1, -EWOULDBLOCK, < 0
3325 + * A predicate function to check the area given by
3326 + * [sector, sector + len) is in sync.
3328 + * If -EWOULDBLOCK is returned the state of the region is
3329 + * unknown, typically this will result in a read being
3330 + * passed to a daemon to deal with, since a daemon is
3331 + * allowed to block.
3333 + int (*in_sync)(struct dirty_log *log, region_t region, int can_block);
3336 + * Flush the current log state (eg, to disk). This
3337 + * function may block.
3339 + int (*flush)(struct dirty_log *log);
3342 + * Mark an area as clean or dirty. These functions may
3343 + * block, though for performance reasons blocking should
3344 + * be extremely rare (eg, allocating another chunk of
3345 + * memory for some reason).
3347 + void (*mark_region)(struct dirty_log *log, region_t region);
3348 + void (*clear_region)(struct dirty_log *log, region_t region);
3351 + * Returns: <0 (error), 0 (no region), 1 (region)
3353 + * The mirrord will need perform recovery on regions of
3354 + * the mirror that are in the NOSYNC state. This
3355 + * function asks the log to tell the caller about the
3356 + * next region that this machine should recover.
3358 + * Do not confuse this function with 'in_sync()', one
3359 + * tells you if an area is synchronised, the other
3360 + * assigns recovery work.
3362 + int (*get_resync_work)(struct dirty_log *log, region_t *region);
3365 + * This notifies the log that the resync of an area has
3366 + * been completed. The log should then mark this region
3369 + void (*complete_resync_work)(struct dirty_log *log,
3370 + region_t region, int success);
3373 +int dm_register_dirty_log_type(struct dirty_log_type *type);
3374 +int dm_unregister_dirty_log_type(struct dirty_log_type *type);
3378 + * Make sure you use these two functions, rather than calling
3379 + * type->constructor/destructor() directly.
3381 +struct dirty_log *dm_create_dirty_log(const char *type_name, sector_t dev_size,
3382 + unsigned int argc, char **argv);
3383 +void dm_destroy_dirty_log(struct dirty_log *log);
3386 + * init/exit functions.
3388 +int dm_dirty_log_init(void);
3389 +void dm_dirty_log_exit(void);
3392 --- linux-2.4.21/drivers/md/dm-raid1.c Thu Jan 1 01:00:00 1970
3393 +++ linux/drivers/md/dm-raid1.c Wed Aug 20 14:41:38 2003
3396 + * Copyright (C) 2003 Sistina Software Limited.
3398 + * This file is released under the GPL.
3402 +#include "dm-daemon.h"
3404 +#include "dm-log.h"
3405 +#include "kcopyd.h"
3407 +#include <linux/ctype.h>
3408 +#include <linux/init.h>
3409 +#include <linux/mempool.h>
3410 +#include <linux/module.h>
3411 +#include <linux/pagemap.h>
3412 +#include <linux/slab.h>
3413 +#include <linux/time.h>
3414 +#include <linux/vmalloc.h>
3416 +static struct dm_daemon _kmirrord;
3418 +/*-----------------------------------------------------------------
3421 + * We play with singly linked lists of buffers, but we want to be
3422 + * careful to add new buffers to the back of the list, to avoid
3423 + * buffers being starved of attention.
3424 + *---------------------------------------------------------------*/
3425 +struct buffer_list {
3426 + struct buffer_head *head;
3427 + struct buffer_head *tail;
3430 +static inline void buffer_list_init(struct buffer_list *bl)
3432 + bl->head = bl->tail = NULL;
3435 +static inline void buffer_list_add(struct buffer_list *bl,
3436 + struct buffer_head *bh)
3438 + bh->b_reqnext = NULL;
3441 + bl->tail->b_reqnext = bh;
3444 + bl->head = bl->tail = bh;
3447 +static struct buffer_head *buffer_list_pop(struct buffer_list *bl)
3449 + struct buffer_head *bh = bl->head;
3452 + bl->head = bl->head->b_reqnext;
3456 + bh->b_reqnext = NULL;
3462 +/*-----------------------------------------------------------------
3465 + * The mirror splits itself up into discrete regions. Each
3466 + * region can be in one of three states: clean, dirty,
3467 + * nosync. There is no need to put clean regions in the hash.
3469 + * In addition to being present in the hash table a region _may_
3470 + * be present on one of three lists.
3472 + * clean_regions: Regions on this list have no io pending to
3473 + * them, they are in sync, we are no longer interested in them,
3474 + * they are dull. rh_update_states() will remove them from the
3477 + * quiesced_regions: These regions have been spun down, ready
3478 + * for recovery. rh_recovery_start() will remove regions from
3479 + * this list and hand them to kmirrord, which will schedule the
3480 + * recovery io with kcopyd.
3482 + * recovered_regions: Regions that kcopyd has successfully
3483 + * recovered. rh_update_states() will now schedule any delayed
3484 + * io, up the recovery_count, and remove the region from the
3487 + * There are 2 locks:
3488 + * A rw spin lock 'hash_lock' protects just the hash table,
3489 + * this is never held in write mode from interrupt context,
3490 + * which I believe means that we only have to disable irqs when
3491 + * doing a write lock.
3493 + * An ordinary spin lock 'region_lock' that protects the three
3494 + * lists in the region_hash, with the 'state', 'list' and
3495 + * 'bhs_delayed' fields of the regions. This is used from irq
3496 + * context, so all other uses will have to suspend local irqs.
3497 + *---------------------------------------------------------------*/
3499 +struct region_hash {
3500 + struct mirror_set *ms;
3501 + sector_t region_size;
3503 + /* holds persistent region state */
3504 + struct dirty_log *log;
3507 + rwlock_t hash_lock;
3508 + mempool_t *region_pool;
3509 + unsigned int mask;
3510 + unsigned int nr_buckets;
3511 + struct list_head *buckets;
3513 + spinlock_t region_lock;
3514 + struct semaphore recovery_count;
3515 + struct list_head clean_regions;
3516 + struct list_head quiesced_regions;
3517 + struct list_head recovered_regions;
3528 + struct region_hash *rh; /* FIXME: can we get rid of this ? */
3532 + struct list_head hash_list;
3533 + struct list_head list;
3536 + struct buffer_head *delayed_bhs;
3542 +static inline region_t bh_to_region(struct region_hash *rh,
3543 + struct buffer_head *bh)
3545 + return bh->b_rsector / rh->region_size;
3548 +static inline sector_t region_to_sector(struct region_hash *rh, region_t region)
3550 + return region * rh->region_size;
3553 +/* FIXME move this */
3554 +static void queue_bh(struct mirror_set *ms, struct buffer_head *bh, int rw);
3556 +static void *region_alloc(int gfp_mask, void *pool_data)
3558 + return kmalloc(sizeof(struct region), gfp_mask);
3561 +static void region_free(void *element, void *pool_data)
3566 +#define MIN_REGIONS 64
3567 +#define MAX_RECOVERY 1
3568 +static int rh_init(struct region_hash *rh, struct mirror_set *ms,
3569 + struct dirty_log *log, sector_t region_size,
3570 + region_t nr_regions)
3572 + unsigned int nr_buckets, max_buckets;
3576 + * Calculate a suitable number of buckets for our hash
3579 + max_buckets = nr_regions >> 6;
3580 + for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1)
3586 + rh->region_size = region_size;
3587 + rwlock_init(&rh->hash_lock);
3588 + rh->mask = nr_buckets - 1;
3589 + rh->nr_buckets = nr_buckets;
3591 + rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets));
3592 + if (!rh->buckets) {
3593 + DMERR("unable to allocate region hash memory");
3597 + for (i = 0; i < nr_buckets; i++)
3598 + INIT_LIST_HEAD(rh->buckets + i);
3600 + spin_lock_init(&rh->region_lock);
3601 + sema_init(&rh->recovery_count, 0);
3602 + INIT_LIST_HEAD(&rh->clean_regions);
3603 + INIT_LIST_HEAD(&rh->quiesced_regions);
3604 + INIT_LIST_HEAD(&rh->recovered_regions);
3606 + rh->region_pool = mempool_create(MIN_REGIONS, region_alloc,
3607 + region_free, NULL);
3608 + if (!rh->region_pool) {
3609 + vfree(rh->buckets);
3610 + rh->buckets = NULL;
3617 +static void rh_exit(struct region_hash *rh)
3620 + struct region *reg;
3621 + struct list_head *tmp, *tmp2;
3623 + BUG_ON(!list_empty(&rh->quiesced_regions));
3624 + for (h = 0; h < rh->nr_buckets; h++) {
3625 + list_for_each_safe (tmp, tmp2, rh->buckets + h) {
3626 + reg = list_entry(tmp, struct region, hash_list);
3627 + BUG_ON(atomic_read(®->pending));
3628 + mempool_free(reg, rh->region_pool);
3633 + dm_destroy_dirty_log(rh->log);
3634 + if (rh->region_pool)
3635 + mempool_destroy(rh->region_pool);
3636 + vfree(rh->buckets);
3639 +#define RH_HASH_MULT 2654435387U
3641 +static inline unsigned int rh_hash(struct region_hash *rh, region_t region)
3643 + return (unsigned int) ((region * RH_HASH_MULT) >> 12) & rh->mask;
3646 +static struct region *__rh_lookup(struct region_hash *rh, region_t region)
3648 + struct region *reg;
3650 + list_for_each_entry (reg, rh->buckets + rh_hash(rh, region), hash_list)
3651 + if (reg->key == region)
3657 +static void __rh_insert(struct region_hash *rh, struct region *reg)
3659 + unsigned int h = rh_hash(rh, reg->key);
3660 + list_add(®->hash_list, rh->buckets + h);
3663 +static struct region *__rh_alloc(struct region_hash *rh, region_t region)
3665 + struct region *reg, *nreg;
3667 + read_unlock(&rh->hash_lock);
3668 + nreg = mempool_alloc(rh->region_pool, GFP_NOIO);
3669 + nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
3670 + RH_CLEAN : RH_NOSYNC;
3672 + nreg->key = region;
3674 + INIT_LIST_HEAD(&nreg->list);
3676 + atomic_set(&nreg->pending, 0);
3677 + nreg->delayed_bhs = NULL;
3678 + write_lock_irq(&rh->hash_lock);
3680 + reg = __rh_lookup(rh, region);
3682 + /* we lost the race */
3683 + mempool_free(nreg, rh->region_pool);
3686 + __rh_insert(rh, nreg);
3687 + if (nreg->state == RH_CLEAN) {
3688 + spin_lock_irq(&rh->region_lock);
3689 + list_add(&nreg->list, &rh->clean_regions);
3690 + spin_unlock_irq(&rh->region_lock);
3694 + write_unlock_irq(&rh->hash_lock);
3695 + read_lock(&rh->hash_lock);
3700 +static inline struct region *__rh_find(struct region_hash *rh, region_t region)
3702 + struct region *reg;
3704 + reg = __rh_lookup(rh, region);
3706 + reg = __rh_alloc(rh, region);
3711 +static int rh_state(struct region_hash *rh, region_t region, int may_block)
3714 + struct region *reg;
3716 + read_lock(&rh->hash_lock);
3717 + reg = __rh_lookup(rh, region);
3718 + read_unlock(&rh->hash_lock);
3721 + return reg->state;
3724 + * The region wasn't in the hash, so we fall back to the
3727 + r = rh->log->type->in_sync(rh->log, region, may_block);
3730 + * Any error from the dirty log (eg. -EWOULDBLOCK) gets
3731 + * taken as a RH_NOSYNC
3733 + return r == 1 ? RH_CLEAN : RH_NOSYNC;
3736 +static inline int rh_in_sync(struct region_hash *rh,
3737 + region_t region, int may_block)
3739 + int state = rh_state(rh, region, may_block);
3740 + return state == RH_CLEAN || state == RH_DIRTY;
3743 +static void dispatch_buffers(struct mirror_set *ms, struct buffer_head *bh)
3745 + struct buffer_head *nbh;
3748 + nbh = bh->b_reqnext;
3749 + queue_bh(ms, bh, WRITE);
3754 +static void rh_update_states(struct region_hash *rh)
3756 + struct list_head *tmp, *tmp2;
3757 + struct region *reg;
3760 + LIST_HEAD(recovered);
3763 + * Quickly grab the lists.
3765 + write_lock_irq(&rh->hash_lock);
3766 + spin_lock(&rh->region_lock);
3767 + if (!list_empty(&rh->clean_regions)) {
3768 + list_splice(&rh->clean_regions, &clean);
3769 + INIT_LIST_HEAD(&rh->clean_regions);
3771 + list_for_each_entry (reg, &clean, list) {
3772 + rh->log->type->clear_region(rh->log, reg->key);
3773 + list_del(®->hash_list);
3777 + if (!list_empty(&rh->recovered_regions)) {
3778 + list_splice(&rh->recovered_regions, &recovered);
3779 + INIT_LIST_HEAD(&rh->recovered_regions);
3781 + list_for_each_entry (reg, &recovered, list)
3782 + list_del(®->hash_list);
3784 + spin_unlock(&rh->region_lock);
3785 + write_unlock_irq(&rh->hash_lock);
3788 + * All the regions on the recovered and clean lists have
3789 + * now been pulled out of the system, so no need to do
3790 + * any more locking.
3792 + list_for_each_safe (tmp, tmp2, &recovered) {
3793 + reg = list_entry(tmp, struct region, list);
3795 + rh->log->type->complete_resync_work(rh->log, reg->key, 1);
3796 + dispatch_buffers(rh->ms, reg->delayed_bhs);
3797 + up(&rh->recovery_count);
3798 + mempool_free(reg, rh->region_pool);
3801 + list_for_each_safe (tmp, tmp2, &clean) {
3802 + reg = list_entry(tmp, struct region, list);
3803 + mempool_free(reg, rh->region_pool);
3807 +static void rh_inc(struct region_hash *rh, region_t region)
3809 + struct region *reg;
3811 + read_lock(&rh->hash_lock);
3812 + reg = __rh_find(rh, region);
3813 + if (reg->state == RH_CLEAN) {
3814 + rh->log->type->mark_region(rh->log, reg->key);
3816 + spin_lock_irq(&rh->region_lock);
3817 + reg->state = RH_DIRTY;
3818 + list_del_init(®->list); /* take off the clean list */
3819 + spin_unlock_irq(&rh->region_lock);
3822 + atomic_inc(®->pending);
3823 + read_unlock(&rh->hash_lock);
3826 +static void rh_inc_pending(struct region_hash *rh, struct buffer_list *buffers)
3828 + struct buffer_head *bh;
3830 + for (bh = buffers->head; bh; bh = bh->b_reqnext)
3831 + rh_inc(rh, bh_to_region(rh, bh));
3834 +static void rh_dec(struct region_hash *rh, region_t region)
3836 + unsigned long flags;
3837 + struct region *reg;
3840 + read_lock(&rh->hash_lock);
3841 + reg = __rh_lookup(rh, region);
3842 + read_unlock(&rh->hash_lock);
3844 + if (atomic_dec_and_test(®->pending)) {
3845 + spin_lock_irqsave(&rh->region_lock, flags);
3846 + if (reg->state == RH_RECOVERING) {
3847 + list_add_tail(®->list, &rh->quiesced_regions);
3849 + reg->state = RH_CLEAN;
3850 + list_add(®->list, &rh->clean_regions);
3852 + spin_unlock_irqrestore(&rh->region_lock, flags);
3857 + dm_daemon_wake(&_kmirrord);
3861 + * Starts quiescing a region in preparation for recovery.
3863 +static int __rh_recovery_prepare(struct region_hash *rh)
3866 + struct region *reg;
3870 + * Ask the dirty log what's next.
3872 + r = rh->log->type->get_resync_work(rh->log, ®ion);
3877 + * Get this region, and start it quiescing by setting the
3878 + * recovering flag.
3880 + read_lock(&rh->hash_lock);
3881 + reg = __rh_find(rh, region);
3882 + read_unlock(&rh->hash_lock);
3884 + spin_lock_irq(&rh->region_lock);
3885 + reg->state = RH_RECOVERING;
3887 + /* Already quiesced ? */
3888 + if (atomic_read(®->pending))
3889 + list_del_init(®->list);
3892 + list_del_init(®->list);
3893 + list_add(®->list, &rh->quiesced_regions);
3895 + spin_unlock_irq(&rh->region_lock);
3900 +static void rh_recovery_prepare(struct region_hash *rh)
3902 + while (!down_trylock(&rh->recovery_count))
3903 + if (__rh_recovery_prepare(rh) <= 0) {
3904 + up(&rh->recovery_count);
3910 + * Returns any quiesced regions.
3912 +static struct region *rh_recovery_start(struct region_hash *rh)
3914 + struct region *reg = NULL;
3916 + spin_lock_irq(&rh->region_lock);
3917 + if (!list_empty(&rh->quiesced_regions)) {
3918 + reg = list_entry(rh->quiesced_regions.next,
3919 + struct region, list);
3920 + list_del_init(®->list); /* remove from the quiesced list */
3922 + spin_unlock_irq(&rh->region_lock);
3927 +/* FIXME: success ignored for now */
3928 +static void rh_recovery_end(struct region *reg, int success)
3930 + struct region_hash *rh = reg->rh;
3932 + spin_lock_irq(&rh->region_lock);
3933 + list_add(®->list, ®->rh->recovered_regions);
3934 + spin_unlock_irq(&rh->region_lock);
3936 + dm_daemon_wake(&_kmirrord);
3939 +static void rh_flush(struct region_hash *rh)
3941 + rh->log->type->flush(rh->log);
3944 +static void rh_delay(struct region_hash *rh, struct buffer_head *bh)
3946 + struct region *reg;
3948 + read_lock(&rh->hash_lock);
3949 + reg = __rh_find(rh, bh_to_region(rh, bh));
3950 + bh->b_reqnext = reg->delayed_bhs;
3951 + reg->delayed_bhs = bh;
3952 + read_unlock(&rh->hash_lock);
3955 +static void rh_stop_recovery(struct region_hash *rh)
3959 + /* wait for any recovering regions */
3960 + for (i = 0; i < MAX_RECOVERY; i++)
3961 + down(&rh->recovery_count);
3964 +static void rh_start_recovery(struct region_hash *rh)
3968 + for (i = 0; i < MAX_RECOVERY; i++)
3969 + up(&rh->recovery_count);
3971 + dm_daemon_wake(&_kmirrord);
3974 +/*-----------------------------------------------------------------
3975 + * Mirror set structures.
3976 + *---------------------------------------------------------------*/
3978 + atomic_t error_count;
3979 + struct dm_dev *dev;
3983 +struct mirror_set {
3984 + struct dm_target *ti;
3985 + struct list_head list;
3986 + struct region_hash rh;
3987 + struct kcopyd_client *kcopyd_client;
3989 + spinlock_t lock; /* protects the next two lists */
3990 + struct buffer_list reads;
3991 + struct buffer_list writes;
3994 + region_t nr_regions;
3995 + region_t sync_count;
3997 + unsigned int nr_mirrors;
3998 + struct mirror mirror[0];
4002 + * Every mirror should look like this one.
4004 +#define DEFAULT_MIRROR 0
4007 + * This is yucky. We squirrel the mirror_set struct away inside
4008 + * b_reqnext for write buffers. This is safe since the bh
4009 + * doesn't get submitted to the lower levels of block layer.
4011 +static struct mirror_set *bh_get_ms(struct buffer_head *bh)
4013 + return (struct mirror_set *) bh->b_reqnext;
4016 +static void bh_set_ms(struct buffer_head *bh, struct mirror_set *ms)
4018 + bh->b_reqnext = (struct buffer_head *) ms;
4021 +/*-----------------------------------------------------------------
4024 + * When a mirror is first activated we may find that some regions
4025 + * are in the no-sync state. We have to recover these by
4026 + * recopying from the default mirror to all the others.
4027 + *---------------------------------------------------------------*/
4028 +static void recovery_complete(int read_err, unsigned int write_err,
4031 + struct region *reg = (struct region *) context;
4032 + struct mirror_set *ms = reg->rh->ms;
4034 + /* FIXME: better error handling */
4035 + rh_recovery_end(reg, read_err || write_err);
4036 + if (++ms->sync_count == ms->nr_regions)
4037 + /* the sync is complete */
4038 + dm_table_event(ms->ti->table);
4041 +static int recover(struct mirror_set *ms, struct region *reg)
4045 + struct io_region from, to[ms->nr_mirrors - 1], *dest;
4047 + unsigned int flags = 0;
4049 + /* fill in the source */
4050 + m = ms->mirror + DEFAULT_MIRROR;
4051 + from.dev = m->dev->dev;
4052 + from.sector = m->offset + region_to_sector(reg->rh, reg->key);
4053 + if (reg->key == (ms->nr_regions - 1)) {
4055 + * The final region may be smaller than
4058 + from.count = ms->ti->len & (reg->rh->region_size - 1);
4060 + from.count = reg->rh->region_size;
4062 + from.count = reg->rh->region_size;
4064 + /* fill in the destinations */
4065 + for (i = 1; i < ms->nr_mirrors; i++) {
4066 + m = ms->mirror + i;
4067 + dest = to + (i - 1);
4069 + dest->dev = m->dev->dev;
4070 + dest->sector = m->offset + region_to_sector(reg->rh, reg->key);
4071 + dest->count = from.count;
4074 + /* hand to kcopyd */
4075 + set_bit(KCOPYD_IGNORE_ERROR, &flags);
4076 + r = kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, flags,
4077 + recovery_complete, reg);
4082 +static void do_recovery(struct mirror_set *ms)
4085 + struct region *reg;
4088 + * Start quiescing some regions.
4090 + rh_recovery_prepare(&ms->rh);
4093 + * Copy any already quiesced regions.
4095 + while ((reg = rh_recovery_start(&ms->rh))) {
4096 + r = recover(ms, reg);
4098 + rh_recovery_end(reg, 0);
4102 +/*-----------------------------------------------------------------
4104 + *---------------------------------------------------------------*/
4105 +static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
4107 + /* FIXME: add read balancing */
4108 + return ms->mirror + DEFAULT_MIRROR;
4112 + * remap a buffer to a particular mirror.
4114 +static void map_buffer(struct mirror_set *ms,
4115 + struct mirror *m, struct buffer_head *bh)
4117 + sector_t bsize = bh->b_size >> 9;
4118 + sector_t rsector = bh->b_blocknr * bsize;
4120 + bh->b_rdev = m->dev->dev;
4121 + bh->b_rsector = m->offset + (rsector - ms->ti->begin);
4124 +static void do_reads(struct mirror_set *ms, struct buffer_list *reads)
4127 + struct buffer_head *bh;
4130 + while ((bh = buffer_list_pop(reads))) {
4131 + region = bh_to_region(&ms->rh, bh);
4134 + * We can only read balance if the region is in sync.
4136 + if (rh_in_sync(&ms->rh, region, 0))
4137 + m = choose_mirror(ms, bh->b_rsector);
4139 + m = ms->mirror + DEFAULT_MIRROR;
4141 + map_buffer(ms, m, bh);
4142 + generic_make_request(READ, bh);
4146 +/*-----------------------------------------------------------------
4149 + * We do different things with the write io depending on the
4150 + * state of the region that it's in:
4152 + * SYNC: increment pending, use kcopyd to write to *all* mirrors
4153 + * RECOVERING: delay the io until recovery completes
4154 + * NOSYNC: increment pending, just write to the default mirror
4155 + *---------------------------------------------------------------*/
4156 +static void write_callback(unsigned int error, void *context)
4160 + struct buffer_head *bh = (struct buffer_head *) context;
4161 + struct mirror_set *ms;
4163 + ms = bh_get_ms(bh);
4164 + bh_set_ms(bh, NULL);
4167 + * NOTE: We don't decrement the pending count here,
4168 + * instead it is done by the targets endio function.
4169 + * This way we handle both writes to SYNC and NOSYNC
4170 + * regions with the same code.
4175 + * only error the io if all mirrors failed.
4179 + for (i = 0; i < ms->nr_mirrors; i++)
4180 + if (!test_bit(i, &error)) {
4185 + bh->b_end_io(bh, uptodate);
4188 +static void do_write(struct mirror_set *ms, struct buffer_head *bh)
4191 + struct io_region io[ms->nr_mirrors];
4194 + for (i = 0; i < ms->nr_mirrors; i++) {
4195 + m = ms->mirror + i;
4197 + io[i].dev = m->dev->dev;
4198 + io[i].sector = m->offset + (bh->b_rsector - ms->ti->begin);
4199 + io[i].count = bh->b_size >> 9;
4202 + bh_set_ms(bh, ms);
4203 + dm_io_async(ms->nr_mirrors, io, WRITE, bh->b_page,
4204 + (unsigned int) bh->b_data & ~PAGE_MASK, write_callback, bh);
4207 +static void do_writes(struct mirror_set *ms, struct buffer_list *writes)
4210 + struct buffer_head *bh;
4211 + struct buffer_list sync, nosync, recover, *this_list = NULL;
4213 + if (!writes->head)
4217 + * Classify each write.
4219 + buffer_list_init(&sync);
4220 + buffer_list_init(&nosync);
4221 + buffer_list_init(&recover);
4223 + while ((bh = buffer_list_pop(writes))) {
4224 + state = rh_state(&ms->rh, bh_to_region(&ms->rh, bh), 1);
4228 + this_list = &sync;
4232 + this_list = &nosync;
4235 + case RH_RECOVERING:
4236 + this_list = &recover;
4240 + buffer_list_add(this_list, bh);
4244 + * Increment the pending counts for any regions that will
4245 + * be written to (writes to recover regions are going to
4248 + rh_inc_pending(&ms->rh, &sync);
4249 + rh_inc_pending(&ms->rh, &nosync);
4250 + rh_flush(&ms->rh);
4255 + while ((bh = buffer_list_pop(&sync)))
4258 + while ((bh = buffer_list_pop(&recover)))
4259 + rh_delay(&ms->rh, bh);
4261 + while ((bh = buffer_list_pop(&nosync))) {
4262 + map_buffer(ms, ms->mirror + DEFAULT_MIRROR, bh);
4263 + generic_make_request(WRITE, bh);
4267 +/*-----------------------------------------------------------------
4269 + *---------------------------------------------------------------*/
4270 +static LIST_HEAD(_mirror_sets);
4271 +static DECLARE_RWSEM(_mirror_sets_lock);
4273 +static void do_mirror(struct mirror_set *ms)
4275 + struct buffer_list reads, writes;
4277 + spin_lock(&ms->lock);
4278 + memcpy(&reads, &ms->reads, sizeof(reads));
4279 + buffer_list_init(&ms->reads);
4280 + memcpy(&writes, &ms->writes, sizeof(writes));
4281 + buffer_list_init(&ms->writes);
4282 + spin_unlock(&ms->lock);
4284 + rh_update_states(&ms->rh);
4286 + do_reads(ms, &reads);
4287 + do_writes(ms, &writes);
4288 + run_task_queue(&tq_disk);
4291 +static void do_work(void)
4293 + struct mirror_set *ms;
4295 + down_read(&_mirror_sets_lock);
4296 + list_for_each_entry (ms, &_mirror_sets, list)
4298 + up_read(&_mirror_sets_lock);
4301 +/*-----------------------------------------------------------------
4302 + * Target functions
4303 + *---------------------------------------------------------------*/
4304 +static struct mirror_set *alloc_context(unsigned int nr_mirrors,
4305 + sector_t region_size,
4306 + struct dm_target *ti,
4307 + struct dirty_log *dl)
4310 + struct mirror_set *ms = NULL;
4312 + if (array_too_big(sizeof(*ms), sizeof(ms->mirror[0]), nr_mirrors))
4315 + len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors);
4317 + ms = kmalloc(len, GFP_KERNEL);
4319 + ti->error = "dm-mirror: Cannot allocate mirror context";
4323 + memset(ms, 0, len);
4324 + spin_lock_init(&ms->lock);
4327 + ms->nr_mirrors = nr_mirrors;
4328 + ms->nr_regions = dm_div_up(ti->len, region_size);
4330 + if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
4331 + ti->error = "dm-mirror: Error creating dirty region hash";
4339 +static void free_context(struct mirror_set *ms, struct dm_target *ti,
4343 + dm_put_device(ti, ms->mirror[m].dev);
4349 +static inline int _check_region_size(struct dm_target *ti, sector_t size)
4351 + return !(size % (PAGE_SIZE >> 9) || (size & (size - 1)) ||
4355 +static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
4356 + unsigned int mirror, char **argv)
4360 + if (sscanf(argv[1], SECTOR_FORMAT, &offset) != 1) {
4361 + ti->error = "dm-mirror: Invalid offset";
4365 + if (dm_get_device(ti, argv[0], offset, ti->len,
4366 + dm_table_get_mode(ti->table),
4367 + &ms->mirror[mirror].dev)) {
4368 + ti->error = "dm-mirror: Device lookup failure";
4372 + ms->mirror[mirror].offset = offset;
4377 +static int add_mirror_set(struct mirror_set *ms)
4379 + down_write(&_mirror_sets_lock);
4380 + list_add_tail(&ms->list, &_mirror_sets);
4381 + up_write(&_mirror_sets_lock);
4382 + dm_daemon_wake(&_kmirrord);
4387 +static void del_mirror_set(struct mirror_set *ms)
4389 + down_write(&_mirror_sets_lock);
4390 + list_del(&ms->list);
4391 + up_write(&_mirror_sets_lock);
4395 + * Create dirty log: log_type #log_params <log_params>
4397 +static struct dirty_log *create_dirty_log(struct dm_target *ti,
4398 + unsigned int argc, char **argv,
4399 + unsigned int *args_used)
4401 + unsigned int param_count;
4402 + struct dirty_log *dl;
4405 + ti->error = "dm-mirror: Insufficient mirror log arguments";
4409 + if (sscanf(argv[1], "%u", ¶m_count) != 1 || param_count != 1) {
4410 + ti->error = "dm-mirror: Invalid mirror log argument count";
4414 + *args_used = 2 + param_count;
4416 + if (argc < *args_used) {
4417 + ti->error = "dm-mirror: Insufficient mirror log arguments";
4421 + dl = dm_create_dirty_log(argv[0], ti->len, param_count, argv + 2);
4423 + ti->error = "dm-mirror: Error creating mirror dirty log";
4427 + if (!_check_region_size(ti, dl->type->get_region_size(dl))) {
4428 + ti->error = "dm-mirror: Invalid region size";
4429 + dm_destroy_dirty_log(dl);
4437 + * Construct a mirror mapping:
4439 + * log_type #log_params <log_params>
4440 + * #mirrors [mirror_path offset]{2,}
4442 + * For now, #log_params = 1, log_type = "core"
4445 +#define DM_IO_PAGES 64
4446 +static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
4449 + unsigned int nr_mirrors, m, args_used;
4450 + struct mirror_set *ms;
4451 + struct dirty_log *dl;
4453 + dl = create_dirty_log(ti, argc, argv, &args_used);
4457 + argv += args_used;
4458 + argc -= args_used;
4460 + if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 ||
4462 + ti->error = "dm-mirror: Invalid number of mirrors";
4463 + dm_destroy_dirty_log(dl);
4469 + if (argc != nr_mirrors * 2) {
4470 + ti->error = "dm-mirror: Wrong number of mirror arguments";
4471 + dm_destroy_dirty_log(dl);
4475 + ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl);
4477 + dm_destroy_dirty_log(dl);
4481 + /* Get the mirror parameter sets */
4482 + for (m = 0; m < nr_mirrors; m++) {
4483 + r = get_mirror(ms, ti, m, argv);
4485 + free_context(ms, ti, m);
4494 + r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client);
4496 + free_context(ms, ti, ms->nr_mirrors);
4500 + add_mirror_set(ms);
4504 +static void mirror_dtr(struct dm_target *ti)
4506 + struct mirror_set *ms = (struct mirror_set *) ti->private;
4508 + del_mirror_set(ms);
4509 + kcopyd_client_destroy(ms->kcopyd_client);
4510 + free_context(ms, ti, ms->nr_mirrors);
4513 +static void queue_bh(struct mirror_set *ms, struct buffer_head *bh, int rw)
4516 + struct buffer_list *bl;
4518 + bl = (rw == WRITE) ? &ms->writes : &ms->reads;
4519 + spin_lock(&ms->lock);
4520 + wake = !(bl->head);
4521 + buffer_list_add(bl, bh);
4522 + spin_unlock(&ms->lock);
4525 + dm_daemon_wake(&_kmirrord);
4529 + * Mirror mapping function
4531 +static int mirror_map(struct dm_target *ti, struct buffer_head *bh,
4532 + int rw, union map_info *map_context)
4536 + struct mirror_set *ms = ti->private;
4538 + /* FIXME: nasty hack, 32 bit sector_t only */
4539 + map_context->ll = bh->b_rsector / ms->rh.region_size;
4541 + if (rw == WRITE) {
4542 + queue_bh(ms, bh, rw);
4546 + r = ms->rh.log->type->in_sync(ms->rh.log, bh_to_region(&ms->rh, bh), 0);
4547 + if (r < 0 && r != -EWOULDBLOCK)
4550 + if (r == -EWOULDBLOCK) /* FIXME: ugly */
4554 + * We don't want to fast track a recovery just for a read
4555 + * ahead. So we just let it silently fail.
4556 + * FIXME: get rid of this.
4558 + if (!r && rw == READA)
4562 + /* Pass this io over to the daemon */
4563 + queue_bh(ms, bh, rw);
4567 + m = choose_mirror(ms, bh->b_rsector);
4571 + map_buffer(ms, m, bh);
4575 +static int mirror_end_io(struct dm_target *ti, struct buffer_head *bh,
4576 + int rw, int error, union map_info *map_context)
4578 + struct mirror_set *ms = (struct mirror_set *) ti->private;
4579 + region_t region = map_context->ll;
4582 + * We need to dec pending if this was a write.
4585 + rh_dec(&ms->rh, region);
4590 +static void mirror_suspend(struct dm_target *ti)
4592 + struct mirror_set *ms = (struct mirror_set *) ti->private;
4593 + rh_stop_recovery(&ms->rh);
4596 +static void mirror_resume(struct dm_target *ti)
4598 + struct mirror_set *ms = (struct mirror_set *) ti->private;
4599 + rh_start_recovery(&ms->rh);
4602 +static int mirror_status(struct dm_target *ti, status_type_t type,
4603 + char *result, unsigned int maxlen)
4605 + unsigned int m, sz = 0;
4606 + struct mirror_set *ms = (struct mirror_set *) ti->private;
4609 + case STATUSTYPE_INFO:
4610 + sz += snprintf(result + sz, maxlen - sz, "%d ", ms->nr_mirrors);
4612 + for (m = 0; m < ms->nr_mirrors; m++)
4613 + sz += snprintf(result + sz, maxlen - sz, "%s ",
4614 + dm_kdevname(ms->mirror[m].dev->dev));
4616 + sz += snprintf(result + sz, maxlen - sz, "%lu/%lu",
4617 + ms->sync_count, ms->nr_regions);
4620 + case STATUSTYPE_TABLE:
4621 + sz += snprintf(result + sz, maxlen - sz,
4622 + "%s 1 " SECTOR_FORMAT " %d ",
4623 + ms->rh.log->type->name, ms->rh.region_size,
4626 + for (m = 0; m < ms->nr_mirrors; m++)
4627 + sz += snprintf(result + sz, maxlen - sz, "%s %ld ",
4628 + dm_kdevname(ms->mirror[m].dev->dev),
4629 + ms->mirror[m].offset);
4635 +static struct target_type mirror_target = {
4637 + .module = THIS_MODULE,
4638 + .ctr = mirror_ctr,
4639 + .dtr = mirror_dtr,
4640 + .map = mirror_map,
4641 + .end_io = mirror_end_io,
4642 + .suspend = mirror_suspend,
4643 + .resume = mirror_resume,
4644 + .status = mirror_status,
4647 +static int __init dm_mirror_init(void)
4651 + r = dm_dirty_log_init();
4655 + r = dm_daemon_start(&_kmirrord, "kmirrord", do_work);
4657 + DMERR("couldn't start kmirrord");
4658 + dm_dirty_log_exit();
4662 + r = dm_register_target(&mirror_target);
4664 + DMERR("%s: Failed to register mirror target",
4665 + mirror_target.name);
4666 + dm_dirty_log_exit();
4667 + dm_daemon_stop(&_kmirrord);
4673 +static void __exit dm_mirror_exit(void)
4677 + r = dm_unregister_target(&mirror_target);
4679 + DMERR("%s: unregister failed %d", mirror_target.name, r);
4681 + dm_daemon_stop(&_kmirrord);
4682 + dm_dirty_log_exit();
4686 +module_init(dm_mirror_init);
4687 +module_exit(dm_mirror_exit);
4689 +MODULE_DESCRIPTION(DM_NAME " mirror target");
4690 +MODULE_AUTHOR("Heinz Mauelshagen <mge@sistina.com>");
4691 +MODULE_LICENSE("GPL");
4692 --- linux-2.4.21/drivers/md/dm-snapshot.c Thu Jan 1 01:00:00 1970
4693 +++ linux/drivers/md/dm-snapshot.c Wed Aug 20 14:41:38 2003
4698 + * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
4700 + * This file is released under the GPL.
4703 +#include <linux/config.h>
4704 +#include <linux/ctype.h>
4705 +#include <linux/module.h>
4706 +#include <linux/init.h>
4707 +#include <linux/slab.h>
4708 +#include <linux/list.h>
4709 +#include <linux/fs.h>
4710 +#include <linux/blkdev.h>
4711 +#include <linux/mempool.h>
4712 +#include <linux/device-mapper.h>
4713 +#include <linux/vmalloc.h>
4715 +#include "dm-snapshot.h"
4716 +#include "kcopyd.h"
4719 + * FIXME: Remove this before release.
4722 +#define DMDEBUG(x...) DMWARN( ## x)
4724 +#define DMDEBUG(x...)
4728 + * The percentage increment we will wake up users at
4730 +#define WAKE_UP_PERCENT 5
4733 + * kcopyd priority of snapshot operations
4735 +#define SNAPSHOT_COPY_PRIORITY 2
4738 + * Each snapshot reserves this many pages for io
4739 + * FIXME: calculate this
4741 +#define SNAPSHOT_PAGES 256
4743 +struct pending_exception {
4744 + struct exception e;
4747 + * Origin buffers waiting for this to complete are held
4748 + * in a list (using b_reqnext).
4750 + struct buffer_head *origin_bhs;
4751 + struct buffer_head *snapshot_bhs;
4754 + * Other pending_exceptions that are processing this
4755 + * chunk. When this list is empty, we know we can
4756 + * complete the origins.
4758 + struct list_head siblings;
4760 + /* Pointer back to snapshot context */
4761 + struct dm_snapshot *snap;
4764 + * 1 indicates the exception has already been sent to
4771 + * Hash table mapping origin volumes to lists of snapshots and
4772 + * a lock to protect it
4774 +static kmem_cache_t *exception_cache;
4775 +static kmem_cache_t *pending_cache;
4776 +static mempool_t *pending_pool;
4779 + * One of these per registered origin, held in the snapshot_origins hash
4782 + /* The origin device */
4785 + struct list_head hash_list;
4787 + /* List of snapshots for this origin */
4788 + struct list_head snapshots;
4792 + * Size of the hash table for origin volumes. If we make this
4793 + * the size of the minors list then it should be nearly perfect
4795 +#define ORIGIN_HASH_SIZE 256
4796 +#define ORIGIN_MASK 0xFF
4797 +static struct list_head *_origins;
4798 +static struct rw_semaphore _origins_lock;
4800 +static int init_origin_hash(void)
4804 + _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
4807 + DMERR("Device mapper: Snapshot: unable to allocate memory");
4811 + for (i = 0; i < ORIGIN_HASH_SIZE; i++)
4812 + INIT_LIST_HEAD(_origins + i);
4813 + init_rwsem(&_origins_lock);
4818 +static void exit_origin_hash(void)
4823 +static inline unsigned int origin_hash(kdev_t dev)
4825 + return MINOR(dev) & ORIGIN_MASK;
4828 +static struct origin *__lookup_origin(kdev_t origin)
4830 + struct list_head *slist;
4831 + struct list_head *ol;
4834 + ol = &_origins[origin_hash(origin)];
4835 + list_for_each(slist, ol) {
4836 + o = list_entry(slist, struct origin, hash_list);
4838 + if (o->dev == origin)
4845 +static void __insert_origin(struct origin *o)
4847 + struct list_head *sl = &_origins[origin_hash(o->dev)];
4848 + list_add_tail(&o->hash_list, sl);
4852 + * Make a note of the snapshot and its origin so we can look it
4853 + * up when the origin has a write on it.
4855 +static int register_snapshot(struct dm_snapshot *snap)
4858 + kdev_t dev = snap->origin->dev;
4860 + down_write(&_origins_lock);
4861 + o = __lookup_origin(dev);
4865 + o = kmalloc(sizeof(*o), GFP_KERNEL);
4867 + up_write(&_origins_lock);
4871 + /* Initialise the struct */
4872 + INIT_LIST_HEAD(&o->snapshots);
4875 + __insert_origin(o);
4878 + list_add_tail(&snap->list, &o->snapshots);
4880 + up_write(&_origins_lock);
4884 +static void unregister_snapshot(struct dm_snapshot *s)
4888 + down_write(&_origins_lock);
4889 + o = __lookup_origin(s->origin->dev);
4891 + list_del(&s->list);
4892 + if (list_empty(&o->snapshots)) {
4893 + list_del(&o->hash_list);
4897 + up_write(&_origins_lock);
4901 + * Implementation of the exception hash tables.
4903 +static int init_exception_table(struct exception_table *et, uint32_t size)
4907 + et->hash_mask = size - 1;
4908 + et->table = vcalloc(size, sizeof(struct list_head));
4912 + for (i = 0; i < size; i++)
4913 + INIT_LIST_HEAD(et->table + i);
4918 +static void exit_exception_table(struct exception_table *et, kmem_cache_t *mem)
4920 + struct list_head *slot, *entry, *temp;
4921 + struct exception *ex;
4924 + size = et->hash_mask + 1;
4925 + for (i = 0; i < size; i++) {
4926 + slot = et->table + i;
4928 + list_for_each_safe(entry, temp, slot) {
4929 + ex = list_entry(entry, struct exception, hash_list);
4930 + kmem_cache_free(mem, ex);
4938 + * FIXME: check how this hash fn is performing.
4940 +static inline uint32_t exception_hash(struct exception_table *et, chunk_t chunk)
4942 + return chunk & et->hash_mask;
4945 +static void insert_exception(struct exception_table *eh, struct exception *e)
4947 + struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)];
4948 + list_add(&e->hash_list, l);
4951 +static inline void remove_exception(struct exception *e)
4953 + list_del(&e->hash_list);
4957 + * Return the exception data for a sector, or NULL if not
4960 +static struct exception *lookup_exception(struct exception_table *et,
4963 + struct list_head *slot, *el;
4964 + struct exception *e;
4966 + slot = &et->table[exception_hash(et, chunk)];
4967 + list_for_each(el, slot) {
4968 + e = list_entry(el, struct exception, hash_list);
4969 + if (e->old_chunk == chunk)
4976 +static inline struct exception *alloc_exception(void)
4978 + struct exception *e;
4980 + e = kmem_cache_alloc(exception_cache, GFP_NOIO);
4982 + e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
4987 +static inline void free_exception(struct exception *e)
4989 + kmem_cache_free(exception_cache, e);
4992 +static inline struct pending_exception *alloc_pending_exception(void)
4994 + return mempool_alloc(pending_pool, GFP_NOIO);
4997 +static inline void free_pending_exception(struct pending_exception *pe)
4999 + mempool_free(pe, pending_pool);
5002 +int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new)
5004 + struct exception *e;
5006 + e = alloc_exception();
5010 + e->old_chunk = old;
5011 + e->new_chunk = new;
5012 + insert_exception(&s->complete, e);
5017 + * Hard coded magic.
5019 +static int calc_max_buckets(void)
5021 + unsigned long mem;
5023 + mem = num_physpages << PAGE_SHIFT;
5025 + mem /= sizeof(struct list_head);
5031 + * Rounds a number down to a power of 2.
5033 +static inline uint32_t round_down(uint32_t n)
5035 + while (n & (n - 1))
5041 + * Allocate room for a suitable hash table.
5043 +static int init_hash_tables(struct dm_snapshot *s)
5045 + sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets;
5048 + * Calculate based on the size of the original volume or
5049 + * the COW volume...
5051 + cow_dev_size = get_dev_size(s->cow->dev);
5052 + origin_dev_size = get_dev_size(s->origin->dev);
5053 + max_buckets = calc_max_buckets();
5055 + hash_size = min(origin_dev_size, cow_dev_size) / s->chunk_size;
5056 + hash_size = min(hash_size, max_buckets);
5058 + /* Round it down to a power of 2 */
5059 + hash_size = round_down(hash_size);
5060 + if (init_exception_table(&s->complete, hash_size))
5064 + * Allocate hash table for in-flight exceptions
5065 + * Make this smaller than the real hash table
5071 + if (init_exception_table(&s->pending, hash_size)) {
5072 + exit_exception_table(&s->complete, exception_cache);
5080 + * Round a number up to the nearest 'size' boundary. size must
5081 + * be a power of 2.
5083 +static inline ulong round_up(ulong n, ulong size)
5086 + return (n + size) & ~size;
5090 + * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
5092 +static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
5094 + struct dm_snapshot *s;
5095 + unsigned long chunk_size;
5098 + char *origin_path;
5104 + ti->error = "dm-snapshot: requires exactly 4 arguments";
5109 + origin_path = argv[0];
5110 + cow_path = argv[1];
5111 + persistent = toupper(*argv[2]);
5113 + if (persistent != 'P' && persistent != 'N') {
5114 + ti->error = "Persistent flag is not P or N";
5119 + chunk_size = simple_strtoul(argv[3], &value, 10);
5120 + if (chunk_size == 0 || value == NULL) {
5121 + ti->error = "Invalid chunk size";
5126 + s = kmalloc(sizeof(*s), GFP_KERNEL);
5128 + ti->error = "Cannot allocate snapshot context private "
5134 + r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin);
5136 + ti->error = "Cannot get origin device";
5140 + /* FIXME: get cow length */
5141 + r = dm_get_device(ti, cow_path, 0, 0,
5142 + FMODE_READ | FMODE_WRITE, &s->cow);
5144 + dm_put_device(ti, s->origin);
5145 + ti->error = "Cannot get COW device";
5150 + * Chunk size must be multiple of page size. Silently
5151 + * round up if it's not.
5153 + chunk_size = round_up(chunk_size, PAGE_SIZE / SECTOR_SIZE);
5155 + /* Validate the chunk size against the device block size */
5156 + blocksize = get_hardsect_size(s->cow->dev);
5157 + if (chunk_size % (blocksize / SECTOR_SIZE)) {
5158 + ti->error = "Chunk size is not a multiple of device blocksize";
5163 + /* Check the sizes are small enough to fit in one kiovec */
5164 + if (chunk_size > KIO_MAX_SECTORS) {
5165 + ti->error = "Chunk size is too big";
5170 + /* Check chunk_size is a power of 2 */
5171 + if (chunk_size & (chunk_size - 1)) {
5172 + ti->error = "Chunk size is not a power of 2";
5177 + s->chunk_size = chunk_size;
5178 + s->chunk_mask = chunk_size - 1;
5179 + s->type = persistent;
5180 + for (s->chunk_shift = 0; chunk_size;
5181 + s->chunk_shift++, chunk_size >>= 1)
5186 + s->have_metadata = 0;
5187 + s->last_percent = 0;
5188 + init_rwsem(&s->lock);
5189 + s->table = ti->table;
5191 + /* Allocate hash table for COW data */
5192 + if (init_hash_tables(s)) {
5193 + ti->error = "Unable to allocate hash table space";
5199 + * Check the persistent flag - done here because we need the iobuf
5200 + * to check the LV header
5202 + s->store.snap = s;
5204 + if (persistent == 'P')
5205 + r = dm_create_persistent(&s->store, s->chunk_size);
5207 + r = dm_create_transient(&s->store, s, blocksize);
5210 + ti->error = "Couldn't create exception store";
5215 + r = kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client);
5217 + ti->error = "Could not create kcopyd client";
5221 + /* Flush IO to the origin device */
5222 + fsync_dev(s->origin->dev);
5224 + /* Add snapshot to the list of snapshots for this origin */
5225 + if (register_snapshot(s)) {
5227 + ti->error = "Cannot register snapshot origin";
5235 + kcopyd_client_destroy(s->kcopyd_client);
5238 + s->store.destroy(&s->store);
5241 + exit_exception_table(&s->pending, pending_cache);
5242 + exit_exception_table(&s->complete, exception_cache);
5245 + dm_put_device(ti, s->cow);
5246 + dm_put_device(ti, s->origin);
5255 +static void snapshot_dtr(struct dm_target *ti)
5257 + struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
5259 + dm_table_event(ti->table);
5261 + unregister_snapshot(s);
5263 + exit_exception_table(&s->pending, pending_cache);
5264 + exit_exception_table(&s->complete, exception_cache);
5266 + /* Deallocate memory used */
5267 + s->store.destroy(&s->store);
5269 + dm_put_device(ti, s->origin);
5270 + dm_put_device(ti, s->cow);
5271 + kcopyd_client_destroy(s->kcopyd_client);
5276 + * We hold lists of buffer_heads, using the b_reqnext field.
5278 +static void queue_buffer(struct buffer_head **queue, struct buffer_head *bh)
5280 + bh->b_reqnext = *queue;
5285 + * FIXME: inefficient.
5287 +static void queue_buffers(struct buffer_head **queue, struct buffer_head *bhs)
5290 + queue = &((*queue)->b_reqnext);
5296 + * Flush a list of buffers.
5298 +static void flush_buffers(struct buffer_head *bh)
5300 + struct buffer_head *n;
5302 + DMDEBUG("begin flush");
5304 + n = bh->b_reqnext;
5305 + bh->b_reqnext = NULL;
5306 + DMDEBUG("flushing %p", bh);
5307 + generic_make_request(WRITE, bh);
5311 + run_task_queue(&tq_disk);
5315 + * Error a list of buffers.
5317 +static void error_buffers(struct buffer_head *bh)
5319 + struct buffer_head *n;
5322 + n = bh->b_reqnext;
5323 + bh->b_reqnext = NULL;
5324 + buffer_IO_error(bh);
5329 +static struct buffer_head *__flush_bhs(struct pending_exception *pe)
5331 + struct pending_exception *sibling;
5333 + if (list_empty(&pe->siblings))
5334 + return pe->origin_bhs;
5336 + sibling = list_entry(pe->siblings.next,
5337 + struct pending_exception, siblings);
5339 + list_del(&pe->siblings);
5341 + /* FIXME: I think there's a race on SMP machines here, add spin lock */
5342 + queue_buffers(&sibling->origin_bhs, pe->origin_bhs);
5347 +static void pending_complete(struct pending_exception *pe, int success)
5349 + struct exception *e;
5350 + struct dm_snapshot *s = pe->snap;
5351 + struct buffer_head *flush = NULL;
5354 + e = alloc_exception();
5356 + DMWARN("Unable to allocate exception.");
5357 + down_write(&s->lock);
5358 + s->store.drop_snapshot(&s->store);
5360 + flush = __flush_bhs(pe);
5361 + up_write(&s->lock);
5363 + error_buffers(pe->snapshot_bhs);
5368 + * Add a proper exception, and remove the
5369 + * in-flight exception from the list.
5371 + down_write(&s->lock);
5373 + memcpy(e, &pe->e, sizeof(*e));
5374 + insert_exception(&s->complete, e);
5375 + remove_exception(&pe->e);
5376 + flush = __flush_bhs(pe);
5378 + /* Submit any pending write BHs */
5379 + up_write(&s->lock);
5381 + flush_buffers(pe->snapshot_bhs);
5382 + DMDEBUG("Exception completed successfully.");
5384 + /* Notify any interested parties */
5385 + if (s->store.fraction_full) {
5386 + sector_t numerator, denominator;
5389 + s->store.fraction_full(&s->store, &numerator,
5391 + pc = numerator * 100 / denominator;
5393 + if (pc >= s->last_percent + WAKE_UP_PERCENT) {
5394 + dm_table_event(s->table);
5395 + s->last_percent = pc - pc % WAKE_UP_PERCENT;
5400 + /* Read/write error - snapshot is unusable */
5401 + down_write(&s->lock);
5403 + DMERR("Error reading/writing snapshot");
5404 + s->store.drop_snapshot(&s->store);
5406 + remove_exception(&pe->e);
5407 + flush = __flush_bhs(pe);
5408 + up_write(&s->lock);
5410 + error_buffers(pe->snapshot_bhs);
5412 + dm_table_event(s->table);
5413 + DMDEBUG("Exception failed.");
5418 + flush_buffers(flush);
5420 + free_pending_exception(pe);
5423 +static void commit_callback(void *context, int success)
5425 + struct pending_exception *pe = (struct pending_exception *) context;
5426 + pending_complete(pe, success);
5430 + * Called when the copy I/O has finished. kcopyd actually runs
5431 + * this code so don't block.
5433 +static void copy_callback(int read_err, unsigned int write_err, void *context)
5435 + struct pending_exception *pe = (struct pending_exception *) context;
5436 + struct dm_snapshot *s = pe->snap;
5438 + if (read_err || write_err)
5439 + pending_complete(pe, 0);
5442 + /* Update the metadata if we are persistent */
5443 + s->store.commit_exception(&s->store, &pe->e, commit_callback,
5448 + * Dispatches the copy operation to kcopyd.
5450 +static inline void start_copy(struct pending_exception *pe)
5452 + struct dm_snapshot *s = pe->snap;
5453 + struct io_region src, dest;
5454 + kdev_t dev = s->origin->dev;
5455 + int *sizes = blk_size[major(dev)];
5456 + sector_t dev_size = (sector_t) -1;
5461 + /* this is protected by snap->lock */
5464 + if (sizes && sizes[minor(dev)])
5465 + dev_size = sizes[minor(dev)] << 1;
5468 + src.sector = chunk_to_sector(s, pe->e.old_chunk);
5469 + src.count = min(s->chunk_size, dev_size - src.sector);
5471 + dest.dev = s->cow->dev;
5472 + dest.sector = chunk_to_sector(s, pe->e.new_chunk);
5473 + dest.count = src.count;
5475 + /* Hand over to kcopyd */
5476 + kcopyd_copy(s->kcopyd_client,
5477 + &src, 1, &dest, 0, copy_callback, pe);
5481 + * Looks to see if this snapshot already has a pending exception
5482 + * for this chunk, otherwise it allocates a new one and inserts
5483 + * it into the pending table.
5485 +static struct pending_exception *find_pending_exception(struct dm_snapshot *s,
5486 + struct buffer_head *bh)
5488 + struct exception *e;
5489 + struct pending_exception *pe;
5490 + chunk_t chunk = sector_to_chunk(s, bh->b_rsector);
5493 + * Is there a pending exception for this already ?
5495 + e = lookup_exception(&s->pending, chunk);
5497 + /* cast the exception to a pending exception */
5498 + pe = list_entry(e, struct pending_exception, e);
5501 + /* Create a new pending exception */
5502 + pe = alloc_pending_exception();
5503 + pe->e.old_chunk = chunk;
5504 + pe->origin_bhs = pe->snapshot_bhs = NULL;
5505 + INIT_LIST_HEAD(&pe->siblings);
5509 + if (s->store.prepare_exception(&s->store, &pe->e)) {
5510 + free_pending_exception(pe);
5515 + insert_exception(&s->pending, &pe->e);
5521 +static inline void remap_exception(struct dm_snapshot *s, struct exception *e,
5522 + struct buffer_head *bh)
5524 + bh->b_rdev = s->cow->dev;
5525 + bh->b_rsector = chunk_to_sector(s, e->new_chunk) +
5526 + (bh->b_rsector & s->chunk_mask);
5529 +static int snapshot_map(struct dm_target *ti, struct buffer_head *bh, int rw,
5530 + union map_info *map_context)
5532 + struct exception *e;
5533 + struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
5536 + struct pending_exception *pe;
5538 + chunk = sector_to_chunk(s, bh->b_rsector);
5540 + /* Full snapshots are not usable */
5545 + * Write to snapshot - higher level takes care of RW/RO
5546 + * flags so we should only get this if we are
5549 + if (rw == WRITE) {
5551 + down_write(&s->lock);
5553 + /* If the block is already remapped - use that, else remap it */
5554 + e = lookup_exception(&s->complete, chunk);
5556 + remap_exception(s, e, bh);
5559 + pe = find_pending_exception(s, bh);
5562 + s->store.drop_snapshot(&s->store);
5566 + remap_exception(s, &pe->e, bh);
5567 + queue_buffer(&pe->snapshot_bhs, bh);
5573 + up_write(&s->lock);
5577 + * FIXME: this read path scares me because we
5578 + * always use the origin when we have a pending
5579 + * exception. However I can't think of a
5580 + * situation where this is wrong - ejt.
5584 + down_read(&s->lock);
5586 + /* See if it it has been remapped */
5587 + e = lookup_exception(&s->complete, chunk);
5589 + remap_exception(s, e, bh);
5591 + bh->b_rdev = s->origin->dev;
5593 + up_read(&s->lock);
5599 +void snapshot_resume(struct dm_target *ti)
5601 + struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
5603 + if (s->have_metadata)
5606 + if (s->store.read_metadata(&s->store)) {
5607 + down_write(&s->lock);
5609 + up_write(&s->lock);
5612 + s->have_metadata = 1;
5615 +static int snapshot_status(struct dm_target *ti, status_type_t type,
5616 + char *result, unsigned int maxlen)
5618 + struct dm_snapshot *snap = (struct dm_snapshot *) ti->private;
5623 + case STATUSTYPE_INFO:
5625 + snprintf(result, maxlen, "Invalid");
5627 + if (snap->store.fraction_full) {
5628 + sector_t numerator, denominator;
5629 + snap->store.fraction_full(&snap->store,
5632 + snprintf(result, maxlen,
5633 + SECTOR_FORMAT "/" SECTOR_FORMAT,
5634 + numerator, denominator);
5637 + snprintf(result, maxlen, "Unknown");
5641 + case STATUSTYPE_TABLE:
5643 + * kdevname returns a static pointer so we need
5644 + * to make private copies if the output is to
5647 + strncpy(cow, dm_kdevname(snap->cow->dev), sizeof(cow));
5648 + strncpy(org, dm_kdevname(snap->origin->dev), sizeof(org));
5649 + snprintf(result, maxlen, "%s %s %c %ld", org, cow,
5650 + snap->type, snap->chunk_size);
5657 +/*-----------------------------------------------------------------
5659 + *---------------------------------------------------------------*/
5660 +static void list_merge(struct list_head *l1, struct list_head *l2)
5662 + struct list_head *l1_n, *l2_p;
5670 + l2_p->next = l1_n;
5671 + l1_n->prev = l2_p;
5674 +static int __origin_write(struct list_head *snapshots, struct buffer_head *bh)
5676 + int r = 1, first = 1;
5677 + struct list_head *sl;
5678 + struct dm_snapshot *snap;
5679 + struct exception *e;
5680 + struct pending_exception *pe, *last = NULL;
5683 + /* Do all the snapshots on this origin */
5684 + list_for_each(sl, snapshots) {
5685 + snap = list_entry(sl, struct dm_snapshot, list);
5687 + /* Only deal with valid snapshots */
5691 + down_write(&snap->lock);
5694 + * Remember, different snapshots can have
5695 + * different chunk sizes.
5697 + chunk = sector_to_chunk(snap, bh->b_rsector);
5700 + * Check exception table to see if block
5701 + * is already remapped in this snapshot
5702 + * and trigger an exception if not.
5704 + e = lookup_exception(&snap->complete, chunk);
5706 + pe = find_pending_exception(snap, bh);
5708 + snap->store.drop_snapshot(&snap->store);
5713 + list_merge(&pe->siblings,
5721 + up_write(&snap->lock);
5725 + * Now that we have a complete pe list we can start the copying.
5730 + down_write(&pe->snap->lock);
5732 + queue_buffer(&pe->origin_bhs, bh);
5734 + up_write(&pe->snap->lock);
5736 + pe = list_entry(pe->siblings.next,
5737 + struct pending_exception, siblings);
5739 + } while (pe != last);
5746 + * Called on a write from the origin driver.
5748 +int do_origin(struct dm_dev *origin, struct buffer_head *bh)
5753 + down_read(&_origins_lock);
5754 + o = __lookup_origin(origin->dev);
5758 + r = __origin_write(&o->snapshots, bh);
5759 + up_read(&_origins_lock);
5765 + * Origin: maps a linear range of a device, with hooks for snapshotting.
5769 + * Construct an origin mapping: <dev_path>
5770 + * The context for an origin is merely a 'struct dm_dev *'
5771 + * pointing to the real device.
5773 +static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
5776 + struct dm_dev *dev;
5779 + ti->error = "dm-origin: incorrect number of arguments";
5783 + r = dm_get_device(ti, argv[0], 0, ti->len,
5784 + dm_table_get_mode(ti->table), &dev);
5786 + ti->error = "Cannot get target device";
5790 + ti->private = dev;
5794 +static void origin_dtr(struct dm_target *ti)
5796 + struct dm_dev *dev = (struct dm_dev *) ti->private;
5797 + dm_put_device(ti, dev);
5800 +static int origin_map(struct dm_target *ti, struct buffer_head *bh, int rw,
5801 + union map_info *map_context)
5803 + struct dm_dev *dev = (struct dm_dev *) ti->private;
5804 + bh->b_rdev = dev->dev;
5806 + /* Only tell snapshots if this is a write */
5807 + return (rw == WRITE) ? do_origin(dev, bh) : 1;
5810 +static int origin_status(struct dm_target *ti, status_type_t type, char *result,
5811 + unsigned int maxlen)
5813 + struct dm_dev *dev = (struct dm_dev *) ti->private;
5816 + case STATUSTYPE_INFO:
5820 + case STATUSTYPE_TABLE:
5821 + snprintf(result, maxlen, "%s", dm_kdevname(dev->dev));
5828 +static struct target_type origin_target = {
5829 + name: "snapshot-origin",
5830 + module: THIS_MODULE,
5834 + status: origin_status,
5837 +static struct target_type snapshot_target = {
5839 + module: THIS_MODULE,
5840 + ctr: snapshot_ctr,
5841 + dtr: snapshot_dtr,
5842 + map: snapshot_map,
5843 + resume: snapshot_resume,
5844 + status: snapshot_status,
5847 +int __init dm_snapshot_init(void)
5851 + r = dm_register_target(&snapshot_target);
5853 + DMERR("snapshot target register failed %d", r);
5857 + r = dm_register_target(&origin_target);
5859 + DMERR("Device mapper: Origin: register failed %d\n", r);
5863 + r = init_origin_hash();
5865 + DMERR("init_origin_hash failed.");
5869 + exception_cache = kmem_cache_create("dm-snapshot-ex",
5870 + sizeof(struct exception),
5871 + __alignof__(struct exception),
5873 + if (!exception_cache) {
5874 + DMERR("Couldn't create exception cache.");
5880 + kmem_cache_create("dm-snapshot-in",
5881 + sizeof(struct pending_exception),
5882 + __alignof__(struct pending_exception),
5884 + if (!pending_cache) {
5885 + DMERR("Couldn't create pending cache.");
5890 + pending_pool = mempool_create(128, mempool_alloc_slab,
5891 + mempool_free_slab, pending_cache);
5892 + if (!pending_pool) {
5893 + DMERR("Couldn't create pending pool.");
5901 + kmem_cache_destroy(pending_cache);
5903 + kmem_cache_destroy(exception_cache);
5905 + exit_origin_hash();
5907 + dm_unregister_target(&origin_target);
5909 + dm_unregister_target(&snapshot_target);
5913 +void dm_snapshot_exit(void)
5917 + r = dm_unregister_target(&snapshot_target);
5919 + DMERR("snapshot unregister failed %d", r);
5921 + r = dm_unregister_target(&origin_target);
5923 + DMERR("origin unregister failed %d", r);
5925 + exit_origin_hash();
5926 + mempool_destroy(pending_pool);
5927 + kmem_cache_destroy(pending_cache);
5928 + kmem_cache_destroy(exception_cache);
5930 --- linux-2.4.21/drivers/md/dm-snapshot.h Thu Jan 1 01:00:00 1970
5931 +++ linux/drivers/md/dm-snapshot.h Wed Aug 20 14:41:38 2003
5936 + * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
5938 + * This file is released under the GPL.
5941 +#ifndef DM_SNAPSHOT_H
5942 +#define DM_SNAPSHOT_H
5945 +#include <linux/blkdev.h>
5947 +struct exception_table {
5948 + uint32_t hash_mask;
5949 + struct list_head *table;
5953 + * The snapshot code deals with largish chunks of the disk at a
5954 + * time. Typically 64k - 256k.
5956 +/* FIXME: can we get away with limiting these to a uint32_t ? */
5957 +typedef sector_t chunk_t;
5960 + * An exception is used where an old chunk of data has been
5961 + * replaced by a new one.
5964 + struct list_head hash_list;
5966 + chunk_t old_chunk;
5967 + chunk_t new_chunk;
5971 + * Abstraction to handle the meta/layout of exception stores (the
5974 +struct exception_store {
5977 + * Destroys this object when you've finished with it.
5979 + void (*destroy) (struct exception_store *store);
5982 + * The target shouldn't read the COW device until this is
5985 + int (*read_metadata) (struct exception_store *store);
5988 + * Find somewhere to store the next exception.
5990 + int (*prepare_exception) (struct exception_store *store,
5991 + struct exception *e);
5994 + * Update the metadata with this exception.
5996 + void (*commit_exception) (struct exception_store *store,
5997 + struct exception *e,
5998 + void (*callback) (void *, int success),
5999 + void *callback_context);
6002 + * The snapshot is invalid, note this in the metadata.
6004 + void (*drop_snapshot) (struct exception_store *store);
6007 + * Return how full the snapshot is.
6009 + void (*fraction_full) (struct exception_store *store,
6010 + sector_t *numerator,
6011 + sector_t *denominator);
6013 + struct dm_snapshot *snap;
6017 +struct dm_snapshot {
6018 + struct rw_semaphore lock;
6019 + struct dm_table *table;
6021 + struct dm_dev *origin;
6022 + struct dm_dev *cow;
6024 + /* List of snapshots per Origin */
6025 + struct list_head list;
6027 + /* Size of data blocks saved - must be a power of 2 */
6028 + chunk_t chunk_size;
6029 + chunk_t chunk_mask;
6030 + chunk_t chunk_shift;
6032 + /* You can't use a snapshot if this is 0 (e.g. if full) */
6034 + int have_metadata;
6036 + /* Used for display of table */
6039 + /* The last percentage we notified */
6042 + struct exception_table pending;
6043 + struct exception_table complete;
6045 + /* The on disk metadata handler */
6046 + struct exception_store store;
6048 + struct kcopyd_client *kcopyd_client;
6052 + * Used by the exception stores to load exceptions hen
6055 +int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new);
6058 + * Constructor and destructor for the default persistent
6061 +int dm_create_persistent(struct exception_store *store, uint32_t chunk_size);
6063 +int dm_create_transient(struct exception_store *store,
6064 + struct dm_snapshot *s, int blocksize);
6067 + * Return the number of sectors in the device.
6069 +static inline sector_t get_dev_size(kdev_t dev)
6073 + sizes = blk_size[MAJOR(dev)];
6075 + return sizes[MINOR(dev)] << 1;
6080 +static inline chunk_t sector_to_chunk(struct dm_snapshot *s, sector_t sector)
6082 + return (sector & ~s->chunk_mask) >> s->chunk_shift;
6085 +static inline sector_t chunk_to_sector(struct dm_snapshot *s, chunk_t chunk)
6087 + return chunk << s->chunk_shift;
6091 --- linux-2.4.21/drivers/md/dm-stripe.c Thu Jan 1 01:00:00 1970
6092 +++ linux/drivers/md/dm-stripe.c Wed Aug 20 14:41:38 2003
6095 + * Copyright (C) 2001 Sistina Software (UK) Limited.
6097 + * This file is released under the GPL.
6102 +#include <linux/module.h>
6103 +#include <linux/init.h>
6104 +#include <linux/blkdev.h>
6105 +#include <linux/slab.h>
6108 + struct dm_dev *dev;
6109 + sector_t physical_start;
6115 + /* The size of this target / num. stripes */
6116 + uint32_t stripe_width;
6118 + /* stripe chunk size */
6119 + uint32_t chunk_shift;
6120 + sector_t chunk_mask;
6122 + struct stripe stripe[0];
6125 +static inline struct stripe_c *alloc_context(unsigned int stripes)
6129 + if (array_too_big(sizeof(struct stripe_c), sizeof(struct stripe),
6133 + len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes);
6135 + return kmalloc(len, GFP_KERNEL);
6139 + * Parse a single <dev> <sector> pair
6141 +static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
6142 + unsigned int stripe, char **argv)
6146 + if (sscanf(argv[1], SECTOR_FORMAT, &start) != 1)
6149 + if (dm_get_device(ti, argv[0], start, sc->stripe_width,
6150 + dm_table_get_mode(ti->table),
6151 + &sc->stripe[stripe].dev))
6154 + sc->stripe[stripe].physical_start = start;
6159 + * FIXME: Nasty function, only present because we can't link
6160 + * against __moddi3 and __divdi3.
6162 + * returns a == b * n
6164 +static int multiple(sector_t a, sector_t b, sector_t *n)
6166 + sector_t acc, prev, i;
6170 + for (acc = b, prev = 0, i = 1;
6172 + prev = acc, acc <<= 1, i <<= 1)
6183 + * Construct a striped mapping.
6184 + * <number of stripes> <chunk size (2^^n)> [<dev_path> <offset>]+
6186 +static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
6188 + struct stripe_c *sc;
6191 + uint32_t chunk_size;
6197 + ti->error = "dm-stripe: Not enough arguments";
6201 + stripes = simple_strtoul(argv[0], &end, 10);
6203 + ti->error = "dm-stripe: Invalid stripe count";
6207 + chunk_size = simple_strtoul(argv[1], &end, 10);
6209 + ti->error = "dm-stripe: Invalid chunk_size";
6214 + * chunk_size is a power of two
6216 + if (!chunk_size || (chunk_size & (chunk_size - 1))) {
6217 + ti->error = "dm-stripe: Invalid chunk size";
6221 + if (!multiple(ti->len, stripes, &width)) {
6222 + ti->error = "dm-stripe: Target length not divisable by "
6223 + "number of stripes";
6228 + * Do we have enough arguments for that many stripes ?
6230 + if (argc != (2 + 2 * stripes)) {
6231 + ti->error = "dm-stripe: Not enough destinations specified";
6235 + sc = alloc_context(stripes);
6237 + ti->error = "dm-stripe: Memory allocation for striped context "
6242 + sc->stripes = stripes;
6243 + sc->stripe_width = width;
6245 + sc->chunk_mask = ((sector_t) chunk_size) - 1;
6246 + for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++)
6248 + sc->chunk_shift--;
6251 + * Get the stripe destinations.
6253 + for (i = 0; i < stripes; i++) {
6256 + r = get_stripe(ti, sc, i, argv);
6258 + ti->error = "dm-stripe: Couldn't parse stripe "
6261 + dm_put_device(ti, sc->stripe[i].dev);
6271 +static void stripe_dtr(struct dm_target *ti)
6274 + struct stripe_c *sc = (struct stripe_c *) ti->private;
6276 + for (i = 0; i < sc->stripes; i++)
6277 + dm_put_device(ti, sc->stripe[i].dev);
6282 +static int stripe_map(struct dm_target *ti, struct buffer_head *bh, int rw,
6283 + union map_info *context)
6285 + struct stripe_c *sc = (struct stripe_c *) ti->private;
6287 + sector_t offset = bh->b_rsector - ti->begin;
6288 + uint32_t chunk = (uint32_t) (offset >> sc->chunk_shift);
6289 + uint32_t stripe = chunk % sc->stripes; /* 32bit modulus */
6290 + chunk = chunk / sc->stripes;
6292 + bh->b_rdev = sc->stripe[stripe].dev->dev;
6293 + bh->b_rsector = sc->stripe[stripe].physical_start +
6294 + (chunk << sc->chunk_shift) + (offset & sc->chunk_mask);
6298 +static int stripe_status(struct dm_target *ti, status_type_t type,
6299 + char *result, unsigned int maxlen)
6301 + struct stripe_c *sc = (struct stripe_c *) ti->private;
6306 + case STATUSTYPE_INFO:
6310 + case STATUSTYPE_TABLE:
6311 + offset = snprintf(result, maxlen, "%d " SECTOR_FORMAT,
6312 + sc->stripes, sc->chunk_mask + 1);
6313 + for (i = 0; i < sc->stripes; i++) {
6315 + snprintf(result + offset, maxlen - offset,
6316 + " %s " SECTOR_FORMAT,
6317 + dm_kdevname(to_kdev_t(sc->stripe[i].dev->bdev->bd_dev)),
6318 + sc->stripe[i].physical_start);
6325 +static struct target_type stripe_target = {
6326 + .name = "striped",
6327 + .module = THIS_MODULE,
6328 + .ctr = stripe_ctr,
6329 + .dtr = stripe_dtr,
6330 + .map = stripe_map,
6331 + .status = stripe_status,
6334 +int __init dm_stripe_init(void)
6338 + r = dm_register_target(&stripe_target);
6340 + DMWARN("striped target registration failed");
6345 +void dm_stripe_exit(void)
6347 + if (dm_unregister_target(&stripe_target))
6348 + DMWARN("striped target unregistration failed");
6352 --- linux-2.4.21/drivers/md/dm-table.c Thu Jan 1 01:00:00 1970
6353 +++ linux/drivers/md/dm-table.c Wed Aug 20 14:41:38 2003
6356 + * Copyright (C) 2001 Sistina Software (UK) Limited.
6358 + * This file is released under the GPL.
6363 +#include <linux/module.h>
6364 +#include <linux/vmalloc.h>
6365 +#include <linux/blkdev.h>
6366 +#include <linux/ctype.h>
6367 +#include <linux/slab.h>
6368 +#include <asm/atomic.h>
6370 +#define MAX_DEPTH 16
6371 +#define NODE_SIZE L1_CACHE_BYTES
6372 +#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
6373 +#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
6374 +#define MAX_TARGET_ARGS 64
6380 + unsigned int depth;
6381 + unsigned int counts[MAX_DEPTH]; /* in nodes */
6382 + sector_t *index[MAX_DEPTH];
6384 + unsigned int num_targets;
6385 + unsigned int num_allocated;
6387 + struct dm_target *targets;
6390 + * Indicates the rw permissions for the new logical
6391 + * device. This should be a combination of FMODE_READ
6392 + * and FMODE_WRITE.
6396 + /* a list of devices used by this table */
6397 + struct list_head devices;
6399 + /* events get handed up using this callback */
6400 + void (*event_fn)(void *);
6401 + void *event_context;
6405 + * Similar to ceiling(log_size(n))
6407 +static unsigned int int_log(unsigned long n, unsigned long base)
6412 + n = dm_div_up(n, base);
6420 + * Calculate the index of the child node of the n'th node k'th key.
6422 +static inline unsigned int get_child(unsigned int n, unsigned int k)
6424 + return (n * CHILDREN_PER_NODE) + k;
6428 + * Return the n'th node of level l from table t.
6430 +static inline sector_t *get_node(struct dm_table *t, unsigned int l,
6433 + return t->index[l] + (n * KEYS_PER_NODE);
6437 + * Return the highest key that you could lookup from the n'th
6438 + * node on level l of the btree.
6440 +static sector_t high(struct dm_table *t, unsigned int l, unsigned int n)
6442 + for (; l < t->depth - 1; l++)
6443 + n = get_child(n, CHILDREN_PER_NODE - 1);
6445 + if (n >= t->counts[l])
6446 + return (sector_t) - 1;
6448 + return get_node(t, l, n)[KEYS_PER_NODE - 1];
6452 + * Fills in a level of the btree based on the highs of the level
6455 +static int setup_btree_index(unsigned int l, struct dm_table *t)
6457 + unsigned int n, k;
6460 + for (n = 0U; n < t->counts[l]; n++) {
6461 + node = get_node(t, l, n);
6463 + for (k = 0U; k < KEYS_PER_NODE; k++)
6464 + node[k] = high(t, l + 1, get_child(n, k));
6471 + * highs, and targets are managed as dynamic arrays during a
6474 +static int alloc_targets(struct dm_table *t, unsigned int num)
6476 + sector_t *n_highs;
6477 + struct dm_target *n_targets;
6478 + int n = t->num_targets;
6481 + * Allocate both the target array and offset array at once.
6483 + n_highs = (sector_t *) vcalloc(sizeof(struct dm_target) +
6484 + sizeof(sector_t), num);
6488 + n_targets = (struct dm_target *) (n_highs + num);
6491 + memcpy(n_highs, t->highs, sizeof(*n_highs) * n);
6492 + memcpy(n_targets, t->targets, sizeof(*n_targets) * n);
6495 + memset(n_highs + n, -1, sizeof(*n_highs) * (num - n));
6498 + t->num_allocated = num;
6499 + t->highs = n_highs;
6500 + t->targets = n_targets;
6505 +int dm_table_create(struct dm_table **result, int mode)
6507 + struct dm_table *t = kmalloc(sizeof(*t), GFP_NOIO);
6512 + memset(t, 0, sizeof(*t));
6513 + INIT_LIST_HEAD(&t->devices);
6514 + atomic_set(&t->holders, 1);
6516 + /* allocate a single nodes worth of targets to begin with */
6517 + if (alloc_targets(t, KEYS_PER_NODE)) {
6528 +static void free_devices(struct list_head *devices)
6530 + struct list_head *tmp, *next;
6532 + for (tmp = devices->next; tmp != devices; tmp = next) {
6533 + struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
6539 +void table_destroy(struct dm_table *t)
6543 + /* free the indexes (see dm_table_complete) */
6544 + if (t->depth >= 2)
6545 + vfree(t->index[t->depth - 2]);
6547 + /* free the targets */
6548 + for (i = 0; i < t->num_targets; i++) {
6549 + struct dm_target *tgt = t->targets + i;
6551 + if (tgt->type->dtr)
6552 + tgt->type->dtr(tgt);
6554 + dm_put_target_type(tgt->type);
6559 + /* free the device list */
6560 + if (t->devices.next != &t->devices) {
6561 + DMWARN("devices still present during destroy: "
6562 + "dm_table_remove_device calls missing");
6564 + free_devices(&t->devices);
6570 +void dm_table_get(struct dm_table *t)
6572 + atomic_inc(&t->holders);
6575 +void dm_table_put(struct dm_table *t)
6577 + if (atomic_dec_and_test(&t->holders))
6582 + * Checks to see if we need to extend highs or targets.
6584 +static inline int check_space(struct dm_table *t)
6586 + if (t->num_targets >= t->num_allocated)
6587 + return alloc_targets(t, t->num_allocated * 2);
6593 + * Convert a device path to a dev_t.
6595 +static int lookup_device(const char *path, kdev_t *dev)
6598 + struct nameidata nd;
6599 + struct inode *inode;
6601 + if (!path_init(path, LOOKUP_FOLLOW, &nd))
6604 + if ((r = path_walk(path, &nd)))
6607 + inode = nd.dentry->d_inode;
6613 + if (!S_ISBLK(inode->i_mode)) {
6618 + *dev = inode->i_rdev;
6621 + path_release(&nd);
6626 + * See if we've already got a device in the list.
6628 +static struct dm_dev *find_device(struct list_head *l, kdev_t dev)
6630 + struct list_head *tmp;
6632 + list_for_each(tmp, l) {
6633 + struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
6634 + if (kdev_same(dd->dev, dev))
6642 + * Open a device so we can use it as a map destination.
6644 +static int open_dev(struct dm_dev *dd)
6649 + dd->bdev = bdget(kdev_t_to_nr(dd->dev));
6653 + return blkdev_get(dd->bdev, dd->mode, 0, BDEV_RAW);
6657 + * Close a device that we've been using.
6659 +static void close_dev(struct dm_dev *dd)
6664 + blkdev_put(dd->bdev, BDEV_RAW);
6669 + * If possible (ie. blk_size[major] is set), this checks an area
6670 + * of a destination device is valid.
6672 +static int check_device_area(kdev_t dev, sector_t start, sector_t len)
6675 + sector_t dev_size;
6677 + if (!(sizes = blk_size[major(dev)]) || !(dev_size = sizes[minor(dev)]))
6678 + /* we don't know the device details,
6679 + * so give the benefit of the doubt */
6682 + /* convert to 512-byte sectors */
6685 + return ((start < dev_size) && (len <= (dev_size - start)));
6689 + * This upgrades the mode on an already open dm_dev. Being
6690 + * careful to leave things as they were if we fail to reopen the
6693 +static int upgrade_mode(struct dm_dev *dd, int new_mode)
6696 + struct dm_dev dd_copy;
6698 + memcpy(&dd_copy, dd, sizeof(dd_copy));
6700 + dd->mode |= new_mode;
6704 + close_dev(&dd_copy);
6706 + memcpy(dd, &dd_copy, sizeof(dd_copy));
6712 + * Add a device to the list, or just increment the usage count if
6713 + * it's already present.
6715 +int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
6716 + sector_t len, int mode, struct dm_dev **result)
6720 + struct dm_dev *dd;
6721 + unsigned major, minor;
6722 + struct dm_table *t = ti->table;
6727 + if (sscanf(path, "%u:%u", &major, &minor) == 2) {
6728 + /* Extract the major/minor numbers */
6729 + dev = mk_kdev(major, minor);
6731 + /* convert the path to a device */
6732 + if ((r = lookup_device(path, &dev)))
6736 + dd = find_device(&t->devices, dev);
6738 + dd = kmalloc(sizeof(*dd), GFP_KERNEL);
6746 + if ((r = open_dev(dd))) {
6751 + atomic_set(&dd->count, 0);
6752 + list_add(&dd->list, &t->devices);
6754 + } else if (dd->mode != (mode | dd->mode)) {
6755 + r = upgrade_mode(dd, mode);
6759 + atomic_inc(&dd->count);
6761 + if (!check_device_area(dd->dev, start, len)) {
6762 + DMWARN("device %s too small for target", path);
6763 + dm_put_device(ti, dd);
6773 + * Decrement a devices use count and remove it if neccessary.
6775 +void dm_put_device(struct dm_target *ti, struct dm_dev *dd)
6777 + if (atomic_dec_and_test(&dd->count)) {
6779 + list_del(&dd->list);
6785 + * Checks to see if the target joins onto the end of the table.
6787 +static int adjoin(struct dm_table *table, struct dm_target *ti)
6789 + struct dm_target *prev;
6791 + if (!table->num_targets)
6792 + return !ti->begin;
6794 + prev = &table->targets[table->num_targets - 1];
6795 + return (ti->begin == (prev->begin + prev->len));
6799 + * Destructively splits up the argument list to pass to ctr.
6801 +static int split_args(int *argc, char ***argvp, char *input)
6803 + char *start, *end = input, *out;
6805 + int max_args = MAX_TARGET_ARGS;
6808 + argv = kmalloc(sizeof(*argv) * max_args, GFP_NOIO);
6815 + /* Skip whitespace */
6816 + while (*start && isspace(*start))
6820 + break; /* success, we hit the end */
6822 + /* 'out' is used to remove any back-quotes */
6823 + end = out = start;
6825 + /* Everything apart from '\0' can be quoted */
6826 + if (*end == '\\' && *(end + 1)) {
6827 + *out++ = *(end + 1);
6832 + if (isspace(*end))
6833 + break; /* end of token */
6838 + /* have we already filled the array ? */
6839 + if ((*argc + 1) > max_args) {
6843 + argv2 = kmalloc(sizeof(*argv2) * max_args, GFP_NOIO);
6849 + memcpy(argv2, argv, sizeof(*argv) * *argc);
6854 + /* we know this is whitespace */
6858 + /* terminate the string and put it in the array */
6860 + argv[*argc] = start;
6868 +int dm_table_add_target(struct dm_table *t, const char *type,
6869 + sector_t start, sector_t len, char *params)
6871 + int r = -EINVAL, argc;
6873 + struct dm_target *tgt;
6875 + if ((r = check_space(t)))
6878 + tgt = t->targets + t->num_targets;
6879 + memset(tgt, 0, sizeof(*tgt));
6881 + tgt->type = dm_get_target_type(type);
6883 + tgt->error = "unknown target type";
6888 + tgt->begin = start;
6890 + tgt->error = "Unknown error";
6893 + * Does this target adjoin the previous one ?
6895 + if (!adjoin(t, tgt)) {
6896 + tgt->error = "Gap in table";
6901 + r = split_args(&argc, &argv, params);
6903 + tgt->error = "couldn't split parameters (insufficient memory)";
6907 + r = tgt->type->ctr(tgt, argc, argv);
6912 + t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
6916 + printk(KERN_ERR DM_NAME ": %s\n", tgt->error);
6917 + dm_put_target_type(tgt->type);
6921 +static int setup_indexes(struct dm_table *t)
6924 + unsigned int total = 0;
6925 + sector_t *indexes;
6927 + /* allocate the space for *all* the indexes */
6928 + for (i = t->depth - 2; i >= 0; i--) {
6929 + t->counts[i] = dm_div_up(t->counts[i + 1], CHILDREN_PER_NODE);
6930 + total += t->counts[i];
6933 + indexes = (sector_t *) vcalloc(total, (unsigned long) NODE_SIZE);
6937 + /* set up internal nodes, bottom-up */
6938 + for (i = t->depth - 2, total = 0; i >= 0; i--) {
6939 + t->index[i] = indexes;
6940 + indexes += (KEYS_PER_NODE * t->counts[i]);
6941 + setup_btree_index(i, t);
6948 + * Builds the btree to index the map.
6950 +int dm_table_complete(struct dm_table *t)
6953 + unsigned int leaf_nodes;
6955 + /* how many indexes will the btree have ? */
6956 + leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
6957 + t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
6959 + /* leaf layer has already been set up */
6960 + t->counts[t->depth - 1] = leaf_nodes;
6961 + t->index[t->depth - 1] = t->highs;
6963 + if (t->depth >= 2)
6964 + r = setup_indexes(t);
6969 +static spinlock_t _event_lock = SPIN_LOCK_UNLOCKED;
6970 +void dm_table_event_callback(struct dm_table *t,
6971 + void (*fn)(void *), void *context)
6973 + spin_lock_irq(&_event_lock);
6975 + t->event_context = context;
6976 + spin_unlock_irq(&_event_lock);
6979 +void dm_table_event(struct dm_table *t)
6981 + spin_lock(&_event_lock);
6983 + t->event_fn(t->event_context);
6984 + spin_unlock(&_event_lock);
6987 +sector_t dm_table_get_size(struct dm_table *t)
6989 + return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0;
6992 +struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index)
6994 + if (index > t->num_targets)
6997 + return t->targets + index;
7001 + * Search the btree for the correct target.
7003 +struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
7005 + unsigned int l, n = 0, k = 0;
7008 + for (l = 0; l < t->depth; l++) {
7009 + n = get_child(n, k);
7010 + node = get_node(t, l, n);
7012 + for (k = 0; k < KEYS_PER_NODE; k++)
7013 + if (node[k] >= sector)
7017 + return &t->targets[(KEYS_PER_NODE * n) + k];
7020 +unsigned int dm_table_get_num_targets(struct dm_table *t)
7022 + return t->num_targets;
7025 +struct list_head *dm_table_get_devices(struct dm_table *t)
7027 + return &t->devices;
7030 +int dm_table_get_mode(struct dm_table *t)
7035 +void dm_table_suspend_targets(struct dm_table *t)
7039 + for (i = 0; i < t->num_targets; i++) {
7040 + struct dm_target *ti = t->targets + i;
7042 + if (ti->type->suspend)
7043 + ti->type->suspend(ti);
7047 +void dm_table_resume_targets(struct dm_table *t)
7051 + for (i = 0; i < t->num_targets; i++) {
7052 + struct dm_target *ti = t->targets + i;
7054 + if (ti->type->resume)
7055 + ti->type->resume(ti);
7059 +EXPORT_SYMBOL(dm_get_device);
7060 +EXPORT_SYMBOL(dm_put_device);
7061 +EXPORT_SYMBOL(dm_table_event);
7062 +EXPORT_SYMBOL(dm_table_get_mode);
7063 --- linux-2.4.21/drivers/md/dm-target.c Thu Jan 1 01:00:00 1970
7064 +++ linux/drivers/md/dm-target.c Wed Aug 20 14:41:38 2003
7067 + * Copyright (C) 2001 Sistina Software (UK) Limited
7069 + * This file is released under the GPL.
7074 +#include <linux/module.h>
7075 +#include <linux/kmod.h>
7076 +#include <linux/slab.h>
7078 +struct tt_internal {
7079 + struct target_type tt;
7081 + struct list_head list;
7085 +static LIST_HEAD(_targets);
7086 +static DECLARE_RWSEM(_lock);
7088 +#define DM_MOD_NAME_SIZE 32
7090 +static inline struct tt_internal *__find_target_type(const char *name)
7092 + struct list_head *tih;
7093 + struct tt_internal *ti;
7095 + list_for_each(tih, &_targets) {
7096 + ti = list_entry(tih, struct tt_internal, list);
7098 + if (!strcmp(name, ti->tt.name))
7105 +static struct tt_internal *get_target_type(const char *name)
7107 + struct tt_internal *ti;
7109 + down_read(&_lock);
7110 + ti = __find_target_type(name);
7113 + if (ti->use == 0 && ti->tt.module)
7114 + __MOD_INC_USE_COUNT(ti->tt.module);
7122 +static void load_module(const char *name)
7124 + char module_name[DM_MOD_NAME_SIZE] = "dm-";
7126 + /* Length check for strcat() below */
7127 + if (strlen(name) > (DM_MOD_NAME_SIZE - 4))
7130 + strcat(module_name, name);
7131 + request_module(module_name);
7134 +struct target_type *dm_get_target_type(const char *name)
7136 + struct tt_internal *ti = get_target_type(name);
7139 + load_module(name);
7140 + ti = get_target_type(name);
7143 + return ti ? &ti->tt : NULL;
7146 +void dm_put_target_type(struct target_type *t)
7148 + struct tt_internal *ti = (struct tt_internal *) t;
7150 + down_read(&_lock);
7151 + if (--ti->use == 0 && ti->tt.module)
7152 + __MOD_DEC_USE_COUNT(ti->tt.module);
7161 +static struct tt_internal *alloc_target(struct target_type *t)
7163 + struct tt_internal *ti = kmalloc(sizeof(*ti), GFP_KERNEL);
7166 + memset(ti, 0, sizeof(*ti));
7173 +int dm_register_target(struct target_type *t)
7176 + struct tt_internal *ti = alloc_target(t);
7181 + down_write(&_lock);
7182 + if (__find_target_type(t->name)) {
7186 + list_add(&ti->list, &_targets);
7192 +int dm_unregister_target(struct target_type *t)
7194 + struct tt_internal *ti;
7196 + down_write(&_lock);
7197 + if (!(ti = __find_target_type(t->name))) {
7207 + list_del(&ti->list);
7215 + * io-err: always fails an io, useful for bringing
7216 + * up LVs that have holes in them.
7218 +static int io_err_ctr(struct dm_target *ti, unsigned int argc, char **args)
7223 +static void io_err_dtr(struct dm_target *ti)
7228 +static int io_err_map(struct dm_target *ti, struct buffer_head *bh, int rw,
7229 + union map_info *map_context)
7234 +static struct target_type error_target = {
7236 + .ctr = io_err_ctr,
7237 + .dtr = io_err_dtr,
7238 + .map = io_err_map,
7241 +int dm_target_init(void)
7243 + return dm_register_target(&error_target);
7246 +void dm_target_exit(void)
7248 + if (dm_unregister_target(&error_target))
7249 + DMWARN("error target unregistration failed");
7252 +EXPORT_SYMBOL(dm_register_target);
7253 +EXPORT_SYMBOL(dm_unregister_target);
7254 --- linux-2.4.21/drivers/md/dm.c Thu Jan 1 01:00:00 1970
7255 +++ linux/drivers/md/dm.c Wed Aug 20 14:41:38 2003
7258 + * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
7260 + * This file is released under the GPL.
7264 +#include "kcopyd.h"
7266 +#include <linux/init.h>
7267 +#include <linux/module.h>
7268 +#include <linux/blk.h>
7269 +#include <linux/blkpg.h>
7270 +#include <linux/mempool.h>
7271 +#include <linux/slab.h>
7272 +#include <linux/major.h>
7273 +#include <linux/kdev_t.h>
7274 +#include <linux/lvm.h>
7276 +#include <asm/uaccess.h>
7278 +static const char *_name = DM_NAME;
7279 +#define DEFAULT_READ_AHEAD 64
7282 + struct mapped_device *md;
7284 + struct dm_target *ti;
7286 + union map_info map_context;
7287 + void (*end_io) (struct buffer_head * bh, int uptodate);
7291 +struct deferred_io {
7293 + struct buffer_head *bh;
7294 + struct deferred_io *next;
7298 + * Bits for the md->flags field.
7300 +#define DMF_BLOCK_IO 0
7301 +#define DMF_SUSPENDED 1
7303 +struct mapped_device {
7304 + struct rw_semaphore lock;
7308 + unsigned long flags;
7311 + * A list of ios that arrived while we were suspended.
7314 + wait_queue_head_t wait;
7315 + struct deferred_io *deferred;
7318 + * The current mapping.
7320 + struct dm_table *map;
7323 + * io objects are allocated from here.
7325 + mempool_t *io_pool;
7330 + uint32_t event_nr;
7331 + wait_queue_head_t eventq;
7334 +#define MIN_IOS 256
7335 +static kmem_cache_t *_io_cache;
7337 +static struct mapped_device *get_kdev(kdev_t dev);
7338 +static int dm_request(request_queue_t *q, int rw, struct buffer_head *bh);
7339 +static int dm_user_bmap(struct inode *inode, struct lv_bmap *lvb);
7341 +/*-----------------------------------------------------------------
7342 + * In order to avoid the 256 minor number limit we are going to
7343 + * register more major numbers as neccessary.
7344 + *---------------------------------------------------------------*/
7345 +#define MAX_MINORS (1 << MINORBITS)
7347 +struct major_details {
7348 + unsigned int major;
7351 + struct list_head transient_list;
7353 + unsigned int first_free_minor;
7354 + int nr_free_minors;
7356 + struct mapped_device *mds[MAX_MINORS];
7357 + int blk_size[MAX_MINORS];
7358 + int blksize_size[MAX_MINORS];
7359 + int hardsect_size[MAX_MINORS];
7362 +static struct rw_semaphore _dev_lock;
7363 +static struct major_details *_majors[MAX_BLKDEV];
7366 + * This holds a list of majors that non-specified device numbers
7367 + * may be allocated from. Only majors with free minors appear on
7370 +static LIST_HEAD(_transients_free);
7372 +static int __alloc_major(unsigned int major, struct major_details **result)
7375 + unsigned int transient = !major;
7376 + struct major_details *maj;
7378 + /* Major already allocated? */
7379 + if (major && _majors[major])
7382 + maj = kmalloc(sizeof(*maj), GFP_KERNEL);
7386 + memset(maj, 0, sizeof(*maj));
7387 + INIT_LIST_HEAD(&maj->transient_list);
7389 + maj->nr_free_minors = MAX_MINORS;
7391 + r = register_blkdev(major, _name, &dm_blk_dops);
7393 + DMERR("register_blkdev failed for %d", major);
7400 + maj->major = major;
7403 + maj->transient = transient;
7404 + list_add_tail(&maj->transient_list, &_transients_free);
7407 + _majors[major] = maj;
7409 + blk_size[major] = maj->blk_size;
7410 + blksize_size[major] = maj->blksize_size;
7411 + hardsect_size[major] = maj->hardsect_size;
7412 + read_ahead[major] = DEFAULT_READ_AHEAD;
7414 + blk_queue_make_request(BLK_DEFAULT_QUEUE(major), dm_request);
7420 +static void __free_major(struct major_details *maj)
7422 + unsigned int major = maj->major;
7424 + list_del(&maj->transient_list);
7426 + read_ahead[major] = 0;
7427 + blk_size[major] = NULL;
7428 + blksize_size[major] = NULL;
7429 + hardsect_size[major] = NULL;
7431 + _majors[major] = NULL;
7434 + if (unregister_blkdev(major, _name) < 0)
7435 + DMERR("devfs_unregister_blkdev failed");
7438 +static void free_all_majors(void)
7440 + unsigned int major = ARRAY_SIZE(_majors);
7442 + down_write(&_dev_lock);
7445 + if (_majors[major])
7446 + __free_major(_majors[major]);
7448 + up_write(&_dev_lock);
7451 +static void free_dev(kdev_t dev)
7453 + unsigned int major = major(dev);
7454 + unsigned int minor = minor(dev);
7455 + struct major_details *maj;
7457 + down_write(&_dev_lock);
7459 + maj = _majors[major];
7463 + maj->mds[minor] = NULL;
7464 + maj->nr_free_minors++;
7466 + if (maj->nr_free_minors == MAX_MINORS) {
7467 + __free_major(maj);
7471 + if (!maj->transient)
7474 + if (maj->nr_free_minors == 1)
7475 + list_add_tail(&maj->transient_list, &_transients_free);
7477 + if (minor < maj->first_free_minor)
7478 + maj->first_free_minor = minor;
7481 + up_write(&_dev_lock);
7484 +static void __alloc_minor(struct major_details *maj, unsigned int minor,
7485 + struct mapped_device *md)
7487 + maj->mds[minor] = md;
7488 + md->dev = mk_kdev(maj->major, minor);
7489 + maj->nr_free_minors--;
7491 + if (maj->transient && !maj->nr_free_minors)
7492 + list_del_init(&maj->transient_list);
7496 + * See if requested kdev_t is available.
7498 +static int specific_dev(kdev_t dev, struct mapped_device *md)
7501 + unsigned int major = major(dev);
7502 + unsigned int minor = minor(dev);
7503 + struct major_details *maj;
7505 + if (!major || (major > MAX_BLKDEV) || (minor >= MAX_MINORS)) {
7506 + DMWARN("device number requested out of range (%d, %d)",
7511 + down_write(&_dev_lock);
7512 + maj = _majors[major];
7514 + /* Register requested major? */
7516 + r = __alloc_major(major, &maj);
7520 + major = maj->major;
7523 + if (maj->mds[minor]) {
7528 + __alloc_minor(maj, minor, md);
7531 + up_write(&_dev_lock);
7537 + * Find first unused device number, requesting a new major number if required.
7539 +static int first_free_dev(struct mapped_device *md)
7542 + struct major_details *maj;
7544 + down_write(&_dev_lock);
7546 + if (list_empty(&_transients_free)) {
7547 + r = __alloc_major(0, &maj);
7551 + maj = list_entry(_transients_free.next, struct major_details,
7554 + while (maj->mds[maj->first_free_minor++])
7557 + __alloc_minor(maj, maj->first_free_minor - 1, md);
7560 + up_write(&_dev_lock);
7565 +static struct mapped_device *get_kdev(kdev_t dev)
7567 + struct mapped_device *md;
7568 + struct major_details *maj;
7570 + down_read(&_dev_lock);
7571 + maj = _majors[major(dev)];
7576 + md = maj->mds[minor(dev)];
7580 + up_read(&_dev_lock);
7585 +/*-----------------------------------------------------------------
7587 + *---------------------------------------------------------------*/
7589 +static __init int local_init(void)
7591 + init_rwsem(&_dev_lock);
7593 + /* allocate a slab for the dm_ios */
7594 + _io_cache = kmem_cache_create("dm io",
7595 + sizeof(struct dm_io), 0, 0, NULL, NULL);
7603 +static void local_exit(void)
7605 + kmem_cache_destroy(_io_cache);
7606 + free_all_majors();
7608 + DMINFO("cleaned up");
7612 + * We have a lot of init/exit functions, so it seems easier to
7613 + * store them in an array. The disposable macro 'xx'
7614 + * expands a prefix into a pair of function names.
7617 + int (*init) (void);
7618 + void (*exit) (void);
7621 +#define xx(n) {n ## _init, n ## _exit},
7632 +static int __init dm_init(void)
7634 + const int count = ARRAY_SIZE(_inits);
7638 + for (i = 0; i < count; i++) {
7639 + r = _inits[i].init();
7653 +static void __exit dm_exit(void)
7655 + int i = ARRAY_SIZE(_inits);
7662 + * Block device functions
7664 +static int dm_blk_open(struct inode *inode, struct file *file)
7666 + struct mapped_device *md;
7668 + md = get_kdev(inode->i_rdev);
7675 +static int dm_blk_close(struct inode *inode, struct file *file)
7677 + struct mapped_device *md;
7679 + md = get_kdev(inode->i_rdev);
7680 + dm_put(md); /* put the reference gained by dm_blk_open */
7685 +static inline struct dm_io *alloc_io(struct mapped_device *md)
7687 + return mempool_alloc(md->io_pool, GFP_NOIO);
7690 +static inline void free_io(struct mapped_device *md, struct dm_io *io)
7692 + mempool_free(io, md->io_pool);
7695 +static inline struct deferred_io *alloc_deferred(void)
7697 + return kmalloc(sizeof(struct deferred_io), GFP_NOIO);
7700 +static inline void free_deferred(struct deferred_io *di)
7705 +static inline sector_t volume_size(kdev_t dev)
7707 + return blk_size[major(dev)][minor(dev)] << 1;
7710 +/* FIXME: check this */
7711 +static int dm_blk_ioctl(struct inode *inode, struct file *file,
7712 + unsigned int command, unsigned long a)
7714 + kdev_t dev = inode->i_rdev;
7717 + switch (command) {
7724 + //case BLKRRPART: /* Re-read partition tables */
7730 + return blk_ioctl(dev, command, a);
7734 + size = volume_size(dev);
7735 + if (copy_to_user((void *) a, &size, sizeof(long)))
7739 + case BLKGETSIZE64:
7740 + size = volume_size(dev);
7741 + if (put_user((u64) ((u64) size) << 9, (u64 *) a))
7749 + return dm_user_bmap(inode, (struct lv_bmap *) a);
7752 + DMWARN("unknown block ioctl 0x%x", command);
7760 + * Add the buffer to the list of deferred io.
7762 +static int queue_io(struct mapped_device *md, struct buffer_head *bh, int rw)
7764 + struct deferred_io *di;
7766 + di = alloc_deferred();
7770 + down_write(&md->lock);
7772 + if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
7773 + up_write(&md->lock);
7774 + free_deferred(di);
7780 + di->next = md->deferred;
7781 + md->deferred = di;
7783 + up_write(&md->lock);
7784 + return 0; /* deferred successfully */
7788 + * bh->b_end_io routine that decrements the pending count
7789 + * and then calls the original bh->b_end_io fn.
7791 +static void dec_pending(struct buffer_head *bh, int uptodate)
7794 + struct dm_io *io = bh->b_private;
7795 + dm_endio_fn endio = io->ti->type->end_io;
7798 + r = endio(io->ti, bh, io->rw, uptodate ? 0 : -EIO,
7799 + &io->map_context);
7804 + /* the target wants another shot at the io */
7808 + if (atomic_dec_and_test(&io->md->pending))
7809 + /* nudge anyone waiting on suspend queue */
7810 + wake_up(&io->md->wait);
7812 + bh->b_end_io = io->end_io;
7813 + bh->b_private = io->context;
7814 + free_io(io->md, io);
7816 + bh->b_end_io(bh, uptodate);
7820 + * Do the bh mapping for a given leaf
7822 +static inline int __map_buffer(struct mapped_device *md, int rw,
7823 + struct buffer_head *bh, struct dm_io *io)
7825 + struct dm_target *ti;
7830 + ti = dm_table_find_target(md->map, bh->b_rsector);
7834 + /* hook the end io request fn */
7835 + atomic_inc(&md->pending);
7839 + io->end_io = bh->b_end_io;
7840 + io->context = bh->b_private;
7841 + bh->b_end_io = dec_pending;
7842 + bh->b_private = io;
7844 + return ti->type->map(ti, bh, rw, &io->map_context);
7848 + * Checks to see if we should be deferring io, if so it queues it
7851 +static inline int __deferring(struct mapped_device *md, int rw,
7852 + struct buffer_head *bh)
7857 + * If we're suspended we have to queue this io for later.
7859 + while (test_bit(DMF_BLOCK_IO, &md->flags)) {
7860 + up_read(&md->lock);
7863 + * There's no point deferring a read ahead
7864 + * request, just drop it.
7866 + if (rw == READA) {
7867 + down_read(&md->lock);
7871 + r = queue_io(md, bh, rw);
7872 + down_read(&md->lock);
7878 + return 1; /* deferred successfully */
7885 +static int dm_request(request_queue_t *q, int rw, struct buffer_head *bh)
7889 + struct mapped_device *md;
7891 + md = get_kdev(bh->b_rdev);
7893 + buffer_IO_error(bh);
7897 + io = alloc_io(md);
7898 + down_read(&md->lock);
7900 + r = __deferring(md, rw, bh);
7905 + /* not deferring */
7906 + r = __map_buffer(md, rw, bh, io);
7912 + up_read(&md->lock);
7917 + buffer_IO_error(bh);
7918 + up_read(&md->lock);
7923 +static int check_dev_size(kdev_t dev, unsigned long block)
7925 + unsigned int major = major(dev);
7926 + unsigned int minor = minor(dev);
7928 + /* FIXME: check this */
7929 + unsigned long max_sector = (blk_size[major][minor] << 1) + 1;
7930 + unsigned long sector = (block + 1) * (blksize_size[major][minor] >> 9);
7932 + return (sector > max_sector) ? 0 : 1;
7936 + * Creates a dummy buffer head and maps it (for lilo).
7938 +static int __bmap(struct mapped_device *md, kdev_t dev, unsigned long block,
7939 + kdev_t *r_dev, unsigned long *r_block)
7941 + struct buffer_head bh;
7942 + struct dm_target *ti;
7943 + union map_info map_context;
7946 + if (test_bit(DMF_BLOCK_IO, &md->flags)) {
7950 + if (!check_dev_size(dev, block)) {
7957 + /* setup dummy bh */
7958 + memset(&bh, 0, sizeof(bh));
7959 + bh.b_blocknr = block;
7960 + bh.b_dev = bh.b_rdev = dev;
7961 + bh.b_size = blksize_size[major(dev)][minor(dev)];
7962 + bh.b_rsector = block * (bh.b_size >> 9);
7965 + ti = dm_table_find_target(md->map, bh.b_rsector);
7967 + /* do the mapping */
7968 + r = ti->type->map(ti, &bh, READ, &map_context);
7969 + ti->type->end_io(ti, &bh, READ, 0, &map_context);
7972 + *r_dev = bh.b_rdev;
7973 + *r_block = bh.b_rsector / (bh.b_size >> 9);
7980 + * Marshals arguments and results between user and kernel space.
7982 +static int dm_user_bmap(struct inode *inode, struct lv_bmap *lvb)
7984 + struct mapped_device *md;
7985 + unsigned long block, r_block;
7989 + if (get_user(block, &lvb->lv_block))
7992 + md = get_kdev(inode->i_rdev);
7996 + down_read(&md->lock);
7997 + r = __bmap(md, inode->i_rdev, block, &r_dev, &r_block);
7998 + up_read(&md->lock);
8001 + if (!r && (put_user(kdev_t_to_nr(r_dev), &lvb->lv_dev) ||
8002 + put_user(r_block, &lvb->lv_block)))
8008 +static void free_md(struct mapped_device *md)
8010 + free_dev(md->dev);
8011 + mempool_destroy(md->io_pool);
8016 + * Allocate and initialise a blank device with a given minor.
8018 +static struct mapped_device *alloc_md(kdev_t dev)
8021 + struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL);
8024 + DMWARN("unable to allocate device, out of memory.");
8028 + memset(md, 0, sizeof(*md));
8030 + /* Allocate suitable device number */
8032 + r = first_free_dev(md);
8034 + r = specific_dev(dev, md);
8041 + md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
8042 + mempool_free_slab, _io_cache);
8043 + if (!md->io_pool) {
8049 + init_rwsem(&md->lock);
8050 + atomic_set(&md->holders, 1);
8051 + atomic_set(&md->pending, 0);
8052 + init_waitqueue_head(&md->wait);
8053 + init_waitqueue_head(&md->eventq);
8059 + * The hardsect size for a mapped device is the largest hardsect size
8060 + * from the devices it maps onto.
8062 +static int __find_hardsect_size(struct list_head *devices)
8064 + int result = 512, size;
8065 + struct list_head *tmp;
8067 + list_for_each (tmp, devices) {
8068 + struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
8069 + size = get_hardsect_size(dd->dev);
8070 + if (size > result)
8078 + * Bind a table to the device.
8080 +static void event_callback(void *context)
8082 + struct mapped_device *md = (struct mapped_device *) context;
8084 + down_write(&md->lock);
8086 + wake_up_interruptible(&md->eventq);
8087 + up_write(&md->lock);
8090 +static int __bind(struct mapped_device *md, struct dm_table *t)
8092 + unsigned int minor = minor(md->dev);
8093 + unsigned int major = major(md->dev);
8097 + blk_size[major][minor] = dm_table_get_size(t) >> 1;
8098 + blksize_size[major][minor] = BLOCK_SIZE;
8099 + hardsect_size[major][minor] =
8100 + __find_hardsect_size(dm_table_get_devices(t));
8101 + register_disk(NULL, md->dev, 1, &dm_blk_dops, blk_size[major][minor]);
8103 + dm_table_event_callback(md->map, event_callback, md);
8108 +static void __unbind(struct mapped_device *md)
8110 + unsigned int minor = minor(md->dev);
8111 + unsigned int major = major(md->dev);
8114 + dm_table_event_callback(md->map, NULL, NULL);
8115 + dm_table_put(md->map);
8120 + blk_size[major][minor] = 0;
8121 + blksize_size[major][minor] = 0;
8122 + hardsect_size[major][minor] = 0;
8126 + * Constructor for a new device.
8128 +int dm_create(kdev_t dev, struct mapped_device **result)
8130 + struct mapped_device *md;
8132 + md = alloc_md(dev);
8136 + __unbind(md); /* Ensure zero device size */
8142 +void dm_get(struct mapped_device *md)
8144 + atomic_inc(&md->holders);
8147 +void dm_put(struct mapped_device *md)
8149 + if (atomic_dec_and_test(&md->holders)) {
8151 + dm_table_suspend_targets(md->map);
8158 + * Requeue the deferred io by calling generic_make_request.
8160 +static void flush_deferred_io(struct deferred_io *c)
8162 + struct deferred_io *n;
8166 + generic_make_request(c->rw, c->bh);
8173 + * Swap in a new table (destroying old one).
8175 +int dm_swap_table(struct mapped_device *md, struct dm_table *table)
8179 + down_write(&md->lock);
8182 + * The device must be suspended, or have no table bound yet.
8184 + if (md->map && !test_bit(DMF_SUSPENDED, &md->flags)) {
8185 + up_write(&md->lock);
8190 + r = __bind(md, table);
8194 + up_write(&md->lock);
8199 + * We need to be able to change a mapping table under a mounted
8200 + * filesystem. For example we might want to move some data in
8201 + * the background. Before the table can be swapped with
8202 + * dm_bind_table, dm_suspend must be called to flush any in
8203 + * flight io and ensure that any further io gets deferred.
8205 +int dm_suspend(struct mapped_device *md)
8208 + DECLARE_WAITQUEUE(wait, current);
8210 + down_write(&md->lock);
8213 + * First we set the BLOCK_IO flag so no more ios will be
8216 + if (test_bit(DMF_BLOCK_IO, &md->flags)) {
8217 + up_write(&md->lock);
8221 + set_bit(DMF_BLOCK_IO, &md->flags);
8222 + add_wait_queue(&md->wait, &wait);
8223 + up_write(&md->lock);
8226 + * Then we wait for the already mapped ios to
8229 + run_task_queue(&tq_disk);
8231 + set_current_state(TASK_INTERRUPTIBLE);
8233 + if (!atomic_read(&md->pending) || signal_pending(current))
8238 + set_current_state(TASK_RUNNING);
8240 + down_write(&md->lock);
8241 + remove_wait_queue(&md->wait, &wait);
8243 + /* did we flush everything ? */
8244 + if (atomic_read(&md->pending)) {
8245 + clear_bit(DMF_BLOCK_IO, &md->flags);
8248 + set_bit(DMF_SUSPENDED, &md->flags);
8250 + dm_table_suspend_targets(md->map);
8252 + up_write(&md->lock);
8257 +int dm_resume(struct mapped_device *md)
8259 + struct deferred_io *def;
8261 + down_write(&md->lock);
8262 + if (!test_bit(DMF_SUSPENDED, &md->flags)) {
8263 + up_write(&md->lock);
8268 + dm_table_resume_targets(md->map);
8270 + clear_bit(DMF_SUSPENDED, &md->flags);
8271 + clear_bit(DMF_BLOCK_IO, &md->flags);
8272 + def = md->deferred;
8273 + md->deferred = NULL;
8274 + up_write(&md->lock);
8276 + flush_deferred_io(def);
8277 + run_task_queue(&tq_disk);
8282 +struct dm_table *dm_get_table(struct mapped_device *md)
8284 + struct dm_table *t;
8286 + down_read(&md->lock);
8290 + up_read(&md->lock);
8295 +/*-----------------------------------------------------------------
8296 + * Event notification.
8297 + *---------------------------------------------------------------*/
8298 +uint32_t dm_get_event_nr(struct mapped_device *md)
8302 + down_read(&md->lock);
8304 + up_read(&md->lock);
8309 +int dm_add_wait_queue(struct mapped_device *md, wait_queue_t *wq,
8310 + uint32_t event_nr)
8312 + down_write(&md->lock);
8313 + if (event_nr != md->event_nr) {
8314 + up_write(&md->lock);
8318 + add_wait_queue(&md->eventq, wq);
8319 + up_write(&md->lock);
8324 +const char *dm_kdevname(kdev_t dev)
8326 + static char buffer[32];
8327 + sprintf(buffer, "%03d:%03d", MAJOR(dev), MINOR(dev));
8331 +void dm_remove_wait_queue(struct mapped_device *md, wait_queue_t *wq)
8333 + down_write(&md->lock);
8334 + remove_wait_queue(&md->eventq, wq);
8335 + up_write(&md->lock);
8338 +kdev_t dm_kdev(struct mapped_device *md)
8342 + down_read(&md->lock);
8344 + up_read(&md->lock);
8349 +int dm_suspended(struct mapped_device *md)
8351 + return test_bit(DMF_SUSPENDED, &md->flags);
8354 +struct block_device_operations dm_blk_dops = {
8355 + .open = dm_blk_open,
8356 + .release = dm_blk_close,
8357 + .ioctl = dm_blk_ioctl,
8358 + .owner = THIS_MODULE
8364 +module_init(dm_init);
8365 +module_exit(dm_exit);
8367 +MODULE_DESCRIPTION(DM_NAME " driver");
8368 +MODULE_AUTHOR("Joe Thornber <thornber@sistina.com>");
8369 +MODULE_LICENSE("GPL");
8371 +EXPORT_SYMBOL(dm_kdevname);
8372 --- linux-2.4.21/drivers/md/dm.h Thu Jan 1 01:00:00 1970
8373 +++ linux/drivers/md/dm.h Wed Aug 20 14:41:38 2003
8376 + * Internal header file for device mapper
8378 + * Copyright (C) 2001, 2002 Sistina Software
8380 + * This file is released under the LGPL.
8383 +#ifndef DM_INTERNAL_H
8384 +#define DM_INTERNAL_H
8386 +#include <linux/fs.h>
8387 +#include <linux/device-mapper.h>
8388 +#include <linux/list.h>
8389 +#include <linux/blkdev.h>
8391 +#define DM_NAME "device-mapper"
8392 +#define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x)
8393 +#define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x)
8394 +#define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x)
8397 + * FIXME: I think this should be with the definition of sector_t
8401 +#define SECTOR_FORMAT "%Lu"
8403 +#define SECTOR_FORMAT "%lu"
8406 +#define SECTOR_SHIFT 9
8407 +#define SECTOR_SIZE (1 << SECTOR_SHIFT)
8409 +extern struct block_device_operations dm_blk_dops;
8412 + * List of devices that a metadevice uses and should open/close.
8415 + struct list_head list;
8420 + struct block_device *bdev;
8424 +struct mapped_device;
8426 +/*-----------------------------------------------------------------
8427 + * Functions for manipulating a struct mapped_device.
8428 + * Drop the reference with dm_put when you finish with the object.
8429 + *---------------------------------------------------------------*/
8430 +int dm_create(kdev_t dev, struct mapped_device **md);
8433 + * Reference counting for md.
8435 +void dm_get(struct mapped_device *md);
8436 +void dm_put(struct mapped_device *md);
8439 + * A device can still be used while suspended, but I/O is deferred.
8441 +int dm_suspend(struct mapped_device *md);
8442 +int dm_resume(struct mapped_device *md);
8445 + * The device must be suspended before calling this method.
8447 +int dm_swap_table(struct mapped_device *md, struct dm_table *t);
8450 + * Drop a reference on the table when you've finished with the
8453 +struct dm_table *dm_get_table(struct mapped_device *md);
8456 + * Event functions.
8458 +uint32_t dm_get_event_nr(struct mapped_device *md);
8459 +int dm_add_wait_queue(struct mapped_device *md, wait_queue_t *wq,
8460 + uint32_t event_nr);
8461 +void dm_remove_wait_queue(struct mapped_device *md, wait_queue_t *wq);
8466 +kdev_t dm_kdev(struct mapped_device *md);
8467 +int dm_suspended(struct mapped_device *md);
8469 +/*-----------------------------------------------------------------
8470 + * Functions for manipulating a table. Tables are also reference
8472 + *---------------------------------------------------------------*/
8473 +int dm_table_create(struct dm_table **result, int mode);
8475 +void dm_table_get(struct dm_table *t);
8476 +void dm_table_put(struct dm_table *t);
8478 +int dm_table_add_target(struct dm_table *t, const char *type,
8479 + sector_t start, sector_t len, char *params);
8480 +int dm_table_complete(struct dm_table *t);
8481 +void dm_table_event_callback(struct dm_table *t,
8482 + void (*fn)(void *), void *context);
8483 +void dm_table_event(struct dm_table *t);
8484 +sector_t dm_table_get_size(struct dm_table *t);
8485 +struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
8486 +struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
8487 +unsigned int dm_table_get_num_targets(struct dm_table *t);
8488 +struct list_head *dm_table_get_devices(struct dm_table *t);
8489 +int dm_table_get_mode(struct dm_table *t);
8490 +void dm_table_suspend_targets(struct dm_table *t);
8491 +void dm_table_resume_targets(struct dm_table *t);
8493 +/*-----------------------------------------------------------------
8494 + * A registry of target types.
8495 + *---------------------------------------------------------------*/
8496 +int dm_target_init(void);
8497 +void dm_target_exit(void);
8498 +struct target_type *dm_get_target_type(const char *name);
8499 +void dm_put_target_type(struct target_type *t);
8502 +/*-----------------------------------------------------------------
8504 + *---------------------------------------------------------------*/
8505 +static inline int array_too_big(unsigned long fixed, unsigned long obj,
8506 + unsigned long num)
8508 + return (num > (ULONG_MAX - fixed) / obj);
8512 + * ceiling(n / size) * size
8514 +static inline unsigned long dm_round_up(unsigned long n, unsigned long size)
8516 + unsigned long r = n % size;
8517 + return n + (r ? (size - r) : 0);
8521 + * Ceiling(n / size)
8523 +static inline unsigned long dm_div_up(unsigned long n, unsigned long size)
8525 + return dm_round_up(n, size) / size;
8528 +const char *dm_kdevname(kdev_t dev);
8531 + * The device-mapper can be driven through one of two interfaces;
8532 + * ioctl or filesystem, depending which patch you have applied.
8534 +int dm_interface_init(void);
8535 +void dm_interface_exit(void);
8538 + * Targets for linear and striped mappings
8540 +int dm_linear_init(void);
8541 +void dm_linear_exit(void);
8543 +int dm_stripe_init(void);
8544 +void dm_stripe_exit(void);
8546 +int dm_snapshot_init(void);
8547 +void dm_snapshot_exit(void);
8550 --- linux-2.4.21/drivers/md/kcopyd.c Thu Jan 1 01:00:00 1970
8551 +++ linux/drivers/md/kcopyd.c Wed Aug 20 14:41:38 2003
8554 + * Copyright (C) 2002 Sistina Software (UK) Limited.
8556 + * This file is released under the GPL.
8559 +#include <asm/atomic.h>
8561 +#include <linux/blkdev.h>
8562 +#include <linux/config.h>
8563 +#include <linux/device-mapper.h>
8564 +#include <linux/fs.h>
8565 +#include <linux/init.h>
8566 +#include <linux/list.h>
8567 +#include <linux/locks.h>
8568 +#include <linux/mempool.h>
8569 +#include <linux/module.h>
8570 +#include <linux/pagemap.h>
8571 +#include <linux/slab.h>
8572 +#include <linux/vmalloc.h>
8574 +#include "kcopyd.h"
8575 +#include "dm-daemon.h"
8577 +/* FIXME: this is only needed for the DMERR macros */
8580 +static struct dm_daemon _kcopyd;
8582 +/*-----------------------------------------------------------------
8583 + * Each kcopyd client has its own little pool of preallocated
8584 + * pages for kcopyd io.
8585 + *---------------------------------------------------------------*/
8586 +struct kcopyd_client {
8587 + struct list_head list;
8590 + struct list_head pages;
8591 + unsigned int nr_pages;
8592 + unsigned int nr_free_pages;
8595 +static inline void __push_page(struct kcopyd_client *kc, struct page *p)
8597 + list_add(&p->list, &kc->pages);
8598 + kc->nr_free_pages++;
8601 +static inline struct page *__pop_page(struct kcopyd_client *kc)
8605 + p = list_entry(kc->pages.next, struct page, list);
8606 + list_del(&p->list);
8607 + kc->nr_free_pages--;
8612 +static int kcopyd_get_pages(struct kcopyd_client *kc,
8613 + unsigned int nr, struct list_head *pages)
8616 + INIT_LIST_HEAD(pages);
8618 + spin_lock(&kc->lock);
8619 + if (kc->nr_free_pages < nr) {
8620 + spin_unlock(&kc->lock);
8625 + p = __pop_page(kc);
8626 + list_add(&p->list, pages);
8628 + spin_unlock(&kc->lock);
8633 +static void kcopyd_put_pages(struct kcopyd_client *kc, struct list_head *pages)
8635 + struct list_head *tmp, *tmp2;
8637 + spin_lock(&kc->lock);
8638 + list_for_each_safe (tmp, tmp2, pages)
8639 + __push_page(kc, list_entry(tmp, struct page, list));
8640 + spin_unlock(&kc->lock);
8644 + * These three functions resize the page pool.
8646 +static void release_pages(struct list_head *pages)
8649 + struct list_head *tmp, *tmp2;
8651 + list_for_each_safe (tmp, tmp2, pages) {
8652 + p = list_entry(tmp, struct page, list);
8658 +static int client_alloc_pages(struct kcopyd_client *kc, unsigned int nr)
8664 + for (i = 0; i < nr; i++) {
8665 + p = alloc_page(GFP_KERNEL);
8667 + release_pages(&new);
8672 + list_add(&p->list, &new);
8675 + kcopyd_put_pages(kc, &new);
8676 + kc->nr_pages += nr;
8680 +static void client_free_pages(struct kcopyd_client *kc)
8682 + BUG_ON(kc->nr_free_pages != kc->nr_pages);
8683 + release_pages(&kc->pages);
8684 + kc->nr_free_pages = kc->nr_pages = 0;
8687 +/*-----------------------------------------------------------------
8688 + * kcopyd_jobs need to be allocated by the *clients* of kcopyd,
8689 + * for this reason we use a mempool to prevent the client from
8690 + * ever having to do io (which could cause a deadlock).
8691 + *---------------------------------------------------------------*/
8692 +struct kcopyd_job {
8693 + struct kcopyd_client *kc;
8694 + struct list_head list;
8695 + unsigned int flags;
8698 + * Error state of the job.
8701 + unsigned int write_err;
8704 + * Either READ or WRITE
8707 + struct io_region source;
8710 + * The destinations for the transfer.
8712 + unsigned int num_dests;
8713 + struct io_region dests[KCOPYD_MAX_REGIONS];
8716 + unsigned int nr_pages;
8717 + struct list_head pages;
8720 + * Set this to ensure you are notified when the job has
8721 + * completed. 'context' is for callback to use.
8723 + kcopyd_notify_fn fn;
8727 + * These fields are only used if the job has been split
8728 + * into more manageable parts.
8730 + struct semaphore lock;
8731 + atomic_t sub_jobs;
8732 + sector_t progress;
8735 +/* FIXME: this should scale with the number of pages */
8736 +#define MIN_JOBS 512
8738 +static kmem_cache_t *_job_cache;
8739 +static mempool_t *_job_pool;
8742 + * We maintain three lists of jobs:
8744 + * i) jobs waiting for pages
8745 + * ii) jobs that have pages, and are waiting for the io to be issued.
8746 + * iii) jobs that have completed.
8748 + * All three of these are protected by job_lock.
8750 +static spinlock_t _job_lock = SPIN_LOCK_UNLOCKED;
8752 +static LIST_HEAD(_complete_jobs);
8753 +static LIST_HEAD(_io_jobs);
8754 +static LIST_HEAD(_pages_jobs);
8756 +static int jobs_init(void)
8758 + INIT_LIST_HEAD(&_complete_jobs);
8759 + INIT_LIST_HEAD(&_io_jobs);
8760 + INIT_LIST_HEAD(&_pages_jobs);
8762 + _job_cache = kmem_cache_create("kcopyd-jobs",
8763 + sizeof(struct kcopyd_job),
8764 + __alignof__(struct kcopyd_job),
8769 + _job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
8770 + mempool_free_slab, _job_cache);
8772 + kmem_cache_destroy(_job_cache);
8779 +static void jobs_exit(void)
8781 + BUG_ON(!list_empty(&_complete_jobs));
8782 + BUG_ON(!list_empty(&_io_jobs));
8783 + BUG_ON(!list_empty(&_pages_jobs));
8785 + mempool_destroy(_job_pool);
8786 + kmem_cache_destroy(_job_cache);
8790 + * Functions to push and pop a job onto the head of a given job
8793 +static inline struct kcopyd_job *pop(struct list_head *jobs)
8795 + struct kcopyd_job *job = NULL;
8796 + unsigned long flags;
8798 + spin_lock_irqsave(&_job_lock, flags);
8800 + if (!list_empty(jobs)) {
8801 + job = list_entry(jobs->next, struct kcopyd_job, list);
8802 + list_del(&job->list);
8804 + spin_unlock_irqrestore(&_job_lock, flags);
8809 +static inline void push(struct list_head *jobs, struct kcopyd_job *job)
8811 + unsigned long flags;
8813 + spin_lock_irqsave(&_job_lock, flags);
8814 + list_add_tail(&job->list, jobs);
8815 + spin_unlock_irqrestore(&_job_lock, flags);
8819 + * These three functions process 1 item from the corresponding
8825 + * > 0: can't process yet.
8827 +static int run_complete_job(struct kcopyd_job *job)
8829 + void *context = job->context;
8830 + int read_err = job->read_err;
8831 + unsigned int write_err = job->write_err;
8832 + kcopyd_notify_fn fn = job->fn;
8834 + kcopyd_put_pages(job->kc, &job->pages);
8835 + mempool_free(job, _job_pool);
8836 + fn(read_err, write_err, context);
8840 +static void complete_io(unsigned int error, void *context)
8842 + struct kcopyd_job *job = (struct kcopyd_job *) context;
8845 + if (job->rw == WRITE)
8846 + job->write_err &= error;
8848 + job->read_err = 1;
8850 + if (!test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) {
8851 + push(&_complete_jobs, job);
8852 + dm_daemon_wake(&_kcopyd);
8857 + if (job->rw == WRITE)
8858 + push(&_complete_jobs, job);
8862 + push(&_io_jobs, job);
8865 + dm_daemon_wake(&_kcopyd);
8869 + * Request io on as many buffer heads as we can currently get for
8870 + * a particular job.
8872 +static int run_io_job(struct kcopyd_job *job)
8876 + if (job->rw == READ)
8877 + r = dm_io_async(1, &job->source, job->rw,
8878 + list_entry(job->pages.next, struct page, list),
8879 + job->offset, complete_io, job);
8882 + r = dm_io_async(job->num_dests, job->dests, job->rw,
8883 + list_entry(job->pages.next, struct page, list),
8884 + job->offset, complete_io, job);
8889 +#define SECTORS_PER_PAGE (PAGE_SIZE / SECTOR_SIZE)
8890 +static int run_pages_job(struct kcopyd_job *job)
8894 + job->nr_pages = dm_div_up(job->dests[0].count + job->offset,
8895 + SECTORS_PER_PAGE);
8896 + r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages);
8898 + /* this job is ready for io */
8899 + push(&_io_jobs, job);
8904 + /* can't complete now */
8911 + * Run through a list for as long as possible. Returns the count
8912 + * of successful jobs.
8914 +static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *))
8916 + struct kcopyd_job *job;
8919 + while ((job = pop(jobs))) {
8924 + /* error this rogue job */
8925 + if (job->rw == WRITE)
8926 + job->write_err = (unsigned int) -1;
8928 + job->read_err = 1;
8929 + push(&_complete_jobs, job);
8935 + * We couldn't service this job ATM, so
8936 + * push this job back onto the list.
8949 + * kcopyd does this every time it's woken up.
8951 +static void do_work(void)
8954 + * The order that these are called is *very* important.
8955 + * complete jobs can free some pages for pages jobs.
8956 + * Pages jobs when successful will jump onto the io jobs
8957 + * list. io jobs call wake when they complete and it all
8960 + process_jobs(&_complete_jobs, run_complete_job);
8961 + process_jobs(&_pages_jobs, run_pages_job);
8962 + process_jobs(&_io_jobs, run_io_job);
8963 + run_task_queue(&tq_disk);
8967 + * If we are copying a small region we just dispatch a single job
8968 + * to do the copy, otherwise the io has to be split up into many
8971 +static void dispatch_job(struct kcopyd_job *job)
8973 + push(&_pages_jobs, job);
8974 + dm_daemon_wake(&_kcopyd);
8977 +#define SUB_JOB_SIZE 128
8978 +static void segment_complete(int read_err,
8979 + unsigned int write_err, void *context)
8981 + /* FIXME: tidy this function */
8982 + sector_t progress = 0;
8983 + sector_t count = 0;
8984 + struct kcopyd_job *job = (struct kcopyd_job *) context;
8988 + /* update the error */
8990 + job->read_err = 1;
8993 + job->write_err &= write_err;
8996 + * Only dispatch more work if there hasn't been an error.
8998 + if ((!job->read_err && !job->write_err) ||
8999 + test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) {
9000 + /* get the next chunk of work */
9001 + progress = job->progress;
9002 + count = job->source.count - progress;
9004 + if (count > SUB_JOB_SIZE)
9005 + count = SUB_JOB_SIZE;
9007 + job->progress += count;
9014 + struct kcopyd_job *sub_job = mempool_alloc(_job_pool, GFP_NOIO);
9016 + memcpy(sub_job, job, sizeof(*job));
9017 + sub_job->source.sector += progress;
9018 + sub_job->source.count = count;
9020 + for (i = 0; i < job->num_dests; i++) {
9021 + sub_job->dests[i].sector += progress;
9022 + sub_job->dests[i].count = count;
9025 + sub_job->fn = segment_complete;
9026 + sub_job->context = job;
9027 + dispatch_job(sub_job);
9029 + } else if (atomic_dec_and_test(&job->sub_jobs)) {
9032 + * To avoid a race we must keep the job around
9033 + * until after the notify function has completed.
9034 + * Otherwise the client may try and stop the job
9035 + * after we've completed.
9037 + job->fn(read_err, write_err, job->context);
9038 + mempool_free(job, _job_pool);
9043 + * Create some little jobs that will do the move between
9046 +#define SPLIT_COUNT 8
9047 +static void split_job(struct kcopyd_job *job)
9051 + atomic_set(&job->sub_jobs, SPLIT_COUNT);
9052 + for (i = 0; i < SPLIT_COUNT; i++)
9053 + segment_complete(0, 0u, job);
9056 +#define SUB_JOB_THRESHOLD (SPLIT_COUNT * SUB_JOB_SIZE)
9057 +int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
9058 + unsigned int num_dests, struct io_region *dests,
9059 + unsigned int flags, kcopyd_notify_fn fn, void *context)
9061 + struct kcopyd_job *job;
9064 + * Allocate a new job.
9066 + job = mempool_alloc(_job_pool, GFP_NOIO);
9069 + * set up for the read.
9072 + job->flags = flags;
9073 + job->read_err = 0;
9074 + job->write_err = 0;
9077 + memcpy(&job->source, from, sizeof(*from));
9079 + job->num_dests = num_dests;
9080 + memcpy(&job->dests, dests, sizeof(*dests) * num_dests);
9083 + job->nr_pages = 0;
9084 + INIT_LIST_HEAD(&job->pages);
9087 + job->context = context;
9089 + if (job->source.count < SUB_JOB_THRESHOLD)
9090 + dispatch_job(job);
9093 + init_MUTEX(&job->lock);
9094 + job->progress = 0;
9102 + * Cancels a kcopyd job, eg. someone might be deactivating a
9105 +int kcopyd_cancel(struct kcopyd_job *job, int block)
9107 + /* FIXME: finish */
9111 +/*-----------------------------------------------------------------
9113 + *---------------------------------------------------------------*/
9114 +static DECLARE_MUTEX(_client_lock);
9115 +static LIST_HEAD(_clients);
9117 +static int client_add(struct kcopyd_client *kc)
9119 + down(&_client_lock);
9120 + list_add(&kc->list, &_clients);
9121 + up(&_client_lock);
9125 +static void client_del(struct kcopyd_client *kc)
9127 + down(&_client_lock);
9128 + list_del(&kc->list);
9129 + up(&_client_lock);
9132 +int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result)
9135 + struct kcopyd_client *kc;
9137 + kc = kmalloc(sizeof(*kc), GFP_KERNEL);
9141 + kc->lock = SPIN_LOCK_UNLOCKED;
9142 + INIT_LIST_HEAD(&kc->pages);
9143 + kc->nr_pages = kc->nr_free_pages = 0;
9144 + r = client_alloc_pages(kc, nr_pages);
9150 + r = dm_io_get(nr_pages);
9152 + client_free_pages(kc);
9157 + r = client_add(kc);
9159 + dm_io_put(nr_pages);
9160 + client_free_pages(kc);
9169 +void kcopyd_client_destroy(struct kcopyd_client *kc)
9171 + dm_io_put(kc->nr_pages);
9172 + client_free_pages(kc);
9178 +int __init kcopyd_init(void)
9186 + r = dm_daemon_start(&_kcopyd, "kcopyd", do_work);
9193 +void kcopyd_exit(void)
9196 + dm_daemon_stop(&_kcopyd);
9199 +EXPORT_SYMBOL(kcopyd_client_create);
9200 +EXPORT_SYMBOL(kcopyd_client_destroy);
9201 +EXPORT_SYMBOL(kcopyd_copy);
9202 +EXPORT_SYMBOL(kcopyd_cancel);
9203 --- linux-2.4.21/drivers/md/kcopyd.h Thu Jan 1 01:00:00 1970
9204 +++ linux/drivers/md/kcopyd.h Wed Aug 20 14:41:38 2003
9207 + * Copyright (C) 2001 Sistina Software
9209 + * This file is released under the GPL.
9212 +#ifndef DM_KCOPYD_H
9213 +#define DM_KCOPYD_H
9216 + * Needed for the definition of offset_t.
9218 +#include <linux/device-mapper.h>
9219 +#include <linux/iobuf.h>
9223 +int kcopyd_init(void);
9224 +void kcopyd_exit(void);
9226 +/* FIXME: make this configurable */
9227 +#define KCOPYD_MAX_REGIONS 8
9229 +#define KCOPYD_IGNORE_ERROR 1
9232 + * To use kcopyd you must first create a kcopyd client object.
9234 +struct kcopyd_client;
9235 +int kcopyd_client_create(unsigned int num_pages, struct kcopyd_client **result);
9236 +void kcopyd_client_destroy(struct kcopyd_client *kc);
9239 + * Submit a copy job to kcopyd. This is built on top of the
9240 + * previous three fns.
9242 + * read_err is a boolean,
9243 + * write_err is a bitset, with 1 bit for each destination region
9245 +typedef void (*kcopyd_notify_fn)(int read_err,
9246 + unsigned int write_err, void *context);
9248 +int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
9249 + unsigned int num_dests, struct io_region *dests,
9250 + unsigned int flags, kcopyd_notify_fn fn, void *context);
9253 --- linux-2.4.21/fs/buffer.c Fri Jun 13 16:32:48 2003
9254 +++ linux/fs/buffer.c Wed Aug 20 14:41:32 2003
9256 bh->b_list = BUF_CLEAN;
9257 bh->b_end_io = handler;
9258 bh->b_private = private;
9259 + bh->b_journal_head = NULL;
9262 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
9263 --- linux-2.4.21/fs/jbd/journal.c Fri Jun 13 16:32:48 2003
9264 +++ linux/fs/jbd/journal.c Wed Aug 20 14:41:32 2003
9265 @@ -1802,9 +1802,9 @@
9267 if (buffer_jbd(bh)) {
9268 /* Someone did it for us! */
9269 - J_ASSERT_BH(bh, bh->b_private != NULL);
9270 + J_ASSERT_BH(bh, bh->b_journal_head != NULL);
9271 journal_free_journal_head(jh);
9272 - jh = bh->b_private;
9273 + jh = bh->b_journal_head;
9276 * We actually don't need jh_splice_lock when
9277 @@ -1812,7 +1812,7 @@
9279 spin_lock(&jh_splice_lock);
9280 set_bit(BH_JBD, &bh->b_state);
9281 - bh->b_private = jh;
9282 + bh->b_journal_head = jh;
9284 atomic_inc(&bh->b_count);
9285 spin_unlock(&jh_splice_lock);
9286 @@ -1821,7 +1821,7 @@
9289 spin_unlock(&journal_datalist_lock);
9290 - return bh->b_private;
9291 + return bh->b_journal_head;
9295 @@ -1854,7 +1854,7 @@
9296 J_ASSERT_BH(bh, jh2bh(jh) == bh);
9297 BUFFER_TRACE(bh, "remove journal_head");
9298 spin_lock(&jh_splice_lock);
9299 - bh->b_private = NULL;
9300 + bh->b_journal_head = NULL;
9301 jh->b_bh = NULL; /* debug, really */
9302 clear_bit(BH_JBD, &bh->b_state);
9304 --- linux-2.4.21/include/linux/device-mapper.h Thu Jan 1 01:00:00 1970
9305 +++ linux/include/linux/device-mapper.h Wed Aug 20 14:41:38 2003
9308 + * Copyright (C) 2001 Sistina Software (UK) Limited.
9310 + * This file is released under the LGPL.
9313 +#ifndef _LINUX_DEVICE_MAPPER_H
9314 +#define _LINUX_DEVICE_MAPPER_H
9316 +typedef unsigned long sector_t;
9322 +typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
9326 + unsigned long long ll;
9330 + * In the constructor the target parameter will already have the
9331 + * table, type, begin and len fields filled in.
9333 +typedef int (*dm_ctr_fn) (struct dm_target * target, unsigned int argc,
9337 + * The destructor doesn't need to free the dm_target, just
9338 + * anything hidden ti->private.
9340 +typedef void (*dm_dtr_fn) (struct dm_target * ti);
9343 + * The map function must return:
9345 + * = 0: The target will handle the io by resubmitting it later
9346 + * > 0: simple remap complete
9348 +typedef int (*dm_map_fn) (struct dm_target * ti, struct buffer_head * bh,
9349 + int rw, union map_info *map_context);
9353 + * < 0 : error (currently ignored)
9354 + * 0 : ended successfully
9355 + * 1 : for some reason the io has still not completed (eg,
9356 + * multipath target might want to requeue a failed io).
9358 +typedef int (*dm_endio_fn) (struct dm_target * ti,
9359 + struct buffer_head * bh, int rw, int error,
9360 + union map_info *map_context);
9361 +typedef void (*dm_suspend_fn) (struct dm_target *ti);
9362 +typedef void (*dm_resume_fn) (struct dm_target *ti);
9363 +typedef int (*dm_status_fn) (struct dm_target * ti, status_type_t status_type,
9364 + char *result, unsigned int maxlen);
9366 +void dm_error(const char *message);
9369 + * Constructors should call these functions to ensure destination devices
9370 + * are opened/closed correctly.
9371 + * FIXME: too many arguments.
9373 +int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
9374 + sector_t len, int mode, struct dm_dev **result);
9375 +void dm_put_device(struct dm_target *ti, struct dm_dev *d);
9378 + * Information about a target type
9380 +struct target_type {
9382 + struct module *module;
9386 + dm_endio_fn end_io;
9387 + dm_suspend_fn suspend;
9388 + dm_resume_fn resume;
9389 + dm_status_fn status;
9393 + struct dm_table *table;
9394 + struct target_type *type;
9396 + /* target limits */
9400 + /* target specific data */
9403 + /* Used to provide an error string from the ctr */
9407 +int dm_register_target(struct target_type *t);
9408 +int dm_unregister_target(struct target_type *t);
9410 +#endif /* _LINUX_DEVICE_MAPPER_H */
9411 --- linux-2.4.21/include/linux/dm-ioctl.h Thu Jan 1 01:00:00 1970
9412 +++ linux/include/linux/dm-ioctl.h Wed Aug 20 14:41:38 2003
9415 + * Copyright (C) 2001 - 2003 Sistina Software (UK) Limited.
9417 + * This file is released under the LGPL.
9420 +#ifndef _LINUX_DM_IOCTL_H
9421 +#define _LINUX_DM_IOCTL_H
9423 +#include <linux/types.h>
9425 +#define DM_DIR "mapper" /* Slashes not supported */
9426 +#define DM_MAX_TYPE_NAME 16
9427 +#define DM_NAME_LEN 128
9428 +#define DM_UUID_LEN 129
9431 + * A traditional ioctl interface for the device mapper.
9433 + * Each device can have two tables associated with it, an
9434 + * 'active' table which is the one currently used by io passing
9435 + * through the device, and an 'inactive' one which is a table
9436 + * that is being prepared as a replacement for the 'active' one.
9439 + * Just get the version information for the ioctl interface.
9442 + * Remove all dm devices, destroy all tables. Only really used
9445 + * DM_LIST_DEVICES:
9446 + * Get a list of all the dm device names.
9449 + * Create a new device, neither the 'active' or 'inactive' table
9450 + * slots will be filled. The device will be in suspended state
9451 + * after creation, however any io to the device will get errored
9452 + * since it will be out-of-bounds.
9455 + * Remove a device, destroy any tables.
9458 + * Rename a device.
9461 + * This performs both suspend and resume, depending which flag is
9463 + * Suspend: This command will not return until all pending io to
9464 + * the device has completed. Further io will be deferred until
9465 + * the device is resumed.
9466 + * Resume: It is no longer an error to issue this command on an
9467 + * unsuspended device. If a table is present in the 'inactive'
9468 + * slot, it will be moved to the active slot, then the old table
9469 + * from the active slot will be _destroyed_. Finally the device
9473 + * Retrieves the status for the table in the 'active' slot.
9476 + * Wait for a significant event to occur to the device. This
9477 + * could either be caused by an event triggered by one of the
9478 + * targets of the table in the 'active' slot, or a table change.
9481 + * Load a table into the 'inactive' slot for the device. The
9482 + * device does _not_ need to be suspended prior to this command.
9485 + * Destroy any table in the 'inactive' slot (ie. abort).
9488 + * Return a set of device dependencies for the 'active' table.
9490 + * DM_TABLE_STATUS:
9491 + * Return the targets status for the 'active' table.
9495 + * All ioctl arguments consist of a single chunk of memory, with
9496 + * this structure at the start. If a uuid is specified any
9497 + * lookup (eg. for a DM_INFO) will be done on that, *not* the
9502 + * The version number is made up of three parts:
9503 + * major - no backward or forward compatibility,
9504 + * minor - only backwards compatible,
9505 + * patch - both backwards and forwards compatible.
9507 + * All clients of the ioctl interface should fill in the
9508 + * version number of the interface that they were
9511 + * All recognised ioctl commands (ie. those that don't
9512 + * return -ENOTTY) fill out this field, even if the
9515 + uint32_t version[3]; /* in/out */
9516 + uint32_t data_size; /* total size of data passed in
9517 + * including this struct */
9519 + uint32_t data_start; /* offset to start of data
9520 + * relative to start of this struct */
9522 + uint32_t target_count; /* in/out */
9523 + int32_t open_count; /* out */
9524 + uint32_t flags; /* in/out */
9525 + uint32_t event_nr; /* in/out */
9528 + uint64_t dev; /* in/out */
9530 + char name[DM_NAME_LEN]; /* device name */
9531 + char uuid[DM_UUID_LEN]; /* unique identifier for
9532 + * the block device */
9536 + * Used to specify tables. These structures appear after the
9539 +struct dm_target_spec {
9540 + uint64_t sector_start;
9542 + int32_t status; /* used when reading from kernel only */
9545 + * Offset in bytes (from the start of this struct) to
9546 + * next target_spec.
9550 + char target_type[DM_MAX_TYPE_NAME];
9553 + * Parameter string starts immediately after this object.
9554 + * Be careful to add padding after string to ensure correct
9555 + * alignment of subsequent dm_target_spec.
9560 + * Used to retrieve the target dependencies.
9562 +struct dm_target_deps {
9563 + uint32_t count; /* Array size */
9564 + uint32_t padding; /* unused */
9565 + uint64_t dev[0]; /* out */
9569 + * Used to get a list of all dm devices.
9571 +struct dm_name_list {
9573 + uint32_t next; /* offset to the next record from
9574 + the _start_ of this */
9579 + * If you change this make sure you make the corresponding change
9580 + * to dm-ioctl.c:lookup_ioctl()
9583 + /* Top level cmds */
9584 + DM_VERSION_CMD = 0,
9585 + DM_REMOVE_ALL_CMD,
9586 + DM_LIST_DEVICES_CMD,
9588 + /* device level cmds */
9589 + DM_DEV_CREATE_CMD,
9590 + DM_DEV_REMOVE_CMD,
9591 + DM_DEV_RENAME_CMD,
9592 + DM_DEV_SUSPEND_CMD,
9593 + DM_DEV_STATUS_CMD,
9596 + /* Table level cmds */
9597 + DM_TABLE_LOAD_CMD,
9598 + DM_TABLE_CLEAR_CMD,
9599 + DM_TABLE_DEPS_CMD,
9600 + DM_TABLE_STATUS_CMD,
9603 +#define DM_IOCTL 0xfd
9605 +#define DM_VERSION _IOWR(DM_IOCTL, DM_VERSION_CMD, struct dm_ioctl)
9606 +#define DM_REMOVE_ALL _IOWR(DM_IOCTL, DM_REMOVE_ALL_CMD, struct dm_ioctl)
9607 +#define DM_LIST_DEVICES _IOWR(DM_IOCTL, DM_LIST_DEVICES_CMD, struct dm_ioctl)
9609 +#define DM_DEV_CREATE _IOWR(DM_IOCTL, DM_DEV_CREATE_CMD, struct dm_ioctl)
9610 +#define DM_DEV_REMOVE _IOWR(DM_IOCTL, DM_DEV_REMOVE_CMD, struct dm_ioctl)
9611 +#define DM_DEV_RENAME _IOWR(DM_IOCTL, DM_DEV_RENAME_CMD, struct dm_ioctl)
9612 +#define DM_DEV_SUSPEND _IOWR(DM_IOCTL, DM_DEV_SUSPEND_CMD, struct dm_ioctl)
9613 +#define DM_DEV_STATUS _IOWR(DM_IOCTL, DM_DEV_STATUS_CMD, struct dm_ioctl)
9614 +#define DM_DEV_WAIT _IOWR(DM_IOCTL, DM_DEV_WAIT_CMD, struct dm_ioctl)
9616 +#define DM_TABLE_LOAD _IOWR(DM_IOCTL, DM_TABLE_LOAD_CMD, struct dm_ioctl)
9617 +#define DM_TABLE_CLEAR _IOWR(DM_IOCTL, DM_TABLE_CLEAR_CMD, struct dm_ioctl)
9618 +#define DM_TABLE_DEPS _IOWR(DM_IOCTL, DM_TABLE_DEPS_CMD, struct dm_ioctl)
9619 +#define DM_TABLE_STATUS _IOWR(DM_IOCTL, DM_TABLE_STATUS_CMD, struct dm_ioctl)
9621 +#define DM_VERSION_MAJOR 4
9622 +#define DM_VERSION_MINOR 0
9623 +#define DM_VERSION_PATCHLEVEL 3
9624 +#define DM_VERSION_EXTRA "-ioctl (2003-08-22)"
9627 +#define DM_READONLY_FLAG (1 << 0) /* In/Out */
9628 +#define DM_SUSPEND_FLAG (1 << 1) /* In/Out */
9629 +#define DM_PERSISTENT_DEV_FLAG (1 << 3) /* In */
9632 + * Flag passed into ioctl STATUS command to get table information
9633 + * rather than current status.
9635 +#define DM_STATUS_TABLE_FLAG (1 << 4) /* In */
9638 + * Flags that indicate whether a table is present in either of
9639 + * the two table slots that a device has.
9641 +#define DM_ACTIVE_PRESENT_FLAG (1 << 5) /* Out */
9642 +#define DM_INACTIVE_PRESENT_FLAG (1 << 6) /* Out */
9645 + * Indicates that the buffer passed in wasn't big enough for the
9648 +#define DM_BUFFER_FULL_FLAG (1 << 8) /* Out */
9650 +#endif /* _LINUX_DM_IOCTL_H */
9651 --- linux-2.4.21/include/linux/fs.h Fri Jun 13 16:32:51 2003
9652 +++ linux/include/linux/fs.h Wed Aug 20 14:41:32 2003
9654 struct page *b_page; /* the page this bh is mapped to */
9655 void (*b_end_io)(struct buffer_head *bh, int uptodate); /* I/O completion */
9656 void *b_private; /* reserved for b_end_io */
9658 + void *b_journal_head; /* ext3 journal_heads */
9659 unsigned long b_rsector; /* Real buffer location on disk */
9660 wait_queue_head_t b_wait;
9662 --- linux-2.4.21/include/linux/jbd.h Fri Jun 13 16:32:51 2003
9663 +++ linux/include/linux/jbd.h Wed Aug 20 14:41:32 2003
9666 static inline struct journal_head *bh2jh(struct buffer_head *bh)
9668 - return bh->b_private;
9669 + return bh->b_journal_head;
9672 #define HAVE_JOURNAL_CALLBACK_STATUS
9673 --- linux-2.4.21/include/linux/mempool.h Thu Jan 1 01:00:00 1970
9674 +++ linux/include/linux/mempool.h Wed Aug 20 14:41:48 2003
9677 + * memory buffer pool support
9679 +#ifndef _LINUX_MEMPOOL_H
9680 +#define _LINUX_MEMPOOL_H
9682 +#include <linux/list.h>
9683 +#include <linux/wait.h>
9686 +typedef struct mempool_s mempool_t;
9688 +typedef void * (mempool_alloc_t)(int gfp_mask, void *pool_data);
9689 +typedef void (mempool_free_t)(void *element, void *pool_data);
9691 +extern mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
9692 + mempool_free_t *free_fn, void *pool_data);
9693 +extern int mempool_resize(mempool_t *pool, int new_min_nr, int gfp_mask);
9694 +extern void mempool_destroy(mempool_t *pool);
9695 +extern void * mempool_alloc(mempool_t *pool, int gfp_mask);
9696 +extern void mempool_free(void *element, mempool_t *pool);
9699 + * A mempool_alloc_t and mempool_free_t that get the memory from
9700 + * a slab that is passed in through pool_data.
9702 +void *mempool_alloc_slab(int gfp_mask, void *pool_data);
9703 +void mempool_free_slab(void *element, void *pool_data);
9706 +#endif /* _LINUX_MEMPOOL_H */
9707 --- linux-2.4.21/include/linux/vmalloc.h Fri Jan 10 16:35:58 2003
9708 +++ linux/include/linux/vmalloc.h Wed Aug 20 14:41:57 2003
9710 extern void vmfree_area_pages(unsigned long address, unsigned long size);
9711 extern int vmalloc_area_pages(unsigned long address, unsigned long size,
9712 int gfp_mask, pgprot_t prot);
9713 +extern void *vcalloc(unsigned long nmemb, unsigned long elem_size);
9716 * Allocate any pages
9717 --- linux-2.4.21/kernel/ksyms.c Fri Jun 13 16:32:52 2003
9718 +++ linux/kernel/ksyms.c Wed Aug 20 14:41:57 2003
9720 EXPORT_SYMBOL(vfree);
9721 EXPORT_SYMBOL(__vmalloc);
9722 EXPORT_SYMBOL(vmalloc_to_page);
9723 +EXPORT_SYMBOL(vcalloc);
9724 EXPORT_SYMBOL(mem_map);
9725 EXPORT_SYMBOL(remap_page_range);
9726 EXPORT_SYMBOL(max_mapnr);
9727 --- linux-2.4.21/mm/Makefile Fri Jan 10 16:36:02 2003
9728 +++ linux/mm/Makefile Wed Aug 20 14:41:48 2003
9733 -export-objs := shmem.o filemap.o memory.o page_alloc.o
9734 +export-objs := shmem.o filemap.o memory.o page_alloc.o mempool.o
9736 obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
9737 vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
9738 page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \
9742 obj-$(CONFIG_HIGHMEM) += highmem.o
9744 --- linux-2.4.21/mm/filemap.c Fri Jun 13 16:33:25 2003
9745 +++ linux/mm/filemap.c Wed Aug 20 14:41:53 2003
9746 @@ -1704,8 +1704,10 @@
9747 retval = generic_file_direct_IO(READ, filp, buf, count, pos);
9749 *ppos = pos + retval;
9752 - UPDATE_ATIME(filp->f_dentry->d_inode);
9753 + if (!S_ISBLK(inode->i_mode))
9754 + UPDATE_ATIME(filp->f_dentry->d_inode);
9758 --- linux-2.4.21/mm/mempool.c Thu Jan 1 01:00:00 1970
9759 +++ linux/mm/mempool.c Wed Aug 20 14:41:48 2003
9762 + * linux/mm/mempool.c
9764 + * memory buffer pool support. Such pools are mostly used
9765 + * for guaranteed, deadlock-free memory allocations during
9766 + * extreme VM load.
9768 + * started by Ingo Molnar, Copyright (C) 2001
9771 +#include <linux/mm.h>
9772 +#include <linux/slab.h>
9773 +#include <linux/module.h>
9774 +#include <linux/mempool.h>
9778 + int min_nr; /* nr of elements at *elements */
9779 + int curr_nr; /* Current nr of elements at *elements */
9783 + mempool_alloc_t *alloc;
9784 + mempool_free_t *free;
9785 + wait_queue_head_t wait;
9788 +static void add_element(mempool_t *pool, void *element)
9790 + BUG_ON(pool->curr_nr >= pool->min_nr);
9791 + pool->elements[pool->curr_nr++] = element;
9794 +static void *remove_element(mempool_t *pool)
9796 + BUG_ON(pool->curr_nr <= 0);
9797 + return pool->elements[--pool->curr_nr];
9800 +static void free_pool(mempool_t *pool)
9802 + while (pool->curr_nr) {
9803 + void *element = remove_element(pool);
9804 + pool->free(element, pool->pool_data);
9806 + kfree(pool->elements);
9811 + * mempool_create - create a memory pool
9812 + * @min_nr: the minimum number of elements guaranteed to be
9813 + * allocated for this pool.
9814 + * @alloc_fn: user-defined element-allocation function.
9815 + * @free_fn: user-defined element-freeing function.
9816 + * @pool_data: optional private data available to the user-defined functions.
9818 + * this function creates and allocates a guaranteed size, preallocated
9819 + * memory pool. The pool can be used from the mempool_alloc and mempool_free
9820 + * functions. This function might sleep. Both the alloc_fn() and the free_fn()
9821 + * functions might sleep - as long as the mempool_alloc function is not called
9822 + * from IRQ contexts.
9824 +mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
9825 + mempool_free_t *free_fn, void *pool_data)
9829 + pool = kmalloc(sizeof(*pool), GFP_KERNEL);
9832 + memset(pool, 0, sizeof(*pool));
9833 + pool->elements = kmalloc(min_nr * sizeof(void *), GFP_KERNEL);
9834 + if (!pool->elements) {
9838 + spin_lock_init(&pool->lock);
9839 + pool->min_nr = min_nr;
9840 + pool->pool_data = pool_data;
9841 + init_waitqueue_head(&pool->wait);
9842 + pool->alloc = alloc_fn;
9843 + pool->free = free_fn;
9846 + * First pre-allocate the guaranteed number of buffers.
9848 + while (pool->curr_nr < pool->min_nr) {
9851 + element = pool->alloc(GFP_KERNEL, pool->pool_data);
9852 + if (unlikely(!element)) {
9856 + add_element(pool, element);
9862 + * mempool_resize - resize an existing memory pool
9863 + * @pool: pointer to the memory pool which was allocated via
9864 + * mempool_create().
9865 + * @new_min_nr: the new minimum number of elements guaranteed to be
9866 + * allocated for this pool.
9867 + * @gfp_mask: the usual allocation bitmask.
9869 + * This function shrinks/grows the pool. In the case of growing,
9870 + * it cannot be guaranteed that the pool will be grown to the new
9871 + * size immediately, but new mempool_free() calls will refill it.
9873 + * Note, the caller must guarantee that no mempool_destroy is called
9874 + * while this function is running. mempool_alloc() & mempool_free()
9875 + * might be called (eg. from IRQ contexts) while this function executes.
9877 +int mempool_resize(mempool_t *pool, int new_min_nr, int gfp_mask)
9880 + void **new_elements;
9881 + unsigned long flags;
9883 + BUG_ON(new_min_nr <= 0);
9885 + spin_lock_irqsave(&pool->lock, flags);
9886 + if (new_min_nr < pool->min_nr) {
9887 + while (pool->curr_nr > new_min_nr) {
9888 + element = remove_element(pool);
9889 + spin_unlock_irqrestore(&pool->lock, flags);
9890 + pool->free(element, pool->pool_data);
9891 + spin_lock_irqsave(&pool->lock, flags);
9893 + pool->min_nr = new_min_nr;
9896 + spin_unlock_irqrestore(&pool->lock, flags);
9898 + /* Grow the pool */
9899 + new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask);
9900 + if (!new_elements)
9903 + spin_lock_irqsave(&pool->lock, flags);
9904 + memcpy(new_elements, pool->elements,
9905 + pool->curr_nr * sizeof(*new_elements));
9906 + kfree(pool->elements);
9907 + pool->elements = new_elements;
9908 + pool->min_nr = new_min_nr;
9910 + while (pool->curr_nr < pool->min_nr) {
9911 + spin_unlock_irqrestore(&pool->lock, flags);
9912 + element = pool->alloc(gfp_mask, pool->pool_data);
9915 + spin_lock_irqsave(&pool->lock, flags);
9916 + if (pool->curr_nr < pool->min_nr)
9917 + add_element(pool, element);
9919 + kfree(element); /* Raced */
9922 + spin_unlock_irqrestore(&pool->lock, flags);
9928 + * mempool_destroy - deallocate a memory pool
9929 + * @pool: pointer to the memory pool which was allocated via
9930 + * mempool_create().
9932 + * this function only sleeps if the free_fn() function sleeps. The caller
9933 + * has to guarantee that all elements have been returned to the pool (ie:
9934 + * freed) prior to calling mempool_destroy().
9936 +void mempool_destroy(mempool_t *pool)
9938 + if (pool->curr_nr != pool->min_nr)
9939 + BUG(); /* There were outstanding elements */
9944 + * mempool_alloc - allocate an element from a specific memory pool
9945 + * @pool: pointer to the memory pool which was allocated via
9946 + * mempool_create().
9947 + * @gfp_mask: the usual allocation bitmask.
9949 + * this function only sleeps if the alloc_fn function sleeps or
9950 + * returns NULL. Note that due to preallocation, this function
9951 + * *never* fails when called from process contexts. (it might
9952 + * fail if called from an IRQ context.)
9954 +void * mempool_alloc(mempool_t *pool, int gfp_mask)
9957 + unsigned long flags;
9959 + DECLARE_WAITQUEUE(wait, current);
9960 + int gfp_nowait = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
9963 + element = pool->alloc(gfp_nowait, pool->pool_data);
9964 + if (likely(element != NULL))
9968 + * If the pool is less than 50% full then try harder
9969 + * to allocate an element:
9971 + if ((gfp_mask != gfp_nowait) && (pool->curr_nr <= pool->min_nr/2)) {
9972 + element = pool->alloc(gfp_mask, pool->pool_data);
9973 + if (likely(element != NULL))
9978 + * Kick the VM at this point.
9982 + spin_lock_irqsave(&pool->lock, flags);
9983 + if (likely(pool->curr_nr)) {
9984 + element = remove_element(pool);
9985 + spin_unlock_irqrestore(&pool->lock, flags);
9988 + spin_unlock_irqrestore(&pool->lock, flags);
9990 + /* We must not sleep in the GFP_ATOMIC case */
9991 + if (gfp_mask == gfp_nowait)
9994 + run_task_queue(&tq_disk);
9996 + add_wait_queue_exclusive(&pool->wait, &wait);
9997 + set_task_state(current, TASK_UNINTERRUPTIBLE);
9999 + spin_lock_irqsave(&pool->lock, flags);
10000 + curr_nr = pool->curr_nr;
10001 + spin_unlock_irqrestore(&pool->lock, flags);
10006 + current->state = TASK_RUNNING;
10007 + remove_wait_queue(&pool->wait, &wait);
10009 + goto repeat_alloc;
10013 + * mempool_free - return an element to the pool.
10014 + * @element: pool element pointer.
10015 + * @pool: pointer to the memory pool which was allocated via
10016 + * mempool_create().
10018 + * this function only sleeps if the free_fn() function sleeps.
10020 +void mempool_free(void *element, mempool_t *pool)
10022 + unsigned long flags;
10024 + if (pool->curr_nr < pool->min_nr) {
10025 + spin_lock_irqsave(&pool->lock, flags);
10026 + if (pool->curr_nr < pool->min_nr) {
10027 + add_element(pool, element);
10028 + spin_unlock_irqrestore(&pool->lock, flags);
10029 + wake_up(&pool->wait);
10032 + spin_unlock_irqrestore(&pool->lock, flags);
10034 + pool->free(element, pool->pool_data);
10038 + * A commonly used alloc and free fn.
10040 +void *mempool_alloc_slab(int gfp_mask, void *pool_data)
10042 + kmem_cache_t *mem = (kmem_cache_t *) pool_data;
10043 + return kmem_cache_alloc(mem, gfp_mask);
10046 +void mempool_free_slab(void *element, void *pool_data)
10048 + kmem_cache_t *mem = (kmem_cache_t *) pool_data;
10049 + kmem_cache_free(mem, element);
10053 +EXPORT_SYMBOL(mempool_create);
10054 +EXPORT_SYMBOL(mempool_resize);
10055 +EXPORT_SYMBOL(mempool_destroy);
10056 +EXPORT_SYMBOL(mempool_alloc);
10057 +EXPORT_SYMBOL(mempool_free);
10058 +EXPORT_SYMBOL(mempool_alloc_slab);
10059 +EXPORT_SYMBOL(mempool_free_slab);
10060 --- linux-2.4.21/mm/vmalloc.c Fri Jun 13 16:33:25 2003
10061 +++ linux/mm/vmalloc.c Wed Aug 20 14:41:57 2003
10062 @@ -327,3 +327,22 @@
10063 read_unlock(&vmlist_lock);
10064 return buf - buf_start;
10067 +void *vcalloc(unsigned long nmemb, unsigned long elem_size)
10069 + unsigned long size;
10073 + * Check that we're not going to overflow.
10075 + if (nmemb > (ULONG_MAX / elem_size))
10078 + size = nmemb * elem_size;
10079 + addr = vmalloc(size);
10081 + memset(addr, 0, size);
10085 Supply #targets when creating a table to avoid needing to extend it later.
10086 --- linux-2.4.21/drivers/md/dm-ioctl.c Mon Aug 18 21:24:26 2003
10087 +++ linux/drivers/md/dm-ioctl.c Fri Aug 22 13:49:01 2003
10088 @@ -764,7 +764,7 @@
10089 struct hash_cell *hc;
10090 struct dm_table *t;
10092 - r = dm_table_create(&t, get_mode(param));
10093 + r = dm_table_create(&t, get_mode(param), param->target_count);
10097 --- linux-2.4.21/drivers/md/dm-table.c Tue Aug 19 15:43:50 2003
10098 +++ linux/drivers/md/dm-table.c Fri Aug 22 14:48:50 2003
10099 @@ -148,7 +148,7 @@
10103 -int dm_table_create(struct dm_table **result, int mode)
10104 +int dm_table_create(struct dm_table **result, int mode, unsigned num_targets)
10106 struct dm_table *t = kmalloc(sizeof(*t), GFP_NOIO);
10108 @@ -159,8 +159,10 @@
10109 INIT_LIST_HEAD(&t->devices);
10110 atomic_set(&t->holders, 1);
10112 - /* allocate a single nodes worth of targets to begin with */
10113 - if (alloc_targets(t, KEYS_PER_NODE)) {
10114 + if (!num_targets)
10115 + num_targets = KEYS_PER_NODE;
10117 + if (alloc_targets(t, num_targets)) {
10121 --- linux-2.4.21/drivers/md/dm.h Sat Jul 12 17:06:52 2003
10122 +++ linux/drivers/md/dm.h Fri Aug 22 13:50:19 2003
10124 * Functions for manipulating a table. Tables are also reference
10126 *---------------------------------------------------------------*/
10127 -int dm_table_create(struct dm_table **result, int mode);
10128 +int dm_table_create(struct dm_table **result, int mode, unsigned num_targets);
10130 void dm_table_get(struct dm_table *t);
10131 void dm_table_put(struct dm_table *t);