2 --- diff/Documentation/Configure.help 2003-10-10 23:39:03.000000000 +0100
3 +++ source/Documentation/Configure.help 2003-10-16 10:44:23.000000000 +0100
5 want), say M here and read <file:Documentation/modules.txt>. The
6 module will be called lvm-mod.o.
10 + Device-mapper is a low level volume manager. It works by allowing
11 + people to specify mappings for ranges of logical sectors. Various
12 + mapping types are available, in addition people may write their own
13 + modules containing custom mappings if they wish.
15 + Higher level volume managers such as LVM2 use this driver.
17 + If you want to compile this as a module, say M here and read
18 + <file:Documentation/modules.txt>. The module will be called dm-mod.o.
22 Multiple devices driver support (RAID and LVM)
24 Support multiple physical spindles through a single logical device.
25 --- diff/MAINTAINERS 2003-10-10 23:39:03.000000000 +0100
26 +++ source/MAINTAINERS 2003-10-16 10:44:23.000000000 +0100
28 W: http://www.debian.org/~dz/i8k/
34 +L: linux-LVM@sistina.com
35 +W: http://www.sistina.com/lvm
38 DEVICE NUMBER REGISTRY
41 --- diff/arch/mips64/kernel/ioctl32.c 2003-08-26 13:50:03.000000000 +0100
42 +++ source/arch/mips64/kernel/ioctl32.c 2003-10-16 10:44:23.000000000 +0100
44 #include <linux/auto_fs4.h>
45 #include <linux/ext2_fs.h>
46 #include <linux/raid/md_u.h>
47 +#include <linux/dm-ioctl.h>
48 #include <linux/serial.h>
50 #include <scsi/scsi.h>
51 @@ -1228,6 +1229,22 @@
52 IOCTL32_DEFAULT(SBPROF_ZBWAITFULL),
53 #endif /* CONFIG_SIBYTE_TBPROF */
55 +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
56 + IOCTL32_DEFAULT(DM_VERSION),
57 + IOCTL32_DEFAULT(DM_REMOVE_ALL),
58 + IOCTL32_DEFAULT(DM_DEV_CREATE),
59 + IOCTL32_DEFAULT(DM_DEV_REMOVE),
60 + IOCTL32_DEFAULT(DM_TABLE_LOAD),
61 + IOCTL32_DEFAULT(DM_DEV_SUSPEND),
62 + IOCTL32_DEFAULT(DM_DEV_RENAME),
63 + IOCTL32_DEFAULT(DM_TABLE_DEPS),
64 + IOCTL32_DEFAULT(DM_DEV_STATUS),
65 + IOCTL32_DEFAULT(DM_TABLE_STATUS),
66 + IOCTL32_DEFAULT(DM_DEV_WAIT),
67 + IOCTL32_DEFAULT(DM_LIST_DEVICES),
68 + IOCTL32_DEFAULT(DM_TABLE_CLEAR),
69 +#endif /* CONFIG_BLK_DEV_DM */
71 IOCTL32_DEFAULT(MTIOCTOP), /* mtio.h ioctls */
72 IOCTL32_HANDLER(MTIOCGET32, mt_ioctl_trans),
73 IOCTL32_HANDLER(MTIOCPOS32, mt_ioctl_trans),
74 --- diff/arch/parisc/kernel/ioctl32.c 2003-08-26 13:50:03.000000000 +0100
75 +++ source/arch/parisc/kernel/ioctl32.c 2003-10-16 10:44:23.000000000 +0100
78 #include <linux/lvm.h>
80 +#include <linux/dm-ioctl.h>
82 #include <scsi/scsi.h>
84 @@ -3423,6 +3424,22 @@
85 COMPATIBLE_IOCTL(LV_BMAP)
86 COMPATIBLE_IOCTL(LV_SNAPSHOT_USE_RATE)
89 +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
90 +COMPATIBLE_IOCTL(DM_VERSION)
91 +COMPATIBLE_IOCTL(DM_REMOVE_ALL)
92 +COMPATIBLE_IOCTL(DM_DEV_CREATE)
93 +COMPATIBLE_IOCTL(DM_DEV_REMOVE)
94 +COMPATIBLE_IOCTL(DM_TABLE_LOAD)
95 +COMPATIBLE_IOCTL(DM_DEV_SUSPEND)
96 +COMPATIBLE_IOCTL(DM_DEV_RENAME)
97 +COMPATIBLE_IOCTL(DM_TABLE_DEPS)
98 +COMPATIBLE_IOCTL(DM_DEV_STATUS)
99 +COMPATIBLE_IOCTL(DM_TABLE_STATUS)
100 +COMPATIBLE_IOCTL(DM_DEV_WAIT)
101 +COMPATIBLE_IOCTL(DM_LIST_DEVICES)
102 +COMPATIBLE_IOCTL(DM_TABLE_CLEAR)
103 +#endif /* CONFIG_BLK_DEV_DM */
104 #if defined(CONFIG_DRM) || defined(CONFIG_DRM_MODULE)
105 COMPATIBLE_IOCTL(DRM_IOCTL_GET_MAGIC)
106 COMPATIBLE_IOCTL(DRM_IOCTL_IRQ_BUSID)
107 --- diff/arch/ppc64/kernel/ioctl32.c 2003-08-26 13:50:04.000000000 +0100
108 +++ source/arch/ppc64/kernel/ioctl32.c 2003-10-16 10:44:23.000000000 +0100
110 #if defined(CONFIG_BLK_DEV_LVM) || defined(CONFIG_BLK_DEV_LVM_MODULE)
111 #include <linux/lvm.h>
113 +#include <linux/dm-ioctl.h>
115 #include <scsi/scsi.h>
117 @@ -4435,6 +4436,22 @@
118 COMPATIBLE_IOCTL(NBD_PRINT_DEBUG),
119 COMPATIBLE_IOCTL(NBD_SET_SIZE_BLOCKS),
120 COMPATIBLE_IOCTL(NBD_DISCONNECT),
122 +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
123 +COMPATIBLE_IOCTL(DM_VERSION),
124 +COMPATIBLE_IOCTL(DM_REMOVE_ALL),
125 +COMPATIBLE_IOCTL(DM_DEV_CREATE),
126 +COMPATIBLE_IOCTL(DM_DEV_REMOVE),
127 +COMPATIBLE_IOCTL(DM_TABLE_LOAD),
128 +COMPATIBLE_IOCTL(DM_DEV_SUSPEND),
129 +COMPATIBLE_IOCTL(DM_DEV_RENAME),
130 +COMPATIBLE_IOCTL(DM_TABLE_DEPS),
131 +COMPATIBLE_IOCTL(DM_DEV_STATUS),
132 +COMPATIBLE_IOCTL(DM_TABLE_STATUS),
133 +COMPATIBLE_IOCTL(DM_DEV_WAIT),
134 +COMPATIBLE_IOCTL(DM_LIST_DEVICES),
135 +COMPATIBLE_IOCTL(DM_TABLE_CLEAR),
136 +#endif /* CONFIG_BLK_DEV_DM */
137 /* Remove *PRIVATE in 2.5 */
138 COMPATIBLE_IOCTL(SIOCDEVPRIVATE),
139 COMPATIBLE_IOCTL(SIOCDEVPRIVATE+1),
140 --- diff/arch/s390x/kernel/ioctl32.c 2003-08-26 13:50:04.000000000 +0100
141 +++ source/arch/s390x/kernel/ioctl32.c 2003-10-16 10:44:23.000000000 +0100
143 #include <linux/blk.h>
144 #include <linux/elevator.h>
145 #include <linux/raw.h>
146 +#include <linux/dm-ioctl.h>
147 #include <asm/types.h>
148 #include <asm/uaccess.h>
149 #include <asm/dasd.h>
152 IOCTL32_DEFAULT(SIOCGSTAMP),
154 + IOCTL32_DEFAULT(DM_VERSION),
155 + IOCTL32_DEFAULT(DM_REMOVE_ALL),
156 + IOCTL32_DEFAULT(DM_DEV_CREATE),
157 + IOCTL32_DEFAULT(DM_DEV_REMOVE),
158 + IOCTL32_DEFAULT(DM_TABLE_LOAD),
159 + IOCTL32_DEFAULT(DM_DEV_SUSPEND),
160 + IOCTL32_DEFAULT(DM_DEV_RENAME),
161 + IOCTL32_DEFAULT(DM_TABLE_DEPS),
162 + IOCTL32_DEFAULT(DM_DEV_STATUS),
163 + IOCTL32_DEFAULT(DM_TABLE_STATUS),
164 + IOCTL32_DEFAULT(DM_DEV_WAIT),
165 + IOCTL32_DEFAULT(DM_LIST_DEVICES),
166 + IOCTL32_DEFAULT(DM_TABLE_CLEAR),
168 IOCTL32_DEFAULT(LOOP_SET_FD),
169 IOCTL32_DEFAULT(LOOP_CLR_FD),
171 --- diff/arch/sparc64/kernel/ioctl32.c 2003-10-10 23:39:05.000000000 +0100
172 +++ source/arch/sparc64/kernel/ioctl32.c 2003-10-16 10:44:23.000000000 +0100
174 #if defined(CONFIG_BLK_DEV_LVM) || defined(CONFIG_BLK_DEV_LVM_MODULE)
175 #include <linux/lvm.h>
177 +#include <linux/dm-ioctl.h>
179 #include <scsi/scsi.h>
181 @@ -5086,6 +5087,22 @@
182 COMPATIBLE_IOCTL(NBD_PRINT_DEBUG)
183 COMPATIBLE_IOCTL(NBD_SET_SIZE_BLOCKS)
184 COMPATIBLE_IOCTL(NBD_DISCONNECT)
186 +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
187 +COMPATIBLE_IOCTL(DM_VERSION)
188 +COMPATIBLE_IOCTL(DM_REMOVE_ALL)
189 +COMPATIBLE_IOCTL(DM_DEV_CREATE)
190 +COMPATIBLE_IOCTL(DM_DEV_REMOVE)
191 +COMPATIBLE_IOCTL(DM_TABLE_LOAD)
192 +COMPATIBLE_IOCTL(DM_DEV_SUSPEND)
193 +COMPATIBLE_IOCTL(DM_DEV_RENAME)
194 +COMPATIBLE_IOCTL(DM_TABLE_DEPS)
195 +COMPATIBLE_IOCTL(DM_DEV_STATUS)
196 +COMPATIBLE_IOCTL(DM_TABLE_STATUS)
197 +COMPATIBLE_IOCTL(DM_DEV_WAIT)
198 +COMPATIBLE_IOCTL(DM_LIST_DEVICES)
199 +COMPATIBLE_IOCTL(DM_TABLE_CLEAR)
200 +#endif /* CONFIG_BLK_DEV_DM */
202 #if defined(CONFIG_IEEE1394) || defined(CONFIG_IEEE1394_MODULE)
203 COMPATIBLE_IOCTL(AMDTP_IOC_CHANNEL)
204 --- diff/arch/x86_64/ia32/ia32_ioctl.c 2003-10-10 23:39:05.000000000 +0100
205 +++ source/arch/x86_64/ia32/ia32_ioctl.c 2003-10-16 10:44:23.000000000 +0100
208 #include <linux/lvm.h>
210 +#include <linux/dm-ioctl.h>
212 #include <scsi/scsi.h>
214 @@ -4051,6 +4052,22 @@
215 COMPATIBLE_IOCTL(LV_BMAP)
216 COMPATIBLE_IOCTL(LV_SNAPSHOT_USE_RATE)
219 +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
220 +COMPATIBLE_IOCTL(DM_VERSION)
221 +COMPATIBLE_IOCTL(DM_REMOVE_ALL)
222 +COMPATIBLE_IOCTL(DM_DEV_CREATE)
223 +COMPATIBLE_IOCTL(DM_DEV_REMOVE)
224 +COMPATIBLE_IOCTL(DM_TABLE_LOAD)
225 +COMPATIBLE_IOCTL(DM_DEV_SUSPEND)
226 +COMPATIBLE_IOCTL(DM_DEV_RENAME)
227 +COMPATIBLE_IOCTL(DM_TABLE_DEPS)
228 +COMPATIBLE_IOCTL(DM_DEV_STATUS)
229 +COMPATIBLE_IOCTL(DM_TABLE_STATUS)
230 +COMPATIBLE_IOCTL(DM_DEV_WAIT)
231 +COMPATIBLE_IOCTL(DM_LIST_DEVICES)
232 +COMPATIBLE_IOCTL(DM_TABLE_CLEAR)
233 +#endif /* CONFIG_BLK_DEV_DM */
234 #ifdef CONFIG_AUTOFS_FS
235 COMPATIBLE_IOCTL(AUTOFS_IOC_READY)
236 COMPATIBLE_IOCTL(AUTOFS_IOC_FAIL)
237 --- diff/drivers/md/Config.in 2001-09-26 16:15:05.000000000 +0100
238 +++ source/drivers/md/Config.in 2003-10-16 10:44:23.000000000 +0100
240 dep_tristate ' Multipath I/O support' CONFIG_MD_MULTIPATH $CONFIG_BLK_DEV_MD
242 dep_tristate ' Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM $CONFIG_MD
243 +if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
244 + dep_tristate ' Device-mapper support (EXPERIMENTAL)' CONFIG_BLK_DEV_DM $CONFIG_MD
245 + dep_tristate ' Mirror (RAID-1) support (EXPERIMENTAL)' CONFIG_BLK_DEV_DM_MIRROR $CONFIG_BLK_DEV_DM
249 --- diff/drivers/md/Makefile 2002-01-17 10:07:52.000000000 +0000
250 +++ source/drivers/md/Makefile 2003-10-16 10:44:23.000000000 +0100
255 -export-objs := md.o xor.o
256 -list-multi := lvm-mod.o
257 +export-objs := md.o xor.o dm-table.o dm-target.o kcopyd.o dm-daemon.o \
258 + dm-log.o dm-io.o dm.o
260 +list-multi := lvm-mod.o dm-mod.o dm-mirror-mod.o
261 lvm-mod-objs := lvm.o lvm-snap.o lvm-fs.o
262 +dm-mod-objs := dm.o dm-table.o dm-target.o dm-ioctl.o \
263 + dm-linear.o dm-stripe.o dm-snapshot.o dm-exception-store.o \
264 + kcopyd.o dm-daemon.o dm-io.o
265 +dm-mirror-mod-objs := dm-raid1.o dm-log.o
267 # Note: link order is important. All raid personalities
268 # and xor.o must come before md.o, as they each initialise
269 # themselves, and md.o may use the personalities when it
272 -obj-$(CONFIG_MD_LINEAR) += linear.o
273 -obj-$(CONFIG_MD_RAID0) += raid0.o
274 -obj-$(CONFIG_MD_RAID1) += raid1.o
275 -obj-$(CONFIG_MD_RAID5) += raid5.o xor.o
276 -obj-$(CONFIG_MD_MULTIPATH) += multipath.o
277 -obj-$(CONFIG_BLK_DEV_MD) += md.o
278 -obj-$(CONFIG_BLK_DEV_LVM) += lvm-mod.o
279 +obj-$(CONFIG_MD_LINEAR) += linear.o
280 +obj-$(CONFIG_MD_RAID0) += raid0.o
281 +obj-$(CONFIG_MD_RAID1) += raid1.o
282 +obj-$(CONFIG_MD_RAID5) += raid5.o xor.o
283 +obj-$(CONFIG_MD_MULTIPATH) += multipath.o
284 +obj-$(CONFIG_BLK_DEV_MD) += md.o
286 +obj-$(CONFIG_BLK_DEV_LVM) += lvm-mod.o
288 +obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
289 +obj-$(CONFIG_BLK_DEV_DM_MIRROR) += dm-mirror.o
291 include $(TOPDIR)/Rules.make
293 lvm-mod.o: $(lvm-mod-objs)
294 $(LD) -r -o $@ $(lvm-mod-objs)
296 +dm-mod.o: $(dm-mod-objs)
297 + $(LD) -r -o $@ $(dm-mod-objs)
299 +dm-mirror.o: $(dm-mirror-mod-objs)
300 + $(LD) -r -o $@ $(dm-mirror-mod-objs)
302 --- diff/fs/buffer.c 2003-10-10 23:39:08.000000000 +0100
303 +++ source/fs/buffer.c 2003-10-16 10:44:23.000000000 +0100
305 bh->b_list = BUF_CLEAN;
306 bh->b_end_io = handler;
307 bh->b_private = private;
308 + bh->b_journal_head = NULL;
311 static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
312 --- diff/fs/jbd/journal.c 2003-10-10 23:39:08.000000000 +0100
313 +++ source/fs/jbd/journal.c 2003-10-16 10:44:23.000000000 +0100
314 @@ -1802,9 +1802,9 @@
316 if (buffer_jbd(bh)) {
317 /* Someone did it for us! */
318 - J_ASSERT_BH(bh, bh->b_private != NULL);
319 + J_ASSERT_BH(bh, bh->b_journal_head != NULL);
320 journal_free_journal_head(jh);
321 - jh = bh->b_private;
322 + jh = bh->b_journal_head;
325 * We actually don't need jh_splice_lock when
326 @@ -1812,7 +1812,7 @@
328 spin_lock(&jh_splice_lock);
329 set_bit(BH_JBD, &bh->b_state);
330 - bh->b_private = jh;
331 + bh->b_journal_head = jh;
333 atomic_inc(&bh->b_count);
334 spin_unlock(&jh_splice_lock);
335 @@ -1821,7 +1821,7 @@
338 spin_unlock(&journal_datalist_lock);
339 - return bh->b_private;
340 + return bh->b_journal_head;
344 @@ -1854,7 +1854,7 @@
345 J_ASSERT_BH(bh, jh2bh(jh) == bh);
346 BUFFER_TRACE(bh, "remove journal_head");
347 spin_lock(&jh_splice_lock);
348 - bh->b_private = NULL;
349 + bh->b_journal_head = NULL;
350 jh->b_bh = NULL; /* debug, really */
351 clear_bit(BH_JBD, &bh->b_state);
353 --- diff/include/linux/fs.h 2003-10-10 23:39:08.000000000 +0100
354 +++ source/include/linux/fs.h 2003-10-16 10:44:23.000000000 +0100
356 struct page *b_page; /* the page this bh is mapped to */
357 void (*b_end_io)(struct buffer_head *bh, int uptodate); /* I/O completion */
358 void *b_private; /* reserved for b_end_io */
360 + void *b_journal_head; /* ext3 journal_heads */
361 unsigned long b_rsector; /* Real buffer location on disk */
362 wait_queue_head_t b_wait;
364 --- diff/include/linux/jbd.h 2003-06-16 09:56:12.000000000 +0100
365 +++ source/include/linux/jbd.h 2003-10-16 10:44:23.000000000 +0100
368 static inline struct journal_head *bh2jh(struct buffer_head *bh)
370 - return bh->b_private;
371 + return bh->b_journal_head;
374 #define HAVE_JOURNAL_CALLBACK_STATUS
375 --- diff/include/linux/vmalloc.h 2003-08-26 13:50:14.000000000 +0100
376 +++ source/include/linux/vmalloc.h 2003-10-16 10:44:23.000000000 +0100
378 extern void vmfree_area_pages(unsigned long address, unsigned long size);
379 extern int vmalloc_area_pages(unsigned long address, unsigned long size,
380 int gfp_mask, pgprot_t prot);
381 +extern void *vcalloc(unsigned long nmemb, unsigned long elem_size);
385 --- diff/kernel/ksyms.c 2003-10-10 23:39:08.000000000 +0100
386 +++ source/kernel/ksyms.c 2003-10-16 10:44:23.000000000 +0100
388 EXPORT_SYMBOL(__vmalloc);
390 EXPORT_SYMBOL(vmalloc_to_page);
391 +EXPORT_SYMBOL(vcalloc);
392 EXPORT_SYMBOL(mem_map);
393 EXPORT_SYMBOL(remap_page_range);
394 EXPORT_SYMBOL(max_mapnr);
395 --- diff/mm/Makefile 2002-08-05 14:57:44.000000000 +0100
396 +++ source/mm/Makefile 2003-10-16 10:44:23.000000000 +0100
401 -export-objs := shmem.o filemap.o memory.o page_alloc.o
402 +export-objs := shmem.o filemap.o memory.o page_alloc.o mempool.o
404 obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
405 vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
406 page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \
410 obj-$(CONFIG_HIGHMEM) += highmem.o
412 --- diff/mm/filemap.c 2003-10-10 23:39:08.000000000 +0100
413 +++ source/mm/filemap.c 2003-10-16 10:44:23.000000000 +0100
414 @@ -1753,7 +1753,8 @@
417 up_read(&inode->i_alloc_sem);
418 - UPDATE_ATIME(filp->f_dentry->d_inode);
419 + if (!S_ISBLK(inode->i_mode))
420 + UPDATE_ATIME(filp->f_dentry->d_inode);
424 @@ -3131,8 +3132,12 @@
428 - inode->i_ctime = inode->i_mtime = CURRENT_TIME;
429 - mark_inode_dirty_sync(inode);
431 + /* Don't update times for block devices using O_DIRECT */
432 + if (!(file->f_flags & O_DIRECT) || !S_ISBLK(inode->i_mode)) {
433 + inode->i_ctime = inode->i_mtime = CURRENT_TIME;
434 + mark_inode_dirty_sync(inode);
438 unsigned long index, offset;
439 --- diff/mm/vmalloc.c 2003-08-26 13:50:14.000000000 +0100
440 +++ source/mm/vmalloc.c 2003-10-16 10:44:23.000000000 +0100
442 read_unlock(&vmlist_lock);
443 return buf - buf_start;
446 +void *vcalloc(unsigned long nmemb, unsigned long elem_size)
448 + unsigned long size;
452 + * Check that we're not going to overflow.
454 + if (nmemb > (ULONG_MAX / elem_size))
457 + size = nmemb * elem_size;
458 + addr = vmalloc(size);
460 + memset(addr, 0, size);
464 --- diff/drivers/md/dm-daemon.c 1970-01-01 01:00:00.000000000 +0100
465 +++ source/drivers/md/dm-daemon.c 2003-10-16 10:44:23.000000000 +0100
468 + * Copyright (C) 2003 Sistina Software
470 + * This file is released under the LGPL.
474 +#include "dm-daemon.h"
476 +#include <linux/module.h>
477 +#include <linux/sched.h>
479 +static int daemon(void *arg)
481 + struct dm_daemon *dd = (struct dm_daemon *) arg;
482 + DECLARE_WAITQUEUE(wq, current);
485 + reparent_to_init();
487 + /* block all signals */
488 + spin_lock_irq(¤t->sigmask_lock);
489 + sigfillset(¤t->blocked);
490 + flush_signals(current);
491 + spin_unlock_irq(¤t->sigmask_lock);
493 + strcpy(current->comm, dd->name);
494 + atomic_set(&dd->please_die, 0);
496 + add_wait_queue(&dd->job_queue, &wq);
498 + down(&dd->run_lock);
499 + up(&dd->start_lock);
502 + * dd->fn() could do anything, very likely it will
503 + * suspend. So we can't set the state to
504 + * TASK_INTERRUPTIBLE before calling it. In order to
505 + * prevent a race with a waking thread we do this little
506 + * dance with the dd->woken variable.
510 + set_current_state(TASK_RUNNING);
512 + if (atomic_read(&dd->please_die))
515 + atomic_set(&dd->woken, 0);
519 + set_current_state(TASK_INTERRUPTIBLE);
520 + } while (atomic_read(&dd->woken));
526 + remove_wait_queue(&dd->job_queue, &wq);
531 +int dm_daemon_start(struct dm_daemon *dd, const char *name, void (*fn)(void))
536 + * Initialise the dm_daemon.
539 + strncpy(dd->name, name, sizeof(dd->name) - 1);
540 + sema_init(&dd->start_lock, 1);
541 + sema_init(&dd->run_lock, 1);
542 + init_waitqueue_head(&dd->job_queue);
545 + * Start the new thread.
547 + down(&dd->start_lock);
548 + pid = kernel_thread(daemon, dd, 0);
550 + DMERR("Failed to start kcopyd thread");
555 + * wait for the daemon to up this mutex.
557 + down(&dd->start_lock);
558 + up(&dd->start_lock);
563 +void dm_daemon_stop(struct dm_daemon *dd)
565 + atomic_set(&dd->please_die, 1);
566 + dm_daemon_wake(dd);
567 + down(&dd->run_lock);
571 +void dm_daemon_wake(struct dm_daemon *dd)
573 + atomic_set(&dd->woken, 1);
574 + wake_up_interruptible(&dd->job_queue);
577 +EXPORT_SYMBOL(dm_daemon_start);
578 +EXPORT_SYMBOL(dm_daemon_stop);
579 +EXPORT_SYMBOL(dm_daemon_wake);
580 --- diff/drivers/md/dm-daemon.h 1970-01-01 01:00:00.000000000 +0100
581 +++ source/drivers/md/dm-daemon.h 2003-10-16 10:44:23.000000000 +0100
584 + * Copyright (C) 2003 Sistina Software
586 + * This file is released under the LGPL.
592 +#include <asm/atomic.h>
593 +#include <asm/semaphore.h>
598 + atomic_t please_die;
599 + struct semaphore start_lock;
600 + struct semaphore run_lock;
603 + wait_queue_head_t job_queue;
606 +int dm_daemon_start(struct dm_daemon *dd, const char *name, void (*fn)(void));
607 +void dm_daemon_stop(struct dm_daemon *dd);
608 +void dm_daemon_wake(struct dm_daemon *dd);
609 +int dm_daemon_running(struct dm_daemon *dd);
612 --- diff/drivers/md/dm-exception-store.c 1970-01-01 01:00:00.000000000 +0100
613 +++ source/drivers/md/dm-exception-store.c 2003-10-16 10:44:23.000000000 +0100
618 + * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
620 + * This file is released under the GPL.
623 +#include "dm-snapshot.h"
627 +#include <linux/mm.h>
628 +#include <linux/pagemap.h>
629 +#include <linux/vmalloc.h>
630 +#include <linux/slab.h>
632 +/*-----------------------------------------------------------------
633 + * Persistent snapshots, by persistent we mean that the snapshot
634 + * will survive a reboot.
635 + *---------------------------------------------------------------*/
638 + * We need to store a record of which parts of the origin have
639 + * been copied to the snapshot device. The snapshot code
640 + * requires that we copy exception chunks to chunk aligned areas
641 + * of the COW store. It makes sense therefore, to store the
642 + * metadata in chunk size blocks.
644 + * There is no backward or forward compatibility implemented,
645 + * snapshots with different disk versions than the kernel will
646 + * not be usable. It is expected that "lvcreate" will blank out
647 + * the start of a fresh COW device before calling the snapshot
650 + * The first chunk of the COW device just contains the header.
651 + * After this there is a chunk filled with exception metadata,
652 + * followed by as many exception chunks as can fit in the
655 + * All on disk structures are in little-endian format. The end
656 + * of the exceptions info is indicated by an exception with a
657 + * new_chunk of 0, which is invalid since it would point to the
662 + * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
664 +#define SNAP_MAGIC 0x70416e53
667 + * The on-disk version of the metadata.
669 +#define SNAPSHOT_DISK_VERSION 1
671 +struct disk_header {
675 + * Is this snapshot valid. There is no way of recovering
676 + * an invalid snapshot.
681 + * Simple, incrementing version. no backward
687 + uint32_t chunk_size;
690 +struct disk_exception {
691 + uint64_t old_chunk;
692 + uint64_t new_chunk;
695 +struct commit_callback {
696 + void (*callback)(void *, int success);
701 + * The top level structure for a persistent exception store.
704 + struct dm_snapshot *snap; /* up pointer to my snapshot */
707 + uint32_t chunk_size;
708 + uint32_t exceptions_per_area;
711 + * Now that we have an asynchronous kcopyd there is no
712 + * need for large chunk sizes, so it wont hurt to have a
713 + * whole chunks worth of metadata in memory at once.
718 + * Used to keep track of which metadata area the data in
719 + * 'chunk' refers to.
721 + uint32_t current_area;
724 + * The next free chunk for an exception.
726 + uint32_t next_free;
729 + * The index of next free exception in the current
732 + uint32_t current_committed;
734 + atomic_t pending_count;
735 + uint32_t callback_count;
736 + struct commit_callback *callbacks;
739 +static inline unsigned int sectors_to_pages(unsigned int sectors)
741 + return sectors / (PAGE_SIZE / SECTOR_SIZE);
744 +static int alloc_area(struct pstore *ps)
747 + size_t i, len, nr_pages;
748 + struct page *page, *last = NULL;
750 + len = ps->chunk_size << SECTOR_SHIFT;
753 + * Allocate the chunk_size block of memory that will hold
754 + * a single metadata area.
756 + ps->area = vmalloc(len);
760 + nr_pages = sectors_to_pages(ps->chunk_size);
763 + * We lock the pages for ps->area into memory since
764 + * they'll be doing a lot of io. We also chain them
765 + * together ready for dm-io.
767 + for (i = 0; i < nr_pages; i++) {
768 + page = vmalloc_to_page(ps->area + (i * PAGE_SIZE));
771 + last->list.next = &page->list;
778 +static void free_area(struct pstore *ps)
780 + size_t i, nr_pages;
783 + nr_pages = sectors_to_pages(ps->chunk_size);
784 + for (i = 0; i < nr_pages; i++) {
785 + page = vmalloc_to_page(ps->area + (i * PAGE_SIZE));
786 + page->list.next = NULL;
794 + * Read or write a chunk aligned and sized block of data from a device.
796 +static int chunk_io(struct pstore *ps, uint32_t chunk, int rw)
798 + struct io_region where;
801 + where.dev = ps->snap->cow->dev;
802 + where.sector = ps->chunk_size * chunk;
803 + where.count = ps->chunk_size;
805 + return dm_io_sync(1, &where, rw, vmalloc_to_page(ps->area), 0, &bits);
809 + * Read or write a metadata area. Remembering to skip the first
810 + * chunk which holds the header.
812 +static int area_io(struct pstore *ps, uint32_t area, int rw)
817 + /* convert a metadata area index to a chunk index */
818 + chunk = 1 + ((ps->exceptions_per_area + 1) * area);
820 + r = chunk_io(ps, chunk, rw);
824 + ps->current_area = area;
828 +static int zero_area(struct pstore *ps, uint32_t area)
830 + memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT);
831 + return area_io(ps, area, WRITE);
834 +static int read_header(struct pstore *ps, int *new_snapshot)
837 + struct disk_header *dh;
839 + r = chunk_io(ps, 0, READ);
843 + dh = (struct disk_header *) ps->area;
845 + if (le32_to_cpu(dh->magic) == 0) {
848 + } else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) {
850 + ps->valid = le32_to_cpu(dh->valid);
851 + ps->version = le32_to_cpu(dh->version);
852 + ps->chunk_size = le32_to_cpu(dh->chunk_size);
855 + DMWARN("Invalid/corrupt snapshot");
862 +static int write_header(struct pstore *ps)
864 + struct disk_header *dh;
866 + memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT);
868 + dh = (struct disk_header *) ps->area;
869 + dh->magic = cpu_to_le32(SNAP_MAGIC);
870 + dh->valid = cpu_to_le32(ps->valid);
871 + dh->version = cpu_to_le32(ps->version);
872 + dh->chunk_size = cpu_to_le32(ps->chunk_size);
874 + return chunk_io(ps, 0, WRITE);
878 + * Access functions for the disk exceptions, these do the endian conversions.
880 +static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
882 + if (index >= ps->exceptions_per_area)
885 + return ((struct disk_exception *) ps->area) + index;
888 +static int read_exception(struct pstore *ps,
889 + uint32_t index, struct disk_exception *result)
891 + struct disk_exception *e;
893 + e = get_exception(ps, index);
898 + result->old_chunk = le64_to_cpu(e->old_chunk);
899 + result->new_chunk = le64_to_cpu(e->new_chunk);
904 +static int write_exception(struct pstore *ps,
905 + uint32_t index, struct disk_exception *de)
907 + struct disk_exception *e;
909 + e = get_exception(ps, index);
914 + e->old_chunk = cpu_to_le64(de->old_chunk);
915 + e->new_chunk = cpu_to_le64(de->new_chunk);
921 + * Registers the exceptions that are present in the current area.
922 + * 'full' is filled in to indicate if the area has been
925 +static int insert_exceptions(struct pstore *ps, int *full)
929 + struct disk_exception de;
931 + /* presume the area is full */
934 + for (i = 0; i < ps->exceptions_per_area; i++) {
935 + r = read_exception(ps, i, &de);
941 + * If the new_chunk is pointing at the start of
942 + * the COW device, where the first metadata area
943 + * is we know that we've hit the end of the
944 + * exceptions. Therefore the area is not full.
946 + if (de.new_chunk == 0LL) {
947 + ps->current_committed = i;
953 + * Keep track of the start of the free chunks.
955 + if (ps->next_free <= de.new_chunk)
956 + ps->next_free = de.new_chunk + 1;
959 + * Otherwise we add the exception to the snapshot.
961 + r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk);
969 +static int read_exceptions(struct pstore *ps)
975 + * Keeping reading chunks and inserting exceptions until
976 + * we find a partially full area.
978 + for (area = 0; full; area++) {
979 + r = area_io(ps, area, READ);
983 + r = insert_exceptions(ps, &full);
993 +static inline struct pstore *get_info(struct exception_store *store)
995 + return (struct pstore *) store->context;
998 +static void persistent_fraction_full(struct exception_store *store,
999 + sector_t *numerator, sector_t *denominator)
1001 + *numerator = get_info(store)->next_free * store->snap->chunk_size;
1002 + *denominator = get_dev_size(store->snap->cow->dev);
1005 +static void persistent_destroy(struct exception_store *store)
1007 + struct pstore *ps = get_info(store);
1009 + dm_io_put(sectors_to_pages(ps->chunk_size));
1010 + vfree(ps->callbacks);
1015 +static int persistent_read_metadata(struct exception_store *store)
1017 + int r, new_snapshot;
1018 + struct pstore *ps = get_info(store);
1021 + * Read the snapshot header.
1023 + r = read_header(ps, &new_snapshot);
1028 + * Do we need to setup a new snapshot ?
1030 + if (new_snapshot) {
1031 + r = write_header(ps);
1033 + DMWARN("write_header failed");
1037 + r = zero_area(ps, 0);
1039 + DMWARN("zero_area(0) failed");
1048 + DMWARN("snapshot is marked invalid");
1052 + if (ps->version != SNAPSHOT_DISK_VERSION) {
1053 + DMWARN("unable to handle snapshot disk version %d",
1059 + * Read the metadata.
1061 + r = read_exceptions(ps);
1069 +static int persistent_prepare(struct exception_store *store,
1070 + struct exception *e)
1072 + struct pstore *ps = get_info(store);
1074 + sector_t size = get_dev_size(store->snap->cow->dev);
1076 + /* Is there enough room ? */
1077 + if (size < ((ps->next_free + 1) * store->snap->chunk_size))
1080 + e->new_chunk = ps->next_free;
1083 + * Move onto the next free pending, making sure to take
1084 + * into account the location of the metadata chunks.
1086 + stride = (ps->exceptions_per_area + 1);
1087 + if ((++ps->next_free % stride) == 1)
1090 + atomic_inc(&ps->pending_count);
1094 +static void persistent_commit(struct exception_store *store,
1095 + struct exception *e,
1096 + void (*callback) (void *, int success),
1097 + void *callback_context)
1101 + struct pstore *ps = get_info(store);
1102 + struct disk_exception de;
1103 + struct commit_callback *cb;
1105 + de.old_chunk = e->old_chunk;
1106 + de.new_chunk = e->new_chunk;
1107 + write_exception(ps, ps->current_committed++, &de);
1110 + * Add the callback to the back of the array. This code
1111 + * is the only place where the callback array is
1112 + * manipulated, and we know that it will never be called
1113 + * multiple times concurrently.
1115 + cb = ps->callbacks + ps->callback_count++;
1116 + cb->callback = callback;
1117 + cb->context = callback_context;
1120 + * If there are no more exceptions in flight, or we have
1121 + * filled this metadata area we commit the exceptions to
1124 + if (atomic_dec_and_test(&ps->pending_count) ||
1125 + (ps->current_committed == ps->exceptions_per_area)) {
1126 + r = area_io(ps, ps->current_area, WRITE);
1130 + for (i = 0; i < ps->callback_count; i++) {
1131 + cb = ps->callbacks + i;
1132 + cb->callback(cb->context, r == 0 ? 1 : 0);
1135 + ps->callback_count = 0;
1139 + * Have we completely filled the current area ?
1141 + if (ps->current_committed == ps->exceptions_per_area) {
1142 + ps->current_committed = 0;
1143 + r = zero_area(ps, ps->current_area + 1);
1149 +static void persistent_drop(struct exception_store *store)
1151 + struct pstore *ps = get_info(store);
1154 + if (write_header(ps))
1155 + DMWARN("write header failed");
1158 +int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
1161 + struct pstore *ps;
1163 + r = dm_io_get(sectors_to_pages(chunk_size));
1167 + /* allocate the pstore */
1168 + ps = kmalloc(sizeof(*ps), GFP_KERNEL);
1174 + ps->snap = store->snap;
1176 + ps->version = SNAPSHOT_DISK_VERSION;
1177 + ps->chunk_size = chunk_size;
1178 + ps->exceptions_per_area = (chunk_size << SECTOR_SHIFT) /
1179 + sizeof(struct disk_exception);
1180 + ps->next_free = 2; /* skipping the header and first area */
1181 + ps->current_committed = 0;
1183 + r = alloc_area(ps);
1188 + * Allocate space for all the callbacks.
1190 + ps->callback_count = 0;
1191 + atomic_set(&ps->pending_count, 0);
1192 + ps->callbacks = vcalloc(ps->exceptions_per_area,
1193 + sizeof(*ps->callbacks));
1195 + if (!ps->callbacks) {
1200 + store->destroy = persistent_destroy;
1201 + store->read_metadata = persistent_read_metadata;
1202 + store->prepare_exception = persistent_prepare;
1203 + store->commit_exception = persistent_commit;
1204 + store->drop_snapshot = persistent_drop;
1205 + store->fraction_full = persistent_fraction_full;
1206 + store->context = ps;
1211 + dm_io_put(sectors_to_pages(chunk_size));
1213 + if (ps->callbacks)
1214 + vfree(ps->callbacks);
1221 +/*-----------------------------------------------------------------
1222 + * Implementation of the store for non-persistent snapshots.
1223 + *---------------------------------------------------------------*/
1224 +struct transient_c {
1225 + sector_t next_free;
1228 +void transient_destroy(struct exception_store *store)
1230 + kfree(store->context);
1233 +int transient_read_metadata(struct exception_store *store)
1238 +int transient_prepare(struct exception_store *store, struct exception *e)
1240 + struct transient_c *tc = (struct transient_c *) store->context;
1241 + sector_t size = get_dev_size(store->snap->cow->dev);
1243 + if (size < (tc->next_free + store->snap->chunk_size))
1246 + e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
1247 + tc->next_free += store->snap->chunk_size;
1252 +void transient_commit(struct exception_store *store,
1253 + struct exception *e,
1254 + void (*callback) (void *, int success),
1255 + void *callback_context)
1257 + /* Just succeed */
1258 + callback(callback_context, 1);
1261 +static void transient_fraction_full(struct exception_store *store,
1262 + sector_t *numerator, sector_t *denominator)
1264 + *numerator = ((struct transient_c *) store->context)->next_free;
1265 + *denominator = get_dev_size(store->snap->cow->dev);
1268 +int dm_create_transient(struct exception_store *store,
1269 + struct dm_snapshot *s, int blocksize)
1271 + struct transient_c *tc;
1273 + memset(store, 0, sizeof(*store));
1274 + store->destroy = transient_destroy;
1275 + store->read_metadata = transient_read_metadata;
1276 + store->prepare_exception = transient_prepare;
1277 + store->commit_exception = transient_commit;
1278 + store->fraction_full = transient_fraction_full;
1281 + tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
1285 + tc->next_free = 0;
1286 + store->context = tc;
1290 --- diff/drivers/md/dm-io.c 1970-01-01 01:00:00.000000000 +0100
1291 +++ source/drivers/md/dm-io.c 2003-10-16 10:44:23.000000000 +0100
1294 + * Copyright (C) 2003 Sistina Software
1296 + * This file is released under the GPL.
1301 +#include <linux/mempool.h>
1302 +#include <linux/module.h>
1303 +#include <linux/slab.h>
1304 +#include <linux/sched.h>
1306 +/* FIXME: can we shrink this ? */
1307 +struct io_context {
1309 + unsigned int error;
1311 + struct task_struct *sleeper;
1312 + io_notify_fn callback;
1317 + * We maintain a pool of buffer heads for dispatching the io.
1319 +static unsigned int _num_bhs;
1320 +static mempool_t *_buffer_pool;
1323 + * io contexts are only dynamically allocated for asynchronous
1324 + * io. Since async io is likely to be the majority of io we'll
1325 + * have the same number of io contexts as buffer heads ! (FIXME:
1326 + * must reduce this).
1328 +mempool_t *_io_pool;
1330 +static void *alloc_bh(int gfp_mask, void *pool_data)
1332 + struct buffer_head *bh;
1334 + bh = kmem_cache_alloc(bh_cachep, gfp_mask);
1336 + bh->b_reqnext = NULL;
1337 + init_waitqueue_head(&bh->b_wait);
1338 + INIT_LIST_HEAD(&bh->b_inode_buffers);
1344 +static void *alloc_io(int gfp_mask, void *pool_data)
1346 + return kmalloc(sizeof(struct io_context), gfp_mask);
1349 +static void free_io(void *element, void *pool_data)
1354 +static unsigned int pages_to_buffers(unsigned int pages)
1356 + return 4 * pages; /* too many ? */
1359 +static int resize_pool(unsigned int new_bhs)
1363 + if (_buffer_pool) {
1364 + if (new_bhs == 0) {
1365 + /* free off the pools */
1366 + mempool_destroy(_buffer_pool);
1367 + mempool_destroy(_io_pool);
1368 + _buffer_pool = _io_pool = NULL;
1370 + /* resize the pools */
1371 + r = mempool_resize(_buffer_pool, new_bhs, GFP_KERNEL);
1373 + r = mempool_resize(_io_pool,
1374 + new_bhs, GFP_KERNEL);
1377 + /* create new pools */
1378 + _buffer_pool = mempool_create(new_bhs, alloc_bh,
1379 + mempool_free_slab, bh_cachep);
1380 + if (!_buffer_pool)
1383 + _io_pool = mempool_create(new_bhs, alloc_io, free_io, NULL);
1385 + mempool_destroy(_buffer_pool);
1386 + _buffer_pool = NULL;
1392 + _num_bhs = new_bhs;
1397 +int dm_io_get(unsigned int num_pages)
1399 + return resize_pool(_num_bhs + pages_to_buffers(num_pages));
1402 +void dm_io_put(unsigned int num_pages)
1404 + resize_pool(_num_bhs - pages_to_buffers(num_pages));
1407 +/*-----------------------------------------------------------------
1408 + * We need to keep track of which region a buffer is doing io
1409 + * for. In order to save a memory allocation we store this in an
1410 + * unused field of the buffer head, and provide these access
1413 + * FIXME: add compile time check that an unsigned int can fit
1416 + *---------------------------------------------------------------*/
1417 +static inline void bh_set_region(struct buffer_head *bh, unsigned int region)
1419 + bh->b_journal_head = (void *) region;
1422 +static inline int bh_get_region(struct buffer_head *bh)
1424 + return (unsigned int) bh->b_journal_head;
1427 +/*-----------------------------------------------------------------
1428 + * We need an io object to keep track of the number of bhs that
1429 + * have been dispatched for a particular io.
1430 + *---------------------------------------------------------------*/
1431 +static void dec_count(struct io_context *io, unsigned int region, int error)
1434 + set_bit(region, &io->error);
1436 + if (atomic_dec_and_test(&io->count)) {
1438 + wake_up_process(io->sleeper);
1441 + int r = io->error;
1442 + io_notify_fn fn = io->callback;
1443 + void *context = io->context;
1445 + mempool_free(io, _io_pool);
1451 +static void endio(struct buffer_head *bh, int uptodate)
1453 + struct io_context *io = (struct io_context *) bh->b_private;
1455 + if (!uptodate && io->rw != WRITE) {
1457 + * We need to zero this region, otherwise people
1458 + * like kcopyd may write the arbitrary contents
1461 + memset(bh->b_data, 0, bh->b_size);
1464 + dec_count((struct io_context *) bh->b_private,
1465 + bh_get_region(bh), !uptodate);
1466 + mempool_free(bh, _buffer_pool);
1470 + * Primitives for alignment calculations.
1472 +int fls(unsigned n)
1474 + return generic_fls32(n);
1477 +static inline int log2_floor(unsigned n)
1479 + return ffs(n) - 1;
1482 +static inline int log2_align(unsigned n)
1484 + return fls(n) - 1;
1488 + * Returns the next block for io.
1490 +static int do_page(kdev_t dev, sector_t *block, sector_t end_block,
1491 + unsigned int block_size,
1492 + struct page *p, unsigned int offset,
1493 + unsigned int region, struct io_context *io)
1495 + struct buffer_head *bh;
1496 + sector_t b = *block;
1497 + sector_t blocks_per_page = PAGE_SIZE / block_size;
1498 + unsigned int this_size; /* holds the size of the current io */
1501 + while ((offset < PAGE_SIZE) && (b != end_block)) {
1502 + bh = mempool_alloc(_buffer_pool, GFP_NOIO);
1503 + init_buffer(bh, endio, io);
1504 + bh_set_region(bh, region);
1507 + * Block size must be a power of 2 and aligned
1510 + len = end_block - b;
1511 + this_size = min((sector_t) 1 << log2_floor(b), blocks_per_page);
1512 + if (this_size > len)
1513 + this_size = 1 << log2_align(len);
1516 + * Add in the job offset.
1518 + bh->b_blocknr = (b / this_size);
1519 + bh->b_size = block_size * this_size;
1520 + set_bh_page(bh, p, offset);
1521 + bh->b_this_page = bh;
1524 + atomic_set(&bh->b_count, 1);
1526 + bh->b_state = ((1 << BH_Uptodate) | (1 << BH_Mapped) |
1529 + if (io->rw == WRITE)
1530 + clear_bit(BH_Dirty, &bh->b_state);
1532 + atomic_inc(&io->count);
1533 + submit_bh(io->rw, bh);
1536 + offset += block_size * this_size;
1540 + return (b == end_block);
1543 +static void do_region(unsigned int region, struct io_region *where,
1544 + struct page *page, unsigned int offset,
1545 + struct io_context *io)
1547 + unsigned int block_size = get_hardsect_size(where->dev);
1548 + unsigned int sblock_size = block_size >> 9;
1549 + sector_t block = where->sector / sblock_size;
1550 + sector_t end_block = (where->sector + where->count) / sblock_size;
1553 + if (do_page(where->dev, &block, end_block, block_size,
1554 + page, offset, region, io))
1557 + offset = 0; /* only offset the first page */
1559 + page = list_entry(page->list.next, struct page, list);
1563 +static void dispatch_io(unsigned int num_regions, struct io_region *where,
1564 + struct page *pages, unsigned int offset,
1565 + struct io_context *io)
1569 + for (i = 0; i < num_regions; i++)
1570 + if (where[i].count)
1571 + do_region(i, where + i, pages, offset, io);
1574 + * Drop the extra refence that we were holding to avoid
1575 + * the io being completed too early.
1577 + dec_count(io, 0, 0);
1583 +int dm_io_sync(unsigned int num_regions, struct io_region *where,
1584 + int rw, struct page *pages, unsigned int offset,
1585 + unsigned int *error_bits)
1587 + struct io_context io;
1589 + BUG_ON(num_regions > 1 && rw != WRITE);
1593 + atomic_set(&io.count, 1); /* see dispatch_io() */
1594 + io.sleeper = current;
1596 + dispatch_io(num_regions, where, pages, offset, &io);
1597 + run_task_queue(&tq_disk);
1600 + set_current_state(TASK_UNINTERRUPTIBLE);
1602 + if (!atomic_read(&io.count))
1607 + set_current_state(TASK_RUNNING);
1609 + *error_bits = io.error;
1610 + return io.error ? -EIO : 0;
1616 +int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
1617 + struct page *pages, unsigned int offset,
1618 + io_notify_fn fn, void *context)
1620 + struct io_context *io = mempool_alloc(_io_pool, GFP_NOIO);
1624 + atomic_set(&io->count, 1); /* see dispatch_io() */
1625 + io->sleeper = NULL;
1626 + io->callback = fn;
1627 + io->context = context;
1629 + dispatch_io(num_regions, where, pages, offset, io);
1633 +EXPORT_SYMBOL(dm_io_get);
1634 +EXPORT_SYMBOL(dm_io_put);
1635 +EXPORT_SYMBOL(dm_io_sync);
1636 +EXPORT_SYMBOL(dm_io_async);
1637 --- diff/drivers/md/dm-io.h 1970-01-01 01:00:00.000000000 +0100
1638 +++ source/drivers/md/dm-io.h 2003-10-16 10:44:23.000000000 +0100
1641 + * Copyright (C) 2003 Sistina Software
1643 + * This file is released under the GPL.
1651 +#include <linux/list.h>
1653 +/* Move these to bitops.h eventually */
1654 +/* Improved generic_fls algorithm (in 2.4 there is no generic_fls so far) */
1655 +/* (c) 2002, D.Phillips and Sistina Software */
1656 +/* Licensed under Version 2 of the GPL */
1658 +static unsigned generic_fls8(unsigned n)
1661 + n & 0xc0 ? (n >> 7) + 7 : (n >> 5) + 5:
1662 + n & 0x0c ? (n >> 3) + 3 : n - ((n + 1) >> 2);
1665 +static inline unsigned generic_fls16(unsigned n)
1667 + return n & 0xff00? generic_fls8(n >> 8) + 8 : generic_fls8(n);
1670 +static inline unsigned generic_fls32(unsigned n)
1672 + return n & 0xffff0000 ? generic_fls16(n >> 16) + 16 : generic_fls16(n);
1675 +/* FIXME make this configurable */
1676 +#define DM_MAX_IO_REGIONS 8
1686 + * 'error' is a bitset, with each bit indicating whether an error
1687 + * occurred doing io to the corresponding region.
1689 +typedef void (*io_notify_fn)(unsigned int error, void *context);
1693 + * Before anyone uses the IO interface they should call
1694 + * dm_io_get(), specifying roughly how many pages they are
1695 + * expecting to perform io on concurrently.
1697 + * This function may block.
1699 +int dm_io_get(unsigned int num_pages);
1700 +void dm_io_put(unsigned int num_pages);
1706 + * Please ensure that the rw flag in the next two functions is
1707 + * either READ or WRITE, ie. we don't take READA. Any
1708 + * regions with a zero count field will be ignored.
1710 +int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw,
1711 + struct page *pages, unsigned int offset,
1712 + unsigned int *error_bits);
1718 + * The 'where' array may be safely allocated on the stack since
1719 + * the function takes a copy.
1721 +int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
1722 + struct page *pages, unsigned int offset,
1723 + io_notify_fn fn, void *context);
1726 --- diff/drivers/md/dm-ioctl.c 1970-01-01 01:00:00.000000000 +0100
1727 +++ source/drivers/md/dm-ioctl.c 2003-10-16 10:44:23.000000000 +0100
1730 + * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
1732 + * This file is released under the GPL.
1737 +#include <linux/module.h>
1738 +#include <linux/vmalloc.h>
1739 +#include <linux/miscdevice.h>
1740 +#include <linux/dm-ioctl.h>
1741 +#include <linux/init.h>
1742 +#include <linux/wait.h>
1743 +#include <linux/blk.h>
1744 +#include <linux/slab.h>
1746 +#include <asm/uaccess.h>
1748 +#define DM_DRIVER_EMAIL "dm@uk.sistina.com"
1750 +/*-----------------------------------------------------------------
1751 + * The ioctl interface needs to be able to look up devices by
1753 + *---------------------------------------------------------------*/
1755 + struct list_head name_list;
1756 + struct list_head uuid_list;
1760 + struct mapped_device *md;
1761 + struct dm_table *new_map;
1763 + /* I hate devfs */
1764 + devfs_handle_t devfs_entry;
1767 +#define NUM_BUCKETS 64
1768 +#define MASK_BUCKETS (NUM_BUCKETS - 1)
1769 +static struct list_head _name_buckets[NUM_BUCKETS];
1770 +static struct list_head _uuid_buckets[NUM_BUCKETS];
1772 +static devfs_handle_t _dev_dir;
1773 +void dm_hash_remove_all(void);
1776 + * Guards access to both hash tables.
1778 +static DECLARE_RWSEM(_hash_lock);
1780 +static void init_buckets(struct list_head *buckets)
1784 + for (i = 0; i < NUM_BUCKETS; i++)
1785 + INIT_LIST_HEAD(buckets + i);
1788 +int dm_hash_init(void)
1790 + init_buckets(_name_buckets);
1791 + init_buckets(_uuid_buckets);
1792 + _dev_dir = devfs_mk_dir(0, DM_DIR, NULL);
1796 +void dm_hash_exit(void)
1798 + dm_hash_remove_all();
1799 + devfs_unregister(_dev_dir);
1802 +/*-----------------------------------------------------------------
1804 + * We're not really concerned with the str hash function being
1805 + * fast since it's only used by the ioctl interface.
1806 + *---------------------------------------------------------------*/
1807 +static unsigned int hash_str(const char *str)
1809 + const unsigned int hash_mult = 2654435387U;
1810 + unsigned int h = 0;
1813 + h = (h + (unsigned int) *str++) * hash_mult;
1815 + return h & MASK_BUCKETS;
1818 +/*-----------------------------------------------------------------
1819 + * Code for looking up a device by name
1820 + *---------------------------------------------------------------*/
1821 +static struct hash_cell *__get_name_cell(const char *str)
1823 + struct list_head *tmp;
1824 + struct hash_cell *hc;
1825 + unsigned int h = hash_str(str);
1827 + list_for_each (tmp, _name_buckets + h) {
1828 + hc = list_entry(tmp, struct hash_cell, name_list);
1829 + if (!strcmp(hc->name, str))
1836 +static struct hash_cell *__get_uuid_cell(const char *str)
1838 + struct list_head *tmp;
1839 + struct hash_cell *hc;
1840 + unsigned int h = hash_str(str);
1842 + list_for_each (tmp, _uuid_buckets + h) {
1843 + hc = list_entry(tmp, struct hash_cell, uuid_list);
1844 + if (!strcmp(hc->uuid, str))
1851 +/*-----------------------------------------------------------------
1852 + * Inserting, removing and renaming a device.
1853 + *---------------------------------------------------------------*/
1854 +static inline char *kstrdup(const char *str)
1856 + char *r = kmalloc(strlen(str) + 1, GFP_KERNEL);
1862 +static struct hash_cell *alloc_cell(const char *name, const char *uuid,
1863 + struct mapped_device *md)
1865 + struct hash_cell *hc;
1867 + hc = kmalloc(sizeof(*hc), GFP_KERNEL);
1871 + hc->name = kstrdup(name);
1881 + hc->uuid = kstrdup(uuid);
1889 + INIT_LIST_HEAD(&hc->name_list);
1890 + INIT_LIST_HEAD(&hc->uuid_list);
1892 + hc->new_map = NULL;
1896 +static void free_cell(struct hash_cell *hc)
1908 +static int register_with_devfs(struct hash_cell *hc)
1910 + kdev_t dev = dm_kdev(hc->md);
1913 + devfs_register(_dev_dir, hc->name, DEVFS_FL_CURRENT_OWNER,
1914 + major(dev), minor(dev),
1915 + S_IFBLK | S_IRUSR | S_IWUSR | S_IRGRP,
1916 + &dm_blk_dops, NULL);
1921 +static int unregister_with_devfs(struct hash_cell *hc)
1923 + devfs_unregister(hc->devfs_entry);
1928 + * The kdev_t and uuid of a device can never change once it is
1929 + * initially inserted.
1931 +int dm_hash_insert(const char *name, const char *uuid, struct mapped_device *md)
1933 + struct hash_cell *cell;
1936 + * Allocate the new cells.
1938 + cell = alloc_cell(name, uuid, md);
1943 + * Insert the cell into both hash tables.
1945 + down_write(&_hash_lock);
1946 + if (__get_name_cell(name))
1949 + list_add(&cell->name_list, _name_buckets + hash_str(name));
1952 + if (__get_uuid_cell(uuid)) {
1953 + list_del(&cell->name_list);
1956 + list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid));
1958 + register_with_devfs(cell);
1960 + up_write(&_hash_lock);
1965 + up_write(&_hash_lock);
1970 +void __hash_remove(struct hash_cell *hc)
1972 + /* remove from the dev hash */
1973 + list_del(&hc->uuid_list);
1974 + list_del(&hc->name_list);
1975 + unregister_with_devfs(hc);
1978 + dm_table_put(hc->new_map);
1982 +void dm_hash_remove_all(void)
1985 + struct hash_cell *hc;
1986 + struct list_head *tmp, *n;
1988 + down_write(&_hash_lock);
1989 + for (i = 0; i < NUM_BUCKETS; i++) {
1990 + list_for_each_safe (tmp, n, _name_buckets + i) {
1991 + hc = list_entry(tmp, struct hash_cell, name_list);
1992 + __hash_remove(hc);
1995 + up_write(&_hash_lock);
1998 +int dm_hash_rename(const char *old, const char *new)
2000 + char *new_name, *old_name;
2001 + struct hash_cell *hc;
2006 + new_name = kstrdup(new);
2010 + down_write(&_hash_lock);
2015 + hc = __get_name_cell(new);
2017 + DMWARN("asked to rename to an already existing name %s -> %s",
2019 + up_write(&_hash_lock);
2025 + * Is there such a device as 'old' ?
2027 + hc = __get_name_cell(old);
2029 + DMWARN("asked to rename a non existent device %s -> %s",
2031 + up_write(&_hash_lock);
2037 + * rename and move the name cell.
2039 + list_del(&hc->name_list);
2040 + old_name = hc->name;
2041 + hc->name = new_name;
2042 + list_add(&hc->name_list, _name_buckets + hash_str(new_name));
2044 + /* rename the device node in devfs */
2045 + unregister_with_devfs(hc);
2046 + register_with_devfs(hc);
2048 + up_write(&_hash_lock);
2053 +/*-----------------------------------------------------------------
2054 + * Implementation of the ioctl commands
2055 + *---------------------------------------------------------------*/
2057 + * All the ioctl commands get dispatched to functions with this
2060 +typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size);
2062 +static int remove_all(struct dm_ioctl *param, size_t param_size)
2064 + dm_hash_remove_all();
2065 + param->data_size = 0;
2070 + * Round up the ptr to an 8-byte boundary.
2072 +#define ALIGN_MASK 7
2073 +static inline void *align_ptr(void *ptr)
2075 + return (void *) (((size_t) (ptr + ALIGN_MASK)) & ~ALIGN_MASK);
2079 + * Retrieves the data payload buffer from an already allocated
2080 + * struct dm_ioctl.
2082 +static void *get_result_buffer(struct dm_ioctl *param, size_t param_size,
2085 + param->data_start = align_ptr(param + 1) - (void *) param;
2087 + if (param->data_start < param_size)
2088 + *len = param_size - param->data_start;
2092 + return ((void *) param) + param->data_start;
2095 +static int list_devices(struct dm_ioctl *param, size_t param_size)
2098 + struct hash_cell *hc;
2099 + size_t len, needed = 0;
2100 + struct dm_name_list *nl, *old_nl = NULL;
2102 + down_write(&_hash_lock);
2105 + * Loop through all the devices working out how much
2108 + for (i = 0; i < NUM_BUCKETS; i++) {
2109 + list_for_each_entry (hc, _name_buckets + i, name_list) {
2110 + needed += sizeof(struct dm_name_list);
2111 + needed += strlen(hc->name);
2112 + needed += ALIGN_MASK;
2117 + * Grab our output buffer.
2119 + nl = get_result_buffer(param, param_size, &len);
2120 + if (len < needed) {
2121 + param->flags |= DM_BUFFER_FULL_FLAG;
2124 + param->data_size = param->data_start + needed;
2126 + nl->dev = 0; /* Flags no data */
2129 + * Now loop through filling out the names.
2131 + for (i = 0; i < NUM_BUCKETS; i++) {
2132 + list_for_each_entry (hc, _name_buckets + i, name_list) {
2134 + old_nl->next = (uint32_t) ((void *) nl -
2137 + nl->dev = dm_kdev(hc->md);
2139 + strcpy(nl->name, hc->name);
2142 + nl = align_ptr(((void *) ++nl) + strlen(hc->name) + 1);
2147 + up_write(&_hash_lock);
2151 +static int check_name(const char *name)
2153 + if (strchr(name, '/')) {
2154 + DMWARN("invalid device name");
2162 + * Fills in a dm_ioctl structure, ready for sending back to
2165 +static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
2167 + kdev_t dev = dm_kdev(md);
2168 + struct dm_table *table;
2169 + struct block_device *bdev;
2171 + param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
2172 + DM_ACTIVE_PRESENT_FLAG);
2174 + if (dm_suspended(md))
2175 + param->flags |= DM_SUSPEND_FLAG;
2177 + param->dev = kdev_t_to_nr(dev);
2179 + if (is_read_only(dev))
2180 + param->flags |= DM_READONLY_FLAG;
2182 + param->event_nr = dm_get_event_nr(md);
2184 + table = dm_get_table(md);
2186 + param->flags |= DM_ACTIVE_PRESENT_FLAG;
2187 + param->target_count = dm_table_get_num_targets(table);
2188 + dm_table_put(table);
2190 + param->target_count = 0;
2192 + bdev = bdget(param->dev);
2195 + param->open_count = bdev->bd_openers;
2201 +static int dev_create(struct dm_ioctl *param, size_t param_size)
2205 + struct mapped_device *md;
2207 + r = check_name(param->name);
2211 + if (param->flags & DM_PERSISTENT_DEV_FLAG)
2212 + dev = to_kdev_t(param->dev);
2214 + r = dm_create(dev, &md);
2218 + r = dm_hash_insert(param->name, *param->uuid ? param->uuid : NULL, md);
2224 + param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
2226 + r = __dev_status(md, param);
2233 + * Always use UUID for lookups if it's present, otherwise use name.
2235 +static inline struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param)
2237 + return *param->uuid ?
2238 + __get_uuid_cell(param->uuid) : __get_name_cell(param->name);
2241 +static inline struct mapped_device *find_device(struct dm_ioctl *param)
2243 + struct hash_cell *hc;
2244 + struct mapped_device *md = NULL;
2246 + down_read(&_hash_lock);
2247 + hc = __find_device_hash_cell(param);
2252 + * Sneakily write in both the name and the uuid
2253 + * while we have the cell.
2255 + strncpy(param->name, hc->name, sizeof(param->name));
2257 + strncpy(param->uuid, hc->uuid, sizeof(param->uuid) - 1);
2259 + param->uuid[0] = '\0';
2262 + param->flags |= DM_INACTIVE_PRESENT_FLAG;
2264 + param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
2268 + up_read(&_hash_lock);
2273 +static int dev_remove(struct dm_ioctl *param, size_t param_size)
2275 + struct hash_cell *hc;
2277 + down_write(&_hash_lock);
2278 + hc = __find_device_hash_cell(param);
2281 + DMWARN("device doesn't appear to be in the dev hash table.");
2282 + up_write(&_hash_lock);
2286 + __hash_remove(hc);
2287 + up_write(&_hash_lock);
2288 + param->data_size = 0;
2293 + * Check a string doesn't overrun the chunk of
2294 + * memory we copied from userland.
2296 +static int invalid_str(char *str, void *end)
2298 + while ((void *) str < end)
2305 +static int dev_rename(struct dm_ioctl *param, size_t param_size)
2308 + char *new_name = (char *) param + param->data_start;
2310 + if (new_name < (char *) (param + 1) ||
2311 + invalid_str(new_name, (void *) param + param_size)) {
2312 + DMWARN("Invalid new logical volume name supplied.");
2316 + r = check_name(new_name);
2320 + param->data_size = 0;
2321 + return dm_hash_rename(param->name, new_name);
2324 +static int suspend(struct dm_ioctl *param)
2327 + struct mapped_device *md;
2329 + md = find_device(param);
2333 + if (!dm_suspended(md))
2334 + r = dm_suspend(md);
2337 + r = __dev_status(md, param);
2343 +static int resume(struct dm_ioctl *param)
2346 + struct hash_cell *hc;
2347 + struct mapped_device *md;
2348 + struct dm_table *new_map;
2350 + down_write(&_hash_lock);
2352 + hc = __find_device_hash_cell(param);
2354 + DMWARN("device doesn't appear to be in the dev hash table.");
2355 + up_write(&_hash_lock);
2362 + new_map = hc->new_map;
2363 + hc->new_map = NULL;
2364 + param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
2366 + up_write(&_hash_lock);
2368 + /* Do we need to load a new map ? */
2370 + /* Suspend if it isn't already suspended */
2371 + if (!dm_suspended(md))
2374 + r = dm_swap_table(md, new_map);
2377 + dm_table_put(new_map);
2381 + if (dm_table_get_mode(new_map) & FMODE_WRITE)
2382 + set_device_ro(dm_kdev(md), 0);
2384 + set_device_ro(dm_kdev(md), 1);
2386 + dm_table_put(new_map);
2389 + if (dm_suspended(md))
2390 + r = dm_resume(md);
2393 + r = __dev_status(md, param);
2400 + * Set or unset the suspension state of a device.
2401 + * If the device already is in the requested state we just return its status.
2403 +static int dev_suspend(struct dm_ioctl *param, size_t param_size)
2405 + if (param->flags & DM_SUSPEND_FLAG)
2406 + return suspend(param);
2408 + return resume(param);
2412 + * Copies device info back to user space, used by
2413 + * the create and info ioctls.
2415 +static int dev_status(struct dm_ioctl *param, size_t param_size)
2418 + struct mapped_device *md;
2420 + md = find_device(param);
2424 + r = __dev_status(md, param);
2430 + * Wait for a device to report an event
2432 +static int dev_wait(struct dm_ioctl *param, size_t param_size)
2435 + struct mapped_device *md;
2436 + DECLARE_WAITQUEUE(wq, current);
2438 + md = find_device(param);
2443 + * Wait for a notification event
2445 + set_current_state(TASK_INTERRUPTIBLE);
2446 + if (!dm_add_wait_queue(md, &wq, param->event_nr)) {
2448 + dm_remove_wait_queue(md, &wq);
2450 + set_current_state(TASK_RUNNING);
2453 + * The userland program is going to want to know what
2454 + * changed to trigger the event, so we may as well tell
2455 + * him and save an ioctl.
2457 + r = __dev_status(md, param);
2463 +static inline int get_mode(struct dm_ioctl *param)
2465 + int mode = FMODE_READ | FMODE_WRITE;
2467 + if (param->flags & DM_READONLY_FLAG)
2468 + mode = FMODE_READ;
2473 +static int next_target(struct dm_target_spec *last, uint32_t next, void *end,
2474 + struct dm_target_spec **spec, char **target_params)
2476 + *spec = (struct dm_target_spec *) ((unsigned char *) last + next);
2477 + *target_params = (char *) (*spec + 1);
2479 + if (*spec < (last + 1))
2482 + return invalid_str(*target_params, end);
2485 +static int populate_table(struct dm_table *table, struct dm_ioctl *param,
2486 + size_t param_size)
2489 + unsigned int i = 0;
2490 + struct dm_target_spec *spec = (struct dm_target_spec *) param;
2491 + uint32_t next = param->data_start;
2492 + void *end = (void *) param + param_size;
2493 + char *target_params;
2495 + if (!param->target_count) {
2496 + DMWARN("populate_table: no targets specified");
2500 + for (i = 0; i < param->target_count; i++) {
2502 + r = next_target(spec, next, end, &spec, &target_params);
2504 + DMWARN("unable to find target");
2508 + r = dm_table_add_target(table, spec->target_type,
2509 + (sector_t) spec->sector_start,
2510 + (sector_t) spec->length,
2513 + DMWARN("error adding target to table");
2517 + next = spec->next;
2520 + return dm_table_complete(table);
2523 +static int table_load(struct dm_ioctl *param, size_t param_size)
2526 + struct hash_cell *hc;
2527 + struct dm_table *t;
2529 + r = dm_table_create(&t, get_mode(param));
2533 + r = populate_table(t, param, param_size);
2539 + down_write(&_hash_lock);
2540 + hc = __find_device_hash_cell(param);
2542 + DMWARN("device doesn't appear to be in the dev hash table.");
2543 + up_write(&_hash_lock);
2548 + param->flags |= DM_INACTIVE_PRESENT_FLAG;
2550 + r = __dev_status(hc->md, param);
2551 + up_write(&_hash_lock);
2555 +static int table_clear(struct dm_ioctl *param, size_t param_size)
2558 + struct hash_cell *hc;
2560 + down_write(&_hash_lock);
2562 + hc = __find_device_hash_cell(param);
2564 + DMWARN("device doesn't appear to be in the dev hash table.");
2565 + up_write(&_hash_lock);
2569 + if (hc->new_map) {
2570 + dm_table_put(hc->new_map);
2571 + hc->new_map = NULL;
2574 + param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
2576 + r = __dev_status(hc->md, param);
2577 + up_write(&_hash_lock);
2582 + * Retrieves a list of devices used by a particular dm device.
2584 +static void retrieve_deps(struct dm_table *table, struct dm_ioctl *param,
2585 + size_t param_size)
2587 + unsigned int count = 0;
2588 + struct list_head *tmp;
2589 + size_t len, needed;
2590 + struct dm_target_deps *deps;
2592 + deps = get_result_buffer(param, param_size, &len);
2595 + * Count the devices.
2597 + list_for_each(tmp, dm_table_get_devices(table))
2601 + * Check we have enough space.
2603 + needed = sizeof(*deps) + (sizeof(*deps->dev) * count);
2604 + if (len < needed) {
2605 + param->flags |= DM_BUFFER_FULL_FLAG;
2610 + * Fill in the devices.
2612 + deps->count = count;
2614 + list_for_each(tmp, dm_table_get_devices(table)) {
2615 + struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
2616 + deps->dev[count++] = dd->bdev->bd_dev;
2619 + param->data_size = param->data_start + needed;
2622 +static int table_deps(struct dm_ioctl *param, size_t param_size)
2625 + struct mapped_device *md;
2626 + struct dm_table *table;
2628 + md = find_device(param);
2632 + r = __dev_status(md, param);
2636 + table = dm_get_table(md);
2638 + retrieve_deps(table, param, param_size);
2639 + dm_table_put(table);
2648 + * Build up the status struct for each target
2650 +static void retrieve_status(struct dm_table *table, struct dm_ioctl *param,
2651 + size_t param_size)
2653 + unsigned int i, num_targets;
2654 + struct dm_target_spec *spec;
2655 + char *outbuf, *outptr;
2656 + status_type_t type;
2657 + size_t remaining, len, used = 0;
2659 + outptr = outbuf = get_result_buffer(param, param_size, &len);
2661 + if (param->flags & DM_STATUS_TABLE_FLAG)
2662 + type = STATUSTYPE_TABLE;
2664 + type = STATUSTYPE_INFO;
2666 + /* Get all the target info */
2667 + num_targets = dm_table_get_num_targets(table);
2668 + for (i = 0; i < num_targets; i++) {
2669 + struct dm_target *ti = dm_table_get_target(table, i);
2671 + remaining = len - (outptr - outbuf);
2672 + if (remaining < sizeof(struct dm_target_spec)) {
2673 + param->flags |= DM_BUFFER_FULL_FLAG;
2677 + spec = (struct dm_target_spec *) outptr;
2680 + spec->sector_start = ti->begin;
2681 + spec->length = ti->len;
2682 + strncpy(spec->target_type, ti->type->name,
2683 + sizeof(spec->target_type));
2685 + outptr += sizeof(struct dm_target_spec);
2686 + remaining = len - (outptr - outbuf);
2688 + /* Get the status/table string from the target driver */
2689 + if (ti->type->status) {
2690 + if (ti->type->status(ti, type, outptr, remaining)) {
2691 + param->flags |= DM_BUFFER_FULL_FLAG;
2697 + outptr += strlen(outptr) + 1;
2698 + used = param->data_start + (outptr - outbuf);
2700 + align_ptr(outptr);
2701 + spec->next = outptr - outbuf;
2705 + param->data_size = used;
2707 + param->target_count = num_targets;
2711 + * Return the status of a device as a text string for each
2714 +static int table_status(struct dm_ioctl *param, size_t param_size)
2717 + struct mapped_device *md;
2718 + struct dm_table *table;
2720 + md = find_device(param);
2724 + r = __dev_status(md, param);
2728 + table = dm_get_table(md);
2730 + retrieve_status(table, param, param_size);
2731 + dm_table_put(table);
2739 +/*-----------------------------------------------------------------
2740 + * Implementation of open/close/ioctl on the special char
2742 + *---------------------------------------------------------------*/
2743 +static ioctl_fn lookup_ioctl(unsigned int cmd)
2749 + {DM_VERSION_CMD, NULL}, /* version is dealt with elsewhere */
2750 + {DM_REMOVE_ALL_CMD, remove_all},
2751 + {DM_LIST_DEVICES_CMD, list_devices},
2753 + {DM_DEV_CREATE_CMD, dev_create},
2754 + {DM_DEV_REMOVE_CMD, dev_remove},
2755 + {DM_DEV_RENAME_CMD, dev_rename},
2756 + {DM_DEV_SUSPEND_CMD, dev_suspend},
2757 + {DM_DEV_STATUS_CMD, dev_status},
2758 + {DM_DEV_WAIT_CMD, dev_wait},
2760 + {DM_TABLE_LOAD_CMD, table_load},
2761 + {DM_TABLE_CLEAR_CMD, table_clear},
2762 + {DM_TABLE_DEPS_CMD, table_deps},
2763 + {DM_TABLE_STATUS_CMD, table_status}
2766 + return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn;
2770 + * As well as checking the version compatibility this always
2771 + * copies the kernel interface version out.
2773 +static int check_version(unsigned int cmd, struct dm_ioctl *user)
2775 + uint32_t version[3];
2778 + if (copy_from_user(version, user->version, sizeof(version)))
2781 + if ((DM_VERSION_MAJOR != version[0]) ||
2782 + (DM_VERSION_MINOR < version[1])) {
2783 + DMWARN("ioctl interface mismatch: "
2784 + "kernel(%u.%u.%u), user(%u.%u.%u), cmd(%d)",
2785 + DM_VERSION_MAJOR, DM_VERSION_MINOR,
2786 + DM_VERSION_PATCHLEVEL,
2787 + version[0], version[1], version[2], cmd);
2792 + * Fill in the kernel version.
2794 + version[0] = DM_VERSION_MAJOR;
2795 + version[1] = DM_VERSION_MINOR;
2796 + version[2] = DM_VERSION_PATCHLEVEL;
2797 + if (copy_to_user(user->version, version, sizeof(version)))
2803 +static void free_params(struct dm_ioctl *param)
2808 +static int copy_params(struct dm_ioctl *user, struct dm_ioctl **param)
2810 + struct dm_ioctl tmp, *dmi;
2812 + if (copy_from_user(&tmp, user, sizeof(tmp)))
2815 + if (tmp.data_size < sizeof(tmp))
2818 + dmi = (struct dm_ioctl *) vmalloc(tmp.data_size);
2822 + if (copy_from_user(dmi, user, tmp.data_size)) {
2831 +static int validate_params(uint cmd, struct dm_ioctl *param)
2833 + /* Always clear this flag */
2834 + param->flags &= ~DM_BUFFER_FULL_FLAG;
2836 + /* Ignores parameters */
2837 + if (cmd == DM_REMOVE_ALL_CMD || cmd == DM_LIST_DEVICES_CMD)
2840 + /* Unless creating, either name or uuid but not both */
2841 + if (cmd != DM_DEV_CREATE_CMD) {
2842 + if ((!*param->uuid && !*param->name) ||
2843 + (*param->uuid && *param->name)) {
2844 + DMWARN("one of name or uuid must be supplied, cmd(%u)",
2850 + /* Ensure strings are terminated */
2851 + param->name[DM_NAME_LEN - 1] = '\0';
2852 + param->uuid[DM_UUID_LEN - 1] = '\0';
2857 +static int ctl_ioctl(struct inode *inode, struct file *file,
2858 + uint command, ulong u)
2862 + struct dm_ioctl *param;
2863 + struct dm_ioctl *user = (struct dm_ioctl *) u;
2864 + ioctl_fn fn = NULL;
2865 + size_t param_size;
2867 + /* only root can play with this */
2868 + if (!capable(CAP_SYS_ADMIN))
2871 + if (_IOC_TYPE(command) != DM_IOCTL)
2874 + cmd = _IOC_NR(command);
2877 + * Check the interface version passed in. This also
2878 + * writes out the kernel's interface version.
2880 + r = check_version(cmd, user);
2885 + * Nothing more to do for the version command.
2887 + if (cmd == DM_VERSION_CMD)
2890 + fn = lookup_ioctl(cmd);
2892 + DMWARN("dm_ctl_ioctl: unknown command 0x%x", command);
2897 + * FIXME: I don't like this, we're trying to avoid low
2898 + * memory issues when a device is suspended.
2900 + current->flags |= PF_MEMALLOC;
2903 + * Copy the parameters into kernel space.
2905 + r = copy_params(user, ¶m);
2907 + current->flags &= ~PF_MEMALLOC;
2911 + r = validate_params(cmd, param);
2915 + param_size = param->data_size;
2916 + param->data_size = sizeof(*param);
2917 + r = fn(param, param_size);
2920 + * Copy the results back to userland.
2922 + if (!r && copy_to_user(user, param, param->data_size))
2926 + free_params(param);
2927 + current->flags &= ~PF_MEMALLOC;
2931 +static struct file_operations _ctl_fops = {
2932 + .ioctl = ctl_ioctl,
2933 + .owner = THIS_MODULE,
2936 +static devfs_handle_t _ctl_handle;
2938 +static struct miscdevice _dm_misc = {
2939 + .minor = MISC_DYNAMIC_MINOR,
2941 + .fops = &_ctl_fops
2945 + * Create misc character device and link to DM_DIR/control.
2947 +int __init dm_interface_init(void)
2952 + r = dm_hash_init();
2956 + r = misc_register(&_dm_misc);
2958 + DMERR("misc_register failed for control device");
2963 + r = devfs_generate_path(_dm_misc.devfs_handle, rname + 3,
2964 + sizeof rname - 3);
2966 + goto done; /* devfs not present */
2969 + DMERR("devfs_generate_path failed for control device");
2973 + strncpy(rname + r, "../", 3);
2974 + r = devfs_mk_symlink(NULL, DM_DIR "/control",
2975 + DEVFS_FL_DEFAULT, rname + r, &_ctl_handle, NULL);
2977 + DMERR("devfs_mk_symlink failed for control device");
2980 + devfs_auto_unregister(_dm_misc.devfs_handle, _ctl_handle);
2983 + DMINFO("%d.%d.%d%s initialised: %s", DM_VERSION_MAJOR,
2984 + DM_VERSION_MINOR, DM_VERSION_PATCHLEVEL, DM_VERSION_EXTRA,
2989 + misc_deregister(&_dm_misc);
2994 +void dm_interface_exit(void)
2996 + if (misc_deregister(&_dm_misc) < 0)
2997 + DMERR("misc_deregister failed for control device");
3001 --- diff/drivers/md/dm-linear.c 1970-01-01 01:00:00.000000000 +0100
3002 +++ source/drivers/md/dm-linear.c 2003-10-16 10:44:23.000000000 +0100
3005 + * Copyright (C) 2001 Sistina Software (UK) Limited.
3007 + * This file is released under the GPL.
3012 +#include <linux/module.h>
3013 +#include <linux/init.h>
3014 +#include <linux/blkdev.h>
3015 +#include <linux/slab.h>
3018 + * Linear: maps a linear range of a device.
3021 + struct dm_dev *dev;
3026 + * Construct a linear mapping: <dev_path> <offset>
3028 +static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
3030 + struct linear_c *lc;
3033 + ti->error = "dm-linear: Not enough arguments";
3037 + lc = kmalloc(sizeof(*lc), GFP_KERNEL);
3039 + ti->error = "dm-linear: Cannot allocate linear context";
3043 + if (sscanf(argv[1], SECTOR_FORMAT, &lc->start) != 1) {
3044 + ti->error = "dm-linear: Invalid device sector";
3048 + if (dm_get_device(ti, argv[0], lc->start, ti->len,
3049 + dm_table_get_mode(ti->table), &lc->dev)) {
3050 + ti->error = "dm-linear: Device lookup failed";
3062 +static void linear_dtr(struct dm_target *ti)
3064 + struct linear_c *lc = (struct linear_c *) ti->private;
3066 + dm_put_device(ti, lc->dev);
3070 +static int linear_map(struct dm_target *ti, struct buffer_head *bh, int rw,
3071 + union map_info *map_context)
3073 + struct linear_c *lc = (struct linear_c *) ti->private;
3075 + bh->b_rdev = lc->dev->dev;
3076 + bh->b_rsector = lc->start + (bh->b_rsector - ti->begin);
3081 +static int linear_status(struct dm_target *ti, status_type_t type,
3082 + char *result, unsigned int maxlen)
3084 + struct linear_c *lc = (struct linear_c *) ti->private;
3088 + case STATUSTYPE_INFO:
3092 + case STATUSTYPE_TABLE:
3093 + kdev = to_kdev_t(lc->dev->bdev->bd_dev);
3094 + snprintf(result, maxlen, "%s " SECTOR_FORMAT,
3095 + dm_kdevname(kdev), lc->start);
3101 +static struct target_type linear_target = {
3103 + .module = THIS_MODULE,
3104 + .ctr = linear_ctr,
3105 + .dtr = linear_dtr,
3106 + .map = linear_map,
3107 + .status = linear_status,
3110 +int __init dm_linear_init(void)
3112 + int r = dm_register_target(&linear_target);
3115 + DMERR("linear: register failed %d", r);
3120 +void dm_linear_exit(void)
3122 + int r = dm_unregister_target(&linear_target);
3125 + DMERR("linear: unregister failed %d", r);
3127 --- diff/drivers/md/dm-log.c 1970-01-01 01:00:00.000000000 +0100
3128 +++ source/drivers/md/dm-log.c 2003-10-16 10:44:23.000000000 +0100
3131 + * Copyright (C) 2003 Sistina Software
3133 + * This file is released under the LGPL.
3136 +#include <linux/init.h>
3137 +#include <linux/slab.h>
3138 +#include <linux/module.h>
3139 +#include <linux/vmalloc.h>
3141 +#include "dm-log.h"
3144 +static LIST_HEAD(_log_types);
3145 +static spinlock_t _lock = SPIN_LOCK_UNLOCKED;
3147 +int dm_register_dirty_log_type(struct dirty_log_type *type)
3149 + spin_lock(&_lock);
3150 + type->use_count = 0;
3152 + __MOD_INC_USE_COUNT(type->module);
3154 + list_add(&type->list, &_log_types);
3155 + spin_unlock(&_lock);
3160 +int dm_unregister_dirty_log_type(struct dirty_log_type *type)
3162 + spin_lock(&_lock);
3164 + if (type->use_count)
3165 + DMWARN("Attempt to unregister a log type that is still in use");
3167 + list_del(&type->list);
3169 + __MOD_DEC_USE_COUNT(type->module);
3172 + spin_unlock(&_lock);
3177 +static struct dirty_log_type *get_type(const char *type_name)
3179 + struct dirty_log_type *type;
3180 + struct list_head *tmp;
3182 + spin_lock(&_lock);
3183 + list_for_each (tmp, &_log_types) {
3184 + type = list_entry(tmp, struct dirty_log_type, list);
3185 + if (!strcmp(type_name, type->name)) {
3186 + type->use_count++;
3187 + spin_unlock(&_lock);
3192 + spin_unlock(&_lock);
3196 +static void put_type(struct dirty_log_type *type)
3198 + spin_lock(&_lock);
3199 + type->use_count--;
3200 + spin_unlock(&_lock);
3203 +struct dirty_log *dm_create_dirty_log(const char *type_name, sector_t dev_size,
3204 + unsigned int argc, char **argv)
3206 + struct dirty_log_type *type;
3207 + struct dirty_log *log;
3209 + log = kmalloc(sizeof(*log), GFP_KERNEL);
3213 + type = get_type(type_name);
3220 + if (type->ctr(log, dev_size, argc, argv)) {
3229 +void dm_destroy_dirty_log(struct dirty_log *log)
3231 + log->type->dtr(log);
3232 + put_type(log->type);
3237 +/*-----------------------------------------------------------------
3238 + * In core log, ie. trivial, non-persistent
3240 + * For now we'll keep this simple and just have 2 bitsets, one
3241 + * for clean/dirty, the other for sync/nosync. The sync bitset
3242 + * will be freed when everything is in sync.
3244 + * FIXME: problems with a 64bit sector_t
3245 + *---------------------------------------------------------------*/
3247 + sector_t region_size;
3248 + unsigned int region_count;
3249 + unsigned long *clean_bits;
3250 + unsigned long *sync_bits;
3251 + unsigned long *recovering_bits; /* FIXME: this seems excessive */
3256 +static int core_ctr(struct dirty_log *log, sector_t dev_size,
3257 + unsigned int argc, char **argv)
3259 + struct core_log *clog;
3260 + sector_t region_size;
3261 + unsigned int region_count;
3262 + size_t bitset_size;
3265 + DMWARN("wrong number of arguments to core_log");
3269 + if (sscanf(argv[0], SECTOR_FORMAT, ®ion_size) != 1) {
3270 + DMWARN("invalid region size string");
3274 + region_count = dm_div_up(dev_size, region_size);
3276 + clog = kmalloc(sizeof(*clog), GFP_KERNEL);
3278 + DMWARN("couldn't allocate core log");
3282 + clog->region_size = region_size;
3283 + clog->region_count = region_count;
3285 + bitset_size = dm_round_up(region_count >> 3, sizeof(*clog->clean_bits));
3286 + clog->clean_bits = vmalloc(bitset_size);
3287 + if (!clog->clean_bits) {
3288 + DMWARN("couldn't allocate clean bitset");
3292 + memset(clog->clean_bits, -1, bitset_size);
3294 + clog->sync_bits = vmalloc(bitset_size);
3295 + if (!clog->sync_bits) {
3296 + DMWARN("couldn't allocate sync bitset");
3297 + vfree(clog->clean_bits);
3301 + memset(clog->sync_bits, 0, bitset_size);
3303 + clog->recovering_bits = vmalloc(bitset_size);
3304 + if (!clog->recovering_bits) {
3305 + DMWARN("couldn't allocate sync bitset");
3306 + vfree(clog->sync_bits);
3307 + vfree(clog->clean_bits);
3311 + memset(clog->recovering_bits, 0, bitset_size);
3312 + clog->sync_search = 0;
3313 + log->context = clog;
3317 +static void core_dtr(struct dirty_log *log)
3319 + struct core_log *clog = (struct core_log *) log->context;
3320 + vfree(clog->clean_bits);
3321 + vfree(clog->sync_bits);
3322 + vfree(clog->recovering_bits);
3326 +static sector_t core_get_region_size(struct dirty_log *log)
3328 + struct core_log *clog = (struct core_log *) log->context;
3329 + return clog->region_size;
3332 +static int core_is_clean(struct dirty_log *log, region_t region)
3334 + struct core_log *clog = (struct core_log *) log->context;
3335 + return test_bit(region, clog->clean_bits);
3338 +static int core_in_sync(struct dirty_log *log, region_t region, int block)
3340 + struct core_log *clog = (struct core_log *) log->context;
3342 + return test_bit(region, clog->sync_bits) ? 1 : 0;
3345 +static int core_flush(struct dirty_log *log)
3351 +static void core_mark_region(struct dirty_log *log, region_t region)
3353 + struct core_log *clog = (struct core_log *) log->context;
3354 + clear_bit(region, clog->clean_bits);
3357 +static void core_clear_region(struct dirty_log *log, region_t region)
3359 + struct core_log *clog = (struct core_log *) log->context;
3360 + set_bit(region, clog->clean_bits);
3363 +static int core_get_resync_work(struct dirty_log *log, region_t *region)
3365 + struct core_log *clog = (struct core_log *) log->context;
3367 + if (clog->sync_search >= clog->region_count)
3371 + *region = find_next_zero_bit(clog->sync_bits,
3372 + clog->region_count,
3373 + clog->sync_search);
3374 + clog->sync_search = *region + 1;
3376 + if (*region == clog->region_count)
3379 + } while (test_bit(*region, clog->recovering_bits));
3381 + set_bit(*region, clog->recovering_bits);
3385 +static void core_complete_resync_work(struct dirty_log *log, region_t region,
3388 + struct core_log *clog = (struct core_log *) log->context;
3390 + clear_bit(region, clog->recovering_bits);
3392 + set_bit(region, clog->sync_bits);
3395 +static struct dirty_log_type _core_type = {
3400 + .get_region_size = core_get_region_size,
3401 + .is_clean = core_is_clean,
3402 + .in_sync = core_in_sync,
3403 + .flush = core_flush,
3404 + .mark_region = core_mark_region,
3405 + .clear_region = core_clear_region,
3406 + .get_resync_work = core_get_resync_work,
3407 + .complete_resync_work = core_complete_resync_work
3410 +__init int dm_dirty_log_init(void)
3414 + r = dm_register_dirty_log_type(&_core_type);
3416 + DMWARN("couldn't register core log");
3421 +void dm_dirty_log_exit(void)
3423 + dm_unregister_dirty_log_type(&_core_type);
3426 +EXPORT_SYMBOL(dm_register_dirty_log_type);
3427 +EXPORT_SYMBOL(dm_unregister_dirty_log_type);
3428 +EXPORT_SYMBOL(dm_dirty_log_init);
3429 +EXPORT_SYMBOL(dm_dirty_log_exit);
3430 +EXPORT_SYMBOL(dm_create_dirty_log);
3431 +EXPORT_SYMBOL(dm_destroy_dirty_log);
3432 --- diff/drivers/md/dm-log.h 1970-01-01 01:00:00.000000000 +0100
3433 +++ source/drivers/md/dm-log.h 2003-10-16 10:44:23.000000000 +0100
3436 + * Copyright (C) 2003 Sistina Software
3438 + * This file is released under the LGPL.
3441 +#ifndef DM_DIRTY_LOG
3442 +#define DM_DIRTY_LOG
3446 +typedef sector_t region_t;
3448 +struct dirty_log_type;
3451 + struct dirty_log_type *type;
3455 +struct dirty_log_type {
3456 + struct list_head list;
3458 + struct module *module;
3459 + unsigned int use_count;
3461 + int (*ctr)(struct dirty_log *log, sector_t dev_size,
3462 + unsigned int argc, char **argv);
3463 + void (*dtr)(struct dirty_log *log);
3466 + * Retrieves the smallest size of region that the log can
3469 + sector_t (*get_region_size)(struct dirty_log *log);
3472 + * A predicate to say whether a region is clean or not.
3475 + int (*is_clean)(struct dirty_log *log, region_t region);
3478 + * Returns: 0, 1, -EWOULDBLOCK, < 0
3480 + * A predicate function to check the area given by
3481 + * [sector, sector + len) is in sync.
3483 + * If -EWOULDBLOCK is returned the state of the region is
3484 + * unknown, typically this will result in a read being
3485 + * passed to a daemon to deal with, since a daemon is
3486 + * allowed to block.
3488 + int (*in_sync)(struct dirty_log *log, region_t region, int can_block);
3491 + * Flush the current log state (eg, to disk). This
3492 + * function may block.
3494 + int (*flush)(struct dirty_log *log);
3497 + * Mark an area as clean or dirty. These functions may
3498 + * block, though for performance reasons blocking should
3499 + * be extremely rare (eg, allocating another chunk of
3500 + * memory for some reason).
3502 + void (*mark_region)(struct dirty_log *log, region_t region);
3503 + void (*clear_region)(struct dirty_log *log, region_t region);
3506 + * Returns: <0 (error), 0 (no region), 1 (region)
3508 + * The mirrord will need perform recovery on regions of
3509 + * the mirror that are in the NOSYNC state. This
3510 + * function asks the log to tell the caller about the
3511 + * next region that this machine should recover.
3513 + * Do not confuse this function with 'in_sync()', one
3514 + * tells you if an area is synchronised, the other
3515 + * assigns recovery work.
3517 + int (*get_resync_work)(struct dirty_log *log, region_t *region);
3520 + * This notifies the log that the resync of an area has
3521 + * been completed. The log should then mark this region
3524 + void (*complete_resync_work)(struct dirty_log *log,
3525 + region_t region, int success);
3528 +int dm_register_dirty_log_type(struct dirty_log_type *type);
3529 +int dm_unregister_dirty_log_type(struct dirty_log_type *type);
3533 + * Make sure you use these two functions, rather than calling
3534 + * type->constructor/destructor() directly.
3536 +struct dirty_log *dm_create_dirty_log(const char *type_name, sector_t dev_size,
3537 + unsigned int argc, char **argv);
3538 +void dm_destroy_dirty_log(struct dirty_log *log);
3541 + * init/exit functions.
3543 +int dm_dirty_log_init(void);
3544 +void dm_dirty_log_exit(void);
3547 --- diff/drivers/md/dm-raid1.c 1970-01-01 01:00:00.000000000 +0100
3548 +++ source/drivers/md/dm-raid1.c 2003-10-16 10:44:23.000000000 +0100
3551 + * Copyright (C) 2003 Sistina Software Limited.
3553 + * This file is released under the GPL.
3557 +#include "dm-daemon.h"
3559 +#include "dm-log.h"
3560 +#include "kcopyd.h"
3562 +#include <linux/ctype.h>
3563 +#include <linux/init.h>
3564 +#include <linux/mempool.h>
3565 +#include <linux/module.h>
3566 +#include <linux/pagemap.h>
3567 +#include <linux/slab.h>
3568 +#include <linux/time.h>
3569 +#include <linux/vmalloc.h>
3571 +static struct dm_daemon _kmirrord;
3573 +/*-----------------------------------------------------------------
3576 + * We play with singly linked lists of buffers, but we want to be
3577 + * careful to add new buffers to the back of the list, to avoid
3578 + * buffers being starved of attention.
3579 + *---------------------------------------------------------------*/
3580 +struct buffer_list {
3581 + struct buffer_head *head;
3582 + struct buffer_head *tail;
3585 +static inline void buffer_list_init(struct buffer_list *bl)
3587 + bl->head = bl->tail = NULL;
3590 +static inline void buffer_list_add(struct buffer_list *bl,
3591 + struct buffer_head *bh)
3593 + bh->b_reqnext = NULL;
3596 + bl->tail->b_reqnext = bh;
3599 + bl->head = bl->tail = bh;
3602 +static struct buffer_head *buffer_list_pop(struct buffer_list *bl)
3604 + struct buffer_head *bh = bl->head;
3607 + bl->head = bl->head->b_reqnext;
3611 + bh->b_reqnext = NULL;
3617 +/*-----------------------------------------------------------------
3620 + * The mirror splits itself up into discrete regions. Each
3621 + * region can be in one of three states: clean, dirty,
3622 + * nosync. There is no need to put clean regions in the hash.
3624 + * In addition to being present in the hash table a region _may_
3625 + * be present on one of three lists.
3627 + * clean_regions: Regions on this list have no io pending to
3628 + * them, they are in sync, we are no longer interested in them,
3629 + * they are dull. rh_update_states() will remove them from the
3632 + * quiesced_regions: These regions have been spun down, ready
3633 + * for recovery. rh_recovery_start() will remove regions from
3634 + * this list and hand them to kmirrord, which will schedule the
3635 + * recovery io with kcopyd.
3637 + * recovered_regions: Regions that kcopyd has successfully
3638 + * recovered. rh_update_states() will now schedule any delayed
3639 + * io, up the recovery_count, and remove the region from the
3642 + * There are 2 locks:
3643 + * A rw spin lock 'hash_lock' protects just the hash table,
3644 + * this is never held in write mode from interrupt context,
3645 + * which I believe means that we only have to disable irqs when
3646 + * doing a write lock.
3648 + * An ordinary spin lock 'region_lock' that protects the three
3649 + * lists in the region_hash, with the 'state', 'list' and
3650 + * 'bhs_delayed' fields of the regions. This is used from irq
3651 + * context, so all other uses will have to suspend local irqs.
3652 + *---------------------------------------------------------------*/
3654 +struct region_hash {
3655 + struct mirror_set *ms;
3656 + sector_t region_size;
3658 + /* holds persistent region state */
3659 + struct dirty_log *log;
3662 + rwlock_t hash_lock;
3663 + mempool_t *region_pool;
3664 + unsigned int mask;
3665 + unsigned int nr_buckets;
3666 + struct list_head *buckets;
3668 + spinlock_t region_lock;
3669 + struct semaphore recovery_count;
3670 + struct list_head clean_regions;
3671 + struct list_head quiesced_regions;
3672 + struct list_head recovered_regions;
3683 + struct region_hash *rh; /* FIXME: can we get rid of this ? */
3687 + struct list_head hash_list;
3688 + struct list_head list;
3691 + struct buffer_head *delayed_bhs;
3697 +static inline region_t bh_to_region(struct region_hash *rh,
3698 + struct buffer_head *bh)
3700 + return bh->b_rsector / rh->region_size;
3703 +static inline sector_t region_to_sector(struct region_hash *rh, region_t region)
3705 + return region * rh->region_size;
3708 +/* FIXME move this */
3709 +static void queue_bh(struct mirror_set *ms, struct buffer_head *bh, int rw);
3711 +static void *region_alloc(int gfp_mask, void *pool_data)
3713 + return kmalloc(sizeof(struct region), gfp_mask);
3716 +static void region_free(void *element, void *pool_data)
3721 +#define MIN_REGIONS 64
3722 +#define MAX_RECOVERY 1
3723 +static int rh_init(struct region_hash *rh, struct mirror_set *ms,
3724 + struct dirty_log *log, sector_t region_size,
3725 + region_t nr_regions)
3727 + unsigned int nr_buckets, max_buckets;
3731 + * Calculate a suitable number of buckets for our hash
3734 + max_buckets = nr_regions >> 6;
3735 + for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1)
3741 + rh->region_size = region_size;
3742 + rwlock_init(&rh->hash_lock);
3743 + rh->mask = nr_buckets - 1;
3744 + rh->nr_buckets = nr_buckets;
3746 + rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets));
3747 + if (!rh->buckets) {
3748 + DMERR("unable to allocate region hash memory");
3752 + for (i = 0; i < nr_buckets; i++)
3753 + INIT_LIST_HEAD(rh->buckets + i);
3755 + spin_lock_init(&rh->region_lock);
3756 + sema_init(&rh->recovery_count, 0);
3757 + INIT_LIST_HEAD(&rh->clean_regions);
3758 + INIT_LIST_HEAD(&rh->quiesced_regions);
3759 + INIT_LIST_HEAD(&rh->recovered_regions);
3761 + rh->region_pool = mempool_create(MIN_REGIONS, region_alloc,
3762 + region_free, NULL);
3763 + if (!rh->region_pool) {
3764 + vfree(rh->buckets);
3765 + rh->buckets = NULL;
3772 +static void rh_exit(struct region_hash *rh)
3775 + struct region *reg;
3776 + struct list_head *tmp, *tmp2;
3778 + BUG_ON(!list_empty(&rh->quiesced_regions));
3779 + for (h = 0; h < rh->nr_buckets; h++) {
3780 + list_for_each_safe (tmp, tmp2, rh->buckets + h) {
3781 + reg = list_entry(tmp, struct region, hash_list);
3782 + BUG_ON(atomic_read(®->pending));
3783 + mempool_free(reg, rh->region_pool);
3788 + dm_destroy_dirty_log(rh->log);
3789 + if (rh->region_pool)
3790 + mempool_destroy(rh->region_pool);
3791 + vfree(rh->buckets);
3794 +#define RH_HASH_MULT 2654435387U
3796 +static inline unsigned int rh_hash(struct region_hash *rh, region_t region)
3798 + return (unsigned int) ((region * RH_HASH_MULT) >> 12) & rh->mask;
3801 +static struct region *__rh_lookup(struct region_hash *rh, region_t region)
3803 + struct region *reg;
3805 + list_for_each_entry (reg, rh->buckets + rh_hash(rh, region), hash_list)
3806 + if (reg->key == region)
3812 +static void __rh_insert(struct region_hash *rh, struct region *reg)
3814 + unsigned int h = rh_hash(rh, reg->key);
3815 + list_add(®->hash_list, rh->buckets + h);
3818 +static struct region *__rh_alloc(struct region_hash *rh, region_t region)
3820 + struct region *reg, *nreg;
3822 + read_unlock(&rh->hash_lock);
3823 + nreg = mempool_alloc(rh->region_pool, GFP_NOIO);
3824 + nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
3825 + RH_CLEAN : RH_NOSYNC;
3827 + nreg->key = region;
3829 + INIT_LIST_HEAD(&nreg->list);
3831 + atomic_set(&nreg->pending, 0);
3832 + nreg->delayed_bhs = NULL;
3833 + write_lock_irq(&rh->hash_lock);
3835 + reg = __rh_lookup(rh, region);
3837 + /* we lost the race */
3838 + mempool_free(nreg, rh->region_pool);
3841 + __rh_insert(rh, nreg);
3842 + if (nreg->state == RH_CLEAN) {
3843 + spin_lock_irq(&rh->region_lock);
3844 + list_add(&nreg->list, &rh->clean_regions);
3845 + spin_unlock_irq(&rh->region_lock);
3849 + write_unlock_irq(&rh->hash_lock);
3850 + read_lock(&rh->hash_lock);
3855 +static inline struct region *__rh_find(struct region_hash *rh, region_t region)
3857 + struct region *reg;
3859 + reg = __rh_lookup(rh, region);
3861 + reg = __rh_alloc(rh, region);
3866 +static int rh_state(struct region_hash *rh, region_t region, int may_block)
3869 + struct region *reg;
3871 + read_lock(&rh->hash_lock);
3872 + reg = __rh_lookup(rh, region);
3873 + read_unlock(&rh->hash_lock);
3876 + return reg->state;
3879 + * The region wasn't in the hash, so we fall back to the
3882 + r = rh->log->type->in_sync(rh->log, region, may_block);
3885 + * Any error from the dirty log (eg. -EWOULDBLOCK) gets
3886 + * taken as a RH_NOSYNC
3888 + return r == 1 ? RH_CLEAN : RH_NOSYNC;
3891 +static inline int rh_in_sync(struct region_hash *rh,
3892 + region_t region, int may_block)
3894 + int state = rh_state(rh, region, may_block);
3895 + return state == RH_CLEAN || state == RH_DIRTY;
3898 +static void dispatch_buffers(struct mirror_set *ms, struct buffer_head *bh)
3900 + struct buffer_head *nbh;
3903 + nbh = bh->b_reqnext;
3904 + queue_bh(ms, bh, WRITE);
3909 +static void rh_update_states(struct region_hash *rh)
3911 + struct list_head *tmp, *tmp2;
3912 + struct region *reg;
3915 + LIST_HEAD(recovered);
3918 + * Quickly grab the lists.
3920 + write_lock_irq(&rh->hash_lock);
3921 + spin_lock(&rh->region_lock);
3922 + if (!list_empty(&rh->clean_regions)) {
3923 + list_splice(&rh->clean_regions, &clean);
3924 + INIT_LIST_HEAD(&rh->clean_regions);
3926 + list_for_each_entry (reg, &clean, list) {
3927 + rh->log->type->clear_region(rh->log, reg->key);
3928 + list_del(®->hash_list);
3932 + if (!list_empty(&rh->recovered_regions)) {
3933 + list_splice(&rh->recovered_regions, &recovered);
3934 + INIT_LIST_HEAD(&rh->recovered_regions);
3936 + list_for_each_entry (reg, &recovered, list)
3937 + list_del(®->hash_list);
3939 + spin_unlock(&rh->region_lock);
3940 + write_unlock_irq(&rh->hash_lock);
3943 + * All the regions on the recovered and clean lists have
3944 + * now been pulled out of the system, so no need to do
3945 + * any more locking.
3947 + list_for_each_safe (tmp, tmp2, &recovered) {
3948 + reg = list_entry(tmp, struct region, list);
3950 + rh->log->type->complete_resync_work(rh->log, reg->key, 1);
3951 + dispatch_buffers(rh->ms, reg->delayed_bhs);
3952 + up(&rh->recovery_count);
3953 + mempool_free(reg, rh->region_pool);
3956 + list_for_each_safe (tmp, tmp2, &clean) {
3957 + reg = list_entry(tmp, struct region, list);
3958 + mempool_free(reg, rh->region_pool);
3962 +static void rh_inc(struct region_hash *rh, region_t region)
3964 + struct region *reg;
3966 + read_lock(&rh->hash_lock);
3967 + reg = __rh_find(rh, region);
3968 + if (reg->state == RH_CLEAN) {
3969 + rh->log->type->mark_region(rh->log, reg->key);
3971 + spin_lock_irq(&rh->region_lock);
3972 + reg->state = RH_DIRTY;
3973 + list_del_init(®->list); /* take off the clean list */
3974 + spin_unlock_irq(&rh->region_lock);
3977 + atomic_inc(®->pending);
3978 + read_unlock(&rh->hash_lock);
3981 +static void rh_inc_pending(struct region_hash *rh, struct buffer_list *buffers)
3983 + struct buffer_head *bh;
3985 + for (bh = buffers->head; bh; bh = bh->b_reqnext)
3986 + rh_inc(rh, bh_to_region(rh, bh));
3989 +static void rh_dec(struct region_hash *rh, region_t region)
3991 + unsigned long flags;
3992 + struct region *reg;
3995 + read_lock(&rh->hash_lock);
3996 + reg = __rh_lookup(rh, region);
3997 + read_unlock(&rh->hash_lock);
3999 + if (atomic_dec_and_test(®->pending)) {
4000 + spin_lock_irqsave(&rh->region_lock, flags);
4001 + if (reg->state == RH_RECOVERING) {
4002 + list_add_tail(®->list, &rh->quiesced_regions);
4004 + reg->state = RH_CLEAN;
4005 + list_add(®->list, &rh->clean_regions);
4007 + spin_unlock_irqrestore(&rh->region_lock, flags);
4012 + dm_daemon_wake(&_kmirrord);
4016 + * Starts quiescing a region in preparation for recovery.
4018 +static int __rh_recovery_prepare(struct region_hash *rh)
4021 + struct region *reg;
4025 + * Ask the dirty log what's next.
4027 + r = rh->log->type->get_resync_work(rh->log, ®ion);
4032 + * Get this region, and start it quiescing by setting the
4033 + * recovering flag.
4035 + read_lock(&rh->hash_lock);
4036 + reg = __rh_find(rh, region);
4037 + read_unlock(&rh->hash_lock);
4039 + spin_lock_irq(&rh->region_lock);
4040 + reg->state = RH_RECOVERING;
4042 + /* Already quiesced ? */
4043 + if (atomic_read(®->pending))
4044 + list_del_init(®->list);
4047 + list_del_init(®->list);
4048 + list_add(®->list, &rh->quiesced_regions);
4050 + spin_unlock_irq(&rh->region_lock);
4055 +static void rh_recovery_prepare(struct region_hash *rh)
4057 + while (!down_trylock(&rh->recovery_count))
4058 + if (__rh_recovery_prepare(rh) <= 0) {
4059 + up(&rh->recovery_count);
4065 + * Returns any quiesced regions.
4067 +static struct region *rh_recovery_start(struct region_hash *rh)
4069 + struct region *reg = NULL;
4071 + spin_lock_irq(&rh->region_lock);
4072 + if (!list_empty(&rh->quiesced_regions)) {
4073 + reg = list_entry(rh->quiesced_regions.next,
4074 + struct region, list);
4075 + list_del_init(®->list); /* remove from the quiesced list */
4077 + spin_unlock_irq(&rh->region_lock);
4082 +/* FIXME: success ignored for now */
4083 +static void rh_recovery_end(struct region *reg, int success)
4085 + struct region_hash *rh = reg->rh;
4087 + spin_lock_irq(&rh->region_lock);
4088 + list_add(®->list, ®->rh->recovered_regions);
4089 + spin_unlock_irq(&rh->region_lock);
4091 + dm_daemon_wake(&_kmirrord);
4094 +static void rh_flush(struct region_hash *rh)
4096 + rh->log->type->flush(rh->log);
4099 +static void rh_delay(struct region_hash *rh, struct buffer_head *bh)
4101 + struct region *reg;
4103 + read_lock(&rh->hash_lock);
4104 + reg = __rh_find(rh, bh_to_region(rh, bh));
4105 + bh->b_reqnext = reg->delayed_bhs;
4106 + reg->delayed_bhs = bh;
4107 + read_unlock(&rh->hash_lock);
4110 +static void rh_stop_recovery(struct region_hash *rh)
4114 + /* wait for any recovering regions */
4115 + for (i = 0; i < MAX_RECOVERY; i++)
4116 + down(&rh->recovery_count);
4119 +static void rh_start_recovery(struct region_hash *rh)
4123 + for (i = 0; i < MAX_RECOVERY; i++)
4124 + up(&rh->recovery_count);
4126 + dm_daemon_wake(&_kmirrord);
4129 +/*-----------------------------------------------------------------
4130 + * Mirror set structures.
4131 + *---------------------------------------------------------------*/
4133 + atomic_t error_count;
4134 + struct dm_dev *dev;
4138 +struct mirror_set {
4139 + struct dm_target *ti;
4140 + struct list_head list;
4141 + struct region_hash rh;
4142 + struct kcopyd_client *kcopyd_client;
4144 + spinlock_t lock; /* protects the next two lists */
4145 + struct buffer_list reads;
4146 + struct buffer_list writes;
4149 + region_t nr_regions;
4150 + region_t sync_count;
4152 + unsigned int nr_mirrors;
4153 + struct mirror mirror[0];
4157 + * Every mirror should look like this one.
4159 +#define DEFAULT_MIRROR 0
4162 + * This is yucky. We squirrel the mirror_set struct away inside
4163 + * b_reqnext for write buffers. This is safe since the bh
4164 + * doesn't get submitted to the lower levels of block layer.
4166 +static struct mirror_set *bh_get_ms(struct buffer_head *bh)
4168 + return (struct mirror_set *) bh->b_reqnext;
4171 +static void bh_set_ms(struct buffer_head *bh, struct mirror_set *ms)
4173 + bh->b_reqnext = (struct buffer_head *) ms;
4176 +/*-----------------------------------------------------------------
4179 + * When a mirror is first activated we may find that some regions
4180 + * are in the no-sync state. We have to recover these by
4181 + * recopying from the default mirror to all the others.
4182 + *---------------------------------------------------------------*/
4183 +static void recovery_complete(int read_err, unsigned int write_err,
4186 + struct region *reg = (struct region *) context;
4187 + struct mirror_set *ms = reg->rh->ms;
4189 + /* FIXME: better error handling */
4190 + rh_recovery_end(reg, read_err || write_err);
4191 + if (++ms->sync_count == ms->nr_regions)
4192 + /* the sync is complete */
4193 + dm_table_event(ms->ti->table);
4196 +static int recover(struct mirror_set *ms, struct region *reg)
4200 + struct io_region from, to[ms->nr_mirrors - 1], *dest;
4202 + unsigned int flags = 0;
4204 + /* fill in the source */
4205 + m = ms->mirror + DEFAULT_MIRROR;
4206 + from.dev = m->dev->dev;
4207 + from.sector = m->offset + region_to_sector(reg->rh, reg->key);
4208 + if (reg->key == (ms->nr_regions - 1)) {
4210 + * The final region may be smaller than
4213 + from.count = ms->ti->len & (reg->rh->region_size - 1);
4215 + from.count = reg->rh->region_size;
4217 + from.count = reg->rh->region_size;
4219 + /* fill in the destinations */
4220 + for (i = 1; i < ms->nr_mirrors; i++) {
4221 + m = ms->mirror + i;
4222 + dest = to + (i - 1);
4224 + dest->dev = m->dev->dev;
4225 + dest->sector = m->offset + region_to_sector(reg->rh, reg->key);
4226 + dest->count = from.count;
4229 + /* hand to kcopyd */
4230 + set_bit(KCOPYD_IGNORE_ERROR, &flags);
4231 + r = kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, flags,
4232 + recovery_complete, reg);
4237 +static void do_recovery(struct mirror_set *ms)
4240 + struct region *reg;
4243 + * Start quiescing some regions.
4245 + rh_recovery_prepare(&ms->rh);
4248 + * Copy any already quiesced regions.
4250 + while ((reg = rh_recovery_start(&ms->rh))) {
4251 + r = recover(ms, reg);
4253 + rh_recovery_end(reg, 0);
4257 +/*-----------------------------------------------------------------
4259 + *---------------------------------------------------------------*/
4260 +static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
4262 + /* FIXME: add read balancing */
4263 + return ms->mirror + DEFAULT_MIRROR;
4267 + * remap a buffer to a particular mirror.
4269 +static void map_buffer(struct mirror_set *ms,
4270 + struct mirror *m, struct buffer_head *bh)
4272 + sector_t bsize = bh->b_size >> 9;
4273 + sector_t rsector = bh->b_blocknr * bsize;
4275 + bh->b_rdev = m->dev->dev;
4276 + bh->b_rsector = m->offset + (rsector - ms->ti->begin);
4279 +static void do_reads(struct mirror_set *ms, struct buffer_list *reads)
4282 + struct buffer_head *bh;
4285 + while ((bh = buffer_list_pop(reads))) {
4286 + region = bh_to_region(&ms->rh, bh);
4289 + * We can only read balance if the region is in sync.
4291 + if (rh_in_sync(&ms->rh, region, 0))
4292 + m = choose_mirror(ms, bh->b_rsector);
4294 + m = ms->mirror + DEFAULT_MIRROR;
4296 + map_buffer(ms, m, bh);
4297 + generic_make_request(READ, bh);
4301 +/*-----------------------------------------------------------------
4304 + * We do different things with the write io depending on the
4305 + * state of the region that it's in:
4307 + * SYNC: increment pending, use kcopyd to write to *all* mirrors
4308 + * RECOVERING: delay the io until recovery completes
4309 + * NOSYNC: increment pending, just write to the default mirror
4310 + *---------------------------------------------------------------*/
4311 +static void write_callback(unsigned int error, void *context)
4315 + struct buffer_head *bh = (struct buffer_head *) context;
4316 + struct mirror_set *ms;
4318 + ms = bh_get_ms(bh);
4319 + bh_set_ms(bh, NULL);
4322 + * NOTE: We don't decrement the pending count here,
4323 + * instead it is done by the targets endio function.
4324 + * This way we handle both writes to SYNC and NOSYNC
4325 + * regions with the same code.
4330 + * only error the io if all mirrors failed.
4334 + for (i = 0; i < ms->nr_mirrors; i++)
4335 + if (!test_bit(i, &error)) {
4340 + bh->b_end_io(bh, uptodate);
4343 +static void do_write(struct mirror_set *ms, struct buffer_head *bh)
4346 + struct io_region io[ms->nr_mirrors];
4349 + for (i = 0; i < ms->nr_mirrors; i++) {
4350 + m = ms->mirror + i;
4352 + io[i].dev = m->dev->dev;
4353 + io[i].sector = m->offset + (bh->b_rsector - ms->ti->begin);
4354 + io[i].count = bh->b_size >> 9;
4357 + bh_set_ms(bh, ms);
4358 + dm_io_async(ms->nr_mirrors, io, WRITE, bh->b_page,
4359 + (unsigned int) bh->b_data & ~PAGE_MASK, write_callback, bh);
4362 +static void do_writes(struct mirror_set *ms, struct buffer_list *writes)
4365 + struct buffer_head *bh;
4366 + struct buffer_list sync, nosync, recover, *this_list = NULL;
4368 + if (!writes->head)
4372 + * Classify each write.
4374 + buffer_list_init(&sync);
4375 + buffer_list_init(&nosync);
4376 + buffer_list_init(&recover);
4378 + while ((bh = buffer_list_pop(writes))) {
4379 + state = rh_state(&ms->rh, bh_to_region(&ms->rh, bh), 1);
4383 + this_list = &sync;
4387 + this_list = &nosync;
4390 + case RH_RECOVERING:
4391 + this_list = &recover;
4395 + buffer_list_add(this_list, bh);
4399 + * Increment the pending counts for any regions that will
4400 + * be written to (writes to recover regions are going to
4403 + rh_inc_pending(&ms->rh, &sync);
4404 + rh_inc_pending(&ms->rh, &nosync);
4405 + rh_flush(&ms->rh);
4410 + while ((bh = buffer_list_pop(&sync)))
4413 + while ((bh = buffer_list_pop(&recover)))
4414 + rh_delay(&ms->rh, bh);
4416 + while ((bh = buffer_list_pop(&nosync))) {
4417 + map_buffer(ms, ms->mirror + DEFAULT_MIRROR, bh);
4418 + generic_make_request(WRITE, bh);
4422 +/*-----------------------------------------------------------------
4424 + *---------------------------------------------------------------*/
4425 +static LIST_HEAD(_mirror_sets);
4426 +static DECLARE_RWSEM(_mirror_sets_lock);
4428 +static void do_mirror(struct mirror_set *ms)
4430 + struct buffer_list reads, writes;
4432 + spin_lock(&ms->lock);
4433 + memcpy(&reads, &ms->reads, sizeof(reads));
4434 + buffer_list_init(&ms->reads);
4435 + memcpy(&writes, &ms->writes, sizeof(writes));
4436 + buffer_list_init(&ms->writes);
4437 + spin_unlock(&ms->lock);
4439 + rh_update_states(&ms->rh);
4441 + do_reads(ms, &reads);
4442 + do_writes(ms, &writes);
4443 + run_task_queue(&tq_disk);
4446 +static void do_work(void)
4448 + struct mirror_set *ms;
4450 + down_read(&_mirror_sets_lock);
4451 + list_for_each_entry (ms, &_mirror_sets, list)
4453 + up_read(&_mirror_sets_lock);
4456 +/*-----------------------------------------------------------------
4457 + * Target functions
4458 + *---------------------------------------------------------------*/
4459 +static struct mirror_set *alloc_context(unsigned int nr_mirrors,
4460 + sector_t region_size,
4461 + struct dm_target *ti,
4462 + struct dirty_log *dl)
4465 + struct mirror_set *ms = NULL;
4467 + if (array_too_big(sizeof(*ms), sizeof(ms->mirror[0]), nr_mirrors))
4470 + len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors);
4472 + ms = kmalloc(len, GFP_KERNEL);
4474 + ti->error = "dm-mirror: Cannot allocate mirror context";
4478 + memset(ms, 0, len);
4479 + spin_lock_init(&ms->lock);
4482 + ms->nr_mirrors = nr_mirrors;
4483 + ms->nr_regions = dm_div_up(ti->len, region_size);
4485 + if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
4486 + ti->error = "dm-mirror: Error creating dirty region hash";
4494 +static void free_context(struct mirror_set *ms, struct dm_target *ti,
4498 + dm_put_device(ti, ms->mirror[m].dev);
4504 +static inline int _check_region_size(struct dm_target *ti, sector_t size)
4506 + return !(size % (PAGE_SIZE >> 9) || (size & (size - 1)) ||
4510 +static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
4511 + unsigned int mirror, char **argv)
4515 + if (sscanf(argv[1], SECTOR_FORMAT, &offset) != 1) {
4516 + ti->error = "dm-mirror: Invalid offset";
4520 + if (dm_get_device(ti, argv[0], offset, ti->len,
4521 + dm_table_get_mode(ti->table),
4522 + &ms->mirror[mirror].dev)) {
4523 + ti->error = "dm-mirror: Device lookup failure";
4527 + ms->mirror[mirror].offset = offset;
4532 +static int add_mirror_set(struct mirror_set *ms)
4534 + down_write(&_mirror_sets_lock);
4535 + list_add_tail(&ms->list, &_mirror_sets);
4536 + up_write(&_mirror_sets_lock);
4537 + dm_daemon_wake(&_kmirrord);
4542 +static void del_mirror_set(struct mirror_set *ms)
4544 + down_write(&_mirror_sets_lock);
4545 + list_del(&ms->list);
4546 + up_write(&_mirror_sets_lock);
4550 + * Create dirty log: log_type #log_params <log_params>
4552 +static struct dirty_log *create_dirty_log(struct dm_target *ti,
4553 + unsigned int argc, char **argv,
4554 + unsigned int *args_used)
4556 + unsigned int param_count;
4557 + struct dirty_log *dl;
4560 + ti->error = "dm-mirror: Insufficient mirror log arguments";
4564 + if (sscanf(argv[1], "%u", ¶m_count) != 1 || param_count != 1) {
4565 + ti->error = "dm-mirror: Invalid mirror log argument count";
4569 + *args_used = 2 + param_count;
4571 + if (argc < *args_used) {
4572 + ti->error = "dm-mirror: Insufficient mirror log arguments";
4576 + dl = dm_create_dirty_log(argv[0], ti->len, param_count, argv + 2);
4578 + ti->error = "dm-mirror: Error creating mirror dirty log";
4582 + if (!_check_region_size(ti, dl->type->get_region_size(dl))) {
4583 + ti->error = "dm-mirror: Invalid region size";
4584 + dm_destroy_dirty_log(dl);
4592 + * Construct a mirror mapping:
4594 + * log_type #log_params <log_params>
4595 + * #mirrors [mirror_path offset]{2,}
4597 + * For now, #log_params = 1, log_type = "core"
4600 +#define DM_IO_PAGES 64
4601 +static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
4604 + unsigned int nr_mirrors, m, args_used;
4605 + struct mirror_set *ms;
4606 + struct dirty_log *dl;
4608 + dl = create_dirty_log(ti, argc, argv, &args_used);
4612 + argv += args_used;
4613 + argc -= args_used;
4615 + if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 ||
4617 + ti->error = "dm-mirror: Invalid number of mirrors";
4618 + dm_destroy_dirty_log(dl);
4624 + if (argc != nr_mirrors * 2) {
4625 + ti->error = "dm-mirror: Wrong number of mirror arguments";
4626 + dm_destroy_dirty_log(dl);
4630 + ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl);
4632 + dm_destroy_dirty_log(dl);
4636 + /* Get the mirror parameter sets */
4637 + for (m = 0; m < nr_mirrors; m++) {
4638 + r = get_mirror(ms, ti, m, argv);
4640 + free_context(ms, ti, m);
4649 + r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client);
4651 + free_context(ms, ti, ms->nr_mirrors);
4655 + add_mirror_set(ms);
4659 +static void mirror_dtr(struct dm_target *ti)
4661 + struct mirror_set *ms = (struct mirror_set *) ti->private;
4663 + del_mirror_set(ms);
4664 + kcopyd_client_destroy(ms->kcopyd_client);
4665 + free_context(ms, ti, ms->nr_mirrors);
4668 +static void queue_bh(struct mirror_set *ms, struct buffer_head *bh, int rw)
4671 + struct buffer_list *bl;
4673 + bl = (rw == WRITE) ? &ms->writes : &ms->reads;
4674 + spin_lock(&ms->lock);
4675 + wake = !(bl->head);
4676 + buffer_list_add(bl, bh);
4677 + spin_unlock(&ms->lock);
4680 + dm_daemon_wake(&_kmirrord);
4684 + * Mirror mapping function
4686 +static int mirror_map(struct dm_target *ti, struct buffer_head *bh,
4687 + int rw, union map_info *map_context)
4691 + struct mirror_set *ms = ti->private;
4693 + /* FIXME: nasty hack, 32 bit sector_t only */
4694 + map_context->ll = bh->b_rsector / ms->rh.region_size;
4696 + if (rw == WRITE) {
4697 + queue_bh(ms, bh, rw);
4701 + r = ms->rh.log->type->in_sync(ms->rh.log, bh_to_region(&ms->rh, bh), 0);
4702 + if (r < 0 && r != -EWOULDBLOCK)
4705 + if (r == -EWOULDBLOCK) /* FIXME: ugly */
4709 + * We don't want to fast track a recovery just for a read
4710 + * ahead. So we just let it silently fail.
4711 + * FIXME: get rid of this.
4713 + if (!r && rw == READA)
4717 + /* Pass this io over to the daemon */
4718 + queue_bh(ms, bh, rw);
4722 + m = choose_mirror(ms, bh->b_rsector);
4726 + map_buffer(ms, m, bh);
4730 +static int mirror_end_io(struct dm_target *ti, struct buffer_head *bh,
4731 + int rw, int error, union map_info *map_context)
4733 + struct mirror_set *ms = (struct mirror_set *) ti->private;
4734 + region_t region = map_context->ll;
4737 + * We need to dec pending if this was a write.
4740 + rh_dec(&ms->rh, region);
4745 +static void mirror_suspend(struct dm_target *ti)
4747 + struct mirror_set *ms = (struct mirror_set *) ti->private;
4748 + rh_stop_recovery(&ms->rh);
4751 +static void mirror_resume(struct dm_target *ti)
4753 + struct mirror_set *ms = (struct mirror_set *) ti->private;
4754 + rh_start_recovery(&ms->rh);
4757 +static int mirror_status(struct dm_target *ti, status_type_t type,
4758 + char *result, unsigned int maxlen)
4760 + unsigned int m, sz = 0;
4761 + struct mirror_set *ms = (struct mirror_set *) ti->private;
4764 + case STATUSTYPE_INFO:
4765 + sz += snprintf(result + sz, maxlen - sz, "%d ", ms->nr_mirrors);
4767 + for (m = 0; m < ms->nr_mirrors; m++)
4768 + sz += snprintf(result + sz, maxlen - sz, "%s ",
4769 + dm_kdevname(ms->mirror[m].dev->dev));
4771 + sz += snprintf(result + sz, maxlen - sz, "%lu/%lu",
4772 + ms->sync_count, ms->nr_regions);
4775 + case STATUSTYPE_TABLE:
4776 + sz += snprintf(result + sz, maxlen - sz,
4777 + "%s 1 " SECTOR_FORMAT " %d ",
4778 + ms->rh.log->type->name, ms->rh.region_size,
4781 + for (m = 0; m < ms->nr_mirrors; m++)
4782 + sz += snprintf(result + sz, maxlen - sz, "%s %ld ",
4783 + dm_kdevname(ms->mirror[m].dev->dev),
4784 + ms->mirror[m].offset);
4790 +static struct target_type mirror_target = {
4792 + .module = THIS_MODULE,
4793 + .ctr = mirror_ctr,
4794 + .dtr = mirror_dtr,
4795 + .map = mirror_map,
4796 + .end_io = mirror_end_io,
4797 + .suspend = mirror_suspend,
4798 + .resume = mirror_resume,
4799 + .status = mirror_status,
4802 +static int __init dm_mirror_init(void)
4806 + r = dm_dirty_log_init();
4810 + r = dm_daemon_start(&_kmirrord, "kmirrord", do_work);
4812 + DMERR("couldn't start kmirrord");
4813 + dm_dirty_log_exit();
4817 + r = dm_register_target(&mirror_target);
4819 + DMERR("%s: Failed to register mirror target",
4820 + mirror_target.name);
4821 + dm_dirty_log_exit();
4822 + dm_daemon_stop(&_kmirrord);
4828 +static void __exit dm_mirror_exit(void)
4832 + r = dm_unregister_target(&mirror_target);
4834 + DMERR("%s: unregister failed %d", mirror_target.name, r);
4836 + dm_daemon_stop(&_kmirrord);
4837 + dm_dirty_log_exit();
4841 +module_init(dm_mirror_init);
4842 +module_exit(dm_mirror_exit);
4844 +MODULE_DESCRIPTION(DM_NAME " mirror target");
4845 +MODULE_AUTHOR("Heinz Mauelshagen <mge@sistina.com>");
4846 +MODULE_LICENSE("GPL");
4847 --- diff/drivers/md/dm-snapshot.c 1970-01-01 01:00:00.000000000 +0100
4848 +++ source/drivers/md/dm-snapshot.c 2003-10-16 10:44:23.000000000 +0100
4853 + * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
4855 + * This file is released under the GPL.
4858 +#include <linux/config.h>
4859 +#include <linux/ctype.h>
4860 +#include <linux/module.h>
4861 +#include <linux/init.h>
4862 +#include <linux/slab.h>
4863 +#include <linux/list.h>
4864 +#include <linux/fs.h>
4865 +#include <linux/blkdev.h>
4866 +#include <linux/mempool.h>
4867 +#include <linux/device-mapper.h>
4868 +#include <linux/vmalloc.h>
4870 +#include "dm-snapshot.h"
4871 +#include "kcopyd.h"
4874 + * FIXME: Remove this before release.
4877 +#define DMDEBUG(x...) DMWARN( ## x)
4879 +#define DMDEBUG(x...)
4883 + * The percentage increment we will wake up users at
4885 +#define WAKE_UP_PERCENT 5
4888 + * kcopyd priority of snapshot operations
4890 +#define SNAPSHOT_COPY_PRIORITY 2
4893 + * Each snapshot reserves this many pages for io
4894 + * FIXME: calculate this
4896 +#define SNAPSHOT_PAGES 256
4898 +struct pending_exception {
4899 + struct exception e;
4902 + * Origin buffers waiting for this to complete are held
4903 + * in a list (using b_reqnext).
4905 + struct buffer_head *origin_bhs;
4906 + struct buffer_head *snapshot_bhs;
4909 + * Other pending_exceptions that are processing this
4910 + * chunk. When this list is empty, we know we can
4911 + * complete the origins.
4913 + struct list_head siblings;
4915 + /* Pointer back to snapshot context */
4916 + struct dm_snapshot *snap;
4919 + * 1 indicates the exception has already been sent to
4926 + * Hash table mapping origin volumes to lists of snapshots and
4927 + * a lock to protect it
4929 +static kmem_cache_t *exception_cache;
4930 +static kmem_cache_t *pending_cache;
4931 +static mempool_t *pending_pool;
4934 + * One of these per registered origin, held in the snapshot_origins hash
4937 + /* The origin device */
4940 + struct list_head hash_list;
4942 + /* List of snapshots for this origin */
4943 + struct list_head snapshots;
4947 + * Size of the hash table for origin volumes. If we make this
4948 + * the size of the minors list then it should be nearly perfect
4950 +#define ORIGIN_HASH_SIZE 256
4951 +#define ORIGIN_MASK 0xFF
4952 +static struct list_head *_origins;
4953 +static struct rw_semaphore _origins_lock;
4955 +static int init_origin_hash(void)
4959 + _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
4962 + DMERR("Device mapper: Snapshot: unable to allocate memory");
4966 + for (i = 0; i < ORIGIN_HASH_SIZE; i++)
4967 + INIT_LIST_HEAD(_origins + i);
4968 + init_rwsem(&_origins_lock);
4973 +static void exit_origin_hash(void)
4978 +static inline unsigned int origin_hash(kdev_t dev)
4980 + return MINOR(dev) & ORIGIN_MASK;
4983 +static struct origin *__lookup_origin(kdev_t origin)
4985 + struct list_head *slist;
4986 + struct list_head *ol;
4989 + ol = &_origins[origin_hash(origin)];
4990 + list_for_each(slist, ol) {
4991 + o = list_entry(slist, struct origin, hash_list);
4993 + if (o->dev == origin)
5000 +static void __insert_origin(struct origin *o)
5002 + struct list_head *sl = &_origins[origin_hash(o->dev)];
5003 + list_add_tail(&o->hash_list, sl);
5007 + * Make a note of the snapshot and its origin so we can look it
5008 + * up when the origin has a write on it.
5010 +static int register_snapshot(struct dm_snapshot *snap)
5013 + kdev_t dev = snap->origin->dev;
5015 + down_write(&_origins_lock);
5016 + o = __lookup_origin(dev);
5020 + o = kmalloc(sizeof(*o), GFP_KERNEL);
5022 + up_write(&_origins_lock);
5026 + /* Initialise the struct */
5027 + INIT_LIST_HEAD(&o->snapshots);
5030 + __insert_origin(o);
5033 + list_add_tail(&snap->list, &o->snapshots);
5035 + up_write(&_origins_lock);
5039 +static void unregister_snapshot(struct dm_snapshot *s)
5043 + down_write(&_origins_lock);
5044 + o = __lookup_origin(s->origin->dev);
5046 + list_del(&s->list);
5047 + if (list_empty(&o->snapshots)) {
5048 + list_del(&o->hash_list);
5052 + up_write(&_origins_lock);
5056 + * Implementation of the exception hash tables.
5058 +static int init_exception_table(struct exception_table *et, uint32_t size)
5062 + et->hash_mask = size - 1;
5063 + et->table = vcalloc(size, sizeof(struct list_head));
5067 + for (i = 0; i < size; i++)
5068 + INIT_LIST_HEAD(et->table + i);
5073 +static void exit_exception_table(struct exception_table *et, kmem_cache_t *mem)
5075 + struct list_head *slot, *entry, *temp;
5076 + struct exception *ex;
5079 + size = et->hash_mask + 1;
5080 + for (i = 0; i < size; i++) {
5081 + slot = et->table + i;
5083 + list_for_each_safe(entry, temp, slot) {
5084 + ex = list_entry(entry, struct exception, hash_list);
5085 + kmem_cache_free(mem, ex);
5093 + * FIXME: check how this hash fn is performing.
5095 +static inline uint32_t exception_hash(struct exception_table *et, chunk_t chunk)
5097 + return chunk & et->hash_mask;
5100 +static void insert_exception(struct exception_table *eh, struct exception *e)
5102 + struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)];
5103 + list_add(&e->hash_list, l);
5106 +static inline void remove_exception(struct exception *e)
5108 + list_del(&e->hash_list);
5112 + * Return the exception data for a sector, or NULL if not
5115 +static struct exception *lookup_exception(struct exception_table *et,
5118 + struct list_head *slot, *el;
5119 + struct exception *e;
5121 + slot = &et->table[exception_hash(et, chunk)];
5122 + list_for_each(el, slot) {
5123 + e = list_entry(el, struct exception, hash_list);
5124 + if (e->old_chunk == chunk)
5131 +static inline struct exception *alloc_exception(void)
5133 + struct exception *e;
5135 + e = kmem_cache_alloc(exception_cache, GFP_NOIO);
5137 + e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
5142 +static inline void free_exception(struct exception *e)
5144 + kmem_cache_free(exception_cache, e);
5147 +static inline struct pending_exception *alloc_pending_exception(void)
5149 + return mempool_alloc(pending_pool, GFP_NOIO);
5152 +static inline void free_pending_exception(struct pending_exception *pe)
5154 + mempool_free(pe, pending_pool);
5157 +int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new)
5159 + struct exception *e;
5161 + e = alloc_exception();
5165 + e->old_chunk = old;
5166 + e->new_chunk = new;
5167 + insert_exception(&s->complete, e);
5172 + * Hard coded magic.
5174 +static int calc_max_buckets(void)
5176 + unsigned long mem;
5178 + mem = num_physpages << PAGE_SHIFT;
5180 + mem /= sizeof(struct list_head);
5186 + * Rounds a number down to a power of 2.
5188 +static inline uint32_t round_down(uint32_t n)
5190 + while (n & (n - 1))
5196 + * Allocate room for a suitable hash table.
5198 +static int init_hash_tables(struct dm_snapshot *s)
5200 + sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets;
5203 + * Calculate based on the size of the original volume or
5204 + * the COW volume...
5206 + cow_dev_size = get_dev_size(s->cow->dev);
5207 + origin_dev_size = get_dev_size(s->origin->dev);
5208 + max_buckets = calc_max_buckets();
5210 + hash_size = min(origin_dev_size, cow_dev_size) / s->chunk_size;
5211 + hash_size = min(hash_size, max_buckets);
5213 + /* Round it down to a power of 2 */
5214 + hash_size = round_down(hash_size);
5215 + if (init_exception_table(&s->complete, hash_size))
5219 + * Allocate hash table for in-flight exceptions
5220 + * Make this smaller than the real hash table
5226 + if (init_exception_table(&s->pending, hash_size)) {
5227 + exit_exception_table(&s->complete, exception_cache);
5235 + * Round a number up to the nearest 'size' boundary. size must
5236 + * be a power of 2.
5238 +static inline ulong round_up(ulong n, ulong size)
5241 + return (n + size) & ~size;
5245 + * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
5247 +static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
5249 + struct dm_snapshot *s;
5250 + unsigned long chunk_size;
5253 + char *origin_path;
5259 + ti->error = "dm-snapshot: requires exactly 4 arguments";
5264 + origin_path = argv[0];
5265 + cow_path = argv[1];
5266 + persistent = toupper(*argv[2]);
5268 + if (persistent != 'P' && persistent != 'N') {
5269 + ti->error = "Persistent flag is not P or N";
5274 + chunk_size = simple_strtoul(argv[3], &value, 10);
5275 + if (chunk_size == 0 || value == NULL) {
5276 + ti->error = "Invalid chunk size";
5281 + s = kmalloc(sizeof(*s), GFP_KERNEL);
5283 + ti->error = "Cannot allocate snapshot context private "
5289 + r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin);
5291 + ti->error = "Cannot get origin device";
5295 + /* FIXME: get cow length */
5296 + r = dm_get_device(ti, cow_path, 0, 0,
5297 + FMODE_READ | FMODE_WRITE, &s->cow);
5299 + dm_put_device(ti, s->origin);
5300 + ti->error = "Cannot get COW device";
5305 + * Chunk size must be multiple of page size. Silently
5306 + * round up if it's not.
5308 + chunk_size = round_up(chunk_size, PAGE_SIZE / SECTOR_SIZE);
5310 + /* Validate the chunk size against the device block size */
5311 + blocksize = get_hardsect_size(s->cow->dev);
5312 + if (chunk_size % (blocksize / SECTOR_SIZE)) {
5313 + ti->error = "Chunk size is not a multiple of device blocksize";
5318 + /* Check the sizes are small enough to fit in one kiovec */
5319 + if (chunk_size > KIO_MAX_SECTORS) {
5320 + ti->error = "Chunk size is too big";
5325 + /* Check chunk_size is a power of 2 */
5326 + if (chunk_size & (chunk_size - 1)) {
5327 + ti->error = "Chunk size is not a power of 2";
5332 + s->chunk_size = chunk_size;
5333 + s->chunk_mask = chunk_size - 1;
5334 + s->type = persistent;
5335 + for (s->chunk_shift = 0; chunk_size;
5336 + s->chunk_shift++, chunk_size >>= 1)
5341 + s->have_metadata = 0;
5342 + s->last_percent = 0;
5343 + init_rwsem(&s->lock);
5344 + s->table = ti->table;
5346 + /* Allocate hash table for COW data */
5347 + if (init_hash_tables(s)) {
5348 + ti->error = "Unable to allocate hash table space";
5354 + * Check the persistent flag - done here because we need the iobuf
5355 + * to check the LV header
5357 + s->store.snap = s;
5359 + if (persistent == 'P')
5360 + r = dm_create_persistent(&s->store, s->chunk_size);
5362 + r = dm_create_transient(&s->store, s, blocksize);
5365 + ti->error = "Couldn't create exception store";
5370 + r = kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client);
5372 + ti->error = "Could not create kcopyd client";
5376 + /* Flush IO to the origin device */
5377 + fsync_dev(s->origin->dev);
5379 + /* Add snapshot to the list of snapshots for this origin */
5380 + if (register_snapshot(s)) {
5382 + ti->error = "Cannot register snapshot origin";
5390 + kcopyd_client_destroy(s->kcopyd_client);
5393 + s->store.destroy(&s->store);
5396 + exit_exception_table(&s->pending, pending_cache);
5397 + exit_exception_table(&s->complete, exception_cache);
5400 + dm_put_device(ti, s->cow);
5401 + dm_put_device(ti, s->origin);
5410 +static void snapshot_dtr(struct dm_target *ti)
5412 + struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
5414 + dm_table_event(ti->table);
5416 + unregister_snapshot(s);
5418 + exit_exception_table(&s->pending, pending_cache);
5419 + exit_exception_table(&s->complete, exception_cache);
5421 + /* Deallocate memory used */
5422 + s->store.destroy(&s->store);
5424 + dm_put_device(ti, s->origin);
5425 + dm_put_device(ti, s->cow);
5426 + kcopyd_client_destroy(s->kcopyd_client);
5431 + * We hold lists of buffer_heads, using the b_reqnext field.
5433 +static void queue_buffer(struct buffer_head **queue, struct buffer_head *bh)
5435 + bh->b_reqnext = *queue;
5440 + * FIXME: inefficient.
5442 +static void queue_buffers(struct buffer_head **queue, struct buffer_head *bhs)
5445 + queue = &((*queue)->b_reqnext);
5451 + * Flush a list of buffers.
5453 +static void flush_buffers(struct buffer_head *bh)
5455 + struct buffer_head *n;
5457 + DMDEBUG("begin flush");
5459 + n = bh->b_reqnext;
5460 + bh->b_reqnext = NULL;
5461 + DMDEBUG("flushing %p", bh);
5462 + generic_make_request(WRITE, bh);
5466 + run_task_queue(&tq_disk);
5470 + * Error a list of buffers.
5472 +static void error_buffers(struct buffer_head *bh)
5474 + struct buffer_head *n;
5477 + n = bh->b_reqnext;
5478 + bh->b_reqnext = NULL;
5479 + buffer_IO_error(bh);
5484 +static struct buffer_head *__flush_bhs(struct pending_exception *pe)
5486 + struct pending_exception *sibling;
5488 + if (list_empty(&pe->siblings))
5489 + return pe->origin_bhs;
5491 + sibling = list_entry(pe->siblings.next,
5492 + struct pending_exception, siblings);
5494 + list_del(&pe->siblings);
5496 + /* FIXME: I think there's a race on SMP machines here, add spin lock */
5497 + queue_buffers(&sibling->origin_bhs, pe->origin_bhs);
5502 +static void pending_complete(struct pending_exception *pe, int success)
5504 + struct exception *e;
5505 + struct dm_snapshot *s = pe->snap;
5506 + struct buffer_head *flush = NULL;
5509 + e = alloc_exception();
5511 + DMWARN("Unable to allocate exception.");
5512 + down_write(&s->lock);
5513 + s->store.drop_snapshot(&s->store);
5515 + flush = __flush_bhs(pe);
5516 + up_write(&s->lock);
5518 + error_buffers(pe->snapshot_bhs);
5523 + * Add a proper exception, and remove the
5524 + * in-flight exception from the list.
5526 + down_write(&s->lock);
5528 + memcpy(e, &pe->e, sizeof(*e));
5529 + insert_exception(&s->complete, e);
5530 + remove_exception(&pe->e);
5531 + flush = __flush_bhs(pe);
5533 + /* Submit any pending write BHs */
5534 + up_write(&s->lock);
5536 + flush_buffers(pe->snapshot_bhs);
5537 + DMDEBUG("Exception completed successfully.");
5539 + /* Notify any interested parties */
5540 + if (s->store.fraction_full) {
5541 + sector_t numerator, denominator;
5544 + s->store.fraction_full(&s->store, &numerator,
5546 + pc = numerator * 100 / denominator;
5548 + if (pc >= s->last_percent + WAKE_UP_PERCENT) {
5549 + dm_table_event(s->table);
5550 + s->last_percent = pc - pc % WAKE_UP_PERCENT;
5555 + /* Read/write error - snapshot is unusable */
5556 + down_write(&s->lock);
5558 + DMERR("Error reading/writing snapshot");
5559 + s->store.drop_snapshot(&s->store);
5561 + remove_exception(&pe->e);
5562 + flush = __flush_bhs(pe);
5563 + up_write(&s->lock);
5565 + error_buffers(pe->snapshot_bhs);
5567 + dm_table_event(s->table);
5568 + DMDEBUG("Exception failed.");
5573 + flush_buffers(flush);
5575 + free_pending_exception(pe);
5578 +static void commit_callback(void *context, int success)
5580 + struct pending_exception *pe = (struct pending_exception *) context;
5581 + pending_complete(pe, success);
5585 + * Called when the copy I/O has finished. kcopyd actually runs
5586 + * this code so don't block.
5588 +static void copy_callback(int read_err, unsigned int write_err, void *context)
5590 + struct pending_exception *pe = (struct pending_exception *) context;
5591 + struct dm_snapshot *s = pe->snap;
5593 + if (read_err || write_err)
5594 + pending_complete(pe, 0);
5597 + /* Update the metadata if we are persistent */
5598 + s->store.commit_exception(&s->store, &pe->e, commit_callback,
5603 + * Dispatches the copy operation to kcopyd.
5605 +static inline void start_copy(struct pending_exception *pe)
5607 + struct dm_snapshot *s = pe->snap;
5608 + struct io_region src, dest;
5609 + kdev_t dev = s->origin->dev;
5610 + int *sizes = blk_size[major(dev)];
5611 + sector_t dev_size = (sector_t) -1;
5616 + /* this is protected by snap->lock */
5619 + if (sizes && sizes[minor(dev)])
5620 + dev_size = sizes[minor(dev)] << 1;
5623 + src.sector = chunk_to_sector(s, pe->e.old_chunk);
5624 + src.count = min(s->chunk_size, dev_size - src.sector);
5626 + dest.dev = s->cow->dev;
5627 + dest.sector = chunk_to_sector(s, pe->e.new_chunk);
5628 + dest.count = src.count;
5630 + /* Hand over to kcopyd */
5631 + kcopyd_copy(s->kcopyd_client,
5632 + &src, 1, &dest, 0, copy_callback, pe);
5636 + * Looks to see if this snapshot already has a pending exception
5637 + * for this chunk, otherwise it allocates a new one and inserts
5638 + * it into the pending table.
5640 +static struct pending_exception *find_pending_exception(struct dm_snapshot *s,
5641 + struct buffer_head *bh)
5643 + struct exception *e;
5644 + struct pending_exception *pe;
5645 + chunk_t chunk = sector_to_chunk(s, bh->b_rsector);
5648 + * Is there a pending exception for this already ?
5650 + e = lookup_exception(&s->pending, chunk);
5652 + /* cast the exception to a pending exception */
5653 + pe = list_entry(e, struct pending_exception, e);
5656 + /* Create a new pending exception */
5657 + pe = alloc_pending_exception();
5658 + pe->e.old_chunk = chunk;
5659 + pe->origin_bhs = pe->snapshot_bhs = NULL;
5660 + INIT_LIST_HEAD(&pe->siblings);
5664 + if (s->store.prepare_exception(&s->store, &pe->e)) {
5665 + free_pending_exception(pe);
5670 + insert_exception(&s->pending, &pe->e);
5676 +static inline void remap_exception(struct dm_snapshot *s, struct exception *e,
5677 + struct buffer_head *bh)
5679 + bh->b_rdev = s->cow->dev;
5680 + bh->b_rsector = chunk_to_sector(s, e->new_chunk) +
5681 + (bh->b_rsector & s->chunk_mask);
5684 +static int snapshot_map(struct dm_target *ti, struct buffer_head *bh, int rw,
5685 + union map_info *map_context)
5687 + struct exception *e;
5688 + struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
5691 + struct pending_exception *pe;
5693 + chunk = sector_to_chunk(s, bh->b_rsector);
5695 + /* Full snapshots are not usable */
5700 + * Write to snapshot - higher level takes care of RW/RO
5701 + * flags so we should only get this if we are
5704 + if (rw == WRITE) {
5706 + down_write(&s->lock);
5708 + /* If the block is already remapped - use that, else remap it */
5709 + e = lookup_exception(&s->complete, chunk);
5711 + remap_exception(s, e, bh);
5714 + pe = find_pending_exception(s, bh);
5717 + s->store.drop_snapshot(&s->store);
5721 + remap_exception(s, &pe->e, bh);
5722 + queue_buffer(&pe->snapshot_bhs, bh);
5728 + up_write(&s->lock);
5732 + * FIXME: this read path scares me because we
5733 + * always use the origin when we have a pending
5734 + * exception. However I can't think of a
5735 + * situation where this is wrong - ejt.
5739 + down_read(&s->lock);
5741 + /* See if it it has been remapped */
5742 + e = lookup_exception(&s->complete, chunk);
5744 + remap_exception(s, e, bh);
5746 + bh->b_rdev = s->origin->dev;
5748 + up_read(&s->lock);
5754 +void snapshot_resume(struct dm_target *ti)
5756 + struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
5758 + if (s->have_metadata)
5761 + if (s->store.read_metadata(&s->store)) {
5762 + down_write(&s->lock);
5764 + up_write(&s->lock);
5767 + s->have_metadata = 1;
5770 +static int snapshot_status(struct dm_target *ti, status_type_t type,
5771 + char *result, unsigned int maxlen)
5773 + struct dm_snapshot *snap = (struct dm_snapshot *) ti->private;
5778 + case STATUSTYPE_INFO:
5780 + snprintf(result, maxlen, "Invalid");
5782 + if (snap->store.fraction_full) {
5783 + sector_t numerator, denominator;
5784 + snap->store.fraction_full(&snap->store,
5787 + snprintf(result, maxlen,
5788 + SECTOR_FORMAT "/" SECTOR_FORMAT,
5789 + numerator, denominator);
5792 + snprintf(result, maxlen, "Unknown");
5796 + case STATUSTYPE_TABLE:
5798 + * kdevname returns a static pointer so we need
5799 + * to make private copies if the output is to
5802 + strncpy(cow, dm_kdevname(snap->cow->dev), sizeof(cow));
5803 + strncpy(org, dm_kdevname(snap->origin->dev), sizeof(org));
5804 + snprintf(result, maxlen, "%s %s %c %ld", org, cow,
5805 + snap->type, snap->chunk_size);
5812 +/*-----------------------------------------------------------------
5814 + *---------------------------------------------------------------*/
5815 +static void list_merge(struct list_head *l1, struct list_head *l2)
5817 + struct list_head *l1_n, *l2_p;
5825 + l2_p->next = l1_n;
5826 + l1_n->prev = l2_p;
5829 +static int __origin_write(struct list_head *snapshots, struct buffer_head *bh)
5831 + int r = 1, first = 1;
5832 + struct list_head *sl;
5833 + struct dm_snapshot *snap;
5834 + struct exception *e;
5835 + struct pending_exception *pe, *last = NULL;
5838 + /* Do all the snapshots on this origin */
5839 + list_for_each(sl, snapshots) {
5840 + snap = list_entry(sl, struct dm_snapshot, list);
5842 + /* Only deal with valid snapshots */
5846 + down_write(&snap->lock);
5849 + * Remember, different snapshots can have
5850 + * different chunk sizes.
5852 + chunk = sector_to_chunk(snap, bh->b_rsector);
5855 + * Check exception table to see if block
5856 + * is already remapped in this snapshot
5857 + * and trigger an exception if not.
5859 + e = lookup_exception(&snap->complete, chunk);
5861 + pe = find_pending_exception(snap, bh);
5863 + snap->store.drop_snapshot(&snap->store);
5868 + list_merge(&pe->siblings,
5876 + up_write(&snap->lock);
5880 + * Now that we have a complete pe list we can start the copying.
5885 + down_write(&pe->snap->lock);
5887 + queue_buffer(&pe->origin_bhs, bh);
5889 + up_write(&pe->snap->lock);
5891 + pe = list_entry(pe->siblings.next,
5892 + struct pending_exception, siblings);
5894 + } while (pe != last);
5901 + * Called on a write from the origin driver.
5903 +int do_origin(struct dm_dev *origin, struct buffer_head *bh)
5908 + down_read(&_origins_lock);
5909 + o = __lookup_origin(origin->dev);
5913 + r = __origin_write(&o->snapshots, bh);
5914 + up_read(&_origins_lock);
5920 + * Origin: maps a linear range of a device, with hooks for snapshotting.
5924 + * Construct an origin mapping: <dev_path>
5925 + * The context for an origin is merely a 'struct dm_dev *'
5926 + * pointing to the real device.
5928 +static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
5931 + struct dm_dev *dev;
5934 + ti->error = "dm-origin: incorrect number of arguments";
5938 + r = dm_get_device(ti, argv[0], 0, ti->len,
5939 + dm_table_get_mode(ti->table), &dev);
5941 + ti->error = "Cannot get target device";
5945 + ti->private = dev;
5949 +static void origin_dtr(struct dm_target *ti)
5951 + struct dm_dev *dev = (struct dm_dev *) ti->private;
5952 + dm_put_device(ti, dev);
5955 +static int origin_map(struct dm_target *ti, struct buffer_head *bh, int rw,
5956 + union map_info *map_context)
5958 + struct dm_dev *dev = (struct dm_dev *) ti->private;
5959 + bh->b_rdev = dev->dev;
5961 + /* Only tell snapshots if this is a write */
5962 + return (rw == WRITE) ? do_origin(dev, bh) : 1;
5965 +static int origin_status(struct dm_target *ti, status_type_t type, char *result,
5966 + unsigned int maxlen)
5968 + struct dm_dev *dev = (struct dm_dev *) ti->private;
5971 + case STATUSTYPE_INFO:
5975 + case STATUSTYPE_TABLE:
5976 + snprintf(result, maxlen, "%s", dm_kdevname(dev->dev));
5983 +static struct target_type origin_target = {
5984 + name: "snapshot-origin",
5985 + module: THIS_MODULE,
5989 + status: origin_status,
5992 +static struct target_type snapshot_target = {
5994 + module: THIS_MODULE,
5995 + ctr: snapshot_ctr,
5996 + dtr: snapshot_dtr,
5997 + map: snapshot_map,
5998 + resume: snapshot_resume,
5999 + status: snapshot_status,
6002 +int __init dm_snapshot_init(void)
6006 + r = dm_register_target(&snapshot_target);
6008 + DMERR("snapshot target register failed %d", r);
6012 + r = dm_register_target(&origin_target);
6014 + DMERR("Device mapper: Origin: register failed %d\n", r);
6018 + r = init_origin_hash();
6020 + DMERR("init_origin_hash failed.");
6024 + exception_cache = kmem_cache_create("dm-snapshot-ex",
6025 + sizeof(struct exception),
6026 + __alignof__(struct exception),
6028 + if (!exception_cache) {
6029 + DMERR("Couldn't create exception cache.");
6035 + kmem_cache_create("dm-snapshot-in",
6036 + sizeof(struct pending_exception),
6037 + __alignof__(struct pending_exception),
6039 + if (!pending_cache) {
6040 + DMERR("Couldn't create pending cache.");
6045 + pending_pool = mempool_create(128, mempool_alloc_slab,
6046 + mempool_free_slab, pending_cache);
6047 + if (!pending_pool) {
6048 + DMERR("Couldn't create pending pool.");
6056 + kmem_cache_destroy(pending_cache);
6058 + kmem_cache_destroy(exception_cache);
6060 + exit_origin_hash();
6062 + dm_unregister_target(&origin_target);
6064 + dm_unregister_target(&snapshot_target);
6068 +void dm_snapshot_exit(void)
6072 + r = dm_unregister_target(&snapshot_target);
6074 + DMERR("snapshot unregister failed %d", r);
6076 + r = dm_unregister_target(&origin_target);
6078 + DMERR("origin unregister failed %d", r);
6080 + exit_origin_hash();
6081 + mempool_destroy(pending_pool);
6082 + kmem_cache_destroy(pending_cache);
6083 + kmem_cache_destroy(exception_cache);
6085 --- diff/drivers/md/dm-snapshot.h 1970-01-01 01:00:00.000000000 +0100
6086 +++ source/drivers/md/dm-snapshot.h 2003-10-16 10:44:23.000000000 +0100
6091 + * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
6093 + * This file is released under the GPL.
6096 +#ifndef DM_SNAPSHOT_H
6097 +#define DM_SNAPSHOT_H
6100 +#include <linux/blkdev.h>
6102 +struct exception_table {
6103 + uint32_t hash_mask;
6104 + struct list_head *table;
6108 + * The snapshot code deals with largish chunks of the disk at a
6109 + * time. Typically 64k - 256k.
6111 +/* FIXME: can we get away with limiting these to a uint32_t ? */
6112 +typedef sector_t chunk_t;
6115 + * An exception is used where an old chunk of data has been
6116 + * replaced by a new one.
6119 + struct list_head hash_list;
6121 + chunk_t old_chunk;
6122 + chunk_t new_chunk;
6126 + * Abstraction to handle the meta/layout of exception stores (the
6129 +struct exception_store {
6132 + * Destroys this object when you've finished with it.
6134 + void (*destroy) (struct exception_store *store);
6137 + * The target shouldn't read the COW device until this is
6140 + int (*read_metadata) (struct exception_store *store);
6143 + * Find somewhere to store the next exception.
6145 + int (*prepare_exception) (struct exception_store *store,
6146 + struct exception *e);
6149 + * Update the metadata with this exception.
6151 + void (*commit_exception) (struct exception_store *store,
6152 + struct exception *e,
6153 + void (*callback) (void *, int success),
6154 + void *callback_context);
6157 + * The snapshot is invalid, note this in the metadata.
6159 + void (*drop_snapshot) (struct exception_store *store);
6162 + * Return how full the snapshot is.
6164 + void (*fraction_full) (struct exception_store *store,
6165 + sector_t *numerator,
6166 + sector_t *denominator);
6168 + struct dm_snapshot *snap;
6172 +struct dm_snapshot {
6173 + struct rw_semaphore lock;
6174 + struct dm_table *table;
6176 + struct dm_dev *origin;
6177 + struct dm_dev *cow;
6179 + /* List of snapshots per Origin */
6180 + struct list_head list;
6182 + /* Size of data blocks saved - must be a power of 2 */
6183 + chunk_t chunk_size;
6184 + chunk_t chunk_mask;
6185 + chunk_t chunk_shift;
6187 + /* You can't use a snapshot if this is 0 (e.g. if full) */
6189 + int have_metadata;
6191 + /* Used for display of table */
6194 + /* The last percentage we notified */
6197 + struct exception_table pending;
6198 + struct exception_table complete;
6200 + /* The on disk metadata handler */
6201 + struct exception_store store;
6203 + struct kcopyd_client *kcopyd_client;
6207 + * Used by the exception stores to load exceptions hen
6210 +int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new);
6213 + * Constructor and destructor for the default persistent
6216 +int dm_create_persistent(struct exception_store *store, uint32_t chunk_size);
6218 +int dm_create_transient(struct exception_store *store,
6219 + struct dm_snapshot *s, int blocksize);
6222 + * Return the number of sectors in the device.
6224 +static inline sector_t get_dev_size(kdev_t dev)
6228 + sizes = blk_size[MAJOR(dev)];
6230 + return sizes[MINOR(dev)] << 1;
6235 +static inline chunk_t sector_to_chunk(struct dm_snapshot *s, sector_t sector)
6237 + return (sector & ~s->chunk_mask) >> s->chunk_shift;
6240 +static inline sector_t chunk_to_sector(struct dm_snapshot *s, chunk_t chunk)
6242 + return chunk << s->chunk_shift;
6246 --- diff/drivers/md/dm-stripe.c 1970-01-01 01:00:00.000000000 +0100
6247 +++ source/drivers/md/dm-stripe.c 2003-10-16 10:44:23.000000000 +0100
6250 + * Copyright (C) 2001 Sistina Software (UK) Limited.
6252 + * This file is released under the GPL.
6257 +#include <linux/module.h>
6258 +#include <linux/init.h>
6259 +#include <linux/blkdev.h>
6260 +#include <linux/slab.h>
6263 + struct dm_dev *dev;
6264 + sector_t physical_start;
6270 + /* The size of this target / num. stripes */
6271 + uint32_t stripe_width;
6273 + /* stripe chunk size */
6274 + uint32_t chunk_shift;
6275 + sector_t chunk_mask;
6277 + struct stripe stripe[0];
6280 +static inline struct stripe_c *alloc_context(unsigned int stripes)
6284 + if (array_too_big(sizeof(struct stripe_c), sizeof(struct stripe),
6288 + len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes);
6290 + return kmalloc(len, GFP_KERNEL);
6294 + * Parse a single <dev> <sector> pair
6296 +static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
6297 + unsigned int stripe, char **argv)
6301 + if (sscanf(argv[1], SECTOR_FORMAT, &start) != 1)
6304 + if (dm_get_device(ti, argv[0], start, sc->stripe_width,
6305 + dm_table_get_mode(ti->table),
6306 + &sc->stripe[stripe].dev))
6309 + sc->stripe[stripe].physical_start = start;
6314 + * FIXME: Nasty function, only present because we can't link
6315 + * against __moddi3 and __divdi3.
6317 + * returns a == b * n
6319 +static int multiple(sector_t a, sector_t b, sector_t *n)
6321 + sector_t acc, prev, i;
6325 + for (acc = b, prev = 0, i = 1;
6327 + prev = acc, acc <<= 1, i <<= 1)
6338 + * Construct a striped mapping.
6339 + * <number of stripes> <chunk size (2^^n)> [<dev_path> <offset>]+
6341 +static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
6343 + struct stripe_c *sc;
6346 + uint32_t chunk_size;
6352 + ti->error = "dm-stripe: Not enough arguments";
6356 + stripes = simple_strtoul(argv[0], &end, 10);
6358 + ti->error = "dm-stripe: Invalid stripe count";
6362 + chunk_size = simple_strtoul(argv[1], &end, 10);
6364 + ti->error = "dm-stripe: Invalid chunk_size";
6369 + * chunk_size is a power of two
6371 + if (!chunk_size || (chunk_size & (chunk_size - 1))) {
6372 + ti->error = "dm-stripe: Invalid chunk size";
6376 + if (!multiple(ti->len, stripes, &width)) {
6377 + ti->error = "dm-stripe: Target length not divisable by "
6378 + "number of stripes";
6383 + * Do we have enough arguments for that many stripes ?
6385 + if (argc != (2 + 2 * stripes)) {
6386 + ti->error = "dm-stripe: Not enough destinations specified";
6390 + sc = alloc_context(stripes);
6392 + ti->error = "dm-stripe: Memory allocation for striped context "
6397 + sc->stripes = stripes;
6398 + sc->stripe_width = width;
6400 + sc->chunk_mask = ((sector_t) chunk_size) - 1;
6401 + for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++)
6403 + sc->chunk_shift--;
6406 + * Get the stripe destinations.
6408 + for (i = 0; i < stripes; i++) {
6411 + r = get_stripe(ti, sc, i, argv);
6413 + ti->error = "dm-stripe: Couldn't parse stripe "
6416 + dm_put_device(ti, sc->stripe[i].dev);
6426 +static void stripe_dtr(struct dm_target *ti)
6429 + struct stripe_c *sc = (struct stripe_c *) ti->private;
6431 + for (i = 0; i < sc->stripes; i++)
6432 + dm_put_device(ti, sc->stripe[i].dev);
6437 +static int stripe_map(struct dm_target *ti, struct buffer_head *bh, int rw,
6438 + union map_info *context)
6440 + struct stripe_c *sc = (struct stripe_c *) ti->private;
6442 + sector_t offset = bh->b_rsector - ti->begin;
6443 + uint32_t chunk = (uint32_t) (offset >> sc->chunk_shift);
6444 + uint32_t stripe = chunk % sc->stripes; /* 32bit modulus */
6445 + chunk = chunk / sc->stripes;
6447 + bh->b_rdev = sc->stripe[stripe].dev->dev;
6448 + bh->b_rsector = sc->stripe[stripe].physical_start +
6449 + (chunk << sc->chunk_shift) + (offset & sc->chunk_mask);
6453 +static int stripe_status(struct dm_target *ti, status_type_t type,
6454 + char *result, unsigned int maxlen)
6456 + struct stripe_c *sc = (struct stripe_c *) ti->private;
6461 + case STATUSTYPE_INFO:
6465 + case STATUSTYPE_TABLE:
6466 + offset = snprintf(result, maxlen, "%d " SECTOR_FORMAT,
6467 + sc->stripes, sc->chunk_mask + 1);
6468 + for (i = 0; i < sc->stripes; i++) {
6470 + snprintf(result + offset, maxlen - offset,
6471 + " %s " SECTOR_FORMAT,
6472 + dm_kdevname(to_kdev_t(sc->stripe[i].dev->bdev->bd_dev)),
6473 + sc->stripe[i].physical_start);
6480 +static struct target_type stripe_target = {
6481 + .name = "striped",
6482 + .module = THIS_MODULE,
6483 + .ctr = stripe_ctr,
6484 + .dtr = stripe_dtr,
6485 + .map = stripe_map,
6486 + .status = stripe_status,
6489 +int __init dm_stripe_init(void)
6493 + r = dm_register_target(&stripe_target);
6495 + DMWARN("striped target registration failed");
6500 +void dm_stripe_exit(void)
6502 + if (dm_unregister_target(&stripe_target))
6503 + DMWARN("striped target unregistration failed");
6507 --- diff/drivers/md/dm-table.c 1970-01-01 01:00:00.000000000 +0100
6508 +++ source/drivers/md/dm-table.c 2003-10-16 10:44:23.000000000 +0100
6511 + * Copyright (C) 2001 Sistina Software (UK) Limited.
6513 + * This file is released under the GPL.
6518 +#include <linux/module.h>
6519 +#include <linux/vmalloc.h>
6520 +#include <linux/blkdev.h>
6521 +#include <linux/ctype.h>
6522 +#include <linux/slab.h>
6523 +#include <asm/atomic.h>
6525 +#define MAX_DEPTH 16
6526 +#define NODE_SIZE L1_CACHE_BYTES
6527 +#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
6528 +#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
6534 + unsigned int depth;
6535 + unsigned int counts[MAX_DEPTH]; /* in nodes */
6536 + sector_t *index[MAX_DEPTH];
6538 + unsigned int num_targets;
6539 + unsigned int num_allocated;
6541 + struct dm_target *targets;
6544 + * Indicates the rw permissions for the new logical
6545 + * device. This should be a combination of FMODE_READ
6546 + * and FMODE_WRITE.
6550 + /* a list of devices used by this table */
6551 + struct list_head devices;
6553 + /* events get handed up using this callback */
6554 + void (*event_fn)(void *);
6555 + void *event_context;
6559 + * Similar to ceiling(log_size(n))
6561 +static unsigned int int_log(unsigned long n, unsigned long base)
6566 + n = dm_div_up(n, base);
6574 + * Calculate the index of the child node of the n'th node k'th key.
6576 +static inline unsigned int get_child(unsigned int n, unsigned int k)
6578 + return (n * CHILDREN_PER_NODE) + k;
6582 + * Return the n'th node of level l from table t.
6584 +static inline sector_t *get_node(struct dm_table *t, unsigned int l,
6587 + return t->index[l] + (n * KEYS_PER_NODE);
6591 + * Return the highest key that you could lookup from the n'th
6592 + * node on level l of the btree.
6594 +static sector_t high(struct dm_table *t, unsigned int l, unsigned int n)
6596 + for (; l < t->depth - 1; l++)
6597 + n = get_child(n, CHILDREN_PER_NODE - 1);
6599 + if (n >= t->counts[l])
6600 + return (sector_t) - 1;
6602 + return get_node(t, l, n)[KEYS_PER_NODE - 1];
6606 + * Fills in a level of the btree based on the highs of the level
6609 +static int setup_btree_index(unsigned int l, struct dm_table *t)
6611 + unsigned int n, k;
6614 + for (n = 0U; n < t->counts[l]; n++) {
6615 + node = get_node(t, l, n);
6617 + for (k = 0U; k < KEYS_PER_NODE; k++)
6618 + node[k] = high(t, l + 1, get_child(n, k));
6625 + * highs, and targets are managed as dynamic arrays during a
6628 +static int alloc_targets(struct dm_table *t, unsigned int num)
6630 + sector_t *n_highs;
6631 + struct dm_target *n_targets;
6632 + int n = t->num_targets;
6635 + * Allocate both the target array and offset array at once.
6637 + n_highs = (sector_t *) vcalloc(sizeof(struct dm_target) +
6638 + sizeof(sector_t), num);
6642 + n_targets = (struct dm_target *) (n_highs + num);
6645 + memcpy(n_highs, t->highs, sizeof(*n_highs) * n);
6646 + memcpy(n_targets, t->targets, sizeof(*n_targets) * n);
6649 + memset(n_highs + n, -1, sizeof(*n_highs) * (num - n));
6652 + t->num_allocated = num;
6653 + t->highs = n_highs;
6654 + t->targets = n_targets;
6659 +int dm_table_create(struct dm_table **result, int mode)
6661 + struct dm_table *t = kmalloc(sizeof(*t), GFP_NOIO);
6666 + memset(t, 0, sizeof(*t));
6667 + INIT_LIST_HEAD(&t->devices);
6668 + atomic_set(&t->holders, 1);
6670 + /* allocate a single nodes worth of targets to begin with */
6671 + if (alloc_targets(t, KEYS_PER_NODE)) {
6682 +static void free_devices(struct list_head *devices)
6684 + struct list_head *tmp, *next;
6686 + for (tmp = devices->next; tmp != devices; tmp = next) {
6687 + struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
6693 +void table_destroy(struct dm_table *t)
6697 + /* free the indexes (see dm_table_complete) */
6698 + if (t->depth >= 2)
6699 + vfree(t->index[t->depth - 2]);
6701 + /* free the targets */
6702 + for (i = 0; i < t->num_targets; i++) {
6703 + struct dm_target *tgt = t->targets + i;
6705 + if (tgt->type->dtr)
6706 + tgt->type->dtr(tgt);
6708 + dm_put_target_type(tgt->type);
6713 + /* free the device list */
6714 + if (t->devices.next != &t->devices) {
6715 + DMWARN("devices still present during destroy: "
6716 + "dm_table_remove_device calls missing");
6718 + free_devices(&t->devices);
6724 +void dm_table_get(struct dm_table *t)
6726 + atomic_inc(&t->holders);
6729 +void dm_table_put(struct dm_table *t)
6731 + if (atomic_dec_and_test(&t->holders))
6736 + * Checks to see if we need to extend highs or targets.
6738 +static inline int check_space(struct dm_table *t)
6740 + if (t->num_targets >= t->num_allocated)
6741 + return alloc_targets(t, t->num_allocated * 2);
6747 + * Convert a device path to a dev_t.
6749 +static int lookup_device(const char *path, kdev_t *dev)
6752 + struct nameidata nd;
6753 + struct inode *inode;
6755 + if (!path_init(path, LOOKUP_FOLLOW, &nd))
6758 + if ((r = path_walk(path, &nd)))
6761 + inode = nd.dentry->d_inode;
6767 + if (!S_ISBLK(inode->i_mode)) {
6772 + *dev = inode->i_rdev;
6775 + path_release(&nd);
6780 + * See if we've already got a device in the list.
6782 +static struct dm_dev *find_device(struct list_head *l, kdev_t dev)
6784 + struct list_head *tmp;
6786 + list_for_each(tmp, l) {
6787 + struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
6788 + if (kdev_same(dd->dev, dev))
6796 + * Open a device so we can use it as a map destination.
6798 +static int open_dev(struct dm_dev *dd)
6803 + dd->bdev = bdget(kdev_t_to_nr(dd->dev));
6807 + return blkdev_get(dd->bdev, dd->mode, 0, BDEV_RAW);
6811 + * Close a device that we've been using.
6813 +static void close_dev(struct dm_dev *dd)
6818 + blkdev_put(dd->bdev, BDEV_RAW);
6823 + * If possible (ie. blk_size[major] is set), this checks an area
6824 + * of a destination device is valid.
6826 +static int check_device_area(kdev_t dev, sector_t start, sector_t len)
6829 + sector_t dev_size;
6831 + if (!(sizes = blk_size[major(dev)]) || !(dev_size = sizes[minor(dev)]))
6832 + /* we don't know the device details,
6833 + * so give the benefit of the doubt */
6836 + /* convert to 512-byte sectors */
6839 + return ((start < dev_size) && (len <= (dev_size - start)));
6843 + * This upgrades the mode on an already open dm_dev. Being
6844 + * careful to leave things as they were if we fail to reopen the
6847 +static int upgrade_mode(struct dm_dev *dd, int new_mode)
6850 + struct dm_dev dd_copy;
6852 + memcpy(&dd_copy, dd, sizeof(dd_copy));
6854 + dd->mode |= new_mode;
6858 + close_dev(&dd_copy);
6860 + memcpy(dd, &dd_copy, sizeof(dd_copy));
6866 + * Add a device to the list, or just increment the usage count if
6867 + * it's already present.
6869 +int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
6870 + sector_t len, int mode, struct dm_dev **result)
6874 + struct dm_dev *dd;
6875 + unsigned major, minor;
6876 + struct dm_table *t = ti->table;
6881 + if (sscanf(path, "%u:%u", &major, &minor) == 2) {
6882 + /* Extract the major/minor numbers */
6883 + dev = mk_kdev(major, minor);
6885 + /* convert the path to a device */
6886 + if ((r = lookup_device(path, &dev)))
6890 + dd = find_device(&t->devices, dev);
6892 + dd = kmalloc(sizeof(*dd), GFP_KERNEL);
6900 + if ((r = open_dev(dd))) {
6905 + atomic_set(&dd->count, 0);
6906 + list_add(&dd->list, &t->devices);
6908 + } else if (dd->mode != (mode | dd->mode)) {
6909 + r = upgrade_mode(dd, mode);
6913 + atomic_inc(&dd->count);
6915 + if (!check_device_area(dd->dev, start, len)) {
6916 + DMWARN("device %s too small for target", path);
6917 + dm_put_device(ti, dd);
6927 + * Decrement a devices use count and remove it if neccessary.
6929 +void dm_put_device(struct dm_target *ti, struct dm_dev *dd)
6931 + if (atomic_dec_and_test(&dd->count)) {
6933 + list_del(&dd->list);
6939 + * Checks to see if the target joins onto the end of the table.
6941 +static int adjoin(struct dm_table *table, struct dm_target *ti)
6943 + struct dm_target *prev;
6945 + if (!table->num_targets)
6946 + return !ti->begin;
6948 + prev = &table->targets[table->num_targets - 1];
6949 + return (ti->begin == (prev->begin + prev->len));
6953 + * Destructively splits up the argument list to pass to ctr.
6955 +static int split_args(int max, int *argc, char **argv, char *input)
6957 + char *start, *end = input, *out;
6963 + /* Skip whitespace */
6964 + while (*start && isspace(*start))
6968 + break; /* success, we hit the end */
6970 + /* 'out' is used to remove any back-quotes */
6971 + end = out = start;
6973 + /* Everything apart from '\0' can be quoted */
6974 + if (*end == '\\' && *(end + 1)) {
6975 + *out++ = *(end + 1);
6980 + if (isspace(*end))
6981 + break; /* end of token */
6986 + /* have we already filled the array ? */
6987 + if ((*argc + 1) > max)
6990 + /* we know this is whitespace */
6994 + /* terminate the string and put it in the array */
6996 + argv[*argc] = start;
7003 +int dm_table_add_target(struct dm_table *t, const char *type,
7004 + sector_t start, sector_t len, char *params)
7006 + int r = -EINVAL, argc;
7008 + struct dm_target *tgt;
7010 + if ((r = check_space(t)))
7013 + tgt = t->targets + t->num_targets;
7014 + memset(tgt, 0, sizeof(*tgt));
7016 + tgt->type = dm_get_target_type(type);
7018 + tgt->error = "unknown target type";
7023 + tgt->begin = start;
7025 + tgt->error = "Unknown error";
7028 + * Does this target adjoin the previous one ?
7030 + if (!adjoin(t, tgt)) {
7031 + tgt->error = "Gap in table";
7036 + r = split_args(ARRAY_SIZE(argv), &argc, argv, params);
7038 + tgt->error = "couldn't split parameters";
7042 + r = tgt->type->ctr(tgt, argc, argv);
7046 + t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
7050 + printk(KERN_ERR DM_NAME ": %s\n", tgt->error);
7051 + dm_put_target_type(tgt->type);
7055 +static int setup_indexes(struct dm_table *t)
7058 + unsigned int total = 0;
7059 + sector_t *indexes;
7061 + /* allocate the space for *all* the indexes */
7062 + for (i = t->depth - 2; i >= 0; i--) {
7063 + t->counts[i] = dm_div_up(t->counts[i + 1], CHILDREN_PER_NODE);
7064 + total += t->counts[i];
7067 + indexes = (sector_t *) vcalloc(total, (unsigned long) NODE_SIZE);
7071 + /* set up internal nodes, bottom-up */
7072 + for (i = t->depth - 2, total = 0; i >= 0; i--) {
7073 + t->index[i] = indexes;
7074 + indexes += (KEYS_PER_NODE * t->counts[i]);
7075 + setup_btree_index(i, t);
7082 + * Builds the btree to index the map.
7084 +int dm_table_complete(struct dm_table *t)
7087 + unsigned int leaf_nodes;
7089 + /* how many indexes will the btree have ? */
7090 + leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
7091 + t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
7093 + /* leaf layer has already been set up */
7094 + t->counts[t->depth - 1] = leaf_nodes;
7095 + t->index[t->depth - 1] = t->highs;
7097 + if (t->depth >= 2)
7098 + r = setup_indexes(t);
7103 +static spinlock_t _event_lock = SPIN_LOCK_UNLOCKED;
7104 +void dm_table_event_callback(struct dm_table *t,
7105 + void (*fn)(void *), void *context)
7107 + spin_lock_irq(&_event_lock);
7109 + t->event_context = context;
7110 + spin_unlock_irq(&_event_lock);
7113 +void dm_table_event(struct dm_table *t)
7115 + spin_lock(&_event_lock);
7117 + t->event_fn(t->event_context);
7118 + spin_unlock(&_event_lock);
7121 +sector_t dm_table_get_size(struct dm_table *t)
7123 + return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0;
7126 +struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index)
7128 + if (index > t->num_targets)
7131 + return t->targets + index;
7135 + * Search the btree for the correct target.
7137 +struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
7139 + unsigned int l, n = 0, k = 0;
7142 + for (l = 0; l < t->depth; l++) {
7143 + n = get_child(n, k);
7144 + node = get_node(t, l, n);
7146 + for (k = 0; k < KEYS_PER_NODE; k++)
7147 + if (node[k] >= sector)
7151 + return &t->targets[(KEYS_PER_NODE * n) + k];
7154 +unsigned int dm_table_get_num_targets(struct dm_table *t)
7156 + return t->num_targets;
7159 +struct list_head *dm_table_get_devices(struct dm_table *t)
7161 + return &t->devices;
7164 +int dm_table_get_mode(struct dm_table *t)
7169 +void dm_table_suspend_targets(struct dm_table *t)
7173 + for (i = 0; i < t->num_targets; i++) {
7174 + struct dm_target *ti = t->targets + i;
7176 + if (ti->type->suspend)
7177 + ti->type->suspend(ti);
7181 +void dm_table_resume_targets(struct dm_table *t)
7185 + for (i = 0; i < t->num_targets; i++) {
7186 + struct dm_target *ti = t->targets + i;
7188 + if (ti->type->resume)
7189 + ti->type->resume(ti);
7193 +EXPORT_SYMBOL(dm_get_device);
7194 +EXPORT_SYMBOL(dm_put_device);
7195 +EXPORT_SYMBOL(dm_table_event);
7196 +EXPORT_SYMBOL(dm_table_get_mode);
7197 --- diff/drivers/md/dm-target.c 1970-01-01 01:00:00.000000000 +0100
7198 +++ source/drivers/md/dm-target.c 2003-10-16 10:44:23.000000000 +0100
7201 + * Copyright (C) 2001 Sistina Software (UK) Limited
7203 + * This file is released under the GPL.
7208 +#include <linux/module.h>
7209 +#include <linux/kmod.h>
7210 +#include <linux/slab.h>
7212 +struct tt_internal {
7213 + struct target_type tt;
7215 + struct list_head list;
7219 +static LIST_HEAD(_targets);
7220 +static DECLARE_RWSEM(_lock);
7222 +#define DM_MOD_NAME_SIZE 32
7224 +static inline struct tt_internal *__find_target_type(const char *name)
7226 + struct list_head *tih;
7227 + struct tt_internal *ti;
7229 + list_for_each(tih, &_targets) {
7230 + ti = list_entry(tih, struct tt_internal, list);
7232 + if (!strcmp(name, ti->tt.name))
7239 +static struct tt_internal *get_target_type(const char *name)
7241 + struct tt_internal *ti;
7243 + down_read(&_lock);
7244 + ti = __find_target_type(name);
7247 + if (ti->use == 0 && ti->tt.module)
7248 + __MOD_INC_USE_COUNT(ti->tt.module);
7256 +static void load_module(const char *name)
7258 + char module_name[DM_MOD_NAME_SIZE] = "dm-";
7260 + /* Length check for strcat() below */
7261 + if (strlen(name) > (DM_MOD_NAME_SIZE - 4))
7264 + strcat(module_name, name);
7265 + request_module(module_name);
7268 +struct target_type *dm_get_target_type(const char *name)
7270 + struct tt_internal *ti = get_target_type(name);
7273 + load_module(name);
7274 + ti = get_target_type(name);
7277 + return ti ? &ti->tt : NULL;
7280 +void dm_put_target_type(struct target_type *t)
7282 + struct tt_internal *ti = (struct tt_internal *) t;
7284 + down_read(&_lock);
7285 + if (--ti->use == 0 && ti->tt.module)
7286 + __MOD_DEC_USE_COUNT(ti->tt.module);
7295 +static struct tt_internal *alloc_target(struct target_type *t)
7297 + struct tt_internal *ti = kmalloc(sizeof(*ti), GFP_KERNEL);
7300 + memset(ti, 0, sizeof(*ti));
7307 +int dm_register_target(struct target_type *t)
7310 + struct tt_internal *ti = alloc_target(t);
7315 + down_write(&_lock);
7316 + if (__find_target_type(t->name)) {
7320 + list_add(&ti->list, &_targets);
7326 +int dm_unregister_target(struct target_type *t)
7328 + struct tt_internal *ti;
7330 + down_write(&_lock);
7331 + if (!(ti = __find_target_type(t->name))) {
7341 + list_del(&ti->list);
7349 + * io-err: always fails an io, useful for bringing
7350 + * up LVs that have holes in them.
7352 +static int io_err_ctr(struct dm_target *ti, unsigned int argc, char **args)
7357 +static void io_err_dtr(struct dm_target *ti)
7362 +static int io_err_map(struct dm_target *ti, struct buffer_head *bh, int rw,
7363 + union map_info *map_context)
7368 +static struct target_type error_target = {
7370 + .ctr = io_err_ctr,
7371 + .dtr = io_err_dtr,
7372 + .map = io_err_map,
7375 +int dm_target_init(void)
7377 + return dm_register_target(&error_target);
7380 +void dm_target_exit(void)
7382 + if (dm_unregister_target(&error_target))
7383 + DMWARN("error target unregistration failed");
7386 +EXPORT_SYMBOL(dm_register_target);
7387 +EXPORT_SYMBOL(dm_unregister_target);
7388 --- diff/drivers/md/dm.c 1970-01-01 01:00:00.000000000 +0100
7389 +++ source/drivers/md/dm.c 2003-10-16 10:44:23.000000000 +0100
7392 + * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
7394 + * This file is released under the GPL.
7398 +#include "kcopyd.h"
7400 +#include <linux/init.h>
7401 +#include <linux/module.h>
7402 +#include <linux/blk.h>
7403 +#include <linux/blkpg.h>
7404 +#include <linux/mempool.h>
7405 +#include <linux/slab.h>
7406 +#include <linux/major.h>
7407 +#include <linux/kdev_t.h>
7408 +#include <linux/lvm.h>
7410 +#include <asm/uaccess.h>
7412 +static const char *_name = DM_NAME;
7413 +#define DEFAULT_READ_AHEAD 64
7416 + struct mapped_device *md;
7418 + struct dm_target *ti;
7420 + union map_info map_context;
7421 + void (*end_io) (struct buffer_head * bh, int uptodate);
7425 +struct deferred_io {
7427 + struct buffer_head *bh;
7428 + struct deferred_io *next;
7432 + * Bits for the md->flags field.
7434 +#define DMF_BLOCK_IO 0
7435 +#define DMF_SUSPENDED 1
7437 +struct mapped_device {
7438 + struct rw_semaphore lock;
7442 + unsigned long flags;
7445 + * A list of ios that arrived while we were suspended.
7448 + wait_queue_head_t wait;
7449 + struct deferred_io *deferred;
7452 + * The current mapping.
7454 + struct dm_table *map;
7457 + * io objects are allocated from here.
7459 + mempool_t *io_pool;
7464 + uint32_t event_nr;
7465 + wait_queue_head_t eventq;
7468 +#define MIN_IOS 256
7469 +static kmem_cache_t *_io_cache;
7471 +static struct mapped_device *get_kdev(kdev_t dev);
7472 +static int dm_request(request_queue_t *q, int rw, struct buffer_head *bh);
7473 +static int dm_user_bmap(struct inode *inode, struct lv_bmap *lvb);
7475 +/*-----------------------------------------------------------------
7476 + * In order to avoid the 256 minor number limit we are going to
7477 + * register more major numbers as neccessary.
7478 + *---------------------------------------------------------------*/
7479 +#define MAX_MINORS (1 << MINORBITS)
7481 +struct major_details {
7482 + unsigned int major;
7485 + struct list_head transient_list;
7487 + unsigned int first_free_minor;
7488 + int nr_free_minors;
7490 + struct mapped_device *mds[MAX_MINORS];
7491 + int blk_size[MAX_MINORS];
7492 + int blksize_size[MAX_MINORS];
7493 + int hardsect_size[MAX_MINORS];
7496 +static struct rw_semaphore _dev_lock;
7497 +static struct major_details *_majors[MAX_BLKDEV];
7500 + * This holds a list of majors that non-specified device numbers
7501 + * may be allocated from. Only majors with free minors appear on
7504 +static LIST_HEAD(_transients_free);
7506 +static int __alloc_major(unsigned int major, struct major_details **result)
7509 + unsigned int transient = !major;
7510 + struct major_details *maj;
7512 + /* Major already allocated? */
7513 + if (major && _majors[major])
7516 + maj = kmalloc(sizeof(*maj), GFP_KERNEL);
7520 + memset(maj, 0, sizeof(*maj));
7521 + INIT_LIST_HEAD(&maj->transient_list);
7523 + maj->nr_free_minors = MAX_MINORS;
7525 + r = register_blkdev(major, _name, &dm_blk_dops);
7527 + DMERR("register_blkdev failed for %d", major);
7534 + maj->major = major;
7537 + maj->transient = transient;
7538 + list_add_tail(&maj->transient_list, &_transients_free);
7541 + _majors[major] = maj;
7543 + blk_size[major] = maj->blk_size;
7544 + blksize_size[major] = maj->blksize_size;
7545 + hardsect_size[major] = maj->hardsect_size;
7546 + read_ahead[major] = DEFAULT_READ_AHEAD;
7548 + blk_queue_make_request(BLK_DEFAULT_QUEUE(major), dm_request);
7554 +static void __free_major(struct major_details *maj)
7556 + unsigned int major = maj->major;
7558 + list_del(&maj->transient_list);
7560 + read_ahead[major] = 0;
7561 + blk_size[major] = NULL;
7562 + blksize_size[major] = NULL;
7563 + hardsect_size[major] = NULL;
7565 + _majors[major] = NULL;
7568 + if (unregister_blkdev(major, _name) < 0)
7569 + DMERR("devfs_unregister_blkdev failed");
7572 +static void free_all_majors(void)
7574 + unsigned int major = ARRAY_SIZE(_majors);
7576 + down_write(&_dev_lock);
7579 + if (_majors[major])
7580 + __free_major(_majors[major]);
7582 + up_write(&_dev_lock);
7585 +static void free_dev(kdev_t dev)
7587 + unsigned int major = major(dev);
7588 + unsigned int minor = minor(dev);
7589 + struct major_details *maj;
7591 + down_write(&_dev_lock);
7593 + maj = _majors[major];
7597 + maj->mds[minor] = NULL;
7598 + maj->nr_free_minors++;
7600 + if (maj->nr_free_minors == MAX_MINORS) {
7601 + __free_major(maj);
7605 + if (!maj->transient)
7608 + if (maj->nr_free_minors == 1)
7609 + list_add_tail(&maj->transient_list, &_transients_free);
7611 + if (minor < maj->first_free_minor)
7612 + maj->first_free_minor = minor;
7615 + up_write(&_dev_lock);
7618 +static void __alloc_minor(struct major_details *maj, unsigned int minor,
7619 + struct mapped_device *md)
7621 + maj->mds[minor] = md;
7622 + md->dev = mk_kdev(maj->major, minor);
7623 + maj->nr_free_minors--;
7625 + if (maj->transient && !maj->nr_free_minors)
7626 + list_del_init(&maj->transient_list);
7630 + * See if requested kdev_t is available.
7632 +static int specific_dev(kdev_t dev, struct mapped_device *md)
7635 + unsigned int major = major(dev);
7636 + unsigned int minor = minor(dev);
7637 + struct major_details *maj;
7639 + if (!major || (major > MAX_BLKDEV) || (minor >= MAX_MINORS)) {
7640 + DMWARN("device number requested out of range (%d, %d)",
7645 + down_write(&_dev_lock);
7646 + maj = _majors[major];
7648 + /* Register requested major? */
7650 + r = __alloc_major(major, &maj);
7654 + major = maj->major;
7657 + if (maj->mds[minor]) {
7662 + __alloc_minor(maj, minor, md);
7665 + up_write(&_dev_lock);
7671 + * Find first unused device number, requesting a new major number if required.
7673 +static int first_free_dev(struct mapped_device *md)
7676 + struct major_details *maj;
7678 + down_write(&_dev_lock);
7680 + if (list_empty(&_transients_free)) {
7681 + r = __alloc_major(0, &maj);
7685 + maj = list_entry(_transients_free.next, struct major_details,
7688 + while (maj->mds[maj->first_free_minor++])
7691 + __alloc_minor(maj, maj->first_free_minor - 1, md);
7694 + up_write(&_dev_lock);
7699 +static struct mapped_device *get_kdev(kdev_t dev)
7701 + struct mapped_device *md;
7702 + struct major_details *maj;
7704 + down_read(&_dev_lock);
7705 + maj = _majors[major(dev)];
7710 + md = maj->mds[minor(dev)];
7714 + up_read(&_dev_lock);
7719 +/*-----------------------------------------------------------------
7721 + *---------------------------------------------------------------*/
7723 +static __init int local_init(void)
7725 + init_rwsem(&_dev_lock);
7727 + /* allocate a slab for the dm_ios */
7728 + _io_cache = kmem_cache_create("dm io",
7729 + sizeof(struct dm_io), 0, 0, NULL, NULL);
7737 +static void local_exit(void)
7739 + kmem_cache_destroy(_io_cache);
7740 + free_all_majors();
7742 + DMINFO("cleaned up");
7746 + * We have a lot of init/exit functions, so it seems easier to
7747 + * store them in an array. The disposable macro 'xx'
7748 + * expands a prefix into a pair of function names.
7751 + int (*init) (void);
7752 + void (*exit) (void);
7755 +#define xx(n) {n ## _init, n ## _exit},
7766 +static int __init dm_init(void)
7768 + const int count = ARRAY_SIZE(_inits);
7772 + for (i = 0; i < count; i++) {
7773 + r = _inits[i].init();
7787 +static void __exit dm_exit(void)
7789 + int i = ARRAY_SIZE(_inits);
7796 + * Block device functions
7798 +static int dm_blk_open(struct inode *inode, struct file *file)
7800 + struct mapped_device *md;
7802 + md = get_kdev(inode->i_rdev);
7809 +static int dm_blk_close(struct inode *inode, struct file *file)
7811 + struct mapped_device *md;
7813 + md = get_kdev(inode->i_rdev);
7814 + dm_put(md); /* put the reference gained by dm_blk_open */
7819 +static inline struct dm_io *alloc_io(struct mapped_device *md)
7821 + return mempool_alloc(md->io_pool, GFP_NOIO);
7824 +static inline void free_io(struct mapped_device *md, struct dm_io *io)
7826 + mempool_free(io, md->io_pool);
7829 +static inline struct deferred_io *alloc_deferred(void)
7831 + return kmalloc(sizeof(struct deferred_io), GFP_NOIO);
7834 +static inline void free_deferred(struct deferred_io *di)
7839 +static inline sector_t volume_size(kdev_t dev)
7841 + return blk_size[major(dev)][minor(dev)] << 1;
7844 +/* FIXME: check this */
7845 +static int dm_blk_ioctl(struct inode *inode, struct file *file,
7846 + unsigned int command, unsigned long a)
7848 + kdev_t dev = inode->i_rdev;
7851 + switch (command) {
7858 + //case BLKRRPART: /* Re-read partition tables */
7864 + return blk_ioctl(dev, command, a);
7868 + size = volume_size(dev);
7869 + if (copy_to_user((void *) a, &size, sizeof(long)))
7873 + case BLKGETSIZE64:
7874 + size = volume_size(dev);
7875 + if (put_user((u64) ((u64) size) << 9, (u64 *) a))
7883 + return dm_user_bmap(inode, (struct lv_bmap *) a);
7886 + DMWARN("unknown block ioctl 0x%x", command);
7894 + * Add the buffer to the list of deferred io.
7896 +static int queue_io(struct mapped_device *md, struct buffer_head *bh, int rw)
7898 + struct deferred_io *di;
7900 + di = alloc_deferred();
7904 + down_write(&md->lock);
7906 + if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
7907 + up_write(&md->lock);
7908 + free_deferred(di);
7914 + di->next = md->deferred;
7915 + md->deferred = di;
7917 + up_write(&md->lock);
7918 + return 0; /* deferred successfully */
7922 + * bh->b_end_io routine that decrements the pending count
7923 + * and then calls the original bh->b_end_io fn.
7925 +static void dec_pending(struct buffer_head *bh, int uptodate)
7928 + struct dm_io *io = bh->b_private;
7929 + dm_endio_fn endio = io->ti->type->end_io;
7932 + r = endio(io->ti, bh, io->rw, uptodate ? 0 : -EIO,
7933 + &io->map_context);
7938 + /* the target wants another shot at the io */
7942 + if (atomic_dec_and_test(&io->md->pending))
7943 + /* nudge anyone waiting on suspend queue */
7944 + wake_up(&io->md->wait);
7946 + bh->b_end_io = io->end_io;
7947 + bh->b_private = io->context;
7948 + free_io(io->md, io);
7950 + bh->b_end_io(bh, uptodate);
7954 + * Do the bh mapping for a given leaf
7956 +static inline int __map_buffer(struct mapped_device *md, int rw,
7957 + struct buffer_head *bh, struct dm_io *io)
7959 + struct dm_target *ti;
7964 + ti = dm_table_find_target(md->map, bh->b_rsector);
7968 + /* hook the end io request fn */
7969 + atomic_inc(&md->pending);
7973 + io->end_io = bh->b_end_io;
7974 + io->context = bh->b_private;
7975 + bh->b_end_io = dec_pending;
7976 + bh->b_private = io;
7978 + return ti->type->map(ti, bh, rw, &io->map_context);
7982 + * Checks to see if we should be deferring io, if so it queues it
7985 +static inline int __deferring(struct mapped_device *md, int rw,
7986 + struct buffer_head *bh)
7991 + * If we're suspended we have to queue this io for later.
7993 + while (test_bit(DMF_BLOCK_IO, &md->flags)) {
7994 + up_read(&md->lock);
7997 + * There's no point deferring a read ahead
7998 + * request, just drop it.
8000 + if (rw == READA) {
8001 + down_read(&md->lock);
8005 + r = queue_io(md, bh, rw);
8006 + down_read(&md->lock);
8012 + return 1; /* deferred successfully */
8019 +static int dm_request(request_queue_t *q, int rw, struct buffer_head *bh)
8023 + struct mapped_device *md;
8025 + md = get_kdev(bh->b_rdev);
8027 + buffer_IO_error(bh);
8031 + io = alloc_io(md);
8032 + down_read(&md->lock);
8034 + r = __deferring(md, rw, bh);
8039 + /* not deferring */
8040 + r = __map_buffer(md, rw, bh, io);
8046 + up_read(&md->lock);
8051 + buffer_IO_error(bh);
8052 + up_read(&md->lock);
8057 +static int check_dev_size(kdev_t dev, unsigned long block)
8059 + unsigned int major = major(dev);
8060 + unsigned int minor = minor(dev);
8062 + /* FIXME: check this */
8063 + unsigned long max_sector = (blk_size[major][minor] << 1) + 1;
8064 + unsigned long sector = (block + 1) * (blksize_size[major][minor] >> 9);
8066 + return (sector > max_sector) ? 0 : 1;
8070 + * Creates a dummy buffer head and maps it (for lilo).
8072 +static int __bmap(struct mapped_device *md, kdev_t dev, unsigned long block,
8073 + kdev_t *r_dev, unsigned long *r_block)
8075 + struct buffer_head bh;
8076 + struct dm_target *ti;
8077 + union map_info map_context;
8080 + if (test_bit(DMF_BLOCK_IO, &md->flags)) {
8084 + if (!check_dev_size(dev, block)) {
8091 + /* setup dummy bh */
8092 + memset(&bh, 0, sizeof(bh));
8093 + bh.b_blocknr = block;
8094 + bh.b_dev = bh.b_rdev = dev;
8095 + bh.b_size = blksize_size[major(dev)][minor(dev)];
8096 + bh.b_rsector = block * (bh.b_size >> 9);
8099 + ti = dm_table_find_target(md->map, bh.b_rsector);
8101 + /* do the mapping */
8102 + r = ti->type->map(ti, &bh, READ, &map_context);
8103 + ti->type->end_io(ti, &bh, READ, 0, &map_context);
8106 + *r_dev = bh.b_rdev;
8107 + *r_block = bh.b_rsector / (bh.b_size >> 9);
8114 + * Marshals arguments and results between user and kernel space.
8116 +static int dm_user_bmap(struct inode *inode, struct lv_bmap *lvb)
8118 + struct mapped_device *md;
8119 + unsigned long block, r_block;
8123 + if (get_user(block, &lvb->lv_block))
8126 + md = get_kdev(inode->i_rdev);
8130 + down_read(&md->lock);
8131 + r = __bmap(md, inode->i_rdev, block, &r_dev, &r_block);
8132 + up_read(&md->lock);
8135 + if (!r && (put_user(kdev_t_to_nr(r_dev), &lvb->lv_dev) ||
8136 + put_user(r_block, &lvb->lv_block)))
8142 +static void free_md(struct mapped_device *md)
8144 + free_dev(md->dev);
8145 + mempool_destroy(md->io_pool);
8150 + * Allocate and initialise a blank device with a given minor.
8152 +static struct mapped_device *alloc_md(kdev_t dev)
8155 + struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL);
8158 + DMWARN("unable to allocate device, out of memory.");
8162 + memset(md, 0, sizeof(*md));
8164 + /* Allocate suitable device number */
8166 + r = first_free_dev(md);
8168 + r = specific_dev(dev, md);
8175 + md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
8176 + mempool_free_slab, _io_cache);
8177 + if (!md->io_pool) {
8183 + init_rwsem(&md->lock);
8184 + atomic_set(&md->holders, 1);
8185 + atomic_set(&md->pending, 0);
8186 + init_waitqueue_head(&md->wait);
8187 + init_waitqueue_head(&md->eventq);
8193 + * The hardsect size for a mapped device is the largest hardsect size
8194 + * from the devices it maps onto.
8196 +static int __find_hardsect_size(struct list_head *devices)
8198 + int result = 512, size;
8199 + struct list_head *tmp;
8201 + list_for_each (tmp, devices) {
8202 + struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
8203 + size = get_hardsect_size(dd->dev);
8204 + if (size > result)
8212 + * Bind a table to the device.
8214 +static void event_callback(void *context)
8216 + struct mapped_device *md = (struct mapped_device *) context;
8218 + down_write(&md->lock);
8220 + wake_up_interruptible(&md->eventq);
8221 + up_write(&md->lock);
8224 +static int __bind(struct mapped_device *md, struct dm_table *t)
8226 + unsigned int minor = minor(md->dev);
8227 + unsigned int major = major(md->dev);
8231 + blk_size[major][minor] = dm_table_get_size(t) >> 1;
8232 + blksize_size[major][minor] = BLOCK_SIZE;
8233 + hardsect_size[major][minor] =
8234 + __find_hardsect_size(dm_table_get_devices(t));
8235 + register_disk(NULL, md->dev, 1, &dm_blk_dops, blk_size[major][minor]);
8237 + dm_table_event_callback(md->map, event_callback, md);
8242 +static void __unbind(struct mapped_device *md)
8244 + unsigned int minor = minor(md->dev);
8245 + unsigned int major = major(md->dev);
8248 + dm_table_event_callback(md->map, NULL, NULL);
8249 + dm_table_put(md->map);
8254 + blk_size[major][minor] = 0;
8255 + blksize_size[major][minor] = 0;
8256 + hardsect_size[major][minor] = 0;
8260 + * Constructor for a new device.
8262 +int dm_create(kdev_t dev, struct mapped_device **result)
8264 + struct mapped_device *md;
8266 + md = alloc_md(dev);
8270 + __unbind(md); /* Ensure zero device size */
8276 +void dm_get(struct mapped_device *md)
8278 + atomic_inc(&md->holders);
8281 +void dm_put(struct mapped_device *md)
8283 + if (atomic_dec_and_test(&md->holders)) {
8285 + dm_table_suspend_targets(md->map);
8292 + * Requeue the deferred io by calling generic_make_request.
8294 +static void flush_deferred_io(struct deferred_io *c)
8296 + struct deferred_io *n;
8300 + generic_make_request(c->rw, c->bh);
8307 + * Swap in a new table (destroying old one).
8309 +int dm_swap_table(struct mapped_device *md, struct dm_table *table)
8313 + down_write(&md->lock);
8316 + * The device must be suspended, or have no table bound yet.
8318 + if (md->map && !test_bit(DMF_SUSPENDED, &md->flags)) {
8319 + up_write(&md->lock);
8324 + r = __bind(md, table);
8328 + up_write(&md->lock);
8333 + * We need to be able to change a mapping table under a mounted
8334 + * filesystem. For example we might want to move some data in
8335 + * the background. Before the table can be swapped with
8336 + * dm_bind_table, dm_suspend must be called to flush any in
8337 + * flight io and ensure that any further io gets deferred.
8339 +int dm_suspend(struct mapped_device *md)
8342 + DECLARE_WAITQUEUE(wait, current);
8344 + down_write(&md->lock);
8347 + * First we set the BLOCK_IO flag so no more ios will be
8350 + if (test_bit(DMF_BLOCK_IO, &md->flags)) {
8351 + up_write(&md->lock);
8355 + set_bit(DMF_BLOCK_IO, &md->flags);
8356 + add_wait_queue(&md->wait, &wait);
8357 + up_write(&md->lock);
8360 + * Then we wait for the already mapped ios to
8363 + run_task_queue(&tq_disk);
8365 + set_current_state(TASK_INTERRUPTIBLE);
8367 + if (!atomic_read(&md->pending) || signal_pending(current))
8372 + set_current_state(TASK_RUNNING);
8374 + down_write(&md->lock);
8375 + remove_wait_queue(&md->wait, &wait);
8377 + /* did we flush everything ? */
8378 + if (atomic_read(&md->pending)) {
8379 + clear_bit(DMF_BLOCK_IO, &md->flags);
8382 + set_bit(DMF_SUSPENDED, &md->flags);
8384 + dm_table_suspend_targets(md->map);
8386 + up_write(&md->lock);
8391 +int dm_resume(struct mapped_device *md)
8393 + struct deferred_io *def;
8395 + down_write(&md->lock);
8396 + if (!test_bit(DMF_SUSPENDED, &md->flags)) {
8397 + up_write(&md->lock);
8402 + dm_table_resume_targets(md->map);
8404 + clear_bit(DMF_SUSPENDED, &md->flags);
8405 + clear_bit(DMF_BLOCK_IO, &md->flags);
8406 + def = md->deferred;
8407 + md->deferred = NULL;
8408 + up_write(&md->lock);
8410 + flush_deferred_io(def);
8411 + run_task_queue(&tq_disk);
8416 +struct dm_table *dm_get_table(struct mapped_device *md)
8418 + struct dm_table *t;
8420 + down_read(&md->lock);
8424 + up_read(&md->lock);
8429 +/*-----------------------------------------------------------------
8430 + * Event notification.
8431 + *---------------------------------------------------------------*/
8432 +uint32_t dm_get_event_nr(struct mapped_device *md)
8436 + down_read(&md->lock);
8438 + up_read(&md->lock);
8443 +int dm_add_wait_queue(struct mapped_device *md, wait_queue_t *wq,
8444 + uint32_t event_nr)
8446 + down_write(&md->lock);
8447 + if (event_nr != md->event_nr) {
8448 + up_write(&md->lock);
8452 + add_wait_queue(&md->eventq, wq);
8453 + up_write(&md->lock);
8458 +const char *dm_kdevname(kdev_t dev)
8460 + static char buffer[32];
8461 + sprintf(buffer, "%03d:%03d", MAJOR(dev), MINOR(dev));
8465 +void dm_remove_wait_queue(struct mapped_device *md, wait_queue_t *wq)
8467 + down_write(&md->lock);
8468 + remove_wait_queue(&md->eventq, wq);
8469 + up_write(&md->lock);
8472 +kdev_t dm_kdev(struct mapped_device *md)
8476 + down_read(&md->lock);
8478 + up_read(&md->lock);
8483 +int dm_suspended(struct mapped_device *md)
8485 + return test_bit(DMF_SUSPENDED, &md->flags);
8488 +struct block_device_operations dm_blk_dops = {
8489 + .open = dm_blk_open,
8490 + .release = dm_blk_close,
8491 + .ioctl = dm_blk_ioctl,
8492 + .owner = THIS_MODULE
8498 +module_init(dm_init);
8499 +module_exit(dm_exit);
8501 +MODULE_DESCRIPTION(DM_NAME " driver");
8502 +MODULE_AUTHOR("Joe Thornber <thornber@sistina.com>");
8503 +MODULE_LICENSE("GPL");
8505 +EXPORT_SYMBOL(dm_kdevname);
8506 --- diff/drivers/md/dm.h 1970-01-01 01:00:00.000000000 +0100
8507 +++ source/drivers/md/dm.h 2003-10-16 10:44:23.000000000 +0100
8510 + * Internal header file for device mapper
8512 + * Copyright (C) 2001, 2002 Sistina Software
8514 + * This file is released under the LGPL.
8517 +#ifndef DM_INTERNAL_H
8518 +#define DM_INTERNAL_H
8520 +#include <linux/fs.h>
8521 +#include <linux/device-mapper.h>
8522 +#include <linux/list.h>
8523 +#include <linux/blkdev.h>
8525 +#define DM_NAME "device-mapper"
8526 +#define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x)
8527 +#define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x)
8528 +#define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x)
8531 + * FIXME: I think this should be with the definition of sector_t
8535 +#define SECTOR_FORMAT "%Lu"
8537 +#define SECTOR_FORMAT "%lu"
8540 +#define SECTOR_SHIFT 9
8541 +#define SECTOR_SIZE (1 << SECTOR_SHIFT)
8543 +extern struct block_device_operations dm_blk_dops;
8546 + * List of devices that a metadevice uses and should open/close.
8549 + struct list_head list;
8554 + struct block_device *bdev;
8558 +struct mapped_device;
8560 +/*-----------------------------------------------------------------
8561 + * Functions for manipulating a struct mapped_device.
8562 + * Drop the reference with dm_put when you finish with the object.
8563 + *---------------------------------------------------------------*/
8564 +int dm_create(kdev_t dev, struct mapped_device **md);
8567 + * Reference counting for md.
8569 +void dm_get(struct mapped_device *md);
8570 +void dm_put(struct mapped_device *md);
8573 + * A device can still be used while suspended, but I/O is deferred.
8575 +int dm_suspend(struct mapped_device *md);
8576 +int dm_resume(struct mapped_device *md);
8579 + * The device must be suspended before calling this method.
8581 +int dm_swap_table(struct mapped_device *md, struct dm_table *t);
8584 + * Drop a reference on the table when you've finished with the
8587 +struct dm_table *dm_get_table(struct mapped_device *md);
8590 + * Event functions.
8592 +uint32_t dm_get_event_nr(struct mapped_device *md);
8593 +int dm_add_wait_queue(struct mapped_device *md, wait_queue_t *wq,
8594 + uint32_t event_nr);
8595 +void dm_remove_wait_queue(struct mapped_device *md, wait_queue_t *wq);
8600 +kdev_t dm_kdev(struct mapped_device *md);
8601 +int dm_suspended(struct mapped_device *md);
8603 +/*-----------------------------------------------------------------
8604 + * Functions for manipulating a table. Tables are also reference
8606 + *---------------------------------------------------------------*/
8607 +int dm_table_create(struct dm_table **result, int mode);
8609 +void dm_table_get(struct dm_table *t);
8610 +void dm_table_put(struct dm_table *t);
8612 +int dm_table_add_target(struct dm_table *t, const char *type,
8613 + sector_t start, sector_t len, char *params);
8614 +int dm_table_complete(struct dm_table *t);
8615 +void dm_table_event_callback(struct dm_table *t,
8616 + void (*fn)(void *), void *context);
8617 +void dm_table_event(struct dm_table *t);
8618 +sector_t dm_table_get_size(struct dm_table *t);
8619 +struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
8620 +struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
8621 +unsigned int dm_table_get_num_targets(struct dm_table *t);
8622 +struct list_head *dm_table_get_devices(struct dm_table *t);
8623 +int dm_table_get_mode(struct dm_table *t);
8624 +void dm_table_suspend_targets(struct dm_table *t);
8625 +void dm_table_resume_targets(struct dm_table *t);
8627 +/*-----------------------------------------------------------------
8628 + * A registry of target types.
8629 + *---------------------------------------------------------------*/
8630 +int dm_target_init(void);
8631 +void dm_target_exit(void);
8632 +struct target_type *dm_get_target_type(const char *name);
8633 +void dm_put_target_type(struct target_type *t);
8636 +/*-----------------------------------------------------------------
8638 + *---------------------------------------------------------------*/
8639 +static inline int array_too_big(unsigned long fixed, unsigned long obj,
8640 + unsigned long num)
8642 + return (num > (ULONG_MAX - fixed) / obj);
8646 + * ceiling(n / size) * size
8648 +static inline unsigned long dm_round_up(unsigned long n, unsigned long size)
8650 + unsigned long r = n % size;
8651 + return n + (r ? (size - r) : 0);
8655 + * Ceiling(n / size)
8657 +static inline unsigned long dm_div_up(unsigned long n, unsigned long size)
8659 + return dm_round_up(n, size) / size;
8662 +const char *dm_kdevname(kdev_t dev);
8665 + * The device-mapper can be driven through one of two interfaces;
8666 + * ioctl or filesystem, depending which patch you have applied.
8668 +int dm_interface_init(void);
8669 +void dm_interface_exit(void);
8672 + * Targets for linear and striped mappings
8674 +int dm_linear_init(void);
8675 +void dm_linear_exit(void);
8677 +int dm_stripe_init(void);
8678 +void dm_stripe_exit(void);
8680 +int dm_snapshot_init(void);
8681 +void dm_snapshot_exit(void);
8684 --- diff/drivers/md/kcopyd.c 1970-01-01 01:00:00.000000000 +0100
8685 +++ source/drivers/md/kcopyd.c 2003-10-16 10:44:23.000000000 +0100
8688 + * Copyright (C) 2002 Sistina Software (UK) Limited.
8690 + * This file is released under the GPL.
8693 +#include <asm/atomic.h>
8695 +#include <linux/blkdev.h>
8696 +#include <linux/config.h>
8697 +#include <linux/device-mapper.h>
8698 +#include <linux/fs.h>
8699 +#include <linux/init.h>
8700 +#include <linux/list.h>
8701 +#include <linux/locks.h>
8702 +#include <linux/mempool.h>
8703 +#include <linux/module.h>
8704 +#include <linux/pagemap.h>
8705 +#include <linux/slab.h>
8706 +#include <linux/vmalloc.h>
8708 +#include "kcopyd.h"
8709 +#include "dm-daemon.h"
8711 +/* FIXME: this is only needed for the DMERR macros */
8714 +static struct dm_daemon _kcopyd;
8716 +/*-----------------------------------------------------------------
8717 + * Each kcopyd client has its own little pool of preallocated
8718 + * pages for kcopyd io.
8719 + *---------------------------------------------------------------*/
8720 +struct kcopyd_client {
8721 + struct list_head list;
8724 + struct list_head pages;
8725 + unsigned int nr_pages;
8726 + unsigned int nr_free_pages;
8729 +static inline void __push_page(struct kcopyd_client *kc, struct page *p)
8731 + list_add(&p->list, &kc->pages);
8732 + kc->nr_free_pages++;
8735 +static inline struct page *__pop_page(struct kcopyd_client *kc)
8739 + p = list_entry(kc->pages.next, struct page, list);
8740 + list_del(&p->list);
8741 + kc->nr_free_pages--;
8746 +static int kcopyd_get_pages(struct kcopyd_client *kc,
8747 + unsigned int nr, struct list_head *pages)
8750 + INIT_LIST_HEAD(pages);
8752 + spin_lock(&kc->lock);
8753 + if (kc->nr_free_pages < nr) {
8754 + spin_unlock(&kc->lock);
8759 + p = __pop_page(kc);
8760 + list_add(&p->list, pages);
8762 + spin_unlock(&kc->lock);
8767 +static void kcopyd_put_pages(struct kcopyd_client *kc, struct list_head *pages)
8769 + struct list_head *tmp, *tmp2;
8771 + spin_lock(&kc->lock);
8772 + list_for_each_safe (tmp, tmp2, pages)
8773 + __push_page(kc, list_entry(tmp, struct page, list));
8774 + spin_unlock(&kc->lock);
8778 + * These three functions resize the page pool.
8780 +static void release_pages(struct list_head *pages)
8783 + struct list_head *tmp, *tmp2;
8785 + list_for_each_safe (tmp, tmp2, pages) {
8786 + p = list_entry(tmp, struct page, list);
8792 +static int client_alloc_pages(struct kcopyd_client *kc, unsigned int nr)
8798 + for (i = 0; i < nr; i++) {
8799 + p = alloc_page(GFP_KERNEL);
8801 + release_pages(&new);
8806 + list_add(&p->list, &new);
8809 + kcopyd_put_pages(kc, &new);
8810 + kc->nr_pages += nr;
8814 +static void client_free_pages(struct kcopyd_client *kc)
8816 + BUG_ON(kc->nr_free_pages != kc->nr_pages);
8817 + release_pages(&kc->pages);
8818 + kc->nr_free_pages = kc->nr_pages = 0;
8821 +/*-----------------------------------------------------------------
8822 + * kcopyd_jobs need to be allocated by the *clients* of kcopyd,
8823 + * for this reason we use a mempool to prevent the client from
8824 + * ever having to do io (which could cause a deadlock).
8825 + *---------------------------------------------------------------*/
8826 +struct kcopyd_job {
8827 + struct kcopyd_client *kc;
8828 + struct list_head list;
8829 + unsigned int flags;
8832 + * Error state of the job.
8835 + unsigned int write_err;
8838 + * Either READ or WRITE
8841 + struct io_region source;
8844 + * The destinations for the transfer.
8846 + unsigned int num_dests;
8847 + struct io_region dests[KCOPYD_MAX_REGIONS];
8850 + unsigned int nr_pages;
8851 + struct list_head pages;
8854 + * Set this to ensure you are notified when the job has
8855 + * completed. 'context' is for callback to use.
8857 + kcopyd_notify_fn fn;
8861 + * These fields are only used if the job has been split
8862 + * into more manageable parts.
8864 + struct semaphore lock;
8865 + atomic_t sub_jobs;
8866 + sector_t progress;
8869 +/* FIXME: this should scale with the number of pages */
8870 +#define MIN_JOBS 512
8872 +static kmem_cache_t *_job_cache = NULL;
8873 +static mempool_t *_job_pool = NULL;
8876 + * We maintain three lists of jobs:
8878 + * i) jobs waiting for pages
8879 + * ii) jobs that have pages, and are waiting for the io to be issued.
8880 + * iii) jobs that have completed.
8882 + * All three of these are protected by job_lock.
8884 +static spinlock_t _job_lock = SPIN_LOCK_UNLOCKED;
8886 +static LIST_HEAD(_complete_jobs);
8887 +static LIST_HEAD(_io_jobs);
8888 +static LIST_HEAD(_pages_jobs);
8890 +static int jobs_init(void)
8892 + INIT_LIST_HEAD(&_complete_jobs);
8893 + INIT_LIST_HEAD(&_io_jobs);
8894 + INIT_LIST_HEAD(&_pages_jobs);
8896 + _job_cache = kmem_cache_create("kcopyd-jobs",
8897 + sizeof(struct kcopyd_job),
8898 + __alignof__(struct kcopyd_job),
8903 + _job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
8904 + mempool_free_slab, _job_cache);
8906 + kmem_cache_destroy(_job_cache);
8913 +static void jobs_exit(void)
8915 + BUG_ON(!list_empty(&_complete_jobs));
8916 + BUG_ON(!list_empty(&_io_jobs));
8917 + BUG_ON(!list_empty(&_pages_jobs));
8919 + mempool_destroy(_job_pool);
8920 + kmem_cache_destroy(_job_cache);
8924 + * Functions to push and pop a job onto the head of a given job
8927 +static inline struct kcopyd_job *pop(struct list_head *jobs)
8929 + struct kcopyd_job *job = NULL;
8930 + unsigned long flags;
8932 + spin_lock_irqsave(&_job_lock, flags);
8934 + if (!list_empty(jobs)) {
8935 + job = list_entry(jobs->next, struct kcopyd_job, list);
8936 + list_del(&job->list);
8938 + spin_unlock_irqrestore(&_job_lock, flags);
8943 +static inline void push(struct list_head *jobs, struct kcopyd_job *job)
8945 + unsigned long flags;
8947 + spin_lock_irqsave(&_job_lock, flags);
8948 + list_add_tail(&job->list, jobs);
8949 + spin_unlock_irqrestore(&_job_lock, flags);
8953 + * These three functions process 1 item from the corresponding
8959 + * > 0: can't process yet.
8961 +static int run_complete_job(struct kcopyd_job *job)
8963 + void *context = job->context;
8964 + int read_err = job->read_err;
8965 + unsigned int write_err = job->write_err;
8966 + kcopyd_notify_fn fn = job->fn;
8968 + kcopyd_put_pages(job->kc, &job->pages);
8969 + mempool_free(job, _job_pool);
8970 + fn(read_err, write_err, context);
8974 +static void complete_io(unsigned int error, void *context)
8976 + struct kcopyd_job *job = (struct kcopyd_job *) context;
8979 + if (job->rw == WRITE)
8980 + job->write_err &= error;
8982 + job->read_err = 1;
8984 + if (!test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) {
8985 + push(&_complete_jobs, job);
8986 + dm_daemon_wake(&_kcopyd);
8991 + if (job->rw == WRITE)
8992 + push(&_complete_jobs, job);
8996 + push(&_io_jobs, job);
8999 + dm_daemon_wake(&_kcopyd);
9003 + * Request io on as many buffer heads as we can currently get for
9004 + * a particular job.
9006 +static int run_io_job(struct kcopyd_job *job)
9010 + if (job->rw == READ)
9011 + r = dm_io_async(1, &job->source, job->rw,
9012 + list_entry(job->pages.next, struct page, list),
9013 + job->offset, complete_io, job);
9016 + r = dm_io_async(job->num_dests, job->dests, job->rw,
9017 + list_entry(job->pages.next, struct page, list),
9018 + job->offset, complete_io, job);
9023 +#define SECTORS_PER_PAGE (PAGE_SIZE / SECTOR_SIZE)
9024 +static int run_pages_job(struct kcopyd_job *job)
9028 + job->nr_pages = dm_div_up(job->dests[0].count + job->offset,
9029 + SECTORS_PER_PAGE);
9030 + r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages);
9032 + /* this job is ready for io */
9033 + push(&_io_jobs, job);
9038 + /* can't complete now */
9045 + * Run through a list for as long as possible. Returns the count
9046 + * of successful jobs.
9048 +static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *))
9050 + struct kcopyd_job *job;
9053 + while ((job = pop(jobs))) {
9058 + /* error this rogue job */
9059 + if (job->rw == WRITE)
9060 + job->write_err = (unsigned int) -1;
9062 + job->read_err = 1;
9063 + push(&_complete_jobs, job);
9069 + * We couldn't service this job ATM, so
9070 + * push this job back onto the list.
9083 + * kcopyd does this every time it's woken up.
9085 +static void do_work(void)
9088 + * The order that these are called is *very* important.
9089 + * complete jobs can free some pages for pages jobs.
9090 + * Pages jobs when successful will jump onto the io jobs
9091 + * list. io jobs call wake when they complete and it all
9094 + process_jobs(&_complete_jobs, run_complete_job);
9095 + process_jobs(&_pages_jobs, run_pages_job);
9096 + process_jobs(&_io_jobs, run_io_job);
9097 + run_task_queue(&tq_disk);
9101 + * If we are copying a small region we just dispatch a single job
9102 + * to do the copy, otherwise the io has to be split up into many
9105 +static void dispatch_job(struct kcopyd_job *job)
9107 + push(&_pages_jobs, job);
9108 + dm_daemon_wake(&_kcopyd);
9111 +#define SUB_JOB_SIZE 128
9112 +static void segment_complete(int read_err,
9113 + unsigned int write_err, void *context)
9115 + /* FIXME: tidy this function */
9116 + sector_t progress = 0;
9117 + sector_t count = 0;
9118 + struct kcopyd_job *job = (struct kcopyd_job *) context;
9122 + /* update the error */
9124 + job->read_err = 1;
9127 + job->write_err &= write_err;
9130 + * Only dispatch more work if there hasn't been an error.
9132 + if ((!job->read_err && !job->write_err) ||
9133 + test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) {
9134 + /* get the next chunk of work */
9135 + progress = job->progress;
9136 + count = job->source.count - progress;
9138 + if (count > SUB_JOB_SIZE)
9139 + count = SUB_JOB_SIZE;
9141 + job->progress += count;
9148 + struct kcopyd_job *sub_job = mempool_alloc(_job_pool, GFP_NOIO);
9150 + memcpy(sub_job, job, sizeof(*job));
9151 + sub_job->source.sector += progress;
9152 + sub_job->source.count = count;
9154 + for (i = 0; i < job->num_dests; i++) {
9155 + sub_job->dests[i].sector += progress;
9156 + sub_job->dests[i].count = count;
9159 + sub_job->fn = segment_complete;
9160 + sub_job->context = job;
9161 + dispatch_job(sub_job);
9163 + } else if (atomic_dec_and_test(&job->sub_jobs)) {
9166 + * To avoid a race we must keep the job around
9167 + * until after the notify function has completed.
9168 + * Otherwise the client may try and stop the job
9169 + * after we've completed.
9171 + job->fn(read_err, write_err, job->context);
9172 + mempool_free(job, _job_pool);
9177 + * Create some little jobs that will do the move between
9180 +#define SPLIT_COUNT 8
9181 +static void split_job(struct kcopyd_job *job)
9185 + atomic_set(&job->sub_jobs, SPLIT_COUNT);
9186 + for (i = 0; i < SPLIT_COUNT; i++)
9187 + segment_complete(0, 0u, job);
9190 +#define SUB_JOB_THRESHOLD (SPLIT_COUNT * SUB_JOB_SIZE)
9191 +int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
9192 + unsigned int num_dests, struct io_region *dests,
9193 + unsigned int flags, kcopyd_notify_fn fn, void *context)
9195 + struct kcopyd_job *job;
9198 + * Allocate a new job.
9200 + job = mempool_alloc(_job_pool, GFP_NOIO);
9203 + * set up for the read.
9206 + job->flags = flags;
9207 + job->read_err = 0;
9208 + job->write_err = 0;
9211 + memcpy(&job->source, from, sizeof(*from));
9213 + job->num_dests = num_dests;
9214 + memcpy(&job->dests, dests, sizeof(*dests) * num_dests);
9217 + job->nr_pages = 0;
9218 + INIT_LIST_HEAD(&job->pages);
9221 + job->context = context;
9223 + if (job->source.count < SUB_JOB_THRESHOLD)
9224 + dispatch_job(job);
9227 + init_MUTEX(&job->lock);
9228 + job->progress = 0;
9236 + * Cancels a kcopyd job, eg. someone might be deactivating a
9239 +int kcopyd_cancel(struct kcopyd_job *job, int block)
9241 + /* FIXME: finish */
9245 +/*-----------------------------------------------------------------
9247 + *---------------------------------------------------------------*/
9248 +static DECLARE_MUTEX(_client_lock);
9249 +static LIST_HEAD(_clients);
9251 +static int client_add(struct kcopyd_client *kc)
9253 + down(&_client_lock);
9254 + list_add(&kc->list, &_clients);
9255 + up(&_client_lock);
9259 +static void client_del(struct kcopyd_client *kc)
9261 + down(&_client_lock);
9262 + list_del(&kc->list);
9263 + up(&_client_lock);
9266 +int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result)
9269 + struct kcopyd_client *kc;
9271 + kc = kmalloc(sizeof(*kc), GFP_KERNEL);
9275 + kc->lock = SPIN_LOCK_UNLOCKED;
9276 + INIT_LIST_HEAD(&kc->pages);
9277 + kc->nr_pages = kc->nr_free_pages = 0;
9278 + r = client_alloc_pages(kc, nr_pages);
9284 + r = dm_io_get(nr_pages);
9286 + client_free_pages(kc);
9291 + r = client_add(kc);
9293 + dm_io_put(nr_pages);
9294 + client_free_pages(kc);
9303 +void kcopyd_client_destroy(struct kcopyd_client *kc)
9305 + dm_io_put(kc->nr_pages);
9306 + client_free_pages(kc);
9312 +int __init kcopyd_init(void)
9320 + r = dm_daemon_start(&_kcopyd, "kcopyd", do_work);
9327 +void kcopyd_exit(void)
9330 + dm_daemon_stop(&_kcopyd);
9333 +EXPORT_SYMBOL(kcopyd_client_create);
9334 +EXPORT_SYMBOL(kcopyd_client_destroy);
9335 +EXPORT_SYMBOL(kcopyd_copy);
9336 +EXPORT_SYMBOL(kcopyd_cancel);
9337 --- diff/drivers/md/kcopyd.h 1970-01-01 01:00:00.000000000 +0100
9338 +++ source/drivers/md/kcopyd.h 2003-10-16 10:44:23.000000000 +0100
9341 + * Copyright (C) 2001 Sistina Software
9343 + * This file is released under the GPL.
9346 +#ifndef DM_KCOPYD_H
9347 +#define DM_KCOPYD_H
9350 + * Needed for the definition of offset_t.
9352 +#include <linux/device-mapper.h>
9353 +#include <linux/iobuf.h>
9357 +int kcopyd_init(void);
9358 +void kcopyd_exit(void);
9360 +/* FIXME: make this configurable */
9361 +#define KCOPYD_MAX_REGIONS 8
9363 +#define KCOPYD_IGNORE_ERROR 1
9366 + * To use kcopyd you must first create a kcopyd client object.
9368 +struct kcopyd_client;
9369 +int kcopyd_client_create(unsigned int num_pages, struct kcopyd_client **result);
9370 +void kcopyd_client_destroy(struct kcopyd_client *kc);
9373 + * Submit a copy job to kcopyd. This is built on top of the
9374 + * previous three fns.
9376 + * read_err is a boolean,
9377 + * write_err is a bitset, with 1 bit for each destination region
9379 +typedef void (*kcopyd_notify_fn)(int read_err,
9380 + unsigned int write_err, void *context);
9382 +int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
9383 + unsigned int num_dests, struct io_region *dests,
9384 + unsigned int flags, kcopyd_notify_fn fn, void *context);
9387 --- diff/include/linux/device-mapper.h 1970-01-01 01:00:00.000000000 +0100
9388 +++ source/include/linux/device-mapper.h 2003-10-16 10:44:23.000000000 +0100
9391 + * Copyright (C) 2001 Sistina Software (UK) Limited.
9393 + * This file is released under the LGPL.
9396 +#ifndef _LINUX_DEVICE_MAPPER_H
9397 +#define _LINUX_DEVICE_MAPPER_H
9399 +typedef unsigned long sector_t;
9405 +typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
9409 + unsigned long long ll;
9413 + * In the constructor the target parameter will already have the
9414 + * table, type, begin and len fields filled in.
9416 +typedef int (*dm_ctr_fn) (struct dm_target * target, unsigned int argc,
9420 + * The destructor doesn't need to free the dm_target, just
9421 + * anything hidden ti->private.
9423 +typedef void (*dm_dtr_fn) (struct dm_target * ti);
9426 + * The map function must return:
9428 + * = 0: The target will handle the io by resubmitting it later
9429 + * > 0: simple remap complete
9431 +typedef int (*dm_map_fn) (struct dm_target * ti, struct buffer_head * bh,
9432 + int rw, union map_info *map_context);
9436 + * < 0 : error (currently ignored)
9437 + * 0 : ended successfully
9438 + * 1 : for some reason the io has still not completed (eg,
9439 + * multipath target might want to requeue a failed io).
9441 +typedef int (*dm_endio_fn) (struct dm_target * ti,
9442 + struct buffer_head * bh, int rw, int error,
9443 + union map_info *map_context);
9444 +typedef void (*dm_suspend_fn) (struct dm_target *ti);
9445 +typedef void (*dm_resume_fn) (struct dm_target *ti);
9446 +typedef int (*dm_status_fn) (struct dm_target * ti, status_type_t status_type,
9447 + char *result, unsigned int maxlen);
9449 +void dm_error(const char *message);
9452 + * Constructors should call these functions to ensure destination devices
9453 + * are opened/closed correctly.
9454 + * FIXME: too many arguments.
9456 +int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
9457 + sector_t len, int mode, struct dm_dev **result);
9458 +void dm_put_device(struct dm_target *ti, struct dm_dev *d);
9461 + * Information about a target type
9463 +struct target_type {
9465 + struct module *module;
9469 + dm_endio_fn end_io;
9470 + dm_suspend_fn suspend;
9471 + dm_resume_fn resume;
9472 + dm_status_fn status;
9476 + struct dm_table *table;
9477 + struct target_type *type;
9479 + /* target limits */
9483 + /* target specific data */
9486 + /* Used to provide an error string from the ctr */
9490 +int dm_register_target(struct target_type *t);
9491 +int dm_unregister_target(struct target_type *t);
9493 +#endif /* _LINUX_DEVICE_MAPPER_H */
9494 --- diff/include/linux/dm-ioctl.h 1970-01-01 01:00:00.000000000 +0100
9495 +++ source/include/linux/dm-ioctl.h 2003-10-16 10:44:23.000000000 +0100
9498 + * Copyright (C) 2001 - 2003 Sistina Software (UK) Limited.
9500 + * This file is released under the LGPL.
9503 +#ifndef _LINUX_DM_IOCTL_H
9504 +#define _LINUX_DM_IOCTL_H
9506 +#include <linux/types.h>
9508 +#define DM_DIR "mapper" /* Slashes not supported */
9509 +#define DM_MAX_TYPE_NAME 16
9510 +#define DM_NAME_LEN 128
9511 +#define DM_UUID_LEN 129
9514 + * A traditional ioctl interface for the device mapper.
9516 + * Each device can have two tables associated with it, an
9517 + * 'active' table which is the one currently used by io passing
9518 + * through the device, and an 'inactive' one which is a table
9519 + * that is being prepared as a replacement for the 'active' one.
9522 + * Just get the version information for the ioctl interface.
9525 + * Remove all dm devices, destroy all tables. Only really used
9528 + * DM_LIST_DEVICES:
9529 + * Get a list of all the dm device names.
9532 + * Create a new device, neither the 'active' or 'inactive' table
9533 + * slots will be filled. The device will be in suspended state
9534 + * after creation, however any io to the device will get errored
9535 + * since it will be out-of-bounds.
9538 + * Remove a device, destroy any tables.
9541 + * Rename a device.
9544 + * This performs both suspend and resume, depending which flag is
9546 + * Suspend: This command will not return until all pending io to
9547 + * the device has completed. Further io will be deferred until
9548 + * the device is resumed.
9549 + * Resume: It is no longer an error to issue this command on an
9550 + * unsuspended device. If a table is present in the 'inactive'
9551 + * slot, it will be moved to the active slot, then the old table
9552 + * from the active slot will be _destroyed_. Finally the device
9556 + * Retrieves the status for the table in the 'active' slot.
9559 + * Wait for a significant event to occur to the device. This
9560 + * could either be caused by an event triggered by one of the
9561 + * targets of the table in the 'active' slot, or a table change.
9564 + * Load a table into the 'inactive' slot for the device. The
9565 + * device does _not_ need to be suspended prior to this command.
9568 + * Destroy any table in the 'inactive' slot (ie. abort).
9571 + * Return a set of device dependencies for the 'active' table.
9573 + * DM_TABLE_STATUS:
9574 + * Return the targets status for the 'active' table.
9578 + * All ioctl arguments consist of a single chunk of memory, with
9579 + * this structure at the start. If a uuid is specified any
9580 + * lookup (eg. for a DM_INFO) will be done on that, *not* the
9585 + * The version number is made up of three parts:
9586 + * major - no backward or forward compatibility,
9587 + * minor - only backwards compatible,
9588 + * patch - both backwards and forwards compatible.
9590 + * All clients of the ioctl interface should fill in the
9591 + * version number of the interface that they were
9594 + * All recognised ioctl commands (ie. those that don't
9595 + * return -ENOTTY) fill out this field, even if the
9598 + uint32_t version[3]; /* in/out */
9599 + uint32_t data_size; /* total size of data passed in
9600 + * including this struct */
9602 + uint32_t data_start; /* offset to start of data
9603 + * relative to start of this struct */
9605 + uint32_t target_count; /* in/out */
9606 + int32_t open_count; /* out */
9607 + uint32_t flags; /* in/out */
9608 + uint32_t event_nr; /* in/out */
9611 + uint64_t dev; /* in/out */
9613 + char name[DM_NAME_LEN]; /* device name */
9614 + char uuid[DM_UUID_LEN]; /* unique identifier for
9615 + * the block device */
9619 + * Used to specify tables. These structures appear after the
9622 +struct dm_target_spec {
9623 + uint64_t sector_start;
9625 + int32_t status; /* used when reading from kernel only */
9628 + * Offset in bytes (from the start of this struct) to
9629 + * next target_spec.
9633 + char target_type[DM_MAX_TYPE_NAME];
9636 + * Parameter string starts immediately after this object.
9637 + * Be careful to add padding after string to ensure correct
9638 + * alignment of subsequent dm_target_spec.
9643 + * Used to retrieve the target dependencies.
9645 +struct dm_target_deps {
9646 + uint32_t count; /* Array size */
9647 + uint32_t padding; /* unused */
9648 + uint64_t dev[0]; /* out */
9652 + * Used to get a list of all dm devices.
9654 +struct dm_name_list {
9656 + uint32_t next; /* offset to the next record from
9657 + the _start_ of this */
9662 + * If you change this make sure you make the corresponding change
9663 + * to dm-ioctl.c:lookup_ioctl()
9666 + /* Top level cmds */
9667 + DM_VERSION_CMD = 0,
9668 + DM_REMOVE_ALL_CMD,
9669 + DM_LIST_DEVICES_CMD,
9671 + /* device level cmds */
9672 + DM_DEV_CREATE_CMD,
9673 + DM_DEV_REMOVE_CMD,
9674 + DM_DEV_RENAME_CMD,
9675 + DM_DEV_SUSPEND_CMD,
9676 + DM_DEV_STATUS_CMD,
9679 + /* Table level cmds */
9680 + DM_TABLE_LOAD_CMD,
9681 + DM_TABLE_CLEAR_CMD,
9682 + DM_TABLE_DEPS_CMD,
9683 + DM_TABLE_STATUS_CMD,
9686 +#define DM_IOCTL 0xfd
9688 +#define DM_VERSION _IOWR(DM_IOCTL, DM_VERSION_CMD, struct dm_ioctl)
9689 +#define DM_REMOVE_ALL _IOWR(DM_IOCTL, DM_REMOVE_ALL_CMD, struct dm_ioctl)
9690 +#define DM_LIST_DEVICES _IOWR(DM_IOCTL, DM_LIST_DEVICES_CMD, struct dm_ioctl)
9692 +#define DM_DEV_CREATE _IOWR(DM_IOCTL, DM_DEV_CREATE_CMD, struct dm_ioctl)
9693 +#define DM_DEV_REMOVE _IOWR(DM_IOCTL, DM_DEV_REMOVE_CMD, struct dm_ioctl)
9694 +#define DM_DEV_RENAME _IOWR(DM_IOCTL, DM_DEV_RENAME_CMD, struct dm_ioctl)
9695 +#define DM_DEV_SUSPEND _IOWR(DM_IOCTL, DM_DEV_SUSPEND_CMD, struct dm_ioctl)
9696 +#define DM_DEV_STATUS _IOWR(DM_IOCTL, DM_DEV_STATUS_CMD, struct dm_ioctl)
9697 +#define DM_DEV_WAIT _IOWR(DM_IOCTL, DM_DEV_WAIT_CMD, struct dm_ioctl)
9699 +#define DM_TABLE_LOAD _IOWR(DM_IOCTL, DM_TABLE_LOAD_CMD, struct dm_ioctl)
9700 +#define DM_TABLE_CLEAR _IOWR(DM_IOCTL, DM_TABLE_CLEAR_CMD, struct dm_ioctl)
9701 +#define DM_TABLE_DEPS _IOWR(DM_IOCTL, DM_TABLE_DEPS_CMD, struct dm_ioctl)
9702 +#define DM_TABLE_STATUS _IOWR(DM_IOCTL, DM_TABLE_STATUS_CMD, struct dm_ioctl)
9704 +#define DM_VERSION_MAJOR 4
9705 +#define DM_VERSION_MINOR 0
9706 +#define DM_VERSION_PATCHLEVEL 1
9707 +#define DM_VERSION_EXTRA "-ioctl (2003-07-12)"
9710 +#define DM_READONLY_FLAG (1 << 0) /* In/Out */
9711 +#define DM_SUSPEND_FLAG (1 << 1) /* In/Out */
9712 +#define DM_PERSISTENT_DEV_FLAG (1 << 3) /* In */
9715 + * Flag passed into ioctl STATUS command to get table information
9716 + * rather than current status.
9718 +#define DM_STATUS_TABLE_FLAG (1 << 4) /* In */
9721 + * Flags that indicate whether a table is present in either of
9722 + * the two table slots that a device has.
9724 +#define DM_ACTIVE_PRESENT_FLAG (1 << 5) /* Out */
9725 +#define DM_INACTIVE_PRESENT_FLAG (1 << 6) /* Out */
9728 + * Indicates that the buffer passed in wasn't big enough for the
9731 +#define DM_BUFFER_FULL_FLAG (1 << 8) /* Out */
9733 +#endif /* _LINUX_DM_IOCTL_H */
9734 --- diff/include/linux/mempool.h 1970-01-01 01:00:00.000000000 +0100
9735 +++ source/include/linux/mempool.h 2003-10-16 10:44:23.000000000 +0100
9738 + * memory buffer pool support
9740 +#ifndef _LINUX_MEMPOOL_H
9741 +#define _LINUX_MEMPOOL_H
9743 +#include <linux/list.h>
9744 +#include <linux/wait.h>
9747 +typedef struct mempool_s mempool_t;
9749 +typedef void * (mempool_alloc_t)(int gfp_mask, void *pool_data);
9750 +typedef void (mempool_free_t)(void *element, void *pool_data);
9752 +extern mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
9753 + mempool_free_t *free_fn, void *pool_data);
9754 +extern int mempool_resize(mempool_t *pool, int new_min_nr, int gfp_mask);
9755 +extern void mempool_destroy(mempool_t *pool);
9756 +extern void * mempool_alloc(mempool_t *pool, int gfp_mask);
9757 +extern void mempool_free(void *element, mempool_t *pool);
9760 + * A mempool_alloc_t and mempool_free_t that get the memory from
9761 + * a slab that is passed in through pool_data.
9763 +void *mempool_alloc_slab(int gfp_mask, void *pool_data);
9764 +void mempool_free_slab(void *element, void *pool_data);
9767 +#endif /* _LINUX_MEMPOOL_H */
9768 --- diff/mm/mempool.c 1970-01-01 01:00:00.000000000 +0100
9769 +++ source/mm/mempool.c 2003-10-16 10:44:23.000000000 +0100
9772 + * linux/mm/mempool.c
9774 + * memory buffer pool support. Such pools are mostly used
9775 + * for guaranteed, deadlock-free memory allocations during
9776 + * extreme VM load.
9778 + * started by Ingo Molnar, Copyright (C) 2001
9781 +#include <linux/mm.h>
9782 +#include <linux/slab.h>
9783 +#include <linux/module.h>
9784 +#include <linux/mempool.h>
9788 + int min_nr; /* nr of elements at *elements */
9789 + int curr_nr; /* Current nr of elements at *elements */
9793 + mempool_alloc_t *alloc;
9794 + mempool_free_t *free;
9795 + wait_queue_head_t wait;
9798 +static void add_element(mempool_t *pool, void *element)
9800 + BUG_ON(pool->curr_nr >= pool->min_nr);
9801 + pool->elements[pool->curr_nr++] = element;
9804 +static void *remove_element(mempool_t *pool)
9806 + BUG_ON(pool->curr_nr <= 0);
9807 + return pool->elements[--pool->curr_nr];
9810 +static void free_pool(mempool_t *pool)
9812 + while (pool->curr_nr) {
9813 + void *element = remove_element(pool);
9814 + pool->free(element, pool->pool_data);
9816 + kfree(pool->elements);
9821 + * mempool_create - create a memory pool
9822 + * @min_nr: the minimum number of elements guaranteed to be
9823 + * allocated for this pool.
9824 + * @alloc_fn: user-defined element-allocation function.
9825 + * @free_fn: user-defined element-freeing function.
9826 + * @pool_data: optional private data available to the user-defined functions.
9828 + * this function creates and allocates a guaranteed size, preallocated
9829 + * memory pool. The pool can be used from the mempool_alloc and mempool_free
9830 + * functions. This function might sleep. Both the alloc_fn() and the free_fn()
9831 + * functions might sleep - as long as the mempool_alloc function is not called
9832 + * from IRQ contexts.
9834 +mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
9835 + mempool_free_t *free_fn, void *pool_data)
9839 + pool = kmalloc(sizeof(*pool), GFP_KERNEL);
9842 + memset(pool, 0, sizeof(*pool));
9843 + pool->elements = kmalloc(min_nr * sizeof(void *), GFP_KERNEL);
9844 + if (!pool->elements) {
9848 + spin_lock_init(&pool->lock);
9849 + pool->min_nr = min_nr;
9850 + pool->pool_data = pool_data;
9851 + init_waitqueue_head(&pool->wait);
9852 + pool->alloc = alloc_fn;
9853 + pool->free = free_fn;
9856 + * First pre-allocate the guaranteed number of buffers.
9858 + while (pool->curr_nr < pool->min_nr) {
9861 + element = pool->alloc(GFP_KERNEL, pool->pool_data);
9862 + if (unlikely(!element)) {
9866 + add_element(pool, element);
9872 + * mempool_resize - resize an existing memory pool
9873 + * @pool: pointer to the memory pool which was allocated via
9874 + * mempool_create().
9875 + * @new_min_nr: the new minimum number of elements guaranteed to be
9876 + * allocated for this pool.
9877 + * @gfp_mask: the usual allocation bitmask.
9879 + * This function shrinks/grows the pool. In the case of growing,
9880 + * it cannot be guaranteed that the pool will be grown to the new
9881 + * size immediately, but new mempool_free() calls will refill it.
9883 + * Note, the caller must guarantee that no mempool_destroy is called
9884 + * while this function is running. mempool_alloc() & mempool_free()
9885 + * might be called (eg. from IRQ contexts) while this function executes.
9887 +int mempool_resize(mempool_t *pool, int new_min_nr, int gfp_mask)
9890 + void **new_elements;
9891 + unsigned long flags;
9893 + BUG_ON(new_min_nr <= 0);
9895 + spin_lock_irqsave(&pool->lock, flags);
9896 + if (new_min_nr < pool->min_nr) {
9897 + while (pool->curr_nr > new_min_nr) {
9898 + element = remove_element(pool);
9899 + spin_unlock_irqrestore(&pool->lock, flags);
9900 + pool->free(element, pool->pool_data);
9901 + spin_lock_irqsave(&pool->lock, flags);
9903 + pool->min_nr = new_min_nr;
9906 + spin_unlock_irqrestore(&pool->lock, flags);
9908 + /* Grow the pool */
9909 + new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask);
9910 + if (!new_elements)
9913 + spin_lock_irqsave(&pool->lock, flags);
9914 + memcpy(new_elements, pool->elements,
9915 + pool->curr_nr * sizeof(*new_elements));
9916 + kfree(pool->elements);
9917 + pool->elements = new_elements;
9918 + pool->min_nr = new_min_nr;
9920 + while (pool->curr_nr < pool->min_nr) {
9921 + spin_unlock_irqrestore(&pool->lock, flags);
9922 + element = pool->alloc(gfp_mask, pool->pool_data);
9925 + spin_lock_irqsave(&pool->lock, flags);
9926 + if (pool->curr_nr < pool->min_nr)
9927 + add_element(pool, element);
9929 + kfree(element); /* Raced */
9932 + spin_unlock_irqrestore(&pool->lock, flags);
9938 + * mempool_destroy - deallocate a memory pool
9939 + * @pool: pointer to the memory pool which was allocated via
9940 + * mempool_create().
9942 + * this function only sleeps if the free_fn() function sleeps. The caller
9943 + * has to guarantee that all elements have been returned to the pool (ie:
9944 + * freed) prior to calling mempool_destroy().
9946 +void mempool_destroy(mempool_t *pool)
9948 + if (pool->curr_nr != pool->min_nr)
9949 + BUG(); /* There were outstanding elements */
9954 + * mempool_alloc - allocate an element from a specific memory pool
9955 + * @pool: pointer to the memory pool which was allocated via
9956 + * mempool_create().
9957 + * @gfp_mask: the usual allocation bitmask.
9959 + * this function only sleeps if the alloc_fn function sleeps or
9960 + * returns NULL. Note that due to preallocation, this function
9961 + * *never* fails when called from process contexts. (it might
9962 + * fail if called from an IRQ context.)
9964 +void * mempool_alloc(mempool_t *pool, int gfp_mask)
9967 + unsigned long flags;
9969 + DECLARE_WAITQUEUE(wait, current);
9970 + int gfp_nowait = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
9973 + element = pool->alloc(gfp_nowait, pool->pool_data);
9974 + if (likely(element != NULL))
9978 + * If the pool is less than 50% full then try harder
9979 + * to allocate an element:
9981 + if ((gfp_mask != gfp_nowait) && (pool->curr_nr <= pool->min_nr/2)) {
9982 + element = pool->alloc(gfp_mask, pool->pool_data);
9983 + if (likely(element != NULL))
9988 + * Kick the VM at this point.
9992 + spin_lock_irqsave(&pool->lock, flags);
9993 + if (likely(pool->curr_nr)) {
9994 + element = remove_element(pool);
9995 + spin_unlock_irqrestore(&pool->lock, flags);
9998 + spin_unlock_irqrestore(&pool->lock, flags);
10000 + /* We must not sleep in the GFP_ATOMIC case */
10001 + if (gfp_mask == gfp_nowait)
10004 + run_task_queue(&tq_disk);
10006 + add_wait_queue_exclusive(&pool->wait, &wait);
10007 + set_task_state(current, TASK_UNINTERRUPTIBLE);
10009 + spin_lock_irqsave(&pool->lock, flags);
10010 + curr_nr = pool->curr_nr;
10011 + spin_unlock_irqrestore(&pool->lock, flags);
10016 + current->state = TASK_RUNNING;
10017 + remove_wait_queue(&pool->wait, &wait);
10019 + goto repeat_alloc;
10023 + * mempool_free - return an element to the pool.
10024 + * @element: pool element pointer.
10025 + * @pool: pointer to the memory pool which was allocated via
10026 + * mempool_create().
10028 + * this function only sleeps if the free_fn() function sleeps.
10030 +void mempool_free(void *element, mempool_t *pool)
10032 + unsigned long flags;
10034 + if (pool->curr_nr < pool->min_nr) {
10035 + spin_lock_irqsave(&pool->lock, flags);
10036 + if (pool->curr_nr < pool->min_nr) {
10037 + add_element(pool, element);
10038 + spin_unlock_irqrestore(&pool->lock, flags);
10039 + wake_up(&pool->wait);
10042 + spin_unlock_irqrestore(&pool->lock, flags);
10044 + pool->free(element, pool->pool_data);
10048 + * A commonly used alloc and free fn.
10050 +void *mempool_alloc_slab(int gfp_mask, void *pool_data)
10052 + kmem_cache_t *mem = (kmem_cache_t *) pool_data;
10053 + return kmem_cache_alloc(mem, gfp_mask);
10056 +void mempool_free_slab(void *element, void *pool_data)
10058 + kmem_cache_t *mem = (kmem_cache_t *) pool_data;
10059 + kmem_cache_free(mem, element);
10063 +EXPORT_SYMBOL(mempool_create);
10064 +EXPORT_SYMBOL(mempool_resize);
10065 +EXPORT_SYMBOL(mempool_destroy);
10066 +EXPORT_SYMBOL(mempool_alloc);
10067 +EXPORT_SYMBOL(mempool_free);
10068 +EXPORT_SYMBOL(mempool_alloc_slab);
10069 +EXPORT_SYMBOL(mempool_free_slab);
10070 Only every other metadata area was being read when loading a snapshot!
10072 --- diff/drivers/md/dm-exception-store.c 2003-10-16 10:44:23.000000000 +0100
10073 +++ source/drivers/md/dm-exception-store.c 2003-10-16 10:44:27.000000000 +0100
10074 @@ -369,8 +369,6 @@
10075 r = insert_exceptions(ps, &full);
10083 Don't initialise static variables to zero/NULL.
10084 --- diff/drivers/md/kcopyd.c 2003-10-16 10:44:23.000000000 +0100
10085 +++ source/drivers/md/kcopyd.c 2003-10-16 10:44:31.000000000 +0100
10086 @@ -183,8 +183,8 @@
10087 /* FIXME: this should scale with the number of pages */
10088 #define MIN_JOBS 512
10090 -static kmem_cache_t *_job_cache = NULL;
10091 -static mempool_t *_job_pool = NULL;
10092 +static kmem_cache_t *_job_cache;
10093 +static mempool_t *_job_pool;
10096 * We maintain three lists of jobs:
10097 Change resume/suspend to do_resume/do_suspend to avoid name clash.
10098 --- diff/drivers/md/dm-ioctl.c 2003-10-16 10:44:23.000000000 +0100
10099 +++ source/drivers/md/dm-ioctl.c 2003-10-16 10:44:34.000000000 +0100
10100 @@ -593,7 +593,7 @@
10101 return dm_hash_rename(param->name, new_name);
10104 -static int suspend(struct dm_ioctl *param)
10105 +static int do_suspend(struct dm_ioctl *param)
10108 struct mapped_device *md;
10109 @@ -612,7 +612,7 @@
10113 -static int resume(struct dm_ioctl *param)
10114 +static int do_resume(struct dm_ioctl *param)
10117 struct hash_cell *hc;
10118 @@ -675,9 +675,9 @@
10119 static int dev_suspend(struct dm_ioctl *param, size_t param_size)
10121 if (param->flags & DM_SUSPEND_FLAG)
10122 - return suspend(param);
10123 + return do_suspend(param);
10125 - return resume(param);
10126 + return do_resume(param);
10132 The current version of the VFS locking patch adds a new semaphore to
10133 fs/super.c. This is used to make sure a filesystem does not get mounted
10134 on a logical volume while a snapshot is being taken. It also results in
10135 all mounts on the system being serialized, and isn't in line with the
10136 VFS locking scheme in general.
10138 I've been meaning to fix it forever, here's an updated version that adds
10139 a super with s->s_dev set to the source volume if nothing is currently
10140 mounted on the source volume. This allows me to use the s_umount
10141 semaphore in the super block to keep things safe, which is cleaner
10144 The other benefit over the existing patch is this one has zero footprint
10145 outside the lockfs calls. You're only running new code if you take a
10148 I've done some testing here, but wanted to let LVM people review it
10149 before going further. Patch is below against 2.4.21-rc6.
10151 This provides zero new functionality over the existing VFS locking
10152 patch, and is experimental. Do not apply this on production servers,
10153 and do not apply unless you want to help test.
10157 ===== drivers/md/lvm.c 1.19 vs edited =====
10158 --- diff/drivers/md/dm-snapshot.c 2003-10-16 10:44:23.000000000 +0100
10159 +++ source/drivers/md/dm-snapshot.c 2003-10-16 10:44:38.000000000 +0100
10160 @@ -525,7 +525,7 @@
10163 /* Flush IO to the origin device */
10164 - fsync_dev(s->origin->dev);
10165 + fsync_dev_lockfs(s->origin->dev);
10167 /* Add snapshot to the list of snapshots for this origin */
10168 if (register_snapshot(s)) {
10169 @@ -539,6 +539,7 @@
10172 kcopyd_client_destroy(s->kcopyd_client);
10173 + unlockfs(s->origin->dev);
10176 s->store.destroy(&s->store);
10177 --- diff/drivers/md/lvm.c 2003-10-10 23:39:06.000000000 +0100
10178 +++ source/drivers/md/lvm.c 2003-10-16 10:44:38.000000000 +0100
10179 @@ -236,9 +236,6 @@
10180 #define DEVICE_OFF(device)
10181 #define LOCAL_END_REQUEST
10183 -/* lvm_do_lv_create calls fsync_dev_lockfs()/unlockfs() */
10184 -/* #define LVM_VFS_ENHANCEMENT */
10186 #include <linux/config.h>
10187 #include <linux/module.h>
10188 #include <linux/kernel.h>
10189 @@ -2250,12 +2247,8 @@
10190 if (lv_ptr->lv_access & LV_SNAPSHOT) {
10191 lv_t *org = lv_ptr->lv_snapshot_org, *last;
10193 - /* sync the original logical volume */
10194 - fsync_dev(org->lv_dev);
10195 -#ifdef LVM_VFS_ENHANCEMENT
10196 /* VFS function call to sync and lock the filesystem */
10197 fsync_dev_lockfs(org->lv_dev);
10200 down_write(&org->lv_lock);
10201 org->lv_access |= LV_SNAPSHOT_ORG;
10202 @@ -2281,11 +2274,9 @@
10204 set_device_ro(lv_ptr->lv_dev, 1);
10206 -#ifdef LVM_VFS_ENHANCEMENT
10207 /* VFS function call to unlock the filesystem */
10208 if (lv_ptr->lv_access & LV_SNAPSHOT)
10209 unlockfs(lv_ptr->lv_snapshot_org->lv_dev);
10212 lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].de =
10213 lvm_fs_create_lv(vg_ptr, lv_ptr);
10214 --- diff/fs/buffer.c 2003-10-16 10:44:23.000000000 +0100
10215 +++ source/fs/buffer.c 2003-10-16 10:44:38.000000000 +0100
10216 @@ -383,6 +383,34 @@
10220 +int fsync_dev_lockfs(kdev_t dev)
10222 + /* you are not allowed to try locking all the filesystems
10223 + ** on the system, your chances of getting through without
10224 + ** total deadlock are slim to none.
10227 + return fsync_dev(dev) ;
10229 + sync_buffers(dev, 0);
10232 + /* note, the FS might need to start transactions to
10233 + ** sync the inodes, or the quota, no locking until
10234 + ** after these are done
10236 + sync_inodes(dev);
10238 + /* if inodes or quotas could be dirtied during the
10239 + ** sync_supers_lockfs call, the FS is responsible for getting
10240 + ** them on disk, without deadlocking against the lock
10242 + sync_supers_lockfs(dev) ;
10245 + return sync_buffers(dev, 1) ;
10248 asmlinkage long sys_sync(void)
10251 --- diff/fs/reiserfs/super.c 2003-08-26 13:50:12.000000000 +0100
10252 +++ source/fs/reiserfs/super.c 2003-10-16 10:44:38.000000000 +0100
10254 reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
10255 journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s));
10256 reiserfs_block_writes(&th) ;
10257 - journal_end(&th, s, 1) ;
10258 + journal_end_sync(&th, s, 1) ;
10262 --- diff/fs/super.c 2003-08-26 13:50:12.000000000 +0100
10263 +++ source/fs/super.c 2003-10-16 10:44:38.000000000 +0100
10265 spinlock_t sb_lock = SPIN_LOCK_UNLOCKED;
10268 + * stub of a filesystem used to make sure an FS isn't mounted
10269 + * in the middle of a lockfs call
10271 +static DECLARE_FSTYPE_DEV(lockfs_fs_type, "lockfs", NULL);
10274 * Handling of filesystem drivers list.
10276 * Inclusion to/removals from/scanning of list are protected by spinlock.
10277 @@ -436,6 +442,25 @@
10281 +static void write_super_lockfs(struct super_block *sb)
10284 + if (sb->s_root && sb->s_op) {
10285 + if (sb->s_dirt && sb->s_op->write_super)
10286 + sb->s_op->write_super(sb);
10287 + if (sb->s_op->write_super_lockfs)
10288 + sb->s_op->write_super_lockfs(sb);
10290 + unlock_super(sb);
10293 + * if no lockfs call is provided, use the sync_fs call instead.
10294 + * this must be done without the super lock held
10296 + if (!sb->s_op->write_super_lockfs && sb->s_op->sync_fs)
10297 + sb->s_op->sync_fs(sb);
10300 static inline void write_super(struct super_block *sb)
10303 @@ -483,6 +508,119 @@
10304 spin_unlock(&sb_lock);
10307 +static struct super_block *find_super_for_lockfs(kdev_t dev)
10309 + struct super_block *lockfs_sb = alloc_super();
10310 + struct super_block * s;
10315 + spin_lock(&sb_lock);
10316 + s = find_super(dev);
10318 + spin_unlock(&sb_lock);
10319 + down_read(&s->s_umount);
10321 + destroy_super(lockfs_sb);
10327 + /* if (s) we either return or goto, so we know s == NULL here.
10328 + * At this point, there are no mounted filesystems on this device,
10329 + * so we pretend to mount one.
10331 + if (!lockfs_sb) {
10332 + spin_unlock(&sb_lock);
10337 + if (lockfs_fs_type.fs_supers.prev == NULL)
10338 + INIT_LIST_HEAD(&lockfs_fs_type.fs_supers);
10339 + insert_super(s, &lockfs_fs_type);
10340 + s->s_root = (struct dentry *)1;
10341 + /* alloc_super gives us a write lock on s_umount, this
10342 + * way we know there are no concurrent lockfs holders for this dev.
10343 + * It allows us to remove the temp super from the list of supers
10344 + * immediately when unlockfs is called
10349 + * Note: don't check the dirty flag before waiting, we want the lock
10350 + * to happen every time this is called. dev must be non-zero
10352 +void sync_supers_lockfs(kdev_t dev)
10354 + struct super_block *sb;
10355 + sb = find_super_for_lockfs(dev);
10357 + write_super_lockfs(sb);
10358 + /* the drop_super is done by unlockfs */
10362 +static void drop_super_lockfs(struct super_block *s)
10364 + if (s->s_type == &lockfs_fs_type) {
10365 + struct file_system_type *fs = s->s_type;
10368 + * nobody else is allowed to grab_super() on our temp
10370 + if (!deactivate_super(s))
10373 + spin_lock(&sb_lock);
10374 + s->s_root = NULL;
10375 + list_del(&s->s_list);
10376 + list_del(&s->s_instances);
10377 + spin_unlock(&sb_lock);
10379 + up_write(&s->s_umount);
10381 + put_filesystem(fs);
10386 +void unlockfs(kdev_t dev)
10388 + struct super_block *s;
10392 + spin_lock(&sb_lock);
10393 + s = find_super(dev);
10396 + * find_super and the original lockfs call both incremented
10397 + * the reference count. drop one of them
10400 + spin_unlock(&sb_lock);
10402 + if (s->s_op->unlockfs)
10403 + s->s_op->unlockfs(s);
10404 + drop_super_lockfs(s);
10407 + printk("unlockfs: no s_root, dev %s\n", kdevname(dev));
10411 + printk("unlockfs: no super found, dev %s\n", kdevname(dev));
10415 + spin_unlock(&sb_lock);
10421 * get_super - get the superblock of a device
10422 * @dev: device to get the superblock for
10423 --- diff/include/linux/fs.h 2003-10-16 10:44:23.000000000 +0100
10424 +++ source/include/linux/fs.h 2003-10-16 10:44:38.000000000 +0100
10425 @@ -1273,6 +1273,7 @@
10426 extern int sync_buffers(kdev_t, int);
10427 extern void sync_dev(kdev_t);
10428 extern int fsync_dev(kdev_t);
10429 +extern int fsync_dev_lockfs(kdev_t);
10430 extern int fsync_super(struct super_block *);
10431 extern int fsync_no_super(kdev_t);
10432 extern void sync_inodes_sb(struct super_block *);
10433 @@ -1290,6 +1291,8 @@
10434 extern int filemap_fdatasync(struct address_space *);
10435 extern int filemap_fdatawait(struct address_space *);
10436 extern void sync_supers(kdev_t dev, int wait);
10437 +extern void sync_supers_lockfs(kdev_t);
10438 +extern void unlockfs(kdev_t);
10439 extern int bmap(struct inode *, int);
10440 extern int notify_change(struct dentry *, struct iattr *);
10441 extern int permission(struct inode *, int);
10442 --- diff/kernel/ksyms.c 2003-10-16 10:44:23.000000000 +0100
10443 +++ source/kernel/ksyms.c 2003-10-16 10:44:38.000000000 +0100
10444 @@ -189,6 +189,8 @@
10445 EXPORT_SYMBOL(invalidate_inode_pages);
10446 EXPORT_SYMBOL(truncate_inode_pages);
10447 EXPORT_SYMBOL(fsync_dev);
10448 +EXPORT_SYMBOL(fsync_dev_lockfs);
10449 +EXPORT_SYMBOL(unlockfs);
10450 EXPORT_SYMBOL(fsync_no_super);
10451 EXPORT_SYMBOL(permission);
10452 EXPORT_SYMBOL(vfs_permission);
10453 missing parts of the previous vfs patch (merge).
10454 --- diff/drivers/md/dm-snapshot.c 2003-10-16 10:44:38.000000000 +0100
10455 +++ source/drivers/md/dm-snapshot.c 2003-10-16 10:44:41.000000000 +0100
10456 @@ -533,13 +533,14 @@
10457 ti->error = "Cannot register snapshot origin";
10460 + unlockfs(s->origin->dev);
10466 - kcopyd_client_destroy(s->kcopyd_client);
10467 unlockfs(s->origin->dev);
10468 + kcopyd_client_destroy(s->kcopyd_client);
10471 s->store.destroy(&s->store);
10472 Lift vfs locking to dm_suspend/resume.
10473 --- diff/drivers/md/dm-snapshot.c 2003-10-16 10:44:41.000000000 +0100
10474 +++ source/drivers/md/dm-snapshot.c 2003-10-16 10:44:44.000000000 +0100
10475 @@ -524,22 +524,17 @@
10479 - /* Flush IO to the origin device */
10480 - fsync_dev_lockfs(s->origin->dev);
10482 /* Add snapshot to the list of snapshots for this origin */
10483 if (register_snapshot(s)) {
10485 ti->error = "Cannot register snapshot origin";
10488 - unlockfs(s->origin->dev);
10494 - unlockfs(s->origin->dev);
10495 kcopyd_client_destroy(s->kcopyd_client);
10498 --- diff/drivers/md/dm.c 2003-10-16 10:44:23.000000000 +0100
10499 +++ source/drivers/md/dm.c 2003-10-16 10:44:44.000000000 +0100
10500 @@ -951,13 +951,23 @@
10502 DECLARE_WAITQUEUE(wait, current);
10504 - down_write(&md->lock);
10505 + /* Flush IO to the origin device */
10506 + down_read(&md->lock);
10507 + if (test_bit(DMF_BLOCK_IO, &md->flags)) {
10508 + up_read(&md->lock);
10512 + fsync_dev_lockfs(md->dev);
10513 + up_read(&md->lock);
10517 - * First we set the BLOCK_IO flag so no more ios will be
10519 + * Set the BLOCK_IO flag so no more ios will be mapped.
10521 + down_write(&md->lock);
10522 if (test_bit(DMF_BLOCK_IO, &md->flags)) {
10523 + unlockfs(md->dev);
10524 up_write(&md->lock);
10527 @@ -986,6 +996,7 @@
10529 /* did we flush everything ? */
10530 if (atomic_read(&md->pending)) {
10531 + unlockfs(md->dev);
10532 clear_bit(DMF_BLOCK_IO, &md->flags);
10535 @@ -1017,6 +1028,7 @@
10536 md->deferred = NULL;
10537 up_write(&md->lock);
10539 + unlockfs(md->dev);
10540 flush_deferred_io(def);
10541 run_task_queue(&tq_disk);
10543 Correct error message when start a dm-daemon.
10544 --- diff/drivers/md/dm-daemon.c 2003-10-16 10:44:23.000000000 +0100
10545 +++ source/drivers/md/dm-daemon.c 2003-10-16 10:44:48.000000000 +0100
10547 down(&dd->start_lock);
10548 pid = kernel_thread(daemon, dd, 0);
10550 - DMERR("Failed to start kcopyd thread");
10551 + DMERR("Failed to start %s thread", name);
10555 When multiple load ioctls are issued the reference count on older
10556 'new_tables' wasn't being dropped. [Christophe Saout]
10557 --- diff/drivers/md/dm-ioctl.c 2003-10-16 10:44:34.000000000 +0100
10558 +++ source/drivers/md/dm-ioctl.c 2003-10-16 10:44:51.000000000 +0100
10559 @@ -816,6 +816,8 @@
10564 + dm_table_put(hc->new_map);
10566 param->flags |= DM_INACTIVE_PRESENT_FLAG;
10568 Stop labelling dm as 'experimental'.
10569 --- diff/drivers/md/Config.in 2003-10-16 10:44:23.000000000 +0100
10570 +++ source/drivers/md/Config.in 2003-10-16 10:44:54.000000000 +0100
10572 dep_tristate ' Multipath I/O support' CONFIG_MD_MULTIPATH $CONFIG_BLK_DEV_MD
10574 dep_tristate ' Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM $CONFIG_MD
10575 -if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then
10576 - dep_tristate ' Device-mapper support (EXPERIMENTAL)' CONFIG_BLK_DEV_DM $CONFIG_MD
10577 - dep_tristate ' Mirror (RAID-1) support (EXPERIMENTAL)' CONFIG_BLK_DEV_DM_MIRROR $CONFIG_BLK_DEV_DM
10579 +dep_tristate ' Device-mapper support' CONFIG_BLK_DEV_DM $CONFIG_MD
10580 +dep_tristate ' Mirror (RAID-1) support' CONFIG_BLK_DEV_DM_MIRROR $CONFIG_BLK_DEV_DM
10583 Move retrieve_status up so dev_wait() can use it.
10584 --- diff/drivers/md/dm-ioctl.c 2003-10-16 10:44:51.000000000 +0100
10585 +++ source/drivers/md/dm-ioctl.c 2003-10-16 10:44:58.000000000 +0100
10586 @@ -699,6 +699,69 @@
10590 + * Build up the status struct for each target
10592 +static void retrieve_status(struct dm_table *table, struct dm_ioctl *param,
10593 + size_t param_size)
10595 + unsigned int i, num_targets;
10596 + struct dm_target_spec *spec;
10597 + char *outbuf, *outptr;
10598 + status_type_t type;
10599 + size_t remaining, len, used = 0;
10601 + outptr = outbuf = get_result_buffer(param, param_size, &len);
10603 + if (param->flags & DM_STATUS_TABLE_FLAG)
10604 + type = STATUSTYPE_TABLE;
10606 + type = STATUSTYPE_INFO;
10608 + /* Get all the target info */
10609 + num_targets = dm_table_get_num_targets(table);
10610 + for (i = 0; i < num_targets; i++) {
10611 + struct dm_target *ti = dm_table_get_target(table, i);
10613 + remaining = len - (outptr - outbuf);
10614 + if (remaining < sizeof(struct dm_target_spec)) {
10615 + param->flags |= DM_BUFFER_FULL_FLAG;
10619 + spec = (struct dm_target_spec *) outptr;
10621 + spec->status = 0;
10622 + spec->sector_start = ti->begin;
10623 + spec->length = ti->len;
10624 + strncpy(spec->target_type, ti->type->name,
10625 + sizeof(spec->target_type));
10627 + outptr += sizeof(struct dm_target_spec);
10628 + remaining = len - (outptr - outbuf);
10630 + /* Get the status/table string from the target driver */
10631 + if (ti->type->status) {
10632 + if (ti->type->status(ti, type, outptr, remaining)) {
10633 + param->flags |= DM_BUFFER_FULL_FLAG;
10637 + outptr[0] = '\0';
10639 + outptr += strlen(outptr) + 1;
10640 + used = param->data_start + (outptr - outbuf);
10642 + align_ptr(outptr);
10643 + spec->next = outptr - outbuf;
10647 + param->data_size = used;
10649 + param->target_count = num_targets;
10653 * Wait for a device to report an event
10655 static int dev_wait(struct dm_ioctl *param, size_t param_size)
10656 @@ -919,69 +982,6 @@
10660 - * Build up the status struct for each target
10662 -static void retrieve_status(struct dm_table *table, struct dm_ioctl *param,
10663 - size_t param_size)
10665 - unsigned int i, num_targets;
10666 - struct dm_target_spec *spec;
10667 - char *outbuf, *outptr;
10668 - status_type_t type;
10669 - size_t remaining, len, used = 0;
10671 - outptr = outbuf = get_result_buffer(param, param_size, &len);
10673 - if (param->flags & DM_STATUS_TABLE_FLAG)
10674 - type = STATUSTYPE_TABLE;
10676 - type = STATUSTYPE_INFO;
10678 - /* Get all the target info */
10679 - num_targets = dm_table_get_num_targets(table);
10680 - for (i = 0; i < num_targets; i++) {
10681 - struct dm_target *ti = dm_table_get_target(table, i);
10683 - remaining = len - (outptr - outbuf);
10684 - if (remaining < sizeof(struct dm_target_spec)) {
10685 - param->flags |= DM_BUFFER_FULL_FLAG;
10689 - spec = (struct dm_target_spec *) outptr;
10691 - spec->status = 0;
10692 - spec->sector_start = ti->begin;
10693 - spec->length = ti->len;
10694 - strncpy(spec->target_type, ti->type->name,
10695 - sizeof(spec->target_type));
10697 - outptr += sizeof(struct dm_target_spec);
10698 - remaining = len - (outptr - outbuf);
10700 - /* Get the status/table string from the target driver */
10701 - if (ti->type->status) {
10702 - if (ti->type->status(ti, type, outptr, remaining)) {
10703 - param->flags |= DM_BUFFER_FULL_FLAG;
10707 - outptr[0] = '\0';
10709 - outptr += strlen(outptr) + 1;
10710 - used = param->data_start + (outptr - outbuf);
10712 - align_ptr(outptr);
10713 - spec->next = outptr - outbuf;
10717 - param->data_size = used;
10719 - param->target_count = num_targets;
10723 * Return the status of a device as a text string for each
10726 dev_wait was meant to return table status not dev status. [Alasdair Kergon]
10727 --- diff/drivers/md/dm-ioctl.c 2003-10-16 10:44:58.000000000 +0100
10728 +++ source/drivers/md/dm-ioctl.c 2003-10-16 10:45:01.000000000 +0100
10729 @@ -768,6 +768,7 @@
10732 struct mapped_device *md;
10733 + struct dm_table *table;
10734 DECLARE_WAITQUEUE(wq, current);
10736 md = find_device(param);
10737 @@ -790,7 +791,16 @@
10738 * him and save an ioctl.
10740 r = __dev_status(md, param);
10744 + table = dm_get_table(md);
10746 + retrieve_status(table, param, param_size);
10747 + dm_table_put(table);
10754 Fix error message when linear targets gets handed more than 2 arguments.
10756 --- diff/drivers/md/dm-linear.c 2003-10-16 10:44:23.000000000 +0100
10757 +++ source/drivers/md/dm-linear.c 2003-10-16 10:45:04.000000000 +0100
10759 struct linear_c *lc;
10762 - ti->error = "dm-linear: Not enough arguments";
10763 + ti->error = "dm-linear: Invalid argument count";
10767 Support an arbitrary number of target parameters. [Alasdair Kergon]
10768 --- diff/drivers/md/dm-table.c 2003-10-16 10:44:23.000000000 +0100
10769 +++ source/drivers/md/dm-table.c 2003-10-16 10:45:07.000000000 +0100
10770 @@ -441,12 +441,36 @@
10774 + * Used to dynamically allocate the arg array.
10776 +static char **realloc_argv(unsigned *array_size, char **old_argv)
10779 + unsigned new_size;
10781 + new_size = *array_size ? *array_size * 2 : 64;
10782 + argv = kmalloc(new_size * sizeof(*argv), GFP_KERNEL);
10784 + memcpy(argv, old_argv, *array_size * sizeof(*argv));
10785 + *array_size = new_size;
10793 * Destructively splits up the argument list to pass to ctr.
10795 -static int split_args(int max, int *argc, char **argv, char *input)
10796 +static int split_args(int *argc, char ***argvp, char *input)
10798 - char *start, *end = input, *out;
10799 + char *start, *end = input, *out, **argv = NULL;
10800 + unsigned array_size = 0;
10803 + argv = realloc_argv(&array_size, argv);
10809 @@ -475,8 +499,11 @@
10812 /* have we already filled the array ? */
10813 - if ((*argc + 1) > max)
10815 + if ((*argc + 1) > array_size) {
10816 + argv = realloc_argv(&array_size, argv);
10821 /* we know this is whitespace */
10823 @@ -488,6 +515,7 @@
10831 @@ -495,7 +523,7 @@
10832 sector_t start, sector_t len, char *params)
10834 int r = -EINVAL, argc;
10837 struct dm_target *tgt;
10839 if ((r = check_space(t)))
10840 @@ -524,13 +552,14 @@
10844 - r = split_args(ARRAY_SIZE(argv), &argc, argv, params);
10845 + r = split_args(&argc, &argv, params);
10847 - tgt->error = "couldn't split parameters";
10848 + tgt->error = "couldn't split parameters (insufficient memory)";
10852 r = tgt->type->ctr(tgt, argc, argv);
10858 --- diff/fs/buffer.c 2003-10-16 10:44:38.000000000 +0100
10859 +++ source/fs/buffer.c 2003-10-16 10:45:11.000000000 +0100
10860 @@ -400,7 +400,7 @@
10861 ** after these are done
10865 + DQUOT_SYNC_DEV(dev);
10866 /* if inodes or quotas could be dirtied during the
10867 ** sync_supers_lockfs call, the FS is responsible for getting
10868 ** them on disk, without deadlocking against the lock
10869 The ioctl interface always knows how many targets are going to be in
10870 the table, so remove the dynamic array sizing code in dm-table.c.
10871 This fixes a problem with large tables where the dm_target pointer
10872 passed to the target ctr was becoming invalid.
10873 --- diff/drivers/md/dm-ioctl.c 2003-10-16 10:45:01.000000000 +0100
10874 +++ source/drivers/md/dm-ioctl.c 2003-10-16 10:45:14.000000000 +0100
10875 @@ -871,7 +871,7 @@
10876 struct hash_cell *hc;
10877 struct dm_table *t;
10879 - r = dm_table_create(&t, get_mode(param));
10880 + r = dm_table_create(&t, get_mode(param), param->target_count);
10884 --- diff/drivers/md/dm-table.c 2003-10-16 10:45:07.000000000 +0100
10885 +++ source/drivers/md/dm-table.c 2003-10-16 10:45:14.000000000 +0100
10886 @@ -112,42 +112,7 @@
10891 - * highs, and targets are managed as dynamic arrays during a
10894 -static int alloc_targets(struct dm_table *t, unsigned int num)
10896 - sector_t *n_highs;
10897 - struct dm_target *n_targets;
10898 - int n = t->num_targets;
10901 - * Allocate both the target array and offset array at once.
10903 - n_highs = (sector_t *) vcalloc(sizeof(struct dm_target) +
10904 - sizeof(sector_t), num);
10908 - n_targets = (struct dm_target *) (n_highs + num);
10911 - memcpy(n_highs, t->highs, sizeof(*n_highs) * n);
10912 - memcpy(n_targets, t->targets, sizeof(*n_targets) * n);
10915 - memset(n_highs + n, -1, sizeof(*n_highs) * (num - n));
10918 - t->num_allocated = num;
10919 - t->highs = n_highs;
10920 - t->targets = n_targets;
10925 -int dm_table_create(struct dm_table **result, int mode)
10926 +int dm_table_create(struct dm_table **result, int mode, unsigned num_targets)
10928 struct dm_table *t = kmalloc(sizeof(*t), GFP_NOIO);
10930 @@ -158,13 +123,17 @@
10931 INIT_LIST_HEAD(&t->devices);
10932 atomic_set(&t->holders, 1);
10934 - /* allocate a single nodes worth of targets to begin with */
10935 - if (alloc_targets(t, KEYS_PER_NODE)) {
10937 + /* allocate both the target array and offset array at once */
10938 + t->highs = (sector_t *) vcalloc(sizeof(struct dm_target) +
10939 + sizeof(sector_t), num_targets);
10946 + t->targets = (struct dm_target *) (t->highs + num_targets);
10947 + t->num_allocated = num_targets;
10951 @@ -224,17 +193,6 @@
10955 - * Checks to see if we need to extend highs or targets.
10957 -static inline int check_space(struct dm_table *t)
10959 - if (t->num_targets >= t->num_allocated)
10960 - return alloc_targets(t, t->num_allocated * 2);
10966 * Convert a device path to a dev_t.
10968 static int lookup_device(const char *path, kdev_t *dev)
10969 @@ -526,8 +484,8 @@
10971 struct dm_target *tgt;
10973 - if ((r = check_space(t)))
10975 + if (t->num_targets >= t->num_allocated)
10978 tgt = t->targets + t->num_targets;
10979 memset(tgt, 0, sizeof(*tgt));
10980 --- diff/drivers/md/dm.h 2003-10-16 10:44:23.000000000 +0100
10981 +++ source/drivers/md/dm.h 2003-10-16 10:45:14.000000000 +0100
10983 * Functions for manipulating a table. Tables are also reference
10985 *---------------------------------------------------------------*/
10986 -int dm_table_create(struct dm_table **result, int mode);
10987 +int dm_table_create(struct dm_table **result, int mode, unsigned num_targets);
10989 void dm_table_get(struct dm_table *t);
10990 void dm_table_put(struct dm_table *t);
10991 Correct calculation of the dirty logs bitset size.
10992 --- diff/drivers/md/dm-log.c 2003-10-16 10:44:23.000000000 +0100
10993 +++ source/drivers/md/dm-log.c 2003-10-16 10:45:18.000000000 +0100
10994 @@ -124,6 +124,7 @@
10998 +#define BYTE_SHIFT 3
10999 static int core_ctr(struct dirty_log *log, sector_t dev_size,
11000 unsigned int argc, char **argv)
11002 @@ -153,7 +154,13 @@
11003 clog->region_size = region_size;
11004 clog->region_count = region_count;
11006 - bitset_size = dm_round_up(region_count >> 3, sizeof(*clog->clean_bits));
11008 + * Work out how many words we need to hold the bitset.
11010 + bitset_size = dm_round_up(region_count,
11011 + sizeof(*clog->clean_bits) << BYTE_SHIFT);
11012 + bitset_size >>= BYTE_SHIFT;
11014 clog->clean_bits = vmalloc(bitset_size);
11015 if (!clog->clean_bits) {
11016 DMWARN("couldn't allocate clean bitset");
11017 Correct the sector calculation in map_buffer().
11018 --- diff/drivers/md/dm-raid1.c 2003-10-16 10:44:23.000000000 +0100
11019 +++ source/drivers/md/dm-raid1.c 2003-10-16 10:45:21.000000000 +0100
11020 @@ -720,11 +720,7 @@
11021 static void map_buffer(struct mirror_set *ms,
11022 struct mirror *m, struct buffer_head *bh)
11024 - sector_t bsize = bh->b_size >> 9;
11025 - sector_t rsector = bh->b_blocknr * bsize;
11027 - bh->b_rdev = m->dev->dev;
11028 - bh->b_rsector = m->offset + (rsector - ms->ti->begin);
11029 + bh->b_rsector = m->offset + (bh->b_rsector - ms->ti->begin);
11032 static void do_reads(struct mirror_set *ms, struct buffer_list *reads)
11033 If a kcopyd client hadn't allocated enough pages and then submitted a
11034 large io that was being split into sub jobs we could stall waiting for
11035 pages. There is now a kcopyd_client->max_split field that is an
11036 appropriate number of sub_jobs to split the io into based on the
11037 number of allocated pages.
11038 --- diff/drivers/md/kcopyd.c 2003-10-16 10:44:31.000000000 +0100
11039 +++ source/drivers/md/kcopyd.c 2003-10-16 10:45:24.000000000 +0100
11042 static struct dm_daemon _kcopyd;
11044 +#define SECTORS_PER_PAGE (PAGE_SIZE / SECTOR_SIZE)
11045 +#define SUB_JOB_SIZE 128
11046 +#define PAGES_PER_SUB_JOB (SUB_JOB_SIZE / SECTORS_PER_PAGE)
11047 +#define SUB_JOB_COUNT 8
11049 /*-----------------------------------------------------------------
11050 * Each kcopyd client has its own little pool of preallocated
11051 * pages for kcopyd io.
11053 struct list_head pages;
11054 unsigned int nr_pages;
11055 unsigned int nr_free_pages;
11056 + unsigned int max_split;
11059 static inline void __push_page(struct kcopyd_client *kc, struct page *p)
11060 @@ -122,6 +128,10 @@
11062 kcopyd_put_pages(kc, &new);
11063 kc->nr_pages += nr;
11064 + kc->max_split = kc->nr_pages / PAGES_PER_SUB_JOB;
11065 + if (kc->max_split > SUB_JOB_COUNT)
11066 + kc->max_split = SUB_JOB_COUNT;
11071 @@ -334,7 +344,6 @@
11075 -#define SECTORS_PER_PAGE (PAGE_SIZE / SECTOR_SIZE)
11076 static int run_pages_job(struct kcopyd_job *job)
11079 @@ -422,7 +431,6 @@
11080 dm_daemon_wake(&_kcopyd);
11083 -#define SUB_JOB_SIZE 128
11084 static void segment_complete(int read_err,
11085 unsigned int write_err, void *context)
11087 @@ -491,17 +499,19 @@
11088 * Create some little jobs that will do the move between
11091 -#define SPLIT_COUNT 8
11092 static void split_job(struct kcopyd_job *job)
11097 + nr = dm_div_up(job->source.count, SUB_JOB_SIZE);
11098 + if (nr > job->kc->max_split)
11099 + nr = job->kc->max_split;
11101 - atomic_set(&job->sub_jobs, SPLIT_COUNT);
11102 - for (i = 0; i < SPLIT_COUNT; i++)
11103 + atomic_set(&job->sub_jobs, nr);
11105 segment_complete(0, 0u, job);
11108 -#define SUB_JOB_THRESHOLD (SPLIT_COUNT * SUB_JOB_SIZE)
11109 int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
11110 unsigned int num_dests, struct io_region *dests,
11111 unsigned int flags, kcopyd_notify_fn fn, void *context)
11112 @@ -534,7 +544,7 @@
11114 job->context = context;
11116 - if (job->source.count < SUB_JOB_THRESHOLD)
11117 + if (job->source.count < SUB_JOB_SIZE)
11121 Fix bug in dm-io.c block size calculation. [Alasdair Kergon]
11123 --- diff/drivers/md/dm-io.c 2003-10-16 10:44:23.000000000 +0100
11124 +++ source/drivers/md/dm-io.c 2003-10-16 10:45:28.000000000 +0100
11125 @@ -204,7 +204,13 @@
11126 sector_t b = *block;
11127 sector_t blocks_per_page = PAGE_SIZE / block_size;
11128 unsigned int this_size; /* holds the size of the current io */
11129 - unsigned int len;
11132 + if (!blocks_per_page) {
11133 + DMERR("dm-io: PAGE_SIZE (%lu) < block_size (%u) unsupported",
11134 + PAGE_SIZE, block_size);
11138 while ((offset < PAGE_SIZE) && (b != end_block)) {
11139 bh = mempool_alloc(_buffer_pool, GFP_NOIO);
11140 @@ -215,10 +221,20 @@
11141 * Block size must be a power of 2 and aligned
11144 - len = end_block - b;
11145 - this_size = min((sector_t) 1 << log2_floor(b), blocks_per_page);
11146 - if (this_size > len)
11147 - this_size = 1 << log2_align(len);
11149 + len = min(end_block - b, blocks_per_page);
11150 + len = min(len, blocks_per_page - offset / block_size);
11153 + DMERR("dm-io: Invalid offset/block_size (%u/%u).",
11154 + offset, block_size);
11158 + this_size = 1 << log2_align(len);
11160 + this_size = min(this_size,
11161 + (unsigned) 1 << log2_floor(b));
11164 * Add in the job offset.
11165 bh->b_rdev wasn't being set properly. Bug from earlier patch.
11166 --- diff/drivers/md/dm-raid1.c 2003-10-16 10:45:21.000000000 +0100
11167 +++ source/drivers/md/dm-raid1.c 2003-10-16 10:45:31.000000000 +0100
11168 @@ -720,6 +720,7 @@
11169 static void map_buffer(struct mirror_set *ms,
11170 struct mirror *m, struct buffer_head *bh)
11172 + bh->b_rdev = m->dev->dev;
11173 bh->b_rsector = m->offset + (bh->b_rsector - ms->ti->begin);