]> git.pld-linux.org Git - packages/kernel.git/blob - combined-linux-2.4.21-devmapper-ioctl.patch
- obsolete
[packages/kernel.git] / combined-linux-2.4.21-devmapper-ioctl.patch
1 --- linux-2.4.21/Documentation/Configure.help   Fri Jun 13 16:32:30 2003
2 +++ linux/Documentation/Configure.help  Wed Aug 20 14:41:36 2003
3 @@ -1839,6 +1839,20 @@
4    want), say M here and read <file:Documentation/modules.txt>.  The
5    module will be called lvm-mod.o.
6  
7 +Device-mapper support
8 +CONFIG_BLK_DEV_DM
9 +  Device-mapper is a low level volume manager.  It works by allowing
10 +  people to specify mappings for ranges of logical sectors.  Various
11 +  mapping types are available, in addition people may write their own
12 +  modules containing custom mappings if they wish.
13 +
14 +  Higher level volume managers such as LVM2 use this driver.
15 +
16 +  If you want to compile this as a module, say M here and read 
17 +  <file:Documentation/modules.txt>.  The module will be called dm-mod.o.
18 +
19 +  If unsure, say N.
20 +
21  Multiple devices driver support (RAID and LVM)
22  CONFIG_MD
23    Support multiple physical spindles through a single logical device.
24 --- linux-2.4.21/MAINTAINERS    Fri Jun 13 16:32:30 2003
25 +++ linux/MAINTAINERS   Wed Aug 20 14:41:36 2003
26 @@ -476,6 +476,13 @@
27  W:     http://www.debian.org/~dz/i8k/
28  S:     Maintained
29  
30 +DEVICE MAPPER
31 +P:     Joe Thornber
32 +M:     dm@uk.sistina.com
33 +L:     linux-LVM@sistina.com
34 +W:     http://www.sistina.com/lvm
35 +S:     Maintained
36 +
37  DEVICE NUMBER REGISTRY
38  P:     H. Peter Anvin
39  M:     hpa@zytor.com
40 --- linux-2.4.21/arch/mips64/kernel/ioctl32.c   Fri Jan 10 16:34:18 2003
41 +++ linux/arch/mips64/kernel/ioctl32.c  Wed Aug 20 14:41:28 2003
42 @@ -33,6 +33,7 @@
43  #include <linux/auto_fs.h>
44  #include <linux/ext2_fs.h>
45  #include <linux/raid/md_u.h>
46 +#include <linux/dm-ioctl.h>
47  
48  #include <scsi/scsi.h>
49  #undef __KERNEL__              /* This file was born to be ugly ...  */
50 @@ -914,6 +915,22 @@
51         IOCTL32_DEFAULT(STOP_ARRAY_RO),
52         IOCTL32_DEFAULT(RESTART_ARRAY_RW),
53  #endif /* CONFIG_MD */
54 +
55 +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
56 +       IOCTL32_DEFAULT(DM_VERSION),
57 +       IOCTL32_DEFAULT(DM_REMOVE_ALL),
58 +       IOCTL32_DEFAULT(DM_DEV_CREATE),
59 +       IOCTL32_DEFAULT(DM_DEV_REMOVE),
60 +       IOCTL32_DEFAULT(DM_TABLE_LOAD),
61 +       IOCTL32_DEFAULT(DM_DEV_SUSPEND),
62 +       IOCTL32_DEFAULT(DM_DEV_RENAME),
63 +       IOCTL32_DEFAULT(DM_TABLE_DEPS),
64 +       IOCTL32_DEFAULT(DM_DEV_STATUS),
65 +       IOCTL32_DEFAULT(DM_TABLE_STATUS),
66 +       IOCTL32_DEFAULT(DM_DEV_WAIT),
67 +       IOCTL32_DEFAULT(DM_LIST_DEVICES),
68 +       IOCTL32_DEFAULT(DM_TABLE_CLEAR),
69 +#endif /* CONFIG_BLK_DEV_DM */
70  
71         IOCTL32_DEFAULT(MTIOCTOP),                      /* mtio.h ioctls  */
72         IOCTL32_HANDLER(MTIOCGET32, mt_ioctl_trans),
73 --- linux-2.4.21/arch/parisc/kernel/ioctl32.c   Fri Jun 13 16:32:32 2003
74 +++ linux/arch/parisc/kernel/ioctl32.c  Wed Aug 20 14:41:28 2003
75 @@ -55,6 +55,7 @@
76  #define max max */
77  #include <linux/lvm.h>
78  #endif /* LVM */
79 +#include <linux/dm-ioctl.h>
80  
81  #include <scsi/scsi.h>
82  /* Ugly hack. */
83 @@ -3418,6 +3419,22 @@
84  COMPATIBLE_IOCTL(LV_BMAP)
85  COMPATIBLE_IOCTL(LV_SNAPSHOT_USE_RATE)
86  #endif /* LVM */
87 +/* Device-Mapper */
88 +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
89 +COMPATIBLE_IOCTL(DM_VERSION)
90 +COMPATIBLE_IOCTL(DM_REMOVE_ALL)
91 +COMPATIBLE_IOCTL(DM_DEV_CREATE)
92 +COMPATIBLE_IOCTL(DM_DEV_REMOVE)
93 +COMPATIBLE_IOCTL(DM_TABLE_LOAD)
94 +COMPATIBLE_IOCTL(DM_DEV_SUSPEND)
95 +COMPATIBLE_IOCTL(DM_DEV_RENAME)
96 +COMPATIBLE_IOCTL(DM_TABLE_DEPS)
97 +COMPATIBLE_IOCTL(DM_DEV_STATUS)
98 +COMPATIBLE_IOCTL(DM_TABLE_STATUS)
99 +COMPATIBLE_IOCTL(DM_DEV_WAIT)
100 +COMPATIBLE_IOCTL(DM_LIST_DEVICES)
101 +COMPATIBLE_IOCTL(DM_TABLE_CLEAR)
102 +#endif /* CONFIG_BLK_DEV_DM */
103  #if defined(CONFIG_DRM) || defined(CONFIG_DRM_MODULE)
104  COMPATIBLE_IOCTL(DRM_IOCTL_GET_MAGIC)
105  COMPATIBLE_IOCTL(DRM_IOCTL_IRQ_BUSID)
106 --- linux-2.4.21/arch/ppc64/kernel/ioctl32.c    Fri Jun 13 16:32:33 2003
107 +++ linux/arch/ppc64/kernel/ioctl32.c   Wed Aug 20 14:41:29 2003
108 @@ -66,6 +66,7 @@
109  #if defined(CONFIG_BLK_DEV_LVM) || defined(CONFIG_BLK_DEV_LVM_MODULE)
110  #include <linux/lvm.h>
111  #endif /* LVM */
112 +#include <linux/dm-ioctl.h>
113  
114  #include <scsi/scsi.h>
115  /* Ugly hack. */
116 @@ -4423,6 +4424,22 @@
117  COMPATIBLE_IOCTL(NBD_PRINT_DEBUG),
118  COMPATIBLE_IOCTL(NBD_SET_SIZE_BLOCKS),
119  COMPATIBLE_IOCTL(NBD_DISCONNECT),
120 +/* device-mapper */
121 +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
122 +COMPATIBLE_IOCTL(DM_VERSION),
123 +COMPATIBLE_IOCTL(DM_REMOVE_ALL),
124 +COMPATIBLE_IOCTL(DM_DEV_CREATE),
125 +COMPATIBLE_IOCTL(DM_DEV_REMOVE),
126 +COMPATIBLE_IOCTL(DM_TABLE_LOAD),
127 +COMPATIBLE_IOCTL(DM_DEV_SUSPEND),
128 +COMPATIBLE_IOCTL(DM_DEV_RENAME),
129 +COMPATIBLE_IOCTL(DM_TABLE_DEPS),
130 +COMPATIBLE_IOCTL(DM_DEV_STATUS),
131 +COMPATIBLE_IOCTL(DM_TABLE_STATUS),
132 +COMPATIBLE_IOCTL(DM_DEV_WAIT),
133 +COMPATIBLE_IOCTL(DM_LIST_DEVICES),
134 +COMPATIBLE_IOCTL(DM_TABLE_CLEAR),
135 +#endif /* CONFIG_BLK_DEV_DM */
136  /* Remove *PRIVATE in 2.5 */
137  COMPATIBLE_IOCTL(SIOCDEVPRIVATE),
138  COMPATIBLE_IOCTL(SIOCDEVPRIVATE+1),
139 --- linux-2.4.21/arch/s390x/kernel/ioctl32.c    Fri Jan 10 16:34:26 2003
140 +++ linux/arch/s390x/kernel/ioctl32.c   Wed Aug 20 14:41:29 2003
141 @@ -25,6 +25,7 @@
142  #include <linux/ext2_fs.h>
143  #include <linux/hdreg.h>
144  #include <linux/if_bonding.h>
145 +#include <linux/dm-ioctl.h>
146  #include <asm/types.h>
147  #include <asm/uaccess.h>
148  #include <asm/dasd.h>
149 @@ -507,6 +508,20 @@
150         IOCTL32_DEFAULT(VT_UNLOCKSWITCH),
151  
152         IOCTL32_DEFAULT(SIOCGSTAMP),
153 +
154 +       IOCTL32_DEFAULT(DM_VERSION),
155 +       IOCTL32_DEFAULT(DM_REMOVE_ALL),
156 +       IOCTL32_DEFAULT(DM_DEV_CREATE),
157 +       IOCTL32_DEFAULT(DM_DEV_REMOVE),
158 +       IOCTL32_DEFAULT(DM_TABLE_LOAD),
159 +       IOCTL32_DEFAULT(DM_DEV_SUSPEND),
160 +       IOCTL32_DEFAULT(DM_DEV_RENAME),
161 +       IOCTL32_DEFAULT(DM_TABLE_DEPS),
162 +       IOCTL32_DEFAULT(DM_DEV_STATUS),
163 +       IOCTL32_DEFAULT(DM_TABLE_STATUS),
164 +       IOCTL32_DEFAULT(DM_DEV_WAIT),
165 +       IOCTL32_DEFAULT(DM_LIST_DEVICES),
166 +       IOCTL32_DEFAULT(DM_TABLE_CLEAR),
167  
168         IOCTL32_HANDLER(SIOCGIFNAME, dev_ifname32),
169         IOCTL32_HANDLER(SIOCGIFCONF, dev_ifconf),
170 --- linux-2.4.21/arch/sparc64/kernel/ioctl32.c  Fri Jun 13 16:32:34 2003
171 +++ linux/arch/sparc64/kernel/ioctl32.c Wed Aug 20 14:41:29 2003
172 @@ -56,6 +56,7 @@
173  #if defined(CONFIG_BLK_DEV_LVM) || defined(CONFIG_BLK_DEV_LVM_MODULE)
174  #include <linux/lvm.h>
175  #endif /* LVM */
176 +#include <linux/dm-ioctl.h>
177  
178  #include <scsi/scsi.h>
179  /* Ugly hack. */
180 @@ -5076,6 +5077,22 @@
181  COMPATIBLE_IOCTL(NBD_PRINT_DEBUG)
182  COMPATIBLE_IOCTL(NBD_SET_SIZE_BLOCKS)
183  COMPATIBLE_IOCTL(NBD_DISCONNECT)
184 +/* device-mapper */
185 +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
186 +COMPATIBLE_IOCTL(DM_VERSION)
187 +COMPATIBLE_IOCTL(DM_REMOVE_ALL)
188 +COMPATIBLE_IOCTL(DM_DEV_CREATE)
189 +COMPATIBLE_IOCTL(DM_DEV_REMOVE)
190 +COMPATIBLE_IOCTL(DM_TABLE_LOAD)
191 +COMPATIBLE_IOCTL(DM_DEV_SUSPEND)
192 +COMPATIBLE_IOCTL(DM_DEV_RENAME)
193 +COMPATIBLE_IOCTL(DM_TABLE_DEPS)
194 +COMPATIBLE_IOCTL(DM_DEV_STATUS)
195 +COMPATIBLE_IOCTL(DM_TABLE_STATUS)
196 +COMPATIBLE_IOCTL(DM_DEV_WAIT)
197 +COMPATIBLE_IOCTL(DM_LIST_DEVICES)
198 +COMPATIBLE_IOCTL(DM_TABLE_CLEAR)
199 +#endif /* CONFIG_BLK_DEV_DM */
200  /* Linux-1394 */
201  #if defined(CONFIG_IEEE1394) || defined(CONFIG_IEEE1394_MODULE)
202  COMPATIBLE_IOCTL(AMDTP_IOC_CHANNEL)
203 --- linux-2.4.21/arch/x86_64/ia32/ia32_ioctl.c  Fri Jun 13 16:32:35 2003
204 +++ linux/arch/x86_64/ia32/ia32_ioctl.c Wed Aug 20 14:41:29 2003
205 @@ -67,6 +67,7 @@
206  #define max max
207  #include <linux/lvm.h>
208  #endif /* LVM */
209 +#include <linux/dm-ioctl.h>
210  
211  #include <scsi/scsi.h>
212  /* Ugly hack. */
213 @@ -4047,6 +4048,22 @@
214  COMPATIBLE_IOCTL(LV_BMAP)
215  COMPATIBLE_IOCTL(LV_SNAPSHOT_USE_RATE)
216  #endif /* LVM */
217 +/* Device-Mapper */
218 +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
219 +COMPATIBLE_IOCTL(DM_VERSION)
220 +COMPATIBLE_IOCTL(DM_REMOVE_ALL)
221 +COMPATIBLE_IOCTL(DM_DEV_CREATE)
222 +COMPATIBLE_IOCTL(DM_DEV_REMOVE)
223 +COMPATIBLE_IOCTL(DM_TABLE_LOAD)
224 +COMPATIBLE_IOCTL(DM_DEV_SUSPEND)
225 +COMPATIBLE_IOCTL(DM_DEV_RENAME)
226 +COMPATIBLE_IOCTL(DM_TABLE_DEPS)
227 +COMPATIBLE_IOCTL(DM_DEV_STATUS)
228 +COMPATIBLE_IOCTL(DM_TABLE_STATUS)
229 +COMPATIBLE_IOCTL(DM_DEV_WAIT)
230 +COMPATIBLE_IOCTL(DM_LIST_DEVICES)
231 +COMPATIBLE_IOCTL(DM_TABLE_CLEAR)
232 +#endif /* CONFIG_BLK_DEV_DM */
233  #ifdef CONFIG_AUTOFS_FS
234  COMPATIBLE_IOCTL(AUTOFS_IOC_READY)
235  COMPATIBLE_IOCTL(AUTOFS_IOC_FAIL)
236 --- linux-2.4.21/drivers/md/Config.in   Fri Jan 10 16:34:50 2003
237 +++ linux/drivers/md/Config.in  Wed Aug 20 14:41:36 2003
238 @@ -14,5 +14,7 @@
239  dep_tristate '  Multipath I/O support' CONFIG_MD_MULTIPATH $CONFIG_BLK_DEV_MD
240  
241  dep_tristate ' Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM $CONFIG_MD
242 +dep_tristate ' Device-mapper support' CONFIG_BLK_DEV_DM $CONFIG_MD
243 +dep_tristate '  Mirror (RAID-1) support' CONFIG_BLK_DEV_DM_MIRROR $CONFIG_BLK_DEV_DM
244  
245  endmenu
246 --- linux-2.4.21/drivers/md/Makefile    Fri Jan 10 16:34:50 2003
247 +++ linux/drivers/md/Makefile   Wed Aug 20 14:41:44 2003
248 @@ -4,24 +4,41 @@
249  
250  O_TARGET       := mddev.o
251  
252 -export-objs    := md.o xor.o
253 -list-multi     := lvm-mod.o
254 +export-objs    := md.o xor.o dm-table.o dm-target.o kcopyd.o dm-daemon.o \
255 +                  dm-log.o dm-io.o dm.o
256 +
257 +list-multi     := lvm-mod.o dm-mod.o dm-mirror-mod.o
258  lvm-mod-objs   := lvm.o lvm-snap.o lvm-fs.o
259 +dm-mod-objs    := dm.o dm-table.o dm-target.o dm-ioctl.o \
260 +                  dm-linear.o dm-stripe.o dm-snapshot.o dm-exception-store.o \
261 +                  kcopyd.o dm-daemon.o dm-io.o
262 +dm-mirror-mod-objs := dm-raid1.o dm-log.o
263  
264  # Note: link order is important.  All raid personalities
265  # and xor.o must come before md.o, as they each initialise 
266  # themselves, and md.o may use the personalities when it 
267  # auto-initialised.
268  
269 -obj-$(CONFIG_MD_LINEAR)                += linear.o
270 -obj-$(CONFIG_MD_RAID0)         += raid0.o
271 -obj-$(CONFIG_MD_RAID1)         += raid1.o
272 -obj-$(CONFIG_MD_RAID5)         += raid5.o xor.o
273 -obj-$(CONFIG_MD_MULTIPATH)     += multipath.o
274 -obj-$(CONFIG_BLK_DEV_MD)       += md.o
275 -obj-$(CONFIG_BLK_DEV_LVM)      += lvm-mod.o
276 +obj-$(CONFIG_MD_LINEAR)                        += linear.o
277 +obj-$(CONFIG_MD_RAID0)                 += raid0.o
278 +obj-$(CONFIG_MD_RAID1)                 += raid1.o
279 +obj-$(CONFIG_MD_RAID5)                 += raid5.o xor.o
280 +obj-$(CONFIG_MD_MULTIPATH)             += multipath.o
281 +obj-$(CONFIG_BLK_DEV_MD)               += md.o
282 +
283 +obj-$(CONFIG_BLK_DEV_LVM)              += lvm-mod.o
284 +
285 +obj-$(CONFIG_BLK_DEV_DM)               += dm-mod.o
286 +obj-$(CONFIG_BLK_DEV_DM_MIRROR)                += dm-mirror.o
287  
288  include $(TOPDIR)/Rules.make
289  
290  lvm-mod.o: $(lvm-mod-objs)
291         $(LD) -r -o $@ $(lvm-mod-objs)
292 +
293 +dm-mod.o: $(dm-mod-objs)
294 +       $(LD) -r -o $@ $(dm-mod-objs)
295 +
296 +dm-mirror.o: $(dm-mirror-mod-objs)
297 +       $(LD) -r -o $@ $(dm-mirror-mod-objs)
298 +
299 --- linux-2.4.21/drivers/md/dm-daemon.c Thu Jan  1 01:00:00 1970
300 +++ linux/drivers/md/dm-daemon.c        Wed Aug 20 14:41:38 2003
301 @@ -0,0 +1,113 @@
302 +/*
303 + * Copyright (C) 2003 Sistina Software
304 + *
305 + * This file is released under the LGPL.
306 + */
307 +
308 +#include "dm.h"
309 +#include "dm-daemon.h"
310 +
311 +#include <linux/module.h>
312 +#include <linux/sched.h>
313 +
314 +static int daemon(void *arg)
315 +{
316 +       struct dm_daemon *dd = (struct dm_daemon *) arg;
317 +       DECLARE_WAITQUEUE(wq, current);
318 +
319 +       daemonize();
320 +       reparent_to_init();
321 +
322 +       /* block all signals */
323 +       spin_lock_irq(&current->sigmask_lock);
324 +       sigfillset(&current->blocked);
325 +       flush_signals(current);
326 +       spin_unlock_irq(&current->sigmask_lock);
327 +
328 +       strcpy(current->comm, dd->name);
329 +       atomic_set(&dd->please_die, 0);
330 +
331 +       add_wait_queue(&dd->job_queue, &wq);
332 +
333 +       down(&dd->run_lock);
334 +       up(&dd->start_lock);
335 +
336 +       /*
337 +        * dd->fn() could do anything, very likely it will
338 +        * suspend.  So we can't set the state to
339 +        * TASK_INTERRUPTIBLE before calling it.  In order to
340 +        * prevent a race with a waking thread we do this little
341 +        * dance with the dd->woken variable.
342 +        */
343 +       while (1) {
344 +               do {
345 +                       set_current_state(TASK_RUNNING);
346 +
347 +                       if (atomic_read(&dd->please_die))
348 +                               goto out;
349 +
350 +                       atomic_set(&dd->woken, 0);
351 +                       dd->fn();
352 +                       yield();
353 +
354 +                       set_current_state(TASK_INTERRUPTIBLE);
355 +               } while (atomic_read(&dd->woken));
356 +
357 +               schedule();
358 +       }
359 +
360 + out:
361 +       remove_wait_queue(&dd->job_queue, &wq);
362 +       up(&dd->run_lock);
363 +       return 0;
364 +}
365 +
366 +int dm_daemon_start(struct dm_daemon *dd, const char *name, void (*fn)(void))
367 +{
368 +       pid_t pid = 0;
369 +
370 +       /*
371 +        * Initialise the dm_daemon.
372 +        */
373 +       dd->fn = fn;
374 +       strncpy(dd->name, name, sizeof(dd->name) - 1);
375 +       sema_init(&dd->start_lock, 1);
376 +       sema_init(&dd->run_lock, 1);
377 +       init_waitqueue_head(&dd->job_queue);
378 +
379 +       /*
380 +        * Start the new thread.
381 +        */
382 +       down(&dd->start_lock);
383 +       pid = kernel_thread(daemon, dd, 0);
384 +       if (pid <= 0) {
385 +               DMERR("Failed to start %s thread", name);
386 +               return -EAGAIN;
387 +       }
388 +
389 +       /*
390 +        * wait for the daemon to up this mutex.
391 +        */
392 +       down(&dd->start_lock);
393 +       up(&dd->start_lock);
394 +
395 +       return 0;
396 +}
397 +
398 +void dm_daemon_stop(struct dm_daemon *dd)
399 +{
400 +       atomic_set(&dd->please_die, 1);
401 +       dm_daemon_wake(dd);
402 +       down(&dd->run_lock);
403 +       up(&dd->run_lock);
404 +}
405 +
406 +void dm_daemon_wake(struct dm_daemon *dd)
407 +{
408 +       atomic_set(&dd->woken, 1);
409 +       wake_up_interruptible(&dd->job_queue);
410 +}
411 +
412 +EXPORT_SYMBOL(dm_daemon_start);
413 +EXPORT_SYMBOL(dm_daemon_stop);
414 +EXPORT_SYMBOL(dm_daemon_wake);
415 --- linux-2.4.21/drivers/md/dm-daemon.h Thu Jan  1 01:00:00 1970
416 +++ linux/drivers/md/dm-daemon.h        Wed Aug 20 14:41:38 2003
417 @@ -0,0 +1,29 @@
418 +/*
419 + * Copyright (C) 2003 Sistina Software
420 + *
421 + * This file is released under the LGPL.
422 + */
423 +
424 +#ifndef DM_DAEMON_H
425 +#define DM_DAEMON_H
426 +
427 +#include <asm/atomic.h>
428 +#include <asm/semaphore.h>
429 +
430 +struct dm_daemon {
431 +       void (*fn)(void);
432 +       char name[16];
433 +       atomic_t please_die;
434 +       struct semaphore start_lock;
435 +       struct semaphore run_lock;
436 +
437 +       atomic_t woken;
438 +       wait_queue_head_t job_queue;
439 +};
440 +
441 +int dm_daemon_start(struct dm_daemon *dd, const char *name, void (*fn)(void));
442 +void dm_daemon_stop(struct dm_daemon *dd);
443 +void dm_daemon_wake(struct dm_daemon *dd);
444 +int dm_daemon_running(struct dm_daemon *dd);
445 +
446 +#endif
447 --- linux-2.4.21/drivers/md/dm-exception-store.c        Thu Jan  1 01:00:00 1970
448 +++ linux/drivers/md/dm-exception-store.c       Wed Aug 20 14:41:38 2003
449 @@ -0,0 +1,673 @@
450 +/*
451 + * dm-snapshot.c
452 + *
453 + * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
454 + *
455 + * This file is released under the GPL.
456 + */
457 +
458 +#include "dm-snapshot.h"
459 +#include "dm-io.h"
460 +#include "kcopyd.h"
461 +
462 +#include <linux/mm.h>
463 +#include <linux/pagemap.h>
464 +#include <linux/vmalloc.h>
465 +#include <linux/slab.h>
466 +
467 +/*-----------------------------------------------------------------
468 + * Persistent snapshots, by persistent we mean that the snapshot
469 + * will survive a reboot.
470 + *---------------------------------------------------------------*/
471 +
472 +/*
473 + * We need to store a record of which parts of the origin have
474 + * been copied to the snapshot device.  The snapshot code
475 + * requires that we copy exception chunks to chunk aligned areas
476 + * of the COW store.  It makes sense therefore, to store the
477 + * metadata in chunk size blocks.
478 + *
479 + * There is no backward or forward compatibility implemented,
480 + * snapshots with different disk versions than the kernel will
481 + * not be usable.  It is expected that "lvcreate" will blank out
482 + * the start of a fresh COW device before calling the snapshot
483 + * constructor.
484 + *
485 + * The first chunk of the COW device just contains the header.
486 + * After this there is a chunk filled with exception metadata,
487 + * followed by as many exception chunks as can fit in the
488 + * metadata areas.
489 + *
490 + * All on disk structures are in little-endian format.  The end
491 + * of the exceptions info is indicated by an exception with a
492 + * new_chunk of 0, which is invalid since it would point to the
493 + * header chunk.
494 + */
495 +
496 +/*
497 + * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
498 + */
499 +#define SNAP_MAGIC 0x70416e53
500 +
501 +/*
502 + * The on-disk version of the metadata.
503 + */
504 +#define SNAPSHOT_DISK_VERSION 1
505 +
506 +struct disk_header {
507 +       uint32_t magic;
508 +
509 +       /*
510 +        * Is this snapshot valid.  There is no way of recovering
511 +        * an invalid snapshot.
512 +        */
513 +       uint32_t valid;
514 +
515 +       /*
516 +        * Simple, incrementing version. no backward
517 +        * compatibility.
518 +        */
519 +       uint32_t version;
520 +
521 +       /* In sectors */
522 +       uint32_t chunk_size;
523 +};
524 +
525 +struct disk_exception {
526 +       uint64_t old_chunk;
527 +       uint64_t new_chunk;
528 +};
529 +
530 +struct commit_callback {
531 +       void (*callback)(void *, int success);
532 +       void *context;
533 +};
534 +
535 +/*
536 + * The top level structure for a persistent exception store.
537 + */
538 +struct pstore {
539 +       struct dm_snapshot *snap;       /* up pointer to my snapshot */
540 +       int version;
541 +       int valid;
542 +       uint32_t chunk_size;
543 +       uint32_t exceptions_per_area;
544 +
545 +       /*
546 +        * Now that we have an asynchronous kcopyd there is no
547 +        * need for large chunk sizes, so it wont hurt to have a
548 +        * whole chunks worth of metadata in memory at once.
549 +        */
550 +       void *area;
551 +
552 +       /*
553 +        * Used to keep track of which metadata area the data in
554 +        * 'chunk' refers to.
555 +        */
556 +       uint32_t current_area;
557 +
558 +       /*
559 +        * The next free chunk for an exception.
560 +        */
561 +       uint32_t next_free;
562 +
563 +       /*
564 +        * The index of next free exception in the current
565 +        * metadata area.
566 +        */
567 +       uint32_t current_committed;
568 +
569 +       atomic_t pending_count;
570 +       uint32_t callback_count;
571 +       struct commit_callback *callbacks;
572 +};
573 +
574 +static inline unsigned int sectors_to_pages(unsigned int sectors)
575 +{
576 +       return sectors / (PAGE_SIZE / SECTOR_SIZE);
577 +}
578 +
579 +static int alloc_area(struct pstore *ps)
580 +{
581 +       int r = -ENOMEM;
582 +       size_t i, len, nr_pages;
583 +       struct page *page, *last = NULL;
584 +
585 +       len = ps->chunk_size << SECTOR_SHIFT;
586 +
587 +       /*
588 +        * Allocate the chunk_size block of memory that will hold
589 +        * a single metadata area.
590 +        */
591 +       ps->area = vmalloc(len);
592 +       if (!ps->area)
593 +               return r;
594 +
595 +       nr_pages = sectors_to_pages(ps->chunk_size);
596 +
597 +       /*
598 +        * We lock the pages for ps->area into memory since
599 +        * they'll be doing a lot of io.  We also chain them
600 +        * together ready for dm-io.
601 +        */
602 +       for (i = 0; i < nr_pages; i++) {
603 +               page = vmalloc_to_page(ps->area + (i * PAGE_SIZE));
604 +               LockPage(page);
605 +               if (last)
606 +                       last->list.next = &page->list;
607 +               last = page;
608 +       }
609 +
610 +       return 0;
611 +}
612 +
613 +static void free_area(struct pstore *ps)
614 +{
615 +       size_t i, nr_pages;
616 +       struct page *page;
617 +
618 +       nr_pages = sectors_to_pages(ps->chunk_size);
619 +       for (i = 0; i < nr_pages; i++) {
620 +               page = vmalloc_to_page(ps->area + (i * PAGE_SIZE));
621 +               page->list.next = NULL;
622 +               UnlockPage(page);
623 +       }
624 +
625 +       vfree(ps->area);
626 +}
627 +
628 +/*
629 + * Read or write a chunk aligned and sized block of data from a device.
630 + */
631 +static int chunk_io(struct pstore *ps, uint32_t chunk, int rw)
632 +{
633 +       struct io_region where;
634 +       unsigned int bits;
635 +
636 +       where.dev = ps->snap->cow->dev;
637 +       where.sector = ps->chunk_size * chunk;
638 +       where.count = ps->chunk_size;
639 +
640 +       return dm_io_sync(1, &where, rw, vmalloc_to_page(ps->area), 0, &bits);
641 +}
642 +
643 +/*
644 + * Read or write a metadata area.  Remembering to skip the first
645 + * chunk which holds the header.
646 + */
647 +static int area_io(struct pstore *ps, uint32_t area, int rw)
648 +{
649 +       int r;
650 +       uint32_t chunk;
651 +
652 +       /* convert a metadata area index to a chunk index */
653 +       chunk = 1 + ((ps->exceptions_per_area + 1) * area);
654 +
655 +       r = chunk_io(ps, chunk, rw);
656 +       if (r)
657 +               return r;
658 +
659 +       ps->current_area = area;
660 +       return 0;
661 +}
662 +
663 +static int zero_area(struct pstore *ps, uint32_t area)
664 +{
665 +       memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT);
666 +       return area_io(ps, area, WRITE);
667 +}
668 +
669 +static int read_header(struct pstore *ps, int *new_snapshot)
670 +{
671 +       int r;
672 +       struct disk_header *dh;
673 +
674 +       r = chunk_io(ps, 0, READ);
675 +       if (r)
676 +               return r;
677 +
678 +       dh = (struct disk_header *) ps->area;
679 +
680 +       if (le32_to_cpu(dh->magic) == 0) {
681 +               *new_snapshot = 1;
682 +
683 +       } else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) {
684 +               *new_snapshot = 0;
685 +               ps->valid = le32_to_cpu(dh->valid);
686 +               ps->version = le32_to_cpu(dh->version);
687 +               ps->chunk_size = le32_to_cpu(dh->chunk_size);
688 +
689 +       } else {
690 +               DMWARN("Invalid/corrupt snapshot");
691 +               r = -ENXIO;
692 +       }
693 +
694 +       return r;
695 +}
696 +
697 +static int write_header(struct pstore *ps)
698 +{
699 +       struct disk_header *dh;
700 +
701 +       memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT);
702 +
703 +       dh = (struct disk_header *) ps->area;
704 +       dh->magic = cpu_to_le32(SNAP_MAGIC);
705 +       dh->valid = cpu_to_le32(ps->valid);
706 +       dh->version = cpu_to_le32(ps->version);
707 +       dh->chunk_size = cpu_to_le32(ps->chunk_size);
708 +
709 +       return chunk_io(ps, 0, WRITE);
710 +}
711 +
712 +/*
713 + * Access functions for the disk exceptions, these do the endian conversions.
714 + */
715 +static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
716 +{
717 +       if (index >= ps->exceptions_per_area)
718 +               return NULL;
719 +
720 +       return ((struct disk_exception *) ps->area) + index;
721 +}
722 +
723 +static int read_exception(struct pstore *ps,
724 +                         uint32_t index, struct disk_exception *result)
725 +{
726 +       struct disk_exception *e;
727 +
728 +       e = get_exception(ps, index);
729 +       if (!e)
730 +               return -EINVAL;
731 +
732 +       /* copy it */
733 +       result->old_chunk = le64_to_cpu(e->old_chunk);
734 +       result->new_chunk = le64_to_cpu(e->new_chunk);
735 +
736 +       return 0;
737 +}
738 +
739 +static int write_exception(struct pstore *ps,
740 +                          uint32_t index, struct disk_exception *de)
741 +{
742 +       struct disk_exception *e;
743 +
744 +       e = get_exception(ps, index);
745 +       if (!e)
746 +               return -EINVAL;
747 +
748 +       /* copy it */
749 +       e->old_chunk = cpu_to_le64(de->old_chunk);
750 +       e->new_chunk = cpu_to_le64(de->new_chunk);
751 +
752 +       return 0;
753 +}
754 +
755 +/*
756 + * Registers the exceptions that are present in the current area.
757 + * 'full' is filled in to indicate if the area has been
758 + * filled.
759 + */
760 +static int insert_exceptions(struct pstore *ps, int *full)
761 +{
762 +       int r;
763 +       unsigned int i;
764 +       struct disk_exception de;
765 +
766 +       /* presume the area is full */
767 +       *full = 1;
768 +
769 +       for (i = 0; i < ps->exceptions_per_area; i++) {
770 +               r = read_exception(ps, i, &de);
771 +
772 +               if (r)
773 +                       return r;
774 +
775 +               /*
776 +                * If the new_chunk is pointing at the start of
777 +                * the COW device, where the first metadata area
778 +                * is we know that we've hit the end of the
779 +                * exceptions.  Therefore the area is not full.
780 +                */
781 +               if (de.new_chunk == 0LL) {
782 +                       ps->current_committed = i;
783 +                       *full = 0;
784 +                       break;
785 +               }
786 +
787 +               /*
788 +                * Keep track of the start of the free chunks.
789 +                */
790 +               if (ps->next_free <= de.new_chunk)
791 +                       ps->next_free = de.new_chunk + 1;
792 +
793 +               /*
794 +                * Otherwise we add the exception to the snapshot.
795 +                */
796 +               r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk);
797 +               if (r)
798 +                       return r;
799 +       }
800 +
801 +       return 0;
802 +}
803 +
804 +static int read_exceptions(struct pstore *ps)
805 +{
806 +       uint32_t area;
807 +       int r, full = 1;
808 +
809 +       /*
810 +        * Keeping reading chunks and inserting exceptions until
811 +        * we find a partially full area.
812 +        */
813 +       for (area = 0; full; area++) {
814 +               r = area_io(ps, area, READ);
815 +               if (r)
816 +                       return r;
817 +
818 +               r = insert_exceptions(ps, &full);
819 +               if (r)
820 +                       return r;
821 +       }
822 +
823 +       return 0;
824 +}
825 +
826 +static inline struct pstore *get_info(struct exception_store *store)
827 +{
828 +       return (struct pstore *) store->context;
829 +}
830 +
831 +static void persistent_fraction_full(struct exception_store *store,
832 +                                    sector_t *numerator, sector_t *denominator)
833 +{
834 +       *numerator = get_info(store)->next_free * store->snap->chunk_size;
835 +       *denominator = get_dev_size(store->snap->cow->dev);
836 +}
837 +
838 +static void persistent_destroy(struct exception_store *store)
839 +{
840 +       struct pstore *ps = get_info(store);
841 +
842 +       dm_io_put(sectors_to_pages(ps->chunk_size));
843 +       vfree(ps->callbacks);
844 +       free_area(ps);
845 +       kfree(ps);
846 +}
847 +
848 +static int persistent_read_metadata(struct exception_store *store)
849 +{
850 +       int r, new_snapshot;
851 +       struct pstore *ps = get_info(store);
852 +
853 +       /*
854 +        * Read the snapshot header.
855 +        */
856 +       r = read_header(ps, &new_snapshot);
857 +       if (r)
858 +               return r;
859 +
860 +       /*
861 +        * Do we need to setup a new snapshot ?
862 +        */
863 +       if (new_snapshot) {
864 +               r = write_header(ps);
865 +               if (r) {
866 +                       DMWARN("write_header failed");
867 +                       return r;
868 +               }
869 +
870 +               r = zero_area(ps, 0);
871 +               if (r) {
872 +                       DMWARN("zero_area(0) failed");
873 +                       return r;
874 +               }
875 +
876 +       } else {
877 +               /*
878 +                * Sanity checks.
879 +                */
880 +               if (!ps->valid) {
881 +                       DMWARN("snapshot is marked invalid");
882 +                       return -EINVAL;
883 +               }
884 +
885 +               if (ps->version != SNAPSHOT_DISK_VERSION) {
886 +                       DMWARN("unable to handle snapshot disk version %d",
887 +                              ps->version);
888 +                       return -EINVAL;
889 +               }
890 +
891 +               /*
892 +                * Read the metadata.
893 +                */
894 +               r = read_exceptions(ps);
895 +               if (r)
896 +                       return r;
897 +       }
898 +
899 +       return 0;
900 +}
901 +
902 +static int persistent_prepare(struct exception_store *store,
903 +                             struct exception *e)
904 +{
905 +       struct pstore *ps = get_info(store);
906 +       uint32_t stride;
907 +       sector_t size = get_dev_size(store->snap->cow->dev);
908 +
909 +       /* Is there enough room ? */
910 +       if (size < ((ps->next_free + 1) * store->snap->chunk_size))
911 +               return -ENOSPC;
912 +
913 +       e->new_chunk = ps->next_free;
914 +
915 +       /*
916 +        * Move onto the next free pending, making sure to take
917 +        * into account the location of the metadata chunks.
918 +        */
919 +       stride = (ps->exceptions_per_area + 1);
920 +       if ((++ps->next_free % stride) == 1)
921 +               ps->next_free++;
922 +
923 +       atomic_inc(&ps->pending_count);
924 +       return 0;
925 +}
926 +
927 +static void persistent_commit(struct exception_store *store,
928 +                             struct exception *e,
929 +                             void (*callback) (void *, int success),
930 +                             void *callback_context)
931 +{
932 +       int r;
933 +       unsigned int i;
934 +       struct pstore *ps = get_info(store);
935 +       struct disk_exception de;
936 +       struct commit_callback *cb;
937 +
938 +       de.old_chunk = e->old_chunk;
939 +       de.new_chunk = e->new_chunk;
940 +       write_exception(ps, ps->current_committed++, &de);
941 +
942 +       /*
943 +        * Add the callback to the back of the array.  This code
944 +        * is the only place where the callback array is
945 +        * manipulated, and we know that it will never be called
946 +        * multiple times concurrently.
947 +        */
948 +       cb = ps->callbacks + ps->callback_count++;
949 +       cb->callback = callback;
950 +       cb->context = callback_context;
951 +
952 +       /*
953 +        * If there are no more exceptions in flight, or we have
954 +        * filled this metadata area we commit the exceptions to
955 +        * disk.
956 +        */
957 +       if (atomic_dec_and_test(&ps->pending_count) ||
958 +           (ps->current_committed == ps->exceptions_per_area)) {
959 +               r = area_io(ps, ps->current_area, WRITE);
960 +               if (r)
961 +                       ps->valid = 0;
962 +
963 +               for (i = 0; i < ps->callback_count; i++) {
964 +                       cb = ps->callbacks + i;
965 +                       cb->callback(cb->context, r == 0 ? 1 : 0);
966 +               }
967 +
968 +               ps->callback_count = 0;
969 +       }
970 +
971 +       /*
972 +        * Have we completely filled the current area ?
973 +        */
974 +       if (ps->current_committed == ps->exceptions_per_area) {
975 +               ps->current_committed = 0;
976 +               r = zero_area(ps, ps->current_area + 1);
977 +               if (r)
978 +                       ps->valid = 0;
979 +       }
980 +}
981 +
982 +static void persistent_drop(struct exception_store *store)
983 +{
984 +       struct pstore *ps = get_info(store);
985 +
986 +       ps->valid = 0;
987 +       if (write_header(ps))
988 +               DMWARN("write header failed");
989 +}
990 +
991 +int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
992 +{
993 +       int r;
994 +       struct pstore *ps;
995 +
996 +       r = dm_io_get(sectors_to_pages(chunk_size));
997 +       if (r)
998 +               return r;
999 +
1000 +       /* allocate the pstore */
1001 +       ps = kmalloc(sizeof(*ps), GFP_KERNEL);
1002 +       if (!ps) {
1003 +               r = -ENOMEM;
1004 +               goto bad;
1005 +       }
1006 +
1007 +       ps->snap = store->snap;
1008 +       ps->valid = 1;
1009 +       ps->version = SNAPSHOT_DISK_VERSION;
1010 +       ps->chunk_size = chunk_size;
1011 +       ps->exceptions_per_area = (chunk_size << SECTOR_SHIFT) /
1012 +           sizeof(struct disk_exception);
1013 +       ps->next_free = 2;      /* skipping the header and first area */
1014 +       ps->current_committed = 0;
1015 +
1016 +       r = alloc_area(ps);
1017 +       if (r)
1018 +               goto bad;
1019 +
1020 +       /*
1021 +        * Allocate space for all the callbacks.
1022 +        */
1023 +       ps->callback_count = 0;
1024 +       atomic_set(&ps->pending_count, 0);
1025 +       ps->callbacks = vcalloc(ps->exceptions_per_area,
1026 +                               sizeof(*ps->callbacks));
1027 +
1028 +       if (!ps->callbacks) {
1029 +               r = -ENOMEM;
1030 +               goto bad;
1031 +       }
1032 +
1033 +       store->destroy = persistent_destroy;
1034 +       store->read_metadata = persistent_read_metadata;
1035 +       store->prepare_exception = persistent_prepare;
1036 +       store->commit_exception = persistent_commit;
1037 +       store->drop_snapshot = persistent_drop;
1038 +       store->fraction_full = persistent_fraction_full;
1039 +       store->context = ps;
1040 +
1041 +       return 0;
1042 +
1043 +      bad:
1044 +       dm_io_put(sectors_to_pages(chunk_size));
1045 +       if (ps) {
1046 +               if (ps->callbacks)
1047 +                       vfree(ps->callbacks);
1048 +
1049 +               kfree(ps);
1050 +       }
1051 +       return r;
1052 +}
1053 +
1054 +/*-----------------------------------------------------------------
1055 + * Implementation of the store for non-persistent snapshots.
1056 + *---------------------------------------------------------------*/
1057 +struct transient_c {
1058 +       sector_t next_free;
1059 +};
1060 +
1061 +void transient_destroy(struct exception_store *store)
1062 +{
1063 +       kfree(store->context);
1064 +}
1065 +
1066 +int transient_read_metadata(struct exception_store *store)
1067 +{
1068 +       return 0;
1069 +}
1070 +
1071 +int transient_prepare(struct exception_store *store, struct exception *e)
1072 +{
1073 +       struct transient_c *tc = (struct transient_c *) store->context;
1074 +       sector_t size = get_dev_size(store->snap->cow->dev);
1075 +
1076 +       if (size < (tc->next_free + store->snap->chunk_size))
1077 +               return -1;
1078 +
1079 +       e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
1080 +       tc->next_free += store->snap->chunk_size;
1081 +
1082 +       return 0;
1083 +}
1084 +
1085 +void transient_commit(struct exception_store *store,
1086 +                     struct exception *e,
1087 +                     void (*callback) (void *, int success),
1088 +                     void *callback_context)
1089 +{
1090 +       /* Just succeed */
1091 +       callback(callback_context, 1);
1092 +}
1093 +
1094 +static void transient_fraction_full(struct exception_store *store,
1095 +                                   sector_t *numerator, sector_t *denominator)
1096 +{
1097 +       *numerator = ((struct transient_c *) store->context)->next_free;
1098 +       *denominator = get_dev_size(store->snap->cow->dev);
1099 +}
1100 +
1101 +int dm_create_transient(struct exception_store *store,
1102 +                       struct dm_snapshot *s, int blocksize)
1103 +{
1104 +       struct transient_c *tc;
1105 +
1106 +       memset(store, 0, sizeof(*store));
1107 +       store->destroy = transient_destroy;
1108 +       store->read_metadata = transient_read_metadata;
1109 +       store->prepare_exception = transient_prepare;
1110 +       store->commit_exception = transient_commit;
1111 +       store->fraction_full = transient_fraction_full;
1112 +       store->snap = s;
1113 +
1114 +       tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
1115 +       if (!tc)
1116 +               return -ENOMEM;
1117 +
1118 +       tc->next_free = 0;
1119 +       store->context = tc;
1120 +
1121 +       return 0;
1122 +}
1123 --- linux-2.4.21/drivers/md/dm-io.c     Thu Jan  1 01:00:00 1970
1124 +++ linux/drivers/md/dm-io.c    Wed Aug 20 14:41:38 2003
1125 @@ -0,0 +1,344 @@
1126 +/*
1127 + * Copyright (C) 2003 Sistina Software
1128 + *
1129 + * This file is released under the GPL.
1130 + */
1131 +
1132 +#include "dm-io.h"
1133 +
1134 +#include <linux/mempool.h>
1135 +#include <linux/module.h>
1136 +#include <linux/slab.h>
1137 +#include <linux/sched.h>
1138 +
1139 +/* FIXME: can we shrink this ? */
1140 +struct io_context {
1141 +       int rw;
1142 +       unsigned int error;
1143 +       atomic_t count;
1144 +       struct task_struct *sleeper;
1145 +       io_notify_fn callback;
1146 +       void *context;
1147 +};
1148 +
1149 +/*
1150 + * We maintain a pool of buffer heads for dispatching the io.
1151 + */
1152 +static unsigned int _num_bhs;
1153 +static mempool_t *_buffer_pool;
1154 +
1155 +/*
1156 + * io contexts are only dynamically allocated for asynchronous
1157 + * io.  Since async io is likely to be the majority of io we'll
1158 + * have the same number of io contexts as buffer heads ! (FIXME:
1159 + * must reduce this).
1160 + */
1161 +mempool_t *_io_pool;
1162 +
1163 +static void *alloc_bh(int gfp_mask, void *pool_data)
1164 +{
1165 +       struct buffer_head *bh;
1166 +
1167 +       bh = kmem_cache_alloc(bh_cachep, gfp_mask);
1168 +       if (bh) {
1169 +               bh->b_reqnext = NULL;
1170 +               init_waitqueue_head(&bh->b_wait);
1171 +               INIT_LIST_HEAD(&bh->b_inode_buffers);
1172 +       }
1173 +
1174 +       return bh;
1175 +}
1176 +
1177 +static void *alloc_io(int gfp_mask, void *pool_data)
1178 +{
1179 +       return kmalloc(sizeof(struct io_context), gfp_mask);
1180 +}
1181 +
1182 +static void free_io(void *element, void *pool_data)
1183 +{
1184 +       kfree(element);
1185 +}
1186 +
1187 +static unsigned int pages_to_buffers(unsigned int pages)
1188 +{
1189 +       return 4 * pages;       /* too many ? */
1190 +}
1191 +
1192 +static int resize_pool(unsigned int new_bhs)
1193 +{
1194 +       int r = 0;
1195 +
1196 +       if (_buffer_pool) {
1197 +               if (new_bhs == 0) {
1198 +                       /* free off the pools */
1199 +                       mempool_destroy(_buffer_pool);
1200 +                       mempool_destroy(_io_pool);
1201 +                       _buffer_pool = _io_pool = NULL;
1202 +               } else {
1203 +                       /* resize the pools */
1204 +                       r = mempool_resize(_buffer_pool, new_bhs, GFP_KERNEL);
1205 +                       if (!r)
1206 +                               r = mempool_resize(_io_pool,
1207 +                                                  new_bhs, GFP_KERNEL);
1208 +               }
1209 +       } else {
1210 +               /* create new pools */
1211 +               _buffer_pool = mempool_create(new_bhs, alloc_bh,
1212 +                                             mempool_free_slab, bh_cachep);
1213 +               if (!_buffer_pool)
1214 +                       r = -ENOMEM;
1215 +
1216 +               _io_pool = mempool_create(new_bhs, alloc_io, free_io, NULL);
1217 +               if (!_io_pool) {
1218 +                       mempool_destroy(_buffer_pool);
1219 +                       _buffer_pool = NULL;
1220 +                       r = -ENOMEM;
1221 +               }
1222 +       }
1223 +
1224 +       if (!r)
1225 +               _num_bhs = new_bhs;
1226 +
1227 +       return r;
1228 +}
1229 +
1230 +int dm_io_get(unsigned int num_pages)
1231 +{
1232 +       return resize_pool(_num_bhs + pages_to_buffers(num_pages));
1233 +}
1234 +
1235 +void dm_io_put(unsigned int num_pages)
1236 +{
1237 +       resize_pool(_num_bhs - pages_to_buffers(num_pages));
1238 +}
1239 +
1240 +/*-----------------------------------------------------------------
1241 + * We need to keep track of which region a buffer is doing io
1242 + * for.  In order to save a memory allocation we store this in an
1243 + * unused field of the buffer head, and provide these access
1244 + * functions.
1245 + *
1246 + * FIXME: add compile time check that an unsigned int can fit
1247 + * into a pointer.
1248 + *
1249 + *---------------------------------------------------------------*/
1250 +static inline void bh_set_region(struct buffer_head *bh, unsigned int region)
1251 +{
1252 +       bh->b_journal_head = (void *) region;
1253 +}
1254 +
1255 +static inline int bh_get_region(struct buffer_head *bh)
1256 +{
1257 +       return (unsigned int) bh->b_journal_head;
1258 +}
1259 +
1260 +/*-----------------------------------------------------------------
1261 + * We need an io object to keep track of the number of bhs that
1262 + * have been dispatched for a particular io.
1263 + *---------------------------------------------------------------*/
1264 +static void dec_count(struct io_context *io, unsigned int region, int error)
1265 +{
1266 +       if (error)
1267 +               set_bit(region, &io->error);
1268 +
1269 +       if (atomic_dec_and_test(&io->count)) {
1270 +               if (io->sleeper)
1271 +                       wake_up_process(io->sleeper);
1272 +
1273 +               else {
1274 +                       int r = io->error;
1275 +                       io_notify_fn fn = io->callback;
1276 +                       void *context = io->context;
1277 +
1278 +                       mempool_free(io, _io_pool);
1279 +                       fn(r, context);
1280 +               }
1281 +       }
1282 +}
1283 +
1284 +static void endio(struct buffer_head *bh, int uptodate)
1285 +{
1286 +       struct io_context *io = (struct io_context *) bh->b_private;
1287 +
1288 +       if (!uptodate && io->rw != WRITE) {
1289 +               /*
1290 +                * We need to zero this region, otherwise people
1291 +                * like kcopyd may write the arbitrary contents
1292 +                * of the page.
1293 +                */
1294 +               memset(bh->b_data, 0, bh->b_size);
1295 +       }
1296 +
1297 +       dec_count((struct io_context *) bh->b_private,
1298 +                 bh_get_region(bh), !uptodate);
1299 +       mempool_free(bh, _buffer_pool);
1300 +}
1301 +
1302 +/*
1303 + * Primitives for alignment calculations.
1304 + */
1305 +int fls(unsigned n)
1306 +{
1307 +       return generic_fls32(n);
1308 +}
1309 +
1310 +static inline int log2_floor(unsigned n)
1311 +{
1312 +       return ffs(n) - 1;
1313 +}
1314 +
1315 +static inline int log2_align(unsigned n)
1316 +{
1317 +       return fls(n) - 1;
1318 +}
1319 +
1320 +/*
1321 + * Returns the next block for io.
1322 + */
1323 +static int do_page(kdev_t dev, sector_t *block, sector_t end_block,
1324 +                  unsigned int block_size,
1325 +                  struct page *p, unsigned int offset,
1326 +                  unsigned int region, struct io_context *io)
1327 +{
1328 +       struct buffer_head *bh;
1329 +       sector_t b = *block;
1330 +       sector_t blocks_per_page = PAGE_SIZE / block_size;
1331 +       unsigned int this_size; /* holds the size of the current io */
1332 +       unsigned int len;
1333 +
1334 +       while ((offset < PAGE_SIZE) && (b != end_block)) {
1335 +               bh = mempool_alloc(_buffer_pool, GFP_NOIO);
1336 +               init_buffer(bh, endio, io);
1337 +               bh_set_region(bh, region);
1338 +
1339 +               /*
1340 +                * Block size must be a power of 2 and aligned
1341 +                * correctly.
1342 +                */
1343 +               len = end_block - b;
1344 +               this_size = min((sector_t) 1 << log2_floor(b), blocks_per_page);
1345 +               if (this_size > len)
1346 +                       this_size = 1 << log2_align(len);
1347 +
1348 +               /*
1349 +                * Add in the job offset.
1350 +                */
1351 +               bh->b_blocknr = (b / this_size);
1352 +               bh->b_size = block_size * this_size;
1353 +               set_bh_page(bh, p, offset);
1354 +               bh->b_this_page = bh;
1355 +
1356 +               bh->b_dev = dev;
1357 +               atomic_set(&bh->b_count, 1);
1358 +
1359 +               bh->b_state = ((1 << BH_Uptodate) | (1 << BH_Mapped) |
1360 +                              (1 << BH_Lock));
1361 +
1362 +               if (io->rw == WRITE)
1363 +                       clear_bit(BH_Dirty, &bh->b_state);
1364 +
1365 +               atomic_inc(&io->count);
1366 +               submit_bh(io->rw, bh);
1367 +
1368 +               b += this_size;
1369 +               offset += block_size * this_size;
1370 +       }
1371 +
1372 +       *block = b;
1373 +       return (b == end_block);
1374 +}
1375 +
1376 +static void do_region(unsigned int region, struct io_region *where,
1377 +                     struct page *page, unsigned int offset,
1378 +                     struct io_context *io)
1379 +{
1380 +       unsigned int block_size = get_hardsect_size(where->dev);
1381 +       unsigned int sblock_size = block_size >> 9;
1382 +       sector_t block = where->sector / sblock_size;
1383 +       sector_t end_block = (where->sector + where->count) / sblock_size;
1384 +
1385 +       while (1) {
1386 +               if (do_page(where->dev, &block, end_block, block_size,
1387 +                           page, offset, region, io))
1388 +                       break;
1389 +
1390 +               offset = 0;     /* only offset the first page */
1391 +
1392 +               page = list_entry(page->list.next, struct page, list);
1393 +       }
1394 +}
1395 +
1396 +static void dispatch_io(unsigned int num_regions, struct io_region *where,
1397 +                       struct page *pages, unsigned int offset,
1398 +                       struct io_context *io)
1399 +{
1400 +       int i;
1401 +
1402 +       for (i = 0; i < num_regions; i++)
1403 +               if (where[i].count)
1404 +                       do_region(i, where + i, pages, offset, io);
1405 +
1406 +       /*
1407 +        * Drop the extra refence that we were holding to avoid
1408 +        * the io being completed too early.
1409 +        */
1410 +       dec_count(io, 0, 0);
1411 +}
1412 +
1413 +/*
1414 + * Synchronous io
1415 + */
1416 +int dm_io_sync(unsigned int num_regions, struct io_region *where,
1417 +              int rw, struct page *pages, unsigned int offset,
1418 +              unsigned int *error_bits)
1419 +{
1420 +       struct io_context io;
1421 +
1422 +       BUG_ON(num_regions > 1 && rw != WRITE);
1423 +
1424 +       io.rw = rw;
1425 +       io.error = 0;
1426 +       atomic_set(&io.count, 1); /* see dispatch_io() */
1427 +       io.sleeper = current;
1428 +
1429 +       dispatch_io(num_regions, where, pages, offset, &io);
1430 +       run_task_queue(&tq_disk);
1431 +
1432 +       while (1) {
1433 +               set_current_state(TASK_UNINTERRUPTIBLE);
1434 +
1435 +               if (!atomic_read(&io.count))
1436 +                       break;
1437 +
1438 +               schedule();
1439 +       }
1440 +       set_current_state(TASK_RUNNING);
1441 +
1442 +       *error_bits = io.error;
1443 +       return io.error ? -EIO : 0;
1444 +}
1445 +
1446 +/*
1447 + * Asynchronous io
1448 + */
1449 +int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
1450 +               struct page *pages, unsigned int offset,
1451 +               io_notify_fn fn, void *context)
1452 +{
1453 +       struct io_context *io = mempool_alloc(_io_pool, GFP_NOIO);
1454 +
1455 +       io->rw = rw;
1456 +       io->error = 0;
1457 +       atomic_set(&io->count, 1); /* see dispatch_io() */
1458 +       io->sleeper = NULL;
1459 +       io->callback = fn;
1460 +       io->context = context;
1461 +
1462 +       dispatch_io(num_regions, where, pages, offset, io);
1463 +       return 0;
1464 +}
1465 +
1466 +EXPORT_SYMBOL(dm_io_get);
1467 +EXPORT_SYMBOL(dm_io_put);
1468 +EXPORT_SYMBOL(dm_io_sync);
1469 +EXPORT_SYMBOL(dm_io_async);
1470 --- linux-2.4.21/drivers/md/dm-io.h     Thu Jan  1 01:00:00 1970
1471 +++ linux/drivers/md/dm-io.h    Wed Aug 20 14:41:38 2003
1472 @@ -0,0 +1,86 @@
1473 +/*
1474 + * Copyright (C) 2003 Sistina Software
1475 + *
1476 + * This file is released under the GPL.
1477 + */
1478 +
1479 +#ifndef _DM_IO_H
1480 +#define _DM_IO_H
1481 +
1482 +#include "dm.h"
1483 +
1484 +#include <linux/list.h>
1485 +
1486 +/* Move these to bitops.h eventually */
1487 +/* Improved generic_fls algorithm (in 2.4 there is no generic_fls so far) */
1488 +/* (c) 2002, D.Phillips and Sistina Software */
1489 +/* Licensed under Version 2 of the GPL */
1490 +
1491 +static unsigned generic_fls8(unsigned n)
1492 +{
1493 +       return n & 0xf0 ?
1494 +           n & 0xc0 ? (n >> 7) + 7 : (n >> 5) + 5:
1495 +           n & 0x0c ? (n >> 3) + 3 : n - ((n + 1) >> 2);
1496 +}
1497 +
1498 +static inline unsigned generic_fls16(unsigned n)
1499 +{
1500 +       return  n & 0xff00? generic_fls8(n >> 8) + 8 : generic_fls8(n);
1501 +}
1502 +
1503 +static inline unsigned generic_fls32(unsigned n)
1504 +{
1505 +       return  n & 0xffff0000 ? generic_fls16(n >> 16) + 16 : generic_fls16(n);
1506 +}
1507 +
1508 +/* FIXME make this configurable */
1509 +#define DM_MAX_IO_REGIONS 8
1510 +
1511 +struct io_region {
1512 +       kdev_t dev;
1513 +       sector_t sector;
1514 +       sector_t count;
1515 +};
1516 +
1517 +
1518 +/*
1519 + * 'error' is a bitset, with each bit indicating whether an error
1520 + * occurred doing io to the corresponding region.
1521 + */
1522 +typedef void (*io_notify_fn)(unsigned int error, void *context);
1523 +
1524 +
1525 +/*
1526 + * Before anyone uses the IO interface they should call
1527 + * dm_io_get(), specifying roughly how many pages they are
1528 + * expecting to perform io on concurrently.
1529 + *
1530 + * This function may block.
1531 + */
1532 +int dm_io_get(unsigned int num_pages);
1533 +void dm_io_put(unsigned int num_pages);
1534 +
1535 +
1536 +/*
1537 + * Synchronous IO.
1538 + *
1539 + * Please ensure that the rw flag in the next two functions is
1540 + * either READ or WRITE, ie. we don't take READA.  Any
1541 + * regions with a zero count field will be ignored.
1542 + */
1543 +int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw,
1544 +              struct page *pages, unsigned int offset,
1545 +              unsigned int *error_bits);
1546 +
1547 +
1548 +/*
1549 + * Aynchronous IO.
1550 + *
1551 + * The 'where' array may be safely allocated on the stack since
1552 + * the function takes a copy.
1553 + */
1554 +int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
1555 +               struct page *pages, unsigned int offset,
1556 +               io_notify_fn fn, void *context);
1557 +
1558 +#endif
1559 --- linux-2.4.21/drivers/md/dm-ioctl.c  Thu Jan  1 01:00:00 1970
1560 +++ linux/drivers/md/dm-ioctl.c Wed Aug 20 14:41:38 2003
1561 @@ -0,0 +1,1284 @@
1562 +/*
1563 + * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
1564 + *
1565 + * This file is released under the GPL.
1566 + */
1567 +
1568 +#include "dm.h"
1569 +
1570 +#include <linux/module.h>
1571 +#include <linux/vmalloc.h>
1572 +#include <linux/miscdevice.h>
1573 +#include <linux/dm-ioctl.h>
1574 +#include <linux/init.h>
1575 +#include <linux/wait.h>
1576 +#include <linux/blk.h>
1577 +#include <linux/slab.h>
1578 +
1579 +#include <asm/uaccess.h>
1580 +
1581 +#define DM_DRIVER_EMAIL "dm@uk.sistina.com"
1582 +
1583 +/*-----------------------------------------------------------------
1584 + * The ioctl interface needs to be able to look up devices by
1585 + * name or uuid.
1586 + *---------------------------------------------------------------*/
1587 +struct hash_cell {
1588 +       struct list_head name_list;
1589 +       struct list_head uuid_list;
1590 +
1591 +       char *name;
1592 +       char *uuid;
1593 +       struct mapped_device *md;
1594 +       struct dm_table *new_map;
1595 +
1596 +       /* I hate devfs */
1597 +       devfs_handle_t devfs_entry;
1598 +};
1599 +
1600 +#define NUM_BUCKETS 64
1601 +#define MASK_BUCKETS (NUM_BUCKETS - 1)
1602 +static struct list_head _name_buckets[NUM_BUCKETS];
1603 +static struct list_head _uuid_buckets[NUM_BUCKETS];
1604 +
1605 +static devfs_handle_t _dev_dir;
1606 +void dm_hash_remove_all(void);
1607 +
1608 +/*
1609 + * Guards access to both hash tables.
1610 + */
1611 +static DECLARE_RWSEM(_hash_lock);
1612 +
1613 +static void init_buckets(struct list_head *buckets)
1614 +{
1615 +       unsigned int i;
1616 +
1617 +       for (i = 0; i < NUM_BUCKETS; i++)
1618 +               INIT_LIST_HEAD(buckets + i);
1619 +}
1620 +
1621 +int dm_hash_init(void)
1622 +{
1623 +       init_buckets(_name_buckets);
1624 +       init_buckets(_uuid_buckets);
1625 +       _dev_dir = devfs_mk_dir(0, DM_DIR, NULL);
1626 +       return 0;
1627 +}
1628 +
1629 +void dm_hash_exit(void)
1630 +{
1631 +       dm_hash_remove_all();
1632 +       devfs_unregister(_dev_dir);
1633 +}
1634 +
1635 +/*-----------------------------------------------------------------
1636 + * Hash function:
1637 + * We're not really concerned with the str hash function being
1638 + * fast since it's only used by the ioctl interface.
1639 + *---------------------------------------------------------------*/
1640 +static unsigned int hash_str(const char *str)
1641 +{
1642 +       const unsigned int hash_mult = 2654435387U;
1643 +       unsigned int h = 0;
1644 +
1645 +       while (*str)
1646 +               h = (h + (unsigned int) *str++) * hash_mult;
1647 +
1648 +       return h & MASK_BUCKETS;
1649 +}
1650 +
1651 +/*-----------------------------------------------------------------
1652 + * Code for looking up a device by name
1653 + *---------------------------------------------------------------*/
1654 +static struct hash_cell *__get_name_cell(const char *str)
1655 +{
1656 +       struct list_head *tmp;
1657 +       struct hash_cell *hc;
1658 +       unsigned int h = hash_str(str);
1659 +
1660 +       list_for_each (tmp, _name_buckets + h) {
1661 +               hc = list_entry(tmp, struct hash_cell, name_list);
1662 +               if (!strcmp(hc->name, str))
1663 +                       return hc;
1664 +       }
1665 +
1666 +       return NULL;
1667 +}
1668 +
1669 +static struct hash_cell *__get_uuid_cell(const char *str)
1670 +{
1671 +       struct list_head *tmp;
1672 +       struct hash_cell *hc;
1673 +       unsigned int h = hash_str(str);
1674 +
1675 +       list_for_each (tmp, _uuid_buckets + h) {
1676 +               hc = list_entry(tmp, struct hash_cell, uuid_list);
1677 +               if (!strcmp(hc->uuid, str))
1678 +                       return hc;
1679 +       }
1680 +
1681 +       return NULL;
1682 +}
1683 +
1684 +/*-----------------------------------------------------------------
1685 + * Inserting, removing and renaming a device.
1686 + *---------------------------------------------------------------*/
1687 +static inline char *kstrdup(const char *str)
1688 +{
1689 +       char *r = kmalloc(strlen(str) + 1, GFP_KERNEL);
1690 +       if (r)
1691 +               strcpy(r, str);
1692 +       return r;
1693 +}
1694 +
1695 +static struct hash_cell *alloc_cell(const char *name, const char *uuid,
1696 +                                   struct mapped_device *md)
1697 +{
1698 +       struct hash_cell *hc;
1699 +
1700 +       hc = kmalloc(sizeof(*hc), GFP_KERNEL);
1701 +       if (!hc)
1702 +               return NULL;
1703 +
1704 +       hc->name = kstrdup(name);
1705 +       if (!hc->name) {
1706 +               kfree(hc);
1707 +               return NULL;
1708 +       }
1709 +
1710 +       if (!uuid)
1711 +               hc->uuid = NULL;
1712 +
1713 +       else {
1714 +               hc->uuid = kstrdup(uuid);
1715 +               if (!hc->uuid) {
1716 +                       kfree(hc->name);
1717 +                       kfree(hc);
1718 +                       return NULL;
1719 +               }
1720 +       }
1721 +
1722 +       INIT_LIST_HEAD(&hc->name_list);
1723 +       INIT_LIST_HEAD(&hc->uuid_list);
1724 +       hc->md = md;
1725 +       hc->new_map = NULL;
1726 +       return hc;
1727 +}
1728 +
1729 +static void free_cell(struct hash_cell *hc)
1730 +{
1731 +       if (hc) {
1732 +               kfree(hc->name);
1733 +               kfree(hc->uuid);
1734 +               kfree(hc);
1735 +       }
1736 +}
1737 +
1738 +/*
1739 + * devfs stuff.
1740 + */
1741 +static int register_with_devfs(struct hash_cell *hc)
1742 +{
1743 +       kdev_t dev = dm_kdev(hc->md);
1744 +
1745 +       hc->devfs_entry =
1746 +           devfs_register(_dev_dir, hc->name, DEVFS_FL_CURRENT_OWNER,
1747 +                          major(dev), minor(dev),
1748 +                          S_IFBLK | S_IRUSR | S_IWUSR | S_IRGRP,
1749 +                          &dm_blk_dops, NULL);
1750 +
1751 +       return 0;
1752 +}
1753 +
1754 +static int unregister_with_devfs(struct hash_cell *hc)
1755 +{
1756 +       devfs_unregister(hc->devfs_entry);
1757 +       return 0;
1758 +}
1759 +
1760 +/*
1761 + * The kdev_t and uuid of a device can never change once it is
1762 + * initially inserted.
1763 + */
1764 +int dm_hash_insert(const char *name, const char *uuid, struct mapped_device *md)
1765 +{
1766 +       struct hash_cell *cell;
1767 +
1768 +       /*
1769 +        * Allocate the new cells.
1770 +        */
1771 +       cell = alloc_cell(name, uuid, md);
1772 +       if (!cell)
1773 +               return -ENOMEM;
1774 +
1775 +       /*
1776 +        * Insert the cell into both hash tables.
1777 +        */
1778 +       down_write(&_hash_lock);
1779 +       if (__get_name_cell(name))
1780 +               goto bad;
1781 +
1782 +       list_add(&cell->name_list, _name_buckets + hash_str(name));
1783 +
1784 +       if (uuid) {
1785 +               if (__get_uuid_cell(uuid)) {
1786 +                       list_del(&cell->name_list);
1787 +                       goto bad;
1788 +               }
1789 +               list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid));
1790 +       }
1791 +       register_with_devfs(cell);
1792 +       dm_get(md);
1793 +       up_write(&_hash_lock);
1794 +
1795 +       return 0;
1796 +
1797 +      bad:
1798 +       up_write(&_hash_lock);
1799 +       free_cell(cell);
1800 +       return -EBUSY;
1801 +}
1802 +
1803 +void __hash_remove(struct hash_cell *hc)
1804 +{
1805 +       /* remove from the dev hash */
1806 +       list_del(&hc->uuid_list);
1807 +       list_del(&hc->name_list);
1808 +       unregister_with_devfs(hc);
1809 +       dm_put(hc->md);
1810 +       if (hc->new_map)
1811 +               dm_table_put(hc->new_map);
1812 +       free_cell(hc);
1813 +}
1814 +
1815 +void dm_hash_remove_all(void)
1816 +{
1817 +       int i;
1818 +       struct hash_cell *hc;
1819 +       struct list_head *tmp, *n;
1820 +
1821 +       down_write(&_hash_lock);
1822 +       for (i = 0; i < NUM_BUCKETS; i++) {
1823 +               list_for_each_safe (tmp, n, _name_buckets + i) {
1824 +                       hc = list_entry(tmp, struct hash_cell, name_list);
1825 +                       __hash_remove(hc);
1826 +               }
1827 +       }
1828 +       up_write(&_hash_lock);
1829 +}
1830 +
1831 +int dm_hash_rename(const char *old, const char *new)
1832 +{
1833 +       char *new_name, *old_name;
1834 +       struct hash_cell *hc;
1835 +
1836 +       /*
1837 +        * duplicate new.
1838 +        */
1839 +       new_name = kstrdup(new);
1840 +       if (!new_name)
1841 +               return -ENOMEM;
1842 +
1843 +       down_write(&_hash_lock);
1844 +
1845 +       /*
1846 +        * Is new free ?
1847 +        */
1848 +       hc = __get_name_cell(new);
1849 +       if (hc) {
1850 +               DMWARN("asked to rename to an already existing name %s -> %s",
1851 +                      old, new);
1852 +               up_write(&_hash_lock);
1853 +               kfree(new_name);
1854 +               return -EBUSY;
1855 +       }
1856 +
1857 +       /*
1858 +        * Is there such a device as 'old' ?
1859 +        */
1860 +       hc = __get_name_cell(old);
1861 +       if (!hc) {
1862 +               DMWARN("asked to rename a non existent device %s -> %s",
1863 +                      old, new);
1864 +               up_write(&_hash_lock);
1865 +               kfree(new_name);
1866 +               return -ENXIO;
1867 +       }
1868 +
1869 +       /*
1870 +        * rename and move the name cell.
1871 +        */
1872 +       list_del(&hc->name_list);
1873 +       old_name = hc->name;
1874 +       hc->name = new_name;
1875 +       list_add(&hc->name_list, _name_buckets + hash_str(new_name));
1876 +
1877 +       /* rename the device node in devfs */
1878 +       unregister_with_devfs(hc);
1879 +       register_with_devfs(hc);
1880 +
1881 +       up_write(&_hash_lock);
1882 +       kfree(old_name);
1883 +       return 0;
1884 +}
1885 +
1886 +/*-----------------------------------------------------------------
1887 + * Implementation of the ioctl commands
1888 + *---------------------------------------------------------------*/
1889 +/*
1890 + * All the ioctl commands get dispatched to functions with this
1891 + * prototype.
1892 + */
1893 +typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size);
1894 +
1895 +static int remove_all(struct dm_ioctl *param, size_t param_size)
1896 +{
1897 +       dm_hash_remove_all();
1898 +       param->data_size = 0;
1899 +       return 0;
1900 +}
1901 +
1902 +/*
1903 + * Round up the ptr to an 8-byte boundary.
1904 + */
1905 +#define ALIGN_MASK 7
1906 +static inline void *align_ptr(void *ptr)
1907 +{
1908 +       return (void *) (((size_t) (ptr + ALIGN_MASK)) & ~ALIGN_MASK);
1909 +}
1910 +
1911 +/*
1912 + * Retrieves the data payload buffer from an already allocated
1913 + * struct dm_ioctl.
1914 + */
1915 +static void *get_result_buffer(struct dm_ioctl *param, size_t param_size,
1916 +                              size_t *len)
1917 +{
1918 +       param->data_start = align_ptr(param + 1) - (void *) param;
1919 +
1920 +       if (param->data_start < param_size)
1921 +               *len = param_size - param->data_start;
1922 +       else
1923 +               *len = 0;
1924 +
1925 +       return ((void *) param) + param->data_start;
1926 +}
1927 +
1928 +static int list_devices(struct dm_ioctl *param, size_t param_size)
1929 +{
1930 +       unsigned int i;
1931 +       struct hash_cell *hc;
1932 +       size_t len, needed = 0;
1933 +       struct dm_name_list *nl, *old_nl = NULL;
1934 +
1935 +       down_write(&_hash_lock);
1936 +
1937 +       /*
1938 +        * Loop through all the devices working out how much
1939 +        * space we need.
1940 +        */
1941 +       for (i = 0; i < NUM_BUCKETS; i++) {
1942 +               list_for_each_entry (hc, _name_buckets + i, name_list) {
1943 +                       needed += sizeof(struct dm_name_list);
1944 +                       needed += strlen(hc->name);
1945 +                       needed += ALIGN_MASK;
1946 +               }
1947 +       }
1948 +
1949 +       /*
1950 +        * Grab our output buffer.
1951 +        */
1952 +       nl = get_result_buffer(param, param_size, &len);
1953 +       if (len < needed) {
1954 +               param->flags |= DM_BUFFER_FULL_FLAG;
1955 +               goto out;
1956 +       }
1957 +       param->data_size = param->data_start + needed;
1958 +
1959 +       nl->dev = 0;    /* Flags no data */
1960 +
1961 +       /*
1962 +        * Now loop through filling out the names.
1963 +        */
1964 +       for (i = 0; i < NUM_BUCKETS; i++) {
1965 +               list_for_each_entry (hc, _name_buckets + i, name_list) {
1966 +                       if (old_nl)
1967 +                               old_nl->next = (uint32_t) ((void *) nl -
1968 +                                                          (void *) old_nl);
1969 +
1970 +                       nl->dev = dm_kdev(hc->md);
1971 +                       nl->next = 0;
1972 +                       strcpy(nl->name, hc->name);
1973 +
1974 +                       old_nl = nl;
1975 +                       nl = align_ptr(((void *) ++nl) + strlen(hc->name) + 1);
1976 +               }
1977 +       }
1978 +
1979 + out:
1980 +       up_write(&_hash_lock);
1981 +       return 0;
1982 +}
1983 +
1984 +static int check_name(const char *name)
1985 +{
1986 +       if (strchr(name, '/')) {
1987 +               DMWARN("invalid device name");
1988 +               return -EINVAL;
1989 +       }
1990 +
1991 +       return 0;
1992 +}
1993 +
1994 +/*
1995 + * Fills in a dm_ioctl structure, ready for sending back to
1996 + * userland.
1997 + */
1998 +static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
1999 +{
2000 +       kdev_t dev = dm_kdev(md);
2001 +       struct dm_table *table;
2002 +       struct block_device *bdev;
2003 +
2004 +       param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
2005 +                         DM_ACTIVE_PRESENT_FLAG);
2006 +
2007 +       if (dm_suspended(md))
2008 +               param->flags |= DM_SUSPEND_FLAG;
2009 +
2010 +       param->dev = kdev_t_to_nr(dev);
2011 +
2012 +       if (is_read_only(dev))
2013 +               param->flags |= DM_READONLY_FLAG;
2014 +
2015 +       param->event_nr = dm_get_event_nr(md);
2016 +
2017 +       table = dm_get_table(md);
2018 +       if (table) {
2019 +               param->flags |= DM_ACTIVE_PRESENT_FLAG;
2020 +               param->target_count = dm_table_get_num_targets(table);
2021 +               dm_table_put(table);
2022 +       } else
2023 +               param->target_count = 0;
2024 +
2025 +       bdev = bdget(param->dev);
2026 +       if (!bdev)
2027 +               return -ENXIO;
2028 +       param->open_count = bdev->bd_openers;
2029 +       bdput(bdev);
2030 +
2031 +       return 0;
2032 +}
2033 +
2034 +static int dev_create(struct dm_ioctl *param, size_t param_size)
2035 +{
2036 +       int r;
2037 +       kdev_t dev = 0;
2038 +       struct mapped_device *md;
2039 +
2040 +       r = check_name(param->name);
2041 +       if (r)
2042 +               return r;
2043 +
2044 +       if (param->flags & DM_PERSISTENT_DEV_FLAG)
2045 +               dev = to_kdev_t(param->dev);
2046 +
2047 +       r = dm_create(dev, &md);
2048 +       if (r)
2049 +               return r;
2050 +
2051 +       r = dm_hash_insert(param->name, *param->uuid ? param->uuid : NULL, md);
2052 +       if (r) {
2053 +               dm_put(md);
2054 +               return r;
2055 +       }
2056 +
2057 +       param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
2058 +
2059 +       r = __dev_status(md, param);
2060 +       dm_put(md);
2061 +
2062 +       return r;
2063 +}
2064 +
2065 +/*
2066 + * Always use UUID for lookups if it's present, otherwise use name.
2067 + */
2068 +static inline struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param)
2069 +{
2070 +       return *param->uuid ?
2071 +           __get_uuid_cell(param->uuid) : __get_name_cell(param->name);
2072 +}
2073 +
2074 +static inline struct mapped_device *find_device(struct dm_ioctl *param)
2075 +{
2076 +       struct hash_cell *hc;
2077 +       struct mapped_device *md = NULL;
2078 +
2079 +       down_read(&_hash_lock);
2080 +       hc = __find_device_hash_cell(param);
2081 +       if (hc) {
2082 +               md = hc->md;
2083 +
2084 +               /*
2085 +                * Sneakily write in both the name and the uuid
2086 +                * while we have the cell.
2087 +                */
2088 +               strncpy(param->name, hc->name, sizeof(param->name));
2089 +               if (hc->uuid)
2090 +                       strncpy(param->uuid, hc->uuid, sizeof(param->uuid) - 1);
2091 +               else
2092 +                       param->uuid[0] = '\0';
2093 +
2094 +               if (hc->new_map)
2095 +                       param->flags |= DM_INACTIVE_PRESENT_FLAG;
2096 +               else
2097 +                       param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
2098 +
2099 +               dm_get(md);
2100 +       }
2101 +       up_read(&_hash_lock);
2102 +
2103 +       return md;
2104 +}
2105 +
2106 +static int dev_remove(struct dm_ioctl *param, size_t param_size)
2107 +{
2108 +       struct hash_cell *hc;
2109 +
2110 +       down_write(&_hash_lock);
2111 +       hc = __find_device_hash_cell(param);
2112 +
2113 +       if (!hc) {
2114 +               DMWARN("device doesn't appear to be in the dev hash table.");
2115 +               up_write(&_hash_lock);
2116 +               return -ENXIO;
2117 +       }
2118 +
2119 +       __hash_remove(hc);
2120 +       up_write(&_hash_lock);
2121 +       param->data_size = 0;
2122 +       return 0;
2123 +}
2124 +
2125 +/*
2126 + * Check a string doesn't overrun the chunk of
2127 + * memory we copied from userland.
2128 + */
2129 +static int invalid_str(char *str, void *end)
2130 +{
2131 +       while ((void *) str < end)
2132 +               if (!*str++)
2133 +                       return 0;
2134 +
2135 +       return -EINVAL;
2136 +}
2137 +
2138 +static int dev_rename(struct dm_ioctl *param, size_t param_size)
2139 +{
2140 +       int r;
2141 +       char *new_name = (char *) param + param->data_start;
2142 +
2143 +       if (new_name < (char *) (param + 1) ||
2144 +           invalid_str(new_name, (void *) param + param_size)) {
2145 +               DMWARN("Invalid new logical volume name supplied.");
2146 +               return -EINVAL;
2147 +       }
2148 +
2149 +       r = check_name(new_name);
2150 +       if (r)
2151 +               return r;
2152 +
2153 +       param->data_size = 0;
2154 +       return dm_hash_rename(param->name, new_name);
2155 +}
2156 +
2157 +static int do_suspend(struct dm_ioctl *param)
2158 +{
2159 +       int r = 0;
2160 +       struct mapped_device *md;
2161 +
2162 +       md = find_device(param);
2163 +       if (!md)
2164 +               return -ENXIO;
2165 +
2166 +       if (!dm_suspended(md))
2167 +               r = dm_suspend(md);
2168 +
2169 +       if (!r)
2170 +               r = __dev_status(md, param);
2171 +
2172 +       dm_put(md);
2173 +       return r;
2174 +}
2175 +
2176 +static int do_resume(struct dm_ioctl *param)
2177 +{
2178 +       int r = 0;
2179 +       struct hash_cell *hc;
2180 +       struct mapped_device *md;
2181 +       struct dm_table *new_map;
2182 +
2183 +       down_write(&_hash_lock);
2184 +
2185 +       hc = __find_device_hash_cell(param);
2186 +       if (!hc) {
2187 +               DMWARN("device doesn't appear to be in the dev hash table.");
2188 +               up_write(&_hash_lock);
2189 +               return -ENXIO;
2190 +       }
2191 +
2192 +       md = hc->md;
2193 +       dm_get(md);
2194 +
2195 +       new_map = hc->new_map;
2196 +       hc->new_map = NULL;
2197 +       param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
2198 +
2199 +       up_write(&_hash_lock);
2200 +
2201 +       /* Do we need to load a new map ? */
2202 +       if (new_map) {
2203 +               /* Suspend if it isn't already suspended */
2204 +               if (!dm_suspended(md))
2205 +                       dm_suspend(md);
2206 +
2207 +               r = dm_swap_table(md, new_map);
2208 +               if (r) {
2209 +                       dm_put(md);
2210 +                       dm_table_put(new_map);
2211 +                       return r;
2212 +               }
2213 +
2214 +               if (dm_table_get_mode(new_map) & FMODE_WRITE)
2215 +                       set_device_ro(dm_kdev(md), 0);
2216 +               else
2217 +                       set_device_ro(dm_kdev(md), 1);
2218 +
2219 +               dm_table_put(new_map);
2220 +       }
2221 +
2222 +       if (dm_suspended(md))
2223 +               r = dm_resume(md);
2224 +
2225 +       if (!r)
2226 +               r = __dev_status(md, param);
2227 +
2228 +       dm_put(md);
2229 +       return r;
2230 +}
2231 +
2232 +/*
2233 + * Set or unset the suspension state of a device.
2234 + * If the device already is in the requested state we just return its status.
2235 + */
2236 +static int dev_suspend(struct dm_ioctl *param, size_t param_size)
2237 +{
2238 +       if (param->flags & DM_SUSPEND_FLAG)
2239 +               return do_suspend(param);
2240 +
2241 +       return do_resume(param);
2242 +}
2243 +
2244 +/*
2245 + * Copies device info back to user space, used by
2246 + * the create and info ioctls.
2247 + */
2248 +static int dev_status(struct dm_ioctl *param, size_t param_size)
2249 +{
2250 +       int r;
2251 +       struct mapped_device *md;
2252 +
2253 +       md = find_device(param);
2254 +       if (!md)
2255 +               return -ENXIO;
2256 +
2257 +       r = __dev_status(md, param);
2258 +       dm_put(md);
2259 +       return r;
2260 +}
2261 +
2262 +static inline int get_mode(struct dm_ioctl *param)
2263 +{
2264 +       int mode = FMODE_READ | FMODE_WRITE;
2265 +
2266 +       if (param->flags & DM_READONLY_FLAG)
2267 +               mode = FMODE_READ;
2268 +
2269 +       return mode;
2270 +}
2271 +
2272 +static int next_target(struct dm_target_spec *last, uint32_t next, void *end,
2273 +                      struct dm_target_spec **spec, char **target_params)
2274 +{
2275 +       *spec = (struct dm_target_spec *) ((unsigned char *) last + next);
2276 +       *target_params = (char *) (*spec + 1);
2277 +
2278 +       if (*spec < (last + 1))
2279 +               return -EINVAL;
2280 +
2281 +       return invalid_str(*target_params, end);
2282 +}
2283 +
2284 +static int populate_table(struct dm_table *table, struct dm_ioctl *param,
2285 +                         size_t param_size)
2286 +{
2287 +       int r;
2288 +       unsigned int i = 0;
2289 +       struct dm_target_spec *spec = (struct dm_target_spec *) param;
2290 +       uint32_t next = param->data_start;
2291 +       void *end = (void *) param + param_size;
2292 +       char *target_params;
2293 +
2294 +       if (!param->target_count) {
2295 +               DMWARN("populate_table: no targets specified");
2296 +               return -EINVAL;
2297 +       }
2298 +
2299 +       for (i = 0; i < param->target_count; i++) {
2300 +
2301 +               r = next_target(spec, next, end, &spec, &target_params);
2302 +               if (r) {
2303 +                       DMWARN("unable to find target");
2304 +                       return r;
2305 +               }
2306 +
2307 +               r = dm_table_add_target(table, spec->target_type,
2308 +                                       (sector_t) spec->sector_start,
2309 +                                       (sector_t) spec->length,
2310 +                                       target_params);
2311 +               if (r) {
2312 +                       DMWARN("error adding target to table");
2313 +                       return r;
2314 +               }
2315 +
2316 +               next = spec->next;
2317 +       }
2318 +
2319 +       return dm_table_complete(table);
2320 +}
2321 +
2322 +static int table_load(struct dm_ioctl *param, size_t param_size)
2323 +{
2324 +       int r;
2325 +       struct hash_cell *hc;
2326 +       struct dm_table *t;
2327 +
2328 +       r = dm_table_create(&t, get_mode(param));
2329 +       if (r)
2330 +               return r;
2331 +
2332 +       r = populate_table(t, param, param_size);
2333 +       if (r) {
2334 +               dm_table_put(t);
2335 +               return r;
2336 +       }
2337 +
2338 +       down_write(&_hash_lock);
2339 +       hc = __find_device_hash_cell(param);
2340 +       if (!hc) {
2341 +               DMWARN("device doesn't appear to be in the dev hash table.");
2342 +               up_write(&_hash_lock);
2343 +               return -ENXIO;
2344 +       }
2345 +
2346 +       if (hc->new_map)
2347 +               dm_table_put(hc->new_map);
2348 +       hc->new_map = t;
2349 +       param->flags |= DM_INACTIVE_PRESENT_FLAG;
2350 +
2351 +       r = __dev_status(hc->md, param);
2352 +       up_write(&_hash_lock);
2353 +       return r;
2354 +}
2355 +
2356 +static int table_clear(struct dm_ioctl *param, size_t param_size)
2357 +{
2358 +       int r;
2359 +       struct hash_cell *hc;
2360 +
2361 +       down_write(&_hash_lock);
2362 +
2363 +       hc = __find_device_hash_cell(param);
2364 +       if (!hc) {
2365 +               DMWARN("device doesn't appear to be in the dev hash table.");
2366 +               up_write(&_hash_lock);
2367 +               return -ENXIO;
2368 +       }
2369 +
2370 +       if (hc->new_map) {
2371 +               dm_table_put(hc->new_map);
2372 +               hc->new_map = NULL;
2373 +       }
2374 +
2375 +       param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
2376 +
2377 +       r = __dev_status(hc->md, param);
2378 +       up_write(&_hash_lock);
2379 +       return r;
2380 +}
2381 +
2382 +/*
2383 + * Retrieves a list of devices used by a particular dm device.
2384 + */
2385 +static void retrieve_deps(struct dm_table *table, struct dm_ioctl *param,
2386 +                         size_t param_size)
2387 +{
2388 +       unsigned int count = 0;
2389 +       struct list_head *tmp;
2390 +       size_t len, needed;
2391 +       struct dm_target_deps *deps;
2392 +
2393 +       deps = get_result_buffer(param, param_size, &len);
2394 +
2395 +       /*
2396 +        * Count the devices.
2397 +        */
2398 +       list_for_each(tmp, dm_table_get_devices(table))
2399 +               count++;
2400 +
2401 +       /*
2402 +        * Check we have enough space.
2403 +        */
2404 +       needed = sizeof(*deps) + (sizeof(*deps->dev) * count);
2405 +       if (len < needed) {
2406 +               param->flags |= DM_BUFFER_FULL_FLAG;
2407 +               return;
2408 +       }
2409 +
2410 +       /*
2411 +        * Fill in the devices.
2412 +        */
2413 +       deps->count = count;
2414 +       count = 0;
2415 +       list_for_each(tmp, dm_table_get_devices(table)) {
2416 +               struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
2417 +               deps->dev[count++] = dd->bdev->bd_dev;
2418 +       }
2419 +
2420 +       param->data_size = param->data_start + needed;
2421 +}
2422 +
2423 +static int table_deps(struct dm_ioctl *param, size_t param_size)
2424 +{
2425 +       int r;
2426 +       struct mapped_device *md;
2427 +       struct dm_table *table;
2428 +
2429 +       md = find_device(param);
2430 +       if (!md)
2431 +               return -ENXIO;
2432 +
2433 +       r = __dev_status(md, param);
2434 +       if (r)
2435 +               goto out;
2436 +
2437 +       table = dm_get_table(md);
2438 +       if (table) {
2439 +               retrieve_deps(table, param, param_size);
2440 +               dm_table_put(table);
2441 +       }
2442 +
2443 + out:
2444 +       dm_put(md);
2445 +       return r;
2446 +}
2447 +
2448 +/*
2449 + * Build up the status struct for each target
2450 + */
2451 +static void retrieve_status(struct dm_table *table, struct dm_ioctl *param,
2452 +                           size_t param_size)
2453 +{
2454 +       unsigned int i, num_targets;
2455 +       struct dm_target_spec *spec;
2456 +       char *outbuf, *outptr;
2457 +       status_type_t type;
2458 +       size_t remaining, len, used = 0;
2459 +
2460 +       outptr = outbuf = get_result_buffer(param, param_size, &len);
2461 +
2462 +       if (param->flags & DM_STATUS_TABLE_FLAG)
2463 +               type = STATUSTYPE_TABLE;
2464 +       else
2465 +               type = STATUSTYPE_INFO;
2466 +
2467 +       /* Get all the target info */
2468 +       num_targets = dm_table_get_num_targets(table);
2469 +       for (i = 0; i < num_targets; i++) {
2470 +               struct dm_target *ti = dm_table_get_target(table, i);
2471 +
2472 +               remaining = len - (outptr - outbuf);
2473 +               if (remaining < sizeof(struct dm_target_spec)) {
2474 +                       param->flags |= DM_BUFFER_FULL_FLAG;
2475 +                       break;
2476 +               }
2477 +
2478 +               spec = (struct dm_target_spec *) outptr;
2479 +
2480 +               spec->status = 0;
2481 +               spec->sector_start = ti->begin;
2482 +               spec->length = ti->len;
2483 +               strncpy(spec->target_type, ti->type->name,
2484 +                       sizeof(spec->target_type));
2485 +
2486 +               outptr += sizeof(struct dm_target_spec);
2487 +               remaining = len - (outptr - outbuf);
2488 +
2489 +               /* Get the status/table string from the target driver */
2490 +               if (ti->type->status) {
2491 +                       if (ti->type->status(ti, type, outptr, remaining)) {
2492 +                               param->flags |= DM_BUFFER_FULL_FLAG;
2493 +                               break;
2494 +                       }
2495 +               } else
2496 +                       outptr[0] = '\0';
2497 +
2498 +               outptr += strlen(outptr) + 1;
2499 +               used = param->data_start + (outptr - outbuf);
2500 +
2501 +               align_ptr(outptr);
2502 +               spec->next = outptr - outbuf;
2503 +       }
2504 +
2505 +       if (used)
2506 +               param->data_size = used;
2507 +
2508 +       param->target_count = num_targets;
2509 +}
2510 +
2511 +/*
2512 + * Return the status of a device as a text string for each
2513 + * target.
2514 + */
2515 +static int table_status(struct dm_ioctl *param, size_t param_size)
2516 +{
2517 +       int r;
2518 +       struct mapped_device *md;
2519 +       struct dm_table *table;
2520 +
2521 +       md = find_device(param);
2522 +       if (!md)
2523 +               return -ENXIO;
2524 +
2525 +       r = __dev_status(md, param);
2526 +       if (r)
2527 +               goto out;
2528
2529 +       table = dm_get_table(md);
2530 +       if (table) {
2531 +               retrieve_status(table, param, param_size);
2532 +               dm_table_put(table);
2533 +       }
2534 +
2535 + out:
2536 +       dm_put(md);
2537 +       return r;
2538 +}
2539 +
2540 +/*
2541 + * Wait for a device to report an event
2542 + */
2543 +static int dev_wait(struct dm_ioctl *param, size_t param_size)
2544 +{
2545 +       int r;
2546 +       struct mapped_device *md;
2547 +       struct dm_table *table;
2548 +       DECLARE_WAITQUEUE(wq, current);
2549 +
2550 +       md = find_device(param);
2551 +       if (!md)
2552 +               return -ENXIO;
2553 +
2554 +       /*
2555 +        * Wait for a notification event
2556 +        */
2557 +       set_current_state(TASK_INTERRUPTIBLE);
2558 +       if (!dm_add_wait_queue(md, &wq, param->event_nr)) {
2559 +               schedule();
2560 +               dm_remove_wait_queue(md, &wq);
2561 +       }
2562 +       set_current_state(TASK_RUNNING);
2563 +
2564 +       /*
2565 +        * The userland program is going to want to know what
2566 +        * changed to trigger the event, so we may as well tell
2567 +        * him and save an ioctl.
2568 +        */
2569 +       r = __dev_status(md, param);
2570 +       if (r)
2571 +               goto out;
2572 +
2573 +       table = dm_get_table(md);
2574 +       if (table) {
2575 +               retrieve_status(table, param, param_size);
2576 +               dm_table_put(table);
2577 +       }
2578 +
2579 + out:
2580 +       dm_put(md);
2581 +       return r;
2582 +}
2583 +
2584 +/*-----------------------------------------------------------------
2585 + * Implementation of open/close/ioctl on the special char
2586 + * device.
2587 + *---------------------------------------------------------------*/
2588 +static ioctl_fn lookup_ioctl(unsigned int cmd)
2589 +{
2590 +       static struct {
2591 +               int cmd;
2592 +               ioctl_fn fn;
2593 +       } _ioctls[] = {
2594 +               {DM_VERSION_CMD, NULL}, /* version is dealt with elsewhere */
2595 +               {DM_REMOVE_ALL_CMD, remove_all},
2596 +               {DM_LIST_DEVICES_CMD, list_devices},
2597 +
2598 +               {DM_DEV_CREATE_CMD, dev_create},
2599 +               {DM_DEV_REMOVE_CMD, dev_remove},
2600 +               {DM_DEV_RENAME_CMD, dev_rename},
2601 +               {DM_DEV_SUSPEND_CMD, dev_suspend},
2602 +               {DM_DEV_STATUS_CMD, dev_status},
2603 +               {DM_DEV_WAIT_CMD, dev_wait},
2604 +
2605 +               {DM_TABLE_LOAD_CMD, table_load},
2606 +               {DM_TABLE_CLEAR_CMD, table_clear},
2607 +               {DM_TABLE_DEPS_CMD, table_deps},
2608 +               {DM_TABLE_STATUS_CMD, table_status}
2609 +       };
2610 +
2611 +       return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn;
2612 +}
2613 +
2614 +/*
2615 + * As well as checking the version compatibility this always
2616 + * copies the kernel interface version out.
2617 + */
2618 +static int check_version(unsigned int cmd, struct dm_ioctl *user)
2619 +{
2620 +       uint32_t version[3];
2621 +       int r = 0;
2622 +
2623 +       if (copy_from_user(version, user->version, sizeof(version)))
2624 +               return -EFAULT;
2625 +
2626 +       if ((DM_VERSION_MAJOR != version[0]) ||
2627 +           (DM_VERSION_MINOR < version[1])) {
2628 +               DMWARN("ioctl interface mismatch: "
2629 +                      "kernel(%u.%u.%u), user(%u.%u.%u), cmd(%d)",
2630 +                      DM_VERSION_MAJOR, DM_VERSION_MINOR,
2631 +                      DM_VERSION_PATCHLEVEL,
2632 +                      version[0], version[1], version[2], cmd);
2633 +               r = -EINVAL;
2634 +       }
2635 +
2636 +       /*
2637 +        * Fill in the kernel version.
2638 +        */
2639 +       version[0] = DM_VERSION_MAJOR;
2640 +       version[1] = DM_VERSION_MINOR;
2641 +       version[2] = DM_VERSION_PATCHLEVEL;
2642 +       if (copy_to_user(user->version, version, sizeof(version)))
2643 +               return -EFAULT;
2644 +
2645 +       return r;
2646 +}
2647 +
2648 +static void free_params(struct dm_ioctl *param)
2649 +{
2650 +       vfree(param);
2651 +}
2652 +
2653 +static int copy_params(struct dm_ioctl *user, struct dm_ioctl **param)
2654 +{
2655 +       struct dm_ioctl tmp, *dmi;
2656 +
2657 +       if (copy_from_user(&tmp, user, sizeof(tmp)))
2658 +               return -EFAULT;
2659 +
2660 +       if (tmp.data_size < sizeof(tmp))
2661 +               return -EINVAL;
2662 +
2663 +       dmi = (struct dm_ioctl *) vmalloc(tmp.data_size);
2664 +       if (!dmi)
2665 +               return -ENOMEM;
2666 +
2667 +       if (copy_from_user(dmi, user, tmp.data_size)) {
2668 +               vfree(dmi);
2669 +               return -EFAULT;
2670 +       }
2671 +
2672 +       *param = dmi;
2673 +       return 0;
2674 +}
2675 +
2676 +static int validate_params(uint cmd, struct dm_ioctl *param)
2677 +{
2678 +       /* Always clear this flag */
2679 +       param->flags &= ~DM_BUFFER_FULL_FLAG;
2680 +
2681 +       /* Ignores parameters */
2682 +       if (cmd == DM_REMOVE_ALL_CMD || cmd == DM_LIST_DEVICES_CMD)
2683 +               return 0;
2684 +
2685 +       /* Unless creating, either name or uuid but not both */
2686 +       if (cmd != DM_DEV_CREATE_CMD) {
2687 +               if ((!*param->uuid && !*param->name) ||
2688 +                   (*param->uuid && *param->name)) {
2689 +                       DMWARN("one of name or uuid must be supplied, cmd(%u)",
2690 +                              cmd);
2691 +                       return -EINVAL;
2692 +               }
2693 +       }
2694 +
2695 +       /* Ensure strings are terminated */
2696 +       param->name[DM_NAME_LEN - 1] = '\0';
2697 +       param->uuid[DM_UUID_LEN - 1] = '\0';
2698 +
2699 +       return 0;
2700 +}
2701 +
2702 +static int ctl_ioctl(struct inode *inode, struct file *file,
2703 +                    uint command, ulong u)
2704 +{
2705 +       int r = 0;
2706 +       unsigned int cmd;
2707 +       struct dm_ioctl *param;
2708 +       struct dm_ioctl *user = (struct dm_ioctl *) u;
2709 +       ioctl_fn fn = NULL;
2710 +       size_t param_size;
2711 +
2712 +       /* only root can play with this */
2713 +       if (!capable(CAP_SYS_ADMIN))
2714 +               return -EACCES;
2715 +
2716 +       if (_IOC_TYPE(command) != DM_IOCTL)
2717 +               return -ENOTTY;
2718 +
2719 +       cmd = _IOC_NR(command);
2720 +
2721 +       /*
2722 +        * Check the interface version passed in.  This also
2723 +        * writes out the kernel's interface version.
2724 +        */
2725 +       r = check_version(cmd, user);
2726 +       if (r)
2727 +               return r;
2728 +
2729 +       /*
2730 +        * Nothing more to do for the version command.
2731 +        */
2732 +       if (cmd == DM_VERSION_CMD)
2733 +               return 0;
2734 +
2735 +       fn = lookup_ioctl(cmd);
2736 +       if (!fn) {
2737 +               DMWARN("dm_ctl_ioctl: unknown command 0x%x", command);
2738 +               return -ENOTTY;
2739 +       }
2740 +
2741 +       /*
2742 +        * FIXME: I don't like this, we're trying to avoid low
2743 +        * memory issues when a device is suspended.
2744 +        */
2745 +       current->flags |= PF_MEMALLOC;
2746 +
2747 +       /*
2748 +        * Copy the parameters into kernel space.
2749 +        */
2750 +       r = copy_params(user, &param);
2751 +       if (r) {
2752 +               current->flags &= ~PF_MEMALLOC;
2753 +               return r;
2754 +       }
2755 +
2756 +       r = validate_params(cmd, param);
2757 +       if (r)
2758 +               goto out;
2759 +
2760 +       param_size = param->data_size;
2761 +       param->data_size = sizeof(*param);
2762 +       r = fn(param, param_size);
2763 +
2764 +       /*
2765 +        * Copy the results back to userland.
2766 +        */
2767 +       if (!r && copy_to_user(user, param, param->data_size))
2768 +               r = -EFAULT;
2769 +
2770 + out:
2771 +       free_params(param);
2772 +       current->flags &= ~PF_MEMALLOC;
2773 +       return r;
2774 +}
2775 +
2776 +static struct file_operations _ctl_fops = {
2777 +       .ioctl   = ctl_ioctl,
2778 +       .owner   = THIS_MODULE,
2779 +};
2780 +
2781 +static devfs_handle_t _ctl_handle;
2782 +
2783 +static struct miscdevice _dm_misc = {
2784 +       .minor = MISC_DYNAMIC_MINOR,
2785 +       .name  = DM_NAME,
2786 +       .fops  = &_ctl_fops
2787 +};
2788 +
2789 +/*
2790 + * Create misc character device and link to DM_DIR/control.
2791 + */
2792 +int __init dm_interface_init(void)
2793 +{
2794 +       int r;
2795 +       char rname[64];
2796 +
2797 +       r = dm_hash_init();
2798 +       if (r)
2799 +               return r;
2800 +
2801 +       r = misc_register(&_dm_misc);
2802 +       if (r) {
2803 +               DMERR("misc_register failed for control device");
2804 +               dm_hash_exit();
2805 +               return r;
2806 +       }
2807 +
2808 +       r = devfs_generate_path(_dm_misc.devfs_handle, rname + 3,
2809 +                               sizeof rname - 3);
2810 +       if (r == -ENOSYS)
2811 +               goto done;      /* devfs not present */
2812 +
2813 +       if (r < 0) {
2814 +               DMERR("devfs_generate_path failed for control device");
2815 +               goto failed;
2816 +       }
2817 +
2818 +       strncpy(rname + r, "../", 3);
2819 +       r = devfs_mk_symlink(NULL, DM_DIR "/control",
2820 +                            DEVFS_FL_DEFAULT, rname + r, &_ctl_handle, NULL);
2821 +       if (r) {
2822 +               DMERR("devfs_mk_symlink failed for control device");
2823 +               goto failed;
2824 +       }
2825 +       devfs_auto_unregister(_dm_misc.devfs_handle, _ctl_handle);
2826 +
2827 +      done:
2828 +       DMINFO("%d.%d.%d%s initialised: %s", DM_VERSION_MAJOR,
2829 +              DM_VERSION_MINOR, DM_VERSION_PATCHLEVEL, DM_VERSION_EXTRA,
2830 +              DM_DRIVER_EMAIL);
2831 +       return 0;
2832 +
2833 +      failed:
2834 +       misc_deregister(&_dm_misc);
2835 +       dm_hash_exit();
2836 +       return r;
2837 +}
2838 +
2839 +void dm_interface_exit(void)
2840 +{
2841 +       if (misc_deregister(&_dm_misc) < 0)
2842 +               DMERR("misc_deregister failed for control device");
2843 +
2844 +       dm_hash_exit();
2845 +}
2846 --- linux-2.4.21/drivers/md/dm-linear.c Thu Jan  1 01:00:00 1970
2847 +++ linux/drivers/md/dm-linear.c        Wed Aug 20 14:41:38 2003
2848 @@ -0,0 +1,123 @@
2849 +/*
2850 + * Copyright (C) 2001 Sistina Software (UK) Limited.
2851 + *
2852 + * This file is released under the GPL.
2853 + */
2854 +
2855 +#include "dm.h"
2856 +
2857 +#include <linux/module.h>
2858 +#include <linux/init.h>
2859 +#include <linux/blkdev.h>
2860 +#include <linux/slab.h>
2861 +
2862 +/*
2863 + * Linear: maps a linear range of a device.
2864 + */
2865 +struct linear_c {
2866 +       struct dm_dev *dev;
2867 +       sector_t start;
2868 +};
2869 +
2870 +/*
2871 + * Construct a linear mapping: <dev_path> <offset>
2872 + */
2873 +static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
2874 +{
2875 +       struct linear_c *lc;
2876 +
2877 +       if (argc != 2) {
2878 +               ti->error = "dm-linear: Invalid argument count";
2879 +               return -EINVAL;
2880 +       }
2881 +
2882 +       lc = kmalloc(sizeof(*lc), GFP_KERNEL);
2883 +       if (lc == NULL) {
2884 +               ti->error = "dm-linear: Cannot allocate linear context";
2885 +               return -ENOMEM;
2886 +       }
2887 +
2888 +       if (sscanf(argv[1], SECTOR_FORMAT, &lc->start) != 1) {
2889 +               ti->error = "dm-linear: Invalid device sector";
2890 +               goto bad;
2891 +       }
2892 +
2893 +       if (dm_get_device(ti, argv[0], lc->start, ti->len,
2894 +                         dm_table_get_mode(ti->table), &lc->dev)) {
2895 +               ti->error = "dm-linear: Device lookup failed";
2896 +               goto bad;
2897 +       }
2898 +
2899 +       ti->private = lc;
2900 +       return 0;
2901 +
2902 +      bad:
2903 +       kfree(lc);
2904 +       return -EINVAL;
2905 +}
2906 +
2907 +static void linear_dtr(struct dm_target *ti)
2908 +{
2909 +       struct linear_c *lc = (struct linear_c *) ti->private;
2910 +
2911 +       dm_put_device(ti, lc->dev);
2912 +       kfree(lc);
2913 +}
2914 +
2915 +static int linear_map(struct dm_target *ti, struct buffer_head *bh, int rw,
2916 +                     union map_info *map_context)
2917 +{
2918 +       struct linear_c *lc = (struct linear_c *) ti->private;
2919 +
2920 +       bh->b_rdev = lc->dev->dev;
2921 +       bh->b_rsector = lc->start + (bh->b_rsector - ti->begin);
2922 +
2923 +       return 1;
2924 +}
2925 +
2926 +static int linear_status(struct dm_target *ti, status_type_t type,
2927 +                        char *result, unsigned int maxlen)
2928 +{
2929 +       struct linear_c *lc = (struct linear_c *) ti->private;
2930 +       kdev_t kdev;
2931 +
2932 +       switch (type) {
2933 +       case STATUSTYPE_INFO:
2934 +               result[0] = '\0';
2935 +               break;
2936 +
2937 +       case STATUSTYPE_TABLE:
2938 +               kdev = to_kdev_t(lc->dev->bdev->bd_dev);
2939 +               snprintf(result, maxlen, "%s " SECTOR_FORMAT,
2940 +                        dm_kdevname(kdev), lc->start);
2941 +               break;
2942 +       }
2943 +       return 0;
2944 +}
2945 +
2946 +static struct target_type linear_target = {
2947 +       .name   = "linear",
2948 +       .module = THIS_MODULE,
2949 +       .ctr    = linear_ctr,
2950 +       .dtr    = linear_dtr,
2951 +       .map    = linear_map,
2952 +       .status = linear_status,
2953 +};
2954 +
2955 +int __init dm_linear_init(void)
2956 +{
2957 +       int r = dm_register_target(&linear_target);
2958 +
2959 +       if (r < 0)
2960 +               DMERR("linear: register failed %d", r);
2961 +
2962 +       return r;
2963 +}
2964 +
2965 +void dm_linear_exit(void)
2966 +{
2967 +       int r = dm_unregister_target(&linear_target);
2968 +
2969 +       if (r < 0)
2970 +               DMERR("linear: unregister failed %d", r);
2971 +}
2972 --- linux-2.4.21/drivers/md/dm-log.c    Thu Jan  1 01:00:00 1970
2973 +++ linux/drivers/md/dm-log.c   Wed Aug 20 14:41:38 2003
2974 @@ -0,0 +1,302 @@
2975 +/*
2976 + * Copyright (C) 2003 Sistina Software
2977 + *
2978 + * This file is released under the LGPL.
2979 + */
2980 +
2981 +#include <linux/init.h>
2982 +#include <linux/slab.h>
2983 +#include <linux/module.h>
2984 +#include <linux/vmalloc.h>
2985 +
2986 +#include "dm-log.h"
2987 +#include "dm-io.h"
2988 +
2989 +static LIST_HEAD(_log_types);
2990 +static spinlock_t _lock = SPIN_LOCK_UNLOCKED;
2991 +
2992 +int dm_register_dirty_log_type(struct dirty_log_type *type)
2993 +{
2994 +       spin_lock(&_lock);
2995 +       type->use_count = 0;
2996 +       if (type->module)
2997 +               __MOD_INC_USE_COUNT(type->module);
2998 +
2999 +       list_add(&type->list, &_log_types);
3000 +       spin_unlock(&_lock);
3001 +
3002 +       return 0;
3003 +}
3004 +
3005 +int dm_unregister_dirty_log_type(struct dirty_log_type *type)
3006 +{
3007 +       spin_lock(&_lock);
3008 +
3009 +       if (type->use_count)
3010 +               DMWARN("Attempt to unregister a log type that is still in use");
3011 +       else {
3012 +               list_del(&type->list);
3013 +               if (type->module)
3014 +                       __MOD_DEC_USE_COUNT(type->module);
3015 +       }
3016 +
3017 +       spin_unlock(&_lock);
3018 +
3019 +       return 0;
3020 +}
3021 +
3022 +static struct dirty_log_type *get_type(const char *type_name)
3023 +{
3024 +       struct dirty_log_type *type;
3025 +       struct list_head *tmp;
3026 +
3027 +       spin_lock(&_lock);
3028 +       list_for_each (tmp, &_log_types) {
3029 +               type = list_entry(tmp, struct dirty_log_type, list);
3030 +               if (!strcmp(type_name, type->name)) {
3031 +                       type->use_count++;
3032 +                       spin_unlock(&_lock);
3033 +                       return type;
3034 +               }
3035 +       }
3036 +
3037 +       spin_unlock(&_lock);
3038 +       return NULL;
3039 +}
3040 +
3041 +static void put_type(struct dirty_log_type *type)
3042 +{
3043 +       spin_lock(&_lock);
3044 +       type->use_count--;
3045 +       spin_unlock(&_lock);
3046 +}
3047 +
3048 +struct dirty_log *dm_create_dirty_log(const char *type_name, sector_t dev_size,
3049 +                                     unsigned int argc, char **argv)
3050 +{
3051 +       struct dirty_log_type *type;
3052 +       struct dirty_log *log;
3053 +
3054 +       log = kmalloc(sizeof(*log), GFP_KERNEL);
3055 +       if (!log)
3056 +               return NULL;
3057 +
3058 +       type = get_type(type_name);
3059 +       if (!type) {
3060 +               kfree(log);
3061 +               return NULL;
3062 +       }
3063 +
3064 +       log->type = type;
3065 +       if (type->ctr(log, dev_size, argc, argv)) {
3066 +               kfree(log);
3067 +               put_type(type);
3068 +               return NULL;
3069 +       }
3070 +
3071 +       return log;
3072 +}
3073 +
3074 +void dm_destroy_dirty_log(struct dirty_log *log)
3075 +{
3076 +       log->type->dtr(log);
3077 +       put_type(log->type);
3078 +       kfree(log);
3079 +}
3080 +
3081 +
3082 +/*-----------------------------------------------------------------
3083 + * In core log, ie. trivial, non-persistent
3084 + *
3085 + * For now we'll keep this simple and just have 2 bitsets, one
3086 + * for clean/dirty, the other for sync/nosync.  The sync bitset
3087 + * will be freed when everything is in sync.
3088 + *
3089 + * FIXME: problems with a 64bit sector_t
3090 + *---------------------------------------------------------------*/
3091 +struct core_log {
3092 +       sector_t region_size;
3093 +       unsigned int region_count;
3094 +       unsigned long *clean_bits;
3095 +       unsigned long *sync_bits;
3096 +       unsigned long *recovering_bits; /* FIXME: this seems excessive */
3097 +
3098 +       int sync_search;
3099 +};
3100 +
3101 +static int core_ctr(struct dirty_log *log, sector_t dev_size,
3102 +                   unsigned int argc, char **argv)
3103 +{
3104 +       struct core_log *clog;
3105 +       sector_t region_size;
3106 +       unsigned int region_count;
3107 +       size_t bitset_size;
3108 +
3109 +       if (argc != 1) {
3110 +               DMWARN("wrong number of arguments to core_log");
3111 +               return -EINVAL;
3112 +       }
3113 +
3114 +       if (sscanf(argv[0], SECTOR_FORMAT, &region_size) != 1) {
3115 +               DMWARN("invalid region size string");
3116 +               return -EINVAL;
3117 +       }
3118 +
3119 +       region_count = dm_div_up(dev_size, region_size);
3120 +
3121 +       clog = kmalloc(sizeof(*clog), GFP_KERNEL);
3122 +       if (!clog) {
3123 +               DMWARN("couldn't allocate core log");
3124 +               return -ENOMEM;
3125 +       }
3126 +
3127 +       clog->region_size = region_size;
3128 +       clog->region_count = region_count;
3129 +
3130 +       bitset_size = dm_round_up(region_count >> 3, sizeof(*clog->clean_bits));
3131 +       clog->clean_bits = vmalloc(bitset_size);
3132 +       if (!clog->clean_bits) {
3133 +               DMWARN("couldn't allocate clean bitset");
3134 +               kfree(clog);
3135 +               return -ENOMEM;
3136 +       }
3137 +       memset(clog->clean_bits, -1, bitset_size);
3138 +
3139 +       clog->sync_bits = vmalloc(bitset_size);
3140 +       if (!clog->sync_bits) {
3141 +               DMWARN("couldn't allocate sync bitset");
3142 +               vfree(clog->clean_bits);
3143 +               kfree(clog);
3144 +               return -ENOMEM;
3145 +       }
3146 +       memset(clog->sync_bits, 0, bitset_size);
3147 +
3148 +       clog->recovering_bits = vmalloc(bitset_size);
3149 +       if (!clog->recovering_bits) {
3150 +               DMWARN("couldn't allocate sync bitset");
3151 +               vfree(clog->sync_bits);
3152 +               vfree(clog->clean_bits);
3153 +               kfree(clog);
3154 +               return -ENOMEM;
3155 +       }
3156 +       memset(clog->recovering_bits, 0, bitset_size);
3157 +       clog->sync_search = 0;
3158 +       log->context = clog;
3159 +       return 0;
3160 +}
3161 +
3162 +static void core_dtr(struct dirty_log *log)
3163 +{
3164 +       struct core_log *clog = (struct core_log *) log->context;
3165 +       vfree(clog->clean_bits);
3166 +       vfree(clog->sync_bits);
3167 +       vfree(clog->recovering_bits);
3168 +       kfree(clog);
3169 +}
3170 +
3171 +static sector_t core_get_region_size(struct dirty_log *log)
3172 +{
3173 +       struct core_log *clog = (struct core_log *) log->context;
3174 +       return clog->region_size;
3175 +}
3176 +
3177 +static int core_is_clean(struct dirty_log *log, region_t region)
3178 +{
3179 +       struct core_log *clog = (struct core_log *) log->context;
3180 +       return test_bit(region, clog->clean_bits);
3181 +}
3182 +
3183 +static int core_in_sync(struct dirty_log *log, region_t region, int block)
3184 +{
3185 +       struct core_log *clog = (struct core_log *) log->context;
3186 +
3187 +       return test_bit(region, clog->sync_bits) ? 1 : 0;
3188 +}
3189 +
3190 +static int core_flush(struct dirty_log *log)
3191 +{
3192 +       /* no op */
3193 +       return 0;
3194 +}
3195 +
3196 +static void core_mark_region(struct dirty_log *log, region_t region)
3197 +{
3198 +       struct core_log *clog = (struct core_log *) log->context;
3199 +       clear_bit(region, clog->clean_bits);
3200 +}
3201 +
3202 +static void core_clear_region(struct dirty_log *log, region_t region)
3203 +{
3204 +       struct core_log *clog = (struct core_log *) log->context;
3205 +       set_bit(region, clog->clean_bits);
3206 +}
3207 +
3208 +static int core_get_resync_work(struct dirty_log *log, region_t *region)
3209 +{
3210 +       struct core_log *clog = (struct core_log *) log->context;
3211 +
3212 +       if (clog->sync_search >= clog->region_count)
3213 +               return 0;
3214 +
3215 +       do {
3216 +               *region = find_next_zero_bit(clog->sync_bits,
3217 +                                            clog->region_count,
3218 +                                            clog->sync_search);
3219 +               clog->sync_search = *region + 1;
3220 +
3221 +               if (*region == clog->region_count)
3222 +                       return 0;
3223 +
3224 +       } while (test_bit(*region, clog->recovering_bits));
3225 +
3226 +       set_bit(*region, clog->recovering_bits);
3227 +       return 1;
3228 +}
3229 +
3230 +static void core_complete_resync_work(struct dirty_log *log, region_t region,
3231 +                                     int success)
3232 +{
3233 +       struct core_log *clog = (struct core_log *) log->context;
3234 +
3235 +       clear_bit(region, clog->recovering_bits);
3236 +       if (success)
3237 +               set_bit(region, clog->sync_bits);
3238 +}
3239 +
3240 +static struct dirty_log_type _core_type = {
3241 +       .name = "core",
3242 +
3243 +       .ctr = core_ctr,
3244 +       .dtr = core_dtr,
3245 +       .get_region_size = core_get_region_size,
3246 +       .is_clean = core_is_clean,
3247 +       .in_sync = core_in_sync,
3248 +       .flush = core_flush,
3249 +       .mark_region = core_mark_region,
3250 +       .clear_region = core_clear_region,
3251 +       .get_resync_work = core_get_resync_work,
3252 +       .complete_resync_work = core_complete_resync_work
3253 +};
3254 +
3255 +__init int dm_dirty_log_init(void)
3256 +{
3257 +       int r;
3258 +
3259 +       r = dm_register_dirty_log_type(&_core_type);
3260 +       if (r)
3261 +               DMWARN("couldn't register core log");
3262 +
3263 +       return r;
3264 +}
3265 +
3266 +void dm_dirty_log_exit(void)
3267 +{
3268 +       dm_unregister_dirty_log_type(&_core_type);
3269 +}
3270 +
3271 +EXPORT_SYMBOL(dm_register_dirty_log_type);
3272 +EXPORT_SYMBOL(dm_unregister_dirty_log_type);
3273 +EXPORT_SYMBOL(dm_dirty_log_init);
3274 +EXPORT_SYMBOL(dm_dirty_log_exit);
3275 +EXPORT_SYMBOL(dm_create_dirty_log);
3276 +EXPORT_SYMBOL(dm_destroy_dirty_log);
3277 --- linux-2.4.21/drivers/md/dm-log.h    Thu Jan  1 01:00:00 1970
3278 +++ linux/drivers/md/dm-log.h   Wed Aug 20 14:41:38 2003
3279 @@ -0,0 +1,112 @@
3280 +/*
3281 + * Copyright (C) 2003 Sistina Software
3282 + *
3283 + * This file is released under the LGPL.
3284 + */
3285 +
3286 +#ifndef DM_DIRTY_LOG
3287 +#define DM_DIRTY_LOG
3288 +
3289 +#include "dm.h"
3290 +
3291 +typedef sector_t region_t;
3292 +
3293 +struct dirty_log_type;
3294 +
3295 +struct dirty_log {
3296 +       struct dirty_log_type *type;
3297 +       void *context;
3298 +};
3299 +
3300 +struct dirty_log_type {
3301 +       struct list_head list;
3302 +       const char *name;
3303 +       struct module *module;
3304 +       unsigned int use_count;
3305 +
3306 +       int (*ctr)(struct dirty_log *log, sector_t dev_size,
3307 +                  unsigned int argc, char **argv);
3308 +       void (*dtr)(struct dirty_log *log);
3309 +
3310 +       /*
3311 +        * Retrieves the smallest size of region that the log can
3312 +        * deal with.
3313 +        */
3314 +       sector_t (*get_region_size)(struct dirty_log *log);
3315 +
3316 +        /*
3317 +        * A predicate to say whether a region is clean or not.
3318 +        * May block.
3319 +        */
3320 +       int (*is_clean)(struct dirty_log *log, region_t region);
3321 +
3322 +       /*
3323 +        *  Returns: 0, 1, -EWOULDBLOCK, < 0
3324 +        *
3325 +        * A predicate function to check the area given by
3326 +        * [sector, sector + len) is in sync.
3327 +        *
3328 +        * If -EWOULDBLOCK is returned the state of the region is
3329 +        * unknown, typically this will result in a read being
3330 +        * passed to a daemon to deal with, since a daemon is
3331 +        * allowed to block.
3332 +        */
3333 +       int (*in_sync)(struct dirty_log *log, region_t region, int can_block);
3334 +
3335 +       /*
3336 +        * Flush the current log state (eg, to disk).  This
3337 +        * function may block.
3338 +        */
3339 +       int (*flush)(struct dirty_log *log);
3340 +
3341 +       /*
3342 +        * Mark an area as clean or dirty.  These functions may
3343 +        * block, though for performance reasons blocking should
3344 +        * be extremely rare (eg, allocating another chunk of
3345 +        * memory for some reason).
3346 +        */
3347 +       void (*mark_region)(struct dirty_log *log, region_t region);
3348 +       void (*clear_region)(struct dirty_log *log, region_t region);
3349 +
3350 +       /*
3351 +        * Returns: <0 (error), 0 (no region), 1 (region)
3352 +        *
3353 +        * The mirrord will need perform recovery on regions of
3354 +        * the mirror that are in the NOSYNC state.  This
3355 +        * function asks the log to tell the caller about the
3356 +        * next region that this machine should recover.
3357 +        *
3358 +        * Do not confuse this function with 'in_sync()', one
3359 +        * tells you if an area is synchronised, the other
3360 +        * assigns recovery work.
3361 +       */
3362 +       int (*get_resync_work)(struct dirty_log *log, region_t *region);
3363 +
3364 +       /*
3365 +        * This notifies the log that the resync of an area has
3366 +        * been completed.  The log should then mark this region
3367 +        * as CLEAN.
3368 +        */
3369 +       void (*complete_resync_work)(struct dirty_log *log,
3370 +                                    region_t region, int success);
3371 +};
3372 +
3373 +int dm_register_dirty_log_type(struct dirty_log_type *type);
3374 +int dm_unregister_dirty_log_type(struct dirty_log_type *type);
3375 +
3376 +
3377 +/*
3378 + * Make sure you use these two functions, rather than calling
3379 + * type->constructor/destructor() directly.
3380 + */
3381 +struct dirty_log *dm_create_dirty_log(const char *type_name, sector_t dev_size,
3382 +                                     unsigned int argc, char **argv);
3383 +void dm_destroy_dirty_log(struct dirty_log *log);
3384 +
3385 +/*
3386 + * init/exit functions.
3387 + */
3388 +int dm_dirty_log_init(void);
3389 +void dm_dirty_log_exit(void);
3390 +
3391 +#endif
3392 --- linux-2.4.21/drivers/md/dm-raid1.c  Thu Jan  1 01:00:00 1970
3393 +++ linux/drivers/md/dm-raid1.c Wed Aug 20 14:41:38 2003
3394 @@ -0,0 +1,1297 @@
3395 +/*
3396 + * Copyright (C) 2003 Sistina Software Limited.
3397 + *
3398 + * This file is released under the GPL.
3399 + */
3400 +
3401 +#include "dm.h"
3402 +#include "dm-daemon.h"
3403 +#include "dm-io.h"
3404 +#include "dm-log.h"
3405 +#include "kcopyd.h"
3406 +
3407 +#include <linux/ctype.h>
3408 +#include <linux/init.h>
3409 +#include <linux/mempool.h>
3410 +#include <linux/module.h>
3411 +#include <linux/pagemap.h>
3412 +#include <linux/slab.h>
3413 +#include <linux/time.h>
3414 +#include <linux/vmalloc.h>
3415 +
3416 +static struct dm_daemon _kmirrord;
3417 +
3418 +/*-----------------------------------------------------------------
3419 + * buffer lists:
3420 + *
3421 + * We play with singly linked lists of buffers, but we want to be
3422 + * careful to add new buffers to the back of the list, to avoid
3423 + * buffers being starved of attention.
3424 + *---------------------------------------------------------------*/
3425 +struct buffer_list {
3426 +       struct buffer_head *head;
3427 +       struct buffer_head *tail;
3428 +};
3429 +
3430 +static inline void buffer_list_init(struct buffer_list *bl)
3431 +{
3432 +       bl->head = bl->tail = NULL;
3433 +}
3434 +
3435 +static inline void buffer_list_add(struct buffer_list *bl,
3436 +                                  struct buffer_head *bh)
3437 +{
3438 +       bh->b_reqnext = NULL;
3439 +
3440 +       if (bl->tail) {
3441 +               bl->tail->b_reqnext = bh;
3442 +               bl->tail = bh;
3443 +       } else
3444 +               bl->head = bl->tail = bh;
3445 +}
3446 +
3447 +static struct buffer_head *buffer_list_pop(struct buffer_list *bl)
3448 +{
3449 +       struct buffer_head *bh = bl->head;
3450 +
3451 +       if (bh) {
3452 +               bl->head = bl->head->b_reqnext;
3453 +               if (!bl->head)
3454 +                       bl->tail = NULL;
3455 +
3456 +               bh->b_reqnext = NULL;
3457 +       }
3458 +
3459 +       return bh;
3460 +}
3461 +
3462 +/*-----------------------------------------------------------------
3463 + * Region hash
3464 + *
3465 + * The mirror splits itself up into discrete regions.  Each
3466 + * region can be in one of three states: clean, dirty,
3467 + * nosync.  There is no need to put clean regions in the hash.
3468 + *
3469 + * In addition to being present in the hash table a region _may_
3470 + * be present on one of three lists.
3471 + *
3472 + *   clean_regions: Regions on this list have no io pending to
3473 + *   them, they are in sync, we are no longer interested in them,
3474 + *   they are dull.  rh_update_states() will remove them from the
3475 + *   hash table.
3476 + *
3477 + *   quiesced_regions: These regions have been spun down, ready
3478 + *   for recovery.  rh_recovery_start() will remove regions from
3479 + *   this list and hand them to kmirrord, which will schedule the
3480 + *   recovery io with kcopyd.
3481 + *
3482 + *   recovered_regions: Regions that kcopyd has successfully
3483 + *   recovered.  rh_update_states() will now schedule any delayed
3484 + *   io, up the recovery_count, and remove the region from the
3485 + *   hash.
3486 + *
3487 + * There are 2 locks:
3488 + *   A rw spin lock 'hash_lock' protects just the hash table,
3489 + *   this is never held in write mode from interrupt context,
3490 + *   which I believe means that we only have to disable irqs when
3491 + *   doing a write lock.
3492 + *
3493 + *   An ordinary spin lock 'region_lock' that protects the three
3494 + *   lists in the region_hash, with the 'state', 'list' and
3495 + *   'bhs_delayed' fields of the regions.  This is used from irq
3496 + *   context, so all other uses will have to suspend local irqs.
3497 + *---------------------------------------------------------------*/
3498 +struct mirror_set;
3499 +struct region_hash {
3500 +       struct mirror_set *ms;
3501 +       sector_t region_size;
3502 +
3503 +       /* holds persistent region state */
3504 +       struct dirty_log *log;
3505 +
3506 +       /* hash table */
3507 +       rwlock_t hash_lock;
3508 +       mempool_t *region_pool;
3509 +       unsigned int mask;
3510 +       unsigned int nr_buckets;
3511 +       struct list_head *buckets;
3512 +
3513 +       spinlock_t region_lock;
3514 +       struct semaphore recovery_count;
3515 +       struct list_head clean_regions;
3516 +       struct list_head quiesced_regions;
3517 +       struct list_head recovered_regions;
3518 +};
3519 +
3520 +enum {
3521 +       RH_CLEAN,
3522 +       RH_DIRTY,
3523 +       RH_NOSYNC,
3524 +       RH_RECOVERING
3525 +};
3526 +
3527 +struct region {
3528 +       struct region_hash *rh; /* FIXME: can we get rid of this ? */
3529 +       region_t key;
3530 +       int state;
3531 +
3532 +       struct list_head hash_list;
3533 +       struct list_head list;
3534 +
3535 +       atomic_t pending;
3536 +       struct buffer_head *delayed_bhs;
3537 +};
3538 +
3539 +/*
3540 + * Conversion fns
3541 + */
3542 +static inline region_t bh_to_region(struct region_hash *rh,
3543 +                                   struct buffer_head *bh)
3544 +{
3545 +       return bh->b_rsector / rh->region_size;
3546 +}
3547 +
3548 +static inline sector_t region_to_sector(struct region_hash *rh, region_t region)
3549 +{
3550 +       return region * rh->region_size;
3551 +}
3552 +
3553 +/* FIXME move this */
3554 +static void queue_bh(struct mirror_set *ms, struct buffer_head *bh, int rw);
3555 +
3556 +static void *region_alloc(int gfp_mask, void *pool_data)
3557 +{
3558 +       return kmalloc(sizeof(struct region), gfp_mask);
3559 +}
3560 +
3561 +static void region_free(void *element, void *pool_data)
3562 +{
3563 +       kfree(element);
3564 +}
3565 +
3566 +#define MIN_REGIONS 64
3567 +#define MAX_RECOVERY 1
3568 +static int rh_init(struct region_hash *rh, struct mirror_set *ms,
3569 +                  struct dirty_log *log, sector_t region_size,
3570 +                  region_t nr_regions)
3571 +{
3572 +       unsigned int nr_buckets, max_buckets;
3573 +       size_t i;
3574 +
3575 +       /*
3576 +        * Calculate a suitable number of buckets for our hash
3577 +        * table.
3578 +        */
3579 +       max_buckets = nr_regions >> 6;
3580 +       for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1)
3581 +               ;
3582 +       nr_buckets >>= 1;
3583 +
3584 +       rh->ms = ms;
3585 +       rh->log = log;
3586 +       rh->region_size = region_size;
3587 +       rwlock_init(&rh->hash_lock);
3588 +       rh->mask = nr_buckets - 1;
3589 +       rh->nr_buckets = nr_buckets;
3590 +
3591 +       rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets));
3592 +       if (!rh->buckets) {
3593 +               DMERR("unable to allocate region hash memory");
3594 +               return -ENOMEM;
3595 +       }
3596 +
3597 +       for (i = 0; i < nr_buckets; i++)
3598 +               INIT_LIST_HEAD(rh->buckets + i);
3599 +
3600 +       spin_lock_init(&rh->region_lock);
3601 +       sema_init(&rh->recovery_count, 0);
3602 +       INIT_LIST_HEAD(&rh->clean_regions);
3603 +       INIT_LIST_HEAD(&rh->quiesced_regions);
3604 +       INIT_LIST_HEAD(&rh->recovered_regions);
3605 +
3606 +       rh->region_pool = mempool_create(MIN_REGIONS, region_alloc,
3607 +                                        region_free, NULL);
3608 +       if (!rh->region_pool) {
3609 +               vfree(rh->buckets);
3610 +               rh->buckets = NULL;
3611 +               return -ENOMEM;
3612 +       }
3613 +
3614 +       return 0;
3615 +}
3616 +
3617 +static void rh_exit(struct region_hash *rh)
3618 +{
3619 +       unsigned int h;
3620 +       struct region *reg;
3621 +       struct list_head *tmp, *tmp2;
3622 +
3623 +       BUG_ON(!list_empty(&rh->quiesced_regions));
3624 +       for (h = 0; h < rh->nr_buckets; h++) {
3625 +               list_for_each_safe (tmp, tmp2, rh->buckets + h) {
3626 +                       reg = list_entry(tmp, struct region, hash_list);
3627 +                       BUG_ON(atomic_read(&reg->pending));
3628 +                       mempool_free(reg, rh->region_pool);
3629 +               }
3630 +       }
3631 +
3632 +       if (rh->log)
3633 +               dm_destroy_dirty_log(rh->log);
3634 +       if (rh->region_pool)
3635 +               mempool_destroy(rh->region_pool);
3636 +       vfree(rh->buckets);
3637 +}
3638 +
3639 +#define RH_HASH_MULT 2654435387U
3640 +
3641 +static inline unsigned int rh_hash(struct region_hash *rh, region_t region)
3642 +{
3643 +       return (unsigned int) ((region * RH_HASH_MULT) >> 12) & rh->mask;
3644 +}
3645 +
3646 +static struct region *__rh_lookup(struct region_hash *rh, region_t region)
3647 +{
3648 +       struct region *reg;
3649 +
3650 +       list_for_each_entry (reg, rh->buckets + rh_hash(rh, region), hash_list)
3651 +               if (reg->key == region)
3652 +                       return reg;
3653 +
3654 +       return NULL;
3655 +}
3656 +
3657 +static void __rh_insert(struct region_hash *rh, struct region *reg)
3658 +{
3659 +       unsigned int h = rh_hash(rh, reg->key);
3660 +       list_add(&reg->hash_list, rh->buckets + h);
3661 +}
3662 +
3663 +static struct region *__rh_alloc(struct region_hash *rh, region_t region)
3664 +{
3665 +       struct region *reg, *nreg;
3666 +
3667 +       read_unlock(&rh->hash_lock);
3668 +       nreg = mempool_alloc(rh->region_pool, GFP_NOIO);
3669 +       nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
3670 +               RH_CLEAN : RH_NOSYNC;
3671 +       nreg->rh = rh;
3672 +       nreg->key = region;
3673 +
3674 +       INIT_LIST_HEAD(&nreg->list);
3675 +
3676 +       atomic_set(&nreg->pending, 0);
3677 +       nreg->delayed_bhs = NULL;
3678 +       write_lock_irq(&rh->hash_lock);
3679 +
3680 +       reg = __rh_lookup(rh, region);
3681 +       if (reg)
3682 +               /* we lost the race */
3683 +               mempool_free(nreg, rh->region_pool);
3684 +
3685 +       else {
3686 +               __rh_insert(rh, nreg);
3687 +               if (nreg->state == RH_CLEAN) {
3688 +                       spin_lock_irq(&rh->region_lock);
3689 +                       list_add(&nreg->list, &rh->clean_regions);
3690 +                       spin_unlock_irq(&rh->region_lock);
3691 +               }
3692 +               reg = nreg;
3693 +       }
3694 +       write_unlock_irq(&rh->hash_lock);
3695 +       read_lock(&rh->hash_lock);
3696 +
3697 +       return reg;
3698 +}
3699 +
3700 +static inline struct region *__rh_find(struct region_hash *rh, region_t region)
3701 +{
3702 +       struct region *reg;
3703 +
3704 +       reg = __rh_lookup(rh, region);
3705 +       if (!reg)
3706 +               reg = __rh_alloc(rh, region);
3707 +
3708 +       return reg;
3709 +}
3710 +
3711 +static int rh_state(struct region_hash *rh, region_t region, int may_block)
3712 +{
3713 +       int r;
3714 +       struct region *reg;
3715 +
3716 +       read_lock(&rh->hash_lock);
3717 +       reg = __rh_lookup(rh, region);
3718 +       read_unlock(&rh->hash_lock);
3719 +
3720 +       if (reg)
3721 +               return reg->state;
3722 +
3723 +       /*
3724 +        * The region wasn't in the hash, so we fall back to the
3725 +        * dirty log.
3726 +        */
3727 +       r = rh->log->type->in_sync(rh->log, region, may_block);
3728 +
3729 +       /*
3730 +        * Any error from the dirty log (eg. -EWOULDBLOCK) gets
3731 +        * taken as a RH_NOSYNC
3732 +        */
3733 +       return r == 1 ? RH_CLEAN : RH_NOSYNC;
3734 +}
3735 +
3736 +static inline int rh_in_sync(struct region_hash *rh,
3737 +                            region_t region, int may_block)
3738 +{
3739 +       int state = rh_state(rh, region, may_block);
3740 +       return state == RH_CLEAN || state == RH_DIRTY;
3741 +}
3742 +
3743 +static void dispatch_buffers(struct mirror_set *ms, struct buffer_head *bh)
3744 +{
3745 +       struct buffer_head *nbh;
3746 +
3747 +       while (bh) {
3748 +               nbh = bh->b_reqnext;
3749 +               queue_bh(ms, bh, WRITE);
3750 +               bh = nbh;
3751 +       }
3752 +}
3753 +
3754 +static void rh_update_states(struct region_hash *rh)
3755 +{
3756 +       struct list_head *tmp, *tmp2;
3757 +       struct region *reg;
3758 +
3759 +       LIST_HEAD(clean);
3760 +       LIST_HEAD(recovered);
3761 +
3762 +       /*
3763 +        * Quickly grab the lists.
3764 +        */
3765 +       write_lock_irq(&rh->hash_lock);
3766 +       spin_lock(&rh->region_lock);
3767 +       if (!list_empty(&rh->clean_regions)) {
3768 +               list_splice(&rh->clean_regions, &clean);
3769 +               INIT_LIST_HEAD(&rh->clean_regions);
3770 +
3771 +               list_for_each_entry (reg, &clean, list) {
3772 +                       rh->log->type->clear_region(rh->log, reg->key);
3773 +                       list_del(&reg->hash_list);
3774 +               }
3775 +       }
3776 +
3777 +       if (!list_empty(&rh->recovered_regions)) {
3778 +               list_splice(&rh->recovered_regions, &recovered);
3779 +               INIT_LIST_HEAD(&rh->recovered_regions);
3780 +
3781 +               list_for_each_entry (reg, &recovered, list)
3782 +                       list_del(&reg->hash_list);
3783 +       }
3784 +       spin_unlock(&rh->region_lock);
3785 +       write_unlock_irq(&rh->hash_lock);
3786 +
3787 +       /*
3788 +        * All the regions on the recovered and clean lists have
3789 +        * now been pulled out of the system, so no need to do
3790 +        * any more locking.
3791 +        */
3792 +       list_for_each_safe (tmp, tmp2, &recovered) {
3793 +               reg = list_entry(tmp, struct region, list);
3794 +
3795 +               rh->log->type->complete_resync_work(rh->log, reg->key, 1);
3796 +               dispatch_buffers(rh->ms, reg->delayed_bhs);
3797 +               up(&rh->recovery_count);
3798 +               mempool_free(reg, rh->region_pool);
3799 +       }
3800 +
3801 +       list_for_each_safe (tmp, tmp2, &clean) {
3802 +               reg = list_entry(tmp, struct region, list);
3803 +               mempool_free(reg, rh->region_pool);
3804 +       }
3805 +}
3806 +
3807 +static void rh_inc(struct region_hash *rh, region_t region)
3808 +{
3809 +       struct region *reg;
3810 +
3811 +       read_lock(&rh->hash_lock);
3812 +       reg = __rh_find(rh, region);
3813 +       if (reg->state == RH_CLEAN) {
3814 +               rh->log->type->mark_region(rh->log, reg->key);
3815 +
3816 +               spin_lock_irq(&rh->region_lock);
3817 +               reg->state = RH_DIRTY;
3818 +               list_del_init(&reg->list);      /* take off the clean list */
3819 +               spin_unlock_irq(&rh->region_lock);
3820 +       }
3821 +
3822 +       atomic_inc(&reg->pending);
3823 +       read_unlock(&rh->hash_lock);
3824 +}
3825 +
3826 +static void rh_inc_pending(struct region_hash *rh, struct buffer_list *buffers)
3827 +{
3828 +       struct buffer_head *bh;
3829 +
3830 +       for (bh = buffers->head; bh; bh = bh->b_reqnext)
3831 +               rh_inc(rh, bh_to_region(rh, bh));
3832 +}
3833 +
3834 +static void rh_dec(struct region_hash *rh, region_t region)
3835 +{
3836 +       unsigned long flags;
3837 +       struct region *reg;
3838 +       int wake = 0;
3839 +
3840 +       read_lock(&rh->hash_lock);
3841 +       reg = __rh_lookup(rh, region);
3842 +       read_unlock(&rh->hash_lock);
3843 +
3844 +       if (atomic_dec_and_test(&reg->pending)) {
3845 +               spin_lock_irqsave(&rh->region_lock, flags);
3846 +               if (reg->state == RH_RECOVERING) {
3847 +                       list_add_tail(&reg->list, &rh->quiesced_regions);
3848 +               } else {
3849 +                       reg->state = RH_CLEAN;
3850 +                       list_add(&reg->list, &rh->clean_regions);
3851 +               }
3852 +               spin_unlock_irqrestore(&rh->region_lock, flags);
3853 +               wake = 1;
3854 +       }
3855 +
3856 +       if (wake)
3857 +               dm_daemon_wake(&_kmirrord);
3858 +}
3859 +
3860 +/*
3861 + * Starts quiescing a region in preparation for recovery.
3862 + */
3863 +static int __rh_recovery_prepare(struct region_hash *rh)
3864 +{
3865 +       int r;
3866 +       struct region *reg;
3867 +       region_t region;
3868 +
3869 +       /*
3870 +        * Ask the dirty log what's next.
3871 +        */
3872 +       r = rh->log->type->get_resync_work(rh->log, &region);
3873 +       if (r <= 0)
3874 +               return r;
3875 +
3876 +       /*
3877 +        * Get this region, and start it quiescing by setting the
3878 +        * recovering flag.
3879 +        */
3880 +       read_lock(&rh->hash_lock);
3881 +       reg = __rh_find(rh, region);
3882 +       read_unlock(&rh->hash_lock);
3883 +
3884 +       spin_lock_irq(&rh->region_lock);
3885 +       reg->state = RH_RECOVERING;
3886 +
3887 +       /* Already quiesced ? */
3888 +       if (atomic_read(&reg->pending))
3889 +               list_del_init(&reg->list);
3890 +
3891 +       else {
3892 +               list_del_init(&reg->list);
3893 +               list_add(&reg->list, &rh->quiesced_regions);
3894 +       }
3895 +       spin_unlock_irq(&rh->region_lock);
3896 +
3897 +       return 1;
3898 +}
3899 +
3900 +static void rh_recovery_prepare(struct region_hash *rh)
3901 +{
3902 +       while (!down_trylock(&rh->recovery_count))
3903 +               if (__rh_recovery_prepare(rh) <= 0) {
3904 +                       up(&rh->recovery_count);
3905 +                       break;
3906 +               }
3907 +}
3908 +
3909 +/*
3910 + * Returns any quiesced regions.
3911 + */
3912 +static struct region *rh_recovery_start(struct region_hash *rh)
3913 +{
3914 +       struct region *reg = NULL;
3915 +
3916 +       spin_lock_irq(&rh->region_lock);
3917 +       if (!list_empty(&rh->quiesced_regions)) {
3918 +               reg = list_entry(rh->quiesced_regions.next,
3919 +                                struct region, list);
3920 +               list_del_init(&reg->list);      /* remove from the quiesced list */
3921 +       }
3922 +       spin_unlock_irq(&rh->region_lock);
3923 +
3924 +       return reg;
3925 +}
3926 +
3927 +/* FIXME: success ignored for now */
3928 +static void rh_recovery_end(struct region *reg, int success)
3929 +{
3930 +       struct region_hash *rh = reg->rh;
3931 +
3932 +       spin_lock_irq(&rh->region_lock);
3933 +       list_add(&reg->list, &reg->rh->recovered_regions);
3934 +       spin_unlock_irq(&rh->region_lock);
3935 +
3936 +       dm_daemon_wake(&_kmirrord);
3937 +}
3938 +
3939 +static void rh_flush(struct region_hash *rh)
3940 +{
3941 +       rh->log->type->flush(rh->log);
3942 +}
3943 +
3944 +static void rh_delay(struct region_hash *rh, struct buffer_head *bh)
3945 +{
3946 +       struct region *reg;
3947 +
3948 +       read_lock(&rh->hash_lock);
3949 +       reg = __rh_find(rh, bh_to_region(rh, bh));
3950 +       bh->b_reqnext = reg->delayed_bhs;
3951 +       reg->delayed_bhs = bh;
3952 +       read_unlock(&rh->hash_lock);
3953 +}
3954 +
3955 +static void rh_stop_recovery(struct region_hash *rh)
3956 +{
3957 +       int i;
3958 +
3959 +       /* wait for any recovering regions */
3960 +       for (i = 0; i < MAX_RECOVERY; i++)
3961 +               down(&rh->recovery_count);
3962 +}
3963 +
3964 +static void rh_start_recovery(struct region_hash *rh)
3965 +{
3966 +       int i;
3967 +
3968 +       for (i = 0; i < MAX_RECOVERY; i++)
3969 +               up(&rh->recovery_count);
3970 +
3971 +       dm_daemon_wake(&_kmirrord);
3972 +}
3973 +
3974 +/*-----------------------------------------------------------------
3975 + * Mirror set structures.
3976 + *---------------------------------------------------------------*/
3977 +struct mirror {
3978 +       atomic_t error_count;
3979 +       struct dm_dev *dev;
3980 +       sector_t offset;
3981 +};
3982 +
3983 +struct mirror_set {
3984 +       struct dm_target *ti;
3985 +       struct list_head list;
3986 +       struct region_hash rh;
3987 +       struct kcopyd_client *kcopyd_client;
3988 +
3989 +       spinlock_t lock;        /* protects the next two lists */
3990 +       struct buffer_list reads;
3991 +       struct buffer_list writes;
3992 +
3993 +       /* recovery */
3994 +       region_t nr_regions;
3995 +       region_t sync_count;
3996 +
3997 +       unsigned int nr_mirrors;
3998 +       struct mirror mirror[0];
3999 +};
4000 +
4001 +/*
4002 + * Every mirror should look like this one.
4003 + */
4004 +#define DEFAULT_MIRROR 0
4005 +
4006 +/*
4007 + * This is yucky.  We squirrel the mirror_set struct away inside
4008 + * b_reqnext for write buffers.  This is safe since the bh
4009 + * doesn't get submitted to the lower levels of block layer.
4010 + */
4011 +static struct mirror_set *bh_get_ms(struct buffer_head *bh)
4012 +{
4013 +       return (struct mirror_set *) bh->b_reqnext;
4014 +}
4015 +
4016 +static void bh_set_ms(struct buffer_head *bh, struct mirror_set *ms)
4017 +{
4018 +       bh->b_reqnext = (struct buffer_head *) ms;
4019 +}
4020 +
4021 +/*-----------------------------------------------------------------
4022 + * Recovery.
4023 + *
4024 + * When a mirror is first activated we may find that some regions
4025 + * are in the no-sync state.  We have to recover these by
4026 + * recopying from the default mirror to all the others.
4027 + *---------------------------------------------------------------*/
4028 +static void recovery_complete(int read_err, unsigned int write_err,
4029 +                             void *context)
4030 +{
4031 +       struct region *reg = (struct region *) context;
4032 +       struct mirror_set *ms = reg->rh->ms;
4033 +
4034 +       /* FIXME: better error handling */
4035 +       rh_recovery_end(reg, read_err || write_err);
4036 +       if (++ms->sync_count == ms->nr_regions)
4037 +               /* the sync is complete */
4038 +               dm_table_event(ms->ti->table);
4039 +}
4040 +
4041 +static int recover(struct mirror_set *ms, struct region *reg)
4042 +{
4043 +       int r;
4044 +       unsigned int i;
4045 +       struct io_region from, to[ms->nr_mirrors - 1], *dest;
4046 +       struct mirror *m;
4047 +       unsigned int flags = 0;
4048 +
4049 +       /* fill in the source */
4050 +       m = ms->mirror + DEFAULT_MIRROR;
4051 +       from.dev = m->dev->dev;
4052 +       from.sector = m->offset + region_to_sector(reg->rh, reg->key);
4053 +       if (reg->key == (ms->nr_regions - 1)) {
4054 +               /*
4055 +                * The final region may be smaller than
4056 +                * region_size.
4057 +                */
4058 +               from.count = ms->ti->len & (reg->rh->region_size - 1);
4059 +               if (!from.count)
4060 +                       from.count = reg->rh->region_size;
4061 +       } else
4062 +               from.count = reg->rh->region_size;
4063 +
4064 +       /* fill in the destinations */
4065 +       for (i = 1; i < ms->nr_mirrors; i++) {
4066 +               m = ms->mirror + i;
4067 +               dest = to + (i - 1);
4068 +
4069 +               dest->dev = m->dev->dev;
4070 +               dest->sector = m->offset + region_to_sector(reg->rh, reg->key);
4071 +               dest->count = from.count;
4072 +       }
4073 +
4074 +       /* hand to kcopyd */
4075 +       set_bit(KCOPYD_IGNORE_ERROR, &flags);
4076 +       r = kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, flags,
4077 +                       recovery_complete, reg);
4078 +
4079 +       return r;
4080 +}
4081 +
4082 +static void do_recovery(struct mirror_set *ms)
4083 +{
4084 +       int r;
4085 +       struct region *reg;
4086 +
4087 +       /*
4088 +        * Start quiescing some regions.
4089 +        */
4090 +       rh_recovery_prepare(&ms->rh);
4091 +
4092 +       /*
4093 +        * Copy any already quiesced regions.
4094 +        */
4095 +       while ((reg = rh_recovery_start(&ms->rh))) {
4096 +               r = recover(ms, reg);
4097 +               if (r)
4098 +                       rh_recovery_end(reg, 0);
4099 +       }
4100 +}
4101 +
4102 +/*-----------------------------------------------------------------
4103 + * Reads
4104 + *---------------------------------------------------------------*/
4105 +static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
4106 +{
4107 +       /* FIXME: add read balancing */
4108 +       return ms->mirror + DEFAULT_MIRROR;
4109 +}
4110 +
4111 +/*
4112 + * remap a buffer to a particular mirror.
4113 + */
4114 +static void map_buffer(struct mirror_set *ms,
4115 +                      struct mirror *m, struct buffer_head *bh)
4116 +{
4117 +       sector_t bsize = bh->b_size >> 9;
4118 +       sector_t rsector = bh->b_blocknr * bsize;
4119 +
4120 +       bh->b_rdev = m->dev->dev;
4121 +       bh->b_rsector = m->offset + (rsector - ms->ti->begin);
4122 +}
4123 +
4124 +static void do_reads(struct mirror_set *ms, struct buffer_list *reads)
4125 +{
4126 +       region_t region;
4127 +       struct buffer_head *bh;
4128 +       struct mirror *m;
4129 +
4130 +       while ((bh = buffer_list_pop(reads))) {
4131 +               region = bh_to_region(&ms->rh, bh);
4132 +
4133 +               /*
4134 +                * We can only read balance if the region is in sync.
4135 +                */
4136 +               if (rh_in_sync(&ms->rh, region, 0))
4137 +                       m = choose_mirror(ms, bh->b_rsector);
4138 +               else
4139 +                       m = ms->mirror + DEFAULT_MIRROR;
4140 +
4141 +               map_buffer(ms, m, bh);
4142 +               generic_make_request(READ, bh);
4143 +       }
4144 +}
4145 +
4146 +/*-----------------------------------------------------------------
4147 + * Writes.
4148 + *
4149 + * We do different things with the write io depending on the
4150 + * state of the region that it's in:
4151 + *
4152 + * SYNC:       increment pending, use kcopyd to write to *all* mirrors
4153 + * RECOVERING: delay the io until recovery completes
4154 + * NOSYNC:     increment pending, just write to the default mirror
4155 + *---------------------------------------------------------------*/
4156 +static void write_callback(unsigned int error, void *context)
4157 +{
4158 +       unsigned int i;
4159 +       int uptodate = 1;
4160 +       struct buffer_head *bh = (struct buffer_head *) context;
4161 +       struct mirror_set *ms;
4162 +
4163 +       ms = bh_get_ms(bh);
4164 +       bh_set_ms(bh, NULL);
4165 +
4166 +       /*
4167 +        * NOTE: We don't decrement the pending count here,
4168 +        * instead it is done by the targets endio function.
4169 +        * This way we handle both writes to SYNC and NOSYNC
4170 +        * regions with the same code.
4171 +        */
4172 +
4173 +       if (error) {
4174 +               /*
4175 +                * only error the io if all mirrors failed.
4176 +                * FIXME: bogus
4177 +                */
4178 +               uptodate = 0;
4179 +               for (i = 0; i < ms->nr_mirrors; i++)
4180 +                       if (!test_bit(i, &error)) {
4181 +                               uptodate = 1;
4182 +                               break;
4183 +                       }
4184 +       }
4185 +       bh->b_end_io(bh, uptodate);
4186 +}
4187 +
4188 +static void do_write(struct mirror_set *ms, struct buffer_head *bh)
4189 +{
4190 +       unsigned int i;
4191 +       struct io_region io[ms->nr_mirrors];
4192 +       struct mirror *m;
4193 +
4194 +       for (i = 0; i < ms->nr_mirrors; i++) {
4195 +               m = ms->mirror + i;
4196 +
4197 +               io[i].dev = m->dev->dev;
4198 +               io[i].sector = m->offset + (bh->b_rsector - ms->ti->begin);
4199 +               io[i].count = bh->b_size >> 9;
4200 +       }
4201 +
4202 +       bh_set_ms(bh, ms);
4203 +       dm_io_async(ms->nr_mirrors, io, WRITE, bh->b_page,
4204 +                   (unsigned int) bh->b_data & ~PAGE_MASK, write_callback, bh);
4205 +}
4206 +
4207 +static void do_writes(struct mirror_set *ms, struct buffer_list *writes)
4208 +{
4209 +       int state;
4210 +       struct buffer_head *bh;
4211 +       struct buffer_list sync, nosync, recover, *this_list = NULL;
4212 +
4213 +       if (!writes->head)
4214 +               return;
4215 +
4216 +       /*
4217 +        * Classify each write.
4218 +        */
4219 +       buffer_list_init(&sync);
4220 +       buffer_list_init(&nosync);
4221 +       buffer_list_init(&recover);
4222 +
4223 +       while ((bh = buffer_list_pop(writes))) {
4224 +               state = rh_state(&ms->rh, bh_to_region(&ms->rh, bh), 1);
4225 +               switch (state) {
4226 +               case RH_CLEAN:
4227 +               case RH_DIRTY:
4228 +                       this_list = &sync;
4229 +                       break;
4230 +
4231 +               case RH_NOSYNC:
4232 +                       this_list = &nosync;
4233 +                       break;
4234 +
4235 +               case RH_RECOVERING:
4236 +                       this_list = &recover;
4237 +                       break;
4238 +               }
4239 +
4240 +               buffer_list_add(this_list, bh);
4241 +       }
4242 +
4243 +       /*
4244 +        * Increment the pending counts for any regions that will
4245 +        * be written to (writes to recover regions are going to
4246 +        * be delayed).
4247 +        */
4248 +       rh_inc_pending(&ms->rh, &sync);
4249 +       rh_inc_pending(&ms->rh, &nosync);
4250 +       rh_flush(&ms->rh);
4251 +
4252 +       /*
4253 +        * Dispatch io.
4254 +        */
4255 +       while ((bh = buffer_list_pop(&sync)))
4256 +               do_write(ms, bh);
4257 +
4258 +       while ((bh = buffer_list_pop(&recover)))
4259 +               rh_delay(&ms->rh, bh);
4260 +
4261 +       while ((bh = buffer_list_pop(&nosync))) {
4262 +               map_buffer(ms, ms->mirror + DEFAULT_MIRROR, bh);
4263 +               generic_make_request(WRITE, bh);
4264 +       }
4265 +}
4266 +
4267 +/*-----------------------------------------------------------------
4268 + * kmirrord
4269 + *---------------------------------------------------------------*/
4270 +static LIST_HEAD(_mirror_sets);
4271 +static DECLARE_RWSEM(_mirror_sets_lock);
4272 +
4273 +static void do_mirror(struct mirror_set *ms)
4274 +{
4275 +       struct buffer_list reads, writes;
4276 +
4277 +       spin_lock(&ms->lock);
4278 +       memcpy(&reads, &ms->reads, sizeof(reads));
4279 +       buffer_list_init(&ms->reads);
4280 +       memcpy(&writes, &ms->writes, sizeof(writes));
4281 +       buffer_list_init(&ms->writes);
4282 +       spin_unlock(&ms->lock);
4283 +
4284 +       rh_update_states(&ms->rh);
4285 +       do_recovery(ms);
4286 +       do_reads(ms, &reads);
4287 +       do_writes(ms, &writes);
4288 +       run_task_queue(&tq_disk);
4289 +}
4290 +
4291 +static void do_work(void)
4292 +{
4293 +       struct mirror_set *ms;
4294 +
4295 +       down_read(&_mirror_sets_lock);
4296 +       list_for_each_entry (ms, &_mirror_sets, list)
4297 +               do_mirror(ms);
4298 +       up_read(&_mirror_sets_lock);
4299 +}
4300 +
4301 +/*-----------------------------------------------------------------
4302 + * Target functions
4303 + *---------------------------------------------------------------*/
4304 +static struct mirror_set *alloc_context(unsigned int nr_mirrors,
4305 +                                       sector_t region_size,
4306 +                                       struct dm_target *ti,
4307 +                                       struct dirty_log *dl)
4308 +{
4309 +       size_t len;
4310 +       struct mirror_set *ms = NULL;
4311 +
4312 +       if (array_too_big(sizeof(*ms), sizeof(ms->mirror[0]), nr_mirrors))
4313 +               return NULL;
4314 +
4315 +       len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors);
4316 +
4317 +       ms = kmalloc(len, GFP_KERNEL);
4318 +       if (!ms) {
4319 +               ti->error = "dm-mirror: Cannot allocate mirror context";
4320 +               return NULL;
4321 +       }
4322 +
4323 +       memset(ms, 0, len);
4324 +       spin_lock_init(&ms->lock);
4325 +
4326 +       ms->ti = ti;
4327 +       ms->nr_mirrors = nr_mirrors;
4328 +       ms->nr_regions = dm_div_up(ti->len, region_size);
4329 +
4330 +       if (rh_init(&ms->rh, ms, dl, region_size, ms->nr_regions)) {
4331 +               ti->error = "dm-mirror: Error creating dirty region hash";
4332 +               kfree(ms);
4333 +               return NULL;
4334 +       }
4335 +
4336 +       return ms;
4337 +}
4338 +
4339 +static void free_context(struct mirror_set *ms, struct dm_target *ti,
4340 +                        unsigned int m)
4341 +{
4342 +       while (m--)
4343 +               dm_put_device(ti, ms->mirror[m].dev);
4344 +
4345 +       rh_exit(&ms->rh);
4346 +       kfree(ms);
4347 +}
4348 +
4349 +static inline int _check_region_size(struct dm_target *ti, sector_t size)
4350 +{
4351 +       return !(size % (PAGE_SIZE >> 9) || (size & (size - 1)) ||
4352 +                size > ti->len);
4353 +}
4354 +
4355 +static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
4356 +                     unsigned int mirror, char **argv)
4357 +{
4358 +       sector_t offset;
4359 +
4360 +       if (sscanf(argv[1], SECTOR_FORMAT, &offset) != 1) {
4361 +               ti->error = "dm-mirror: Invalid offset";
4362 +               return -EINVAL;
4363 +       }
4364 +
4365 +       if (dm_get_device(ti, argv[0], offset, ti->len,
4366 +                         dm_table_get_mode(ti->table),
4367 +                         &ms->mirror[mirror].dev)) {
4368 +               ti->error = "dm-mirror: Device lookup failure";
4369 +               return -ENXIO;
4370 +       }
4371 +
4372 +       ms->mirror[mirror].offset = offset;
4373 +
4374 +       return 0;
4375 +}
4376 +
4377 +static int add_mirror_set(struct mirror_set *ms)
4378 +{
4379 +       down_write(&_mirror_sets_lock);
4380 +       list_add_tail(&ms->list, &_mirror_sets);
4381 +       up_write(&_mirror_sets_lock);
4382 +       dm_daemon_wake(&_kmirrord);
4383 +
4384 +       return 0;
4385 +}
4386 +
4387 +static void del_mirror_set(struct mirror_set *ms)
4388 +{
4389 +       down_write(&_mirror_sets_lock);
4390 +       list_del(&ms->list);
4391 +       up_write(&_mirror_sets_lock);
4392 +}
4393 +
4394 +/*
4395 + * Create dirty log: log_type #log_params <log_params>
4396 + */
4397 +static struct dirty_log *create_dirty_log(struct dm_target *ti,
4398 +                                         unsigned int argc, char **argv,
4399 +                                         unsigned int *args_used)
4400 +{
4401 +       unsigned int param_count;
4402 +       struct dirty_log *dl;
4403 +
4404 +       if (argc < 2) {
4405 +               ti->error = "dm-mirror: Insufficient mirror log arguments";
4406 +               return NULL;
4407 +       }
4408 +
4409 +       if (sscanf(argv[1], "%u", &param_count) != 1 || param_count != 1) {
4410 +               ti->error = "dm-mirror: Invalid mirror log argument count";
4411 +               return NULL;
4412 +       }
4413 +
4414 +       *args_used = 2 + param_count;
4415 +
4416 +       if (argc < *args_used) {
4417 +               ti->error = "dm-mirror: Insufficient mirror log arguments";
4418 +               return NULL;
4419 +       }
4420 +
4421 +       dl = dm_create_dirty_log(argv[0], ti->len, param_count, argv + 2);
4422 +       if (!dl) {
4423 +               ti->error = "dm-mirror: Error creating mirror dirty log";
4424 +               return NULL;
4425 +       }
4426 +
4427 +       if (!_check_region_size(ti, dl->type->get_region_size(dl))) {
4428 +               ti->error = "dm-mirror: Invalid region size";
4429 +               dm_destroy_dirty_log(dl);
4430 +               return NULL;
4431 +       }
4432 +
4433 +       return dl;
4434 +}
4435 +
4436 +/*
4437 + * Construct a mirror mapping:
4438 + *
4439 + * log_type #log_params <log_params>
4440 + * #mirrors [mirror_path offset]{2,}
4441 + *
4442 + * For now, #log_params = 1, log_type = "core"
4443 + *
4444 + */
4445 +#define DM_IO_PAGES 64
4446 +static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
4447 +{
4448 +       int r;
4449 +       unsigned int nr_mirrors, m, args_used;
4450 +       struct mirror_set *ms;
4451 +       struct dirty_log *dl;
4452 +
4453 +       dl = create_dirty_log(ti, argc, argv, &args_used);
4454 +       if (!dl)
4455 +               return -EINVAL;
4456 +
4457 +       argv += args_used;
4458 +       argc -= args_used;
4459 +
4460 +       if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 ||
4461 +           nr_mirrors < 2) {
4462 +               ti->error = "dm-mirror: Invalid number of mirrors";
4463 +               dm_destroy_dirty_log(dl);
4464 +               return -EINVAL;
4465 +       }
4466 +
4467 +       argv++, argc--;
4468 +
4469 +       if (argc != nr_mirrors * 2) {
4470 +               ti->error = "dm-mirror: Wrong number of mirror arguments";
4471 +               dm_destroy_dirty_log(dl);
4472 +               return -EINVAL;
4473 +       }
4474 +
4475 +       ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl);
4476 +       if (!ms) {
4477 +               dm_destroy_dirty_log(dl);
4478 +               return -ENOMEM;
4479 +       }
4480 +
4481 +       /* Get the mirror parameter sets */
4482 +       for (m = 0; m < nr_mirrors; m++) {
4483 +               r = get_mirror(ms, ti, m, argv);
4484 +               if (r) {
4485 +                       free_context(ms, ti, m);
4486 +                       return r;
4487 +               }
4488 +               argv += 2;
4489 +               argc -= 2;
4490 +       }
4491 +
4492 +       ti->private = ms;
4493 +
4494 +       r = kcopyd_client_create(DM_IO_PAGES, &ms->kcopyd_client);
4495 +       if (r) {
4496 +               free_context(ms, ti, ms->nr_mirrors);
4497 +               return r;
4498 +       }
4499 +
4500 +       add_mirror_set(ms);
4501 +       return 0;
4502 +}
4503 +
4504 +static void mirror_dtr(struct dm_target *ti)
4505 +{
4506 +       struct mirror_set *ms = (struct mirror_set *) ti->private;
4507 +
4508 +       del_mirror_set(ms);
4509 +       kcopyd_client_destroy(ms->kcopyd_client);
4510 +       free_context(ms, ti, ms->nr_mirrors);
4511 +}
4512 +
4513 +static void queue_bh(struct mirror_set *ms, struct buffer_head *bh, int rw)
4514 +{
4515 +       int wake = 0;
4516 +       struct buffer_list *bl;
4517 +
4518 +       bl = (rw == WRITE) ? &ms->writes : &ms->reads;
4519 +       spin_lock(&ms->lock);
4520 +       wake = !(bl->head);
4521 +       buffer_list_add(bl, bh);
4522 +       spin_unlock(&ms->lock);
4523 +
4524 +       if (wake)
4525 +               dm_daemon_wake(&_kmirrord);
4526 +}
4527 +
4528 +/*
4529 + * Mirror mapping function
4530 + */
4531 +static int mirror_map(struct dm_target *ti, struct buffer_head *bh,
4532 +                     int rw, union map_info *map_context)
4533 +{
4534 +       int r;
4535 +       struct mirror *m;
4536 +       struct mirror_set *ms = ti->private;
4537 +
4538 +       /* FIXME: nasty hack, 32 bit sector_t only */
4539 +       map_context->ll = bh->b_rsector / ms->rh.region_size;
4540 +
4541 +       if (rw == WRITE) {
4542 +               queue_bh(ms, bh, rw);
4543 +               return 0;
4544 +       }
4545 +
4546 +       r = ms->rh.log->type->in_sync(ms->rh.log, bh_to_region(&ms->rh, bh), 0);
4547 +       if (r < 0 && r != -EWOULDBLOCK)
4548 +               return r;
4549 +
4550 +       if (r == -EWOULDBLOCK)  /* FIXME: ugly */
4551 +               r = 0;
4552 +
4553 +       /*
4554 +        * We don't want to fast track a recovery just for a read
4555 +        * ahead.  So we just let it silently fail.
4556 +        * FIXME: get rid of this.
4557 +        */
4558 +       if (!r && rw == READA)
4559 +               return -EIO;
4560 +
4561 +       if (!r) {
4562 +               /* Pass this io over to the daemon */
4563 +               queue_bh(ms, bh, rw);
4564 +               return 0;
4565 +       }
4566 +
4567 +       m = choose_mirror(ms, bh->b_rsector);
4568 +       if (!m)
4569 +               return -EIO;
4570 +
4571 +       map_buffer(ms, m, bh);
4572 +       return 1;
4573 +}
4574 +
4575 +static int mirror_end_io(struct dm_target *ti, struct buffer_head *bh,
4576 +                        int rw, int error, union map_info *map_context)
4577 +{
4578 +       struct mirror_set *ms = (struct mirror_set *) ti->private;
4579 +       region_t region = map_context->ll;
4580 +
4581 +       /*
4582 +        * We need to dec pending if this was a write.
4583 +        */
4584 +       if (rw == WRITE)
4585 +               rh_dec(&ms->rh, region);
4586 +
4587 +       return 0;
4588 +}
4589 +
4590 +static void mirror_suspend(struct dm_target *ti)
4591 +{
4592 +       struct mirror_set *ms = (struct mirror_set *) ti->private;
4593 +       rh_stop_recovery(&ms->rh);
4594 +}
4595 +
4596 +static void mirror_resume(struct dm_target *ti)
4597 +{
4598 +       struct mirror_set *ms = (struct mirror_set *) ti->private;
4599 +       rh_start_recovery(&ms->rh);
4600 +}
4601 +
4602 +static int mirror_status(struct dm_target *ti, status_type_t type,
4603 +                        char *result, unsigned int maxlen)
4604 +{
4605 +       unsigned int m, sz = 0;
4606 +       struct mirror_set *ms = (struct mirror_set *) ti->private;
4607 +
4608 +       switch (type) {
4609 +       case STATUSTYPE_INFO:
4610 +               sz += snprintf(result + sz, maxlen - sz, "%d ", ms->nr_mirrors);
4611 +
4612 +               for (m = 0; m < ms->nr_mirrors; m++)
4613 +                       sz += snprintf(result + sz, maxlen - sz, "%s ",
4614 +                                      dm_kdevname(ms->mirror[m].dev->dev));
4615 +
4616 +               sz += snprintf(result + sz, maxlen - sz, "%lu/%lu",
4617 +                              ms->sync_count, ms->nr_regions);
4618 +               break;
4619 +
4620 +       case STATUSTYPE_TABLE:
4621 +               sz += snprintf(result + sz, maxlen - sz,
4622 +                              "%s 1 " SECTOR_FORMAT " %d ",
4623 +                              ms->rh.log->type->name, ms->rh.region_size,
4624 +                              ms->nr_mirrors);
4625 +
4626 +               for (m = 0; m < ms->nr_mirrors; m++)
4627 +                       sz += snprintf(result + sz, maxlen - sz, "%s %ld ",
4628 +                                      dm_kdevname(ms->mirror[m].dev->dev),
4629 +                                      ms->mirror[m].offset);
4630 +       }
4631 +
4632 +       return 0;
4633 +}
4634 +
4635 +static struct target_type mirror_target = {
4636 +       .name    = "mirror",
4637 +       .module  = THIS_MODULE,
4638 +       .ctr     = mirror_ctr,
4639 +       .dtr     = mirror_dtr,
4640 +       .map     = mirror_map,
4641 +       .end_io  = mirror_end_io,
4642 +       .suspend = mirror_suspend,
4643 +       .resume  = mirror_resume,
4644 +       .status  = mirror_status,
4645 +};
4646 +
4647 +static int __init dm_mirror_init(void)
4648 +{
4649 +       int r;
4650 +
4651 +       r = dm_dirty_log_init();
4652 +       if (r)
4653 +               return r;
4654 +
4655 +       r = dm_daemon_start(&_kmirrord, "kmirrord", do_work);
4656 +       if (r) {
4657 +               DMERR("couldn't start kmirrord");
4658 +               dm_dirty_log_exit();
4659 +               return r;
4660 +       }
4661 +
4662 +       r = dm_register_target(&mirror_target);
4663 +       if (r < 0) {
4664 +               DMERR("%s: Failed to register mirror target",
4665 +                     mirror_target.name);
4666 +               dm_dirty_log_exit();
4667 +               dm_daemon_stop(&_kmirrord);
4668 +       }
4669 +
4670 +       return r;
4671 +}
4672 +
4673 +static void __exit dm_mirror_exit(void)
4674 +{
4675 +       int r;
4676 +
4677 +       r = dm_unregister_target(&mirror_target);
4678 +       if (r < 0)
4679 +               DMERR("%s: unregister failed %d", mirror_target.name, r);
4680 +
4681 +       dm_daemon_stop(&_kmirrord);
4682 +       dm_dirty_log_exit();
4683 +}
4684 +
4685 +/* Module hooks */
4686 +module_init(dm_mirror_init);
4687 +module_exit(dm_mirror_exit);
4688 +
4689 +MODULE_DESCRIPTION(DM_NAME " mirror target");
4690 +MODULE_AUTHOR("Heinz Mauelshagen <mge@sistina.com>");
4691 +MODULE_LICENSE("GPL");
4692 --- linux-2.4.21/drivers/md/dm-snapshot.c       Thu Jan  1 01:00:00 1970
4693 +++ linux/drivers/md/dm-snapshot.c      Wed Aug 20 14:41:38 2003
4694 @@ -0,0 +1,1235 @@
4695 +/*
4696 + * dm-snapshot.c
4697 + *
4698 + * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
4699 + *
4700 + * This file is released under the GPL.
4701 + */
4702 +
4703 +#include <linux/config.h>
4704 +#include <linux/ctype.h>
4705 +#include <linux/module.h>
4706 +#include <linux/init.h>
4707 +#include <linux/slab.h>
4708 +#include <linux/list.h>
4709 +#include <linux/fs.h>
4710 +#include <linux/blkdev.h>
4711 +#include <linux/mempool.h>
4712 +#include <linux/device-mapper.h>
4713 +#include <linux/vmalloc.h>
4714 +
4715 +#include "dm-snapshot.h"
4716 +#include "kcopyd.h"
4717 +
4718 +/*
4719 + * FIXME: Remove this before release.
4720 + */
4721 +#if 0
4722 +#define DMDEBUG(x...) DMWARN( ## x)
4723 +#else
4724 +#define DMDEBUG(x...)
4725 +#endif
4726 +
4727 +/*
4728 + * The percentage increment we will wake up users at
4729 + */
4730 +#define WAKE_UP_PERCENT 5
4731 +
4732 +/*
4733 + * kcopyd priority of snapshot operations
4734 + */
4735 +#define SNAPSHOT_COPY_PRIORITY 2
4736 +
4737 +/*
4738 + * Each snapshot reserves this many pages for io
4739 + * FIXME: calculate this
4740 + */
4741 +#define SNAPSHOT_PAGES 256
4742 +
4743 +struct pending_exception {
4744 +       struct exception e;
4745 +
4746 +       /*
4747 +        * Origin buffers waiting for this to complete are held
4748 +        * in a list (using b_reqnext).
4749 +        */
4750 +       struct buffer_head *origin_bhs;
4751 +       struct buffer_head *snapshot_bhs;
4752 +
4753 +       /*
4754 +        * Other pending_exceptions that are processing this
4755 +        * chunk.  When this list is empty, we know we can
4756 +        * complete the origins.
4757 +        */
4758 +       struct list_head siblings;
4759 +
4760 +       /* Pointer back to snapshot context */
4761 +       struct dm_snapshot *snap;
4762 +
4763 +       /*
4764 +        * 1 indicates the exception has already been sent to
4765 +        * kcopyd.
4766 +        */
4767 +       int started;
4768 +};
4769 +
4770 +/*
4771 + * Hash table mapping origin volumes to lists of snapshots and
4772 + * a lock to protect it
4773 + */
4774 +static kmem_cache_t *exception_cache;
4775 +static kmem_cache_t *pending_cache;
4776 +static mempool_t *pending_pool;
4777 +
4778 +/*
4779 + * One of these per registered origin, held in the snapshot_origins hash
4780 + */
4781 +struct origin {
4782 +       /* The origin device */
4783 +       kdev_t dev;
4784 +
4785 +       struct list_head hash_list;
4786 +
4787 +       /* List of snapshots for this origin */
4788 +       struct list_head snapshots;
4789 +};
4790 +
4791 +/*
4792 + * Size of the hash table for origin volumes. If we make this
4793 + * the size of the minors list then it should be nearly perfect
4794 + */
4795 +#define ORIGIN_HASH_SIZE 256
4796 +#define ORIGIN_MASK      0xFF
4797 +static struct list_head *_origins;
4798 +static struct rw_semaphore _origins_lock;
4799 +
4800 +static int init_origin_hash(void)
4801 +{
4802 +       int i;
4803 +
4804 +       _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
4805 +                          GFP_KERNEL);
4806 +       if (!_origins) {
4807 +               DMERR("Device mapper: Snapshot: unable to allocate memory");
4808 +               return -ENOMEM;
4809 +       }
4810 +
4811 +       for (i = 0; i < ORIGIN_HASH_SIZE; i++)
4812 +               INIT_LIST_HEAD(_origins + i);
4813 +       init_rwsem(&_origins_lock);
4814 +
4815 +       return 0;
4816 +}
4817 +
4818 +static void exit_origin_hash(void)
4819 +{
4820 +       kfree(_origins);
4821 +}
4822 +
4823 +static inline unsigned int origin_hash(kdev_t dev)
4824 +{
4825 +       return MINOR(dev) & ORIGIN_MASK;
4826 +}
4827 +
4828 +static struct origin *__lookup_origin(kdev_t origin)
4829 +{
4830 +       struct list_head *slist;
4831 +       struct list_head *ol;
4832 +       struct origin *o;
4833 +
4834 +       ol = &_origins[origin_hash(origin)];
4835 +       list_for_each(slist, ol) {
4836 +               o = list_entry(slist, struct origin, hash_list);
4837 +
4838 +               if (o->dev == origin)
4839 +                       return o;
4840 +       }
4841 +
4842 +       return NULL;
4843 +}
4844 +
4845 +static void __insert_origin(struct origin *o)
4846 +{
4847 +       struct list_head *sl = &_origins[origin_hash(o->dev)];
4848 +       list_add_tail(&o->hash_list, sl);
4849 +}
4850 +
4851 +/*
4852 + * Make a note of the snapshot and its origin so we can look it
4853 + * up when the origin has a write on it.
4854 + */
4855 +static int register_snapshot(struct dm_snapshot *snap)
4856 +{
4857 +       struct origin *o;
4858 +       kdev_t dev = snap->origin->dev;
4859 +
4860 +       down_write(&_origins_lock);
4861 +       o = __lookup_origin(dev);
4862 +
4863 +       if (!o) {
4864 +               /* New origin */
4865 +               o = kmalloc(sizeof(*o), GFP_KERNEL);
4866 +               if (!o) {
4867 +                       up_write(&_origins_lock);
4868 +                       return -ENOMEM;
4869 +               }
4870 +
4871 +               /* Initialise the struct */
4872 +               INIT_LIST_HEAD(&o->snapshots);
4873 +               o->dev = dev;
4874 +
4875 +               __insert_origin(o);
4876 +       }
4877 +
4878 +       list_add_tail(&snap->list, &o->snapshots);
4879 +
4880 +       up_write(&_origins_lock);
4881 +       return 0;
4882 +}
4883 +
4884 +static void unregister_snapshot(struct dm_snapshot *s)
4885 +{
4886 +       struct origin *o;
4887 +
4888 +       down_write(&_origins_lock);
4889 +       o = __lookup_origin(s->origin->dev);
4890 +
4891 +       list_del(&s->list);
4892 +       if (list_empty(&o->snapshots)) {
4893 +               list_del(&o->hash_list);
4894 +               kfree(o);
4895 +       }
4896 +
4897 +       up_write(&_origins_lock);
4898 +}
4899 +
4900 +/*
4901 + * Implementation of the exception hash tables.
4902 + */
4903 +static int init_exception_table(struct exception_table *et, uint32_t size)
4904 +{
4905 +       unsigned int i;
4906 +
4907 +       et->hash_mask = size - 1;
4908 +       et->table = vcalloc(size, sizeof(struct list_head));
4909 +       if (!et->table)
4910 +               return -ENOMEM;
4911 +
4912 +       for (i = 0; i < size; i++)
4913 +               INIT_LIST_HEAD(et->table + i);
4914 +
4915 +       return 0;
4916 +}
4917 +
4918 +static void exit_exception_table(struct exception_table *et, kmem_cache_t *mem)
4919 +{
4920 +       struct list_head *slot, *entry, *temp;
4921 +       struct exception *ex;
4922 +       int i, size;
4923 +
4924 +       size = et->hash_mask + 1;
4925 +       for (i = 0; i < size; i++) {
4926 +               slot = et->table + i;
4927 +
4928 +               list_for_each_safe(entry, temp, slot) {
4929 +                       ex = list_entry(entry, struct exception, hash_list);
4930 +                       kmem_cache_free(mem, ex);
4931 +               }
4932 +       }
4933 +
4934 +       vfree(et->table);
4935 +}
4936 +
4937 +/*
4938 + * FIXME: check how this hash fn is performing.
4939 + */
4940 +static inline uint32_t exception_hash(struct exception_table *et, chunk_t chunk)
4941 +{
4942 +       return chunk & et->hash_mask;
4943 +}
4944 +
4945 +static void insert_exception(struct exception_table *eh, struct exception *e)
4946 +{
4947 +       struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)];
4948 +       list_add(&e->hash_list, l);
4949 +}
4950 +
4951 +static inline void remove_exception(struct exception *e)
4952 +{
4953 +       list_del(&e->hash_list);
4954 +}
4955 +
4956 +/*
4957 + * Return the exception data for a sector, or NULL if not
4958 + * remapped.
4959 + */
4960 +static struct exception *lookup_exception(struct exception_table *et,
4961 +                                         chunk_t chunk)
4962 +{
4963 +       struct list_head *slot, *el;
4964 +       struct exception *e;
4965 +
4966 +       slot = &et->table[exception_hash(et, chunk)];
4967 +       list_for_each(el, slot) {
4968 +               e = list_entry(el, struct exception, hash_list);
4969 +               if (e->old_chunk == chunk)
4970 +                       return e;
4971 +       }
4972 +
4973 +       return NULL;
4974 +}
4975 +
4976 +static inline struct exception *alloc_exception(void)
4977 +{
4978 +       struct exception *e;
4979 +
4980 +       e = kmem_cache_alloc(exception_cache, GFP_NOIO);
4981 +       if (!e)
4982 +               e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
4983 +
4984 +       return e;
4985 +}
4986 +
4987 +static inline void free_exception(struct exception *e)
4988 +{
4989 +       kmem_cache_free(exception_cache, e);
4990 +}
4991 +
4992 +static inline struct pending_exception *alloc_pending_exception(void)
4993 +{
4994 +       return mempool_alloc(pending_pool, GFP_NOIO);
4995 +}
4996 +
4997 +static inline void free_pending_exception(struct pending_exception *pe)
4998 +{
4999 +       mempool_free(pe, pending_pool);
5000 +}
5001 +
5002 +int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new)
5003 +{
5004 +       struct exception *e;
5005 +
5006 +       e = alloc_exception();
5007 +       if (!e)
5008 +               return -ENOMEM;
5009 +
5010 +       e->old_chunk = old;
5011 +       e->new_chunk = new;
5012 +       insert_exception(&s->complete, e);
5013 +       return 0;
5014 +}
5015 +
5016 +/*
5017 + * Hard coded magic.
5018 + */
5019 +static int calc_max_buckets(void)
5020 +{
5021 +       unsigned long mem;
5022 +
5023 +       mem = num_physpages << PAGE_SHIFT;
5024 +       mem /= 50;
5025 +       mem /= sizeof(struct list_head);
5026 +
5027 +       return mem;
5028 +}
5029 +
5030 +/*
5031 + * Rounds a number down to a power of 2.
5032 + */
5033 +static inline uint32_t round_down(uint32_t n)
5034 +{
5035 +       while (n & (n - 1))
5036 +               n &= (n - 1);
5037 +       return n;
5038 +}
5039 +
5040 +/*
5041 + * Allocate room for a suitable hash table.
5042 + */
5043 +static int init_hash_tables(struct dm_snapshot *s)
5044 +{
5045 +       sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets;
5046 +
5047 +       /*
5048 +        * Calculate based on the size of the original volume or
5049 +        * the COW volume...
5050 +        */
5051 +       cow_dev_size = get_dev_size(s->cow->dev);
5052 +       origin_dev_size = get_dev_size(s->origin->dev);
5053 +       max_buckets = calc_max_buckets();
5054 +
5055 +       hash_size = min(origin_dev_size, cow_dev_size) / s->chunk_size;
5056 +       hash_size = min(hash_size, max_buckets);
5057 +
5058 +       /* Round it down to a power of 2 */
5059 +       hash_size = round_down(hash_size);
5060 +       if (init_exception_table(&s->complete, hash_size))
5061 +               return -ENOMEM;
5062 +
5063 +       /*
5064 +        * Allocate hash table for in-flight exceptions
5065 +        * Make this smaller than the real hash table
5066 +        */
5067 +       hash_size >>= 3;
5068 +       if (!hash_size)
5069 +               hash_size = 64;
5070 +
5071 +       if (init_exception_table(&s->pending, hash_size)) {
5072 +               exit_exception_table(&s->complete, exception_cache);
5073 +               return -ENOMEM;
5074 +       }
5075 +
5076 +       return 0;
5077 +}
5078 +
5079 +/*
5080 + * Round a number up to the nearest 'size' boundary.  size must
5081 + * be a power of 2.
5082 + */
5083 +static inline ulong round_up(ulong n, ulong size)
5084 +{
5085 +       size--;
5086 +       return (n + size) & ~size;
5087 +}
5088 +
5089 +/*
5090 + * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
5091 + */
5092 +static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
5093 +{
5094 +       struct dm_snapshot *s;
5095 +       unsigned long chunk_size;
5096 +       int r = -EINVAL;
5097 +       char persistent;
5098 +       char *origin_path;
5099 +       char *cow_path;
5100 +       char *value;
5101 +       int blocksize;
5102 +
5103 +       if (argc < 4) {
5104 +               ti->error = "dm-snapshot: requires exactly 4 arguments";
5105 +               r = -EINVAL;
5106 +               goto bad1;
5107 +       }
5108 +
5109 +       origin_path = argv[0];
5110 +       cow_path = argv[1];
5111 +       persistent = toupper(*argv[2]);
5112 +
5113 +       if (persistent != 'P' && persistent != 'N') {
5114 +               ti->error = "Persistent flag is not P or N";
5115 +               r = -EINVAL;
5116 +               goto bad1;
5117 +       }
5118 +
5119 +       chunk_size = simple_strtoul(argv[3], &value, 10);
5120 +       if (chunk_size == 0 || value == NULL) {
5121 +               ti->error = "Invalid chunk size";
5122 +               r = -EINVAL;
5123 +               goto bad1;
5124 +       }
5125 +
5126 +       s = kmalloc(sizeof(*s), GFP_KERNEL);
5127 +       if (s == NULL) {
5128 +               ti->error = "Cannot allocate snapshot context private "
5129 +                   "structure";
5130 +               r = -ENOMEM;
5131 +               goto bad1;
5132 +       }
5133 +
5134 +       r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin);
5135 +       if (r) {
5136 +               ti->error = "Cannot get origin device";
5137 +               goto bad2;
5138 +       }
5139 +
5140 +       /* FIXME: get cow length */
5141 +       r = dm_get_device(ti, cow_path, 0, 0,
5142 +                         FMODE_READ | FMODE_WRITE, &s->cow);
5143 +       if (r) {
5144 +               dm_put_device(ti, s->origin);
5145 +               ti->error = "Cannot get COW device";
5146 +               goto bad2;
5147 +       }
5148 +
5149 +       /*
5150 +        * Chunk size must be multiple of page size.  Silently
5151 +        * round up if it's not.
5152 +        */
5153 +       chunk_size = round_up(chunk_size, PAGE_SIZE / SECTOR_SIZE);
5154 +
5155 +       /* Validate the chunk size against the device block size */
5156 +       blocksize = get_hardsect_size(s->cow->dev);
5157 +       if (chunk_size % (blocksize / SECTOR_SIZE)) {
5158 +               ti->error = "Chunk size is not a multiple of device blocksize";
5159 +               r = -EINVAL;
5160 +               goto bad3;
5161 +       }
5162 +
5163 +       /* Check the sizes are small enough to fit in one kiovec */
5164 +       if (chunk_size > KIO_MAX_SECTORS) {
5165 +               ti->error = "Chunk size is too big";
5166 +               r = -EINVAL;
5167 +               goto bad3;
5168 +       }
5169 +
5170 +       /* Check chunk_size is a power of 2 */
5171 +       if (chunk_size & (chunk_size - 1)) {
5172 +               ti->error = "Chunk size is not a power of 2";
5173 +               r = -EINVAL;
5174 +               goto bad3;
5175 +       }
5176 +
5177 +       s->chunk_size = chunk_size;
5178 +       s->chunk_mask = chunk_size - 1;
5179 +       s->type = persistent;
5180 +       for (s->chunk_shift = 0; chunk_size;
5181 +            s->chunk_shift++, chunk_size >>= 1)
5182 +               ;
5183 +       s->chunk_shift--;
5184 +
5185 +       s->valid = 1;
5186 +       s->have_metadata = 0;
5187 +       s->last_percent = 0;
5188 +       init_rwsem(&s->lock);
5189 +       s->table = ti->table;
5190 +
5191 +       /* Allocate hash table for COW data */
5192 +       if (init_hash_tables(s)) {
5193 +               ti->error = "Unable to allocate hash table space";
5194 +               r = -ENOMEM;
5195 +               goto bad3;
5196 +       }
5197 +
5198 +       /*
5199 +        * Check the persistent flag - done here because we need the iobuf
5200 +        * to check the LV header
5201 +        */
5202 +       s->store.snap = s;
5203 +
5204 +       if (persistent == 'P')
5205 +               r = dm_create_persistent(&s->store, s->chunk_size);
5206 +       else
5207 +               r = dm_create_transient(&s->store, s, blocksize);
5208 +
5209 +       if (r) {
5210 +               ti->error = "Couldn't create exception store";
5211 +               r = -EINVAL;
5212 +               goto bad4;
5213 +       }
5214 +
5215 +       r = kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client);
5216 +       if (r) {
5217 +               ti->error = "Could not create kcopyd client";
5218 +               goto bad5;
5219 +       }
5220 +
5221 +       /* Flush IO to the origin device */
5222 +       fsync_dev(s->origin->dev);
5223 +
5224 +       /* Add snapshot to the list of snapshots for this origin */
5225 +       if (register_snapshot(s)) {
5226 +               r = -EINVAL;
5227 +               ti->error = "Cannot register snapshot origin";
5228 +               goto bad6;
5229 +       }
5230 +
5231 +       ti->private = s;
5232 +       return 0;
5233 +
5234 + bad6:
5235 +       kcopyd_client_destroy(s->kcopyd_client);
5236 +
5237 + bad5:
5238 +       s->store.destroy(&s->store);
5239 +
5240 + bad4:
5241 +       exit_exception_table(&s->pending, pending_cache);
5242 +       exit_exception_table(&s->complete, exception_cache);
5243 +
5244 + bad3:
5245 +       dm_put_device(ti, s->cow);
5246 +       dm_put_device(ti, s->origin);
5247 +
5248 + bad2:
5249 +       kfree(s);
5250 +
5251 + bad1:
5252 +       return r;
5253 +}
5254 +
5255 +static void snapshot_dtr(struct dm_target *ti)
5256 +{
5257 +       struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
5258 +
5259 +       dm_table_event(ti->table);
5260 +
5261 +       unregister_snapshot(s);
5262 +
5263 +       exit_exception_table(&s->pending, pending_cache);
5264 +       exit_exception_table(&s->complete, exception_cache);
5265 +
5266 +       /* Deallocate memory used */
5267 +       s->store.destroy(&s->store);
5268 +
5269 +       dm_put_device(ti, s->origin);
5270 +       dm_put_device(ti, s->cow);
5271 +       kcopyd_client_destroy(s->kcopyd_client);
5272 +       kfree(s);
5273 +}
5274 +
5275 +/*
5276 + * We hold lists of buffer_heads, using the b_reqnext field.
5277 + */
5278 +static void queue_buffer(struct buffer_head **queue, struct buffer_head *bh)
5279 +{
5280 +       bh->b_reqnext = *queue;
5281 +       *queue = bh;
5282 +}
5283 +
5284 +/*
5285 + * FIXME: inefficient.
5286 + */
5287 +static void queue_buffers(struct buffer_head **queue, struct buffer_head *bhs)
5288 +{
5289 +       while (*queue)
5290 +               queue = &((*queue)->b_reqnext);
5291 +
5292 +       *queue = bhs;
5293 +}
5294 +
5295 +/*
5296 + * Flush a list of buffers.
5297 + */
5298 +static void flush_buffers(struct buffer_head *bh)
5299 +{
5300 +       struct buffer_head *n;
5301 +
5302 +       DMDEBUG("begin flush");
5303 +       while (bh) {
5304 +               n = bh->b_reqnext;
5305 +               bh->b_reqnext = NULL;
5306 +               DMDEBUG("flushing %p", bh);
5307 +               generic_make_request(WRITE, bh);
5308 +               bh = n;
5309 +       }
5310 +
5311 +       run_task_queue(&tq_disk);
5312 +}
5313 +
5314 +/*
5315 + * Error a list of buffers.
5316 + */
5317 +static void error_buffers(struct buffer_head *bh)
5318 +{
5319 +       struct buffer_head *n;
5320 +
5321 +       while (bh) {
5322 +               n = bh->b_reqnext;
5323 +               bh->b_reqnext = NULL;
5324 +               buffer_IO_error(bh);
5325 +               bh = n;
5326 +       }
5327 +}
5328 +
5329 +static struct buffer_head *__flush_bhs(struct pending_exception *pe)
5330 +{
5331 +       struct pending_exception *sibling;
5332 +
5333 +       if (list_empty(&pe->siblings))
5334 +               return pe->origin_bhs;
5335 +
5336 +       sibling = list_entry(pe->siblings.next,
5337 +                            struct pending_exception, siblings);
5338 +
5339 +       list_del(&pe->siblings);
5340 +
5341 +       /* FIXME: I think there's a race on SMP machines here, add spin lock */
5342 +       queue_buffers(&sibling->origin_bhs, pe->origin_bhs);
5343 +
5344 +       return NULL;
5345 +}
5346 +
5347 +static void pending_complete(struct pending_exception *pe, int success)
5348 +{
5349 +       struct exception *e;
5350 +       struct dm_snapshot *s = pe->snap;
5351 +       struct buffer_head *flush = NULL;
5352 +
5353 +       if (success) {
5354 +               e = alloc_exception();
5355 +               if (!e) {
5356 +                       DMWARN("Unable to allocate exception.");
5357 +                       down_write(&s->lock);
5358 +                       s->store.drop_snapshot(&s->store);
5359 +                       s->valid = 0;
5360 +                       flush = __flush_bhs(pe);
5361 +                       up_write(&s->lock);
5362 +
5363 +                       error_buffers(pe->snapshot_bhs);
5364 +                       goto out;
5365 +               }
5366 +
5367 +               /*
5368 +                * Add a proper exception, and remove the
5369 +                * in-flight exception from the list.
5370 +                */
5371 +               down_write(&s->lock);
5372 +
5373 +               memcpy(e, &pe->e, sizeof(*e));
5374 +               insert_exception(&s->complete, e);
5375 +               remove_exception(&pe->e);
5376 +               flush = __flush_bhs(pe);
5377 +
5378 +               /* Submit any pending write BHs */
5379 +               up_write(&s->lock);
5380 +
5381 +               flush_buffers(pe->snapshot_bhs);
5382 +               DMDEBUG("Exception completed successfully.");
5383 +
5384 +               /* Notify any interested parties */
5385 +               if (s->store.fraction_full) {
5386 +                       sector_t numerator, denominator;
5387 +                       int pc;
5388 +
5389 +                       s->store.fraction_full(&s->store, &numerator,
5390 +                                              &denominator);
5391 +                       pc = numerator * 100 / denominator;
5392 +
5393 +                       if (pc >= s->last_percent + WAKE_UP_PERCENT) {
5394 +                               dm_table_event(s->table);
5395 +                               s->last_percent = pc - pc % WAKE_UP_PERCENT;
5396 +                       }
5397 +               }
5398 +
5399 +       } else {
5400 +               /* Read/write error - snapshot is unusable */
5401 +               down_write(&s->lock);
5402 +               if (s->valid)
5403 +                       DMERR("Error reading/writing snapshot");
5404 +               s->store.drop_snapshot(&s->store);
5405 +               s->valid = 0;
5406 +               remove_exception(&pe->e);
5407 +               flush = __flush_bhs(pe);
5408 +               up_write(&s->lock);
5409 +
5410 +               error_buffers(pe->snapshot_bhs);
5411 +
5412 +               dm_table_event(s->table);
5413 +               DMDEBUG("Exception failed.");
5414 +       }
5415 +
5416 + out:
5417 +       if (flush)
5418 +               flush_buffers(flush);
5419 +
5420 +       free_pending_exception(pe);
5421 +}
5422 +
5423 +static void commit_callback(void *context, int success)
5424 +{
5425 +       struct pending_exception *pe = (struct pending_exception *) context;
5426 +       pending_complete(pe, success);
5427 +}
5428 +
5429 +/*
5430 + * Called when the copy I/O has finished.  kcopyd actually runs
5431 + * this code so don't block.
5432 + */
5433 +static void copy_callback(int read_err, unsigned int write_err, void *context)
5434 +{
5435 +       struct pending_exception *pe = (struct pending_exception *) context;
5436 +       struct dm_snapshot *s = pe->snap;
5437 +
5438 +       if (read_err || write_err)
5439 +               pending_complete(pe, 0);
5440 +
5441 +       else
5442 +               /* Update the metadata if we are persistent */
5443 +               s->store.commit_exception(&s->store, &pe->e, commit_callback,
5444 +                                         pe);
5445 +}
5446 +
5447 +/*
5448 + * Dispatches the copy operation to kcopyd.
5449 + */
5450 +static inline void start_copy(struct pending_exception *pe)
5451 +{
5452 +       struct dm_snapshot *s = pe->snap;
5453 +       struct io_region src, dest;
5454 +       kdev_t dev = s->origin->dev;
5455 +       int *sizes = blk_size[major(dev)];
5456 +       sector_t dev_size = (sector_t) -1;
5457 +
5458 +       if (pe->started)
5459 +               return;
5460 +
5461 +       /* this is protected by snap->lock */
5462 +       pe->started = 1;
5463 +
5464 +       if (sizes && sizes[minor(dev)])
5465 +               dev_size = sizes[minor(dev)] << 1;
5466 +
5467 +       src.dev = dev;
5468 +       src.sector = chunk_to_sector(s, pe->e.old_chunk);
5469 +       src.count = min(s->chunk_size, dev_size - src.sector);
5470 +
5471 +       dest.dev = s->cow->dev;
5472 +       dest.sector = chunk_to_sector(s, pe->e.new_chunk);
5473 +       dest.count = src.count;
5474 +
5475 +       /* Hand over to kcopyd */
5476 +       kcopyd_copy(s->kcopyd_client,
5477 +                   &src, 1, &dest, 0, copy_callback, pe);
5478 +}
5479 +
5480 +/*
5481 + * Looks to see if this snapshot already has a pending exception
5482 + * for this chunk, otherwise it allocates a new one and inserts
5483 + * it into the pending table.
5484 + */
5485 +static struct pending_exception *find_pending_exception(struct dm_snapshot *s,
5486 +                                                       struct buffer_head *bh)
5487 +{
5488 +       struct exception *e;
5489 +       struct pending_exception *pe;
5490 +       chunk_t chunk = sector_to_chunk(s, bh->b_rsector);
5491 +
5492 +       /*
5493 +        * Is there a pending exception for this already ?
5494 +        */
5495 +       e = lookup_exception(&s->pending, chunk);
5496 +       if (e) {
5497 +               /* cast the exception to a pending exception */
5498 +               pe = list_entry(e, struct pending_exception, e);
5499 +
5500 +       } else {
5501 +               /* Create a new pending exception */
5502 +               pe = alloc_pending_exception();
5503 +               pe->e.old_chunk = chunk;
5504 +               pe->origin_bhs = pe->snapshot_bhs = NULL;
5505 +               INIT_LIST_HEAD(&pe->siblings);
5506 +               pe->snap = s;
5507 +               pe->started = 0;
5508 +
5509 +               if (s->store.prepare_exception(&s->store, &pe->e)) {
5510 +                       free_pending_exception(pe);
5511 +                       s->valid = 0;
5512 +                       return NULL;
5513 +               }
5514 +
5515 +               insert_exception(&s->pending, &pe->e);
5516 +       }
5517 +
5518 +       return pe;
5519 +}
5520 +
5521 +static inline void remap_exception(struct dm_snapshot *s, struct exception *e,
5522 +                                  struct buffer_head *bh)
5523 +{
5524 +       bh->b_rdev = s->cow->dev;
5525 +       bh->b_rsector = chunk_to_sector(s, e->new_chunk) +
5526 +           (bh->b_rsector & s->chunk_mask);
5527 +}
5528 +
5529 +static int snapshot_map(struct dm_target *ti, struct buffer_head *bh, int rw,
5530 +                       union map_info *map_context)
5531 +{
5532 +       struct exception *e;
5533 +       struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
5534 +       int r = 1;
5535 +       chunk_t chunk;
5536 +       struct pending_exception *pe;
5537 +
5538 +       chunk = sector_to_chunk(s, bh->b_rsector);
5539 +
5540 +       /* Full snapshots are not usable */
5541 +       if (!s->valid)
5542 +               return -1;
5543 +
5544 +       /*
5545 +        * Write to snapshot - higher level takes care of RW/RO
5546 +        * flags so we should only get this if we are
5547 +        * writeable.
5548 +        */
5549 +       if (rw == WRITE) {
5550 +
5551 +               down_write(&s->lock);
5552 +
5553 +               /* If the block is already remapped - use that, else remap it */
5554 +               e = lookup_exception(&s->complete, chunk);
5555 +               if (e)
5556 +                       remap_exception(s, e, bh);
5557 +
5558 +               else {
5559 +                       pe = find_pending_exception(s, bh);
5560 +
5561 +                       if (!pe) {
5562 +                               s->store.drop_snapshot(&s->store);
5563 +                               s->valid = 0;
5564 +                               r = -EIO;
5565 +                       } else {
5566 +                               remap_exception(s, &pe->e, bh);
5567 +                               queue_buffer(&pe->snapshot_bhs, bh);
5568 +                               start_copy(pe);
5569 +                               r = 0;
5570 +                       }
5571 +               }
5572 +
5573 +               up_write(&s->lock);
5574 +
5575 +       } else {
5576 +               /*
5577 +                * FIXME: this read path scares me because we
5578 +                * always use the origin when we have a pending
5579 +                * exception.  However I can't think of a
5580 +                * situation where this is wrong - ejt.
5581 +                */
5582 +
5583 +               /* Do reads */
5584 +               down_read(&s->lock);
5585 +
5586 +               /* See if it it has been remapped */
5587 +               e = lookup_exception(&s->complete, chunk);
5588 +               if (e)
5589 +                       remap_exception(s, e, bh);
5590 +               else
5591 +                       bh->b_rdev = s->origin->dev;
5592 +
5593 +               up_read(&s->lock);
5594 +       }
5595 +
5596 +       return r;
5597 +}
5598 +
5599 +void snapshot_resume(struct dm_target *ti)
5600 +{
5601 +       struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
5602 +
5603 +       if (s->have_metadata)
5604 +               return;
5605 +
5606 +       if (s->store.read_metadata(&s->store)) {
5607 +               down_write(&s->lock);
5608 +               s->valid = 0;
5609 +               up_write(&s->lock);
5610 +       }
5611 +
5612 +       s->have_metadata = 1;
5613 +}
5614 +
5615 +static int snapshot_status(struct dm_target *ti, status_type_t type,
5616 +                          char *result, unsigned int maxlen)
5617 +{
5618 +       struct dm_snapshot *snap = (struct dm_snapshot *) ti->private;
5619 +       char cow[16];
5620 +       char org[16];
5621 +
5622 +       switch (type) {
5623 +       case STATUSTYPE_INFO:
5624 +               if (!snap->valid)
5625 +                       snprintf(result, maxlen, "Invalid");
5626 +               else {
5627 +                       if (snap->store.fraction_full) {
5628 +                               sector_t numerator, denominator;
5629 +                               snap->store.fraction_full(&snap->store,
5630 +                                                         &numerator,
5631 +                                                         &denominator);
5632 +                               snprintf(result, maxlen,
5633 +                                        SECTOR_FORMAT "/" SECTOR_FORMAT,
5634 +                                        numerator, denominator);
5635 +                       }
5636 +                       else
5637 +                               snprintf(result, maxlen, "Unknown");
5638 +               }
5639 +               break;
5640 +
5641 +       case STATUSTYPE_TABLE:
5642 +               /*
5643 +                * kdevname returns a static pointer so we need
5644 +                * to make private copies if the output is to
5645 +                * make sense.
5646 +                */
5647 +               strncpy(cow, dm_kdevname(snap->cow->dev), sizeof(cow));
5648 +               strncpy(org, dm_kdevname(snap->origin->dev), sizeof(org));
5649 +               snprintf(result, maxlen, "%s %s %c %ld", org, cow,
5650 +                        snap->type, snap->chunk_size);
5651 +               break;
5652 +       }
5653 +
5654 +       return 0;
5655 +}
5656 +
5657 +/*-----------------------------------------------------------------
5658 + * Origin methods
5659 + *---------------------------------------------------------------*/
5660 +static void list_merge(struct list_head *l1, struct list_head *l2)
5661 +{
5662 +       struct list_head *l1_n, *l2_p;
5663 +
5664 +       l1_n = l1->next;
5665 +       l2_p = l2->prev;
5666 +
5667 +       l1->next = l2;
5668 +       l2->prev = l1;
5669 +
5670 +       l2_p->next = l1_n;
5671 +       l1_n->prev = l2_p;
5672 +}
5673 +
5674 +static int __origin_write(struct list_head *snapshots, struct buffer_head *bh)
5675 +{
5676 +       int r = 1, first = 1;
5677 +       struct list_head *sl;
5678 +       struct dm_snapshot *snap;
5679 +       struct exception *e;
5680 +       struct pending_exception *pe, *last = NULL;
5681 +       chunk_t chunk;
5682 +
5683 +       /* Do all the snapshots on this origin */
5684 +       list_for_each(sl, snapshots) {
5685 +               snap = list_entry(sl, struct dm_snapshot, list);
5686 +
5687 +               /* Only deal with valid snapshots */
5688 +               if (!snap->valid)
5689 +                       continue;
5690 +
5691 +               down_write(&snap->lock);
5692 +
5693 +               /*
5694 +                * Remember, different snapshots can have
5695 +                * different chunk sizes.
5696 +                */
5697 +               chunk = sector_to_chunk(snap, bh->b_rsector);
5698 +
5699 +               /*
5700 +                * Check exception table to see if block
5701 +                * is already remapped in this snapshot
5702 +                * and trigger an exception if not.
5703 +                */
5704 +               e = lookup_exception(&snap->complete, chunk);
5705 +               if (!e) {
5706 +                       pe = find_pending_exception(snap, bh);
5707 +                       if (!pe) {
5708 +                               snap->store.drop_snapshot(&snap->store);
5709 +                               snap->valid = 0;
5710 +
5711 +                       } else {
5712 +                               if (last)
5713 +                                       list_merge(&pe->siblings,
5714 +                                                  &last->siblings);
5715 +
5716 +                               last = pe;
5717 +                               r = 0;
5718 +                       }
5719 +               }
5720 +
5721 +               up_write(&snap->lock);
5722 +       }
5723 +
5724 +       /*
5725 +        * Now that we have a complete pe list we can start the copying.
5726 +        */
5727 +       if (last) {
5728 +               pe = last;
5729 +               do {
5730 +                       down_write(&pe->snap->lock);
5731 +                       if (first)
5732 +                               queue_buffer(&pe->origin_bhs, bh);
5733 +                       start_copy(pe);
5734 +                       up_write(&pe->snap->lock);
5735 +                       first = 0;
5736 +                       pe = list_entry(pe->siblings.next,
5737 +                                       struct pending_exception, siblings);
5738 +
5739 +               } while (pe != last);
5740 +       }
5741 +
5742 +       return r;
5743 +}
5744 +
5745 +/*
5746 + * Called on a write from the origin driver.
5747 + */
5748 +int do_origin(struct dm_dev *origin, struct buffer_head *bh)
5749 +{
5750 +       struct origin *o;
5751 +       int r;
5752 +
5753 +       down_read(&_origins_lock);
5754 +       o = __lookup_origin(origin->dev);
5755 +       if (!o)
5756 +               BUG();
5757 +
5758 +       r = __origin_write(&o->snapshots, bh);
5759 +       up_read(&_origins_lock);
5760 +
5761 +       return r;
5762 +}
5763 +
5764 +/*
5765 + * Origin: maps a linear range of a device, with hooks for snapshotting.
5766 + */
5767 +
5768 +/*
5769 + * Construct an origin mapping: <dev_path>
5770 + * The context for an origin is merely a 'struct dm_dev *'
5771 + * pointing to the real device.
5772 + */
5773 +static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
5774 +{
5775 +       int r;
5776 +       struct dm_dev *dev;
5777 +
5778 +       if (argc != 1) {
5779 +               ti->error = "dm-origin: incorrect number of arguments";
5780 +               return -EINVAL;
5781 +       }
5782 +
5783 +       r = dm_get_device(ti, argv[0], 0, ti->len,
5784 +                         dm_table_get_mode(ti->table), &dev);
5785 +       if (r) {
5786 +               ti->error = "Cannot get target device";
5787 +               return r;
5788 +       }
5789 +
5790 +       ti->private = dev;
5791 +       return 0;
5792 +}
5793 +
5794 +static void origin_dtr(struct dm_target *ti)
5795 +{
5796 +       struct dm_dev *dev = (struct dm_dev *) ti->private;
5797 +       dm_put_device(ti, dev);
5798 +}
5799 +
5800 +static int origin_map(struct dm_target *ti, struct buffer_head *bh, int rw,
5801 +                     union map_info *map_context)
5802 +{
5803 +       struct dm_dev *dev = (struct dm_dev *) ti->private;
5804 +       bh->b_rdev = dev->dev;
5805 +
5806 +       /* Only tell snapshots if this is a write */
5807 +       return (rw == WRITE) ? do_origin(dev, bh) : 1;
5808 +}
5809 +
5810 +static int origin_status(struct dm_target *ti, status_type_t type, char *result,
5811 +                        unsigned int maxlen)
5812 +{
5813 +       struct dm_dev *dev = (struct dm_dev *) ti->private;
5814 +
5815 +       switch (type) {
5816 +       case STATUSTYPE_INFO:
5817 +               result[0] = '\0';
5818 +               break;
5819 +
5820 +       case STATUSTYPE_TABLE:
5821 +               snprintf(result, maxlen, "%s", dm_kdevname(dev->dev));
5822 +               break;
5823 +       }
5824 +
5825 +       return 0;
5826 +}
5827 +
5828 +static struct target_type origin_target = {
5829 +       name:   "snapshot-origin",
5830 +       module: THIS_MODULE,
5831 +       ctr:    origin_ctr,
5832 +       dtr:    origin_dtr,
5833 +       map:    origin_map,
5834 +       status: origin_status,
5835 +};
5836 +
5837 +static struct target_type snapshot_target = {
5838 +       name:   "snapshot",
5839 +       module: THIS_MODULE,
5840 +       ctr:    snapshot_ctr,
5841 +       dtr:    snapshot_dtr,
5842 +       map:    snapshot_map,
5843 +       resume: snapshot_resume,
5844 +       status: snapshot_status,
5845 +};
5846 +
5847 +int __init dm_snapshot_init(void)
5848 +{
5849 +       int r;
5850 +
5851 +       r = dm_register_target(&snapshot_target);
5852 +       if (r) {
5853 +               DMERR("snapshot target register failed %d", r);
5854 +               return r;
5855 +       }
5856 +
5857 +       r = dm_register_target(&origin_target);
5858 +       if (r < 0) {
5859 +               DMERR("Device mapper: Origin: register failed %d\n", r);
5860 +               goto bad1;
5861 +       }
5862 +
5863 +       r = init_origin_hash();
5864 +       if (r) {
5865 +               DMERR("init_origin_hash failed.");
5866 +               goto bad2;
5867 +       }
5868 +
5869 +       exception_cache = kmem_cache_create("dm-snapshot-ex",
5870 +                                           sizeof(struct exception),
5871 +                                           __alignof__(struct exception),
5872 +                                           0, NULL, NULL);
5873 +       if (!exception_cache) {
5874 +               DMERR("Couldn't create exception cache.");
5875 +               r = -ENOMEM;
5876 +               goto bad3;
5877 +       }
5878 +
5879 +       pending_cache =
5880 +           kmem_cache_create("dm-snapshot-in",
5881 +                             sizeof(struct pending_exception),
5882 +                             __alignof__(struct pending_exception),
5883 +                             0, NULL, NULL);
5884 +       if (!pending_cache) {
5885 +               DMERR("Couldn't create pending cache.");
5886 +               r = -ENOMEM;
5887 +               goto bad4;
5888 +       }
5889 +
5890 +       pending_pool = mempool_create(128, mempool_alloc_slab,
5891 +                                     mempool_free_slab, pending_cache);
5892 +       if (!pending_pool) {
5893 +               DMERR("Couldn't create pending pool.");
5894 +               r = -ENOMEM;
5895 +               goto bad5;
5896 +       }
5897 +
5898 +       return 0;
5899 +
5900 +      bad5:
5901 +       kmem_cache_destroy(pending_cache);
5902 +      bad4:
5903 +       kmem_cache_destroy(exception_cache);
5904 +      bad3:
5905 +       exit_origin_hash();
5906 +      bad2:
5907 +       dm_unregister_target(&origin_target);
5908 +      bad1:
5909 +       dm_unregister_target(&snapshot_target);
5910 +       return r;
5911 +}
5912 +
5913 +void dm_snapshot_exit(void)
5914 +{
5915 +       int r;
5916 +
5917 +       r = dm_unregister_target(&snapshot_target);
5918 +       if (r)
5919 +               DMERR("snapshot unregister failed %d", r);
5920 +
5921 +       r = dm_unregister_target(&origin_target);
5922 +       if (r)
5923 +               DMERR("origin unregister failed %d", r);
5924 +
5925 +       exit_origin_hash();
5926 +       mempool_destroy(pending_pool);
5927 +       kmem_cache_destroy(pending_cache);
5928 +       kmem_cache_destroy(exception_cache);
5929 +}
5930 --- linux-2.4.21/drivers/md/dm-snapshot.h       Thu Jan  1 01:00:00 1970
5931 +++ linux/drivers/md/dm-snapshot.h      Wed Aug 20 14:41:38 2003
5932 @@ -0,0 +1,158 @@
5933 +/*
5934 + * dm-snapshot.c
5935 + *
5936 + * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
5937 + *
5938 + * This file is released under the GPL.
5939 + */
5940 +
5941 +#ifndef DM_SNAPSHOT_H
5942 +#define DM_SNAPSHOT_H
5943 +
5944 +#include "dm.h"
5945 +#include <linux/blkdev.h>
5946 +
5947 +struct exception_table {
5948 +       uint32_t hash_mask;
5949 +       struct list_head *table;
5950 +};
5951 +
5952 +/*
5953 + * The snapshot code deals with largish chunks of the disk at a
5954 + * time. Typically 64k - 256k.
5955 + */
5956 +/* FIXME: can we get away with limiting these to a uint32_t ? */
5957 +typedef sector_t chunk_t;
5958 +
5959 +/*
5960 + * An exception is used where an old chunk of data has been
5961 + * replaced by a new one.
5962 + */
5963 +struct exception {
5964 +       struct list_head hash_list;
5965 +
5966 +       chunk_t old_chunk;
5967 +       chunk_t new_chunk;
5968 +};
5969 +
5970 +/*
5971 + * Abstraction to handle the meta/layout of exception stores (the
5972 + * COW device).
5973 + */
5974 +struct exception_store {
5975 +
5976 +       /*
5977 +        * Destroys this object when you've finished with it.
5978 +        */
5979 +       void (*destroy) (struct exception_store *store);
5980 +
5981 +       /*
5982 +        * The target shouldn't read the COW device until this is
5983 +        * called.
5984 +        */
5985 +       int (*read_metadata) (struct exception_store *store);
5986 +
5987 +       /*
5988 +        * Find somewhere to store the next exception.
5989 +        */
5990 +       int (*prepare_exception) (struct exception_store *store,
5991 +                                 struct exception *e);
5992 +
5993 +       /*
5994 +        * Update the metadata with this exception.
5995 +        */
5996 +       void (*commit_exception) (struct exception_store *store,
5997 +                                 struct exception *e,
5998 +                                 void (*callback) (void *, int success),
5999 +                                 void *callback_context);
6000 +
6001 +       /*
6002 +        * The snapshot is invalid, note this in the metadata.
6003 +        */
6004 +       void (*drop_snapshot) (struct exception_store *store);
6005 +
6006 +       /*
6007 +        * Return how full the snapshot is.
6008 +        */
6009 +       void (*fraction_full) (struct exception_store *store,
6010 +                              sector_t *numerator,
6011 +                              sector_t *denominator);
6012 +
6013 +       struct dm_snapshot *snap;
6014 +       void *context;
6015 +};
6016 +
6017 +struct dm_snapshot {
6018 +       struct rw_semaphore lock;
6019 +       struct dm_table *table;
6020 +
6021 +       struct dm_dev *origin;
6022 +       struct dm_dev *cow;
6023 +
6024 +       /* List of snapshots per Origin */
6025 +       struct list_head list;
6026 +
6027 +       /* Size of data blocks saved - must be a power of 2 */
6028 +       chunk_t chunk_size;
6029 +       chunk_t chunk_mask;
6030 +       chunk_t chunk_shift;
6031 +
6032 +       /* You can't use a snapshot if this is 0 (e.g. if full) */
6033 +       int valid;
6034 +       int have_metadata;
6035 +
6036 +       /* Used for display of table */
6037 +       char type;
6038 +
6039 +       /* The last percentage we notified */
6040 +       int last_percent;
6041 +
6042 +       struct exception_table pending;
6043 +       struct exception_table complete;
6044 +
6045 +       /* The on disk metadata handler */
6046 +       struct exception_store store;
6047 +
6048 +       struct kcopyd_client *kcopyd_client;
6049 +};
6050 +
6051 +/*
6052 + * Used by the exception stores to load exceptions hen
6053 + * initialising.
6054 + */
6055 +int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new);
6056 +
6057 +/*
6058 + * Constructor and destructor for the default persistent
6059 + * store.
6060 + */
6061 +int dm_create_persistent(struct exception_store *store, uint32_t chunk_size);
6062 +
6063 +int dm_create_transient(struct exception_store *store,
6064 +                       struct dm_snapshot *s, int blocksize);
6065 +
6066 +/*
6067 + * Return the number of sectors in the device.
6068 + */
6069 +static inline sector_t get_dev_size(kdev_t dev)
6070 +{
6071 +       int *sizes;
6072 +
6073 +       sizes = blk_size[MAJOR(dev)];
6074 +       if (sizes)
6075 +               return sizes[MINOR(dev)] << 1;
6076 +
6077 +       return 0;
6078 +}
6079 +
6080 +static inline chunk_t sector_to_chunk(struct dm_snapshot *s, sector_t sector)
6081 +{
6082 +       return (sector & ~s->chunk_mask) >> s->chunk_shift;
6083 +}
6084 +
6085 +static inline sector_t chunk_to_sector(struct dm_snapshot *s, chunk_t chunk)
6086 +{
6087 +       return chunk << s->chunk_shift;
6088 +}
6089 +
6090 +#endif
6091 --- linux-2.4.21/drivers/md/dm-stripe.c Thu Jan  1 01:00:00 1970
6092 +++ linux/drivers/md/dm-stripe.c        Wed Aug 20 14:41:38 2003
6093 @@ -0,0 +1,258 @@
6094 +/*
6095 + * Copyright (C) 2001 Sistina Software (UK) Limited.
6096 + *
6097 + * This file is released under the GPL.
6098 + */
6099 +
6100 +#include "dm.h"
6101 +
6102 +#include <linux/module.h>
6103 +#include <linux/init.h>
6104 +#include <linux/blkdev.h>
6105 +#include <linux/slab.h>
6106 +
6107 +struct stripe {
6108 +       struct dm_dev *dev;
6109 +       sector_t physical_start;
6110 +};
6111 +
6112 +struct stripe_c {
6113 +       uint32_t stripes;
6114 +
6115 +       /* The size of this target / num. stripes */
6116 +       uint32_t stripe_width;
6117 +
6118 +       /* stripe chunk size */
6119 +       uint32_t chunk_shift;
6120 +       sector_t chunk_mask;
6121 +
6122 +       struct stripe stripe[0];
6123 +};
6124 +
6125 +static inline struct stripe_c *alloc_context(unsigned int stripes)
6126 +{
6127 +       size_t len;
6128 +
6129 +       if (array_too_big(sizeof(struct stripe_c), sizeof(struct stripe),
6130 +                         stripes))
6131 +               return NULL;
6132 +
6133 +       len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes);
6134 +
6135 +       return kmalloc(len, GFP_KERNEL);
6136 +}
6137 +
6138 +/*
6139 + * Parse a single <dev> <sector> pair
6140 + */
6141 +static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
6142 +                     unsigned int stripe, char **argv)
6143 +{
6144 +       sector_t start;
6145 +
6146 +       if (sscanf(argv[1], SECTOR_FORMAT, &start) != 1)
6147 +               return -EINVAL;
6148 +
6149 +       if (dm_get_device(ti, argv[0], start, sc->stripe_width,
6150 +                         dm_table_get_mode(ti->table),
6151 +                         &sc->stripe[stripe].dev))
6152 +               return -ENXIO;
6153 +
6154 +       sc->stripe[stripe].physical_start = start;
6155 +       return 0;
6156 +}
6157 +
6158 +/*
6159 + * FIXME: Nasty function, only present because we can't link
6160 + * against __moddi3 and __divdi3.
6161 + *
6162 + * returns a == b * n
6163 + */
6164 +static int multiple(sector_t a, sector_t b, sector_t *n)
6165 +{
6166 +       sector_t acc, prev, i;
6167 +
6168 +       *n = 0;
6169 +       while (a >= b) {
6170 +               for (acc = b, prev = 0, i = 1;
6171 +                    acc <= a;
6172 +                    prev = acc, acc <<= 1, i <<= 1)
6173 +                       ;
6174 +
6175 +               a -= prev;
6176 +               *n += i >> 1;
6177 +       }
6178 +
6179 +       return a == 0;
6180 +}
6181 +
6182 +/*
6183 + * Construct a striped mapping.
6184 + * <number of stripes> <chunk size (2^^n)> [<dev_path> <offset>]+
6185 + */
6186 +static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
6187 +{
6188 +       struct stripe_c *sc;
6189 +       sector_t width;
6190 +       uint32_t stripes;
6191 +       uint32_t chunk_size;
6192 +       char *end;
6193 +       int r;
6194 +       unsigned int i;
6195 +
6196 +       if (argc < 2) {
6197 +               ti->error = "dm-stripe: Not enough arguments";
6198 +               return -EINVAL;
6199 +       }
6200 +
6201 +       stripes = simple_strtoul(argv[0], &end, 10);
6202 +       if (*end) {
6203 +               ti->error = "dm-stripe: Invalid stripe count";
6204 +               return -EINVAL;
6205 +       }
6206 +
6207 +       chunk_size = simple_strtoul(argv[1], &end, 10);
6208 +       if (*end) {
6209 +               ti->error = "dm-stripe: Invalid chunk_size";
6210 +               return -EINVAL;
6211 +       }
6212 +
6213 +       /*
6214 +        * chunk_size is a power of two
6215 +        */
6216 +       if (!chunk_size || (chunk_size & (chunk_size - 1))) {
6217 +               ti->error = "dm-stripe: Invalid chunk size";
6218 +               return -EINVAL;
6219 +       }
6220 +
6221 +       if (!multiple(ti->len, stripes, &width)) {
6222 +               ti->error = "dm-stripe: Target length not divisable by "
6223 +                   "number of stripes";
6224 +               return -EINVAL;
6225 +       }
6226 +
6227 +       /*
6228 +        * Do we have enough arguments for that many stripes ?
6229 +        */
6230 +       if (argc != (2 + 2 * stripes)) {
6231 +               ti->error = "dm-stripe: Not enough destinations specified";
6232 +               return -EINVAL;
6233 +       }
6234 +
6235 +       sc = alloc_context(stripes);
6236 +       if (!sc) {
6237 +               ti->error = "dm-stripe: Memory allocation for striped context "
6238 +                   "failed";
6239 +               return -ENOMEM;
6240 +       }
6241 +
6242 +       sc->stripes = stripes;
6243 +       sc->stripe_width = width;
6244 +
6245 +       sc->chunk_mask = ((sector_t) chunk_size) - 1;
6246 +       for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++)
6247 +               chunk_size >>= 1;
6248 +       sc->chunk_shift--;
6249 +
6250 +       /*
6251 +        * Get the stripe destinations.
6252 +        */
6253 +       for (i = 0; i < stripes; i++) {
6254 +               argv += 2;
6255 +
6256 +               r = get_stripe(ti, sc, i, argv);
6257 +               if (r < 0) {
6258 +                       ti->error = "dm-stripe: Couldn't parse stripe "
6259 +                           "destination";
6260 +                       while (i--)
6261 +                               dm_put_device(ti, sc->stripe[i].dev);
6262 +                       kfree(sc);
6263 +                       return r;
6264 +               }
6265 +       }
6266 +
6267 +       ti->private = sc;
6268 +       return 0;
6269 +}
6270 +
6271 +static void stripe_dtr(struct dm_target *ti)
6272 +{
6273 +       unsigned int i;
6274 +       struct stripe_c *sc = (struct stripe_c *) ti->private;
6275 +
6276 +       for (i = 0; i < sc->stripes; i++)
6277 +               dm_put_device(ti, sc->stripe[i].dev);
6278 +
6279 +       kfree(sc);
6280 +}
6281 +
6282 +static int stripe_map(struct dm_target *ti, struct buffer_head *bh, int rw,
6283 +                     union map_info *context)
6284 +{
6285 +       struct stripe_c *sc = (struct stripe_c *) ti->private;
6286 +
6287 +       sector_t offset = bh->b_rsector - ti->begin;
6288 +       uint32_t chunk = (uint32_t) (offset >> sc->chunk_shift);
6289 +       uint32_t stripe = chunk % sc->stripes;  /* 32bit modulus */
6290 +       chunk = chunk / sc->stripes;
6291 +
6292 +       bh->b_rdev = sc->stripe[stripe].dev->dev;
6293 +       bh->b_rsector = sc->stripe[stripe].physical_start +
6294 +           (chunk << sc->chunk_shift) + (offset & sc->chunk_mask);
6295 +       return 1;
6296 +}
6297 +
6298 +static int stripe_status(struct dm_target *ti, status_type_t type,
6299 +                        char *result, unsigned int maxlen)
6300 +{
6301 +       struct stripe_c *sc = (struct stripe_c *) ti->private;
6302 +       int offset;
6303 +       unsigned int i;
6304 +
6305 +       switch (type) {
6306 +       case STATUSTYPE_INFO:
6307 +               result[0] = '\0';
6308 +               break;
6309 +
6310 +       case STATUSTYPE_TABLE:
6311 +               offset = snprintf(result, maxlen, "%d " SECTOR_FORMAT,
6312 +                                 sc->stripes, sc->chunk_mask + 1);
6313 +               for (i = 0; i < sc->stripes; i++) {
6314 +                       offset +=
6315 +                           snprintf(result + offset, maxlen - offset,
6316 +                                    " %s " SECTOR_FORMAT,
6317 +                      dm_kdevname(to_kdev_t(sc->stripe[i].dev->bdev->bd_dev)),
6318 +                                    sc->stripe[i].physical_start);
6319 +               }
6320 +               break;
6321 +       }
6322 +       return 0;
6323 +}
6324 +
6325 +static struct target_type stripe_target = {
6326 +       .name   = "striped",
6327 +       .module = THIS_MODULE,
6328 +       .ctr    = stripe_ctr,
6329 +       .dtr    = stripe_dtr,
6330 +       .map    = stripe_map,
6331 +       .status = stripe_status,
6332 +};
6333 +
6334 +int __init dm_stripe_init(void)
6335 +{
6336 +       int r;
6337 +
6338 +       r = dm_register_target(&stripe_target);
6339 +       if (r < 0)
6340 +               DMWARN("striped target registration failed");
6341 +
6342 +       return r;
6343 +}
6344 +
6345 +void dm_stripe_exit(void)
6346 +{
6347 +       if (dm_unregister_target(&stripe_target))
6348 +               DMWARN("striped target unregistration failed");
6349 +
6350 +       return;
6351 +}
6352 --- linux-2.4.21/drivers/md/dm-table.c  Thu Jan  1 01:00:00 1970
6353 +++ linux/drivers/md/dm-table.c Wed Aug 20 14:41:38 2003
6354 @@ -0,0 +1,708 @@
6355 +/*
6356 + * Copyright (C) 2001 Sistina Software (UK) Limited.
6357 + *
6358 + * This file is released under the GPL.
6359 + */
6360 +
6361 +#include "dm.h"
6362 +
6363 +#include <linux/module.h>
6364 +#include <linux/vmalloc.h>
6365 +#include <linux/blkdev.h>
6366 +#include <linux/ctype.h>
6367 +#include <linux/slab.h>
6368 +#include <asm/atomic.h>
6369 +
6370 +#define MAX_DEPTH 16
6371 +#define NODE_SIZE L1_CACHE_BYTES
6372 +#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
6373 +#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
6374 +#define MAX_TARGET_ARGS 64
6375 +
6376 +struct dm_table {
6377 +       atomic_t holders;
6378 +
6379 +       /* btree table */
6380 +       unsigned int depth;
6381 +       unsigned int counts[MAX_DEPTH]; /* in nodes */
6382 +       sector_t *index[MAX_DEPTH];
6383 +
6384 +       unsigned int num_targets;
6385 +       unsigned int num_allocated;
6386 +       sector_t *highs;
6387 +       struct dm_target *targets;
6388 +
6389 +       /*
6390 +        * Indicates the rw permissions for the new logical
6391 +        * device.  This should be a combination of FMODE_READ
6392 +        * and FMODE_WRITE.
6393 +        */
6394 +       int mode;
6395 +
6396 +       /* a list of devices used by this table */
6397 +       struct list_head devices;
6398 +
6399 +       /* events get handed up using this callback */
6400 +       void (*event_fn)(void *);
6401 +       void *event_context;
6402 +};
6403 +
6404 +/*
6405 + * Similar to ceiling(log_size(n))
6406 + */
6407 +static unsigned int int_log(unsigned long n, unsigned long base)
6408 +{
6409 +       int result = 0;
6410 +
6411 +       while (n > 1) {
6412 +               n = dm_div_up(n, base);
6413 +               result++;
6414 +       }
6415 +
6416 +       return result;
6417 +}
6418 +
6419 +/*
6420 + * Calculate the index of the child node of the n'th node k'th key.
6421 + */
6422 +static inline unsigned int get_child(unsigned int n, unsigned int k)
6423 +{
6424 +       return (n * CHILDREN_PER_NODE) + k;
6425 +}
6426 +
6427 +/*
6428 + * Return the n'th node of level l from table t.
6429 + */
6430 +static inline sector_t *get_node(struct dm_table *t, unsigned int l,
6431 +                                unsigned int n)
6432 +{
6433 +       return t->index[l] + (n * KEYS_PER_NODE);
6434 +}
6435 +
6436 +/*
6437 + * Return the highest key that you could lookup from the n'th
6438 + * node on level l of the btree.
6439 + */
6440 +static sector_t high(struct dm_table *t, unsigned int l, unsigned int n)
6441 +{
6442 +       for (; l < t->depth - 1; l++)
6443 +               n = get_child(n, CHILDREN_PER_NODE - 1);
6444 +
6445 +       if (n >= t->counts[l])
6446 +               return (sector_t) - 1;
6447 +
6448 +       return get_node(t, l, n)[KEYS_PER_NODE - 1];
6449 +}
6450 +
6451 +/*
6452 + * Fills in a level of the btree based on the highs of the level
6453 + * below it.
6454 + */
6455 +static int setup_btree_index(unsigned int l, struct dm_table *t)
6456 +{
6457 +       unsigned int n, k;
6458 +       sector_t *node;
6459 +
6460 +       for (n = 0U; n < t->counts[l]; n++) {
6461 +               node = get_node(t, l, n);
6462 +
6463 +               for (k = 0U; k < KEYS_PER_NODE; k++)
6464 +                       node[k] = high(t, l + 1, get_child(n, k));
6465 +       }
6466 +
6467 +       return 0;
6468 +}
6469 +
6470 +/*
6471 + * highs, and targets are managed as dynamic arrays during a
6472 + * table load.
6473 + */
6474 +static int alloc_targets(struct dm_table *t, unsigned int num)
6475 +{
6476 +       sector_t *n_highs;
6477 +       struct dm_target *n_targets;
6478 +       int n = t->num_targets;
6479 +
6480 +       /*
6481 +        * Allocate both the target array and offset array at once.
6482 +        */
6483 +       n_highs = (sector_t *) vcalloc(sizeof(struct dm_target) +
6484 +                                      sizeof(sector_t), num);
6485 +       if (!n_highs)
6486 +               return -ENOMEM;
6487 +
6488 +       n_targets = (struct dm_target *) (n_highs + num);
6489 +
6490 +       if (n) {
6491 +               memcpy(n_highs, t->highs, sizeof(*n_highs) * n);
6492 +               memcpy(n_targets, t->targets, sizeof(*n_targets) * n);
6493 +       }
6494 +
6495 +       memset(n_highs + n, -1, sizeof(*n_highs) * (num - n));
6496 +       vfree(t->highs);
6497 +
6498 +       t->num_allocated = num;
6499 +       t->highs = n_highs;
6500 +       t->targets = n_targets;
6501 +
6502 +       return 0;
6503 +}
6504 +
6505 +int dm_table_create(struct dm_table **result, int mode)
6506 +{
6507 +       struct dm_table *t = kmalloc(sizeof(*t), GFP_NOIO);
6508 +
6509 +       if (!t)
6510 +               return -ENOMEM;
6511 +
6512 +       memset(t, 0, sizeof(*t));
6513 +       INIT_LIST_HEAD(&t->devices);
6514 +       atomic_set(&t->holders, 1);
6515 +
6516 +       /* allocate a single nodes worth of targets to begin with */
6517 +       if (alloc_targets(t, KEYS_PER_NODE)) {
6518 +               kfree(t);
6519 +               t = NULL;
6520 +               return -ENOMEM;
6521 +       }
6522 +
6523 +       t->mode = mode;
6524 +       *result = t;
6525 +       return 0;
6526 +}
6527 +
6528 +static void free_devices(struct list_head *devices)
6529 +{
6530 +       struct list_head *tmp, *next;
6531 +
6532 +       for (tmp = devices->next; tmp != devices; tmp = next) {
6533 +               struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
6534 +               next = tmp->next;
6535 +               kfree(dd);
6536 +       }
6537 +}
6538 +
6539 +void table_destroy(struct dm_table *t)
6540 +{
6541 +       unsigned int i;
6542 +
6543 +       /* free the indexes (see dm_table_complete) */
6544 +       if (t->depth >= 2)
6545 +               vfree(t->index[t->depth - 2]);
6546 +
6547 +       /* free the targets */
6548 +       for (i = 0; i < t->num_targets; i++) {
6549 +               struct dm_target *tgt = t->targets + i;
6550 +
6551 +               if (tgt->type->dtr)
6552 +                       tgt->type->dtr(tgt);
6553 +
6554 +               dm_put_target_type(tgt->type);
6555 +       }
6556 +
6557 +       vfree(t->highs);
6558 +
6559 +       /* free the device list */
6560 +       if (t->devices.next != &t->devices) {
6561 +               DMWARN("devices still present during destroy: "
6562 +                      "dm_table_remove_device calls missing");
6563 +
6564 +               free_devices(&t->devices);
6565 +       }
6566 +
6567 +       kfree(t);
6568 +}
6569 +
6570 +void dm_table_get(struct dm_table *t)
6571 +{
6572 +       atomic_inc(&t->holders);
6573 +}
6574 +
6575 +void dm_table_put(struct dm_table *t)
6576 +{
6577 +       if (atomic_dec_and_test(&t->holders))
6578 +               table_destroy(t);
6579 +}
6580 +
6581 +/*
6582 + * Checks to see if we need to extend highs or targets.
6583 + */
6584 +static inline int check_space(struct dm_table *t)
6585 +{
6586 +       if (t->num_targets >= t->num_allocated)
6587 +               return alloc_targets(t, t->num_allocated * 2);
6588 +
6589 +       return 0;
6590 +}
6591 +
6592 +/*
6593 + * Convert a device path to a dev_t.
6594 + */
6595 +static int lookup_device(const char *path, kdev_t *dev)
6596 +{
6597 +       int r;
6598 +       struct nameidata nd;
6599 +       struct inode *inode;
6600 +
6601 +       if (!path_init(path, LOOKUP_FOLLOW, &nd))
6602 +               return 0;
6603 +
6604 +       if ((r = path_walk(path, &nd)))
6605 +               goto out;
6606 +
6607 +       inode = nd.dentry->d_inode;
6608 +       if (!inode) {
6609 +               r = -ENOENT;
6610 +               goto out;
6611 +       }
6612 +
6613 +       if (!S_ISBLK(inode->i_mode)) {
6614 +               r = -ENOTBLK;
6615 +               goto out;
6616 +       }
6617 +
6618 +       *dev = inode->i_rdev;
6619 +
6620 +      out:
6621 +       path_release(&nd);
6622 +       return r;
6623 +}
6624 +
6625 +/*
6626 + * See if we've already got a device in the list.
6627 + */
6628 +static struct dm_dev *find_device(struct list_head *l, kdev_t dev)
6629 +{
6630 +       struct list_head *tmp;
6631 +
6632 +       list_for_each(tmp, l) {
6633 +               struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
6634 +               if (kdev_same(dd->dev, dev))
6635 +                       return dd;
6636 +       }
6637 +
6638 +       return NULL;
6639 +}
6640 +
6641 +/*
6642 + * Open a device so we can use it as a map destination.
6643 + */
6644 +static int open_dev(struct dm_dev *dd)
6645 +{
6646 +       if (dd->bdev)
6647 +               BUG();
6648 +
6649 +       dd->bdev = bdget(kdev_t_to_nr(dd->dev));
6650 +       if (!dd->bdev)
6651 +               return -ENOMEM;
6652 +
6653 +       return blkdev_get(dd->bdev, dd->mode, 0, BDEV_RAW);
6654 +}
6655 +
6656 +/*
6657 + * Close a device that we've been using.
6658 + */
6659 +static void close_dev(struct dm_dev *dd)
6660 +{
6661 +       if (!dd->bdev)
6662 +               return;
6663 +
6664 +       blkdev_put(dd->bdev, BDEV_RAW);
6665 +       dd->bdev = NULL;
6666 +}
6667 +
6668 +/*
6669 + * If possible (ie. blk_size[major] is set), this checks an area
6670 + * of a destination device is valid.
6671 + */
6672 +static int check_device_area(kdev_t dev, sector_t start, sector_t len)
6673 +{
6674 +       int *sizes;
6675 +       sector_t dev_size;
6676 +
6677 +       if (!(sizes = blk_size[major(dev)]) || !(dev_size = sizes[minor(dev)]))
6678 +               /* we don't know the device details,
6679 +                * so give the benefit of the doubt */
6680 +               return 1;
6681 +
6682 +       /* convert to 512-byte sectors */
6683 +       dev_size <<= 1;
6684 +
6685 +       return ((start < dev_size) && (len <= (dev_size - start)));
6686 +}
6687 +
6688 +/*
6689 + * This upgrades the mode on an already open dm_dev.  Being
6690 + * careful to leave things as they were if we fail to reopen the
6691 + * device.
6692 + */
6693 +static int upgrade_mode(struct dm_dev *dd, int new_mode)
6694 +{
6695 +       int r;
6696 +       struct dm_dev dd_copy;
6697 +
6698 +       memcpy(&dd_copy, dd, sizeof(dd_copy));
6699 +
6700 +       dd->mode |= new_mode;
6701 +       dd->bdev = NULL;
6702 +       r = open_dev(dd);
6703 +       if (!r)
6704 +               close_dev(&dd_copy);
6705 +       else
6706 +               memcpy(dd, &dd_copy, sizeof(dd_copy));
6707 +
6708 +       return r;
6709 +}
6710 +
6711 +/*
6712 + * Add a device to the list, or just increment the usage count if
6713 + * it's already present.
6714 + */
6715 +int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
6716 +                 sector_t len, int mode, struct dm_dev **result)
6717 +{
6718 +       int r;
6719 +       kdev_t dev;
6720 +       struct dm_dev *dd;
6721 +       unsigned major, minor;
6722 +       struct dm_table *t = ti->table;
6723 +
6724 +       if (!t)
6725 +               BUG();
6726 +
6727 +       if (sscanf(path, "%u:%u", &major, &minor) == 2) {
6728 +               /* Extract the major/minor numbers */
6729 +               dev = mk_kdev(major, minor);
6730 +       } else {
6731 +               /* convert the path to a device */
6732 +               if ((r = lookup_device(path, &dev)))
6733 +                       return r;
6734 +       }
6735 +
6736 +       dd = find_device(&t->devices, dev);
6737 +       if (!dd) {
6738 +               dd = kmalloc(sizeof(*dd), GFP_KERNEL);
6739 +               if (!dd)
6740 +                       return -ENOMEM;
6741 +
6742 +               dd->dev = dev;
6743 +               dd->mode = mode;
6744 +               dd->bdev = NULL;
6745 +
6746 +               if ((r = open_dev(dd))) {
6747 +                       kfree(dd);
6748 +                       return r;
6749 +               }
6750 +
6751 +               atomic_set(&dd->count, 0);
6752 +               list_add(&dd->list, &t->devices);
6753 +
6754 +       } else if (dd->mode != (mode | dd->mode)) {
6755 +               r = upgrade_mode(dd, mode);
6756 +               if (r)
6757 +                       return r;
6758 +       }
6759 +       atomic_inc(&dd->count);
6760 +
6761 +       if (!check_device_area(dd->dev, start, len)) {
6762 +               DMWARN("device %s too small for target", path);
6763 +               dm_put_device(ti, dd);
6764 +               return -EINVAL;
6765 +       }
6766 +
6767 +       *result = dd;
6768 +
6769 +       return 0;
6770 +}
6771 +
6772 +/*
6773 + * Decrement a devices use count and remove it if neccessary.
6774 + */
6775 +void dm_put_device(struct dm_target *ti, struct dm_dev *dd)
6776 +{
6777 +       if (atomic_dec_and_test(&dd->count)) {
6778 +               close_dev(dd);
6779 +               list_del(&dd->list);
6780 +               kfree(dd);
6781 +       }
6782 +}
6783 +
6784 +/*
6785 + * Checks to see if the target joins onto the end of the table.
6786 + */
6787 +static int adjoin(struct dm_table *table, struct dm_target *ti)
6788 +{
6789 +       struct dm_target *prev;
6790 +
6791 +       if (!table->num_targets)
6792 +               return !ti->begin;
6793 +
6794 +       prev = &table->targets[table->num_targets - 1];
6795 +       return (ti->begin == (prev->begin + prev->len));
6796 +}
6797 +
6798 +/*
6799 + * Destructively splits up the argument list to pass to ctr.
6800 + */
6801 +static int split_args(int *argc, char ***argvp, char *input)
6802 +{
6803 +       char *start, *end = input, *out;
6804 +       char **argv;
6805 +       int max_args = MAX_TARGET_ARGS;
6806 +
6807 +       *argc = 0;
6808 +       argv = kmalloc(sizeof(*argv) * max_args, GFP_NOIO);
6809 +       if (!argv)
6810 +               return -ENOMEM;
6811 +
6812 +       while (1) {
6813 +               start = end;
6814 +
6815 +               /* Skip whitespace */
6816 +               while (*start && isspace(*start))
6817 +                       start++;
6818 +
6819 +               if (!*start)
6820 +                       break;  /* success, we hit the end */
6821 +
6822 +               /* 'out' is used to remove any back-quotes */
6823 +               end = out = start;
6824 +               while (*end) {
6825 +                       /* Everything apart from '\0' can be quoted */
6826 +                       if (*end == '\\' && *(end + 1)) {
6827 +                               *out++ = *(end + 1);
6828 +                               end += 2;
6829 +                               continue;
6830 +                       }
6831 +
6832 +                       if (isspace(*end))
6833 +                               break;  /* end of token */
6834 +
6835 +                       *out++ = *end++;
6836 +               }
6837 +
6838 +               /* have we already filled the array ? */
6839 +               if ((*argc + 1) > max_args) {
6840 +                       char **argv2;
6841 +                       
6842 +                       max_args *= 2;
6843 +                       argv2 = kmalloc(sizeof(*argv2) * max_args, GFP_NOIO);
6844 +                       if (!argv2) {
6845 +                               kfree(argv);
6846 +                               return -ENOMEM;
6847 +                       }
6848 +
6849 +                       memcpy(argv2, argv, sizeof(*argv) * *argc);
6850 +                       kfree(argv);
6851 +                       argv = argv2;
6852 +               }
6853 +
6854 +               /* we know this is whitespace */
6855 +               if (*end)
6856 +                       end++;
6857 +
6858 +               /* terminate the string and put it in the array */
6859 +               *out = '\0';
6860 +               argv[*argc] = start;
6861 +               (*argc)++;
6862 +       }
6863 +
6864 +       *argvp = argv;
6865 +       return 0;
6866 +}
6867 +
6868 +int dm_table_add_target(struct dm_table *t, const char *type,
6869 +                       sector_t start, sector_t len, char *params)
6870 +{
6871 +       int r = -EINVAL, argc;
6872 +       char **argv;
6873 +       struct dm_target *tgt;
6874 +
6875 +       if ((r = check_space(t)))
6876 +               return r;
6877 +
6878 +       tgt = t->targets + t->num_targets;
6879 +       memset(tgt, 0, sizeof(*tgt));
6880 +
6881 +       tgt->type = dm_get_target_type(type);
6882 +       if (!tgt->type) {
6883 +               tgt->error = "unknown target type";
6884 +               return -EINVAL;
6885 +       }
6886 +
6887 +       tgt->table = t;
6888 +       tgt->begin = start;
6889 +       tgt->len = len;
6890 +       tgt->error = "Unknown error";
6891 +
6892 +       /*
6893 +        * Does this target adjoin the previous one ?
6894 +        */
6895 +       if (!adjoin(t, tgt)) {
6896 +               tgt->error = "Gap in table";
6897 +               r = -EINVAL;
6898 +               goto bad;
6899 +       }
6900 +
6901 +       r = split_args(&argc, &argv, params);
6902 +       if (r) {
6903 +               tgt->error = "couldn't split parameters (insufficient memory)";
6904 +               goto bad;
6905 +       }
6906 +
6907 +       r = tgt->type->ctr(tgt, argc, argv);
6908 +       kfree(argv);
6909 +       if (r)
6910 +               goto bad;
6911 +
6912 +       t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
6913 +       return 0;
6914 +
6915 +      bad:
6916 +       printk(KERN_ERR DM_NAME ": %s\n", tgt->error);
6917 +       dm_put_target_type(tgt->type);
6918 +       return r;
6919 +}
6920 +
6921 +static int setup_indexes(struct dm_table *t)
6922 +{
6923 +       int i;
6924 +       unsigned int total = 0;
6925 +       sector_t *indexes;
6926 +
6927 +       /* allocate the space for *all* the indexes */
6928 +       for (i = t->depth - 2; i >= 0; i--) {
6929 +               t->counts[i] = dm_div_up(t->counts[i + 1], CHILDREN_PER_NODE);
6930 +               total += t->counts[i];
6931 +       }
6932 +
6933 +       indexes = (sector_t *) vcalloc(total, (unsigned long) NODE_SIZE);
6934 +       if (!indexes)
6935 +               return -ENOMEM;
6936 +
6937 +       /* set up internal nodes, bottom-up */
6938 +       for (i = t->depth - 2, total = 0; i >= 0; i--) {
6939 +               t->index[i] = indexes;
6940 +               indexes += (KEYS_PER_NODE * t->counts[i]);
6941 +               setup_btree_index(i, t);
6942 +       }
6943 +
6944 +       return 0;
6945 +}
6946 +
6947 +/*
6948 + * Builds the btree to index the map.
6949 + */
6950 +int dm_table_complete(struct dm_table *t)
6951 +{
6952 +       int r = 0;
6953 +       unsigned int leaf_nodes;
6954 +
6955 +       /* how many indexes will the btree have ? */
6956 +       leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
6957 +       t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
6958 +
6959 +       /* leaf layer has already been set up */
6960 +       t->counts[t->depth - 1] = leaf_nodes;
6961 +       t->index[t->depth - 1] = t->highs;
6962 +
6963 +       if (t->depth >= 2)
6964 +               r = setup_indexes(t);
6965 +
6966 +       return r;
6967 +}
6968 +
6969 +static spinlock_t _event_lock = SPIN_LOCK_UNLOCKED;
6970 +void dm_table_event_callback(struct dm_table *t,
6971 +                            void (*fn)(void *), void *context)
6972 +{
6973 +       spin_lock_irq(&_event_lock);
6974 +       t->event_fn = fn;
6975 +       t->event_context = context;
6976 +       spin_unlock_irq(&_event_lock);
6977 +}
6978 +
6979 +void dm_table_event(struct dm_table *t)
6980 +{
6981 +       spin_lock(&_event_lock);
6982 +       if (t->event_fn)
6983 +               t->event_fn(t->event_context);
6984 +       spin_unlock(&_event_lock);
6985 +}
6986 +
6987 +sector_t dm_table_get_size(struct dm_table *t)
6988 +{
6989 +       return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0;
6990 +}
6991 +
6992 +struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index)
6993 +{
6994 +       if (index > t->num_targets)
6995 +               return NULL;
6996 +
6997 +       return t->targets + index;
6998 +}
6999 +
7000 +/*
7001 + * Search the btree for the correct target.
7002 + */
7003 +struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
7004 +{
7005 +       unsigned int l, n = 0, k = 0;
7006 +       sector_t *node;
7007 +
7008 +       for (l = 0; l < t->depth; l++) {
7009 +               n = get_child(n, k);
7010 +               node = get_node(t, l, n);
7011 +
7012 +               for (k = 0; k < KEYS_PER_NODE; k++)
7013 +                       if (node[k] >= sector)
7014 +                               break;
7015 +       }
7016 +
7017 +       return &t->targets[(KEYS_PER_NODE * n) + k];
7018 +}
7019 +
7020 +unsigned int dm_table_get_num_targets(struct dm_table *t)
7021 +{
7022 +       return t->num_targets;
7023 +}
7024 +
7025 +struct list_head *dm_table_get_devices(struct dm_table *t)
7026 +{
7027 +       return &t->devices;
7028 +}
7029 +
7030 +int dm_table_get_mode(struct dm_table *t)
7031 +{
7032 +       return t->mode;
7033 +}
7034 +
7035 +void dm_table_suspend_targets(struct dm_table *t)
7036 +{
7037 +       int i;
7038 +
7039 +       for (i = 0; i < t->num_targets; i++) {
7040 +               struct dm_target *ti = t->targets + i;
7041 +
7042 +               if (ti->type->suspend)
7043 +                       ti->type->suspend(ti);
7044 +       }
7045 +}
7046 +
7047 +void dm_table_resume_targets(struct dm_table *t)
7048 +{
7049 +       int i;
7050 +
7051 +       for (i = 0; i < t->num_targets; i++) {
7052 +               struct dm_target *ti = t->targets + i;
7053 +
7054 +               if (ti->type->resume)
7055 +                       ti->type->resume(ti);
7056 +       }
7057 +}
7058 +
7059 +EXPORT_SYMBOL(dm_get_device);
7060 +EXPORT_SYMBOL(dm_put_device);
7061 +EXPORT_SYMBOL(dm_table_event);
7062 +EXPORT_SYMBOL(dm_table_get_mode);
7063 --- linux-2.4.21/drivers/md/dm-target.c Thu Jan  1 01:00:00 1970
7064 +++ linux/drivers/md/dm-target.c        Wed Aug 20 14:41:38 2003
7065 @@ -0,0 +1,188 @@
7066 +/*
7067 + * Copyright (C) 2001 Sistina Software (UK) Limited
7068 + *
7069 + * This file is released under the GPL.
7070 + */
7071 +
7072 +#include "dm.h"
7073 +
7074 +#include <linux/module.h>
7075 +#include <linux/kmod.h>
7076 +#include <linux/slab.h>
7077 +
7078 +struct tt_internal {
7079 +       struct target_type tt;
7080 +
7081 +       struct list_head list;
7082 +       long use;
7083 +};
7084 +
7085 +static LIST_HEAD(_targets);
7086 +static DECLARE_RWSEM(_lock);
7087 +
7088 +#define DM_MOD_NAME_SIZE 32
7089 +
7090 +static inline struct tt_internal *__find_target_type(const char *name)
7091 +{
7092 +       struct list_head *tih;
7093 +       struct tt_internal *ti;
7094 +
7095 +       list_for_each(tih, &_targets) {
7096 +               ti = list_entry(tih, struct tt_internal, list);
7097 +
7098 +               if (!strcmp(name, ti->tt.name))
7099 +                       return ti;
7100 +       }
7101 +
7102 +       return NULL;
7103 +}
7104 +
7105 +static struct tt_internal *get_target_type(const char *name)
7106 +{
7107 +       struct tt_internal *ti;
7108 +
7109 +       down_read(&_lock);
7110 +       ti = __find_target_type(name);
7111 +
7112 +       if (ti) {
7113 +               if (ti->use == 0 && ti->tt.module)
7114 +                       __MOD_INC_USE_COUNT(ti->tt.module);
7115 +               ti->use++;
7116 +       }
7117 +       up_read(&_lock);
7118 +
7119 +       return ti;
7120 +}
7121 +
7122 +static void load_module(const char *name)
7123 +{
7124 +       char module_name[DM_MOD_NAME_SIZE] = "dm-";
7125 +
7126 +       /* Length check for strcat() below */
7127 +       if (strlen(name) > (DM_MOD_NAME_SIZE - 4))
7128 +               return;
7129 +
7130 +       strcat(module_name, name);
7131 +       request_module(module_name);
7132 +}
7133 +
7134 +struct target_type *dm_get_target_type(const char *name)
7135 +{
7136 +       struct tt_internal *ti = get_target_type(name);
7137 +
7138 +       if (!ti) {
7139 +               load_module(name);
7140 +               ti = get_target_type(name);
7141 +       }
7142 +
7143 +       return ti ? &ti->tt : NULL;
7144 +}
7145 +
7146 +void dm_put_target_type(struct target_type *t)
7147 +{
7148 +       struct tt_internal *ti = (struct tt_internal *) t;
7149 +
7150 +       down_read(&_lock);
7151 +       if (--ti->use == 0 && ti->tt.module)
7152 +               __MOD_DEC_USE_COUNT(ti->tt.module);
7153 +
7154 +       if (ti->use < 0)
7155 +               BUG();
7156 +       up_read(&_lock);
7157 +
7158 +       return;
7159 +}
7160 +
7161 +static struct tt_internal *alloc_target(struct target_type *t)
7162 +{
7163 +       struct tt_internal *ti = kmalloc(sizeof(*ti), GFP_KERNEL);
7164 +
7165 +       if (ti) {
7166 +               memset(ti, 0, sizeof(*ti));
7167 +               ti->tt = *t;
7168 +       }
7169 +
7170 +       return ti;
7171 +}
7172 +
7173 +int dm_register_target(struct target_type *t)
7174 +{
7175 +       int rv = 0;
7176 +       struct tt_internal *ti = alloc_target(t);
7177 +
7178 +       if (!ti)
7179 +               return -ENOMEM;
7180 +
7181 +       down_write(&_lock);
7182 +       if (__find_target_type(t->name)) {
7183 +               kfree(ti);
7184 +               rv = -EEXIST;
7185 +       } else
7186 +               list_add(&ti->list, &_targets);
7187 +
7188 +       up_write(&_lock);
7189 +       return rv;
7190 +}
7191 +
7192 +int dm_unregister_target(struct target_type *t)
7193 +{
7194 +       struct tt_internal *ti;
7195 +
7196 +       down_write(&_lock);
7197 +       if (!(ti = __find_target_type(t->name))) {
7198 +               up_write(&_lock);
7199 +               return -EINVAL;
7200 +       }
7201 +
7202 +       if (ti->use) {
7203 +               up_write(&_lock);
7204 +               return -ETXTBSY;
7205 +       }
7206 +
7207 +       list_del(&ti->list);
7208 +       kfree(ti);
7209 +
7210 +       up_write(&_lock);
7211 +       return 0;
7212 +}
7213 +
7214 +/*
7215 + * io-err: always fails an io, useful for bringing
7216 + * up LVs that have holes in them.
7217 + */
7218 +static int io_err_ctr(struct dm_target *ti, unsigned int argc, char **args)
7219 +{
7220 +       return 0;
7221 +}
7222 +
7223 +static void io_err_dtr(struct dm_target *ti)
7224 +{
7225 +       /* empty */
7226 +}
7227 +
7228 +static int io_err_map(struct dm_target *ti, struct buffer_head *bh, int rw,
7229 +                     union map_info *map_context)
7230 +{
7231 +       return -EIO;
7232 +}
7233 +
7234 +static struct target_type error_target = {
7235 +       .name = "error",
7236 +       .ctr  = io_err_ctr,
7237 +       .dtr  = io_err_dtr,
7238 +       .map  = io_err_map,
7239 +};
7240 +
7241 +int dm_target_init(void)
7242 +{
7243 +       return dm_register_target(&error_target);
7244 +}
7245 +
7246 +void dm_target_exit(void)
7247 +{
7248 +       if (dm_unregister_target(&error_target))
7249 +               DMWARN("error target unregistration failed");
7250 +}
7251 +
7252 +EXPORT_SYMBOL(dm_register_target);
7253 +EXPORT_SYMBOL(dm_unregister_target);
7254 --- linux-2.4.21/drivers/md/dm.c        Thu Jan  1 01:00:00 1970
7255 +++ linux/drivers/md/dm.c       Wed Aug 20 14:41:38 2003
7256 @@ -0,0 +1,1115 @@
7257 +/*
7258 + * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
7259 + *
7260 + * This file is released under the GPL.
7261 + */
7262 +
7263 +#include "dm.h"
7264 +#include "kcopyd.h"
7265 +
7266 +#include <linux/init.h>
7267 +#include <linux/module.h>
7268 +#include <linux/blk.h>
7269 +#include <linux/blkpg.h>
7270 +#include <linux/mempool.h>
7271 +#include <linux/slab.h>
7272 +#include <linux/major.h>
7273 +#include <linux/kdev_t.h>
7274 +#include <linux/lvm.h>
7275 +
7276 +#include <asm/uaccess.h>
7277 +
7278 +static const char *_name = DM_NAME;
7279 +#define DEFAULT_READ_AHEAD 64
7280 +
7281 +struct dm_io {
7282 +       struct mapped_device *md;
7283 +
7284 +       struct dm_target *ti;
7285 +       int rw;
7286 +       union map_info map_context;
7287 +       void (*end_io) (struct buffer_head * bh, int uptodate);
7288 +       void *context;
7289 +};
7290 +
7291 +struct deferred_io {
7292 +       int rw;
7293 +       struct buffer_head *bh;
7294 +       struct deferred_io *next;
7295 +};
7296 +
7297 +/*
7298 + * Bits for the md->flags field.
7299 + */
7300 +#define DMF_BLOCK_IO 0
7301 +#define DMF_SUSPENDED 1
7302 +
7303 +struct mapped_device {
7304 +       struct rw_semaphore lock;
7305 +       atomic_t holders;
7306 +
7307 +       kdev_t dev;
7308 +       unsigned long flags;
7309 +
7310 +       /*
7311 +        * A list of ios that arrived while we were suspended.
7312 +        */
7313 +       atomic_t pending;
7314 +       wait_queue_head_t wait;
7315 +       struct deferred_io *deferred;
7316 +
7317 +       /*
7318 +        * The current mapping.
7319 +        */
7320 +       struct dm_table *map;
7321 +
7322 +       /*
7323 +        * io objects are allocated from here.
7324 +        */
7325 +       mempool_t *io_pool;
7326 +
7327 +       /*
7328 +        * Event handling.
7329 +        */
7330 +       uint32_t event_nr;
7331 +       wait_queue_head_t eventq;
7332 +};
7333 +
7334 +#define MIN_IOS 256
7335 +static kmem_cache_t *_io_cache;
7336 +
7337 +static struct mapped_device *get_kdev(kdev_t dev);
7338 +static int dm_request(request_queue_t *q, int rw, struct buffer_head *bh);
7339 +static int dm_user_bmap(struct inode *inode, struct lv_bmap *lvb);
7340 +
7341 +/*-----------------------------------------------------------------
7342 + * In order to avoid the 256 minor number limit we are going to
7343 + * register more major numbers as neccessary.
7344 + *---------------------------------------------------------------*/
7345 +#define MAX_MINORS (1 << MINORBITS)
7346 +
7347 +struct major_details {
7348 +       unsigned int major;
7349 +
7350 +       int transient;
7351 +       struct list_head transient_list;
7352 +
7353 +       unsigned int first_free_minor;
7354 +       int nr_free_minors;
7355 +
7356 +       struct mapped_device *mds[MAX_MINORS];
7357 +       int blk_size[MAX_MINORS];
7358 +       int blksize_size[MAX_MINORS];
7359 +       int hardsect_size[MAX_MINORS];
7360 +};
7361 +
7362 +static struct rw_semaphore _dev_lock;
7363 +static struct major_details *_majors[MAX_BLKDEV];
7364 +
7365 +/*
7366 + * This holds a list of majors that non-specified device numbers
7367 + * may be allocated from.  Only majors with free minors appear on
7368 + * this list.
7369 + */
7370 +static LIST_HEAD(_transients_free);
7371 +
7372 +static int __alloc_major(unsigned int major, struct major_details **result)
7373 +{
7374 +       int r;
7375 +       unsigned int transient = !major;
7376 +       struct major_details *maj;
7377 +
7378 +       /* Major already allocated? */
7379 +       if (major && _majors[major])
7380 +               return 0;
7381 +
7382 +       maj = kmalloc(sizeof(*maj), GFP_KERNEL);
7383 +       if (!maj)
7384 +               return -ENOMEM;
7385 +
7386 +       memset(maj, 0, sizeof(*maj));
7387 +       INIT_LIST_HEAD(&maj->transient_list);
7388 +
7389 +       maj->nr_free_minors = MAX_MINORS;
7390 +
7391 +       r = register_blkdev(major, _name, &dm_blk_dops);
7392 +       if (r < 0) {
7393 +               DMERR("register_blkdev failed for %d", major);
7394 +               kfree(maj);
7395 +               return r;
7396 +       }
7397 +       if (r > 0)
7398 +               major = r;
7399 +
7400 +       maj->major = major;
7401 +
7402 +       if (transient) {
7403 +               maj->transient = transient;
7404 +               list_add_tail(&maj->transient_list, &_transients_free);
7405 +       }
7406 +
7407 +       _majors[major] = maj;
7408 +
7409 +       blk_size[major] = maj->blk_size;
7410 +       blksize_size[major] = maj->blksize_size;
7411 +       hardsect_size[major] = maj->hardsect_size;
7412 +       read_ahead[major] = DEFAULT_READ_AHEAD;
7413 +
7414 +       blk_queue_make_request(BLK_DEFAULT_QUEUE(major), dm_request);
7415 +
7416 +       *result = maj;
7417 +       return 0;
7418 +}
7419 +
7420 +static void __free_major(struct major_details *maj)
7421 +{
7422 +       unsigned int major = maj->major;
7423 +
7424 +       list_del(&maj->transient_list);
7425 +
7426 +       read_ahead[major] = 0;
7427 +       blk_size[major] = NULL;
7428 +       blksize_size[major] = NULL;
7429 +       hardsect_size[major] = NULL;
7430 +
7431 +       _majors[major] = NULL;
7432 +       kfree(maj);
7433 +
7434 +       if (unregister_blkdev(major, _name) < 0)
7435 +               DMERR("devfs_unregister_blkdev failed");
7436 +}
7437 +
7438 +static void free_all_majors(void)
7439 +{
7440 +       unsigned int major = ARRAY_SIZE(_majors);
7441 +
7442 +       down_write(&_dev_lock);
7443 +
7444 +       while (major--)
7445 +               if (_majors[major])
7446 +                       __free_major(_majors[major]);
7447 +
7448 +       up_write(&_dev_lock);
7449 +}
7450 +
7451 +static void free_dev(kdev_t dev)
7452 +{
7453 +       unsigned int major = major(dev);
7454 +       unsigned int minor = minor(dev);
7455 +       struct major_details *maj;
7456 +
7457 +       down_write(&_dev_lock);
7458 +
7459 +       maj = _majors[major];
7460 +       if (!maj)
7461 +               goto out;
7462 +
7463 +       maj->mds[minor] = NULL;
7464 +       maj->nr_free_minors++;
7465 +
7466 +       if (maj->nr_free_minors == MAX_MINORS) {
7467 +               __free_major(maj);
7468 +               goto out;
7469 +       }
7470 +
7471 +       if (!maj->transient)
7472 +               goto out;
7473 +
7474 +       if (maj->nr_free_minors == 1)
7475 +               list_add_tail(&maj->transient_list, &_transients_free);
7476 +
7477 +       if (minor < maj->first_free_minor)
7478 +               maj->first_free_minor = minor;
7479 +
7480 +      out:
7481 +       up_write(&_dev_lock);
7482 +}
7483 +
7484 +static void __alloc_minor(struct major_details *maj, unsigned int minor,
7485 +                         struct mapped_device *md)
7486 +{
7487 +       maj->mds[minor] = md;
7488 +       md->dev = mk_kdev(maj->major, minor);
7489 +       maj->nr_free_minors--;
7490 +
7491 +       if (maj->transient && !maj->nr_free_minors)
7492 +               list_del_init(&maj->transient_list);
7493 +}
7494 +
7495 +/*
7496 + * See if requested kdev_t is available.
7497 + */
7498 +static int specific_dev(kdev_t dev, struct mapped_device *md)
7499 +{
7500 +       int r = 0;
7501 +       unsigned int major = major(dev);
7502 +       unsigned int minor = minor(dev);
7503 +       struct major_details *maj;
7504 +
7505 +       if (!major || (major > MAX_BLKDEV) || (minor >= MAX_MINORS)) {
7506 +               DMWARN("device number requested out of range (%d, %d)",
7507 +                      major, minor);
7508 +               return -EINVAL;
7509 +       }
7510 +
7511 +       down_write(&_dev_lock);
7512 +       maj = _majors[major];
7513 +
7514 +       /* Register requested major? */
7515 +       if (!maj) {
7516 +               r = __alloc_major(major, &maj);
7517 +               if (r)
7518 +                       goto out;
7519 +
7520 +               major = maj->major;
7521 +       }
7522 +
7523 +       if (maj->mds[minor]) {
7524 +               r = -EBUSY;
7525 +               goto out;
7526 +       }
7527 +
7528 +       __alloc_minor(maj, minor, md);
7529 +
7530 +      out:
7531 +       up_write(&_dev_lock);
7532 +
7533 +       return r;
7534 +}
7535 +
7536 +/*
7537 + * Find first unused device number, requesting a new major number if required.
7538 + */
7539 +static int first_free_dev(struct mapped_device *md)
7540 +{
7541 +       int r = 0;
7542 +       struct major_details *maj;
7543 +
7544 +       down_write(&_dev_lock);
7545 +
7546 +       if (list_empty(&_transients_free)) {
7547 +               r = __alloc_major(0, &maj);
7548 +               if (r)
7549 +                       goto out;
7550 +       } else
7551 +               maj = list_entry(_transients_free.next, struct major_details,
7552 +                                transient_list);
7553 +
7554 +       while (maj->mds[maj->first_free_minor++])
7555 +               ;
7556 +
7557 +       __alloc_minor(maj, maj->first_free_minor - 1, md);
7558 +
7559 +      out:
7560 +       up_write(&_dev_lock);
7561 +
7562 +       return r;
7563 +}
7564 +
7565 +static struct mapped_device *get_kdev(kdev_t dev)
7566 +{
7567 +       struct mapped_device *md;
7568 +       struct major_details *maj;
7569 +
7570 +       down_read(&_dev_lock);
7571 +       maj = _majors[major(dev)];
7572 +       if (!maj) {
7573 +               md = NULL;
7574 +               goto out;
7575 +       }
7576 +       md = maj->mds[minor(dev)];
7577 +       if (md)
7578 +               dm_get(md);
7579 +      out:
7580 +       up_read(&_dev_lock);
7581 +
7582 +       return md;
7583 +}
7584 +
7585 +/*-----------------------------------------------------------------
7586 + * init/exit code
7587 + *---------------------------------------------------------------*/
7588 +
7589 +static __init int local_init(void)
7590 +{
7591 +       init_rwsem(&_dev_lock);
7592 +
7593 +       /* allocate a slab for the dm_ios */
7594 +       _io_cache = kmem_cache_create("dm io",
7595 +                                     sizeof(struct dm_io), 0, 0, NULL, NULL);
7596 +
7597 +       if (!_io_cache)
7598 +               return -ENOMEM;
7599 +
7600 +       return 0;
7601 +}
7602 +
7603 +static void local_exit(void)
7604 +{
7605 +       kmem_cache_destroy(_io_cache);
7606 +       free_all_majors();
7607 +
7608 +       DMINFO("cleaned up");
7609 +}
7610 +
7611 +/*
7612 + * We have a lot of init/exit functions, so it seems easier to
7613 + * store them in an array.  The disposable macro 'xx'
7614 + * expands a prefix into a pair of function names.
7615 + */
7616 +static struct {
7617 +       int (*init) (void);
7618 +       void (*exit) (void);
7619 +
7620 +} _inits[] = {
7621 +#define xx(n) {n ## _init, n ## _exit},
7622 +       xx(local)
7623 +       xx(kcopyd)
7624 +       xx(dm_target)
7625 +       xx(dm_linear)
7626 +       xx(dm_stripe)
7627 +       xx(dm_snapshot)
7628 +       xx(dm_interface)
7629 +#undef xx
7630 +};
7631 +
7632 +static int __init dm_init(void)
7633 +{
7634 +       const int count = ARRAY_SIZE(_inits);
7635 +
7636 +       int r, i;
7637 +
7638 +       for (i = 0; i < count; i++) {
7639 +               r = _inits[i].init();
7640 +               if (r)
7641 +                       goto bad;
7642 +       }
7643 +
7644 +       return 0;
7645 +
7646 +      bad:
7647 +       while (i--)
7648 +               _inits[i].exit();
7649 +
7650 +       return r;
7651 +}
7652 +
7653 +static void __exit dm_exit(void)
7654 +{
7655 +       int i = ARRAY_SIZE(_inits);
7656 +
7657 +       while (i--)
7658 +               _inits[i].exit();
7659 +}
7660 +
7661 +/*
7662 + * Block device functions
7663 + */
7664 +static int dm_blk_open(struct inode *inode, struct file *file)
7665 +{
7666 +       struct mapped_device *md;
7667 +
7668 +       md = get_kdev(inode->i_rdev);
7669 +       if (!md)
7670 +               return -ENXIO;
7671 +
7672 +       return 0;
7673 +}
7674 +
7675 +static int dm_blk_close(struct inode *inode, struct file *file)
7676 +{
7677 +       struct mapped_device *md;
7678 +
7679 +       md = get_kdev(inode->i_rdev);
7680 +       dm_put(md);             /* put the reference gained by dm_blk_open */
7681 +       dm_put(md);
7682 +       return 0;
7683 +}
7684 +
7685 +static inline struct dm_io *alloc_io(struct mapped_device *md)
7686 +{
7687 +       return mempool_alloc(md->io_pool, GFP_NOIO);
7688 +}
7689 +
7690 +static inline void free_io(struct mapped_device *md, struct dm_io *io)
7691 +{
7692 +       mempool_free(io, md->io_pool);
7693 +}
7694 +
7695 +static inline struct deferred_io *alloc_deferred(void)
7696 +{
7697 +       return kmalloc(sizeof(struct deferred_io), GFP_NOIO);
7698 +}
7699 +
7700 +static inline void free_deferred(struct deferred_io *di)
7701 +{
7702 +       kfree(di);
7703 +}
7704 +
7705 +static inline sector_t volume_size(kdev_t dev)
7706 +{
7707 +       return blk_size[major(dev)][minor(dev)] << 1;
7708 +}
7709 +
7710 +/* FIXME: check this */
7711 +static int dm_blk_ioctl(struct inode *inode, struct file *file,
7712 +                       unsigned int command, unsigned long a)
7713 +{
7714 +       kdev_t dev = inode->i_rdev;
7715 +       long size;
7716 +
7717 +       switch (command) {
7718 +       case BLKROSET:
7719 +       case BLKROGET:
7720 +       case BLKRASET:
7721 +       case BLKRAGET:
7722 +       case BLKFLSBUF:
7723 +       case BLKSSZGET:
7724 +               //case BLKRRPART: /* Re-read partition tables */
7725 +               //case BLKPG:
7726 +       case BLKELVGET:
7727 +       case BLKELVSET:
7728 +       case BLKBSZGET:
7729 +       case BLKBSZSET:
7730 +               return blk_ioctl(dev, command, a);
7731 +               break;
7732 +
7733 +       case BLKGETSIZE:
7734 +               size = volume_size(dev);
7735 +               if (copy_to_user((void *) a, &size, sizeof(long)))
7736 +                       return -EFAULT;
7737 +               break;
7738 +
7739 +       case BLKGETSIZE64:
7740 +               size = volume_size(dev);
7741 +               if (put_user((u64) ((u64) size) << 9, (u64 *) a))
7742 +                       return -EFAULT;
7743 +               break;
7744 +
7745 +       case BLKRRPART:
7746 +               return -ENOTTY;
7747 +
7748 +       case LV_BMAP:
7749 +               return dm_user_bmap(inode, (struct lv_bmap *) a);
7750 +
7751 +       default:
7752 +               DMWARN("unknown block ioctl 0x%x", command);
7753 +               return -ENOTTY;
7754 +       }
7755 +
7756 +       return 0;
7757 +}
7758 +
7759 +/*
7760 + * Add the buffer to the list of deferred io.
7761 + */
7762 +static int queue_io(struct mapped_device *md, struct buffer_head *bh, int rw)
7763 +{
7764 +       struct deferred_io *di;
7765 +
7766 +       di = alloc_deferred();
7767 +       if (!di)
7768 +               return -ENOMEM;
7769 +
7770 +       down_write(&md->lock);
7771 +
7772 +       if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
7773 +               up_write(&md->lock);
7774 +               free_deferred(di);
7775 +               return 1;
7776 +       }
7777 +
7778 +       di->bh = bh;
7779 +       di->rw = rw;
7780 +       di->next = md->deferred;
7781 +       md->deferred = di;
7782 +
7783 +       up_write(&md->lock);
7784 +       return 0;               /* deferred successfully */
7785 +}
7786 +
7787 +/*
7788 + * bh->b_end_io routine that decrements the pending count
7789 + * and then calls the original bh->b_end_io fn.
7790 + */
7791 +static void dec_pending(struct buffer_head *bh, int uptodate)
7792 +{
7793 +       int r;
7794 +       struct dm_io *io = bh->b_private;
7795 +       dm_endio_fn endio = io->ti->type->end_io;
7796 +
7797 +       if (endio) {
7798 +               r = endio(io->ti, bh, io->rw, uptodate ? 0 : -EIO,
7799 +                         &io->map_context);
7800 +               if (r < 0)
7801 +                       uptodate = 0;
7802 +
7803 +               else if (r > 0)
7804 +                       /* the target wants another shot at the io */
7805 +                       return;
7806 +       }
7807 +
7808 +       if (atomic_dec_and_test(&io->md->pending))
7809 +               /* nudge anyone waiting on suspend queue */
7810 +               wake_up(&io->md->wait);
7811 +
7812 +       bh->b_end_io = io->end_io;
7813 +       bh->b_private = io->context;
7814 +       free_io(io->md, io);
7815 +
7816 +       bh->b_end_io(bh, uptodate);
7817 +}
7818 +
7819 +/*
7820 + * Do the bh mapping for a given leaf
7821 + */
7822 +static inline int __map_buffer(struct mapped_device *md, int rw,
7823 +                              struct buffer_head *bh, struct dm_io *io)
7824 +{
7825 +       struct dm_target *ti;
7826 +
7827 +       if (!md->map)
7828 +               return -EINVAL;
7829 +
7830 +       ti = dm_table_find_target(md->map, bh->b_rsector);
7831 +       if (!ti->type)
7832 +               return -EINVAL;
7833 +
7834 +       /* hook the end io request fn */
7835 +       atomic_inc(&md->pending);
7836 +       io->md = md;
7837 +       io->ti = ti;
7838 +       io->rw = rw;
7839 +       io->end_io = bh->b_end_io;
7840 +       io->context = bh->b_private;
7841 +       bh->b_end_io = dec_pending;
7842 +       bh->b_private = io;
7843 +
7844 +       return ti->type->map(ti, bh, rw, &io->map_context);
7845 +}
7846 +
7847 +/*
7848 + * Checks to see if we should be deferring io, if so it queues it
7849 + * and returns 1.
7850 + */
7851 +static inline int __deferring(struct mapped_device *md, int rw,
7852 +                             struct buffer_head *bh)
7853 +{
7854 +       int r;
7855 +
7856 +       /*
7857 +        * If we're suspended we have to queue this io for later.
7858 +        */
7859 +       while (test_bit(DMF_BLOCK_IO, &md->flags)) {
7860 +               up_read(&md->lock);
7861 +
7862 +               /*
7863 +                * There's no point deferring a read ahead
7864 +                * request, just drop it.
7865 +                */
7866 +               if (rw == READA) {
7867 +                       down_read(&md->lock);
7868 +                       return -EIO;
7869 +               }
7870 +
7871 +               r = queue_io(md, bh, rw);
7872 +               down_read(&md->lock);
7873 +
7874 +               if (r < 0)
7875 +                       return r;
7876 +
7877 +               if (r == 0)
7878 +                       return 1;       /* deferred successfully */
7879 +
7880 +       }
7881 +
7882 +       return 0;
7883 +}
7884 +
7885 +static int dm_request(request_queue_t *q, int rw, struct buffer_head *bh)
7886 +{
7887 +       int r;
7888 +       struct dm_io *io;
7889 +       struct mapped_device *md;
7890 +
7891 +       md = get_kdev(bh->b_rdev);
7892 +       if (!md) {
7893 +               buffer_IO_error(bh);
7894 +               return 0;
7895 +       }
7896 +
7897 +       io = alloc_io(md);
7898 +       down_read(&md->lock);
7899 +
7900 +       r = __deferring(md, rw, bh);
7901 +       if (r < 0)
7902 +               goto bad;
7903 +
7904 +       else if (!r) {
7905 +               /* not deferring */
7906 +               r = __map_buffer(md, rw, bh, io);
7907 +               if (r < 0)
7908 +                       goto bad;
7909 +       } else
7910 +               r = 0;
7911 +
7912 +       up_read(&md->lock);
7913 +       dm_put(md);
7914 +       return r;
7915 +
7916 +      bad:
7917 +       buffer_IO_error(bh);
7918 +       up_read(&md->lock);
7919 +       dm_put(md);
7920 +       return 0;
7921 +}
7922 +
7923 +static int check_dev_size(kdev_t dev, unsigned long block)
7924 +{
7925 +       unsigned int major = major(dev);
7926 +       unsigned int minor = minor(dev);
7927 +
7928 +       /* FIXME: check this */
7929 +       unsigned long max_sector = (blk_size[major][minor] << 1) + 1;
7930 +       unsigned long sector = (block + 1) * (blksize_size[major][minor] >> 9);
7931 +
7932 +       return (sector > max_sector) ? 0 : 1;
7933 +}
7934 +
7935 +/*
7936 + * Creates a dummy buffer head and maps it (for lilo).
7937 + */
7938 +static int __bmap(struct mapped_device *md, kdev_t dev, unsigned long block,
7939 +                 kdev_t *r_dev, unsigned long *r_block)
7940 +{
7941 +       struct buffer_head bh;
7942 +       struct dm_target *ti;
7943 +       union map_info map_context;
7944 +       int r;
7945 +
7946 +       if (test_bit(DMF_BLOCK_IO, &md->flags)) {
7947 +               return -EPERM;
7948 +       }
7949 +
7950 +       if (!check_dev_size(dev, block)) {
7951 +               return -EINVAL;
7952 +       }
7953 +
7954 +       if (!md->map)
7955 +               return -EINVAL;
7956 +
7957 +       /* setup dummy bh */
7958 +       memset(&bh, 0, sizeof(bh));
7959 +       bh.b_blocknr = block;
7960 +       bh.b_dev = bh.b_rdev = dev;
7961 +       bh.b_size = blksize_size[major(dev)][minor(dev)];
7962 +       bh.b_rsector = block * (bh.b_size >> 9);
7963 +
7964 +       /* find target */
7965 +       ti = dm_table_find_target(md->map, bh.b_rsector);
7966 +
7967 +       /* do the mapping */
7968 +       r = ti->type->map(ti, &bh, READ, &map_context);
7969 +       ti->type->end_io(ti, &bh, READ, 0, &map_context);
7970 +
7971 +       if (!r) {
7972 +               *r_dev = bh.b_rdev;
7973 +               *r_block = bh.b_rsector / (bh.b_size >> 9);
7974 +       }
7975 +
7976 +       return r;
7977 +}
7978 +
7979 +/*
7980 + * Marshals arguments and results between user and kernel space.
7981 + */
7982 +static int dm_user_bmap(struct inode *inode, struct lv_bmap *lvb)
7983 +{
7984 +       struct mapped_device *md;
7985 +       unsigned long block, r_block;
7986 +       kdev_t r_dev;
7987 +       int r;
7988 +
7989 +       if (get_user(block, &lvb->lv_block))
7990 +               return -EFAULT;
7991 +
7992 +       md = get_kdev(inode->i_rdev);
7993 +       if (!md)
7994 +               return -ENXIO;
7995 +
7996 +       down_read(&md->lock);
7997 +       r = __bmap(md, inode->i_rdev, block, &r_dev, &r_block);
7998 +       up_read(&md->lock);
7999 +       dm_put(md);
8000 +
8001 +       if (!r && (put_user(kdev_t_to_nr(r_dev), &lvb->lv_dev) ||
8002 +                  put_user(r_block, &lvb->lv_block)))
8003 +               r = -EFAULT;
8004 +
8005 +       return r;
8006 +}
8007 +
8008 +static void free_md(struct mapped_device *md)
8009 +{
8010 +       free_dev(md->dev);
8011 +       mempool_destroy(md->io_pool);
8012 +       kfree(md);
8013 +}
8014 +
8015 +/*
8016 + * Allocate and initialise a blank device with a given minor.
8017 + */
8018 +static struct mapped_device *alloc_md(kdev_t dev)
8019 +{
8020 +       int r;
8021 +       struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL);
8022 +
8023 +       if (!md) {
8024 +               DMWARN("unable to allocate device, out of memory.");
8025 +               return NULL;
8026 +       }
8027 +
8028 +       memset(md, 0, sizeof(*md));
8029 +
8030 +       /* Allocate suitable device number */
8031 +       if (!dev)
8032 +               r = first_free_dev(md);
8033 +       else
8034 +               r = specific_dev(dev, md);
8035 +
8036 +       if (r) {
8037 +               kfree(md);
8038 +               return NULL;
8039 +       }
8040 +
8041 +       md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
8042 +                                    mempool_free_slab, _io_cache);
8043 +       if (!md->io_pool) {
8044 +               free_md(md);
8045 +               kfree(md);
8046 +               return NULL;
8047 +       }
8048 +
8049 +       init_rwsem(&md->lock);
8050 +       atomic_set(&md->holders, 1);
8051 +       atomic_set(&md->pending, 0);
8052 +       init_waitqueue_head(&md->wait);
8053 +       init_waitqueue_head(&md->eventq);
8054 +
8055 +       return md;
8056 +}
8057 +
8058 +/*
8059 + * The hardsect size for a mapped device is the largest hardsect size
8060 + * from the devices it maps onto.
8061 + */
8062 +static int __find_hardsect_size(struct list_head *devices)
8063 +{
8064 +       int result = 512, size;
8065 +       struct list_head *tmp;
8066 +
8067 +       list_for_each (tmp, devices) {
8068 +               struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
8069 +               size = get_hardsect_size(dd->dev);
8070 +               if (size > result)
8071 +                       result = size;
8072 +       }
8073 +
8074 +       return result;
8075 +}
8076 +
8077 +/*
8078 + * Bind a table to the device.
8079 + */
8080 +static void event_callback(void *context)
8081 +{
8082 +       struct mapped_device *md = (struct mapped_device *) context;
8083 +
8084 +       down_write(&md->lock);
8085 +       md->event_nr++;
8086 +       wake_up_interruptible(&md->eventq);
8087 +       up_write(&md->lock);
8088 +}
8089 +
8090 +static int __bind(struct mapped_device *md, struct dm_table *t)
8091 +{
8092 +       unsigned int minor = minor(md->dev);
8093 +       unsigned int major = major(md->dev);
8094 +       md->map = t;
8095 +
8096 +       /* in k */
8097 +       blk_size[major][minor] = dm_table_get_size(t) >> 1;
8098 +       blksize_size[major][minor] = BLOCK_SIZE;
8099 +       hardsect_size[major][minor] =
8100 +           __find_hardsect_size(dm_table_get_devices(t));
8101 +       register_disk(NULL, md->dev, 1, &dm_blk_dops, blk_size[major][minor]);
8102 +
8103 +       dm_table_event_callback(md->map, event_callback, md);
8104 +       dm_table_get(t);
8105 +       return 0;
8106 +}
8107 +
8108 +static void __unbind(struct mapped_device *md)
8109 +{
8110 +       unsigned int minor = minor(md->dev);
8111 +       unsigned int major = major(md->dev);
8112 +
8113 +       if (md->map) {
8114 +               dm_table_event_callback(md->map, NULL, NULL);
8115 +               dm_table_put(md->map);
8116 +               md->map = NULL;
8117 +
8118 +       }
8119 +
8120 +       blk_size[major][minor] = 0;
8121 +       blksize_size[major][minor] = 0;
8122 +       hardsect_size[major][minor] = 0;
8123 +}
8124 +
8125 +/*
8126 + * Constructor for a new device.
8127 + */
8128 +int dm_create(kdev_t dev, struct mapped_device **result)
8129 +{
8130 +       struct mapped_device *md;
8131 +
8132 +       md = alloc_md(dev);
8133 +       if (!md)
8134 +               return -ENXIO;
8135 +
8136 +       __unbind(md);   /* Ensure zero device size */
8137 +
8138 +       *result = md;
8139 +       return 0;
8140 +}
8141 +
8142 +void dm_get(struct mapped_device *md)
8143 +{
8144 +       atomic_inc(&md->holders);
8145 +}
8146 +
8147 +void dm_put(struct mapped_device *md)
8148 +{
8149 +       if (atomic_dec_and_test(&md->holders)) {
8150 +               if (md->map)
8151 +                       dm_table_suspend_targets(md->map);
8152 +               __unbind(md);
8153 +               free_md(md);
8154 +       }
8155 +}
8156 +
8157 +/*
8158 + * Requeue the deferred io by calling generic_make_request.
8159 + */
8160 +static void flush_deferred_io(struct deferred_io *c)
8161 +{
8162 +       struct deferred_io *n;
8163 +
8164 +       while (c) {
8165 +               n = c->next;
8166 +               generic_make_request(c->rw, c->bh);
8167 +               free_deferred(c);
8168 +               c = n;
8169 +       }
8170 +}
8171 +
8172 +/*
8173 + * Swap in a new table (destroying old one).
8174 + */
8175 +int dm_swap_table(struct mapped_device *md, struct dm_table *table)
8176 +{
8177 +       int r;
8178 +
8179 +       down_write(&md->lock);
8180 +
8181 +       /*
8182 +        * The device must be suspended, or have no table bound yet.
8183 +        */
8184 +       if (md->map && !test_bit(DMF_SUSPENDED, &md->flags)) {
8185 +               up_write(&md->lock);
8186 +               return -EPERM;
8187 +       }
8188 +
8189 +       __unbind(md);
8190 +       r = __bind(md, table);
8191 +       if (r)
8192 +               return r;
8193 +
8194 +       up_write(&md->lock);
8195 +       return 0;
8196 +}
8197 +
8198 +/*
8199 + * We need to be able to change a mapping table under a mounted
8200 + * filesystem.  For example we might want to move some data in
8201 + * the background.  Before the table can be swapped with
8202 + * dm_bind_table, dm_suspend must be called to flush any in
8203 + * flight io and ensure that any further io gets deferred.
8204 + */
8205 +int dm_suspend(struct mapped_device *md)
8206 +{
8207 +       int r = 0;
8208 +       DECLARE_WAITQUEUE(wait, current);
8209 +
8210 +       down_write(&md->lock);
8211 +
8212 +       /*
8213 +        * First we set the BLOCK_IO flag so no more ios will be
8214 +        * mapped.
8215 +        */
8216 +       if (test_bit(DMF_BLOCK_IO, &md->flags)) {
8217 +               up_write(&md->lock);
8218 +               return -EINVAL;
8219 +       }
8220 +
8221 +       set_bit(DMF_BLOCK_IO, &md->flags);
8222 +       add_wait_queue(&md->wait, &wait);
8223 +       up_write(&md->lock);
8224 +
8225 +       /*
8226 +        * Then we wait for the already mapped ios to
8227 +        * complete.
8228 +        */
8229 +       run_task_queue(&tq_disk);
8230 +       while (1) {
8231 +               set_current_state(TASK_INTERRUPTIBLE);
8232 +
8233 +               if (!atomic_read(&md->pending) || signal_pending(current))
8234 +                       break;
8235 +
8236 +               schedule();
8237 +       }
8238 +       set_current_state(TASK_RUNNING);
8239 +
8240 +       down_write(&md->lock);
8241 +       remove_wait_queue(&md->wait, &wait);
8242 +
8243 +       /* did we flush everything ? */
8244 +       if (atomic_read(&md->pending)) {
8245 +               clear_bit(DMF_BLOCK_IO, &md->flags);
8246 +               r = -EINTR;
8247 +       } else {
8248 +               set_bit(DMF_SUSPENDED, &md->flags);
8249 +               if (md->map)
8250 +                       dm_table_suspend_targets(md->map);
8251 +       }
8252 +       up_write(&md->lock);
8253 +
8254 +       return r;
8255 +}
8256 +
8257 +int dm_resume(struct mapped_device *md)
8258 +{
8259 +       struct deferred_io *def;
8260 +
8261 +       down_write(&md->lock);
8262 +       if (!test_bit(DMF_SUSPENDED, &md->flags)) {
8263 +               up_write(&md->lock);
8264 +               return -EINVAL;
8265 +       }
8266 +
8267 +       if (md->map)
8268 +               dm_table_resume_targets(md->map);
8269 +
8270 +       clear_bit(DMF_SUSPENDED, &md->flags);
8271 +       clear_bit(DMF_BLOCK_IO, &md->flags);
8272 +       def = md->deferred;
8273 +       md->deferred = NULL;
8274 +       up_write(&md->lock);
8275 +
8276 +       flush_deferred_io(def);
8277 +       run_task_queue(&tq_disk);
8278 +
8279 +       return 0;
8280 +}
8281 +
8282 +struct dm_table *dm_get_table(struct mapped_device *md)
8283 +{
8284 +       struct dm_table *t;
8285 +
8286 +       down_read(&md->lock);
8287 +       t = md->map;
8288 +       if (t)
8289 +               dm_table_get(t);
8290 +       up_read(&md->lock);
8291 +
8292 +       return t;
8293 +}
8294 +
8295 +/*-----------------------------------------------------------------
8296 + * Event notification.
8297 + *---------------------------------------------------------------*/
8298 +uint32_t dm_get_event_nr(struct mapped_device *md)
8299 +{
8300 +       uint32_t r;
8301 +
8302 +       down_read(&md->lock);
8303 +       r = md->event_nr;
8304 +       up_read(&md->lock);
8305 +
8306 +       return r;
8307 +}
8308 +
8309 +int dm_add_wait_queue(struct mapped_device *md, wait_queue_t *wq,
8310 +                     uint32_t event_nr)
8311 +{
8312 +       down_write(&md->lock);
8313 +       if (event_nr != md->event_nr) {
8314 +               up_write(&md->lock);
8315 +               return 1;
8316 +       }
8317 +
8318 +       add_wait_queue(&md->eventq, wq);
8319 +       up_write(&md->lock);
8320 +
8321 +       return 0;
8322 +}
8323 +
8324 +const char *dm_kdevname(kdev_t dev)
8325 +{
8326 +       static char buffer[32];
8327 +       sprintf(buffer, "%03d:%03d", MAJOR(dev), MINOR(dev));
8328 +       return buffer;
8329 +}
8330 +
8331 +void dm_remove_wait_queue(struct mapped_device *md, wait_queue_t *wq)
8332 +{
8333 +       down_write(&md->lock);
8334 +       remove_wait_queue(&md->eventq, wq);
8335 +       up_write(&md->lock);
8336 +}
8337 +
8338 +kdev_t dm_kdev(struct mapped_device *md)
8339 +{
8340 +       kdev_t dev;
8341 +
8342 +       down_read(&md->lock);
8343 +       dev = md->dev;
8344 +       up_read(&md->lock);
8345 +
8346 +       return dev;
8347 +}
8348 +
8349 +int dm_suspended(struct mapped_device *md)
8350 +{
8351 +       return test_bit(DMF_SUSPENDED, &md->flags);
8352 +}
8353 +
8354 +struct block_device_operations dm_blk_dops = {
8355 +       .open = dm_blk_open,
8356 +       .release = dm_blk_close,
8357 +       .ioctl = dm_blk_ioctl,
8358 +       .owner = THIS_MODULE
8359 +};
8360 +
8361 +/*
8362 + * module hooks
8363 + */
8364 +module_init(dm_init);
8365 +module_exit(dm_exit);
8366 +
8367 +MODULE_DESCRIPTION(DM_NAME " driver");
8368 +MODULE_AUTHOR("Joe Thornber <thornber@sistina.com>");
8369 +MODULE_LICENSE("GPL");
8370 +
8371 +EXPORT_SYMBOL(dm_kdevname);
8372 --- linux-2.4.21/drivers/md/dm.h        Thu Jan  1 01:00:00 1970
8373 +++ linux/drivers/md/dm.h       Wed Aug 20 14:41:38 2003
8374 @@ -0,0 +1,175 @@
8375 +/*
8376 + * Internal header file for device mapper
8377 + *
8378 + * Copyright (C) 2001, 2002 Sistina Software
8379 + *
8380 + * This file is released under the LGPL.
8381 + */
8382 +
8383 +#ifndef DM_INTERNAL_H
8384 +#define DM_INTERNAL_H
8385 +
8386 +#include <linux/fs.h>
8387 +#include <linux/device-mapper.h>
8388 +#include <linux/list.h>
8389 +#include <linux/blkdev.h>
8390 +
8391 +#define DM_NAME "device-mapper"
8392 +#define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x)
8393 +#define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x)
8394 +#define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x)
8395 +
8396 +/*
8397 + * FIXME: I think this should be with the definition of sector_t
8398 + * in types.h.
8399 + */
8400 +#ifdef CONFIG_LBD
8401 +#define SECTOR_FORMAT "%Lu"
8402 +#else
8403 +#define SECTOR_FORMAT "%lu"
8404 +#endif
8405 +
8406 +#define SECTOR_SHIFT 9
8407 +#define SECTOR_SIZE (1 << SECTOR_SHIFT)
8408 +
8409 +extern struct block_device_operations dm_blk_dops;
8410 +
8411 +/*
8412 + * List of devices that a metadevice uses and should open/close.
8413 + */
8414 +struct dm_dev {
8415 +       struct list_head list;
8416 +
8417 +       atomic_t count;
8418 +       int mode;
8419 +       kdev_t dev;
8420 +       struct block_device *bdev;
8421 +};
8422 +
8423 +struct dm_table;
8424 +struct mapped_device;
8425 +
8426 +/*-----------------------------------------------------------------
8427 + * Functions for manipulating a struct mapped_device.
8428 + * Drop the reference with dm_put when you finish with the object.
8429 + *---------------------------------------------------------------*/
8430 +int dm_create(kdev_t dev, struct mapped_device **md);
8431 +
8432 +/*
8433 + * Reference counting for md.
8434 + */
8435 +void dm_get(struct mapped_device *md);
8436 +void dm_put(struct mapped_device *md);
8437 +
8438 +/*
8439 + * A device can still be used while suspended, but I/O is deferred.
8440 + */
8441 +int dm_suspend(struct mapped_device *md);
8442 +int dm_resume(struct mapped_device *md);
8443 +
8444 +/*
8445 + * The device must be suspended before calling this method.
8446 + */
8447 +int dm_swap_table(struct mapped_device *md, struct dm_table *t);
8448 +
8449 +/*
8450 + * Drop a reference on the table when you've finished with the
8451 + * result.
8452 + */
8453 +struct dm_table *dm_get_table(struct mapped_device *md);
8454 +
8455 +/*
8456 + * Event functions.
8457 + */
8458 +uint32_t dm_get_event_nr(struct mapped_device *md);
8459 +int dm_add_wait_queue(struct mapped_device *md, wait_queue_t *wq,
8460 +                     uint32_t event_nr);
8461 +void dm_remove_wait_queue(struct mapped_device *md, wait_queue_t *wq);
8462 +
8463 +/*
8464 + * Info functions.
8465 + */
8466 +kdev_t dm_kdev(struct mapped_device *md);
8467 +int dm_suspended(struct mapped_device *md);
8468 +
8469 +/*-----------------------------------------------------------------
8470 + * Functions for manipulating a table.  Tables are also reference
8471 + * counted.
8472 + *---------------------------------------------------------------*/
8473 +int dm_table_create(struct dm_table **result, int mode);
8474 +
8475 +void dm_table_get(struct dm_table *t);
8476 +void dm_table_put(struct dm_table *t);
8477 +
8478 +int dm_table_add_target(struct dm_table *t, const char *type,
8479 +                       sector_t start, sector_t len, char *params);
8480 +int dm_table_complete(struct dm_table *t);
8481 +void dm_table_event_callback(struct dm_table *t,
8482 +                            void (*fn)(void *), void *context);
8483 +void dm_table_event(struct dm_table *t);
8484 +sector_t dm_table_get_size(struct dm_table *t);
8485 +struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
8486 +struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
8487 +unsigned int dm_table_get_num_targets(struct dm_table *t);
8488 +struct list_head *dm_table_get_devices(struct dm_table *t);
8489 +int dm_table_get_mode(struct dm_table *t);
8490 +void dm_table_suspend_targets(struct dm_table *t);
8491 +void dm_table_resume_targets(struct dm_table *t);
8492 +
8493 +/*-----------------------------------------------------------------
8494 + * A registry of target types.
8495 + *---------------------------------------------------------------*/
8496 +int dm_target_init(void);
8497 +void dm_target_exit(void);
8498 +struct target_type *dm_get_target_type(const char *name);
8499 +void dm_put_target_type(struct target_type *t);
8500 +
8501 +
8502 +/*-----------------------------------------------------------------
8503 + * Useful inlines.
8504 + *---------------------------------------------------------------*/
8505 +static inline int array_too_big(unsigned long fixed, unsigned long obj,
8506 +                               unsigned long num)
8507 +{
8508 +       return (num > (ULONG_MAX - fixed) / obj);
8509 +}
8510 +
8511 +/*
8512 + * ceiling(n / size) * size
8513 + */
8514 +static inline unsigned long dm_round_up(unsigned long n, unsigned long size)
8515 +{
8516 +       unsigned long r = n % size;
8517 +       return n + (r ? (size - r) : 0);
8518 +}
8519 +
8520 +/*
8521 + * Ceiling(n / size)
8522 + */
8523 +static inline unsigned long dm_div_up(unsigned long n, unsigned long size)
8524 +{
8525 +       return dm_round_up(n, size) / size;
8526 +}
8527 +
8528 +const char *dm_kdevname(kdev_t dev);
8529 +
8530 +/*
8531 + * The device-mapper can be driven through one of two interfaces;
8532 + * ioctl or filesystem, depending which patch you have applied.
8533 + */
8534 +int dm_interface_init(void);
8535 +void dm_interface_exit(void);
8536 +
8537 +/*
8538 + * Targets for linear and striped mappings
8539 + */
8540 +int dm_linear_init(void);
8541 +void dm_linear_exit(void);
8542 +
8543 +int dm_stripe_init(void);
8544 +void dm_stripe_exit(void);
8545 +
8546 +int dm_snapshot_init(void);
8547 +void dm_snapshot_exit(void);
8548 +
8549 +#endif
8550 --- linux-2.4.21/drivers/md/kcopyd.c    Thu Jan  1 01:00:00 1970
8551 +++ linux/drivers/md/kcopyd.c   Wed Aug 20 14:41:38 2003
8552 @@ -0,0 +1,650 @@
8553 +/*
8554 + * Copyright (C) 2002 Sistina Software (UK) Limited.
8555 + *
8556 + * This file is released under the GPL.
8557 + */
8558 +
8559 +#include <asm/atomic.h>
8560 +
8561 +#include <linux/blkdev.h>
8562 +#include <linux/config.h>
8563 +#include <linux/device-mapper.h>
8564 +#include <linux/fs.h>
8565 +#include <linux/init.h>
8566 +#include <linux/list.h>
8567 +#include <linux/locks.h>
8568 +#include <linux/mempool.h>
8569 +#include <linux/module.h>
8570 +#include <linux/pagemap.h>
8571 +#include <linux/slab.h>
8572 +#include <linux/vmalloc.h>
8573 +
8574 +#include "kcopyd.h"
8575 +#include "dm-daemon.h"
8576 +
8577 +/* FIXME: this is only needed for the DMERR macros */
8578 +#include "dm.h"
8579 +
8580 +static struct dm_daemon _kcopyd;
8581 +
8582 +/*-----------------------------------------------------------------
8583 + * Each kcopyd client has its own little pool of preallocated
8584 + * pages for kcopyd io.
8585 + *---------------------------------------------------------------*/
8586 +struct kcopyd_client {
8587 +       struct list_head list;
8588 +
8589 +       spinlock_t lock;
8590 +       struct list_head pages;
8591 +       unsigned int nr_pages;
8592 +       unsigned int nr_free_pages;
8593 +};
8594 +
8595 +static inline void __push_page(struct kcopyd_client *kc, struct page *p)
8596 +{
8597 +       list_add(&p->list, &kc->pages);
8598 +       kc->nr_free_pages++;
8599 +}
8600 +
8601 +static inline struct page *__pop_page(struct kcopyd_client *kc)
8602 +{
8603 +       struct page *p;
8604 +
8605 +       p = list_entry(kc->pages.next, struct page, list);
8606 +       list_del(&p->list);
8607 +       kc->nr_free_pages--;
8608 +
8609 +       return p;
8610 +}
8611 +
8612 +static int kcopyd_get_pages(struct kcopyd_client *kc,
8613 +                           unsigned int nr, struct list_head *pages)
8614 +{
8615 +       struct page *p;
8616 +       INIT_LIST_HEAD(pages);
8617 +
8618 +       spin_lock(&kc->lock);
8619 +       if (kc->nr_free_pages < nr) {
8620 +               spin_unlock(&kc->lock);
8621 +               return -ENOMEM;
8622 +       }
8623 +
8624 +       while (nr--) {
8625 +               p = __pop_page(kc);
8626 +               list_add(&p->list, pages);
8627 +       }
8628 +       spin_unlock(&kc->lock);
8629 +
8630 +       return 0;
8631 +}
8632 +
8633 +static void kcopyd_put_pages(struct kcopyd_client *kc, struct list_head *pages)
8634 +{
8635 +       struct list_head *tmp, *tmp2;
8636 +
8637 +       spin_lock(&kc->lock);
8638 +       list_for_each_safe (tmp, tmp2, pages)
8639 +               __push_page(kc, list_entry(tmp, struct page, list));
8640 +       spin_unlock(&kc->lock);
8641 +}
8642 +
8643 +/*
8644 + * These three functions resize the page pool.
8645 + */
8646 +static void release_pages(struct list_head *pages)
8647 +{
8648 +       struct page *p;
8649 +       struct list_head *tmp, *tmp2;
8650 +
8651 +       list_for_each_safe (tmp, tmp2, pages) {
8652 +               p = list_entry(tmp, struct page, list);
8653 +               UnlockPage(p);
8654 +               __free_page(p);
8655 +       }
8656 +}
8657 +
8658 +static int client_alloc_pages(struct kcopyd_client *kc, unsigned int nr)
8659 +{
8660 +       unsigned int i;
8661 +       struct page *p;
8662 +       LIST_HEAD(new);
8663 +
8664 +       for (i = 0; i < nr; i++) {
8665 +               p = alloc_page(GFP_KERNEL);
8666 +               if (!p) {
8667 +                       release_pages(&new);
8668 +                       return -ENOMEM;
8669 +               }
8670 +
8671 +               LockPage(p);
8672 +               list_add(&p->list, &new);
8673 +       }
8674 +
8675 +       kcopyd_put_pages(kc, &new);
8676 +       kc->nr_pages += nr;
8677 +       return 0;
8678 +}
8679 +
8680 +static void client_free_pages(struct kcopyd_client *kc)
8681 +{
8682 +       BUG_ON(kc->nr_free_pages != kc->nr_pages);
8683 +       release_pages(&kc->pages);
8684 +       kc->nr_free_pages = kc->nr_pages = 0;
8685 +}
8686 +
8687 +/*-----------------------------------------------------------------
8688 + * kcopyd_jobs need to be allocated by the *clients* of kcopyd,
8689 + * for this reason we use a mempool to prevent the client from
8690 + * ever having to do io (which could cause a deadlock).
8691 + *---------------------------------------------------------------*/
8692 +struct kcopyd_job {
8693 +       struct kcopyd_client *kc;
8694 +       struct list_head list;
8695 +       unsigned int flags;
8696 +
8697 +       /*
8698 +        * Error state of the job.
8699 +        */
8700 +       int read_err;
8701 +       unsigned int write_err;
8702 +
8703 +       /*
8704 +        * Either READ or WRITE
8705 +        */
8706 +       int rw;
8707 +       struct io_region source;
8708 +
8709 +       /*
8710 +        * The destinations for the transfer.
8711 +        */
8712 +       unsigned int num_dests;
8713 +       struct io_region dests[KCOPYD_MAX_REGIONS];
8714 +
8715 +       sector_t offset;
8716 +       unsigned int nr_pages;
8717 +       struct list_head pages;
8718 +
8719 +       /*
8720 +        * Set this to ensure you are notified when the job has
8721 +        * completed.  'context' is for callback to use.
8722 +        */
8723 +       kcopyd_notify_fn fn;
8724 +       void *context;
8725 +
8726 +       /*
8727 +        * These fields are only used if the job has been split
8728 +        * into more manageable parts.
8729 +        */
8730 +       struct semaphore lock;
8731 +       atomic_t sub_jobs;
8732 +       sector_t progress;
8733 +};
8734 +
8735 +/* FIXME: this should scale with the number of pages */
8736 +#define MIN_JOBS 512
8737 +
8738 +static kmem_cache_t *_job_cache;
8739 +static mempool_t *_job_pool;
8740 +
8741 +/*
8742 + * We maintain three lists of jobs:
8743 + *
8744 + * i)   jobs waiting for pages
8745 + * ii)  jobs that have pages, and are waiting for the io to be issued.
8746 + * iii) jobs that have completed.
8747 + *
8748 + * All three of these are protected by job_lock.
8749 + */
8750 +static spinlock_t _job_lock = SPIN_LOCK_UNLOCKED;
8751 +
8752 +static LIST_HEAD(_complete_jobs);
8753 +static LIST_HEAD(_io_jobs);
8754 +static LIST_HEAD(_pages_jobs);
8755 +
8756 +static int jobs_init(void)
8757 +{
8758 +       INIT_LIST_HEAD(&_complete_jobs);
8759 +       INIT_LIST_HEAD(&_io_jobs);
8760 +       INIT_LIST_HEAD(&_pages_jobs);
8761 +
8762 +       _job_cache = kmem_cache_create("kcopyd-jobs",
8763 +                                      sizeof(struct kcopyd_job),
8764 +                                      __alignof__(struct kcopyd_job),
8765 +                                      0, NULL, NULL);
8766 +       if (!_job_cache)
8767 +               return -ENOMEM;
8768 +
8769 +       _job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
8770 +                                  mempool_free_slab, _job_cache);
8771 +       if (!_job_pool) {
8772 +               kmem_cache_destroy(_job_cache);
8773 +               return -ENOMEM;
8774 +       }
8775 +
8776 +       return 0;
8777 +}
8778 +
8779 +static void jobs_exit(void)
8780 +{
8781 +       BUG_ON(!list_empty(&_complete_jobs));
8782 +       BUG_ON(!list_empty(&_io_jobs));
8783 +       BUG_ON(!list_empty(&_pages_jobs));
8784 +
8785 +       mempool_destroy(_job_pool);
8786 +       kmem_cache_destroy(_job_cache);
8787 +}
8788 +
8789 +/*
8790 + * Functions to push and pop a job onto the head of a given job
8791 + * list.
8792 + */
8793 +static inline struct kcopyd_job *pop(struct list_head *jobs)
8794 +{
8795 +       struct kcopyd_job *job = NULL;
8796 +       unsigned long flags;
8797 +
8798 +       spin_lock_irqsave(&_job_lock, flags);
8799 +
8800 +       if (!list_empty(jobs)) {
8801 +               job = list_entry(jobs->next, struct kcopyd_job, list);
8802 +               list_del(&job->list);
8803 +       }
8804 +       spin_unlock_irqrestore(&_job_lock, flags);
8805 +
8806 +       return job;
8807 +}
8808 +
8809 +static inline void push(struct list_head *jobs, struct kcopyd_job *job)
8810 +{
8811 +       unsigned long flags;
8812 +
8813 +       spin_lock_irqsave(&_job_lock, flags);
8814 +       list_add_tail(&job->list, jobs);
8815 +       spin_unlock_irqrestore(&_job_lock, flags);
8816 +}
8817 +
8818 +/*
8819 + * These three functions process 1 item from the corresponding
8820 + * job list.
8821 + *
8822 + * They return:
8823 + * < 0: error
8824 + *   0: success
8825 + * > 0: can't process yet.
8826 + */
8827 +static int run_complete_job(struct kcopyd_job *job)
8828 +{
8829 +       void *context = job->context;
8830 +       int read_err = job->read_err;
8831 +       unsigned int write_err = job->write_err;
8832 +       kcopyd_notify_fn fn = job->fn;
8833 +
8834 +       kcopyd_put_pages(job->kc, &job->pages);
8835 +       mempool_free(job, _job_pool);
8836 +       fn(read_err, write_err, context);
8837 +       return 0;
8838 +}
8839 +
8840 +static void complete_io(unsigned int error, void *context)
8841 +{
8842 +       struct kcopyd_job *job = (struct kcopyd_job *) context;
8843 +
8844 +       if (error) {
8845 +               if (job->rw == WRITE)
8846 +                       job->write_err &= error;
8847 +               else
8848 +                       job->read_err = 1;
8849 +
8850 +               if (!test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) {
8851 +                       push(&_complete_jobs, job);
8852 +                       dm_daemon_wake(&_kcopyd);
8853 +                       return;
8854 +               }
8855 +       }
8856 +
8857 +       if (job->rw == WRITE)
8858 +               push(&_complete_jobs, job);
8859 +
8860 +       else {
8861 +               job->rw = WRITE;
8862 +               push(&_io_jobs, job);
8863 +       }
8864 +
8865 +       dm_daemon_wake(&_kcopyd);
8866 +}
8867 +
8868 +/*
8869 + * Request io on as many buffer heads as we can currently get for
8870 + * a particular job.
8871 + */
8872 +static int run_io_job(struct kcopyd_job *job)
8873 +{
8874 +       int r;
8875 +
8876 +       if (job->rw == READ)
8877 +               r = dm_io_async(1, &job->source, job->rw,
8878 +                               list_entry(job->pages.next, struct page, list),
8879 +                               job->offset, complete_io, job);
8880 +
8881 +       else
8882 +               r = dm_io_async(job->num_dests, job->dests, job->rw,
8883 +                               list_entry(job->pages.next, struct page, list),
8884 +                               job->offset, complete_io, job);
8885 +
8886 +       return r;
8887 +}
8888 +
8889 +#define SECTORS_PER_PAGE (PAGE_SIZE / SECTOR_SIZE)
8890 +static int run_pages_job(struct kcopyd_job *job)
8891 +{
8892 +       int r;
8893 +
8894 +       job->nr_pages = dm_div_up(job->dests[0].count + job->offset,
8895 +                                 SECTORS_PER_PAGE);
8896 +       r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages);
8897 +       if (!r) {
8898 +               /* this job is ready for io */
8899 +               push(&_io_jobs, job);
8900 +               return 0;
8901 +       }
8902 +
8903 +       if (r == -ENOMEM)
8904 +               /* can't complete now */
8905 +               return 1;
8906 +
8907 +       return r;
8908 +}
8909 +
8910 +/*
8911 + * Run through a list for as long as possible.  Returns the count
8912 + * of successful jobs.
8913 + */
8914 +static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *))
8915 +{
8916 +       struct kcopyd_job *job;
8917 +       int r, count = 0;
8918 +
8919 +       while ((job = pop(jobs))) {
8920 +
8921 +               r = fn(job);
8922 +
8923 +               if (r < 0) {
8924 +                       /* error this rogue job */
8925 +                       if (job->rw == WRITE)
8926 +                               job->write_err = (unsigned int) -1;
8927 +                       else
8928 +                               job->read_err = 1;
8929 +                       push(&_complete_jobs, job);
8930 +                       break;
8931 +               }
8932 +
8933 +               if (r > 0) {
8934 +                       /*
8935 +                        * We couldn't service this job ATM, so
8936 +                        * push this job back onto the list.
8937 +                        */
8938 +                       push(jobs, job);
8939 +                       break;
8940 +               }
8941 +
8942 +               count++;
8943 +       }
8944 +
8945 +       return count;
8946 +}
8947 +
8948 +/*
8949 + * kcopyd does this every time it's woken up.
8950 + */
8951 +static void do_work(void)
8952 +{
8953 +       /*
8954 +        * The order that these are called is *very* important.
8955 +        * complete jobs can free some pages for pages jobs.
8956 +        * Pages jobs when successful will jump onto the io jobs
8957 +        * list.  io jobs call wake when they complete and it all
8958 +        * starts again.
8959 +        */
8960 +       process_jobs(&_complete_jobs, run_complete_job);
8961 +       process_jobs(&_pages_jobs, run_pages_job);
8962 +       process_jobs(&_io_jobs, run_io_job);
8963 +       run_task_queue(&tq_disk);
8964 +}
8965 +
8966 +/*
8967 + * If we are copying a small region we just dispatch a single job
8968 + * to do the copy, otherwise the io has to be split up into many
8969 + * jobs.
8970 + */
8971 +static void dispatch_job(struct kcopyd_job *job)
8972 +{
8973 +       push(&_pages_jobs, job);
8974 +       dm_daemon_wake(&_kcopyd);
8975 +}
8976 +
8977 +#define SUB_JOB_SIZE 128
8978 +static void segment_complete(int read_err,
8979 +                            unsigned int write_err, void *context)
8980 +{
8981 +       /* FIXME: tidy this function */
8982 +       sector_t progress = 0;
8983 +       sector_t count = 0;
8984 +       struct kcopyd_job *job = (struct kcopyd_job *) context;
8985 +
8986 +       down(&job->lock);
8987 +
8988 +       /* update the error */
8989 +       if (read_err)
8990 +               job->read_err = 1;
8991 +
8992 +       if (write_err)
8993 +               job->write_err &= write_err;
8994 +
8995 +       /*
8996 +        * Only dispatch more work if there hasn't been an error.
8997 +        */
8998 +       if ((!job->read_err && !job->write_err) ||
8999 +           test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) {
9000 +               /* get the next chunk of work */
9001 +               progress = job->progress;
9002 +               count = job->source.count - progress;
9003 +               if (count) {
9004 +                       if (count > SUB_JOB_SIZE)
9005 +                               count = SUB_JOB_SIZE;
9006 +
9007 +                       job->progress += count;
9008 +               }
9009 +       }
9010 +       up(&job->lock);
9011 +
9012 +       if (count) {
9013 +               int i;
9014 +               struct kcopyd_job *sub_job = mempool_alloc(_job_pool, GFP_NOIO);
9015 +
9016 +               memcpy(sub_job, job, sizeof(*job));
9017 +               sub_job->source.sector += progress;
9018 +               sub_job->source.count = count;
9019 +
9020 +               for (i = 0; i < job->num_dests; i++) {
9021 +                       sub_job->dests[i].sector += progress;
9022 +                       sub_job->dests[i].count = count;
9023 +               }
9024 +
9025 +               sub_job->fn = segment_complete;
9026 +               sub_job->context = job;
9027 +               dispatch_job(sub_job);
9028 +
9029 +       } else if (atomic_dec_and_test(&job->sub_jobs)) {
9030 +
9031 +               /*
9032 +                * To avoid a race we must keep the job around
9033 +                * until after the notify function has completed.
9034 +                * Otherwise the client may try and stop the job
9035 +                * after we've completed.
9036 +                */
9037 +               job->fn(read_err, write_err, job->context);
9038 +               mempool_free(job, _job_pool);
9039 +       }
9040 +}
9041 +
9042 +/*
9043 + * Create some little jobs that will do the move between
9044 + * them.
9045 + */
9046 +#define SPLIT_COUNT 8
9047 +static void split_job(struct kcopyd_job *job)
9048 +{
9049 +       int i;
9050 +
9051 +       atomic_set(&job->sub_jobs, SPLIT_COUNT);
9052 +       for (i = 0; i < SPLIT_COUNT; i++)
9053 +               segment_complete(0, 0u, job);
9054 +}
9055 +
9056 +#define SUB_JOB_THRESHOLD (SPLIT_COUNT * SUB_JOB_SIZE)
9057 +int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
9058 +               unsigned int num_dests, struct io_region *dests,
9059 +               unsigned int flags, kcopyd_notify_fn fn, void *context)
9060 +{
9061 +       struct kcopyd_job *job;
9062 +
9063 +       /*
9064 +        * Allocate a new job.
9065 +        */
9066 +       job = mempool_alloc(_job_pool, GFP_NOIO);
9067 +
9068 +       /*
9069 +        * set up for the read.
9070 +        */
9071 +       job->kc = kc;
9072 +       job->flags = flags;
9073 +       job->read_err = 0;
9074 +       job->write_err = 0;
9075 +       job->rw = READ;
9076 +
9077 +       memcpy(&job->source, from, sizeof(*from));
9078 +
9079 +       job->num_dests = num_dests;
9080 +       memcpy(&job->dests, dests, sizeof(*dests) * num_dests);
9081 +
9082 +       job->offset = 0;
9083 +       job->nr_pages = 0;
9084 +       INIT_LIST_HEAD(&job->pages);
9085 +
9086 +       job->fn = fn;
9087 +       job->context = context;
9088 +
9089 +       if (job->source.count < SUB_JOB_THRESHOLD)
9090 +               dispatch_job(job);
9091 +
9092 +       else {
9093 +               init_MUTEX(&job->lock);
9094 +               job->progress = 0;
9095 +               split_job(job);
9096 +       }
9097 +
9098 +       return 0;
9099 +}
9100 +
9101 +/*
9102 + * Cancels a kcopyd job, eg. someone might be deactivating a
9103 + * mirror.
9104 + */
9105 +int kcopyd_cancel(struct kcopyd_job *job, int block)
9106 +{
9107 +       /* FIXME: finish */
9108 +       return -1;
9109 +}
9110 +
9111 +/*-----------------------------------------------------------------
9112 + * Unit setup
9113 + *---------------------------------------------------------------*/
9114 +static DECLARE_MUTEX(_client_lock);
9115 +static LIST_HEAD(_clients);
9116 +
9117 +static int client_add(struct kcopyd_client *kc)
9118 +{
9119 +       down(&_client_lock);
9120 +       list_add(&kc->list, &_clients);
9121 +       up(&_client_lock);
9122 +       return 0;
9123 +}
9124 +
9125 +static void client_del(struct kcopyd_client *kc)
9126 +{
9127 +       down(&_client_lock);
9128 +       list_del(&kc->list);
9129 +       up(&_client_lock);
9130 +}
9131 +
9132 +int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result)
9133 +{
9134 +       int r = 0;
9135 +       struct kcopyd_client *kc;
9136 +
9137 +       kc = kmalloc(sizeof(*kc), GFP_KERNEL);
9138 +       if (!kc)
9139 +               return -ENOMEM;
9140 +
9141 +       kc->lock = SPIN_LOCK_UNLOCKED;
9142 +       INIT_LIST_HEAD(&kc->pages);
9143 +       kc->nr_pages = kc->nr_free_pages = 0;
9144 +       r = client_alloc_pages(kc, nr_pages);
9145 +       if (r) {
9146 +               kfree(kc);
9147 +               return r;
9148 +       }
9149 +
9150 +       r = dm_io_get(nr_pages);
9151 +       if (r) {
9152 +               client_free_pages(kc);
9153 +               kfree(kc);
9154 +               return r;
9155 +       }
9156 +
9157 +       r = client_add(kc);
9158 +       if (r) {
9159 +               dm_io_put(nr_pages);
9160 +               client_free_pages(kc);
9161 +               kfree(kc);
9162 +               return r;
9163 +       }
9164 +
9165 +       *result = kc;
9166 +       return 0;
9167 +}
9168 +
9169 +void kcopyd_client_destroy(struct kcopyd_client *kc)
9170 +{
9171 +       dm_io_put(kc->nr_pages);
9172 +       client_free_pages(kc);
9173 +       client_del(kc);
9174 +       kfree(kc);
9175 +}
9176 +
9177 +
9178 +int __init kcopyd_init(void)
9179 +{
9180 +       int r;
9181 +
9182 +       r = jobs_init();
9183 +       if (r)
9184 +               return r;
9185 +
9186 +       r = dm_daemon_start(&_kcopyd, "kcopyd", do_work);
9187 +       if (r)
9188 +               jobs_exit();
9189 +
9190 +       return r;
9191 +}
9192 +
9193 +void kcopyd_exit(void)
9194 +{
9195 +       jobs_exit();
9196 +       dm_daemon_stop(&_kcopyd);
9197 +}
9198 +
9199 +EXPORT_SYMBOL(kcopyd_client_create);
9200 +EXPORT_SYMBOL(kcopyd_client_destroy);
9201 +EXPORT_SYMBOL(kcopyd_copy);
9202 +EXPORT_SYMBOL(kcopyd_cancel);
9203 --- linux-2.4.21/drivers/md/kcopyd.h    Thu Jan  1 01:00:00 1970
9204 +++ linux/drivers/md/kcopyd.h   Wed Aug 20 14:41:38 2003
9205 @@ -0,0 +1,47 @@
9206 +/*
9207 + * Copyright (C) 2001 Sistina Software
9208 + *
9209 + * This file is released under the GPL.
9210 + */
9211 +
9212 +#ifndef DM_KCOPYD_H
9213 +#define DM_KCOPYD_H
9214 +
9215 +/*
9216 + * Needed for the definition of offset_t.
9217 + */
9218 +#include <linux/device-mapper.h>
9219 +#include <linux/iobuf.h>
9220 +
9221 +#include "dm-io.h"
9222 +
9223 +int kcopyd_init(void);
9224 +void kcopyd_exit(void);
9225 +
9226 +/* FIXME: make this configurable */
9227 +#define KCOPYD_MAX_REGIONS 8
9228 +
9229 +#define KCOPYD_IGNORE_ERROR 1
9230 +
9231 +/*
9232 + * To use kcopyd you must first create a kcopyd client object.
9233 + */
9234 +struct kcopyd_client;
9235 +int kcopyd_client_create(unsigned int num_pages, struct kcopyd_client **result);
9236 +void kcopyd_client_destroy(struct kcopyd_client *kc);
9237 +
9238 +/*
9239 + * Submit a copy job to kcopyd.  This is built on top of the
9240 + * previous three fns.
9241 + *
9242 + * read_err is a boolean,
9243 + * write_err is a bitset, with 1 bit for each destination region
9244 + */
9245 +typedef void (*kcopyd_notify_fn)(int read_err,
9246 +                                unsigned int write_err, void *context);
9247 +
9248 +int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
9249 +               unsigned int num_dests, struct io_region *dests,
9250 +               unsigned int flags, kcopyd_notify_fn fn, void *context);
9251 +
9252 +#endif
9253 --- linux-2.4.21/fs/buffer.c    Fri Jun 13 16:32:48 2003
9254 +++ linux/fs/buffer.c   Wed Aug 20 14:41:32 2003
9255 @@ -735,6 +735,7 @@
9256         bh->b_list = BUF_CLEAN;
9257         bh->b_end_io = handler;
9258         bh->b_private = private;
9259 +       bh->b_journal_head = NULL;
9260  }
9261  
9262  static void end_buffer_io_async(struct buffer_head * bh, int uptodate)
9263 --- linux-2.4.21/fs/jbd/journal.c       Fri Jun 13 16:32:48 2003
9264 +++ linux/fs/jbd/journal.c      Wed Aug 20 14:41:32 2003
9265 @@ -1802,9 +1802,9 @@
9266  
9267                 if (buffer_jbd(bh)) {
9268                         /* Someone did it for us! */
9269 -                       J_ASSERT_BH(bh, bh->b_private != NULL);
9270 +                       J_ASSERT_BH(bh, bh->b_journal_head != NULL);
9271                         journal_free_journal_head(jh);
9272 -                       jh = bh->b_private;
9273 +                       jh = bh->b_journal_head;
9274                 } else {
9275                         /*
9276                          * We actually don't need jh_splice_lock when
9277 @@ -1812,7 +1812,7 @@
9278                          */
9279                         spin_lock(&jh_splice_lock);
9280                         set_bit(BH_JBD, &bh->b_state);
9281 -                       bh->b_private = jh;
9282 +                       bh->b_journal_head = jh;
9283                         jh->b_bh = bh;
9284                         atomic_inc(&bh->b_count);
9285                         spin_unlock(&jh_splice_lock);
9286 @@ -1821,7 +1821,7 @@
9287         }
9288         jh->b_jcount++;
9289         spin_unlock(&journal_datalist_lock);
9290 -       return bh->b_private;
9291 +       return bh->b_journal_head;
9292  }
9293  
9294  /*
9295 @@ -1854,7 +1854,7 @@
9296                         J_ASSERT_BH(bh, jh2bh(jh) == bh);
9297                         BUFFER_TRACE(bh, "remove journal_head");
9298                         spin_lock(&jh_splice_lock);
9299 -                       bh->b_private = NULL;
9300 +                       bh->b_journal_head = NULL;
9301                         jh->b_bh = NULL;        /* debug, really */
9302                         clear_bit(BH_JBD, &bh->b_state);
9303                         __brelse(bh);
9304 --- linux-2.4.21/include/linux/device-mapper.h  Thu Jan  1 01:00:00 1970
9305 +++ linux/include/linux/device-mapper.h Wed Aug 20 14:41:38 2003
9306 @@ -0,0 +1,104 @@
9307 +/*
9308 + * Copyright (C) 2001 Sistina Software (UK) Limited.
9309 + *
9310 + * This file is released under the LGPL.
9311 + */
9312 +
9313 +#ifndef _LINUX_DEVICE_MAPPER_H
9314 +#define _LINUX_DEVICE_MAPPER_H
9315 +
9316 +typedef unsigned long sector_t;
9317 +
9318 +struct dm_target;
9319 +struct dm_table;
9320 +struct dm_dev;
9321 +
9322 +typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
9323 +
9324 +union map_info {
9325 +       void *ptr;
9326 +       unsigned long long ll;
9327 +};
9328 +
9329 +/*
9330 + * In the constructor the target parameter will already have the
9331 + * table, type, begin and len fields filled in.
9332 + */
9333 +typedef int (*dm_ctr_fn) (struct dm_target * target, unsigned int argc,
9334 +                         char **argv);
9335 +
9336 +/*
9337 + * The destructor doesn't need to free the dm_target, just
9338 + * anything hidden ti->private.
9339 + */
9340 +typedef void (*dm_dtr_fn) (struct dm_target * ti);
9341 +
9342 +/*
9343 + * The map function must return:
9344 + * < 0: error
9345 + * = 0: The target will handle the io by resubmitting it later
9346 + * > 0: simple remap complete
9347 + */
9348 +typedef int (*dm_map_fn) (struct dm_target * ti, struct buffer_head * bh,
9349 +                         int rw, union map_info *map_context);
9350 +
9351 +/*
9352 + * Returns:
9353 + * < 0 : error (currently ignored)
9354 + * 0   : ended successfully
9355 + * 1   : for some reason the io has still not completed (eg,
9356 + *       multipath target might want to requeue a failed io).
9357 + */
9358 +typedef int (*dm_endio_fn) (struct dm_target * ti,
9359 +                           struct buffer_head * bh, int rw, int error,
9360 +                           union map_info *map_context);
9361 +typedef void (*dm_suspend_fn) (struct dm_target *ti);
9362 +typedef void (*dm_resume_fn) (struct dm_target *ti);
9363 +typedef int (*dm_status_fn) (struct dm_target * ti, status_type_t status_type,
9364 +                            char *result, unsigned int maxlen);
9365 +
9366 +void dm_error(const char *message);
9367 +
9368 +/*
9369 + * Constructors should call these functions to ensure destination devices
9370 + * are opened/closed correctly.
9371 + * FIXME: too many arguments.
9372 + */
9373 +int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
9374 +                 sector_t len, int mode, struct dm_dev **result);
9375 +void dm_put_device(struct dm_target *ti, struct dm_dev *d);
9376 +
9377 +/*
9378 + * Information about a target type
9379 + */
9380 +struct target_type {
9381 +       const char *name;
9382 +       struct module *module;
9383 +       dm_ctr_fn ctr;
9384 +       dm_dtr_fn dtr;
9385 +       dm_map_fn map;
9386 +       dm_endio_fn end_io;
9387 +       dm_suspend_fn suspend;
9388 +       dm_resume_fn resume;
9389 +       dm_status_fn status;
9390 +};
9391 +
9392 +struct dm_target {
9393 +       struct dm_table *table;
9394 +       struct target_type *type;
9395 +
9396 +       /* target limits */
9397 +       sector_t begin;
9398 +       sector_t len;
9399 +
9400 +       /* target specific data */
9401 +       void *private;
9402 +
9403 +       /* Used to provide an error string from the ctr */
9404 +       char *error;
9405 +};
9406 +
9407 +int dm_register_target(struct target_type *t);
9408 +int dm_unregister_target(struct target_type *t);
9409 +
9410 +#endif                         /* _LINUX_DEVICE_MAPPER_H */
9411 --- linux-2.4.21/include/linux/dm-ioctl.h       Thu Jan  1 01:00:00 1970
9412 +++ linux/include/linux/dm-ioctl.h      Wed Aug 20 14:41:38 2003
9413 @@ -0,0 +1,237 @@
9414 +/*
9415 + * Copyright (C) 2001 - 2003 Sistina Software (UK) Limited.
9416 + *
9417 + * This file is released under the LGPL.
9418 + */
9419 +
9420 +#ifndef _LINUX_DM_IOCTL_H
9421 +#define _LINUX_DM_IOCTL_H
9422 +
9423 +#include <linux/types.h>
9424 +
9425 +#define DM_DIR "mapper"                /* Slashes not supported */
9426 +#define DM_MAX_TYPE_NAME 16
9427 +#define DM_NAME_LEN 128
9428 +#define DM_UUID_LEN 129
9429 +
9430 +/*
9431 + * A traditional ioctl interface for the device mapper.
9432 + *
9433 + * Each device can have two tables associated with it, an
9434 + * 'active' table which is the one currently used by io passing
9435 + * through the device, and an 'inactive' one which is a table
9436 + * that is being prepared as a replacement for the 'active' one.
9437 + *
9438 + * DM_VERSION:
9439 + * Just get the version information for the ioctl interface.
9440 + *
9441 + * DM_REMOVE_ALL:
9442 + * Remove all dm devices, destroy all tables.  Only really used
9443 + * for debug.
9444 + *
9445 + * DM_LIST_DEVICES:
9446 + * Get a list of all the dm device names.
9447 + *
9448 + * DM_DEV_CREATE:
9449 + * Create a new device, neither the 'active' or 'inactive' table
9450 + * slots will be filled.  The device will be in suspended state
9451 + * after creation, however any io to the device will get errored
9452 + * since it will be out-of-bounds.
9453 + *
9454 + * DM_DEV_REMOVE:
9455 + * Remove a device, destroy any tables.
9456 + *
9457 + * DM_DEV_RENAME:
9458 + * Rename a device.
9459 + *
9460 + * DM_SUSPEND:
9461 + * This performs both suspend and resume, depending which flag is
9462 + * passed in.
9463 + * Suspend: This command will not return until all pending io to
9464 + * the device has completed.  Further io will be deferred until
9465 + * the device is resumed.
9466 + * Resume: It is no longer an error to issue this command on an
9467 + * unsuspended device.  If a table is present in the 'inactive'
9468 + * slot, it will be moved to the active slot, then the old table
9469 + * from the active slot will be _destroyed_.  Finally the device
9470 + * is resumed.
9471 + *
9472 + * DM_DEV_STATUS:
9473 + * Retrieves the status for the table in the 'active' slot.
9474 + *
9475 + * DM_DEV_WAIT:
9476 + * Wait for a significant event to occur to the device.  This
9477 + * could either be caused by an event triggered by one of the
9478 + * targets of the table in the 'active' slot, or a table change.
9479 + *
9480 + * DM_TABLE_LOAD:
9481 + * Load a table into the 'inactive' slot for the device.  The
9482 + * device does _not_ need to be suspended prior to this command.
9483 + *
9484 + * DM_TABLE_CLEAR:
9485 + * Destroy any table in the 'inactive' slot (ie. abort).
9486 + *
9487 + * DM_TABLE_DEPS:
9488 + * Return a set of device dependencies for the 'active' table.
9489 + *
9490 + * DM_TABLE_STATUS:
9491 + * Return the targets status for the 'active' table.
9492 + */
9493 +
9494 +/*
9495 + * All ioctl arguments consist of a single chunk of memory, with
9496 + * this structure at the start.  If a uuid is specified any
9497 + * lookup (eg. for a DM_INFO) will be done on that, *not* the
9498 + * name.
9499 + */
9500 +struct dm_ioctl {
9501 +       /*
9502 +        * The version number is made up of three parts:
9503 +        * major - no backward or forward compatibility,
9504 +        * minor - only backwards compatible,
9505 +        * patch - both backwards and forwards compatible.
9506 +        *
9507 +        * All clients of the ioctl interface should fill in the
9508 +        * version number of the interface that they were
9509 +        * compiled with.
9510 +        *
9511 +        * All recognised ioctl commands (ie. those that don't
9512 +        * return -ENOTTY) fill out this field, even if the
9513 +        * command failed.
9514 +        */
9515 +       uint32_t version[3];    /* in/out */
9516 +       uint32_t data_size;     /* total size of data passed in
9517 +                                * including this struct */
9518 +
9519 +       uint32_t data_start;    /* offset to start of data
9520 +                                * relative to start of this struct */
9521 +
9522 +       uint32_t target_count;  /* in/out */
9523 +       int32_t open_count;     /* out */
9524 +       uint32_t flags;         /* in/out */
9525 +       uint32_t event_nr;      /* in/out */
9526 +       uint32_t padding;
9527 +
9528 +       uint64_t dev;           /* in/out */
9529 +
9530 +       char name[DM_NAME_LEN]; /* device name */
9531 +       char uuid[DM_UUID_LEN]; /* unique identifier for
9532 +                                * the block device */
9533 +};
9534 +
9535 +/*
9536 + * Used to specify tables.  These structures appear after the
9537 + * dm_ioctl.
9538 + */
9539 +struct dm_target_spec {
9540 +       uint64_t sector_start;
9541 +       uint64_t length;
9542 +       int32_t status;         /* used when reading from kernel only */
9543 +
9544 +       /*
9545 +        * Offset in bytes (from the start of this struct) to
9546 +        * next target_spec.
9547 +        */
9548 +       uint32_t next;
9549 +
9550 +       char target_type[DM_MAX_TYPE_NAME];
9551 +
9552 +       /*
9553 +        * Parameter string starts immediately after this object.
9554 +        * Be careful to add padding after string to ensure correct
9555 +        * alignment of subsequent dm_target_spec.
9556 +        */
9557 +};
9558 +
9559 +/*
9560 + * Used to retrieve the target dependencies.
9561 + */
9562 +struct dm_target_deps {
9563 +       uint32_t count;         /* Array size */
9564 +       uint32_t padding;       /* unused */
9565 +       uint64_t dev[0];        /* out */
9566 +};
9567 +
9568 +/*
9569 + * Used to get a list of all dm devices.
9570 + */
9571 +struct dm_name_list {
9572 +       uint64_t dev;
9573 +       uint32_t next;          /* offset to the next record from
9574 +                                  the _start_ of this */
9575 +       char name[0];
9576 +};
9577 +
9578 +/*
9579 + * If you change this make sure you make the corresponding change
9580 + * to dm-ioctl.c:lookup_ioctl()
9581 + */
9582 +enum {
9583 +       /* Top level cmds */
9584 +       DM_VERSION_CMD = 0,
9585 +       DM_REMOVE_ALL_CMD,
9586 +       DM_LIST_DEVICES_CMD,
9587 +
9588 +       /* device level cmds */
9589 +       DM_DEV_CREATE_CMD,
9590 +       DM_DEV_REMOVE_CMD,
9591 +       DM_DEV_RENAME_CMD,
9592 +       DM_DEV_SUSPEND_CMD,
9593 +       DM_DEV_STATUS_CMD,
9594 +       DM_DEV_WAIT_CMD,
9595 +
9596 +       /* Table level cmds */
9597 +       DM_TABLE_LOAD_CMD,
9598 +       DM_TABLE_CLEAR_CMD,
9599 +       DM_TABLE_DEPS_CMD,
9600 +       DM_TABLE_STATUS_CMD,
9601 +};
9602 +
9603 +#define DM_IOCTL 0xfd
9604 +
9605 +#define DM_VERSION       _IOWR(DM_IOCTL, DM_VERSION_CMD, struct dm_ioctl)
9606 +#define DM_REMOVE_ALL    _IOWR(DM_IOCTL, DM_REMOVE_ALL_CMD, struct dm_ioctl)
9607 +#define DM_LIST_DEVICES  _IOWR(DM_IOCTL, DM_LIST_DEVICES_CMD, struct dm_ioctl)
9608 +
9609 +#define DM_DEV_CREATE    _IOWR(DM_IOCTL, DM_DEV_CREATE_CMD, struct dm_ioctl)
9610 +#define DM_DEV_REMOVE    _IOWR(DM_IOCTL, DM_DEV_REMOVE_CMD, struct dm_ioctl)
9611 +#define DM_DEV_RENAME    _IOWR(DM_IOCTL, DM_DEV_RENAME_CMD, struct dm_ioctl)
9612 +#define DM_DEV_SUSPEND   _IOWR(DM_IOCTL, DM_DEV_SUSPEND_CMD, struct dm_ioctl)
9613 +#define DM_DEV_STATUS    _IOWR(DM_IOCTL, DM_DEV_STATUS_CMD, struct dm_ioctl)
9614 +#define DM_DEV_WAIT      _IOWR(DM_IOCTL, DM_DEV_WAIT_CMD, struct dm_ioctl)
9615 +
9616 +#define DM_TABLE_LOAD    _IOWR(DM_IOCTL, DM_TABLE_LOAD_CMD, struct dm_ioctl)
9617 +#define DM_TABLE_CLEAR   _IOWR(DM_IOCTL, DM_TABLE_CLEAR_CMD, struct dm_ioctl)
9618 +#define DM_TABLE_DEPS    _IOWR(DM_IOCTL, DM_TABLE_DEPS_CMD, struct dm_ioctl)
9619 +#define DM_TABLE_STATUS  _IOWR(DM_IOCTL, DM_TABLE_STATUS_CMD, struct dm_ioctl)
9620 +
9621 +#define DM_VERSION_MAJOR       4
9622 +#define DM_VERSION_MINOR       0
9623 +#define DM_VERSION_PATCHLEVEL  3
9624 +#define DM_VERSION_EXTRA       "-ioctl (2003-08-22)"
9625 +
9626 +/* Status bits */
9627 +#define DM_READONLY_FLAG       (1 << 0) /* In/Out */
9628 +#define DM_SUSPEND_FLAG                (1 << 1) /* In/Out */
9629 +#define DM_PERSISTENT_DEV_FLAG (1 << 3) /* In */
9630 +
9631 +/*
9632 + * Flag passed into ioctl STATUS command to get table information
9633 + * rather than current status.
9634 + */
9635 +#define DM_STATUS_TABLE_FLAG   (1 << 4) /* In */
9636 +
9637 +/*
9638 + * Flags that indicate whether a table is present in either of
9639 + * the two table slots that a device has.
9640 + */
9641 +#define DM_ACTIVE_PRESENT_FLAG   (1 << 5) /* Out */
9642 +#define DM_INACTIVE_PRESENT_FLAG (1 << 6) /* Out */
9643 +
9644 +/*
9645 + * Indicates that the buffer passed in wasn't big enough for the
9646 + * results.
9647 + */
9648 +#define DM_BUFFER_FULL_FLAG    (1 << 8) /* Out */
9649 +
9650 +#endif                         /* _LINUX_DM_IOCTL_H */
9651 --- linux-2.4.21/include/linux/fs.h     Fri Jun 13 16:32:51 2003
9652 +++ linux/include/linux/fs.h    Wed Aug 20 14:41:32 2003
9653 @@ -263,7 +263,7 @@
9654         struct page *b_page;            /* the page this bh is mapped to */
9655         void (*b_end_io)(struct buffer_head *bh, int uptodate); /* I/O completion */
9656         void *b_private;                /* reserved for b_end_io */
9657 -
9658 +       void *b_journal_head;           /* ext3 journal_heads */
9659         unsigned long b_rsector;        /* Real buffer location on disk */
9660         wait_queue_head_t b_wait;
9661  
9662 --- linux-2.4.21/include/linux/jbd.h    Fri Jun 13 16:32:51 2003
9663 +++ linux/include/linux/jbd.h   Wed Aug 20 14:41:32 2003
9664 @@ -311,7 +311,7 @@
9665  
9666  static inline struct journal_head *bh2jh(struct buffer_head *bh)
9667  {
9668 -       return bh->b_private;
9669 +       return bh->b_journal_head;
9670  }
9671  
9672  #define HAVE_JOURNAL_CALLBACK_STATUS
9673 --- linux-2.4.21/include/linux/mempool.h        Thu Jan  1 01:00:00 1970
9674 +++ linux/include/linux/mempool.h       Wed Aug 20 14:41:48 2003
9675 @@ -0,0 +1,31 @@
9676 +/*
9677 + * memory buffer pool support
9678 + */
9679 +#ifndef _LINUX_MEMPOOL_H
9680 +#define _LINUX_MEMPOOL_H
9681 +
9682 +#include <linux/list.h>
9683 +#include <linux/wait.h>
9684 +
9685 +struct mempool_s;
9686 +typedef struct mempool_s mempool_t;
9687 +
9688 +typedef void * (mempool_alloc_t)(int gfp_mask, void *pool_data);
9689 +typedef void (mempool_free_t)(void *element, void *pool_data);
9690 +
9691 +extern mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
9692 +                                mempool_free_t *free_fn, void *pool_data);
9693 +extern int mempool_resize(mempool_t *pool, int new_min_nr, int gfp_mask);
9694 +extern void mempool_destroy(mempool_t *pool);
9695 +extern void * mempool_alloc(mempool_t *pool, int gfp_mask);
9696 +extern void mempool_free(void *element, mempool_t *pool);
9697 +
9698 +/*
9699 + * A mempool_alloc_t and mempool_free_t that get the memory from
9700 + * a slab that is passed in through pool_data.
9701 + */
9702 +void *mempool_alloc_slab(int gfp_mask, void *pool_data);
9703 +void mempool_free_slab(void *element, void *pool_data);
9704 +
9705 +
9706 +#endif /* _LINUX_MEMPOOL_H */
9707 --- linux-2.4.21/include/linux/vmalloc.h        Fri Jan 10 16:35:58 2003
9708 +++ linux/include/linux/vmalloc.h       Wed Aug 20 14:41:57 2003
9709 @@ -26,6 +26,7 @@
9710  extern void vmfree_area_pages(unsigned long address, unsigned long size);
9711  extern int vmalloc_area_pages(unsigned long address, unsigned long size,
9712                                int gfp_mask, pgprot_t prot);
9713 +extern void *vcalloc(unsigned long nmemb, unsigned long elem_size);
9714  
9715  /*
9716   *     Allocate any pages
9717 --- linux-2.4.21/kernel/ksyms.c Fri Jun 13 16:32:52 2003
9718 +++ linux/kernel/ksyms.c        Wed Aug 20 14:41:57 2003
9719 @@ -112,6 +112,7 @@
9720  EXPORT_SYMBOL(vfree);
9721  EXPORT_SYMBOL(__vmalloc);
9722  EXPORT_SYMBOL(vmalloc_to_page);
9723 +EXPORT_SYMBOL(vcalloc);
9724  EXPORT_SYMBOL(mem_map);
9725  EXPORT_SYMBOL(remap_page_range);
9726  EXPORT_SYMBOL(max_mapnr);
9727 --- linux-2.4.21/mm/Makefile    Fri Jan 10 16:36:02 2003
9728 +++ linux/mm/Makefile   Wed Aug 20 14:41:48 2003
9729 @@ -9,12 +9,12 @@
9730  
9731  O_TARGET := mm.o
9732  
9733 -export-objs := shmem.o filemap.o memory.o page_alloc.o
9734 +export-objs := shmem.o filemap.o memory.o page_alloc.o mempool.o
9735  
9736  obj-y   := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
9737             vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
9738             page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \
9739 -           shmem.o
9740 +           shmem.o mempool.o
9741  
9742  obj-$(CONFIG_HIGHMEM) += highmem.o
9743  
9744 --- linux-2.4.21/mm/filemap.c   Fri Jun 13 16:33:25 2003
9745 +++ linux/mm/filemap.c  Wed Aug 20 14:41:53 2003
9746 @@ -1704,8 +1704,10 @@
9747                         retval = generic_file_direct_IO(READ, filp, buf, count, pos);
9748                         if (retval > 0)
9749                                 *ppos = pos + retval;
9750 +
9751                 }
9752 -               UPDATE_ATIME(filp->f_dentry->d_inode);
9753 +               if (!S_ISBLK(inode->i_mode))
9754 +                       UPDATE_ATIME(filp->f_dentry->d_inode);
9755                 goto out;
9756         }
9757  }
9758 --- linux-2.4.21/mm/mempool.c   Thu Jan  1 01:00:00 1970
9759 +++ linux/mm/mempool.c  Wed Aug 20 14:41:48 2003
9760 @@ -0,0 +1,299 @@
9761 +/*
9762 + *  linux/mm/mempool.c
9763 + *
9764 + *  memory buffer pool support. Such pools are mostly used
9765 + *  for guaranteed, deadlock-free memory allocations during
9766 + *  extreme VM load.
9767 + *
9768 + *  started by Ingo Molnar, Copyright (C) 2001
9769 + */
9770 +
9771 +#include <linux/mm.h>
9772 +#include <linux/slab.h>
9773 +#include <linux/module.h>
9774 +#include <linux/mempool.h>
9775 +
9776 +struct mempool_s {
9777 +       spinlock_t lock;
9778 +       int min_nr;             /* nr of elements at *elements */
9779 +       int curr_nr;            /* Current nr of elements at *elements */
9780 +       void **elements;
9781 +
9782 +       void *pool_data;
9783 +       mempool_alloc_t *alloc;
9784 +       mempool_free_t *free;
9785 +       wait_queue_head_t wait;
9786 +};
9787 +
9788 +static void add_element(mempool_t *pool, void *element)
9789 +{
9790 +       BUG_ON(pool->curr_nr >= pool->min_nr);
9791 +       pool->elements[pool->curr_nr++] = element;
9792 +}
9793 +
9794 +static void *remove_element(mempool_t *pool)
9795 +{
9796 +       BUG_ON(pool->curr_nr <= 0);
9797 +       return pool->elements[--pool->curr_nr];
9798 +}
9799 +
9800 +static void free_pool(mempool_t *pool)
9801 +{
9802 +       while (pool->curr_nr) {
9803 +               void *element = remove_element(pool);
9804 +               pool->free(element, pool->pool_data);
9805 +       }
9806 +       kfree(pool->elements);
9807 +       kfree(pool);
9808 +}
9809 +
9810 +/**
9811 + * mempool_create - create a memory pool
9812 + * @min_nr:    the minimum number of elements guaranteed to be
9813 + *             allocated for this pool.
9814 + * @alloc_fn:  user-defined element-allocation function.
9815 + * @free_fn:   user-defined element-freeing function.
9816 + * @pool_data: optional private data available to the user-defined functions.
9817 + *
9818 + * this function creates and allocates a guaranteed size, preallocated
9819 + * memory pool. The pool can be used from the mempool_alloc and mempool_free
9820 + * functions. This function might sleep. Both the alloc_fn() and the free_fn()
9821 + * functions might sleep - as long as the mempool_alloc function is not called
9822 + * from IRQ contexts.
9823 + */
9824 +mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
9825 +                               mempool_free_t *free_fn, void *pool_data)
9826 +{
9827 +       mempool_t *pool;
9828 +
9829 +       pool = kmalloc(sizeof(*pool), GFP_KERNEL);
9830 +       if (!pool)
9831 +               return NULL;
9832 +       memset(pool, 0, sizeof(*pool));
9833 +       pool->elements = kmalloc(min_nr * sizeof(void *), GFP_KERNEL);
9834 +       if (!pool->elements) {
9835 +               kfree(pool);
9836 +               return NULL;
9837 +       }
9838 +       spin_lock_init(&pool->lock);
9839 +       pool->min_nr = min_nr;
9840 +       pool->pool_data = pool_data;
9841 +       init_waitqueue_head(&pool->wait);
9842 +       pool->alloc = alloc_fn;
9843 +       pool->free = free_fn;
9844 +
9845 +       /*
9846 +        * First pre-allocate the guaranteed number of buffers.
9847 +        */
9848 +       while (pool->curr_nr < pool->min_nr) {
9849 +               void *element;
9850 +
9851 +               element = pool->alloc(GFP_KERNEL, pool->pool_data);
9852 +               if (unlikely(!element)) {
9853 +                       free_pool(pool);
9854 +                       return NULL;
9855 +               }
9856 +               add_element(pool, element);
9857 +       }
9858 +       return pool;
9859 +}
9860 +
9861 +/**
9862 + * mempool_resize - resize an existing memory pool
9863 + * @pool:       pointer to the memory pool which was allocated via
9864 + *              mempool_create().
9865 + * @new_min_nr: the new minimum number of elements guaranteed to be
9866 + *              allocated for this pool.
9867 + * @gfp_mask:   the usual allocation bitmask.
9868 + *
9869 + * This function shrinks/grows the pool. In the case of growing,
9870 + * it cannot be guaranteed that the pool will be grown to the new
9871 + * size immediately, but new mempool_free() calls will refill it.
9872 + *
9873 + * Note, the caller must guarantee that no mempool_destroy is called
9874 + * while this function is running. mempool_alloc() & mempool_free()
9875 + * might be called (eg. from IRQ contexts) while this function executes.
9876 + */
9877 +int mempool_resize(mempool_t *pool, int new_min_nr, int gfp_mask)
9878 +{
9879 +       void *element;
9880 +       void **new_elements;
9881 +       unsigned long flags;
9882 +
9883 +       BUG_ON(new_min_nr <= 0);
9884 +
9885 +       spin_lock_irqsave(&pool->lock, flags);
9886 +       if (new_min_nr < pool->min_nr) {
9887 +               while (pool->curr_nr > new_min_nr) {
9888 +                       element = remove_element(pool);
9889 +                       spin_unlock_irqrestore(&pool->lock, flags);
9890 +                       pool->free(element, pool->pool_data);
9891 +                       spin_lock_irqsave(&pool->lock, flags);
9892 +               }
9893 +               pool->min_nr = new_min_nr;
9894 +               goto out_unlock;
9895 +       }
9896 +       spin_unlock_irqrestore(&pool->lock, flags);
9897 +
9898 +       /* Grow the pool */
9899 +       new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask);
9900 +       if (!new_elements)
9901 +               return -ENOMEM;
9902 +
9903 +       spin_lock_irqsave(&pool->lock, flags);
9904 +       memcpy(new_elements, pool->elements,
9905 +                       pool->curr_nr * sizeof(*new_elements));
9906 +       kfree(pool->elements);
9907 +       pool->elements = new_elements;
9908 +       pool->min_nr = new_min_nr;
9909 +
9910 +       while (pool->curr_nr < pool->min_nr) {
9911 +               spin_unlock_irqrestore(&pool->lock, flags);
9912 +               element = pool->alloc(gfp_mask, pool->pool_data);
9913 +               if (!element)
9914 +                       goto out;
9915 +               spin_lock_irqsave(&pool->lock, flags);
9916 +               if (pool->curr_nr < pool->min_nr)
9917 +                       add_element(pool, element);
9918 +               else
9919 +                       kfree(element);         /* Raced */
9920 +       }
9921 +out_unlock:
9922 +       spin_unlock_irqrestore(&pool->lock, flags);
9923 +out:
9924 +       return 0;
9925 +}
9926 +
9927 +/**
9928 + * mempool_destroy - deallocate a memory pool
9929 + * @pool:      pointer to the memory pool which was allocated via
9930 + *             mempool_create().
9931 + *
9932 + * this function only sleeps if the free_fn() function sleeps. The caller
9933 + * has to guarantee that all elements have been returned to the pool (ie:
9934 + * freed) prior to calling mempool_destroy().
9935 + */
9936 +void mempool_destroy(mempool_t *pool)
9937 +{
9938 +       if (pool->curr_nr != pool->min_nr)
9939 +               BUG();          /* There were outstanding elements */
9940 +       free_pool(pool);
9941 +}
9942 +
9943 +/**
9944 + * mempool_alloc - allocate an element from a specific memory pool
9945 + * @pool:      pointer to the memory pool which was allocated via
9946 + *             mempool_create().
9947 + * @gfp_mask:  the usual allocation bitmask.
9948 + *
9949 + * this function only sleeps if the alloc_fn function sleeps or
9950 + * returns NULL. Note that due to preallocation, this function
9951 + * *never* fails when called from process contexts. (it might
9952 + * fail if called from an IRQ context.)
9953 + */
9954 +void * mempool_alloc(mempool_t *pool, int gfp_mask)
9955 +{
9956 +       void *element;
9957 +       unsigned long flags;
9958 +       int curr_nr;
9959 +       DECLARE_WAITQUEUE(wait, current);
9960 +       int gfp_nowait = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
9961 +
9962 +repeat_alloc:
9963 +       element = pool->alloc(gfp_nowait, pool->pool_data);
9964 +       if (likely(element != NULL))
9965 +               return element;
9966 +
9967 +       /*
9968 +        * If the pool is less than 50% full then try harder
9969 +        * to allocate an element:
9970 +        */
9971 +       if ((gfp_mask != gfp_nowait) && (pool->curr_nr <= pool->min_nr/2)) {
9972 +               element = pool->alloc(gfp_mask, pool->pool_data);
9973 +               if (likely(element != NULL))
9974 +                       return element;
9975 +       }
9976 +
9977 +       /*
9978 +        * Kick the VM at this point.
9979 +        */
9980 +       wakeup_bdflush();
9981 +
9982 +       spin_lock_irqsave(&pool->lock, flags);
9983 +       if (likely(pool->curr_nr)) {
9984 +               element = remove_element(pool);
9985 +               spin_unlock_irqrestore(&pool->lock, flags);
9986 +               return element;
9987 +       }
9988 +       spin_unlock_irqrestore(&pool->lock, flags);
9989 +
9990 +       /* We must not sleep in the GFP_ATOMIC case */
9991 +       if (gfp_mask == gfp_nowait)
9992 +               return NULL;
9993 +
9994 +       run_task_queue(&tq_disk);
9995 +
9996 +       add_wait_queue_exclusive(&pool->wait, &wait);
9997 +       set_task_state(current, TASK_UNINTERRUPTIBLE);
9998 +
9999 +       spin_lock_irqsave(&pool->lock, flags);
10000 +       curr_nr = pool->curr_nr;
10001 +       spin_unlock_irqrestore(&pool->lock, flags);
10002 +
10003 +       if (!curr_nr)
10004 +               schedule();
10005 +
10006 +       current->state = TASK_RUNNING;
10007 +       remove_wait_queue(&pool->wait, &wait);
10008 +
10009 +       goto repeat_alloc;
10010 +}
10011 +
10012 +/**
10013 + * mempool_free - return an element to the pool.
10014 + * @element:   pool element pointer.
10015 + * @pool:      pointer to the memory pool which was allocated via
10016 + *             mempool_create().
10017 + *
10018 + * this function only sleeps if the free_fn() function sleeps.
10019 + */
10020 +void mempool_free(void *element, mempool_t *pool)
10021 +{
10022 +       unsigned long flags;
10023 +
10024 +       if (pool->curr_nr < pool->min_nr) {
10025 +               spin_lock_irqsave(&pool->lock, flags);
10026 +               if (pool->curr_nr < pool->min_nr) {
10027 +                       add_element(pool, element);
10028 +                       spin_unlock_irqrestore(&pool->lock, flags);
10029 +                       wake_up(&pool->wait);
10030 +                       return;
10031 +               }
10032 +               spin_unlock_irqrestore(&pool->lock, flags);
10033 +       }
10034 +       pool->free(element, pool->pool_data);
10035 +}
10036 +
10037 +/*
10038 + * A commonly used alloc and free fn.
10039 + */
10040 +void *mempool_alloc_slab(int gfp_mask, void *pool_data)
10041 +{
10042 +       kmem_cache_t *mem = (kmem_cache_t *) pool_data;
10043 +       return kmem_cache_alloc(mem, gfp_mask);
10044 +}
10045 +
10046 +void mempool_free_slab(void *element, void *pool_data)
10047 +{
10048 +       kmem_cache_t *mem = (kmem_cache_t *) pool_data;
10049 +       kmem_cache_free(mem, element);
10050 +}
10051 +
10052 +
10053 +EXPORT_SYMBOL(mempool_create);
10054 +EXPORT_SYMBOL(mempool_resize);
10055 +EXPORT_SYMBOL(mempool_destroy);
10056 +EXPORT_SYMBOL(mempool_alloc);
10057 +EXPORT_SYMBOL(mempool_free);
10058 +EXPORT_SYMBOL(mempool_alloc_slab);
10059 +EXPORT_SYMBOL(mempool_free_slab);
10060 --- linux-2.4.21/mm/vmalloc.c   Fri Jun 13 16:33:25 2003
10061 +++ linux/mm/vmalloc.c  Wed Aug 20 14:41:57 2003
10062 @@ -327,3 +327,22 @@
10063         read_unlock(&vmlist_lock);
10064         return buf - buf_start;
10065  }
10066 +
10067 +void *vcalloc(unsigned long nmemb, unsigned long elem_size)
10068 +{
10069 +       unsigned long size;
10070 +       void *addr;
10071 +
10072 +       /*
10073 +        * Check that we're not going to overflow.
10074 +        */
10075 +       if (nmemb > (ULONG_MAX / elem_size))
10076 +               return NULL;
10077 +
10078 +       size = nmemb * elem_size;
10079 +       addr = vmalloc(size);
10080 +       if (addr)
10081 +               memset(addr, 0, size);
10082 +
10083 +       return addr;
10084 +}
10085 Supply #targets when creating a table to avoid needing to extend it later.
10086 --- linux-2.4.21/drivers/md/dm-ioctl.c  Mon Aug 18 21:24:26 2003
10087 +++ linux/drivers/md/dm-ioctl.c Fri Aug 22 13:49:01 2003
10088 @@ -764,7 +764,7 @@
10089         struct hash_cell *hc;
10090         struct dm_table *t;
10091  
10092 -       r = dm_table_create(&t, get_mode(param));
10093 +       r = dm_table_create(&t, get_mode(param), param->target_count);
10094         if (r)
10095                 return r;
10096  
10097 --- linux-2.4.21/drivers/md/dm-table.c  Tue Aug 19 15:43:50 2003
10098 +++ linux/drivers/md/dm-table.c Fri Aug 22 14:48:50 2003
10099 @@ -148,7 +148,7 @@
10100         return 0;
10101  }
10102  
10103 -int dm_table_create(struct dm_table **result, int mode)
10104 +int dm_table_create(struct dm_table **result, int mode, unsigned num_targets)
10105  {
10106         struct dm_table *t = kmalloc(sizeof(*t), GFP_NOIO);
10107  
10108 @@ -159,8 +159,10 @@
10109         INIT_LIST_HEAD(&t->devices);
10110         atomic_set(&t->holders, 1);
10111  
10112 -       /* allocate a single nodes worth of targets to begin with */
10113 -       if (alloc_targets(t, KEYS_PER_NODE)) {
10114 +       if (!num_targets)
10115 +               num_targets = KEYS_PER_NODE;
10116 +
10117 +       if (alloc_targets(t, num_targets)) {
10118                 kfree(t);
10119                 t = NULL;
10120                 return -ENOMEM;
10121 --- linux-2.4.21/drivers/md/dm.h        Sat Jul 12 17:06:52 2003
10122 +++ linux/drivers/md/dm.h       Fri Aug 22 13:50:19 2003
10123 @@ -96,7 +96,7 @@
10124   * Functions for manipulating a table.  Tables are also reference
10125   * counted.
10126   *---------------------------------------------------------------*/
10127 -int dm_table_create(struct dm_table **result, int mode);
10128 +int dm_table_create(struct dm_table **result, int mode, unsigned num_targets);
10129  
10130  void dm_table_get(struct dm_table *t);
10131  void dm_table_put(struct dm_table *t);
This page took 0.763936 seconds and 3 git commands to generate.