]> git.pld-linux.org Git - packages/kernel.git/blob - linux-2.4.25-dm-3.patch
- 2.6.19 ready
[packages/kernel.git] / linux-2.4.25-dm-3.patch
1 diff -urN linux-2.4.24.org/arch/mips64/kernel/ioctl32.c linux-2.4.24/arch/mips64/kernel/ioctl32.c
2 --- linux-2.4.24.org/arch/mips64/kernel/ioctl32.c       2004-01-18 14:59:17.636181134 +0100
3 +++ linux-2.4.24/arch/mips64/kernel/ioctl32.c   2004-01-18 15:01:17.736881093 +0100
4 @@ -62,6 +62,7 @@
5  
6  #include <linux/mtd/mtd.h>
7  #include <linux/serial.h>
8 +#include <linux/dm-ioctl.h>
9  
10  #ifdef CONFIG_SIBYTE_TBPROF
11  #include <asm/sibyte/trace_prof.h>
12 @@ -2324,6 +2325,22 @@
13         IOCTL32_DEFAULT(RESTART_ARRAY_RW),
14  #endif /* CONFIG_MD */
15  
16 +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
17 +       IOCTL32_DEFAULT(DM_VERSION),
18 +       IOCTL32_DEFAULT(DM_REMOVE_ALL),
19 +       IOCTL32_DEFAULT(DM_DEV_CREATE),
20 +       IOCTL32_DEFAULT(DM_DEV_REMOVE),
21 +       IOCTL32_DEFAULT(DM_TABLE_LOAD),
22 +       IOCTL32_DEFAULT(DM_DEV_SUSPEND),
23 +       IOCTL32_DEFAULT(DM_DEV_RENAME),
24 +       IOCTL32_DEFAULT(DM_TABLE_DEPS),
25 +       IOCTL32_DEFAULT(DM_DEV_STATUS),
26 +       IOCTL32_DEFAULT(DM_TABLE_STATUS),
27 +       IOCTL32_DEFAULT(DM_DEV_WAIT),
28 +       IOCTL32_DEFAULT(DM_LIST_DEVICES),
29 +       IOCTL32_DEFAULT(DM_TABLE_CLEAR),
30 +#endif /* CONFIG_BLK_DEV_DM */
31 +
32  #ifdef CONFIG_SIBYTE_TBPROF
33         IOCTL32_DEFAULT(SBPROF_ZBSTART),
34         IOCTL32_DEFAULT(SBPROF_ZBSTOP),
35 diff -urN linux-2.4.24.org/arch/parisc/kernel/ioctl32.c linux-2.4.24/arch/parisc/kernel/ioctl32.c
36 --- linux-2.4.24.org/arch/parisc/kernel/ioctl32.c       2004-01-18 14:59:20.929484849 +0100
37 +++ linux-2.4.24/arch/parisc/kernel/ioctl32.c   2004-01-18 15:01:17.742879834 +0100
38 @@ -55,6 +55,7 @@
39  #define max max */
40  #include <linux/lvm.h>
41  #endif /* LVM */
42 +#include <linux/dm-ioctl.h>
43  
44  #include <scsi/scsi.h>
45  /* Ugly hack. */
46 @@ -3423,6 +3424,22 @@
47  COMPATIBLE_IOCTL(LV_BMAP)
48  COMPATIBLE_IOCTL(LV_SNAPSHOT_USE_RATE)
49  #endif /* LVM */
50 +/* Device-Mapper */
51 +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
52 +COMPATIBLE_IOCTL(DM_VERSION)
53 +COMPATIBLE_IOCTL(DM_REMOVE_ALL)
54 +COMPATIBLE_IOCTL(DM_DEV_CREATE)
55 +COMPATIBLE_IOCTL(DM_DEV_REMOVE)
56 +COMPATIBLE_IOCTL(DM_TABLE_LOAD)
57 +COMPATIBLE_IOCTL(DM_DEV_SUSPEND)
58 +COMPATIBLE_IOCTL(DM_DEV_RENAME)
59 +COMPATIBLE_IOCTL(DM_TABLE_DEPS)
60 +COMPATIBLE_IOCTL(DM_DEV_STATUS)
61 +COMPATIBLE_IOCTL(DM_TABLE_STATUS)
62 +COMPATIBLE_IOCTL(DM_DEV_WAIT)
63 +COMPATIBLE_IOCTL(DM_LIST_DEVICES)
64 +COMPATIBLE_IOCTL(DM_TABLE_CLEAR)
65 +#endif /* CONFIG_BLK_DEV_DM */
66  #if defined(CONFIG_DRM) || defined(CONFIG_DRM_MODULE)
67  COMPATIBLE_IOCTL(DRM_IOCTL_GET_MAGIC)
68  COMPATIBLE_IOCTL(DRM_IOCTL_IRQ_BUSID)
69 diff -urN linux-2.4.24.org/arch/ppc64/kernel/ioctl32.c linux-2.4.24/arch/ppc64/kernel/ioctl32.c
70 --- linux-2.4.24.org/arch/ppc64/kernel/ioctl32.c        2004-01-18 14:58:17.568907286 +0100
71 +++ linux-2.4.24/arch/ppc64/kernel/ioctl32.c    2004-01-18 15:01:17.754877316 +0100
72 @@ -66,6 +66,7 @@
73  #if defined(CONFIG_BLK_DEV_LVM) || defined(CONFIG_BLK_DEV_LVM_MODULE)
74  #include <linux/lvm.h>
75  #endif /* LVM */
76 +#include <linux/dm-ioctl.h>
77  
78  #include <scsi/scsi.h>
79  /* Ugly hack. */
80 @@ -4408,6 +4409,22 @@
81  COMPATIBLE_IOCTL(NBD_PRINT_DEBUG),
82  COMPATIBLE_IOCTL(NBD_SET_SIZE_BLOCKS),
83  COMPATIBLE_IOCTL(NBD_DISCONNECT),
84 +/* device-mapper */
85 +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
86 +COMPATIBLE_IOCTL(DM_VERSION),
87 +COMPATIBLE_IOCTL(DM_REMOVE_ALL),
88 +COMPATIBLE_IOCTL(DM_DEV_CREATE),
89 +COMPATIBLE_IOCTL(DM_DEV_REMOVE),
90 +COMPATIBLE_IOCTL(DM_TABLE_LOAD),
91 +COMPATIBLE_IOCTL(DM_DEV_SUSPEND),
92 +COMPATIBLE_IOCTL(DM_DEV_RENAME),
93 +COMPATIBLE_IOCTL(DM_TABLE_DEPS),
94 +COMPATIBLE_IOCTL(DM_DEV_STATUS),
95 +COMPATIBLE_IOCTL(DM_TABLE_STATUS),
96 +COMPATIBLE_IOCTL(DM_DEV_WAIT),
97 +COMPATIBLE_IOCTL(DM_LIST_DEVICES),
98 +COMPATIBLE_IOCTL(DM_TABLE_CLEAR),
99 +#endif /* CONFIG_BLK_DEV_DM */
100  /* Remove *PRIVATE in 2.5 */
101  COMPATIBLE_IOCTL(SIOCDEVPRIVATE),
102  COMPATIBLE_IOCTL(SIOCDEVPRIVATE+1),
103 diff -urN linux-2.4.24.org/arch/s390x/kernel/ioctl32.c linux-2.4.24/arch/s390x/kernel/ioctl32.c
104 --- linux-2.4.24.org/arch/s390x/kernel/ioctl32.c        2004-01-18 14:59:24.825661296 +0100
105 +++ linux-2.4.24/arch/s390x/kernel/ioctl32.c    2004-01-18 15:01:17.759876266 +0100
106 @@ -30,6 +30,7 @@
107  #include <linux/blk.h>
108  #include <linux/elevator.h>
109  #include <linux/raw.h>
110 +#include <linux/dm-ioctl.h>
111  #include <asm/types.h>
112  #include <asm/uaccess.h>
113  #include <asm/dasd.h>
114 @@ -627,6 +628,20 @@
115  
116         IOCTL32_DEFAULT(SIOCGSTAMP),
117  
118 +       IOCTL32_DEFAULT(DM_VERSION),
119 +       IOCTL32_DEFAULT(DM_REMOVE_ALL),
120 +       IOCTL32_DEFAULT(DM_DEV_CREATE),
121 +       IOCTL32_DEFAULT(DM_DEV_REMOVE),
122 +       IOCTL32_DEFAULT(DM_TABLE_LOAD),
123 +       IOCTL32_DEFAULT(DM_DEV_SUSPEND),
124 +       IOCTL32_DEFAULT(DM_DEV_RENAME),
125 +       IOCTL32_DEFAULT(DM_TABLE_DEPS),
126 +       IOCTL32_DEFAULT(DM_DEV_STATUS),
127 +       IOCTL32_DEFAULT(DM_TABLE_STATUS),
128 +       IOCTL32_DEFAULT(DM_DEV_WAIT),
129 +       IOCTL32_DEFAULT(DM_LIST_DEVICES),
130 +       IOCTL32_DEFAULT(DM_TABLE_CLEAR),
131 +
132         IOCTL32_DEFAULT(LOOP_SET_FD),
133         IOCTL32_DEFAULT(LOOP_CLR_FD),
134  
135 diff -urN linux-2.4.24.org/arch/sparc64/kernel/ioctl32.c linux-2.4.24/arch/sparc64/kernel/ioctl32.c
136 --- linux-2.4.24.org/arch/sparc64/kernel/ioctl32.c      2004-01-18 14:58:59.210079599 +0100
137 +++ linux-2.4.24/arch/sparc64/kernel/ioctl32.c  2004-01-18 15:01:17.768874378 +0100
138 @@ -56,6 +56,7 @@
139  #if defined(CONFIG_BLK_DEV_LVM) || defined(CONFIG_BLK_DEV_LVM_MODULE)
140  #include <linux/lvm.h>
141  #endif /* LVM */
142 +#include <linux/dm-ioctl.h>
143  
144  #include <scsi/scsi.h>
145  /* Ugly hack. */
146 @@ -5086,6 +5087,22 @@
147  COMPATIBLE_IOCTL(NBD_PRINT_DEBUG)
148  COMPATIBLE_IOCTL(NBD_SET_SIZE_BLOCKS)
149  COMPATIBLE_IOCTL(NBD_DISCONNECT)
150 +/* device-mapper */
151 +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
152 +COMPATIBLE_IOCTL(DM_VERSION)
153 +COMPATIBLE_IOCTL(DM_REMOVE_ALL)
154 +COMPATIBLE_IOCTL(DM_DEV_CREATE)
155 +COMPATIBLE_IOCTL(DM_DEV_REMOVE)
156 +COMPATIBLE_IOCTL(DM_TABLE_LOAD)
157 +COMPATIBLE_IOCTL(DM_DEV_SUSPEND)
158 +COMPATIBLE_IOCTL(DM_DEV_RENAME)
159 +COMPATIBLE_IOCTL(DM_TABLE_DEPS)
160 +COMPATIBLE_IOCTL(DM_DEV_STATUS)
161 +COMPATIBLE_IOCTL(DM_TABLE_STATUS)
162 +COMPATIBLE_IOCTL(DM_DEV_WAIT)
163 +COMPATIBLE_IOCTL(DM_LIST_DEVICES)
164 +COMPATIBLE_IOCTL(DM_TABLE_CLEAR)
165 +#endif /* CONFIG_BLK_DEV_DM */
166  /* Linux-1394 */
167  #if defined(CONFIG_IEEE1394) || defined(CONFIG_IEEE1394_MODULE)
168  COMPATIBLE_IOCTL(AMDTP_IOC_CHANNEL)
169 diff -urN linux-2.4.24.org/arch/x86_64/ia32/ia32_ioctl.c linux-2.4.24/arch/x86_64/ia32/ia32_ioctl.c
170 --- linux-2.4.24.org/arch/x86_64/ia32/ia32_ioctl.c      2004-01-18 14:58:15.119427333 +0100
171 +++ linux-2.4.24/arch/x86_64/ia32/ia32_ioctl.c  2004-01-18 15:01:17.778872279 +0100
172 @@ -67,6 +67,7 @@
173  #define max max
174  #include <linux/lvm.h>
175  #endif /* LVM */
176 +#include <linux/dm-ioctl.h>
177  
178  #include <scsi/scsi.h>
179  /* Ugly hack. */
180 @@ -4051,6 +4052,22 @@
181  COMPATIBLE_IOCTL(LV_BMAP)
182  COMPATIBLE_IOCTL(LV_SNAPSHOT_USE_RATE)
183  #endif /* LVM */
184 +/* Device-Mapper */
185 +#if defined(CONFIG_BLK_DEV_DM) || defined(CONFIG_BLK_DEV_DM_MODULE)
186 +COMPATIBLE_IOCTL(DM_VERSION)
187 +COMPATIBLE_IOCTL(DM_REMOVE_ALL)
188 +COMPATIBLE_IOCTL(DM_DEV_CREATE)
189 +COMPATIBLE_IOCTL(DM_DEV_REMOVE)
190 +COMPATIBLE_IOCTL(DM_TABLE_LOAD)
191 +COMPATIBLE_IOCTL(DM_DEV_SUSPEND)
192 +COMPATIBLE_IOCTL(DM_DEV_RENAME)
193 +COMPATIBLE_IOCTL(DM_TABLE_DEPS)
194 +COMPATIBLE_IOCTL(DM_DEV_STATUS)
195 +COMPATIBLE_IOCTL(DM_TABLE_STATUS)
196 +COMPATIBLE_IOCTL(DM_DEV_WAIT)
197 +COMPATIBLE_IOCTL(DM_LIST_DEVICES)
198 +COMPATIBLE_IOCTL(DM_TABLE_CLEAR)
199 +#endif /* CONFIG_BLK_DEV_DM */
200  #ifdef CONFIG_AUTOFS_FS
201  COMPATIBLE_IOCTL(AUTOFS_IOC_READY)
202  COMPATIBLE_IOCTL(AUTOFS_IOC_FAIL)
203 diff -urN linux-2.4.24.org/Documentation/Configure.help linux-2.4.24/Documentation/Configure.help
204 --- linux-2.4.24.org/Documentation/Configure.help       2004-01-18 14:59:47.177940541 +0100
205 +++ linux-2.4.24/Documentation/Configure.help   2004-01-18 15:01:13.758716197 +0100
206 @@ -1952,6 +1952,20 @@
207    want), say M here and read <file:Documentation/modules.txt>.  The
208    module will be called lvm-mod.o.
209  
210 +Device-mapper support
211 +CONFIG_BLK_DEV_DM
212 +  Device-mapper is a low level volume manager.  It works by allowing
213 +  people to specify mappings for ranges of logical sectors.  Various
214 +  mapping types are available, in addition people may write their own
215 +  modules containing custom mappings if they wish.
216 +
217 +  Higher level volume managers such as LVM2 use this driver.
218 +
219 +  If you want to compile this as a module, say M here and read 
220 +  <file:Documentation/modules.txt>.  The module will be called dm-mod.o.
221 +
222 +  If unsure, say N.
223 +
224  Multiple devices driver support (RAID and LVM)
225  CONFIG_MD
226    Support multiple physical spindles through a single logical device.
227 diff -urN linux-2.4.24.org/drivers/md/Config.in linux-2.4.24/drivers/md/Config.in
228 --- linux-2.4.24.org/drivers/md/Config.in       2004-01-18 14:58:09.306661789 +0100
229 +++ linux-2.4.24/drivers/md/Config.in   2004-01-18 15:01:13.770713678 +0100
230 @@ -14,5 +14,6 @@
231  dep_tristate '  Multipath I/O support' CONFIG_MD_MULTIPATH $CONFIG_BLK_DEV_MD
232  
233  dep_tristate ' Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM $CONFIG_MD
234 +dep_tristate ' Device-mapper support' CONFIG_BLK_DEV_DM $CONFIG_MD
235  
236  endmenu
237 diff -urN linux-2.4.24.org/drivers/md/dm.c linux-2.4.24/drivers/md/dm.c
238 --- linux-2.4.24.org/drivers/md/dm.c    1970-01-01 01:00:00.000000000 +0100
239 +++ linux-2.4.24/drivers/md/dm.c        2004-01-18 15:01:29.214472770 +0100
240 @@ -0,0 +1,1115 @@
241 +/*
242 + * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
243 + *
244 + * This file is released under the GPL.
245 + */
246 +
247 +#include "dm.h"
248 +#include "kcopyd.h"
249 +
250 +#include <linux/init.h>
251 +#include <linux/module.h>
252 +#include <linux/blk.h>
253 +#include <linux/blkpg.h>
254 +#include <linux/mempool.h>
255 +#include <linux/slab.h>
256 +#include <linux/major.h>
257 +#include <linux/kdev_t.h>
258 +#include <linux/lvm.h>
259 +
260 +#include <asm/uaccess.h>
261 +
262 +static const char *_name = DM_NAME;
263 +#define DEFAULT_READ_AHEAD 64
264 +
265 +struct dm_io {
266 +       struct mapped_device *md;
267 +
268 +       struct dm_target *ti;
269 +       int rw;
270 +       union map_info map_context;
271 +       void (*end_io) (struct buffer_head * bh, int uptodate);
272 +       void *context;
273 +};
274 +
275 +struct deferred_io {
276 +       int rw;
277 +       struct buffer_head *bh;
278 +       struct deferred_io *next;
279 +};
280 +
281 +/*
282 + * Bits for the md->flags field.
283 + */
284 +#define DMF_BLOCK_IO 0
285 +#define DMF_SUSPENDED 1
286 +
287 +struct mapped_device {
288 +       struct rw_semaphore lock;
289 +       atomic_t holders;
290 +
291 +       kdev_t dev;
292 +       unsigned long flags;
293 +
294 +       /*
295 +        * A list of ios that arrived while we were suspended.
296 +        */
297 +       atomic_t pending;
298 +       wait_queue_head_t wait;
299 +       struct deferred_io *deferred;
300 +
301 +       /*
302 +        * The current mapping.
303 +        */
304 +       struct dm_table *map;
305 +
306 +       /*
307 +        * io objects are allocated from here.
308 +        */
309 +       mempool_t *io_pool;
310 +
311 +       /*
312 +        * Event handling.
313 +        */
314 +       uint32_t event_nr;
315 +       wait_queue_head_t eventq;
316 +};
317 +
318 +#define MIN_IOS 256
319 +static kmem_cache_t *_io_cache;
320 +
321 +static struct mapped_device *get_kdev(kdev_t dev);
322 +static int dm_request(request_queue_t *q, int rw, struct buffer_head *bh);
323 +static int dm_user_bmap(struct inode *inode, struct lv_bmap *lvb);
324 +
325 +/*-----------------------------------------------------------------
326 + * In order to avoid the 256 minor number limit we are going to
327 + * register more major numbers as neccessary.
328 + *---------------------------------------------------------------*/
329 +#define MAX_MINORS (1 << MINORBITS)
330 +
331 +struct major_details {
332 +       unsigned int major;
333 +
334 +       int transient;
335 +       struct list_head transient_list;
336 +
337 +       unsigned int first_free_minor;
338 +       int nr_free_minors;
339 +
340 +       struct mapped_device *mds[MAX_MINORS];
341 +       int blk_size[MAX_MINORS];
342 +       int blksize_size[MAX_MINORS];
343 +       int hardsect_size[MAX_MINORS];
344 +};
345 +
346 +static struct rw_semaphore _dev_lock;
347 +static struct major_details *_majors[MAX_BLKDEV];
348 +
349 +/*
350 + * This holds a list of majors that non-specified device numbers
351 + * may be allocated from.  Only majors with free minors appear on
352 + * this list.
353 + */
354 +static LIST_HEAD(_transients_free);
355 +
356 +static int __alloc_major(unsigned int major, struct major_details **result)
357 +{
358 +       int r;
359 +       unsigned int transient = !major;
360 +       struct major_details *maj;
361 +
362 +       /* Major already allocated? */
363 +       if (major && _majors[major])
364 +               return 0;
365 +
366 +       maj = kmalloc(sizeof(*maj), GFP_KERNEL);
367 +       if (!maj)
368 +               return -ENOMEM;
369 +
370 +       memset(maj, 0, sizeof(*maj));
371 +       INIT_LIST_HEAD(&maj->transient_list);
372 +
373 +       maj->nr_free_minors = MAX_MINORS;
374 +
375 +       r = register_blkdev(major, _name, &dm_blk_dops);
376 +       if (r < 0) {
377 +               DMERR("register_blkdev failed for %d", major);
378 +               kfree(maj);
379 +               return r;
380 +       }
381 +       if (r > 0)
382 +               major = r;
383 +
384 +       maj->major = major;
385 +
386 +       if (transient) {
387 +               maj->transient = transient;
388 +               list_add_tail(&maj->transient_list, &_transients_free);
389 +       }
390 +
391 +       _majors[major] = maj;
392 +
393 +       blk_size[major] = maj->blk_size;
394 +       blksize_size[major] = maj->blksize_size;
395 +       hardsect_size[major] = maj->hardsect_size;
396 +       read_ahead[major] = DEFAULT_READ_AHEAD;
397 +
398 +       blk_queue_make_request(BLK_DEFAULT_QUEUE(major), dm_request);
399 +
400 +       *result = maj;
401 +       return 0;
402 +}
403 +
404 +static void __free_major(struct major_details *maj)
405 +{
406 +       unsigned int major = maj->major;
407 +
408 +       list_del(&maj->transient_list);
409 +
410 +       read_ahead[major] = 0;
411 +       blk_size[major] = NULL;
412 +       blksize_size[major] = NULL;
413 +       hardsect_size[major] = NULL;
414 +
415 +       _majors[major] = NULL;
416 +       kfree(maj);
417 +
418 +       if (unregister_blkdev(major, _name) < 0)
419 +               DMERR("devfs_unregister_blkdev failed");
420 +}
421 +
422 +static void free_all_majors(void)
423 +{
424 +       unsigned int major = ARRAY_SIZE(_majors);
425 +
426 +       down_write(&_dev_lock);
427 +
428 +       while (major--)
429 +               if (_majors[major])
430 +                       __free_major(_majors[major]);
431 +
432 +       up_write(&_dev_lock);
433 +}
434 +
435 +static void free_dev(kdev_t dev)
436 +{
437 +       unsigned int major = major(dev);
438 +       unsigned int minor = minor(dev);
439 +       struct major_details *maj;
440 +
441 +       down_write(&_dev_lock);
442 +
443 +       maj = _majors[major];
444 +       if (!maj)
445 +               goto out;
446 +
447 +       maj->mds[minor] = NULL;
448 +       maj->nr_free_minors++;
449 +
450 +       if (maj->nr_free_minors == MAX_MINORS) {
451 +               __free_major(maj);
452 +               goto out;
453 +       }
454 +
455 +       if (!maj->transient)
456 +               goto out;
457 +
458 +       if (maj->nr_free_minors == 1)
459 +               list_add_tail(&maj->transient_list, &_transients_free);
460 +
461 +       if (minor < maj->first_free_minor)
462 +               maj->first_free_minor = minor;
463 +
464 +      out:
465 +       up_write(&_dev_lock);
466 +}
467 +
468 +static void __alloc_minor(struct major_details *maj, unsigned int minor,
469 +                         struct mapped_device *md)
470 +{
471 +       maj->mds[minor] = md;
472 +       md->dev = mk_kdev(maj->major, minor);
473 +       maj->nr_free_minors--;
474 +
475 +       if (maj->transient && !maj->nr_free_minors)
476 +               list_del_init(&maj->transient_list);
477 +}
478 +
479 +/*
480 + * See if requested kdev_t is available.
481 + */
482 +static int specific_dev(kdev_t dev, struct mapped_device *md)
483 +{
484 +       int r = 0;
485 +       unsigned int major = major(dev);
486 +       unsigned int minor = minor(dev);
487 +       struct major_details *maj;
488 +
489 +       if (!major || (major > MAX_BLKDEV) || (minor >= MAX_MINORS)) {
490 +               DMWARN("device number requested out of range (%d, %d)",
491 +                      major, minor);
492 +               return -EINVAL;
493 +       }
494 +
495 +       down_write(&_dev_lock);
496 +       maj = _majors[major];
497 +
498 +       /* Register requested major? */
499 +       if (!maj) {
500 +               r = __alloc_major(major, &maj);
501 +               if (r)
502 +                       goto out;
503 +
504 +               major = maj->major;
505 +       }
506 +
507 +       if (maj->mds[minor]) {
508 +               r = -EBUSY;
509 +               goto out;
510 +       }
511 +
512 +       __alloc_minor(maj, minor, md);
513 +
514 +      out:
515 +       up_write(&_dev_lock);
516 +
517 +       return r;
518 +}
519 +
520 +/*
521 + * Find first unused device number, requesting a new major number if required.
522 + */
523 +static int first_free_dev(struct mapped_device *md)
524 +{
525 +       int r = 0;
526 +       struct major_details *maj;
527 +
528 +       down_write(&_dev_lock);
529 +
530 +       if (list_empty(&_transients_free)) {
531 +               r = __alloc_major(0, &maj);
532 +               if (r)
533 +                       goto out;
534 +       } else
535 +               maj = list_entry(_transients_free.next, struct major_details,
536 +                                transient_list);
537 +
538 +       while (maj->mds[maj->first_free_minor++])
539 +               ;
540 +
541 +       __alloc_minor(maj, maj->first_free_minor - 1, md);
542 +
543 +      out:
544 +       up_write(&_dev_lock);
545 +
546 +       return r;
547 +}
548 +
549 +static struct mapped_device *get_kdev(kdev_t dev)
550 +{
551 +       struct mapped_device *md;
552 +       struct major_details *maj;
553 +
554 +       down_read(&_dev_lock);
555 +       maj = _majors[major(dev)];
556 +       if (!maj) {
557 +               md = NULL;
558 +               goto out;
559 +       }
560 +       md = maj->mds[minor(dev)];
561 +       if (md)
562 +               dm_get(md);
563 +      out:
564 +       up_read(&_dev_lock);
565 +
566 +       return md;
567 +}
568 +
569 +/*-----------------------------------------------------------------
570 + * init/exit code
571 + *---------------------------------------------------------------*/
572 +
573 +static __init int local_init(void)
574 +{
575 +       init_rwsem(&_dev_lock);
576 +
577 +       /* allocate a slab for the dm_ios */
578 +       _io_cache = kmem_cache_create("dm io",
579 +                                     sizeof(struct dm_io), 0, 0, NULL, NULL);
580 +
581 +       if (!_io_cache)
582 +               return -ENOMEM;
583 +
584 +       return 0;
585 +}
586 +
587 +static void local_exit(void)
588 +{
589 +       kmem_cache_destroy(_io_cache);
590 +       free_all_majors();
591 +
592 +       DMINFO("cleaned up");
593 +}
594 +
595 +/*
596 + * We have a lot of init/exit functions, so it seems easier to
597 + * store them in an array.  The disposable macro 'xx'
598 + * expands a prefix into a pair of function names.
599 + */
600 +static struct {
601 +       int (*init) (void);
602 +       void (*exit) (void);
603 +
604 +} _inits[] = {
605 +#define xx(n) {n ## _init, n ## _exit},
606 +       xx(local)
607 +       xx(dm_target)
608 +       xx(dm_linear)
609 +       xx(dm_stripe)
610 +       xx(dm_interface)
611 +       xx(kcopyd)
612 +       xx(dm_snapshot)
613 +#undef xx
614 +};
615 +
616 +static int __init dm_init(void)
617 +{
618 +       const int count = ARRAY_SIZE(_inits);
619 +
620 +       int r, i;
621 +
622 +       for (i = 0; i < count; i++) {
623 +               r = _inits[i].init();
624 +               if (r)
625 +                       goto bad;
626 +       }
627 +
628 +       return 0;
629 +
630 +      bad:
631 +       while (i--)
632 +               _inits[i].exit();
633 +
634 +       return r;
635 +}
636 +
637 +static void __exit dm_exit(void)
638 +{
639 +       int i = ARRAY_SIZE(_inits);
640 +
641 +       while (i--)
642 +               _inits[i].exit();
643 +}
644 +
645 +/*
646 + * Block device functions
647 + */
648 +static int dm_blk_open(struct inode *inode, struct file *file)
649 +{
650 +       struct mapped_device *md;
651 +
652 +       md = get_kdev(inode->i_rdev);
653 +       if (!md)
654 +               return -ENXIO;
655 +
656 +       return 0;
657 +}
658 +
659 +static int dm_blk_close(struct inode *inode, struct file *file)
660 +{
661 +       struct mapped_device *md;
662 +
663 +       md = get_kdev(inode->i_rdev);
664 +       dm_put(md);             /* put the reference gained by dm_blk_open */
665 +       dm_put(md);
666 +       return 0;
667 +}
668 +
669 +static inline struct dm_io *alloc_io(struct mapped_device *md)
670 +{
671 +       return mempool_alloc(md->io_pool, GFP_NOIO);
672 +}
673 +
674 +static inline void free_io(struct mapped_device *md, struct dm_io *io)
675 +{
676 +       mempool_free(io, md->io_pool);
677 +}
678 +
679 +static inline struct deferred_io *alloc_deferred(void)
680 +{
681 +       return kmalloc(sizeof(struct deferred_io), GFP_NOIO);
682 +}
683 +
684 +static inline void free_deferred(struct deferred_io *di)
685 +{
686 +       kfree(di);
687 +}
688 +
689 +static inline sector_t volume_size(kdev_t dev)
690 +{
691 +       return blk_size[major(dev)][minor(dev)] << 1;
692 +}
693 +
694 +/* FIXME: check this */
695 +static int dm_blk_ioctl(struct inode *inode, struct file *file,
696 +                       unsigned int command, unsigned long a)
697 +{
698 +       kdev_t dev = inode->i_rdev;
699 +       long size;
700 +
701 +       switch (command) {
702 +       case BLKROSET:
703 +       case BLKROGET:
704 +       case BLKRASET:
705 +       case BLKRAGET:
706 +       case BLKFLSBUF:
707 +       case BLKSSZGET:
708 +               //case BLKRRPART: /* Re-read partition tables */
709 +               //case BLKPG:
710 +       case BLKELVGET:
711 +       case BLKELVSET:
712 +       case BLKBSZGET:
713 +       case BLKBSZSET:
714 +               return blk_ioctl(dev, command, a);
715 +               break;
716 +
717 +       case BLKGETSIZE:
718 +               size = volume_size(dev);
719 +               if (copy_to_user((void *) a, &size, sizeof(long)))
720 +                       return -EFAULT;
721 +               break;
722 +
723 +       case BLKGETSIZE64:
724 +               size = volume_size(dev);
725 +               if (put_user((u64) ((u64) size) << 9, (u64 *) a))
726 +                       return -EFAULT;
727 +               break;
728 +
729 +       case BLKRRPART:
730 +               return -ENOTTY;
731 +
732 +       case LV_BMAP:
733 +               return dm_user_bmap(inode, (struct lv_bmap *) a);
734 +
735 +       default:
736 +               DMWARN("unknown block ioctl 0x%x", command);
737 +               return -ENOTTY;
738 +       }
739 +
740 +       return 0;
741 +}
742 +
743 +/*
744 + * Add the buffer to the list of deferred io.
745 + */
746 +static int queue_io(struct mapped_device *md, struct buffer_head *bh, int rw)
747 +{
748 +       struct deferred_io *di;
749 +
750 +       di = alloc_deferred();
751 +       if (!di)
752 +               return -ENOMEM;
753 +
754 +       down_write(&md->lock);
755 +
756 +       if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
757 +               up_write(&md->lock);
758 +               free_deferred(di);
759 +               return 1;
760 +       }
761 +
762 +       di->bh = bh;
763 +       di->rw = rw;
764 +       di->next = md->deferred;
765 +       md->deferred = di;
766 +
767 +       up_write(&md->lock);
768 +       return 0;               /* deferred successfully */
769 +}
770 +
771 +/*
772 + * bh->b_end_io routine that decrements the pending count
773 + * and then calls the original bh->b_end_io fn.
774 + */
775 +static void dec_pending(struct buffer_head *bh, int uptodate)
776 +{
777 +       int r;
778 +       struct dm_io *io = bh->b_private;
779 +       dm_endio_fn endio = io->ti->type->end_io;
780 +
781 +       if (endio) {
782 +               r = endio(io->ti, bh, io->rw, uptodate ? 0 : -EIO,
783 +                         &io->map_context);
784 +               if (r < 0)
785 +                       uptodate = 0;
786 +
787 +               else if (r > 0)
788 +                       /* the target wants another shot at the io */
789 +                       return;
790 +       }
791 +
792 +       if (atomic_dec_and_test(&io->md->pending))
793 +               /* nudge anyone waiting on suspend queue */
794 +               wake_up(&io->md->wait);
795 +
796 +       bh->b_end_io = io->end_io;
797 +       bh->b_private = io->context;
798 +       free_io(io->md, io);
799 +
800 +       bh->b_end_io(bh, uptodate);
801 +}
802 +
803 +/*
804 + * Do the bh mapping for a given leaf
805 + */
806 +static inline int __map_buffer(struct mapped_device *md, int rw,
807 +                              struct buffer_head *bh, struct dm_io *io)
808 +{
809 +       struct dm_target *ti;
810 +
811 +       if (!md->map)
812 +               return -EINVAL;
813 +
814 +       ti = dm_table_find_target(md->map, bh->b_rsector);
815 +       if (!ti->type)
816 +               return -EINVAL;
817 +
818 +       /* hook the end io request fn */
819 +       atomic_inc(&md->pending);
820 +       io->md = md;
821 +       io->ti = ti;
822 +       io->rw = rw;
823 +       io->end_io = bh->b_end_io;
824 +       io->context = bh->b_private;
825 +       bh->b_end_io = dec_pending;
826 +       bh->b_private = io;
827 +
828 +       return ti->type->map(ti, bh, rw, &io->map_context);
829 +}
830 +
831 +/*
832 + * Checks to see if we should be deferring io, if so it queues it
833 + * and returns 1.
834 + */
835 +static inline int __deferring(struct mapped_device *md, int rw,
836 +                             struct buffer_head *bh)
837 +{
838 +       int r;
839 +
840 +       /*
841 +        * If we're suspended we have to queue this io for later.
842 +        */
843 +       while (test_bit(DMF_BLOCK_IO, &md->flags)) {
844 +               up_read(&md->lock);
845 +
846 +               /*
847 +                * There's no point deferring a read ahead
848 +                * request, just drop it.
849 +                */
850 +               if (rw == READA) {
851 +                       down_read(&md->lock);
852 +                       return -EIO;
853 +               }
854 +
855 +               r = queue_io(md, bh, rw);
856 +               down_read(&md->lock);
857 +
858 +               if (r < 0)
859 +                       return r;
860 +
861 +               if (r == 0)
862 +                       return 1;       /* deferred successfully */
863 +
864 +       }
865 +
866 +       return 0;
867 +}
868 +
869 +static int dm_request(request_queue_t *q, int rw, struct buffer_head *bh)
870 +{
871 +       int r;
872 +       struct dm_io *io;
873 +       struct mapped_device *md;
874 +
875 +       md = get_kdev(bh->b_rdev);
876 +       if (!md) {
877 +               buffer_IO_error(bh);
878 +               return 0;
879 +       }
880 +
881 +       io = alloc_io(md);
882 +       down_read(&md->lock);
883 +
884 +       r = __deferring(md, rw, bh);
885 +       if (r < 0)
886 +               goto bad;
887 +
888 +       else if (!r) {
889 +               /* not deferring */
890 +               r = __map_buffer(md, rw, bh, io);
891 +               if (r < 0)
892 +                       goto bad;
893 +       } else
894 +               r = 0;
895 +
896 +       up_read(&md->lock);
897 +       dm_put(md);
898 +       return r;
899 +
900 +      bad:
901 +       buffer_IO_error(bh);
902 +       up_read(&md->lock);
903 +       dm_put(md);
904 +       return 0;
905 +}
906 +
907 +static int check_dev_size(kdev_t dev, unsigned long block)
908 +{
909 +       unsigned int major = major(dev);
910 +       unsigned int minor = minor(dev);
911 +
912 +       /* FIXME: check this */
913 +       unsigned long max_sector = (blk_size[major][minor] << 1) + 1;
914 +       unsigned long sector = (block + 1) * (blksize_size[major][minor] >> 9);
915 +
916 +       return (sector > max_sector) ? 0 : 1;
917 +}
918 +
919 +/*
920 + * Creates a dummy buffer head and maps it (for lilo).
921 + */
922 +static int __bmap(struct mapped_device *md, kdev_t dev, unsigned long block,
923 +                 kdev_t *r_dev, unsigned long *r_block)
924 +{
925 +       struct buffer_head bh;
926 +       struct dm_target *ti;
927 +       union map_info map_context;
928 +       int r;
929 +
930 +       if (test_bit(DMF_BLOCK_IO, &md->flags)) {
931 +               return -EPERM;
932 +       }
933 +
934 +       if (!check_dev_size(dev, block)) {
935 +               return -EINVAL;
936 +       }
937 +
938 +       if (!md->map)
939 +               return -EINVAL;
940 +
941 +       /* setup dummy bh */
942 +       memset(&bh, 0, sizeof(bh));
943 +       bh.b_blocknr = block;
944 +       bh.b_dev = bh.b_rdev = dev;
945 +       bh.b_size = blksize_size[major(dev)][minor(dev)];
946 +       bh.b_rsector = block * (bh.b_size >> 9);
947 +
948 +       /* find target */
949 +       ti = dm_table_find_target(md->map, bh.b_rsector);
950 +
951 +       /* do the mapping */
952 +       r = ti->type->map(ti, &bh, READ, &map_context);
953 +       ti->type->end_io(ti, &bh, READ, 0, &map_context);
954 +
955 +       if (!r) {
956 +               *r_dev = bh.b_rdev;
957 +               *r_block = bh.b_rsector / (bh.b_size >> 9);
958 +       }
959 +
960 +       return r;
961 +}
962 +
963 +/*
964 + * Marshals arguments and results between user and kernel space.
965 + */
966 +static int dm_user_bmap(struct inode *inode, struct lv_bmap *lvb)
967 +{
968 +       struct mapped_device *md;
969 +       unsigned long block, r_block;
970 +       kdev_t r_dev;
971 +       int r;
972 +
973 +       if (get_user(block, &lvb->lv_block))
974 +               return -EFAULT;
975 +
976 +       md = get_kdev(inode->i_rdev);
977 +       if (!md)
978 +               return -ENXIO;
979 +
980 +       down_read(&md->lock);
981 +       r = __bmap(md, inode->i_rdev, block, &r_dev, &r_block);
982 +       up_read(&md->lock);
983 +       dm_put(md);
984 +
985 +       if (!r && (put_user(kdev_t_to_nr(r_dev), &lvb->lv_dev) ||
986 +                  put_user(r_block, &lvb->lv_block)))
987 +               r = -EFAULT;
988 +
989 +       return r;
990 +}
991 +
992 +static void free_md(struct mapped_device *md)
993 +{
994 +       free_dev(md->dev);
995 +       mempool_destroy(md->io_pool);
996 +       kfree(md);
997 +}
998 +
999 +/*
1000 + * Allocate and initialise a blank device with a given minor.
1001 + */
1002 +static struct mapped_device *alloc_md(kdev_t dev)
1003 +{
1004 +       int r;
1005 +       struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL);
1006 +
1007 +       if (!md) {
1008 +               DMWARN("unable to allocate device, out of memory.");
1009 +               return NULL;
1010 +       }
1011 +
1012 +       memset(md, 0, sizeof(*md));
1013 +
1014 +       /* Allocate suitable device number */
1015 +       if (!dev)
1016 +               r = first_free_dev(md);
1017 +       else
1018 +               r = specific_dev(dev, md);
1019 +
1020 +       if (r) {
1021 +               kfree(md);
1022 +               return NULL;
1023 +       }
1024 +
1025 +       md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
1026 +                                    mempool_free_slab, _io_cache);
1027 +       if (!md->io_pool) {
1028 +               free_md(md);
1029 +               kfree(md);
1030 +               return NULL;
1031 +       }
1032 +
1033 +       init_rwsem(&md->lock);
1034 +       atomic_set(&md->holders, 1);
1035 +       atomic_set(&md->pending, 0);
1036 +       init_waitqueue_head(&md->wait);
1037 +       init_waitqueue_head(&md->eventq);
1038 +
1039 +       return md;
1040 +}
1041 +
1042 +/*
1043 + * The hardsect size for a mapped device is the largest hardsect size
1044 + * from the devices it maps onto.
1045 + */
1046 +static int __find_hardsect_size(struct list_head *devices)
1047 +{
1048 +       int result = 512, size;
1049 +       struct list_head *tmp;
1050 +
1051 +       list_for_each (tmp, devices) {
1052 +               struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
1053 +               size = get_hardsect_size(dd->dev);
1054 +               if (size > result)
1055 +                       result = size;
1056 +       }
1057 +
1058 +       return result;
1059 +}
1060 +
1061 +/*
1062 + * Bind a table to the device.
1063 + */
1064 +static void event_callback(void *context)
1065 +{
1066 +       struct mapped_device *md = (struct mapped_device *) context;
1067 +
1068 +       down_write(&md->lock);
1069 +       md->event_nr++;
1070 +       wake_up_interruptible(&md->eventq);
1071 +       up_write(&md->lock);
1072 +}
1073 +
1074 +static int __bind(struct mapped_device *md, struct dm_table *t)
1075 +{
1076 +       unsigned int minor = minor(md->dev);
1077 +       unsigned int major = major(md->dev);
1078 +       md->map = t;
1079 +
1080 +       /* in k */
1081 +       blk_size[major][minor] = dm_table_get_size(t) >> 1;
1082 +       blksize_size[major][minor] = BLOCK_SIZE;
1083 +       hardsect_size[major][minor] =
1084 +           __find_hardsect_size(dm_table_get_devices(t));
1085 +       register_disk(NULL, md->dev, 1, &dm_blk_dops, blk_size[major][minor]);
1086 +
1087 +       dm_table_event_callback(md->map, event_callback, md);
1088 +       dm_table_get(t);
1089 +       return 0;
1090 +}
1091 +
1092 +static void __unbind(struct mapped_device *md)
1093 +{
1094 +       unsigned int minor = minor(md->dev);
1095 +       unsigned int major = major(md->dev);
1096 +
1097 +       if (md->map) {
1098 +               dm_table_event_callback(md->map, NULL, NULL);
1099 +               dm_table_put(md->map);
1100 +               md->map = NULL;
1101 +
1102 +       }
1103 +
1104 +       blk_size[major][minor] = 0;
1105 +       blksize_size[major][minor] = 0;
1106 +       hardsect_size[major][minor] = 0;
1107 +}
1108 +
1109 +/*
1110 + * Constructor for a new device.
1111 + */
1112 +int dm_create(kdev_t dev, struct mapped_device **result)
1113 +{
1114 +       struct mapped_device *md;
1115 +
1116 +       md = alloc_md(dev);
1117 +       if (!md)
1118 +               return -ENXIO;
1119 +
1120 +       __unbind(md);   /* Ensure zero device size */
1121 +
1122 +       *result = md;
1123 +       return 0;
1124 +}
1125 +
1126 +void dm_get(struct mapped_device *md)
1127 +{
1128 +       atomic_inc(&md->holders);
1129 +}
1130 +
1131 +void dm_put(struct mapped_device *md)
1132 +{
1133 +       if (atomic_dec_and_test(&md->holders)) {
1134 +               if (md->map)
1135 +                       dm_table_suspend_targets(md->map);
1136 +               __unbind(md);
1137 +               free_md(md);
1138 +       }
1139 +}
1140 +
1141 +/*
1142 + * Requeue the deferred io by calling generic_make_request.
1143 + */
1144 +static void flush_deferred_io(struct deferred_io *c)
1145 +{
1146 +       struct deferred_io *n;
1147 +
1148 +       while (c) {
1149 +               n = c->next;
1150 +               generic_make_request(c->rw, c->bh);
1151 +               free_deferred(c);
1152 +               c = n;
1153 +       }
1154 +}
1155 +
1156 +/*
1157 + * Swap in a new table (destroying old one).
1158 + */
1159 +int dm_swap_table(struct mapped_device *md, struct dm_table *table)
1160 +{
1161 +       int r;
1162 +
1163 +       down_write(&md->lock);
1164 +
1165 +       /*
1166 +        * The device must be suspended, or have no table bound yet.
1167 +        */
1168 +       if (md->map && !test_bit(DMF_SUSPENDED, &md->flags)) {
1169 +               up_write(&md->lock);
1170 +               return -EPERM;
1171 +       }
1172 +
1173 +       __unbind(md);
1174 +       r = __bind(md, table);
1175 +       if (r)
1176 +               return r;
1177 +
1178 +       up_write(&md->lock);
1179 +       return 0;
1180 +}
1181 +
1182 +/*
1183 + * We need to be able to change a mapping table under a mounted
1184 + * filesystem.  For example we might want to move some data in
1185 + * the background.  Before the table can be swapped with
1186 + * dm_bind_table, dm_suspend must be called to flush any in
1187 + * flight io and ensure that any further io gets deferred.
1188 + */
1189 +int dm_suspend(struct mapped_device *md)
1190 +{
1191 +       int r = 0;
1192 +       DECLARE_WAITQUEUE(wait, current);
1193 +
1194 +       down_write(&md->lock);
1195 +
1196 +       /*
1197 +        * First we set the BLOCK_IO flag so no more ios will be
1198 +        * mapped.
1199 +        */
1200 +       if (test_bit(DMF_BLOCK_IO, &md->flags)) {
1201 +               up_write(&md->lock);
1202 +               return -EINVAL;
1203 +       }
1204 +
1205 +       set_bit(DMF_BLOCK_IO, &md->flags);
1206 +       add_wait_queue(&md->wait, &wait);
1207 +       up_write(&md->lock);
1208 +
1209 +       /*
1210 +        * Then we wait for the already mapped ios to
1211 +        * complete.
1212 +        */
1213 +       run_task_queue(&tq_disk);
1214 +       while (1) {
1215 +               set_current_state(TASK_INTERRUPTIBLE);
1216 +
1217 +               if (!atomic_read(&md->pending) || signal_pending(current))
1218 +                       break;
1219 +
1220 +               schedule();
1221 +       }
1222 +       set_current_state(TASK_RUNNING);
1223 +
1224 +       down_write(&md->lock);
1225 +       remove_wait_queue(&md->wait, &wait);
1226 +
1227 +       /* did we flush everything ? */
1228 +       if (atomic_read(&md->pending)) {
1229 +               clear_bit(DMF_BLOCK_IO, &md->flags);
1230 +               r = -EINTR;
1231 +       } else {
1232 +               set_bit(DMF_SUSPENDED, &md->flags);
1233 +               if (md->map)
1234 +                       dm_table_suspend_targets(md->map);
1235 +       }
1236 +       up_write(&md->lock);
1237 +
1238 +       return r;
1239 +}
1240 +
1241 +int dm_resume(struct mapped_device *md)
1242 +{
1243 +       struct deferred_io *def;
1244 +
1245 +       down_write(&md->lock);
1246 +       if (!test_bit(DMF_SUSPENDED, &md->flags)) {
1247 +               up_write(&md->lock);
1248 +               return -EINVAL;
1249 +       }
1250 +
1251 +       if (md->map)
1252 +               dm_table_resume_targets(md->map);
1253 +
1254 +       clear_bit(DMF_SUSPENDED, &md->flags);
1255 +       clear_bit(DMF_BLOCK_IO, &md->flags);
1256 +       def = md->deferred;
1257 +       md->deferred = NULL;
1258 +       up_write(&md->lock);
1259 +
1260 +       flush_deferred_io(def);
1261 +       run_task_queue(&tq_disk);
1262 +
1263 +       return 0;
1264 +}
1265 +
1266 +struct dm_table *dm_get_table(struct mapped_device *md)
1267 +{
1268 +       struct dm_table *t;
1269 +
1270 +       down_read(&md->lock);
1271 +       t = md->map;
1272 +       if (t)
1273 +               dm_table_get(t);
1274 +       up_read(&md->lock);
1275 +
1276 +       return t;
1277 +}
1278 +
1279 +/*-----------------------------------------------------------------
1280 + * Event notification.
1281 + *---------------------------------------------------------------*/
1282 +uint32_t dm_get_event_nr(struct mapped_device *md)
1283 +{
1284 +       uint32_t r;
1285 +
1286 +       down_read(&md->lock);
1287 +       r = md->event_nr;
1288 +       up_read(&md->lock);
1289 +
1290 +       return r;
1291 +}
1292 +
1293 +int dm_add_wait_queue(struct mapped_device *md, wait_queue_t *wq,
1294 +                     uint32_t event_nr)
1295 +{
1296 +       down_write(&md->lock);
1297 +       if (event_nr != md->event_nr) {
1298 +               up_write(&md->lock);
1299 +               return 1;
1300 +       }
1301 +
1302 +       add_wait_queue(&md->eventq, wq);
1303 +       up_write(&md->lock);
1304 +
1305 +       return 0;
1306 +}
1307 +
1308 +const char *dm_kdevname(kdev_t dev)
1309 +{
1310 +       static char buffer[32];
1311 +       sprintf(buffer, "%03d:%03d", MAJOR(dev), MINOR(dev));
1312 +       return buffer;
1313 +}
1314 +
1315 +void dm_remove_wait_queue(struct mapped_device *md, wait_queue_t *wq)
1316 +{
1317 +       down_write(&md->lock);
1318 +       remove_wait_queue(&md->eventq, wq);
1319 +       up_write(&md->lock);
1320 +}
1321 +
1322 +kdev_t dm_kdev(struct mapped_device *md)
1323 +{
1324 +       kdev_t dev;
1325 +
1326 +       down_read(&md->lock);
1327 +       dev = md->dev;
1328 +       up_read(&md->lock);
1329 +
1330 +       return dev;
1331 +}
1332 +
1333 +int dm_suspended(struct mapped_device *md)
1334 +{
1335 +       return test_bit(DMF_SUSPENDED, &md->flags);
1336 +}
1337 +
1338 +struct block_device_operations dm_blk_dops = {
1339 +       .open = dm_blk_open,
1340 +       .release = dm_blk_close,
1341 +       .ioctl = dm_blk_ioctl,
1342 +       .owner = THIS_MODULE
1343 +};
1344 +
1345 +/*
1346 + * module hooks
1347 + */
1348 +module_init(dm_init);
1349 +module_exit(dm_exit);
1350 +
1351 +MODULE_DESCRIPTION(DM_NAME " driver");
1352 +MODULE_AUTHOR("Joe Thornber <thornber@sistina.com>");
1353 +MODULE_LICENSE("GPL");
1354 +
1355 +EXPORT_SYMBOL(dm_kdevname);
1356 diff -urN linux-2.4.24.org/drivers/md/dm-daemon.c linux-2.4.24/drivers/md/dm-daemon.c
1357 --- linux-2.4.24.org/drivers/md/dm-daemon.c     1970-01-01 01:00:00.000000000 +0100
1358 +++ linux-2.4.24/drivers/md/dm-daemon.c 2004-01-18 15:01:21.977991002 +0100
1359 @@ -0,0 +1,113 @@
1360 +/*
1361 + * Copyright (C) 2003 Sistina Software
1362 + *
1363 + * This file is released under the LGPL.
1364 + */
1365 +
1366 +#include "dm.h"
1367 +#include "dm-daemon.h"
1368 +
1369 +#include <linux/module.h>
1370 +#include <linux/sched.h>
1371 +
1372 +static int daemon(void *arg)
1373 +{
1374 +       struct dm_daemon *dd = (struct dm_daemon *) arg;
1375 +       DECLARE_WAITQUEUE(wq, current);
1376 +
1377 +       daemonize();
1378 +       reparent_to_init();
1379 +
1380 +       /* block all signals */
1381 +       spin_lock_irq(&current->sigmask_lock);
1382 +       sigfillset(&current->blocked);
1383 +       flush_signals(current);
1384 +       spin_unlock_irq(&current->sigmask_lock);
1385 +
1386 +       strcpy(current->comm, dd->name);
1387 +       atomic_set(&dd->please_die, 0);
1388 +
1389 +       add_wait_queue(&dd->job_queue, &wq);
1390 +
1391 +       down(&dd->run_lock);
1392 +       up(&dd->start_lock);
1393 +
1394 +       /*
1395 +        * dd->fn() could do anything, very likely it will
1396 +        * suspend.  So we can't set the state to
1397 +        * TASK_INTERRUPTIBLE before calling it.  In order to
1398 +        * prevent a race with a waking thread we do this little
1399 +        * dance with the dd->woken variable.
1400 +        */
1401 +       while (1) {
1402 +               do {
1403 +                       set_current_state(TASK_RUNNING);
1404 +
1405 +                       if (atomic_read(&dd->please_die))
1406 +                               goto out;
1407 +
1408 +                       atomic_set(&dd->woken, 0);
1409 +                       dd->fn();
1410 +                       yield();
1411 +
1412 +                       set_current_state(TASK_INTERRUPTIBLE);
1413 +               } while (atomic_read(&dd->woken));
1414 +
1415 +               schedule();
1416 +       }
1417 +
1418 + out:
1419 +       remove_wait_queue(&dd->job_queue, &wq);
1420 +       up(&dd->run_lock);
1421 +       return 0;
1422 +}
1423 +
1424 +int dm_daemon_start(struct dm_daemon *dd, const char *name, void (*fn)(void))
1425 +{
1426 +       pid_t pid = 0;
1427 +
1428 +       /*
1429 +        * Initialise the dm_daemon.
1430 +        */
1431 +       dd->fn = fn;
1432 +       strncpy(dd->name, name, sizeof(dd->name) - 1);
1433 +       sema_init(&dd->start_lock, 1);
1434 +       sema_init(&dd->run_lock, 1);
1435 +       init_waitqueue_head(&dd->job_queue);
1436 +
1437 +       /*
1438 +        * Start the new thread.
1439 +        */
1440 +       down(&dd->start_lock);
1441 +       pid = kernel_thread(daemon, dd, 0);
1442 +       if (pid <= 0) {
1443 +               DMERR("Failed to start %s thread", name);
1444 +               return -EAGAIN;
1445 +       }
1446 +
1447 +       /*
1448 +        * wait for the daemon to up this mutex.
1449 +        */
1450 +       down(&dd->start_lock);
1451 +       up(&dd->start_lock);
1452 +
1453 +       return 0;
1454 +}
1455 +
1456 +void dm_daemon_stop(struct dm_daemon *dd)
1457 +{
1458 +       atomic_set(&dd->please_die, 1);
1459 +       dm_daemon_wake(dd);
1460 +       down(&dd->run_lock);
1461 +       up(&dd->run_lock);
1462 +}
1463 +
1464 +void dm_daemon_wake(struct dm_daemon *dd)
1465 +{
1466 +       atomic_set(&dd->woken, 1);
1467 +       wake_up_interruptible(&dd->job_queue);
1468 +}
1469 +
1470 +EXPORT_SYMBOL(dm_daemon_start);
1471 +EXPORT_SYMBOL(dm_daemon_stop);
1472 +EXPORT_SYMBOL(dm_daemon_wake);
1473 diff -urN linux-2.4.24.org/drivers/md/dm-daemon.h linux-2.4.24/drivers/md/dm-daemon.h
1474 --- linux-2.4.24.org/drivers/md/dm-daemon.h     1970-01-01 01:00:00.000000000 +0100
1475 +++ linux-2.4.24/drivers/md/dm-daemon.h 2004-01-18 15:01:21.980990372 +0100
1476 @@ -0,0 +1,29 @@
1477 +/*
1478 + * Copyright (C) 2003 Sistina Software
1479 + *
1480 + * This file is released under the LGPL.
1481 + */
1482 +
1483 +#ifndef DM_DAEMON_H
1484 +#define DM_DAEMON_H
1485 +
1486 +#include <asm/atomic.h>
1487 +#include <asm/semaphore.h>
1488 +
1489 +struct dm_daemon {
1490 +       void (*fn)(void);
1491 +       char name[16];
1492 +       atomic_t please_die;
1493 +       struct semaphore start_lock;
1494 +       struct semaphore run_lock;
1495 +
1496 +       atomic_t woken;
1497 +       wait_queue_head_t job_queue;
1498 +};
1499 +
1500 +int dm_daemon_start(struct dm_daemon *dd, const char *name, void (*fn)(void));
1501 +void dm_daemon_stop(struct dm_daemon *dd);
1502 +void dm_daemon_wake(struct dm_daemon *dd);
1503 +int dm_daemon_running(struct dm_daemon *dd);
1504 +
1505 +#endif
1506 diff -urN linux-2.4.24.org/drivers/md/dm-exception-store.c linux-2.4.24/drivers/md/dm-exception-store.c
1507 --- linux-2.4.24.org/drivers/md/dm-exception-store.c    1970-01-01 01:00:00.000000000 +0100
1508 +++ linux-2.4.24/drivers/md/dm-exception-store.c        2004-01-18 15:01:29.225470463 +0100
1509 @@ -0,0 +1,673 @@
1510 +/*
1511 + * dm-snapshot.c
1512 + *
1513 + * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
1514 + *
1515 + * This file is released under the GPL.
1516 + */
1517 +
1518 +#include "dm-snapshot.h"
1519 +#include "dm-io.h"
1520 +#include "kcopyd.h"
1521 +
1522 +#include <linux/mm.h>
1523 +#include <linux/pagemap.h>
1524 +#include <linux/vmalloc.h>
1525 +#include <linux/slab.h>
1526 +
1527 +/*-----------------------------------------------------------------
1528 + * Persistent snapshots, by persistent we mean that the snapshot
1529 + * will survive a reboot.
1530 + *---------------------------------------------------------------*/
1531 +
1532 +/*
1533 + * We need to store a record of which parts of the origin have
1534 + * been copied to the snapshot device.  The snapshot code
1535 + * requires that we copy exception chunks to chunk aligned areas
1536 + * of the COW store.  It makes sense therefore, to store the
1537 + * metadata in chunk size blocks.
1538 + *
1539 + * There is no backward or forward compatibility implemented,
1540 + * snapshots with different disk versions than the kernel will
1541 + * not be usable.  It is expected that "lvcreate" will blank out
1542 + * the start of a fresh COW device before calling the snapshot
1543 + * constructor.
1544 + *
1545 + * The first chunk of the COW device just contains the header.
1546 + * After this there is a chunk filled with exception metadata,
1547 + * followed by as many exception chunks as can fit in the
1548 + * metadata areas.
1549 + *
1550 + * All on disk structures are in little-endian format.  The end
1551 + * of the exceptions info is indicated by an exception with a
1552 + * new_chunk of 0, which is invalid since it would point to the
1553 + * header chunk.
1554 + */
1555 +
1556 +/*
1557 + * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
1558 + */
1559 +#define SNAP_MAGIC 0x70416e53
1560 +
1561 +/*
1562 + * The on-disk version of the metadata.
1563 + */
1564 +#define SNAPSHOT_DISK_VERSION 1
1565 +
1566 +struct disk_header {
1567 +       uint32_t magic;
1568 +
1569 +       /*
1570 +        * Is this snapshot valid.  There is no way of recovering
1571 +        * an invalid snapshot.
1572 +        */
1573 +       uint32_t valid;
1574 +
1575 +       /*
1576 +        * Simple, incrementing version. no backward
1577 +        * compatibility.
1578 +        */
1579 +       uint32_t version;
1580 +
1581 +       /* In sectors */
1582 +       uint32_t chunk_size;
1583 +};
1584 +
1585 +struct disk_exception {
1586 +       uint64_t old_chunk;
1587 +       uint64_t new_chunk;
1588 +};
1589 +
1590 +struct commit_callback {
1591 +       void (*callback)(void *, int success);
1592 +       void *context;
1593 +};
1594 +
1595 +/*
1596 + * The top level structure for a persistent exception store.
1597 + */
1598 +struct pstore {
1599 +       struct dm_snapshot *snap;       /* up pointer to my snapshot */
1600 +       int version;
1601 +       int valid;
1602 +       uint32_t chunk_size;
1603 +       uint32_t exceptions_per_area;
1604 +
1605 +       /*
1606 +        * Now that we have an asynchronous kcopyd there is no
1607 +        * need for large chunk sizes, so it wont hurt to have a
1608 +        * whole chunks worth of metadata in memory at once.
1609 +        */
1610 +       void *area;
1611 +
1612 +       /*
1613 +        * Used to keep track of which metadata area the data in
1614 +        * 'chunk' refers to.
1615 +        */
1616 +       uint32_t current_area;
1617 +
1618 +       /*
1619 +        * The next free chunk for an exception.
1620 +        */
1621 +       uint32_t next_free;
1622 +
1623 +       /*
1624 +        * The index of next free exception in the current
1625 +        * metadata area.
1626 +        */
1627 +       uint32_t current_committed;
1628 +
1629 +       atomic_t pending_count;
1630 +       uint32_t callback_count;
1631 +       struct commit_callback *callbacks;
1632 +};
1633 +
1634 +static inline unsigned int sectors_to_pages(unsigned int sectors)
1635 +{
1636 +       return sectors / (PAGE_SIZE / SECTOR_SIZE);
1637 +}
1638 +
1639 +static int alloc_area(struct pstore *ps)
1640 +{
1641 +       int r = -ENOMEM;
1642 +       size_t i, len, nr_pages;
1643 +       struct page *page, *last = NULL;
1644 +
1645 +       len = ps->chunk_size << SECTOR_SHIFT;
1646 +
1647 +       /*
1648 +        * Allocate the chunk_size block of memory that will hold
1649 +        * a single metadata area.
1650 +        */
1651 +       ps->area = vmalloc(len);
1652 +       if (!ps->area)
1653 +               return r;
1654 +
1655 +       nr_pages = sectors_to_pages(ps->chunk_size);
1656 +
1657 +       /*
1658 +        * We lock the pages for ps->area into memory since
1659 +        * they'll be doing a lot of io.  We also chain them
1660 +        * together ready for dm-io.
1661 +        */
1662 +       for (i = 0; i < nr_pages; i++) {
1663 +               page = vmalloc_to_page(ps->area + (i * PAGE_SIZE));
1664 +               LockPage(page);
1665 +               if (last)
1666 +                       last->list.next = &page->list;
1667 +               last = page;
1668 +       }
1669 +
1670 +       return 0;
1671 +}
1672 +
1673 +static void free_area(struct pstore *ps)
1674 +{
1675 +       size_t i, nr_pages;
1676 +       struct page *page;
1677 +
1678 +       nr_pages = sectors_to_pages(ps->chunk_size);
1679 +       for (i = 0; i < nr_pages; i++) {
1680 +               page = vmalloc_to_page(ps->area + (i * PAGE_SIZE));
1681 +               page->list.next = NULL;
1682 +               UnlockPage(page);
1683 +       }
1684 +
1685 +       vfree(ps->area);
1686 +}
1687 +
1688 +/*
1689 + * Read or write a chunk aligned and sized block of data from a device.
1690 + */
1691 +static int chunk_io(struct pstore *ps, uint32_t chunk, int rw)
1692 +{
1693 +       struct io_region where;
1694 +       unsigned int bits;
1695 +
1696 +       where.dev = ps->snap->cow->dev;
1697 +       where.sector = ps->chunk_size * chunk;
1698 +       where.count = ps->chunk_size;
1699 +
1700 +       return dm_io_sync(1, &where, rw, vmalloc_to_page(ps->area), 0, &bits);
1701 +}
1702 +
1703 +/*
1704 + * Read or write a metadata area.  Remembering to skip the first
1705 + * chunk which holds the header.
1706 + */
1707 +static int area_io(struct pstore *ps, uint32_t area, int rw)
1708 +{
1709 +       int r;
1710 +       uint32_t chunk;
1711 +
1712 +       /* convert a metadata area index to a chunk index */
1713 +       chunk = 1 + ((ps->exceptions_per_area + 1) * area);
1714 +
1715 +       r = chunk_io(ps, chunk, rw);
1716 +       if (r)
1717 +               return r;
1718 +
1719 +       ps->current_area = area;
1720 +       return 0;
1721 +}
1722 +
1723 +static int zero_area(struct pstore *ps, uint32_t area)
1724 +{
1725 +       memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT);
1726 +       return area_io(ps, area, WRITE);
1727 +}
1728 +
1729 +static int read_header(struct pstore *ps, int *new_snapshot)
1730 +{
1731 +       int r;
1732 +       struct disk_header *dh;
1733 +
1734 +       r = chunk_io(ps, 0, READ);
1735 +       if (r)
1736 +               return r;
1737 +
1738 +       dh = (struct disk_header *) ps->area;
1739 +
1740 +       if (le32_to_cpu(dh->magic) == 0) {
1741 +               *new_snapshot = 1;
1742 +
1743 +       } else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) {
1744 +               *new_snapshot = 0;
1745 +               ps->valid = le32_to_cpu(dh->valid);
1746 +               ps->version = le32_to_cpu(dh->version);
1747 +               ps->chunk_size = le32_to_cpu(dh->chunk_size);
1748 +
1749 +       } else {
1750 +               DMWARN("Invalid/corrupt snapshot");
1751 +               r = -ENXIO;
1752 +       }
1753 +
1754 +       return r;
1755 +}
1756 +
1757 +static int write_header(struct pstore *ps)
1758 +{
1759 +       struct disk_header *dh;
1760 +
1761 +       memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT);
1762 +
1763 +       dh = (struct disk_header *) ps->area;
1764 +       dh->magic = cpu_to_le32(SNAP_MAGIC);
1765 +       dh->valid = cpu_to_le32(ps->valid);
1766 +       dh->version = cpu_to_le32(ps->version);
1767 +       dh->chunk_size = cpu_to_le32(ps->chunk_size);
1768 +
1769 +       return chunk_io(ps, 0, WRITE);
1770 +}
1771 +
1772 +/*
1773 + * Access functions for the disk exceptions, these do the endian conversions.
1774 + */
1775 +static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
1776 +{
1777 +       if (index >= ps->exceptions_per_area)
1778 +               return NULL;
1779 +
1780 +       return ((struct disk_exception *) ps->area) + index;
1781 +}
1782 +
1783 +static int read_exception(struct pstore *ps,
1784 +                         uint32_t index, struct disk_exception *result)
1785 +{
1786 +       struct disk_exception *e;
1787 +
1788 +       e = get_exception(ps, index);
1789 +       if (!e)
1790 +               return -EINVAL;
1791 +
1792 +       /* copy it */
1793 +       result->old_chunk = le64_to_cpu(e->old_chunk);
1794 +       result->new_chunk = le64_to_cpu(e->new_chunk);
1795 +
1796 +       return 0;
1797 +}
1798 +
1799 +static int write_exception(struct pstore *ps,
1800 +                          uint32_t index, struct disk_exception *de)
1801 +{
1802 +       struct disk_exception *e;
1803 +
1804 +       e = get_exception(ps, index);
1805 +       if (!e)
1806 +               return -EINVAL;
1807 +
1808 +       /* copy it */
1809 +       e->old_chunk = cpu_to_le64(de->old_chunk);
1810 +       e->new_chunk = cpu_to_le64(de->new_chunk);
1811 +
1812 +       return 0;
1813 +}
1814 +
1815 +/*
1816 + * Registers the exceptions that are present in the current area.
1817 + * 'full' is filled in to indicate if the area has been
1818 + * filled.
1819 + */
1820 +static int insert_exceptions(struct pstore *ps, int *full)
1821 +{
1822 +       int r;
1823 +       unsigned int i;
1824 +       struct disk_exception de;
1825 +
1826 +       /* presume the area is full */
1827 +       *full = 1;
1828 +
1829 +       for (i = 0; i < ps->exceptions_per_area; i++) {
1830 +               r = read_exception(ps, i, &de);
1831 +
1832 +               if (r)
1833 +                       return r;
1834 +
1835 +               /*
1836 +                * If the new_chunk is pointing at the start of
1837 +                * the COW device, where the first metadata area
1838 +                * is we know that we've hit the end of the
1839 +                * exceptions.  Therefore the area is not full.
1840 +                */
1841 +               if (de.new_chunk == 0LL) {
1842 +                       ps->current_committed = i;
1843 +                       *full = 0;
1844 +                       break;
1845 +               }
1846 +
1847 +               /*
1848 +                * Keep track of the start of the free chunks.
1849 +                */
1850 +               if (ps->next_free <= de.new_chunk)
1851 +                       ps->next_free = de.new_chunk + 1;
1852 +
1853 +               /*
1854 +                * Otherwise we add the exception to the snapshot.
1855 +                */
1856 +               r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk);
1857 +               if (r)
1858 +                       return r;
1859 +       }
1860 +
1861 +       return 0;
1862 +}
1863 +
1864 +static int read_exceptions(struct pstore *ps)
1865 +{
1866 +       uint32_t area;
1867 +       int r, full = 1;
1868 +
1869 +       /*
1870 +        * Keeping reading chunks and inserting exceptions until
1871 +        * we find a partially full area.
1872 +        */
1873 +       for (area = 0; full; area++) {
1874 +               r = area_io(ps, area, READ);
1875 +               if (r)
1876 +                       return r;
1877 +
1878 +               r = insert_exceptions(ps, &full);
1879 +               if (r)
1880 +                       return r;
1881 +       }
1882 +
1883 +       return 0;
1884 +}
1885 +
1886 +static inline struct pstore *get_info(struct exception_store *store)
1887 +{
1888 +       return (struct pstore *) store->context;
1889 +}
1890 +
1891 +static void persistent_fraction_full(struct exception_store *store,
1892 +                                    sector_t *numerator, sector_t *denominator)
1893 +{
1894 +       *numerator = get_info(store)->next_free * store->snap->chunk_size;
1895 +       *denominator = get_dev_size(store->snap->cow->dev);
1896 +}
1897 +
1898 +static void persistent_destroy(struct exception_store *store)
1899 +{
1900 +       struct pstore *ps = get_info(store);
1901 +
1902 +       dm_io_put(sectors_to_pages(ps->chunk_size));
1903 +       vfree(ps->callbacks);
1904 +       free_area(ps);
1905 +       kfree(ps);
1906 +}
1907 +
1908 +static int persistent_read_metadata(struct exception_store *store)
1909 +{
1910 +       int r, new_snapshot;
1911 +       struct pstore *ps = get_info(store);
1912 +
1913 +       /*
1914 +        * Read the snapshot header.
1915 +        */
1916 +       r = read_header(ps, &new_snapshot);
1917 +       if (r)
1918 +               return r;
1919 +
1920 +       /*
1921 +        * Do we need to setup a new snapshot ?
1922 +        */
1923 +       if (new_snapshot) {
1924 +               r = write_header(ps);
1925 +               if (r) {
1926 +                       DMWARN("write_header failed");
1927 +                       return r;
1928 +               }
1929 +
1930 +               r = zero_area(ps, 0);
1931 +               if (r) {
1932 +                       DMWARN("zero_area(0) failed");
1933 +                       return r;
1934 +               }
1935 +
1936 +       } else {
1937 +               /*
1938 +                * Sanity checks.
1939 +                */
1940 +               if (!ps->valid) {
1941 +                       DMWARN("snapshot is marked invalid");
1942 +                       return -EINVAL;
1943 +               }
1944 +
1945 +               if (ps->version != SNAPSHOT_DISK_VERSION) {
1946 +                       DMWARN("unable to handle snapshot disk version %d",
1947 +                              ps->version);
1948 +                       return -EINVAL;
1949 +               }
1950 +
1951 +               /*
1952 +                * Read the metadata.
1953 +                */
1954 +               r = read_exceptions(ps);
1955 +               if (r)
1956 +                       return r;
1957 +       }
1958 +
1959 +       return 0;
1960 +}
1961 +
1962 +static int persistent_prepare(struct exception_store *store,
1963 +                             struct exception *e)
1964 +{
1965 +       struct pstore *ps = get_info(store);
1966 +       uint32_t stride;
1967 +       sector_t size = get_dev_size(store->snap->cow->dev);
1968 +
1969 +       /* Is there enough room ? */
1970 +       if (size < ((ps->next_free + 1) * store->snap->chunk_size))
1971 +               return -ENOSPC;
1972 +
1973 +       e->new_chunk = ps->next_free;
1974 +
1975 +       /*
1976 +        * Move onto the next free pending, making sure to take
1977 +        * into account the location of the metadata chunks.
1978 +        */
1979 +       stride = (ps->exceptions_per_area + 1);
1980 +       if ((++ps->next_free % stride) == 1)
1981 +               ps->next_free++;
1982 +
1983 +       atomic_inc(&ps->pending_count);
1984 +       return 0;
1985 +}
1986 +
1987 +static void persistent_commit(struct exception_store *store,
1988 +                             struct exception *e,
1989 +                             void (*callback) (void *, int success),
1990 +                             void *callback_context)
1991 +{
1992 +       int r;
1993 +       unsigned int i;
1994 +       struct pstore *ps = get_info(store);
1995 +       struct disk_exception de;
1996 +       struct commit_callback *cb;
1997 +
1998 +       de.old_chunk = e->old_chunk;
1999 +       de.new_chunk = e->new_chunk;
2000 +       write_exception(ps, ps->current_committed++, &de);
2001 +
2002 +       /*
2003 +        * Add the callback to the back of the array.  This code
2004 +        * is the only place where the callback array is
2005 +        * manipulated, and we know that it will never be called
2006 +        * multiple times concurrently.
2007 +        */
2008 +       cb = ps->callbacks + ps->callback_count++;
2009 +       cb->callback = callback;
2010 +       cb->context = callback_context;
2011 +
2012 +       /*
2013 +        * If there are no more exceptions in flight, or we have
2014 +        * filled this metadata area we commit the exceptions to
2015 +        * disk.
2016 +        */
2017 +       if (atomic_dec_and_test(&ps->pending_count) ||
2018 +           (ps->current_committed == ps->exceptions_per_area)) {
2019 +               r = area_io(ps, ps->current_area, WRITE);
2020 +               if (r)
2021 +                       ps->valid = 0;
2022 +
2023 +               for (i = 0; i < ps->callback_count; i++) {
2024 +                       cb = ps->callbacks + i;
2025 +                       cb->callback(cb->context, r == 0 ? 1 : 0);
2026 +               }
2027 +
2028 +               ps->callback_count = 0;
2029 +       }
2030 +
2031 +       /*
2032 +        * Have we completely filled the current area ?
2033 +        */
2034 +       if (ps->current_committed == ps->exceptions_per_area) {
2035 +               ps->current_committed = 0;
2036 +               r = zero_area(ps, ps->current_area + 1);
2037 +               if (r)
2038 +                       ps->valid = 0;
2039 +       }
2040 +}
2041 +
2042 +static void persistent_drop(struct exception_store *store)
2043 +{
2044 +       struct pstore *ps = get_info(store);
2045 +
2046 +       ps->valid = 0;
2047 +       if (write_header(ps))
2048 +               DMWARN("write header failed");
2049 +}
2050 +
2051 +int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
2052 +{
2053 +       int r;
2054 +       struct pstore *ps;
2055 +
2056 +       r = dm_io_get(sectors_to_pages(chunk_size));
2057 +       if (r)
2058 +               return r;
2059 +
2060 +       /* allocate the pstore */
2061 +       ps = kmalloc(sizeof(*ps), GFP_KERNEL);
2062 +       if (!ps) {
2063 +               r = -ENOMEM;
2064 +               goto bad;
2065 +       }
2066 +
2067 +       ps->snap = store->snap;
2068 +       ps->valid = 1;
2069 +       ps->version = SNAPSHOT_DISK_VERSION;
2070 +       ps->chunk_size = chunk_size;
2071 +       ps->exceptions_per_area = (chunk_size << SECTOR_SHIFT) /
2072 +           sizeof(struct disk_exception);
2073 +       ps->next_free = 2;      /* skipping the header and first area */
2074 +       ps->current_committed = 0;
2075 +
2076 +       r = alloc_area(ps);
2077 +       if (r)
2078 +               goto bad;
2079 +
2080 +       /*
2081 +        * Allocate space for all the callbacks.
2082 +        */
2083 +       ps->callback_count = 0;
2084 +       atomic_set(&ps->pending_count, 0);
2085 +       ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
2086 +                                  sizeof(*ps->callbacks));
2087 +
2088 +       if (!ps->callbacks) {
2089 +               r = -ENOMEM;
2090 +               goto bad;
2091 +       }
2092 +
2093 +       store->destroy = persistent_destroy;
2094 +       store->read_metadata = persistent_read_metadata;
2095 +       store->prepare_exception = persistent_prepare;
2096 +       store->commit_exception = persistent_commit;
2097 +       store->drop_snapshot = persistent_drop;
2098 +       store->fraction_full = persistent_fraction_full;
2099 +       store->context = ps;
2100 +
2101 +       return 0;
2102 +
2103 +      bad:
2104 +       dm_io_put(sectors_to_pages(chunk_size));
2105 +       if (ps) {
2106 +               if (ps->callbacks)
2107 +                       vfree(ps->callbacks);
2108 +
2109 +               kfree(ps);
2110 +       }
2111 +       return r;
2112 +}
2113 +
2114 +/*-----------------------------------------------------------------
2115 + * Implementation of the store for non-persistent snapshots.
2116 + *---------------------------------------------------------------*/
2117 +struct transient_c {
2118 +       sector_t next_free;
2119 +};
2120 +
2121 +void transient_destroy(struct exception_store *store)
2122 +{
2123 +       kfree(store->context);
2124 +}
2125 +
2126 +int transient_read_metadata(struct exception_store *store)
2127 +{
2128 +       return 0;
2129 +}
2130 +
2131 +int transient_prepare(struct exception_store *store, struct exception *e)
2132 +{
2133 +       struct transient_c *tc = (struct transient_c *) store->context;
2134 +       sector_t size = get_dev_size(store->snap->cow->dev);
2135 +
2136 +       if (size < (tc->next_free + store->snap->chunk_size))
2137 +               return -1;
2138 +
2139 +       e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
2140 +       tc->next_free += store->snap->chunk_size;
2141 +
2142 +       return 0;
2143 +}
2144 +
2145 +void transient_commit(struct exception_store *store,
2146 +                     struct exception *e,
2147 +                     void (*callback) (void *, int success),
2148 +                     void *callback_context)
2149 +{
2150 +       /* Just succeed */
2151 +       callback(callback_context, 1);
2152 +}
2153 +
2154 +static void transient_fraction_full(struct exception_store *store,
2155 +                                   sector_t *numerator, sector_t *denominator)
2156 +{
2157 +       *numerator = ((struct transient_c *) store->context)->next_free;
2158 +       *denominator = get_dev_size(store->snap->cow->dev);
2159 +}
2160 +
2161 +int dm_create_transient(struct exception_store *store,
2162 +                       struct dm_snapshot *s, int blocksize)
2163 +{
2164 +       struct transient_c *tc;
2165 +
2166 +       memset(store, 0, sizeof(*store));
2167 +       store->destroy = transient_destroy;
2168 +       store->read_metadata = transient_read_metadata;
2169 +       store->prepare_exception = transient_prepare;
2170 +       store->commit_exception = transient_commit;
2171 +       store->fraction_full = transient_fraction_full;
2172 +       store->snap = s;
2173 +
2174 +       tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
2175 +       if (!tc)
2176 +               return -ENOMEM;
2177 +
2178 +       tc->next_free = 0;
2179 +       store->context = tc;
2180 +
2181 +       return 0;
2182 +}
2183 diff -urN linux-2.4.24.org/drivers/md/dm.h linux-2.4.24/drivers/md/dm.h
2184 --- linux-2.4.24.org/drivers/md/dm.h    1970-01-01 01:00:00.000000000 +0100
2185 +++ linux-2.4.24/drivers/md/dm.h        2004-01-18 15:01:29.219471722 +0100
2186 @@ -0,0 +1,176 @@
2187 +/*
2188 + * Internal header file for device mapper
2189 + *
2190 + * Copyright (C) 2001, 2002 Sistina Software
2191 + *
2192 + * This file is released under the LGPL.
2193 + */
2194 +
2195 +#ifndef DM_INTERNAL_H
2196 +#define DM_INTERNAL_H
2197 +
2198 +#include <linux/fs.h>
2199 +#include <linux/device-mapper.h>
2200 +#include <linux/list.h>
2201 +#include <linux/blkdev.h>
2202 +
2203 +#define DM_NAME "device-mapper"
2204 +#define DMWARN(f, x...) printk(KERN_WARNING DM_NAME ": " f "\n" , ## x)
2205 +#define DMERR(f, x...) printk(KERN_ERR DM_NAME ": " f "\n" , ## x)
2206 +#define DMINFO(f, x...) printk(KERN_INFO DM_NAME ": " f "\n" , ## x)
2207 +
2208 +/*
2209 + * FIXME: I think this should be with the definition of sector_t
2210 + * in types.h.
2211 + */
2212 +#ifdef CONFIG_LBD
2213 +#define SECTOR_FORMAT "%Lu"
2214 +#else
2215 +#define SECTOR_FORMAT "%lu"
2216 +#endif
2217 +
2218 +#define SECTOR_SHIFT 9
2219 +#define SECTOR_SIZE (1 << SECTOR_SHIFT)
2220 +
2221 +extern struct block_device_operations dm_blk_dops;
2222 +
2223 +/*
2224 + * List of devices that a metadevice uses and should open/close.
2225 + */
2226 +struct dm_dev {
2227 +       struct list_head list;
2228 +
2229 +       atomic_t count;
2230 +       int mode;
2231 +       kdev_t dev;
2232 +       struct block_device *bdev;
2233 +};
2234 +
2235 +struct dm_table;
2236 +struct mapped_device;
2237 +
2238 +/*-----------------------------------------------------------------
2239 + * Functions for manipulating a struct mapped_device.
2240 + * Drop the reference with dm_put when you finish with the object.
2241 + *---------------------------------------------------------------*/
2242 +int dm_create(kdev_t dev, struct mapped_device **md);
2243 +
2244 +/*
2245 + * Reference counting for md.
2246 + */
2247 +void dm_get(struct mapped_device *md);
2248 +void dm_put(struct mapped_device *md);
2249 +
2250 +/*
2251 + * A device can still be used while suspended, but I/O is deferred.
2252 + */
2253 +int dm_suspend(struct mapped_device *md);
2254 +int dm_resume(struct mapped_device *md);
2255 +
2256 +/*
2257 + * The device must be suspended before calling this method.
2258 + */
2259 +int dm_swap_table(struct mapped_device *md, struct dm_table *t);
2260 +
2261 +/*
2262 + * Drop a reference on the table when you've finished with the
2263 + * result.
2264 + */
2265 +struct dm_table *dm_get_table(struct mapped_device *md);
2266 +
2267 +/*
2268 + * Event functions.
2269 + */
2270 +uint32_t dm_get_event_nr(struct mapped_device *md);
2271 +int dm_add_wait_queue(struct mapped_device *md, wait_queue_t *wq,
2272 +                     uint32_t event_nr);
2273 +void dm_remove_wait_queue(struct mapped_device *md, wait_queue_t *wq);
2274 +
2275 +/*
2276 + * Info functions.
2277 + */
2278 +kdev_t dm_kdev(struct mapped_device *md);
2279 +int dm_suspended(struct mapped_device *md);
2280 +
2281 +/*-----------------------------------------------------------------
2282 + * Functions for manipulating a table.  Tables are also reference
2283 + * counted.
2284 + *---------------------------------------------------------------*/
2285 +int dm_table_create(struct dm_table **result, int mode, unsigned num_targets);
2286 +
2287 +void dm_table_get(struct dm_table *t);
2288 +void dm_table_put(struct dm_table *t);
2289 +
2290 +int dm_table_add_target(struct dm_table *t, const char *type,
2291 +                       sector_t start, sector_t len, char *params);
2292 +int dm_table_complete(struct dm_table *t);
2293 +void dm_table_event_callback(struct dm_table *t,
2294 +                            void (*fn)(void *), void *context);
2295 +void dm_table_event(struct dm_table *t);
2296 +sector_t dm_table_get_size(struct dm_table *t);
2297 +struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
2298 +struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
2299 +unsigned int dm_table_get_num_targets(struct dm_table *t);
2300 +struct list_head *dm_table_get_devices(struct dm_table *t);
2301 +int dm_table_get_mode(struct dm_table *t);
2302 +void dm_table_suspend_targets(struct dm_table *t);
2303 +void dm_table_resume_targets(struct dm_table *t);
2304 +
2305 +/*-----------------------------------------------------------------
2306 + * A registry of target types.
2307 + *---------------------------------------------------------------*/
2308 +int dm_target_init(void);
2309 +void dm_target_exit(void);
2310 +struct target_type *dm_get_target_type(const char *name);
2311 +void dm_put_target_type(struct target_type *t);
2312 +
2313 +
2314 +/*-----------------------------------------------------------------
2315 + * Useful inlines.
2316 + *---------------------------------------------------------------*/
2317 +static inline int array_too_big(unsigned long fixed, unsigned long obj,
2318 +                               unsigned long num)
2319 +{
2320 +       return (num > (ULONG_MAX - fixed) / obj);
2321 +}
2322 +
2323 +/*
2324 + * ceiling(n / size) * size
2325 + */
2326 +static inline unsigned long dm_round_up(unsigned long n, unsigned long size)
2327 +{
2328 +       unsigned long r = n % size;
2329 +       return n + (r ? (size - r) : 0);
2330 +}
2331 +
2332 +/*
2333 + * Ceiling(n / size)
2334 + */
2335 +static inline unsigned long dm_div_up(unsigned long n, unsigned long size)
2336 +{
2337 +       return dm_round_up(n, size) / size;
2338 +}
2339 +
2340 +const char *dm_kdevname(kdev_t dev);
2341 +void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size);
2342 +
2343 +/*
2344 + * The device-mapper can be driven through one of two interfaces;
2345 + * ioctl or filesystem, depending which patch you have applied.
2346 + */
2347 +int dm_interface_init(void);
2348 +void dm_interface_exit(void);
2349 +
2350 +/*
2351 + * Targets for linear and striped mappings
2352 + */
2353 +int dm_linear_init(void);
2354 +void dm_linear_exit(void);
2355 +
2356 +int dm_stripe_init(void);
2357 +void dm_stripe_exit(void);
2358 +
2359 +int dm_snapshot_init(void);
2360 +void dm_snapshot_exit(void);
2361 +
2362 +#endif
2363 diff -urN linux-2.4.24.org/drivers/md/dm-io.c linux-2.4.24/drivers/md/dm-io.c
2364 --- linux-2.4.24.org/drivers/md/dm-io.c 1970-01-01 01:00:00.000000000 +0100
2365 +++ linux-2.4.24/drivers/md/dm-io.c     2004-01-18 15:01:25.790191115 +0100
2366 @@ -0,0 +1,361 @@
2367 +/*
2368 + * Copyright (C) 2003 Sistina Software
2369 + *
2370 + * This file is released under the GPL.
2371 + */
2372 +
2373 +#include "dm-io.h"
2374 +
2375 +#include <linux/mempool.h>
2376 +#include <linux/module.h>
2377 +#include <linux/slab.h>
2378 +#include <linux/sched.h>
2379 +#include <linux/bitops.h>
2380 +
2381 +/* FIXME: can we shrink this ? */
2382 +struct io_context {
2383 +       int rw;
2384 +       unsigned int error;
2385 +       atomic_t count;
2386 +       struct task_struct *sleeper;
2387 +       io_notify_fn callback;
2388 +       void *context;
2389 +};
2390 +
2391 +/*
2392 + * We maintain a pool of buffer heads for dispatching the io.
2393 + */
2394 +static unsigned int _num_bhs;
2395 +static mempool_t *_buffer_pool;
2396 +
2397 +/*
2398 + * io contexts are only dynamically allocated for asynchronous
2399 + * io.  Since async io is likely to be the majority of io we'll
2400 + * have the same number of io contexts as buffer heads ! (FIXME:
2401 + * must reduce this).
2402 + */
2403 +mempool_t *_io_pool;
2404 +
2405 +static void *alloc_bh(int gfp_mask, void *pool_data)
2406 +{
2407 +       struct buffer_head *bh;
2408 +
2409 +       bh = kmem_cache_alloc(bh_cachep, gfp_mask);
2410 +       if (bh) {
2411 +               bh->b_reqnext = NULL;
2412 +               init_waitqueue_head(&bh->b_wait);
2413 +               INIT_LIST_HEAD(&bh->b_inode_buffers);
2414 +       }
2415 +
2416 +       return bh;
2417 +}
2418 +
2419 +static void *alloc_io(int gfp_mask, void *pool_data)
2420 +{
2421 +       return kmalloc(sizeof(struct io_context), gfp_mask);
2422 +}
2423 +
2424 +static void free_io(void *element, void *pool_data)
2425 +{
2426 +       kfree(element);
2427 +}
2428 +
2429 +static unsigned int pages_to_buffers(unsigned int pages)
2430 +{
2431 +       return 4 * pages;       /* too many ? */
2432 +}
2433 +
2434 +static int resize_pool(unsigned int new_bhs)
2435 +{
2436 +       int r = 0;
2437 +
2438 +       if (_buffer_pool) {
2439 +               if (new_bhs == 0) {
2440 +                       /* free off the pools */
2441 +                       mempool_destroy(_buffer_pool);
2442 +                       mempool_destroy(_io_pool);
2443 +                       _buffer_pool = _io_pool = NULL;
2444 +               } else {
2445 +                       /* resize the pools */
2446 +                       r = mempool_resize(_buffer_pool, new_bhs, GFP_KERNEL);
2447 +                       if (!r)
2448 +                               r = mempool_resize(_io_pool,
2449 +                                                  new_bhs, GFP_KERNEL);
2450 +               }
2451 +       } else {
2452 +               /* create new pools */
2453 +               _buffer_pool = mempool_create(new_bhs, alloc_bh,
2454 +                                             mempool_free_slab, bh_cachep);
2455 +               if (!_buffer_pool)
2456 +                       r = -ENOMEM;
2457 +
2458 +               _io_pool = mempool_create(new_bhs, alloc_io, free_io, NULL);
2459 +               if (!_io_pool) {
2460 +                       mempool_destroy(_buffer_pool);
2461 +                       _buffer_pool = NULL;
2462 +                       r = -ENOMEM;
2463 +               }
2464 +       }
2465 +
2466 +       if (!r)
2467 +               _num_bhs = new_bhs;
2468 +
2469 +       return r;
2470 +}
2471 +
2472 +int dm_io_get(unsigned int num_pages)
2473 +{
2474 +       return resize_pool(_num_bhs + pages_to_buffers(num_pages));
2475 +}
2476 +
2477 +void dm_io_put(unsigned int num_pages)
2478 +{
2479 +       resize_pool(_num_bhs - pages_to_buffers(num_pages));
2480 +}
2481 +
2482 +/*-----------------------------------------------------------------
2483 + * We need to keep track of which region a buffer is doing io
2484 + * for.  In order to save a memory allocation we store this in an
2485 + * unused field of the buffer head, and provide these access
2486 + * functions.
2487 + *
2488 + * FIXME: add compile time check that an unsigned int can fit
2489 + * into a pointer.
2490 + *
2491 + *---------------------------------------------------------------*/
2492 +static inline void bh_set_region(struct buffer_head *bh, unsigned int region)
2493 +{
2494 +       bh->b_journal_head = (void *) region;
2495 +}
2496 +
2497 +static inline int bh_get_region(struct buffer_head *bh)
2498 +{
2499 +       return (unsigned int) bh->b_journal_head;
2500 +}
2501 +
2502 +/*-----------------------------------------------------------------
2503 + * We need an io object to keep track of the number of bhs that
2504 + * have been dispatched for a particular io.
2505 + *---------------------------------------------------------------*/
2506 +static void dec_count(struct io_context *io, unsigned int region, int error)
2507 +{
2508 +       if (error)
2509 +               set_bit(region, &io->error);
2510 +
2511 +       if (atomic_dec_and_test(&io->count)) {
2512 +               if (io->sleeper)
2513 +                       wake_up_process(io->sleeper);
2514 +
2515 +               else {
2516 +                       int r = io->error;
2517 +                       io_notify_fn fn = io->callback;
2518 +                       void *context = io->context;
2519 +
2520 +                       mempool_free(io, _io_pool);
2521 +                       fn(r, context);
2522 +               }
2523 +       }
2524 +}
2525 +
2526 +static void endio(struct buffer_head *bh, int uptodate)
2527 +{
2528 +       struct io_context *io = (struct io_context *) bh->b_private;
2529 +
2530 +       if (!uptodate && io->rw != WRITE) {
2531 +               /*
2532 +                * We need to zero this region, otherwise people
2533 +                * like kcopyd may write the arbitrary contents
2534 +                * of the page.
2535 +                */
2536 +               memset(bh->b_data, 0, bh->b_size);
2537 +       }
2538 +
2539 +       dec_count((struct io_context *) bh->b_private,
2540 +                 bh_get_region(bh), !uptodate);
2541 +       mempool_free(bh, _buffer_pool);
2542 +}
2543 +
2544 +/*
2545 + * Primitives for alignment calculations.
2546 + */
2547 +int fls(unsigned n)
2548 +{
2549 +       return generic_fls32(n);
2550 +}
2551 +
2552 +static inline int log2_floor(unsigned n)
2553 +{
2554 +       return ffs(n) - 1;
2555 +}
2556 +
2557 +static inline int log2_align(unsigned n)
2558 +{
2559 +       return fls(n) - 1;
2560 +}
2561 +
2562 +/*
2563 + * Returns the next block for io.
2564 + */
2565 +static int do_page(kdev_t dev, sector_t *block, sector_t end_block,
2566 +                  unsigned int block_size,
2567 +                  struct page *p, unsigned int offset,
2568 +                  unsigned int region, struct io_context *io)
2569 +{
2570 +       struct buffer_head *bh;
2571 +       sector_t b = *block;
2572 +       sector_t blocks_per_page = PAGE_SIZE / block_size;
2573 +       unsigned int this_size; /* holds the size of the current io */
2574 +       sector_t len;
2575 +
2576 +       if (!blocks_per_page) {
2577 +               DMERR("dm-io: PAGE_SIZE (%lu) < block_size (%u) unsupported",
2578 +                     PAGE_SIZE, block_size);
2579 +               return 0;
2580 +       }
2581 +
2582 +       while ((offset < PAGE_SIZE) && (b != end_block)) {
2583 +               bh = mempool_alloc(_buffer_pool, GFP_NOIO);
2584 +               init_buffer(bh, endio, io);
2585 +               bh_set_region(bh, region);
2586 +
2587 +               /*
2588 +                * Block size must be a power of 2 and aligned
2589 +                * correctly.
2590 +                */
2591 +
2592 +               len = min(end_block - b, blocks_per_page);
2593 +               len = min(len, blocks_per_page - offset / block_size);
2594 +
2595 +               if (!len) {
2596 +                       DMERR("dm-io: Invalid offset/block_size (%u/%u).",
2597 +                             offset, block_size);
2598 +                       return 0;
2599 +               }
2600 +
2601 +               this_size = 1 << log2_align(len);
2602 +               if (b)
2603 +                       this_size = min(this_size,
2604 +                                       (unsigned) 1 << log2_floor(b));
2605 +
2606 +               /*
2607 +                * Add in the job offset.
2608 +                */
2609 +               bh->b_blocknr = (b / this_size);
2610 +               bh->b_size = block_size * this_size;
2611 +               set_bh_page(bh, p, offset);
2612 +               bh->b_this_page = bh;
2613 +
2614 +               bh->b_dev = dev;
2615 +               atomic_set(&bh->b_count, 1);
2616 +
2617 +               bh->b_state = ((1 << BH_Uptodate) | (1 << BH_Mapped) |
2618 +                              (1 << BH_Lock));
2619 +
2620 +               if (io->rw == WRITE)
2621 +                       clear_bit(BH_Dirty, &bh->b_state);
2622 +
2623 +               atomic_inc(&io->count);
2624 +               submit_bh(io->rw, bh);
2625 +
2626 +               b += this_size;
2627 +               offset += block_size * this_size;
2628 +       }
2629 +
2630 +       *block = b;
2631 +       return (b == end_block);
2632 +}
2633 +
2634 +static void do_region(unsigned int region, struct io_region *where,
2635 +                     struct page *page, unsigned int offset,
2636 +                     struct io_context *io)
2637 +{
2638 +       unsigned int block_size = get_hardsect_size(where->dev);
2639 +       unsigned int sblock_size = block_size >> 9;
2640 +       sector_t block = where->sector / sblock_size;
2641 +       sector_t end_block = (where->sector + where->count) / sblock_size;
2642 +
2643 +       while (1) {
2644 +               if (do_page(where->dev, &block, end_block, block_size,
2645 +                           page, offset, region, io))
2646 +                       break;
2647 +
2648 +               offset = 0;     /* only offset the first page */
2649 +
2650 +               page = list_entry(page->list.next, struct page, list);
2651 +       }
2652 +}
2653 +
2654 +static void dispatch_io(unsigned int num_regions, struct io_region *where,
2655 +                       struct page *pages, unsigned int offset,
2656 +                       struct io_context *io)
2657 +{
2658 +       int i;
2659 +
2660 +       for (i = 0; i < num_regions; i++)
2661 +               if (where[i].count)
2662 +                       do_region(i, where + i, pages, offset, io);
2663 +
2664 +       /*
2665 +        * Drop the extra refence that we were holding to avoid
2666 +        * the io being completed too early.
2667 +        */
2668 +       dec_count(io, 0, 0);
2669 +}
2670 +
2671 +/*
2672 + * Synchronous io
2673 + */
2674 +int dm_io_sync(unsigned int num_regions, struct io_region *where,
2675 +              int rw, struct page *pages, unsigned int offset,
2676 +              unsigned int *error_bits)
2677 +{
2678 +       struct io_context io;
2679 +
2680 +       BUG_ON(num_regions > 1 && rw != WRITE);
2681 +
2682 +       io.rw = rw;
2683 +       io.error = 0;
2684 +       atomic_set(&io.count, 1); /* see dispatch_io() */
2685 +       io.sleeper = current;
2686 +
2687 +       dispatch_io(num_regions, where, pages, offset, &io);
2688 +       run_task_queue(&tq_disk);
2689 +
2690 +       while (1) {
2691 +               set_current_state(TASK_UNINTERRUPTIBLE);
2692 +
2693 +               if (!atomic_read(&io.count))
2694 +                       break;
2695 +
2696 +               schedule();
2697 +       }
2698 +       set_current_state(TASK_RUNNING);
2699 +
2700 +       *error_bits = io.error;
2701 +       return io.error ? -EIO : 0;
2702 +}
2703 +
2704 +/*
2705 + * Asynchronous io
2706 + */
2707 +int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
2708 +               struct page *pages, unsigned int offset,
2709 +               io_notify_fn fn, void *context)
2710 +{
2711 +       struct io_context *io = mempool_alloc(_io_pool, GFP_NOIO);
2712 +
2713 +       io->rw = rw;
2714 +       io->error = 0;
2715 +       atomic_set(&io->count, 1); /* see dispatch_io() */
2716 +       io->sleeper = NULL;
2717 +       io->callback = fn;
2718 +       io->context = context;
2719 +
2720 +       dispatch_io(num_regions, where, pages, offset, io);
2721 +       return 0;
2722 +}
2723 +
2724 +EXPORT_SYMBOL(dm_io_get);
2725 +EXPORT_SYMBOL(dm_io_put);
2726 +EXPORT_SYMBOL(dm_io_sync);
2727 +EXPORT_SYMBOL(dm_io_async);
2728 diff -urN linux-2.4.24.org/drivers/md/dm-ioctl.c linux-2.4.24/drivers/md/dm-ioctl.c
2729 --- linux-2.4.24.org/drivers/md/dm-ioctl.c      1970-01-01 01:00:00.000000000 +0100
2730 +++ linux-2.4.24/drivers/md/dm-ioctl.c  2004-01-18 15:01:17.790869761 +0100
2731 @@ -0,0 +1,1284 @@
2732 +/*
2733 + * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
2734 + *
2735 + * This file is released under the GPL.
2736 + */
2737 +
2738 +#include "dm.h"
2739 +
2740 +#include <linux/module.h>
2741 +#include <linux/vmalloc.h>
2742 +#include <linux/miscdevice.h>
2743 +#include <linux/dm-ioctl.h>
2744 +#include <linux/init.h>
2745 +#include <linux/wait.h>
2746 +#include <linux/blk.h>
2747 +#include <linux/slab.h>
2748 +
2749 +#include <asm/uaccess.h>
2750 +
2751 +#define DM_DRIVER_EMAIL "dm@uk.sistina.com"
2752 +
2753 +/*-----------------------------------------------------------------
2754 + * The ioctl interface needs to be able to look up devices by
2755 + * name or uuid.
2756 + *---------------------------------------------------------------*/
2757 +struct hash_cell {
2758 +       struct list_head name_list;
2759 +       struct list_head uuid_list;
2760 +
2761 +       char *name;
2762 +       char *uuid;
2763 +       struct mapped_device *md;
2764 +       struct dm_table *new_map;
2765 +
2766 +       /* I hate devfs */
2767 +       devfs_handle_t devfs_entry;
2768 +};
2769 +
2770 +#define NUM_BUCKETS 64
2771 +#define MASK_BUCKETS (NUM_BUCKETS - 1)
2772 +static struct list_head _name_buckets[NUM_BUCKETS];
2773 +static struct list_head _uuid_buckets[NUM_BUCKETS];
2774 +
2775 +static devfs_handle_t _dev_dir;
2776 +void dm_hash_remove_all(void);
2777 +
2778 +/*
2779 + * Guards access to both hash tables.
2780 + */
2781 +static DECLARE_RWSEM(_hash_lock);
2782 +
2783 +static void init_buckets(struct list_head *buckets)
2784 +{
2785 +       unsigned int i;
2786 +
2787 +       for (i = 0; i < NUM_BUCKETS; i++)
2788 +               INIT_LIST_HEAD(buckets + i);
2789 +}
2790 +
2791 +int dm_hash_init(void)
2792 +{
2793 +       init_buckets(_name_buckets);
2794 +       init_buckets(_uuid_buckets);
2795 +       _dev_dir = devfs_mk_dir(0, DM_DIR, NULL);
2796 +       return 0;
2797 +}
2798 +
2799 +void dm_hash_exit(void)
2800 +{
2801 +       dm_hash_remove_all();
2802 +       devfs_unregister(_dev_dir);
2803 +}
2804 +
2805 +/*-----------------------------------------------------------------
2806 + * Hash function:
2807 + * We're not really concerned with the str hash function being
2808 + * fast since it's only used by the ioctl interface.
2809 + *---------------------------------------------------------------*/
2810 +static unsigned int hash_str(const char *str)
2811 +{
2812 +       const unsigned int hash_mult = 2654435387U;
2813 +       unsigned int h = 0;
2814 +
2815 +       while (*str)
2816 +               h = (h + (unsigned int) *str++) * hash_mult;
2817 +
2818 +       return h & MASK_BUCKETS;
2819 +}
2820 +
2821 +/*-----------------------------------------------------------------
2822 + * Code for looking up a device by name
2823 + *---------------------------------------------------------------*/
2824 +static struct hash_cell *__get_name_cell(const char *str)
2825 +{
2826 +       struct list_head *tmp;
2827 +       struct hash_cell *hc;
2828 +       unsigned int h = hash_str(str);
2829 +
2830 +       list_for_each (tmp, _name_buckets + h) {
2831 +               hc = list_entry(tmp, struct hash_cell, name_list);
2832 +               if (!strcmp(hc->name, str))
2833 +                       return hc;
2834 +       }
2835 +
2836 +       return NULL;
2837 +}
2838 +
2839 +static struct hash_cell *__get_uuid_cell(const char *str)
2840 +{
2841 +       struct list_head *tmp;
2842 +       struct hash_cell *hc;
2843 +       unsigned int h = hash_str(str);
2844 +
2845 +       list_for_each (tmp, _uuid_buckets + h) {
2846 +               hc = list_entry(tmp, struct hash_cell, uuid_list);
2847 +               if (!strcmp(hc->uuid, str))
2848 +                       return hc;
2849 +       }
2850 +
2851 +       return NULL;
2852 +}
2853 +
2854 +/*-----------------------------------------------------------------
2855 + * Inserting, removing and renaming a device.
2856 + *---------------------------------------------------------------*/
2857 +static inline char *kstrdup(const char *str)
2858 +{
2859 +       char *r = kmalloc(strlen(str) + 1, GFP_KERNEL);
2860 +       if (r)
2861 +               strcpy(r, str);
2862 +       return r;
2863 +}
2864 +
2865 +static struct hash_cell *alloc_cell(const char *name, const char *uuid,
2866 +                                   struct mapped_device *md)
2867 +{
2868 +       struct hash_cell *hc;
2869 +
2870 +       hc = kmalloc(sizeof(*hc), GFP_KERNEL);
2871 +       if (!hc)
2872 +               return NULL;
2873 +
2874 +       hc->name = kstrdup(name);
2875 +       if (!hc->name) {
2876 +               kfree(hc);
2877 +               return NULL;
2878 +       }
2879 +
2880 +       if (!uuid)
2881 +               hc->uuid = NULL;
2882 +
2883 +       else {
2884 +               hc->uuid = kstrdup(uuid);
2885 +               if (!hc->uuid) {
2886 +                       kfree(hc->name);
2887 +                       kfree(hc);
2888 +                       return NULL;
2889 +               }
2890 +       }
2891 +
2892 +       INIT_LIST_HEAD(&hc->name_list);
2893 +       INIT_LIST_HEAD(&hc->uuid_list);
2894 +       hc->md = md;
2895 +       hc->new_map = NULL;
2896 +       return hc;
2897 +}
2898 +
2899 +static void free_cell(struct hash_cell *hc)
2900 +{
2901 +       if (hc) {
2902 +               kfree(hc->name);
2903 +               kfree(hc->uuid);
2904 +               kfree(hc);
2905 +       }
2906 +}
2907 +
2908 +/*
2909 + * devfs stuff.
2910 + */
2911 +static int register_with_devfs(struct hash_cell *hc)
2912 +{
2913 +       kdev_t dev = dm_kdev(hc->md);
2914 +
2915 +       hc->devfs_entry =
2916 +           devfs_register(_dev_dir, hc->name, DEVFS_FL_CURRENT_OWNER,
2917 +                          major(dev), minor(dev),
2918 +                          S_IFBLK | S_IRUSR | S_IWUSR | S_IRGRP,
2919 +                          &dm_blk_dops, NULL);
2920 +
2921 +       return 0;
2922 +}
2923 +
2924 +static int unregister_with_devfs(struct hash_cell *hc)
2925 +{
2926 +       devfs_unregister(hc->devfs_entry);
2927 +       return 0;
2928 +}
2929 +
2930 +/*
2931 + * The kdev_t and uuid of a device can never change once it is
2932 + * initially inserted.
2933 + */
2934 +int dm_hash_insert(const char *name, const char *uuid, struct mapped_device *md)
2935 +{
2936 +       struct hash_cell *cell;
2937 +
2938 +       /*
2939 +        * Allocate the new cells.
2940 +        */
2941 +       cell = alloc_cell(name, uuid, md);
2942 +       if (!cell)
2943 +               return -ENOMEM;
2944 +
2945 +       /*
2946 +        * Insert the cell into both hash tables.
2947 +        */
2948 +       down_write(&_hash_lock);
2949 +       if (__get_name_cell(name))
2950 +               goto bad;
2951 +
2952 +       list_add(&cell->name_list, _name_buckets + hash_str(name));
2953 +
2954 +       if (uuid) {
2955 +               if (__get_uuid_cell(uuid)) {
2956 +                       list_del(&cell->name_list);
2957 +                       goto bad;
2958 +               }
2959 +               list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid));
2960 +       }
2961 +       register_with_devfs(cell);
2962 +       dm_get(md);
2963 +       up_write(&_hash_lock);
2964 +
2965 +       return 0;
2966 +
2967 +      bad:
2968 +       up_write(&_hash_lock);
2969 +       free_cell(cell);
2970 +       return -EBUSY;
2971 +}
2972 +
2973 +void __hash_remove(struct hash_cell *hc)
2974 +{
2975 +       /* remove from the dev hash */
2976 +       list_del(&hc->uuid_list);
2977 +       list_del(&hc->name_list);
2978 +       unregister_with_devfs(hc);
2979 +       dm_put(hc->md);
2980 +       if (hc->new_map)
2981 +               dm_table_put(hc->new_map);
2982 +       free_cell(hc);
2983 +}
2984 +
2985 +void dm_hash_remove_all(void)
2986 +{
2987 +       int i;
2988 +       struct hash_cell *hc;
2989 +       struct list_head *tmp, *n;
2990 +
2991 +       down_write(&_hash_lock);
2992 +       for (i = 0; i < NUM_BUCKETS; i++) {
2993 +               list_for_each_safe (tmp, n, _name_buckets + i) {
2994 +                       hc = list_entry(tmp, struct hash_cell, name_list);
2995 +                       __hash_remove(hc);
2996 +               }
2997 +       }
2998 +       up_write(&_hash_lock);
2999 +}
3000 +
3001 +int dm_hash_rename(const char *old, const char *new)
3002 +{
3003 +       char *new_name, *old_name;
3004 +       struct hash_cell *hc;
3005 +
3006 +       /*
3007 +        * duplicate new.
3008 +        */
3009 +       new_name = kstrdup(new);
3010 +       if (!new_name)
3011 +               return -ENOMEM;
3012 +
3013 +       down_write(&_hash_lock);
3014 +
3015 +       /*
3016 +        * Is new free ?
3017 +        */
3018 +       hc = __get_name_cell(new);
3019 +       if (hc) {
3020 +               DMWARN("asked to rename to an already existing name %s -> %s",
3021 +                      old, new);
3022 +               up_write(&_hash_lock);
3023 +               kfree(new_name);
3024 +               return -EBUSY;
3025 +       }
3026 +
3027 +       /*
3028 +        * Is there such a device as 'old' ?
3029 +        */
3030 +       hc = __get_name_cell(old);
3031 +       if (!hc) {
3032 +               DMWARN("asked to rename a non existent device %s -> %s",
3033 +                      old, new);
3034 +               up_write(&_hash_lock);
3035 +               kfree(new_name);
3036 +               return -ENXIO;
3037 +       }
3038 +
3039 +       /*
3040 +        * rename and move the name cell.
3041 +        */
3042 +       list_del(&hc->name_list);
3043 +       old_name = hc->name;
3044 +       hc->name = new_name;
3045 +       list_add(&hc->name_list, _name_buckets + hash_str(new_name));
3046 +
3047 +       /* rename the device node in devfs */
3048 +       unregister_with_devfs(hc);
3049 +       register_with_devfs(hc);
3050 +
3051 +       up_write(&_hash_lock);
3052 +       kfree(old_name);
3053 +       return 0;
3054 +}
3055 +
3056 +/*-----------------------------------------------------------------
3057 + * Implementation of the ioctl commands
3058 + *---------------------------------------------------------------*/
3059 +/*
3060 + * All the ioctl commands get dispatched to functions with this
3061 + * prototype.
3062 + */
3063 +typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size);
3064 +
3065 +static int remove_all(struct dm_ioctl *param, size_t param_size)
3066 +{
3067 +       dm_hash_remove_all();
3068 +       param->data_size = 0;
3069 +       return 0;
3070 +}
3071 +
3072 +/*
3073 + * Round up the ptr to an 8-byte boundary.
3074 + */
3075 +#define ALIGN_MASK 7
3076 +static inline void *align_ptr(void *ptr)
3077 +{
3078 +       return (void *) (((size_t) (ptr + ALIGN_MASK)) & ~ALIGN_MASK);
3079 +}
3080 +
3081 +/*
3082 + * Retrieves the data payload buffer from an already allocated
3083 + * struct dm_ioctl.
3084 + */
3085 +static void *get_result_buffer(struct dm_ioctl *param, size_t param_size,
3086 +                              size_t *len)
3087 +{
3088 +       param->data_start = align_ptr(param + 1) - (void *) param;
3089 +
3090 +       if (param->data_start < param_size)
3091 +               *len = param_size - param->data_start;
3092 +       else
3093 +               *len = 0;
3094 +
3095 +       return ((void *) param) + param->data_start;
3096 +}
3097 +
3098 +static int list_devices(struct dm_ioctl *param, size_t param_size)
3099 +{
3100 +       unsigned int i;
3101 +       struct hash_cell *hc;
3102 +       size_t len, needed = 0;
3103 +       struct dm_name_list *nl, *old_nl = NULL;
3104 +
3105 +       down_write(&_hash_lock);
3106 +
3107 +       /*
3108 +        * Loop through all the devices working out how much
3109 +        * space we need.
3110 +        */
3111 +       for (i = 0; i < NUM_BUCKETS; i++) {
3112 +               list_for_each_entry (hc, _name_buckets + i, name_list) {
3113 +                       needed += sizeof(struct dm_name_list);
3114 +                       needed += strlen(hc->name);
3115 +                       needed += ALIGN_MASK;
3116 +               }
3117 +       }
3118 +
3119 +       /*
3120 +        * Grab our output buffer.
3121 +        */
3122 +       nl = get_result_buffer(param, param_size, &len);
3123 +       if (len < needed) {
3124 +               param->flags |= DM_BUFFER_FULL_FLAG;
3125 +               goto out;
3126 +       }
3127 +       param->data_size = param->data_start + needed;
3128 +
3129 +       nl->dev = 0;    /* Flags no data */
3130 +
3131 +       /*
3132 +        * Now loop through filling out the names.
3133 +        */
3134 +       for (i = 0; i < NUM_BUCKETS; i++) {
3135 +               list_for_each_entry (hc, _name_buckets + i, name_list) {
3136 +                       if (old_nl)
3137 +                               old_nl->next = (uint32_t) ((void *) nl -
3138 +                                                          (void *) old_nl);
3139 +
3140 +                       nl->dev = dm_kdev(hc->md);
3141 +                       nl->next = 0;
3142 +                       strcpy(nl->name, hc->name);
3143 +
3144 +                       old_nl = nl;
3145 +                       nl = align_ptr(((void *) ++nl) + strlen(hc->name) + 1);
3146 +               }
3147 +       }
3148 +
3149 + out:
3150 +       up_write(&_hash_lock);
3151 +       return 0;
3152 +}
3153 +
3154 +static int check_name(const char *name)
3155 +{
3156 +       if (strchr(name, '/')) {
3157 +               DMWARN("invalid device name");
3158 +               return -EINVAL;
3159 +       }
3160 +
3161 +       return 0;
3162 +}
3163 +
3164 +/*
3165 + * Fills in a dm_ioctl structure, ready for sending back to
3166 + * userland.
3167 + */
3168 +static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
3169 +{
3170 +       kdev_t dev = dm_kdev(md);
3171 +       struct dm_table *table;
3172 +       struct block_device *bdev;
3173 +
3174 +       param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
3175 +                         DM_ACTIVE_PRESENT_FLAG);
3176 +
3177 +       if (dm_suspended(md))
3178 +               param->flags |= DM_SUSPEND_FLAG;
3179 +
3180 +       param->dev = kdev_t_to_nr(dev);
3181 +
3182 +       if (is_read_only(dev))
3183 +               param->flags |= DM_READONLY_FLAG;
3184 +
3185 +       param->event_nr = dm_get_event_nr(md);
3186 +
3187 +       table = dm_get_table(md);
3188 +       if (table) {
3189 +               param->flags |= DM_ACTIVE_PRESENT_FLAG;
3190 +               param->target_count = dm_table_get_num_targets(table);
3191 +               dm_table_put(table);
3192 +       } else
3193 +               param->target_count = 0;
3194 +
3195 +       bdev = bdget(param->dev);
3196 +       if (!bdev)
3197 +               return -ENXIO;
3198 +       param->open_count = bdev->bd_openers;
3199 +       bdput(bdev);
3200 +
3201 +       return 0;
3202 +}
3203 +
3204 +static int dev_create(struct dm_ioctl *param, size_t param_size)
3205 +{
3206 +       int r;
3207 +       kdev_t dev = 0;
3208 +       struct mapped_device *md;
3209 +
3210 +       r = check_name(param->name);
3211 +       if (r)
3212 +               return r;
3213 +
3214 +       if (param->flags & DM_PERSISTENT_DEV_FLAG)
3215 +               dev = to_kdev_t(param->dev);
3216 +
3217 +       r = dm_create(dev, &md);
3218 +       if (r)
3219 +               return r;
3220 +
3221 +       r = dm_hash_insert(param->name, *param->uuid ? param->uuid : NULL, md);
3222 +       if (r) {
3223 +               dm_put(md);
3224 +               return r;
3225 +       }
3226 +
3227 +       param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
3228 +
3229 +       r = __dev_status(md, param);
3230 +       dm_put(md);
3231 +
3232 +       return r;
3233 +}
3234 +
3235 +/*
3236 + * Always use UUID for lookups if it's present, otherwise use name.
3237 + */
3238 +static inline struct hash_cell *__find_device_hash_cell(struct dm_ioctl *param)
3239 +{
3240 +       return *param->uuid ?
3241 +           __get_uuid_cell(param->uuid) : __get_name_cell(param->name);
3242 +}
3243 +
3244 +static inline struct mapped_device *find_device(struct dm_ioctl *param)
3245 +{
3246 +       struct hash_cell *hc;
3247 +       struct mapped_device *md = NULL;
3248 +
3249 +       down_read(&_hash_lock);
3250 +       hc = __find_device_hash_cell(param);
3251 +       if (hc) {
3252 +               md = hc->md;
3253 +
3254 +               /*
3255 +                * Sneakily write in both the name and the uuid
3256 +                * while we have the cell.
3257 +                */
3258 +               strncpy(param->name, hc->name, sizeof(param->name));
3259 +               if (hc->uuid)
3260 +                       strncpy(param->uuid, hc->uuid, sizeof(param->uuid) - 1);
3261 +               else
3262 +                       param->uuid[0] = '\0';
3263 +
3264 +               if (hc->new_map)
3265 +                       param->flags |= DM_INACTIVE_PRESENT_FLAG;
3266 +               else
3267 +                       param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
3268 +
3269 +               dm_get(md);
3270 +       }
3271 +       up_read(&_hash_lock);
3272 +
3273 +       return md;
3274 +}
3275 +
3276 +static int dev_remove(struct dm_ioctl *param, size_t param_size)
3277 +{
3278 +       struct hash_cell *hc;
3279 +
3280 +       down_write(&_hash_lock);
3281 +       hc = __find_device_hash_cell(param);
3282 +
3283 +       if (!hc) {
3284 +               DMWARN("device doesn't appear to be in the dev hash table.");
3285 +               up_write(&_hash_lock);
3286 +               return -ENXIO;
3287 +       }
3288 +
3289 +       __hash_remove(hc);
3290 +       up_write(&_hash_lock);
3291 +       param->data_size = 0;
3292 +       return 0;
3293 +}
3294 +
3295 +/*
3296 + * Check a string doesn't overrun the chunk of
3297 + * memory we copied from userland.
3298 + */
3299 +static int invalid_str(char *str, void *end)
3300 +{
3301 +       while ((void *) str < end)
3302 +               if (!*str++)
3303 +                       return 0;
3304 +
3305 +       return -EINVAL;
3306 +}
3307 +
3308 +static int dev_rename(struct dm_ioctl *param, size_t param_size)
3309 +{
3310 +       int r;
3311 +       char *new_name = (char *) param + param->data_start;
3312 +
3313 +       if (new_name < (char *) (param + 1) ||
3314 +           invalid_str(new_name, (void *) param + param_size)) {
3315 +               DMWARN("Invalid new logical volume name supplied.");
3316 +               return -EINVAL;
3317 +       }
3318 +
3319 +       r = check_name(new_name);
3320 +       if (r)
3321 +               return r;
3322 +
3323 +       param->data_size = 0;
3324 +       return dm_hash_rename(param->name, new_name);
3325 +}
3326 +
3327 +static int do_suspend(struct dm_ioctl *param)
3328 +{
3329 +       int r = 0;
3330 +       struct mapped_device *md;
3331 +
3332 +       md = find_device(param);
3333 +       if (!md)
3334 +               return -ENXIO;
3335 +
3336 +       if (!dm_suspended(md))
3337 +               r = dm_suspend(md);
3338 +
3339 +       if (!r)
3340 +               r = __dev_status(md, param);
3341 +
3342 +       dm_put(md);
3343 +       return r;
3344 +}
3345 +
3346 +static int do_resume(struct dm_ioctl *param)
3347 +{
3348 +       int r = 0;
3349 +       struct hash_cell *hc;
3350 +       struct mapped_device *md;
3351 +       struct dm_table *new_map;
3352 +
3353 +       down_write(&_hash_lock);
3354 +
3355 +       hc = __find_device_hash_cell(param);
3356 +       if (!hc) {
3357 +               DMWARN("device doesn't appear to be in the dev hash table.");
3358 +               up_write(&_hash_lock);
3359 +               return -ENXIO;
3360 +       }
3361 +
3362 +       md = hc->md;
3363 +       dm_get(md);
3364 +
3365 +       new_map = hc->new_map;
3366 +       hc->new_map = NULL;
3367 +       param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
3368 +
3369 +       up_write(&_hash_lock);
3370 +
3371 +       /* Do we need to load a new map ? */
3372 +       if (new_map) {
3373 +               /* Suspend if it isn't already suspended */
3374 +               if (!dm_suspended(md))
3375 +                       dm_suspend(md);
3376 +
3377 +               r = dm_swap_table(md, new_map);
3378 +               if (r) {
3379 +                       dm_put(md);
3380 +                       dm_table_put(new_map);
3381 +                       return r;
3382 +               }
3383 +
3384 +               if (dm_table_get_mode(new_map) & FMODE_WRITE)
3385 +                       set_device_ro(dm_kdev(md), 0);
3386 +               else
3387 +                       set_device_ro(dm_kdev(md), 1);
3388 +
3389 +               dm_table_put(new_map);
3390 +       }
3391 +
3392 +       if (dm_suspended(md))
3393 +               r = dm_resume(md);
3394 +
3395 +       if (!r)
3396 +               r = __dev_status(md, param);
3397 +
3398 +       dm_put(md);
3399 +       return r;
3400 +}
3401 +
3402 +/*
3403 + * Set or unset the suspension state of a device.
3404 + * If the device already is in the requested state we just return its status.
3405 + */
3406 +static int dev_suspend(struct dm_ioctl *param, size_t param_size)
3407 +{
3408 +       if (param->flags & DM_SUSPEND_FLAG)
3409 +               return do_suspend(param);
3410 +
3411 +       return do_resume(param);
3412 +}
3413 +
3414 +/*
3415 + * Copies device info back to user space, used by
3416 + * the create and info ioctls.
3417 + */
3418 +static int dev_status(struct dm_ioctl *param, size_t param_size)
3419 +{
3420 +       int r;
3421 +       struct mapped_device *md;
3422 +
3423 +       md = find_device(param);
3424 +       if (!md)
3425 +               return -ENXIO;
3426 +
3427 +       r = __dev_status(md, param);
3428 +       dm_put(md);
3429 +       return r;
3430 +}
3431 +
3432 +/*
3433 + * Build up the status struct for each target
3434 + */
3435 +static void retrieve_status(struct dm_table *table, struct dm_ioctl *param,
3436 +                           size_t param_size)
3437 +{
3438 +       unsigned int i, num_targets;
3439 +       struct dm_target_spec *spec;
3440 +       char *outbuf, *outptr;
3441 +       status_type_t type;
3442 +       size_t remaining, len, used = 0;
3443 +
3444 +       outptr = outbuf = get_result_buffer(param, param_size, &len);
3445 +
3446 +       if (param->flags & DM_STATUS_TABLE_FLAG)
3447 +               type = STATUSTYPE_TABLE;
3448 +       else
3449 +               type = STATUSTYPE_INFO;
3450 +
3451 +       /* Get all the target info */
3452 +       num_targets = dm_table_get_num_targets(table);
3453 +       for (i = 0; i < num_targets; i++) {
3454 +               struct dm_target *ti = dm_table_get_target(table, i);
3455 +
3456 +               remaining = len - (outptr - outbuf);
3457 +               if (remaining < sizeof(struct dm_target_spec)) {
3458 +                       param->flags |= DM_BUFFER_FULL_FLAG;
3459 +                       break;
3460 +               }
3461 +
3462 +               spec = (struct dm_target_spec *) outptr;
3463 +
3464 +               spec->status = 0;
3465 +               spec->sector_start = ti->begin;
3466 +               spec->length = ti->len;
3467 +               strncpy(spec->target_type, ti->type->name,
3468 +                       sizeof(spec->target_type));
3469 +
3470 +               outptr += sizeof(struct dm_target_spec);
3471 +               remaining = len - (outptr - outbuf);
3472 +
3473 +               /* Get the status/table string from the target driver */
3474 +               if (ti->type->status) {
3475 +                       if (ti->type->status(ti, type, outptr, remaining)) {
3476 +                               param->flags |= DM_BUFFER_FULL_FLAG;
3477 +                               break;
3478 +                       }
3479 +               } else
3480 +                       outptr[0] = '\0';
3481 +
3482 +               outptr += strlen(outptr) + 1;
3483 +               used = param->data_start + (outptr - outbuf);
3484 +
3485 +               align_ptr(outptr);
3486 +               spec->next = outptr - outbuf;
3487 +       }
3488 +
3489 +       if (used)
3490 +               param->data_size = used;
3491 +
3492 +       param->target_count = num_targets;
3493 +}
3494 +
3495 +/*
3496 + * Wait for a device to report an event
3497 + */
3498 +static int dev_wait(struct dm_ioctl *param, size_t param_size)
3499 +{
3500 +       int r;
3501 +       struct mapped_device *md;
3502 +       struct dm_table *table;
3503 +       DECLARE_WAITQUEUE(wq, current);
3504 +
3505 +       md = find_device(param);
3506 +       if (!md)
3507 +               return -ENXIO;
3508 +
3509 +       /*
3510 +        * Wait for a notification event
3511 +        */
3512 +       set_current_state(TASK_INTERRUPTIBLE);
3513 +       if (!dm_add_wait_queue(md, &wq, param->event_nr)) {
3514 +               schedule();
3515 +               dm_remove_wait_queue(md, &wq);
3516 +       }
3517 +       set_current_state(TASK_RUNNING);
3518 +
3519 +       /*
3520 +        * The userland program is going to want to know what
3521 +        * changed to trigger the event, so we may as well tell
3522 +        * him and save an ioctl.
3523 +        */
3524 +       r = __dev_status(md, param);
3525 +       if (r)
3526 +               goto out;
3527 +
3528 +       table = dm_get_table(md);
3529 +       if (table) {
3530 +               retrieve_status(table, param, param_size);
3531 +               dm_table_put(table);
3532 +       }
3533 +
3534 + out:
3535 +       dm_put(md);
3536 +       return r;
3537 +}
3538 +
3539 +static inline int get_mode(struct dm_ioctl *param)
3540 +{
3541 +       int mode = FMODE_READ | FMODE_WRITE;
3542 +
3543 +       if (param->flags & DM_READONLY_FLAG)
3544 +               mode = FMODE_READ;
3545 +
3546 +       return mode;
3547 +}
3548 +
3549 +static int next_target(struct dm_target_spec *last, uint32_t next, void *end,
3550 +                      struct dm_target_spec **spec, char **target_params)
3551 +{
3552 +       *spec = (struct dm_target_spec *) ((unsigned char *) last + next);
3553 +       *target_params = (char *) (*spec + 1);
3554 +
3555 +       if (*spec < (last + 1))
3556 +               return -EINVAL;
3557 +
3558 +       return invalid_str(*target_params, end);
3559 +}
3560 +
3561 +static int populate_table(struct dm_table *table, struct dm_ioctl *param,
3562 +                         size_t param_size)
3563 +{
3564 +       int r;
3565 +       unsigned int i = 0;
3566 +       struct dm_target_spec *spec = (struct dm_target_spec *) param;
3567 +       uint32_t next = param->data_start;
3568 +       void *end = (void *) param + param_size;
3569 +       char *target_params;
3570 +
3571 +       if (!param->target_count) {
3572 +               DMWARN("populate_table: no targets specified");
3573 +               return -EINVAL;
3574 +       }
3575 +
3576 +       for (i = 0; i < param->target_count; i++) {
3577 +
3578 +               r = next_target(spec, next, end, &spec, &target_params);
3579 +               if (r) {
3580 +                       DMWARN("unable to find target");
3581 +                       return r;
3582 +               }
3583 +
3584 +               r = dm_table_add_target(table, spec->target_type,
3585 +                                       (sector_t) spec->sector_start,
3586 +                                       (sector_t) spec->length,
3587 +                                       target_params);
3588 +               if (r) {
3589 +                       DMWARN("error adding target to table");
3590 +                       return r;
3591 +               }
3592 +
3593 +               next = spec->next;
3594 +       }
3595 +
3596 +       return dm_table_complete(table);
3597 +}
3598 +
3599 +static int table_load(struct dm_ioctl *param, size_t param_size)
3600 +{
3601 +       int r;
3602 +       struct hash_cell *hc;
3603 +       struct dm_table *t;
3604 +
3605 +       r = dm_table_create(&t, get_mode(param), param->target_count);
3606 +       if (r)
3607 +               return r;
3608 +
3609 +       r = populate_table(t, param, param_size);
3610 +       if (r) {
3611 +               dm_table_put(t);
3612 +               return r;
3613 +       }
3614 +
3615 +       down_write(&_hash_lock);
3616 +       hc = __find_device_hash_cell(param);
3617 +       if (!hc) {
3618 +               DMWARN("device doesn't appear to be in the dev hash table.");
3619 +               up_write(&_hash_lock);
3620 +               return -ENXIO;
3621 +       }
3622 +
3623 +       if (hc->new_map)
3624 +               dm_table_put(hc->new_map);
3625 +       hc->new_map = t;
3626 +       param->flags |= DM_INACTIVE_PRESENT_FLAG;
3627 +
3628 +       r = __dev_status(hc->md, param);
3629 +       up_write(&_hash_lock);
3630 +       return r;
3631 +}
3632 +
3633 +static int table_clear(struct dm_ioctl *param, size_t param_size)
3634 +{
3635 +       int r;
3636 +       struct hash_cell *hc;
3637 +
3638 +       down_write(&_hash_lock);
3639 +
3640 +       hc = __find_device_hash_cell(param);
3641 +       if (!hc) {
3642 +               DMWARN("device doesn't appear to be in the dev hash table.");
3643 +               up_write(&_hash_lock);
3644 +               return -ENXIO;
3645 +       }
3646 +
3647 +       if (hc->new_map) {
3648 +               dm_table_put(hc->new_map);
3649 +               hc->new_map = NULL;
3650 +       }
3651 +
3652 +       param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
3653 +
3654 +       r = __dev_status(hc->md, param);
3655 +       up_write(&_hash_lock);
3656 +       return r;
3657 +}
3658 +
3659 +/*
3660 + * Retrieves a list of devices used by a particular dm device.
3661 + */
3662 +static void retrieve_deps(struct dm_table *table, struct dm_ioctl *param,
3663 +                         size_t param_size)
3664 +{
3665 +       unsigned int count = 0;
3666 +       struct list_head *tmp;
3667 +       size_t len, needed;
3668 +       struct dm_target_deps *deps;
3669 +
3670 +       deps = get_result_buffer(param, param_size, &len);
3671 +
3672 +       /*
3673 +        * Count the devices.
3674 +        */
3675 +       list_for_each(tmp, dm_table_get_devices(table))
3676 +               count++;
3677 +
3678 +       /*
3679 +        * Check we have enough space.
3680 +        */
3681 +       needed = sizeof(*deps) + (sizeof(*deps->dev) * count);
3682 +       if (len < needed) {
3683 +               param->flags |= DM_BUFFER_FULL_FLAG;
3684 +               return;
3685 +       }
3686 +
3687 +       /*
3688 +        * Fill in the devices.
3689 +        */
3690 +       deps->count = count;
3691 +       count = 0;
3692 +       list_for_each(tmp, dm_table_get_devices(table)) {
3693 +               struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
3694 +               deps->dev[count++] = dd->bdev->bd_dev;
3695 +       }
3696 +
3697 +       param->data_size = param->data_start + needed;
3698 +}
3699 +
3700 +static int table_deps(struct dm_ioctl *param, size_t param_size)
3701 +{
3702 +       int r;
3703 +       struct mapped_device *md;
3704 +       struct dm_table *table;
3705 +
3706 +       md = find_device(param);
3707 +       if (!md)
3708 +               return -ENXIO;
3709 +
3710 +       r = __dev_status(md, param);
3711 +       if (r)
3712 +               goto out;
3713 +
3714 +       table = dm_get_table(md);
3715 +       if (table) {
3716 +               retrieve_deps(table, param, param_size);
3717 +               dm_table_put(table);
3718 +       }
3719 +
3720 + out:
3721 +       dm_put(md);
3722 +       return r;
3723 +}
3724 +
3725 +/*
3726 + * Return the status of a device as a text string for each
3727 + * target.
3728 + */
3729 +static int table_status(struct dm_ioctl *param, size_t param_size)
3730 +{
3731 +       int r;
3732 +       struct mapped_device *md;
3733 +       struct dm_table *table;
3734 +
3735 +       md = find_device(param);
3736 +       if (!md)
3737 +               return -ENXIO;
3738 +
3739 +       r = __dev_status(md, param);
3740 +       if (r)
3741 +               goto out;
3742
3743 +       table = dm_get_table(md);
3744 +       if (table) {
3745 +               retrieve_status(table, param, param_size);
3746 +               dm_table_put(table);
3747 +       }
3748 +
3749 + out:
3750 +       dm_put(md);
3751 +       return r;
3752 +}
3753 +
3754 +/*-----------------------------------------------------------------
3755 + * Implementation of open/close/ioctl on the special char
3756 + * device.
3757 + *---------------------------------------------------------------*/
3758 +static ioctl_fn lookup_ioctl(unsigned int cmd)
3759 +{
3760 +       static struct {
3761 +               int cmd;
3762 +               ioctl_fn fn;
3763 +       } _ioctls[] = {
3764 +               {DM_VERSION_CMD, NULL}, /* version is dealt with elsewhere */
3765 +               {DM_REMOVE_ALL_CMD, remove_all},
3766 +               {DM_LIST_DEVICES_CMD, list_devices},
3767 +
3768 +               {DM_DEV_CREATE_CMD, dev_create},
3769 +               {DM_DEV_REMOVE_CMD, dev_remove},
3770 +               {DM_DEV_RENAME_CMD, dev_rename},
3771 +               {DM_DEV_SUSPEND_CMD, dev_suspend},
3772 +               {DM_DEV_STATUS_CMD, dev_status},
3773 +               {DM_DEV_WAIT_CMD, dev_wait},
3774 +
3775 +               {DM_TABLE_LOAD_CMD, table_load},
3776 +               {DM_TABLE_CLEAR_CMD, table_clear},
3777 +               {DM_TABLE_DEPS_CMD, table_deps},
3778 +               {DM_TABLE_STATUS_CMD, table_status}
3779 +       };
3780 +
3781 +       return (cmd >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[cmd].fn;
3782 +}
3783 +
3784 +/*
3785 + * As well as checking the version compatibility this always
3786 + * copies the kernel interface version out.
3787 + */
3788 +static int check_version(unsigned int cmd, struct dm_ioctl *user)
3789 +{
3790 +       uint32_t version[3];
3791 +       int r = 0;
3792 +
3793 +       if (copy_from_user(version, user->version, sizeof(version)))
3794 +               return -EFAULT;
3795 +
3796 +       if ((DM_VERSION_MAJOR != version[0]) ||
3797 +           (DM_VERSION_MINOR < version[1])) {
3798 +               DMWARN("ioctl interface mismatch: "
3799 +                      "kernel(%u.%u.%u), user(%u.%u.%u), cmd(%d)",
3800 +                      DM_VERSION_MAJOR, DM_VERSION_MINOR,
3801 +                      DM_VERSION_PATCHLEVEL,
3802 +                      version[0], version[1], version[2], cmd);
3803 +               r = -EINVAL;
3804 +       }
3805 +
3806 +       /*
3807 +        * Fill in the kernel version.
3808 +        */
3809 +       version[0] = DM_VERSION_MAJOR;
3810 +       version[1] = DM_VERSION_MINOR;
3811 +       version[2] = DM_VERSION_PATCHLEVEL;
3812 +       if (copy_to_user(user->version, version, sizeof(version)))
3813 +               return -EFAULT;
3814 +
3815 +       return r;
3816 +}
3817 +
3818 +static void free_params(struct dm_ioctl *param)
3819 +{
3820 +       vfree(param);
3821 +}
3822 +
3823 +static int copy_params(struct dm_ioctl *user, struct dm_ioctl **param)
3824 +{
3825 +       struct dm_ioctl tmp, *dmi;
3826 +
3827 +       if (copy_from_user(&tmp, user, sizeof(tmp)))
3828 +               return -EFAULT;
3829 +
3830 +       if (tmp.data_size < sizeof(tmp))
3831 +               return -EINVAL;
3832 +
3833 +       dmi = (struct dm_ioctl *) vmalloc(tmp.data_size);
3834 +       if (!dmi)
3835 +               return -ENOMEM;
3836 +
3837 +       if (copy_from_user(dmi, user, tmp.data_size)) {
3838 +               vfree(dmi);
3839 +               return -EFAULT;
3840 +       }
3841 +
3842 +       *param = dmi;
3843 +       return 0;
3844 +}
3845 +
3846 +static int validate_params(uint cmd, struct dm_ioctl *param)
3847 +{
3848 +       /* Always clear this flag */
3849 +       param->flags &= ~DM_BUFFER_FULL_FLAG;
3850 +
3851 +       /* Ignores parameters */
3852 +       if (cmd == DM_REMOVE_ALL_CMD || cmd == DM_LIST_DEVICES_CMD)
3853 +               return 0;
3854 +
3855 +       /* Unless creating, either name or uuid but not both */
3856 +       if (cmd != DM_DEV_CREATE_CMD) {
3857 +               if ((!*param->uuid && !*param->name) ||
3858 +                   (*param->uuid && *param->name)) {
3859 +                       DMWARN("one of name or uuid must be supplied, cmd(%u)",
3860 +                              cmd);
3861 +                       return -EINVAL;
3862 +               }
3863 +       }
3864 +
3865 +       /* Ensure strings are terminated */
3866 +       param->name[DM_NAME_LEN - 1] = '\0';
3867 +       param->uuid[DM_UUID_LEN - 1] = '\0';
3868 +
3869 +       return 0;
3870 +}
3871 +
3872 +static int ctl_ioctl(struct inode *inode, struct file *file,
3873 +                    uint command, ulong u)
3874 +{
3875 +       int r = 0;
3876 +       unsigned int cmd;
3877 +       struct dm_ioctl *param;
3878 +       struct dm_ioctl *user = (struct dm_ioctl *) u;
3879 +       ioctl_fn fn = NULL;
3880 +       size_t param_size;
3881 +
3882 +       /* only root can play with this */
3883 +       if (!capable(CAP_SYS_ADMIN))
3884 +               return -EACCES;
3885 +
3886 +       if (_IOC_TYPE(command) != DM_IOCTL)
3887 +               return -ENOTTY;
3888 +
3889 +       cmd = _IOC_NR(command);
3890 +
3891 +       /*
3892 +        * Check the interface version passed in.  This also
3893 +        * writes out the kernel's interface version.
3894 +        */
3895 +       r = check_version(cmd, user);
3896 +       if (r)
3897 +               return r;
3898 +
3899 +       /*
3900 +        * Nothing more to do for the version command.
3901 +        */
3902 +       if (cmd == DM_VERSION_CMD)
3903 +               return 0;
3904 +
3905 +       fn = lookup_ioctl(cmd);
3906 +       if (!fn) {
3907 +               DMWARN("dm_ctl_ioctl: unknown command 0x%x", command);
3908 +               return -ENOTTY;
3909 +       }
3910 +
3911 +       /*
3912 +        * FIXME: I don't like this, we're trying to avoid low
3913 +        * memory issues when a device is suspended.
3914 +        */
3915 +       current->flags |= PF_MEMALLOC;
3916 +
3917 +       /*
3918 +        * Copy the parameters into kernel space.
3919 +        */
3920 +       r = copy_params(user, &param);
3921 +       if (r) {
3922 +               current->flags &= ~PF_MEMALLOC;
3923 +               return r;
3924 +       }
3925 +
3926 +       r = validate_params(cmd, param);
3927 +       if (r)
3928 +               goto out;
3929 +
3930 +       param_size = param->data_size;
3931 +       param->data_size = sizeof(*param);
3932 +       r = fn(param, param_size);
3933 +
3934 +       /*
3935 +        * Copy the results back to userland.
3936 +        */
3937 +       if (!r && copy_to_user(user, param, param->data_size))
3938 +               r = -EFAULT;
3939 +
3940 + out:
3941 +       free_params(param);
3942 +       current->flags &= ~PF_MEMALLOC;
3943 +       return r;
3944 +}
3945 +
3946 +static struct file_operations _ctl_fops = {
3947 +       .ioctl   = ctl_ioctl,
3948 +       .owner   = THIS_MODULE,
3949 +};
3950 +
3951 +static devfs_handle_t _ctl_handle;
3952 +
3953 +static struct miscdevice _dm_misc = {
3954 +       .minor = MISC_DYNAMIC_MINOR,
3955 +       .name  = DM_NAME,
3956 +       .fops  = &_ctl_fops
3957 +};
3958 +
3959 +/*
3960 + * Create misc character device and link to DM_DIR/control.
3961 + */
3962 +int __init dm_interface_init(void)
3963 +{
3964 +       int r;
3965 +       char rname[64];
3966 +
3967 +       r = dm_hash_init();
3968 +       if (r)
3969 +               return r;
3970 +
3971 +       r = misc_register(&_dm_misc);
3972 +       if (r) {
3973 +               DMERR("misc_register failed for control device");
3974 +               dm_hash_exit();
3975 +               return r;
3976 +       }
3977 +
3978 +       r = devfs_generate_path(_dm_misc.devfs_handle, rname + 3,
3979 +                               sizeof rname - 3);
3980 +       if (r == -ENOSYS)
3981 +               goto done;      /* devfs not present */
3982 +
3983 +       if (r < 0) {
3984 +               DMERR("devfs_generate_path failed for control device");
3985 +               goto failed;
3986 +       }
3987 +
3988 +       strncpy(rname + r, "../", 3);
3989 +       r = devfs_mk_symlink(NULL, DM_DIR "/control",
3990 +                            DEVFS_FL_DEFAULT, rname + r, &_ctl_handle, NULL);
3991 +       if (r) {
3992 +               DMERR("devfs_mk_symlink failed for control device");
3993 +               goto failed;
3994 +       }
3995 +       devfs_auto_unregister(_dm_misc.devfs_handle, _ctl_handle);
3996 +
3997 +      done:
3998 +       DMINFO("%d.%d.%d%s initialised: %s", DM_VERSION_MAJOR,
3999 +              DM_VERSION_MINOR, DM_VERSION_PATCHLEVEL, DM_VERSION_EXTRA,
4000 +              DM_DRIVER_EMAIL);
4001 +       return 0;
4002 +
4003 +      failed:
4004 +       misc_deregister(&_dm_misc);
4005 +       dm_hash_exit();
4006 +       return r;
4007 +}
4008 +
4009 +void dm_interface_exit(void)
4010 +{
4011 +       if (misc_deregister(&_dm_misc) < 0)
4012 +               DMERR("misc_deregister failed for control device");
4013 +
4014 +       dm_hash_exit();
4015 +}
4016 diff -urN linux-2.4.24.org/drivers/md/dm-io.h linux-2.4.24/drivers/md/dm-io.h
4017 --- linux-2.4.24.org/drivers/md/dm-io.h 1970-01-01 01:00:00.000000000 +0100
4018 +++ linux-2.4.24/drivers/md/dm-io.h     2004-01-18 15:01:25.794190275 +0100
4019 @@ -0,0 +1,86 @@
4020 +/*
4021 + * Copyright (C) 2003 Sistina Software
4022 + *
4023 + * This file is released under the GPL.
4024 + */
4025 +
4026 +#ifndef _DM_IO_H
4027 +#define _DM_IO_H
4028 +
4029 +#include "dm.h"
4030 +
4031 +#include <linux/list.h>
4032 +
4033 +/* Move these to bitops.h eventually */
4034 +/* Improved generic_fls algorithm (in 2.4 there is no generic_fls so far) */
4035 +/* (c) 2002, D.Phillips and Sistina Software */
4036 +/* Licensed under Version 2 of the GPL */
4037 +
4038 +static unsigned generic_fls8(unsigned n)
4039 +{
4040 +       return n & 0xf0 ?
4041 +           n & 0xc0 ? (n >> 7) + 7 : (n >> 5) + 5:
4042 +           n & 0x0c ? (n >> 3) + 3 : n - ((n + 1) >> 2);
4043 +}
4044 +
4045 +static inline unsigned generic_fls16(unsigned n)
4046 +{
4047 +       return  n & 0xff00? generic_fls8(n >> 8) + 8 : generic_fls8(n);
4048 +}
4049 +
4050 +static inline unsigned generic_fls32(unsigned n)
4051 +{
4052 +       return  n & 0xffff0000 ? generic_fls16(n >> 16) + 16 : generic_fls16(n);
4053 +}
4054 +
4055 +/* FIXME make this configurable */
4056 +#define DM_MAX_IO_REGIONS 8
4057 +
4058 +struct io_region {
4059 +       kdev_t dev;
4060 +       sector_t sector;
4061 +       sector_t count;
4062 +};
4063 +
4064 +
4065 +/*
4066 + * 'error' is a bitset, with each bit indicating whether an error
4067 + * occurred doing io to the corresponding region.
4068 + */
4069 +typedef void (*io_notify_fn)(unsigned int error, void *context);
4070 +
4071 +
4072 +/*
4073 + * Before anyone uses the IO interface they should call
4074 + * dm_io_get(), specifying roughly how many pages they are
4075 + * expecting to perform io on concurrently.
4076 + *
4077 + * This function may block.
4078 + */
4079 +int dm_io_get(unsigned int num_pages);
4080 +void dm_io_put(unsigned int num_pages);
4081 +
4082 +
4083 +/*
4084 + * Synchronous IO.
4085 + *
4086 + * Please ensure that the rw flag in the next two functions is
4087 + * either READ or WRITE, ie. we don't take READA.  Any
4088 + * regions with a zero count field will be ignored.
4089 + */
4090 +int dm_io_sync(unsigned int num_regions, struct io_region *where, int rw,
4091 +              struct page *pages, unsigned int offset,
4092 +              unsigned int *error_bits);
4093 +
4094 +
4095 +/*
4096 + * Aynchronous IO.
4097 + *
4098 + * The 'where' array may be safely allocated on the stack since
4099 + * the function takes a copy.
4100 + */
4101 +int dm_io_async(unsigned int num_regions, struct io_region *where, int rw,
4102 +               struct page *pages, unsigned int offset,
4103 +               io_notify_fn fn, void *context);
4104 +
4105 +#endif
4106 diff -urN linux-2.4.24.org/drivers/md/dm-linear.c linux-2.4.24/drivers/md/dm-linear.c
4107 --- linux-2.4.24.org/drivers/md/dm-linear.c     1970-01-01 01:00:00.000000000 +0100
4108 +++ linux-2.4.24/drivers/md/dm-linear.c 2004-01-18 15:01:13.777712209 +0100
4109 @@ -0,0 +1,123 @@
4110 +/*
4111 + * Copyright (C) 2001 Sistina Software (UK) Limited.
4112 + *
4113 + * This file is released under the GPL.
4114 + */
4115 +
4116 +#include "dm.h"
4117 +
4118 +#include <linux/module.h>
4119 +#include <linux/init.h>
4120 +#include <linux/blkdev.h>
4121 +#include <linux/slab.h>
4122 +
4123 +/*
4124 + * Linear: maps a linear range of a device.
4125 + */
4126 +struct linear_c {
4127 +       struct dm_dev *dev;
4128 +       sector_t start;
4129 +};
4130 +
4131 +/*
4132 + * Construct a linear mapping: <dev_path> <offset>
4133 + */
4134 +static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
4135 +{
4136 +       struct linear_c *lc;
4137 +
4138 +       if (argc != 2) {
4139 +               ti->error = "dm-linear: Invalid argument count";
4140 +               return -EINVAL;
4141 +       }
4142 +
4143 +       lc = kmalloc(sizeof(*lc), GFP_KERNEL);
4144 +       if (lc == NULL) {
4145 +               ti->error = "dm-linear: Cannot allocate linear context";
4146 +               return -ENOMEM;
4147 +       }
4148 +
4149 +       if (sscanf(argv[1], SECTOR_FORMAT, &lc->start) != 1) {
4150 +               ti->error = "dm-linear: Invalid device sector";
4151 +               goto bad;
4152 +       }
4153 +
4154 +       if (dm_get_device(ti, argv[0], lc->start, ti->len,
4155 +                         dm_table_get_mode(ti->table), &lc->dev)) {
4156 +               ti->error = "dm-linear: Device lookup failed";
4157 +               goto bad;
4158 +       }
4159 +
4160 +       ti->private = lc;
4161 +       return 0;
4162 +
4163 +      bad:
4164 +       kfree(lc);
4165 +       return -EINVAL;
4166 +}
4167 +
4168 +static void linear_dtr(struct dm_target *ti)
4169 +{
4170 +       struct linear_c *lc = (struct linear_c *) ti->private;
4171 +
4172 +       dm_put_device(ti, lc->dev);
4173 +       kfree(lc);
4174 +}
4175 +
4176 +static int linear_map(struct dm_target *ti, struct buffer_head *bh, int rw,
4177 +                     union map_info *map_context)
4178 +{
4179 +       struct linear_c *lc = (struct linear_c *) ti->private;
4180 +
4181 +       bh->b_rdev = lc->dev->dev;
4182 +       bh->b_rsector = lc->start + (bh->b_rsector - ti->begin);
4183 +
4184 +       return 1;
4185 +}
4186 +
4187 +static int linear_status(struct dm_target *ti, status_type_t type,
4188 +                        char *result, unsigned int maxlen)
4189 +{
4190 +       struct linear_c *lc = (struct linear_c *) ti->private;
4191 +       kdev_t kdev;
4192 +
4193 +       switch (type) {
4194 +       case STATUSTYPE_INFO:
4195 +               result[0] = '\0';
4196 +               break;
4197 +
4198 +       case STATUSTYPE_TABLE:
4199 +               kdev = to_kdev_t(lc->dev->bdev->bd_dev);
4200 +               snprintf(result, maxlen, "%s " SECTOR_FORMAT,
4201 +                        dm_kdevname(kdev), lc->start);
4202 +               break;
4203 +       }
4204 +       return 0;
4205 +}
4206 +
4207 +static struct target_type linear_target = {
4208 +       .name   = "linear",
4209 +       .module = THIS_MODULE,
4210 +       .ctr    = linear_ctr,
4211 +       .dtr    = linear_dtr,
4212 +       .map    = linear_map,
4213 +       .status = linear_status,
4214 +};
4215 +
4216 +int __init dm_linear_init(void)
4217 +{
4218 +       int r = dm_register_target(&linear_target);
4219 +
4220 +       if (r < 0)
4221 +               DMERR("linear: register failed %d", r);
4222 +
4223 +       return r;
4224 +}
4225 +
4226 +void dm_linear_exit(void)
4227 +{
4228 +       int r = dm_unregister_target(&linear_target);
4229 +
4230 +       if (r < 0)
4231 +               DMERR("linear: unregister failed %d", r);
4232 +}
4233 diff -urN linux-2.4.24.org/drivers/md/dm-snapshot.c linux-2.4.24/drivers/md/dm-snapshot.c
4234 --- linux-2.4.24.org/drivers/md/dm-snapshot.c   1970-01-01 01:00:00.000000000 +0100
4235 +++ linux-2.4.24/drivers/md/dm-snapshot.c       2004-01-18 15:01:29.247465850 +0100
4236 @@ -0,0 +1,1235 @@
4237 +/*
4238 + * dm-snapshot.c
4239 + *
4240 + * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
4241 + *
4242 + * This file is released under the GPL.
4243 + */
4244 +
4245 +#include <linux/config.h>
4246 +#include <linux/ctype.h>
4247 +#include <linux/module.h>
4248 +#include <linux/init.h>
4249 +#include <linux/slab.h>
4250 +#include <linux/list.h>
4251 +#include <linux/fs.h>
4252 +#include <linux/blkdev.h>
4253 +#include <linux/mempool.h>
4254 +#include <linux/device-mapper.h>
4255 +#include <linux/vmalloc.h>
4256 +
4257 +#include "dm-snapshot.h"
4258 +#include "kcopyd.h"
4259 +
4260 +/*
4261 + * FIXME: Remove this before release.
4262 + */
4263 +#if 0
4264 +#define DMDEBUG(x...) DMWARN( ## x)
4265 +#else
4266 +#define DMDEBUG(x...)
4267 +#endif
4268 +
4269 +/*
4270 + * The percentage increment we will wake up users at
4271 + */
4272 +#define WAKE_UP_PERCENT 5
4273 +
4274 +/*
4275 + * kcopyd priority of snapshot operations
4276 + */
4277 +#define SNAPSHOT_COPY_PRIORITY 2
4278 +
4279 +/*
4280 + * Each snapshot reserves this many pages for io
4281 + * FIXME: calculate this
4282 + */
4283 +#define SNAPSHOT_PAGES 256
4284 +
4285 +struct pending_exception {
4286 +       struct exception e;
4287 +
4288 +       /*
4289 +        * Origin buffers waiting for this to complete are held
4290 +        * in a list (using b_reqnext).
4291 +        */
4292 +       struct buffer_head *origin_bhs;
4293 +       struct buffer_head *snapshot_bhs;
4294 +
4295 +       /*
4296 +        * Other pending_exceptions that are processing this
4297 +        * chunk.  When this list is empty, we know we can
4298 +        * complete the origins.
4299 +        */
4300 +       struct list_head siblings;
4301 +
4302 +       /* Pointer back to snapshot context */
4303 +       struct dm_snapshot *snap;
4304 +
4305 +       /*
4306 +        * 1 indicates the exception has already been sent to
4307 +        * kcopyd.
4308 +        */
4309 +       int started;
4310 +};
4311 +
4312 +/*
4313 + * Hash table mapping origin volumes to lists of snapshots and
4314 + * a lock to protect it
4315 + */
4316 +static kmem_cache_t *exception_cache;
4317 +static kmem_cache_t *pending_cache;
4318 +static mempool_t *pending_pool;
4319 +
4320 +/*
4321 + * One of these per registered origin, held in the snapshot_origins hash
4322 + */
4323 +struct origin {
4324 +       /* The origin device */
4325 +       kdev_t dev;
4326 +
4327 +       struct list_head hash_list;
4328 +
4329 +       /* List of snapshots for this origin */
4330 +       struct list_head snapshots;
4331 +};
4332 +
4333 +/*
4334 + * Size of the hash table for origin volumes. If we make this
4335 + * the size of the minors list then it should be nearly perfect
4336 + */
4337 +#define ORIGIN_HASH_SIZE 256
4338 +#define ORIGIN_MASK      0xFF
4339 +static struct list_head *_origins;
4340 +static struct rw_semaphore _origins_lock;
4341 +
4342 +static int init_origin_hash(void)
4343 +{
4344 +       int i;
4345 +
4346 +       _origins = kmalloc(ORIGIN_HASH_SIZE * sizeof(struct list_head),
4347 +                          GFP_KERNEL);
4348 +       if (!_origins) {
4349 +               DMERR("Device mapper: Snapshot: unable to allocate memory");
4350 +               return -ENOMEM;
4351 +       }
4352 +
4353 +       for (i = 0; i < ORIGIN_HASH_SIZE; i++)
4354 +               INIT_LIST_HEAD(_origins + i);
4355 +       init_rwsem(&_origins_lock);
4356 +
4357 +       return 0;
4358 +}
4359 +
4360 +static void exit_origin_hash(void)
4361 +{
4362 +       kfree(_origins);
4363 +}
4364 +
4365 +static inline unsigned int origin_hash(kdev_t dev)
4366 +{
4367 +       return MINOR(dev) & ORIGIN_MASK;
4368 +}
4369 +
4370 +static struct origin *__lookup_origin(kdev_t origin)
4371 +{
4372 +       struct list_head *slist;
4373 +       struct list_head *ol;
4374 +       struct origin *o;
4375 +
4376 +       ol = &_origins[origin_hash(origin)];
4377 +       list_for_each(slist, ol) {
4378 +               o = list_entry(slist, struct origin, hash_list);
4379 +
4380 +               if (o->dev == origin)
4381 +                       return o;
4382 +       }
4383 +
4384 +       return NULL;
4385 +}
4386 +
4387 +static void __insert_origin(struct origin *o)
4388 +{
4389 +       struct list_head *sl = &_origins[origin_hash(o->dev)];
4390 +       list_add_tail(&o->hash_list, sl);
4391 +}
4392 +
4393 +/*
4394 + * Make a note of the snapshot and its origin so we can look it
4395 + * up when the origin has a write on it.
4396 + */
4397 +static int register_snapshot(struct dm_snapshot *snap)
4398 +{
4399 +       struct origin *o;
4400 +       kdev_t dev = snap->origin->dev;
4401 +
4402 +       down_write(&_origins_lock);
4403 +       o = __lookup_origin(dev);
4404 +
4405 +       if (!o) {
4406 +               /* New origin */
4407 +               o = kmalloc(sizeof(*o), GFP_KERNEL);
4408 +               if (!o) {
4409 +                       up_write(&_origins_lock);
4410 +                       return -ENOMEM;
4411 +               }
4412 +
4413 +               /* Initialise the struct */
4414 +               INIT_LIST_HEAD(&o->snapshots);
4415 +               o->dev = dev;
4416 +
4417 +               __insert_origin(o);
4418 +       }
4419 +
4420 +       list_add_tail(&snap->list, &o->snapshots);
4421 +
4422 +       up_write(&_origins_lock);
4423 +       return 0;
4424 +}
4425 +
4426 +static void unregister_snapshot(struct dm_snapshot *s)
4427 +{
4428 +       struct origin *o;
4429 +
4430 +       down_write(&_origins_lock);
4431 +       o = __lookup_origin(s->origin->dev);
4432 +
4433 +       list_del(&s->list);
4434 +       if (list_empty(&o->snapshots)) {
4435 +               list_del(&o->hash_list);
4436 +               kfree(o);
4437 +       }
4438 +
4439 +       up_write(&_origins_lock);
4440 +}
4441 +
4442 +/*
4443 + * Implementation of the exception hash tables.
4444 + */
4445 +static int init_exception_table(struct exception_table *et, uint32_t size)
4446 +{
4447 +       unsigned int i;
4448 +
4449 +       et->hash_mask = size - 1;
4450 +       et->table = dm_vcalloc(size, sizeof(struct list_head));
4451 +       if (!et->table)
4452 +               return -ENOMEM;
4453 +
4454 +       for (i = 0; i < size; i++)
4455 +               INIT_LIST_HEAD(et->table + i);
4456 +
4457 +       return 0;
4458 +}
4459 +
4460 +static void exit_exception_table(struct exception_table *et, kmem_cache_t *mem)
4461 +{
4462 +       struct list_head *slot, *entry, *temp;
4463 +       struct exception *ex;
4464 +       int i, size;
4465 +
4466 +       size = et->hash_mask + 1;
4467 +       for (i = 0; i < size; i++) {
4468 +               slot = et->table + i;
4469 +
4470 +               list_for_each_safe(entry, temp, slot) {
4471 +                       ex = list_entry(entry, struct exception, hash_list);
4472 +                       kmem_cache_free(mem, ex);
4473 +               }
4474 +       }
4475 +
4476 +       vfree(et->table);
4477 +}
4478 +
4479 +/*
4480 + * FIXME: check how this hash fn is performing.
4481 + */
4482 +static inline uint32_t exception_hash(struct exception_table *et, chunk_t chunk)
4483 +{
4484 +       return chunk & et->hash_mask;
4485 +}
4486 +
4487 +static void insert_exception(struct exception_table *eh, struct exception *e)
4488 +{
4489 +       struct list_head *l = &eh->table[exception_hash(eh, e->old_chunk)];
4490 +       list_add(&e->hash_list, l);
4491 +}
4492 +
4493 +static inline void remove_exception(struct exception *e)
4494 +{
4495 +       list_del(&e->hash_list);
4496 +}
4497 +
4498 +/*
4499 + * Return the exception data for a sector, or NULL if not
4500 + * remapped.
4501 + */
4502 +static struct exception *lookup_exception(struct exception_table *et,
4503 +                                         chunk_t chunk)
4504 +{
4505 +       struct list_head *slot, *el;
4506 +       struct exception *e;
4507 +
4508 +       slot = &et->table[exception_hash(et, chunk)];
4509 +       list_for_each(el, slot) {
4510 +               e = list_entry(el, struct exception, hash_list);
4511 +               if (e->old_chunk == chunk)
4512 +                       return e;
4513 +       }
4514 +
4515 +       return NULL;
4516 +}
4517 +
4518 +static inline struct exception *alloc_exception(void)
4519 +{
4520 +       struct exception *e;
4521 +
4522 +       e = kmem_cache_alloc(exception_cache, GFP_NOIO);
4523 +       if (!e)
4524 +               e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
4525 +
4526 +       return e;
4527 +}
4528 +
4529 +static inline void free_exception(struct exception *e)
4530 +{
4531 +       kmem_cache_free(exception_cache, e);
4532 +}
4533 +
4534 +static inline struct pending_exception *alloc_pending_exception(void)
4535 +{
4536 +       return mempool_alloc(pending_pool, GFP_NOIO);
4537 +}
4538 +
4539 +static inline void free_pending_exception(struct pending_exception *pe)
4540 +{
4541 +       mempool_free(pe, pending_pool);
4542 +}
4543 +
4544 +int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new)
4545 +{
4546 +       struct exception *e;
4547 +
4548 +       e = alloc_exception();
4549 +       if (!e)
4550 +               return -ENOMEM;
4551 +
4552 +       e->old_chunk = old;
4553 +       e->new_chunk = new;
4554 +       insert_exception(&s->complete, e);
4555 +       return 0;
4556 +}
4557 +
4558 +/*
4559 + * Hard coded magic.
4560 + */
4561 +static int calc_max_buckets(void)
4562 +{
4563 +       unsigned long mem;
4564 +
4565 +       mem = num_physpages << PAGE_SHIFT;
4566 +       mem /= 50;
4567 +       mem /= sizeof(struct list_head);
4568 +
4569 +       return mem;
4570 +}
4571 +
4572 +/*
4573 + * Rounds a number down to a power of 2.
4574 + */
4575 +static inline uint32_t round_down(uint32_t n)
4576 +{
4577 +       while (n & (n - 1))
4578 +               n &= (n - 1);
4579 +       return n;
4580 +}
4581 +
4582 +/*
4583 + * Allocate room for a suitable hash table.
4584 + */
4585 +static int init_hash_tables(struct dm_snapshot *s)
4586 +{
4587 +       sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets;
4588 +
4589 +       /*
4590 +        * Calculate based on the size of the original volume or
4591 +        * the COW volume...
4592 +        */
4593 +       cow_dev_size = get_dev_size(s->cow->dev);
4594 +       origin_dev_size = get_dev_size(s->origin->dev);
4595 +       max_buckets = calc_max_buckets();
4596 +
4597 +       hash_size = min(origin_dev_size, cow_dev_size) / s->chunk_size;
4598 +       hash_size = min(hash_size, max_buckets);
4599 +
4600 +       /* Round it down to a power of 2 */
4601 +       hash_size = round_down(hash_size);
4602 +       if (init_exception_table(&s->complete, hash_size))
4603 +               return -ENOMEM;
4604 +
4605 +       /*
4606 +        * Allocate hash table for in-flight exceptions
4607 +        * Make this smaller than the real hash table
4608 +        */
4609 +       hash_size >>= 3;
4610 +       if (!hash_size)
4611 +               hash_size = 64;
4612 +
4613 +       if (init_exception_table(&s->pending, hash_size)) {
4614 +               exit_exception_table(&s->complete, exception_cache);
4615 +               return -ENOMEM;
4616 +       }
4617 +
4618 +       return 0;
4619 +}
4620 +
4621 +/*
4622 + * Round a number up to the nearest 'size' boundary.  size must
4623 + * be a power of 2.
4624 + */
4625 +static inline ulong round_up(ulong n, ulong size)
4626 +{
4627 +       size--;
4628 +       return (n + size) & ~size;
4629 +}
4630 +
4631 +/*
4632 + * Construct a snapshot mapping: <origin_dev> <COW-dev> <p/n> <chunk-size>
4633 + */
4634 +static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
4635 +{
4636 +       struct dm_snapshot *s;
4637 +       unsigned long chunk_size;
4638 +       int r = -EINVAL;
4639 +       char persistent;
4640 +       char *origin_path;
4641 +       char *cow_path;
4642 +       char *value;
4643 +       int blocksize;
4644 +
4645 +       if (argc < 4) {
4646 +               ti->error = "dm-snapshot: requires exactly 4 arguments";
4647 +               r = -EINVAL;
4648 +               goto bad1;
4649 +       }
4650 +
4651 +       origin_path = argv[0];
4652 +       cow_path = argv[1];
4653 +       persistent = toupper(*argv[2]);
4654 +
4655 +       if (persistent != 'P' && persistent != 'N') {
4656 +               ti->error = "Persistent flag is not P or N";
4657 +               r = -EINVAL;
4658 +               goto bad1;
4659 +       }
4660 +
4661 +       chunk_size = simple_strtoul(argv[3], &value, 10);
4662 +       if (chunk_size == 0 || value == NULL) {
4663 +               ti->error = "Invalid chunk size";
4664 +               r = -EINVAL;
4665 +               goto bad1;
4666 +       }
4667 +
4668 +       s = kmalloc(sizeof(*s), GFP_KERNEL);
4669 +       if (s == NULL) {
4670 +               ti->error = "Cannot allocate snapshot context private "
4671 +                   "structure";
4672 +               r = -ENOMEM;
4673 +               goto bad1;
4674 +       }
4675 +
4676 +       r = dm_get_device(ti, origin_path, 0, ti->len, FMODE_READ, &s->origin);
4677 +       if (r) {
4678 +               ti->error = "Cannot get origin device";
4679 +               goto bad2;
4680 +       }
4681 +
4682 +       /* FIXME: get cow length */
4683 +       r = dm_get_device(ti, cow_path, 0, 0,
4684 +                         FMODE_READ | FMODE_WRITE, &s->cow);
4685 +       if (r) {
4686 +               dm_put_device(ti, s->origin);
4687 +               ti->error = "Cannot get COW device";
4688 +               goto bad2;
4689 +       }
4690 +
4691 +       /*
4692 +        * Chunk size must be multiple of page size.  Silently
4693 +        * round up if it's not.
4694 +        */
4695 +       chunk_size = round_up(chunk_size, PAGE_SIZE / SECTOR_SIZE);
4696 +
4697 +       /* Validate the chunk size against the device block size */
4698 +       blocksize = get_hardsect_size(s->cow->dev);
4699 +       if (chunk_size % (blocksize / SECTOR_SIZE)) {
4700 +               ti->error = "Chunk size is not a multiple of device blocksize";
4701 +               r = -EINVAL;
4702 +               goto bad3;
4703 +       }
4704 +
4705 +       /* Check the sizes are small enough to fit in one kiovec */
4706 +       if (chunk_size > KIO_MAX_SECTORS) {
4707 +               ti->error = "Chunk size is too big";
4708 +               r = -EINVAL;
4709 +               goto bad3;
4710 +       }
4711 +
4712 +       /* Check chunk_size is a power of 2 */
4713 +       if (chunk_size & (chunk_size - 1)) {
4714 +               ti->error = "Chunk size is not a power of 2";
4715 +               r = -EINVAL;
4716 +               goto bad3;
4717 +       }
4718 +
4719 +       s->chunk_size = chunk_size;
4720 +       s->chunk_mask = chunk_size - 1;
4721 +       s->type = persistent;
4722 +       for (s->chunk_shift = 0; chunk_size;
4723 +            s->chunk_shift++, chunk_size >>= 1)
4724 +               ;
4725 +       s->chunk_shift--;
4726 +
4727 +       s->valid = 1;
4728 +       s->have_metadata = 0;
4729 +       s->last_percent = 0;
4730 +       init_rwsem(&s->lock);
4731 +       s->table = ti->table;
4732 +
4733 +       /* Allocate hash table for COW data */
4734 +       if (init_hash_tables(s)) {
4735 +               ti->error = "Unable to allocate hash table space";
4736 +               r = -ENOMEM;
4737 +               goto bad3;
4738 +       }
4739 +
4740 +       /*
4741 +        * Check the persistent flag - done here because we need the iobuf
4742 +        * to check the LV header
4743 +        */
4744 +       s->store.snap = s;
4745 +
4746 +       if (persistent == 'P')
4747 +               r = dm_create_persistent(&s->store, s->chunk_size);
4748 +       else
4749 +               r = dm_create_transient(&s->store, s, blocksize);
4750 +
4751 +       if (r) {
4752 +               ti->error = "Couldn't create exception store";
4753 +               r = -EINVAL;
4754 +               goto bad4;
4755 +       }
4756 +
4757 +       r = kcopyd_client_create(SNAPSHOT_PAGES, &s->kcopyd_client);
4758 +       if (r) {
4759 +               ti->error = "Could not create kcopyd client";
4760 +               goto bad5;
4761 +       }
4762 +
4763 +       /* Flush IO to the origin device */
4764 +       fsync_dev(s->origin->dev);
4765 +
4766 +       /* Add snapshot to the list of snapshots for this origin */
4767 +       if (register_snapshot(s)) {
4768 +               r = -EINVAL;
4769 +               ti->error = "Cannot register snapshot origin";
4770 +               goto bad6;
4771 +       }
4772 +
4773 +       ti->private = s;
4774 +       return 0;
4775 +
4776 + bad6:
4777 +       kcopyd_client_destroy(s->kcopyd_client);
4778 +
4779 + bad5:
4780 +       s->store.destroy(&s->store);
4781 +
4782 + bad4:
4783 +       exit_exception_table(&s->pending, pending_cache);
4784 +       exit_exception_table(&s->complete, exception_cache);
4785 +
4786 + bad3:
4787 +       dm_put_device(ti, s->cow);
4788 +       dm_put_device(ti, s->origin);
4789 +
4790 + bad2:
4791 +       kfree(s);
4792 +
4793 + bad1:
4794 +       return r;
4795 +}
4796 +
4797 +static void snapshot_dtr(struct dm_target *ti)
4798 +{
4799 +       struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
4800 +
4801 +       dm_table_event(ti->table);
4802 +
4803 +       unregister_snapshot(s);
4804 +
4805 +       exit_exception_table(&s->pending, pending_cache);
4806 +       exit_exception_table(&s->complete, exception_cache);
4807 +
4808 +       /* Deallocate memory used */
4809 +       s->store.destroy(&s->store);
4810 +
4811 +       dm_put_device(ti, s->origin);
4812 +       dm_put_device(ti, s->cow);
4813 +       kcopyd_client_destroy(s->kcopyd_client);
4814 +       kfree(s);
4815 +}
4816 +
4817 +/*
4818 + * We hold lists of buffer_heads, using the b_reqnext field.
4819 + */
4820 +static void queue_buffer(struct buffer_head **queue, struct buffer_head *bh)
4821 +{
4822 +       bh->b_reqnext = *queue;
4823 +       *queue = bh;
4824 +}
4825 +
4826 +/*
4827 + * FIXME: inefficient.
4828 + */
4829 +static void queue_buffers(struct buffer_head **queue, struct buffer_head *bhs)
4830 +{
4831 +       while (*queue)
4832 +               queue = &((*queue)->b_reqnext);
4833 +
4834 +       *queue = bhs;
4835 +}
4836 +
4837 +/*
4838 + * Flush a list of buffers.
4839 + */
4840 +static void flush_buffers(struct buffer_head *bh)
4841 +{
4842 +       struct buffer_head *n;
4843 +
4844 +       DMDEBUG("begin flush");
4845 +       while (bh) {
4846 +               n = bh->b_reqnext;
4847 +               bh->b_reqnext = NULL;
4848 +               DMDEBUG("flushing %p", bh);
4849 +               generic_make_request(WRITE, bh);
4850 +               bh = n;
4851 +       }
4852 +
4853 +       run_task_queue(&tq_disk);
4854 +}
4855 +
4856 +/*
4857 + * Error a list of buffers.
4858 + */
4859 +static void error_buffers(struct buffer_head *bh)
4860 +{
4861 +       struct buffer_head *n;
4862 +
4863 +       while (bh) {
4864 +               n = bh->b_reqnext;
4865 +               bh->b_reqnext = NULL;
4866 +               buffer_IO_error(bh);
4867 +               bh = n;
4868 +       }
4869 +}
4870 +
4871 +static struct buffer_head *__flush_bhs(struct pending_exception *pe)
4872 +{
4873 +       struct pending_exception *sibling;
4874 +
4875 +       if (list_empty(&pe->siblings))
4876 +               return pe->origin_bhs;
4877 +
4878 +       sibling = list_entry(pe->siblings.next,
4879 +                            struct pending_exception, siblings);
4880 +
4881 +       list_del(&pe->siblings);
4882 +
4883 +       /* FIXME: I think there's a race on SMP machines here, add spin lock */
4884 +       queue_buffers(&sibling->origin_bhs, pe->origin_bhs);
4885 +
4886 +       return NULL;
4887 +}
4888 +
4889 +static void pending_complete(struct pending_exception *pe, int success)
4890 +{
4891 +       struct exception *e;
4892 +       struct dm_snapshot *s = pe->snap;
4893 +       struct buffer_head *flush = NULL;
4894 +
4895 +       if (success) {
4896 +               e = alloc_exception();
4897 +               if (!e) {
4898 +                       DMWARN("Unable to allocate exception.");
4899 +                       down_write(&s->lock);
4900 +                       s->store.drop_snapshot(&s->store);
4901 +                       s->valid = 0;
4902 +                       flush = __flush_bhs(pe);
4903 +                       up_write(&s->lock);
4904 +
4905 +                       error_buffers(pe->snapshot_bhs);
4906 +                       goto out;
4907 +               }
4908 +
4909 +               /*
4910 +                * Add a proper exception, and remove the
4911 +                * in-flight exception from the list.
4912 +                */
4913 +               down_write(&s->lock);
4914 +
4915 +               memcpy(e, &pe->e, sizeof(*e));
4916 +               insert_exception(&s->complete, e);
4917 +               remove_exception(&pe->e);
4918 +               flush = __flush_bhs(pe);
4919 +
4920 +               /* Submit any pending write BHs */
4921 +               up_write(&s->lock);
4922 +
4923 +               flush_buffers(pe->snapshot_bhs);
4924 +               DMDEBUG("Exception completed successfully.");
4925 +
4926 +               /* Notify any interested parties */
4927 +               if (s->store.fraction_full) {
4928 +                       sector_t numerator, denominator;
4929 +                       int pc;
4930 +
4931 +                       s->store.fraction_full(&s->store, &numerator,
4932 +                                              &denominator);
4933 +                       pc = numerator * 100 / denominator;
4934 +
4935 +                       if (pc >= s->last_percent + WAKE_UP_PERCENT) {
4936 +                               dm_table_event(s->table);
4937 +                               s->last_percent = pc - pc % WAKE_UP_PERCENT;
4938 +                       }
4939 +               }
4940 +
4941 +       } else {
4942 +               /* Read/write error - snapshot is unusable */
4943 +               down_write(&s->lock);
4944 +               if (s->valid)
4945 +                       DMERR("Error reading/writing snapshot");
4946 +               s->store.drop_snapshot(&s->store);
4947 +               s->valid = 0;
4948 +               remove_exception(&pe->e);
4949 +               flush = __flush_bhs(pe);
4950 +               up_write(&s->lock);
4951 +
4952 +               error_buffers(pe->snapshot_bhs);
4953 +
4954 +               dm_table_event(s->table);
4955 +               DMDEBUG("Exception failed.");
4956 +       }
4957 +
4958 + out:
4959 +       if (flush)
4960 +               flush_buffers(flush);
4961 +
4962 +       free_pending_exception(pe);
4963 +}
4964 +
4965 +static void commit_callback(void *context, int success)
4966 +{
4967 +       struct pending_exception *pe = (struct pending_exception *) context;
4968 +       pending_complete(pe, success);
4969 +}
4970 +
4971 +/*
4972 + * Called when the copy I/O has finished.  kcopyd actually runs
4973 + * this code so don't block.
4974 + */
4975 +static void copy_callback(int read_err, unsigned int write_err, void *context)
4976 +{
4977 +       struct pending_exception *pe = (struct pending_exception *) context;
4978 +       struct dm_snapshot *s = pe->snap;
4979 +
4980 +       if (read_err || write_err)
4981 +               pending_complete(pe, 0);
4982 +
4983 +       else
4984 +               /* Update the metadata if we are persistent */
4985 +               s->store.commit_exception(&s->store, &pe->e, commit_callback,
4986 +                                         pe);
4987 +}
4988 +
4989 +/*
4990 + * Dispatches the copy operation to kcopyd.
4991 + */
4992 +static inline void start_copy(struct pending_exception *pe)
4993 +{
4994 +       struct dm_snapshot *s = pe->snap;
4995 +       struct io_region src, dest;
4996 +       kdev_t dev = s->origin->dev;
4997 +       int *sizes = blk_size[major(dev)];
4998 +       sector_t dev_size = (sector_t) -1;
4999 +
5000 +       if (pe->started)
5001 +               return;
5002 +
5003 +       /* this is protected by snap->lock */
5004 +       pe->started = 1;
5005 +
5006 +       if (sizes && sizes[minor(dev)])
5007 +               dev_size = sizes[minor(dev)] << 1;
5008 +
5009 +       src.dev = dev;
5010 +       src.sector = chunk_to_sector(s, pe->e.old_chunk);
5011 +       src.count = min(s->chunk_size, dev_size - src.sector);
5012 +
5013 +       dest.dev = s->cow->dev;
5014 +       dest.sector = chunk_to_sector(s, pe->e.new_chunk);
5015 +       dest.count = src.count;
5016 +
5017 +       /* Hand over to kcopyd */
5018 +       kcopyd_copy(s->kcopyd_client,
5019 +                   &src, 1, &dest, 0, copy_callback, pe);
5020 +}
5021 +
5022 +/*
5023 + * Looks to see if this snapshot already has a pending exception
5024 + * for this chunk, otherwise it allocates a new one and inserts
5025 + * it into the pending table.
5026 + */
5027 +static struct pending_exception *find_pending_exception(struct dm_snapshot *s,
5028 +                                                       struct buffer_head *bh)
5029 +{
5030 +       struct exception *e;
5031 +       struct pending_exception *pe;
5032 +       chunk_t chunk = sector_to_chunk(s, bh->b_rsector);
5033 +
5034 +       /*
5035 +        * Is there a pending exception for this already ?
5036 +        */
5037 +       e = lookup_exception(&s->pending, chunk);
5038 +       if (e) {
5039 +               /* cast the exception to a pending exception */
5040 +               pe = list_entry(e, struct pending_exception, e);
5041 +
5042 +       } else {
5043 +               /* Create a new pending exception */
5044 +               pe = alloc_pending_exception();
5045 +               pe->e.old_chunk = chunk;
5046 +               pe->origin_bhs = pe->snapshot_bhs = NULL;
5047 +               INIT_LIST_HEAD(&pe->siblings);
5048 +               pe->snap = s;
5049 +               pe->started = 0;
5050 +
5051 +               if (s->store.prepare_exception(&s->store, &pe->e)) {
5052 +                       free_pending_exception(pe);
5053 +                       s->valid = 0;
5054 +                       return NULL;
5055 +               }
5056 +
5057 +               insert_exception(&s->pending, &pe->e);
5058 +       }
5059 +
5060 +       return pe;
5061 +}
5062 +
5063 +static inline void remap_exception(struct dm_snapshot *s, struct exception *e,
5064 +                                  struct buffer_head *bh)
5065 +{
5066 +       bh->b_rdev = s->cow->dev;
5067 +       bh->b_rsector = chunk_to_sector(s, e->new_chunk) +
5068 +           (bh->b_rsector & s->chunk_mask);
5069 +}
5070 +
5071 +static int snapshot_map(struct dm_target *ti, struct buffer_head *bh, int rw,
5072 +                       union map_info *map_context)
5073 +{
5074 +       struct exception *e;
5075 +       struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
5076 +       int r = 1;
5077 +       chunk_t chunk;
5078 +       struct pending_exception *pe;
5079 +
5080 +       chunk = sector_to_chunk(s, bh->b_rsector);
5081 +
5082 +       /* Full snapshots are not usable */
5083 +       if (!s->valid)
5084 +               return -1;
5085 +
5086 +       /*
5087 +        * Write to snapshot - higher level takes care of RW/RO
5088 +        * flags so we should only get this if we are
5089 +        * writeable.
5090 +        */
5091 +       if (rw == WRITE) {
5092 +
5093 +               down_write(&s->lock);
5094 +
5095 +               /* If the block is already remapped - use that, else remap it */
5096 +               e = lookup_exception(&s->complete, chunk);
5097 +               if (e)
5098 +                       remap_exception(s, e, bh);
5099 +
5100 +               else {
5101 +                       pe = find_pending_exception(s, bh);
5102 +
5103 +                       if (!pe) {
5104 +                               s->store.drop_snapshot(&s->store);
5105 +                               s->valid = 0;
5106 +                               r = -EIO;
5107 +                       } else {
5108 +                               remap_exception(s, &pe->e, bh);
5109 +                               queue_buffer(&pe->snapshot_bhs, bh);
5110 +                               start_copy(pe);
5111 +                               r = 0;
5112 +                       }
5113 +               }
5114 +
5115 +               up_write(&s->lock);
5116 +
5117 +       } else {
5118 +               /*
5119 +                * FIXME: this read path scares me because we
5120 +                * always use the origin when we have a pending
5121 +                * exception.  However I can't think of a
5122 +                * situation where this is wrong - ejt.
5123 +                */
5124 +
5125 +               /* Do reads */
5126 +               down_read(&s->lock);
5127 +
5128 +               /* See if it it has been remapped */
5129 +               e = lookup_exception(&s->complete, chunk);
5130 +               if (e)
5131 +                       remap_exception(s, e, bh);
5132 +               else
5133 +                       bh->b_rdev = s->origin->dev;
5134 +
5135 +               up_read(&s->lock);
5136 +       }
5137 +
5138 +       return r;
5139 +}
5140 +
5141 +void snapshot_resume(struct dm_target *ti)
5142 +{
5143 +       struct dm_snapshot *s = (struct dm_snapshot *) ti->private;
5144 +
5145 +       if (s->have_metadata)
5146 +               return;
5147 +
5148 +       if (s->store.read_metadata(&s->store)) {
5149 +               down_write(&s->lock);
5150 +               s->valid = 0;
5151 +               up_write(&s->lock);
5152 +       }
5153 +
5154 +       s->have_metadata = 1;
5155 +}
5156 +
5157 +static int snapshot_status(struct dm_target *ti, status_type_t type,
5158 +                          char *result, unsigned int maxlen)
5159 +{
5160 +       struct dm_snapshot *snap = (struct dm_snapshot *) ti->private;
5161 +       char cow[16];
5162 +       char org[16];
5163 +
5164 +       switch (type) {
5165 +       case STATUSTYPE_INFO:
5166 +               if (!snap->valid)
5167 +                       snprintf(result, maxlen, "Invalid");
5168 +               else {
5169 +                       if (snap->store.fraction_full) {
5170 +                               sector_t numerator, denominator;
5171 +                               snap->store.fraction_full(&snap->store,
5172 +                                                         &numerator,
5173 +                                                         &denominator);
5174 +                               snprintf(result, maxlen,
5175 +                                        SECTOR_FORMAT "/" SECTOR_FORMAT,
5176 +                                        numerator, denominator);
5177 +                       }
5178 +                       else
5179 +                               snprintf(result, maxlen, "Unknown");
5180 +               }
5181 +               break;
5182 +
5183 +       case STATUSTYPE_TABLE:
5184 +               /*
5185 +                * kdevname returns a static pointer so we need
5186 +                * to make private copies if the output is to
5187 +                * make sense.
5188 +                */
5189 +               strncpy(cow, dm_kdevname(snap->cow->dev), sizeof(cow));
5190 +               strncpy(org, dm_kdevname(snap->origin->dev), sizeof(org));
5191 +               snprintf(result, maxlen, "%s %s %c %ld", org, cow,
5192 +                        snap->type, snap->chunk_size);
5193 +               break;
5194 +       }
5195 +
5196 +       return 0;
5197 +}
5198 +
5199 +/*-----------------------------------------------------------------
5200 + * Origin methods
5201 + *---------------------------------------------------------------*/
5202 +static void list_merge(struct list_head *l1, struct list_head *l2)
5203 +{
5204 +       struct list_head *l1_n, *l2_p;
5205 +
5206 +       l1_n = l1->next;
5207 +       l2_p = l2->prev;
5208 +
5209 +       l1->next = l2;
5210 +       l2->prev = l1;
5211 +
5212 +       l2_p->next = l1_n;
5213 +       l1_n->prev = l2_p;
5214 +}
5215 +
5216 +static int __origin_write(struct list_head *snapshots, struct buffer_head *bh)
5217 +{
5218 +       int r = 1, first = 1;
5219 +       struct list_head *sl;
5220 +       struct dm_snapshot *snap;
5221 +       struct exception *e;
5222 +       struct pending_exception *pe, *last = NULL;
5223 +       chunk_t chunk;
5224 +
5225 +       /* Do all the snapshots on this origin */
5226 +       list_for_each(sl, snapshots) {
5227 +               snap = list_entry(sl, struct dm_snapshot, list);
5228 +
5229 +               /* Only deal with valid snapshots */
5230 +               if (!snap->valid)
5231 +                       continue;
5232 +
5233 +               down_write(&snap->lock);
5234 +
5235 +               /*
5236 +                * Remember, different snapshots can have
5237 +                * different chunk sizes.
5238 +                */
5239 +               chunk = sector_to_chunk(snap, bh->b_rsector);
5240 +
5241 +               /*
5242 +                * Check exception table to see if block
5243 +                * is already remapped in this snapshot
5244 +                * and trigger an exception if not.
5245 +                */
5246 +               e = lookup_exception(&snap->complete, chunk);
5247 +               if (!e) {
5248 +                       pe = find_pending_exception(snap, bh);
5249 +                       if (!pe) {
5250 +                               snap->store.drop_snapshot(&snap->store);
5251 +                               snap->valid = 0;
5252 +
5253 +                       } else {
5254 +                               if (last)
5255 +                                       list_merge(&pe->siblings,
5256 +                                                  &last->siblings);
5257 +
5258 +                               last = pe;
5259 +                               r = 0;
5260 +                       }
5261 +               }
5262 +
5263 +               up_write(&snap->lock);
5264 +       }
5265 +
5266 +       /*
5267 +        * Now that we have a complete pe list we can start the copying.
5268 +        */
5269 +       if (last) {
5270 +               pe = last;
5271 +               do {
5272 +                       down_write(&pe->snap->lock);
5273 +                       if (first)
5274 +                               queue_buffer(&pe->origin_bhs, bh);
5275 +                       start_copy(pe);
5276 +                       up_write(&pe->snap->lock);
5277 +                       first = 0;
5278 +                       pe = list_entry(pe->siblings.next,
5279 +                                       struct pending_exception, siblings);
5280 +
5281 +               } while (pe != last);
5282 +       }
5283 +
5284 +       return r;
5285 +}
5286 +
5287 +/*
5288 + * Called on a write from the origin driver.
5289 + */
5290 +int do_origin(struct dm_dev *origin, struct buffer_head *bh)
5291 +{
5292 +       struct origin *o;
5293 +       int r;
5294 +
5295 +       down_read(&_origins_lock);
5296 +       o = __lookup_origin(origin->dev);
5297 +       if (!o)
5298 +               BUG();
5299 +
5300 +       r = __origin_write(&o->snapshots, bh);
5301 +       up_read(&_origins_lock);
5302 +
5303 +       return r;
5304 +}
5305 +
5306 +/*
5307 + * Origin: maps a linear range of a device, with hooks for snapshotting.
5308 + */
5309 +
5310 +/*
5311 + * Construct an origin mapping: <dev_path>
5312 + * The context for an origin is merely a 'struct dm_dev *'
5313 + * pointing to the real device.
5314 + */
5315 +static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
5316 +{
5317 +       int r;
5318 +       struct dm_dev *dev;
5319 +
5320 +       if (argc != 1) {
5321 +               ti->error = "dm-origin: incorrect number of arguments";
5322 +               return -EINVAL;
5323 +       }
5324 +
5325 +       r = dm_get_device(ti, argv[0], 0, ti->len,
5326 +                         dm_table_get_mode(ti->table), &dev);
5327 +       if (r) {
5328 +               ti->error = "Cannot get target device";
5329 +               return r;
5330 +       }
5331 +
5332 +       ti->private = dev;
5333 +       return 0;
5334 +}
5335 +
5336 +static void origin_dtr(struct dm_target *ti)
5337 +{
5338 +       struct dm_dev *dev = (struct dm_dev *) ti->private;
5339 +       dm_put_device(ti, dev);
5340 +}
5341 +
5342 +static int origin_map(struct dm_target *ti, struct buffer_head *bh, int rw,
5343 +                     union map_info *map_context)
5344 +{
5345 +       struct dm_dev *dev = (struct dm_dev *) ti->private;
5346 +       bh->b_rdev = dev->dev;
5347 +
5348 +       /* Only tell snapshots if this is a write */
5349 +       return (rw == WRITE) ? do_origin(dev, bh) : 1;
5350 +}
5351 +
5352 +static int origin_status(struct dm_target *ti, status_type_t type, char *result,
5353 +                        unsigned int maxlen)
5354 +{
5355 +       struct dm_dev *dev = (struct dm_dev *) ti->private;
5356 +
5357 +       switch (type) {
5358 +       case STATUSTYPE_INFO:
5359 +               result[0] = '\0';
5360 +               break;
5361 +
5362 +       case STATUSTYPE_TABLE:
5363 +               snprintf(result, maxlen, "%s", dm_kdevname(dev->dev));
5364 +               break;
5365 +       }
5366 +
5367 +       return 0;
5368 +}
5369 +
5370 +static struct target_type origin_target = {
5371 +       name:   "snapshot-origin",
5372 +       module: THIS_MODULE,
5373 +       ctr:    origin_ctr,
5374 +       dtr:    origin_dtr,
5375 +       map:    origin_map,
5376 +       status: origin_status,
5377 +};
5378 +
5379 +static struct target_type snapshot_target = {
5380 +       name:   "snapshot",
5381 +       module: THIS_MODULE,
5382 +       ctr:    snapshot_ctr,
5383 +       dtr:    snapshot_dtr,
5384 +       map:    snapshot_map,
5385 +       resume: snapshot_resume,
5386 +       status: snapshot_status,
5387 +};
5388 +
5389 +int __init dm_snapshot_init(void)
5390 +{
5391 +       int r;
5392 +
5393 +       r = dm_register_target(&snapshot_target);
5394 +       if (r) {
5395 +               DMERR("snapshot target register failed %d", r);
5396 +               return r;
5397 +       }
5398 +
5399 +       r = dm_register_target(&origin_target);
5400 +       if (r < 0) {
5401 +               DMERR("Device mapper: Origin: register failed %d\n", r);
5402 +               goto bad1;
5403 +       }
5404 +
5405 +       r = init_origin_hash();
5406 +       if (r) {
5407 +               DMERR("init_origin_hash failed.");
5408 +               goto bad2;
5409 +       }
5410 +
5411 +       exception_cache = kmem_cache_create("dm-snapshot-ex",
5412 +                                           sizeof(struct exception),
5413 +                                           __alignof__(struct exception),
5414 +                                           0, NULL, NULL);
5415 +       if (!exception_cache) {
5416 +               DMERR("Couldn't create exception cache.");
5417 +               r = -ENOMEM;
5418 +               goto bad3;
5419 +       }
5420 +
5421 +       pending_cache =
5422 +           kmem_cache_create("dm-snapshot-in",
5423 +                             sizeof(struct pending_exception),
5424 +                             __alignof__(struct pending_exception),
5425 +                             0, NULL, NULL);
5426 +       if (!pending_cache) {
5427 +               DMERR("Couldn't create pending cache.");
5428 +               r = -ENOMEM;
5429 +               goto bad4;
5430 +       }
5431 +
5432 +       pending_pool = mempool_create(128, mempool_alloc_slab,
5433 +                                     mempool_free_slab, pending_cache);
5434 +       if (!pending_pool) {
5435 +               DMERR("Couldn't create pending pool.");
5436 +               r = -ENOMEM;
5437 +               goto bad5;
5438 +       }
5439 +
5440 +       return 0;
5441 +
5442 +      bad5:
5443 +       kmem_cache_destroy(pending_cache);
5444 +      bad4:
5445 +       kmem_cache_destroy(exception_cache);
5446 +      bad3:
5447 +       exit_origin_hash();
5448 +      bad2:
5449 +       dm_unregister_target(&origin_target);
5450 +      bad1:
5451 +       dm_unregister_target(&snapshot_target);
5452 +       return r;
5453 +}
5454 +
5455 +void dm_snapshot_exit(void)
5456 +{
5457 +       int r;
5458 +
5459 +       r = dm_unregister_target(&snapshot_target);
5460 +       if (r)
5461 +               DMERR("snapshot unregister failed %d", r);
5462 +
5463 +       r = dm_unregister_target(&origin_target);
5464 +       if (r)
5465 +               DMERR("origin unregister failed %d", r);
5466 +
5467 +       exit_origin_hash();
5468 +       mempool_destroy(pending_pool);
5469 +       kmem_cache_destroy(pending_cache);
5470 +       kmem_cache_destroy(exception_cache);
5471 +}
5472 diff -urN linux-2.4.24.org/drivers/md/dm-snapshot.h linux-2.4.24/drivers/md/dm-snapshot.h
5473 --- linux-2.4.24.org/drivers/md/dm-snapshot.h   1970-01-01 01:00:00.000000000 +0100
5474 +++ linux-2.4.24/drivers/md/dm-snapshot.h       2004-01-18 15:01:29.250465221 +0100
5475 @@ -0,0 +1,158 @@
5476 +/*
5477 + * dm-snapshot.c
5478 + *
5479 + * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
5480 + *
5481 + * This file is released under the GPL.
5482 + */
5483 +
5484 +#ifndef DM_SNAPSHOT_H
5485 +#define DM_SNAPSHOT_H
5486 +
5487 +#include "dm.h"
5488 +#include <linux/blkdev.h>
5489 +
5490 +struct exception_table {
5491 +       uint32_t hash_mask;
5492 +       struct list_head *table;
5493 +};
5494 +
5495 +/*
5496 + * The snapshot code deals with largish chunks of the disk at a
5497 + * time. Typically 64k - 256k.
5498 + */
5499 +/* FIXME: can we get away with limiting these to a uint32_t ? */
5500 +typedef sector_t chunk_t;
5501 +
5502 +/*
5503 + * An exception is used where an old chunk of data has been
5504 + * replaced by a new one.
5505 + */
5506 +struct exception {
5507 +       struct list_head hash_list;
5508 +
5509 +       chunk_t old_chunk;
5510 +       chunk_t new_chunk;
5511 +};
5512 +
5513 +/*
5514 + * Abstraction to handle the meta/layout of exception stores (the
5515 + * COW device).
5516 + */
5517 +struct exception_store {
5518 +
5519 +       /*
5520 +        * Destroys this object when you've finished with it.
5521 +        */
5522 +       void (*destroy) (struct exception_store *store);
5523 +
5524 +       /*
5525 +        * The target shouldn't read the COW device until this is
5526 +        * called.
5527 +        */
5528 +       int (*read_metadata) (struct exception_store *store);
5529 +
5530 +       /*
5531 +        * Find somewhere to store the next exception.
5532 +        */
5533 +       int (*prepare_exception) (struct exception_store *store,
5534 +                                 struct exception *e);
5535 +
5536 +       /*
5537 +        * Update the metadata with this exception.
5538 +        */
5539 +       void (*commit_exception) (struct exception_store *store,
5540 +                                 struct exception *e,
5541 +                                 void (*callback) (void *, int success),
5542 +                                 void *callback_context);
5543 +
5544 +       /*
5545 +        * The snapshot is invalid, note this in the metadata.
5546 +        */
5547 +       void (*drop_snapshot) (struct exception_store *store);
5548 +
5549 +       /*
5550 +        * Return how full the snapshot is.
5551 +        */
5552 +       void (*fraction_full) (struct exception_store *store,
5553 +                              sector_t *numerator,
5554 +                              sector_t *denominator);
5555 +
5556 +       struct dm_snapshot *snap;
5557 +       void *context;
5558 +};
5559 +
5560 +struct dm_snapshot {
5561 +       struct rw_semaphore lock;
5562 +       struct dm_table *table;
5563 +
5564 +       struct dm_dev *origin;
5565 +       struct dm_dev *cow;
5566 +
5567 +       /* List of snapshots per Origin */
5568 +       struct list_head list;
5569 +
5570 +       /* Size of data blocks saved - must be a power of 2 */
5571 +       chunk_t chunk_size;
5572 +       chunk_t chunk_mask;
5573 +       chunk_t chunk_shift;
5574 +
5575 +       /* You can't use a snapshot if this is 0 (e.g. if full) */
5576 +       int valid;
5577 +       int have_metadata;
5578 +
5579 +       /* Used for display of table */
5580 +       char type;
5581 +
5582 +       /* The last percentage we notified */
5583 +       int last_percent;
5584 +
5585 +       struct exception_table pending;
5586 +       struct exception_table complete;
5587 +
5588 +       /* The on disk metadata handler */
5589 +       struct exception_store store;
5590 +
5591 +       struct kcopyd_client *kcopyd_client;
5592 +};
5593 +
5594 +/*
5595 + * Used by the exception stores to load exceptions hen
5596 + * initialising.
5597 + */
5598 +int dm_add_exception(struct dm_snapshot *s, chunk_t old, chunk_t new);
5599 +
5600 +/*
5601 + * Constructor and destructor for the default persistent
5602 + * store.
5603 + */
5604 +int dm_create_persistent(struct exception_store *store, uint32_t chunk_size);
5605 +
5606 +int dm_create_transient(struct exception_store *store,
5607 +                       struct dm_snapshot *s, int blocksize);
5608 +
5609 +/*
5610 + * Return the number of sectors in the device.
5611 + */
5612 +static inline sector_t get_dev_size(kdev_t dev)
5613 +{
5614 +       int *sizes;
5615 +
5616 +       sizes = blk_size[MAJOR(dev)];
5617 +       if (sizes)
5618 +               return sizes[MINOR(dev)] << 1;
5619 +
5620 +       return 0;
5621 +}
5622 +
5623 +static inline chunk_t sector_to_chunk(struct dm_snapshot *s, sector_t sector)
5624 +{
5625 +       return (sector & ~s->chunk_mask) >> s->chunk_shift;
5626 +}
5627 +
5628 +static inline sector_t chunk_to_sector(struct dm_snapshot *s, chunk_t chunk)
5629 +{
5630 +       return chunk << s->chunk_shift;
5631 +}
5632 +
5633 +#endif
5634 diff -urN linux-2.4.24.org/drivers/md/dm-stripe.c linux-2.4.24/drivers/md/dm-stripe.c
5635 --- linux-2.4.24.org/drivers/md/dm-stripe.c     1970-01-01 01:00:00.000000000 +0100
5636 +++ linux-2.4.24/drivers/md/dm-stripe.c 2004-01-18 15:01:13.781711369 +0100
5637 @@ -0,0 +1,258 @@
5638 +/*
5639 + * Copyright (C) 2001 Sistina Software (UK) Limited.
5640 + *
5641 + * This file is released under the GPL.
5642 + */
5643 +
5644 +#include "dm.h"
5645 +
5646 +#include <linux/module.h>
5647 +#include <linux/init.h>
5648 +#include <linux/blkdev.h>
5649 +#include <linux/slab.h>
5650 +
5651 +struct stripe {
5652 +       struct dm_dev *dev;
5653 +       sector_t physical_start;
5654 +};
5655 +
5656 +struct stripe_c {
5657 +       uint32_t stripes;
5658 +
5659 +       /* The size of this target / num. stripes */
5660 +       uint32_t stripe_width;
5661 +
5662 +       /* stripe chunk size */
5663 +       uint32_t chunk_shift;
5664 +       sector_t chunk_mask;
5665 +
5666 +       struct stripe stripe[0];
5667 +};
5668 +
5669 +static inline struct stripe_c *alloc_context(unsigned int stripes)
5670 +{
5671 +       size_t len;
5672 +
5673 +       if (array_too_big(sizeof(struct stripe_c), sizeof(struct stripe),
5674 +                         stripes))
5675 +               return NULL;
5676 +
5677 +       len = sizeof(struct stripe_c) + (sizeof(struct stripe) * stripes);
5678 +
5679 +       return kmalloc(len, GFP_KERNEL);
5680 +}
5681 +
5682 +/*
5683 + * Parse a single <dev> <sector> pair
5684 + */
5685 +static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
5686 +                     unsigned int stripe, char **argv)
5687 +{
5688 +       sector_t start;
5689 +
5690 +       if (sscanf(argv[1], SECTOR_FORMAT, &start) != 1)
5691 +               return -EINVAL;
5692 +
5693 +       if (dm_get_device(ti, argv[0], start, sc->stripe_width,
5694 +                         dm_table_get_mode(ti->table),
5695 +                         &sc->stripe[stripe].dev))
5696 +               return -ENXIO;
5697 +
5698 +       sc->stripe[stripe].physical_start = start;
5699 +       return 0;
5700 +}
5701 +
5702 +/*
5703 + * FIXME: Nasty function, only present because we can't link
5704 + * against __moddi3 and __divdi3.
5705 + *
5706 + * returns a == b * n
5707 + */
5708 +static int multiple(sector_t a, sector_t b, sector_t *n)
5709 +{
5710 +       sector_t acc, prev, i;
5711 +
5712 +       *n = 0;
5713 +       while (a >= b) {
5714 +               for (acc = b, prev = 0, i = 1;
5715 +                    acc <= a;
5716 +                    prev = acc, acc <<= 1, i <<= 1)
5717 +                       ;
5718 +
5719 +               a -= prev;
5720 +               *n += i >> 1;
5721 +       }
5722 +
5723 +       return a == 0;
5724 +}
5725 +
5726 +/*
5727 + * Construct a striped mapping.
5728 + * <number of stripes> <chunk size (2^^n)> [<dev_path> <offset>]+
5729 + */
5730 +static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
5731 +{
5732 +       struct stripe_c *sc;
5733 +       sector_t width;
5734 +       uint32_t stripes;
5735 +       uint32_t chunk_size;
5736 +       char *end;
5737 +       int r;
5738 +       unsigned int i;
5739 +
5740 +       if (argc < 2) {
5741 +               ti->error = "dm-stripe: Not enough arguments";
5742 +               return -EINVAL;
5743 +       }
5744 +
5745 +       stripes = simple_strtoul(argv[0], &end, 10);
5746 +       if (*end) {
5747 +               ti->error = "dm-stripe: Invalid stripe count";
5748 +               return -EINVAL;
5749 +       }
5750 +
5751 +       chunk_size = simple_strtoul(argv[1], &end, 10);
5752 +       if (*end) {
5753 +               ti->error = "dm-stripe: Invalid chunk_size";
5754 +               return -EINVAL;
5755 +       }
5756 +
5757 +       /*
5758 +        * chunk_size is a power of two
5759 +        */
5760 +       if (!chunk_size || (chunk_size & (chunk_size - 1))) {
5761 +               ti->error = "dm-stripe: Invalid chunk size";
5762 +               return -EINVAL;
5763 +       }
5764 +
5765 +       if (!multiple(ti->len, stripes, &width)) {
5766 +               ti->error = "dm-stripe: Target length not divisable by "
5767 +                   "number of stripes";
5768 +               return -EINVAL;
5769 +       }
5770 +
5771 +       /*
5772 +        * Do we have enough arguments for that many stripes ?
5773 +        */
5774 +       if (argc != (2 + 2 * stripes)) {
5775 +               ti->error = "dm-stripe: Not enough destinations specified";
5776 +               return -EINVAL;
5777 +       }
5778 +
5779 +       sc = alloc_context(stripes);
5780 +       if (!sc) {
5781 +               ti->error = "dm-stripe: Memory allocation for striped context "
5782 +                   "failed";
5783 +               return -ENOMEM;
5784 +       }
5785 +
5786 +       sc->stripes = stripes;
5787 +       sc->stripe_width = width;
5788 +
5789 +       sc->chunk_mask = ((sector_t) chunk_size) - 1;
5790 +       for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++)
5791 +               chunk_size >>= 1;
5792 +       sc->chunk_shift--;
5793 +
5794 +       /*
5795 +        * Get the stripe destinations.
5796 +        */
5797 +       for (i = 0; i < stripes; i++) {
5798 +               argv += 2;
5799 +
5800 +               r = get_stripe(ti, sc, i, argv);
5801 +               if (r < 0) {
5802 +                       ti->error = "dm-stripe: Couldn't parse stripe "
5803 +                           "destination";
5804 +                       while (i--)
5805 +                               dm_put_device(ti, sc->stripe[i].dev);
5806 +                       kfree(sc);
5807 +                       return r;
5808 +               }
5809 +       }
5810 +
5811 +       ti->private = sc;
5812 +       return 0;
5813 +}
5814 +
5815 +static void stripe_dtr(struct dm_target *ti)
5816 +{
5817 +       unsigned int i;
5818 +       struct stripe_c *sc = (struct stripe_c *) ti->private;
5819 +
5820 +       for (i = 0; i < sc->stripes; i++)
5821 +               dm_put_device(ti, sc->stripe[i].dev);
5822 +
5823 +       kfree(sc);
5824 +}
5825 +
5826 +static int stripe_map(struct dm_target *ti, struct buffer_head *bh, int rw,
5827 +                     union map_info *context)
5828 +{
5829 +       struct stripe_c *sc = (struct stripe_c *) ti->private;
5830 +
5831 +       sector_t offset = bh->b_rsector - ti->begin;
5832 +       uint32_t chunk = (uint32_t) (offset >> sc->chunk_shift);
5833 +       uint32_t stripe = chunk % sc->stripes;  /* 32bit modulus */
5834 +       chunk = chunk / sc->stripes;
5835 +
5836 +       bh->b_rdev = sc->stripe[stripe].dev->dev;
5837 +       bh->b_rsector = sc->stripe[stripe].physical_start +
5838 +           (chunk << sc->chunk_shift) + (offset & sc->chunk_mask);
5839 +       return 1;
5840 +}
5841 +
5842 +static int stripe_status(struct dm_target *ti, status_type_t type,
5843 +                        char *result, unsigned int maxlen)
5844 +{
5845 +       struct stripe_c *sc = (struct stripe_c *) ti->private;
5846 +       int offset;
5847 +       unsigned int i;
5848 +
5849 +       switch (type) {
5850 +       case STATUSTYPE_INFO:
5851 +               result[0] = '\0';
5852 +               break;
5853 +
5854 +       case STATUSTYPE_TABLE:
5855 +               offset = snprintf(result, maxlen, "%d " SECTOR_FORMAT,
5856 +                                 sc->stripes, sc->chunk_mask + 1);
5857 +               for (i = 0; i < sc->stripes; i++) {
5858 +                       offset +=
5859 +                           snprintf(result + offset, maxlen - offset,
5860 +                                    " %s " SECTOR_FORMAT,
5861 +                      dm_kdevname(to_kdev_t(sc->stripe[i].dev->bdev->bd_dev)),
5862 +                                    sc->stripe[i].physical_start);
5863 +               }
5864 +               break;
5865 +       }
5866 +       return 0;
5867 +}
5868 +
5869 +static struct target_type stripe_target = {
5870 +       .name   = "striped",
5871 +       .module = THIS_MODULE,
5872 +       .ctr    = stripe_ctr,
5873 +       .dtr    = stripe_dtr,
5874 +       .map    = stripe_map,
5875 +       .status = stripe_status,
5876 +};
5877 +
5878 +int __init dm_stripe_init(void)
5879 +{
5880 +       int r;
5881 +
5882 +       r = dm_register_target(&stripe_target);
5883 +       if (r < 0)
5884 +               DMWARN("striped target registration failed");
5885 +
5886 +       return r;
5887 +}
5888 +
5889 +void dm_stripe_exit(void)
5890 +{
5891 +       if (dm_unregister_target(&stripe_target))
5892 +               DMWARN("striped target unregistration failed");
5893 +
5894 +       return;
5895 +}
5896 diff -urN linux-2.4.24.org/drivers/md/dm-table.c linux-2.4.24/drivers/md/dm-table.c
5897 --- linux-2.4.24.org/drivers/md/dm-table.c      1970-01-01 01:00:00.000000000 +0100
5898 +++ linux-2.4.24/drivers/md/dm-table.c  2004-01-18 15:01:13.786710320 +0100
5899 @@ -0,0 +1,696 @@
5900 +/*
5901 + * Copyright (C) 2001 Sistina Software (UK) Limited.
5902 + *
5903 + * This file is released under the GPL.
5904 + */
5905 +
5906 +#include "dm.h"
5907 +
5908 +#include <linux/module.h>
5909 +#include <linux/vmalloc.h>
5910 +#include <linux/blkdev.h>
5911 +#include <linux/ctype.h>
5912 +#include <linux/slab.h>
5913 +#include <asm/atomic.h>
5914 +
5915 +#define MAX_DEPTH 16
5916 +#define NODE_SIZE L1_CACHE_BYTES
5917 +#define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
5918 +#define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
5919 +
5920 +struct dm_table {
5921 +       atomic_t holders;
5922 +
5923 +       /* btree table */
5924 +       unsigned int depth;
5925 +       unsigned int counts[MAX_DEPTH]; /* in nodes */
5926 +       sector_t *index[MAX_DEPTH];
5927 +
5928 +       unsigned int num_targets;
5929 +       unsigned int num_allocated;
5930 +       sector_t *highs;
5931 +       struct dm_target *targets;
5932 +
5933 +       /*
5934 +        * Indicates the rw permissions for the new logical
5935 +        * device.  This should be a combination of FMODE_READ
5936 +        * and FMODE_WRITE.
5937 +        */
5938 +       int mode;
5939 +
5940 +       /* a list of devices used by this table */
5941 +       struct list_head devices;
5942 +
5943 +       /* events get handed up using this callback */
5944 +       void (*event_fn)(void *);
5945 +       void *event_context;
5946 +};
5947 +
5948 +/*
5949 + * Similar to ceiling(log_size(n))
5950 + */
5951 +static unsigned int int_log(unsigned long n, unsigned long base)
5952 +{
5953 +       int result = 0;
5954 +
5955 +       while (n > 1) {
5956 +               n = dm_div_up(n, base);
5957 +               result++;
5958 +       }
5959 +
5960 +       return result;
5961 +}
5962 +
5963 +/*
5964 + * Calculate the index of the child node of the n'th node k'th key.
5965 + */
5966 +static inline unsigned int get_child(unsigned int n, unsigned int k)
5967 +{
5968 +       return (n * CHILDREN_PER_NODE) + k;
5969 +}
5970 +
5971 +/*
5972 + * Return the n'th node of level l from table t.
5973 + */
5974 +static inline sector_t *get_node(struct dm_table *t, unsigned int l,
5975 +                                unsigned int n)
5976 +{
5977 +       return t->index[l] + (n * KEYS_PER_NODE);
5978 +}
5979 +
5980 +/*
5981 + * Return the highest key that you could lookup from the n'th
5982 + * node on level l of the btree.
5983 + */
5984 +static sector_t high(struct dm_table *t, unsigned int l, unsigned int n)
5985 +{
5986 +       for (; l < t->depth - 1; l++)
5987 +               n = get_child(n, CHILDREN_PER_NODE - 1);
5988 +
5989 +       if (n >= t->counts[l])
5990 +               return (sector_t) - 1;
5991 +
5992 +       return get_node(t, l, n)[KEYS_PER_NODE - 1];
5993 +}
5994 +
5995 +/*
5996 + * Fills in a level of the btree based on the highs of the level
5997 + * below it.
5998 + */
5999 +static int setup_btree_index(unsigned int l, struct dm_table *t)
6000 +{
6001 +       unsigned int n, k;
6002 +       sector_t *node;
6003 +
6004 +       for (n = 0U; n < t->counts[l]; n++) {
6005 +               node = get_node(t, l, n);
6006 +
6007 +               for (k = 0U; k < KEYS_PER_NODE; k++)
6008 +                       node[k] = high(t, l + 1, get_child(n, k));
6009 +       }
6010 +
6011 +       return 0;
6012 +}
6013 +
6014 +void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size)
6015 +{
6016 +       unsigned long size;
6017 +       void *addr;
6018 +
6019 +       /*
6020 +        * Check that we're not going to overflow.
6021 +        */
6022 +       if (nmemb > (ULONG_MAX / elem_size))
6023 +               return NULL;
6024 +
6025 +       size = nmemb * elem_size;
6026 +       addr = vmalloc(size);
6027 +       if (addr)
6028 +               memset(addr, 0, size);
6029 +
6030 +       return addr;
6031 +}
6032 +
6033 +int dm_table_create(struct dm_table **result, int mode, unsigned num_targets)
6034 +{
6035 +       struct dm_table *t = kmalloc(sizeof(*t), GFP_KERNEL);
6036 +
6037 +       if (!t)
6038 +               return -ENOMEM;
6039 +
6040 +       memset(t, 0, sizeof(*t));
6041 +       INIT_LIST_HEAD(&t->devices);
6042 +       atomic_set(&t->holders, 1);
6043 +
6044 +       num_targets = dm_round_up(num_targets, KEYS_PER_NODE);
6045 +
6046 +       /* Allocate both the target array and offset array at once. */
6047 +       t->highs = (sector_t *) dm_vcalloc(sizeof(struct dm_target) +
6048 +                                          sizeof(sector_t), num_targets);
6049 +       if (!t->highs) {
6050 +               kfree(t);
6051 +               return -ENOMEM;
6052 +       }
6053 +
6054 +       memset(t->highs, -1, sizeof(*t->highs) * num_targets);
6055 +
6056 +       t->targets = (struct dm_target *) (t->highs + num_targets);
6057 +       t->num_allocated = num_targets;
6058 +       t->mode = mode;
6059 +       *result = t;
6060 +       return 0;
6061 +}
6062 +
6063 +static void free_devices(struct list_head *devices)
6064 +{
6065 +       struct list_head *tmp, *next;
6066 +
6067 +       for (tmp = devices->next; tmp != devices; tmp = next) {
6068 +               struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
6069 +               next = tmp->next;
6070 +               kfree(dd);
6071 +       }
6072 +}
6073 +
6074 +void table_destroy(struct dm_table *t)
6075 +{
6076 +       unsigned int i;
6077 +
6078 +       /* free the indexes (see dm_table_complete) */
6079 +       if (t->depth >= 2)
6080 +               vfree(t->index[t->depth - 2]);
6081 +
6082 +       /* free the targets */
6083 +       for (i = 0; i < t->num_targets; i++) {
6084 +               struct dm_target *tgt = t->targets + i;
6085 +
6086 +               if (tgt->type->dtr)
6087 +                       tgt->type->dtr(tgt);
6088 +
6089 +               dm_put_target_type(tgt->type);
6090 +       }
6091 +
6092 +       vfree(t->highs);
6093 +
6094 +       /* free the device list */
6095 +       if (t->devices.next != &t->devices) {
6096 +               DMWARN("devices still present during destroy: "
6097 +                      "dm_table_remove_device calls missing");
6098 +
6099 +               free_devices(&t->devices);
6100 +       }
6101 +
6102 +       kfree(t);
6103 +}
6104 +
6105 +void dm_table_get(struct dm_table *t)
6106 +{
6107 +       atomic_inc(&t->holders);
6108 +}
6109 +
6110 +void dm_table_put(struct dm_table *t)
6111 +{
6112 +       if (atomic_dec_and_test(&t->holders))
6113 +               table_destroy(t);
6114 +}
6115 +
6116 +/*
6117 + * Convert a device path to a dev_t.
6118 + */
6119 +static int lookup_device(const char *path, kdev_t *dev)
6120 +{
6121 +       int r;
6122 +       struct nameidata nd;
6123 +       struct inode *inode;
6124 +
6125 +       if (!path_init(path, LOOKUP_FOLLOW, &nd))
6126 +               return 0;
6127 +
6128 +       if ((r = path_walk(path, &nd)))
6129 +               goto out;
6130 +
6131 +       inode = nd.dentry->d_inode;
6132 +       if (!inode) {
6133 +               r = -ENOENT;
6134 +               goto out;
6135 +       }
6136 +
6137 +       if (!S_ISBLK(inode->i_mode)) {
6138 +               r = -ENOTBLK;
6139 +               goto out;
6140 +       }
6141 +
6142 +       *dev = inode->i_rdev;
6143 +
6144 +      out:
6145 +       path_release(&nd);
6146 +       return r;
6147 +}
6148 +
6149 +/*
6150 + * See if we've already got a device in the list.
6151 + */
6152 +static struct dm_dev *find_device(struct list_head *l, kdev_t dev)
6153 +{
6154 +       struct list_head *tmp;
6155 +
6156 +       list_for_each(tmp, l) {
6157 +               struct dm_dev *dd = list_entry(tmp, struct dm_dev, list);
6158 +               if (kdev_same(dd->dev, dev))
6159 +                       return dd;
6160 +       }
6161 +
6162 +       return NULL;
6163 +}
6164 +
6165 +/*
6166 + * Open a device so we can use it as a map destination.
6167 + */
6168 +static int open_dev(struct dm_dev *dd)
6169 +{
6170 +       if (dd->bdev)
6171 +               BUG();
6172 +
6173 +       dd->bdev = bdget(kdev_t_to_nr(dd->dev));
6174 +       if (!dd->bdev)
6175 +               return -ENOMEM;
6176 +
6177 +       return blkdev_get(dd->bdev, dd->mode, 0, BDEV_RAW);
6178 +}
6179 +
6180 +/*
6181 + * Close a device that we've been using.
6182 + */
6183 +static void close_dev(struct dm_dev *dd)
6184 +{
6185 +       if (!dd->bdev)
6186 +               return;
6187 +
6188 +       blkdev_put(dd->bdev, BDEV_RAW);
6189 +       dd->bdev = NULL;
6190 +}
6191 +
6192 +/*
6193 + * If possible (ie. blk_size[major] is set), this checks an area
6194 + * of a destination device is valid.
6195 + */
6196 +static int check_device_area(kdev_t dev, sector_t start, sector_t len)
6197 +{
6198 +       int *sizes;
6199 +       sector_t dev_size;
6200 +
6201 +       if (!(sizes = blk_size[major(dev)]) || !(dev_size = sizes[minor(dev)]))
6202 +               /* we don't know the device details,
6203 +                * so give the benefit of the doubt */
6204 +               return 1;
6205 +
6206 +       /* convert to 512-byte sectors */
6207 +       dev_size <<= 1;
6208 +
6209 +       return ((start < dev_size) && (len <= (dev_size - start)));
6210 +}
6211 +
6212 +/*
6213 + * This upgrades the mode on an already open dm_dev.  Being
6214 + * careful to leave things as they were if we fail to reopen the
6215 + * device.
6216 + */
6217 +static int upgrade_mode(struct dm_dev *dd, int new_mode)
6218 +{
6219 +       int r;
6220 +       struct dm_dev dd_copy;
6221 +
6222 +       memcpy(&dd_copy, dd, sizeof(dd_copy));
6223 +
6224 +       dd->mode |= new_mode;
6225 +       dd->bdev = NULL;
6226 +       r = open_dev(dd);
6227 +       if (!r)
6228 +               close_dev(&dd_copy);
6229 +       else
6230 +               memcpy(dd, &dd_copy, sizeof(dd_copy));
6231 +
6232 +       return r;
6233 +}
6234 +
6235 +/*
6236 + * Add a device to the list, or just increment the usage count if
6237 + * it's already present.
6238 + */
6239 +int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
6240 +                 sector_t len, int mode, struct dm_dev **result)
6241 +{
6242 +       int r;
6243 +       kdev_t dev;
6244 +       struct dm_dev *dd;
6245 +       unsigned major, minor;
6246 +       struct dm_table *t = ti->table;
6247 +
6248 +       if (!t)
6249 +               BUG();
6250 +
6251 +       if (sscanf(path, "%u:%u", &major, &minor) == 2) {
6252 +               /* Extract the major/minor numbers */
6253 +               dev = mk_kdev(major, minor);
6254 +       } else {
6255 +               /* convert the path to a device */
6256 +               if ((r = lookup_device(path, &dev)))
6257 +                       return r;
6258 +       }
6259 +
6260 +       dd = find_device(&t->devices, dev);
6261 +       if (!dd) {
6262 +               dd = kmalloc(sizeof(*dd), GFP_KERNEL);
6263 +               if (!dd)
6264 +                       return -ENOMEM;
6265 +
6266 +               dd->dev = dev;
6267 +               dd->mode = mode;
6268 +               dd->bdev = NULL;
6269 +
6270 +               if ((r = open_dev(dd))) {
6271 +                       kfree(dd);
6272 +                       return r;
6273 +               }
6274 +
6275 +               atomic_set(&dd->count, 0);
6276 +               list_add(&dd->list, &t->devices);
6277 +
6278 +       } else if (dd->mode != (mode | dd->mode)) {
6279 +               r = upgrade_mode(dd, mode);
6280 +               if (r)
6281 +                       return r;
6282 +       }
6283 +       atomic_inc(&dd->count);
6284 +
6285 +       if (!check_device_area(dd->dev, start, len)) {
6286 +               DMWARN("device %s too small for target", path);
6287 +               dm_put_device(ti, dd);
6288 +               return -EINVAL;
6289 +       }
6290 +
6291 +       *result = dd;
6292 +
6293 +       return 0;
6294 +}
6295 +
6296 +/*
6297 + * Decrement a devices use count and remove it if neccessary.
6298 + */
6299 +void dm_put_device(struct dm_target *ti, struct dm_dev *dd)
6300 +{
6301 +       if (atomic_dec_and_test(&dd->count)) {
6302 +               close_dev(dd);
6303 +               list_del(&dd->list);
6304 +               kfree(dd);
6305 +       }
6306 +}
6307 +
6308 +/*
6309 + * Checks to see if the target joins onto the end of the table.
6310 + */
6311 +static int adjoin(struct dm_table *table, struct dm_target *ti)
6312 +{
6313 +       struct dm_target *prev;
6314 +
6315 +       if (!table->num_targets)
6316 +               return !ti->begin;
6317 +
6318 +       prev = &table->targets[table->num_targets - 1];
6319 +       return (ti->begin == (prev->begin + prev->len));
6320 +}
6321 +
6322 +/*
6323 + * Used to dynamically allocate the arg array.
6324 + */
6325 +static char **realloc_argv(unsigned *array_size, char **old_argv)
6326 +{
6327 +       char **argv;
6328 +       unsigned new_size;
6329 +
6330 +       new_size = *array_size ? *array_size * 2 : 64;
6331 +       argv = kmalloc(new_size * sizeof(*argv), GFP_KERNEL);
6332 +       if (argv) {
6333 +               memcpy(argv, old_argv, *array_size * sizeof(*argv));
6334 +               *array_size = new_size;
6335 +       }
6336 +
6337 +       kfree(old_argv);
6338 +       return argv;
6339 +}
6340 +
6341 +/*
6342 + * Destructively splits up the argument list to pass to ctr.
6343 + */
6344 +static int split_args(int *argc, char ***argvp, char *input)
6345 +{
6346 +       char *start, *end = input, *out, **argv = NULL;
6347 +       unsigned array_size = 0;
6348 +
6349 +       *argc = 0;
6350 +       argv = realloc_argv(&array_size, argv);
6351 +       if (!argv)
6352 +               return -ENOMEM;
6353 +
6354 +       while (1) {
6355 +               start = end;
6356 +
6357 +               /* Skip whitespace */
6358 +               while (*start && isspace(*start))
6359 +                       start++;
6360 +
6361 +               if (!*start)
6362 +                       break;  /* success, we hit the end */
6363 +
6364 +               /* 'out' is used to remove any back-quotes */
6365 +               end = out = start;
6366 +               while (*end) {
6367 +                       /* Everything apart from '\0' can be quoted */
6368 +                       if (*end == '\\' && *(end + 1)) {
6369 +                               *out++ = *(end + 1);
6370 +                               end += 2;
6371 +                               continue;
6372 +                       }
6373 +
6374 +                       if (isspace(*end))
6375 +                               break;  /* end of token */
6376 +
6377 +                       *out++ = *end++;
6378 +               }
6379 +
6380 +               /* have we already filled the array ? */
6381 +               if ((*argc + 1) > array_size) {
6382 +                       argv = realloc_argv(&array_size, argv);
6383 +                       if (!argv)
6384 +                               return -ENOMEM;
6385 +               }
6386 +
6387 +               /* we know this is whitespace */
6388 +               if (*end)
6389 +                       end++;
6390 +
6391 +               /* terminate the string and put it in the array */
6392 +               *out = '\0';
6393 +               argv[*argc] = start;
6394 +               (*argc)++;
6395 +       }
6396 +
6397 +       *argvp = argv;
6398 +       return 0;
6399 +}
6400 +
6401 +int dm_table_add_target(struct dm_table *t, const char *type,
6402 +                       sector_t start, sector_t len, char *params)
6403 +{
6404 +       int r = -EINVAL, argc;
6405 +       char **argv;
6406 +       struct dm_target *tgt;
6407 +
6408 +       if (t->num_targets >= t->num_allocated)
6409 +               return -ENOMEM;
6410 +
6411 +       tgt = t->targets + t->num_targets;
6412 +       memset(tgt, 0, sizeof(*tgt));
6413 +
6414 +       tgt->type = dm_get_target_type(type);
6415 +       if (!tgt->type) {
6416 +               tgt->error = "unknown target type";
6417 +               return -EINVAL;
6418 +       }
6419 +
6420 +       tgt->table = t;
6421 +       tgt->begin = start;
6422 +       tgt->len = len;
6423 +       tgt->error = "Unknown error";
6424 +
6425 +       /*
6426 +        * Does this target adjoin the previous one ?
6427 +        */
6428 +       if (!adjoin(t, tgt)) {
6429 +               tgt->error = "Gap in table";
6430 +               r = -EINVAL;
6431 +               goto bad;
6432 +       }
6433 +
6434 +       r = split_args(&argc, &argv, params);
6435 +       if (r) {
6436 +               tgt->error = "couldn't split parameters (insufficient memory)";
6437 +               goto bad;
6438 +       }
6439 +
6440 +       r = tgt->type->ctr(tgt, argc, argv);
6441 +       kfree(argv);
6442 +       if (r)
6443 +               goto bad;
6444 +
6445 +       t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
6446 +       return 0;
6447 +
6448 +      bad:
6449 +       printk(KERN_ERR DM_NAME ": %s\n", tgt->error);
6450 +       dm_put_target_type(tgt->type);
6451 +       return r;
6452 +}
6453 +
6454 +static int setup_indexes(struct dm_table *t)
6455 +{
6456 +       int i;
6457 +       unsigned int total = 0;
6458 +       sector_t *indexes;
6459 +
6460 +       /* allocate the space for *all* the indexes */
6461 +       for (i = t->depth - 2; i >= 0; i--) {
6462 +               t->counts[i] = dm_div_up(t->counts[i + 1], CHILDREN_PER_NODE);
6463 +               total += t->counts[i];
6464 +       }
6465 +
6466 +       indexes = (sector_t *) dm_vcalloc(total, (unsigned long) NODE_SIZE);
6467 +       if (!indexes)
6468 +               return -ENOMEM;
6469 +
6470 +       /* set up internal nodes, bottom-up */
6471 +       for (i = t->depth - 2, total = 0; i >= 0; i--) {
6472 +               t->index[i] = indexes;
6473 +               indexes += (KEYS_PER_NODE * t->counts[i]);
6474 +               setup_btree_index(i, t);
6475 +       }
6476 +
6477 +       return 0;
6478 +}
6479 +
6480 +/*
6481 + * Builds the btree to index the map.
6482 + */
6483 +int dm_table_complete(struct dm_table *t)
6484 +{
6485 +       int r = 0;
6486 +       unsigned int leaf_nodes;
6487 +
6488 +       /* how many indexes will the btree have ? */
6489 +       leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
6490 +       t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
6491 +
6492 +       /* leaf layer has already been set up */
6493 +       t->counts[t->depth - 1] = leaf_nodes;
6494 +       t->index[t->depth - 1] = t->highs;
6495 +
6496 +       if (t->depth >= 2)
6497 +               r = setup_indexes(t);
6498 +
6499 +       return r;
6500 +}
6501 +
6502 +static spinlock_t _event_lock = SPIN_LOCK_UNLOCKED;
6503 +void dm_table_event_callback(struct dm_table *t,
6504 +                            void (*fn)(void *), void *context)
6505 +{
6506 +       spin_lock_irq(&_event_lock);
6507 +       t->event_fn = fn;
6508 +       t->event_context = context;
6509 +       spin_unlock_irq(&_event_lock);
6510 +}
6511 +
6512 +void dm_table_event(struct dm_table *t)
6513 +{
6514 +       spin_lock(&_event_lock);
6515 +       if (t->event_fn)
6516 +               t->event_fn(t->event_context);
6517 +       spin_unlock(&_event_lock);
6518 +}
6519 +
6520 +sector_t dm_table_get_size(struct dm_table *t)
6521 +{
6522 +       return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0;
6523 +}
6524 +
6525 +struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index)
6526 +{
6527 +       if (index > t->num_targets)
6528 +               return NULL;
6529 +
6530 +       return t->targets + index;
6531 +}
6532 +
6533 +/*
6534 + * Search the btree for the correct target.
6535 + */
6536 +struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
6537 +{
6538 +       unsigned int l, n = 0, k = 0;
6539 +       sector_t *node;
6540 +
6541 +       for (l = 0; l < t->depth; l++) {
6542 +               n = get_child(n, k);
6543 +               node = get_node(t, l, n);
6544 +
6545 +               for (k = 0; k < KEYS_PER_NODE; k++)
6546 +                       if (node[k] >= sector)
6547 +                               break;
6548 +       }
6549 +
6550 +       return &t->targets[(KEYS_PER_NODE * n) + k];
6551 +}
6552 +
6553 +unsigned int dm_table_get_num_targets(struct dm_table *t)
6554 +{
6555 +       return t->num_targets;
6556 +}
6557 +
6558 +struct list_head *dm_table_get_devices(struct dm_table *t)
6559 +{
6560 +       return &t->devices;
6561 +}
6562 +
6563 +int dm_table_get_mode(struct dm_table *t)
6564 +{
6565 +       return t->mode;
6566 +}
6567 +
6568 +void dm_table_suspend_targets(struct dm_table *t)
6569 +{
6570 +       int i;
6571 +
6572 +       for (i = 0; i < t->num_targets; i++) {
6573 +               struct dm_target *ti = t->targets + i;
6574 +
6575 +               if (ti->type->suspend)
6576 +                       ti->type->suspend(ti);
6577 +       }
6578 +}
6579 +
6580 +void dm_table_resume_targets(struct dm_table *t)
6581 +{
6582 +       int i;
6583 +
6584 +       for (i = 0; i < t->num_targets; i++) {
6585 +               struct dm_target *ti = t->targets + i;
6586 +
6587 +               if (ti->type->resume)
6588 +                       ti->type->resume(ti);
6589 +       }
6590 +}
6591 +
6592 +EXPORT_SYMBOL(dm_get_device);
6593 +EXPORT_SYMBOL(dm_put_device);
6594 +EXPORT_SYMBOL(dm_table_event);
6595 +EXPORT_SYMBOL(dm_table_get_mode);
6596 diff -urN linux-2.4.24.org/drivers/md/dm-target.c linux-2.4.24/drivers/md/dm-target.c
6597 --- linux-2.4.24.org/drivers/md/dm-target.c     1970-01-01 01:00:00.000000000 +0100
6598 +++ linux-2.4.24/drivers/md/dm-target.c 2004-01-18 15:01:13.789709690 +0100
6599 @@ -0,0 +1,188 @@
6600 +/*
6601 + * Copyright (C) 2001 Sistina Software (UK) Limited
6602 + *
6603 + * This file is released under the GPL.
6604 + */
6605 +
6606 +#include "dm.h"
6607 +
6608 +#include <linux/module.h>
6609 +#include <linux/kmod.h>
6610 +#include <linux/slab.h>
6611 +
6612 +struct tt_internal {
6613 +       struct target_type tt;
6614 +
6615 +       struct list_head list;
6616 +       long use;
6617 +};
6618 +
6619 +static LIST_HEAD(_targets);
6620 +static DECLARE_RWSEM(_lock);
6621 +
6622 +#define DM_MOD_NAME_SIZE 32
6623 +
6624 +static inline struct tt_internal *__find_target_type(const char *name)
6625 +{
6626 +       struct list_head *tih;
6627 +       struct tt_internal *ti;
6628 +
6629 +       list_for_each(tih, &_targets) {
6630 +               ti = list_entry(tih, struct tt_internal, list);
6631 +
6632 +               if (!strcmp(name, ti->tt.name))
6633 +                       return ti;
6634 +       }
6635 +
6636 +       return NULL;
6637 +}
6638 +
6639 +static struct tt_internal *get_target_type(const char *name)
6640 +{
6641 +       struct tt_internal *ti;
6642 +
6643 +       down_read(&_lock);
6644 +       ti = __find_target_type(name);
6645 +
6646 +       if (ti) {
6647 +               if (ti->use == 0 && ti->tt.module)
6648 +                       __MOD_INC_USE_COUNT(ti->tt.module);
6649 +               ti->use++;
6650 +       }
6651 +       up_read(&_lock);
6652 +
6653 +       return ti;
6654 +}
6655 +
6656 +static void load_module(const char *name)
6657 +{
6658 +       char module_name[DM_MOD_NAME_SIZE] = "dm-";
6659 +
6660 +       /* Length check for strcat() below */
6661 +       if (strlen(name) > (DM_MOD_NAME_SIZE - 4))
6662 +               return;
6663 +
6664 +       strcat(module_name, name);
6665 +       request_module(module_name);
6666 +}
6667 +
6668 +struct target_type *dm_get_target_type(const char *name)
6669 +{
6670 +       struct tt_internal *ti = get_target_type(name);
6671 +
6672 +       if (!ti) {
6673 +               load_module(name);
6674 +               ti = get_target_type(name);
6675 +       }
6676 +
6677 +       return ti ? &ti->tt : NULL;
6678 +}
6679 +
6680 +void dm_put_target_type(struct target_type *t)
6681 +{
6682 +       struct tt_internal *ti = (struct tt_internal *) t;
6683 +
6684 +       down_read(&_lock);
6685 +       if (--ti->use == 0 && ti->tt.module)
6686 +               __MOD_DEC_USE_COUNT(ti->tt.module);
6687 +
6688 +       if (ti->use < 0)
6689 +               BUG();
6690 +       up_read(&_lock);
6691 +
6692 +       return;
6693 +}
6694 +
6695 +static struct tt_internal *alloc_target(struct target_type *t)
6696 +{
6697 +       struct tt_internal *ti = kmalloc(sizeof(*ti), GFP_KERNEL);
6698 +
6699 +       if (ti) {
6700 +               memset(ti, 0, sizeof(*ti));
6701 +               ti->tt = *t;
6702 +       }
6703 +
6704 +       return ti;
6705 +}
6706 +
6707 +int dm_register_target(struct target_type *t)
6708 +{
6709 +       int rv = 0;
6710 +       struct tt_internal *ti = alloc_target(t);
6711 +
6712 +       if (!ti)
6713 +               return -ENOMEM;
6714 +
6715 +       down_write(&_lock);
6716 +       if (__find_target_type(t->name)) {
6717 +               kfree(ti);
6718 +               rv = -EEXIST;
6719 +       } else
6720 +               list_add(&ti->list, &_targets);
6721 +
6722 +       up_write(&_lock);
6723 +       return rv;
6724 +}
6725 +
6726 +int dm_unregister_target(struct target_type *t)
6727 +{
6728 +       struct tt_internal *ti;
6729 +
6730 +       down_write(&_lock);
6731 +       if (!(ti = __find_target_type(t->name))) {
6732 +               up_write(&_lock);
6733 +               return -EINVAL;
6734 +       }
6735 +
6736 +       if (ti->use) {
6737 +               up_write(&_lock);
6738 +               return -ETXTBSY;
6739 +       }
6740 +
6741 +       list_del(&ti->list);
6742 +       kfree(ti);
6743 +
6744 +       up_write(&_lock);
6745 +       return 0;
6746 +}
6747 +
6748 +/*
6749 + * io-err: always fails an io, useful for bringing
6750 + * up LVs that have holes in them.
6751 + */
6752 +static int io_err_ctr(struct dm_target *ti, unsigned int argc, char **args)
6753 +{
6754 +       return 0;
6755 +}
6756 +
6757 +static void io_err_dtr(struct dm_target *ti)
6758 +{
6759 +       /* empty */
6760 +}
6761 +
6762 +static int io_err_map(struct dm_target *ti, struct buffer_head *bh, int rw,
6763 +                     union map_info *map_context)
6764 +{
6765 +       return -EIO;
6766 +}
6767 +
6768 +static struct target_type error_target = {
6769 +       .name = "error",
6770 +       .ctr  = io_err_ctr,
6771 +       .dtr  = io_err_dtr,
6772 +       .map  = io_err_map,
6773 +};
6774 +
6775 +int dm_target_init(void)
6776 +{
6777 +       return dm_register_target(&error_target);
6778 +}
6779 +
6780 +void dm_target_exit(void)
6781 +{
6782 +       if (dm_unregister_target(&error_target))
6783 +               DMWARN("error target unregistration failed");
6784 +}
6785 +
6786 +EXPORT_SYMBOL(dm_register_target);
6787 +EXPORT_SYMBOL(dm_unregister_target);
6788 diff -urN linux-2.4.24.org/drivers/md/kcopyd.c linux-2.4.24/drivers/md/kcopyd.c
6789 --- linux-2.4.24.org/drivers/md/kcopyd.c        1970-01-01 01:00:00.000000000 +0100
6790 +++ linux-2.4.24/drivers/md/kcopyd.c    2004-01-18 15:01:25.797189646 +0100
6791 @@ -0,0 +1,666 @@
6792 +/*
6793 + * Copyright (C) 2002 Sistina Software (UK) Limited.
6794 + *
6795 + * This file is released under the GPL.
6796 + */
6797 +
6798 +#include <asm/atomic.h>
6799 +
6800 +#include <linux/blkdev.h>
6801 +#include <linux/config.h>
6802 +#include <linux/device-mapper.h>
6803 +#include <linux/fs.h>
6804 +#include <linux/init.h>
6805 +#include <linux/list.h>
6806 +#include <linux/locks.h>
6807 +#include <linux/mempool.h>
6808 +#include <linux/module.h>
6809 +#include <linux/pagemap.h>
6810 +#include <linux/slab.h>
6811 +#include <linux/vmalloc.h>
6812 +
6813 +#include "kcopyd.h"
6814 +#include "dm-daemon.h"
6815 +
6816 +/* FIXME: this is only needed for the DMERR macros */
6817 +#include "dm.h"
6818 +
6819 +static struct dm_daemon _kcopyd;
6820 +
6821 +#define SECTORS_PER_PAGE (PAGE_SIZE / SECTOR_SIZE)
6822 +#define SUB_JOB_SIZE 128
6823 +#define PAGES_PER_SUB_JOB (SUB_JOB_SIZE / SECTORS_PER_PAGE)
6824 +#define SUB_JOB_COUNT 8
6825 +
6826 +/*-----------------------------------------------------------------
6827 + * Each kcopyd client has its own little pool of preallocated
6828 + * pages for kcopyd io.
6829 + *---------------------------------------------------------------*/
6830 +struct kcopyd_client {
6831 +       struct list_head list;
6832 +
6833 +       spinlock_t lock;
6834 +       struct list_head pages;
6835 +       unsigned int nr_pages;
6836 +       unsigned int nr_free_pages;
6837 +       unsigned int max_split;
6838 +};
6839 +
6840 +static inline void __push_page(struct kcopyd_client *kc, struct page *p)
6841 +{
6842 +       list_add(&p->list, &kc->pages);
6843 +       kc->nr_free_pages++;
6844 +}
6845 +
6846 +static inline struct page *__pop_page(struct kcopyd_client *kc)
6847 +{
6848 +       struct page *p;
6849 +
6850 +       p = list_entry(kc->pages.next, struct page, list);
6851 +       list_del(&p->list);
6852 +       kc->nr_free_pages--;
6853 +
6854 +       return p;
6855 +}
6856 +
6857 +static int kcopyd_get_pages(struct kcopyd_client *kc,
6858 +                           unsigned int nr, struct list_head *pages)
6859 +{
6860 +       struct page *p;
6861 +       INIT_LIST_HEAD(pages);
6862 +
6863 +       spin_lock(&kc->lock);
6864 +       if (kc->nr_free_pages < nr) {
6865 +               spin_unlock(&kc->lock);
6866 +               return -ENOMEM;
6867 +       }
6868 +
6869 +       while (nr--) {
6870 +               p = __pop_page(kc);
6871 +               list_add(&p->list, pages);
6872 +       }
6873 +       spin_unlock(&kc->lock);
6874 +
6875 +       return 0;
6876 +}
6877 +
6878 +static void kcopyd_put_pages(struct kcopyd_client *kc, struct list_head *pages)
6879 +{
6880 +       struct list_head *tmp, *tmp2;
6881 +
6882 +       spin_lock(&kc->lock);
6883 +       list_for_each_safe (tmp, tmp2, pages)
6884 +               __push_page(kc, list_entry(tmp, struct page, list));
6885 +       spin_unlock(&kc->lock);
6886 +}
6887 +
6888 +/*
6889 + * These three functions resize the page pool.
6890 + */
6891 +static void release_pages(struct list_head *pages)
6892 +{
6893 +       struct page *p;
6894 +       struct list_head *tmp, *tmp2;
6895 +
6896 +       list_for_each_safe (tmp, tmp2, pages) {
6897 +               p = list_entry(tmp, struct page, list);
6898 +               UnlockPage(p);
6899 +               __free_page(p);
6900 +       }
6901 +}
6902 +
6903 +static int client_alloc_pages(struct kcopyd_client *kc, unsigned int nr)
6904 +{
6905 +       unsigned int i;
6906 +       struct page *p;
6907 +       LIST_HEAD(new);
6908 +
6909 +       for (i = 0; i < nr; i++) {
6910 +               p = alloc_page(GFP_KERNEL);
6911 +               if (!p) {
6912 +                       release_pages(&new);
6913 +                       return -ENOMEM;
6914 +               }
6915 +
6916 +               LockPage(p);
6917 +               list_add(&p->list, &new);
6918 +       }
6919 +
6920 +       kcopyd_put_pages(kc, &new);
6921 +       kc->nr_pages += nr;
6922 +       kc->max_split = kc->nr_pages / PAGES_PER_SUB_JOB;
6923 +       if (kc->max_split > SUB_JOB_COUNT)
6924 +               kc->max_split = SUB_JOB_COUNT;
6925 +
6926 +       return 0;
6927 +}
6928 +
6929 +static void client_free_pages(struct kcopyd_client *kc)
6930 +{
6931 +       BUG_ON(kc->nr_free_pages != kc->nr_pages);
6932 +       release_pages(&kc->pages);
6933 +       kc->nr_free_pages = kc->nr_pages = 0;
6934 +}
6935 +
6936 +/*-----------------------------------------------------------------
6937 + * kcopyd_jobs need to be allocated by the *clients* of kcopyd,
6938 + * for this reason we use a mempool to prevent the client from
6939 + * ever having to do io (which could cause a deadlock).
6940 + *---------------------------------------------------------------*/
6941 +struct kcopyd_job {
6942 +       struct kcopyd_client *kc;
6943 +       struct list_head list;
6944 +       unsigned int flags;
6945 +
6946 +       /*
6947 +        * Error state of the job.
6948 +        */
6949 +       int read_err;
6950 +       unsigned int write_err;
6951 +
6952 +       /*
6953 +        * Either READ or WRITE
6954 +        */
6955 +       int rw;
6956 +       struct io_region source;
6957 +
6958 +       /*
6959 +        * The destinations for the transfer.
6960 +        */
6961 +       unsigned int num_dests;
6962 +       struct io_region dests[KCOPYD_MAX_REGIONS];
6963 +
6964 +       sector_t offset;
6965 +       unsigned int nr_pages;
6966 +       struct list_head pages;
6967 +
6968 +       /*
6969 +        * Set this to ensure you are notified when the job has
6970 +        * completed.  'context' is for callback to use.
6971 +        */
6972 +       kcopyd_notify_fn fn;
6973 +       void *context;
6974 +
6975 +       /*
6976 +        * These fields are only used if the job has been split
6977 +        * into more manageable parts.
6978 +        */
6979 +       struct semaphore lock;
6980 +       atomic_t sub_jobs;
6981 +       sector_t progress;
6982 +};
6983 +
6984 +/* FIXME: this should scale with the number of pages */
6985 +#define MIN_JOBS 512
6986 +
6987 +static kmem_cache_t *_job_cache;
6988 +static mempool_t *_job_pool;
6989 +
6990 +/*
6991 + * We maintain three lists of jobs:
6992 + *
6993 + * i)   jobs waiting for pages
6994 + * ii)  jobs that have pages, and are waiting for the io to be issued.
6995 + * iii) jobs that have completed.
6996 + *
6997 + * All three of these are protected by job_lock.
6998 + */
6999 +static spinlock_t _job_lock = SPIN_LOCK_UNLOCKED;
7000 +
7001 +static LIST_HEAD(_complete_jobs);
7002 +static LIST_HEAD(_io_jobs);
7003 +static LIST_HEAD(_pages_jobs);
7004 +
7005 +static int jobs_init(void)
7006 +{
7007 +       INIT_LIST_HEAD(&_complete_jobs);
7008 +       INIT_LIST_HEAD(&_io_jobs);
7009 +       INIT_LIST_HEAD(&_pages_jobs);
7010 +
7011 +       _job_cache = kmem_cache_create("kcopyd-jobs",
7012 +                                      sizeof(struct kcopyd_job),
7013 +                                      __alignof__(struct kcopyd_job),
7014 +                                      0, NULL, NULL);
7015 +       if (!_job_cache)
7016 +               return -ENOMEM;
7017 +
7018 +       _job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
7019 +                                  mempool_free_slab, _job_cache);
7020 +       if (!_job_pool) {
7021 +               kmem_cache_destroy(_job_cache);
7022 +               return -ENOMEM;
7023 +       }
7024 +
7025 +       return 0;
7026 +}
7027 +
7028 +static void jobs_exit(void)
7029 +{
7030 +       BUG_ON(!list_empty(&_complete_jobs));
7031 +       BUG_ON(!list_empty(&_io_jobs));
7032 +       BUG_ON(!list_empty(&_pages_jobs));
7033 +
7034 +       mempool_destroy(_job_pool);
7035 +       kmem_cache_destroy(_job_cache);
7036 +}
7037 +
7038 +/*
7039 + * Functions to push and pop a job onto the head of a given job
7040 + * list.
7041 + */
7042 +static inline struct kcopyd_job *pop(struct list_head *jobs)
7043 +{
7044 +       struct kcopyd_job *job = NULL;
7045 +       unsigned long flags;
7046 +
7047 +       spin_lock_irqsave(&_job_lock, flags);
7048 +
7049 +       if (!list_empty(jobs)) {
7050 +               job = list_entry(jobs->next, struct kcopyd_job, list);
7051 +               list_del(&job->list);
7052 +       }
7053 +       spin_unlock_irqrestore(&_job_lock, flags);
7054 +
7055 +       return job;
7056 +}
7057 +
7058 +static inline void push(struct list_head *jobs, struct kcopyd_job *job)
7059 +{
7060 +       unsigned long flags;
7061 +
7062 +       spin_lock_irqsave(&_job_lock, flags);
7063 +       list_add_tail(&job->list, jobs);
7064 +       spin_unlock_irqrestore(&_job_lock, flags);
7065 +}
7066 +
7067 +/*
7068 + * These three functions process 1 item from the corresponding
7069 + * job list.
7070 + *
7071 + * They return:
7072 + * < 0: error
7073 + *   0: success
7074 + * > 0: can't process yet.
7075 + */
7076 +static int run_complete_job(struct kcopyd_job *job)
7077 +{
7078 +       void *context = job->context;
7079 +       int read_err = job->read_err;
7080 +       unsigned int write_err = job->write_err;
7081 +       kcopyd_notify_fn fn = job->fn;
7082 +
7083 +       kcopyd_put_pages(job->kc, &job->pages);
7084 +       mempool_free(job, _job_pool);
7085 +       fn(read_err, write_err, context);
7086 +       return 0;
7087 +}
7088 +
7089 +static void complete_io(unsigned int error, void *context)
7090 +{
7091 +       struct kcopyd_job *job = (struct kcopyd_job *) context;
7092 +
7093 +       if (error) {
7094 +               if (job->rw == WRITE)
7095 +                       job->write_err &= error;
7096 +               else
7097 +                       job->read_err = 1;
7098 +
7099 +               if (!test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) {
7100 +                       push(&_complete_jobs, job);
7101 +                       dm_daemon_wake(&_kcopyd);
7102 +                       return;
7103 +               }
7104 +       }
7105 +
7106 +       if (job->rw == WRITE)
7107 +               push(&_complete_jobs, job);
7108 +
7109 +       else {
7110 +               job->rw = WRITE;
7111 +               push(&_io_jobs, job);
7112 +       }
7113 +
7114 +       dm_daemon_wake(&_kcopyd);
7115 +}
7116 +
7117 +/*
7118 + * Request io on as many buffer heads as we can currently get for
7119 + * a particular job.
7120 + */
7121 +static int run_io_job(struct kcopyd_job *job)
7122 +{
7123 +       int r;
7124 +
7125 +       if (job->rw == READ)
7126 +               r = dm_io_async(1, &job->source, job->rw,
7127 +                               list_entry(job->pages.next, struct page, list),
7128 +                               job->offset, complete_io, job);
7129 +
7130 +       else
7131 +               r = dm_io_async(job->num_dests, job->dests, job->rw,
7132 +                               list_entry(job->pages.next, struct page, list),
7133 +                               job->offset, complete_io, job);
7134 +
7135 +       return r;
7136 +}
7137 +
7138 +static int run_pages_job(struct kcopyd_job *job)
7139 +{
7140 +       int r;
7141 +
7142 +       job->nr_pages = dm_div_up(job->dests[0].count + job->offset,
7143 +                                 SECTORS_PER_PAGE);
7144 +       r = kcopyd_get_pages(job->kc, job->nr_pages, &job->pages);
7145 +       if (!r) {
7146 +               /* this job is ready for io */
7147 +               push(&_io_jobs, job);
7148 +               return 0;
7149 +       }
7150 +
7151 +       if (r == -ENOMEM)
7152 +               /* can't complete now */
7153 +               return 1;
7154 +
7155 +       return r;
7156 +}
7157 +
7158 +/*
7159 + * Run through a list for as long as possible.  Returns the count
7160 + * of successful jobs.
7161 + */
7162 +static int process_jobs(struct list_head *jobs, int (*fn) (struct kcopyd_job *))
7163 +{
7164 +       struct kcopyd_job *job;
7165 +       int r, count = 0;
7166 +
7167 +       while ((job = pop(jobs))) {
7168 +
7169 +               r = fn(job);
7170 +
7171 +               if (r < 0) {
7172 +                       /* error this rogue job */
7173 +                       if (job->rw == WRITE)
7174 +                               job->write_err = (unsigned int) -1;
7175 +                       else
7176 +                               job->read_err = 1;
7177 +                       push(&_complete_jobs, job);
7178 +                       break;
7179 +               }
7180 +
7181 +               if (r > 0) {
7182 +                       /*
7183 +                        * We couldn't service this job ATM, so
7184 +                        * push this job back onto the list.
7185 +                        */
7186 +                       push(jobs, job);
7187 +                       break;
7188 +               }
7189 +
7190 +               count++;
7191 +       }
7192 +
7193 +       return count;
7194 +}
7195 +
7196 +/*
7197 + * kcopyd does this every time it's woken up.
7198 + */
7199 +static void do_work(void)
7200 +{
7201 +       /*
7202 +        * The order that these are called is *very* important.
7203 +        * complete jobs can free some pages for pages jobs.
7204 +        * Pages jobs when successful will jump onto the io jobs
7205 +        * list.  io jobs call wake when they complete and it all
7206 +        * starts again.
7207 +        */
7208 +       process_jobs(&_complete_jobs, run_complete_job);
7209 +       process_jobs(&_pages_jobs, run_pages_job);
7210 +       process_jobs(&_io_jobs, run_io_job);
7211 +       run_task_queue(&tq_disk);
7212 +}
7213 +
7214 +/*
7215 + * If we are copying a small region we just dispatch a single job
7216 + * to do the copy, otherwise the io has to be split up into many
7217 + * jobs.
7218 + */
7219 +static void dispatch_job(struct kcopyd_job *job)
7220 +{
7221 +       push(&_pages_jobs, job);
7222 +       dm_daemon_wake(&_kcopyd);
7223 +}
7224 +
7225 +static void segment_complete(int read_err,
7226 +                            unsigned int write_err, void *context)
7227 +{
7228 +       /* FIXME: tidy this function */
7229 +       sector_t progress = 0;
7230 +       sector_t count = 0;
7231 +       struct kcopyd_job *job = (struct kcopyd_job *) context;
7232 +
7233 +       down(&job->lock);
7234 +
7235 +       /* update the error */
7236 +       if (read_err)
7237 +               job->read_err = 1;
7238 +
7239 +       if (write_err)
7240 +               job->write_err &= write_err;
7241 +
7242 +       /*
7243 +        * Only dispatch more work if there hasn't been an error.
7244 +        */
7245 +       if ((!job->read_err && !job->write_err) ||
7246 +           test_bit(KCOPYD_IGNORE_ERROR, &job->flags)) {
7247 +               /* get the next chunk of work */
7248 +               progress = job->progress;
7249 +               count = job->source.count - progress;
7250 +               if (count) {
7251 +                       if (count > SUB_JOB_SIZE)
7252 +                               count = SUB_JOB_SIZE;
7253 +
7254 +                       job->progress += count;
7255 +               }
7256 +       }
7257 +       up(&job->lock);
7258 +
7259 +       if (count) {
7260 +               int i;
7261 +               struct kcopyd_job *sub_job = mempool_alloc(_job_pool, GFP_NOIO);
7262 +
7263 +               memcpy(sub_job, job, sizeof(*job));
7264 +               sub_job->source.sector += progress;
7265 +               sub_job->source.count = count;
7266 +
7267 +               for (i = 0; i < job->num_dests; i++) {
7268 +                       sub_job->dests[i].sector += progress;
7269 +                       sub_job->dests[i].count = count;
7270 +               }
7271 +
7272 +               sub_job->fn = segment_complete;
7273 +               sub_job->context = job;
7274 +               dispatch_job(sub_job);
7275 +
7276 +       } else if (atomic_dec_and_test(&job->sub_jobs)) {
7277 +
7278 +               /*
7279 +                * To avoid a race we must keep the job around
7280 +                * until after the notify function has completed.
7281 +                * Otherwise the client may try and stop the job
7282 +                * after we've completed.
7283 +                */
7284 +               job->fn(read_err, write_err, job->context);
7285 +               mempool_free(job, _job_pool);
7286 +       }
7287 +}
7288 +
7289 +/*
7290 + * Create some little jobs that will do the move between
7291 + * them.
7292 + */
7293 +static void split_job(struct kcopyd_job *job)
7294 +{
7295 +       int nr;
7296 +
7297 +       nr = dm_div_up(job->source.count, SUB_JOB_SIZE);
7298 +       if (nr > job->kc->max_split)
7299 +               nr = job->kc->max_split;
7300 +
7301 +       atomic_set(&job->sub_jobs, nr);
7302 +       while (nr--)
7303 +               segment_complete(0, 0u, job);
7304 +}
7305 +
7306 +int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
7307 +               unsigned int num_dests, struct io_region *dests,
7308 +               unsigned int flags, kcopyd_notify_fn fn, void *context)
7309 +{
7310 +       struct kcopyd_job *job;
7311 +
7312 +       /*
7313 +        * Allocate a new job.
7314 +        */
7315 +       job = mempool_alloc(_job_pool, GFP_NOIO);
7316 +
7317 +       /*
7318 +        * set up for the read.
7319 +        */
7320 +       job->kc = kc;
7321 +       job->flags = flags;
7322 +       job->read_err = 0;
7323 +       job->write_err = 0;
7324 +       job->rw = READ;
7325 +
7326 +       memcpy(&job->source, from, sizeof(*from));
7327 +
7328 +       job->num_dests = num_dests;
7329 +       memcpy(&job->dests, dests, sizeof(*dests) * num_dests);
7330 +
7331 +       job->offset = 0;
7332 +       job->nr_pages = 0;
7333 +       INIT_LIST_HEAD(&job->pages);
7334 +
7335 +       job->fn = fn;
7336 +       job->context = context;
7337 +
7338 +       if (job->source.count < SUB_JOB_SIZE)
7339 +               dispatch_job(job);
7340 +
7341 +       else {
7342 +               init_MUTEX(&job->lock);
7343 +               job->progress = 0;
7344 +               split_job(job);
7345 +       }
7346 +
7347 +       return 0;
7348 +}
7349 +
7350 +/*
7351 + * Cancels a kcopyd job, eg. someone might be deactivating a
7352 + * mirror.
7353 + */
7354 +int kcopyd_cancel(struct kcopyd_job *job, int block)
7355 +{
7356 +       /* FIXME: finish */
7357 +       return -1;
7358 +}
7359 +
7360 +/*-----------------------------------------------------------------
7361 + * Unit setup
7362 + *---------------------------------------------------------------*/
7363 +static DECLARE_MUTEX(_client_lock);
7364 +static LIST_HEAD(_clients);
7365 +
7366 +static int client_add(struct kcopyd_client *kc)
7367 +{
7368 +       down(&_client_lock);
7369 +       list_add(&kc->list, &_clients);
7370 +       up(&_client_lock);
7371 +       return 0;
7372 +}
7373 +
7374 +static void client_del(struct kcopyd_client *kc)
7375 +{
7376 +       down(&_client_lock);
7377 +       list_del(&kc->list);
7378 +       up(&_client_lock);
7379 +}
7380 +
7381 +int kcopyd_client_create(unsigned int nr_pages, struct kcopyd_client **result)
7382 +{
7383 +       int r = 0;
7384 +       struct kcopyd_client *kc;
7385 +
7386 +       if (nr_pages * SECTORS_PER_PAGE < SUB_JOB_SIZE) {
7387 +               DMERR("kcopyd client requested %u pages: minimum is %lu",
7388 +                     nr_pages, SUB_JOB_SIZE / SECTORS_PER_PAGE);
7389 +               return -ENOMEM;
7390 +       }
7391 +
7392 +       kc = kmalloc(sizeof(*kc), GFP_KERNEL);
7393 +       if (!kc)
7394 +               return -ENOMEM;
7395 +
7396 +       kc->lock = SPIN_LOCK_UNLOCKED;
7397 +       INIT_LIST_HEAD(&kc->pages);
7398 +       kc->nr_pages = kc->nr_free_pages = 0;
7399 +       r = client_alloc_pages(kc, nr_pages);
7400 +       if (r) {
7401 +               kfree(kc);
7402 +               return r;
7403 +       }
7404 +
7405 +       r = dm_io_get(nr_pages);
7406 +       if (r) {
7407 +               client_free_pages(kc);
7408 +               kfree(kc);
7409 +               return r;
7410 +       }
7411 +
7412 +       r = client_add(kc);
7413 +       if (r) {
7414 +               dm_io_put(nr_pages);
7415 +               client_free_pages(kc);
7416 +               kfree(kc);
7417 +               return r;
7418 +       }
7419 +
7420 +       *result = kc;
7421 +       return 0;
7422 +}
7423 +
7424 +void kcopyd_client_destroy(struct kcopyd_client *kc)
7425 +{
7426 +       dm_io_put(kc->nr_pages);
7427 +       client_free_pages(kc);
7428 +       client_del(kc);
7429 +       kfree(kc);
7430 +}
7431 +
7432 +
7433 +int __init kcopyd_init(void)
7434 +{
7435 +       int r;
7436 +
7437 +       r = jobs_init();
7438 +       if (r)
7439 +               return r;
7440 +
7441 +       r = dm_daemon_start(&_kcopyd, "kcopyd", do_work);
7442 +       if (r)
7443 +               jobs_exit();
7444 +
7445 +       return r;
7446 +}
7447 +
7448 +void kcopyd_exit(void)
7449 +{
7450 +       jobs_exit();
7451 +       dm_daemon_stop(&_kcopyd);
7452 +}
7453 +
7454 +EXPORT_SYMBOL(kcopyd_client_create);
7455 +EXPORT_SYMBOL(kcopyd_client_destroy);
7456 +EXPORT_SYMBOL(kcopyd_copy);
7457 +EXPORT_SYMBOL(kcopyd_cancel);
7458 diff -urN linux-2.4.24.org/drivers/md/kcopyd.h linux-2.4.24/drivers/md/kcopyd.h
7459 --- linux-2.4.24.org/drivers/md/kcopyd.h        1970-01-01 01:00:00.000000000 +0100
7460 +++ linux-2.4.24/drivers/md/kcopyd.h    2004-01-18 15:01:25.800189017 +0100
7461 @@ -0,0 +1,47 @@
7462 +/*
7463 + * Copyright (C) 2001 Sistina Software
7464 + *
7465 + * This file is released under the GPL.
7466 + */
7467 +
7468 +#ifndef DM_KCOPYD_H
7469 +#define DM_KCOPYD_H
7470 +
7471 +/*
7472 + * Needed for the definition of offset_t.
7473 + */
7474 +#include <linux/device-mapper.h>
7475 +#include <linux/iobuf.h>
7476 +
7477 +#include "dm-io.h"
7478 +
7479 +int kcopyd_init(void);
7480 +void kcopyd_exit(void);
7481 +
7482 +/* FIXME: make this configurable */
7483 +#define KCOPYD_MAX_REGIONS 8
7484 +
7485 +#define KCOPYD_IGNORE_ERROR 1
7486 +
7487 +/*
7488 + * To use kcopyd you must first create a kcopyd client object.
7489 + */
7490 +struct kcopyd_client;
7491 +int kcopyd_client_create(unsigned int num_pages, struct kcopyd_client **result);
7492 +void kcopyd_client_destroy(struct kcopyd_client *kc);
7493 +
7494 +/*
7495 + * Submit a copy job to kcopyd.  This is built on top of the
7496 + * previous three fns.
7497 + *
7498 + * read_err is a boolean,
7499 + * write_err is a bitset, with 1 bit for each destination region
7500 + */
7501 +typedef void (*kcopyd_notify_fn)(int read_err,
7502 +                                unsigned int write_err, void *context);
7503 +
7504 +int kcopyd_copy(struct kcopyd_client *kc, struct io_region *from,
7505 +               unsigned int num_dests, struct io_region *dests,
7506 +               unsigned int flags, kcopyd_notify_fn fn, void *context);
7507 +
7508 +#endif
7509 diff -urN linux-2.4.24.org/drivers/md/Makefile linux-2.4.24/drivers/md/Makefile
7510 --- linux-2.4.24.org/drivers/md/Makefile        2004-01-18 14:58:09.300663064 +0100
7511 +++ linux-2.4.24/drivers/md/Makefile    2004-01-18 15:01:29.209473819 +0100
7512 @@ -4,24 +4,35 @@
7513  
7514  O_TARGET       := mddev.o
7515  
7516 -export-objs    := md.o xor.o
7517 -list-multi     := lvm-mod.o
7518 +export-objs    := md.o xor.o dm-table.o dm-target.o dm.o dm-daemon.o \
7519 +                  kcopyd.o dm-io.o
7520 +
7521 +list-multi     := lvm-mod.o dm-mod.o dm-mirror-mod.o
7522  lvm-mod-objs   := lvm.o lvm-snap.o lvm-fs.o
7523 +dm-mod-objs    := dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \
7524 +                  dm-ioctl.o dm-daemon.o kcopyd.o dm-io.o dm-snapshot.o \
7525 +                  dm-exception-store.o
7526  
7527  # Note: link order is important.  All raid personalities
7528  # and xor.o must come before md.o, as they each initialise 
7529  # themselves, and md.o may use the personalities when it 
7530  # auto-initialised.
7531  
7532 -obj-$(CONFIG_MD_LINEAR)                += linear.o
7533 -obj-$(CONFIG_MD_RAID0)         += raid0.o
7534 -obj-$(CONFIG_MD_RAID1)         += raid1.o
7535 -obj-$(CONFIG_MD_RAID5)         += raid5.o xor.o
7536 -obj-$(CONFIG_MD_MULTIPATH)     += multipath.o
7537 -obj-$(CONFIG_BLK_DEV_MD)       += md.o
7538 -obj-$(CONFIG_BLK_DEV_LVM)      += lvm-mod.o
7539 +obj-$(CONFIG_MD_LINEAR)                        += linear.o
7540 +obj-$(CONFIG_MD_RAID0)                 += raid0.o
7541 +obj-$(CONFIG_MD_RAID1)                 += raid1.o
7542 +obj-$(CONFIG_MD_RAID5)                 += raid5.o xor.o
7543 +obj-$(CONFIG_MD_MULTIPATH)             += multipath.o
7544 +obj-$(CONFIG_BLK_DEV_MD)               += md.o
7545 +
7546 +obj-$(CONFIG_BLK_DEV_LVM)              += lvm-mod.o
7547 +
7548 +obj-$(CONFIG_BLK_DEV_DM)               += dm-mod.o
7549  
7550  include $(TOPDIR)/Rules.make
7551  
7552  lvm-mod.o: $(lvm-mod-objs)
7553         $(LD) -r -o $@ $(lvm-mod-objs)
7554 +
7555 +dm-mod.o: $(dm-mod-objs)
7556 +       $(LD) -r -o $@ $(dm-mod-objs)
7557 diff -urN linux-2.4.24.org/include/linux/device-mapper.h linux-2.4.24/include/linux/device-mapper.h
7558 --- linux-2.4.24.org/include/linux/device-mapper.h      1970-01-01 01:00:00.000000000 +0100
7559 +++ linux-2.4.24/include/linux/device-mapper.h  2004-01-18 15:01:13.800707381 +0100
7560 @@ -0,0 +1,104 @@
7561 +/*
7562 + * Copyright (C) 2001 Sistina Software (UK) Limited.
7563 + *
7564 + * This file is released under the LGPL.
7565 + */
7566 +
7567 +#ifndef _LINUX_DEVICE_MAPPER_H
7568 +#define _LINUX_DEVICE_MAPPER_H
7569 +
7570 +typedef unsigned long sector_t;
7571 +
7572 +struct dm_target;
7573 +struct dm_table;
7574 +struct dm_dev;
7575 +
7576 +typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
7577 +
7578 +union map_info {
7579 +       void *ptr;
7580 +       unsigned long long ll;
7581 +};
7582 +
7583 +/*
7584 + * In the constructor the target parameter will already have the
7585 + * table, type, begin and len fields filled in.
7586 + */
7587 +typedef int (*dm_ctr_fn) (struct dm_target * target, unsigned int argc,
7588 +                         char **argv);
7589 +
7590 +/*
7591 + * The destructor doesn't need to free the dm_target, just
7592 + * anything hidden ti->private.
7593 + */
7594 +typedef void (*dm_dtr_fn) (struct dm_target * ti);
7595 +
7596 +/*
7597 + * The map function must return:
7598 + * < 0: error
7599 + * = 0: The target will handle the io by resubmitting it later
7600 + * > 0: simple remap complete
7601 + */
7602 +typedef int (*dm_map_fn) (struct dm_target * ti, struct buffer_head * bh,
7603 +                         int rw, union map_info *map_context);
7604 +
7605 +/*
7606 + * Returns:
7607 + * < 0 : error (currently ignored)
7608 + * 0   : ended successfully
7609 + * 1   : for some reason the io has still not completed (eg,
7610 + *       multipath target might want to requeue a failed io).
7611 + */
7612 +typedef int (*dm_endio_fn) (struct dm_target * ti,
7613 +                           struct buffer_head * bh, int rw, int error,
7614 +                           union map_info *map_context);
7615 +typedef void (*dm_suspend_fn) (struct dm_target *ti);
7616 +typedef void (*dm_resume_fn) (struct dm_target *ti);
7617 +typedef int (*dm_status_fn) (struct dm_target * ti, status_type_t status_type,
7618 +                            char *result, unsigned int maxlen);
7619 +
7620 +void dm_error(const char *message);
7621 +
7622 +/*
7623 + * Constructors should call these functions to ensure destination devices
7624 + * are opened/closed correctly.
7625 + * FIXME: too many arguments.
7626 + */
7627 +int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
7628 +                 sector_t len, int mode, struct dm_dev **result);
7629 +void dm_put_device(struct dm_target *ti, struct dm_dev *d);
7630 +
7631 +/*
7632 + * Information about a target type
7633 + */
7634 +struct target_type {
7635 +       const char *name;
7636 +       struct module *module;
7637 +       dm_ctr_fn ctr;
7638 +       dm_dtr_fn dtr;
7639 +       dm_map_fn map;
7640 +       dm_endio_fn end_io;
7641 +       dm_suspend_fn suspend;
7642 +       dm_resume_fn resume;
7643 +       dm_status_fn status;
7644 +};
7645 +
7646 +struct dm_target {
7647 +       struct dm_table *table;
7648 +       struct target_type *type;
7649 +
7650 +       /* target limits */
7651 +       sector_t begin;
7652 +       sector_t len;
7653 +
7654 +       /* target specific data */
7655 +       void *private;
7656 +
7657 +       /* Used to provide an error string from the ctr */
7658 +       char *error;
7659 +};
7660 +
7661 +int dm_register_target(struct target_type *t);
7662 +int dm_unregister_target(struct target_type *t);
7663 +
7664 +#endif                         /* _LINUX_DEVICE_MAPPER_H */
7665 diff -urN linux-2.4.24.org/include/linux/dm-ioctl.h linux-2.4.24/include/linux/dm-ioctl.h
7666 --- linux-2.4.24.org/include/linux/dm-ioctl.h   1970-01-01 01:00:00.000000000 +0100
7667 +++ linux-2.4.24/include/linux/dm-ioctl.h       2004-01-18 15:01:17.793869131 +0100
7668 @@ -0,0 +1,237 @@
7669 +/*
7670 + * Copyright (C) 2001 - 2003 Sistina Software (UK) Limited.
7671 + *
7672 + * This file is released under the LGPL.
7673 + */
7674 +
7675 +#ifndef _LINUX_DM_IOCTL_H
7676 +#define _LINUX_DM_IOCTL_H
7677 +
7678 +#include <linux/types.h>
7679 +
7680 +#define DM_DIR "mapper"                /* Slashes not supported */
7681 +#define DM_MAX_TYPE_NAME 16
7682 +#define DM_NAME_LEN 128
7683 +#define DM_UUID_LEN 129
7684 +
7685 +/*
7686 + * A traditional ioctl interface for the device mapper.
7687 + *
7688 + * Each device can have two tables associated with it, an
7689 + * 'active' table which is the one currently used by io passing
7690 + * through the device, and an 'inactive' one which is a table
7691 + * that is being prepared as a replacement for the 'active' one.
7692 + *
7693 + * DM_VERSION:
7694 + * Just get the version information for the ioctl interface.
7695 + *
7696 + * DM_REMOVE_ALL:
7697 + * Remove all dm devices, destroy all tables.  Only really used
7698 + * for debug.
7699 + *
7700 + * DM_LIST_DEVICES:
7701 + * Get a list of all the dm device names.
7702 + *
7703 + * DM_DEV_CREATE:
7704 + * Create a new device, neither the 'active' or 'inactive' table
7705 + * slots will be filled.  The device will be in suspended state
7706 + * after creation, however any io to the device will get errored
7707 + * since it will be out-of-bounds.
7708 + *
7709 + * DM_DEV_REMOVE:
7710 + * Remove a device, destroy any tables.
7711 + *
7712 + * DM_DEV_RENAME:
7713 + * Rename a device.
7714 + *
7715 + * DM_SUSPEND:
7716 + * This performs both suspend and resume, depending which flag is
7717 + * passed in.
7718 + * Suspend: This command will not return until all pending io to
7719 + * the device has completed.  Further io will be deferred until
7720 + * the device is resumed.
7721 + * Resume: It is no longer an error to issue this command on an
7722 + * unsuspended device.  If a table is present in the 'inactive'
7723 + * slot, it will be moved to the active slot, then the old table
7724 + * from the active slot will be _destroyed_.  Finally the device
7725 + * is resumed.
7726 + *
7727 + * DM_DEV_STATUS:
7728 + * Retrieves the status for the table in the 'active' slot.
7729 + *
7730 + * DM_DEV_WAIT:
7731 + * Wait for a significant event to occur to the device.  This
7732 + * could either be caused by an event triggered by one of the
7733 + * targets of the table in the 'active' slot, or a table change.
7734 + *
7735 + * DM_TABLE_LOAD:
7736 + * Load a table into the 'inactive' slot for the device.  The
7737 + * device does _not_ need to be suspended prior to this command.
7738 + *
7739 + * DM_TABLE_CLEAR:
7740 + * Destroy any table in the 'inactive' slot (ie. abort).
7741 + *
7742 + * DM_TABLE_DEPS:
7743 + * Return a set of device dependencies for the 'active' table.
7744 + *
7745 + * DM_TABLE_STATUS:
7746 + * Return the targets status for the 'active' table.
7747 + */
7748 +
7749 +/*
7750 + * All ioctl arguments consist of a single chunk of memory, with
7751 + * this structure at the start.  If a uuid is specified any
7752 + * lookup (eg. for a DM_INFO) will be done on that, *not* the
7753 + * name.
7754 + */
7755 +struct dm_ioctl {
7756 +       /*
7757 +        * The version number is made up of three parts:
7758 +        * major - no backward or forward compatibility,
7759 +        * minor - only backwards compatible,
7760 +        * patch - both backwards and forwards compatible.
7761 +        *
7762 +        * All clients of the ioctl interface should fill in the
7763 +        * version number of the interface that they were
7764 +        * compiled with.
7765 +        *
7766 +        * All recognised ioctl commands (ie. those that don't
7767 +        * return -ENOTTY) fill out this field, even if the
7768 +        * command failed.
7769 +        */
7770 +       uint32_t version[3];    /* in/out */
7771 +       uint32_t data_size;     /* total size of data passed in
7772 +                                * including this struct */
7773 +
7774 +       uint32_t data_start;    /* offset to start of data
7775 +                                * relative to start of this struct */
7776 +
7777 +       uint32_t target_count;  /* in/out */
7778 +       int32_t open_count;     /* out */
7779 +       uint32_t flags;         /* in/out */
7780 +       uint32_t event_nr;      /* in/out */
7781 +       uint32_t padding;
7782 +
7783 +       uint64_t dev;           /* in/out */
7784 +
7785 +       char name[DM_NAME_LEN]; /* device name */
7786 +       char uuid[DM_UUID_LEN]; /* unique identifier for
7787 +                                * the block device */
7788 +};
7789 +
7790 +/*
7791 + * Used to specify tables.  These structures appear after the
7792 + * dm_ioctl.
7793 + */
7794 +struct dm_target_spec {
7795 +       uint64_t sector_start;
7796 +       uint64_t length;
7797 +       int32_t status;         /* used when reading from kernel only */
7798 +
7799 +       /*
7800 +        * Offset in bytes (from the start of this struct) to
7801 +        * next target_spec.
7802 +        */
7803 +       uint32_t next;
7804 +
7805 +       char target_type[DM_MAX_TYPE_NAME];
7806 +
7807 +       /*
7808 +        * Parameter string starts immediately after this object.
7809 +        * Be careful to add padding after string to ensure correct
7810 +        * alignment of subsequent dm_target_spec.
7811 +        */
7812 +};
7813 +
7814 +/*
7815 + * Used to retrieve the target dependencies.
7816 + */
7817 +struct dm_target_deps {
7818 +       uint32_t count;         /* Array size */
7819 +       uint32_t padding;       /* unused */
7820 +       uint64_t dev[0];        /* out */
7821 +};
7822 +
7823 +/*
7824 + * Used to get a list of all dm devices.
7825 + */
7826 +struct dm_name_list {
7827 +       uint64_t dev;
7828 +       uint32_t next;          /* offset to the next record from
7829 +                                  the _start_ of this */
7830 +       char name[0];
7831 +};
7832 +
7833 +/*
7834 + * If you change this make sure you make the corresponding change
7835 + * to dm-ioctl.c:lookup_ioctl()
7836 + */
7837 +enum {
7838 +       /* Top level cmds */
7839 +       DM_VERSION_CMD = 0,
7840 +       DM_REMOVE_ALL_CMD,
7841 +       DM_LIST_DEVICES_CMD,
7842 +
7843 +       /* device level cmds */
7844 +       DM_DEV_CREATE_CMD,
7845 +       DM_DEV_REMOVE_CMD,
7846 +       DM_DEV_RENAME_CMD,
7847 +       DM_DEV_SUSPEND_CMD,
7848 +       DM_DEV_STATUS_CMD,
7849 +       DM_DEV_WAIT_CMD,
7850 +
7851 +       /* Table level cmds */
7852 +       DM_TABLE_LOAD_CMD,
7853 +       DM_TABLE_CLEAR_CMD,
7854 +       DM_TABLE_DEPS_CMD,
7855 +       DM_TABLE_STATUS_CMD,
7856 +};
7857 +
7858 +#define DM_IOCTL 0xfd
7859 +
7860 +#define DM_VERSION       _IOWR(DM_IOCTL, DM_VERSION_CMD, struct dm_ioctl)
7861 +#define DM_REMOVE_ALL    _IOWR(DM_IOCTL, DM_REMOVE_ALL_CMD, struct dm_ioctl)
7862 +#define DM_LIST_DEVICES  _IOWR(DM_IOCTL, DM_LIST_DEVICES_CMD, struct dm_ioctl)
7863 +
7864 +#define DM_DEV_CREATE    _IOWR(DM_IOCTL, DM_DEV_CREATE_CMD, struct dm_ioctl)
7865 +#define DM_DEV_REMOVE    _IOWR(DM_IOCTL, DM_DEV_REMOVE_CMD, struct dm_ioctl)
7866 +#define DM_DEV_RENAME    _IOWR(DM_IOCTL, DM_DEV_RENAME_CMD, struct dm_ioctl)
7867 +#define DM_DEV_SUSPEND   _IOWR(DM_IOCTL, DM_DEV_SUSPEND_CMD, struct dm_ioctl)
7868 +#define DM_DEV_STATUS    _IOWR(DM_IOCTL, DM_DEV_STATUS_CMD, struct dm_ioctl)
7869 +#define DM_DEV_WAIT      _IOWR(DM_IOCTL, DM_DEV_WAIT_CMD, struct dm_ioctl)
7870 +
7871 +#define DM_TABLE_LOAD    _IOWR(DM_IOCTL, DM_TABLE_LOAD_CMD, struct dm_ioctl)
7872 +#define DM_TABLE_CLEAR   _IOWR(DM_IOCTL, DM_TABLE_CLEAR_CMD, struct dm_ioctl)
7873 +#define DM_TABLE_DEPS    _IOWR(DM_IOCTL, DM_TABLE_DEPS_CMD, struct dm_ioctl)
7874 +#define DM_TABLE_STATUS  _IOWR(DM_IOCTL, DM_TABLE_STATUS_CMD, struct dm_ioctl)
7875 +
7876 +#define DM_VERSION_MAJOR       4
7877 +#define DM_VERSION_MINOR       0
7878 +#define DM_VERSION_PATCHLEVEL  5
7879 +#define DM_VERSION_EXTRA       "-ioctl (2003-11-18)"
7880 +
7881 +/* Status bits */
7882 +#define DM_READONLY_FLAG       (1 << 0) /* In/Out */
7883 +#define DM_SUSPEND_FLAG                (1 << 1) /* In/Out */
7884 +#define DM_PERSISTENT_DEV_FLAG (1 << 3) /* In */
7885 +
7886 +/*
7887 + * Flag passed into ioctl STATUS command to get table information
7888 + * rather than current status.
7889 + */
7890 +#define DM_STATUS_TABLE_FLAG   (1 << 4) /* In */
7891 +
7892 +/*
7893 + * Flags that indicate whether a table is present in either of
7894 + * the two table slots that a device has.
7895 + */
7896 +#define DM_ACTIVE_PRESENT_FLAG   (1 << 5) /* Out */
7897 +#define DM_INACTIVE_PRESENT_FLAG (1 << 6) /* Out */
7898 +
7899 +/*
7900 + * Indicates that the buffer passed in wasn't big enough for the
7901 + * results.
7902 + */
7903 +#define DM_BUFFER_FULL_FLAG    (1 << 8) /* Out */
7904 +
7905 +#endif                         /* _LINUX_DM_IOCTL_H */
7906 diff -urN linux-2.4.24.org/include/linux/mempool.h linux-2.4.24/include/linux/mempool.h
7907 --- linux-2.4.24.org/include/linux/mempool.h    1970-01-01 01:00:00.000000000 +0100
7908 +++ linux-2.4.24/include/linux/mempool.h        2004-01-18 15:01:09.522605662 +0100
7909 @@ -0,0 +1,31 @@
7910 +/*
7911 + * memory buffer pool support
7912 + */
7913 +#ifndef _LINUX_MEMPOOL_H
7914 +#define _LINUX_MEMPOOL_H
7915 +
7916 +#include <linux/list.h>
7917 +#include <linux/wait.h>
7918 +
7919 +struct mempool_s;
7920 +typedef struct mempool_s mempool_t;
7921 +
7922 +typedef void * (mempool_alloc_t)(int gfp_mask, void *pool_data);
7923 +typedef void (mempool_free_t)(void *element, void *pool_data);
7924 +
7925 +extern mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
7926 +                                mempool_free_t *free_fn, void *pool_data);
7927 +extern int mempool_resize(mempool_t *pool, int new_min_nr, int gfp_mask);
7928 +extern void mempool_destroy(mempool_t *pool);
7929 +extern void * mempool_alloc(mempool_t *pool, int gfp_mask);
7930 +extern void mempool_free(void *element, mempool_t *pool);
7931 +
7932 +/*
7933 + * A mempool_alloc_t and mempool_free_t that get the memory from
7934 + * a slab that is passed in through pool_data.
7935 + */
7936 +void *mempool_alloc_slab(int gfp_mask, void *pool_data);
7937 +void mempool_free_slab(void *element, void *pool_data);
7938 +
7939 +
7940 +#endif /* _LINUX_MEMPOOL_H */
7941 diff -urN linux-2.4.24.org/MAINTAINERS linux-2.4.24/MAINTAINERS
7942 --- linux-2.4.24.org/MAINTAINERS        2004-01-18 14:59:47.570857618 +0100
7943 +++ linux-2.4.24/MAINTAINERS    2004-01-18 15:01:13.766714518 +0100
7944 @@ -581,6 +581,13 @@
7945  W:     http://www.debian.org/~dz/i8k/
7946  S:     Maintained
7947  
7948 +DEVICE MAPPER
7949 +P:     Joe Thornber
7950 +M:     dm@uk.sistina.com
7951 +L:     linux-LVM@sistina.com
7952 +W:     http://www.sistina.com/lvm
7953 +S:     Maintained
7954 +
7955  DEVICE NUMBER REGISTRY
7956  P:     H. Peter Anvin
7957  M:     hpa@zytor.com
7958 diff -urN linux-2.4.24.org/mm/Makefile linux-2.4.24/mm/Makefile
7959 --- linux-2.4.24.org/mm/Makefile        2004-01-18 14:55:23.909936044 +0100
7960 +++ linux-2.4.24/mm/Makefile    2004-01-18 15:01:09.497610911 +0100
7961 @@ -9,12 +9,12 @@
7962  
7963  O_TARGET := mm.o
7964  
7965 -export-objs := shmem.o filemap.o memory.o page_alloc.o
7966 +export-objs := shmem.o filemap.o memory.o page_alloc.o mempool.o
7967  
7968  obj-y   := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
7969             vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
7970             page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \
7971 -           shmem.o
7972 +           shmem.o mempool.o
7973  
7974  obj-$(CONFIG_HIGHMEM) += highmem.o
7975  obj-$(CONFIG_PROC_MM) += proc_mm.o
7976 diff -urN linux-2.4.24.org/mm/mempool.c linux-2.4.24/mm/mempool.c
7977 --- linux-2.4.24.org/mm/mempool.c       1970-01-01 01:00:00.000000000 +0100
7978 +++ linux-2.4.24/mm/mempool.c   2004-01-18 15:01:09.525605032 +0100
7979 @@ -0,0 +1,299 @@
7980 +/*
7981 + *  linux/mm/mempool.c
7982 + *
7983 + *  memory buffer pool support. Such pools are mostly used
7984 + *  for guaranteed, deadlock-free memory allocations during
7985 + *  extreme VM load.
7986 + *
7987 + *  started by Ingo Molnar, Copyright (C) 2001
7988 + */
7989 +
7990 +#include <linux/mm.h>
7991 +#include <linux/slab.h>
7992 +#include <linux/module.h>
7993 +#include <linux/mempool.h>
7994 +
7995 +struct mempool_s {
7996 +       spinlock_t lock;
7997 +       int min_nr;             /* nr of elements at *elements */
7998 +       int curr_nr;            /* Current nr of elements at *elements */
7999 +       void **elements;
8000 +
8001 +       void *pool_data;
8002 +       mempool_alloc_t *alloc;
8003 +       mempool_free_t *free;
8004 +       wait_queue_head_t wait;
8005 +};
8006 +
8007 +static void add_element(mempool_t *pool, void *element)
8008 +{
8009 +       BUG_ON(pool->curr_nr >= pool->min_nr);
8010 +       pool->elements[pool->curr_nr++] = element;
8011 +}
8012 +
8013 +static void *remove_element(mempool_t *pool)
8014 +{
8015 +       BUG_ON(pool->curr_nr <= 0);
8016 +       return pool->elements[--pool->curr_nr];
8017 +}
8018 +
8019 +static void free_pool(mempool_t *pool)
8020 +{
8021 +       while (pool->curr_nr) {
8022 +               void *element = remove_element(pool);
8023 +               pool->free(element, pool->pool_data);
8024 +       }
8025 +       kfree(pool->elements);
8026 +       kfree(pool);
8027 +}
8028 +
8029 +/**
8030 + * mempool_create - create a memory pool
8031 + * @min_nr:    the minimum number of elements guaranteed to be
8032 + *             allocated for this pool.
8033 + * @alloc_fn:  user-defined element-allocation function.
8034 + * @free_fn:   user-defined element-freeing function.
8035 + * @pool_data: optional private data available to the user-defined functions.
8036 + *
8037 + * this function creates and allocates a guaranteed size, preallocated
8038 + * memory pool. The pool can be used from the mempool_alloc and mempool_free
8039 + * functions. This function might sleep. Both the alloc_fn() and the free_fn()
8040 + * functions might sleep - as long as the mempool_alloc function is not called
8041 + * from IRQ contexts.
8042 + */
8043 +mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
8044 +                               mempool_free_t *free_fn, void *pool_data)
8045 +{
8046 +       mempool_t *pool;
8047 +
8048 +       pool = kmalloc(sizeof(*pool), GFP_KERNEL);
8049 +       if (!pool)
8050 +               return NULL;
8051 +       memset(pool, 0, sizeof(*pool));
8052 +       pool->elements = kmalloc(min_nr * sizeof(void *), GFP_KERNEL);
8053 +       if (!pool->elements) {
8054 +               kfree(pool);
8055 +               return NULL;
8056 +       }
8057 +       spin_lock_init(&pool->lock);
8058 +       pool->min_nr = min_nr;
8059 +       pool->pool_data = pool_data;
8060 +       init_waitqueue_head(&pool->wait);
8061 +       pool->alloc = alloc_fn;
8062 +       pool->free = free_fn;
8063 +
8064 +       /*
8065 +        * First pre-allocate the guaranteed number of buffers.
8066 +        */
8067 +       while (pool->curr_nr < pool->min_nr) {
8068 +               void *element;
8069 +
8070 +               element = pool->alloc(GFP_KERNEL, pool->pool_data);
8071 +               if (unlikely(!element)) {
8072 +                       free_pool(pool);
8073 +                       return NULL;
8074 +               }
8075 +               add_element(pool, element);
8076 +       }
8077 +       return pool;
8078 +}
8079 +
8080 +/**
8081 + * mempool_resize - resize an existing memory pool
8082 + * @pool:       pointer to the memory pool which was allocated via
8083 + *              mempool_create().
8084 + * @new_min_nr: the new minimum number of elements guaranteed to be
8085 + *              allocated for this pool.
8086 + * @gfp_mask:   the usual allocation bitmask.
8087 + *
8088 + * This function shrinks/grows the pool. In the case of growing,
8089 + * it cannot be guaranteed that the pool will be grown to the new
8090 + * size immediately, but new mempool_free() calls will refill it.
8091 + *
8092 + * Note, the caller must guarantee that no mempool_destroy is called
8093 + * while this function is running. mempool_alloc() & mempool_free()
8094 + * might be called (eg. from IRQ contexts) while this function executes.
8095 + */
8096 +int mempool_resize(mempool_t *pool, int new_min_nr, int gfp_mask)
8097 +{
8098 +       void *element;
8099 +       void **new_elements;
8100 +       unsigned long flags;
8101 +
8102 +       BUG_ON(new_min_nr <= 0);
8103 +
8104 +       spin_lock_irqsave(&pool->lock, flags);
8105 +       if (new_min_nr < pool->min_nr) {
8106 +               while (pool->curr_nr > new_min_nr) {
8107 +                       element = remove_element(pool);
8108 +                       spin_unlock_irqrestore(&pool->lock, flags);
8109 +                       pool->free(element, pool->pool_data);
8110 +                       spin_lock_irqsave(&pool->lock, flags);
8111 +               }
8112 +               pool->min_nr = new_min_nr;
8113 +               goto out_unlock;
8114 +       }
8115 +       spin_unlock_irqrestore(&pool->lock, flags);
8116 +
8117 +       /* Grow the pool */
8118 +       new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask);
8119 +       if (!new_elements)
8120 +               return -ENOMEM;
8121 +
8122 +       spin_lock_irqsave(&pool->lock, flags);
8123 +       memcpy(new_elements, pool->elements,
8124 +                       pool->curr_nr * sizeof(*new_elements));
8125 +       kfree(pool->elements);
8126 +       pool->elements = new_elements;
8127 +       pool->min_nr = new_min_nr;
8128 +
8129 +       while (pool->curr_nr < pool->min_nr) {
8130 +               spin_unlock_irqrestore(&pool->lock, flags);
8131 +               element = pool->alloc(gfp_mask, pool->pool_data);
8132 +               if (!element)
8133 +                       goto out;
8134 +               spin_lock_irqsave(&pool->lock, flags);
8135 +               if (pool->curr_nr < pool->min_nr)
8136 +                       add_element(pool, element);
8137 +               else
8138 +                       kfree(element);         /* Raced */
8139 +       }
8140 +out_unlock:
8141 +       spin_unlock_irqrestore(&pool->lock, flags);
8142 +out:
8143 +       return 0;
8144 +}
8145 +
8146 +/**
8147 + * mempool_destroy - deallocate a memory pool
8148 + * @pool:      pointer to the memory pool which was allocated via
8149 + *             mempool_create().
8150 + *
8151 + * this function only sleeps if the free_fn() function sleeps. The caller
8152 + * has to guarantee that all elements have been returned to the pool (ie:
8153 + * freed) prior to calling mempool_destroy().
8154 + */
8155 +void mempool_destroy(mempool_t *pool)
8156 +{
8157 +       if (pool->curr_nr != pool->min_nr)
8158 +               BUG();          /* There were outstanding elements */
8159 +       free_pool(pool);
8160 +}
8161 +
8162 +/**
8163 + * mempool_alloc - allocate an element from a specific memory pool
8164 + * @pool:      pointer to the memory pool which was allocated via
8165 + *             mempool_create().
8166 + * @gfp_mask:  the usual allocation bitmask.
8167 + *
8168 + * this function only sleeps if the alloc_fn function sleeps or
8169 + * returns NULL. Note that due to preallocation, this function
8170 + * *never* fails when called from process contexts. (it might
8171 + * fail if called from an IRQ context.)
8172 + */
8173 +void * mempool_alloc(mempool_t *pool, int gfp_mask)
8174 +{
8175 +       void *element;
8176 +       unsigned long flags;
8177 +       int curr_nr;
8178 +       DECLARE_WAITQUEUE(wait, current);
8179 +       int gfp_nowait = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
8180 +
8181 +repeat_alloc:
8182 +       element = pool->alloc(gfp_nowait, pool->pool_data);
8183 +       if (likely(element != NULL))
8184 +               return element;
8185 +
8186 +       /*
8187 +        * If the pool is less than 50% full then try harder
8188 +        * to allocate an element:
8189 +        */
8190 +       if ((gfp_mask != gfp_nowait) && (pool->curr_nr <= pool->min_nr/2)) {
8191 +               element = pool->alloc(gfp_mask, pool->pool_data);
8192 +               if (likely(element != NULL))
8193 +                       return element;
8194 +       }
8195 +
8196 +       /*
8197 +        * Kick the VM at this point.
8198 +        */
8199 +       wakeup_bdflush();
8200 +
8201 +       spin_lock_irqsave(&pool->lock, flags);
8202 +       if (likely(pool->curr_nr)) {
8203 +               element = remove_element(pool);
8204 +               spin_unlock_irqrestore(&pool->lock, flags);
8205 +               return element;
8206 +       }
8207 +       spin_unlock_irqrestore(&pool->lock, flags);
8208 +
8209 +       /* We must not sleep in the GFP_ATOMIC case */
8210 +       if (gfp_mask == gfp_nowait)
8211 +               return NULL;
8212 +
8213 +       run_task_queue(&tq_disk);
8214 +
8215 +       add_wait_queue_exclusive(&pool->wait, &wait);
8216 +       set_task_state(current, TASK_UNINTERRUPTIBLE);
8217 +
8218 +       spin_lock_irqsave(&pool->lock, flags);
8219 +       curr_nr = pool->curr_nr;
8220 +       spin_unlock_irqrestore(&pool->lock, flags);
8221 +
8222 +       if (!curr_nr)
8223 +               schedule();
8224 +
8225 +       current->state = TASK_RUNNING;
8226 +       remove_wait_queue(&pool->wait, &wait);
8227 +
8228 +       goto repeat_alloc;
8229 +}
8230 +
8231 +/**
8232 + * mempool_free - return an element to the pool.
8233 + * @element:   pool element pointer.
8234 + * @pool:      pointer to the memory pool which was allocated via
8235 + *             mempool_create().
8236 + *
8237 + * this function only sleeps if the free_fn() function sleeps.
8238 + */
8239 +void mempool_free(void *element, mempool_t *pool)
8240 +{
8241 +       unsigned long flags;
8242 +
8243 +       if (pool->curr_nr < pool->min_nr) {
8244 +               spin_lock_irqsave(&pool->lock, flags);
8245 +               if (pool->curr_nr < pool->min_nr) {
8246 +                       add_element(pool, element);
8247 +                       spin_unlock_irqrestore(&pool->lock, flags);
8248 +                       wake_up(&pool->wait);
8249 +                       return;
8250 +               }
8251 +               spin_unlock_irqrestore(&pool->lock, flags);
8252 +       }
8253 +       pool->free(element, pool->pool_data);
8254 +}
8255 +
8256 +/*
8257 + * A commonly used alloc and free fn.
8258 + */
8259 +void *mempool_alloc_slab(int gfp_mask, void *pool_data)
8260 +{
8261 +       kmem_cache_t *mem = (kmem_cache_t *) pool_data;
8262 +       return kmem_cache_alloc(mem, gfp_mask);
8263 +}
8264 +
8265 +void mempool_free_slab(void *element, void *pool_data)
8266 +{
8267 +       kmem_cache_t *mem = (kmem_cache_t *) pool_data;
8268 +       kmem_cache_free(mem, element);
8269 +}
8270 +
8271 +
8272 +EXPORT_SYMBOL(mempool_create);
8273 +EXPORT_SYMBOL(mempool_resize);
8274 +EXPORT_SYMBOL(mempool_destroy);
8275 +EXPORT_SYMBOL(mempool_alloc);
8276 +EXPORT_SYMBOL(mempool_free);
8277 +EXPORT_SYMBOL(mempool_alloc_slab);
8278 +EXPORT_SYMBOL(mempool_free_slab);
This page took 0.618293 seconds and 3 git commands to generate.