1 diff -urNp linux-1251/drivers/md/lvm.c linux-1252/drivers/md/lvm.c
2 --- linux-1251/drivers/md/lvm.c
3 +++ linux-1252/drivers/md/lvm.c
6 #include <linux/slab.h>
7 #include <linux/init.h>
8 +#include <linux/mempool.h>
10 #include <linux/hdreg.h>
11 #include <linux/stat.h>
14 #include <linux/errno.h>
15 #include <linux/lvm.h>
16 +#include <linux/iobuf.h>
18 #include "lvm-internal.h"
20 @@ -296,7 +298,6 @@ static int lvm_chr_open(struct inode *,
21 static int lvm_chr_close(struct inode *, struct file *);
22 static int lvm_chr_ioctl(struct inode *, struct file *, uint, ulong);
25 /* End external function prototypes */
28 @@ -324,6 +325,7 @@ static int lvm_do_lv_status_byindex(vg_t
29 static int lvm_do_lv_status_bydev(vg_t *, void *);
31 static int lvm_do_pe_lock_unlock(vg_t *r, void *);
32 +static int lvm_do_pe_locked_copy(vg_t *r, void *);
34 static int lvm_do_pv_change(vg_t*, void*);
35 static int lvm_do_pv_status(vg_t *, void *);
36 @@ -334,8 +336,11 @@ static int lvm_do_vg_extend(vg_t *, void
37 static int lvm_do_vg_reduce(vg_t *, void *);
38 static int lvm_do_vg_rename(vg_t *, void *);
39 static int lvm_do_vg_remove(int);
40 +static int lvm_push_callback(lv_t *, int, struct buffer_head *);
41 +static void lvm_bh_callback(struct buffer_head *, int);
42 static void lvm_geninit(struct gendisk *);
43 static void __update_hardsectsize(lv_t *lv);
44 +static int __do_le_remap(vg_t *, lv_t *, kdev_t, kdev_t, uint, uint);
47 static void _queue_io(struct buffer_head *bh, int rw);
48 @@ -359,7 +364,6 @@ ushort lvm_iop_version = LVM_DRIVER_IOP_
50 const char *const lvm_name = LVM_NAME;
53 /* volume group descriptor area pointers */
54 vg_t *vg[ABS_MAX_VG + 1];
56 @@ -369,6 +373,12 @@ static struct {
58 } vg_lv_map[ABS_MAX_LV];
60 +/* cache a buffer_head end_io callback state */
62 + struct buffer_head bh_io;
64 + struct buffer_head *bh_orig;
67 /* Request structures (lvm_chr_ioctl()) */
68 static pv_change_req_t pv_change_req;
69 @@ -419,6 +429,8 @@ static int lvm_blocksizes[MAX_LV];
70 static int lvm_hardsectsizes[MAX_LV];
71 static int lvm_size[MAX_LV];
73 +static mempool_t *lvm_callback_mempool;
75 static struct gendisk lvm_gendisk =
78 @@ -431,25 +443,45 @@ static struct gendisk lvm_gendisk =
82 +static void * lvm_callback_alloc(int gfp_flags, void *data)
84 + callback_t *callback;
86 + callback = kmalloc(sizeof *callback, gfp_flags);
90 +static void lvm_callback_free(void *callback, void *data)
96 * Driver initialization...
102 if (devfs_register_chrdev(LVM_CHAR_MAJOR,
103 lvm_name, &lvm_chr_fops) < 0) {
104 printk(KERN_ERR "%s -- devfs_register_chrdev failed\n",
109 if (devfs_register_blkdev(MAJOR_NR, lvm_name, &lvm_blk_dops) < 0)
111 printk("%s -- devfs_register_blkdev failed\n", lvm_name);
112 - if (devfs_unregister_chrdev(LVM_CHAR_MAJOR, lvm_name) < 0)
114 - "%s -- devfs_unregister_chrdev failed\n",
117 + goto out_unreg_char;
121 + lvm_callback_mempool = mempool_create(NR_LVM_CALLBACK,
122 + lvm_callback_alloc,
123 + lvm_callback_free, NULL);
124 + if (!lvm_callback_mempool) {
125 + printk("%s -- out of memory for callback pool\n", lvm_name);
126 + goto out_unreg_block;
130 @@ -482,6 +514,18 @@ int lvm_init(void)
136 + if (devfs_unregister_blkdev(MAJOR_NR, lvm_name) < 0)
137 + printk(KERN_ERR "%s -- devfs_unregister_blkdev failed\n",
140 + if (devfs_unregister_chrdev(LVM_CHAR_MAJOR, lvm_name) < 0)
142 + "%s -- devfs_unregister_chrdev failed\n",
149 @@ -497,7 +541,7 @@ static void lvm_cleanup(void)
150 printk(KERN_ERR "%s -- devfs_unregister_blkdev failed\n",
154 + mempool_destroy(lvm_callback_mempool);
156 /* delete our gendisk from chain */
157 del_gendisk(&lvm_gendisk);
158 @@ -658,6 +702,11 @@ static int lvm_chr_ioctl(struct inode *i
159 physical volume (move's done in user space's pvmove) */
160 return lvm_do_pe_lock_unlock(vg_ptr,arg);
162 + case PE_LOCKED_COPY:
163 + /* lock/unlock i/o to a physical extent to move it to another
164 + physical volume (move's done in user space's pvmove) */
165 + return lvm_do_pe_locked_copy(vg_ptr,arg);
169 return lvm_do_vg_create(arg, minor);
170 @@ -1213,7 +1262,7 @@ static int lvm_map(struct buffer_head *b
172 vg_t *vg_this = vg[VG_BLK(minor)];
173 lv_t *lv = vg_this->lv[LV_BLK(minor)];
177 down_read(&lv->lv_lock);
178 if (!(lv->lv_status & LV_ACTIVE)) {
179 @@ -1328,8 +1377,9 @@ static int lvm_map(struct buffer_head *b
181 bh->b_rdev = rdev_map;
182 bh->b_rsector = rsector_map;
183 + ret = lvm_push_callback(lv, rw, bh);
184 up_read(&lv->lv_lock);
190 @@ -1343,6 +1393,65 @@ static int lvm_map(struct buffer_head *b
191 * internal support functions
195 + * Handle LVM callbacks on buffer_head IO completion: push an IO
196 + * completion onto an existing buffer_head. preserve b_private by
197 + * creating a new buffer_head for the mapped IO.
199 +static int lvm_push_callback(lv_t *lv, int rw, struct buffer_head *bh)
201 + callback_t *callback;
202 + struct buffer_head *nbh;
204 + callback = mempool_alloc(lvm_callback_mempool, GFP_NOIO);
207 + callback->bh_orig = bh;
209 + nbh = &callback->bh_io;
211 + nbh->b_blocknr = bh->b_blocknr;
212 + nbh->b_dev = bh->b_dev;
213 + nbh->b_rdev = bh->b_rdev;
214 + nbh->b_rsector = bh->b_rsector;
215 + nbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) |
216 + (1<<BH_Mapped) | (1<<BH_Lock);
217 + atomic_set(&nbh->b_count, 1);
218 + nbh->b_size = bh->b_size;
219 + nbh->b_page = bh->b_page;
220 + nbh->b_data = bh->b_data;
222 + nbh->b_reqnext = NULL;
224 + nbh->b_end_io = lvm_bh_callback;
225 + nbh->b_private = callback;
227 + down_read(&lv->lv_io_sem);
228 + generic_make_request(rw, nbh);
230 + return 0; /* Tell generic_make_request not to pursue the
231 + original buffer_head any further now that we've
232 + submitted a new one. */
235 +static void lvm_bh_callback(struct buffer_head *bh, int uptodate)
237 + callback_t *callback;
238 + struct buffer_head *obh;
241 + callback = bh->b_private;
243 + obh = callback->bh_orig;
245 + up_read(&lv->lv_io_sem);
247 + mempool_free(callback, lvm_callback_mempool);
249 + obh->b_end_io(obh, uptodate);
255 * generate "hard disk" name
256 @@ -1407,13 +1516,49 @@ lock_try_again:
257 } /* lvm_do_lock_lvm */
260 +static int do_pe_lock(kdev_t lv, kdev_t pv, uint32_t offset)
262 + down_write(&_pe_lock);
263 + if (pe_lock_req.lock == LOCK_PE) {
264 + up_write(&_pe_lock);
268 + /* Should we do to_kdev_t() on the pv_dev and lv_dev??? */
269 + pe_lock_req.lock = LOCK_PE;
270 + pe_lock_req.data.lv_dev = lv;
271 + pe_lock_req.data.pv_dev = pv;
272 + pe_lock_req.data.pv_offset = offset;
273 + up_write(&_pe_lock);
277 +static void do_pe_unlock(void)
279 + struct buffer_head *bh;
281 + down_write(&_pe_lock);
282 + pe_lock_req.lock = UNLOCK_PE;
283 + pe_lock_req.data.lv_dev = 0;
284 + pe_lock_req.data.pv_dev = 0;
285 + pe_lock_req.data.pv_offset = 0;
286 + bh = _dequeue_io();
287 + up_write(&_pe_lock);
289 + /* handle all deferred io for this PE */
290 + /* TODO: Eek, what about attaching callbacks to _flush_io()
291 + deferred requests? --sct */
297 * character device support function lock/unlock physical extend
299 static int lvm_do_pe_lock_unlock(vg_t *vg_ptr, void *arg)
301 pe_lock_req_t new_lock;
302 - struct buffer_head *bh;
306 if (vg_ptr == NULL) return -ENXIO;
307 @@ -1430,42 +1575,30 @@ static int lvm_do_pe_lock_unlock(vg_t *v
308 if (p == vg_ptr->pv_max) return -ENXIO;
311 - * this sync releaves memory pressure to lessen the
312 - * likelyhood of pvmove being paged out - resulting in
313 + * this sync relieves memory pressure to lessen the
314 + * likelihood of pvmove being paged out - resulting in
317 - * This method of doing a pvmove is broken
318 + * This method of doing a pvmove is *highly* broken for
319 + * several reasons. It deadlocks, it does not
320 + * synchronise correctly with outstanding write IO, and
321 + * it defers the actual copy to a user mode app which
322 + * has no cache coherency with the LV devices.
324 fsync_dev(pe_lock_req.data.lv_dev);
326 - down_write(&_pe_lock);
327 - if (pe_lock_req.lock == LOCK_PE) {
328 - up_write(&_pe_lock);
332 - /* Should we do to_kdev_t() on the pv_dev and lv_dev??? */
333 - pe_lock_req.lock = LOCK_PE;
334 - pe_lock_req.data.lv_dev = new_lock.data.lv_dev;
335 - pe_lock_req.data.pv_dev = new_lock.data.pv_dev;
336 - pe_lock_req.data.pv_offset = new_lock.data.pv_offset;
337 - up_write(&_pe_lock);
338 + err = do_pe_lock(new_lock.data.lv_dev,
339 + new_lock.data.pv_dev,
340 + new_lock.data.pv_offset);
344 /* some requests may have got through since the fsync */
345 fsync_dev(pe_lock_req.data.pv_dev);
349 - down_write(&_pe_lock);
350 - pe_lock_req.lock = UNLOCK_PE;
351 - pe_lock_req.data.lv_dev = 0;
352 - pe_lock_req.data.pv_dev = 0;
353 - pe_lock_req.data.pv_offset = 0;
354 - bh = _dequeue_io();
355 - up_write(&_pe_lock);
357 - /* handle all deferred io for this PE */
363 @@ -1476,6 +1609,103 @@ static int lvm_do_pe_lock_unlock(vg_t *v
367 + * character device support function: safe, locked PE copy
369 +static int lvm_do_pe_locked_copy(vg_t *vg_ptr, void *arg)
371 + pe_copy_req_t pe_copy_req;
373 + lv_t *lv_ptr = NULL;
374 + pv_t *pv_ptr = NULL;
376 + unsigned long old_offset, new_offset;
378 + if (vg_ptr == NULL) return -ENXIO;
379 + if (copy_from_user(&pe_copy_req, arg,
380 + sizeof(pe_copy_req_t)) != 0)
382 + if (pe_copy_req.cookie != PE_COPY_MAGIC_COOKIE)
385 + /* First find the logical volume for the request... */
387 + for (i = 0; i < vg_ptr->lv_max; i++) {
388 + lv_ptr = vg_ptr->lv[i];
389 + if (lv_ptr != NULL &&
390 + strcmp(lv_ptr->lv_name, pe_copy_req.lv_name) == 0)
394 + if (i == vg_ptr->lv_max)
397 + /* ... and the physical volume. */
399 + for (i = 0; i < vg_ptr->pv_max; i++) {
400 + pv_ptr = vg_ptr->pv[i];
401 + if (pv_ptr->pv_dev == pe_copy_req.old_dev)
405 + if (i == vg_ptr->pv_max)
408 + /* We'll take the lock on the source extent in the LV first. We
409 + mutex out ALL IO to the entire logical volume before doing
410 + this, so we can be absolutely certain that there is no
411 + outstanding IO to this PE once the lock is in place. (We
412 + can't mutex just one PE without tracking outstanding IO on a
413 + per-extent basis.) */
415 + down_write(&lv_ptr->lv_io_sem);
416 + err = do_pe_lock(lv_ptr->lv_dev,
417 + pe_copy_req.old_dev,
418 + pe_copy_req.old_pe);
419 + up_write(&lv_ptr->lv_io_sem);
424 + /* All prep done, we can copy the bits now */
426 + err = lvm_do_bulk_copy(pe_copy_req.old_dev, pe_copy_req.new_dev,
427 + pe_copy_req.old_pe, pe_copy_req.new_pe,
431 + err = __do_le_remap(vg_ptr, lv_ptr,
432 + pe_copy_req.old_dev, pe_copy_req.new_dev,
433 + pe_copy_req.old_pe, pe_copy_req.new_pe);
440 +static int __do_le_remap(vg_t *vg_ptr, lv_t *lv_ptr,
441 + kdev_t old_dev, kdev_t new_dev,
442 + uint old_pe, uint new_pe)
446 + down_write(&lv_ptr->lv_lock);
447 + for (le = 0; le < lv_ptr->lv_allocated_le; le++) {
448 + if (lv_ptr->lv_current_pe[le].dev == old_dev &&
449 + lv_ptr->lv_current_pe[le].pe == old_pe) {
450 + lv_ptr->lv_current_pe[le].dev = new_dev;
451 + lv_ptr->lv_current_pe[le].pe = new_pe;
453 + up_write(&lv_ptr->lv_lock);
454 + __update_hardsectsize(lv_ptr);
458 + up_write(&lv_ptr->lv_lock);
464 * character device support function logical extend remap
466 static int lvm_do_le_remap(vg_t *vg_ptr, void *arg)
467 @@ -1495,23 +1725,11 @@ static int lvm_do_le_remap(vg_t *vg_ptr,
470 if (strcmp(lv_ptr->lv_name, le_remap_req.lv_name) == 0) {
471 - down_write(&lv_ptr->lv_lock);
472 - for (le = 0; le < lv_ptr->lv_allocated_le; le++) {
473 - if (lv_ptr->lv_current_pe[le].dev ==
474 - le_remap_req.old_dev &&
475 - lv_ptr->lv_current_pe[le].pe ==
476 - le_remap_req.old_pe) {
477 - lv_ptr->lv_current_pe[le].dev =
478 - le_remap_req.new_dev;
479 - lv_ptr->lv_current_pe[le].pe =
480 - le_remap_req.new_pe;
481 - __update_hardsectsize(lv_ptr);
482 - up_write(&lv_ptr->lv_lock);
486 - up_write(&lv_ptr->lv_lock);
488 + return __do_le_remap(vg_ptr, lv_ptr,
489 + le_remap_req.old_dev,
490 + le_remap_req.new_dev,
491 + le_remap_req.old_pe,
492 + le_remap_req.new_pe);
496 @@ -2023,7 +2241,8 @@ static int lvm_do_lv_create(int minor, c
497 lv_ptr->lv_snapshot_hash_table_size = 0;
498 lv_ptr->lv_snapshot_hash_mask = 0;
499 init_rwsem(&lv_ptr->lv_lock);
501 + init_rwsem(&lv_ptr->lv_io_sem);
503 lv_ptr->lv_snapshot_use_rate = 0;
505 vg_ptr->lv[l] = lv_ptr;
506 diff -urNp linux-1251/include/linux/lvm.h linux-1252/include/linux/lvm.h
507 --- linux-1251/include/linux/lvm.h
508 +++ linux-1252/include/linux/lvm.h
509 @@ -153,6 +153,11 @@ struct list_head {
510 #define SECTOR_SIZE 512
514 + * Number of guaranteed callback structs in case of extreme VM load:
516 +#define NR_LVM_CALLBACK 256
518 /* structure version */
519 #define LVM_STRUCT_VERSION 1
521 @@ -339,6 +344,7 @@ struct list_head {
523 /* physical extent */
524 #define PE_LOCK_UNLOCK _IOW ( 0xfe, 0x50, 1)
525 +#define PE_LOCKED_COPY _IOW ( 0xfe, 0x51, 1)
527 /* i/o protocol version */
528 #define LVM_GET_IOP_VERSION _IOR ( 0xfe, 0x98, 1)
529 @@ -571,6 +577,8 @@ typedef struct lv_v5 {
532 uint lv_allocated_snapshot_le;
534 + struct rw_semaphore lv_io_sem;
538 @@ -689,6 +697,18 @@ typedef struct {
542 +/* Request structure PE_COPY */
543 +#define PE_COPY_MAGIC_COOKIE 0xD0D4FF95 /* 4 bytes out of /dev/random */
545 + uint32_t cookie; /* Cookie to guard against reuse of this ioctl */
546 + char lv_name[NAME_LEN];
554 /* Request structure LV_STATUS_BYNAME */
556 char lv_name[NAME_LEN];