]> git.pld-linux.org Git - packages/kernel.git/blame - linux-2.4.17-lvm-pvmove.patch
- added description of djurban's branch
[packages/kernel.git] / linux-2.4.17-lvm-pvmove.patch
CommitLineData
48836bcd
JR
1diff -urNp linux-1251/drivers/md/lvm.c linux-1252/drivers/md/lvm.c
2--- linux-1251/drivers/md/lvm.c
3+++ linux-1252/drivers/md/lvm.c
4@@ -239,6 +239,7 @@
5
6 #include <linux/slab.h>
7 #include <linux/init.h>
8+#include <linux/mempool.h>
9
10 #include <linux/hdreg.h>
11 #include <linux/stat.h>
12@@ -264,6 +265,7 @@
13
14 #include <linux/errno.h>
15 #include <linux/lvm.h>
16+#include <linux/iobuf.h>
17
18 #include "lvm-internal.h"
19
20@@ -296,7 +298,6 @@ static int lvm_chr_open(struct inode *,
21 static int lvm_chr_close(struct inode *, struct file *);
22 static int lvm_chr_ioctl(struct inode *, struct file *, uint, ulong);
23
24-
25 /* End external function prototypes */
26
27
28@@ -324,6 +325,7 @@ static int lvm_do_lv_status_byindex(vg_t
29 static int lvm_do_lv_status_bydev(vg_t *, void *);
30
31 static int lvm_do_pe_lock_unlock(vg_t *r, void *);
32+static int lvm_do_pe_locked_copy(vg_t *r, void *);
33
34 static int lvm_do_pv_change(vg_t*, void*);
35 static int lvm_do_pv_status(vg_t *, void *);
36@@ -334,8 +336,11 @@ static int lvm_do_vg_extend(vg_t *, void
37 static int lvm_do_vg_reduce(vg_t *, void *);
38 static int lvm_do_vg_rename(vg_t *, void *);
39 static int lvm_do_vg_remove(int);
40+static int lvm_push_callback(lv_t *, int, struct buffer_head *);
41+static void lvm_bh_callback(struct buffer_head *, int);
42 static void lvm_geninit(struct gendisk *);
43 static void __update_hardsectsize(lv_t *lv);
44+static int __do_le_remap(vg_t *, lv_t *, kdev_t, kdev_t, uint, uint);
45
46
47 static void _queue_io(struct buffer_head *bh, int rw);
48@@ -359,7 +364,6 @@ ushort lvm_iop_version = LVM_DRIVER_IOP_
49 int loadtime = 0;
50 const char *const lvm_name = LVM_NAME;
51
52-
53 /* volume group descriptor area pointers */
54 vg_t *vg[ABS_MAX_VG + 1];
55
56@@ -369,6 +373,12 @@ static struct {
57 int lv_number;
58 } vg_lv_map[ABS_MAX_LV];
59
60+/* cache a buffer_head end_io callback state */
61+typedef struct {
62+ struct buffer_head bh_io;
63+ lv_t *lv;
64+ struct buffer_head *bh_orig;
65+} callback_t;
66
67 /* Request structures (lvm_chr_ioctl()) */
68 static pv_change_req_t pv_change_req;
69@@ -419,6 +429,8 @@ static int lvm_blocksizes[MAX_LV];
70 static int lvm_hardsectsizes[MAX_LV];
71 static int lvm_size[MAX_LV];
72
73+static mempool_t *lvm_callback_mempool;
74+
75 static struct gendisk lvm_gendisk =
76 {
77 major: MAJOR_NR,
78@@ -431,25 +443,45 @@ static struct gendisk lvm_gendisk =
79 };
80
81
82+static void * lvm_callback_alloc(int gfp_flags, void *data)
83+{
84+ callback_t *callback;
85+
86+ callback = kmalloc(sizeof *callback, gfp_flags);
87+ return callback;
88+}
89+
90+static void lvm_callback_free(void *callback, void *data)
91+{
92+ kfree(callback);
93+}
94+
95 /*
96 * Driver initialization...
97 */
98 int lvm_init(void)
99 {
100+ int err = -EIO;
101+
102 if (devfs_register_chrdev(LVM_CHAR_MAJOR,
103 lvm_name, &lvm_chr_fops) < 0) {
104 printk(KERN_ERR "%s -- devfs_register_chrdev failed\n",
105 lvm_name);
106- return -EIO;
107+ goto out_err;
108 }
109 if (devfs_register_blkdev(MAJOR_NR, lvm_name, &lvm_blk_dops) < 0)
110 {
111 printk("%s -- devfs_register_blkdev failed\n", lvm_name);
112- if (devfs_unregister_chrdev(LVM_CHAR_MAJOR, lvm_name) < 0)
113- printk(KERN_ERR
114- "%s -- devfs_unregister_chrdev failed\n",
115- lvm_name);
116- return -EIO;
117+ goto out_unreg_char;
118+ }
119+
120+ err = -ENOMEM;
121+ lvm_callback_mempool = mempool_create(NR_LVM_CALLBACK,
122+ lvm_callback_alloc,
123+ lvm_callback_free, NULL);
124+ if (!lvm_callback_mempool) {
125+ printk("%s -- out of memory for callback pool\n", lvm_name);
126+ goto out_unreg_block;
127 }
128
129 lvm_init_fs();
130@@ -482,6 +514,18 @@ int lvm_init(void)
131 #endif
132
133 return 0;
134+
135+out_unreg_block:
136+ if (devfs_unregister_blkdev(MAJOR_NR, lvm_name) < 0)
137+ printk(KERN_ERR "%s -- devfs_unregister_blkdev failed\n",
138+ lvm_name);
139+out_unreg_char:
140+ if (devfs_unregister_chrdev(LVM_CHAR_MAJOR, lvm_name) < 0)
141+ printk(KERN_ERR
142+ "%s -- devfs_unregister_chrdev failed\n",
143+ lvm_name);
144+out_err:
145+ return err;
146 } /* lvm_init() */
147
148 /*
149@@ -497,7 +541,7 @@ static void lvm_cleanup(void)
150 printk(KERN_ERR "%s -- devfs_unregister_blkdev failed\n",
151 lvm_name);
152
153-
154+ mempool_destroy(lvm_callback_mempool);
155
156 /* delete our gendisk from chain */
157 del_gendisk(&lvm_gendisk);
158@@ -658,6 +702,11 @@ static int lvm_chr_ioctl(struct inode *i
159 physical volume (move's done in user space's pvmove) */
160 return lvm_do_pe_lock_unlock(vg_ptr,arg);
161
162+ case PE_LOCKED_COPY:
163+ /* lock/unlock i/o to a physical extent to move it to another
164+ physical volume (move's done in user space's pvmove) */
165+ return lvm_do_pe_locked_copy(vg_ptr,arg);
166+
167 case VG_CREATE_OLD:
168 /* create a VGDA */
169 return lvm_do_vg_create(arg, minor);
170@@ -1213,7 +1262,7 @@ static int lvm_map(struct buffer_head *b
171 kdev_t rdev_map;
172 vg_t *vg_this = vg[VG_BLK(minor)];
173 lv_t *lv = vg_this->lv[LV_BLK(minor)];
174-
175+ int ret;
176
177 down_read(&lv->lv_lock);
178 if (!(lv->lv_status & LV_ACTIVE)) {
179@@ -1328,8 +1377,9 @@ static int lvm_map(struct buffer_head *b
180 out:
181 bh->b_rdev = rdev_map;
182 bh->b_rsector = rsector_map;
183+ ret = lvm_push_callback(lv, rw, bh);
184 up_read(&lv->lv_lock);
185- return 1;
186+ return ret;
187
188 bad:
189 if (bh->b_end_io)
190@@ -1343,6 +1393,65 @@ static int lvm_map(struct buffer_head *b
191 * internal support functions
192 */
193
194+/*
195+ * Handle LVM callbacks on buffer_head IO completion: push an IO
196+ * completion onto an existing buffer_head. preserve b_private by
197+ * creating a new buffer_head for the mapped IO.
198+ */
199+static int lvm_push_callback(lv_t *lv, int rw, struct buffer_head *bh)
200+{
201+ callback_t *callback;
202+ struct buffer_head *nbh;
203+
204+ callback = mempool_alloc(lvm_callback_mempool, GFP_NOIO);
205+
206+ callback->lv = lv;
207+ callback->bh_orig = bh;
208+
209+ nbh = &callback->bh_io;
210+
211+ nbh->b_blocknr = bh->b_blocknr;
212+ nbh->b_dev = bh->b_dev;
213+ nbh->b_rdev = bh->b_rdev;
214+ nbh->b_rsector = bh->b_rsector;
215+ nbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) |
216+ (1<<BH_Mapped) | (1<<BH_Lock);
217+ atomic_set(&nbh->b_count, 1);
218+ nbh->b_size = bh->b_size;
219+ nbh->b_page = bh->b_page;
220+ nbh->b_data = bh->b_data;
221+ nbh->b_list = 0;
222+ nbh->b_reqnext = NULL;
223+
224+ nbh->b_end_io = lvm_bh_callback;
225+ nbh->b_private = callback;
226+
227+ down_read(&lv->lv_io_sem);
228+ generic_make_request(rw, nbh);
229+
230+ return 0; /* Tell generic_make_request not to pursue the
231+ original buffer_head any further now that we've
232+ submitted a new one. */
233+}
234+
235+static void lvm_bh_callback(struct buffer_head *bh, int uptodate)
236+{
237+ callback_t *callback;
238+ struct buffer_head *obh;
239+ lv_t *lv;
240+
241+ callback = bh->b_private;
242+ lv = callback->lv;
243+ obh = callback->bh_orig;
244+
245+ up_read(&lv->lv_io_sem);
246+
247+ mempool_free(callback, lvm_callback_mempool);
248+ if (obh->b_end_io)
249+ obh->b_end_io(obh, uptodate);
250+}
251+
252+
253 #ifdef LVM_HD_NAME
254 /*
255 * generate "hard disk" name
256@@ -1407,13 +1516,49 @@ lock_try_again:
257 } /* lvm_do_lock_lvm */
258
259
260+static int do_pe_lock(kdev_t lv, kdev_t pv, uint32_t offset)
261+{
262+ down_write(&_pe_lock);
263+ if (pe_lock_req.lock == LOCK_PE) {
264+ up_write(&_pe_lock);
265+ return -EBUSY;
266+ }
267+
268+ /* Should we do to_kdev_t() on the pv_dev and lv_dev??? */
269+ pe_lock_req.lock = LOCK_PE;
270+ pe_lock_req.data.lv_dev = lv;
271+ pe_lock_req.data.pv_dev = pv;
272+ pe_lock_req.data.pv_offset = offset;
273+ up_write(&_pe_lock);
274+ return 0;
275+}
276+
277+static void do_pe_unlock(void)
278+{
279+ struct buffer_head *bh;
280+
281+ down_write(&_pe_lock);
282+ pe_lock_req.lock = UNLOCK_PE;
283+ pe_lock_req.data.lv_dev = 0;
284+ pe_lock_req.data.pv_dev = 0;
285+ pe_lock_req.data.pv_offset = 0;
286+ bh = _dequeue_io();
287+ up_write(&_pe_lock);
288+
289+ /* handle all deferred io for this PE */
290+ /* TODO: Eek, what about attaching callbacks to _flush_io()
291+ deferred requests? --sct */
292+ _flush_io(bh);
293+}
294+
295+
296 /*
297 * character device support function lock/unlock physical extend
298 */
299 static int lvm_do_pe_lock_unlock(vg_t *vg_ptr, void *arg)
300 {
301 pe_lock_req_t new_lock;
302- struct buffer_head *bh;
303+ int err;
304 uint p;
305
306 if (vg_ptr == NULL) return -ENXIO;
307@@ -1430,42 +1575,30 @@ static int lvm_do_pe_lock_unlock(vg_t *v
308 if (p == vg_ptr->pv_max) return -ENXIO;
309
310 /*
311- * this sync releaves memory pressure to lessen the
312- * likelyhood of pvmove being paged out - resulting in
313+ * this sync relieves memory pressure to lessen the
314+ * likelihood of pvmove being paged out - resulting in
315 * deadlock.
316 *
317- * This method of doing a pvmove is broken
318+ * This method of doing a pvmove is *highly* broken for
319+ * several reasons. It deadlocks, it does not
320+ * synchronise correctly with outstanding write IO, and
321+ * it defers the actual copy to a user mode app which
322+ * has no cache coherency with the LV devices.
323 */
324 fsync_dev(pe_lock_req.data.lv_dev);
325
326- down_write(&_pe_lock);
327- if (pe_lock_req.lock == LOCK_PE) {
328- up_write(&_pe_lock);
329- return -EBUSY;
330- }
331-
332- /* Should we do to_kdev_t() on the pv_dev and lv_dev??? */
333- pe_lock_req.lock = LOCK_PE;
334- pe_lock_req.data.lv_dev = new_lock.data.lv_dev;
335- pe_lock_req.data.pv_dev = new_lock.data.pv_dev;
336- pe_lock_req.data.pv_offset = new_lock.data.pv_offset;
337- up_write(&_pe_lock);
338+ err = do_pe_lock(new_lock.data.lv_dev,
339+ new_lock.data.pv_dev,
340+ new_lock.data.pv_offset);
341+ if (err)
342+ return err;
343
344 /* some requests may have got through since the fsync */
345 fsync_dev(pe_lock_req.data.pv_dev);
346 break;
347
348 case UNLOCK_PE:
349- down_write(&_pe_lock);
350- pe_lock_req.lock = UNLOCK_PE;
351- pe_lock_req.data.lv_dev = 0;
352- pe_lock_req.data.pv_dev = 0;
353- pe_lock_req.data.pv_offset = 0;
354- bh = _dequeue_io();
355- up_write(&_pe_lock);
356-
357- /* handle all deferred io for this PE */
358- _flush_io(bh);
359+ do_pe_unlock();
360 break;
361
362 default:
363@@ -1476,6 +1609,103 @@ static int lvm_do_pe_lock_unlock(vg_t *v
364
365
366 /*
367+ * character device support function: safe, locked PE copy
368+ */
369+static int lvm_do_pe_locked_copy(vg_t *vg_ptr, void *arg)
370+{
371+ pe_copy_req_t pe_copy_req;
372+ int err;
373+ lv_t *lv_ptr = NULL;
374+ pv_t *pv_ptr = NULL;
375+ int i;
376+ unsigned long old_offset, new_offset;
377+
378+ if (vg_ptr == NULL) return -ENXIO;
379+ if (copy_from_user(&pe_copy_req, arg,
380+ sizeof(pe_copy_req_t)) != 0)
381+ return -EFAULT;
382+ if (pe_copy_req.cookie != PE_COPY_MAGIC_COOKIE)
383+ return -EINVAL;
384+
385+ /* First find the logical volume for the request... */
386+
387+ for (i = 0; i < vg_ptr->lv_max; i++) {
388+ lv_ptr = vg_ptr->lv[i];
389+ if (lv_ptr != NULL &&
390+ strcmp(lv_ptr->lv_name, pe_copy_req.lv_name) == 0)
391+ break;
392+ }
393+
394+ if (i == vg_ptr->lv_max)
395+ return -EINVAL;
396+
397+ /* ... and the physical volume. */
398+
399+ for (i = 0; i < vg_ptr->pv_max; i++) {
400+ pv_ptr = vg_ptr->pv[i];
401+ if (pv_ptr->pv_dev == pe_copy_req.old_dev)
402+ break;
403+ }
404+
405+ if (i == vg_ptr->pv_max)
406+ return -EINVAL;
407+
408+ /* We'll take the lock on the source extent in the LV first. We
409+ mutex out ALL IO to the entire logical volume before doing
410+ this, so we can be absolutely certain that there is no
411+ outstanding IO to this PE once the lock is in place. (We
412+ can't mutex just one PE without tracking outstanding IO on a
413+ per-extent basis.) */
414+
415+ down_write(&lv_ptr->lv_io_sem);
416+ err = do_pe_lock(lv_ptr->lv_dev,
417+ pe_copy_req.old_dev,
418+ pe_copy_req.old_pe);
419+ up_write(&lv_ptr->lv_io_sem);
420+
421+ if (err)
422+ return err;
423+
424+ /* All prep done, we can copy the bits now */
425+
426+ err = lvm_do_bulk_copy(pe_copy_req.old_dev, pe_copy_req.new_dev,
427+ pe_copy_req.old_pe, pe_copy_req.new_pe,
428+ vg_ptr->pe_size);
429+
430+ if (!err)
431+ err = __do_le_remap(vg_ptr, lv_ptr,
432+ pe_copy_req.old_dev, pe_copy_req.new_dev,
433+ pe_copy_req.old_pe, pe_copy_req.new_pe);
434+
435+out:
436+ do_pe_unlock();
437+ return err;
438+}
439+
440+static int __do_le_remap(vg_t *vg_ptr, lv_t *lv_ptr,
441+ kdev_t old_dev, kdev_t new_dev,
442+ uint old_pe, uint new_pe)
443+{
444+ uint le;
445+
446+ down_write(&lv_ptr->lv_lock);
447+ for (le = 0; le < lv_ptr->lv_allocated_le; le++) {
448+ if (lv_ptr->lv_current_pe[le].dev == old_dev &&
449+ lv_ptr->lv_current_pe[le].pe == old_pe) {
450+ lv_ptr->lv_current_pe[le].dev = new_dev;
451+ lv_ptr->lv_current_pe[le].pe = new_pe;
452+
453+ up_write(&lv_ptr->lv_lock);
454+ __update_hardsectsize(lv_ptr);
455+ return 0;
456+ }
457+ }
458+ up_write(&lv_ptr->lv_lock);
459+ return -EINVAL;
460+}
461+
462+
463+/*
464 * character device support function logical extend remap
465 */
466 static int lvm_do_le_remap(vg_t *vg_ptr, void *arg)
467@@ -1495,23 +1725,11 @@ static int lvm_do_le_remap(vg_t *vg_ptr,
468 continue;
469
470 if (strcmp(lv_ptr->lv_name, le_remap_req.lv_name) == 0) {
471- down_write(&lv_ptr->lv_lock);
472- for (le = 0; le < lv_ptr->lv_allocated_le; le++) {
473- if (lv_ptr->lv_current_pe[le].dev ==
474- le_remap_req.old_dev &&
475- lv_ptr->lv_current_pe[le].pe ==
476- le_remap_req.old_pe) {
477- lv_ptr->lv_current_pe[le].dev =
478- le_remap_req.new_dev;
479- lv_ptr->lv_current_pe[le].pe =
480- le_remap_req.new_pe;
481- __update_hardsectsize(lv_ptr);
482- up_write(&lv_ptr->lv_lock);
483- return 0;
484- }
485- }
486- up_write(&lv_ptr->lv_lock);
487- return -EINVAL;
488+ return __do_le_remap(vg_ptr, lv_ptr,
489+ le_remap_req.old_dev,
490+ le_remap_req.new_dev,
491+ le_remap_req.old_pe,
492+ le_remap_req.new_pe);
493 }
494 }
495 return -ENXIO;
496@@ -2023,7 +2241,8 @@ static int lvm_do_lv_create(int minor, c
497 lv_ptr->lv_snapshot_hash_table_size = 0;
498 lv_ptr->lv_snapshot_hash_mask = 0;
499 init_rwsem(&lv_ptr->lv_lock);
500-
501+ init_rwsem(&lv_ptr->lv_io_sem);
502+
503 lv_ptr->lv_snapshot_use_rate = 0;
504
505 vg_ptr->lv[l] = lv_ptr;
506diff -urNp linux-1251/include/linux/lvm.h linux-1252/include/linux/lvm.h
507--- linux-1251/include/linux/lvm.h
508+++ linux-1252/include/linux/lvm.h
509@@ -153,6 +153,11 @@ struct list_head {
510 #define SECTOR_SIZE 512
511 #endif
512
513+/*
514+ * Number of guaranteed callback structs in case of extreme VM load:
515+ */
516+#define NR_LVM_CALLBACK 256
517+
518 /* structure version */
519 #define LVM_STRUCT_VERSION 1
520
521@@ -339,6 +344,7 @@ struct list_head {
522
523 /* physical extent */
524 #define PE_LOCK_UNLOCK _IOW ( 0xfe, 0x50, 1)
525+#define PE_LOCKED_COPY _IOW ( 0xfe, 0x51, 1)
526
527 /* i/o protocol version */
528 #define LVM_GET_IOP_VERSION _IOR ( 0xfe, 0x98, 1)
529@@ -571,6 +577,8 @@ typedef struct lv_v5 {
530 struct vg_v3 *vg;
531
532 uint lv_allocated_snapshot_le;
533+
534+ struct rw_semaphore lv_io_sem;
535 #else
536 char dummy[200];
537 #endif
538@@ -689,6 +697,18 @@ typedef struct {
539 } pe_lock_req_t;
540
541
542+/* Request structure PE_COPY */
543+#define PE_COPY_MAGIC_COOKIE 0xD0D4FF95 /* 4 bytes out of /dev/random */
544+typedef struct {
545+ uint32_t cookie; /* Cookie to guard against reuse of this ioctl */
546+ char lv_name[NAME_LEN];
547+ kdev_t old_dev;
548+ kdev_t new_dev;
549+ uint32_t old_pe;
550+ uint32_t new_pe;
551+} pe_copy_req_t;
552+
553+
554 /* Request structure LV_STATUS_BYNAME */
555 typedef struct {
556 char lv_name[NAME_LEN];
This page took 0.639352 seconds and 4 git commands to generate.