]> git.pld-linux.org Git - packages/kernel.git/blob - linux-2.4.17-lvm-pvmove.patch
- added description of djurban's branch
[packages/kernel.git] / linux-2.4.17-lvm-pvmove.patch
1 diff -urNp linux-1251/drivers/md/lvm.c linux-1252/drivers/md/lvm.c
2 --- linux-1251/drivers/md/lvm.c 
3 +++ linux-1252/drivers/md/lvm.c 
4 @@ -239,6 +239,7 @@
5  
6  #include <linux/slab.h>
7  #include <linux/init.h>
8 +#include <linux/mempool.h>
9  
10  #include <linux/hdreg.h>
11  #include <linux/stat.h>
12 @@ -264,6 +265,7 @@
13  
14  #include <linux/errno.h>
15  #include <linux/lvm.h>
16 +#include <linux/iobuf.h>
17  
18  #include "lvm-internal.h"
19  
20 @@ -296,7 +298,6 @@ static int lvm_chr_open(struct inode *, 
21  static int lvm_chr_close(struct inode *, struct file *);
22  static int lvm_chr_ioctl(struct inode *, struct file *, uint, ulong);
23  
24 -
25  /* End external function prototypes */
26  
27  
28 @@ -324,6 +325,7 @@ static int lvm_do_lv_status_byindex(vg_t
29  static int lvm_do_lv_status_bydev(vg_t *, void *);
30  
31  static int lvm_do_pe_lock_unlock(vg_t *r, void *);
32 +static int lvm_do_pe_locked_copy(vg_t *r, void *);
33  
34  static int lvm_do_pv_change(vg_t*, void*);
35  static int lvm_do_pv_status(vg_t *, void *);
36 @@ -334,8 +336,11 @@ static int lvm_do_vg_extend(vg_t *, void
37  static int lvm_do_vg_reduce(vg_t *, void *);
38  static int lvm_do_vg_rename(vg_t *, void *);
39  static int lvm_do_vg_remove(int);
40 +static int lvm_push_callback(lv_t *, int, struct buffer_head *);
41 +static void lvm_bh_callback(struct buffer_head *, int);
42  static void lvm_geninit(struct gendisk *);
43  static void __update_hardsectsize(lv_t *lv);
44 +static int __do_le_remap(vg_t *, lv_t *, kdev_t, kdev_t, uint, uint);
45  
46  
47  static void _queue_io(struct buffer_head *bh, int rw);
48 @@ -359,7 +364,6 @@ ushort lvm_iop_version = LVM_DRIVER_IOP_
49  int loadtime = 0;
50  const char *const lvm_name = LVM_NAME;
51  
52 -
53  /* volume group descriptor area pointers */
54  vg_t *vg[ABS_MAX_VG + 1];
55  
56 @@ -369,6 +373,12 @@ static struct {
57         int lv_number;
58  } vg_lv_map[ABS_MAX_LV];
59  
60 +/* cache a buffer_head end_io callback state */
61 +typedef struct {
62 +       struct buffer_head bh_io;
63 +       lv_t *lv;
64 +       struct buffer_head *bh_orig;
65 +} callback_t;
66  
67  /* Request structures (lvm_chr_ioctl()) */
68  static pv_change_req_t pv_change_req;
69 @@ -419,6 +429,8 @@ static int lvm_blocksizes[MAX_LV];
70  static int lvm_hardsectsizes[MAX_LV];
71  static int lvm_size[MAX_LV];
72  
73 +static mempool_t *lvm_callback_mempool;
74 +
75  static struct gendisk lvm_gendisk =
76  {
77         major:          MAJOR_NR,
78 @@ -431,25 +443,45 @@ static struct gendisk lvm_gendisk =
79  };
80  
81  
82 +static void * lvm_callback_alloc(int gfp_flags, void *data)
83 +{
84 +       callback_t *callback;
85 +
86 +       callback = kmalloc(sizeof *callback, gfp_flags);
87 +       return callback;
88 +}
89 +
90 +static void lvm_callback_free(void *callback, void *data)
91 +{
92 +       kfree(callback);
93 +}
94 +
95  /*
96   * Driver initialization...
97   */
98  int lvm_init(void)
99  {
100 +       int err = -EIO;
101 +       
102         if (devfs_register_chrdev(LVM_CHAR_MAJOR,
103                                   lvm_name, &lvm_chr_fops) < 0) {
104                 printk(KERN_ERR "%s -- devfs_register_chrdev failed\n",
105                        lvm_name);
106 -               return -EIO;
107 +               goto out_err;
108         }
109         if (devfs_register_blkdev(MAJOR_NR, lvm_name, &lvm_blk_dops) < 0)
110         {
111                 printk("%s -- devfs_register_blkdev failed\n", lvm_name);
112 -               if (devfs_unregister_chrdev(LVM_CHAR_MAJOR, lvm_name) < 0)
113 -                       printk(KERN_ERR
114 -                              "%s -- devfs_unregister_chrdev failed\n",
115 -                              lvm_name);
116 -               return -EIO;
117 +               goto out_unreg_char;
118 +       }
119 +
120 +       err = -ENOMEM;
121 +       lvm_callback_mempool = mempool_create(NR_LVM_CALLBACK,
122 +                                             lvm_callback_alloc,
123 +                                             lvm_callback_free, NULL);
124 +       if (!lvm_callback_mempool) {
125 +               printk("%s -- out of memory for callback pool\n", lvm_name);
126 +               goto out_unreg_block;
127         }
128  
129         lvm_init_fs();
130 @@ -482,6 +514,18 @@ int lvm_init(void)
131  #endif
132  
133         return 0;
134 +
135 +out_unreg_block:
136 +       if (devfs_unregister_blkdev(MAJOR_NR, lvm_name) < 0)
137 +               printk(KERN_ERR "%s -- devfs_unregister_blkdev failed\n",
138 +                      lvm_name);
139 +out_unreg_char:
140 +       if (devfs_unregister_chrdev(LVM_CHAR_MAJOR, lvm_name) < 0)
141 +               printk(KERN_ERR
142 +                      "%s -- devfs_unregister_chrdev failed\n",
143 +                      lvm_name);
144 +out_err:
145 +       return err;
146  } /* lvm_init() */
147  
148  /*
149 @@ -497,7 +541,7 @@ static void lvm_cleanup(void)
150                 printk(KERN_ERR "%s -- devfs_unregister_blkdev failed\n",
151                        lvm_name);
152  
153 -
154 +       mempool_destroy(lvm_callback_mempool);
155  
156         /* delete our gendisk from chain */
157         del_gendisk(&lvm_gendisk);
158 @@ -658,6 +702,11 @@ static int lvm_chr_ioctl(struct inode *i
159                    physical volume (move's done in user space's pvmove) */
160                 return lvm_do_pe_lock_unlock(vg_ptr,arg);
161  
162 +       case PE_LOCKED_COPY:
163 +               /* lock/unlock i/o to a physical extent to move it to another
164 +                  physical volume (move's done in user space's pvmove) */
165 +               return lvm_do_pe_locked_copy(vg_ptr,arg);
166 +
167         case VG_CREATE_OLD:
168                 /* create a VGDA */
169                 return lvm_do_vg_create(arg, minor);
170 @@ -1213,7 +1262,7 @@ static int lvm_map(struct buffer_head *b
171         kdev_t rdev_map;
172         vg_t *vg_this = vg[VG_BLK(minor)];
173         lv_t *lv = vg_this->lv[LV_BLK(minor)];
174 -
175 +       int ret;
176  
177         down_read(&lv->lv_lock);
178         if (!(lv->lv_status & LV_ACTIVE)) {
179 @@ -1328,8 +1377,9 @@ static int lvm_map(struct buffer_head *b
180   out:
181         bh->b_rdev = rdev_map;
182         bh->b_rsector = rsector_map;
183 +       ret = lvm_push_callback(lv, rw, bh);
184         up_read(&lv->lv_lock);
185 -       return 1;
186 +       return ret;
187  
188   bad:
189         if (bh->b_end_io)
190 @@ -1343,6 +1393,65 @@ static int lvm_map(struct buffer_head *b
191   * internal support functions
192   */
193  
194 +/*
195 + * Handle LVM callbacks on buffer_head IO completion: push an IO
196 + * completion onto an existing buffer_head.  preserve b_private by
197 + * creating a new buffer_head for the mapped IO. 
198 + */
199 +static int lvm_push_callback(lv_t *lv, int rw, struct buffer_head *bh)
200 +{
201 +       callback_t *callback;
202 +       struct buffer_head *nbh;
203 +       
204 +       callback = mempool_alloc(lvm_callback_mempool, GFP_NOIO);
205 +
206 +       callback->lv = lv;
207 +       callback->bh_orig = bh;
208 +       
209 +       nbh = &callback->bh_io;
210 +       
211 +       nbh->b_blocknr    = bh->b_blocknr;
212 +       nbh->b_dev        = bh->b_dev;
213 +       nbh->b_rdev       = bh->b_rdev;
214 +       nbh->b_rsector    = bh->b_rsector;
215 +       nbh->b_state      = (1<<BH_Req) | (1<<BH_Dirty) |
216 +               (1<<BH_Mapped) | (1<<BH_Lock);
217 +       atomic_set(&nbh->b_count, 1);
218 +       nbh->b_size       = bh->b_size;
219 +       nbh->b_page       = bh->b_page;
220 +       nbh->b_data       = bh->b_data;
221 +       nbh->b_list       = 0;
222 +       nbh->b_reqnext    = NULL;
223 +       
224 +       nbh->b_end_io     = lvm_bh_callback;
225 +       nbh->b_private    = callback;
226 +
227 +       down_read(&lv->lv_io_sem);
228 +       generic_make_request(rw, nbh);
229 +       
230 +       return 0; /* Tell generic_make_request not to pursue the
231 +                    original buffer_head any further now that we've
232 +                    submitted a new one. */
233 +}
234 +
235 +static void lvm_bh_callback(struct buffer_head *bh, int uptodate)
236 +{
237 +       callback_t *callback;
238 +       struct buffer_head *obh;
239 +       lv_t *lv;
240 +       
241 +       callback = bh->b_private;
242 +       lv = callback->lv;
243 +       obh = callback->bh_orig;
244 +       
245 +       up_read(&lv->lv_io_sem);
246 +       
247 +       mempool_free(callback, lvm_callback_mempool);
248 +       if (obh->b_end_io)
249 +               obh->b_end_io(obh, uptodate);
250 +}
251 +
252 +
253  #ifdef LVM_HD_NAME
254  /*
255   * generate "hard disk" name
256 @@ -1407,13 +1516,49 @@ lock_try_again:
257  } /* lvm_do_lock_lvm */
258  
259  
260 +static int do_pe_lock(kdev_t lv, kdev_t pv, uint32_t offset)
261 +{
262 +       down_write(&_pe_lock);
263 +       if (pe_lock_req.lock == LOCK_PE) {
264 +               up_write(&_pe_lock);
265 +               return -EBUSY;
266 +       }
267 +
268 +       /* Should we do to_kdev_t() on the pv_dev and lv_dev??? */
269 +       pe_lock_req.lock = LOCK_PE;
270 +       pe_lock_req.data.lv_dev = lv;
271 +       pe_lock_req.data.pv_dev = pv;
272 +       pe_lock_req.data.pv_offset = offset;
273 +       up_write(&_pe_lock);
274 +       return 0;
275 +}
276 +
277 +static void do_pe_unlock(void)
278 +{
279 +       struct buffer_head *bh;
280 +
281 +       down_write(&_pe_lock);
282 +       pe_lock_req.lock = UNLOCK_PE;
283 +       pe_lock_req.data.lv_dev = 0;
284 +       pe_lock_req.data.pv_dev = 0;
285 +       pe_lock_req.data.pv_offset = 0;
286 +       bh = _dequeue_io();
287 +       up_write(&_pe_lock);
288 +
289 +       /* handle all deferred io for this PE */
290 +       /* TODO: Eek, what about attaching callbacks to _flush_io()
291 +          deferred requests?  --sct */
292 +       _flush_io(bh);
293 +}
294 +
295 +
296  /*
297   * character device support function lock/unlock physical extend
298   */
299  static int lvm_do_pe_lock_unlock(vg_t *vg_ptr, void *arg)
300  {
301         pe_lock_req_t new_lock;
302 -       struct buffer_head *bh;
303 +       int err;
304         uint p;
305  
306         if (vg_ptr == NULL) return -ENXIO;
307 @@ -1430,42 +1575,30 @@ static int lvm_do_pe_lock_unlock(vg_t *v
308                 if (p == vg_ptr->pv_max) return -ENXIO;
309  
310                 /*
311 -                * this sync releaves memory pressure to lessen the
312 -                * likelyhood of pvmove being paged out - resulting in
313 +                * this sync relieves memory pressure to lessen the
314 +                * likelihood of pvmove being paged out - resulting in
315                  * deadlock.
316                  *
317 -                * This method of doing a pvmove is broken
318 +                * This method of doing a pvmove is *highly* broken for
319 +                * several reasons.  It deadlocks, it does not
320 +                * synchronise correctly with outstanding write IO, and
321 +                * it defers the actual copy to a user mode app which
322 +                * has no cache coherency with the LV devices.
323                  */
324                 fsync_dev(pe_lock_req.data.lv_dev);
325  
326 -               down_write(&_pe_lock);
327 -               if (pe_lock_req.lock == LOCK_PE) {
328 -                       up_write(&_pe_lock);
329 -                       return -EBUSY;
330 -               }
331 -
332 -               /* Should we do to_kdev_t() on the pv_dev and lv_dev??? */
333 -               pe_lock_req.lock = LOCK_PE;
334 -               pe_lock_req.data.lv_dev = new_lock.data.lv_dev;
335 -               pe_lock_req.data.pv_dev = new_lock.data.pv_dev;
336 -               pe_lock_req.data.pv_offset = new_lock.data.pv_offset;
337 -               up_write(&_pe_lock);
338 +               err = do_pe_lock(new_lock.data.lv_dev,
339 +                                new_lock.data.pv_dev,
340 +                                new_lock.data.pv_offset);
341 +               if (err)
342 +                       return err;
343  
344                 /* some requests may have got through since the fsync */
345                 fsync_dev(pe_lock_req.data.pv_dev);
346                 break;
347  
348         case UNLOCK_PE:
349 -               down_write(&_pe_lock);
350 -               pe_lock_req.lock = UNLOCK_PE;
351 -               pe_lock_req.data.lv_dev = 0;
352 -               pe_lock_req.data.pv_dev = 0;
353 -               pe_lock_req.data.pv_offset = 0;
354 -               bh = _dequeue_io();
355 -               up_write(&_pe_lock);
356 -
357 -               /* handle all deferred io for this PE */
358 -               _flush_io(bh);
359 +               do_pe_unlock();
360                 break;
361  
362         default:
363 @@ -1476,6 +1609,103 @@ static int lvm_do_pe_lock_unlock(vg_t *v
364  
365  
366  /*
367 + * character device support function: safe, locked PE copy
368 + */
369 +static int lvm_do_pe_locked_copy(vg_t *vg_ptr, void *arg)
370 +{
371 +       pe_copy_req_t pe_copy_req;
372 +       int err;
373 +       lv_t *lv_ptr = NULL;
374 +       pv_t *pv_ptr = NULL;
375 +       int i;
376 +       unsigned long old_offset, new_offset;
377 +       
378 +       if (vg_ptr == NULL) return -ENXIO;
379 +       if (copy_from_user(&pe_copy_req, arg,
380 +                          sizeof(pe_copy_req_t)) != 0)
381 +               return -EFAULT;
382 +       if (pe_copy_req.cookie != PE_COPY_MAGIC_COOKIE)
383 +               return -EINVAL;
384 +       
385 +       /* First find the logical volume for the request... */
386 +
387 +       for (i = 0; i < vg_ptr->lv_max; i++) {
388 +               lv_ptr = vg_ptr->lv[i];
389 +               if (lv_ptr != NULL &&
390 +                   strcmp(lv_ptr->lv_name, pe_copy_req.lv_name) == 0)
391 +                       break;
392 +       }
393 +       
394 +       if (i == vg_ptr->lv_max)
395 +               return -EINVAL;
396 +
397 +       /* ... and the physical volume. */
398 +
399 +       for (i = 0; i < vg_ptr->pv_max; i++) {
400 +               pv_ptr = vg_ptr->pv[i];
401 +               if (pv_ptr->pv_dev == pe_copy_req.old_dev)
402 +                       break;
403 +       }
404 +       
405 +       if (i == vg_ptr->pv_max)
406 +               return -EINVAL;
407 +
408 +       /* We'll take the lock on the source extent in the LV first.  We
409 +           mutex out ALL IO to the entire logical volume before doing
410 +           this, so we can be absolutely certain that there is no
411 +           outstanding IO to this PE once the lock is in place.  (We
412 +           can't mutex just one PE without tracking outstanding IO on a
413 +           per-extent basis.) */
414 +
415 +       down_write(&lv_ptr->lv_io_sem);
416 +       err = do_pe_lock(lv_ptr->lv_dev,
417 +                        pe_copy_req.old_dev, 
418 +                        pe_copy_req.old_pe);
419 +       up_write(&lv_ptr->lv_io_sem);
420 +
421 +       if (err)
422 +               return err;
423 +       
424 +       /* All prep done, we can copy the bits now */
425 +
426 +       err = lvm_do_bulk_copy(pe_copy_req.old_dev, pe_copy_req.new_dev, 
427 +                              pe_copy_req.old_pe, pe_copy_req.new_pe, 
428 +                              vg_ptr->pe_size);
429 +
430 +       if (!err)
431 +               err = __do_le_remap(vg_ptr, lv_ptr, 
432 +                                   pe_copy_req.old_dev, pe_copy_req.new_dev, 
433 +                                   pe_copy_req.old_pe, pe_copy_req.new_pe);
434 +       
435 +out:
436 +       do_pe_unlock();
437 +       return err;
438 +}
439 +
440 +static int __do_le_remap(vg_t *vg_ptr, lv_t *lv_ptr, 
441 +                        kdev_t old_dev, kdev_t new_dev, 
442 +                        uint old_pe, uint new_pe)
443 +{
444 +       uint le;
445 +       
446 +       down_write(&lv_ptr->lv_lock);
447 +       for (le = 0; le < lv_ptr->lv_allocated_le; le++) {
448 +               if (lv_ptr->lv_current_pe[le].dev == old_dev &&
449 +                   lv_ptr->lv_current_pe[le].pe == old_pe) {
450 +                       lv_ptr->lv_current_pe[le].dev = new_dev;
451 +                       lv_ptr->lv_current_pe[le].pe = new_pe;
452 +               
453 +                       up_write(&lv_ptr->lv_lock);
454 +                       __update_hardsectsize(lv_ptr);
455 +                       return 0;
456 +               }
457 +       }
458 +       up_write(&lv_ptr->lv_lock);
459 +       return -EINVAL;
460 +}
461 +
462 +
463 +/*
464   * character device support function logical extend remap
465   */
466  static int lvm_do_le_remap(vg_t *vg_ptr, void *arg)
467 @@ -1495,23 +1725,11 @@ static int lvm_do_le_remap(vg_t *vg_ptr,
468                         continue;
469  
470                 if (strcmp(lv_ptr->lv_name, le_remap_req.lv_name) == 0) {
471 -                       down_write(&lv_ptr->lv_lock);
472 -                       for (le = 0; le < lv_ptr->lv_allocated_le; le++) {
473 -                               if (lv_ptr->lv_current_pe[le].dev ==
474 -                                   le_remap_req.old_dev &&
475 -                                   lv_ptr->lv_current_pe[le].pe ==
476 -                                   le_remap_req.old_pe) {
477 -                                       lv_ptr->lv_current_pe[le].dev =
478 -                                           le_remap_req.new_dev;
479 -                                       lv_ptr->lv_current_pe[le].pe =
480 -                                           le_remap_req.new_pe;
481 -                                       __update_hardsectsize(lv_ptr);
482 -                                       up_write(&lv_ptr->lv_lock);
483 -                                       return 0;
484 -                               }
485 -                       }
486 -                       up_write(&lv_ptr->lv_lock);
487 -                       return -EINVAL;
488 +                       return __do_le_remap(vg_ptr, lv_ptr,
489 +                                          le_remap_req.old_dev,
490 +                                          le_remap_req.new_dev,
491 +                                          le_remap_req.old_pe,
492 +                                          le_remap_req.new_pe);
493                 }
494         }
495         return -ENXIO;
496 @@ -2023,7 +2241,8 @@ static int lvm_do_lv_create(int minor, c
497         lv_ptr->lv_snapshot_hash_table_size = 0;
498         lv_ptr->lv_snapshot_hash_mask = 0;
499         init_rwsem(&lv_ptr->lv_lock);
500 -
501 +       init_rwsem(&lv_ptr->lv_io_sem);
502 +       
503         lv_ptr->lv_snapshot_use_rate = 0;
504  
505         vg_ptr->lv[l] = lv_ptr;
506 diff -urNp linux-1251/include/linux/lvm.h linux-1252/include/linux/lvm.h
507 --- linux-1251/include/linux/lvm.h      
508 +++ linux-1252/include/linux/lvm.h      
509 @@ -153,6 +153,11 @@ struct list_head {
510  #define SECTOR_SIZE    512
511  #endif
512  
513 +/*
514 + * Number of guaranteed callback structs in case of extreme VM load:
515 + */
516 +#define        NR_LVM_CALLBACK 256
517 +
518  /* structure version */
519  #define LVM_STRUCT_VERSION 1
520  
521 @@ -339,6 +344,7 @@ struct list_head {
522  
523  /* physical extent */
524  #define        PE_LOCK_UNLOCK          _IOW ( 0xfe, 0x50, 1)
525 +#define        PE_LOCKED_COPY          _IOW ( 0xfe, 0x51, 1)
526  
527  /* i/o protocol version */
528  #define        LVM_GET_IOP_VERSION     _IOR ( 0xfe, 0x98, 1)
529 @@ -571,6 +577,8 @@ typedef struct lv_v5 {
530         struct vg_v3    *vg;
531  
532         uint lv_allocated_snapshot_le;
533 +
534 +       struct rw_semaphore lv_io_sem;
535  #else
536         char dummy[200];
537  #endif
538 @@ -689,6 +697,18 @@ typedef struct {
539  } pe_lock_req_t;
540  
541  
542 +/* Request structure PE_COPY */
543 +#define PE_COPY_MAGIC_COOKIE 0xD0D4FF95  /* 4 bytes out of /dev/random */
544 +typedef struct {
545 +       uint32_t cookie; /* Cookie to guard against reuse of this ioctl */
546 +       char lv_name[NAME_LEN];
547 +       kdev_t old_dev;
548 +       kdev_t new_dev;
549 +       uint32_t old_pe;
550 +       uint32_t new_pe;
551 +} pe_copy_req_t;
552 +
553 +
554  /* Request structure LV_STATUS_BYNAME */
555  typedef struct {
556         char lv_name[NAME_LEN];
This page took 0.082698 seconds and 3 git commands to generate.