--- linux/include/linux/lvm.h.orig Tue Oct 23 13:55:20 2001 +++ linux/include/linux/lvm.h Tue Oct 23 13:55:51 2001 @@ -9,7 +9,7 @@ * May-July 1998 * January-March,July,September,October,Dezember 1999 * January,February,July,November 2000 - * January-March 2001 + * January-March,June,July 2001 * * lvm is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -67,6 +67,10 @@ * 01/03/2001 - Revert to IOP10 and add VG_CREATE_OLD call for compatibility * 08/03/2001 - new lv_t (in core) version number 5: changed page member * to (struct kiobuf *) to use for COW exception table io + * 26/03/2001 - changed lv_v4 to lv_v5 in structure definition (HM) + * 21/06/2001 - changed BLOCK_SIZE back to 1024 for non S/390 + * 22/06/2001 - added Andreas Dilger's PE on 4k boundary alignment enhancements + * 19/07/2001 - added rwsem compatibility macros for 2.2 kernels * */ @@ -74,8 +78,8 @@ #ifndef _LVM_H_INCLUDE #define _LVM_H_INCLUDE -#define LVM_RELEASE_NAME "0.9.1_beta6" -#define LVM_RELEASE_DATE "12/03/2001" +#define LVM_RELEASE_NAME "1.0.1-rc4" +#define LVM_RELEASE_DATE "03/10/2001" #define _LVM_KERNEL_H_VERSION "LVM "LVM_RELEASE_NAME" ("LVM_RELEASE_DATE")" @@ -102,16 +106,14 @@ #define DEBUG_DEVICE #define DEBUG_KFREE */ -#endif /* #ifdef __KERNEL__ */ -#ifndef __KERNEL__ -#define __KERNEL__ #include #include -#undef __KERNEL__ #else +#define __KERNEL__ #include #include +#undef __KERNEL__ #endif /* #ifndef __KERNEL__ */ #include @@ -123,6 +125,7 @@ #include #endif /* #ifdef __KERNEL__ */ + #include #if !defined ( LVM_BLK_MAJOR) || !defined ( LVM_CHAR_MAJOR) @@ -136,31 +139,18 @@ #ifdef CONFIG_ARCH_S390 #define BLOCK_SIZE 4096 #else -#define BLOCK_SIZE 512 +#define BLOCK_SIZE 1024 #endif #ifndef SECTOR_SIZE #define SECTOR_SIZE 512 #endif -#define LVM_STRUCT_VERSION 1 /* structure version */ +/* structure version */ +#define LVM_STRUCT_VERSION 1 #define LVM_DIR_PREFIX "/dev/" -/* set the default structure version */ -#if ( LVM_STRUCT_VERSION == 1) -#define pv_t pv_v2_t -#define lv_t lv_v5_t -#define vg_t vg_v3_t -#define pv_disk_t pv_disk_v2_t -#define lv_disk_t lv_disk_v3_t -#define vg_disk_t vg_disk_v2_t -#define lv_block_exception_t lv_block_exception_v1_t -#define lv_COW_table_disk_t lv_COW_table_disk_v1_t -#endif - - - /* * i/o protocol version * @@ -234,40 +224,11 @@ */ /* DONT TOUCH THESE !!! */ -/* base of PV structure in disk partition */ -#define LVM_PV_DISK_BASE 0L -/* size reserved for PV structure on disk */ -#define LVM_PV_DISK_SIZE 1024L -/* base of VG structure in disk partition */ -#define LVM_VG_DISK_BASE LVM_PV_DISK_SIZE -/* size reserved for VG structure */ -#define LVM_VG_DISK_SIZE ( 9 * 512L) - -/* size reserved for timekeeping */ -#define LVM_TIMESTAMP_DISK_BASE ( LVM_VG_DISK_BASE + LVM_VG_DISK_SIZE) -#define LVM_TIMESTAMP_DISK_SIZE 512L /* reserved for timekeeping */ - -/* name list of physical volumes on disk */ -#define LVM_PV_UUIDLIST_DISK_BASE ( LVM_TIMESTAMP_DISK_BASE + \ - LVM_TIMESTAMP_DISK_SIZE) - -/* now for the dynamically calculated parts of the VGDA */ -#define LVM_LV_DISK_OFFSET(a, b) ( (a)->lv_on_disk.base + \ - sizeof ( lv_disk_t) * b) -#define LVM_DISK_SIZE(pv) ( (pv)->pe_on_disk.base + \ - (pv)->pe_on_disk.size) -#define LVM_PE_DISK_OFFSET(pe, pv) ( pe * pv->pe_size + \ - ( LVM_DISK_SIZE ( pv) / SECTOR_SIZE)) -#define LVM_PE_ON_DISK_BASE(pv) \ - { int rest; \ - pv->pe_on_disk.base = pv->lv_on_disk.base + pv->lv_on_disk.size; \ - if ( ( rest = pv->pe_on_disk.base % SECTOR_SIZE) != 0) \ - pv->pe_on_disk.base += ( SECTOR_SIZE - rest); \ - } -/* END default disk spaces and offsets for PVs */ + + /* @@ -299,9 +260,8 @@ #define LVM_MAX_STRIPES 128 /* max # of stripes */ #define LVM_MAX_SIZE ( 1024LU * 1024 / SECTOR_SIZE * 1024 * 1024) /* 1TB[sectors] */ #define LVM_MAX_MIRRORS 2 /* future use */ -#define LVM_MIN_READ_AHEAD 0 /* minimum read ahead sectors */ -#define LVM_DEFAULT_READ_AHEAD 1024 /* default read ahead sectors for 512k scsi segments */ -#define LVM_MAX_READ_AHEAD 10000 /* maximum read ahead sectors */ +#define LVM_MIN_READ_AHEAD 2 /* minimum read ahead sectors */ +#define LVM_MAX_READ_AHEAD 120 /* maximum read ahead sectors */ #define LVM_MAX_LV_IO_TIMEOUT 60 /* seconds I/O timeout (future use) */ #define LVM_PARTITION 0xfe /* LVM partition id */ #define LVM_NEW_PARTITION 0x8e /* new LVM partition id (10/09/1999) */ @@ -312,25 +272,12 @@ #define LVM_SNAPSHOT_MIN_CHUNK (PAGE_SIZE/1024) /* 4 or 8 KB */ #define UNDEF -1 -#define FALSE 0 -#define TRUE 1 - - -#define LVM_GET_COW_TABLE_CHUNKS_PER_PE(vg, lv) ( \ - vg->pe_size / lv->lv_chunk_size) - -#define LVM_GET_COW_TABLE_ENTRIES_PER_PE(vg, lv) ( \ -{ \ - int COW_table_entries_per_PE; \ - int COW_table_chunks_per_PE; \ -\ - COW_table_entries_per_PE = LVM_GET_COW_TABLE_CHUNKS_PER_PE(vg, lv); \ - COW_table_chunks_per_PE = ( COW_table_entries_per_PE * sizeof(lv_COW_table_disk_t) / SECTOR_SIZE + lv->lv_chunk_size - 1) / lv->lv_chunk_size; \ - COW_table_entries_per_PE - COW_table_chunks_per_PE;}) - /* * ioctls + * FIXME: the last parameter to _IO{W,R,WR} is a data type. The macro will + * expand this using sizeof(), so putting "1" there is misleading + * because sizeof(1) = sizeof(int) = sizeof(2) = 4 on a 32-bit machine! */ /* volume group */ #define VG_CREATE_OLD _IOW ( 0xfe, 0x00, 1) @@ -394,7 +341,12 @@ #endif /* lock the logical volume manager */ +#if LVM_DRIVER_IOP_VERSION > 11 +#define LVM_LOCK_LVM _IO ( 0xfe, 0x9A) +#else +/* This is actually the same as _IO ( 0xff, 0x00), oops. Remove for IOP 12+ */ #define LVM_LOCK_LVM _IO ( 0xfe, 0x100) +#endif /* END ioctls */ @@ -445,21 +397,21 @@ #define UUID_LEN 32 /* don't change!!! */ /* copy on write tables in disk format */ -typedef struct { +typedef struct lv_COW_table_disk_v1 { uint64_t pv_org_number; uint64_t pv_org_rsector; uint64_t pv_snap_number; uint64_t pv_snap_rsector; -} lv_COW_table_disk_v1_t; +} lv_COW_table_disk_t; /* remap physical sector/rdev pairs including hash */ -typedef struct { +typedef struct lv_block_exception_v1 { struct list_head hash; uint32_t rsector_org; kdev_t rdev_org; uint32_t rsector_new; kdev_t rdev_new; -} lv_block_exception_v1_t; +} lv_block_exception_t; /* disk stored pe information */ typedef struct { @@ -475,37 +427,11 @@ /* - * Structure Physical Volume (PV) Version 1 + * physical volume structures */ /* core */ -typedef struct { - char id[2]; /* Identifier */ - unsigned short version; /* HM lvm version */ - lvm_disk_data_t pv_on_disk; - lvm_disk_data_t vg_on_disk; - lvm_disk_data_t pv_namelist_on_disk; - lvm_disk_data_t lv_on_disk; - lvm_disk_data_t pe_on_disk; - char pv_name[NAME_LEN]; - char vg_name[NAME_LEN]; - char system_id[NAME_LEN]; /* for vgexport/vgimport */ - kdev_t pv_dev; - uint pv_number; - uint pv_status; - uint pv_allocatable; - uint pv_size; /* HM */ - uint lv_cur; - uint pe_size; - uint pe_total; - uint pe_allocated; - uint pe_stale; /* for future use */ - pe_disk_t *pe; /* HM */ - struct inode *inode; /* HM */ -} pv_v1_t; - -/* core */ -typedef struct { +typedef struct pv_v2 { char id[2]; /* Identifier */ unsigned short version; /* HM lvm version */ lvm_disk_data_t pv_on_disk; @@ -527,36 +453,17 @@ uint pe_allocated; uint pe_stale; /* for future use */ pe_disk_t *pe; /* HM */ - struct inode *inode; /* HM */ + struct block_device *bd; char pv_uuid[UUID_LEN+1]; -} pv_v2_t; +#ifndef __KERNEL__ + uint32_t pe_start; /* in sectors */ +#endif +} pv_t; -/* disk */ -typedef struct { - uint8_t id[2]; /* Identifier */ - uint16_t version; /* HM lvm version */ - lvm_disk_data_t pv_on_disk; - lvm_disk_data_t vg_on_disk; - lvm_disk_data_t pv_namelist_on_disk; - lvm_disk_data_t lv_on_disk; - lvm_disk_data_t pe_on_disk; - uint8_t pv_name[NAME_LEN]; - uint8_t vg_name[NAME_LEN]; - uint8_t system_id[NAME_LEN]; /* for vgexport/vgimport */ - uint32_t pv_major; - uint32_t pv_number; - uint32_t pv_status; - uint32_t pv_allocatable; - uint32_t pv_size; /* HM */ - uint32_t lv_cur; - uint32_t pe_size; - uint32_t pe_total; - uint32_t pe_allocated; -} pv_disk_v1_t; /* disk */ -typedef struct { +typedef struct pv_disk_v2 { uint8_t id[2]; /* Identifier */ uint16_t version; /* HM lvm version */ lvm_disk_data_t pv_on_disk; @@ -576,7 +483,11 @@ uint32_t pe_size; uint32_t pe_total; uint32_t pe_allocated; -} pv_disk_v2_t; + + /* new in struct version 2 */ + uint32_t pe_start; /* in sectors */ + +} pv_disk_t; /* @@ -600,7 +511,7 @@ } le_remap_req_t; typedef struct lv_bmap { - ulong lv_block; + uint32_t lv_block; dev_t lv_dev; } lv_bmap_t; @@ -609,7 +520,7 @@ */ /* core */ -typedef struct lv_v4 { +typedef struct lv_v5 { char lv_name[NAME_LEN]; char vg_name[NAME_LEN]; uint lv_access; @@ -632,9 +543,9 @@ uint lv_read_ahead; /* delta to version 1 starts here */ - struct lv_v4 *lv_snapshot_org; - struct lv_v4 *lv_snapshot_prev; - struct lv_v4 *lv_snapshot_next; + struct lv_v5 *lv_snapshot_org; + struct lv_v5 *lv_snapshot_prev; + struct lv_v5 *lv_snapshot_next; lv_block_exception_t *lv_block_exception; uint lv_remap_ptr; uint lv_remap_end; @@ -643,22 +554,22 @@ #ifdef __KERNEL__ struct kiobuf *lv_iobuf; struct kiobuf *lv_COW_table_iobuf; - struct semaphore lv_snapshot_sem; + struct rw_semaphore lv_lock; struct list_head *lv_snapshot_hash_table; uint32_t lv_snapshot_hash_table_size; uint32_t lv_snapshot_hash_mask; wait_queue_head_t lv_snapshot_wait; int lv_snapshot_use_rate; - void *vg; + struct vg_v3 *vg; uint lv_allocated_snapshot_le; #else char dummy[200]; #endif -} lv_v5_t; +} lv_t; /* disk */ -typedef struct { +typedef struct lv_disk_v3 { uint8_t lv_name[NAME_LEN]; uint8_t vg_name[NAME_LEN]; uint32_t lv_access; @@ -680,36 +591,14 @@ uint32_t lv_allocation; uint32_t lv_io_timeout; /* for future use */ uint32_t lv_read_ahead; /* HM */ -} lv_disk_v3_t; +} lv_disk_t; /* * Structure Volume Group (VG) Version 1 */ /* core */ -typedef struct { - char vg_name[NAME_LEN]; /* volume group name */ - uint vg_number; /* volume group number */ - uint vg_access; /* read/write */ - uint vg_status; /* active or not */ - uint lv_max; /* maximum logical volumes */ - uint lv_cur; /* current logical volumes */ - uint lv_open; /* open logical volumes */ - uint pv_max; /* maximum physical volumes */ - uint pv_cur; /* current physical volumes FU */ - uint pv_act; /* active physical volumes */ - uint dummy; /* was obsolete max_pe_per_pv */ - uint vgda; /* volume group descriptor arrays FU */ - uint pe_size; /* physical extent size in sectors */ - uint pe_total; /* total of physical extents */ - uint pe_allocated; /* allocated physical extents */ - uint pvg_total; /* physical volume groups FU */ - struct proc_dir_entry *proc; - pv_t *pv[ABS_MAX_PV + 1]; /* physical volume struct pointers */ - lv_t *lv[ABS_MAX_LV + 1]; /* logical volume struct pointers */ -} vg_v1_t; - -typedef struct { +typedef struct vg_v3 { char vg_name[NAME_LEN]; /* volume group name */ uint vg_number; /* volume group number */ uint vg_access; /* read/write */ @@ -737,30 +626,11 @@ #else char dummy1[200]; #endif -} vg_v3_t; +} vg_t; /* disk */ -typedef struct { - uint8_t vg_name[NAME_LEN]; /* volume group name */ - uint32_t vg_number; /* volume group number */ - uint32_t vg_access; /* read/write */ - uint32_t vg_status; /* active or not */ - uint32_t lv_max; /* maximum logical volumes */ - uint32_t lv_cur; /* current logical volumes */ - uint32_t lv_open; /* open logical volumes */ - uint32_t pv_max; /* maximum physical volumes */ - uint32_t pv_cur; /* current physical volumes FU */ - uint32_t pv_act; /* active physical volumes */ - uint32_t dummy; - uint32_t vgda; /* volume group descriptor arrays FU */ - uint32_t pe_size; /* physical extent size in sectors */ - uint32_t pe_total; /* total of physical extents */ - uint32_t pe_allocated; /* allocated physical extents */ - uint32_t pvg_total; /* physical volume groups FU */ -} vg_disk_v1_t; - -typedef struct { +typedef struct vg_disk_v2 { uint8_t vg_uuid[UUID_LEN]; /* volume group UUID */ uint8_t vg_name_dummy[NAME_LEN-UUID_LEN]; /* rest of v1 VG name */ uint32_t vg_number; /* volume group number */ @@ -778,7 +648,7 @@ uint32_t pe_total; /* total of physical extents */ uint32_t pe_allocated; /* allocated physical extents */ uint32_t pvg_total; /* physical volume groups FU */ -} vg_disk_v2_t; +} vg_disk_t; /* @@ -838,4 +708,41 @@ int rate; } lv_snapshot_use_rate_req_t; + + +/* useful inlines */ +static inline ulong round_up(ulong n, ulong size) { + size--; + return (n + size) & ~size; +} + +static inline ulong div_up(ulong n, ulong size) { + return round_up(n, size) / size; +} + +/* FIXME: nasty capital letters */ +static int inline LVM_GET_COW_TABLE_CHUNKS_PER_PE(vg_t *vg, lv_t *lv) { + return vg->pe_size / lv->lv_chunk_size; +} + +static int inline LVM_GET_COW_TABLE_ENTRIES_PER_PE(vg_t *vg, lv_t *lv) { + ulong chunks = vg->pe_size / lv->lv_chunk_size; + ulong entry_size = sizeof(lv_COW_table_disk_t); + ulong chunk_size = lv->lv_chunk_size * SECTOR_SIZE; + ulong entries = (vg->pe_size * SECTOR_SIZE) / + (entry_size + chunk_size); + + if(chunks < 2) + return 0; + + for(; entries; entries--) + if((div_up(entries * entry_size, chunk_size) + entries) <= + chunks) + break; + + return entries; +} + + #endif /* #ifndef _LVM_H_INCLUDE */ + --- linux/drivers/md/lvm.c.orig Tue Oct 23 13:55:17 2001 +++ linux/drivers/md/lvm.c Tue Oct 23 13:55:51 2001 @@ -7,7 +7,7 @@ * April-May,July-August,November 1998 * January-March,May,July,September,October 1999 * January,February,July,September-November 2000 - * January,February,March 2001 + * January-April 2001 * * * LVM driver is free software; you can redistribute it and/or modify @@ -73,7 +73,7 @@ * only other update ioctls are blocked now * - fixed pv->pe to NULL for pv_status * - using lv_req structure in lvm_chr_ioctl() now - * - fixed NULL ptr reference bug in lvm_do_lv_extendreduce() + * - fixed NULL ptr reference bug in lvm_do_lv_extend_reduce() * caused by uncontiguous PV array in lvm_chr_ioctl(VG_REDUCE) * 09/02/1999 - changed BLKRASET and BLKRAGET in lvm_chr_ioctl() to * handle lgoical volume private read ahead sector @@ -194,11 +194,25 @@ * - factored lvm_do_pv_flush out of lvm_chr_ioctl (HM) * 09/03/2001 - Added _lock_open_count to ensure we only drop the lock * when the locking process closes. - * 05/04/2001 - lvm_map bugs: don't use b_blocknr/b_dev in lvm_map, it - * destroys stacking devices. call b_end_io on failed maps. - * (Jens Axboe) - * 30/04/2001 - replace get_hardblock_size() with get_hardsect_size() for - * 2.4.4 kernel. + * 05/04/2001 - Defer writes to an extent that is being moved [JT] + * 05/04/2001 - use b_rdev and b_rsector rather than b_dev and b_blocknr in + * lvm_map() in order to make stacking devices more happy (HM) + * 11/04/2001 - cleaned up the pvmove queue code. I no longer retain the + * rw flag, instead WRITEA's are just dropped [JT] + * 30/04/2001 - added KERNEL_VERSION > 2.4.3 get_hardsect_size() rather + * than get_hardblocksize() call + * 03/05/2001 - Use copy_to/from_user to preserve pointers in + * lvm_do_status_by* + * 11/05/2001 - avoid accesses to inactive snapshot data in + * __update_hardsectsize() and lvm_do_lv_extend_reduce() (JW) + * 28/05/2001 - implemented missing BLKSSZGET ioctl + * 05/06/2001 - Move _pe_lock out of fast path for lvm_map when no PEs + * locked. Make buffer queue flush not need locking. + * Fix lvm_user_bmap() to set b_rsector for new lvm_map(). [AED] + * 30/06/2001 - Speed up __update_hardsectsize() by checking if PVs have + * the same hardsectsize (very likely) before scanning all LEs + * in the LV each time. [AED] + * 12/10/2001 - Use add/del_gendisk() routines in 2.4.10+ * */ @@ -228,6 +242,7 @@ #include +#include #include #include #include @@ -245,13 +260,9 @@ #include "lvm-internal.h" -#define LVM_CORRECT_READ_AHEAD(a) \ -do { \ - if ((a) < LVM_MIN_READ_AHEAD || \ - (a) > LVM_MAX_READ_AHEAD) \ - (a) = LVM_DEFAULT_READ_AHEAD; \ - read_ahead[MAJOR_NR] = (a); \ -} while(0) +#define LVM_CORRECT_READ_AHEAD( a) \ + if ( a < LVM_MIN_READ_AHEAD || \ + a > LVM_MAX_READ_AHEAD) a = LVM_MAX_READ_AHEAD; #ifndef WRITEA # define WRITEA WRITE @@ -316,6 +327,15 @@ static void __update_hardsectsize(lv_t *lv); +static void _queue_io(struct buffer_head *bh, int rw); +static struct buffer_head *_dequeue_io(void); +static void _flush_io(struct buffer_head *bh); + +static int _open_pv(pv_t *pv); +static void _close_pv(pv_t *pv); + +static unsigned long _sectors_to_k(unsigned long sect); + #ifdef LVM_HD_NAME void lvm_hd_name(char *, int); #endif @@ -323,9 +343,7 @@ /* variables */ -char *lvm_version = "LVM version "LVM_RELEASE_NAME" by Heinz Mauelshagen " - "("LVM_RELEASE_DATE")\n"; -char *lvm_short_version = "version "LVM_RELEASE_NAME" ("LVM_RELEASE_DATE")"; +char *lvm_version = "LVM version "LVM_RELEASE_NAME"("LVM_RELEASE_DATE")"; ushort lvm_iop_version = LVM_DRIVER_IOP_VERSION; int loadtime = 0; const char *const lvm_name = LVM_NAME; @@ -334,11 +352,6 @@ /* volume group descriptor area pointers */ vg_t *vg[ABS_MAX_VG]; -static pv_t *pvp = NULL; -static lv_t *lvp = NULL; -static pe_t *pep = NULL; - - /* map from block minor number to VG and LV numbers */ typedef struct { int vg_number; @@ -350,7 +363,7 @@ /* Request structures (lvm_chr_ioctl()) */ static pv_change_req_t pv_change_req; static pv_status_req_t pv_status_req; -static pe_lock_req_t pe_lock_req; +volatile static pe_lock_req_t pe_lock_req; static le_remap_req_t le_remap_req; static lv_req_t lv_req; @@ -365,11 +378,14 @@ static uint vg_count = 0; static long lvm_chr_open_count = 0; static DECLARE_WAIT_QUEUE_HEAD(lvm_wait); -static DECLARE_WAIT_QUEUE_HEAD(lvm_map_wait); static spinlock_t lvm_lock = SPIN_LOCK_UNLOCKED; static spinlock_t lvm_snapshot_lock = SPIN_LOCK_UNLOCKED; +static struct buffer_head *_pe_requests; +static DECLARE_RWSEM(_pe_lock); + + struct file_operations lvm_chr_fops = { open: lvm_chr_open, release: lvm_chr_close, @@ -379,7 +395,7 @@ /* block device operations structure needed for 2.3.38? and above */ struct block_device_operations lvm_blk_dops = { - open: lvm_blk_open, + open: lvm_blk_open, release: lvm_blk_close, ioctl: lvm_blk_ioctl, }; @@ -393,29 +409,38 @@ static struct gendisk lvm_gendisk = { - major: MAJOR_NR, - major_name: LVM_NAME, - minor_shift: 0, - max_p: 1, - part: lvm_hd_struct, - sizes: lvm_size, - nr_real: MAX_LV, + MAJOR_NR, /* major # */ + LVM_NAME, /* name of major */ + 0, /* number of times minor is shifted + to get real minor */ + 1, /* maximum partitions per device */ + lvm_hd_struct, /* partition table */ + lvm_size, /* device size in blocks, copied + to block_size[] */ + MAX_LV, /* number or real devices */ + NULL, /* internal */ + NULL, /* pointer to next gendisk struct (internal) */ }; + /* * Driver initialization... */ int lvm_init(void) { - if (register_chrdev(LVM_CHAR_MAJOR, lvm_name, &lvm_chr_fops) < 0) { - printk(KERN_ERR "%s -- register_chrdev failed\n", lvm_name); + if (devfs_register_chrdev(LVM_CHAR_MAJOR, + lvm_name, &lvm_chr_fops) < 0) { + printk(KERN_ERR "%s -- devfs_register_chrdev failed\n", + lvm_name); return -EIO; } - if (register_blkdev(MAJOR_NR, lvm_name, &lvm_blk_dops) < 0) + if (devfs_register_blkdev(MAJOR_NR, lvm_name, &lvm_blk_dops) < 0) { - printk("%s -- register_blkdev failed\n", lvm_name); - if (unregister_chrdev(LVM_CHAR_MAJOR, lvm_name) < 0) - printk(KERN_ERR "%s -- unregister_chrdev failed\n", lvm_name); + printk("%s -- devfs_register_blkdev failed\n", lvm_name); + if (devfs_unregister_chrdev(LVM_CHAR_MAJOR, lvm_name) < 0) + printk(KERN_ERR + "%s -- devfs_unregister_chrdev failed\n", + lvm_name); return -EIO; } @@ -423,6 +448,7 @@ lvm_init_vars(); lvm_geninit(&lvm_gendisk); + /* insert our gendisk at the corresponding major */ add_gendisk(&lvm_gendisk); #ifdef LVM_HD_NAME @@ -433,20 +459,19 @@ blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), lvm_make_request_fn); + /* initialise the pe lock */ + pe_lock_req.lock = UNLOCK_PE; + /* optional read root VGDA */ /* if ( *rootvg != 0) vg_read_with_pv_and_lv ( rootvg, &vg); */ - printk(KERN_INFO - "%s%s -- " #ifdef MODULE - "Module" + printk(KERN_INFO "%s module loaded\n", lvm_version); #else - "Driver" + printk(KERN_INFO "%s\n", lvm_version); #endif - " successfully initialized\n", - lvm_version, lvm_name); return 0; } /* lvm_init() */ @@ -457,15 +482,16 @@ static void lvm_cleanup(void) { - if (unregister_chrdev(LVM_CHAR_MAJOR, lvm_name) < 0) { - printk(KERN_ERR "%s -- unregister_chrdev failed\n", lvm_name); - } - if (unregister_blkdev(MAJOR_NR, lvm_name) < 0) { - printk(KERN_ERR "%s -- unregister_blkdev failed\n", lvm_name); - } + if (devfs_unregister_chrdev(LVM_CHAR_MAJOR, lvm_name) < 0) + printk(KERN_ERR "%s -- devfs_unregister_chrdev failed\n", + lvm_name); + if (devfs_unregister_blkdev(MAJOR_NR, lvm_name) < 0) + printk(KERN_ERR "%s -- devfs_unregister_blkdev failed\n", + lvm_name); + /* delete our gendisk from chain */ del_gendisk(&lvm_gendisk); blk_size[MAJOR_NR] = NULL; @@ -480,7 +506,9 @@ /* unregister with procfs and devfs */ lvm_fin_fs(); +#ifdef MODULE printk(KERN_INFO "%s -- Module successfully deactivated\n", lvm_name); +#endif return; } /* lvm_cleanup() */ @@ -497,8 +525,8 @@ lvm_lock = lvm_snapshot_lock = SPIN_LOCK_UNLOCKED; pe_lock_req.lock = UNLOCK_PE; - pe_lock_req.data.lv_dev = \ - pe_lock_req.data.pv_dev = \ + pe_lock_req.data.lv_dev = 0; + pe_lock_req.data.pv_dev = 0; pe_lock_req.data.pv_offset = 0; /* Initialize VG pointers */ @@ -521,6 +549,9 @@ * ********************************************************************/ +#define MODE_TO_STR(mode) (mode) & FMODE_READ ? "READ" : "", \ + (mode) & FMODE_WRITE ? "WRITE" : "" + /* * character device open routine */ @@ -528,8 +559,8 @@ { int minor = MINOR(inode->i_rdev); - P_DEV("%s -- lvm_chr_open MINOR: %d VG#: %d mode: 0x%X lock: %d\n", - lvm_name, minor, VG_CHR(minor), file->f_mode, lock); + P_DEV("chr_open MINOR: %d VG#: %d mode: %s%s lock: %d\n", + minor, VG_CHR(minor), MODE_TO_STR(file->f_mode), lock); /* super user validation */ if (!capable(CAP_SYS_ADMIN)) return -EACCES; @@ -569,9 +600,8 @@ /* otherwise cc will complain about unused variables */ (void) lvm_lock; - P_IOCTL("%s -- lvm_chr_ioctl: command: 0x%X MINOR: %d " - "VG#: %d mode: 0x%X\n", - lvm_name, command, minor, VG_CHR(minor), file->f_mode); + P_IOCTL("chr MINOR: %d command: 0x%X arg: %p VG#: %d mode: %s%s\n", + minor, command, arg, VG_CHR(minor), MODE_TO_STR(file->f_mode)); #ifdef LVM_TOTAL_RESET if (lvm_reset_spindown > 0) return -EACCES; @@ -764,13 +794,10 @@ /* * character device close routine */ -int lvm_chr_close(struct inode *inode, struct file *file) +static int lvm_chr_close(struct inode *inode, struct file *file) { -#ifdef DEBUG - int minor = MINOR(inode->i_rdev); - printk(KERN_DEBUG - "%s -- lvm_chr_close VG#: %d\n", lvm_name, VG_CHR(minor)); -#endif + P_DEV("chr_close MINOR: %d VG#: %d\n", + MINOR(inode->i_rdev), VG_CHR(MINOR(inode->i_rdev))); #ifdef LVM_TOTAL_RESET if (lvm_reset_spindown > 0) { @@ -784,6 +811,7 @@ spin_lock(&lvm_lock); if(lock == current->pid) { if(!_lock_open_count) { + P_DEV("chr_close: unlocking LVM for pid %d\n", lock); lock = 0; wake_up_interruptible(&lvm_wait); } else @@ -813,8 +841,8 @@ lv_t *lv_ptr; vg_t *vg_ptr = vg[VG_BLK(minor)]; - P_DEV("%s -- lvm_blk_open MINOR: %d VG#: %d LV#: %d mode: 0x%X\n", - lvm_name, minor, VG_BLK(minor), LV_BLK(minor), file->f_mode); + P_DEV("blk_open MINOR: %d VG#: %d LV#: %d mode: %s%s\n", + minor, VG_BLK(minor), LV_BLK(minor), MODE_TO_STR(file->f_mode)); #ifdef LVM_TOTAL_RESET if (lvm_reset_spindown > 0) @@ -848,7 +876,7 @@ MOD_INC_USE_COUNT; - P_DEV("%s -- OPEN OK, LV size %d\n", lvm_name, lv_ptr->lv_size); + P_DEV("blk_open OK, LV size %d\n", lv_ptr->lv_size); return 0; } @@ -868,22 +896,19 @@ void *arg = (void *) a; struct hd_geometry *hd = (struct hd_geometry *) a; - P_IOCTL("%s -- lvm_blk_ioctl MINOR: %d command: 0x%X arg: %lX " - "VG#: %dl LV#: %d\n", - lvm_name, minor, command, (ulong) arg, - VG_BLK(minor), LV_BLK(minor)); + P_IOCTL("blk MINOR: %d command: 0x%X arg: %p VG#: %d LV#: %d " + "mode: %s%s\n", minor, command, arg, VG_BLK(minor), + LV_BLK(minor), MODE_TO_STR(file->f_mode)); switch (command) { + case BLKSSZGET: + /* get block device sector size as needed e.g. by fdisk */ + return put_user(lvm_sectsize(inode->i_rdev), (int *) arg); + case BLKGETSIZE: /* return device size */ - P_IOCTL("%s -- lvm_blk_ioctl -- BLKGETSIZE: %u\n", - lvm_name, lv_ptr->lv_size); - if (put_user(lv_ptr->lv_size, (unsigned long *)arg)) - return -EFAULT; - break; - - case BLKGETSIZE64: - if (put_user((u64)lv_ptr->lv_size << 9, (u64 *)arg)) + P_IOCTL("BLKGETSIZE: %u\n", lv_ptr->lv_size); + if (put_user(lv_ptr->lv_size, (long *)arg)) return -EFAULT; break; @@ -892,7 +917,7 @@ /* flush buffer cache */ if (!capable(CAP_SYS_ADMIN)) return -EACCES; - P_IOCTL("%s -- lvm_blk_ioctl -- BLKFLSBUF\n", lvm_name); + P_IOCTL("BLKFLSBUF\n"); fsync_dev(inode->i_rdev); invalidate_buffers(inode->i_rdev); @@ -903,20 +928,19 @@ /* set read ahead for block device */ if (!capable(CAP_SYS_ADMIN)) return -EACCES; - P_IOCTL("%s -- lvm_blk_ioctl -- BLKRASET: %ld sectors for %s\n", - lvm_name, (long) arg, kdevname(inode->i_rdev)); + P_IOCTL("BLKRASET: %ld sectors for %s\n", + (long) arg, kdevname(inode->i_rdev)); if ((long) arg < LVM_MIN_READ_AHEAD || (long) arg > LVM_MAX_READ_AHEAD) return -EINVAL; lv_ptr->lv_read_ahead = (long) arg; - read_ahead[MAJOR_NR] = lv_ptr->lv_read_ahead; break; case BLKRAGET: /* get current read ahead setting */ - P_IOCTL("%s -- lvm_blk_ioctl -- BLKRAGET\n", lvm_name); + P_IOCTL("BLKRAGET %d\n", lv_ptr->lv_read_ahead); if (put_user(lv_ptr->lv_read_ahead, (long *)arg)) return -EFAULT; break; @@ -969,13 +993,12 @@ break; case LV_BMAP: - /* turn logical block into (dev_t, block). non privileged. */ + /* turn logical block into (dev_t, block). non privileged. */ /* don't bmap a snapshot, since the mapping can change */ - if (lv_ptr->lv_access & LV_SNAPSHOT) + if(lv_ptr->lv_access & LV_SNAPSHOT) return -EPERM; return lvm_user_bmap(inode, (struct lv_bmap *) arg); - break; case LV_SET_ALLOCATION: /* set allocation flags of a logical volume */ @@ -1006,11 +1029,8 @@ vg_t *vg_ptr = vg[VG_BLK(minor)]; lv_t *lv_ptr = vg_ptr->lv[LV_BLK(minor)]; -#ifdef DEBUG - printk(KERN_DEBUG - "%s -- lvm_blk_close MINOR: %d VG#: %d LV#: %d\n", - lvm_name, minor, VG_BLK(minor), LV_BLK(minor)); -#endif + P_DEV("blk_close MINOR: %d VG#: %d LV#: %d\n", + minor, VG_BLK(minor), LV_BLK(minor)); if (lv_ptr->lv_open == 1) vg_ptr->lv_open--; lv_ptr->lv_open--; @@ -1063,17 +1083,17 @@ return -EFAULT; memset(&bh,0,sizeof bh); - bh.b_rsector = block; - bh.b_dev = bh.b_rdev = inode->i_dev; + bh.b_blocknr = block; + bh.b_dev = bh.b_rdev = inode->i_rdev; bh.b_size = lvm_get_blksize(bh.b_dev); + bh.b_rsector = block * (bh.b_size >> 9); if ((err=lvm_map(&bh, READ)) < 0) { printk("lvm map failed: %d\n", err); return -EINVAL; } - return put_user(kdev_t_to_nr(bh.b_rdev), &user_result->lv_dev) || - put_user(bh.b_rsector/(bh.b_size>>9), &user_result->lv_block) ? - -EFAULT : 0; + return (put_user(kdev_t_to_nr(bh.b_rdev), &user_result->lv_dev) || + put_user(bh.b_rsector/(bh.b_size>>9), &user_result->lv_block)); } @@ -1081,16 +1101,68 @@ * block device support function for /usr/src/linux/drivers/block/ll_rw_blk.c * (see init_module/lvm_init) */ -static inline void __remap_snapshot(kdev_t rdev, ulong rsector, +static void __remap_snapshot(kdev_t rdev, ulong rsector, ulong pe_start, lv_t *lv, vg_t *vg) { + + /* copy a chunk from the origin to a snapshot device */ + down_write(&lv->lv_lock); + + /* we must redo lvm_snapshot_remap_block in order to avoid a + race condition in the gap where no lock was held */ if (!lvm_snapshot_remap_block(&rdev, &rsector, pe_start, lv) && !lvm_snapshot_COW(rdev, rsector, pe_start, rsector, vg, lv)) lvm_write_COW_table_block(vg, lv); + + up_write(&lv->lv_lock); +} + +static inline void _remap_snapshot(kdev_t rdev, ulong rsector, + ulong pe_start, lv_t *lv, vg_t *vg) { + int r; + + /* check to see if this chunk is already in the snapshot */ + down_read(&lv->lv_lock); + r = lvm_snapshot_remap_block(&rdev, &rsector, pe_start, lv); + up_read(&lv->lv_lock); + + if (!r) + /* we haven't yet copied this block to the snapshot */ + __remap_snapshot(rdev, rsector, pe_start, lv, vg); +} + + +/* + * extents destined for a pe that is on the move should be deferred + */ +static inline int _should_defer(kdev_t pv, ulong sector, uint32_t pe_size) { + return ((pe_lock_req.lock == LOCK_PE) && + (pv == pe_lock_req.data.pv_dev) && + (sector >= pe_lock_req.data.pv_offset) && + (sector < (pe_lock_req.data.pv_offset + pe_size))); } +static inline int _defer_extent(struct buffer_head *bh, int rw, + kdev_t pv, ulong sector, uint32_t pe_size) +{ + if (pe_lock_req.lock == LOCK_PE) { + down_read(&_pe_lock); + if (_should_defer(pv, sector, pe_size)) { + up_read(&_pe_lock); + down_write(&_pe_lock); + if (_should_defer(pv, sector, pe_size)) + _queue_io(bh, rw); + up_write(&_pe_lock); + return 1; + } + up_read(&_pe_lock); + } + return 0; +} + + static int lvm_map(struct buffer_head *bh, int rw) { - int minor = MINOR(bh->b_dev); + int minor = MINOR(bh->b_rdev); ulong index; ulong pe_start; ulong size = bh->b_size >> 9; @@ -1101,7 +1173,7 @@ lv_t *lv = vg_this->lv[LV_BLK(minor)]; - down(&lv->lv_snapshot_sem); + down_read(&lv->lv_lock); if (!(lv->lv_status & LV_ACTIVE)) { printk(KERN_ALERT "%s - lvm_map: ll_rw_blk for inactive LV %s\n", @@ -1119,7 +1191,7 @@ P_MAP("%s - lvm_map minor: %d *rdev: %s *rsector: %lu size:%lu\n", lvm_name, minor, - kdevname(bh->b_dev), + kdevname(bh->b_rdev), rsector_org, size); if (rsector_org + size > lv->lv_size) { @@ -1130,7 +1202,7 @@ goto bad; } -remap: + if (lv->lv_stripes < 2) { /* linear mapping */ /* get the index */ index = rsector_org / vg_this->pe_size; @@ -1167,36 +1239,33 @@ rsector_map, stripe_length, stripe_index); } - /* handle physical extents on the move */ - if (pe_lock_req.lock == LOCK_PE) { - if (rdev_map == pe_lock_req.data.pv_dev && - rsector_map >= pe_lock_req.data.pv_offset && - rsector_map < (pe_lock_req.data.pv_offset + - vg_this->pe_size)) { - sleep_on(&lvm_map_wait); - goto remap; + /* + * Queue writes to physical extents on the move until move completes. + * Don't get _pe_lock until there is a reasonable expectation that + * we need to queue this request, because this is in the fast path. + */ + if (rw == WRITE || rw == WRITEA) { + if(_defer_extent(bh, rw, rdev_map, + rsector_map, vg_this->pe_size)) { + + up_read(&lv->lv_lock); + return 0; } - } - /* statistic */ - if (rw == WRITE || rw == WRITEA) - lv->lv_current_pe[index].writes++; - else - lv->lv_current_pe[index].reads++; + lv->lv_current_pe[index].writes++; /* statistic */ + } else + lv->lv_current_pe[index].reads++; /* statistic */ - /* snapshot volume exception handling on physical device - address base */ + /* snapshot volume exception handling on physical device address base */ if (!(lv->lv_access & (LV_SNAPSHOT|LV_SNAPSHOT_ORG))) goto out; if (lv->lv_access & LV_SNAPSHOT) { /* remap snapshot */ - if (lv->lv_block_exception) - lvm_snapshot_remap_block(&rdev_map, &rsector_map, - pe_start, lv); - else + if (lvm_snapshot_remap_block(&rdev_map, &rsector_map, + pe_start, lv) < 0) goto bad; - } else if(rw == WRITE || rw == WRITEA) { /* snapshot origin */ + } else if (rw == WRITE || rw == WRITEA) { /* snapshot origin */ lv_t *snap; /* start with first snapshot and loop through all of @@ -1209,22 +1278,20 @@ /* Serializes the COW with the accesses to the snapshot device */ - down(&snap->lv_snapshot_sem); - __remap_snapshot(rdev_map, rsector_map, + _remap_snapshot(rdev_map, rsector_map, pe_start, snap, vg_this); - up(&snap->lv_snapshot_sem); } } out: bh->b_rdev = rdev_map; bh->b_rsector = rsector_map; - up(&lv->lv_snapshot_sem); + up_read(&lv->lv_lock); return 1; bad: buffer_IO_error(bh); - up(&lv->lv_snapshot_sem); + up_read(&lv->lv_lock); return -1; } /* lvm_map() */ @@ -1258,15 +1325,10 @@ /* * make request function */ -static int lvm_make_request_fn(request_queue_t *q, - int rw, - struct buffer_head *bh) -{ - if (lvm_map(bh, rw) >= 0) - return 1; - - buffer_IO_error(bh); - return 0; +static int lvm_make_request_fn(request_queue_t *q, + int rw, + struct buffer_head *bh) { + return (lvm_map(bh, rw) <= 0) ? 0 : 1; } @@ -1283,8 +1345,7 @@ lock_try_again: spin_lock(&lvm_lock); if (lock != 0 && lock != current->pid) { - P_IOCTL("lvm_do_lock_lvm: %s is locked by pid %d ...\n", - lvm_name, lock); + P_DEV("lvm_do_lock_lvm: locked by pid %d ...\n", lock); spin_unlock(&lvm_lock); interruptible_sleep_on(&lvm_wait); if (current->sigpending != 0) @@ -1296,6 +1357,7 @@ goto lock_try_again; } lock = current->pid; + P_DEV("lvm_do_lock_lvm: locking LVM for pid %d\n", lock); spin_unlock(&lvm_lock); return 0; } /* lvm_do_lock_lvm */ @@ -1306,33 +1368,60 @@ */ static int lvm_do_pe_lock_unlock(vg_t *vg_ptr, void *arg) { + pe_lock_req_t new_lock; + struct buffer_head *bh; uint p; if (vg_ptr == NULL) return -ENXIO; - if (copy_from_user(&pe_lock_req, arg, - sizeof(pe_lock_req_t)) != 0) return -EFAULT; + if (copy_from_user(&new_lock, arg, sizeof(new_lock)) != 0) + return -EFAULT; - switch (pe_lock_req.lock) { + switch (new_lock.lock) { case LOCK_PE: for (p = 0; p < vg_ptr->pv_max; p++) { if (vg_ptr->pv[p] != NULL && - pe_lock_req.data.pv_dev == - vg_ptr->pv[p]->pv_dev) + new_lock.data.pv_dev == vg_ptr->pv[p]->pv_dev) break; } if (p == vg_ptr->pv_max) return -ENXIO; - pe_lock_req.lock = UNLOCK_PE; + /* + * this sync releaves memory pressure to lessen the + * likelyhood of pvmove being paged out - resulting in + * deadlock. + * + * This method of doing a pvmove is broken + */ fsync_dev(pe_lock_req.data.lv_dev); + + down_write(&_pe_lock); + if (pe_lock_req.lock == LOCK_PE) { + up_write(&_pe_lock); + return -EBUSY; + } + + /* Should we do to_kdev_t() on the pv_dev and lv_dev??? */ pe_lock_req.lock = LOCK_PE; + pe_lock_req.data.lv_dev = new_lock.data.lv_dev; + pe_lock_req.data.pv_dev = new_lock.data.pv_dev; + pe_lock_req.data.pv_offset = new_lock.data.pv_offset; + up_write(&_pe_lock); + + /* some requests may have got through since the fsync */ + fsync_dev(pe_lock_req.data.pv_dev); break; case UNLOCK_PE: + down_write(&_pe_lock); pe_lock_req.lock = UNLOCK_PE; - pe_lock_req.data.lv_dev = \ - pe_lock_req.data.pv_dev = \ + pe_lock_req.data.lv_dev = 0; + pe_lock_req.data.pv_dev = 0; pe_lock_req.data.pv_offset = 0; - wake_up(&lvm_map_wait); + bh = _dequeue_io(); + up_write(&_pe_lock); + + /* handle all deferred io for this PE */ + _flush_io(bh); break; default: @@ -1400,6 +1489,8 @@ } /* get the volume group structure */ if (copy_from_user(vg_ptr, arg, sizeof(vg_t)) != 0) { + P_IOCTL("lvm_do_vg_create ERROR: copy VG ptr %p (%d bytes)\n", + arg, sizeof(vg_t)); kfree(vg_ptr); return -EFAULT; } @@ -1409,8 +1500,9 @@ /* Validate it */ if (vg[VG_CHR(minor)] != NULL) { - kfree(vg_ptr); - return -EPERM; + P_IOCTL("lvm_do_vg_create ERROR: VG %d in use\n", minor); + kfree(vg_ptr); + return -EPERM; } /* we are not that active so far... */ @@ -1441,6 +1533,7 @@ /* get the physical volume structures */ vg_ptr->pv_act = vg_ptr->pv_cur = 0; for (p = 0; p < vg_ptr->pv_max; p++) { + pv_t *pvp; /* user space address */ if ((pvp = vg_ptr->pv[p]) != NULL) { ret = lvm_do_pv_create(pvp, vg_ptr, p); @@ -1464,9 +1557,12 @@ /* get the logical volume structures */ vg_ptr->lv_cur = 0; for (l = 0; l < vg_ptr->lv_max; l++) { + lv_t *lvp; /* user space address */ if ((lvp = vg_ptr->lv[l]) != NULL) { if (copy_from_user(&lv, lvp, sizeof(lv_t)) != 0) { + P_IOCTL("ERROR: copying LV ptr %p (%d bytes)\n", + lvp, sizeof(lv_t)); lvm_do_vg_remove(minor); return -EFAULT; } @@ -1488,7 +1584,7 @@ /* Second path to correct snapshot logical volumes which are not in place during first path above */ for (l = 0; l < ls; l++) { - lvp = snap_lv_ptr[l]; + lv_t *lvp = snap_lv_ptr[l]; if (copy_from_user(&lv, lvp, sizeof(lv_t)) != 0) { lvm_do_vg_remove(minor); return -EFAULT; @@ -1680,27 +1776,41 @@ * character device support function physical volume create */ static int lvm_do_pv_create(pv_t *pvp, vg_t *vg_ptr, ulong p) { - pv_t *pv_ptr = NULL; + pv_t *pv; + int err; - pv_ptr = vg_ptr->pv[p] = kmalloc(sizeof(pv_t),GFP_KERNEL); - if (pv_ptr == NULL) { + pv = kmalloc(sizeof(pv_t),GFP_KERNEL); + if (pv == NULL) { printk(KERN_CRIT "%s -- PV_CREATE: kmalloc error PV at line %d\n", lvm_name, __LINE__); return -ENOMEM; } - if (copy_from_user(pv_ptr, pvp, sizeof(pv_t)) != 0) { + + memset(pv, 0, sizeof(*pv)); + + if (copy_from_user(pv, pvp, sizeof(pv_t)) != 0) { + P_IOCTL("lvm_do_pv_create ERROR: copy PV ptr %p (%d bytes)\n", + pvp, sizeof(pv_t)); + kfree(pv); return -EFAULT; } + + if ((err = _open_pv(pv))) { + kfree(pv); + return err; + } + /* We don't need the PE list in kernel space as with LVs pe_t list (see below) */ - pv_ptr->pe = NULL; - pv_ptr->pe_allocated = 0; - pv_ptr->pv_status = PV_ACTIVE; + pv->pe = NULL; + pv->pe_allocated = 0; + pv->pv_status = PV_ACTIVE; vg_ptr->pv_act++; vg_ptr->pv_cur++; - lvm_fs_create_pv(vg_ptr, pv_ptr); + lvm_fs_create_pv(vg_ptr, pv); + vg_ptr->pv[p] = pv; return 0; } /* lvm_do_pv_create() */ @@ -1709,47 +1819,73 @@ * character device support function physical volume remove */ static int lvm_do_pv_remove(vg_t *vg_ptr, ulong p) { - pv_t *pv_ptr = vg_ptr->pv[p]; + pv_t *pv = vg_ptr->pv[p]; - lvm_fs_remove_pv(vg_ptr, pv_ptr); + lvm_fs_remove_pv(vg_ptr, pv); - vg_ptr->pe_total -= pv_ptr->pe_total; + vg_ptr->pe_total -= pv->pe_total; vg_ptr->pv_cur--; vg_ptr->pv_act--; -#ifdef LVM_GET_INODE - lvm_clear_inode(pv_ptr->inode); -#endif - kfree(pv_ptr); + + _close_pv(pv); + kfree(pv); + vg_ptr->pv[p] = NULL; return 0; } -static void __update_hardsectsize(lv_t *lv) { - int le, e; - int max_hardsectsize = 0, hardsectsize; - - for (le = 0; le < lv->lv_allocated_le; le++) { - hardsectsize = get_hardsect_size(lv->lv_current_pe[le].dev); - if (hardsectsize == 0) - hardsectsize = 512; - if (hardsectsize > max_hardsectsize) - max_hardsectsize = hardsectsize; - } - - if (lv->lv_access & LV_SNAPSHOT) { - for (e = 0; e < lv->lv_remap_end; e++) { - hardsectsize = - get_hardsect_size( - lv->lv_block_exception[e].rdev_new); - if (hardsectsize == 0) - hardsectsize = 512; - if (hardsectsize > max_hardsectsize) +static void __update_hardsectsize(lv_t *lv) +{ + int max_hardsectsize = 0, hardsectsize = 0; + int p; + + /* Check PVs first to see if they all have same sector size */ + for (p = 0; p < lv->vg->pv_cur; p++) { + pv_t *pv = lv->vg->pv[p]; + if (pv && (hardsectsize = lvm_sectsize(pv->pv_dev))) { + if (max_hardsectsize == 0) max_hardsectsize = hardsectsize; + else if (hardsectsize != max_hardsectsize) { + P_DEV("%s PV[%d] (%s) sector size %d, not %d\n", + lv->lv_name, p, kdevname(pv->pv_dev), + hardsectsize, max_hardsectsize); + break; + } } } + /* PVs have different block size, need to check each LE sector size */ + if (hardsectsize != max_hardsectsize) { + int le; + for (le = 0; le < lv->lv_allocated_le; le++) { + hardsectsize = lvm_sectsize(lv->lv_current_pe[le].dev); + if (hardsectsize > max_hardsectsize) { + P_DEV("%s LE[%d] (%s) blocksize %d not %d\n", + lv->lv_name, le, + kdevname(lv->lv_current_pe[le].dev), + hardsectsize, max_hardsectsize); + max_hardsectsize = hardsectsize; + } + } + + /* only perform this operation on active snapshots */ + if ((lv->lv_access & LV_SNAPSHOT) && + (lv->lv_status & LV_ACTIVE)) { + int e; + for (e = 0; e < lv->lv_remap_end; e++) { + hardsectsize = lvm_sectsize(lv->lv_block_exception[e].rdev_new); + if (hardsectsize > max_hardsectsize) + max_hardsectsize = hardsectsize; + } + } + } + + if (max_hardsectsize == 0) + max_hardsectsize = SECTOR_SIZE; + P_DEV("hardblocksize for LV %s is %d\n", + kdevname(lv->lv_dev), max_hardsectsize); lvm_hardsectsizes[MINOR(lv->lv_dev)] = max_hardsectsize; } @@ -1763,9 +1899,12 @@ lv_block_exception_t *lvbe = lv->lv_block_exception; vg_t *vg_ptr = vg[VG_CHR(minor)]; lv_t *lv_ptr = NULL; + pe_t *pep; - if ((pep = lv->lv_current_pe) == NULL) return -EINVAL; - if (lv->lv_chunk_size > LVM_SNAPSHOT_MAX_CHUNK) + if (!(pep = lv->lv_current_pe)) + return -EINVAL; + + if (_sectors_to_k(lv->lv_chunk_size) > LVM_SNAPSHOT_MAX_CHUNK) return -EINVAL; for (l = 0; l < vg_ptr->lv_cur; l++) { @@ -1797,8 +1936,8 @@ lv_status_save = lv_ptr->lv_status; lv_ptr->lv_status &= ~LV_ACTIVE; - lv_ptr->lv_snapshot_org = \ - lv_ptr->lv_snapshot_prev = \ + lv_ptr->lv_snapshot_org = NULL; + lv_ptr->lv_snapshot_prev = NULL; lv_ptr->lv_snapshot_next = NULL; lv_ptr->lv_block_exception = NULL; lv_ptr->lv_iobuf = NULL; @@ -1806,7 +1945,8 @@ lv_ptr->lv_snapshot_hash_table = NULL; lv_ptr->lv_snapshot_hash_table_size = 0; lv_ptr->lv_snapshot_hash_mask = 0; - init_MUTEX(&lv_ptr->lv_snapshot_sem); + init_rwsem(&lv_ptr->lv_lock); + lv_ptr->lv_snapshot_use_rate = 0; vg_ptr->lv[l] = lv_ptr; @@ -1815,6 +1955,7 @@ is not a snapshot logical volume */ if (!(lv_ptr->lv_access & LV_SNAPSHOT)) { size = lv_ptr->lv_allocated_le * sizeof(pe_t); + if ((lv_ptr->lv_current_pe = vmalloc(size)) == NULL) { printk(KERN_CRIT "%s -- LV_CREATE: vmalloc error LV_CURRENT_PE of %d Byte " @@ -1826,6 +1967,8 @@ return -ENOMEM; } if (copy_from_user(lv_ptr->lv_current_pe, pep, size)) { + P_IOCTL("ERROR: copying PE ptr %p (%d bytes)\n", + pep, sizeof(size)); vfree(lv_ptr->lv_current_pe); kfree(lv_ptr); vg_ptr->lv[l] = NULL; @@ -1847,6 +1990,15 @@ vg_ptr->lv[LV_BLK(lv_ptr->lv_snapshot_minor)]; if (lv_ptr->lv_snapshot_org != NULL) { size = lv_ptr->lv_remap_end * sizeof(lv_block_exception_t); + + if(!size) { + printk(KERN_WARNING + "%s -- zero length exception table requested\n", + lvm_name); + kfree(lv_ptr); + return -EINVAL; + } + if ((lv_ptr->lv_block_exception = vmalloc(size)) == NULL) { printk(KERN_CRIT "%s -- lvm_do_lv_create: vmalloc error LV_BLOCK_EXCEPTION " @@ -1934,6 +2086,7 @@ LVM_CORRECT_READ_AHEAD(lv_ptr->lv_read_ahead); vg_ptr->lv_cur++; lv_ptr->lv_status = lv_status_save; + lv_ptr->vg = vg_ptr; __update_hardsectsize(lv_ptr); @@ -1948,7 +2101,7 @@ fsync_dev_lockfs(org->lv_dev); #endif - down(&org->lv_snapshot_sem); + down_write(&org->lv_lock); org->lv_access |= LV_SNAPSHOT_ORG; lv_ptr->lv_access &= ~LV_SNAPSHOT_ORG; /* this can only hide an userspace bug */ @@ -1957,7 +2110,7 @@ for (last = org; last->lv_snapshot_next; last = last->lv_snapshot_next); lv_ptr->lv_snapshot_prev = last; last->lv_snapshot_next = lv_ptr; - up(&org->lv_snapshot_sem); + up_write(&org->lv_lock); } /* activate the logical volume */ @@ -1973,14 +2126,12 @@ #ifdef LVM_VFS_ENHANCEMENT /* VFS function call to unlock the filesystem */ - if (lv_ptr->lv_access & LV_SNAPSHOT) { + if (lv_ptr->lv_access & LV_SNAPSHOT) unlockfs(lv_ptr->lv_snapshot_org->lv_dev); - } #endif - lv_ptr->vg = vg_ptr; - - lvm_fs_create_lv(vg_ptr, lv_ptr); + lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].de = + lvm_fs_create_lv(vg_ptr, lv_ptr); return 0; } /* lvm_do_lv_create() */ @@ -2026,7 +2177,7 @@ * to the original lv before playing with it. */ lv_t * org = lv_ptr->lv_snapshot_org; - down(&org->lv_snapshot_sem); + down_write(&org->lv_lock); /* remove this snapshot logical volume from the chain */ lv_ptr->lv_snapshot_prev->lv_snapshot_next = lv_ptr->lv_snapshot_next; @@ -2039,7 +2190,7 @@ if (!org->lv_snapshot_next) { org->lv_access &= ~LV_SNAPSHOT_ORG; } - up(&org->lv_snapshot_sem); + up_write(&org->lv_lock); lvm_snapshot_release(lv_ptr); @@ -2060,6 +2211,7 @@ /* reset generic hd */ lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].start_sect = -1; lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].nr_sects = 0; + lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].de = 0; lvm_size[MINOR(lv_ptr->lv_dev)] = 0; /* reset VG/LV mapping */ @@ -2191,8 +2343,7 @@ new_stripe_size = new_lv->lv_allocated_le / new_lv->lv_stripes; end = min(old_stripe_size, new_stripe_size); - for (i = source = dest = 0; - i < new_lv->lv_stripes; i++) { + for (i = source = dest = 0; i < new_lv->lv_stripes; i++) { for (j = 0; j < end; j++) { new_lv->lv_current_pe[dest + j].reads += old_lv->lv_current_pe[source + j].reads; @@ -2227,23 +2378,27 @@ old_lv = vg_ptr->lv[l]; - if (old_lv->lv_access & LV_SNAPSHOT) - r = __extend_reduce_snapshot(vg_ptr, old_lv, new_lv); - else + if (old_lv->lv_access & LV_SNAPSHOT) { + /* only perform this operation on active snapshots */ + if (old_lv->lv_status & LV_ACTIVE) + r = __extend_reduce_snapshot(vg_ptr, old_lv, new_lv); + else + r = -EPERM; + + } else r = __extend_reduce(vg_ptr, old_lv, new_lv); if(r) return r; /* copy relevent fields */ - down(&old_lv->lv_snapshot_sem); + down_write(&old_lv->lv_lock); if(new_lv->lv_access & LV_SNAPSHOT) { - size = (new_lv->lv_remap_end > old_lv->lv_remap_end) ? old_lv->lv_remap_ptr : new_lv->lv_remap_end; size *= sizeof(lv_block_exception_t); - memcpy(new_lv->lv_block_exception, + memcpy(new_lv->lv_block_exception, old_lv->lv_block_exception, size); old_lv->lv_remap_end = new_lv->lv_remap_end; @@ -2258,7 +2413,7 @@ for (e = 0; e < new_lv->lv_remap_ptr; e++) lvm_hash_link(new_lv->lv_block_exception + e, new_lv->lv_block_exception[e].rdev_org, - new_lv->lv_block_exception[e].rsector_org, + new_lv->lv_block_exception[e].rsector_org, new_lv); } else { @@ -2278,7 +2433,7 @@ lv_t *snap; for(snap = old_lv->lv_snapshot_next; snap; snap = snap->lv_snapshot_next) { - down(&snap->lv_snapshot_sem); + down_write(&snap->lv_lock); snap->lv_current_pe = old_lv->lv_current_pe; snap->lv_allocated_le = old_lv->lv_allocated_le; @@ -2290,13 +2445,13 @@ lvm_size[MINOR(snap->lv_dev)] = old_lv->lv_size >> 1; __update_hardsectsize(snap); - up(&snap->lv_snapshot_sem); + up_write(&snap->lv_lock); } } } __update_hardsectsize(old_lv); - up(&old_lv->lv_snapshot_sem); + up_write(&old_lv->lv_lock); return 0; } /* lvm_do_lv_extend_reduce() */ @@ -2325,8 +2480,10 @@ strcmp(lv_ptr->lv_name, lv_status_byname_req.lv_name) == 0) { /* Save usermode pointers */ - saved_ptr1 = lv_status_byname_req.lv->lv_current_pe; - saved_ptr2 = lv_status_byname_req.lv->lv_block_exception; + if (copy_from_user(&saved_ptr1, &lv_status_byname_req.lv->lv_current_pe, sizeof(void*)) != 0) + return -EFAULT; + if (copy_from_user(&saved_ptr2, &lv_status_byname_req.lv->lv_block_exception, sizeof(void*)) != 0) + return -EFAULT; if (copy_to_user(lv_status_byname_req.lv, lv_ptr, sizeof(lv_t)) != 0) @@ -2339,7 +2496,8 @@ return -EFAULT; } /* Restore usermode pointers */ - lv_status_byname_req.lv->lv_current_pe = saved_ptr1; + if (copy_to_user(&lv_status_byname_req.lv->lv_current_pe, &saved_ptr1, sizeof(void*)) != 0) + return -EFAULT; return 0; } } @@ -2368,8 +2526,11 @@ return -ENXIO; /* Save usermode pointers */ - saved_ptr1 = lv_status_byindex_req.lv->lv_current_pe; - saved_ptr2 = lv_status_byindex_req.lv->lv_block_exception; + if (copy_from_user(&saved_ptr1, &lv_status_byindex_req.lv->lv_current_pe, sizeof(void*)) != 0) + return -EFAULT; + if (copy_from_user(&saved_ptr2, &lv_status_byindex_req.lv->lv_block_exception, sizeof(void*)) != 0) + return -EFAULT; + if (copy_to_user(lv_status_byindex_req.lv, lv_ptr, sizeof(lv_t)) != 0) return -EFAULT; if (saved_ptr1 != NULL) { @@ -2381,7 +2542,8 @@ } /* Restore usermode pointers */ - lv_status_byindex_req.lv->lv_current_pe = saved_ptr1; + if (copy_to_user(&lv_status_byindex_req.lv->lv_current_pe, &saved_ptr1, sizeof(void *)) != 0) + return -EFAULT; return 0; } /* lvm_do_lv_status_byindex() */ @@ -2411,8 +2573,10 @@ lv_ptr = vg_ptr->lv[l]; /* Save usermode pointers */ - saved_ptr1 = lv_status_bydev_req.lv->lv_current_pe; - saved_ptr2 = lv_status_bydev_req.lv->lv_block_exception; + if (copy_from_user(&saved_ptr1, &lv_status_bydev_req.lv->lv_current_pe, sizeof(void*)) != 0) + return -EFAULT; + if (copy_from_user(&saved_ptr2, &lv_status_bydev_req.lv->lv_block_exception, sizeof(void*)) != 0) + return -EFAULT; if (copy_to_user(lv_status_bydev_req.lv, lv_ptr, sizeof(lv_t)) != 0) return -EFAULT; @@ -2424,7 +2588,8 @@ return -EFAULT; } /* Restore usermode pointers */ - lv_status_bydev_req.lv->lv_current_pe = saved_ptr1; + if (copy_to_user(&lv_status_bydev_req.lv->lv_current_pe, &saved_ptr1, sizeof(void *)) != 0) + return -EFAULT; return 0; } /* lvm_do_lv_status_bydev() */ @@ -2445,9 +2610,7 @@ if (lv_ptr->lv_dev == lv->lv_dev) { lvm_fs_remove_lv(vg_ptr, lv_ptr); - strncpy(lv_ptr->lv_name, - lv_req->lv_name, - NAME_LEN); + strncpy(lv_ptr->lv_name, lv_req->lv_name, NAME_LEN); lvm_fs_create_lv(vg_ptr, lv_ptr); break; } @@ -2465,9 +2628,7 @@ { uint p; pv_t *pv_ptr; -#ifdef LVM_GET_INODE - struct inode *inode_sav; -#endif + struct block_device *bd; if (vg_ptr == NULL) return -ENXIO; if (copy_from_user(&pv_change_req, arg, @@ -2479,20 +2640,17 @@ if (pv_ptr != NULL && strcmp(pv_ptr->pv_name, pv_change_req.pv_name) == 0) { -#ifdef LVM_GET_INODE - inode_sav = pv_ptr->inode; -#endif + + bd = pv_ptr->bd; if (copy_from_user(pv_ptr, pv_change_req.pv, sizeof(pv_t)) != 0) return -EFAULT; + pv_ptr->bd = bd; /* We don't need the PE list in kernel space as with LVs pe_t list */ pv_ptr->pe = NULL; -#ifdef LVM_GET_INODE - pv_ptr->inode = inode_sav; -#endif return 0; } } @@ -2535,8 +2693,7 @@ { pv_flush_req_t pv_flush_req; - if (copy_from_user(&pv_flush_req, arg, - sizeof(pv_flush_req)) != 0) + if (copy_from_user(&pv_flush_req, arg, sizeof(pv_flush_req)) != 0) return -EFAULT; fsync_dev(pv_flush_req.pv_dev); @@ -2571,5 +2728,82 @@ } /* lvm_gen_init() */ + +/* Must have down_write(_pe_lock) when we enqueue buffers */ +static void _queue_io(struct buffer_head *bh, int rw) { + if (bh->b_reqnext) BUG(); + bh->b_reqnext = _pe_requests; + _pe_requests = bh; +} + +/* Must have down_write(_pe_lock) when we dequeue buffers */ +static struct buffer_head *_dequeue_io(void) +{ + struct buffer_head *bh = _pe_requests; + _pe_requests = NULL; + return bh; +} + +/* + * We do not need to hold _pe_lock to flush buffers. bh should be taken from + * _pe_requests under down_write(_pe_lock), and then _pe_requests can be set + * NULL and we drop _pe_lock. Any new buffers defered at this time will be + * added to a new list, and the old buffers can have their I/O restarted + * asynchronously. + * + * If, for some reason, the same PE is locked again before all of these writes + * have finished, then these buffers will just be re-queued (i.e. no danger). + */ +static void _flush_io(struct buffer_head *bh) +{ + while (bh) { + struct buffer_head *next = bh->b_reqnext; + bh->b_reqnext = NULL; + /* resubmit this buffer head */ + generic_make_request(WRITE, bh); + bh = next; + } +} + + +/* + * we must open the pv's before we use them + */ +static int _open_pv(pv_t *pv) { + int err; + struct block_device *bd; + + if (!(bd = bdget(kdev_t_to_nr(pv->pv_dev)))) + return -ENOMEM; + + err = blkdev_get(bd, FMODE_READ|FMODE_WRITE, 0, BDEV_FILE); + if (err) { + bdput(bd); + return err; + } + + pv->bd = bd; + return 0; +} + +static void _close_pv(pv_t *pv) { + if(!pv || !pv->bd) + return; + + blkdev_put(pv->bd, BDEV_FILE); + bdput(pv->bd); + pv->bd = 0; +} + + +static unsigned long _sectors_to_k(unsigned long sect) +{ + if(SECTOR_SIZE > 1024) { + return sect * (SECTOR_SIZE / 1024); + } + + return sect / (1024 / SECTOR_SIZE); +} + module_init(lvm_init); module_exit(lvm_cleanup); --- linux/drivers/md/lvm-internal.h.orig Tue Oct 23 13:55:17 2001 +++ linux/drivers/md/lvm-internal.h Tue Oct 23 13:55:51 2001 @@ -1,3 +1,4 @@ + /* * kernel/lvm_internal.h * @@ -39,17 +40,20 @@ /* global variables, defined in lvm.c */ extern char *lvm_version; -extern char *lvm_short_version; extern ushort lvm_iop_version; extern int loadtime; extern const char *const lvm_name; +extern uint vg_count; extern vg_t *vg[]; extern struct file_operations lvm_chr_fops; extern struct block_device_operations lvm_blk_dops; +#define lvm_sectsize(dev) get_hardsect_size(dev) + +/* 2.4.8 had no global min/max macros, and 2.4.9's were flawed */ /* debug macros */ #ifdef DEBUG_IOCTL @@ -96,7 +100,7 @@ void lvm_fs_create_vg(vg_t *vg_ptr); void lvm_fs_remove_vg(vg_t *vg_ptr); -void lvm_fs_create_lv(vg_t *vg_ptr, lv_t *lv); +devfs_handle_t lvm_fs_create_lv(vg_t *vg_ptr, lv_t *lv); void lvm_fs_remove_lv(vg_t *vg_ptr, lv_t *lv); void lvm_fs_create_pv(vg_t *vg_ptr, pv_t *pv); void lvm_fs_remove_pv(vg_t *vg_ptr, pv_t *pv); --- linux/drivers/md/lvm-snap.c.orig Tue Oct 23 13:55:17 2001 +++ linux/drivers/md/lvm-snap.c Tue Oct 23 13:55:51 2001 @@ -39,6 +39,8 @@ * o pv number is returned in new uint * arg * o -1 returned on error * lvm_snapshot_fill_COW_table has a return value too. + * 15/10/2001 - fix snapshot alignment problem [CM] + * - fix snapshot full oops (always check lv_block_exception) [CM] * */ @@ -49,6 +51,7 @@ #include #include #include +#include #include "lvm-internal.h" @@ -140,10 +143,20 @@ unsigned long mask = lv->lv_snapshot_hash_mask; int chunk_size = lv->lv_chunk_size; + if (!hash_table) + BUG(); hash_table = &hash_table[hashfn(org_dev, org_start, mask, chunk_size)]; list_add(&exception->hash, hash_table); } +/* + * Determine if we already have a snapshot chunk for this block. + * Return: 1 if it the chunk already exists + * 0 if we need to COW this block and allocate a new chunk + * -1 if the snapshot was disabled because it ran out of space + * + * We need to be holding at least a read lock on lv->lv_lock. + */ int lvm_snapshot_remap_block(kdev_t * org_dev, unsigned long * org_sector, unsigned long pe_start, lv_t * lv) { @@ -153,6 +166,9 @@ int chunk_size = lv->lv_chunk_size; lv_block_exception_t * exception; + if (!lv->lv_block_exception) + return -1; + pe_off = pe_start % chunk_size; pe_adjustment = (*org_sector-pe_off) % chunk_size; __org_start = *org_sector - pe_adjustment; @@ -196,19 +212,25 @@ reason); } -static inline void lvm_snapshot_prepare_blocks(unsigned long * blocks, +static inline int lvm_snapshot_prepare_blocks(unsigned long *blocks, unsigned long start, int nr_sectors, int blocksize) { int i, sectors_per_block, nr_blocks; - sectors_per_block = blocksize >> 9; + sectors_per_block = blocksize / SECTOR_SIZE; + + if(start & (sectors_per_block - 1)) + return 0; + nr_blocks = nr_sectors / sectors_per_block; start /= sectors_per_block; for (i = 0; i < nr_blocks; i++) blocks[i] = start++; + + return 1; } inline int lvm_get_blksize(kdev_t dev) @@ -291,6 +313,8 @@ /* * writes a COW exception table sector to disk (HM) + * + * We need to hold a write lock on lv_snap->lv_lock. */ int lvm_write_COW_table_block(vg_t * vg, lv_t *lv_snap) { @@ -309,6 +333,10 @@ * if there is no exception storage space free any longer --> release snapshot. * * this routine gets called for each _first_ write to a physical chunk. + * + * We need to hold a write lock on lv_snap->lv_lock. It is assumed that + * lv->lv_block_exception is non-NULL (checked by lvm_snapshot_remap_block()) + * when this function is called. */ int lvm_snapshot_COW(kdev_t org_phys_dev, unsigned long org_phys_sector, @@ -318,6 +346,7 @@ { const char * reason; unsigned long org_start, snap_start, snap_phys_dev, virt_start, pe_off; + unsigned long phys_start; int idx = lv_snap->lv_remap_ptr, chunk_size = lv_snap->lv_chunk_size; struct kiobuf * iobuf; unsigned long blocks[KIO_MAX_SECTORS]; @@ -352,8 +381,8 @@ iobuf = lv_snap->lv_iobuf; - blksize_org = lvm_get_blksize(org_phys_dev); - blksize_snap = lvm_get_blksize(snap_phys_dev); + blksize_org = lvm_sectsize(org_phys_dev); + blksize_snap = lvm_sectsize(snap_phys_dev); max_blksize = max(blksize_org, blksize_snap); min_blksize = min(blksize_org, blksize_snap); max_sectors = KIO_MAX_SECTORS * (min_blksize>>9); @@ -361,6 +390,9 @@ if (chunk_size % (max_blksize>>9)) goto fail_blksize; + /* Don't change org_start, we need it to fill in the exception table */ + phys_start = org_start; + while (chunk_size) { nr_sectors = min(chunk_size, max_sectors); @@ -368,17 +400,24 @@ iobuf->length = nr_sectors << 9; - lvm_snapshot_prepare_blocks(blocks, org_start, - nr_sectors, blksize_org); + if (!lvm_snapshot_prepare_blocks(blocks, phys_start, + nr_sectors, blksize_org)) + goto fail_prepare; + if (__brw_kiovec(READ, 1, &iobuf, org_phys_dev, blocks, blksize_org, lv_snap) != (nr_sectors<<9)) goto fail_raw_read; - lvm_snapshot_prepare_blocks(blocks, snap_start, - nr_sectors, blksize_snap); + if (!lvm_snapshot_prepare_blocks(blocks, snap_start, + nr_sectors, blksize_snap)) + goto fail_prepare; + if (__brw_kiovec(WRITE, 1, &iobuf, snap_phys_dev, blocks, blksize_snap, lv_snap) != (nr_sectors<<9)) goto fail_raw_write; + + phys_start += nr_sectors; + snap_start += nr_sectors; } #ifdef DEBUG_SNAPSHOT @@ -418,6 +457,11 @@ fail_blksize: reason = "blocksize error"; goto out; + +fail_prepare: + reason = "couldn't prepare kiovec blocks " + "(start probably isn't block aligned)"; + goto out; } int lvm_snapshot_alloc_iobuf_pages(struct kiobuf * iobuf, int sectors) @@ -588,7 +632,7 @@ snap_phys_dev = lv_snap->lv_block_exception[idx].rdev_new; snap_pe_start = lv_snap->lv_block_exception[idx - (idx % COW_entries_per_pe)].rsector_new - lv_snap->lv_chunk_size; - blksize_snap = lvm_get_blksize(snap_phys_dev); + blksize_snap = lvm_sectsize(snap_phys_dev); COW_entries_per_block = blksize_snap / sizeof(lv_COW_table_disk_t); idx_COW_table = idx % COW_entries_per_pe % COW_entries_per_block; @@ -637,7 +681,7 @@ idx++; snap_phys_dev = lv_snap->lv_block_exception[idx].rdev_new; snap_pe_start = lv_snap->lv_block_exception[idx - (idx % COW_entries_per_pe)].rsector_new - lv_snap->lv_chunk_size; - blksize_snap = lvm_get_blksize(snap_phys_dev); + blksize_snap = lvm_sectsize(snap_phys_dev); blocks[0] = snap_pe_start >> (blksize_snap >> 10); } else blocks[0]++; --- linux/drivers/md/lvm-fs.c.orig Tue Oct 23 13:55:17 2001 +++ linux/drivers/md/lvm-fs.c Tue Oct 23 13:55:51 2001 @@ -3,7 +3,7 @@ * * Copyright (C) 2001 Sistina Software * - * January,February 2001 + * January-April 2001 * * LVM driver is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -26,6 +26,10 @@ * Changelog * * 11/01/2001 - First version (Joe Thornber) + * 21/03/2001 - added display of stripes and stripe size (HM) + * 04/10/2001 - corrected devfs_register() call in lvm_init_fs() + * 11/04/2001 - don't devfs_register("lvm") as user-space always does it + * 10/05/2001 - show more of PV name in /proc/lvm/global * */ @@ -38,6 +42,7 @@ #include #include +#include #include #include "lvm-internal.h" @@ -58,7 +63,9 @@ static void _show_uuid(const char *src, char *b, char *e); +#if 0 static devfs_handle_t lvm_devfs_handle; +#endif static devfs_handle_t vg_devfs_handle[MAX_VG]; static devfs_handle_t ch_devfs_handle[MAX_VG]; static devfs_handle_t lv_devfs_handle[MAX_LV]; @@ -69,14 +76,16 @@ /* inline functions */ /* public interface */ -void lvm_init_fs() { +void __init lvm_init_fs() { struct proc_dir_entry *pde; +/* User-space has already registered this */ +#if 0 lvm_devfs_handle = devfs_register( - 0 , "lvm", 0, 0, LVM_CHAR_MAJOR, + 0 , "lvm", 0, LVM_CHAR_MAJOR, 0, S_IFCHR | S_IRUSR | S_IWUSR | S_IRGRP, &lvm_chr_fops, NULL); - +#endif lvm_proc_dir = create_proc_entry(LVM_DIR, S_IFDIR, &proc_root); if (lvm_proc_dir) { lvm_proc_vg_subdir = create_proc_entry(LVM_VG_SUBDIR, S_IFDIR, @@ -87,8 +96,9 @@ } void lvm_fin_fs() { +#if 0 devfs_unregister (lvm_devfs_handle); - +#endif remove_proc_entry(LVM_GLOBAL, lvm_proc_dir); remove_proc_entry(LVM_VG_SUBDIR, lvm_proc_dir); remove_proc_entry(LVM_DIR, &proc_root); @@ -137,8 +147,14 @@ if(vg_ptr->vg_dir_pde) { remove_proc_entry(LVM_LV_SUBDIR, vg_ptr->vg_dir_pde); + vg_ptr->lv_subdir_pde = NULL; + remove_proc_entry(LVM_PV_SUBDIR, vg_ptr->vg_dir_pde); + vg_ptr->pv_subdir_pde = NULL; + remove_proc_entry("group", vg_ptr->vg_dir_pde); + vg_ptr->vg_dir_pde = NULL; + remove_proc_entry(vg_ptr->vg_name, lvm_proc_vg_subdir); } } @@ -150,7 +166,7 @@ return name; } -void lvm_fs_create_lv(vg_t *vg_ptr, lv_t *lv) { +devfs_handle_t lvm_fs_create_lv(vg_t *vg_ptr, lv_t *lv) { struct proc_dir_entry *pde; const char *name = _basename(lv->lv_name); @@ -165,6 +181,7 @@ pde->read_proc = _proc_read_lv; pde->data = lv; } + return lv_devfs_handle[MINOR(lv->lv_dev)]; } void lvm_fs_remove_lv(vg_t *vg_ptr, lv_t *lv) { @@ -256,6 +273,12 @@ sz += sprintf(page + sz, "number: %u\n", lv->lv_number); sz += sprintf(page + sz, "open: %u\n", lv->lv_open); sz += sprintf(page + sz, "allocation: %u\n", lv->lv_allocation); + if(lv->lv_stripes > 1) { + sz += sprintf(page + sz, "stripes: %u\n", + lv->lv_stripes); + sz += sprintf(page + sz, "stripesize: %u\n", + lv->lv_stripesize); + } sz += sprintf(page + sz, "device: %02u:%02u\n", MAJOR(lv->lv_dev), MINOR(lv->lv_dev)); @@ -304,8 +327,8 @@ #ifdef DEBUG_LVM_PROC_GET_INFO printk(KERN_DEBUG - "%s - lvm_proc_get_global_info CALLED pos: %lu count: %d whence: %d\n", - lvm_name, pos, count, whence); + "%s - lvm_proc_get_global_info CALLED pos: %lu count: %d\n", + lvm_name, pos, count); #endif if(pos != 0 && buf != NULL) @@ -361,7 +384,7 @@ #endif " %s\n\n" "Total: %d VG%s %d PV%s %d LV%s ", - lvm_short_version, + lvm_version, vg_counter, vg_counter == 1 ? "" : "s", pv_counter, pv_counter == 1 ? "" : "s", lv_counter, lv_counter == 1 ? "" : "s"); @@ -564,7 +587,7 @@ allocation_flag = 'A'; if (!(pv->pv_allocatable & PV_ALLOCATABLE)) allocation_flag = 'N'; - pv_name = strrchr(pv->pv_name+1,'/'); + pv_name = strchr(pv->pv_name+1,'/'); if ( pv_name == 0) pv_name = pv->pv_name; else pv_name++; sz = sprintf(buf, diff -ruN -X /home/joe/packages/dontdiff linux_2.4.1/drivers/md/lvm-snap.h linux/drivers/md/lvm-snap.h --- linux_2.4.1/drivers/md/lvm-snap.h Fri Feb 16 14:51:26 2001 +++ linux/drivers/md/lvm-snap.h Thu Jan 1 01:00:00 1970 @@ -1,47 +0,0 @@ -/* - * kernel/lvm-snap.h - * - * Copyright (C) 2001 Sistina Software - * - * - * LVM driver is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * LVM driver is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU CC; see the file COPYING. If not, write to - * the Free Software Foundation, 59 Temple Place - Suite 330, - * Boston, MA 02111-1307, USA. - * - */ - -/* - * Changelog - * - * 05/01/2001:Joe Thornber - Factored this file out of lvm.c - * - */ - -#ifndef LVM_SNAP_H -#define LVM_SNAP_H - -/* external snapshot calls */ -extern inline int lvm_get_blksize(kdev_t); -extern int lvm_snapshot_alloc(lv_t *); -extern void lvm_snapshot_fill_COW_page(vg_t *, lv_t *); -extern int lvm_snapshot_COW(kdev_t, ulong, ulong, ulong, lv_t *); -extern int lvm_snapshot_remap_block(kdev_t *, ulong *, ulong, lv_t *); -extern void lvm_snapshot_release(lv_t *); -extern int lvm_write_COW_table_block(vg_t *, lv_t *); -extern inline void lvm_hash_link(lv_block_exception_t *, - kdev_t, ulong, lv_t *); -extern int lvm_snapshot_alloc_hash_table(lv_t *); -extern void lvm_drop_snapshot(lv_t *, const char *); - -#endif