diff -Naur linux-2002-09-30/drivers/evms/AIXlvm_vge.c evms-2002-09-30/drivers/evms/AIXlvm_vge.c --- linux-2002-09-30/drivers/evms/AIXlvm_vge.c Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/drivers/evms/AIXlvm_vge.c Fri Sep 27 14:55:45 2002 @@ -0,0 +1,3681 @@ +/* -*- linux-c -*- */ + +/* + * + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * + */ +/* + * linux/drivers/evms/AIXlvm_vge.c + * + * EVMS AIX LVM Volume Group Emulator + * + * + */ + +#define EVMS_DEBUG 1 +#define EVMS_AIX_DEBUG 1 + +#define AIX_COMMON_SERVICES_MAJOR 0 // Required common services levels for the AIX kernel plugin +#define AIX_COMMON_SERVICES_MINOR 5 // These must be incremented if new function is added to common +#define AIX_COMMON_SERVICES_PATCHLEVEL 0 // services and the AIX kernel plugin uses the new function. +#define AIX_INCREMENT_REQUEST 1 +#define AIX_DECREMENT_REQUEST -1 +#define AIX_RESYNC_BLOCKSIZE 512 +#define AIX_SYNC_INCOMPLETE 0x01 +#define AIX_SYNC_COMPLETE 0x00 +#define AIX_MASTER 0 +#define AIX_SLAVE_1 1 +#define AIX_SLAVE_2 2 + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#ifdef EVMS_AIX_DEBUG +static int AIX_volume_group_dump(void); +#endif + +static struct aix_volume_group *AIXVolumeGroupList = NULL; +static struct evms_thread *AIX_mirror_read_retry_thread; +static struct evms_thread *AIX_mirror_resync_thread; +static struct evms_pool_mgmt *AIX_BH_list_pool = NULL; +static struct aix_mirror_bh *AIX_retry_list = NULL; +static struct aix_mirror_bh **AIX_retry_tail = NULL; +static spinlock_t AIX_retry_list_lock = SPIN_LOCK_UNLOCKED; +static spinlock_t AIX_resync_list_lock = SPIN_LOCK_UNLOCKED; +static spinlock_t AIX_resync_pp_lock = SPIN_LOCK_UNLOCKED; +static int AIXResyncInProgress = FALSE; +static struct aix_resync_struct *AIX_resync_list = NULL; + +// Plugin API prototypes + +static void AIXiod(void *data); +static void AIXresync(void *data); +static int discover_aix(struct evms_logical_node **evms_logical_disk_head); +static int discover_volume_groups(struct evms_logical_node **); +static int discover_logical_volumes(void); +static int end_discover_aix(struct evms_logical_node **evms_logical_disk_head); +static void read_aix(struct evms_logical_node *node, struct buffer_head *bh); +static void write_aix(struct evms_logical_node *node, struct buffer_head *bh); +static int ioctl_aix(struct evms_logical_node *logical_node, + struct inode *inode, + struct file *file, unsigned int cmd, unsigned long arg); + +static int aix_direct_ioctl(struct inode *inode, + struct file *file, + unsigned int cmd, unsigned long args); + +static int AIX_remap_sector(struct evms_logical_node *node, u64 org_sector, // logical sector to remap + u64 size, // size (in sectors) of request to remap + u64 * new_sector, // remapped sector + u64 * new_size, // new size (in sectors) + struct partition_list_entry **partition, // new node for which new_sector is relative + u32 * le, u32 * offset_in_le); + +static int validate_build_volume_group_disk_info(struct evms_logical_node + *logical_node, + struct AIXlvm_rec *AIXlvm); + +static int add_VG_data_to_VG_list(struct evms_logical_node *logical_node, + struct aix_volume_group *new_group, + short int pvNum); +static int add_PV_to_volume_group(struct aix_volume_group *group, + struct evms_logical_node *evms_partition, + int pvNum); +static struct aix_volume_group *AIX_create_volume_group(struct evms_logical_node + *logical_node, + struct AIXlvm_rec + *AIXlvm); + +static int AIX_update_volume_group(struct aix_volume_group *AIXVGLptr, + struct evms_logical_node *logical_node, + struct AIXlvm_rec *AIXlvm); + +static int AIX_evms_cs_notify_lv_io_error(struct evms_logical_node *node); + +static int AIX_pvh_data_posn(u32 vgda_psn, u32 * pvh_posn, struct partition_list_entry *partition, u32 numpvs); + +static int AIX_resync_lv_mirrors(struct aix_logical_volume *volume, int force); + +static int AIX_copy_on_read(struct aix_logical_volume *volume, + struct partition_list_entry *master_part, + struct partition_list_entry *slave1_part, + struct partition_list_entry *slave2_part, + u64 master_offset, + u64 slave1_offset, + u64 slave2_offset, u32 pe_size, int le); + +static int export_volumes(struct evms_logical_node **evms_logical_disk_head); +static int lvm_cleanup(void); +static int AIX_copy_header_info(struct vg_header *AIXvgh, + struct vg_header *AIXvgh2); +static int build_pe_maps(struct aix_volume_group *volume_group); + +static struct aix_logical_volume *new_logical_volume(struct lv_entries + *AIXlvent, + struct aix_volume_group + *group, char *lv_name, + u32 stripesize); + +static int check_log_volume_and_pe_maps(struct aix_volume_group *group); +static int check_volume_groups(void); +static int init_io_aix(struct evms_logical_node *node, int io_flag, /* 0=read, 1=write */ + u64 sect_nr, /* disk LBA */ + u64 num_sects, /* # of sectors */ + void *buf_addr); /* buffer address */ + +static int delete_logical_volume(struct aix_logical_volume *volume); +static int delete_aix_node(struct evms_logical_node *logical_node); +static int deallocate_volume_group(struct aix_volume_group *group); + +static void AIX_handle_read_mirror_drives(struct buffer_head *bh, int uptodate); + +static void AIX_handle_write_mirror_drives(struct buffer_head *bh, + int uptodate); + +static void aix_notify_cache_ctor(void *foo, kmem_cache_t * cachep, + unsigned long flags); + +static void AIX_schedule_resync(struct aix_logical_volume *resync_volume, + int force); +static struct aix_logical_volume *AIX_get_volume_data(char *object_name); + +static void AIX_sync_mirrored_partitions(struct buffer_head *bh, int uptodate); + +static int AIX_get_set_mirror_offset(struct aix_mirror_bh *tmp_bh, + int index, int offset); + +static struct aix_mirror_bh *AIX_alloc_rbh(struct evms_logical_node *node, + struct buffer_head *bh, + u32 mirror_copies, + u32 le, u64 org_sector, int cmd); + +static struct aix_mirror_bh *AIX_alloc_wbh(struct evms_logical_node *node, + struct evms_logical_node *node2, + struct evms_logical_node *node3, + struct buffer_head *bh, + u32 mirror_copies, + u32 le, + u64 new_sector2, u64 new_sector3); + +static struct aix_mirror_bh *AIX_alloc_sbh(struct aix_logical_volume *volume, + struct partition_list_entry + *master_part, + struct partition_list_entry + *slave1_part, + struct partition_list_entry + *slave2_part, u64 master_offset, + u64 slave1_offset, u64 slave2_offset, + u32 pe_size); + +static void AIX_free_headers(struct vg_header *AIXvgh, + struct vg_header *AIXvgh2, + struct vg_trailer *AIXvgt, + struct vg_trailer *AIXvgt2); + +static int remove_group_from_list(struct aix_volume_group *group); + +//**************************************************************************************************** + +/* END of PROTOTYES*/ + +#define GET_PHYSICAL_PART_SIZE(v1) (1 << v1) + +#define COMPARE_TIMESTAMPS(t1, t2) ( (t1).tv_sec == (t2).tv_sec && \ + (t1).tv_nsec == (t2).tv_nsec ) + +#define COMPARE_UNIQUE_IDS(id1, id2) ( (id1).word1 == (id2).word1 && \ + (id1).word2 == (id2).word2 && \ + (id1).word3 == (id2).word3 && \ + (id1).word4 == (id2).word4 ) + +#define SECTOR_IN_RANGE(s1, s2) ((s2 > s1) && (s2 < s1 + AIX_RESYNC_BLOCKSIZE)) + +#define AIX_PV_STATE_VALID 0 // Both VGDAs are valid and match. +#define AIX_PV_STATE_FIRST_VGDA 1 // Only the first VGDA is valid. +#define AIX_PV_STATE_SECOND_VGDA 2 // Only the second VGDA is valid. +#define AIX_PV_STATE_EITHER_VGDA -1 // Both VGDAs are valid, but do not match each other. +#define AIX_PV_STATE_INVALID -2 // We're in an invalid state but there's more PVs in this group + +#ifndef EVMS_AIX_DEBUG +#define AIX_VOLUME_GROUP_DUMP() +#else +#define AIX_VOLUME_GROUP_DUMP() LOG_DEBUG("Called line:%d \n",__LINE__); \ + AIX_volume_group_dump() +#endif + +// Global LVM data structures + +static struct evms_plugin_fops AIXlvm_fops = { + .discover = discover_aix, + .end_discover = end_discover_aix, + .delete = delete_aix_node, + .read = read_aix, + .write = write_aix, + .init_io = init_io_aix, + .ioctl = ioctl_aix, + .direct_ioctl = aix_direct_ioctl +}; + +static struct evms_plugin_header plugin_header = { + .id = SetPluginID(IBM_OEM_ID, + EVMS_REGION_MANAGER, + EVMS_AIX_FEATURE_ID), + .version = { + .major = 1, + .minor = 1, + .patchlevel = 1}, + .required_services_version = { + .major = AIX_COMMON_SERVICES_MAJOR, + .minor = AIX_COMMON_SERVICES_MINOR, + .patchlevel = + AIX_COMMON_SERVICES_PATCHLEVEL}, + .fops = &AIXlvm_fops +}; + +/* + * Function: remap sector + * Common function to remap volume lba to partition lba in appropriate PE + */ +static int +AIX_remap_sector(struct evms_logical_node *node, u64 org_sector, // logical sector to remap + u64 size, // size (in sectors) of request to remap + u64 * new_sector, // remapped sector + u64 * new_size, // new size (in sectors) + struct partition_list_entry **partition, // new node for which new_sector is relative + u32 * le, u32 * offset_in_le) +{ + struct aix_logical_volume *volume; + + u32 sectors_per_stripe; + u32 partition_to_use; + u32 column; + u32 stripe_in_column; + + u32 org_sector32; // Until striping is 64-bit enabled. + + volume = (struct aix_logical_volume *) node->private; + +#ifdef EVMS_DEBUG + LOG_DEBUG("-- %s volume:%p lv:%d size:" PFU64 " Name:%s\n", + __FUNCTION__, volume, volume->lv_number, size, volume->name); + LOG_DEBUG(" node %p node_name [%s] org_sector:" PFU64 "\n", node, + node->name, org_sector); + LOG_DEBUG(" mirror_copies:%d volume->lv_size:" PFU64 "\n", + volume->mirror_copies, volume->lv_size); +#endif + + org_sector32 = org_sector; + + *(new_size) = size; + + // Check if volume is striped. Reset the size if the request + // crosses a stripe boundary. + if (volume->stripes > 1) { +#ifdef EVMS_DEBUG + LOG_DEBUG(" *** STRIPED ***\n"); + LOG_DEBUG(" ------- volume->stripe_size:%d org_sector:%d volume_stripes:%d\n", + volume->stripe_size, org_sector32, volume->stripes); +#endif + + *(le) = org_sector >> volume->pe_size_shift; // 64-bit safe + *(offset_in_le) = org_sector & (volume->pe_size - 1); // 64-bit safe + +#ifdef EVMS_DEBUG + LOG_DEBUG("OLD - le:%d -- offset_in_le:%d \n", *(le), + *(offset_in_le)); +#endif + + sectors_per_stripe = volume->stripe_size / AIX_SECTOR_SIZE; + partition_to_use = + (org_sector32 / sectors_per_stripe) % volume->stripes; + stripe_in_column = + ((((org_sector32 / volume->stripe_size) / volume->stripes) * + volume->stripe_size) + + (org_sector32 % sectors_per_stripe)); + column = + ((org_sector32 / sectors_per_stripe) / volume->stripes) * + sectors_per_stripe; + +#ifdef EVMS_DEBUG + LOG_DEBUG("offset_in_le:%d org_sector:" PFU64 + " pe_shift:%d stripe_shift:%d\n", *(offset_in_le), + org_sector, volume->pe_size_shift, + volume->stripe_size_shift); + + LOG_DEBUG(" org_sector:%d sectors_per_stripe:%d partition_to_use:%d stripe_in_column:%d column:%d\n", + org_sector32, sectors_per_stripe, partition_to_use, + stripe_in_column, column); + LOG_DEBUG(" offset_in_le + size:" PFU64 + " volume->pe_size:%d volume->lv_size:" PFU64 "\n", + (*(offset_in_le) + size), volume->pe_size, + volume->lv_size); +#endif + + if (*(offset_in_le) + size > volume->pe_size) { + *new_size = volume->pe_size - *(offset_in_le); + LOG_DEBUG(" new_size " PFU64 "\n", *new_size); + } + + } + // Non-striped volume. Just find LE and offset. Reset the size + // if the request crosses an LE boundary. + else { +#ifdef EVMS_DEBUG + LOG_DEBUG(" *** NON-STRIPED ***\n"); +#endif + + *(le) = org_sector >> volume->pe_size_shift; // 64-bit safe + *(offset_in_le) = org_sector & (volume->pe_size - 1); // 64-bit safe + + } + +#ifdef EVMS_DEBUG + LOG_DEBUG(" offset_in_le:%d org_sector:" PFU64 " shift:%d\n", + *(offset_in_le), org_sector, volume->pe_size_shift); + + if (*(le) >= volume->num_le) { + LOG_DEBUG(" le Memory Overwrite !! le:%d vs volume->num_le:%d\n", + *(le), volume->num_le); + return -EINVAL; + } +#endif + + *(new_sector) = volume->le_to_pe_map[*(le)].pe_sector_offset + *(offset_in_le); + *(partition) = volume->le_to_pe_map[*(le)].owning_pv; + +#ifdef EVMS_DEBUG + LOG_DEBUG(" new_sector:" PFU64 "\n", *(new_sector)); + LOG_DEBUG(" Owning Part %p\n", *(partition)); + LOG_DEBUG(" End %s\n", __FUNCTION__); +#endif + + return (0); +} + +/* + * Function: read_aix + */ +static void +read_aix(struct evms_logical_node *node, struct buffer_head *bh) +{ + struct partition_list_entry *partition; + u64 org_sector; + u64 new_sector; + u64 new_size; + struct aix_logical_volume *volume; + struct aix_mirror_bh *tmp_bh; + u32 le, offset_in_le, count; + int flags = 0; + + volume = (struct aix_logical_volume *) node->private; +//#ifdef EVMS_DEBUG +// LOG_DEBUG(" ***** %s ***** bh:%p volume->iter:%d\n", __FUNCTION__, bh, +// volume->mirror_iterations); +//#endif + +#ifdef EVMS_DEBUG + LOG_DEBUG(" node->total_vsectors:" PFU64 "\n", node->total_vsectors); + LOG_DEBUG(" rsector:%lu rsize:%u node_flags:%u\n", bh->b_rsector, + bh->b_size, node->flags); +#endif + + // Check if I/O goes past end of logical volume. + if (bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) > + node->total_vsectors) { + LOG_CRITICAL(" read_aix ERROR %d\n", __LINE__); + buffer_IO_error(bh); + return; + } + + // Logical-to-physical remapping. + if (AIX_remap_sector + (node, bh->b_rsector, (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT), + &new_sector, &new_size, &partition, &le, &offset_in_le) + || (!partition || !new_sector)) { + LOG_CRITICAL(" read_aix bh: ERROR %d\n", __LINE__); + buffer_IO_error(bh); + return; + } + + org_sector = bh->b_rsector; + bh->b_rsector = new_sector; + //bh->b_size = new_size; + +#ifdef EVMS_DEBUG + LOG_DEBUG(" read_aix Mirror_Copies:%d\n", volume->mirror_copies); +#endif + + if (volume->mirror_copies > AIX_DEFAULT_MIRRORING) { + + tmp_bh = + AIX_alloc_rbh(node, bh, 1, le, new_sector, AIX_LV_READ); + + if (!tmp_bh) { + buffer_IO_error(bh); + return; + } + + if (volume->le_to_pe_map_mir1) { + tmp_bh->mir_node1 = + volume->le_to_pe_map_mir1[le].owning_pv-> + logical_node; + tmp_bh->mir_sector1 = + volume->le_to_pe_map_mir1[le].pe_sector_offset + + offset_in_le; + } + + if (volume->mirror_copies == AIX_MAX_MIRRORS) { + tmp_bh->mir_node2 = + volume->le_to_pe_map_mir2[le].owning_pv-> + logical_node; + tmp_bh->mir_sector2 = + volume->le_to_pe_map_mir2[le].pe_sector_offset + + offset_in_le; + } + + if (evms_cs_volume_request_in_progress + (tmp_bh->bh_req.b_rdev, AIX_INCREMENT_REQUEST, &count)) { + buffer_IO_error(bh); + return; + } + + if (AIXResyncInProgress) { + if (SECTOR_IN_RANGE + (tmp_bh->bh_req.b_rsector, + AIX_resync_list->master_offset)) { + spin_lock_irqsave(&AIX_resync_list_lock, flags); + } + } + + R_IO(partition->logical_node, &tmp_bh->bh_req); + + if (AIXResyncInProgress) { + if (SECTOR_IN_RANGE + (tmp_bh->bh_req.b_rsector, + AIX_resync_list->master_offset)) { + spin_unlock_irqrestore(&AIX_resync_list_lock, + flags); + } + } + + } else { + + R_IO(partition->logical_node, bh); + } + +#ifdef EVMS_DEBUG + LOG_DEBUG(" ***** %s ***** returning\n", __FUNCTION__); +#endif + return; +} + +/* + * Function: write_aix + */ +static void +write_aix(struct evms_logical_node *node, struct buffer_head *bh) +{ + struct partition_list_entry *partition; + u64 new_sector, new_sector2 = 0, new_sector3 = 0; + u64 org_sector; + u64 new_size; + struct aix_logical_volume *volume; + struct aix_mirror_bh *tmp_bh; + struct evms_logical_node *node2 = NULL, *node3 = NULL; + u32 le, offset_in_le, count; + int flags = 0; + + volume = (struct aix_logical_volume *) node->private; + +#ifdef EVMS_DEBUG +// LOG_DEBUG(" ***** %s ***** bh:%p volume->iter:%d\n", __FUNCTION__, bh, +// volume->mirror_iterations); + LOG_DEBUG(" write_aix rsector:%lu rsize:%u\n", bh->b_rsector, + bh->b_size); + LOG_DEBUG(" write_aix total_sectors:" PFU64 "\n", node->total_vsectors); +#endif + + if (volume->lv_access & EVMS_LV_INCOMPLETE) { //No writes allowed on incomplete volumes + LOG_CRITICAL(" write_aix incomplete volume ERROR %d\n", + __LINE__); + buffer_IO_error(bh); + return; + } + + // Check if I/O goes past end of logical volume. + if (bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) > + node->total_vsectors) { + LOG_CRITICAL(" write_aix ERROR %d\n", __LINE__); + buffer_IO_error(bh); + return; + } + // Logical-to-Physical remapping + if (AIX_remap_sector + (node, bh->b_rsector, (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT), + &new_sector, &new_size, &partition, &le, &offset_in_le) + || (!new_sector || !partition)) { + LOG_CRITICAL(" write_aix ERROR %d\n", __LINE__); + buffer_IO_error(bh); + return; + } + + org_sector = bh->b_rsector; + bh->b_rsector = new_sector; + //bh->b_size = new_size; + +#ifdef EVMS_DEBUG + LOG_DEBUG(" write_aix Mirror_Copies:%d\n", volume->mirror_copies); +#endif + + if (volume->mirror_copies > AIX_DEFAULT_MIRRORING) { + + if (volume->le_to_pe_map_mir1) { + new_sector2 = + volume->le_to_pe_map_mir1[le].pe_sector_offset + + offset_in_le; + node2 = + volume->le_to_pe_map_mir1[le].owning_pv-> + logical_node; + } + + if (volume->mirror_copies == AIX_MAX_MIRRORS) { + + new_sector3 = + volume->le_to_pe_map_mir2[le].pe_sector_offset + + offset_in_le; + node3 = + volume->le_to_pe_map_mir2[le].owning_pv-> + logical_node; + } + + tmp_bh = + AIX_alloc_wbh(partition->logical_node, node2, node3, bh, le, + volume->mirror_copies, new_sector2, + new_sector3); + + if (!tmp_bh) { + buffer_IO_error(bh); + return; + } + tmp_bh->node = node; + + tmp_bh = tmp_bh->mirror_bh_list; + + if (evms_cs_volume_request_in_progress + (tmp_bh->bh_req.b_rdev, AIX_INCREMENT_REQUEST, &count)) { + buffer_IO_error(bh); + // free memory here + return; + } + + if (AIXResyncInProgress) { + if (SECTOR_IN_RANGE + (tmp_bh->bh_req.b_rsector, + AIX_resync_list->master_offset)) { + spin_lock_irqsave(&AIX_resync_list_lock, flags); + } + } + + W_IO(tmp_bh->node, &tmp_bh->bh_req); + + if (AIXResyncInProgress) { + if (SECTOR_IN_RANGE + (tmp_bh->bh_req.b_rsector, + AIX_resync_list->master_offset)) { + spin_unlock_irqrestore(&AIX_resync_list_lock, + flags); + } + } + + tmp_bh = tmp_bh->next_r1; + + if (tmp_bh) { + W_IO(tmp_bh->node, &tmp_bh->bh_req); + tmp_bh = tmp_bh->next_r1; + } + + if (tmp_bh) { + W_IO(tmp_bh->node, &tmp_bh->bh_req); + } + + } else { + + W_IO(partition->logical_node, bh); + } + +#ifdef EVMS_DEBUG + LOG_DEBUG(" ***** %s returning *****\n", __FUNCTION__); +#endif + return; +} + +/* + * Function: ioctl_aix + * + */ +static int +ioctl_aix(struct evms_logical_node *logical_node, + struct inode *inode, + struct file *file, unsigned int cmd, unsigned long arg) +{ + struct aix_logical_volume *volume = + (struct aix_logical_volume *) (logical_node->private); + int rc = 0; + + LOG_EXTRA(" Ioctl %u\n", cmd); + + switch (cmd) { + + case HDIO_GETGEO: + { + // Fixed geomerty for all LVM volumes + unsigned char heads = 64; + unsigned char sectors = 32; + long start = 0; + struct hd_geometry *hd = (struct hd_geometry *) arg; + short cylinders; + cylinders = logical_node->total_vsectors; + cylinders = (cylinders / heads) / sectors; + + if (hd == NULL) { + return -EINVAL; + } + + if (copy_to_user + ((char *) (&hd->heads), &heads, sizeof (heads)) != 0 + || copy_to_user((char *) (&hd->sectors), §ors, + sizeof (sectors)) != 0 + || copy_to_user((short *) (&hd->cylinders), + &cylinders, sizeof (cylinders)) != 0 + || copy_to_user((long *) (&hd->start), &start, + sizeof (start)) != 0) { + return -EFAULT; + } + } + break; + + case EVMS_QUIESCE_VOLUME: + break; + + case EVMS_GET_DISK_LIST: + case EVMS_CHECK_MEDIA_CHANGE: + case EVMS_REVALIDATE_DISK: + case EVMS_OPEN_VOLUME: + case EVMS_CLOSE_VOLUME: + case EVMS_CHECK_DEVICE_STATUS: + { + // These five ioctl all need to be broadcast to all PVs. + struct aix_volume_group *group = volume->group; + struct partition_list_entry *partition; + for (partition = group->partition_list; partition; + partition = partition->next) { + rc |= + IOCTL(partition->logical_node, inode, file, + cmd, arg); + } + } + break; + + default: + // Currently the VGE does not send any ioctl's down to the + // partitions. Which partition would they go to? + rc = -ENOTTY; + } + + return rc; +} + +/* Function: aix_direct_ioctl + * + * This function provides a method for user-space to communicate directly + * with a plugin in the kernel. + */ +static int +aix_direct_ioctl(struct inode *inode, + struct file *file, unsigned int cmd, unsigned long args) +{ + struct aix_logical_volume *volume = NULL; + struct evms_plugin_ioctl_pkt argument; + int rc = 0; + + MOD_INC_USE_COUNT; + LOG_DEBUG(" Function:%s cmd:%d \n", __FUNCTION__, cmd); + + // Copy user's parameters to kernel space + if (copy_from_user + (&argument, (struct evms_plugin_ioctl *) args, sizeof (argument))) { + MOD_DEC_USE_COUNT; + return -EFAULT; + } + // Make sure this is supposed to be our ioctl. + if (argument.feature_id != plugin_header.id) { + MOD_DEC_USE_COUNT; + return -EINVAL; + } + + argument.feature_command = 1; + + switch (argument.feature_command) { + + case EVMS_AIX_RESYNC_MIRRORS: + { + struct aix_volume_resync_ioctl aix_lv_resync; + + if (copy_from_user + (&aix_lv_resync, + (struct aix_volume_resync_ioctl *) argument. + feature_ioctl_data, sizeof (aix_lv_resync))) { + rc = -EINVAL; + break; + } + + volume = AIX_get_volume_data(aix_lv_resync.object_name); + + if (volume) { + AIX_schedule_resync(volume, FALSE); + } else { + LOG_DEBUG + (" Function:%s object_name:%s -- no match found\n", + __FUNCTION__, aix_lv_resync.object_name); + rc = -EINVAL; + } + + } + break; + + default: + rc = -EINVAL; + break; + } + + argument.status = rc; + copy_to_user((struct evms_plugin_ioctl *) args, &argument, + sizeof (argument)); + MOD_DEC_USE_COUNT; + return rc; +} + +/* Function: aix_direct_ioctl + * + * This function provides a method for user-space to communicate directly + * with a plugin in the kernel. + */ +static struct aix_logical_volume * +AIX_get_volume_data(char *object_name) +{ + + struct aix_volume_group *VG_ptr; + struct aix_logical_volume *volume = NULL; + int i; + + LOG_DEBUG(" Function:%s object_name:%s \n", __FUNCTION__, object_name); + + if (!object_name || !strlen(object_name)) { + return NULL; + } + + for (VG_ptr = AIXVolumeGroupList; VG_ptr; VG_ptr = VG_ptr->next) { + for (i = 0; VG_ptr->volume_list[i]; i++) { + if (!strcmp(VG_ptr->volume_list[i]->name, object_name)) { + LOG_DEBUG + (" Function:%s FOUND!! volume_name:%s \n", + __FUNCTION__, + VG_ptr->volume_list[i]->name); + volume = VG_ptr->volume_list[i]; + break; + } + } + } + + if (!volume) { + LOG_DEBUG(" Function:%s object_name:%s NOT FOUND !! volume:%p \n", + __FUNCTION__, object_name, volume); + } + + return volume; +} + +/* + * Function: init_io_aix + * + */ +static int +init_io_aix(struct evms_logical_node *node, int io_flag, /* 0=read, 1=write */ + u64 sect_nr, /* disk LBA */ + u64 num_sects, /* # of sectors */ + void *buf_addr) +{ /* buffer address */ + struct partition_list_entry *partition; + u64 new_sector = 0; + u64 new_size = 0; + int rc = 0; + u32 le, offset; + + LOG_DEBUG(" ************ init_io_aix() num_sects:" PFU64 + " node:%p sect_nr:" PFU64 "\n", num_sects, node, sect_nr); + + // Init IO needs to deal with the possibility that a request can come + // in that spans PEs or stripes. This is possible because there is no + // limit on num_sects. To fix this, we loop through AIX_remap_sector and + // INIT_IO until num_sects reaches zero. + + while (num_sects > 0) { + + if (AIX_remap_sector(node, sect_nr, num_sects, &new_sector, &new_size, + &partition, &le, &offset) || (!new_sector || !partition)) { + LOG_CRITICAL("--- Error returned from AIX_remap_sector %d\n", + __LINE__); + return -EIO; + } + + LOG_DEBUG(" init_io_aix() line:%d logical_node:%p io_flag:%d new_sector:" + PFU64 " new_size:" PFU64 "\n", __LINE__, + partition->logical_node, io_flag, new_sector, new_size); + + rc = INIT_IO(partition->logical_node, io_flag, new_sector, + new_size, buf_addr); + num_sects -= new_size; + sect_nr += new_size; + buf_addr = (void *) (((unsigned long) buf_addr) + + (unsigned long) (new_size << EVMS_VSECTOR_SIZE_SHIFT)); + } + + return rc; +} + +/* + * Function: AIXlvm_vge_init + * + */ +int __init +AIXlvm_vge_init(void) +{ + + LOG_DEBUG(" %s --------\n", __FUNCTION__); + + MOD_INC_USE_COUNT; + return evms_cs_register_plugin(&plugin_header); /* register with EVMS */ +} + +module_init(AIXlvm_vge_init); + +/********** Required Plugin Functions **********/ + +/* + * Function: discover_aix + * + * This is the entry point into the LVM discovery process. + */ +static int +discover_aix(struct evms_logical_node **evms_logical_disk_head) +{ + int rc = 0, count = 0; + + MOD_INC_USE_COUNT; + LOG_DEBUG("[%s] discover_volume_groups\n", __FUNCTION__); + + rc = discover_volume_groups(evms_logical_disk_head); + + if (rc) { + LOG_ERROR("[%s] discover_volume_groups rc=%d\n", __FUNCTION__,rc); + } + + if (AIXVolumeGroupList && !rc) { + + LOG_DEBUG("[%s] discover_logical_volumes\n", __FUNCTION__); + + rc = discover_logical_volumes(); + + if (rc) { + LOG_ERROR("[%s] discover_logical_volumes rc=%d\n", + __FUNCTION__, rc); + } + + LOG_DEBUG("[%s] export_volumes\n", __FUNCTION__); + + count = export_volumes(evms_logical_disk_head); + + LOG_DEBUG("[%s] export_volumes count=%d\n", __FUNCTION__, + count); + } + + MOD_DEC_USE_COUNT; + return (count); +} + +static int +discover_volume_groups(struct evms_logical_node **evms_logical_disk_head) +{ + struct evms_logical_node *logical_node; + struct evms_logical_node *next_node; + struct aix_ipl_rec_area *AIXpv; + struct AIXlvm_rec *AIXlvm; // Temp holder for the LVM on disk rec + + LOG_DEBUG(" Begin %s\n", __FUNCTION__); + + AIXpv = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL); + if (!AIXpv) { + return -ENOMEM; + } + + // We'll create at least one volume entry, if we don't find any AIX volumes we'll clean it up later + + AIXlvm = kmalloc(sizeof (struct AIXlvm_rec), GFP_KERNEL); + if (!AIXlvm) { + kfree(AIXpv); + return -ENOMEM; + } + + for (logical_node = *evms_logical_disk_head; logical_node; + logical_node = next_node) { + + // Grab the next list item in case we remove this partition from the global list. + next_node = logical_node->next; + + // Read the first sector and see if it has a valid AIX PV signature. + + if (INIT_IO(logical_node, 0, 0, 1, AIXpv)) { + // On an I/O error, continue on to the next + // partition. The group that this partition + // belongs to will be incomplete, but we still + // need to discover any other groups. + + LOG_ERROR(" Error reading PV [%p]\n", logical_node); + continue; + } + + if (AIXpv->IPL_record_id == IPLRECID) { + + // This partition is definitely a PV, + // but is it part of a valid VG? + LOG_DEBUG(" DVG removing node from list logical_node %p\n", + logical_node); + + if (INIT_IO(logical_node, 0, PSN_LVM_REC, 1, AIXlvm)) { + LOG_ERROR(" Error reading PV [%p]\n",logical_node); + continue; + } + + if (AIXlvm->lvm_id == AIX_LVM_LVMID) { + + if (validate_build_volume_group_disk_info( + logical_node, AIXlvm)) { + // Again, continue on and we'll + // clean up later. + continue; + } + + evms_cs_remove_logical_node_from_list( + evms_logical_disk_head, logical_node); + + } else { + LOG_DEBUG(" Found an AIX PV with no parent LVM (LVM ID: %d)\n", + AIXlvm->lvm_id); + continue; + } + } else { + LOG_DEBUG(" Found a PV not belonging to AIX [%p]\n", + logical_node); + } + } + + AIX_VOLUME_GROUP_DUMP(); + + if (check_volume_groups()) { + return -EINVAL; + } + + kfree(AIXpv); + kfree(AIXlvm); + + return 0; +} + +/* + * Function: validate_build_volume_group_disk_info + * + * Creates and validates the volume groups found on the disk structures. + * + */ +static int +validate_build_volume_group_disk_info(struct evms_logical_node *logical_node, + struct AIXlvm_rec *AIXlvm) +{ + + struct aix_volume_group *AIXVGLptr = AIXVolumeGroupList; + + LOG_DEBUG(" VBVGDI pv_num:%d\n", AIXlvm->pv_num); + + while (AIXVGLptr) { + if (COMPARE_UNIQUE_IDS(AIXlvm->vg_id, AIXVGLptr->vg_id)) { + break; + } + AIXVGLptr = AIXVGLptr->next; // There is more than one so walk the list + } + + if (!AIXVGLptr) { + LOG_DEBUG(" VBVGDI AIXVGLptr:%p line:%d\n", AIXVGLptr,__LINE__); + AIXVGLptr = AIX_create_volume_group(logical_node, AIXlvm); + if (AIXVGLptr) { + AIXVGLptr->next = AIXVolumeGroupList; + AIXVolumeGroupList = AIXVGLptr; + } + } else { + LOG_DEBUG(" VBVGDI Rediscover AIXVGLptr:%p line:%d\n", + AIXVGLptr, __LINE__); + + if (AIX_update_volume_group(AIXVGLptr, logical_node, AIXlvm)) { + LOG_DEBUG + (" VBVGDI ERROR on Rediscover AIXVGLptr:%p line:%d\n", + AIXVGLptr, __LINE__); + } + } + + if (!AIXVGLptr) { + + LOG_DEBUG(" VBVGDI AIXVGLptr:%p line:%d\n", AIXVGLptr, + __LINE__); + LOG_DEBUG(" VBVGDI flags:%d\n", AIXVGLptr->flags); + LOG_CRITICAL("Unable to allocate volume group data struct Volume Group Corruption !!\n"); + return -EINVAL; + } else { + + LOG_DEBUG(" VBVGDI AIXVolumeGroupList:%p line:%d\n", + AIXVolumeGroupList, __LINE__); + LOG_DEBUG(" VBVGDI AIXVGLptr:%p line:%d\n", AIXVGLptr, + __LINE__); + LOG_DEBUG(" VBVGDI flags:%d\n", AIXVGLptr->flags); + + if (add_PV_to_volume_group(AIXVGLptr, logical_node, AIXlvm->pv_num)) { + return -EINVAL; + } + } + + return 0; +} + +/* + * Function: add_VG_data_to_VG_list + * + * Allocate space for a new LVM volume group and all of its sub-fields. + * Initialize the appropriate fields. + */ + +static int +add_VG_data_to_VG_list(struct evms_logical_node *logical_node, + struct aix_volume_group *new_group, short int pvNum) +{ +// int pvh_pos; + +// struct pv_header *AIXpvh; + + // The array of pointer to the logical volumes. + // Leave this allocation at the max permitted, the lv numbering may not be sequential so you may have gaps + // in the array allocation i.e. 1,2,3,4,5,6,7,8,11,15,21,33 etc. even though you only have 12 LVs. + + LOG_DEBUG(" AVGDVGL Entering pvNum:%d vgda_PSN:%d\n", pvNum, + new_group->vgda_psn); + +// pvh_pos = AIX_PVH_DATA_PSN(new_group->vgda_psn, pvNum); + +/* AIXpvh = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL); + if (!AIXpvh) { + return -ENOMEM; + } + + memset(AIXpvh, 0, AIX_SECTOR_SIZE); + + LOG_DEBUG(" AVGDVGL pvh_pos:%d\n", pvh_pos); + + if (INIT_IO(logical_node, 0, pvh_pos, 1, AIXpvh)) { + return -EIO; + } + + LOG_DEBUG(" AVGDVGL AIXpvh->pv_num:%d\n", pvNum); +*/ + if (!new_group->volume_list) { + new_group->volume_list = + kmalloc(LVM_MAXLVS * sizeof (struct aix_logical_volume *), + GFP_KERNEL); + if (!new_group->volume_list) { +// kfree(AIXpvh); + return -ENOMEM; + } + memset(new_group->volume_list, 0, + (LVM_MAXLVS * sizeof (struct aix_logical_volume *))); + } + + new_group->vg_id.word1 = new_group->AIXvgh->vg_id.word1; + new_group->vg_id.word2 = new_group->AIXvgh->vg_id.word2; + new_group->vg_id.word3 = new_group->AIXvgh->vg_id.word3; + new_group->vg_id.word4 = new_group->AIXvgh->vg_id.word4; +// new_group->numpvs = new_group->AIXvgh->numpvs; +// new_group->numlvs = new_group->AIXvgh->numlvs; +// new_group->lv_max = new_group->AIXvgh->maxlvs; + new_group->pe_size = GET_PHYSICAL_PART_SIZE(new_group->AIXvgh->pp_size) / + AIX_SECTOR_SIZE; + +// new_group->block_size = 0; +// new_group->hard_sect_size = 0; + new_group->flags |= AIX_VG_DIRTY; + +// kfree(AIXpvh); + + LOG_DEBUG(" AVGDVGL Vol Group ID %x\n", new_group->vg_id.word2); + + return 0; +} + +/* + * Function: add_PV_to_volume_group + * + * Create a new partition_list_entry for the specified volume group. + * Initialize the new partition with the evms node and lvm pv information, + * and add the new partition to the group's list. + */ + +static int +add_PV_to_volume_group(struct aix_volume_group *group, + struct evms_logical_node *evms_partition, int pvNum) +{ + struct partition_list_entry *new_partition; + + LOG_DEBUG(" APVVG Entering pvNum:%d\n", pvNum); + + group->flags |= AIX_VG_DIRTY; + + for (new_partition = group->partition_list; new_partition != NULL; + new_partition = new_partition->next) { + if (new_partition->logical_node == evms_partition) { + return 0; + } + } + + new_partition = + kmalloc(sizeof (struct partition_list_entry), GFP_KERNEL); + if (!new_partition) { + return -ENOMEM; + } + + memset(new_partition, 0, sizeof (struct partition_list_entry)); + + // Add this partition to this group's list. + new_partition->logical_node = evms_partition; + new_partition->pv_number = pvNum; + + if (evms_partition->hardsector_size > group->hard_sect_size) { + group->hard_sect_size = evms_partition->hardsector_size; + } + if (evms_partition->block_size > group->block_size) { + group->block_size = evms_partition->block_size; + } + + // Add this partition to the beginning of its group's list. + new_partition->next = group->partition_list; + group->partition_list = new_partition; + group->partition_count++; + + LOG_DEBUG(" APVVG partition_count:%d pv_num:%d\n", + group->partition_count, pvNum); + + return 0; +} + +/**************************************************** +* +* +* +*****************************************************/ +static struct aix_volume_group * +AIX_create_volume_group(struct evms_logical_node *logical_node, + struct AIXlvm_rec *AIXlvm) +{ + struct vg_header *AIXvgh = NULL, *AIXvgh2 = NULL; + struct vg_trailer *AIXvgt = NULL, *AIXvgt2 = NULL; + struct aix_volume_group *AIXVGLptr; + + AIXvgh = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL); + if (!AIXvgh) { + return NULL; + } + + AIXvgh2 = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL); + if (!AIXvgh2) { + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2); + return NULL; + } + + AIXvgt = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL); + if (!AIXvgt) { + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2); + return NULL; + } + + AIXvgt2 = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL); + if (!AIXvgt2) { + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2); + return NULL; + } + + memset(AIXvgh, 0, AIX_SECTOR_SIZE); + memset(AIXvgh2, 0, AIX_SECTOR_SIZE); + memset(AIXvgt, 0, AIX_SECTOR_SIZE); + memset(AIXvgt2, 0, AIX_SECTOR_SIZE); + + // First time thru we want to read this in, we may only have one PV in this group, all others + // may be corrupt, etc. If the info is clean we shouldn't get here. + + if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[0], 1, AIXvgh)) { + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2); + return NULL; + } + + if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[1], 1, AIXvgh2)) { + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2); + return NULL; + } + + if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[0] + AIXlvm->vgda_len - 1), 1, + AIXvgt)) { + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2); + return NULL; + } + + if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[1] + AIXlvm->vgda_len - 1), 1, + AIXvgt2)) { + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2); + return NULL; + } + + LOG_DEBUG("CVG AIXvgh->vgda_psn[%d]:%d\n", 0, AIXlvm->vgda_psn[0]); + LOG_DEBUG("CVG AIXvgh->vgda_psn[%d]:%d\n", 1, AIXlvm->vgda_psn[1]); + LOG_DEBUG("CVG AIXvgt psn[%d]:%d\n", 0,(AIXlvm->vgda_psn[0] + AIXlvm->vgda_len - 1)); + LOG_DEBUG("CVG AIXvgt psn[%d]:%d\n", 1,(AIXlvm->vgda_psn[1] + AIXlvm->vgda_len - 1)); + LOG_DEBUG("CVG Allocating AIXVGLptr:size:%d \n",(int) sizeof (struct aix_volume_group)); + + AIXVGLptr = kmalloc(sizeof (struct aix_volume_group), GFP_KERNEL); + if (!AIXVGLptr) { + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2); + return NULL; + } + memset(AIXVGLptr, 0, sizeof (struct aix_volume_group)); + + AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID; + AIXVGLptr->flags |= AIX_VG_DIRTY; + + LOG_DEBUG("CVG AIXVGLptr:%p line %d\n", AIXVGLptr, __LINE__); + + AIXVGLptr->AIXvgh = kmalloc(sizeof (struct vg_header), GFP_KERNEL); + if (!AIXVGLptr->AIXvgh) { + kfree(AIXVGLptr); + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2); + return NULL; + } + memset(AIXVGLptr->AIXvgh, 0, sizeof (struct vg_header)); + + LOG_DEBUG("CVG COMP TS AIXVGLptr->CleanVGInfo:%d \n", + AIXVGLptr->CleanVGInfo); + + if (AIXVGLptr->CleanVGInfo == AIX_PV_STATE_INVALID) { + if (COMPARE_TIMESTAMPS(AIXvgh->vg_timestamp, AIXvgt->timestamp)) { + if (COMPARE_TIMESTAMPS + (AIXvgh2->vg_timestamp, AIXvgt2->timestamp)) { + if (COMPARE_TIMESTAMPS + (AIXvgh->vg_timestamp, + AIXvgh2->vg_timestamp)) { + // All timestamps match. Yea! + AIXVGLptr->CleanVGInfo = + AIX_PV_STATE_VALID; + } else { + // Both VGDAs are good, but timestamps are + // different. Can't tell yet which one is + // correct. + AIXVGLptr->CleanVGInfo = + AIX_PV_STATE_EITHER_VGDA; + } + } else { + // First VGDA is good, second is bad. + AIXVGLptr->CleanVGInfo = + AIX_PV_STATE_FIRST_VGDA; + } + } else { + if (COMPARE_TIMESTAMPS + (AIXvgh2->vg_timestamp, AIXvgt2->timestamp)) { + // First VGDA is bad, second is good. + AIXVGLptr->CleanVGInfo = + AIX_PV_STATE_SECOND_VGDA; + } else if (AIXvgh->numpvs == 1) { // We only have 1 PV in this group, mismatch or not this will have to do + AIXVGLptr->CleanVGInfo = AIX_PV_STATE_VALID; + } else { + // This should never happen. + LOG_DEBUG("All four VG timestamps for %d are different. What happened?!?\n", + AIXVGLptr->vg_id.word2); + AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID; + + } + } + + LOG_DEBUG("CVG SWITCH TS AIXVGLptr->CleanVGInfo:%d \n", + AIXVGLptr->CleanVGInfo); + + switch (AIXVGLptr->CleanVGInfo) { + case AIX_PV_STATE_VALID: + case AIX_PV_STATE_FIRST_VGDA: + + LOG_DEBUG("CVG SWITCH VALID %d size:%d\n", + AIXVGLptr->CleanVGInfo, + (int) sizeof (struct vg_header)); + + AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh); // Get the info. we need + + AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0]; + AIXVGLptr->vgda_len = AIXlvm->vgda_len; + break; + + case AIX_PV_STATE_SECOND_VGDA: + LOG_DEBUG("CVG SWITCH SECOND VGDA %d size:%d\n", + AIXVGLptr->CleanVGInfo, + (int) sizeof (struct vg_header)); + + AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh2); // Get the info. we need + + AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[1]; + AIXVGLptr->vgda_len = AIXlvm->vgda_len; + break; + + case AIX_PV_STATE_EITHER_VGDA: + LOG_DEBUG("CVG SWITCH EITHER VGDA %d size:%d\n", + AIXVGLptr->CleanVGInfo,(int) sizeof (struct vg_header)); + if (COMPARE_UNIQUE_IDS(AIXvgh->vg_id, AIXvgh2->vg_id)) { + + AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh); // Get the info. we need + + AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0]; + AIXVGLptr->vgda_len = AIXlvm->vgda_len; + } else { + AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID; + // Not sure where this PV belongs. It thinks it is + // supposed to be in two different containers. We will + // probably need to put this on a separate, temporary + // list, and determine later which container is missing + // a PV. + } + break; + + default: + LOG_ERROR("Invalid PV state (%d) for %d\n", + AIXVGLptr->CleanVGInfo, + AIXVGLptr->vg_id.word2); + AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID; + break; + } + + } + + // Currently AIX Big VGDA is not supported - cleanup and return NULL so this VG doesn't get added + + if (AIXVGLptr->AIXvgh->bigvg != 0) { + LOG_SERIOUS("Error creating Volume Group AIX Big VGDA is not currently supported\n"); + if (AIXVGLptr->AIXvgh) { + kfree(AIXVGLptr->AIXvgh); + AIXVGLptr->AIXvgh = NULL; + } + + if (AIXVGLptr) { + kfree(AIXVGLptr); + AIXVGLptr = NULL; + } + + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2); + return NULL; + } + + add_VG_data_to_VG_list(logical_node, AIXVGLptr, AIXlvm->pv_num); + + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2); + + LOG_DEBUG("CVG Exiting CleanVGInfo:%d\n", AIXVGLptr->CleanVGInfo); + + return AIXVGLptr; +} + +/**************************************************** +* +* +* +*****************************************************/ +static int +AIX_update_volume_group(struct aix_volume_group *AIXVGLptr, + struct evms_logical_node *logical_node, + struct AIXlvm_rec *AIXlvm) +{ + struct vg_header *AIXvgh = NULL, *AIXvgh2 = NULL; + struct vg_trailer *AIXvgt = NULL, *AIXvgt2 = NULL; + + AIXvgh = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL); + if (!AIXvgh) { + return -ENOMEM; + } + + AIXvgh2 = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL); + if (!AIXvgh2) { + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2); + return -ENOMEM; + } + + AIXvgt = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL); + if (!AIXvgt) { + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2); + return -ENOMEM; + } + + AIXvgt2 = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL); + if (!AIXvgt2) { + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2); + return -ENOMEM; + } + + // First time thru we want to read this in, we may only have one PV in this group, all others + // may be corrupt, etc. If the info is clean we shouldn't get here. + + if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[0], 1, AIXvgh)) { + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2); + return -ENOMEM; + } + + if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[1], 1, AIXvgh2)) { + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2); + return -ENOMEM; + } + + if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[0] + AIXlvm->vgda_len - 1), 1, + AIXvgt)) { + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2); + return -ENOMEM; + } + + if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[1] + AIXlvm->vgda_len - 1), 1, + AIXvgt2)) { + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2); + return -ENOMEM; + } + + LOG_DEBUG("UVG AIXvgh->vgda_psn[%d]:%d\n", 0, AIXlvm->vgda_psn[0]); + LOG_DEBUG("UVG AIXvgh->vgda_psn[%d]:%d\n", 1, AIXlvm->vgda_psn[1]); + LOG_DEBUG("UVG AIXvgt psn[%d]:%d\n", 0,(AIXlvm->vgda_psn[0] + AIXlvm->vgda_len - 1)); + LOG_DEBUG("UVG AIXvgt psn[%d]:%d\n", 1,(AIXlvm->vgda_psn[1] + AIXlvm->vgda_len - 1)); + + AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID; + AIXVGLptr->flags |= AIX_VG_DIRTY; + + LOG_DEBUG("UVG AIXVGLptr:%p line %d\n", AIXVGLptr, __LINE__); + + AIXVGLptr->AIXvgh = kmalloc(sizeof (struct vg_header), GFP_KERNEL); + if (!AIXVGLptr->AIXvgh) { + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2); + return -ENOMEM; + } + memset(AIXVGLptr->AIXvgh, 0, sizeof (struct vg_header)); + + LOG_DEBUG("UVG COMP TS AIXVGLptr->CleanVGInfo:%d \n",AIXVGLptr->CleanVGInfo); + + if (AIXVGLptr->CleanVGInfo == AIX_PV_STATE_INVALID) { + if (COMPARE_TIMESTAMPS(AIXvgh->vg_timestamp, AIXvgt->timestamp)) { + if (COMPARE_TIMESTAMPS + (AIXvgh2->vg_timestamp, AIXvgt2->timestamp)) { + if (COMPARE_TIMESTAMPS + (AIXvgh->vg_timestamp, + AIXvgh2->vg_timestamp)) { + // All timestamps match. Yea! + AIXVGLptr->CleanVGInfo = + AIX_PV_STATE_VALID; + } else { + // Both VGDAs are good, but timestamps are + // different. Can't tell yet which one is + // correct. + AIXVGLptr->CleanVGInfo = + AIX_PV_STATE_EITHER_VGDA; + } + } else { + // First VGDA is good, second is bad. + AIXVGLptr->CleanVGInfo = + AIX_PV_STATE_FIRST_VGDA; + } + } else { + if (COMPARE_TIMESTAMPS + (AIXvgh2->vg_timestamp, AIXvgt2->timestamp)) { + // First VGDA is bad, second is good. + AIXVGLptr->CleanVGInfo = + AIX_PV_STATE_SECOND_VGDA; + } else if (AIXvgh->numpvs == 1) { // We only have 1 PV in this group, mismatch or not this will have to do + AIXVGLptr->CleanVGInfo = AIX_PV_STATE_VALID; + } else { + // This should never happen. + LOG_DEBUG + ("All four VG timestamps for %d are different. What happened?!?\n", + AIXVGLptr->vg_id.word2); + AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID; + + } + } + + LOG_DEBUG("UVG SWITCH TS AIXVGLptr->CleanVGInfo:%d \n", + AIXVGLptr->CleanVGInfo); + + switch (AIXVGLptr->CleanVGInfo) { + case AIX_PV_STATE_VALID: + case AIX_PV_STATE_FIRST_VGDA: + + LOG_DEBUG("UVG SWITCH VALID %d size:%d\n", + AIXVGLptr->CleanVGInfo, + (int) sizeof (struct vg_header)); + + AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh); // Get the info. we need + + AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0]; + AIXVGLptr->vgda_len = AIXlvm->vgda_len; + break; + + case AIX_PV_STATE_SECOND_VGDA: + LOG_DEBUG("UVG SWITCH SECOND VGDA %d size:%d\n", + AIXVGLptr->CleanVGInfo, + (int) sizeof (struct vg_header)); + + AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh2); // Get the info. we need + + AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[1]; + AIXVGLptr->vgda_len = AIXlvm->vgda_len; + break; + + case AIX_PV_STATE_EITHER_VGDA: + LOG_DEBUG("UVG SWITCH EITHER VGDA %d size:%d\n", + AIXVGLptr->CleanVGInfo, + (int) sizeof (struct vg_header)); + if (COMPARE_UNIQUE_IDS(AIXvgh->vg_id, AIXvgh2->vg_id)) { + + AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh); // Get the info. we need + + AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0]; + AIXVGLptr->vgda_len = AIXlvm->vgda_len; + } else { + AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID; + // Not sure where this PV belongs. It thinks it is + // supposed to be in two different containers. We will + // probably need to put this on a separate, temporary + // list, and determine later which container is missing + // a PV. + } + break; + + default: + LOG_ERROR("UVG Invalid PV state (%d) for %d\n", + AIXVGLptr->CleanVGInfo, + AIXVGLptr->vg_id.word2); + AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID; + break; + } + + } + +// add_VG_data_to_VG_list(logical_node, AIXVGLptr, AIXlvm->pv_num); + AIXVGLptr->flags |= AIX_VG_DIRTY; + + AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2); + + LOG_DEBUG("UVG Exiting CleanVGInfo:%d\n", AIXVGLptr->CleanVGInfo); + + return 0; +} + +/**************************************************** +* Function: check_volume_groups +* +* We just want to make sure the volume groups have found +* all their drives. +* +* If not, we'll continue and build what we can +*****************************************************/ +static int +check_volume_groups(void) +{ + struct aix_volume_group *group; + struct aix_volume_group *next_group; +// struct partition_list_entry *partitions; +// int NumPVS = 0; + + LOG_DEBUG("CHVG Checking volume groups:\n"); + + + for (group = AIXVolumeGroupList; group; group = next_group) { + next_group = group->next; + + if (group->flags & AIX_VG_DIRTY){ + if (group->AIXvgh->numlvs == 0) { + remove_group_from_list(group); + deallocate_volume_group(group); + } else { + if (group->partition_count != group->AIXvgh->numpvs) { + group->flags |= AIX_VG_INCOMPLETE; + LOG_ERROR("CHVG Found incomplete VG !! flags:%x\n", + group->flags); + LOG_ERROR("CHVG Found %d PVs should have %d PVs\n", + group->partition_count, group->AIXvgh->numpvs); + } + } + } + } + + LOG_DEBUG("CHVG Finished Checking volume groups:\n"); + return 0; + +} + +/************************************************************************ + * Function: discover_logical_volumes + * + * After all PVs have been claimed and added to the appropriate VG list, + * the volumes for each VG must be constructed. + * + * + */ +static int +discover_logical_volumes(void) +{ + + struct aix_volume_group *AIXVGLPtr; + struct aix_logical_volume *new_LV; + struct partition_list_entry *partition; + struct evms_logical_node *node; + struct lv_entries *AIXlvent, *AIXlventHead; + int j, lv_found, all_lvs_found, rc; + struct namelist *AIXnamelist; + char *NameBuffer; + + AIXlventHead = + kmalloc(MAX_SECTORS_LV_ENTRIES * AIX_SECTOR_SIZE, GFP_KERNEL); + if (!AIXlventHead) { + return -ENOMEM; + } + + memset(AIXlventHead, 0, (MAX_SECTORS_LV_ENTRIES * AIX_SECTOR_SIZE)); + + NameBuffer = + kmalloc(MAX_SECTORS_NAMELIST * AIX_SECTOR_SIZE, GFP_KERNEL); + if (!NameBuffer) { + kfree(AIXlventHead); + return -ENOMEM; + } + + memset(NameBuffer, 0, (MAX_SECTORS_NAMELIST * AIX_SECTOR_SIZE)); + + for (AIXVGLPtr = AIXVolumeGroupList; AIXVGLPtr; + AIXVGLPtr = AIXVGLPtr->next ) { + + partition = AIXVGLPtr->partition_list; + + if (!(AIXVGLPtr->flags & AIX_VG_DIRTY)) { + continue; + } + + if (partition == NULL) { + continue; + } + + node = partition->logical_node; + + if (node == NULL) { + continue; + } + + LOG_DEBUG("DLV INIT_IO AIXNameList position:%d\n", + ((AIXVGLPtr->vgda_psn + AIXVGLPtr->vgda_len) - 1 - + MAX_SECTORS_NAMELIST)); + LOG_DEBUG("AIXVGLPTR:%p partition:%p node:%p \n", AIXVGLPtr, + partition, node); + + if (INIT_IO(node, 0, + ((AIXVGLPtr->vgda_psn + AIXVGLPtr->vgda_len) - 1 - + MAX_SECTORS_NAMELIST), MAX_SECTORS_NAMELIST, + NameBuffer)) { + continue; + } + + LOG_DEBUG("DLV INIT_IO AIXNameList\n"); + + if (INIT_IO(node, 0, AIXVGLPtr->vgda_psn + PSN_LVE_REC, + MAX_SECTORS_LV_ENTRIES, AIXlventHead)) { + continue; + } + AIXlvent = AIXlventHead; + AIXnamelist = (struct namelist *) NameBuffer; + + LOG_DEBUG("DLV INIT_IO AIXlvent\n"); + // Search through the LV structs for valid LV entries + // We're just going to search until all valid LVs are found + // The max. allowable LVs is 256 and we want don't want to + // search for 255 if only 8 are defined 1-8 however, there + // could be gaps in the LV numbering. i.e 1,2,3,4,5,6,7,8, 27,43, etc. + + for (j = 0, lv_found = 0, all_lvs_found = FALSE; + !all_lvs_found && j < LVM_MAXLVS; j++, AIXlvent++) { + + LOG_DEBUG(" ** DVIG:lv_size:%d lvname:[%s] j:%d lv_number:%d ** \n", + AIXlvent->num_lps, AIXnamelist->name[j], j, + AIXlvent->lvname); + LOG_DEBUG(" DVIG:stripe_exp:%u stripesize:%u lv_status:%d\n", + AIXlvent->striping_width, + GET_PHYSICAL_PART_SIZE(AIXlvent->stripe_exp), + AIXlvent->lv_state); + LOG_DEBUG(" DVIG Group:%x.Access:%x\n", + (unsigned int) AIXVGLPtr->vg_id.word2, + AIXlvent->permissions); + LOG_DEBUG(" DVIG mirror:%d mirror_policy:%d mirwrt:%d \n", + AIXlvent->mirror, AIXlvent->mirror_policy, + AIXlvent->mirwrt_consist); + + // This is the same check we used in "diskedit" and "readdisk" + if (AIXlvent->lv_state == 0 || + AIXlvent->permissions > 0x10) { + continue; + } + + lv_found++; + if (lv_found == AIXVGLPtr->AIXvgh->numlvs) { + all_lvs_found = TRUE; + } + + LOG_DEBUG(" DVIG lv_found:%d all_lvs_found:%d \n", + lv_found, all_lvs_found); + + // Create a new logical volume and place it in the appropriate + // spot in this VG's volume list. For re-discovery, make sure + // this volume does not already exist. + if (!AIXVGLPtr->volume_list[AIXlvent->lvname]) { + new_LV = + new_logical_volume(AIXlvent, + AIXVGLPtr, + AIXnamelist-> + name[j], + GET_PHYSICAL_PART_SIZE + (AIXlvent-> + stripe_exp)); + if (!new_LV) { + continue; + } + LOG_DEBUG(" DVIG Adding new logical volume %d to group:%x \n", + new_LV->lv_number,AIXVGLPtr->vg_id.word2); + + AIXVGLPtr->volume_list[new_LV->lv_number] = new_LV; + } else { + LOG_DEBUG("DVIG Updating Vol Exists\n"); + } + } + + // Build the le_to_pe_map for each volume that was discovered above. + // This has to be done after all volumes in the group are discovered + if ((rc = build_pe_maps(AIXVGLPtr))) { + continue; + } + + check_log_volume_and_pe_maps(AIXVGLPtr); + } + + kfree(NameBuffer); + kfree(AIXlventHead); + + return 0; +} + +/* + * Function: new_logical_volume + * + * Allocate space for a new LVM logical volume, including space for the + * PE map + */ +static struct aix_logical_volume * +new_logical_volume(struct lv_entries *AIXlvent, + struct aix_volume_group *volume_group, + char *lv_name, u32 stripesize) +{ + + struct aix_logical_volume *new_volume; + const char *name = "evms_AIXiod"; + const char *resync_name = "evms_AIXresync"; + + LOG_DEBUG(" NLV: lv_number:%d lv_allocated_le:%d lv_size:%d\n", + AIXlvent->lvname, AIXlvent->num_lps, + AIXlvent->num_lps * volume_group->pe_size); + + // Allocate space for the new logical volume. + new_volume = kmalloc(sizeof (struct aix_logical_volume), GFP_KERNEL); + if (!new_volume) { + return NULL; + } + memset(new_volume, 0, sizeof (struct aix_logical_volume)); + + // Allocate space for the LE to PE mapping table + // We add 1 for the allocated le to ease mapping later on, all AIX le are 1 based + new_volume->le_to_pe_map = + kmalloc((AIXlvent->num_lps + 1) * sizeof (struct pe_table_entry), + GFP_KERNEL); + if (!new_volume->le_to_pe_map) { + delete_logical_volume(new_volume); + return NULL; + } + + memset(new_volume->le_to_pe_map, 0, + (AIXlvent->num_lps + 1) * sizeof (struct pe_table_entry)); + + if (AIXlvent->mirror > AIX_DEFAULT_MIRRORING) { + new_volume->le_to_pe_map_mir1 = + kmalloc((AIXlvent->num_lps + + 1) * sizeof (struct pe_table_entry), GFP_KERNEL); + if (!new_volume->le_to_pe_map_mir1) { + delete_logical_volume(new_volume); + return NULL; + } + memset(new_volume->le_to_pe_map_mir1, 0, + (AIXlvent->num_lps + + 1) * sizeof (struct pe_table_entry)); + } + + if (AIXlvent->mirror == AIX_MAX_MIRRORS) { + new_volume->le_to_pe_map_mir2 = + kmalloc((AIXlvent->num_lps + 1) + * sizeof (struct pe_table_entry), GFP_KERNEL); + if (!new_volume->le_to_pe_map_mir2) { + delete_logical_volume(new_volume); + return NULL; + } + memset(new_volume->le_to_pe_map_mir2, 0, + (AIXlvent->num_lps +1) + * sizeof (struct pe_table_entry)); + } + + // Initialize the rest of the new volume. + new_volume->lv_number = AIXlvent->lvname; + new_volume->lv_size = AIXlvent->num_lps * (volume_group->pe_size); + new_volume->lv_access = AIXlvent->permissions | EVMS_LV_NEW; // All volumes start new. + new_volume->lv_status = AIXlvent->lv_state; + //new_volume->lv_minor = MINOR(1); + new_volume->mirror_copies = AIXlvent->mirror; +// new_volume->mirror_iterations = AIX_DEFAULT_MIRRORING; + new_volume->stripes = AIXlvent->striping_width; + new_volume->stripe_size = stripesize; + new_volume->stripe_size_shift = evms_cs_log2(stripesize); + new_volume->pe_size = volume_group->pe_size; + new_volume->pe_size_shift = evms_cs_log2(volume_group->pe_size); + new_volume->num_le = AIXlvent->num_lps; +// new_volume->new_volume = TRUE; + new_volume->group = volume_group; + + volume_group->numlvs++; + + sprintf(new_volume->name, "aix/%s", lv_name); + + if (!AIX_BH_list_pool + && new_volume->mirror_copies > AIX_DEFAULT_MIRRORING) { + + // We only need the ReSync thread if we have at least one mirrored LV. + // You can't ReSync a non-mirrored drive + + AIX_BH_list_pool = + evms_cs_create_pool(sizeof (struct aix_mirror_bh), + "EVMS_AIX_BH", aix_notify_cache_ctor, + NULL); + if (!AIX_BH_list_pool) { + return NULL; + + AIX_mirror_read_retry_thread = + evms_cs_register_thread(AIXiod, NULL, name); + + AIX_mirror_resync_thread = + evms_cs_register_thread(AIXresync, NULL, + resync_name); + } + } + + LOG_DEBUG("NLV lv_number:%d name:%s lv_size " PFU64 " \n", + new_volume->lv_number, new_volume->name, new_volume->lv_size); + LOG_DEBUG("NLV stripe_size:%d stripe_size_shift:%d\n", + new_volume->stripe_size, new_volume->stripe_size_shift); + + return new_volume; +} + +/* + * Function: aix_notify_cache_ctor + * this function initializes the b_wait field in the buffer heads + * in our private buffer head pool. + */ +static void +aix_notify_cache_ctor(void *foo, kmem_cache_t * cachep, unsigned long flags) +{ + if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) { + struct aix_mirror_bh *rbh = (struct aix_mirror_bh *) foo; + memset(rbh, 0, sizeof (struct aix_mirror_bh)); + rbh->remaining = (atomic_t) ATOMIC_INIT(0); + init_waitqueue_head(&rbh->bh_req.b_wait); + } +} + +/* + * Function: build_pe_maps + * + * After all logical volumes have been discovered, the mappings from + * logical extents to physical extents must be constructed. Each PV + * contains a map on-disk of its PEs. Each PE map entry contains the + * logical volume number and the logical extent number on that volume. + * Our internal map is the reverse of this map for each volume, listing + * the PV node and sector offset for every logical extent on the volume. + */ +static int + build_pe_maps(struct aix_volume_group *volume_group) +{ + struct partition_list_entry *partition; + struct partition_list_entry *mirror_partition; + struct pp_entries *AIXppent, *AIXppent_buff; + struct pv_header *AIXpvh; + u64 offset; + u32 le_number; + u32 j, pp_count, pvh_pos; + u32 MirrorFound; + u32 pvh_posn[LVM_MAXPVS]; + u32 rc; +#ifdef EVMS_DEBUG_MIRRORS + u32 lv_found, all_lvs_found; + u32 mirs = 0; +#endif + + LOG_DEBUG(" *** BPEM ***\n"); + // For every partition in this VG + + AIXppent_buff = kmalloc(AIX_SECTOR_SIZE * PHYS_VOL_OFFSET, GFP_KERNEL); + if (!AIXppent_buff) { + return -ENOMEM; + } + + memset(AIXppent_buff, 0, AIX_SECTOR_SIZE * PHYS_VOL_OFFSET); + memset(pvh_posn, 0, LVM_MAXPVS); + + AIXpvh = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL); + if (!AIXpvh) { + kfree(AIXppent_buff); + return -ENOMEM; + } + + memset(AIXpvh, 0, AIX_SECTOR_SIZE); + + LOG_DEBUG(" BPEM AIXppent_buff:%d \n", + (AIX_SECTOR_SIZE * PHYS_VOL_OFFSET)); + + // This next section is to calculate the sector spacing between PV info for the VG + // AIX doesn't always space the info. the same. It could be 17 or 34 sectors apart + // depending on the PE size selected. + + rc = AIX_pvh_data_posn(volume_group->vgda_psn, pvh_posn, volume_group->partition_list, volume_group->AIXvgh->numpvs); + + if (rc != 0) { + kfree(AIXppent_buff); + kfree(AIXpvh); + return (rc); + } + + for (partition = volume_group->partition_list; partition; + partition = partition->next) { + + LOG_DEBUG(" BPEM partition:%p next:%p\n", partition, + partition->next); + + pvh_pos = pvh_posn[partition->pv_number]; + + LOG_DEBUG(" BPEM pvh_pos:%d pv_number:%d\n", pvh_pos, partition->pv_number); + + if (INIT_IO(partition->logical_node, 0, pvh_pos, 1, AIXpvh)) { + kfree(AIXppent_buff); + kfree(AIXpvh); + return -EIO; + } + // For every entry in the PE map, calculate the PE's sector offset + // and update the correct LV's PE map. LV number of 0 marks an unused PE. + // For re-discovery, only compute entries for new volumes. + + if (INIT_IO(partition->logical_node, 0, pvh_pos, AIX_PVHPP_LENGTH, + AIXppent_buff)) { + kfree(AIXppent_buff); + kfree(AIXpvh); + return -EIO; + } + + AIXppent = AIXppent_buff; + AIXppent++; + + pp_count = AIXpvh->pp_count; + + LOG_DEBUG("BPEM AIXpvh data: pp_count:%d psn_part1:%d pv_id1:%d pv_id2:%d pv_id3:%d pv_id4:%d pv_num:%d pv_state:%d vgdas:%d res1:%d res2:%d\n", AIXpvh->pp_count, + AIXpvh->psn_part1, + AIXpvh->pv_id.word1, + AIXpvh->pv_id.word2, + AIXpvh->pv_id.word3, + AIXpvh->pv_id.word4, + AIXpvh->pv_num, + AIXpvh->pv_state, AIXpvh->pvnum_vgdas, AIXpvh->res1, AIXpvh->res2); + + LOG_DEBUG(" PE Map: volgrp:%x AIXpvh->pv_num:%d partition:%p next:%p lv_index:%d pp_count:%d\n", + volume_group->vg_id.word2, AIXpvh->pv_num, partition, + partition->next, AIXppent->lv_index, pp_count); + + for (j = 0; j < pp_count; j++,AIXppent++) { + if (!AIXppent->lv_index || AIXppent->pp_state == AIX_LVM_LVUNDEF) { + continue; + } + + LOG_EXTRA(" -- pv:%x pp:%d st:%d nm:%s lv:%d lp:%d cp:%d fst v:%d fst p:%d snd v:%d snd p:%d \n", + volume_group->vg_id.word2, j + 1, + AIXppent->pp_state, + volume_group->volume_list[AIXppent->lv_index -1]->name, + AIXppent->lv_index, AIXppent->lp_num, + AIXppent->copy, AIXppent->fst_alt_vol, + AIXppent->fst_alt_part, + AIXppent->snd_alt_vol, + AIXppent->snd_alt_part); + + le_number = AIXppent->lp_num - 1; // AIX lp's start @ 1, we want a 0 index + offset = ((j * (volume_group->pe_size)) + AIXpvh->psn_part1); + + LOG_DEBUG(" PE Map: le_number:%d partition:%p lv_index:%d lv_name:%s\n", + le_number, partition, AIXppent->lv_index, + volume_group->volume_list[AIXppent->lv_index -1]->name); + + if (!volume_group->volume_list[AIXppent->lv_index - 1]) { + LOG_SERIOUS("Failed attempt to access volume without memory allocation lv:%d\n", + AIXppent->lv_index - 1); + continue; + } + + if (volume_group->volume_list[AIXppent->lv_index -1]->le_to_pe_map + && le_number <= volume_group->volume_list[AIXppent->lv_index - 1]->num_le) { + + volume_group->volume_list[AIXppent->lv_index -1]->le_to_pe_map[le_number].owning_pv = partition; + volume_group->volume_list[AIXppent->lv_index -1]->le_to_pe_map[le_number].pe_sector_offset = offset; + volume_group->volume_list[AIXppent->lv_index -1]->le_to_pe_map[le_number].pp_state = AIXppent->pp_state; + } + + if (volume_group->volume_list[AIXppent->lv_index -1]->mirror_copies > + AIX_DEFAULT_MIRRORING) { + + LOG_EXTRA(" PE Map: Mirror found lv:%d -- \n", + AIXppent->lv_index); + + for (mirror_partition = volume_group->partition_list, + MirrorFound = FALSE; + mirror_partition && !MirrorFound; + mirror_partition = mirror_partition->next) { + + if (mirror_partition->pv_number == AIXppent->fst_alt_vol) { + + offset = (((AIXppent->fst_alt_part - 1) * (volume_group->pe_size)) + AIXpvh->psn_part1); + + volume_group->volume_list[AIXppent->lv_index -1]->le_to_pe_map_mir1[le_number].owning_pv = mirror_partition; + volume_group->volume_list[AIXppent->lv_index -1]->le_to_pe_map_mir1[le_number].pe_sector_offset = offset; + volume_group->volume_list[AIXppent->lv_index -1]->le_to_pe_map_mir1[le_number].pp_state = AIXppent->pp_state; + + LOG_EXTRA(" PE Map: mirror_partition:%p \n", + mirror_partition); + LOG_EXTRA(" PE Map: mirror_sector_offet:%d\n", + AIXppent->fst_alt_part); + + MirrorFound = TRUE; + } + } + + if (volume_group->volume_list[AIXppent->lv_index -1]->mirror_copies == AIX_MAX_MIRRORS) { + + for (mirror_partition = volume_group->partition_list, + MirrorFound = FALSE; + mirror_partition && !MirrorFound; + mirror_partition = mirror_partition->next) { + + if (mirror_partition->pv_number == AIXppent->snd_alt_vol) { + + offset = (((AIXppent->snd_alt_part - 1) * (volume_group->pe_size)) + AIXpvh->psn_part1); + + volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map_mir2[le_number].owning_pv = mirror_partition; + volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map_mir2[le_number].pe_sector_offset = offset; + volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map_mir2[le_number].pp_state = AIXppent->pp_state; + + LOG_EXTRA(" PE Map: mirror_partition2:%p \n", + mirror_partition); + LOG_EXTRA(" PE Map: mirror_sector_offet2:%d\n", + AIXppent->snd_alt_part); + + MirrorFound = TRUE; + } + } + } + + } // End of if mirroring is enabled + } + } + +// LOG_EXTRA(" PE Map: PE maps:%d Mirror count:%d -- \n", lvs, mirs); + +#ifdef EVMS_DEBUG_MIRRORS + for (mirs = 0, lv_found = 0, all_lvs_found = FALSE; + !all_lvs_found && mirs < LVM_MAXLVS; mirs++) { + + if (volume_group->volume_list[mirs] != NULL) { + if (volume_group->volume_list[mirs]->lv_status == + LV_ACTIVE) { + + lv_found++; + + LOG_DEBUG(" PE Map: owning part lv %d -- %p\n", + mirs, + volume_group->volume_list[mirs]-> + le_to_pe_map[0].owning_pv); + if (volume_group->volume_list[mirs]-> + mirror_copies > AIX_DEFAULT_MIRRORING) { + LOG_DEBUG(" PE Map: mirror_partition lv %d -- %p \n", + mirs, + volume_group->volume_list[mirs]-> + le_to_pe_map_mir1[0].owning_pv); + } + if (volume_group->volume_list[mirs]-> + mirror_copies == AIX_MAX_MIRRORS) { + LOG_DEBUG(" PE Map: mirror_partition lv %d -- %p \n", + mirs, + volume_group->volume_list[mirs]-> + le_to_pe_map_mir2[0].owning_pv); + } + } + if (lv_found == volume_group->AIXvgh->numlvs) { + all_lvs_found = TRUE; + LOG_DEBUG(" PE Map: all_lvs_found\n"); + } + } + } +#endif + + kfree(AIXpvh); + kfree(AIXppent_buff); + + return 0; +} + +/* + * Function: check_log_volume_and_pe_maps + * + * Make sure all volumes in this group have valid LE-to-PE maps. + * Any volume that doesn't is deleted. This is safe for re-discovery + * because only new volumes could have corrupted PE maps. + */ +static int +check_log_volume_and_pe_maps(struct aix_volume_group *group) +{ + struct aix_logical_volume *volume; + int i, j, lv_found, all_lvs_found; + + LOG_DEBUG(" check_pe_map.\n"); + + for (i = 0, all_lvs_found = FALSE, lv_found = 0; + !all_lvs_found && i < LVM_MAXLVS; i++) { + if (!group->volume_list[i]) { + LOG_DEBUG(" CPEM No Volume %d found \n", i); + continue; + } + + volume = group->volume_list[i]; + if (!volume->le_to_pe_map) { + LOG_DEBUG(" CPEM Volume %s has no PE map.\n", + volume->name); + delete_logical_volume(volume); + continue; + } + + LOG_DEBUG(" CPEM volume %s num_le: %d \n", volume->name, + volume->num_le); + + lv_found++; + + if (lv_found == group->AIXvgh->numlvs) { + all_lvs_found = TRUE; + } + + for (j = 0; j < volume->num_le; j++) { + if (!volume->le_to_pe_map[j].owning_pv || + !volume->le_to_pe_map[j].pe_sector_offset) { + LOG_SERIOUS(" CPEM Volume (%s) incomplete PE map (LE %d) \n", + volume->name, j); + volume->lv_access |= EVMS_LV_INCOMPLETE; + } + + if (volume->mirror_copies > AIX_DEFAULT_MIRRORING) { + if (!volume->le_to_pe_map_mir1[j].owning_pv || + !volume->le_to_pe_map_mir1[j]. + pe_sector_offset) { + LOG_SERIOUS(" CPEM Volume (%s) incomplete PE mirror map 1 (LE %d) \n", + volume->name, j); + volume->lv_access |= EVMS_LV_INCOMPLETE; + } + + if (volume->mirror_copies == AIX_MAX_MIRRORS) { + if (!volume->le_to_pe_map_mir2[j]. + owning_pv + || !volume->le_to_pe_map_mir2[j]. + pe_sector_offset) { + LOG_SERIOUS(" CPEM Volume (%s) incomplete PE mirror map 2 (LE %d) \n", + volume->name, j); + volume->lv_access |= EVMS_LV_INCOMPLETE; + } + } + } + } + } + + LOG_EXTRA(" Leaving check_pe_map.\n"); + return 0; +} + +/* + * Function: export_volumes + * + * The last thing this VGE must do is take each constructed volume and + * place it back on the evms logical partition list. + */ +static int +export_volumes(struct evms_logical_node **evms_partition_list) +{ + struct aix_volume_group *AIXVGLPtr; + struct evms_logical_node *new_node; + struct aix_logical_volume *volume; + int j, lv_found, all_lvs_found; + int count = 0; + + for (AIXVGLPtr = AIXVolumeGroupList; AIXVGLPtr; AIXVGLPtr = AIXVGLPtr->next) { + + if (!(AIXVGLPtr->flags & AIX_VG_DIRTY)) { + LOG_DEBUG(" EV Existing group(%d), not dirty, skipping\n", + AIXVGLPtr->vg_id.word2); + continue; + } + LOG_DEBUG(" Exporting all new volumes numpvs:%d numlvs:%d \n", + AIXVGLPtr->AIXvgh->numpvs, AIXVGLPtr->numlvs); + + // Export every valid volume in the group. For re-discovery, + // make sure we are only exporting "new" volumes. + + for (j = 0, all_lvs_found = FALSE, lv_found = 0; + !all_lvs_found && j < LVM_MAXLVS; j++) { + if (AIXVGLPtr->volume_list[j] != NULL) { + if (AIXVGLPtr->volume_list[j]->lv_access & EVMS_LV_NEW) { + + LOG_DEBUG(" EV Checking LV:[%d] volume:%p\n", + j,AIXVGLPtr->volume_list[j]); + + volume = AIXVGLPtr->volume_list[j]; + lv_found++; + + if (lv_found == AIXVGLPtr->AIXvgh->numlvs) { + all_lvs_found = TRUE; + } + // For new volumes, create a new EVMS node and + // initialize the appropriate fields. + if (evms_cs_allocate_logical_node(&new_node)) { + LOG_DEBUG(" Export Vol Error allocating node !!\n"); + continue; + } else { + LOG_DEBUG(" EV Node allocated OK\n"); + } + +// volume->new_volume = 0; + volume->volume_node = new_node; + volume->lv_access &= (~EVMS_LV_NEW); + new_node->hardsector_size = AIXVGLPtr->hard_sect_size; + new_node->block_size = AIXVGLPtr->block_size; + new_node->plugin = &plugin_header; + new_node->private = volume; + new_node->total_vsectors = volume->lv_size; + + LOG_DEBUG(" EV volume->name:[%s]\n", + volume->name); + + strncpy(new_node->name,volume->name, + EVMS_VOLUME_NAME_SIZE + 1); + + // Is the volume read-only? + if (!(volume->lv_access & AIX_LV_WRITE) + || volume->lv_access & EVMS_LV_INCOMPLETE) + { + new_node->flags |= EVMS_VOLUME_SET_READ_ONLY; + LOG_DEBUG(" EV Read Only volume->lv_access:%d\n", + volume->lv_access); + } + + evms_cs_add_logical_node_to_list(evms_partition_list, + new_node); + count++; + + LOG_DEBUG(" Exporting LVM volume %p new_node:%p ESD->volume_name[%s]\n", + volume, new_node,new_node->name); + } else { + evms_cs_add_logical_node_to_list(evms_partition_list, + AIXVGLPtr->volume_list[j]->volume_node); + count++; + LOG_DEBUG(" ELV vol_list[%d]%p\n", j, + AIXVGLPtr->volume_list[j]); + } + } else { + LOG_DEBUG(" EV Checking LV:[%d] == NULL\n",j); + } + } // end checking all lvs + + AIXVGLPtr->flags &= ~AIX_VG_DIRTY; + } + + return count; +} + +/* + * Function: delete_logical_volume + * + * This function deletes the in-memory representation of a single LVM + * logical volume, including its PE map and any snapshot data. It does + * not alter the parent volume group, except to remove this volume from + * its volume list. + */ +static int +delete_logical_volume(struct aix_logical_volume *volume) +{ + struct aix_volume_group *group = volume->group; + + LOG_DEBUG(" Deleting volume %s\n", volume->name); + + // Now free up all the memory. This includes the LE-to-PE map, any + // mirror PEs, etc. + if (volume->le_to_pe_map) { + kfree(volume->le_to_pe_map); + volume->le_to_pe_map = NULL; + } + + if (volume->le_to_pe_map_mir1) { + kfree(volume->le_to_pe_map_mir1); + volume->le_to_pe_map_mir1 = NULL; + } + + if (volume->le_to_pe_map_mir2) { + kfree(volume->le_to_pe_map_mir2); + volume->le_to_pe_map_mir2 = NULL; + } + // Remove this volume from the volume-group's list. + if (group && group->volume_list[volume->lv_number] == volume) { + group->volume_list[volume->lv_number] = NULL; + group->numlvs--; + } + + kfree(volume); + + return 0; +} + +/* Function: remove_group_from_list + * + * Remove an LVM volume group from the global LVM list. + */ +static int +remove_group_from_list(struct aix_volume_group *group) +{ + struct aix_volume_group **p_group; + + for (p_group = &AIXVolumeGroupList; *p_group; + p_group = &(*p_group)->next) { + if (*p_group == group) { + *p_group = (*p_group)->next; + group->next = NULL; + break; + } + } + return 0; +} + +/* + * Function: delete_aix_node + * + * This function deletes the in-memory representation of an LVM + * logical volume. Right now it makes a lot of assumptions about + * the data in the group not being corrupted. It would be possible + * to put in a lot of consistency checks before deleting everything + * to indicate if problems have occurred during the lifetime of the + * volume and its volume group. + */ +static int +delete_aix_node(struct evms_logical_node *logical_node) +{ + struct aix_logical_volume *volume = + (struct aix_logical_volume *) (logical_node->private); + struct aix_volume_group *group = volume->group; + + if (delete_logical_volume(volume)) { + return -EINVAL; + } + // If we just removed the last volume from this group, the entire group + // can also be deleted. + if (group && group->numlvs == 0) { + remove_group_from_list(group); + deallocate_volume_group(group); + } + // Free the logical node. + evms_cs_deallocate_logical_node(logical_node); + + return 0; +} + +/* Function: deallocate_volume_group + * + * This function deletes the entire in-memory representation of an LVM + * volume group, including all partitions and logical volumes. If this + * group is on the VGE's volume group list, it is removed. + */ +static int +deallocate_volume_group(struct aix_volume_group *group) +{ + struct partition_list_entry *partition; + struct partition_list_entry *next_part; + int i; + + LOG_DEBUG(" Deleting volume group %x\n", group->vg_id.word2); + + // Delete all partitions from the group's list. + for (partition = group->partition_list; partition; + partition = next_part) { + + next_part = partition->next; + + if (partition->logical_node) { + // Send a delete command down to the partition manager. + LOG_DEBUG(" Deleting PV %d from group %x\n", + partition->pv_number, group->vg_id.word2); + DELETE(partition->logical_node); + } + kfree(partition); + } + + // Delete all logical volumes, and the array of pointers. + for (i = 0; i < LVM_MAXLVS; i++) { + if (group->volume_list[i]) { + delete_logical_volume(group->volume_list[i]); + } + } + + kfree(group); + + return 0; +} + +/* Function: end_discover_aix + * + * The discovery process at the region-manager level is now iterative, + * much like the EVMS feature level. To accomplish this correctly, and + * also to accomplish partial volume discovery, a second discover + * entry point is needed, so EVMS can tell the region managers that + * discovery is over, and to finish up any discovery that is not yet + * complete. When this function is called, it should be assumed that + * the node list has had nothing new added to it since the last call + * of the regular discover function. Therefore, when this function is + * called, we do not need to try to discovery any additional volume + * groups. We will, however, look for logical volumes once more. This + * gives us the ability to export (read-only) volumes that have + * partially corrupted LE maps due to missing PVs in their VG. + */ +static int +end_discover_aix(struct evms_logical_node **evms_logical_disk_head) +{ + + int rc; + + MOD_INC_USE_COUNT; + LOG_DEBUG("Final Discovery:\n"); + + rc = discover_logical_volumes(); + + if (!rc) { + rc = export_volumes(evms_logical_disk_head); + + lvm_cleanup(); + } + + MOD_DEC_USE_COUNT; + return rc; +} + +/**************************************************** +* Function: AIX_alloc_wbh +* +* Alloc any buffer heads from the pool and return a linked list +* +* +*****************************************************/ +static struct aix_mirror_bh * +AIX_alloc_wbh(struct evms_logical_node *node, + struct evms_logical_node *node2, + struct evms_logical_node *node3, + struct buffer_head *bh, + u32 mirror_copies, u32 le, u64 new_sector2, u64 new_sector3) +{ + struct aix_mirror_bh *tmp_bh = NULL, *head_bh = NULL; + int i; + + head_bh = evms_cs_allocate_from_pool(AIX_BH_list_pool, EVMS_BLOCKABLE); + + if (!head_bh) { + LOG_SERIOUS("Unable to allocate memory for mirror pool line:%d\n", + __LINE__); + return NULL; + } + + head_bh->master_bh = bh; + head_bh->mirror_bh_list = NULL; + head_bh->remaining = (atomic_t) ATOMIC_INIT(0); + + for (i = AIX_DEFAULT_MIRRORING; i <= mirror_copies; i++) { + + tmp_bh = + evms_cs_allocate_from_pool(AIX_BH_list_pool, + EVMS_BLOCKABLE); + if (!tmp_bh) { + LOG_SERIOUS("Unable to allocate memory for mirror pool line:%d\n", + __LINE__); + return NULL; + } + + tmp_bh->next_r1 = head_bh->mirror_bh_list; + head_bh->mirror_bh_list = tmp_bh; + atomic_inc(&head_bh->remaining); + + memcpy(&tmp_bh->bh_req, bh, sizeof (struct buffer_head)); + tmp_bh->remaining = (atomic_t) ATOMIC_INIT(0); + init_waitqueue_head(&tmp_bh->bh_req.b_wait); + //tmp_bh->bh_req.b_size = bh->b_size; + + switch (i) { + + case AIX_DEFAULT_MIRRORING: + tmp_bh->node = node; + tmp_bh->bh_req.b_rsector = bh->b_rsector; + break; + + case AIX_FIRST_MIRROR: + tmp_bh->node = node2; + tmp_bh->bh_req.b_rsector = new_sector2; + break; + + case AIX_MAX_MIRRORS: + tmp_bh->node = node3; + tmp_bh->bh_req.b_rsector = new_sector3; + break; + } + + tmp_bh->bh_req.b_end_io = AIX_handle_write_mirror_drives; //setup callback routine + tmp_bh->bh_req.b_private = (void *) head_bh; + + } + + return head_bh; + +} + +/**************************************************** +* Function: AIX_handle_write_mirror_drives +* +* Handles a write from a set of mirrored AIX LVs + +* +* +*****************************************************/ +static void +AIX_handle_write_mirror_drives(struct buffer_head *bh, int uptodate) +{ + struct aix_logical_volume *volume; + struct evms_logical_node *node; + struct aix_mirror_bh *tmp_bh = NULL, *tmp_bh2 = NULL; + kdev_t tmp_b_rdev; + u32 count, le = 0; + + tmp_bh = (struct aix_mirror_bh *) bh->b_private; + tmp_b_rdev = tmp_bh->master_bh->b_rdev; + node = tmp_bh->node; + volume = (struct aix_logical_volume *) node->private; + + LOG_DEBUG("AHWMD node:%p bh_flags:%lu uptodate:%d mirror_copies:%d \n", + node, bh->b_state, uptodate, volume->mirror_copies); + + if (!uptodate) { + le = tmp_bh->le; + + switch (tmp_bh->iteration) { + case AIX_DEFAULT_MIRRORING: + volume->le_to_pe_map[le].pp_state += AIX_LVM_LVSTALE; + break; + + case AIX_FIRST_MIRROR: + volume->le_to_pe_map_mir1[le].pp_state += + AIX_LVM_LVSTALE; + break; + + case AIX_MAX_MIRRORS: + volume->le_to_pe_map_mir2[le].pp_state += + AIX_LVM_LVSTALE; + break; + } + + AIX_evms_cs_notify_lv_io_error(node); + } + + if (atomic_dec_and_test(&tmp_bh->remaining)) { + tmp_bh->master_bh->b_end_io(tmp_bh->master_bh, uptodate); + tmp_bh2 = tmp_bh->mirror_bh_list; + evms_cs_deallocate_to_pool(AIX_BH_list_pool, tmp_bh); + + while (tmp_bh2) { + tmp_bh = tmp_bh2->next_r1; + evms_cs_deallocate_to_pool(AIX_BH_list_pool, tmp_bh2); + tmp_bh2 = tmp_bh; + } + + evms_cs_volume_request_in_progress(tmp_b_rdev, + AIX_DECREMENT_REQUEST, + &count); + } + + return; +} + +/**************************************************** +* Function: AIX_alloc_rbh +* +* Alloc any buffer heads from the pool and return a linked list +* +* +*****************************************************/ +static struct aix_mirror_bh * +AIX_alloc_rbh(struct evms_logical_node *node, + struct buffer_head *bh, + u32 mirror_copies, u32 le, u64 org_sector, int cmd) +{ + struct aix_mirror_bh *tmp_bh = NULL; + + tmp_bh = evms_cs_allocate_from_pool(AIX_BH_list_pool, EVMS_BLOCKABLE); + + if (!tmp_bh) { + LOG_SERIOUS + ("Unable to allocate memory for mirror pool line:%d\n", + __LINE__); + return NULL; + } + + memcpy(&tmp_bh->bh_req, bh, sizeof (struct buffer_head)); + tmp_bh->remaining = (atomic_t) ATOMIC_INIT(0); + tmp_bh->node = node; + tmp_bh->master_bh = bh; + tmp_bh->iteration = AIX_FIRST_MIRROR; + //tmp_bh->eio.rsector = eio->rsector; + //tmp_bh->eio.rsize = eio->rsize; + tmp_bh->le = le; + //tmp_bh->eio.bh = &tmp_bh->bh_req; + + if (cmd == AIX_LV_READ) { + tmp_bh->bh_req.b_end_io = AIX_handle_read_mirror_drives; //setup callback routine + } else { + tmp_bh->bh_req.b_end_io = AIX_sync_mirrored_partitions; //setup callback routine + } + + tmp_bh->bh_req.b_private = (void *) tmp_bh; + + tmp_bh->cmd = cmd; + tmp_bh->next_r1 = NULL; + tmp_bh->node = node; + + return tmp_bh; + +} + +/**************************************************** +* Function: AIX_reschedule_retry +* +* reschedule a read of one of our mirror copies +* +* +*****************************************************/ +static void +AIX_reschedule_retry(struct aix_mirror_bh *aix_bh) +{ + unsigned long flags; + + spin_lock_irqsave(&AIX_retry_list_lock, flags); + if (AIX_retry_list == NULL) + AIX_retry_tail = &AIX_retry_list; + *AIX_retry_tail = aix_bh; + AIX_retry_tail = &aix_bh->next_r1; + aix_bh->next_r1 = NULL; + spin_unlock_irqrestore(&AIX_retry_list_lock, flags); + evms_cs_wakeup_thread(AIX_mirror_read_retry_thread); +} + +/**************************************************** +* Function: AIX_handle_read_mirror_drives +* +* Handles a read from a set of mirrored AIX LVs + +* +* +*****************************************************/ +static void +AIX_handle_read_mirror_drives(struct buffer_head *bh, int uptodate) +{ + struct aix_logical_volume *volume; + struct evms_logical_node *node; + struct aix_mirror_bh *tmp_bh; + kdev_t tmp_b_rdev; + u32 count, le = 0; + + tmp_bh = (struct aix_mirror_bh *) bh->b_private; + tmp_b_rdev = tmp_bh->master_bh->b_rdev; + volume = (struct aix_logical_volume *) tmp_bh->node->private; + node = tmp_bh->node; + le = tmp_bh->le; + + LOG_DEBUG("AHRMD node:%p bh_flags:%lu uptodate:%d mirror_copies:%d \n", + node, bh->b_state, uptodate, volume->mirror_copies); + + switch (tmp_bh->iteration) { + case AIX_DEFAULT_MIRRORING: + count = volume->le_to_pe_map[le].pp_state; + break; + + case AIX_FIRST_MIRROR: + count = volume->le_to_pe_map[le].pp_state; + break; + + case AIX_MAX_MIRRORS: + count = volume->le_to_pe_map[le].pp_state; + break; + } + + if (count == (AIX_LVM_LVSTALE + AIX_LVM_LVDEFINED)) { + uptodate = 0; + count = 0; + } + + if (!uptodate && tmp_bh->iteration < volume->mirror_copies) { + AIX_evms_cs_notify_lv_io_error(node); + AIX_reschedule_retry(tmp_bh); + } else { + tmp_bh->master_bh->b_end_io(tmp_bh->master_bh, uptodate); + evms_cs_deallocate_to_pool(AIX_BH_list_pool, tmp_bh); + evms_cs_volume_request_in_progress(tmp_b_rdev, + AIX_DECREMENT_REQUEST, + &count); + + } + + return; +} + +/**************************************************** +* This is a temporary function until a common EVMS +* notification function can be created. +* +*****************************************************/ +static int +AIX_evms_cs_notify_lv_io_error(struct evms_logical_node *node) +{ + struct aix_logical_volume *volume; + + volume = (struct aix_logical_volume *) node->private; + + LOG_CRITICAL("Notify_ERROR !! node:%p volume->lv_status:%d volume->name:[%s]\n", + node, volume->lv_status, volume->name); + + return 0; +} + +/* Function: lvm_cleanup + * + * This function runs through the entire lvm data structure, removing + * all items that are not needed at runtime. Currently, this is just the + * vg_disk_t structure and the pv_disk_t structure for each PV. Also, any + * groups that don't contain any volumes are deleted. All of the other + * volume_group, logical_volume and evms_logical_node structures will be + * kept around at run-time. + */ +static int +lvm_cleanup(void) +{ + struct aix_volume_group *group; + + group = AIXVolumeGroupList; + + while (group) { + + if (group->AIXvgh) { + kfree(group->AIXvgh); + group->AIXvgh = NULL; + } + + group = group->next; + } + + return 0; +} + +/**************************************************** +* Function: AIX_copy_header_info +* +* Copy the disk header info into the volume struct +* so we can use it later. +* +* +* +*****************************************************/ +static int +AIX_copy_header_info(struct vg_header *AIXvgh, struct vg_header *AIXvgh2) +{ + + LOG_DEBUG("CHI AIXvgh:%p AIXvgh2:%p\n", AIXvgh, AIXvgh2); + + if (AIXvgh) { + + AIXvgh->vg_timestamp.tv_sec = AIXvgh2->vg_timestamp.tv_sec; + AIXvgh->vg_timestamp.tv_nsec = AIXvgh2->vg_timestamp.tv_nsec; + AIXvgh->vg_id.word1 = AIXvgh2->vg_id.word1; + AIXvgh->vg_id.word2 = AIXvgh2->vg_id.word2; + AIXvgh->vg_id.word3 = AIXvgh2->vg_id.word3; + AIXvgh->vg_id.word4 = AIXvgh2->vg_id.word4; + AIXvgh->numlvs = AIXvgh2->numlvs; + AIXvgh->maxlvs = AIXvgh2->maxlvs; + AIXvgh->pp_size = AIXvgh2->pp_size; + AIXvgh->numpvs = AIXvgh2->numpvs; + AIXvgh->total_vgdas = AIXvgh2->total_vgdas; + AIXvgh->vgda_size = AIXvgh2->vgda_size; + AIXvgh->bigvg = AIXvgh2->bigvg; + AIXvgh->quorum = AIXvgh2->quorum; + AIXvgh->auto_varyon = AIXvgh2->auto_varyon; + AIXvgh->checksum = AIXvgh2->checksum; + AIXvgh->bigda_size = AIXvgh2->bigda_size; + + } else { + return -ENOMEM; + } + + LOG_DEBUG("Returning CHI AIXvgh:%p AIXvgh2:%p\n", AIXvgh, AIXvgh2); + + return 0; +} + +/**************************************************** +* Function: AIX_free_header +* +* +* +* +* +*****************************************************/ +static void +AIX_free_headers(struct vg_header *AIXvgh, struct vg_header *AIXvgh2, + struct vg_trailer *AIXvgt, struct vg_trailer *AIXvgt2) +{ + + if (AIXvgh) { + kfree(AIXvgh); + AIXvgh = NULL; + } + + if (AIXvgh2) { + kfree(AIXvgh2); + AIXvgh2 = NULL; + } + + if (AIXvgt) { + kfree(AIXvgt); + AIXvgt = NULL; + } + + if (AIXvgt2) { + kfree(AIXvgt2); + AIXvgt2 = NULL; + } + +} + +/**************************************************** +* Function: AIXiod +* +* This is a kernel thread that handles read of mirrors +* This shouldn't ever run on a non-mirrored LV read +* +* +*****************************************************/ +static void +AIXiod(void *data) +{ + struct aix_mirror_bh *r1_bh; + struct evms_logical_node *node; + unsigned long flags; + + while (1) { + + spin_lock_irqsave(&AIX_retry_list_lock, flags); + if (AIX_retry_list == NULL) { + spin_unlock_irqrestore(&AIX_retry_list_lock, flags); + break; + } + r1_bh = AIX_retry_list; + AIX_retry_list = r1_bh->next_r1; + spin_unlock_irqrestore(&AIX_retry_list_lock, flags); + r1_bh->next_r1 = NULL; // for mark + + switch (r1_bh->cmd) { + case AIX_LV_READ: + + r1_bh->iteration++; + LOG_DEBUG("Report from thread AIXiod READ\n"); + + if (r1_bh->iteration == AIX_FIRST_MIRROR) { + node = r1_bh->mir_node1; + r1_bh->bh_req.b_rsector = r1_bh->mir_sector1; + } else { + node = r1_bh->mir_node2; + r1_bh->bh_req.b_rsector = r1_bh->mir_sector2; + } + + R_IO(node, &r1_bh->bh_req); + + break; + + default: + LOG_DEBUG("AIXiod unknown cmd passed to thread:%d\n", + r1_bh->cmd); + break; + } + + } + return; +} + +/**************************************************** +* Function: AIX_schedule_resync +* +* schedule a resync of one of our lv mirror copies +* +* +*****************************************************/ +static void +AIX_schedule_resync(struct aix_logical_volume *resync_volume, int force) +{ + unsigned long flags; + + LOG_DEBUG("Function %s volume: %s \n", __FUNCTION__, + resync_volume->name); + + spin_lock_irqsave(&AIX_resync_list_lock, flags); + + if (!AIX_resync_list) { + AIX_resync_list = + kmalloc(sizeof (struct aix_resync_struct), GFP_ATOMIC); + if (!AIX_resync_list) { + return; + } + memset(AIX_resync_list, 0, sizeof (struct aix_resync_struct)); + } + + AIX_resync_list->resync_vol = resync_volume; + AIX_resync_list->next_resync_vol = NULL; + + spin_unlock_irqrestore(&AIX_resync_list_lock, flags); + evms_cs_wakeup_thread(AIX_mirror_resync_thread); +} + +/**************************************************** +* Function: AIXresync +* +* This is a kernel thread that handles resync of mirrors +* This shouldn't ever run on a non-mirrored LV +* +* +*****************************************************/ +static void +AIXresync(void *data) +{ + + struct aix_logical_volume *volume = NULL; + int force = FALSE; // Currently we don't force a resync of non-stale pe's + + if (AIX_resync_list == NULL) { + LOG_ERROR("No Volumes on list to resync\n"); + return; + } + + volume = AIX_resync_list->resync_vol; + LOG_DEBUG("Function %s volume: %s \n", __FUNCTION__, volume->name); + + if (!volume) { + LOG_ERROR("Invalid volume passed to sync\n"); + return; + } + + if (AIXResyncInProgress) { + LOG_ERROR("Unable to resync multiple LVs concurrently %s\n", + volume->name); + return; + } + + if (volume->mirror_copies == AIX_DEFAULT_MIRRORING) { + LOG_ERROR("Unable to resync non-mirrored LV %s \n", + volume->name); + return; + } + + AIXResyncInProgress = TRUE; + + AIX_resync_lv_mirrors(volume, force); + + return; +} + +/**************************************************** +* Function: AIX_resync_lv_mirrors +* +* +* +* +* +*****************************************************/ +static int +AIX_resync_lv_mirrors(struct aix_logical_volume *volume, int force) +{ + + int i; + char pp_stale = FALSE; + + struct partition_list_entry *master_part = NULL; + struct partition_list_entry *slave1_part = NULL; + struct partition_list_entry *slave2_part = NULL; + + u64 master_offset = 0; + u64 slave1_offset = 0; + u64 slave2_offset = 0; + + LOG_DEBUG("Function %s volume: %s \n", __FUNCTION__, volume->name); + + for (i = 0; i < volume->num_le; i++, pp_stale = FALSE) { + + // We need to see which mirror has a valid non-stale copy. + // The first non-stale copy will be our master and we'll + // copy to the slave(s). + + if ((volume->le_to_pe_map[i].pp_state & AIX_LVM_LVSTALE)) { + pp_stale = TRUE; + } + + if (volume->le_to_pe_map_mir1 != NULL) { + if ((volume->le_to_pe_map_mir1[i]. + pp_state & AIX_LVM_LVSTALE)) { + pp_stale = TRUE; + } + } + + if (volume->le_to_pe_map_mir2 != NULL) { + if ((volume->le_to_pe_map_mir2[i]. + pp_state & AIX_LVM_LVSTALE)) { + pp_stale = TRUE; + } + } + + LOG_DEBUG("Function %s pp_stale:%d force:%d \n", __FUNCTION__, + pp_stale, force); + + if (pp_stale || force) { + if (!(volume->le_to_pe_map[i].pp_state & AIX_LVM_LVSTALE)) { + + master_part = volume->le_to_pe_map[i].owning_pv; + master_offset = volume->le_to_pe_map[i].pe_sector_offset; + + if (volume->le_to_pe_map_mir1 != NULL) { + slave1_part = volume->le_to_pe_map_mir1[i].owning_pv; + slave1_offset = volume->le_to_pe_map_mir1[i].pe_sector_offset; + } + + if (volume->le_to_pe_map_mir2 != NULL) { + slave2_part = volume->le_to_pe_map_mir2[i].owning_pv; + slave2_offset = volume->le_to_pe_map_mir2[i].pe_sector_offset; + } + } else + if (!(volume->le_to_pe_map_mir1[i].pp_state & AIX_LVM_LVSTALE)) { + master_part = volume->le_to_pe_map_mir1[i].owning_pv; + master_offset = volume->le_to_pe_map_mir1[i].pe_sector_offset; + + if (volume->le_to_pe_map != NULL) { + slave1_part = volume->le_to_pe_map[i].owning_pv; + slave1_offset = volume->le_to_pe_map[i].pe_sector_offset; + } + + if (volume->le_to_pe_map_mir2 != NULL) { + slave2_part = volume->le_to_pe_map_mir2[i].owning_pv; + slave2_offset = volume->le_to_pe_map_mir2[i].pe_sector_offset; + } + } else + if (!(volume->le_to_pe_map_mir2[i].pp_state & AIX_LVM_LVSTALE)) { + master_part = volume->le_to_pe_map_mir2[i].owning_pv; + master_offset = volume->le_to_pe_map_mir2[i].pe_sector_offset; + + if (volume->le_to_pe_map != NULL) { + slave1_part = volume->le_to_pe_map[i].owning_pv; + slave1_offset = volume->le_to_pe_map[i].pe_sector_offset; + } + + if (volume->le_to_pe_map_mir1 != NULL) { + slave2_part = volume->le_to_pe_map_mir1[i].owning_pv; + slave2_offset = volume->le_to_pe_map_mir1[i].pe_sector_offset; + } + } + + if (AIX_copy_on_read(volume, master_part, slave1_part, slave2_part, + master_offset, slave1_offset, slave2_offset, + volume->pe_size, i)) { + + LOG_CRITICAL("ReSync of logical Volume %s FAILED !!\n", + volume->name); + AIX_evms_cs_notify_lv_io_error(volume-> + volume_node); + break; + } + + } + + } + + return 0; +} + +/**************************************************** +* Function: AIX_copy_on_read +* +* +* +* +* +*****************************************************/ +static int +AIX_copy_on_read(struct aix_logical_volume *volume, + struct partition_list_entry *master_part, + struct partition_list_entry *slave1_part, + struct partition_list_entry *slave2_part, + u64 master_offset, + u64 slave1_offset, u64 slave2_offset, u32 pe_size, int le) +{ + unsigned long flags; + struct aix_mirror_bh *tmp_bh = NULL; + + // Check for valid partitions we need at least 2 good partitions so slave2 doesn't have to be valid + + if (!master_part || !slave1_part) { + LOG_ERROR("Invalid partitions for resync master part:%p slave1_part:%p slave2_part:%p\n", + master_part, slave1_part, slave2_part); + return -EINVAL; + } + + LOG_DEBUG("Function %s volume:%s master_part:%d, slave1_part:%d, slave2_part:%d master_offset:" + PFU64 ", slave1_offset:" PFU64 " slave2_offset:" PFU64 ", \n", + __FUNCTION__, volume->name, master_part->pv_number, + slave1_part->pv_number, slave2_part->pv_number, master_offset, + slave1_offset, slave2_offset); + + LOG_DEBUG("pe_size:%d le:%d\n", pe_size, le); + + tmp_bh = + AIX_alloc_sbh(volume, master_part, slave1_part, slave2_part, + master_offset, slave1_offset, slave2_offset, pe_size); + + if (!tmp_bh) { + buffer_IO_error(&tmp_bh->bh_req); + return -ENOMEM; + } + +/* if (evms_cs_volume_request_in_progress + (tmp_bh->bh_req.b_rdev, AIX_INCREMENT_REQUEST, &count)) { + buffer_IO_error(&tmp_bh->bh_req); + return -EIO; + } */ + + spin_lock_irqsave(&AIX_resync_pp_lock, flags); + + LOG_DEBUG("Function:%s kicking off read node:%p\n", __FUNCTION__, + master_part->logical_node); + + R_IO(master_part->logical_node, &tmp_bh->bh_req); + + spin_unlock_irqrestore(&AIX_resync_pp_lock, flags); + + return 0; +} + +/**************************************************** +* Function: AIX_alloc_sbh +* +* Alloc any buffer heads from the pool and return a linked list +* +* +*****************************************************/ +static struct aix_mirror_bh * +AIX_alloc_sbh(struct aix_logical_volume *volume, + struct partition_list_entry *master_part, + struct partition_list_entry *slave1_part, + struct partition_list_entry *slave2_part, + u64 master_offset, + u64 slave1_offset, u64 slave2_offset, u32 pe_size) +{ + struct aix_mirror_bh *tmp_bh = NULL, *head_bh = NULL; + unsigned long flags; + + LOG_DEBUG("Function:%s Enter\n", __FUNCTION__); + + head_bh = evms_cs_allocate_from_pool(AIX_BH_list_pool, EVMS_BLOCKABLE); + if (!head_bh) { + LOG_SERIOUS + ("Unable to allocate memory for mirror pool line:%d\n", + __LINE__); + return NULL; + } + // Update buffer so we block on a read/write on the normal IO path + // if we're trying to sync the same sector on the disk + // We don't want to block if it's different sectors + + spin_lock_irqsave(&AIX_resync_list_lock, flags); + + AIX_resync_list->master_part = master_part; + AIX_resync_list->slave1_part = slave1_part; + AIX_resync_list->slave2_part = slave2_part; + AIX_resync_list->master_offset = master_offset; + AIX_resync_list->slave1_offset = slave1_offset; + AIX_resync_list->slave2_offset = slave2_offset; + + head_bh->bh_req.b_data = kmalloc(AIX_RESYNC_BLOCKSIZE + 1, GFP_NOIO); + if (!head_bh->bh_req.b_data) { + evms_cs_deallocate_to_pool(AIX_BH_list_pool, head_bh); + LOG_SERIOUS + ("Unable to allocate memory for mirror pool line:%d\n", + __LINE__); + return NULL; + } + + memset(head_bh->bh_req.b_data, 0, AIX_RESYNC_BLOCKSIZE + 1); + + head_bh->remaining = (atomic_t) ATOMIC_INIT(0); + head_bh->bh_req.b_rsector = master_offset; + head_bh->bh_req.b_size = AIX_RESYNC_BLOCKSIZE; + head_bh->sync_flag = AIX_SYNC_INCOMPLETE; + head_bh->bh_req.b_end_io = AIX_sync_mirrored_partitions; + head_bh->bh_req.b_page = virt_to_page(head_bh->bh_req.b_data); + head_bh->bh_req.b_state = 0; + set_bit(BH_Dirty, &head_bh->bh_req.b_state); + set_bit(BH_Lock, &head_bh->bh_req.b_state); + set_bit(BH_Req, &head_bh->bh_req.b_state); + set_bit(BH_Mapped, &head_bh->bh_req.b_state); + head_bh->master_bh = NULL; + head_bh->mirror_bh_list = NULL; + + tmp_bh = evms_cs_allocate_from_pool(AIX_BH_list_pool, EVMS_BLOCKABLE); + if (!tmp_bh) { + LOG_SERIOUS + ("Unable to allocate memory for mirror pool line:%d\n", + __LINE__); + return NULL; + } + + head_bh->next_r1 = tmp_bh; + memcpy(&tmp_bh->bh_req, head_bh, sizeof (struct buffer_head)); + tmp_bh->remaining = (atomic_t) ATOMIC_INIT(0); + tmp_bh->bh_req.b_end_io = NULL; + + if (volume->mirror_copies == AIX_MAX_MIRRORS) { + tmp_bh->next_r1 = + evms_cs_allocate_from_pool(AIX_BH_list_pool, + EVMS_BLOCKABLE); + if (!tmp_bh->next_r1) { + LOG_SERIOUS + ("Unable to allocate memory for mirror pool line:%d\n", + __LINE__); + return NULL; + } + + memcpy(&tmp_bh->next_r1->bh_req, head_bh, + sizeof (struct buffer_head)); + tmp_bh->next_r1->bh_req.b_end_io = NULL; + tmp_bh->next_r1->remaining = (atomic_t) ATOMIC_INIT(0); + } + + init_waitqueue_head(&head_bh->bh_req.b_wait); + + spin_unlock_irqrestore(&AIX_resync_list_lock, flags); + + LOG_DEBUG("Function:%s Exit head_bh:%p\n", __FUNCTION__, head_bh); + + return head_bh; +} + +/**************************************************** +* Function: AIX_sync_mirrored_partitions +* +* +* +* +* +*****************************************************/ +static void +AIX_sync_mirrored_partitions(struct buffer_head *bh, int uptodate) +{ + struct aix_logical_volume *volume = NULL; + struct aix_mirror_bh *tmp_bh, *head_bh; + + head_bh = tmp_bh = (struct aix_mirror_bh *) bh->b_private; + volume = (struct aix_logical_volume *) tmp_bh->node->private; + + LOG_DEBUG("Function:%s Enter uptodate:%d\n", __FUNCTION__, uptodate); + + if (!uptodate) { + + AIX_evms_cs_notify_lv_io_error(tmp_bh->node); + } + + tmp_bh = head_bh->next_r1; + + LOG_DEBUG("Function:%s line:%d write to mirror:%p\n", __FUNCTION__, + __LINE__, tmp_bh); + + if (tmp_bh) { + W_IO(tmp_bh->node, &tmp_bh->bh_req); + AIX_get_set_mirror_offset(tmp_bh, AIX_SLAVE_1, + AIX_RESYNC_BLOCKSIZE); + } + + tmp_bh = tmp_bh->next_r1; + LOG_DEBUG("Function:%s line:%d write to mirror:%p\n", __FUNCTION__, + __LINE__, tmp_bh); + + if (tmp_bh) { + W_IO(tmp_bh->node, &tmp_bh->bh_req); + AIX_get_set_mirror_offset(tmp_bh, AIX_SLAVE_2, + AIX_RESYNC_BLOCKSIZE); + } + + LOG_DEBUG("Function:%s line:%d read from master:%p\n", __FUNCTION__, + __LINE__, head_bh); + + if (head_bh && head_bh->sync_flag) { + AIX_get_set_mirror_offset(head_bh, AIX_MASTER, + AIX_RESYNC_BLOCKSIZE); + if (head_bh->sync_flag == AIX_SYNC_INCOMPLETE) { + R_IO(head_bh->node, &head_bh->bh_req); + } + } + + LOG_DEBUG("Function:%s line:%d head_bh->sync_flag:%d\n", __FUNCTION__, + __LINE__, head_bh->sync_flag); + + if (!head_bh->sync_flag) { + tmp_bh = head_bh; + head_bh = head_bh->next_r1; + + while (tmp_bh != NULL) { + evms_cs_deallocate_to_pool(AIX_BH_list_pool, tmp_bh); + tmp_bh = head_bh; + } + + AIXResyncInProgress = FALSE; +/* evms_cs_volume_request_in_progress(tmp_bh->bh_req.b_rdev, + AIX_DECREMENT_REQUEST, + &count); */ + + if (AIX_resync_list) { + kfree(AIX_resync_list); + } + } + + return; +} + +/**************************************************** +* Function: AIX_get_set_mirror_offset +* +* +* +* +* +*****************************************************/ +static int +AIX_get_set_mirror_offset(struct aix_mirror_bh *tmp_bh, int index, int offset) +{ + int flags; + + if (!tmp_bh) { + return -EINVAL; + } + + LOG_DEBUG("Function:%s Enter offset:%d\n", __FUNCTION__, offset); + + tmp_bh->bh_req.b_rsector += tmp_bh->bh_req.b_rsector + offset; + + if (tmp_bh->bh_req.b_rsector > tmp_bh->node->total_vsectors) { + tmp_bh->sync_flag = AIX_SYNC_COMPLETE; + return -EIO; + } + // Update buffer so we block on a read/write on the normal IO path + // if we're trying to sync the same sector on the disk + // We don't want to block if it's different sectors + + spin_lock_irqsave(&AIX_resync_list_lock, flags); + + if (AIX_resync_list->master_part->logical_node == tmp_bh->node) { + AIX_resync_list->master_offset += offset; + } + + if (AIX_resync_list->slave1_part->logical_node == tmp_bh->node) { + AIX_resync_list->slave1_offset += offset; + } + + if (AIX_resync_list->slave2_part->logical_node == tmp_bh->node) { + AIX_resync_list->slave2_offset += offset; + } + + spin_unlock_irqrestore(&AIX_resync_list_lock, flags); + + return 0; + +} + +static int AIX_pvh_data_posn(u32 vgda_psn, u32 * pvh_posn, struct partition_list_entry *partition, u32 numpvs) +{ + struct partition_list_entry * pv; + struct pv_header * AIXpvh; + int posn = 0; + int num_pps; + int tmp,i; + + LOG_DEBUG("APDP - vgda_psn:%d numpvs:%d \n", vgda_psn, numpvs); + + AIXpvh = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL); + if (!AIXpvh) { + return -ENOMEM; + } + + memset(AIXpvh, 0 , sizeof(struct pv_header)); + + // Adjust this because when AIX VGs/Volumes are created on Intel platforms, the + // pp_count could be anything since we don't give up the entire physical drive. + // This is for calculation purposes only. + + pvh_posn[0] = 0; + pv = partition; + + for (i = 1; i <= numpvs; i++) { + for (pv = partition; pv->pv_number != i; pv = pv->next ); + + LOG_DEBUG("APDP line:%d pp_count:%d \n", __LINE__, AIXpvh->pp_count); + + num_pps = AIXpvh->pp_count; + num_pps++; // Account for the pv_header on the front + + while ((num_pps * sizeof(struct pp_entries)) % AIX_SECTOR_SIZE) { + LOG_EXTRA("num_pps:%d \n", num_pps); + num_pps++; + } + + tmp = (num_pps * sizeof(struct pp_entries)) / AIX_SECTOR_SIZE; + + LOG_DEBUG("APDP tmp:%d num_pps:%d \n", tmp,num_pps); + + posn = ((vgda_psn + PSN_PPH_OFFSET) + ((pv->pv_number -1) * tmp)); + + pvh_posn[pv->pv_number] = posn; + + if (INIT_IO(pv->logical_node, 0, posn, 1, AIXpvh)) { + kfree(AIXpvh); + return -EIO; + } + + pv = partition; + } + + kfree(AIXpvh); + + return 0; +} + +/**************************************************** +* Function: AIX_volume_group_dump +* +* This is for debug purposes and will walk the volume group list +* and LV's within the volume groups +* +* It can be called at anytime however the output to the display is large +* +*****************************************************/ +#ifdef EVMS_AIX_DEBUG +static int +AIX_volume_group_dump(void) +{ + struct aix_volume_group *AIXVGLDebugPtr; + struct partition_list_entry *DebugPartitionList; + struct aix_logical_volume *DebugLVList; + int i; + + AIXVGLDebugPtr = AIXVolumeGroupList; + + if (!AIXVGLDebugPtr) { + LOG_DEBUG("***********************************************\n"); + LOG_DEBUG("ERROR Nothing built in the list to check !!! \n"); + LOG_DEBUG("***********************************************\n"); + return 0; + } + + LOG_DEBUG("*********************************************** \n"); + LOG_DEBUG("Begin Volume Group Dump \n"); + LOG_DEBUG("*********************************************** \n"); + + while (AIXVGLDebugPtr) { + + LOG_DEBUG("vg_number %x\n", AIXVGLDebugPtr->vg_id.word2); + LOG_DEBUG("numpsrtitions %d\n", AIXVGLDebugPtr->partition_count); + LOG_DEBUG("numlvs %d\n", AIXVGLDebugPtr->numlvs); + LOG_DEBUG("hard_sect_size %d\n", AIXVGLDebugPtr->hard_sect_size); + LOG_DEBUG("block_size %d\n", AIXVGLDebugPtr->block_size); + LOG_DEBUG("flags %d\n", AIXVGLDebugPtr->flags); +// LOG_DEBUG("lv_max %d\n", AIXVGLDebugPtr->lv_max); + LOG_DEBUG("pe_size %d\n", AIXVGLDebugPtr->pe_size); + LOG_DEBUG("CleanVGInfo %d\n", AIXVGLDebugPtr->CleanVGInfo); + + DebugPartitionList = AIXVGLDebugPtr->partition_list; + + LOG_DEBUG("********* Begin Volume Partition Dump ********* \n"); + + if (!DebugPartitionList) { + LOG_DEBUG("No partitions to check !! \n"); + } + + while (DebugPartitionList) { + LOG_DEBUG("logical_node %p\n", + DebugPartitionList->logical_node); + LOG_DEBUG("pv_number %d\n", + DebugPartitionList->pv_number); + LOG_DEBUG("block_size %d\n", + DebugPartitionList->block_size); + LOG_DEBUG("hard_sect_size %d\n", + DebugPartitionList->hard_sect_size); + LOG_DEBUG("-------------------------------------------------------------\n"); + DebugPartitionList = DebugPartitionList->next; + } + + LOG_DEBUG("********* End Volume Partition Dump **********\n"); + + LOG_DEBUG("********** Begin Logical Volume Partition Dump **********\n"); + + DebugLVList = AIXVGLDebugPtr->volume_list[0]; + + if (!DebugLVList) { + LOG_DEBUG("No logical volumes to check !! \n"); + } + + for (i = 0; i < LVM_MAXLVS && DebugLVList; i++) { + + DebugLVList = AIXVGLDebugPtr->volume_list[i]; + + if (DebugLVList) { + LOG_DEBUG("volume_list # %d \n", i); + LOG_DEBUG("lv_number %d \n", + DebugLVList->lv_number); + LOG_DEBUG("LV name %s \n", + DebugLVList->name); + LOG_DEBUG("lv_size " PFU64 " \n", + DebugLVList->lv_size); + LOG_DEBUG("lv_access %d \n", + DebugLVList->lv_access); + LOG_DEBUG("lv_status %d \n", + DebugLVList->lv_status); +// LOG_DEBUG("lv_minor %d \n", +// DebugLVList->lv_minor); + LOG_DEBUG("mirror_copies %d \n", + DebugLVList->mirror_copies); +// LOG_DEBUG("mirror_number %d \n", +// DebugLVList->mirror_number); + LOG_DEBUG("stripes %d \n", + DebugLVList->stripes); + LOG_DEBUG("stripe_size %d \n", + DebugLVList->stripe_size); + LOG_DEBUG("stripe_size_shift%d \n", + DebugLVList->stripe_size_shift); + LOG_DEBUG("pe_size %d \n", + DebugLVList->pe_size); + LOG_DEBUG("pe_size_shift %d \n", + DebugLVList->pe_size_shift); + LOG_DEBUG("num_le %d \n", + DebugLVList->num_le); +// LOG_DEBUG("new_volume %d \n", +// DebugLVList->new_volume); + LOG_DEBUG("group %p \n", + DebugLVList->group); + } + + } + + AIXVGLDebugPtr = AIXVGLDebugPtr->next; + + LOG_DEBUG("********** End Logical Volume Partition Dump **********\n"); + + } + + LOG_DEBUG("***********************************************\n"); + LOG_DEBUG("End Volume Group Dump \n"); + LOG_DEBUG("***********************************************\n"); + + return 0; + +} +#endif diff -Naur linux-2002-09-30/drivers/evms/Config.in evms-2002-09-30/drivers/evms/Config.in --- linux-2002-09-30/drivers/evms/Config.in Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/drivers/evms/Config.in Mon Sep 16 15:55:24 2002 @@ -0,0 +1,60 @@ +# +# Copyright (c) International Business Machines Corp., 2000 +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See +# the GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +# +# EVMS driver configuration +# + +mainmenu_option next_comment +comment 'Enterprise Volume Management System' + +tristate 'EVMS Kernel Runtime' CONFIG_EVMS +dep_tristate ' EVMS Local Device Manager' CONFIG_EVMS_LOCAL_DEV_MGR $CONFIG_EVMS +dep_tristate ' EVMS DOS Segment Manager' CONFIG_EVMS_DOS_SEGMENT_MGR $CONFIG_EVMS +dep_tristate ' EVMS GPT Segment Manager' CONFIG_EVMS_GPT_SEGMENT_MGR $CONFIG_EVMS +if [ "$CONFIG_ARCH_S390" = "y" ]; then +dep_tristate ' EVMS S/390 Segment Manager' CONFIG_EVMS_S390_SEGMENT_MGR $CONFIG_EVMS +fi +dep_tristate ' EVMS SnapShot Feature' CONFIG_EVMS_SNAPSHOT $CONFIG_EVMS +dep_tristate ' EVMS DriveLink Feature' CONFIG_EVMS_DRIVELINK $CONFIG_EVMS +dep_tristate ' EVMS Bad Block Relocation (BBR) Feature' CONFIG_EVMS_BBR $CONFIG_EVMS +dep_tristate ' EVMS Linux LVM Package' CONFIG_EVMS_LVM $CONFIG_EVMS +dep_tristate ' EVMS Linux MD Package' CONFIG_EVMS_MD $CONFIG_EVMS +dep_tristate ' EVMS MD Linear (append) mode' CONFIG_EVMS_MD_LINEAR $CONFIG_EVMS_MD +dep_tristate ' EVMS MD RAID-0 (stripe) mode' CONFIG_EVMS_MD_RAID0 $CONFIG_EVMS_MD +dep_tristate ' EVMS MD RAID-1 (mirroring) mode' CONFIG_EVMS_MD_RAID1 $CONFIG_EVMS_MD +dep_tristate ' EVMS MD RAID-4/RAID-5 mode' CONFIG_EVMS_MD_RAID5 $CONFIG_EVMS_MD +dep_tristate ' EVMS AIX LVM Package' CONFIG_EVMS_AIX $CONFIG_EVMS +dep_tristate ' EVMS OS/2 LVM Package' CONFIG_EVMS_OS2 $CONFIG_EVMS +#dep_tristate ' EVMS Clustering Package' CONFIG_EVMS_ECR $CONFIG_EVMS + +if [ "$CONFIG_EVMS" != "n" ]; then + choice ' EVMS Debug Level' \ + "Critical CONFIG_EVMS_INFO_CRITICAL \ + Serious CONFIG_EVMS_INFO_SERIOUS \ + Error CONFIG_EVMS_INFO_ERROR \ + Warning CONFIG_EVMS_INFO_WARNING \ + Default CONFIG_EVMS_INFO_DEFAULT \ + Details CONFIG_EVMS_INFO_DETAILS \ + Debug CONFIG_EVMS_INFO_DEBUG \ + Extra CONFIG_EVMS_INFO_EXTRA \ + Entry_Exit CONFIG_EVMS_INFO_ENTRY_EXIT \ + Everything CONFIG_EVMS_INFO_EVERYTHING" Default +fi + +endmenu + diff -Naur linux-2002-09-30/drivers/evms/Makefile evms-2002-09-30/drivers/evms/Makefile --- linux-2002-09-30/drivers/evms/Makefile Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/drivers/evms/Makefile Mon Sep 16 15:55:24 2002 @@ -0,0 +1,64 @@ +# +# Makefile for the kernel EVMS driver and modules. +# +# 08 March 2001, Mark Peloquin +# + +O_TARGET := evmsdrvr.o + +export-objs := evms.o evms_passthru.o ldev_mgr.o dos_part.o lvm_vge.o \ + snapshot.o evms_drivelink.o evms_bbr.o AIXlvm_vge.o \ + os2lvm_vge.o evms_ecr.o md_core.o md_linear.o md_raid0.o \ + md_raid1.o md_raid5.o md_xor.o s390_part.o gpt_part.o + +# Link order is important! Plugins must come first, then the EVMS core. + +obj-$(CONFIG_EVMS_LOCAL_DEV_MGR) += ldev_mgr.o +obj-$(CONFIG_EVMS_DOS_SEGMENT_MGR) += dos_part.o +obj-$(CONFIG_EVMS_GPT_SEGMENT_MGR) += gpt_part.o +obj-$(CONFIG_EVMS_S390_SEGMENT_MGR) += s390_part.o +obj-$(CONFIG_EVMS_MD) += md_core.o +obj-$(CONFIG_EVMS_MD_LINEAR) += md_linear.o +obj-$(CONFIG_EVMS_MD_RAID0) += md_raid0.o +obj-$(CONFIG_EVMS_MD_RAID1) += md_raid1.o +obj-$(CONFIG_EVMS_MD_RAID5) += md_raid5.o md_xor.o +obj-$(CONFIG_EVMS_LVM) += lvm_vge.o +obj-$(CONFIG_EVMS_AIX) += AIXlvm_vge.o +obj-$(CONFIG_EVMS_OS2) += os2lvm_vge.o +obj-$(CONFIG_EVMS_DRIVELINK) += evms_drivelink.o +obj-$(CONFIG_EVMS_BBR) += evms_bbr.o +obj-$(CONFIG_EVMS_SNAPSHOT) += snapshot.o +obj-$(CONFIG_EVMS_ECR) += evms_ecr.o +obj-$(CONFIG_EVMS) += evms_passthru.o evms.o + +EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_DEFAULT +ifeq ($(CONFIG_EVMS_INFO_CRITICAL),y) + EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_CRITICAL +endif +ifeq ($(CONFIG_EVMS_INFO_SERIOUS),y) + EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_SERIOUS +endif +ifeq ($(CONFIG_EVMS_INFO_ERROR),y) + EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_ERROR +endif +ifeq ($(CONFIG_EVMS_INFO_WARNING),y) + EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_WARNING +endif +ifeq ($(CONFIG_EVMS_INFO_DETAILS),y) + EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_DETAILS +endif +ifeq ($(CONFIG_EVMS_INFO_DEBUG),y) + EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_DEBUG +endif +ifeq ($(CONFIG_EVMS_INFO_EXTRA),y) + EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_EXTRA +endif +ifeq ($(CONFIG_EVMS_INFO_ENTRY_EXIT),y) + EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_ENTRY_EXIT +endif +ifeq ($(CONFIG_EVMS_INFO_EVERYTHING),y) + EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_EVERYTHING +endif + +include $(TOPDIR)/Rules.make + diff -Naur linux-2002-09-30/drivers/evms/dos_part.c evms-2002-09-30/drivers/evms/dos_part.c --- linux-2002-09-30/drivers/evms/dos_part.c Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/drivers/evms/dos_part.c Fri Sep 13 16:09:55 2002 @@ -0,0 +1,1452 @@ +/* -*- linux-c -*- */ +/* + * + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * + */ +/* + * linux/drivers/evms/dos_part.c + * + * EVMS DOS partition manager + * + * Partial code extracted from + * + * linux/fs/partitions/msdos.c + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for kiobuf stuffs */ + +#ifdef CONFIG_BLK_DEV_IDE +#include /* IDE xlate */ +#endif /* CONFIG_BLK_DEV_IDE */ + +#include +#include + +#include +#include + +/* prefix used in logging messages */ +#define LOG_PREFIX "dos_part: " + +/* #include "msdos.h" */ +#define MSDOS_LABEL_MAGIC 0xAA55 +#define GPT_ENTIRE_DISK_INDICATOR 0xEE +#define GPT_ESP_INDICATOR 0xEF + +/** + * struct mbr_ebr - Skeletal MBR/EBR structure useful for our purposes + * @unused1: skip IPL record code + * @partitions: partition table + * @signature: DOS magic + * + * skeletal access to parition table in MBR/EBR + **/ +struct mbr_ebr { + u8 unused1[0x1be]; + struct partition partitions[4]; + u16 signature; +}; + +/** + * struct dos_private - Private data structure for this plugin + * @source_object: object this IO will get remapped to + * @start_sect: source object relative starting address in 512 byte units + * @nr_sect: partition size in 512 bytes units + * @type: partition type or filesystem format indicator + * + * private copy of the just the fields we require to remap IO requests + * to the underlying object. + **/ +struct dos_private { + struct evms_logical_node *source_disk; + u64 start_sect; + u64 nr_sects; + unsigned char type; +}; + +/** + * struct extended_part - Structure used to track progress traversing an EBR chain + * @extended: partition table in the extended boot record + * @start_sect: address of the extended boot record in 512 byte units + * @next_ebr_start: address of next ebr in the chain + * @done: progress flag + * + * struct used to track extended boot record chain traversals. + **/ +struct extended_part { + struct partition *extended; + u64 start_sect; + u64 next_ebr_start; + int done; +}; + +/* Global variables */ +static int cur_comp_part_num; /* used to track non-primary + * partition numbers + */ +static int exported_nodes; /* total # of exported segments + * produced during this discovery. + */ + +/* External references */ +#if CONFIG_BLK_DEV_MD && CONFIG_AUTODETECT_RAID +extern void md_autodetect_dev(kdev_t dev); +#endif + +/* Prototypes */ +static int mbr_ebr_partition_discover(struct evms_logical_node **); +static int mbr_ebr_partition_delete(struct evms_logical_node *); +static void mbr_ebr_partition_read(struct evms_logical_node *, + struct buffer_head *); +static void mbr_ebr_partition_write(struct evms_logical_node *, + struct buffer_head *); +static int mbr_ebr_partition_ioctl(struct evms_logical_node *, struct inode *, + struct file *, unsigned int, unsigned long); +static int mbr_ebr_partition_init_io(struct evms_logical_node *, + int, u64, u64, void *); + +static struct evms_plugin_fops fops = { + .discover = mbr_ebr_partition_discover, + .delete = mbr_ebr_partition_delete, + .read = mbr_ebr_partition_read, + .write = mbr_ebr_partition_write, + .init_io = mbr_ebr_partition_init_io, + .ioctl = mbr_ebr_partition_ioctl +}; + +#define EVMS_MSDOS_PARTITION_MANAGER_ID 1 + +static struct evms_plugin_header plugin_header = { + .id = SetPluginID(IBM_OEM_ID, + EVMS_SEGMENT_MANAGER, + EVMS_MSDOS_PARTITION_MANAGER_ID), + .version = { + .major = 1, + .minor = 1, + .patchlevel = 1 + }, + .required_services_version = { + .major = 0, + .minor = 5, + .patchlevel = 0 + }, + .fops = &fops +}; + +/* + * Many architectures don't like unaligned accesses, which is + * frequently the case with the nr_sects and start_sect partition + * table entries. + */ +#include + +#define SYS_IND(p) (get_unaligned(&p->sys_ind)) +#define NR_SECTS(p) (u64)({ __typeof__(p->nr_sects) __a = \ + get_unaligned(&p->nr_sects); \ + le32_to_cpu(__a); \ + }) + +#define START_SECT(p) (u64)({ __typeof__(p->start_sect) __a = \ + get_unaligned(&p->start_sect); \ + le32_to_cpu(__a); \ + }) + +/******************************************/ +/* List Support - Variables, & Functions */ +/******************************************/ + +/* Typedefs */ + +struct segment_list_node { + struct evms_logical_node *segment; + struct segment_list_node *next; +}; + +struct disk_list_node { + struct evms_logical_node *disk; + struct segment_list_node *segment_list; + struct disk_list_node *next; +}; + +/* Variables */ + +static struct disk_list_node *my_disk_list; + +/* Functions */ + +static struct disk_list_node ** +lookup_disk(struct evms_logical_node *disk) +{ + struct disk_list_node **ldln; + + ldln = &my_disk_list; + while (*ldln) { + if ((*ldln)->disk == disk) + break; + ldln = &(*ldln)->next; + } + return (ldln); +} + +static struct segment_list_node ** +lookup_segment(struct disk_list_node *disk, struct evms_logical_node *segment) +{ + struct segment_list_node **lsln; + + lsln = &disk->segment_list; + while (*lsln) { + if ((*lsln)->segment == segment) + break; + lsln = &(*lsln)->next; + } + return (lsln); +} + +static struct evms_logical_node * +find_segment_on_disk(struct evms_logical_node *disk, + u64 start_sect, u64 nr_sects) +{ + struct evms_logical_node *rc = NULL; + struct disk_list_node **ldln; + struct segment_list_node **lsln; + struct dos_private *dos_prv; + + ldln = lookup_disk(disk); + if (*ldln) { + /* disk found in list */ + /* attempt to find segment */ + + lsln = &(*ldln)->segment_list; + while (*lsln) { + dos_prv = (*lsln)->segment->private; + if (dos_prv->start_sect == start_sect) + if (dos_prv->nr_sects == nr_sects) + break; + lsln = &(*lsln)->next; + } + if (*lsln) + rc = (*lsln)->segment; + } + return (rc); +} + +/* function description: add_segment_to_disk + * + * this function attempts to add a segment to the segment + * list of a disk. if the specified disk is not found, it + * will be added to the global disk list. this function will + * return a pointer to the matching segment in the disk's + * segment list. the caller must compare the returned pointer + * to the specified segment to see if the + * specified segment was already present in the disk's segment + * list. if the return pointer matches the specified segment, + * then the specified segment was added to the list. if the + * return segment pointer to does not match the specified + * segment pointer, then the specified segment pointer was + * a duplicate and can be thrown away. + */ +static int +add_segment_to_disk(struct evms_logical_node *disk, + struct evms_logical_node *segment) +{ + int rc = 0; + struct disk_list_node **ldln, *new_disk; + struct segment_list_node **lsln, *new_segment; + + ldln = lookup_disk(disk); + if (*ldln == NULL) { + /* disk not in list, add disk */ + new_disk = kmalloc(sizeof (*new_disk), GFP_KERNEL); + if (new_disk) { + memset(new_disk, 0, sizeof (*new_disk)); + new_disk->disk = disk; + *ldln = new_disk; + } else { + rc = -ENOMEM; + } + } + if (!rc) { + /* attempt to add segment */ + lsln = lookup_segment(*ldln, segment); + if (*lsln == NULL) { + /* segment not in list, add segment */ + new_segment = + kmalloc(sizeof (*new_segment), GFP_KERNEL); + if (new_segment) { + memset(new_segment, 0, sizeof (*new_segment)); + new_segment->segment = segment; + *lsln = new_segment; + } else { + rc = -ENOMEM; + } + } else + rc = -1; + } + return (rc); +} + +static int +remove_segment_from_disk(struct evms_logical_node *disk, + struct evms_logical_node *segment, + struct evms_logical_node **empty_disk) +{ + int rc = 0; + struct disk_list_node **ldln, *tmp_disk_node; + struct segment_list_node **lsln, *tmp_segment_node; + + *empty_disk = NULL; + ldln = lookup_disk(disk); + if (*ldln == NULL) { + rc = -1; + } else { + /* disk found in list */ + /* attempt to add segment */ + lsln = lookup_segment(*ldln, segment); + if (*lsln == NULL) { + rc = -2; + } else { + tmp_segment_node = *lsln; + /* remove segment from list */ + *lsln = (*lsln)->next; + /* free the segment list node */ + kfree(tmp_segment_node); + + if ((*ldln)->segment_list == NULL) { + tmp_disk_node = *ldln; + *empty_disk = tmp_disk_node->disk; + /* remove disk from list */ + *ldln = (*ldln)->next; + /* free the disk list node */ + kfree(tmp_disk_node); + } + } + } + return (rc); +} + +static inline int +is_extended_partition(struct partition *p) +{ + return (SYS_IND(p) == DOS_EXTENDED_PARTITION || + SYS_IND(p) == WIN98_EXTENDED_PARTITION || + SYS_IND(p) == LINUX_EXTENDED_PARTITION); +} + +static inline u64 +part_start(struct partition *part, u64 ext_start, u64 ebr_start) +{ + u64 pstart = START_SECT(part); + pstart += (is_extended_partition(part)) ? ext_start : ebr_start; + return (pstart); +} + +static int +validate_mbr_ebr(struct evms_logical_node *node, + struct mbr_ebr *mbr_ebr, u64 ext_start, + u64 ebr_start) +{ + int valid_mbr_ebr, i, j, mbr_flag; + struct partition *pi, *pj; + u64 pi_start, pi_end, pj_start, pj_end; + + /* assume an MBR */ + mbr_flag = TRUE; + + /* assume its valid */ + valid_mbr_ebr = TRUE; + + /* check for valid signature */ + if (mbr_ebr->signature != cpu_to_le16(MSDOS_LABEL_MAGIC)) { + LOG_DEBUG("%s: invalid signature on '%s'!\n", + __FUNCTION__, node->name); + valid_mbr_ebr = FALSE; + } + + /* check for an AIX IPL signature */ +#define IPLRECID 0xc9c2d4c1 /* Value is EBCIDIC 'IBMA' */ + if (*(unsigned int *) mbr_ebr == IPLRECID) { + LOG_DEBUG("%s: found an AIX IPL signature on '%s'\n", + __FUNCTION__, node->name); + valid_mbr_ebr = FALSE; + } + + /* check for boot sector fields */ + +#if 0 //Remove checking of the first byte + + /* attempt to make some initial assumptions about + * what type of data structure this could be. we + * start by checking the 1st byte. we can tell a + * few things based on what is or isn't there. + */ + if (valid_mbr_ebr == TRUE) + switch (*(u_char *) mbr_ebr) { + /* check for JMP as 1st instruction + * if found, assume (for now), that + * this is a boot sector. + */ + /* Removed the JMP opcode check because it's not enough to determine + * that this sector does not have a valid MBR. + * Note: To avoid going thru validation process of partition table, + * it's necessary to have a better boot sector check + * (eg. JMP opcode && other conditions) */ + /* + case 0xEB: + LOG_DEBUG("%s: boot sector detected!\n", __FUNCTION__); + valid_mbr_ebr = FALSE; + */ + /* let this fall thru to pick up the + * mbr_flag == FALSE. + */ + + /* the MBR should contain boot strap + * code, so we don't expect the 1st + * byte to be a 0x0. If the 1st byte + * IS 0x0, its assumed (for now) to + * be an EBR. + */ + case 0: + mbr_flag = FALSE; + break; + } +#endif //Remove checking of the first byte + + if (valid_mbr_ebr == TRUE) { + /* dump the partition table entries in debug mode */ + LOG_DEBUG + ("%s: disk relative starts: ext_part("PFU64"), ebr("PFU64").\n", + __FUNCTION__, ext_start, ebr_start); + for (i = 0; i < 4; i++) { + pi = &mbr_ebr->partitions[i]; + LOG_DEBUG + ("%s: Partition: index(%d), start("PFU64"), size("PFU64"), sys(0x%x).\n", + __FUNCTION__, i, START_SECT(pi), NR_SECTS(pi), + SYS_IND(pi)); + } + + /* check for PMBR (Protected Master Boot Record) + * and skip this node if found + */ + for (i = 0; i < 4; i++) { + pi = &mbr_ebr->partitions[i]; + + if (SYS_IND(pi) == 0xEE) { + valid_mbr_ebr = FALSE; + LOG_DETAILS + ("%s: detected PMBR on '%s', skipping.\n", + __FUNCTION__, node->name); + break; + } + } + + /* check of this segment is marked as non-dividable + * and skip if found + */ + if (node->iflags & EVMS_TOP_SEGMENT) { + valid_mbr_ebr = FALSE; + } + } + + if (valid_mbr_ebr == TRUE) { + /* check for mbr/ebr partition table validity */ + for (i = 0; i < 4; i++) { + pi = &mbr_ebr->partitions[i]; + if (NR_SECTS(pi)) { + /* check for partition extending past end of node */ + pi_start = part_start(pi, ext_start, ebr_start); + pi_end = pi_start + NR_SECTS(pi) - 1; + if (pi_end >= node->total_vsectors) { + LOG_DEBUG + ("%s: partition(%d) ends("PFU64") beyond the end of the disk(%s,"PFU64")!\n", + __FUNCTION__, i, pi_end, + node->name, node->total_vsectors); + valid_mbr_ebr = FALSE; + } + if (valid_mbr_ebr == FALSE) + break; + + /* check for partition overlap */ + for (j = i + 1; j < 4; j++) { + pj = &mbr_ebr->partitions[j]; + if (NR_SECTS(pj)) { + pj_start = + part_start(pj, ext_start, + ebr_start); + pj_end = + pj_start + NR_SECTS(pj) - 1; + if (pi_start == pj_start) { + valid_mbr_ebr = FALSE; + } else if (pi_start < pj_start) { + if (pi_end >= pj_start) + valid_mbr_ebr = + FALSE; + } else if (pi_start <= pj_end) + valid_mbr_ebr = FALSE; + + if (valid_mbr_ebr == FALSE) { + LOG_DEBUG + ("%s: overlapping partitions(%d,%d) detected on '%s'!\n", + __FUNCTION__, i, j, + node->name); + break; + } + } + } + if (valid_mbr_ebr == FALSE) + break; + } + } + } + if (valid_mbr_ebr == TRUE) { + LOG_DEBUG("%s: valid %cBR detected on '%s'!\n", __FUNCTION__, + (mbr_flag == TRUE) ? 'M' : 'E', node->name); + } else { + LOG_DEBUG("%s: no valid MBR/EBR detected on '%s'!\n", + __FUNCTION__, node->name); + } + return (valid_mbr_ebr); +} + +/* + * Function: add_segment + */ +static int +mbr_ebr_process_segment(struct evms_logical_node **discover_list, + struct evms_logical_node *node, + u64 start_sect, + u64 nr_sects, + unsigned char type, int part_num, char *partition_name) +{ + struct dos_private *dos_prv = NULL; + struct evms_logical_node *segment; + int rc = 0; + + segment = find_segment_on_disk(node, start_sect, nr_sects); + if (segment) { + LOG_DETAILS("exporting segment '%s'.\n", segment->name); + } else { + dos_prv = kmalloc(sizeof (*dos_prv), GFP_KERNEL); + if (dos_prv) { + memset(dos_prv, 0, sizeof (*dos_prv)); + dos_prv->source_disk = node; + dos_prv->start_sect = start_sect; + dos_prv->nr_sects = nr_sects; + dos_prv->type = type; + rc = evms_cs_allocate_logical_node(&segment); + } else { + rc = -ENOMEM; + } + if (!rc) { + segment->plugin = &plugin_header; + segment->system_id = (unsigned int) type; + segment->total_vsectors = nr_sects; + segment->block_size = node->block_size; + segment->hardsector_size = node->hardsector_size; + segment->private = dos_prv; + segment->flags = node->flags; + if (partition_name) + strcpy(segment->name, partition_name); + else { + strcpy(segment->name, node->name); + if (GetPluginType(node->plugin->id) == + EVMS_SEGMENT_MANAGER) { + strcat(segment->name, "."); + } + sprintf(segment->name + strlen(segment->name), + "%d", part_num); + } + /* watch for super floppy format gpt system partition + * and dont let it be sub divided + */ + if (segment->system_id == GPT_ESP_INDICATOR) { + node->iflags |= EVMS_TOP_SEGMENT; + } + LOG_DETAILS("creating segment '%s'.\n", segment->name); + rc = add_segment_to_disk(node, segment); + if (rc) { + LOG_ERROR + ("%s: error(%d) adding segment '%s'!\n", + __FUNCTION__, rc, segment->name); + rc = 0; + } else { + MOD_INC_USE_COUNT; + } + } + if (rc) { + if (dos_prv) + kfree(dos_prv); + if (segment) + evms_cs_deallocate_logical_node(segment); + } + } + if (!rc) { + evms_cs_add_logical_node_to_list(discover_list, segment); + exported_nodes++; + } + return rc; +} + +static void +print_partition_info(char *leading_comment, struct partition *p) +{ + LOG_EXTRA + ("%s: boot_ind(0x%02x), sys_ind(0x%02x), startCHS(%u,%u,%u), endCHS(%u,%u,%u), startLBA("PFU64"), sizeLBA("PFU64")\n", + leading_comment, p->boot_ind, p->sys_ind, p->cyl, p->head, + p->sector, p->end_cyl, p->end_head, p->end_sector, START_SECT(p), + NR_SECTS(p)); +} + +#ifdef CONFIG_BSD_DISKLABEL +#define BSD_DISKLABEL_PART_TABLE_SECTOR_OFFSET 1 +static void +print_bsd_partition_info(char *leading_comment, struct bsd_partition *p) +{ + LOG_EXTRA + ("%s: p_size(%u), p_offset(%u), p_fsize(%u), p_fstype(0x%02X), p_frag(0x%02X), p_cpg(%u)\n", + leading_comment, p->p_size, p->p_offset, p->p_fsize, p->p_fstype, + p->p_frag, p->p_cpg); +} + +/* + * bsd_disklabel_partition + * + * Return: + * - 0 for 0 partition + * - (positive) number for number of BSD partitions found + * - (negative) error code + */ +static int +bsd_disklabel_partition(struct evms_logical_node **discover_list, + struct evms_logical_node *node, struct partition *bsd) +{ + struct bsd_disklabel *l; + struct bsd_partition *p; + int max_partitions; + char *data; + int rc = 0; + int count = 0; + + data = kmalloc(node->hardsector_size, GFP_KERNEL); + if (data) + rc = INIT_IO(node, + 0, + START_SECT(bsd) + + BSD_DISKLABEL_PART_TABLE_SECTOR_OFFSET, 1, data); + else + rc = -ENOMEM; + if (!rc) { + + l = (struct bsd_disklabel *) data; + if (l->d_magic == BSD_DISKMAGIC) { + + max_partitions = + ((SYS_IND(bsd) == + OPENBSD_PARTITION) ? OPENBSD_MAXPARTITIONS : + BSD_MAXPARTITIONS); + if (l->d_npartitions < max_partitions) + max_partitions = l->d_npartitions; + for (p = l->d_partitions; + p - l->d_partitions < max_partitions; p++) { + if (p->p_fstype != BSD_FS_UNUSED) { + evmsLOG2(EVMS_INFO_EXTRA, + (print_bsd_partition_info + (__FUNCTION__, p))); + rc = mbr_ebr_process_segment + (discover_list, node, + (u64) p->p_offset, + (u64) p->p_size, p->p_fstype, + cur_comp_part_num++, NULL); + if (rc) + break; + count++; + } + } + } + } + if (data) + kfree(data); + if (!rc) + rc = count; + LOG_DETAILS("%s: exported (%d) partitions\n", __FUNCTION__, rc); + return rc; +} +#endif + +#ifdef CONFIG_UNIXWARE_DISKLABEL +#define UNIXWARE_PART_TABLE_SECTOR_OFFSET 29 + +/* + * unixware_partition + * + * Return: + * - 0 for 0 partition + * - (positive) number for number of UNIXWARE partitions found + * - (negative) error code + */ +static int +unixware_partition(struct evms_logical_node **discover_list, + struct evms_logical_node *node, + struct partition *unixware_part) +{ + struct unixware_disklabel *l; + struct unixware_slice *p; + char *data = NULL; + int rc = 0; + int count = 0; + + data = kmalloc(node->hardsector_size, GFP_KERNEL); + if (data) + rc = INIT_IO(node, + 0, + START_SECT(unixware_part) + + UNIXWARE_PART_TABLE_SECTOR_OFFSET, 1, data); + else + rc = -ENOMEM; + if (!rc) { + l = (struct unixware_disklabel *) data; + if (le32_to_cpu(l->d_magic) == UNIXWARE_DISKMAGIC && + le32_to_cpu(l->vtoc.v_magic) == UNIXWARE_DISKMAGIC2) { + p = &l->vtoc.v_slice[1]; /* The 0th slice is the same as whole disk. */ + while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) { + if (p->s_label != UNIXWARE_FS_UNUSED) { + rc = mbr_ebr_process_segment + (discover_list, node, START_SECT(p), + NR_SECTS(p), UNIXWARE_PARTITION, + cur_comp_part_num++, NULL); + if (rc) + break; + count++; + } + p++; + } + } + } + if (data) + kfree(data); + if (!rc) + rc = count; + LOG_DETAILS("%s: exported (%d) partitions\n", __FUNCTION__, rc); + return rc; +} +#endif + +#ifdef CONFIG_SOLARIS_X86_PARTITION +#define SOLARIS_X86_PART_TABLE_SECTOR_OFFSET 1 +/* + * solaris_x86_partition + * + * Return: + * - 0 for 0 partition + * - (positive) number for number of solaris partitions found + * - (negative) error code + */ +static int +solaris_x86_partition(struct evms_logical_node **discover_list, + struct evms_logical_node *node, + struct partition *solaris_x86, int probe_only) +{ /* if TRUE, do not add segments */ + long offset = START_SECT(solaris_x86); + struct solaris_x86_vtoc *v; + struct solaris_x86_slice *s; + int i; + char *data = NULL; + int rc = 0; + int count = 0; + + data = kmalloc(node->hardsector_size, GFP_KERNEL); + if (data) + rc = INIT_IO(node, + 0, + START_SECT(solaris_x86) + + SOLARIS_X86_PART_TABLE_SECTOR_OFFSET, 1, data); + else + rc = -ENOMEM; + if (!rc) { + + v = (struct solaris_x86_vtoc *) data; + + if (v->v_sanity == SOLARIS_X86_VTOC_SANE) { + if (v->v_version != 1) { + LOG_WARNING + ("%s: cannot handle version %d vtoc>\n", + __FUNCTION__, v->v_version); + } else { + for (i = 0; i < v->v_nparts; i++) { + s = &v->v_slice[i]; + LOG_EXTRA + ("s[%d] s_tag(%u), s_flag(%u), s_start(%u), s_size(%u), last_sector(%u)\n", + i, s->s_tag, s->s_flag, s->s_start, + s->s_size, + s->s_start + s->s_size - 1); + + if ((s->s_size == 0) + || (s->s_tag == 0x05)) + continue; + if (!probe_only) { + rc = mbr_ebr_process_segment + (discover_list, node, + (u64) (s->s_start + + offset), + (u64) s->s_size, + SOLARIS_X86_PARTITION, + cur_comp_part_num++, NULL); + if (rc) + break; + } + count++; + } + } + } + } + if (data) + kfree(data); + if (!rc) + rc = count; + LOG_DETAILS("%s: %s (%d) partitions\n", + __FUNCTION__, probe_only ? " " : "exported", rc); + return rc; +} +#endif + +/* + * os2lvm_partition() looks for DLAT at last sector of the track containing MBR/EBR + * + * Returns: 1 - os2 DLAT was found + * 0 otherwise + * + */ +static int +os2lvm_partition(u64 MBR_EBR_sect, + struct evms_logical_node *node, struct dla_table_sector *dlat) +{ + struct hd_geometry geometry; + int rc; + u32 crc_hold; + + rc = evms_cs_kernel_ioctl(node, HDIO_GETGEO, (unsigned long) &geometry); + if (rc) { + LOG_SERIOUS("%s: ioctl failed(%u) on '%s'\n", + __FUNCTION__, rc, node->name); + } else + if (!INIT_IO(node, 0, MBR_EBR_sect + geometry.sectors - 1, 1, dlat)) + { + if ((dlat->DLA_Signature1 == cpu_to_le32(DLA_TABLE_SIGNATURE1)) + && (dlat->DLA_Signature2 == + cpu_to_le32(DLA_TABLE_SIGNATURE2))) { + crc_hold = le32_to_cpu(dlat->DLA_CRC); + dlat->DLA_CRC = 0; + if (evms_cs_calculate_crc + (EVMS_INITIAL_CRC, (void *) dlat, + node->hardsector_size) == crc_hold) + return 1; + } + } + return 0; +} + +static int +mbr_ebr_process_logical_drive(struct evms_logical_node **discover_list, + struct evms_logical_node *node, + struct extended_part *ext_info, + int i, + struct partition *p, + int os2lvm, struct dla_table_sector *dlat) +{ + int rc = 0; + char tmp_buf[EVMS_VOLUME_NAME_SIZE], *partition_name; + + LOG_EXTRA("%s: PartitionTableIndex(%i), Start("PFU64"), Size("PFU64")\n", + __FUNCTION__, i, START_SECT(p), NR_SECTS(p)); + + if (NR_SECTS(p)) { + if (is_extended_partition(p)) { + ext_info->next_ebr_start = + (u64) (START_SECT(p) + + START_SECT(ext_info->extended)); + ext_info->done = FALSE; /* not done yet */ + } else { + partition_name = NULL; + if (os2lvm && p->sys_ind != LVM_PARTITION_INDICATOR && + le32_to_cpu(dlat->DLA_Array[i].Partition_Start) == + (ext_info->start_sect + START_SECT(p)) + && le32_to_cpu(dlat->DLA_Array[i].Partition_Size) == + NR_SECTS(p) + && dlat->DLA_Array[i].Drive_Letter != '\0') { + sprintf(tmp_buf, "os2/%c", + dlat->DLA_Array[i].Drive_Letter); + partition_name = tmp_buf; + } + evmsLOG2(EVMS_INFO_EXTRA, + (print_partition_info(__FUNCTION__, p))); + + rc = mbr_ebr_process_segment(discover_list, + node, + ext_info->start_sect + + START_SECT(p), NR_SECTS(p), + p->sys_ind, + cur_comp_part_num++, + partition_name); + } + } + return (rc); +} + +static int +mbr_ebr_process_ebr(struct evms_logical_node **discover_list, + struct evms_logical_node *node, + struct extended_part *ext_info, struct mbr_ebr *ebr) +{ + int rc = 0, i, os2lvm; + struct partition *p; + struct dla_table_sector *dlat = NULL; + + /* allocate space for the OS2 DLAT info */ + dlat = kmalloc(node->hardsector_size, GFP_KERNEL); + if (dlat) { + /* read the dlat for this mbr */ + os2lvm = os2lvm_partition(ext_info->start_sect, node, dlat); + + /* walk thru the partition table in the mbr + * processing each partition record. + */ + for (i = 0; i < 4; i++) { + p = &ebr->partitions[i]; + rc = mbr_ebr_process_logical_drive(discover_list, + node, + ext_info, + i, p, os2lvm, dlat); + } + } else { + rc = -ENOMEM; + } + + /* free the space used for OS2 DLAT info */ + if (dlat) + kfree(dlat); + + return (rc); +} + +static int +mbr_ebr_probe_for_ebr(struct evms_logical_node **discover_list, + struct evms_logical_node *node, + struct extended_part *ext_info) +{ + int rc = 0; + u_char *sector_buffer = NULL; + struct mbr_ebr *ebr = NULL; + + /* allocate a sector size buffer */ + sector_buffer = kmalloc(node->hardsector_size, GFP_KERNEL); + if (sector_buffer) + /* read the location of the mbr sector */ + rc = INIT_IO(node, 0, ext_info->start_sect, 1, sector_buffer); + else + rc = -ENOMEM; + + if (!rc) { + ebr = (struct mbr_ebr *) sector_buffer; + if (validate_mbr_ebr(node, ebr, + START_SECT(ext_info->extended), + ext_info->start_sect) == TRUE) + rc = mbr_ebr_process_ebr(discover_list, + node, ext_info, ebr); + } + + if (sector_buffer) + kfree(sector_buffer); + + return (rc); +} + +static int +mbr_ebr_process_extended_partition(struct evms_logical_node **discover_list, + struct evms_logical_node *node, + struct partition *p) +{ + int rc = 0; + struct extended_part ext_info; + + memset(&ext_info, 0, sizeof (ext_info)); + ext_info.done = FALSE; + ext_info.extended = p; + ext_info.next_ebr_start = START_SECT(p); + while (ext_info.done == FALSE) { + ext_info.done = TRUE; /* assume done, unless we find another EBR */ + ext_info.start_sect = ext_info.next_ebr_start; + rc = mbr_ebr_probe_for_ebr(discover_list, node, &ext_info); + } + return rc; +} + +/* + * is_non_dos_extended + * + * This function returns TRUE if the partition entry represents a non-DOS + * extended partition such as UnixWare, Solaris x86 and BSD + */ +static int +is_non_dos_extended(struct evms_logical_node **discover_list, + struct evms_logical_node *node, struct partition *p) +{ + if (NR_SECTS(p)) { +#ifdef CONFIG_BSD_DISKLABEL + if (SYS_IND(p) == BSD_PARTITION || + SYS_IND(p) == NETBSD_PARTITION || + SYS_IND(p) == OPENBSD_PARTITION) + return TRUE; +#endif + +#ifdef CONFIG_UNIXWARE_DISKLABEL + if (SYS_IND(p) == UNIXWARE_PARTITION) + return TRUE; +#endif + +#ifdef CONFIG_SOLARIS_X86_PARTITION + if ((SYS_IND(p) == SOLARIS_X86_PARTITION) && + (solaris_x86_partition(discover_list, node, p, TRUE) > 0)) + return TRUE; +#endif + } + return (FALSE); +} + +/* + * mbr_ebr_process_other_primary_partition + * This function processes other (non-DOS) primary partitions such as + * UnixWare, Solaris x86 and BSD + */ +static int +mbr_ebr_process_other_primary_partition(struct evms_logical_node + **discover_list, + struct evms_logical_node *node, + struct partition *p) +{ + if (NR_SECTS(p)) { +#ifdef CONFIG_BSD_DISKLABEL + if (SYS_IND(p) == BSD_PARTITION || + SYS_IND(p) == NETBSD_PARTITION || + SYS_IND(p) == OPENBSD_PARTITION) + return bsd_disklabel_partition(discover_list, node, p); +#endif + +#ifdef CONFIG_UNIXWARE_DISKLABEL + if (SYS_IND(p) == UNIXWARE_PARTITION) + return unixware_partition(discover_list, node, p); +#endif + +#ifdef CONFIG_SOLARIS_X86_PARTITION + if (SYS_IND(p) == SOLARIS_X86_PARTITION) + return solaris_x86_partition(discover_list, node, p, + FALSE); +#endif + } + return (0); +} + +static int +mbr_ebr_process_dos_primary_partition(struct evms_logical_node **discover_list, + struct evms_logical_node *node, + int i, + struct partition *p, + int os2lvm, struct dla_table_sector *dlat) +{ + int rc = 0; + char tmp_buf[EVMS_VOLUME_NAME_SIZE], *partition_name; + + LOG_EVERYTHING("%s: PartitionTableIndex(%i), Start("PFU64"), Size("PFU64")\n", + __FUNCTION__, i, START_SECT(p), NR_SECTS(p)); + + if (NR_SECTS(p)) { + + if (is_extended_partition(p)) + rc = mbr_ebr_process_extended_partition(discover_list, + node, p); + + else { + partition_name = NULL; + if (os2lvm && p->sys_ind != LVM_PARTITION_INDICATOR && + le32_to_cpu(dlat->DLA_Array[i].Partition_Start) == + START_SECT(p) + && le32_to_cpu(dlat->DLA_Array[i].Partition_Size) == + NR_SECTS(p) + && dlat->DLA_Array[i].Drive_Letter != '\0') { + sprintf(tmp_buf, "os2/%c", + dlat->DLA_Array[i].Drive_Letter); + partition_name = tmp_buf; + } + evmsLOG2(EVMS_INFO_EXTRA, + (print_partition_info(__FUNCTION__, p))); + + rc = mbr_ebr_process_segment(discover_list, + node, + START_SECT(p), + NR_SECTS(p), + p->sys_ind, + i + 1, partition_name); + } + } + return (rc); +} + +static int +mbr_ebr_process_mbr(struct evms_logical_node **discover_list, + struct evms_logical_node *node, struct mbr_ebr *mbr) +{ + int rc = 0, i, os2lvm; + struct partition *p; + struct dla_table_sector *dlat = NULL; + + cur_comp_part_num = 5; /* set this value for each disk */ + + /* allocate space for the OS2 DLAT info */ + dlat = kmalloc(node->hardsector_size, GFP_KERNEL); + if (dlat) { + /* read the dlat for this mbr */ + os2lvm = os2lvm_partition(0, node, dlat); + + /* Pass 1: walk thru the partition table in the mbr + * processing each partition record. + */ + for (i = 0; i < 4; i++) { + p = &mbr->partitions[i]; + if (is_non_dos_extended(discover_list, node, p)) { + LOG_DETAILS + (" Found and skip a non-dos extended partition.\n"); + continue; + } + + mbr_ebr_process_dos_primary_partition(discover_list, + node, + i, + p, os2lvm, dlat); + } + + /* Pass 2: walk thru the partition table in the mbr + * processing each partition record for non-DOS extended partitions + */ + for (i = 0; i < 4; i++) { + p = &mbr->partitions[i]; + mbr_ebr_process_other_primary_partition(discover_list, + node, p); + } + + } else { + rc = -ENOMEM; + } + + /* free the space used for OS2 DLAT info */ + if (dlat) + kfree(dlat); + + return (rc); +} + +static int +mbr_ebr_probe_for_mbr(struct evms_logical_node **discover_list, + struct evms_logical_node *node) +{ + int rc = 0; + u_char *sector_buffer = NULL; + struct mbr_ebr *mbr = NULL; + + LOG_DEBUG("%s: probing (%s).\n", __FUNCTION__, node->name); + + /* allocate a sector size buffer */ + sector_buffer = kmalloc(node->hardsector_size, GFP_KERNEL); + if (sector_buffer) + /* read the location of the mbr sector */ + rc = INIT_IO(node, 0, 0, 1, sector_buffer); + else + rc = -ENOMEM; + if (rc) { + LOG_ERROR("%s: read error(%d) on '%s'.\n", + __FUNCTION__, rc, node->name); + } else { + mbr = (struct mbr_ebr *) sector_buffer; + if (validate_mbr_ebr(node, mbr, 0, 0) == TRUE) { + /* since it looks like this disk has a + * valid MBR, remove the disk node from + * the discover list. it may already be + * on the global list, or it will be + * added to it. in the case of an mbr + * with no partitions, it is simply + * removed and forgotten. when one or + * more partitions are created, the + * disk will be examined and handled + * properly during the following + * rediscover operation. + */ + evms_cs_remove_logical_node_from_list(discover_list, + node); + + rc = mbr_ebr_process_mbr(discover_list, node, mbr); + } + } + + if (sector_buffer) + kfree(sector_buffer); + + return (rc); +} + +/* + * Function: mbr_ebr_partition_discover + * + */ +static int +mbr_ebr_partition_discover(struct evms_logical_node **discover_list) +{ + int rc = 0; + struct evms_logical_node *node, *next_node; + + MOD_INC_USE_COUNT; + LOG_ENTRY_EXIT("%s: ENTRY\n", __FUNCTION__); + + /* initialize global variable */ + exported_nodes = 0; + + /* examine each node on the discover list */ + next_node = *discover_list; + while (next_node) { + node = next_node; + next_node = node->next; + if (node->plugin->id == plugin_header.id) + /* don't recurse into our own objects + */ + continue; + mbr_ebr_probe_for_mbr(discover_list, node); + } + + LOG_ENTRY_EXIT("%s: EXIT(exported nodes:%d, error code:%d)\n", + __FUNCTION__, exported_nodes, rc); + if (exported_nodes) + rc = exported_nodes; + MOD_DEC_USE_COUNT; + return (rc); +} + +/* + * Function: mbr_ebr_partition_delete + * + */ +static int +mbr_ebr_partition_delete(struct evms_logical_node *segment) +{ + int rc = 0; + struct dos_private *dos_prv; + struct evms_logical_node *empty_disk = NULL; + + LOG_DETAILS("deleting segment '%s'.\n", segment->name); + + if (!segment) { + rc = -ENODEV; + } else { + dos_prv = segment->private; + if (dos_prv) { + /* remove the segment from the + * disk's segment list + */ + rc = remove_segment_from_disk(dos_prv->source_disk, + segment, &empty_disk); + /* free the local instance data */ + kfree(dos_prv); + } + /* free the segment node */ + evms_cs_deallocate_logical_node(segment); + MOD_DEC_USE_COUNT; + /* if the last segment on the disk was + * deleted, delete the disk node too + */ + if (empty_disk) + DELETE(empty_disk); + } + return (rc); +} + +/* + * function: mbr_ebr_partition_io_error + * + * this function was primarily created because the function + * buffer_IO_error is inline and kgdb doesn't allow breakpoints + * to be set on inline functions. Since this was an error path + * and not mainline, I decided to add a trace statement to help + * report on the failing condition. + * + */ +static void +mbr_ebr_partition_io_error(struct evms_logical_node *node, + int io_flag, struct buffer_head *bh) +{ + LOG_SERIOUS + ("attempt to %s beyond partition boundary("PFU64") on (%s), rsector("PFU64").\n", + (io_flag) ? "WRITE" : "READ", node->total_vsectors - 1, node->name, + (u64) bh->b_rsector); + + bh->b_end_io(bh, 0); +} + +/* + * Function: mbr_ebr_partition_read + * + */ +static void +mbr_ebr_partition_read(struct evms_logical_node *partition, + struct buffer_head *bh) +{ + struct dos_private *dos_prv = partition->private; + + if ((bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT)) <= + partition->total_vsectors) { + bh->b_rsector += dos_prv->start_sect; + R_IO(dos_prv->source_disk, bh); + } else + mbr_ebr_partition_io_error(partition, READ, bh); +} + +/* + * Function: mbr_ebr_partition_write + * + */ +static void +mbr_ebr_partition_write(struct evms_logical_node *partition, + struct buffer_head *bh) +{ + struct dos_private *dos_prv = partition->private; + + if ((bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT)) <= + partition->total_vsectors) { + bh->b_rsector += dos_prv->start_sect; + W_IO(dos_prv->source_disk, bh); + } else + mbr_ebr_partition_io_error(partition, WRITE, bh); +} + +/* + * Function: mbr_ebr_partition_init_io + * + */ +static int +mbr_ebr_partition_init_io(struct evms_logical_node *partition, int io_flag, /* 0=read, 1=write */ + u64 sect_nr, /* disk LBA */ + u64 num_sects, /* # of sectors */ + void *buf_addr) +{ /* buffer address */ + int rc; + struct dos_private *dos_prv = partition->private; + + if ((sect_nr + num_sects) <= partition->total_vsectors) { + rc = INIT_IO(dos_prv->source_disk, io_flag, + sect_nr + dos_prv->start_sect, num_sects, + buf_addr); + } else { + LOG_SERIOUS + ("init_io: attempt to %s beyond partition(%s) boundary("PFU64") at sector("PFU64") for count("PFU64").\n", + (io_flag) ? "WRITE" : "READ", partition->name, + (dos_prv->nr_sects - 1), sect_nr, num_sects); + rc = -EINVAL; + } + + return (rc); +} + +/* + * Function: mbr_ebr_partition_ioctl + * + */ +static int +mbr_ebr_partition_ioctl(struct evms_logical_node *partition, + struct inode *inode, + struct file *file, unsigned int cmd, unsigned long arg) +{ + struct dos_private *dos_prv; + struct hd_geometry hd_geo; + int rc; + + rc = 0; + dos_prv = partition->private; + if (!inode) + return -EINVAL; + switch (cmd) { + case HDIO_GETGEO: + { + rc = IOCTL(dos_prv->source_disk, inode, file, cmd, arg); + if (rc) + break; + if (copy_from_user + (&hd_geo, (void *) arg, + sizeof (struct hd_geometry))) + rc = -EFAULT; + if (rc) + break; + hd_geo.start = dos_prv->start_sect; + if (copy_to_user + ((void *) arg, &hd_geo, + sizeof (struct hd_geometry))) + rc = -EFAULT; + } + break; + case EVMS_GET_BMAP: + { + struct evms_get_bmap_pkt *bmap = + (struct evms_get_bmap_pkt *) arg; + bmap->rsector += dos_prv->start_sect; + /* intentionally fall thru to + * default ioctl down to device + * manager. + */ + } + default: + rc = IOCTL(dos_prv->source_disk, inode, file, cmd, arg); + } + return rc; +} + +/* + * Function: dos_part_init + * + */ +static int __init +dos_part_init(void) +{ + return evms_cs_register_plugin(&plugin_header); /* register with EVMS */ +} + +static void __exit +dos_part_exit(void) +{ + evms_cs_unregister_plugin(&plugin_header); +} + +module_init(dos_part_init); +module_exit(dos_part_exit); +#ifdef MODULE_LICENSE +MODULE_LICENSE("GPL"); +#endif diff -Naur linux-2002-09-30/drivers/evms/evms.c evms-2002-09-30/drivers/evms/evms.c --- linux-2002-09-30/drivers/evms/evms.c Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/drivers/evms/evms.c Thu Sep 26 11:55:45 2002 @@ -0,0 +1,5865 @@ +/* -*- linux-c -*- */ +/* + * + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * + */ +/* + * + * linux/drivers/evms/evms.c + * + * EVMS Base and Common Services + * + */ + +#define DEVICE_NR(device) MINOR(device) /* evms has no partition bits */ +#define DEVICE_NAME "evms" /* name for messaging */ +#define DEVICE_NO_RANDOM /* no entropy to contribute */ +#define DEVICE_OFF(d) /* do nothing */ + +//#define LOCAL_DEBUG 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include /* must be included by all block drivers */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +//#define VFS_PATCH_PRESENT + +/* prefix used in logging messages */ +#define LOG_PREFIX + +struct evms_registered_plugin { + struct evms_plugin_header *plugin; + struct evms_registered_plugin *next; +}; +static struct evms_registered_plugin *registered_plugin_head = NULL; + +static struct evms_list_node *evms_global_device_list = NULL; +static struct evms_list_node *evms_global_feature_node_list = NULL; +static struct evms_list_node *evms_global_notify_list = NULL; + +int evms_info_level = EVMS_INFO_LEVEL; +struct proc_dir_entry *evms_proc_dir = NULL; +EXPORT_SYMBOL(evms_info_level); +static struct evms_logical_volume *evms_logical_volumes; +static int evms_volumes = 0; +/* a few variables to aid in detecting memory leaks. + * these variables are always in use, regardless of + * the state of EVMS_MEM_DEBUG. + */ +static atomic_t evms_allocs = (atomic_t) ATOMIC_INIT(0); +static atomic_t evms_logical_nodes = (atomic_t) ATOMIC_INIT(0); + +u8 *evms_primary_string = "primary"; +EXPORT_SYMBOL(evms_primary_string); +u8 *evms_secondary_string = "secondary"; +EXPORT_SYMBOL(evms_secondary_string); + +static struct evms_version evms_svc_version = { + .major = EVMS_COMMON_SERVICES_MAJOR, + .minor = EVMS_COMMON_SERVICES_MINOR, + .patchlevel = EVMS_COMMON_SERVICES_PATCHLEVEL +}; + +/* Handles for "private" EVMS object pools */ +static struct evms_pool_mgmt *evms_io_notify_pool; + +/* Handles for "public" EVMS object pools */ +struct evms_pool_mgmt *evms_bh_pool; +EXPORT_SYMBOL(evms_bh_pool); + +/* Handle for the devfs directory entry */ +devfs_handle_t evms_dir_devfs_handle; +devfs_handle_t evms_blk_devfs_handle; + +/**********************************************************/ +/* SYSCTL - EVMS folder */ +/**********************************************************/ + +#ifdef CONFIG_PROC_FS +static struct ctl_table_header *evms_table_header; +static int evms_info_level_min = EVMS_INFO_CRITICAL; +static int evms_info_level_max = EVMS_INFO_EVERYTHING; + +static ctl_table evms_table[] = { + {DEV_EVMS_INFO_LEVEL, "evms_info_level", + &evms_info_level, sizeof (int), 0644, NULL, + &proc_dointvec_minmax, &sysctl_intvec, + NULL, &evms_info_level_min, &evms_info_level_max}, + {0} +}; + +static ctl_table evms_dir_table[] = { + {DEV_EVMS, "evms", NULL, 0, 0555, evms_table}, + {0} +}; + +static ctl_table dev_dir_table[] = { + {CTL_DEV, "dev", NULL, 0, 0555, evms_dir_table}, + {0} +}; +#endif + +/**********************************************************/ +/* START -- arch ioctl32 support */ +/**********************************************************/ +#if defined(CONFIG_PPC64) || defined(CONFIG_SPARC64) +#include +#include + +extern asmlinkage long +sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg); + +extern int +register_ioctl32_conversion(unsigned int cmd, void *handler); + +extern int +unregister_ioctl32_conversion(unsigned int cmd); + +#define uvirt_to_kernel(__x) ((unsigned long)(__x)) +typedef unsigned int __uvirt_addr; + +struct evms_sector_io32 { + u64 disk_handle; + s32 io_flag; + u64 starting_sector; + u64 sector_count; + __uvirt_addr buffer_address; + s32 status; +}; + +struct evms_rediscover32 { + s32 status; + u32 drive_count; + __uvirt_addr drive_array; +}; + +struct evms_compute_csum32 { + __uvirt_addr buffer_address; + s32 buffer_size; + u32 insum; + u32 outsum; + s32 status; +}; + +struct evms_plugin_ioctl32 { + u32 feature_id; + s32 feature_command; + s32 status; + __uvirt_addr feature_ioctl_data; +}; + +struct evms_notify_bbr32 { + char object_name[EVMS_VOLUME_NAME_SIZE+1]; + u64 count; + u64 start_sect; + u64 nr_sect; + __uvirt_addr buffer; + s32 rw; +}; + +#define EVMS_MD_ID 4 +#define EVMS_MD_PERS_IOCTL_CMD 1 +#define EVMS_MD_ADD 2 +#define EVMS_MD_REMOVE 3 +#define EVMS_MD_ACTIVATE 4 +#define EVMS_MD_DEACTIVATE 5 +#define EVMS_MD_GET_ARRAY_INFO 6 +#define EVMS_MD_RAID5_INIT_IO 1 + +struct evms_md_ioctl { + int mddev_idx; + int cmd; + void *arg; +}; + +struct evms_md_ioctl32 { + u32 mddev_idx; + u32 cmd; + __uvirt_addr arg; +}; + +struct evms_md_array_info { + unsigned long state; + mdp_super_t *sb; +}; + +struct evms_md_array_info32 { + u32 state; + __uvirt_addr sb; +}; + +struct raid5_ioctl_init_io { + int rw; + u64 lsn; + u64 nr_sects; + void *data; +}; + +struct raid5_ioctl_init_io32 { + s32 rw; + u64 lsn; + u64 nr_sects; + __uvirt_addr data; +}; + +#define EVMS_MD_PLUGIN_ID ((IBM_OEM_ID << 16) | \ + (EVMS_REGION_MANAGER << 12) | EVMS_MD_ID) +#define EVMS_BBR_PLUGIN_ID ((IBM_OEM_ID << 16) | \ + (EVMS_FEATURE << 12) | EVMS_BBR_FEATURE_ID) + +#define EVMS_SECTOR_IO_32 _IOWR(EVMS_MAJOR, \ + EVMS_SECTOR_IO_NUMBER, \ + struct evms_sector_io32) +#define EVMS_REDISCOVER_VOLUMES_32 _IOWR(EVMS_MAJOR, \ + EVMS_REDISCOVER_VOLUMES_NUMBER, \ + struct evms_rediscover32) +#define EVMS_COMPUTE_CSUM_32 _IOWR(EVMS_MAJOR, \ + EVMS_COMPUTE_CSUM_NUMBER, \ + struct evms_compute_csum32) +#define EVMS_PLUGIN_IOCTL_32 _IOR(EVMS_MAJOR, \ + EVMS_PLUGIN_IOCTL_NUMBER, \ + struct evms_plugin_ioctl32) + +static int evms_sector_io(unsigned int fd, + unsigned int cmd, + unsigned long arg) +{ + mm_segment_t old_fs = get_fs(); + struct evms_sector_io32 parms32; + struct evms_sector_io_pkt parms; + unsigned int kcmd; + void *karg; + int rc = 0; + + if (copy_from_user(&parms32, (struct evms_sector_io32 *)arg, + sizeof(struct evms_sector_io32))) + return -EFAULT; + + parms.disk_handle = parms32.disk_handle; + parms.io_flag = parms32.io_flag; + parms.starting_sector = parms32.starting_sector; + parms.sector_count = parms32.sector_count; + parms.buffer_address = (u8 *)uvirt_to_kernel(parms32.buffer_address); + parms.status = 0; + + kcmd = EVMS_SECTOR_IO; + karg = &parms; + + set_fs(KERNEL_DS); + rc = sys_ioctl(fd, kcmd, (unsigned long)karg); + set_fs(old_fs); + + parms32.status = parms.status; + + if (copy_to_user((struct evms_sector_io32 *)arg, &parms32, + sizeof(struct evms_sector_io32))) + return -EFAULT; + + return rc; +} + +static int evms_rediscover(unsigned int fd, + unsigned int cmd, + unsigned long arg) +{ + mm_segment_t old_fs = get_fs(); + struct evms_rediscover32 parms32; + struct evms_rediscover_pkt parms; + unsigned int kcmd; + void *karg; + int rc = 0; + + if (copy_from_user(&parms32, (struct evms_rediscover32 *)arg, + sizeof(struct evms_rediscover32))) + return -EFAULT; + + parms.drive_count = parms32.drive_count; + parms.drive_array = (void *)uvirt_to_kernel(parms32.drive_array); + parms.status = 0; + + kcmd = EVMS_REDISCOVER_VOLUMES; + karg = &parms; + + set_fs(KERNEL_DS); + rc = sys_ioctl(fd, kcmd, (unsigned long)karg); + set_fs(old_fs); + + parms32.status = parms.status; + + if (copy_to_user((struct evms_rediscover32 *)arg, &parms32, + sizeof(struct evms_rediscover32))) + return -EFAULT; + + return rc; +} + +static int evms_compute_csum(unsigned int fd, + unsigned int cmd, + unsigned long arg) +{ + mm_segment_t old_fs = get_fs(); + struct evms_compute_csum32 parms32; + struct evms_compute_csum_pkt parms; + unsigned int kcmd; + void *karg; + int rc = 0; + + if (copy_from_user(&parms32, (struct evms_compute_csum32 *)arg, + sizeof(struct evms_compute_csum32))) + return -EFAULT; + + parms.insum = parms32.insum; + parms.outsum = parms32.outsum; + parms.buffer_size = parms32.buffer_size; + parms.buffer_address = (void *)uvirt_to_kernel(parms32.buffer_address); + parms.status = 0; + + kcmd = EVMS_COMPUTE_CSUM; + karg = &parms; + + set_fs(KERNEL_DS); + rc = sys_ioctl(fd, kcmd, (unsigned long)karg); + set_fs(old_fs); + + parms32.status = parms.status; + parms32.outsum = parms.outsum; + + if (copy_to_user((struct evms_compute_csum32 *)arg, &parms32, + sizeof(struct evms_compute_csum32))) + return -EFAULT; + + return rc; +} + +static int evms_bbr_plugin_ioctl(unsigned int fd, + unsigned int cmd, + unsigned long arg) +{ + mm_segment_t old_fs = get_fs(); + struct evms_notify_bbr32 bbr_parms32; + struct evms_notify_bbr bbr_parms; + struct evms_plugin_ioctl_pkt *parms = + (struct evms_plugin_ioctl_pkt *)arg; + void *old_ptr = NULL; + int rc; + + if (copy_from_user(&bbr_parms32, + (struct evms_notify_bbr32 *)parms->feature_ioctl_data, + sizeof(struct evms_notify_bbr32))) + return -EFAULT; + + memcpy(&bbr_parms, &bbr_parms32, sizeof(struct evms_notify_bbr32)); + bbr_parms.buffer = (void *)uvirt_to_kernel(bbr_parms32.buffer); + bbr_parms.rw = bbr_parms32.rw; + old_ptr = parms->feature_ioctl_data; + parms->feature_ioctl_data = &bbr_parms; + + set_fs(KERNEL_DS); + rc = sys_ioctl(fd, cmd, arg); + set_fs(old_fs); + + parms->feature_ioctl_data = old_ptr; + + if (!rc) { + bbr_parms32.nr_sect = bbr_parms.nr_sect; + rc = copy_to_user((struct evms_notify_bbr32 *)parms->feature_ioctl_data, + &bbr_parms32, + sizeof(struct evms_notify_bbr32)); + } + + return rc; +} + +static int evms_md_plugin_ioctl(unsigned int fd, + unsigned int cmd, + unsigned long arg) +{ + mm_segment_t old_fs = get_fs(); + void *old_ptr = NULL; + void *old_md_ptr = NULL; + struct evms_md_ioctl32 md_parms32; + struct evms_md_ioctl md_parms; + struct evms_md_array_info32 md_array_parms32; + struct evms_md_array_info md_array_parms; + struct raid5_ioctl_init_io32 r5_init_io_parms32; + struct raid5_ioctl_init_io r5_init_io_parms; + struct evms_plugin_ioctl_pkt *parms = + (struct evms_plugin_ioctl_pkt *)arg; + int rc; + + if (copy_from_user(&md_parms32, + (struct evms_md_ioctl*)parms->feature_ioctl_data, + sizeof(struct evms_md_ioctl32))) + return -EFAULT; + + md_parms.mddev_idx = md_parms32.mddev_idx; + md_parms.cmd = md_parms32.cmd; + md_parms.arg = (void *)uvirt_to_kernel(md_parms32.arg); + old_ptr = parms->feature_ioctl_data; + parms->feature_ioctl_data = &md_parms; + + if (parms->feature_command == EVMS_MD_GET_ARRAY_INFO) { + if (copy_from_user(&md_array_parms32, + (struct evms_md_array_info32*)md_parms.arg, + sizeof(struct evms_md_array_info32))) + return -EFAULT; + + md_array_parms.state = md_array_parms32.state; + md_array_parms.sb = + (void *)uvirt_to_kernel(md_array_parms32.sb); + old_md_ptr = (void *)md_parms.arg; + md_parms.arg = &md_array_parms; + } else if (parms->feature_command == EVMS_MD_PERS_IOCTL_CMD) { + if (md_parms.cmd == EVMS_MD_RAID5_INIT_IO) { + if (copy_from_user(&r5_init_io_parms32, + (struct raid5_ioctl_init_io32*)md_parms.arg, + sizeof(struct raid5_ioctl_init_io32))) + return -EFAULT; + + r5_init_io_parms.rw = r5_init_io_parms32.rw; + r5_init_io_parms.lsn = r5_init_io_parms32.lsn; + r5_init_io_parms.nr_sects = r5_init_io_parms32.nr_sects; + r5_init_io_parms.data = + (void *)uvirt_to_kernel(r5_init_io_parms32.data); + old_md_ptr = (void *)md_parms.arg; + md_parms.arg = &r5_init_io_parms; + } + } + + set_fs(KERNEL_DS); + rc = sys_ioctl(fd, cmd, arg); + set_fs(old_fs); + + parms->feature_ioctl_data = old_ptr; + md_parms.arg = old_md_ptr; + + if (!rc) { + if (parms->feature_command == EVMS_MD_GET_ARRAY_INFO) { + md_array_parms32.state = md_array_parms.state; + rc = copy_to_user((struct evms_md_array_info32 *)md_parms.arg, + &md_array_parms32, + sizeof(struct evms_md_array_info32)); + } + if (!rc) { + md_parms32.mddev_idx = md_parms.mddev_idx; + rc = copy_to_user((struct evms_md_ioctl*)parms->feature_ioctl_data, + &md_parms32, + sizeof(struct evms_md_ioctl32)); + } + } + + return rc; +} + +static int evms_plugin_ioctl(unsigned int fd, + unsigned int cmd, + unsigned long arg) +{ + mm_segment_t old_fs = get_fs(); + struct evms_plugin_ioctl32 parms32; + struct evms_plugin_ioctl_pkt parms; + unsigned int kcmd; + void *karg; + int rc; + + if (copy_from_user(&parms32, (struct evms_plugin_ioctl32 *)arg, + sizeof(struct evms_plugin_ioctl32))) + return -EFAULT; + + parms.feature_id = parms32.feature_id; + parms.feature_command = parms32.feature_command; + parms.status = parms32.status; + parms.feature_ioctl_data = + (void *)uvirt_to_kernel(parms32.feature_ioctl_data); + + kcmd = EVMS_PLUGIN_IOCTL; + karg = &parms; + + switch (parms.feature_id) { + case EVMS_MD_PLUGIN_ID: + rc = evms_md_plugin_ioctl(fd, kcmd, (unsigned long)karg); + break; + case EVMS_BBR_PLUGIN_ID: + rc = evms_bbr_plugin_ioctl(fd, kcmd, (unsigned long)karg); + break; + default: + set_fs(KERNEL_DS); + rc = sys_ioctl(fd, kcmd, (unsigned long)karg); + set_fs(old_fs); + } + + if (!rc) { + parms32.status = parms.status; + rc = copy_to_user((struct evms_plugin_ioctl32 *)arg, &parms32, + sizeof(struct evms_plugin_ioctl32)); + } + + return rc; +} +#endif + +/**********************************************************/ +/* START -- exported functions/Common Services */ +/**********************************************************/ + +/* + * Function: evms_cs_get_version + * Description: This function returns the current EVMS version + */ +void +evms_cs_get_version(int *major, int *minor) +{ + *major = EVMS_MAJOR_VERSION; + *minor = EVMS_MINOR_VERSION; +} + +EXPORT_SYMBOL(evms_cs_get_version); + +int +evms_cs_check_version(struct evms_version *required, + struct evms_version *actual) +{ + if (required->major != actual->major) + return -EINVAL; + else if (required->minor > actual->minor) + return -EINVAL; + else if (required->minor == actual->minor) + if (required->patchlevel > actual->patchlevel) + return -EINVAL; + return 0; +} + +EXPORT_SYMBOL(evms_cs_check_version); + +int +evms_cs_allocate_logical_node(struct evms_logical_node **pp) +{ + *pp = kmalloc(sizeof (struct evms_logical_node), GFP_KERNEL); + if (*pp) { + memset(*pp, 0, sizeof (struct evms_logical_node)); + atomic_inc(&evms_logical_nodes); + return 0; + } + return -ENOMEM; +} + +EXPORT_SYMBOL(evms_cs_allocate_logical_node); + +void +evms_cs_deallocate_volume_info(struct evms_logical_node *p) +{ + if (p->iflags & EVMS_FEATURE_BOTTOM) { + evms_cs_remove_item_from_list(&evms_global_feature_node_list, + p); + kfree(p->volume_info); + p->volume_info = NULL; + p->iflags &= ~EVMS_FEATURE_BOTTOM; + } +} + +EXPORT_SYMBOL(evms_cs_deallocate_volume_info); + +void +evms_cs_deallocate_logical_node(struct evms_logical_node *p) +{ + if (p->next) { + LOG_SERIOUS + ("Deallocating object whose NEXT ptr is not null!!\n"); + } + evms_cs_deallocate_volume_info(p); + if (p->feature_header) { + kfree(p->feature_header); + p->feature_header = NULL; + } + kfree(p); + atomic_dec(&evms_logical_nodes); +} + +EXPORT_SYMBOL(evms_cs_deallocate_logical_node); + +/* + * Function: evms_cs_register_plugin + * Description: This function is exported so that all plugins can register with EVMS + */ +int +evms_cs_register_plugin(struct evms_plugin_header *plugin) +{ + int rc = 0; + struct evms_registered_plugin *reg_record, **pp; + struct evms_version *ver; + + ver = &plugin->required_services_version; + + LOG_EXTRA + ("registering plugin (plugin.id=%d.%d.%d, plugin.ver=%d.%d.%d, req.svc.ver=%d.%d.%d)\n", + GetPluginOEM(plugin->id), GetPluginType(plugin->id), + GetPluginID(plugin->id), plugin->version.major, + plugin->version.minor, plugin->version.patchlevel, ver->major, + ver->minor, ver->patchlevel); + + /* check common services requirements */ + rc = evms_cs_check_version(ver, &evms_svc_version); + if (rc) { + LOG_SERIOUS + ("plugin failed to load: common services (vers:%d,%d,%d) incompatibility!\n", + EVMS_COMMON_SERVICES_MAJOR, EVMS_COMMON_SERVICES_MINOR, + EVMS_COMMON_SERVICES_PATCHLEVEL); + } + if (!rc) { + /* ensure a plugin with this feature id is + * not already loaded. + */ + for (pp = ®istered_plugin_head; *pp; pp = &(*pp)->next) { + if ((*pp)->plugin->id == plugin->id) { + rc = -EBUSY; + LOG_ERROR + ("error(%d) attempting to load another plugin with id(%x).\n", + rc, plugin->id); + } + } + } + if (!rc) { + /* ensure the plugin has provided functions for + * the mandatory entry points. + */ + if (!plugin->fops->discover) { + rc = -EINVAL; + } else if (!plugin->fops->init_io) { + rc = -EINVAL; + } else if (!plugin->fops->ioctl) { + rc = -EINVAL; + } else if (!plugin->fops->read) { + rc = -EINVAL; + } else if (!plugin->fops->write) { + rc = -EINVAL; + } else if (!plugin->fops->delete) { + rc = -EINVAL; + } + } + if (!rc) { + /* allocate a new plugin registration record */ + reg_record = + kmalloc(sizeof (struct evms_registered_plugin), GFP_KERNEL); + if (!reg_record) { + rc = -ENOMEM; + } + } + if (!rc) { + memset(reg_record, 0, sizeof (struct evms_registered_plugin)); + /* store ptr to plugin header in new registration record */ + reg_record->plugin = plugin; + + /* terminate the record */ + reg_record->next = NULL; + + /* find end of the plugin registration list */ + for (pp = ®istered_plugin_head; *pp; pp = &(*pp)->next) ; + /* add registration record to list */ + *pp = reg_record; + + /* increment the usage count */ + MOD_INC_USE_COUNT; + } + + return (rc); +} + +EXPORT_SYMBOL(evms_cs_register_plugin); + +/* + * Function: evms_cs_unregister_plugin + * Description: This function is exported so that all plugins can + * unregister with EVMS + */ +int +evms_cs_unregister_plugin(struct evms_plugin_header *plugin) +{ + int rc = 0, found = FALSE; + struct evms_registered_plugin **pp; + struct evms_version *ver; + + ver = &plugin->required_services_version; + + LOG_EXTRA + ("unregistering plugin (plugin.id=%d.%d.%d, plugin.ver=%d.%d.%d, req.svc.ver=%d.%d.%d)\n", + GetPluginOEM(plugin->id), GetPluginType(plugin->id), + GetPluginID(plugin->id), plugin->version.major, + plugin->version.minor, plugin->version.patchlevel, ver->major, + ver->minor, ver->patchlevel); + /* ensure a plugin with this feature id is + * currently loaded. + */ + for (pp = ®istered_plugin_head; *pp; pp = &(*pp)->next) { + if ((*pp)->plugin->id == plugin->id) { + found = TRUE; + break; + } + } + if (!found) { + rc = -ENOPKG; + LOG_ERROR + ("error(%d) attempt to unload a non-loaded plugin with id(%x).\n", + rc, plugin->id); + } + /* actually unload the plugin now */ + if (!rc) { + struct evms_registered_plugin *tmp = *pp; + + /* remove the plugin record from our + * internal plugin list + */ + *pp = (*pp)->next; + /* deallocate the plugin registration record + */ + kfree(tmp); + + /* decrement the usage count */ + MOD_DEC_USE_COUNT; + } + return (rc); +} + +EXPORT_SYMBOL(evms_cs_unregister_plugin); + +/* function: evms_cs_add_logical_node_to_list + * + * This functions adds a new logical node to the end of a + * node list. + * + * NOTE: This function is only expected to be called at + * discovery time, which is singled threaded by nature, + * and therefore doesn't need to be made SMP safe. + */ +int +evms_cs_add_logical_node_to_list(struct evms_logical_node **list_head, + struct evms_logical_node *node) +{ + int rc = 0; + struct evms_logical_node **pp = NULL; + + /* check to make sure node is not already on a list */ + if (node->next) + rc = 1; + else + /* check to make sure node being added is not already in the list */ + for (pp = list_head; *pp; pp = &(*pp)->next) + if (*pp == node) { + rc = 2; + break; + } + + /* add node to the end of the list */ + if (!rc) + *pp = node; + + return (rc); +} + +EXPORT_SYMBOL(evms_cs_add_logical_node_to_list); + +/* function: evms_cs_remove_logical_node_from_list + * + * This functions removes a new logical node from a node list. + * + * NOTE: This function is only expected to be called at + * discovery time, which is singled threaded by nature, + * and therefore doesn't need to be made SMP safe. + */ +int +evms_cs_remove_logical_node_from_list(struct evms_logical_node **list_head, + struct evms_logical_node *node) +{ + /* remove this node from the head of the list */ + int rc = 1; /* assume failure until target node is found */ + struct evms_logical_node **pp; + for (pp = list_head; *pp; pp = &(*pp)->next) + if (*pp == node) { + *pp = (*pp)->next; + node->next = NULL; + rc = 0; + break; + } + return (rc); +} + +EXPORT_SYMBOL(evms_cs_remove_logical_node_from_list); + +int +evms_cs_kernel_ioctl(struct evms_logical_node *node, unsigned int cmd, + unsigned long arg) +{ + int rc = 0; + struct inode tmp_inode; + mm_segment_t fs; + + lock_kernel(); + fs = get_fs(); + set_fs(get_ds()); + rc = IOCTL(node, &tmp_inode, NULL, cmd, arg); + set_fs(fs); + unlock_kernel(); + + return (rc); + +} + +EXPORT_SYMBOL(evms_cs_kernel_ioctl); + +/* + * function: evms_cs_size_in_vsectors + * + * In EVMS a V(irtual)Sector is 512 bytes in size. + * This function computes the number of VSECTORs an specified + * item size would require. + * + * NOTE: This function has been coded to work with 64 bit values. + */ +unsigned long +evms_cs_size_in_vsectors(long long item_size) +{ + long long sectors; + + sectors = item_size >> EVMS_VSECTOR_SIZE_SHIFT; + if (item_size & (EVMS_VSECTOR_SIZE - 1)) + sectors++; + + return (sectors); +} + +EXPORT_SYMBOL(evms_cs_size_in_vsectors); + +/* + * function: evms_cs_log2 + * + * this function computes the power of the 2 of specified + * value. If the value is 0, a -1 is returned. If the value + * is NOT a power of 2, a -2 is return. Otherwise the power + * of 2 is returned. + */ +int +evms_cs_log2(long long value) +{ + int result = -1; + long long tmp; + + if (value) { + tmp = value; + result++; + while (!(tmp & 1)) { + result++; + tmp >>= 1; + } + if (tmp != 1) { + result = -2; + } + } + return (result); +} + +EXPORT_SYMBOL(evms_cs_log2); + +/* + * Functions: + * + * build_crc_table() + * calculate_crc() + * + * + * Description: The functions in this module provide a means of calculating + * the 32 bit CRC for a block of data. build_crc_table must + * be called to initialize this module. calculate_crc must + * NOT be used until after build_crc_table has been called. + * Once build_crc_table has been called, calculate_crc can + * be used to calculate the crc of the data residing in a + * user specified buffer. + * + */ + +#define CRC_POLYNOMIAL 0xEDB88320L + +static u32 crc_table[256]; +static u32 crc_table_built = FALSE; + +/*********************************************************************/ +/* */ +/* Function Name: build_crc_table */ +/* */ +/* Descriptive Name: This module implements the crc function using */ +/* a table driven method. The required table */ +/* must be setup before the calculate_crc */ +/* function can be used. This table only needs */ +/* to be set up once. This function sets up the */ +/* crc table needed by calculate_crc. */ +/* */ +/* Input: None */ +/* */ +/* Output: None */ +/* */ +/* Error Handling: N/A */ +/* */ +/* Side Effects: The internal crc table is initialized. */ +/* */ +/* Notes: None. */ +/* */ +/*********************************************************************/ +static void +build_crc_table(void) +{ + u32 i, j, crc; + + for (i = 0; i <= 255; i++) { + crc = i; + for (j = 8; j > 0; j--) { + if (crc & 1) + crc = (crc >> 1) ^ CRC_POLYNOMIAL; + else + crc >>= 1; + } + crc_table[i] = crc; + } + crc_table_built = TRUE; +} + +/*********************************************************************/ +/* */ +/* Function Name: calculate_crc */ +/* */ +/* Descriptive Name: This function calculates the crc value for */ +/* the data in the buffer specified by Buffer. */ +/* */ +/* Input: u32 crc : This is the starting crc. If you are */ +/* starting a new crc calculation, then */ +/* this should be set to 0xFFFFFFFF. If */ +/* you are continuing a crc calculation */ +/* (i.e. all of the data did not fit in */ +/* the buffer so you could not calculate */ +/* the crc in a single operation), then */ +/* this is the crc output by the last */ +/* calculate_crc call. */ +/* */ +/* Output: The crc for the data in the buffer, based upon the value*/ +/* of the input parameter crc. */ +/* */ +/* Error Handling: None. */ +/* */ +/* Side Effects: None. */ +/* */ +/* Notes: None. */ +/* */ +/*********************************************************************/ +u32 +evms_cs_calculate_crc(u32 crc, void *buffer, u32 buffersize) +{ + unsigned char *current_byte; + u32 temp1, temp2, i; + + current_byte = (unsigned char *) buffer; + /* Make sure the crc table is available */ + if (crc_table_built == FALSE) + build_crc_table(); + /* Process each byte in the buffer. */ + for (i = 0; i < buffersize; i++) { + temp1 = (crc >> 8) & 0x00FFFFFF; + temp2 = + crc_table[(crc ^ (u32) * + current_byte) & (u32) 0xff]; + current_byte++; + crc = temp1 ^ temp2; + } + return (crc); +} + +EXPORT_SYMBOL(evms_cs_calculate_crc); + +#define EVMS_ORIGINAL_CALLBACK_FLAG 1<<0 +typedef struct io_notify_s { + unsigned int flags; + void *private; + struct buffer_head *bh; + u64 rsector; + kdev_t rdev; + void *b_private; + void (*callback_function) (struct evms_logical_node * node, + struct buffer_head * bh, + int uptodate, int *redrive); + struct io_notify_s *next; +} io_notify_t; + +struct evms_pool_mgmt * +evms_cs_create_pool(int objsize, + u8 * pool_name, + void (*ctor) (void *, kmem_cache_t *, unsigned long), + void (*dtor) (void *, kmem_cache_t *, unsigned long)) +{ + struct evms_pool_mgmt *pool; + + /* create the pool management structure */ + pool = kmalloc(sizeof (struct evms_pool_mgmt), GFP_KERNEL); + if (!pool) { + LOG_CRITICAL("Cannot create %s fpool mgmt structure", + pool_name); + return NULL; + } + /* initialize various field in pool mgmt structure */ + memset(pool, 0, sizeof (struct evms_pool_mgmt)); + pool->member_size = objsize; + pool->name = pool_name; + pool->waiters = (atomic_t) ATOMIC_INIT(0); + init_waitqueue_head(&pool->wait_queue); + /* go create the pool */ + pool->cachep = kmem_cache_create(pool->name, + pool->member_size, + 0, SLAB_HWCACHE_ALIGN, ctor, dtor); + if (!pool->cachep) + panic("Cannot create %s SLAB cache", pool->name); + return (pool); +} + +EXPORT_SYMBOL(evms_cs_create_pool); + +void * +evms_cs_allocate_from_pool(struct evms_pool_mgmt *pool, int blockable) +{ + void *objp; + + while (1) { + objp = kmem_cache_alloc(pool->cachep, SLAB_NOIO); + if (objp || !blockable) { + return (objp); + } else { + /* block and wait for an object to + * be returned to the pool + */ + atomic_inc(&pool->waiters); + wait_event(pool->wait_queue, + (!atomic_read(&pool->waiters))); + } + } + return (objp); +} + +EXPORT_SYMBOL(evms_cs_allocate_from_pool); + +void +evms_cs_deallocate_to_pool(struct evms_pool_mgmt *pool, void *objp) +{ + kmem_cache_free(pool->cachep, objp); + atomic_set(&pool->waiters, 0); + if (waitqueue_active(&pool->wait_queue)) { + wake_up(&pool->wait_queue); + } +} + +EXPORT_SYMBOL(evms_cs_deallocate_to_pool); + +void +evms_cs_destroy_pool(struct evms_pool_mgmt *pool) +{ + kmem_cache_destroy(pool->cachep); + kfree(pool); +} + +EXPORT_SYMBOL(evms_cs_destroy_pool); + +/* + * function: evms_end_io + * + * This is a support function for + * evms_cs_register_for_end_io_notification. + * This function is called during I/O completion on any buffer + * head that was registered by a plugin. Control is passed here + * and this routine will, thru the use of the I/O notify entry + * stored in the b_private field of the buffer head, restore + * the b_rsector value the buffer head had at the time of + * registration and pass control to the registered callback + * address, with pointers to the buffer head and an optional + * plugin private data. Upon completion of the callback, + * control is returned back here. The io notify list entry + * is deleted. This process repeats until this routine + * detects that all registered plugins have been called back + * and the buffer head's original end_io function has been + * called. At this point the DONE flag is set, and we terminate + * callback loop and exit. + * + * Plugins may desire to break or interrupt the callback + * sequence or chain. This may be useful to redrive I/O or + * to wait for other buffer heads to complete before + * allowing the original buffer head callback to occur. + * To interrupt the callback "chain", a registered + * plugin's callback must return with the DONE flag set. + * + * NOTE: If a plugin set the DONE flag, and wishes to redrive + * a buffer head, the plugin MUST reregister the buffer head + * to receive another callback on this buffer head. Also, the + * plugin MUST ensure that the original buffer head end_io + * function get called at some point, either by reregistering + * this buffer head and receiving another callback, or by + * means of buffer head aggregation triggered by the callbacks + * of other buffer heads. + * + */ +static void +evms_end_io(struct buffer_head *bh, int uptodate) +{ + io_notify_t *entry; + int done; + + done = FALSE; + while (!done) { + /* retrieve the io_notify_entry ptr from + * the b_private field in the buffer head. + */ + entry = (io_notify_t *) bh->b_private; + + /* restore the b_private value to + * the previous b_private value (which + * should be a previous io_notify_entry + * or the original b_private pointer). + */ + bh->b_private = entry->b_private; + + /* check for original callback for this bh */ + if (entry->flags & EVMS_ORIGINAL_CALLBACK_FLAG) { + /* this is the original for bh */ + + /* turn off flag marking this as the original */ + entry->flags &= ~EVMS_ORIGINAL_CALLBACK_FLAG; + + /* decrement volume's requests_in_progress var */ + atomic_dec(&evms_logical_volumes[MINOR(bh->b_rdev)]. + requests_in_progress); + + /* restore b_end_io to original value */ + bh->b_end_io = (void *) entry->callback_function; + if (bh->b_end_io) { + /* invoke original callback function + * if it exists. + */ + bh->b_end_io(bh, uptodate); + } + done = TRUE; + } else { + /* this is a plugin callback */ + + /* restore the rsector value to the + * value at the time of callback + * registration. + */ + bh->b_rsector = entry->rsector; + bh->b_rdev = entry->rdev; + /* invoke plugin callback function */ + entry->callback_function(entry->private, bh, uptodate, + &done); + } + /* free the io notify entry */ + evms_cs_deallocate_to_pool(evms_io_notify_pool, entry); + } +} + +/* + * function: evms_cs_register_for_end_io_notification + * + * This function is an evms common service. + * This routine allows a (plugin) function to register to + * participate in the io completion notification process. + * This is useful for plugins which alter data after it + * has been read from the disk (i.e. encryption or + * compression). + * + * This routine also records the rsector value at the time + * of registration, so that it can be restored to that value + * prior to the callback to a plugin, thus allowing that + * plugin to work with the value it had seen during the + * initiating I/O request. + * + * This routine also records a private data pointer at the + * time of registration, and is returned to the plugin + * at callback time. This private data pointer was designed + * to contain context/callback/buffer_head specific data, and + * frees the plugin from having to store and find associated + * data at the time of the callback. This field is not used + * by this function and is optional (NULL if unused). It is + * recorded and returned as a convenience for the plugins. + * + * DANGER!!! - WILL ROBINSON - DANGER!!! + * This routine uses the b_private field in the + * buffer_head structure. If any lower level driver uses this + * field and do NOT restore it, the I/O callback will fail!! + * + * Any plugins writers requiring a field for private storage + * should instead use the private field parameter in this + * function to store their private data. + * + */ + +int +evms_cs_register_for_end_io_notification(void *private, + struct buffer_head *bh, + void *callback_function) +{ + int rc = 0, done; + io_notify_t *new_entry; + + done = FALSE; + while (!done) { + /* allocate a notify entry */ + new_entry = + evms_cs_allocate_from_pool(evms_io_notify_pool, + EVMS_BLOCKABLE); + if (!new_entry) { + schedule(); + continue; + } + + /* initialize notify entry */ + new_entry->private = private; + new_entry->bh = bh; + new_entry->rsector = bh->b_rsector; + new_entry->rdev = bh->b_rdev; + new_entry->b_private = bh->b_private; + new_entry->flags = 0; + + /* is this the first callback for this bh? */ + if (bh->b_end_io != evms_end_io) { + /* yes, first callback */ + new_entry->flags |= EVMS_ORIGINAL_CALLBACK_FLAG; + new_entry->callback_function = (void *) bh->b_end_io; + + /* increment volume's requests_in_progress var */ + atomic_inc(&evms_logical_volumes[MINOR(bh->b_rdev)]. + requests_in_progress); + + /* set b_end_io so we get control */ + bh->b_end_io = evms_end_io; + } else { + /* no, not first callback */ + new_entry->callback_function = callback_function; + done = TRUE; + } + /* set b_private to aid in quick lookup */ + bh->b_private = new_entry; + } + return (rc); +} + +EXPORT_SYMBOL(evms_cs_register_for_end_io_notification); + +/* function description: evms_cs_lookup_item_in_list + * + * this function searches for the specified item in the + * specified node list. it returns the address of the + * evms_list_node containing the specified item. + */ +struct evms_list_node ** +evms_cs_lookup_item_in_list(struct evms_list_node **node_list, void *item) +{ + struct evms_list_node **list_node; + + list_node = node_list; + while (*list_node) { + if ((*list_node)->item == item) + break; + list_node = &(*list_node)->next; + } + return (list_node); +} + +EXPORT_SYMBOL(evms_cs_lookup_item_in_list); + +/* function description: evms_add_item_to_list + * + * this function adds an item to the list. the + * node for the new item is added to the end + * of the list. the list is traversed to find the end. + * while the traversal occurs, the list is checked + * for the presence of the specified item. if already + * present in the list, and error code is returned. + */ +/* function description: evms_cs_add_item_to_list + * + * this function adds an item to an item list. + * + * RC == 0 is returned for: + * a successful add of a new item + * + * RC == 1 is returned when: + * the item is already on the list + * + * RC < 0 is returned for an error attempting to add the item. + */ +int +evms_cs_add_item_to_list(struct evms_list_node **list, void *item) +{ + int rc = 0; + struct evms_list_node **list_node, *new_node; + + list_node = evms_cs_lookup_item_in_list(list, item); + if (*list_node == NULL) { + new_node = kmalloc(sizeof (struct evms_list_node), GFP_NOIO); + if (new_node) { + memset(new_node, 0, sizeof (struct evms_list_node)); + new_node->item = item; + *list_node = new_node; + } else { + rc = -ENOMEM; + } + } else { + rc = 1; + LOG_DEBUG + ("warning: attempt to add duplicate item(%p) to list(%p).\n", + item, list); + } + return (rc); +} + +EXPORT_SYMBOL(evms_cs_add_item_to_list); + +/* function description: evms_remove_item_from_list + * + * this function removes a specified item from the + * specified list. if the specified item is not + * found in the list, and error is returned. + */ +int +evms_cs_remove_item_from_list(struct evms_list_node **list, void *item) +{ + int rc = 0; + struct evms_list_node **list_node; + + /* check to see if item is in the list */ + list_node = evms_cs_lookup_item_in_list(list, item); + + /* was the node found in the list? */ + if (*list_node) { + /* yes, it was found */ + struct evms_list_node *tmp_node; + + /* save ptr to node being removed */ + tmp_node = *list_node; + /* remove it from the global list */ + *list_node = tmp_node->next; + /* delete removed node */ + kfree(tmp_node); + } else { + /* no, it was not found */ + rc = -1; + LOG_ERROR + ("error(%d): attempt to remove nonexistant node(%p) from list(%p).\n", + rc, item, list); + } + return (rc); +} + +EXPORT_SYMBOL(evms_cs_remove_item_from_list); + +/* function description: evms_cs_register_device + * + * this function adds a device to the EVMS global device list. + * + * RC == 0 is returned for: + * a successful add of a new device + * + * RC == 1 is returned when: + * the device is already on the list + * + * RC < 0 is returned for an error attempting to add the device. + */ +int +evms_cs_register_device(struct evms_logical_node *device) +{ + return (evms_cs_add_item_to_list(&evms_global_device_list, device)); +} + +EXPORT_SYMBOL(evms_cs_register_device); + +/* function description: evms_cs_unregister_device + * + * this function removes a device from the EVMS global device list. + * + * RC == 0 is returned for: + * a successful removal of the specified device + * + * RC < 0 is returned for an error attempting to add the device. + * -ENODATA is returned if specified device is not found. + */ +int +evms_cs_unregister_device(struct evms_logical_node *device) +{ + return (evms_cs_remove_item_from_list(&evms_global_device_list, + device)); +} + +EXPORT_SYMBOL(evms_cs_unregister_device); + +static struct evms_list_node *find_first_next_list_node = NULL; +int +evms_cs_find_next_device(struct evms_logical_node *in_device, + struct evms_logical_node **out_device) +{ + int rc = 0; + struct evms_list_node **list_node; + + if (in_device == NULL) + find_first_next_list_node = evms_global_device_list; + else { + list_node = + evms_cs_lookup_item_in_list(&evms_global_device_list, + in_device); + find_first_next_list_node = *list_node; + if (find_first_next_list_node == NULL) + rc = -ENODATA; + else + find_first_next_list_node = + find_first_next_list_node->next; + } + + if (find_first_next_list_node == NULL) + *out_device = NULL; + else + *out_device = (struct evms_logical_node *) + find_first_next_list_node->item; + + return (rc); +} + +EXPORT_SYMBOL(evms_cs_find_next_device); + +void +evms_cs_signal_event(int eventid) +{ + int rc; + struct evms_list_node **list_node; + + /* signal PID(s) of specified event */ + list_node = &evms_global_notify_list; + while (*list_node) { + struct evms_event *event; + + event = (*list_node)->item; + if (event->eventid == eventid) { + struct task_struct *tsk; + + tsk = find_task_by_pid(event->pid); + if (tsk) { + struct siginfo siginfo; + + siginfo.si_signo = event->signo; + siginfo.si_errno = 0; + siginfo.si_code = 0; + rc = send_sig_info(event->signo, &siginfo, tsk); + } else { + /* TODO: + * unregister this stale + * notification record + */ + } + } + list_node = &(*list_node)->next; + } +} + +EXPORT_SYMBOL(evms_cs_signal_event); + +static inline void +evms_flush_signals(void) +{ + spin_lock(¤t->sigmask_lock); + flush_signals(current); + spin_unlock(¤t->sigmask_lock); +} + +static inline void +evms_init_signals(void) +{ + current->exit_signal = SIGCHLD; + siginitsetinv(¤t->blocked, sigmask(SIGKILL)); +} + +static int +evms_thread(void *arg) +{ + struct evms_thread *thread = arg; + lock_kernel(); + + /* + * Detach thread + */ + + daemonize(); + + sprintf(current->comm, thread->name); + evms_init_signals(); + evms_flush_signals(); + thread->tsk = current; + + current->policy = SCHED_OTHER; +#ifdef O1_SCHEDULER + set_user_nice(current, -20); +#else + current->nice = -20; +#endif + unlock_kernel(); + + complete(thread->event); + while (thread->run) { + void (*run) (void *data); + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(&thread->wqueue, &wait); +#ifdef O1_SCHEDULER + set_current_state(TASK_INTERRUPTIBLE); +#else + set_task_state(current, TASK_INTERRUPTIBLE); +#endif + if (!test_bit(EVMS_THREAD_WAKEUP, &thread->flags)) { + schedule(); + } +#ifdef O1_SCHEDULER + set_current_state(TASK_RUNNING); +#else + current->state = TASK_RUNNING; +#endif + remove_wait_queue(&thread->wqueue, &wait); + clear_bit(EVMS_THREAD_WAKEUP, &thread->flags); + + run = thread->run; + if (run) { + run(thread->data); + run_task_queue(&tq_disk); + } + if (signal_pending(current)) { + evms_flush_signals(); + } + } + complete(thread->event); + return 0; +} + +struct evms_thread * +evms_cs_register_thread(void (*run) (void *), void *data, const u8 * name) +{ + struct evms_thread *thread; + int ret; + struct completion event; + + thread = kmalloc(sizeof (struct evms_thread), GFP_KERNEL); + if (!thread) { + return NULL; + } + memset(thread, 0, sizeof (struct evms_thread)); + init_waitqueue_head(&thread->wqueue); + + init_completion(&event); + thread->event = &event; + thread->run = run; + thread->data = data; + thread->name = name; + ret = kernel_thread(evms_thread, thread, 0); + if (ret < 0) { + kfree(thread); + return NULL; + } + wait_for_completion(&event); + return thread; +} + +EXPORT_SYMBOL(evms_cs_register_thread); + +void +evms_cs_unregister_thread(struct evms_thread *thread) +{ + struct completion event; + + init_completion(&event); + + thread->event = &event; + thread->run = NULL; + thread->name = NULL; + evms_cs_interrupt_thread(thread); + wait_for_completion(&event); + kfree(thread); +} + +EXPORT_SYMBOL(evms_cs_unregister_thread); + +void +evms_cs_wakeup_thread(struct evms_thread *thread) +{ + set_bit(EVMS_THREAD_WAKEUP, &thread->flags); + wake_up(&thread->wqueue); +} + +EXPORT_SYMBOL(evms_cs_wakeup_thread); + +void +evms_cs_interrupt_thread(struct evms_thread *thread) +{ + if (!thread->tsk) { + LOG_ERROR("error: attempted to interrupt an invalid thread!\n"); + return; + } + send_sig(SIGKILL, thread->tsk, 1); +} + +EXPORT_SYMBOL(evms_cs_interrupt_thread); + +struct proc_dir_entry * +evms_cs_get_evms_proc_dir(void) +{ +#ifdef CONFIG_PROC_FS + if (!evms_proc_dir) { + evms_proc_dir = create_proc_entry("evms", S_IFDIR, &proc_root); + } +#endif + return (evms_proc_dir); +} + +EXPORT_SYMBOL(evms_cs_get_evms_proc_dir); + +int +evms_cs_volume_request_in_progress(kdev_t dev, + int operation, int *current_count) +{ + int rc = 0; + struct evms_logical_volume *volume; + + volume = &evms_logical_volumes[MINOR(dev)]; + if (volume->node) { + if (operation > 0) { + atomic_inc(&volume->requests_in_progress); + } else if (operation < 0) { + atomic_dec(&volume->requests_in_progress); + } + if (current_count) { + *current_count = + atomic_read(&volume->requests_in_progress); + } + } else { + rc = -ENODEV; + } + return (rc); +} + +EXPORT_SYMBOL(evms_cs_volume_request_in_progress); + +void +evms_cs_invalidate_volume(struct evms_logical_node *node) +{ + int i; + for (i = 1; i < MAX_EVMS_VOLUMES; i++) { + if (evms_logical_volumes[i].node && node->name) { + if (! + (strcmp + (evms_logical_volumes[i].node->name, + node->name))) { + LOG_DETAILS + ("Invalidating EVMS device %s minor %d\n", + node->name, i); + invalidate_device(MKDEV(EVMS_MAJOR, i), 0); + break; + } + } + } +} + +EXPORT_SYMBOL(evms_cs_invalidate_volume); + +static int +is_open(int minor) +{ + return atomic_read(&evms_logical_volumes[minor].opens); +} + +/**********************************************************/ +/* END -- exported functions/Common Services */ +/**********************************************************/ + +/**********************************************************/ +/* START -- Proc FS Support functions */ +/**********************************************************/ + +#ifdef CONFIG_PROC_FS +static int +evms_info_read_proc(char *page, + char **start, off_t off, int count, int *eof, void *data) +{ + int sz = 0; + char *info_level_text = NULL; + + PROCPRINT("Enterprise Volume Management System: Info\n"); + switch (evms_info_level) { + case EVMS_INFO_CRITICAL: + info_level_text = "critical"; + break; + case EVMS_INFO_SERIOUS: + info_level_text = "serious"; + break; + case EVMS_INFO_ERROR: + info_level_text = "error"; + break; + case EVMS_INFO_WARNING: + info_level_text = "warning"; + break; + case EVMS_INFO_DEFAULT: + info_level_text = "default"; + break; + case EVMS_INFO_DETAILS: + info_level_text = "details"; + break; + case EVMS_INFO_DEBUG: + info_level_text = "debug"; + break; + case EVMS_INFO_EXTRA: + info_level_text = "extra"; + break; + case EVMS_INFO_ENTRY_EXIT: + info_level_text = "entry exit"; + break; + case EVMS_INFO_EVERYTHING: + info_level_text = "everything"; + break; + default: + info_level_text = "unknown"; + break; + } + PROCPRINT("EVMS info level: %d (%s).\n", + evms_info_level, info_level_text); + + PROCPRINT("EVMS kernel version: %d.%d.%d\n", + EVMS_MAJOR_VERSION, + EVMS_MINOR_VERSION, EVMS_PATCHLEVEL_VERSION); + + PROCPRINT("EVMS IOCTL interface version: %d.%d.%d\n", + EVMS_IOCTL_INTERFACE_MAJOR, + EVMS_IOCTL_INTERFACE_MINOR, EVMS_IOCTL_INTERFACE_PATCHLEVEL); + + PROCPRINT("EVMS Common Services version: %d.%d.%d\n", + EVMS_COMMON_SERVICES_MAJOR, + EVMS_COMMON_SERVICES_MINOR, EVMS_COMMON_SERVICES_PATCHLEVEL); + + *eof = 1; + +out: + *start = page + off; + sz -= off; + if (sz < 0) + sz = 0; + return sz > count ? count : sz; +} + +static int +evms_plugins_read_proc(char *page, + char **start, off_t off, int count, int *eof, void *data) +{ + int sz = 0; + struct evms_registered_plugin *rp = NULL; + + PROCPRINT("Enterprise Volume Management System: Plugins\n"); + /* 0 1 1 2 2 3 3 4 4 5 5 6 6 7 */ + /* 1 5 0 5 0 5 0 5 0 5 0 5 0 5 0 */ + PROCPRINT(" ---------Plugin---------- required services\n"); + PROCPRINT(" ----id---- version version\n\n"); + for (rp = registered_plugin_head; rp; rp = rp->next) { + PROCPRINT(" %x.%x.%x\t %d.%d.%d\t%d.%d.%d\n", + GetPluginOEM(rp->plugin->id), + GetPluginType(rp->plugin->id), + GetPluginID(rp->plugin->id), + rp->plugin->version.major, + rp->plugin->version.minor, + rp->plugin->version.patchlevel, + rp->plugin->required_services_version.major, + rp->plugin->required_services_version.minor, + rp->plugin->required_services_version.patchlevel); + } + +out: + *start = page + off; + sz -= off; + if (sz < 0) + sz = 0; + return sz > count ? count : sz; +} + +static int +evms_volumes_read_proc(char *page, + char **start, off_t off, int count, int *eof, void *data) +{ + int sz = 0, j; + + PROCPRINT("Enterprise Volume Management System: Volumes\n"); + PROCPRINT("major minor #blocks type flags name\n\n"); + for (j = 1; j < MAX_EVMS_VOLUMES; j++) { + struct evms_logical_volume *volume; + + volume = &evms_logical_volumes[j]; + if (volume->node) { + PROCPRINT("%5d %7d %16Ld %s %s %s %s%s\n", + EVMS_MAJOR, j, + (long long)volume->node->total_vsectors >> 1, + (volume-> + flags & EVMS_VOLUME_FLAG) ? "evms " : + "compat", + (volume-> + flags & EVMS_VOLUME_READ_ONLY) ? "ro" : "rw", + (volume-> + flags & EVMS_VOLUME_PARTIAL) ? "p " : " ", + EVMS_DEV_NODE_PATH, volume->name); + } + } +out: + *start = page + off; + sz -= off; + if (sz < 0) + sz = 0; + return sz > count ? count : sz; + +} +#endif + +/**********************************************************/ +/* END -- Proc FS Support functions */ +/**********************************************************/ + +/**********************************************************/ +/* START -- FOPS functions definitions */ +/**********************************************************/ + +/************************************************/ +/* START -- IOCTL commands -- EVMS specific */ +/************************************************/ + +static int +evms_ioctl_cmd_get_ioctl_version(void *arg) +{ + int rc = 0; + struct evms_version ver; + + ver.major = EVMS_IOCTL_INTERFACE_MAJOR; + ver.minor = EVMS_IOCTL_INTERFACE_MINOR; + ver.patchlevel = EVMS_IOCTL_INTERFACE_PATCHLEVEL; + + /* copy info to userspace */ + if (copy_to_user(arg, &ver, sizeof (ver))) + rc = -EFAULT; + + return (rc); +} + +static int +evms_ioctl_cmd_get_version(void *arg) +{ + int rc = 0; + struct evms_version ver; + + ver.major = EVMS_MAJOR_VERSION; + ver.minor = EVMS_MINOR_VERSION; + ver.patchlevel = EVMS_PATCHLEVEL_VERSION; + + /* copy info to userspace */ + if (copy_to_user(arg, &ver, sizeof (ver))) + rc = -EFAULT; + + return (rc); +} + +static int +evms_ioctl_cmd_get_info_level(void *arg) +{ + int rc = 0; + + /* copy info to userspace */ + if (copy_to_user(arg, &evms_info_level, sizeof (evms_info_level))) + rc = -EFAULT; + + return (rc); +} + +static int +evms_ioctl_cmd_set_info_level(void *arg) +{ + int temp, rc = 0; + + /* copy info from userspace */ + if (copy_from_user(&temp, arg, sizeof (temp))) + rc = -EFAULT; + else + evms_info_level = temp; + + return (rc); +} + +/* function: evms_quiesce_volume + * + * this function performs the actual quiesce operation on + * a volume in kernel memory. + * + * when quiescing, all new I/Os to a volume are stopped, + * causing the calling thread to block. this thread then + * waits until all I/Os in progress are completed, before + * return control to the caller. + * + * when unquiescing, all new I/Os are allowed to proceed + * unencumbered, and all threads waiting (blocked) on this + * volume, are woken up and allowed to proceed. + * + */ +static int +evms_quiesce_volume(struct evms_logical_volume *volume, + struct inode *inode, + struct file *file, struct evms_quiesce_vol_pkt *qv) +{ + int rc; + + LOG_DEBUG("%squiescing %s.\n", + ((qv->command) ? "" : "un"), volume->name); + +#ifdef VFS_PATCH_PRESENT + if (qv->do_vfs) { + /* VFS function call to sync and lock the filesystem */ + fsync_dev_lockfs(MKDEV(EVMS_MAJOR, qv->minor)); + volume->vfs_quiesced = TRUE; + } +#endif + volume->quiesced = qv->command; + + /* Command specified was "quiesce". */ + if (qv->command) { + /* After setting the volume to + * a quiesced state, there could + * be threads (on SMP systems) + * that are executing in the + * function, evms_handle_request, + * between the "wait_event" and the + * "atomic_inc" lines. We need to + * provide a "delay" sufficient + * to allow those threads to + * to reach the atomic_inc's + * before executing the while loop + * below. The "schedule" call should + * provide this. + */ + schedule(); + /* wait for outstanding requests + * to complete + */ + while (atomic_read(&volume->requests_in_progress) > 0) + schedule(); + } + /* send this command down the stack so lower */ + /* layers can know about this */ + rc = IOCTL(volume->node, inode, file, + EVMS_QUIESCE_VOLUME, (unsigned long) qv); + if (!rc) { + /* Command specified was "unquiesce". */ + if (!qv->command) { + /* "wakeup" any I/O requests waiting on + * this volume. + */ + if (waitqueue_active(&volume->wait_queue)) + wake_up(&volume->wait_queue); +#ifdef VFS_PATCH_PRESENT + if (volume->vfs_quiesced) { + /* VFS function call to unlock the filesystem */ + unlockfs(MKDEV(EVMS_MAJOR, qv->minor)); + volume->vfs_quiesced = FALSE; + } +#endif + } + } else { + LOG_ERROR("error(%d) %squiescing %s.\n", + rc, ((qv->command) ? "" : "un"), volume->name); + } + return (rc); +} + +/* function: evms_delete_volume + * + * this function performs the actual delete operation on + * a volume to purge it from kernel memory. all structures + * and memory consumed by this volume will be free as well + * as clearing or unregistering any system services or + * global data arrays. + * + * NOTE: this function will return -EBUSY on attempts to + * delete mounted volumes. + * + */ +static int +evms_delete_volume(struct evms_logical_volume *volume, + struct evms_delete_vol_pkt *dv) +{ + int rc = 0; + + /* if this is a "permament" delete */ + /* check to make sure volume is not mounted */ + if (dv->command) { + if (is_open(dv->minor)) { + rc = -EBUSY; + } else { + // invalidate the device since it is not coming back + // this is required incase we are re-using the minor number + invalidate_device(MKDEV(EVMS_MAJOR, dv->minor), 1); + } + } + + /* invoke the delete ioctl at the top of the feature stack */ + if (!rc) { + LOG_DETAILS("deleting '%s'.\n", volume->name); + rc = DELETE(volume->node); + } + + /* the volume has been deleted, do any clean up work + * required. + */ + if (!rc) { + devfs_unregister(volume->devfs_handle); + if (dv->command) { + /* if "permanent" delete, free the name + * and NULL the name field. + */ + kfree(volume->name); + volume->name = NULL; + volume->flags = 0; + } else { + /* if "soft" delete, leave the name so + * we can use it to reassign the same + * minor to this volume after a + * rediscovery. + */ + volume->flags = EVMS_VOLUME_SOFT_DELETED; + } + volume->node = NULL; + set_device_ro(MKDEV(EVMS_MAJOR, dv->minor), 0); + blk_size[EVMS_MAJOR][dv->minor] = 0; + blksize_size[EVMS_MAJOR][dv->minor] = 0; + hardsect_size[EVMS_MAJOR][dv->minor] = 0; + evms_volumes--; + } else { + LOG_ERROR("error(%d) %s deleting %s.\n", + rc, ((dv->command) ? "hard" : "soft"), volume->name); + } + return (rc); +} + +/* function: evms_user_delete_volume + * + * this function, depending on the parameters, performs + * a "soft" or a "hard" delete. for a "soft" delete, a + * quiesce & delete request is queued up, to be executed + * at the beginning of the next rediscovery. for a + * "hard" delete, the target volume is quiesced and then + * deleted. if there is any errors attempting to delete + * the target, then the target is unquiesced. if an + * associative volume is specified it is quiesced before + * the target volume is quiesced, and is unquiesced + * after the attempt to delete the target volume. + * + */ +static int +evms_user_delete_volume(struct evms_logical_volume *lvt, + struct inode *inode, + struct file *file, struct evms_delete_vol_pkt *dv) +{ + int rc = 0; + + if (!dv->command) { + /* "soft delete" requested */ + lvt->flags |= (EVMS_REQUESTED_QUIESCE | EVMS_REQUESTED_DELETE); + if (dv->do_vfs) { + lvt->flags |= EVMS_REQUESTED_VFS_QUIESCE; + } + } else { + /* "hard delete" requested */ + int qa = FALSE; + struct evms_quiesce_vol_pkt qv; + struct evms_logical_volume *lva = NULL; + + if (dv->associative_minor) { + /* associative volume specified + * + * quiesce it + */ + lva = &evms_logical_volumes[dv->associative_minor]; + /* quiesce associative volume */ + qv.command = EVMS_QUIESCE; + qv.do_vfs = EVMS_VFS_DO_NOTHING; + qv.minor = dv->associative_minor; + rc = evms_quiesce_volume(lva, inode, file, &qv); + qa = (rc) ? FALSE : TRUE; + } + if (!rc) { + /* quiesce target volume */ + qv.command = EVMS_QUIESCE; + qv.do_vfs = EVMS_VFS_DO_NOTHING; + qv.minor = dv->minor; + rc = evms_quiesce_volume(lvt, inode, file, &qv); + } + if (!rc) { + /* delete the target volume */ + rc = evms_delete_volume(lvt, dv); + if (rc) { + /* got an error undeleting... + * + * unquiesce the target + */ + qv.command = EVMS_UNQUIESCE; + qv.do_vfs = EVMS_VFS_DO_NOTHING; + qv.minor = dv->minor; + evms_quiesce_volume(lvt, inode, file, &qv); + } + } + if (dv->associative_minor) { + /* associative volume specified + * + * unquiesce it + */ + if (qa) { + /* only unquiesce associative + * if we successfully quiesced + * it previously. + */ + qv.command = EVMS_UNQUIESCE; + qv.do_vfs = EVMS_VFS_DO_NOTHING; + qv.minor = dv->associative_minor; + evms_quiesce_volume(lva, inode, file, &qv); + } + } + } + return (rc); +} + +/* function: evms_ioctl_cmd_delete_volume + * + * this function copy user data to/from the kernel, and + * validates user parameters. after validation, control + * is passed to worker routine evms_user_delete_volume. + * + */ +static int +evms_ioctl_cmd_delete_volume(struct inode *inode, + struct file *file, unsigned long arg) +{ + int rc = 0; + struct evms_delete_vol_pkt tmp, *user_parms; + struct evms_logical_volume *volume = NULL; + + user_parms = (struct evms_delete_vol_pkt *) arg; + /* copy user's parameters to kernel space */ + if (copy_from_user(&tmp, user_parms, sizeof (tmp))) + rc = -EFAULT; + + /* check to make sure associative minor is in use */ + if (!rc) { + if (tmp.associative_minor) { + volume = &evms_logical_volumes[tmp.associative_minor]; + if (volume->node == NULL) + rc = -ENXIO; + } + } + /* check to make sure target minor is in use */ + if (!rc) { + volume = &evms_logical_volumes[tmp.minor]; + if (volume->node == NULL) + rc = -ENXIO; + else + rc = evms_user_delete_volume(volume, inode, file, &tmp); + } + /* copy the status value back to the user */ + tmp.status = rc; + if (copy_to_user(user_parms, &tmp, sizeof (tmp))) + rc = -EFAULT; + + return (rc); +} + +/* function: evms_full_rediscover_prep + * + * this function helps to prevent problems when evms is + * configured with the base built in statically and some + * plugins built as modules. + * + * in these cases, when the initial discovery is done, + * only the statically built modules are available for + * volume construction. as a result, some volumes that + * require the plugins built as modules (which haven't + * been loaded), to be fully reconstructed, may come up + * as compatibility volumes or partial volumes. + * + * when parts of evms are built as modules, the + * evms_rediscover_pkty utility is used, to perform a secondary + * rediscover, after all the plugins built as modules + * have been loaded, to construct all the volumes + * requiring these plugins. + * + * however since some of the volumes, requiring the plugins + * built as modules, may have been already exported as + * compatibility or partial volumes, we need to purge these + * volumes from kernel's memory, so that can be rediscovered + * and claimed by the appropriate plugins, and reconstructed + * into the correct volumes. + * + * this function purges all compatibility volumes that are + * not in use(mounted) and all partial volumes, prior to + * doing the secondary rediscover, thus allowing volumes to + * rediscovered correctly. + * + * NOTE: again, this is only required in cases when a + * combination of plugins are built statically and as + * modules. + * + */ +static void +evms_full_rediscover_prep(struct inode *inode, struct file *file) +{ + int rc = 0, i; + + LOG_DETAILS("%s: started.\n", __FUNCTION__); + /* check for acceptable volumes to be deleted */ + for (i = 1; i < MAX_EVMS_VOLUMES; i++) { + struct evms_logical_volume *volume = NULL; + struct evms_delete_vol_pkt dv; + int volume_open, doit; + + volume = &evms_logical_volumes[i]; + if (!volume->node) + continue; + volume_open = is_open(i); + /* only proceed on volumes that are: + * partial volumes + * OR + * unopened compatibility volumes + */ + doit = FALSE; + if (volume->flags & EVMS_VOLUME_PARTIAL) { + /* do all partial volumes + */ + doit = TRUE; + } else if (!(volume->flags & EVMS_VOLUME_FLAG)) { + /* check all compatibility volumes + */ + if (!volume_open && !is_swap_partition(MKDEV(EVMS_MAJOR, i))) { + /* only do unopened volumes + */ + doit = TRUE; + } + } + if (doit == FALSE) { + continue; + } + /* delete the volume from memory. + * do a 'soft' delete if volume + * is mounted, and 'hard' delete + * if it is not. + * + * NOTE: the delete operation will + * clear the bits in the flags field. + */ + dv.command = (volume_open) ? + EVMS_SOFT_DELETE : EVMS_HARD_DELETE; + dv.minor = i; + dv.associative_minor = 0; + dv.status = 0; + rc = evms_user_delete_volume(volume, inode, file, &dv); + } + LOG_DETAILS("%s: completed.\n", __FUNCTION__); +} + +static int +evms_ioctl_cmd_rediscover_volumes(struct inode *inode, + struct file *file, + unsigned int cmd, unsigned long arg) +{ + int rc, i; + struct evms_rediscover_pkt tmp, *user_parms; + u64 *array_ptr = NULL; + ulong array_size = 0; + struct evms_logical_volume *volume = NULL; + + rc = tmp.drive_count = 0; + user_parms = (struct evms_rediscover_pkt *) arg; + /* copy user's parameters to kernel space */ + if (copy_from_user(&tmp, user_parms, sizeof (tmp))) + rc = -EFAULT; + + if (tmp.drive_count == REDISCOVER_ALL_DEVICES) { + evms_full_rediscover_prep(inode, file); + } + /* quiesce all queued volumes */ + for (i = 1; i < MAX_EVMS_VOLUMES; i++) { + struct evms_quiesce_vol_pkt qv; + + volume = &evms_logical_volumes[i]; + if (!volume->node) { + continue; + } + if (!(volume->flags & EVMS_REQUESTED_QUIESCE)) { + continue; + } + qv.command = EVMS_QUIESCE; + qv.minor = i; + qv.do_vfs = (volume->flags & EVMS_REQUESTED_VFS_QUIESCE) ? + EVMS_VFS_DO : EVMS_VFS_DO_NOTHING, qv.status = 0; + rc = evms_quiesce_volume(volume, inode, file, &qv); + } + /* "soft" delete all queued volumes */ + for (i = 1; i < MAX_EVMS_VOLUMES; i++) { + struct evms_delete_vol_pkt dv; + + volume = &evms_logical_volumes[i]; + if (!volume->node) { + continue; + } + if (!(volume->flags & EVMS_REQUESTED_DELETE)) { + continue; + } + dv.command = EVMS_SOFT_DELETE; + dv.minor = i; + dv.associative_minor = 0; + dv.status = 0; + rc = evms_delete_volume(volume, &dv); + } + + if (tmp.drive_count && (tmp.drive_count != REDISCOVER_ALL_DEVICES)) { + if (!rc) { + /* create space for userspace drive array */ + array_size = + sizeof (*tmp.drive_array) * tmp.drive_count; + array_ptr = tmp.drive_array; + tmp.drive_array = kmalloc(array_size, GFP_KERNEL); + if (!tmp.drive_array) { + rc = -ENOMEM; + } + } + if (!rc) + /* copy rediscover drive array to kernel space */ + if (copy_from_user + (tmp.drive_array, array_ptr, array_size)) + rc = -EFAULT; + } + + if (!rc) { + static int evms_discover_volumes(struct evms_rediscover_pkt *); + /* perform the rediscovery operation */ + rc = evms_discover_volumes(&tmp); + } + + /* clean up after operation */ + if (tmp.drive_count && (tmp.drive_count != REDISCOVER_ALL_DEVICES)) + kfree(tmp.drive_array); + + /* set return code and copy info to userspace */ + tmp.status = rc; + if (copy_to_user(&user_parms->status, &tmp.status, sizeof (tmp.status))) + rc = -EFAULT; + + return (rc); +} + +static struct evms_list_node *user_disk_ptr; +static int +evms_ioctl_cmd_get_logical_disk(void *arg) +{ + int rc = 0; + struct evms_user_disk_pkt tmp, *user_parms; + + user_parms = (struct evms_user_disk_pkt *) arg; + /* copy user's parameters to kernel space */ + if (copy_from_user + (&tmp.command, &user_parms->command, sizeof (tmp.command))) + rc = -EFAULT; + + if (!rc) { + if (tmp.command == EVMS_FIRST_DISK) + user_disk_ptr = evms_global_device_list; + else /* tmp.command == EVMS_NEXT_DISK */ + user_disk_ptr = user_disk_ptr->next; + + if (user_disk_ptr == NULL) + tmp.status = EVMS_DISK_INVALID; + else { + tmp.status = EVMS_DISK_VALID; + tmp.disk_handle = + NODE_TO_DEV_HANDLE(user_disk_ptr->item); + } + /* copy info to userspace */ + if (copy_to_user(user_parms, &tmp, sizeof (tmp))) + rc = -EFAULT; + } + return (rc); +} + +static int +evms_ioctl_cmd_get_logical_disk_info(void *arg) +{ + int rc = 0; + struct evms_user_disk_info_pkt tmp, *user_parms; + struct evms_list_node *p; + struct evms_logical_node *disk_node = NULL; + + user_parms = (struct evms_user_disk_info_pkt *) arg; + /* copy user's parameters to kernel space */ + if (copy_from_user + (&tmp.disk_handle, &user_parms->disk_handle, + sizeof (tmp.disk_handle))) + rc = -EFAULT; + + /* check handle for validity */ + if (!rc) { + rc = -EINVAL; + disk_node = DEV_HANDLE_TO_NODE(tmp.disk_handle); + for (p = evms_global_device_list; p; p = p->next) + if (p->item == disk_node) { + rc = 0; + user_disk_ptr = p; + break; + } + } + + /* populate kernel copy of user's structure with appropriate info */ + if (!rc) { + struct hd_geometry geo; + struct evms_logical_node *node = + (struct evms_logical_node *) user_disk_ptr->item; + tmp.flags = node->flags; + strcpy(tmp.disk_name, EVMS_DEV_NODE_PATH); + strcat(tmp.disk_name, node->name); + rc = evms_cs_kernel_ioctl(node, EVMS_UPDATE_DEVICE_INFO, + (ulong) NULL); + if (!rc) { + tmp.total_sectors = node->total_vsectors; + tmp.hardsect_size = node->hardsector_size; + tmp.block_size = node->block_size; + rc = evms_cs_kernel_ioctl(node, HDIO_GETGEO, + (unsigned long) &geo); + } + if (!rc) { + tmp.geo_sectors = geo.sectors; + tmp.geo_heads = geo.heads; + tmp.geo_cylinders = geo.cylinders; + } + } + + /* set return code and copy info to userspace */ + tmp.status = rc; + if (copy_to_user(user_parms, &tmp, sizeof (tmp))) + rc = -EFAULT; + + return (rc); +} + +static int +evms_ioctl_cmd_sector_io(void *arg) +{ + int rc; +#define MAX_IO_SIZE 128 + u64 io_size, max_io_size = MAX_IO_SIZE; +#undef MAX_IO_SIZE + struct evms_sector_io_pkt tmp, *user_parms; + struct evms_logical_node *disk_node = NULL; + struct evms_list_node *list_node; + unsigned char *io_buffer; + + rc = 0; + list_node = NULL; + io_buffer = NULL; + + user_parms = (struct evms_sector_io_pkt *) arg; + /* copy user's parameters to kernel space */ + if (copy_from_user(&tmp, user_parms, sizeof (tmp))) + rc = -EFAULT; + + /* check handle for validity */ + if (!rc) { + rc = -EINVAL; + disk_node = DEV_HANDLE_TO_NODE(tmp.disk_handle); + for (list_node = evms_global_device_list; list_node; + list_node = list_node->next) + if (list_node->item == disk_node) { + rc = 0; + break; + } + } + if (!rc) { + int done; + /* allocate a io buffer upto 64Kbytes in size */ + if (tmp.sector_count < max_io_size) + max_io_size = tmp.sector_count; + do { + done = TRUE; + /* allocate buffer large enough to max_io_size sectors */ + io_buffer = + kmalloc(max_io_size << EVMS_VSECTOR_SIZE_SHIFT, + GFP_KERNEL); + if (!io_buffer) { + max_io_size >>= 1; + if (!max_io_size) { + rc = -ENOMEM; + } else { + done = FALSE; + } + } + } while (!done); + } + /* perform io with specified disk */ + if (!rc) { + u64 io_sector_offset, io_remaining; + u64 io_bytes; + u_char *user_buffer_ptr; + + io_remaining = tmp.sector_count; + io_sector_offset = 0; + user_buffer_ptr = tmp.buffer_address; + while (io_remaining) { + /* compute the io_size for this pass */ + io_size = (io_remaining >= max_io_size) ? + max_io_size : io_remaining; + + io_bytes = io_size << EVMS_VSECTOR_SIZE_SHIFT; + /* for writes, copy a sector from user to kernel */ + if (tmp.io_flag == EVMS_SECTOR_IO_WRITE) { + /* copy sector from user data buffer */ + if (copy_from_user(io_buffer, + user_buffer_ptr, io_bytes)) + rc = -EFAULT; + } + if (rc) + break; + + /* perform IO one sector at a time */ + rc = INIT_IO(disk_node, + tmp.io_flag, + io_sector_offset + tmp.starting_sector, + io_size, io_buffer); + + if (rc) + break; + + if (tmp.io_flag != EVMS_SECTOR_IO_WRITE) { + /* copy sector to user data buffer */ + if (copy_to_user(user_buffer_ptr, + io_buffer, io_bytes)) + rc = -EFAULT; + } + if (rc) + break; + + user_buffer_ptr += io_bytes; + tmp.buffer_address += io_bytes; + io_sector_offset += io_size; + io_remaining -= io_size; + } + } + + /* if the sector_buffer was allocated, free it */ + if (io_buffer) + kfree(io_buffer); + + /* copy the status value back to the user */ + tmp.status = rc; + if (copy_to_user(user_parms, &tmp, sizeof (tmp))) + rc = -EFAULT; + + return (rc); +} + +static int user_minor; +static int +evms_ioctl_cmd_get_minor(void *arg) +{ + int rc = 0; + struct evms_user_minor_pkt tmp, *user_parms; + + user_parms = (struct evms_user_minor_pkt *) arg; + /* copy user's parameters to kernel space */ + if (copy_from_user + (&tmp.command, &user_parms->command, sizeof (tmp.command))) + rc = -EFAULT; + + if (!rc) { + if (tmp.command == EVMS_FIRST_VOLUME) + user_minor = 1; + else /* tmp.command == EVMS_NEXT_VOLUME */ + user_minor++; + + tmp.status = EVMS_VOLUME_INVALID; + for (; user_minor < MAX_EVMS_VOLUMES; user_minor++) { + struct evms_logical_volume *lv; + + lv = &evms_logical_volumes[user_minor]; + /* see if any corrupt volumes have been + * unmounted. If so, clean up the + * evms_logical_volumes array entry, and + * don't report the volume to the user. + */ + if (lv->flags & EVMS_VOLUME_CORRUPT) { + if (!is_open(user_minor)) { + /* clear logical volume structure + * for this volume so it may be + * reused. + */ + LOG_WARNING + ("ioctl_get_minor: found unmounted %s volume(%u,%u,%s).\n", + ((lv-> + flags & EVMS_VOLUME_SOFT_DELETED) + ? "'soft deleted'" : ""), + EVMS_MAJOR, user_minor, lv->name); + LOG_WARNING + (" releasing minor(%d) used by volume(%s)!\n", + user_minor, lv->name); + kfree(lv->name); + lv->name = NULL; + lv->flags = 0; + } + } + if (lv->node || (lv->flags & EVMS_VOLUME_CORRUPT)) { + tmp.status = EVMS_VOLUME_VALID; + tmp.minor = user_minor; + break; + } + } + + /* copy info to userspace */ + if (copy_to_user(user_parms, &tmp, sizeof (tmp))) + rc = -EFAULT; + } + return (rc); +} + +static int +evms_ioctl_cmd_get_volume_data(void *arg) +{ + int rc = 0; + struct evms_volume_data_pkt tmp, *user_parms; + struct evms_logical_volume *volume = NULL; + struct evms_logical_node *node = NULL; + + user_parms = (struct evms_volume_data_pkt *) arg; + /* copy user's parameters to kernel space */ + if (copy_from_user(&tmp, user_parms, sizeof (tmp))) + rc = -EFAULT; + + if (!rc) { + volume = &evms_logical_volumes[tmp.minor]; + node = volume->node; + if (node == NULL) + rc = -ENODEV; + } + if (!rc) { + tmp.flags = volume->flags; + strcpy(tmp.volume_name, EVMS_DEV_NODE_PATH); + strcat(tmp.volume_name, volume->name); + } + + /* copy return code and info to userspace */ + tmp.status = rc; + if (copy_to_user(user_parms, &tmp, sizeof (tmp))) + rc = -EFAULT; + return (rc); +} + +static struct evms_registered_plugin *ioctl_reg_record; +static int +evms_ioctl_cmd_get_plugin(void *arg) +{ + int rc = 0; + struct evms_kernel_plugin_pkt tmp, *user_parms; + + user_parms = (struct evms_kernel_plugin_pkt *) arg; + /* copy user's parameters to kernel space */ + if (copy_from_user + (&tmp.command, &user_parms->command, sizeof (tmp.command))) + rc = -EFAULT; + + if (!rc) { + /* if the command is not 0, then verify + * that ioctl_reg_record is pointing to + * current and valid plugin header. + */ + if (tmp.command) { /* tmp.command == EVMS_NEXT_PLUGIN */ + struct evms_registered_plugin *tmp_reg_record; + tmp_reg_record = registered_plugin_head; + /* search the current plugin list */ + while (tmp_reg_record) { + if (tmp_reg_record == ioctl_reg_record) + break; + tmp_reg_record = tmp_reg_record->next; + } + /* if the ioctl_reg_record is not in the + * current list, then start at the beginning. + */ + if (!tmp_reg_record) + tmp.command = EVMS_FIRST_PLUGIN; + } + + if (tmp.command == EVMS_FIRST_PLUGIN) + /* start at beginning of plugin list */ + ioctl_reg_record = registered_plugin_head; + else /* tmp.command == EVMS_NEXT_PLUGIN */ + /* continue from current position in list */ + ioctl_reg_record = ioctl_reg_record->next; + + tmp.status = EVMS_PLUGIN_INVALID; + tmp.id = 0; + if (ioctl_reg_record) { + tmp.id = ioctl_reg_record->plugin->id; + tmp.version = ioctl_reg_record->plugin->version; + tmp.status = EVMS_PLUGIN_VALID; + } + + /* copy info to userspace */ + if (copy_to_user(user_parms, &tmp, sizeof (tmp))) + rc = -EFAULT; + } + return (rc); +} + +static int +evms_ioctl_cmd_plugin_ioctl(struct inode *inode, + struct file *file, + unsigned int cmd, unsigned long arg) +{ + int rc = 0, found = FALSE; + struct evms_plugin_ioctl_pkt tmp, *user_parms; + struct evms_registered_plugin *p; + + user_parms = (struct evms_plugin_ioctl_pkt *) arg; + /* copy user's parameters to kernel space */ + if (copy_from_user(&tmp, user_parms, sizeof (tmp))) + rc = -EFAULT; + + if (!rc) { + /* search for the specified plugin */ + for (p = registered_plugin_head; p; p = p->next) + /* check for the specified feature id */ + if (p->plugin->id == tmp.feature_id) { + found = TRUE; + /* check that entry point is used */ + if (p->plugin->fops->direct_ioctl) + rc = DIRECT_IOCTL(p, inode, file, cmd, + arg); + else + rc = -ENOSYS; + break; + } + /* was the specified plugin found? */ + if (found == FALSE) + rc = -ENOPKG; + + /* copy the status value back to the user */ + tmp.status = rc; + if (copy_to_user(user_parms, &tmp, sizeof (tmp))) + rc = -EFAULT; + } + return (rc); +} + +#define MAX_BUFFER_SIZE 65536 +static int +evms_ioctl_cmd_kernel_partial_csum(void *arg) +{ + int rc = 0; + u64 compute_size = MAX_BUFFER_SIZE; + struct evms_compute_csum_pkt tmp, *user_parms; + unsigned char *buffer = NULL; + + user_parms = (struct evms_compute_csum_pkt *) arg; + /* copy user's parameters to kernel space */ + if (copy_from_user(&tmp, user_parms, sizeof (tmp))) + rc = -EFAULT; + + if (!rc) { + /* allocate a io buffer upto 64Kbytes in size */ + if (tmp.buffer_size < MAX_BUFFER_SIZE) + compute_size = tmp.buffer_size; + + /* allocate buffer large enough to hold a single sector */ + buffer = kmalloc(compute_size, GFP_KERNEL); + if (!buffer) { + rc = -ENOMEM; + } + } + /* perform io with specified disk */ + if (!rc) { + u64 remaining_bytes; + u_char *user_buffer_ptr; + unsigned int insum = tmp.insum; + + remaining_bytes = tmp.buffer_size; + user_buffer_ptr = tmp.buffer_address; + while (remaining_bytes) { + /* compute the compute_size for this pass */ + compute_size = (remaining_bytes >= MAX_BUFFER_SIZE) ? + MAX_BUFFER_SIZE : remaining_bytes; + + /* copy into kernel from user data buffer */ + if (copy_from_user(buffer, user_buffer_ptr, + compute_size)) + rc = -EFAULT; + if (rc) + break; + /* compute the checksum for this pass */ + tmp.outsum = csum_partial(buffer, tmp.buffer_size, + insum); + /* set up for another possible pass */ + insum = tmp.outsum; + /* update loop progress variables */ + user_buffer_ptr += compute_size; + tmp.buffer_address += compute_size; + remaining_bytes -= compute_size; + } + } + + /* if the sector_buffer was allocated, free it */ + if (buffer) + kfree(buffer); + + /* copy the status value back to the user */ + tmp.status = rc; + if (copy_to_user(user_parms, &tmp, sizeof (tmp))) + rc = -EFAULT; + + return (rc); +} + +#undef MAX_BUFFER_SIZE + +static int +evms_ioctl_cmd_get_bmap(struct inode *inode, + struct file *file, unsigned int cmd, unsigned long arg) +{ + int rc = 0; + struct evms_get_bmap_pkt tmp, *user_parms; + + user_parms = (struct evms_get_bmap_pkt *) arg; + /* copy user's parameters to kernel space */ + if (copy_from_user(&tmp, user_parms, sizeof (tmp))) + rc = -EFAULT; + + /* pass the ioctl down the volume stack */ + if (!rc) { + struct evms_logical_volume *volume; + + volume = &evms_logical_volumes[MINOR(inode->i_rdev)]; + rc = IOCTL(volume->node, inode, file, cmd, + (unsigned long) &tmp); + } + /* copy the status value back to the user */ + tmp.status = rc; + if (copy_to_user(user_parms, &tmp, sizeof (tmp))) + rc = -EFAULT; + + return (rc); +} + +static int +evms_ioctl_cmd_process_notify_event(unsigned long arg) +{ + int rc = 0, found = FALSE; + struct evms_notify_pkt tmp, *user_parms; + struct evms_list_node **list_node = NULL; + struct evms_event *event = NULL; + + user_parms = (struct evms_notify_pkt *) arg; + /* copy user's parameters to kernel space */ + if (copy_from_user(&tmp, user_parms, sizeof (tmp))) + rc = -EFAULT; + + /* check to see if PID has already been registered + * for this event. + */ + if (!rc) { + list_node = &evms_global_notify_list; + while (*list_node) { + event = (*list_node)->item; + if ((event->pid == tmp.eventry.pid) && + (event->eventid == tmp.eventry.eventid)) { + found = TRUE; + break; + } + list_node = &(*list_node)->next; + } + } + if (tmp.command) { /* tmp.command == EVMS_REGISTER_EVENT */ + /* registration code */ + if (found) { + rc = -EBUSY; + LOG_ERROR + ("error(%d) pid(%d) already register to receive signal(%d) on event(%d).\n", + rc, tmp.eventry.pid, tmp.eventry.signo, + tmp.eventry.eventid); + } else { + /* register this pid/event type */ + event = kmalloc(sizeof (struct evms_event), GFP_KERNEL); + if (!event) { + rc = -ENOMEM; + LOG_ERROR + ("error(%d) allocating event structure.\n", + rc); + } else { + memset(event, 0, sizeof (struct evms_event)); + event->pid = tmp.eventry.pid; + event->eventid = tmp.eventry.eventid; + event->signo = tmp.eventry.signo; + rc = evms_cs_add_item_to_list + (&evms_global_notify_list, event); + } + } + } else { /* tmp.command == EVMS_UNREGISTER_EVENT */ + /* unregistration code */ + if (!found) { + rc = -ENODATA; + LOG_ERROR + ("error(%d) attempting to unregister a non-registered pid(%d) on event(%d).\n", + rc, tmp.eventry.pid, tmp.eventry.eventid); + } else { + event = (*list_node)->item; + rc = evms_cs_remove_item_from_list + (&evms_global_notify_list, event); + if (!rc) { + kfree(event); + } + } + } + /* copy the status value back to the user */ + tmp.status = rc; + if (copy_to_user(user_parms, &tmp, sizeof (tmp))) + rc = -EFAULT; + + return (rc); +} + +static int +evms_ioctl_cmd_check_mount_status(struct inode *inode, struct file *file, + ulong arg) +{ + int rc = 0; + struct evms_mount_status_pkt tmp, *user_parms; + + user_parms = (struct evms_mount_status_pkt *) arg; + /* copy user's parameters to kernel space */ + if (copy_from_user(&tmp, user_parms, sizeof (tmp))) + rc = -EFAULT; + + if (!rc) { + tmp.mounted = + (is_mounted(MKDEV(EVMS_MAJOR, tmp.minor))) ? TRUE : FALSE; + } + + /* copy the status value back to the user */ + tmp.status = rc; + if (copy_to_user(user_parms, &tmp, sizeof (tmp))) + rc = -EFAULT; + + return (rc); +} + +static int +evms_ioctl_cmd_check_open_status(struct inode *inode, struct file *file, + ulong arg) +{ + int rc = 0; + struct evms_open_status_pkt tmp, *user_parms; + + user_parms = (struct evms_open_status_pkt *) arg; + /* copy user's parameters to kernel space */ + if (copy_from_user(&tmp, user_parms, sizeof (tmp))) + rc = -EFAULT; + + if (!rc) { + tmp.opens = is_open(tmp.minor); + } + + /* copy the status value back to the user */ + tmp.status = rc; + if (copy_to_user(user_parms, &tmp, sizeof (tmp))) + rc = -EFAULT; + + return (rc); +} + +/************************************************/ +/* END -- IOCTL commands -- EVMS specific */ +/************************************************/ + +/************************************************/ +/* START -- IOCTL commands -- Volume specific */ +/************************************************/ + +/************************************************/ +/* END -- IOCTL commands -- Volume specific */ +/************************************************/ + +/************************************************/ +/* START -- IOCTL main */ +/************************************************/ + +/* + * Function: evms_ioctl + * + * This function is the main ioctl entry point for all of evms. + */ + +static int +evms_ioctl(struct inode *inode, + struct file *file, unsigned int cmd, unsigned long arg) +{ + unsigned long minor = 0; + int rc = 0; + struct evms_logical_node *node = NULL; + + /* check user access */ + if (!capable(CAP_SYS_ADMIN)) + rc = -EACCES; + + if (!inode) + rc = -EINVAL; + + if (!rc) { + /* get the minor */ + minor = MINOR(inode->i_rdev); + LOG_EXTRA + ("ioctl: minor(%lu), dir(%d), size(%d), type(%d), nr(%d)\n", + minor, (cmd >> _IOC_DIRSHIFT) & _IOC_DIRMASK, + (cmd >> _IOC_SIZESHIFT) & _IOC_SIZEMASK, + (cmd >> _IOC_TYPESHIFT) & _IOC_TYPEMASK, + (cmd >> _IOC_NRSHIFT) & _IOC_NRMASK); + + /* insure this minor points to a valid volume */ + if (minor) { + node = evms_logical_volumes[minor].node; + if (node == NULL) + rc = -ENXIO; + } + } + + /* process the IOCTL commands */ + if (!rc) { + if (!minor) { + /* process all EVMS specific commands */ + switch (cmd) { + case EVMS_GET_IOCTL_VERSION: + rc = evms_ioctl_cmd_get_ioctl_version((void *) + arg); + break; + case EVMS_GET_VERSION: + rc = evms_ioctl_cmd_get_version((void *) arg); + break; + case EVMS_GET_INFO_LEVEL: + rc = evms_ioctl_cmd_get_info_level((void *) + arg); + break; + case EVMS_SET_INFO_LEVEL: + rc = evms_ioctl_cmd_set_info_level((void *) + arg); + break; + case EVMS_REDISCOVER_VOLUMES: + rc = evms_ioctl_cmd_rediscover_volumes(inode, + file, + cmd, + arg); + break; + case EVMS_GET_LOGICAL_DISK: + rc = evms_ioctl_cmd_get_logical_disk((void *) + arg); + break; + case EVMS_GET_LOGICAL_DISK_INFO: + rc = evms_ioctl_cmd_get_logical_disk_info((void + *) + arg); + break; + case EVMS_SECTOR_IO: + rc = evms_ioctl_cmd_sector_io((void *) arg); + break; + case EVMS_GET_MINOR: + rc = evms_ioctl_cmd_get_minor((void *) arg); + break; + case EVMS_GET_VOLUME_DATA: + rc = evms_ioctl_cmd_get_volume_data((void *) + arg); + break; + case EVMS_DELETE_VOLUME: + rc = evms_ioctl_cmd_delete_volume(inode, file, + arg); + break; + case EVMS_GET_PLUGIN: + rc = evms_ioctl_cmd_get_plugin((void *) arg); + break; + case EVMS_PLUGIN_IOCTL: + rc = evms_ioctl_cmd_plugin_ioctl(inode, file, + cmd, arg); + break; + case EVMS_COMPUTE_CSUM: + rc = evms_ioctl_cmd_kernel_partial_csum((void *) + arg); + break; + case EVMS_PROCESS_NOTIFY_EVENT: + rc = evms_ioctl_cmd_process_notify_event(arg); + break; + case EVMS_CHECK_MOUNT_STATUS: + rc = evms_ioctl_cmd_check_mount_status(inode, + file, + arg); + break; + case EVMS_CHECK_OPEN_STATUS: + rc = evms_ioctl_cmd_check_open_status(inode, + file, + arg); + break; + default: + rc = -EINVAL; + break; + } + } else { + /* process Volume specific commands */ + switch (cmd) { + /* pick up standard blk ioctls */ + case BLKFLSBUF: + case BLKROSET: + case BLKROGET: + case BLKRASET: + case BLKRAGET: +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,10) + case BLKBSZGET: + case BLKBSZSET: +#endif + case BLKSSZGET: + rc = blk_ioctl(inode->i_rdev, cmd, arg); + break; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,10) + case BLKGETSIZE: + { + /* casting size down to 32-bits until + * kernel allows return of 64-bit size + * values. + */ + long size = node->total_vsectors; + if (copy_to_user + ((long *) arg, &size, + sizeof (long))) + rc = -EFAULT; + } + break; + case BLKGETSIZE64: + { + u64 size_in_bytes = + node-> + total_vsectors << + EVMS_VSECTOR_SIZE_SHIFT; + if (copy_to_user + ((u64 *) arg, &size_in_bytes, + sizeof (u64))) + rc = -EFAULT; + } + break; +#endif + case EVMS_GET_IOCTL_VERSION: + rc = evms_ioctl_cmd_get_ioctl_version((void *) + arg); + break; + case EVMS_GET_BMAP: + rc = evms_ioctl_cmd_get_bmap(inode, file, cmd, + arg); + break; + case EVMS_GET_VOL_STRIPE_INFO: + { + struct evms_vol_stripe_info_pkt info; + + info.size = + PAGE_SIZE >> + EVMS_VSECTOR_SIZE_SHIFT; + info.width = 1; + if (copy_to_user + ((struct evms_vol_stripe_info_pkt *) + arg, &info, sizeof (info))) + rc = -EFAULT; + } + break; + + default: + rc = IOCTL(node, inode, file, cmd, arg); + break; + } + } + } + return rc; +} + +/************************************************/ +/* END -- IOCTL main */ +/************************************************/ + +/************************************************/ +/* START -- CHECK MEDIA CHANGE */ +/************************************************/ + +static int +evms_check_media_change(kdev_t dev) +{ + int rc = 0; + struct evms_logical_volume *volume = NULL; + + /* check user access */ + if (!capable(CAP_SYS_ADMIN)) + rc = -EACCES; + if (!rc) { + int minor; + /* get the minor */ + minor = MINOR(dev); + /* insure this minor points to a valid volume */ + volume = &evms_logical_volumes[minor]; + if (volume->node == NULL) { + rc = -ENXIO; + } + } + if (!rc) { + if (volume->flags & EVMS_DEVICE_REMOVABLE) { + /* check for media change */ + rc = evms_cs_kernel_ioctl(volume->node, + EVMS_CHECK_MEDIA_CHANGE, + (unsigned long) NULL); + if (rc < 0) { + LOG_ERROR + ("error(%d) doing EVMS_CHECK_MEDIA_CHANGE ioctl on '%s'.\n", + rc, volume->name); + } + } + } + return (rc); +} + +/************************************************/ +/* END -- CHECK MEDIA CHANGE */ +/************************************************/ + +static int +evms_check_for_device_changes(struct inode *inode, struct file *file) +{ + int rc = 0, something_changed = 0, i; + struct evms_rediscover_pkt kernel_rd_pckt = { 0, 0, NULL }; + struct evms_list_node *disk_list = NULL, *lnode, *next_lnode; + struct evms_logical_node *disk, *new_device_list = NULL; + struct evms_logical_volume *volume = NULL; + + /* check for new devices + * + * put all new devices on the disk list so they + * will be included in the rediscovery process. + */ + static void evms_discover_logical_disks(struct evms_logical_node **); + evms_discover_logical_disks(&new_device_list); + if (new_device_list) { + LOG_DETAILS("%s: new devices detected.\n", __FUNCTION__); + something_changed++; + /* put these new nodes on the disk list */ + while (new_device_list) { + disk = new_device_list; + rc = evms_cs_remove_logical_node_from_list + (&new_device_list, disk); + if (rc) { + LOG_ERROR + ("%s: error(%d) removing device(%s) from list.\n", + __FUNCTION__, rc, disk->name); + } + rc = evms_cs_add_item_to_list(&disk_list, disk); + if (rc) { + LOG_ERROR + ("%s: error(%d) adding device(%s) from list.\n", + __FUNCTION__, rc, disk->name); + } + } + } + + /* check all devices for changed removable media + * + * scan the global device list and issue check + * media change on each removable media device. + * put all removable devices that indicate a + * media change on the disk list. + * + * also scan for devices that have been unplugged + * or contain corrupt volumes. + */ + for (lnode = evms_global_device_list; lnode; lnode = lnode->next) { + int add_to_list = FALSE; + disk = (struct evms_logical_node *) lnode->item; + /* only really check removable media devices */ + if (disk->flags & EVMS_DEVICE_REMOVABLE) { + /* check for media change */ + rc = evms_cs_kernel_ioctl(disk, + EVMS_CHECK_MEDIA_CHANGE, + (unsigned long) NULL); + if (rc < 0) { + LOG_ERROR + ("%s: error(%d) doing EVMS_CHECK_MEDIA_CHANGE ioctl on '%s'.\n", + __FUNCTION__, rc, disk->name); + } else if (rc == 1) { + add_to_list = TRUE; + } + } + /* check for device that where present + * before but are gone (unplugged + * device or unloaded driver). + */ + rc = IOCTL(disk, inode, file, + EVMS_CHECK_DEVICE_STATUS, (ulong) NULL); + if (rc) { + LOG_ERROR + ("error(%d) doing EVMS_CHECK_DEVICE_STATUS ioctl on '%s'.\n", + rc, volume->name); + } + if (disk->flags & EVMS_DEVICE_UNAVAILABLE) { + add_to_list = TRUE; + } + if (add_to_list) { + something_changed++; + rc = evms_cs_add_item_to_list(&disk_list, disk); + } + } + /* log a statement that we detected changed media. + */ + if (disk_list) { + LOG_DETAILS("%s: media change detected.\n", __FUNCTION__); + } + + /* check for volumes with removed removable media. + * mark the volumes that reside on changed media. + */ + for (i = 1; i < MAX_EVMS_VOLUMES; i++) { + volume = &evms_logical_volumes[i]; + if (!volume->node) + continue; + if (!(volume->flags & EVMS_DEVICE_REMOVABLE)) + continue; + if (evms_check_media_change(MKDEV(EVMS_MAJOR, i)) <= 0) + continue; + /* remember which volumes have changed media */ + volume->flags |= EVMS_MEDIA_CHANGED; + something_changed++; + } + + /* check for removed devices */ + for (i = 1; i < MAX_EVMS_VOLUMES; i++) { + int status; + volume = &evms_logical_volumes[i]; + if (!volume->node) + continue; + /* check for device status */ + status = 0; + rc = IOCTL(volume->node, inode, file, + EVMS_CHECK_DEVICE_STATUS, (ulong) & status); + if (rc) { + LOG_ERROR + ("error(%d) doing EVMS_CHECK_DEVICE_STATUS ioctl on '%s'.\n", + rc, volume->name); + continue; + } + if (!(status & EVMS_DEVICE_UNAVAILABLE)) { + continue; + } + /* remember which volumes have changed media */ + volume->flags |= EVMS_DEVICE_UNPLUGGED; + something_changed++; + } + + /* do we have some work to do? */ + if (something_changed) { + /* check for volumes to be deleted */ + for (i = 1; i < MAX_EVMS_VOLUMES; i++) { + struct evms_quiesce_vol_pkt qv; + + volume = &evms_logical_volumes[i]; + if (!volume->node) + continue; + /* only proceed on volumes with: + * changed media, + * hot-unplugged devices, + * & partial volumes + */ + if (!(volume->flags & + (EVMS_MEDIA_CHANGED | + EVMS_VOLUME_PARTIAL | EVMS_DEVICE_UNPLUGGED))) + continue; + /* gather the disk's needing to be + * rediscovered to rebuild this + * volume. + * + * this will locate other disks that + * the volume resides on that don't + * indicate media change. + */ + rc = evms_cs_kernel_ioctl(volume->node, + EVMS_GET_DISK_LIST, + (unsigned long) &disk_list); + if (rc) { + LOG_ERROR + ("%s: error(%d) retrieving underlying disk list for '%s', skipping ...\n", + __FUNCTION__, rc, volume->name); + continue; + } + /* quiesce all the changed volumes + * prior to being deleted. + */ + qv.command = 1; // quiesce + qv.minor = i; // + qv.status = 0; // reset status + qv.do_vfs = 0; + rc = evms_quiesce_volume(volume, inode, file, &qv); + if (rc) { + LOG_ERROR + ("%s: error(%d) attempting to quiesce '%s%s'.\n", + __FUNCTION__, rc, EVMS_DEV_NODE_PATH, + volume->name); + } + } + + /* we need to revalidate all the changed + * media. this is accomplished by issuing + * the revalidate disk ioctl to each device + * with changed media. the device manager + * remembers which devices indicated + * media changed (set by check media + * changed ioctl issued earlier), and will + * only issue the revalidate disk ioctl to + * those disks one time. + * + * NOTE: + * this needs to be done BEFORE deleting + * the volumes because deleting the + * last segment on disk will cause the + * associated disk node to freed, and we + * will not be able to issue the + * revalidate disk ioctl after that. + */ + for (lnode = disk_list; lnode; lnode = lnode->next) { + disk = (struct evms_logical_node *) lnode->item; + /* only really do removable media devices */ + if (disk->flags & EVMS_MEDIA_CHANGED) { + /* go revalidate the change media */ + rc = evms_cs_kernel_ioctl(disk, + EVMS_REVALIDATE_DISK, + (unsigned long) NULL); + if (rc) { + LOG_ERROR + ("%s: error(%d) attempting to revalidate '%s%s'.\n", + __FUNCTION__, rc, + EVMS_DEV_NODE_PATH, volume->name); + } + } + } + + /* delete all the affected volumes */ + for (i = 1; i < MAX_EVMS_VOLUMES; i++) { + struct evms_delete_vol_pkt dv; + + volume = &evms_logical_volumes[i]; + if (!volume->node) + continue; + /* only proceed on volumes with: + * changed media, + * hot-unplugged devices, + * & partial volumes + */ + if (!(volume->flags & + (EVMS_MEDIA_CHANGED | + EVMS_VOLUME_PARTIAL | EVMS_DEVICE_UNPLUGGED))) + continue; + /* only delete quiesced volumes */ + if (!volume->quiesced) + continue; + /* delete the volume from memory. + * do a 'soft' delete if volume + * is mounted, and 'hard' delete + * if it is not. + * + * NOTE: the delete operation will + * clear the bits in the flags field. + */ + dv.command = is_open(i); + dv.minor = i; + dv.status = 0; + rc = evms_delete_volume(volume, &dv); + } + + /* at this point all devices indicating + * media change that had volumes on them + * should be gone. however, we could still + * have devices indicating media change + * that had no volumes on them in the disk + * list. we need to delete these devices + * from kernel memory and the global device + * list. + */ + for (lnode = evms_global_device_list; lnode; lnode = next_lnode) { + next_lnode = lnode->next; + + disk = (struct evms_logical_node *) lnode->item; + if (disk->flags & EVMS_MEDIA_CHANGED) { + rc = DELETE(disk); + } + } + + /* all the devices that indicated media + * change should be gone, both from kernel + * memory and global device list. we now + * need to remove any references to these + * devices from the disk list. + * + * when removable media is installed, it + * will get detected in the device manager's + * rediscovery as a new device and added to + * the discover list. + */ + for (lnode = disk_list; lnode; lnode = next_lnode) { + struct evms_list_node *glnode; + int lnode_still_there; + + next_lnode = lnode->next; + + lnode_still_there = FALSE; + for (glnode = evms_global_device_list; + glnode; glnode = glnode->next) { + if (glnode->item == lnode->item) { + lnode_still_there = TRUE; + break; + } + } + if (lnode_still_there == FALSE) { + rc = evms_cs_remove_item_from_list(&disk_list, + lnode->item); + if (rc) { + LOG_ERROR + ("%s: error(%d) attempting to remove item(%p) from disk_list(%p).\n", + __FUNCTION__, rc, lnode->item, + &disk_list); + } + } + } + + /* build the in-kernel rediscover packet */ + + /* allocate the space for the drive_array in + * the struct evms_rediscover_pkt packet. to do this + * we need to count the number of disk nodes, + * then allocate the necessary space. + */ + /* count the disk nodes */ + for (lnode = disk_list; lnode; lnode = lnode->next) + kernel_rd_pckt.drive_count++; + /* allocate the space */ + if (kernel_rd_pckt.drive_count) { + kernel_rd_pckt.drive_array = + kmalloc(kernel_rd_pckt.drive_count * + sizeof (u64), GFP_KERNEL); + if (!kernel_rd_pckt.drive_array) { + rc = -ENOMEM; + LOG_ERROR + ("%s: error(%d) allocating rediscover drive array.\n", + __FUNCTION__, rc); + } + } + /* populate the drive array + * + * this also frees the disk_list which is useful + * if we had an error allocating the drive array. + */ + for (i = 0, lnode = disk_list; lnode; lnode = next_lnode, i++) { + next_lnode = lnode->next; + + /* remove this disk from the disk list */ + disk = (struct evms_logical_node *) lnode->item; + rc = evms_cs_remove_item_from_list(&disk_list, disk); + if (!rc) { + /* add this disk to rediscover + * packet + */ + kernel_rd_pckt.drive_array[i] = + NODE_TO_DEV_HANDLE(disk); + } + } + /* perform the rediscovery operation */ + if (!rc) { + static int evms_discover_volumes(struct + evms_rediscover_pkt *); + rc = evms_discover_volumes(&kernel_rd_pckt); + if (kernel_rd_pckt.drive_count) { + kfree(kernel_rd_pckt.drive_array); + } + } + LOG_DETAILS("%s: rediscover completed.\n", __FUNCTION__); + } + + return (rc); +} + +/************************************************/ +/* START -- REVALIDATE DISK */ +/************************************************/ + +static int +evms_revalidate_disk(kdev_t dev) +{ + int rc = 0; + struct evms_logical_volume *volume = NULL; + + /* check user access */ + if (!capable(CAP_SYS_ADMIN)) + rc = -EACCES; + if (!rc) { + int minor; + /* get the minor */ + minor = MINOR(dev); + /* insure this minor points to a valid volume */ + volume = &evms_logical_volumes[minor]; + if (volume->node == NULL) { + rc = -ENXIO; + } + } + if (!rc) { + /* go revalidate the change media */ + rc = evms_cs_kernel_ioctl(volume->node, + EVMS_REVALIDATE_DISK, + (unsigned long) NULL); + } + return (rc); +} + +/************************************************/ +/* END -- REVALIDATE DISK */ +/************************************************/ + +/************************************************/ +/* START -- OPEN */ +/************************************************/ + +static int +evms_open(struct inode *inode, struct file *file) +{ + int rc = 0, minor = 0; + struct evms_logical_volume *volume = NULL; + + /* check user access */ + if (!capable(CAP_SYS_ADMIN)) + rc = -EACCES; + if (!rc) { + if (!inode) + rc = -EINVAL; + } + rc = evms_check_for_device_changes(inode, file); + if (!rc) { + /* get the minor */ + minor = MINOR(inode->i_rdev); + if (minor) { + /* insure this minor points to a valid volume */ + volume = &evms_logical_volumes[minor]; + if (volume->node == NULL) { + rc = -ENXIO; + } + } + } + /* go "open" the volume */ + if (!rc && minor) { + atomic_inc(&volume->opens); + rc = IOCTL(volume->node, inode, file, + EVMS_OPEN_VOLUME, (unsigned long) NULL); + if (rc) { + LOG_ERROR + ("error(%d) doing EVMS_OPEN_VOLUME ioctl to '%s'.\n", + rc, volume->name); + atomic_dec(&volume->opens); + } + } + return (rc); +} + +/************************************************/ +/* END -- OPEN */ +/************************************************/ + +/************************************************/ +/* START -- RELEASE */ +/************************************************/ + +static int +evms_release(struct inode *inode, struct file *file) +{ + int rc = 0, minor = 0; + struct evms_logical_volume *volume = NULL; + + if (!inode) + rc = -EINVAL; + if (!rc) { + /* get the minor */ + minor = MINOR(inode->i_rdev); + if (minor) { + /* insure this minor points to a valid volume */ + volume = &evms_logical_volumes[minor]; + if (volume->node == NULL) { + rc = -ENXIO; + } + } + } + /* go "close" the volume */ + if (!rc && minor) { + rc = IOCTL(volume->node, inode, file, + EVMS_CLOSE_VOLUME, (unsigned long) NULL); + if (rc) { + LOG_ERROR + ("error(%d) doing EVMS_CLOSE_VOLUME ioctl to '%s'.\n", + rc, volume->name); + } else { + atomic_dec(&volume->opens); + } + } + return (rc); +} + +/************************************************/ +/* END -- RELEASE */ +/************************************************/ + +static struct block_device_operations evms_fops = { +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,14) + owner:THIS_MODULE, +#endif + open:evms_open, + release:evms_release, + ioctl:evms_ioctl, + check_media_change:evms_check_media_change, + revalidate:evms_revalidate_disk +}; + +/**********************************************************/ +/* END -- FOPS functions definitions */ +/**********************************************************/ + +/**********************************************************/ +/* START -- RUNTIME support functions */ +/**********************************************************/ + +static void +evms_do_request_fn(request_queue_t * q) +{ + LOG_WARNING("This function should not be called.\n"); +} + +#ifdef CONFIG_SMP +static request_queue_t * +evms_find_queue(kdev_t dev) +{ + request_queue_t *rq = NULL; + struct evms_logical_volume *volume; + + volume = &evms_logical_volumes[MINOR(dev)]; + if (volume->node) + rq = &volume->request_queue; + return (rq); +} +#endif + +/* + * Function: evms_make_request_fn + * + */ +static int +evms_make_request_fn(request_queue_t * q, int rw, struct buffer_head *bh) +{ + struct evms_logical_volume *volume; + + volume = &evms_logical_volumes[MINOR(bh->b_rdev)]; + wait_event(volume->wait_queue, (!volume->quiesced)); + if (volume->node) { + switch (rw) { + case READ: + case READA: + atomic_inc(&volume->requests_in_progress); + R_IO(volume->node, bh); + atomic_dec(&volume->requests_in_progress); + return 0; + case WRITE: + atomic_inc(&volume->requests_in_progress); + W_IO(volume->node, bh); + atomic_dec(&volume->requests_in_progress); + return 0; + default: + buffer_IO_error(bh); + return 0; + } + } else { + LOG_ERROR("request for unknown logical volume [minor(%d)].\n", + MINOR(bh->b_rdev)); + buffer_IO_error(bh); + } + return 0; +} + +/**********************************************************/ +/* END -- RUNTIME support functions */ +/**********************************************************/ + +/**********************************************************/ +/* START -- INIT/DISCOVERY support functions */ +/**********************************************************/ + +#ifdef LOCAL_DEBUG +static void +display_discover_list(struct evms_logical_node *discover_list, char *text) +{ + struct evms_logical_node *node; + + LOG_DETAILS("discover list:(%s)\n", text); + for (node = discover_list; node; node = node->next) { + LOG_DETAILS("\nnode info:\n"); + LOG_DETAILS("node.....................(0x%p)\n", node); + LOG_DETAILS("name.....................(%s)\n", node->name); + LOG_DETAILS("plugin id................(0x%x)\n", + node->plugin->id); + LOG_DETAILS("size.....................("PFU64")\n", + node->total_vsectors); + LOG_DETAILS("flags....................(0x%x)\n", node->flags); + LOG_DETAILS("iflags...................(0x%x)\n", node->iflags); + LOG_DETAILS("sector size..............(%d)\n", + node->hardsector_size); + LOG_DETAILS("block size...............(%d)\n", + node->block_size); + LOG_DETAILS("sys id...................(0x%x)\n", + node->system_id); + + if (node->feature_header) { + struct evms_feature_header *fh; + + fh = node->feature_header; + LOG_DETAILS("\nfeature header:\n"); + LOG_DETAILS("signature................(0x%x)\n", + fh->signature); + LOG_DETAILS("crc......................(0x%x)\n", + fh->crc); + LOG_DETAILS("feature header version...(%d.%d.%d)\n", + fh->version.major, fh->version.minor, + fh->version.patchlevel); + LOG_DETAILS("engine version...........(%d.%d.%d)\n", + fh->engine_version.major, + fh->engine_version.minor, + fh->engine_version.patchlevel); + LOG_DETAILS("flags....................(0x%x)\n", + fh->flags); + LOG_DETAILS("feature id...............(0x%x)\n", + fh->feature_id); + LOG_DETAILS("sequence#................("PFU64")\n", + fh->sequence_number); + LOG_DETAILS("alignment padding........("PFU64")\n", + fh->alignment_padding); + LOG_DETAILS("feature data1 lsn........("PFU64")\n", + fh->feature_data1_start_lsn); + LOG_DETAILS("feature data1 size.......("PFU64")\n", + fh->feature_data1_size); + LOG_DETAILS("feature data2 lsn........("PFU64")\n", + fh->feature_data2_start_lsn); + LOG_DETAILS("feature data2 size.......("PFU64")\n", + fh->feature_data2_size); + LOG_DETAILS("volume sn................("PFU64")\n", + fh->volume_serial_number); + LOG_DETAILS("volume minor#............(%d)\n", + fh->volume_system_id); + LOG_DETAILS("object depth.............(%d)\n", + fh->object_depth); + LOG_DETAILS("object name..............(%s)\n", + fh->object_name); + LOG_DETAILS("volume name..............(%s)\n", + fh->volume_name); + } + + if (node->volume_info) { + struct evms_volume_info *vi; + + vi = node->volume_info; + LOG_DETAILS("\nvolume info:\n"); + LOG_DETAILS("volume name..............(%s)\n", + vi->volume_name); + LOG_DETAILS("volume sn................("PFU64")\n", + vi->volume_sn); + LOG_DETAILS("volume minor#............(%d)\n", + vi->volume_minor); + } + } + if (discover_list) { + LOG_DETAILS("\n"); + } +} +#endif + +/* + * Function: evms_discover_logical_disks + * Description: Construct the logical disk list by calling all registered device managers. + */ +static void +evms_discover_logical_disks(struct evms_logical_node **disk_list) +{ + struct evms_registered_plugin *p; + LOG_EXTRA("discovering logical disks...\n"); + for (p = registered_plugin_head; p; p = p->next) { + if (GetPluginType(p->plugin->id) == EVMS_DEVICE_MANAGER) { + DISCOVER(p, disk_list); + } + } +} + +/* + * Function: evms_discover_logical_partitions + * Description: Construct the logical partition list by calling all registered partition managers. + */ +static void +evms_discover_logical_partitions(struct evms_logical_node **discover_list) +{ + int rc, done; + + struct evms_registered_plugin *p; + LOG_EXTRA("discovering logical partitions...\n"); + do { + done = TRUE; + for (p = registered_plugin_head; p; p = p->next) { + if (GetPluginType(p->plugin->id) == + EVMS_SEGMENT_MANAGER) { + rc = DISCOVER(p, discover_list); + /* RC > 0 means the plugin + * added something to the + * discover list. This also + * means we must loop thru + * these plugins another time. + * RC == 0 means nothing was + * added to the discover list + * by this plugin. + * RC < 0 means the plugin + * encountered some error and + * nothing was added to the list. + * NOTE: If a plugin has both + * added something new to the + * discover list and encountered + * an error, RC > 0 must be + * returned. + */ + if (rc > 0) + done = FALSE; + } + } + } while (done == FALSE); + + /* send the end of discovery signal to each + * partition manager plugin. + */ + for (p = registered_plugin_head; p; p = p->next) + if (GetPluginType(p->plugin->id) == EVMS_SEGMENT_MANAGER) + if (p->plugin->fops->end_discover) + rc = END_DISCOVER(p, discover_list); +} + +/* + * Function: evms_discover_volume_groups + * Description: Find volume groups within the logical partitions list + */ +static void +evms_discover_volume_groups(struct evms_logical_node **discover_list) +{ + int rc, done; + + struct evms_registered_plugin *p; + LOG_EXTRA("discovering logical volume groups...\n"); + do { + done = TRUE; + for (p = registered_plugin_head; p; p = p->next) { + if (GetPluginType(p->plugin->id) == EVMS_REGION_MANAGER) { + rc = DISCOVER(p, discover_list); + /* RC > 0 means the plugin + * added something to the + * discover list. This also + * means we must loop thru + * these plugins another time. + * RC == 0 means nothing was + * added to the discover list + * by this plugin. + * RC < 0 means the plugin + * encountered some error and + * nothing was added to the list. + * NOTE: If a plugin has both + * added something new to the + * discover list and encountered + * an error, RC > 0 must be + * returned. + */ + if (rc > 0) + done = FALSE; + } + } + } while (done == FALSE); + + /* send the end of discovery signal to each volume + * group plugin. + */ + for (p = registered_plugin_head; p; p = p->next) + if (GetPluginType(p->plugin->id) == EVMS_REGION_MANAGER) + if (p->plugin->fops->end_discover) + rc = END_DISCOVER(p, discover_list); +} + +/* + * + * convert all the feature header fields into cpu native format + * from the on-disk Little Endian format. From this point forward + * all plugins can deal with feature headers natively. + */ +void +le_feature_header_to_cpu(struct evms_feature_header *fh) +{ + fh->signature = le32_to_cpup(&fh->signature); + fh->crc = le32_to_cpup(&fh->crc); + fh->version.major = le32_to_cpup(&fh->version.major); + fh->version.minor = le32_to_cpup(&fh->version.minor); + fh->version.patchlevel = le32_to_cpup(&fh->version.patchlevel); + fh->engine_version.major = le32_to_cpup(&fh->engine_version.major); + fh->engine_version.minor = le32_to_cpup(&fh->engine_version.minor); + fh->engine_version.patchlevel = + le32_to_cpup(&fh->engine_version.patchlevel); + fh->flags = le32_to_cpup(&fh->flags); + fh->feature_id = le32_to_cpup(&fh->feature_id); + fh->sequence_number = le64_to_cpup(&fh->sequence_number); + fh->alignment_padding = le64_to_cpup(&fh->alignment_padding); + fh->feature_data1_start_lsn = + le64_to_cpup(&fh->feature_data1_start_lsn); + fh->feature_data1_size = le64_to_cpup(&fh->feature_data1_size); + fh->feature_data2_start_lsn = + le64_to_cpup(&fh->feature_data2_start_lsn); + fh->feature_data2_size = le64_to_cpup(&fh->feature_data2_size); + fh->volume_serial_number = le64_to_cpup(&fh->volume_serial_number); + fh->volume_system_id = le32_to_cpup(&fh->volume_system_id); + fh->object_depth = le32_to_cpup(&fh->object_depth); +} + +static int +edef_load_feature_header(struct evms_logical_node *node) +{ + int i, rc = 0, rc_array[2] = { 0, 0 }; + unsigned long size_in_bytes; + u64 size_in_sectors, starting_sector = 0; + struct evms_feature_header *fh = NULL, *fh1 = NULL, *fh2 = NULL; + char *location_name = NULL; + struct evms_version version = { + EVMS_FEATURE_HEADER_MAJOR, + EVMS_FEATURE_HEADER_MINOR, + EVMS_FEATURE_HEADER_PATCHLEVEL + }; + + if (!node->feature_header) { + size_in_sectors = evms_cs_size_in_vsectors(sizeof (*fh)); + size_in_bytes = size_in_sectors << EVMS_VSECTOR_SIZE_SHIFT; + fh1 = kmalloc(size_in_bytes, GFP_KERNEL); + if (fh1) { + fh2 = kmalloc(size_in_bytes, GFP_KERNEL); + if (!fh2) { + kfree(fh1); + rc = -ENOMEM; + } + } else { + rc = -ENOMEM; + } + + for (i = 0; i < 2; i++) { + if (i == 0) { + starting_sector = + node->total_vsectors - size_in_sectors; + fh = fh1; + location_name = evms_primary_string; + } else { + starting_sector--; + fh = fh2; + location_name = evms_secondary_string; + } + /* read header into buffer */ + rc = INIT_IO(node, + 0, starting_sector, size_in_sectors, fh); + if (rc) { + LOG_ERROR + ("error(%d) probing for %s feature header(at "PFU64") on '%s'.\n", + rc, location_name, starting_sector, + node->name); + rc_array[i] = rc; + continue; + } + /* validate header signature */ + if (cpu_to_le32(fh->signature) != + EVMS_FEATURE_HEADER_SIGNATURE) { + rc = -ENODATA; + rc_array[i] = rc; + continue; + } + /* validate header CRC */ + if (fh->crc != EVMS_MAGIC_CRC) { + u32 org_crc, final_crc; + org_crc = cpu_to_le32(fh->crc); + fh->crc = 0; + final_crc = + evms_cs_calculate_crc(EVMS_INITIAL_CRC, fh, + sizeof (*fh)); + if (final_crc != org_crc) { + LOG_ERROR + ("CRC mismatch error [stored(%x), computed(%x)] in %s feature header(at "PFU64") on '%s'.\n", + org_crc, final_crc, location_name, + starting_sector, node->name); + rc = -EINVAL; + rc_array[i] = rc; + continue; + } + } else { + LOG_WARNING + ("CRC disabled in %s feature header(at "PFU64") on '%s'.\n", + location_name, starting_sector, + node->name); + } + /* convert the feature header from the + * on-disk format (Little Endian) to + * native cpu format. + */ + le_feature_header_to_cpu(fh); + /* verify the system data version */ + rc = evms_cs_check_version(&version, &fh->version); + if (rc) { + LOG_ERROR + ("error: obsolete version(%d,%d,%d) in %s feature header on '%s'.\n", + fh->version.major, fh->version.minor, + fh->version.patchlevel, location_name, + node->name); + rc_array[i] = rc; + } + } + + /* getting same return code for both copies? */ + if (rc_array[0] == rc_array[1]) { + rc = rc_array[0]; + /* if no errors on both copies, + * check the sequence numbers. + * use the highest sequence number. + */ + if (!rc) { + /* compare sequence numbers */ + if (fh1->sequence_number == + fh2->sequence_number) { + fh = fh1; + } else { + LOG_WARNING + ("%s feature header sequence number("PFU64") mismatches %s feature header sequence number("PFU64") on '%s'!\n", + evms_primary_string, + fh1->sequence_number, + evms_secondary_string, + fh2->sequence_number, node->name); + if (fh1->sequence_number > + fh2->sequence_number) { + fh = fh1; + location_name = + evms_primary_string; + /* indicate bad sequence number of secondary */ + rc_array[1] = -1; + } else { + fh = fh2; + location_name = + evms_secondary_string; + /* indicate bad sequence number of primary */ + rc_array[0] = -1; + } + } + } + /* getting different return codes for each copy */ + } else + /* either primary or secondary copy is + * valid, so use the valid copy. + */ + if ((rc_array[0] == 0) || (rc_array[1] == 0)) { + char *warn_name = NULL; + + /* indicate success */ + rc = 0; + /* set variables based on which copy is valid */ + if (rc_array[0] == 0) { + /* use primary (rear) copy if its good */ + fh = fh1; + location_name = evms_primary_string; + warn_name = evms_secondary_string; + } else { + /* use secondary (front) copy if its good */ + fh = fh2; + location_name = evms_secondary_string; + warn_name = evms_primary_string; + } + /* warn the user about the invalid copy */ + LOG_WARNING + ("warning: error(%d) probing/verifying the %s feature header on '%s'.\n", + rc_array[0] + rc_array[1], warn_name, node->name); + } else + /* both copies had a different error, + * and one was a fatal error, so + * indicate fatal error. + */ + if ((rc_array[0] == -EINVAL) || (rc_array[1] == -EINVAL)) { + rc = -EINVAL; + } + + /* on error, set fh to NULL */ + if (rc) + fh = NULL; + + /* deallocate metadata buffers appropriately */ + if (fh != fh1) + kfree(fh1); + if (fh != fh2) + kfree(fh2); + + /* save validated feature header pointer */ + if (!rc) { + node->feature_header = fh; + if (rc_array[0] != rc_array[1]) { + LOG_DETAILS + ("using %s feature header on '%s'.\n", + location_name, node->name); + } + } + + /* if no signature found, adjust return code */ + if (rc == -ENODATA) { + rc = 0; + LOG_DEBUG("no feature header found on '%s'.\n", + node->name); + } + } + return (rc); +} + +static int +edef_find_first_features(struct evms_logical_node **discover_list) +{ + int rc; + struct evms_logical_node *node, *tmp_list_head; + + tmp_list_head = *discover_list; + *discover_list = NULL; + + while (tmp_list_head) { + struct evms_list_node **evms_node; + + node = tmp_list_head; + rc = evms_cs_remove_logical_node_from_list(&tmp_list_head, + node); + if (rc) + BUG(); + + /* check for duplicate pointers + * search for the node in global list + */ + evms_node = + evms_cs_lookup_item_in_list(&evms_global_feature_node_list, + node); + /* already present? */ + if (*evms_node) { + /* yes, already present */ + rc = -ENODATA; /* dont process this node further */ + LOG_DETAILS("deleting duplicate reference to '%s'.\n", + node->name); + /* forget this node */ + node = NULL; + } else { + /* load the feature header if present */ + rc = edef_load_feature_header(node); + /* This node have a feature header ? + * it won't be if there is no header to load + * OR + * there was a fatal error attempting to read it. + */ + if (node->feature_header) { + /* check for object flag */ + if (node->feature_header->flags & + EVMS_VOLUME_DATA_OBJECT) { + LOG_DEFAULT + ("object detected, deleting '%s'.\n", + node->name); + rc = -EINVAL; + } else + /* check for stop-data flag */ + if (node->feature_header->flags & + EVMS_VOLUME_DATA_STOP) { + LOG_DEFAULT + ("stop data detected, deleting '%s'.\n", + node->name); + rc = -EINVAL; + } else { + /* we have a valid feature header. + * initialize appropriate node fields + * to indicate this. + */ + node->flags |= EVMS_VOLUME_FLAG; + node->iflags |= EVMS_FEATURE_BOTTOM; + node->volume_info = + kmalloc(sizeof + (struct evms_volume_info), + GFP_KERNEL); + if (node->volume_info) { + /* set up volume + * info struct + */ + memset(node->volume_info, 0, + sizeof + (struct + evms_volume_info)); + node->volume_info->volume_sn = + node->feature_header-> + volume_serial_number; + node->volume_info-> + volume_minor = + node->feature_header-> + volume_system_id; + strcpy(node->volume_info-> + volume_name, + node->feature_header-> + volume_name); + /* register(add) node to + * the global list. + */ + rc = evms_cs_add_item_to_list + (&evms_global_feature_node_list, + node); + } else { + rc = -ENOMEM; + } + } + } + } + /* if any errors, delete the node */ + if (rc) { + if (node) { + DELETE(node); + } + } else + /* on successful processing of this node + * place it back on the discover list. + */ + evms_cs_add_logical_node_to_list(discover_list, node); + } + return (0); +} + +/* These define describe the node types that can be isolated. */ +#define ISOLATE_ASSOCIATIVE_FEATURES 0 +#define ISOLATE_COMPATIBILITY_VOLUMES 1 +#define ISOLATE_EVMS_VOLUMES 2 +#define ISOLATE_EVMS_VOLUME_SERIAL_NUMBER 3 +#define ISOLATE_EVMS_NODES_BY_FEATURE_AND_DEPTH 4 +static int +edef_isolate_nodes_by_type(unsigned int type, + struct evms_logical_node **src_list, + struct evms_logical_node **trg_list, + u32 compare32, u64 compare64) +{ + struct evms_logical_node *node, *next_node; + int rc = 0, found_node; + struct evms_feature_header *fh = NULL; + + for (node = *src_list; node; node = next_node) { + next_node = node->next; + + if (node->feature_header) + fh = node->feature_header; + found_node = FALSE; + switch (type) { + case ISOLATE_ASSOCIATIVE_FEATURES: + if (fh) { + if (GetPluginType(fh->feature_id) == + EVMS_ASSOCIATIVE_FEATURE) + found_node = TRUE; + } + break; + case ISOLATE_COMPATIBILITY_VOLUMES: + if (!(node->flags & EVMS_VOLUME_FLAG)) + found_node = TRUE; + break; + case ISOLATE_EVMS_VOLUMES: + if (node->flags & EVMS_VOLUME_FLAG) + found_node = TRUE; + break; + /* EVMS volumes with same serial # */ + case ISOLATE_EVMS_VOLUME_SERIAL_NUMBER: + if (node->volume_info->volume_sn == compare64) + found_node = TRUE; + break; + case ISOLATE_EVMS_NODES_BY_FEATURE_AND_DEPTH: + if (fh) + if (fh->object_depth == compare64) + if (fh->feature_id == compare32) + found_node = TRUE; + break; + } + if (found_node == TRUE) { + rc = evms_cs_remove_logical_node_from_list(src_list, + node); + if (rc) + break; + rc = evms_cs_add_logical_node_to_list(trg_list, node); + if (rc) + break; + } + } + return (rc); +} + +static int +edef_apply_feature(struct evms_logical_node *node, + struct evms_logical_node **volume_node_list) +{ + struct evms_registered_plugin *p; + int rc = -1; + + for (p = registered_plugin_head; p; p = p->next) { + if (p->plugin->id == node->feature_header->feature_id) { + rc = DISCOVER(p, volume_node_list); + break; + } + } + return (rc); +} + +static int +edef_get_feature_plugin_header(u32 id, struct evms_plugin_header **header) +{ + int rc = -ENOPKG; + struct evms_registered_plugin *p; + + for (p = registered_plugin_head; p; p = p->next) { + if (p->plugin->id == id) { + *header = p->plugin; + rc = 0; + break; + } + } + if (rc) { + LOG_SERIOUS("no plugin loaded for feature id(0x%x)\n", id); + } + return (rc); +} + +typedef struct evms_volume_build_info_s { + int node_count; + int feature_header_count; + int feature_count; + int associative_feature_count; + u64 max_depth; + struct evms_plugin_header *plugin; + struct evms_logical_node *feature_node_list; +} evms_volume_build_info_t; + +/* + * edef_evaluate_volume_node_list: + * does: + * 1) put all nodes from feature list back on volume list + * 2) loads the node's feature headers + * 3) counts the node list's entries + * 4) builds the feature node list + * 5) counts the feature headers for associative features + * 6) sets feature count to >1 if >1 features to be processed + */ +static int +edef_evaluate_volume_node_list(struct evms_logical_node **volume_node_list, + evms_volume_build_info_t * vbi, + int volume_complete) +{ + int rc; + struct evms_logical_node *node; + + vbi->node_count = + vbi->feature_count = + vbi->associative_feature_count = vbi->max_depth = 0; + vbi->plugin = NULL; + + /* put all feature nodes back on the volume list */ + rc = edef_isolate_nodes_by_type(ISOLATE_EVMS_VOLUMES, + &vbi->feature_node_list, + volume_node_list, 0, 0); + if (rc) + return (rc); + + /* load all the feature headers */ + if (!volume_complete) { + for (node = *volume_node_list; node; node = node->next) { + rc = edef_load_feature_header(node); + if (rc) + return (rc); + } + } + + /* find the 1st max depth object: + * record the depth + * record the plugin + */ + for (node = *volume_node_list; node; node = node->next) { + struct evms_plugin_header *plugin; + struct evms_feature_header *fh = node->feature_header; + + /* count the nodes */ + vbi->node_count++; + + /* no feature header found, continue to next node */ + if (!fh) + continue; + + /* check the depth */ + if (fh->object_depth > vbi->max_depth) { + /* record new max depth */ + vbi->max_depth = fh->object_depth; + /* find the plugin header for this feature id */ + rc = edef_get_feature_plugin_header(fh->feature_id, + &plugin); + if (rc) + return (rc); + /* check for >1 plugins */ + if (vbi->plugin != plugin) { + vbi->feature_count++; + vbi->plugin = plugin; + } + } + /* check for "associative" feature indicator */ + if (GetPluginType(vbi->plugin->id) == EVMS_ASSOCIATIVE_FEATURE) + vbi->associative_feature_count++; + } + /* build a list of max depth nodes for this feature */ + if (vbi->max_depth) { + rc = edef_isolate_nodes_by_type + (ISOLATE_EVMS_NODES_BY_FEATURE_AND_DEPTH, volume_node_list, + &vbi->feature_node_list, vbi->plugin->id, vbi->max_depth); + if (rc) + return (rc); + if (!vbi->plugin) + return (-ENODATA); + if (!vbi->feature_node_list) + return (-ENODATA); + } + + return (rc); +} + +/* function: edef_check_feature_conditions + * + * This routine verifies the state of volume based on the features + * headers and nodes in the current discovery list. All detected + * errors are considered fatal. + */ +static int +edef_check_feature_conditions(evms_volume_build_info_t * vbi) +{ + int rc = 0; + + if (vbi->associative_feature_count) { + if (vbi->node_count > 1) { + rc = -EVMS_VOLUME_FATAL_ERROR; + LOG_ERROR + ("associative ERROR: > 1 nodes(%d) remaining to be processed!\n", + vbi->node_count); + } else if (vbi->max_depth != 1) { + rc = -EVMS_VOLUME_FATAL_ERROR; + LOG_ERROR + ("associative ERROR: associative feature found at node depth("PFU64") != 1!\n", + vbi->max_depth); + } else + rc = -EVMS_ASSOCIATIVE_FEATURE; + } + if (!rc) { + if (!vbi->max_depth) { + if (vbi->node_count > 1) { + rc = -EVMS_VOLUME_FATAL_ERROR; + LOG_ERROR + ("max depth ERROR: > 1 nodes(%d) remaining to be processed!\n", + vbi->node_count); + } + } else if (vbi->max_depth == 1) { + if (vbi->feature_count > 1) { + rc = -EVMS_VOLUME_FATAL_ERROR; + LOG_ERROR + ("max depth 1 ERROR: > 1 features remaining to be processed!\n"); + } + } + } + return (rc); +} + +/* function: edef_apply_features + * + * This routine applies none, one, or more features to an EVMS + * volume. The system data structure is first verified and then + * features are applied and verified recursively until the + * entire volume has been constructed. Fatal errors result in + * all nodes in the volume discovery list being deleted. + */ +static int +edef_apply_features(struct evms_logical_node **volume_node_list) +{ + int rc = 1, done, top_feature_applying; + evms_volume_build_info_t vbi; + + vbi.feature_node_list = NULL; + rc = edef_evaluate_volume_node_list(volume_node_list, &vbi, FALSE); + + /* ensure we don't go into the next loop + * without having a target plugin to + * pass control to. + */ + if (!rc) { + if (!vbi.plugin) { + rc = -ENODATA; + } + } + + /* this loop should ONLY get used when + * there are features to process. + */ + done = (rc) ? TRUE : FALSE; + while (!done) { + rc = edef_check_feature_conditions(&vbi); + if (rc) + break; + top_feature_applying = (vbi.max_depth == 1) ? TRUE : FALSE; + rc = vbi.plugin->fops->discover(&vbi.feature_node_list); + if (!rc) { + rc = edef_evaluate_volume_node_list(volume_node_list, + &vbi, + top_feature_applying); + if (top_feature_applying == TRUE) { + if (vbi.node_count > 1) { + rc = -EVMS_VOLUME_FATAL_ERROR; + LOG_ERROR + ("ERROR: detected > 1 node at volume completion!\n"); + } + done = TRUE; + } else { + if (!vbi.plugin) { + rc = -EVMS_VOLUME_FATAL_ERROR; + LOG_ERROR + ("ERROR: depth("PFU64"): expected another feature!\n", + vbi.max_depth); + done = TRUE; + } + } + } else { /* rc != 0 */ + rc = -EVMS_VOLUME_FATAL_ERROR; + done = TRUE; + } + } + if (rc) + /* put all feature nodes back on the volume list */ + if (edef_isolate_nodes_by_type(ISOLATE_EVMS_VOLUMES, + &vbi.feature_node_list, + volume_node_list, 0, 0)) + BUG(); + return (rc); +} + +static int +edef_delete_node(struct evms_logical_node **node_list, + struct evms_logical_node *node, int return_code, + char *log_text) +{ + int rc; + + rc = evms_cs_remove_logical_node_from_list(node_list, node); + if (!rc) { + LOG_ERROR("%s error(%d): deleting volume(%s), node(%s)\n", + log_text, return_code, + node->volume_info->volume_name, node->name); + rc = DELETE(node); + if (rc) { + LOG_ERROR("error(%d) while deleting node(%s)\n", + rc, node->name); + } + } else { + LOG_WARNING + ("%s error(%d): node gone, assumed deleted by plugin.\n", + log_text, return_code); + /* plugin must have cleaned up the node. + * So just reset the return code and leave. + */ + rc = 0; + } + + return (rc); +} + +static int +edef_process_evms_volumes(struct evms_logical_node **discover_list, + struct evms_logical_node **associative_feature_list) +{ + int rc = 0; + struct evms_logical_node *node, *evms_volumes_list, *volume_node_list; + u64 volume_sn; + + /* put all EVMS volumes on their own list */ + evms_volumes_list = NULL; + rc = edef_isolate_nodes_by_type(ISOLATE_EVMS_VOLUMES, + discover_list, + &evms_volumes_list, 0, 0); + + /* apply features to each EVMS volume */ + /* one volume at a time on each pass */ + while (evms_volumes_list) { + node = evms_volumes_list; + /* put all nodes for one EVMS volume on separate list */ + volume_node_list = NULL; + volume_sn = node->volume_info->volume_sn; + rc = edef_isolate_nodes_by_type + (ISOLATE_EVMS_VOLUME_SERIAL_NUMBER, &evms_volumes_list, + &volume_node_list, 0, volume_sn); + if (rc) + break; + /* go apply all the volume features now */ + rc = edef_apply_features(&volume_node_list); + switch (rc) { + case 0: /* SUCCESS */ + /* remove volume just processed */ + node = volume_node_list; + rc = evms_cs_remove_logical_node_from_list + (&volume_node_list, node); + if (rc) + break; + /* put volume on global list */ + rc = evms_cs_add_logical_node_to_list(discover_list, + node); + break; + case -EVMS_ASSOCIATIVE_FEATURE: + /* put all "associative" features on their own list */ + rc = edef_isolate_nodes_by_type + (ISOLATE_ASSOCIATIVE_FEATURES, &volume_node_list, + associative_feature_list, 0, 0); + break; + default: /* FATAL ERROR */ + /* delete each node remaining in the list */ + if (volume_node_list) { + LOG_ERROR + ("encountered fatal error building volume '%s'\n", + volume_node_list->volume_info-> + volume_name); + } + while (volume_node_list) { + node = volume_node_list; + edef_delete_node(&volume_node_list, + node, rc, "EVMS feature"); + } + rc = 0; + break; + } + if (rc) + break; + } + return (rc); +} + +static int +edef_process_associative_volumes(struct evms_logical_node + **associative_feature_list, + struct evms_logical_node **discover_list) +{ + int rc = 0; + struct evms_logical_node *node; + + while (*associative_feature_list) { + node = *associative_feature_list; + /* remove this node from associative feature list */ + rc = evms_cs_remove_logical_node_from_list + (associative_feature_list, node); + if (rc) + break; + /* put volume on global list */ + rc = evms_cs_add_logical_node_to_list(discover_list, node); + if (rc) + break; + rc = edef_load_feature_header(node); + if (rc) + break; + rc = edef_apply_feature(node, discover_list); + if (rc) + edef_delete_node(discover_list, node, rc, + "Associative feature"); + } + return (rc); +} + +static int +edef_check_for_incomplete_volumes(struct evms_logical_node **discover_list) +{ + int rc = 0; + struct evms_logical_node *next_node, *node; + + /* check to see if any incomplete volumes are left around */ + /* if so, delete them. */ + /* complete volumes should not have feature_headers */ + /* hanging off them, if we find any, we know the volume */ + /* is incomplete. */ + + for (node = *discover_list; node; node = next_node) { + next_node = node->next; + + if (node->feature_header) { + edef_delete_node(discover_list, node, rc, + "Unexpected feature header"); + } + } + return (rc); +} + +/* + * Function: evms_discover_evms_features + * Description: Find features for nodes on the logical partitions list + */ +static int +evms_discover_evms_features(struct evms_logical_node **discover_list) +{ + struct evms_logical_node *associative_feature_list; + int rc = 0; + + LOG_EXTRA("discovering evms volume features...\n"); + + /* initialize "associative" features list */ + associative_feature_list = NULL; + + /* find the bottom features */ + rc = edef_find_first_features(discover_list); +#ifdef LOCAL_DEBUG + display_discover_list(*discover_list, "after 1st features hdr"); +#endif + if (!rc) + /* process EVMS volumes here */ + rc = edef_process_evms_volumes(discover_list, + &associative_feature_list); +#ifdef LOCAL_DEBUG + display_discover_list(*discover_list, "after evms volumes"); +#endif + if (!rc) + /* process "associative" features here */ + rc = edef_process_associative_volumes(&associative_feature_list, + discover_list); +#ifdef LOCAL_DEBUG + display_discover_list(*discover_list, "after associatives"); +#endif + if (!rc) + /* check for incomplete volumes */ + rc = edef_check_for_incomplete_volumes(discover_list); + + return (rc); +} + +/* + * function: eelv_assign_volume_minor + * + * This is a support function for evms_export_logical_volumes. + * This routine assigns a specific minor number to a volume. It + * also performs the remaining steps to make this volume visible + * and usable to the kernel. + * + */ +static void +eelv_assign_volume_minor(struct evms_logical_node *node, int minor) +{ + struct evms_logical_volume *volume; + + /* initialize the logical_node entry in the volume array */ + volume = &evms_logical_volumes[minor]; + volume->node = node; + volume->name = + kmalloc(strlen(EVMS_GET_NODE_NAME(node)) + 1, GFP_KERNEL); + if (!volume->name) + BUG(); + strcpy(volume->name, EVMS_GET_NODE_NAME(node)); + + /* copy flags from top level node into volume structure */ + volume->flags = node->flags; + + /* check for read-only volume */ + if (volume->flags & EVMS_VOLUME_READ_ONLY) { + set_device_ro(MKDEV(EVMS_MAJOR, minor), 1); + } + + /* adjust volume size based on hardsector size */ + node->total_vsectors &= + ~((node->hardsector_size >> EVMS_VSECTOR_SIZE_SHIFT) - 1); + + /* initialize the global device arrays */ + blksize_size[EVMS_MAJOR][minor] = node->block_size; + hardsect_size[EVMS_MAJOR][minor] = node->hardsector_size; + blk_size[EVMS_MAJOR][minor] = (int) (node->total_vsectors >> 1); + + /* register this volume with devfs */ + volume->devfs_handle = + devfs_register(evms_dir_devfs_handle, + volume->name, + DEVFS_FL_DEFAULT, + EVMS_MAJOR, minor, + S_IFBLK | S_IRUGO | S_IWUGO, &evms_fops, NULL); + + evms_volumes++; + + LOG_DEFAULT("Exporting EVMS Volume(%u,%u) from \"%s%s\".\n", + EVMS_MAJOR, minor, EVMS_DEV_NODE_PATH, volume->name); +} + +/* + * function: eelv_check_for_duplicity + * + * This is a support function for evms_export_logical_volumes. + * This routine compares the serial number in the top most node + * in the volume to the list of currently exported volumes. If + * this volumes serial number is found in the list then we know + * this volume is a duplicate and it is then delete. + * + */ +static void +eelv_check_for_duplicity(struct evms_logical_node **discover_list) +{ + struct evms_logical_node *next_node, *node; + struct evms_logical_volume *lv; + int i, is_dup; + + for (node = *discover_list; node; node = next_node) { + next_node = node->next; + + is_dup = FALSE; + for (i = 1; i < MAX_EVMS_VOLUMES; i++) { + lv = &evms_logical_volumes[i]; + /* only check exported volumes */ + if (lv->node) { + char *type_ptr = NULL; + + /* check for duplicate pointer */ + if (node == lv->node) { + is_dup = TRUE; + type_ptr = "pointer"; + /* check for duplicate node */ + } else if (!strcmp(node->name, lv->node->name)) { + is_dup = TRUE; + type_ptr = "node"; + } + if (is_dup == TRUE) { + evms_cs_remove_logical_node_from_list + (discover_list, node); + LOG_DETAILS + ("deleting duplicate %s to EVMS volume(%u,%u,%s)...\n", + type_ptr, EVMS_MAJOR, i, + EVMS_GET_NODE_NAME(node)); + /* forget duplicate */ + break; + } + } + } + } +} + +/* + * function: eelv_reassign_soft_deleted_volume_minors + * + * This is a support function for evms_export_logical_volumes. + * This routine reassigns minor numbers to rediscovered "soft" + * deleted volumes. + * + */ +static void +eelv_reassign_soft_deleted_volume_minors(struct evms_logical_node + **discover_list) +{ + struct evms_logical_node *next_node, *node; + struct evms_logical_volume *lv; + int i, node_removed; + + for (node = *discover_list; node; node = next_node) { + next_node = node->next; + + node_removed = FALSE; + for (i = 1; i < MAX_EVMS_VOLUMES; i++) { + lv = &evms_logical_volumes[i]; + /* only check soft deleted volumes: + * they have a non-NULL name. + */ + if (lv->flags & EVMS_VOLUME_SOFT_DELETED) { + if (!strcmp(EVMS_GET_NODE_NAME(node), lv->name)) { + /* reassign requested minor */ + evms_cs_remove_logical_node_from_list + (discover_list, node); + node_removed = TRUE; + LOG_DEFAULT("Re"); + /* free the previously used name */ + kfree(lv->name); + lv->name = NULL; + /* clear the EVMS_VOLUME_SOFT_DELETED flag */ + lv->flags = 0; + eelv_assign_volume_minor(node, i); + break; + } + } + } + } +} + +/* + * function: eelv_assign_evms_volume_minors + * + * This is a support function for evms_export_logical_volumes. + * This routine assigns minor numbers to new evms volumes. If + * the specified minor is already in use, the requested minor + * is set to 0, and will be assigned next available along with + * any remaining volumes at the end of evms_export_logical_volumes. + * + */ +static void +eelv_assign_evms_volume_minors(struct evms_logical_node **discover_list) +{ + struct evms_logical_node *next_node, *node, *lv_node; + unsigned int requested_minor, node_removed; + + for (node = *discover_list; node; node = next_node) { + next_node = node->next; + + node_removed = FALSE; + /* only process evms volumes */ + if (node->flags & EVMS_VOLUME_FLAG) { + requested_minor = node->volume_info->volume_minor; + /* is there a requested minor? */ + if (requested_minor) { + int lv_flags = 0; + + /* check range of requested minor */ + if (requested_minor >= MAX_EVMS_VOLUMES) + lv_node = node; + else { + struct evms_logical_volume *lv; + lv = &evms_logical_volumes + [requested_minor]; + lv_node = lv->node; + lv_flags = lv->flags; + } + if ((!lv_node) + && (!(lv_flags & EVMS_VOLUME_SOFT_DELETED))) { + /* assign requested minor */ + evms_cs_remove_logical_node_from_list + (discover_list, node); + node_removed = TRUE; + eelv_assign_volume_minor(node, + requested_minor); + } else { + LOG_WARNING + ("EVMS volume(%s) requesting invalid/in-use minor(%d), assigning next available!\n", + node->volume_info->volume_name, + requested_minor); + /* + * requested minor is already + * in use, defer assignment + * until later. + */ + node->volume_info->volume_minor = 0; + } + } + } + } +} + +/* + * function: eelv_assign_remaining_evms_volume_minors + * + * This is a support function for evms_export_logical_volumes. + * This routine assigns minor numbers to new evms volumes that + * have no/conflicting minor assignments. This function will + * search from high(255) minor values down, for the first available + * minor. Searching high to low minimizes the possibility of + * conflicting evms volumes causing "compatibility" minor + * assignments to shift from expected assignments. + * + */ +static void +eelv_assign_remaining_evms_volume_minors(struct evms_logical_node + **discover_list) +{ + struct evms_logical_node *next_node, *node; + int requested_minor, node_removed; + + for (node = *discover_list; node; node = next_node) { + next_node = node->next; + + node_removed = FALSE; + /* only process evms volumes */ + /* all remaining evms volumes should now + * have a minor value of 0, meaning they + * had no minor assignment, or their minor + * assignment conflicted with an existing + * minor assignment. + */ + if (node->flags & EVMS_VOLUME_FLAG) { + evms_cs_remove_logical_node_from_list(discover_list, + node); + node_removed = TRUE; + /* find next available minor number */ + for (requested_minor = 255; + (evms_logical_volumes[requested_minor].node || + evms_logical_volumes[requested_minor].name) && + requested_minor; requested_minor--) ; + /* check range of assigned minor */ + if (!requested_minor) { + LOG_CRITICAL + ("no more minor numbers available for evms volumes!!!!\n"); + DELETE(node); + } else + /* assign requested minor */ + eelv_assign_volume_minor(node, requested_minor); + } + } +} + +/* + * function: eelv_assign_remaining_volume_minors + * + * This is a support function for evms_export_logical_volumes. + * This routine assigns minor numbers to all remaining unassigned + * volumes. Minor numbers are assigned on an availability + * basis. The first free minor number is used in the assignment. + * + */ +static void +eelv_assign_remaining_volume_minors(struct evms_logical_node **discover_list) +{ + struct evms_logical_node *node; + int minor; + + while (*discover_list) { + node = *discover_list; + evms_cs_remove_logical_node_from_list(discover_list, node); + + /* find next available minor number */ + for (minor = 1; + (evms_logical_volumes[minor].node || + evms_logical_volumes[minor].name) && + minor < MAX_EVMS_VOLUMES; minor++) ; + + if (minor >= MAX_EVMS_VOLUMES) { + LOG_CRITICAL + ("no more minor numbers available for compatibility volumes!!!!\n"); + DELETE(node); + } else + /* assign minor */ + eelv_assign_volume_minor(node, minor); + } +} + +/* + * function: eelv_check_for_unreassign_soft_deleted_volume + * + * This is a support function for evms_export_logical_volumes. + * This routine reports any "soft deleted" volumes that were not + * found after a rediscovery. + */ +static void +eelv_check_for_unreassign_soft_deleted_volume(void) +{ + struct evms_logical_volume *lv; + int i; + + for (i = 1; i < MAX_EVMS_VOLUMES; i++) { + lv = &evms_logical_volumes[i]; + /* only check soft deleted volumes: + * they have a NULL node ptr & + * they have a non-NULL name. + */ + if (lv->flags & EVMS_VOLUME_SOFT_DELETED) { + if (is_open(i)) + lv->flags |= EVMS_VOLUME_CORRUPT; + LOG_ERROR + ("error: rediscovery failed to find %smounted 'soft deleted' volume(%u,%u,%s)...\n", + ((lv->flags & EVMS_VOLUME_CORRUPT) ? "" : "un"), + EVMS_MAJOR, i, lv->name); + if (lv->flags & EVMS_VOLUME_CORRUPT) { + LOG_ERROR + (" flagging volume(%u,%u,%s) as CORRUPT!\n", + EVMS_MAJOR, i, lv->name); + } else { + LOG_ERROR + (" releasing minor(%d) used by volume(%s)!\n", + i, lv->name); + /* clear logical volume structure + * for this volume so it may be + * reused. + */ + kfree(lv->name); + lv->name = NULL; + lv->flags = 0; + } + } + } +} + +static void +eelv_unquiesce_volumes(void) +{ + int i; + + /* check each volume array entry */ + for (i = 1; i < MAX_EVMS_VOLUMES; i++) { + struct evms_logical_volume *volume; + + volume = &evms_logical_volumes[i]; + /* is this volume "quiesced" ? */ + if (volume->quiesced) { + int rc = 1; + if (volume->node) { + /* "unquiesce" it */ + struct inode inode; + struct evms_quiesce_vol_pkt qv; + + qv.command = qv.status = 0; + qv.do_vfs = 0; + qv.minor = i; + rc = evms_quiesce_volume(volume, &inode, NULL, + &qv); + } + /* Wake up any waiters */ + if (rc) { + /* clear the flag */ + volume->quiesced = 0; + /* wake up the waiters */ + if (waitqueue_active(&volume->wait_queue)) + wake_up(&volume->wait_queue); +#ifdef VFS_PATCH_PRESENT + /* unquiesce VFS if quiesced */ + if (volume->vfs_quiesced) { + /* VFS function call to unlock the filesystem */ + unlockfs(MKDEV(EVMS_MAJOR, i)); + volume->vfs_quiesced = FALSE; + } +#endif + } + } + } +} + +/* + * Function: evms_export_logical_volumes + * + * This function is called from evms_discover_volumes. It + * check for duplicate volumes, assigns minor values to evms + * volumes, and assigns minor values to the remaining volumes. + * In addition to assigning minor values to each volume this + * function also completes the final steps necessary to allow + * the volumes to be using by the operating system. + */ +static void +evms_export_logical_volumes(struct evms_logical_node **discover_list) +{ + LOG_EXTRA("exporting EVMS logical volumes...\n"); + + eelv_check_for_duplicity(discover_list); + + eelv_reassign_soft_deleted_volume_minors(discover_list); + + eelv_assign_evms_volume_minors(discover_list); + + eelv_assign_remaining_evms_volume_minors(discover_list); + + eelv_assign_remaining_volume_minors(discover_list); + + eelv_check_for_unreassign_soft_deleted_volume(); + + /* "unquiesce" any "quiesced" volumes */ + eelv_unquiesce_volumes(); +} + +static int +edv_populate_discover_list(struct evms_list_node *src_list, + struct evms_logical_node **trg_list, + struct evms_rediscover_pkt *discover_parms) +{ + int rc = 0, i, move_node, use_all_disks = FALSE; + struct evms_list_node *src_node; + struct evms_logical_node *disk_node = NULL; + + /* if no discover parameters are specified */ + /* copy ALL the disk nodes into the */ + /* discovery list. */ + if ((discover_parms == NULL) || + (discover_parms->drive_count == REDISCOVER_ALL_DEVICES)) + use_all_disks = TRUE; + + /* copy the disk nodes specified in the */ + /* discover_parms over to a discover list */ + src_node = src_list; + while (src_node) { + move_node = use_all_disks; + if (move_node == FALSE) + /* check the rediscovery array */ + for (i = 0; i < discover_parms->drive_count; i++) { + disk_node = + DEV_HANDLE_TO_NODE(discover_parms-> + drive_array[i]); + if (disk_node == src_node->item) { + move_node = TRUE; + break; + } + } + /* check to see if we want this node */ + if (move_node == TRUE) + evms_cs_add_logical_node_to_list(trg_list, + (struct + evms_logical_node *) + src_node->item); + /* advance to next struct evms_list_node */ + src_node = src_node->next; + } + return (rc); +} + +static int +evms_discover_volumes(struct evms_rediscover_pkt *discover_parms) +{ + int rc = 0; + struct evms_logical_node *discover_list = NULL; + + evms_discover_logical_disks(&discover_list); + if (evms_global_device_list) { + /* move the appropriate disk nodes, based on */ + /* on the discover parameters, onto the */ + /* discover list for the partition managers */ + /* to process */ + edv_populate_discover_list(evms_global_device_list, + &discover_list, discover_parms); + } + if (discover_list) { +#ifdef LOCAL_DEBUG + display_discover_list(discover_list, "after dev mgrs"); +#endif + evms_discover_logical_partitions(&discover_list); + } + if (discover_list) { +#ifdef LOCAL_DEBUG + display_discover_list(discover_list, "after seg mgrs"); +#endif + evms_discover_volume_groups(&discover_list); + } + if (discover_list) { +#ifdef LOCAL_DEBUG + display_discover_list(discover_list, "after reg mgrs"); +#endif + evms_discover_evms_features(&discover_list); + } + if (discover_list) { +#ifdef LOCAL_DEBUG + display_discover_list(discover_list, "after features"); +#endif + evms_export_logical_volumes(&discover_list); + evms_cs_signal_event(EVMS_EVENT_END_OF_DISCOVERY); + } + return (rc); +} + +/* function: evms_notify_reboot + * + * this function gets called at shutdown time and is used + * to remove any evms controlled volumes from memory, thus + * allowing any plugins needing to flush internal caches + * to do so. + */ +int +evms_notify_reboot(struct notifier_block *this, unsigned long code, void *x) +{ + int i; + struct evms_logical_volume *volume; + + switch (code) { + case SYS_DOWN: + case SYS_HALT: + case SYS_POWER_OFF: + LOG_DEFAULT("stopping all evms controlled volumes.\n"); + + /* quiesce all volumes */ + for (i = 1; i < MAX_EVMS_VOLUMES; i++) { + struct evms_quiesce_vol_pkt qv; + struct inode inode; + + volume = &evms_logical_volumes[i]; + if (!volume->node) + continue; + qv.command = 1; // quiesce + qv.minor = i; // + qv.status = 0; // reset status + qv.do_vfs = 0; + evms_quiesce_volume(volume, &inode, NULL, &qv); + } + /* delete all volumes + * + * to ensure this work under the + * most circumstances, a "soft" + * delete will be done. this will + * handle the strange case of a + * volume still being mounted. + */ + for (i = 1; i < MAX_EVMS_VOLUMES; i++) { + struct evms_delete_vol_pkt dv; + + volume = &evms_logical_volumes[i]; + if (!volume->node) + continue; + /* only delete quiesced volumes */ + if (!volume->quiesced) + continue; + /* delete the volume from memory. + * do a 'soft' delete if volume + * is mounted, and 'hard' delete + * if it is not. + */ + dv.command = is_open(i); + dv.minor = i; + dv.status = 0; + evms_delete_volume(volume, &dv); + } + } + return NOTIFY_DONE; +} + +static struct notifier_block evms_notifier = { + .notifier_call = evms_notify_reboot, + .next = NULL, + .priority = INT_MAX, /* before any real devices */ +}; + +/* + * Function: find_root_fs_dev + * If "root=/dev/evms/???" was specified on the kernel command line, and devfs + * is not enabled, we need to determine the appropriate minor number for the + * specified volume for the root fs. + */ +static void +find_root_fs_dev(void) +{ +#ifndef MODULE + char root_name[64] = { 0 }; + char *name; + int i; + +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,18) + strncpy(root_name, root_device_name, 63); +#else + get_root_device_name(root_name); +#endif + + if (!strncmp(root_name, EVMS_DIR_NAME "/", strlen(EVMS_DIR_NAME) + 1)) { + name = &root_name[strlen(EVMS_DIR_NAME) + 1]; + + for (i = 1; i < MAX_EVMS_VOLUMES; i++) { + if (evms_logical_volumes[i].name && + !strncmp(name, evms_logical_volumes[i].name, + strlen(evms_logical_volumes[i].name))) { + ROOT_DEV = MKDEV(EVMS_MAJOR, i); + return; + } + } + } +#endif +} + +/* + * Function: bh_cache_ctor + * this function initializes the b_wait field in the buffer heads + * in our private buffer head pool. + */ +static void +io_notify_cache_ctor(void *foo, kmem_cache_t * cachep, unsigned long flags) +{ + if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) { + io_notify_t *io_notify = (io_notify_t *) foo; + memset(io_notify, 0, sizeof (*io_notify)); + } +} + +/* + * Function: bh_cache_ctor + * this function initializes the b_wait field in the buffer heads + * in our private buffer head pool. + */ +static void +bh_cache_ctor(void *foo, kmem_cache_t * cachep, unsigned long flags) +{ + if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) { + struct buffer_head *bh = (struct buffer_head *) foo; + memset(bh, 0, sizeof (*bh)); + init_waitqueue_head(&bh->b_wait); + } +} + +/* + * Function: evms_init_module + * This function runs once at system initialization. + */ +static int __init +evms_init_module(void) +{ + int rc = 0, i; + int *evms_blocksizes; + + LOG_DEFAULT("EVMS v%d.%d.%d initializing .... info level(%d).\n", + EVMS_MAJOR_VERSION, + EVMS_MINOR_VERSION, + EVMS_PATCHLEVEL_VERSION, evms_info_level); + + /* initialize memory management counters */ + evms_allocs = (atomic_t) ATOMIC_INIT(0); + evms_logical_nodes = (atomic_t) ATOMIC_INIT(0); + + /* initialize the io_notify_entry pool */ + if (!rc) + evms_io_notify_pool = evms_cs_create_pool(sizeof (io_notify_t), + "EVMS IO Notify", + io_notify_cache_ctor, + NULL); + + /* initialize the "public" buffer_head pool */ + if (!rc) + evms_bh_pool = evms_cs_create_pool(sizeof (struct buffer_head), + "EVMS BH", + bh_cache_ctor, NULL); + + /* allocate the logical volume array */ + if (!rc) + evms_logical_volumes = + kmalloc(sizeof (struct evms_logical_volume) * + MAX_EVMS_VOLUMES, GFP_KERNEL); + if (!evms_logical_volumes) { + rc = -ENOMEM; + } + + /* initialize the logical volume array entries */ + if (!rc) { + memset(evms_logical_volumes, 0, + sizeof (struct evms_logical_volume) * MAX_EVMS_VOLUMES); + for (i = 1; i < MAX_EVMS_VOLUMES; i++) { + struct evms_logical_volume *volume; + + volume = &evms_logical_volumes[i]; + init_waitqueue_head(&volume->wait_queue); + volume->requests_in_progress = + (atomic_t) ATOMIC_INIT(0); +#ifdef CONFIG_SMP + blk_init_queue(&volume->request_queue, + evms_do_request_fn); + blk_queue_make_request(&volume->request_queue, + evms_make_request_fn); +#endif + } + } + + /* allocate EVMS' blk_size array */ + if (!rc) { + evms_blocksizes = kmalloc(MAX_EVMS_VOLUMES * + sizeof (int), GFP_KERNEL); + if (!evms_blocksizes) { + rc = -ENOMEM; + LOG_CRITICAL + ("can't allocate memory for EVMS blk_size\n"); + } else { + memset(evms_blocksizes, 0, + MAX_EVMS_VOLUMES * sizeof (int)); + blk_size[EVMS_MAJOR] = evms_blocksizes; + } + } + + /* allocate EVMS' blksize_size array */ + if (!rc) { + evms_blocksizes = kmalloc(MAX_EVMS_VOLUMES * + sizeof (int), GFP_KERNEL); + if (!evms_blocksizes) { + rc = -ENOMEM; + LOG_CRITICAL + ("can't allocate memory for EVMS blksize_size\n"); + } else { + memset(evms_blocksizes, 0, + MAX_EVMS_VOLUMES * sizeof (int)); + blksize_size[EVMS_MAJOR] = evms_blocksizes; + } + } + + /* allocate EVMS' hardsect_size array */ + if (!rc) { + evms_blocksizes = kmalloc(MAX_EVMS_VOLUMES * + sizeof (int), GFP_KERNEL); + if (!evms_blocksizes) { + rc = -ENOMEM; + LOG_CRITICAL + ("can't allocate memory for EVMS hardsect_size\n"); + } else { + memset(evms_blocksizes, 0, + MAX_EVMS_VOLUMES * sizeof (int)); + hardsect_size[EVMS_MAJOR] = evms_blocksizes; + } + } + + /* Register the block device */ + if (!rc) { + rc = devfs_register_blkdev(EVMS_MAJOR, EVMS_DIR_NAME, + &evms_fops); + if (rc) { + LOG_CRITICAL + ("error calling devfs_register_blkdev() err=%u\n", + rc); + rc = -EINVAL; + } + } + + /* Register with devfs */ + if (!rc) { + evms_dir_devfs_handle = devfs_mk_dir(NULL, EVMS_DIR_NAME, NULL); + // A NULL return cannot be fatal. + // Devfs just might not be running + if (!evms_dir_devfs_handle) { + LOG_EXTRA + ("NULL return from devfs_mk_dir() for \"%s\"\n", + EVMS_DIR_NAME); + LOG_EXTRA("Is devfs enabled?\n"); + } else { + evms_blk_devfs_handle = + devfs_register(evms_dir_devfs_handle, EVMS_DEV_NAME, + DEVFS_FL_DEFAULT, EVMS_MAJOR, 0, + S_IFBLK | S_IRUGO | S_IWUGO, + &evms_fops, NULL); + if (!evms_blk_devfs_handle) { + LOG_DETAILS + ("NULL return from devfs_register() for \"%s\"\n", + EVMS_DEV_NAME); + } + } + } + + if (!rc) { + read_ahead[EVMS_MAJOR] = 4096; +#ifdef CONFIG_SMP + blk_dev[EVMS_MAJOR].queue = evms_find_queue; +#else + blk_init_queue(BLK_DEFAULT_QUEUE(EVMS_MAJOR), + evms_do_request_fn); + blk_queue_make_request(BLK_DEFAULT_QUEUE(EVMS_MAJOR), + evms_make_request_fn); +#endif +#ifdef CONFIG_PROC_FS + evms_cs_get_evms_proc_dir(); + if (evms_proc_dir) { + create_proc_read_entry("info", 0, evms_proc_dir, + evms_info_read_proc, NULL); + create_proc_read_entry("plugins", 0, evms_proc_dir, + evms_plugins_read_proc, NULL); + create_proc_read_entry("volumes", 0, evms_proc_dir, + evms_volumes_read_proc, NULL); + } + evms_table_header = register_sysctl_table(dev_dir_table, 1); +#endif + /* Register for reboot notification */ + register_reboot_notifier(&evms_notifier); + +#if defined(CONFIG_PPC64) || defined(CONFIG_SPARC64) + /* Register evms 32bit ioctl handlers */ + lock_kernel(); + register_ioctl32_conversion(EVMS_GET_INFO_LEVEL,NULL); + register_ioctl32_conversion(EVMS_SET_INFO_LEVEL,NULL); + register_ioctl32_conversion(EVMS_REDISCOVER_VOLUMES_32, + evms_rediscover); + register_ioctl32_conversion(EVMS_DELETE_VOLUME,NULL); + register_ioctl32_conversion(EVMS_PLUGIN_IOCTL_32, + evms_plugin_ioctl); + register_ioctl32_conversion(EVMS_PROCESS_NOTIFY_EVENT,NULL); + register_ioctl32_conversion(EVMS_GET_LOGICAL_DISK,NULL); + register_ioctl32_conversion(EVMS_GET_LOGICAL_DISK_INFO,NULL); + register_ioctl32_conversion(EVMS_SECTOR_IO_32, evms_sector_io); + register_ioctl32_conversion(EVMS_GET_MINOR,NULL); + register_ioctl32_conversion(EVMS_GET_VOLUME_DATA,NULL); + register_ioctl32_conversion(EVMS_GET_PLUGIN,NULL); + register_ioctl32_conversion(EVMS_COMPUTE_CSUM_32, + evms_compute_csum); + register_ioctl32_conversion(EVMS_GET_BMAP,NULL); + register_ioctl32_conversion(EVMS_GET_IOCTL_VERSION,NULL); + register_ioctl32_conversion(EVMS_GET_VERSION,NULL); + register_ioctl32_conversion(EVMS_UPDATE_DEVICE_INFO,NULL); + register_ioctl32_conversion(EVMS_CHECK_MOUNT_STATUS,NULL); + register_ioctl32_conversion(EVMS_GET_VOL_STRIPE_INFO,NULL); + unlock_kernel(); +#endif + + } + + return rc; +} + +/* + * Function: evms_exit_module + * This function runs once when the EVMS core module is unloaded. + */ +static void __exit +evms_exit_module(void) +{ + LOG_DEFAULT("EVMS v%d.%d.%d unloading ....\n", + EVMS_MAJOR_VERSION, + EVMS_MINOR_VERSION, EVMS_PATCHLEVEL_VERSION); + +#if defined(CONFIG_PPC64) || defined(CONFIG_SPARC64) + /* Un-Register evms 32bit ioctl handlers */ + lock_kernel(); + unregister_ioctl32_conversion(EVMS_GET_INFO_LEVEL); + unregister_ioctl32_conversion(EVMS_SET_INFO_LEVEL); + unregister_ioctl32_conversion(EVMS_REDISCOVER_VOLUMES_32); + unregister_ioctl32_conversion(EVMS_DELETE_VOLUME); + unregister_ioctl32_conversion(EVMS_PLUGIN_IOCTL_32); + unregister_ioctl32_conversion(EVMS_PROCESS_NOTIFY_EVENT); + unregister_ioctl32_conversion(EVMS_GET_LOGICAL_DISK); + unregister_ioctl32_conversion(EVMS_GET_LOGICAL_DISK_INFO); + unregister_ioctl32_conversion(EVMS_SECTOR_IO_32); + unregister_ioctl32_conversion(EVMS_GET_MINOR); + unregister_ioctl32_conversion(EVMS_GET_VOLUME_DATA); + unregister_ioctl32_conversion(EVMS_GET_PLUGIN); + unregister_ioctl32_conversion(EVMS_COMPUTE_CSUM_32); + unregister_ioctl32_conversion(EVMS_GET_BMAP); + unregister_ioctl32_conversion(EVMS_GET_IOCTL_VERSION); + unregister_ioctl32_conversion(EVMS_GET_VERSION); + unregister_ioctl32_conversion(EVMS_UPDATE_DEVICE_INFO); + unregister_ioctl32_conversion(EVMS_CHECK_MOUNT_STATUS); + unregister_ioctl32_conversion(EVMS_GET_VOL_STRIPE_INFO); + unlock_kernel(); +#endif + + /* unregister with devfs + */ + devfs_unregister(evms_dir_devfs_handle); + /* clean up the queue for the block device + */ + blk_cleanup_queue(blk_get_queue(MKDEV(EVMS_MAJOR, 0))); + /* unregister block device + */ + devfs_unregister_blkdev(EVMS_MAJOR, EVMS_DIR_NAME); + /* deallocate device arrays + */ + kfree(blk_size[EVMS_MAJOR]); + blk_size[EVMS_MAJOR] = NULL; + kfree(blksize_size[EVMS_MAJOR]); + blksize_size[EVMS_MAJOR] = NULL; + kfree(hardsect_size[EVMS_MAJOR]); + hardsect_size[EVMS_MAJOR] = NULL; + read_ahead[EVMS_MAJOR] = 0; + /* deallocate logical volumes array + */ + kfree(evms_logical_volumes); + /* destroy buffer head pool + */ + evms_cs_destroy_pool(evms_bh_pool); + /* destroy io notify pool + */ + evms_cs_destroy_pool(evms_io_notify_pool); +#ifdef CONFIG_PROC_FS + if (evms_proc_dir) { + remove_proc_entry("volumes", evms_proc_dir); + remove_proc_entry("plugins", evms_proc_dir); + remove_proc_entry("info", evms_proc_dir); + remove_proc_entry("evms", NULL); + } + unregister_sysctl_table(evms_table_header); +#endif +} + +/* + * Function: evms_init_discover + * If EVMS is statically built into the kernel, this function will be called + * to perform an initial volume discovery. + */ +int __init +evms_init_discover(void) +{ + /* go find volumes */ + evms_discover_volumes(NULL); + + /* Check if the root fs is on EVMS */ + if (MAJOR(ROOT_DEV) == EVMS_MAJOR) { + find_root_fs_dev(); + } + + return 0; +} + +/* + * a placeholder for cluster enablement + */ +void +evms_cluster_init(int nodeid, int clusterid) +{ + /* dummy */ + return; +} + +EXPORT_SYMBOL(evms_cluster_init); + +/* + * a placeholder for cluster enablement + */ +int +evms_cluster_shutdown(void) +{ + /* dummy */ + return -1; +} + +EXPORT_SYMBOL(evms_cluster_shutdown); + +static int __init +evms_boot_info_level(char *str) +{ + int evms_boot_info_level = (int) simple_strtoul(str, NULL, 10); + if (evms_boot_info_level) { + evms_info_level = evms_boot_info_level; + } + return 1; +} + +__setup("evms_info_level=", evms_boot_info_level); +module_init(evms_init_module); +module_exit(evms_exit_module); +__initcall(evms_init_discover); +#ifdef MODULE_LICENSE +MODULE_LICENSE("GPL"); +#endif + +/**********************************************************/ +/* END -- INIT/DISCOVERY support functions */ +/**********************************************************/ diff -Naur linux-2002-09-30/drivers/evms/evms_bbr.c evms-2002-09-30/drivers/evms/evms_bbr.c --- linux-2002-09-30/drivers/evms/evms_bbr.c Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/drivers/evms/evms_bbr.c Wed Sep 25 15:04:22 2002 @@ -0,0 +1,1817 @@ +/* -*- linux-c -*- */ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +/* linux/driver/evms/evms_bbr.c + * + * EVMS - Bad Block Relocation (BBR) Feature Plugin + * + * BBR feature is designed to remap I/O write failures to another safe location + * on disk. Note that most disk drives have BBR built into them, this means + * that our software BBR will be only activated when all hardware BBR + * replacement sectors have been used. + */ + +#define LOG_PREFIX "bbr: " + +#include +#include +#include +#include +#include + +#include +#include + +/* API prototypes. */ +static int bbr_discover(struct evms_logical_node ** discover_list); +static int bbr_delete(struct evms_logical_node * node); +static void bbr_read(struct evms_logical_node * node, struct buffer_head * bh); +static void bbr_write(struct evms_logical_node * node, struct buffer_head * bh); +static int bbr_ioctl(struct evms_logical_node * bbr_node, + struct inode * inode, + struct file * file, + unsigned int cmd, + unsigned long arg); +static int bbr_direct_ioctl(struct inode * inode, + struct file * file, + unsigned int cmd, + unsigned long arg); +static int bbr_init_io(struct evms_logical_node * bbr_node, + int io_flag, + u64 startLSN, + u64 nr_sects, + void * bufptr); + +/* Other function prototypes. */ +static int bbr_create_pools(void); +static void bbr_destroy_pools(void); +static u32 bbr_table_to_remap_list(struct bbr_private * bbr_id); +static void bbr_io_handler(void * void_data); +static void bbr_free_private(struct bbr_private * bbr_id); +static inline void bbr_list_add(struct bbr_private * bbr_id); + +/* List of all BBR nodes. */ +static struct bbr_private * bbr_instances = NULL; + +/* Data pertaining to the I/O thread. */ +static struct evms_thread * bbr_io_thread = NULL; +static spinlock_t bbr_io_list_lock = SPIN_LOCK_UNLOCKED; +static struct list_head bbr_io_list = LIST_HEAD_INIT(bbr_io_list); + +/* Global pools for bbr_io_buf's and bbr_remap's. */ +kmem_cache_t * bbr_io_buf_slab; +mempool_t * bbr_io_buf_pool; +kmem_cache_t * bbr_remap_slab; +mempool_t * bbr_remap_pool; + +/* Plugin function table and header. */ +static struct evms_plugin_fops function_table = { + .discover = bbr_discover, + .delete = bbr_delete, + .read = bbr_read, + .write = bbr_write, + .init_io = bbr_init_io, + .ioctl = bbr_ioctl, + .direct_ioctl = bbr_direct_ioctl +}; + +static struct evms_plugin_header plugin_header = { + .id = SetPluginID(IBM_OEM_ID, + EVMS_FEATURE, + EVMS_BBR_FEATURE_ID), + .version = { + .major = EVMS_BBR_VERSION_MAJOR, + .minor = EVMS_BBR_VERSION_MINOR, + .patchlevel = EVMS_BBR_VERSION_PATCHLEVEL + }, + .required_services_version = { + .major = EVMS_BBR_COMMON_SERVICES_MAJOR, + .minor = EVMS_BBR_COMMON_SERVICES_MINOR, + .patchlevel = EVMS_BBR_COMMON_SERVICES_PATCHLEVEL + }, + .fops = &function_table +}; + +/** + * le_meta_data_to_cpu + * + * Convert bbr meta data from on-disk (LE) format + * to the native cpu endian format. + */ +void le_meta_data_to_cpu(struct evms_bbr_metadata * md) +{ + md->signature = le32_to_cpup(&md->signature); + md->crc = le32_to_cpup(&md->crc); + md->block_size = le32_to_cpup(&md->block_size); + md->flags = le32_to_cpup(&md->flags); + md->sequence_number = le64_to_cpup(&md->sequence_number); + md->start_sect_bbr_table = le64_to_cpup(&md->start_sect_bbr_table); + md->nr_sects_bbr_table = le64_to_cpup(&md->nr_sects_bbr_table); + md->start_replacement_sect = le64_to_cpup(&md->start_replacement_sect); + md->nr_replacement_blks = le64_to_cpup(&md->nr_replacement_blks); +} + +/** + * le_bbr_table_sector_to_cpu + * + * Convert bbr meta data from on-disk (LE) format + * to the native cpu endian format. + */ +void le_bbr_table_sector_to_cpu(struct evms_bbr_table * p) +{ + int i; + p->signature = le32_to_cpup(&p->signature); + p->crc = le32_to_cpup(&p->crc); + p->sequence_number = le32_to_cpup(&p->sequence_number); + p->in_use_cnt = le32_to_cpup(&p->in_use_cnt); + for ( i = 0; i < EVMS_BBR_ENTRIES_PER_SECT; i++ ) { + p->entries[i].bad_sect = + le64_to_cpup(&p->entries[i].bad_sect); + p->entries[i].replacement_sect = + le64_to_cpup(&p->entries[i].replacement_sect); + } +} + +/** + * cpu_bbr_table_sector_to_le + * + * Convert bbr meta data from cpu endian format to on-disk (LE) format + */ +void cpu_bbr_table_sector_to_le(struct evms_bbr_table * p, + struct evms_bbr_table * le) +{ + int i; + le->signature = cpu_to_le32p(&p->signature); + le->crc = cpu_to_le32p(&p->crc); + le->sequence_number = cpu_to_le32p(&p->sequence_number); + le->in_use_cnt = cpu_to_le32p(&p->in_use_cnt); + for ( i = 0; i < EVMS_BBR_ENTRIES_PER_SECT; i++ ) { + le->entries[i].bad_sect = + cpu_to_le64p(&p->entries[i].bad_sect); + le->entries[i].replacement_sect = + cpu_to_le64p(&p->entries[i].replacement_sect); + } +} + +#ifdef EVMS_BBR_DEBUG +static void print_meta_data(struct evms_bbr_metadata * md) +{ + LOG_DEBUG("BBR Metadata Sector:\n" + " signature 0x%08X\n" + " crc 0x%08X\n" + " block_size %u\n" + " start_sect_bbr_table "PFU64"\n" + " nr_sects_bbr_table "PFU64"\n" + " start_replacement_sect "PFU64"\n" + " nr_replacement_blks "PFU64"\n", + md->signature, md->crc, md->block_size, + md->start_sect_bbr_table, md->nr_sects_bbr_table, + md->start_replacement_sect, md->nr_replacement_blks); +} + +static void print_bbr_table_sector(struct evms_bbr_table * p) +{ + int i; + LOG_DEBUG("BBR Table Sector:\n" + " sig 0x%08X\n" + " crc 0x%08X\n" + " sequence %u\n" + " in_use_cnt %u\n" + " Table Entries:\n", + p->signature, p->crc, p->sequence_number, p->in_use_cnt); + for ( i = 0; i < EVMS_BBR_ENTRIES_PER_SECT; i++ ) { + LOG_DEBUG(" [%d] bad_sect: "PFU64" replacement_sect: "PFU64"\n", + i, p->entries[i].bad_sect, + p->entries[i].replacement_sect); + } +} + +void print_binary_tree(struct bbr_runtime_remap * node) +{ + if (node) { + LOG_DEFAULT("["PFU64","PFU64"]\n", node->remap.bad_sect, + node->remap.replacement_sect); + print_binary_tree(node->left); + print_binary_tree(node->right); + } +} + +static void print_remap_list(struct bbr_private * bbr_id) +{ + if (bbr_id->remap_root) { + LOG_DEFAULT("%s for %s\n", __FUNCTION__, bbr_id->node->name); + print_binary_tree(bbr_id->remap_root); + } +} +#endif + +/** + * validate_bbr_table_sector + * + * Check the specified BBR table sector for a valid signature and CRC. + */ +static int validate_bbr_table_sector(struct evms_bbr_table * p) +{ + int rc = 0; + int org_crc, final_crc; + + if ( le32_to_cpup(&p->signature) != EVMS_BBR_TABLE_SIGNATURE ) { + LOG_ERROR("BBR table signature doesn't match!\n"); + LOG_ERROR("Sector has (0x%08X) expected(0x%08X)\n", + le32_to_cpup(&p->signature), + EVMS_BBR_TABLE_SIGNATURE); + rc = -EINVAL; + } else { + if (p->crc) { + org_crc = le32_to_cpup(&p->crc); + p->crc = 0; + final_crc = evms_cs_calculate_crc(EVMS_INITIAL_CRC, p, + sizeof(*p)); + if ( final_crc != org_crc ) { + LOG_ERROR("CRC failed!\n"); + LOG_ERROR("Sector has (0x%08X) calculated(0x%08X)\n", + org_crc, final_crc); + rc = -EINVAL; + } + p->crc = cpu_to_le32p(&org_crc); + } else { + LOG_ERROR("BBR table sector has no CRC!\n"); + rc = -EINVAL; + } + } + if (rc) + BBR_DEBUG_PRINT_TABLE_SECTOR(p); + le_bbr_table_sector_to_cpu(p); + return rc; +} + +/** + * update_invalid_bbr_table_sector + * + * If one copy of a BBR table sector is bad, replace it with the valid copy. + */ +void update_invalid_bbr_table_sector(struct evms_logical_node * node, + struct evms_bbr_table * valid, + struct evms_bbr_table * invalid, + u64 lsn) +{ + int rc; + struct evms_bbr_table * tmp_bbr_table; + + /* Correct the invalid bbr table sector */ + memcpy(invalid, valid, sizeof(struct evms_bbr_table)); + + /* Allocate memory for I/O */ + tmp_bbr_table = kmalloc(sizeof(struct evms_bbr_table), GFP_KERNEL); + if (tmp_bbr_table) { + memset(tmp_bbr_table, 0, sizeof(struct evms_bbr_table)); + cpu_bbr_table_sector_to_le(valid, tmp_bbr_table); + LOG_WARNING("Correcting BBR table sector "PFU64"\n", lsn); + rc = INIT_IO(node, 1, lsn, 1, tmp_bbr_table); + if (rc) { + LOG_ERROR("Could not correct BBR table sector "PFU64".\n", + lsn); + } + kfree(tmp_bbr_table); + } +} + +/** + * validate_bbr_table + * + * Validate the entire range of sectors in the BBR table. + */ +static u32 validate_bbr_table(struct evms_bbr_metadata * md, + struct evms_bbr_table * p) +{ + u32 i, nr_sects; + + nr_sects = md->nr_sects_bbr_table; + + for ( i = 0; i < nr_sects; i++, p++ ) { + if ( validate_bbr_table_sector(p) ) + break; + } + + if ( i != nr_sects ) { + LOG_SERIOUS("Stopped BBR table validation at sector %u.\n", i); + nr_sects = i; + } + LOG_DEBUG("Validated %u BBR table sectors.\n", nr_sects); + return nr_sects; +} + +/** + * validate_bbr_tables + * @node: BBR node to validate. + * @MD1: Primary metadata sector. + * @MD2: Secondary metadata sector. + * @p1: Primary BBR table. + * @p2: Secondary BBR table. + * + * Validate both copies of the BBR table. If one of them is invalid, + * try to correct the errors using the valid copy. + */ +static u32 validate_bbr_tables(struct evms_logical_node * node, + struct evms_bbr_metadata * MD1, + struct evms_bbr_metadata * MD2, + struct evms_bbr_table * p1, + struct evms_bbr_table * p2) +{ + u32 i, rc1, rc2, nr_sects; + + nr_sects = MD1->nr_sects_bbr_table; + if ( nr_sects != MD2->nr_sects_bbr_table ) { + nr_sects = (nr_sects < MD2->nr_sects_bbr_table) ? + nr_sects : MD2->nr_sects_bbr_table; + LOG_SERIOUS("Size of BBR tables don't match. Using %u\n", + nr_sects); + } + + for ( i = 0; i < nr_sects; i++, p1++, p2++ ) { + rc1 = validate_bbr_table_sector(p1); + if (rc1) { + LOG_WARNING("Invalid BBR table sector at "PFU64".\n", + MD1->start_sect_bbr_table + i); + } + rc2 = validate_bbr_table_sector(p2); + if (rc2) { + LOG_WARNING("Invalid BBR table sector at "PFU64".\n", + MD2->start_sect_bbr_table + i); + } + + /* Correct BBR table errors. */ + if (rc1 && rc2) { + /* Cannot fix. */ + break; + } else if (rc1) { + update_invalid_bbr_table_sector(node, p2, p1, + MD1->start_sect_bbr_table + i); + continue; + } else if (rc2) { + update_invalid_bbr_table_sector(node, p1, p2, + MD2->start_sect_bbr_table + i); + continue; + } + + if ( p1->sequence_number != p2->sequence_number ) { + LOG_WARNING("Sequence numbers for BBR table index %u don't match.\n", i); + LOG_WARNING("MD1 sequence_nr=%u, MD2 sequence_nr_2=%u\n", + p1->sequence_number, p2->sequence_number); + if ( p1->sequence_number < p2->sequence_number ) { + update_invalid_bbr_table_sector(node, p2, p1, + MD1->start_sect_bbr_table + i); + } else { + update_invalid_bbr_table_sector(node, p1, p2, + MD2->start_sect_bbr_table + i); + } + } + } + if ( i != nr_sects ) { + LOG_SERIOUS("Stopped validation at sector %u\n", i); + nr_sects = i; + } + LOG_DEBUG("Validated %u BBR table sectors.\n", nr_sects); + return nr_sects; +} + +/** + * validate_meta_data + * + * Check the specified BBR metadata sector for a valid signature and CRC. + */ +static int validate_meta_data(struct evms_bbr_metadata * md) +{ + int org_crc, final_crc; + + BBR_DEBUG_PRINT_META_DATA(md); + + if ( le32_to_cpup(&md->signature) != EVMS_BBR_SIGNATURE ) { + LOG_SERIOUS("BBR signature doesn't match!\n"); + LOG_SERIOUS("Found: 0x%08X Expecting: 0x%08X\n", + le32_to_cpup(&md->signature), EVMS_BBR_SIGNATURE); + return -EINVAL; + } + + if (md->crc) { + org_crc = le32_to_cpup(&md->crc); + md->crc = 0; + final_crc = evms_cs_calculate_crc(EVMS_INITIAL_CRC, md, + sizeof(*md)); + if ( final_crc != org_crc ) { + LOG_ERROR("CRC failed!\n"); + LOG_ERROR("Sector has (0x%08X) calculated(0x%08X)\n", + org_crc, final_crc); + return -EINVAL; + } + md->crc = cpu_to_le32p(&org_crc); + } else { + LOG_WARNING("Metadata sector has no CRC!\n"); + } + + le_meta_data_to_cpu(md); + return 0; +} + +/** + * bbr_load_meta_data + * @node: BBR node to read metadata from. + * @lsn: Sector to read metadata from. + * @md: Pointer to return metadata structure. + * @bbr_table: Pointer to return BBR table. + * + * Load one copy of the BBR metadata. If the metadata is valid, load the + * corresponding copy of the BBR table. + */ +static int load_meta_data(struct evms_logical_node * node, + u64 lsn, + struct evms_bbr_metadata ** md, + struct evms_bbr_table ** bbr_table) +{ + int rc; + + *md = NULL; + *bbr_table = NULL; + + if (!lsn) { + LOG_WARNING("No sector specified for BBR metadata on %s.\n", + node->name); + return -ENODATA; + } + + /* Allocate a buffer for the metadata sector. */ + *md = kmalloc(sizeof(struct evms_bbr_metadata), GFP_KERNEL); + if (!*md) { + LOG_ERROR("kmalloc error creating metadata buffer for %s.\n", + node->name); + return -ENOMEM; + } + + /* Read the metadata sector. */ + rc = INIT_IO(node, 0, lsn, 1, *md); + if (rc) { + LOG_ERROR("init_io error on %s.\n", node->name); + kfree(*md); + *md = NULL; + return rc; + } + + /* Validate the metadata sector. */ + rc = validate_meta_data(*md); + if (rc) { + LOG_ERROR("Error validating metadata for %s.\n", node->name); + kfree(*md); + *md = NULL; + return rc; + } + + /* Allocate a buffer for the BBR table. */ + *bbr_table = kmalloc((*md)->nr_sects_bbr_table << + EVMS_VSECTOR_SIZE_SHIFT, GFP_KERNEL); + if (!*bbr_table) { + LOG_ERROR("kmalloc error creating BBR table buffer for %s.\n", + node->name); + kfree(*md); + *md = NULL; + return -ENOMEM; + } + + /* Read the BBR table but don't validate here. */ + rc = INIT_IO(node, 0, (*md)->start_sect_bbr_table, + (*md)->nr_sects_bbr_table, *bbr_table); + if (rc) { + LOG_ERROR("init_io error on %s.\n", node->name); + kfree(*md); + *md = NULL; + kfree(*bbr_table); + *bbr_table = NULL; + } + + return rc; +} + +/** + * bbr_load_feature_data + * @node: BBR node + * @ID: Return pointer to BBR private data. + * + * Load both copies of the BBR metadata and table. If one is invalid, try + * to correct is using the valid copy. When a valid copy is found, create + * a private data structure for the specified node. + */ +static int load_feature_data(struct evms_logical_node * node, + struct bbr_private ** ID) +{ + struct evms_bbr_metadata * md1 = NULL; + struct evms_bbr_metadata * md2 = NULL; + struct evms_bbr_table * table1 = NULL; + struct evms_bbr_table * table2 = NULL; + u64 lba_table1 = 0, lba_table2 = 0; + u32 nr_sects = 0; + int rc = 0, rc1, rc2; + + *ID = NULL; + + /* Load metadata 1 */ + rc1 = load_meta_data(node, + node->feature_header->feature_data1_start_lsn, + &md1, &table1); + /* Load metadata 2 */ + rc2 = load_meta_data(node, + node->feature_header->feature_data2_start_lsn, + &md2, &table2); + + if (rc1 && rc2) { + /* Both copies are bad? Cannot continue. */ + rc = -ENODATA; + } else if (rc1 || rc2) { + /* One copy is bad. Use the good copy. */ + if (rc1) { + lba_table2 = md2->start_sect_bbr_table; + kfree(table1); + kfree(md1); + table1 = table2; + table2 = NULL; + md1 = md2; + md2 = NULL; + } else { + lba_table1 = md1->start_sect_bbr_table; + } + + nr_sects = validate_bbr_table(md1, table1); + if ( nr_sects == 0 ) { + rc = -ENODATA; + } + } else { + lba_table1 = md1->start_sect_bbr_table; + lba_table2 = md2->start_sect_bbr_table; + nr_sects = validate_bbr_tables(node, md1, md2, table1, table2); + if ( nr_sects == 0 ) { + rc = -ENODATA; + } + } + + if (!rc && nr_sects) { + *ID = kmalloc(sizeof(struct bbr_private), GFP_KERNEL); + if (*ID) { + memset(*ID, 0, sizeof(struct bbr_private)); + (*ID)->source = node; + (*ID)->blksize_in_sects = md1->block_size >> + EVMS_VSECTOR_SIZE_SHIFT; + (*ID)->remap_root = NULL; + (*ID)->lba_table1 = lba_table1; + (*ID)->lba_table2 = lba_table2; + (*ID)->bbr_table = table1; + (*ID)->nr_sects_bbr_table = nr_sects; + if ( nr_sects < md1->nr_sects_bbr_table ) { + LOG_WARNING("Making BBR node read-only\n"); + (*ID)->flag |= EVMS_VOLUME_READ_ONLY; + } + (*ID)->nr_replacement_blks = nr_sects * + EVMS_BBR_ENTRIES_PER_SECT; + (*ID)->start_replacement_sect = md1->start_replacement_sect; + (*ID)->in_use_replacement_blks = (atomic_t)ATOMIC_INIT(0); + (*ID)->bbr_id_lock = SPIN_LOCK_UNLOCKED; + if ( !bbr_remap_pool || !bbr_io_buf_pool ) { + rc = bbr_create_pools(); + } + if (!rc) { + atomic_set(&(*ID)->in_use_replacement_blks, + bbr_table_to_remap_list(*ID)); + } + } else { + rc = -ENOMEM; + } + } + + if (!rc) { + if (!bbr_io_thread) { + const char * name = "evms_bbr_io"; + bbr_io_thread = evms_cs_register_thread(bbr_io_handler, + NULL, name); + if (!bbr_io_thread) { + rc = -EINVAL; + } + } + } + + /* If error, free table1. */ + if (rc) { + if (table1) { + kfree(table1); + } + if (*ID) { + (*ID)->bbr_table = NULL; + bbr_free_private(*ID); + (*ID) = NULL; + } + } + + /* Will never use md1, md2 and table2 again */ + if (md1) { + kfree(md1); + } + if (md2) { + kfree(md2); + } + if (table2) { + kfree(table2); + } + + return rc; +} + +/** + * bbr_binary_tree_insert + * + * Insert a node into the binary tree. + */ +void bbr_binary_tree_insert(struct bbr_runtime_remap ** root, + struct bbr_runtime_remap * newnode) +{ + struct bbr_runtime_remap ** node = root; + while (node && *node) { + if ( newnode->remap.bad_sect > (*node)->remap.bad_sect ) { + node = &((*node)->right); + } else { + node = &((*node)->left); + } + } + + newnode->left = newnode->right = NULL; + *node = newnode; +} + +/** + * bbr_binary_search + * + * Search for a node that contains bad_sect = lsn. + */ +struct bbr_runtime_remap * bbr_binary_search(struct bbr_runtime_remap * root, + u64 lsn) +{ + struct bbr_runtime_remap * node = root; + while (node) { + if (node->remap.bad_sect == lsn) { + break; + } + if ( lsn > node->remap.bad_sect ) { + node = node->right; + } else { + node = node->left; + } + } + return node; +} + +/** + * bbr_binary_tree_destroy + * + * Destroy the binary tree. + */ +void bbr_binary_tree_destroy(struct bbr_runtime_remap * root, + struct bbr_private * bbr_id) +{ + struct bbr_runtime_remap ** link = NULL; + struct bbr_runtime_remap * node = root; + + while (node) { + if (node->left) { + link = &(node->left); + node = node->left; + continue; + } + if (node->right) { + link = &(node->right); + node = node->right; + continue; + } + + mempool_free(node, bbr_remap_pool); + if (node == root) { + /* If root is deleted, we're done. */ + break; + } + + /* Back to root. */ + node = root; + *link = NULL; + } +} + +static void bbr_free_remap(struct bbr_private * bbr_id) +{ + unsigned long flags; + spin_lock_irqsave(&bbr_id->bbr_id_lock, flags); + bbr_binary_tree_destroy(bbr_id->remap_root, bbr_id); + bbr_id->remap_root = NULL; + spin_unlock_irqrestore(&bbr_id->bbr_id_lock, flags); +} + +/** + * bbr_insert_remap_entry + * + * Create a new remap entry and add it to the binary tree for this node. + */ +static int bbr_insert_remap_entry(struct bbr_private * bbr_id, + struct evms_bbr_table_entry * new_bbr_entry) +{ + struct bbr_runtime_remap * newnode = NULL; + unsigned long flags; + int rc; + + newnode = mempool_alloc(bbr_remap_pool, GFP_NOIO); + if (!newnode) { + rc = -ENOMEM; + LOG_SERIOUS("Could not allocate from remap pool! (rc=%d)\n", rc); + return rc; + } + newnode->remap.bad_sect = new_bbr_entry->bad_sect; + newnode->remap.replacement_sect = new_bbr_entry->replacement_sect; + spin_lock_irqsave(&bbr_id->bbr_id_lock, flags); + bbr_binary_tree_insert(&bbr_id->remap_root, newnode); + spin_unlock_irqrestore(&bbr_id->bbr_id_lock, flags); + return 0; +} + +/** + * bbr_table_to_remap_list + * + * The on-disk bbr table is sorted by the replacement sector LBA. In order to + * improve run time performance, the in memory remap list must be sorted by + * the bad sector LBA. This function is called at discovery time to initialize + * the remap list. This function assumes that at least one copy of meta data + * is valid. + */ +static u32 bbr_table_to_remap_list(struct bbr_private * bbr_id) +{ + u32 in_use_blks = 0; + int i, j; + struct evms_bbr_table * p; + + + for ( i = 0, p = bbr_id->bbr_table; + i < bbr_id->nr_sects_bbr_table; + i++, p++ ) { + if (!p->in_use_cnt) { + break; + } + in_use_blks += p->in_use_cnt; + for ( j = 0; j < p->in_use_cnt; j++ ) { + bbr_insert_remap_entry(bbr_id, &p->entries[j]); + } + } + + return in_use_blks; +} + +/** + * bbr_search_remap_entry + * + * Search remap entry for the specified sector. If found, return a pointer to + * the table entry. Otherwise, return NULL. + */ +static struct evms_bbr_table_entry * bbr_search_remap_entry(struct bbr_private * bbr_id, + u64 lsn) +{ + struct bbr_runtime_remap * p; + unsigned long flags; + + spin_lock_irqsave(&bbr_id->bbr_id_lock, flags); + p = bbr_binary_search(bbr_id->remap_root, lsn); + spin_unlock_irqrestore(&bbr_id->bbr_id_lock, flags); + if (p) { + return (&p->remap); + } else { + return NULL; + } +} + +/** + * bbr_remap + * + * If *lsn is in the remap table, return TRUE and modify *lsn, + * else, return FALSE. + */ +static inline int bbr_remap(struct bbr_private * bbr_id, + u64 * lsn) +{ + struct evms_bbr_table_entry *e; + + if ( atomic_read(&bbr_id->in_use_replacement_blks) && + ! (bbr_id->flag & BBR_STOP_REMAP) ) { + e = bbr_search_remap_entry(bbr_id, *lsn); + if (e) { + *lsn = e->replacement_sect; + LOG_EXTRA("%s replacement sector (LSN="PFU64")\n", + __FUNCTION__, *lsn); + return TRUE; + } + } + return FALSE; +} + +/** + * bbr_remap_probe + * + * If any of the sectors in the range [lsn, lsn+nr_sects] are in the remap + * table return TRUE, Else, return FALSE. + */ +static inline int bbr_remap_probe(struct bbr_private * bbr_id, + u64 lsn, u64 nr_sects) +{ + u64 tmp, cnt; + + if ( atomic_read(&bbr_id->in_use_replacement_blks) && + ! (bbr_id->flag & BBR_STOP_REMAP) ) { + for ( cnt = 0, tmp = lsn; + cnt < nr_sects; + cnt += bbr_id->blksize_in_sects, tmp = lsn + cnt) { + if ( bbr_remap(bbr_id,&tmp) ) { + return TRUE; + } + } + } + return FALSE; +} + +static void *bbr_slab_pool_alloc(int gfp_mask, void * data) +{ + return kmem_cache_alloc(data, gfp_mask); +} + +static void bbr_slab_pool_free(void *ptr, void * data) +{ + kmem_cache_free(data, ptr); +} + +static int bbr_create_pools(void) +{ + /* Create a memory pool for the remap list. */ + if (!bbr_remap_slab) { + bbr_remap_slab = kmem_cache_create("BBR_Remap_Slab", + sizeof(struct bbr_runtime_remap), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!bbr_remap_slab) { + panic("Unable to create BBR remap cache."); + } + } + if (!bbr_remap_pool) { + bbr_remap_pool = mempool_create(64, bbr_slab_pool_alloc, + bbr_slab_pool_free, + bbr_remap_slab); + if (!bbr_remap_pool) { + panic("Unable to create BBR remap pool."); + } + } + + /* Create a memory pool for the BBR I/O anchors. */ + if (!bbr_io_buf_slab) { + bbr_io_buf_slab = kmem_cache_create("BBR_IO_Buf_Slab", + sizeof(struct bbr_io_buffer), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!bbr_io_buf_slab) { + panic("Unable to create BBR I/O buffer cache."); + } + } + if (!bbr_io_buf_pool) { + bbr_io_buf_pool = mempool_create(256, bbr_slab_pool_alloc, + bbr_slab_pool_free, + bbr_io_buf_slab); + if (!bbr_io_buf_pool) { + panic("Unable to create BBR I/O buffer pool."); + } + } + + return 0; +} + +static void bbr_destroy_pools(void) +{ + if (bbr_io_buf_pool) { + mempool_destroy(bbr_io_buf_pool); + bbr_io_buf_pool = NULL; + } + if (bbr_io_buf_slab) { + kmem_cache_destroy(bbr_io_buf_slab); + bbr_io_buf_slab = NULL; + } + if (bbr_remap_pool) { + mempool_destroy(bbr_remap_pool); + bbr_remap_pool = NULL; + } + if (bbr_remap_slab) { + kmem_cache_destroy(bbr_remap_slab); + bbr_remap_slab = NULL; + } +} + +/** + * bbr_discover + * + * Search through the discover list looking for object with BBR metadata. + * Remove them from the list and replace with a new BBR node. + */ +static int bbr_discover(struct evms_logical_node ** discover_list) +{ + struct evms_logical_node * node, * next_node; + struct evms_logical_node * bbr_node = NULL; + struct bbr_private * bbr_id; + int bad_blocks, rc = 0; + + MOD_INC_USE_COUNT; + + next_node = *discover_list; + while (next_node) { + node = next_node; + next_node = node->next; + + /* The node must have a BBR feature-header. */ + if ( ! node->feature_header || + node->feature_header->feature_id != plugin_header.id ) { + continue; + } + + rc = load_feature_data(node, &bbr_id); + if (rc) { + /* Error loading feature data. + * This node belongs to us, but metadata is invalid, + * - remove it from the discovery list + * - delete it + * - clear error code then continue. + * Will consider creating a read only BBR node in + * the future. + */ + LOG_SERIOUS("Error in node (%s) with "PFU64" sectors.\n", + node->name, node->total_vsectors); + evms_cs_remove_logical_node_from_list(discover_list, + node); + DELETE(node); + rc = 0; + continue; + } + + rc = evms_cs_allocate_logical_node(&bbr_node); + if (rc) { + LOG_SERIOUS("Could not allocate logical node! rc=%d\n", rc); + bbr_free_private(bbr_id); + continue; + } + + MOD_INC_USE_COUNT; + bbr_node->volume_info = node->volume_info; + bbr_node->flags |= node->flags; + bbr_node->plugin = &plugin_header; + strcpy(bbr_node->name, + node->feature_header->object_name); + bbr_node->hardsector_size = node->hardsector_size; + bbr_node->total_vsectors = node->total_vsectors - 2 - + node->feature_header->feature_data1_size - + node->feature_header->feature_data2_size; + bbr_node->block_size = node->block_size; + bbr_node->private = bbr_id; + bbr_id->node = bbr_node; + + /* Free the feature header */ + kfree(node->feature_header); + node->feature_header = NULL; + evms_cs_remove_logical_node_from_list(discover_list, node); + + /* If bad blocks exist, give warning */ + bad_blocks = atomic_read(&bbr_id->in_use_replacement_blks); + if (bad_blocks) { + BBR_DEBUG_PRINT_REMAP_LIST(bbr_id); + LOG_WARNING("%s has %d bad blocks.\n", + bbr_id->source->name, bad_blocks); + LOG_WARNING("There are "PFU64" total replacement blocks.\n", + bbr_id->nr_replacement_blks); + LOG_WARNING("There are "PFU64" remaining replacement blocks.\n", + bbr_id->nr_replacement_blks - + bad_blocks); + } + + evms_cs_add_logical_node_to_list(discover_list, bbr_node); + bbr_list_add(bbr_id); + } + + MOD_DEC_USE_COUNT; + return rc; +} + +static inline void bbr_list_add(struct bbr_private * bbr_id) +{ + bbr_id->next = bbr_instances; + bbr_instances = bbr_id; +} + +static void bbr_list_remove(struct bbr_private * bbr_id) +{ + struct bbr_private ** p; + + for ( p = &bbr_instances; *p; p = &(*p)->next ) { + if ( *p == bbr_id ) { + *p = (*p)->next; + break; + } + } +} + +static struct bbr_private * bbr_find_private(char * object_name) +{ + struct bbr_private * p; + + for ( p = bbr_instances; p; p = p->next ) { + if ( ! strncmp(p->node->name, object_name, + EVMS_VOLUME_NAME_SIZE) ) { + return p; + } + } + return NULL; +} + +static void bbr_free_private(struct bbr_private * bbr_id) +{ + if (bbr_id->remap_root) { + bbr_free_remap(bbr_id); + } + if (bbr_id->bbr_table) { + kfree(bbr_id->bbr_table); + } + bbr_list_remove(bbr_id); + kfree(bbr_id); +} + +/** + * bbr_delete + * + * Delete the specified BBR node and the node it is built on. If the last BBR + * node is deleted, shut down the I/O thread. + */ +static int bbr_delete(struct evms_logical_node * bbr_node) +{ + struct bbr_private * bbr_id; + int rc; + + bbr_id = bbr_node->private; + + rc = DELETE(bbr_id->source); + if (!rc) { + /* Now cleanup and go away */ + bbr_free_private(bbr_id); + evms_cs_deallocate_logical_node(bbr_node); + if (!bbr_instances) { + bbr_destroy_pools(); + if (bbr_io_thread) { + evms_cs_unregister_thread(bbr_io_thread); + bbr_io_thread = NULL; + } + } + MOD_DEC_USE_COUNT; + } + return rc; +} + +static struct bbr_io_buffer * allocate_bbr_io_buf(struct bbr_private * bbr_id, + struct buffer_head * bh, + int rw) +{ + struct bbr_io_buffer * bbr_io_buf; + + bbr_io_buf = mempool_alloc(bbr_io_buf_pool, GFP_NOIO); + if (bbr_io_buf) { + memset(bbr_io_buf, 0, sizeof(struct bbr_io_buffer)); + INIT_LIST_HEAD(&bbr_io_buf->bbr_io_list); + bbr_io_buf->bbr_id = bbr_id; + bbr_io_buf->bh = bh; + bbr_io_buf->rw = rw; + } else { + LOG_WARNING("Could not allocate from BBR I/O buffer pool!\n"); + } + return bbr_io_buf; +} + +static void free_bbr_io_buf(struct bbr_io_buffer * bbr_io_buf) +{ + mempool_free(bbr_io_buf, bbr_io_buf_pool); +} + +/** + * bbr_io_remap_error + * @bbr_id: Private data for the BBR node. + * @rw: READ or WRITE. + * @starting_lsn: Starting sector of request to remap. + * @count: Number of sectors in the request. + * @buffer: Data buffer for the request. + * + * For the requested range, try to write each sector individually. For each + * sector that fails, find the next available remap location and write the + * data to that new location. Then update the table and write both copies + * of the table to disk. Finally, update the in-memory mapping and do any + * other necessary bookkeeping. + */ +static int bbr_io_remap_error(struct bbr_private * bbr_id, + int rw, + u64 starting_lsn, + u64 count, + char * buffer ) +{ + struct evms_bbr_table * bbr_table; + unsigned long table_sector_index; + unsigned long table_sector_offset; + unsigned long index; + u64 lsn, new_lsn; + int rc; + + if ( rw == READ ) { + /* Nothing can be done about read errors. */ + return -EIO; + } + + /* For each sector in the request. */ + for ( lsn = 0; lsn < count; lsn++, buffer += EVMS_VSECTOR_SIZE ) { + rc = INIT_IO(bbr_id->source, rw, starting_lsn + lsn, 1, buffer); + while (rc) { + if ( bbr_id->flag & BBR_STOP_REMAP ) { + /* Can't allow new remaps if the + * engine told us to stop. + */ + LOG_ERROR("Object %s: Bad sector ("PFU64"), but remapping is turned off.\n", + bbr_id->node->name, starting_lsn+lsn); + return -EIO; + } + + /* Find the next available relocation sector. */ + new_lsn = atomic_read(&bbr_id->in_use_replacement_blks); + if ( new_lsn >= bbr_id->nr_replacement_blks ) { + /* No more replacement sectors available. */ + return -EIO; + } + new_lsn += bbr_id->start_replacement_sect; + + /* Write the data to its new location. */ + LOG_WARNING("Object %s: Trying to remap bad sector ("PFU64") to sector ("PFU64")\n", + bbr_id->node->name, starting_lsn + lsn, + new_lsn); + rc = INIT_IO(bbr_id->source, rw, new_lsn, 1, buffer); + if (rc) { + /* This replacement sector is bad. + * Try the next one. + */ + LOG_ERROR("Object %s: Replacement sector ("PFU64") is bad. Skipping.\n", + bbr_id->node->name, new_lsn); + atomic_inc(&bbr_id->in_use_replacement_blks); + continue; + } + + /* Add this new entry to the on-disk table. */ + table_sector_index = new_lsn - + bbr_id->start_replacement_sect; + table_sector_offset = table_sector_index / + EVMS_BBR_ENTRIES_PER_SECT; + index = table_sector_index % EVMS_BBR_ENTRIES_PER_SECT; + + bbr_table = &bbr_id->bbr_table[table_sector_offset]; + bbr_table->entries[index].bad_sect = starting_lsn + lsn; + bbr_table->entries[index].replacement_sect = new_lsn; + bbr_table->in_use_cnt++; + bbr_table->sequence_number++; + bbr_table->crc = 0; + bbr_table->crc = evms_cs_calculate_crc(EVMS_INITIAL_CRC, + bbr_table, + sizeof(struct evms_bbr_table)); + + /* Write the table to disk. */ + cpu_bbr_table_sector_to_le(bbr_table, bbr_table); + if ( bbr_id->lba_table1 ) { + rc = INIT_IO(bbr_id->source, WRITE, + bbr_id->lba_table1 + + table_sector_offset, + 1, bbr_table); + } + if ( bbr_id->lba_table2 ) { + rc |= INIT_IO(bbr_id->source, WRITE, + bbr_id->lba_table2 + + table_sector_offset, + 1, bbr_table); + } + le_bbr_table_sector_to_cpu(bbr_table); + + if (rc) { + /* Error writing one of the tables to disk. */ + LOG_ERROR("Object %s: Error updating BBR tables on disk.\n", + bbr_id->node->name); + return rc; + } + + /* Insert a new entry in the remapping binary-tree. */ + rc = bbr_insert_remap_entry(bbr_id, + &bbr_table->entries[index]); + if (rc) { + LOG_ERROR("Object %s: Error adding new entry to remap tree.\n", + bbr_id->node->name); + return rc; + } + + atomic_inc(&bbr_id->in_use_replacement_blks); + } + } + + return 0; +} + +/** + * bbr_io_process_request + * + * For each sector in this request, check if the sector has already + * been remapped. If so, process all previous sectors in the request, + * followed by the remapped sector. Then reset the starting lsn and + * count, and keep going with the rest of the request as if it were + * a whole new request. If any of the INIT_IO's return an error, + * call the remapper to relocate the bad sector(s). + */ +static int bbr_io_process_request(struct bbr_io_buffer * bbr_io_buf) +{ + struct bbr_private * bbr_id = bbr_io_buf->bbr_id; + u64 starting_lsn = bbr_io_buf->bh->b_rsector; + u64 count = bbr_io_buf->bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT; + u64 lsn, remapped_lsn; + char * buffer = bbr_io_buf->bh->b_data; + int rc = 0, rw = bbr_io_buf->rw; + + /* For each sector in this request, check if this sector has already + * been remapped. If so, process all previous sectors in this request, + * followed by the remapped sector. Then reset the starting lsn and + * count and keep going with the rest of the request as if it were + * a whole new request. + */ + for ( lsn = 0; lsn < count && !(bbr_id->flag & BBR_STOP_REMAP); lsn++ ) { + remapped_lsn = starting_lsn + lsn; + rc = bbr_remap(bbr_id, &remapped_lsn); + if (!rc) { + /* This sector is fine. */ + continue; + } + + /* Process all sectors in the request up to this one. */ + if ( lsn > 0 ) { + rc = INIT_IO(bbr_id->source, rw, + starting_lsn, lsn, buffer); + if (rc) { + /* If this I/O failed, then one of the sectors + * in this request needs to be relocated. + */ + rc = bbr_io_remap_error(bbr_id, rw, starting_lsn, + lsn, buffer); + if (rc) { + return rc; + } + } + buffer += (lsn << EVMS_VSECTOR_SIZE_SHIFT); + } + + /* Process the remapped sector. */ + rc = INIT_IO(bbr_id->source, rw, remapped_lsn, 1, buffer); + if (rc) { + /* BUGBUG - Need more processing if this caused an + * an error. If this I/O failed, then the existing + * remap is now bad, and we need to find a new remap. + * Can't use bbr_io_remap_error(), because the existing + * map entry needs to be changed, not added again, and + * the original table entry also needs to be changed. + */ + return rc; + } + + buffer += EVMS_VSECTOR_SIZE; + starting_lsn += (lsn + 1); + count -= (lsn + 1); + lsn = -1; + } + + /* Check for any remaining sectors after the last split. This could + * potentially be the whole request, but that should be a rare case + * because requests should only be processed by the thread if we know + * an error occurred or they contained one or more remapped sectors. + */ + if ( count ) { + rc = INIT_IO(bbr_id->source, rw, starting_lsn, count, buffer); + if (rc) { + /* If this I/O failed, then one of the sectors in this + * request needs to be relocated. + */ + rc = bbr_io_remap_error(bbr_id, rw, starting_lsn, + count, buffer); + if (rc) { + return rc; + } + } + } + + return 0; +} + +/** + * bbr_io_handler + * + * This is the handler for the bbr_io_thread. It continuously loops, + * taking I/O requests off its list and processing them. If nothing + * is on the list, the thread goes back to sleep until specifically + * woken up. + * + * I/O requests should only be sent to this thread if we know that: + * a) the request contains at least one remapped sector. + * or + * b) the request caused an error on the normal I/O path. + * This function uses synchronous I/O, so sending a request to this + * thread that doesn't need special processing will cause severe + * performance degredation. + */ +static void bbr_io_handler(void * void_data) +{ + struct bbr_io_buffer * bbr_io_buf; + struct buffer_head * bh; + unsigned long flags; + int rc = 0; + + while (1) { + /* Process bbr_io_list, one entry at a time. */ + spin_lock_irqsave(&bbr_io_list_lock, flags); + if (list_empty(&bbr_io_list)) { + /* No more items on the list. */ + spin_unlock_irqrestore(&bbr_io_list_lock, flags); + break; + } + bbr_io_buf = list_entry(bbr_io_list.next, + struct bbr_io_buffer, bbr_io_list); + list_del(&bbr_io_buf->bbr_io_list); + spin_unlock_irqrestore(&bbr_io_list_lock, flags); + + rc = bbr_io_process_request(bbr_io_buf); + + /* Clean up and complete the original I/O. */ + bh = bbr_io_buf->bh; + if (bh->b_end_io) { + free_bbr_io_buf(bbr_io_buf); + evms_cs_volume_request_in_progress(bh->b_rdev, -1, NULL); + bh->b_end_io(bh, rc ? 0 : 1); + } else { + /* A request that originated from bbr_init_io. */ + bbr_io_buf->rc = rc; + complete(bbr_io_buf->complete); + } + } +} + +/** + * bbr_schedule_io + * + * Place the specified bbr_io_buf on the thread's processing list. + */ +static void bbr_schedule_io(struct bbr_io_buffer * bbr_io_buf) +{ + unsigned long flags; + + spin_lock_irqsave(&bbr_io_list_lock, flags); + list_add_tail(&bbr_io_buf->bbr_io_list, &bbr_io_list); + spin_unlock_irqrestore(&bbr_io_list_lock, flags); + evms_cs_wakeup_thread(bbr_io_thread); +} + +/** + * bbr_read + * + * If there are any remapped sectors on this object, send this request over + * to the thread for processing. Otherwise send it down the stack normally. + */ +static void bbr_read(struct evms_logical_node * bbr_node, + struct buffer_head * bh ) +{ + struct bbr_private * bbr_id = bbr_node->private; + struct bbr_io_buffer * bbr_io_buf; + + if ( bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) > + bbr_node->total_vsectors ) { + /* Request is off the end of the object. */ + bh->b_end_io(bh, 0); + return; + } + + if ( atomic_read(&bbr_id->in_use_replacement_blks) == 0 || + bbr_id->flag & BBR_STOP_REMAP || + ! bbr_remap_probe(bbr_id, bh->b_rsector, + bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) ) { + /* No existing remaps, this request doesn't contain any + * remapped sectors, or the engine told us not to remap. + */ + R_IO(bbr_id->source, bh); + return; + } + + /* This request has at least one remapped sector. */ + bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, READ); + if (!bbr_io_buf) { + /* Can't get memory to track the I/O. */ + bh->b_end_io(bh, 0); + return; + } + + evms_cs_volume_request_in_progress(bbr_io_buf->bh->b_rdev, +1, NULL); + bbr_schedule_io(bbr_io_buf); +} + +/** + * bbr_write_callback + * + * This is the callback for normal write requests. Check for an error + * during the I/O, and send to the thread for processing if necessary. + */ +static void bbr_write_callback(struct buffer_head * bh, + int uptodate) +{ + struct bbr_io_buffer * bbr_io_buf = bh->b_private; + + bh->b_end_io = bbr_io_buf->org_end_io; + bh->b_private = bbr_io_buf->org_private; + bh->b_rsector = bbr_io_buf->org_rsector; + bh->b_rdev = bbr_io_buf->org_dev; + + if (!(bbr_io_buf->bbr_id->flag & BBR_STOP_REMAP) && + !uptodate) { + LOG_ERROR("Object %s: Write failure on sector ("PFU64"). Scheduling for retry.\n", + bbr_io_buf->bbr_id->node->name, (u64)bbr_io_buf->bh->b_rsector); + bbr_schedule_io(bbr_io_buf); + } else { + free_bbr_io_buf(bbr_io_buf); + evms_cs_volume_request_in_progress(bh->b_rdev, -1, NULL); + bh->b_end_io(bh, uptodate); + } +} + +/** + * bbr_write + * + * If there are any remapped sectors on this object, send the request over + * to the thread for processing. Otherwise, register for callback + * notification, and send the request down normally. + */ +static void bbr_write(struct evms_logical_node * bbr_node, + struct buffer_head * bh) +{ + struct bbr_private * bbr_id = bbr_node->private; + struct bbr_io_buffer * bbr_io_buf; + + if ( bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) > + bbr_node->total_vsectors || + bbr_id->flag & EVMS_VOLUME_READ_ONLY ) { + /* Request is off the end of the object, or this + * is a read-only object. + */ + bh->b_end_io(bh, 0); + return; + } + + bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, WRITE); + if (!bbr_io_buf) { + /* Can't get memory to track the I/O. */ + bh->b_end_io(bh, 0); + return; + } + + evms_cs_volume_request_in_progress(bh->b_rdev, +1, NULL); + + if ( atomic_read(&bbr_id->in_use_replacement_blks) == 0 || + bbr_id->flag & BBR_STOP_REMAP || + ! bbr_remap_probe(bbr_id, bh->b_rsector, + bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) ) { + /* No existing remaps, this request contains no remapped + * sectors, or the engine said to stop remapping. + */ + bbr_io_buf->org_end_io = bh->b_end_io; + bbr_io_buf->org_private = bh->b_private; + bbr_io_buf->org_rsector = bh->b_rsector; + bbr_io_buf->org_dev = bh->b_rdev; + bh->b_end_io = bbr_write_callback; + bh->b_private = bbr_io_buf; + W_IO(bbr_id->source, bh); + } else { + /* This request contains at least one remapped sector. */ + bbr_schedule_io(bbr_io_buf); + } +} + +/** + * bbr_init_io_schedule_io + * @bbr_id: Private data for the BBR node. + * @rw: READ or WRITE. + * @lsn: Starting sector for the request. + * @count: Number of sectors in the request. + * @buffer: Data buffer for the request. + * + * During init_io, failures must still be handled by the I/O thread. Create + * a bbr_io_buf, and schedule it to be handled by the thread. Then wait until + * the request is complete. + */ +static int bbr_init_io_schedule_io(struct bbr_private * bbr_id, + int rw, + u64 lsn, + u64 count, + void * buffer) +{ + struct bbr_io_buffer * bbr_io_buf; + struct buffer_head bh; + struct completion complete; + int rc = 0; + + if ( rw != WRITE ) { + /* Nothing can be done about read failures. */ + return -EIO; + } + + LOG_ERROR("Object %s: init_io write failure (sector "PFU64": count "PFU64"). Scheduling for retry.\n", + bbr_id->node->name, lsn, count); + bbr_io_buf = allocate_bbr_io_buf(bbr_id, &bh, rw); + if (!bbr_io_buf) { + return -ENOMEM; + } + + memset(&bh, 0, sizeof(struct buffer_head)); + init_waitqueue_head(&bh.b_wait); + bh.b_rsector = lsn; + bh.b_size = count << EVMS_VSECTOR_SIZE_SHIFT; + bh.b_data = buffer; + bh.b_end_io = NULL; + + /* Schedule the I/O and wait for it to finish. */ + bbr_io_buf->complete = &complete; + init_completion(bbr_io_buf->complete); + bbr_schedule_io(bbr_io_buf); + wait_for_completion(bbr_io_buf->complete); + + rc = bbr_io_buf->rc; + free_bbr_io_buf(bbr_io_buf); + + return rc; +} + +/** + * bbr_init_io + * @bbr_node: BBR node. + * @rw: READ or WRITE. + * @lsn: Starting sector for I/O request. + * @count: Number of sectors in the I/O request. + * @buffer: Data buffer for the I/O request. + * + * Synchronous I/O requests. + */ +static int bbr_init_io(struct evms_logical_node * bbr_node, + int rw, + u64 start_lsn, + u64 count, + void * buffer ) +{ + struct bbr_private * bbr_id = bbr_node->private; + u64 lsn; + int rc = 0; + + if ( start_lsn + count > bbr_node->total_vsectors ) { + /* Request is off the end of the object. */ + return -EINVAL; + } + + if ( rw == WRITE && (bbr_id->flag & EVMS_VOLUME_READ_ONLY) ) { + /* Can't write to a read-only object. */ + return -EINVAL; + } + + if ( bbr_id->flag & BBR_STOP_REMAP || + atomic_read(&bbr_id->in_use_replacement_blks) == 0 || + ! bbr_remap_probe(bbr_id, start_lsn, count) ) { + /* Normal case (no existing remaps). */ + rc = INIT_IO(bbr_id->source, rw, start_lsn, count, buffer); + if (rc && ! (bbr_id->flag & BBR_STOP_REMAP) ) { + /* Init_io error. Send request over to + * thread for further processing. + */ + rc = bbr_init_io_schedule_io(bbr_id, rw, start_lsn, + count, buffer); + } + } else { + /* At least one sector in this request needs to be remapped. + * Test and send each one down individually. + */ + for ( lsn = start_lsn; + lsn < start_lsn + count; + lsn++, buffer += EVMS_VSECTOR_SIZE ) { + bbr_remap(bbr_id, &lsn); + rc = INIT_IO(bbr_id->source, rw, lsn, 1, buffer); + if (rc) { + /* Init_io error. Send request + * to thread for processing. + */ + rc = bbr_init_io_schedule_io(bbr_id, rw, + lsn, 1, buffer); + if (rc) { + break; + } + } + } + } + + return rc; +} + +/** + * bbr_direct_ioctl_sector_io + * + * Process an I/O from the engine on an active BBR object. + */ +static int bbr_direct_ioctl_sector_io(struct bbr_private * bbr_id, + struct evms_notify_bbr * notify) +{ + char * buffer, * user_buffer; + u64 lsn; + int rc = 0; + + buffer = kmalloc(EVMS_VSECTOR_SIZE, GFP_NOIO); + if (!buffer) { + return -ENOMEM; + } + + user_buffer = (char*)notify->buffer; + + for ( lsn = 0; + lsn < notify->nr_sect; + lsn++, user_buffer += EVMS_VSECTOR_SIZE ) { + if ( notify->rw == WRITE ) { + if ( copy_from_user(buffer, user_buffer, + EVMS_VSECTOR_SIZE) ) { + rc = -EFAULT; + break; + } + } + + rc = bbr_init_io(bbr_id->node, notify->rw, + notify->start_sect + lsn, 1, buffer); + if (rc) { + break; + } + + if ( notify->rw == READ ) { + if ( copy_to_user(user_buffer, buffer, + EVMS_VSECTOR_SIZE) ) { + rc = -EFAULT; + break; + } + } + } + + kfree(buffer); + return rc; +} + +/** + * bbr_direct_ioctl + * @inode: N/A + * @file: N/A + * @cmd: N/A + * @arg: Pointer to an evms_plugin_ioctl_pkt. + * + * BBR-specific ioctls from the engine. Currently handles: + * BBR_STOP_REMAP_CMD + * BBR_GET_INFO_CMD + * BBR_SECTOR_IO_CMD + */ +static int bbr_direct_ioctl(struct inode * inode, + struct file * file, + unsigned int cmd, + unsigned long arg) +{ + int rc = 0; + struct bbr_private * bbr_id; + struct evms_plugin_ioctl_pkt pkt, * user_pkt; + struct evms_notify_bbr notify, * user_notify; + + MOD_INC_USE_COUNT; + + user_pkt = (struct evms_plugin_ioctl_pkt *)arg; + if ( copy_from_user(&pkt, user_pkt, sizeof(pkt)) ) { + MOD_DEC_USE_COUNT; + return -EFAULT; + } + + if ( pkt.feature_id != plugin_header.id ) { + MOD_DEC_USE_COUNT; + return -EINVAL; + } + + user_notify = (struct evms_notify_bbr *)pkt.feature_ioctl_data; + if ( copy_from_user(¬ify, user_notify, sizeof(notify)) ) { + rc = -EFAULT; + } else { + bbr_id = bbr_find_private(notify.object_name); + if (!bbr_id) { + rc = -ENODEV; + } else { + + switch(pkt.feature_command) { + + case BBR_STOP_REMAP_CMD: + bbr_id->flag |= BBR_STOP_REMAP; + /* Fall through. */ + + case BBR_GET_INFO_CMD: + notify.count = atomic_read(&bbr_id->in_use_replacement_blks); + if ( copy_to_user(&user_notify->count, + ¬ify.count, + sizeof(user_notify->count))) { + rc = -EFAULT; + } + break; + + case BBR_SECTOR_IO_CMD: + rc = bbr_direct_ioctl_sector_io(bbr_id, + ¬ify); + break; + + default: + rc = -ENOSYS; + } + } + } + + pkt.status = rc; + copy_to_user(user_pkt, &pkt, sizeof(pkt)); + MOD_DEC_USE_COUNT; + return rc; +} + +/** + * bbr_ioctl + * @bbr_node: BBR node. + * @inode: N/A + * @file: N/A + * @cmd: ioctl command to process. + * @arg: ioctl-specific data pointer. + * + * IOCTL handler. Currently BBR handles plugin-specific ioctls, as well as + * EVMS_GET_BMAP. All others are passed to the child node. + */ +static int bbr_ioctl (struct evms_logical_node * bbr_node, + struct inode * inode, + struct file * file, + unsigned int cmd, + unsigned long arg) +{ + struct bbr_private * bbr_id = bbr_node->private; + struct evms_get_bmap_pkt * bmap; + int rc = 0; + + switch (cmd) { + case EVMS_PLUGIN_IOCTL: + rc = bbr_direct_ioctl(inode, file, cmd, arg); + break; + + case EVMS_GET_BMAP: + bmap = (struct evms_get_bmap_pkt *)arg; + bbr_remap(bbr_id, &bmap->rsector); + /* fall thru */ + + default: + rc = IOCTL(bbr_id->source, inode, file, cmd, arg); + } + return rc; +} + +static int __init bbr_init(void) +{ + return evms_cs_register_plugin(&plugin_header); +} + +static void __exit bbr_exit(void) +{ + evms_cs_unregister_plugin(&plugin_header); +} + +module_init(bbr_init); +module_exit(bbr_exit); +#ifdef MODULE_LICENSE +MODULE_LICENSE("GPL"); +#endif + diff -Naur linux-2002-09-30/drivers/evms/evms_drivelink.c evms-2002-09-30/drivers/evms/evms_drivelink.c --- linux-2002-09-30/drivers/evms/evms_drivelink.c Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/drivers/evms/evms_drivelink.c Fri Sep 13 16:09:55 2002 @@ -0,0 +1,1274 @@ +/* -*- linux-c -*- + * + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * + */ +/* + * linux/drivers/evms/drvlink.c + + * + * EVMS Drive Linking Feature. + * + * This feature provides the ability to link multiple storage objects + * together as a single virtual storage object. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define LOG_PREFIX "drivelink: " + +/* prototypes for mandatory plugin interface functions */ +static int drivelink_discover(struct evms_logical_node **); +static int drivelink_delete(struct evms_logical_node *); +static void drivelink_read(struct evms_logical_node *, struct buffer_head *); +static void drivelink_write(struct evms_logical_node *, struct buffer_head *); +static int drivelink_ioctl(struct evms_logical_node *, + struct inode *, + struct file *, unsigned int, unsigned long); +static int drivelink_init_io(struct evms_logical_node *, + int, u64, u64, void *); + +/* plugin function table definition */ +static struct evms_plugin_fops fops = { + .discover = drivelink_discover, + .delete = drivelink_delete, + .read = drivelink_read, + .write = drivelink_write, + .init_io = drivelink_init_io, + .ioctl = drivelink_ioctl +}; + +/* plugin header definition */ +static struct evms_plugin_header plugin_header = { + .id = SetPluginID(IBM_OEM_ID, + EVMS_FEATURE, + EVMS_DRIVELINK_FEATURE_ID), + .version = { + .major = 2, + .minor = 0, + .patchlevel = 1 + }, + .required_services_version = { + .major = 0, + .minor = 5, + .patchlevel = 0 + }, + .fops = &fops +}; + +/********************************************************/ +/* Required Plugin Function Table Entry Point: */ +/* Discover function & Support routines */ +/********************************************************/ + +/** + * le_feature_data_to_cpu: + * @md: drivelink metadata + * + * convert feature data from on-disk (Little Endian) format + * to the native cpu endian format. +**/ +static void +le_feature_data_to_cpu(struct evms_drivelink_metadata *md) +{ + int i; + + md->signature = le32_to_cpup(&md->signature); + md->crc = le32_to_cpup(&md->crc); + md->version.major = le32_to_cpup(&md->version.major); + md->version.minor = le32_to_cpup(&md->version.minor); + md->version.patchlevel = le32_to_cpup(&md->version.patchlevel); + md->flags = le32_to_cpup(&md->flags); + md->sequence_number = le64_to_cpup(&md->sequence_number); + md->child_serial_number = le64_to_cpup(&md->child_serial_number); + md->parent_serial_number = le64_to_cpup(&md->parent_serial_number); + md->child_count = le64_to_cpup(&md->child_count); + for (i = 0; i < EVMS_DRIVELINK_MAX_ENTRIES; i++) { + struct evms_dl_ordering_table_entry *child_entry; + + child_entry = &md->ordering_table[i]; + child_entry->child_serial_number = + le64_to_cpup(&child_entry->child_serial_number); + child_entry->child_vsize = + le64_to_cpup(&child_entry->child_vsize); + } +} + +/** + * load_feature_data: load a feature header from disk + * @node: storage object + * @md: ptr to drivelink metadata + * + * loads and verifies redundant copies of drivelink metadata. @md is modified + * and returned to the caller. + * + * Return value: 0 on success + * Otherwise error code +**/ +static int +load_feature_data(struct evms_logical_node *node, + struct evms_drivelink_metadata **md) +{ + int i, rc = 0, rc_array[2] = { 0, 0 }, size_in_bytes; + u64 real_metadata_size, feature_data_size; + u64 starting_sector; + struct evms_drivelink_metadata *cur_md, *md1, *md2 = NULL; + char *location_name; + + /* verify the feature metadata size from the */ + /* feature header agrees with the real size */ + /* of the current metadata structure. */ + real_metadata_size = evms_cs_size_in_vsectors(sizeof (**md)); + + /* allocate a buffer large enough to hold all */ + /* sectors containing the feature's metadata */ + size_in_bytes = real_metadata_size * EVMS_VSECTOR_SIZE; + md1 = kmalloc(size_in_bytes, GFP_KERNEL); + if (md1) { + md2 = kmalloc(size_in_bytes, GFP_KERNEL); + if (!md2) { + kfree(md1); + rc = -ENOMEM; + } + } else { + rc = -ENOMEM; + } + if (!rc) { + for (i = 0; i < 2; i++) { + if (i == 0) { + starting_sector = + node->feature_header-> + feature_data1_start_lsn; + feature_data_size = + node->feature_header->feature_data1_size; + cur_md = md1; + location_name = evms_primary_string; + } else { + starting_sector = + node->feature_header-> + feature_data2_start_lsn; + feature_data_size = + node->feature_header->feature_data2_size; + cur_md = md2; + location_name = evms_secondary_string; + } + /* check that real metadata size matches the */ + /* feature data size */ + if (real_metadata_size != feature_data_size) { + LOG_ERROR + ("%s feature data size("PFU64" bytes) doesn't match expected size("PFU64" bytes).\n", + location_name, + feature_data_size << + EVMS_VSECTOR_SIZE_SHIFT, + real_metadata_size << + EVMS_VSECTOR_SIZE_SHIFT); + rc = -EINVAL; + rc_array[i] = rc; + continue; + } + /* load the node's feature data */ + rc = INIT_IO(node, + 0, + starting_sector, + feature_data_size, cur_md); + if (rc) { + LOG_ERROR + ("error(%d) probing for %s feature data at sector("PFU64") on '%s'.\n", + rc, location_name, starting_sector, + node->name); + rc_array[i] = rc; + continue; + } + /* check for valid metadata signature */ + if (le32_to_cpup(&cur_md->signature) != + EVMS_DRIVELINK_SIGNATURE) { + rc = -ENODATA; + LOG_SERIOUS + ("error(%d) invalid signature in %s feature data on '%s'\n", + rc, location_name, node->name); + rc_array[i] = rc; + continue; + } + /* validate feature data CRC */ + if (cur_md->crc != EVMS_MAGIC_CRC) { + int org_crc, final_crc; + org_crc = le32_to_cpup(&cur_md->crc); + cur_md->crc = 0; + final_crc = + evms_cs_calculate_crc(EVMS_INITIAL_CRC, + cur_md, + sizeof (*cur_md)); + if (final_crc != org_crc) { + LOG_ERROR + ("CRC mismatch error [stored(%x), computed(%x)] in %s feature data on '%s'.\n", + org_crc, final_crc, location_name, + node->name); + rc = -EINVAL; + rc_array[i] = rc; + continue; + } + } else { + LOG_WARNING + ("CRC disabled in %s feature data on '%s'.\n", + location_name, node->name); + } + /* convert feature data from on-disk + * format (Little Endian) to native + * cpu endian format. + */ + le_feature_data_to_cpu(cur_md); + /* check for valid structure version */ + rc = evms_cs_check_version(&metadata_ver, + &cur_md->version); + if (rc) { + LOG_SERIOUS + ("error(%d) obsolete version detected: actual(%d,%d,%d), requires(%d,%d,%d) in %s feature data on '%s'\n", + rc, cur_md->version.major, + cur_md->version.minor, + cur_md->version.patchlevel, + DRIVELINK_METADATA_MAJOR, + DRIVELINK_METADATA_MINOR, + DRIVELINK_METADATA_PATCHLEVEL, + location_name, node->name); + rc_array[i] = rc; + } + } + /* getting same return code for both copies? */ + if (rc_array[0] == rc_array[1]) { + rc = rc_array[0]; + /* if no errors on both copies, + * check the sequence numbers. + * use the highest sequence number. + */ + if (!rc) { + /* compare sequence numbers */ + if (md1->sequence_number == + md2->sequence_number) { + cur_md = md1; + } else { + LOG_WARNING + ("sequence number mismatches between front("PFU64") and rear("PFU64") feature data copies on node(%s)!\n", + md2->sequence_number, + md1->sequence_number, node->name); + if (md1->sequence_number > + md2->sequence_number) + cur_md = md1; + else + cur_md = md2; + LOG_WARNING + ("using %s feature data copy!\n", + (cur_md == + md1) ? evms_primary_string : + evms_secondary_string); + } + } + /* getting different return codes for each copy */ + } else if (rc_array[0] == 0) { + /* use 1st (rear) copy if its good */ + rc = 0; + cur_md = md1; + } else if (rc_array[1] == 0) { + /* use 2nd (front) copy if its good */ + rc = 0; + cur_md = md2; + } else if ((rc_array[0] == -EINVAL) || (rc_array[1] == -EINVAL)) { + /* fail if either give a fatal error */ + rc = -EINVAL; + cur_md = NULL; + } + + /* deallocate metadata buffers appropriately */ + if (rc || (cur_md == md1)) + kfree(md2); + if (rc || (cur_md == md2)) + kfree(md1); + + /* save validated feature header pointer */ + if (!rc) + *md = cur_md; + } + return (rc); +} + +/** + * find_parent_node_for_child_node: finds or creates a parent node for this child node + * @child_node: input, child node + * @md: input, on-disk metadata + * @parent_node: output, parent node + * @dl_private: output, runtime metadata + * @discover_list: input/output, list of objects being discovered + * + * finds or creates a parent node for the specified child node. if the parent node is + * created, create and initialize the parent's private data area. + * + * Return value: 0 on success + * Otherwise error code. +**/ +static int +find_parent_node_for_child_node(struct evms_logical_node *child_node, + struct evms_drivelink_metadata *md, + struct evms_logical_node **parent_node, + struct runtime_data **dl_private, + struct evms_logical_node **discover_list) +{ + int rc = 0, parent_found = FALSE; + struct evms_logical_node *parent = NULL; + struct runtime_data *rd = NULL; + + /* find the parent node for this child */ + for (parent = *discover_list; parent; parent = parent->next) { + /* only parent nodes will have null feature headers */ + if (!parent->feature_header) { + rd = (struct runtime_data *) parent->private; + if (rd->parent_sn == md->parent_serial_number) { + parent_found = TRUE; + break; + } + } + } + /* if no parent node found, create it */ + if (parent_found == FALSE) { + rc = evms_cs_allocate_logical_node(&parent); + if (!rc) { + /* transpose info from child to parent */ + parent->flags |= child_node->flags; + strcpy(parent->name, + child_node->feature_header->object_name); + /* copy evms system data to parent */ + parent->volume_info = child_node->volume_info; + /* initialize the plugin id field */ + parent->plugin = &plugin_header; + /* allocate parent's instance data */ + parent->private = kmalloc(sizeof(*rd), GFP_KERNEL); + if (!parent->private) + rc = -ENOMEM; + } + if (!rc) { + /* initialize some instance data fields */ + rd = (struct runtime_data *) parent->private; + rd->block_size = 0; + rd->parent_sn = md->parent_serial_number; + rd->child_count = md->child_count; + /* allocate the child table */ + rd->child_table = kmalloc(sizeof(struct runtime_entry) * + rd->child_count, GFP_KERNEL); + if (!rd->child_table) + rc = -ENOMEM; + } + if (!rc) { + memset(rd->child_table, 0, + sizeof(struct runtime_entry) * rd->child_count); + /* add the parent node to the discover list */ + rc = evms_cs_add_logical_node_to_list(discover_list, + parent); + MOD_INC_USE_COUNT; + } + /* if any errors encountered, try to clean up */ + if (rc) { + LOG_SERIOUS("find_parent_node: rc(%d) from '%s'\n", + rc, child_node->name); + if (parent) { + DELETE(parent); + parent = NULL; + rd = NULL; + } + } + } + + *dl_private = rd; + *parent_node = parent; + + return (rc); +} + +/** + * compute_child_index: compute the index for a specific child node + * @node: the child node + * @md: the drivelink on-disk metadata + * + * compute and return and 0-based index value of this child node's position + * in the parent node's ordering table. + * + * Return value: -1 on error + * otherwise the index of the specified child. +**/ +static int +compute_child_index(struct evms_logical_node *node, + struct evms_drivelink_metadata *md) +{ + int i, position = -1; + + for (i = 0; i < md->child_count; i++) { + if (md->ordering_table[i].child_serial_number == + md->child_serial_number) { + position = i; + break; + } + } + if (position == -1) { + LOG_SERIOUS("%s: child not found from '%s'\n", + __FUNCTION__, node->name); + } + return (position); +} + +/** + * process_child_nodes: perform the discovery operation on each child node + * @discover_list: the list of potential child objects + * + * search the discovery list of drivelink child nodes. for each node found, + * perform the discovery operation on it. + * + * Return value: 0 on success + * otherwise error code +**/ +static int +process_child_nodes(struct evms_logical_node **discover_list) +{ + int rc = 0, index = -1; + struct evms_logical_node *node, *next_node, *parent; + struct evms_drivelink_metadata *md; + struct runtime_data *rd; + struct runtime_entry *child_entry = NULL; + + for (node = *discover_list; node; node = next_node) { + next_node = node->next; + if ((!node->feature_header) || + (node->feature_header->feature_id != plugin_header.id)) { + continue; + } + + rc = evms_cs_remove_logical_node_from_list(discover_list, node); + if (rc) + BUG(); + /* we need to load the feature data to */ + /* find the parent's serial number this */ + /* child node belongs to. */ + md = NULL; + rc = load_feature_data(node, &md); + if (!rc) { + /* find the parent node for this child */ + parent = NULL; + rc = find_parent_node_for_child_node(node, md, + &parent, &rd, + discover_list); + } + if (!rc) { + /* determine position of child in drive link object */ + index = compute_child_index(node, md); + if (index == -1) + rc = index; + } + if (!rc) { + /* check for multiple child index requests */ + child_entry = + (struct runtime_entry *) &rd->child_table[index]; + /* check to see if this child index is + * already in use. + */ + if (child_entry->child_node) { + LOG_SERIOUS + ("attempt to put '%s' in child index(%d). Already occupied by '%s'.\n", + node->name, index, + child_entry->child_node->name); + rc = -1; + } + } + if (!rc) { + /* fill in child info in parent */ + + /* check the sector size for this node */ + if (node->hardsector_size > parent->hardsector_size) + parent->hardsector_size = node->hardsector_size; + /* check the block size for this node */ + if (node->block_size > parent->block_size) + parent->block_size = node->block_size; + /* set the child node */ + child_entry->child_node = node; + /* set the metadata for this node */ + child_entry->child_metadata = md; + } + + /* on error, clean up accordingly */ + if (rc) { + if (md) + kfree(md); + LOG_SERIOUS("%s: rc(%d) from '%s'\n", + __FUNCTION__, rc, node->name); + LOG_SERIOUS("deleting child node '%s'.\n", node->name); + rc = DELETE(node); + if (rc) { + LOG_SERIOUS + ("error(%d) attempting to delete '%s'.\n", + rc, node->name); + } + } + } + + /* errors are handled internal to this function */ + /* by deleting the failed node. This will get */ + /* picked up by finalize_parent_nodes as a */ + /* missing child node */ + return (0); +} + +#define TEST_CHILD_PRESENCE 0 +#define TEST_CHILD_COUNT 1 +#define TEST_CHILD_PARENTS_SERIAL_NUM 2 +#define TEST_CHILD_POSITION 3 +#define TEST_CHILD_METADATA 4 + +/** + * test_parent_node: verify that a parent is complete + * @node: specified parent node + * + * verify that the parent node has all of its child nodes accounted for. + * + * Return value: 0 on success + * otherwise error code +**/ +static int +test_parent_node(struct evms_logical_node *node) +{ + int i, rc = 0; + struct runtime_data *rd; + struct runtime_entry *child_entry; + + rd = (struct runtime_data *) node->private; + for (i = 0; i < rd->child_count; i++) { + child_entry = (struct runtime_entry *) &rd->child_table[i]; + + /* insure each child entry is filled */ + if (!child_entry->child_node) { + node->flags |= + EVMS_VOLUME_SET_READ_ONLY | EVMS_VOLUME_PARTIAL; + LOG_ERROR("%s: missing child(%d).\n", __FUNCTION__, i); + } else + /* insure child count is the same */ + /* in each child's metadata */ + if (child_entry->child_metadata->child_count != rd->child_count) { + rc = -EVMS_FEATURE_FATAL_ERROR; + LOG_ERROR("%s: child count wrong for node '%s'\n", + __FUNCTION__, node->name); + } else + /* insure parent serial number is */ + /* the same in each child's metadata */ + if (child_entry->child_metadata->parent_serial_number != + rd->parent_sn) { + rc = -EVMS_FEATURE_FATAL_ERROR; + LOG_ERROR + ("%s: incorrect [is("PFU64"), should be("PFU64")] child serial number for node '%s'\n", + __FUNCTION__, + child_entry->child_metadata->parent_serial_number, + rd->parent_sn, node->name); + } else + /* insure each is in the correct entry */ + if (child_entry->child_metadata->ordering_table[i]. + child_serial_number != + child_entry->child_metadata->child_serial_number) { + rc = -EVMS_FEATURE_FATAL_ERROR; + LOG_ERROR + ("%s: child reports different index for node '%s'\n", + __FUNCTION__, node->name); + } else { + struct runtime_entry *other_child_entry; + int j, rc2; + /* compare the children's metadata */ + + /* look for another present child to + * compare against. + */ + other_child_entry = NULL; + for (j = 0; j < rd->child_count; j++) { + /* skip comparing to ourselves */ + if (j == i) { + continue; + } + /* is this child is present? */ + if (rd->child_table[j].child_node) { + /* yes, use it */ + other_child_entry = &rd->child_table[j]; + break; + } + } + /* if we can't find another valid + * child node's metadata to compare + * against, just skip this test. + */ + if (!other_child_entry) { + continue; + } + rc2 = + memcmp(other_child_entry->child_metadata-> + ordering_table, + child_entry->child_metadata->ordering_table, + sizeof (child_entry->child_metadata-> + ordering_table)); + if (rc2) { + rc = -EVMS_FEATURE_FATAL_ERROR; + LOG_ERROR + ("%s: mismatching child metadata for nodes '%s' and '%s'\n", + __FUNCTION__, + rd->child_table[i - 1].child_node->name, + child_entry->child_node->name); + } + } + /* stop if fatal error encountered */ + if (rc == -EVMS_FEATURE_FATAL_ERROR) { + break; + } + } + return (rc); +} + +/** + * perform_final_adjustments: do final tweaks to parent node + * @node: parent node + * + * This function does the following: + * sets the vsize (in vsectors) field in each child node + * sets the voffset (in vsectors) field in each child node + * frees each child node's metadata + * sets the parent's total size field +**/ +static void +perform_final_adjustments(struct evms_logical_node *node) +{ + int i; + struct runtime_data *rd; + struct runtime_entry *child_entry = NULL; + struct evms_drivelink_metadata *ref_data = NULL; + + rd = (struct runtime_data *) node->private; + /* find a valid copy of the ordering table. + * since all the ordering tables are the same + * we can just pick one to use for all the + * child computations. + */ + for (i = 0; i < rd->child_count; i++) { + child_entry = (struct runtime_entry *) &rd->child_table[i]; + if (child_entry->child_node) { + ref_data = child_entry->child_metadata; + break; + } + } + /* if we got this far, there should + * always be at least one valid child. + */ + if (!ref_data) + BUG(); + /* compute the parent's usable size, + * and construct the table used to + * remap parent I/Os to child I/Os */ + for (i = 0; i < rd->child_count; i++) { + child_entry = (struct runtime_entry *) &rd->child_table[i]; + /* set the LBA count for this child node */ + child_entry->vsize = ref_data->ordering_table[i].child_vsize; + /* set the start LBA value for this child node */ + child_entry->voffset = node->total_vsectors; + /* keep a running total of size in sectors */ + node->total_vsectors += child_entry->vsize; + /* free the metadata for this child node */ + if (ref_data != child_entry->child_metadata) { + kfree(child_entry->child_metadata); + } + child_entry->child_metadata = NULL; + /* free the feature header for this child node */ + if (child_entry->child_node) { + kfree(child_entry->child_node->feature_header); + child_entry->child_node->feature_header = NULL; + } + } + /* free the reference data */ + kfree(ref_data); +} + +/** + * finalize_parent_nodes: verify and prepare parent nodes + * @discover_list: list of potential drivelink parent objects + * + * verify the completeness of each parent node. if not complete, purge the in-memory + * structs for this object and all its children. If complete, perform final tweaks + * to allow this node to useable. + * + * Return value: 0 on success + * otherwise error code +**/ +static int +finalize_parent_nodes(struct evms_logical_node **discover_list) +{ + int rc = 0, rc2; + struct evms_logical_node *node, *next_node; + + for (node = *discover_list; node; node = next_node) { + next_node = node->next; + /* only check parent nodes */ + if (!node->feature_header) { + /* valid the children of this parent */ + rc = test_parent_node(node); + if (!rc) { + /* compute parent size and + * child remap table. + */ + perform_final_adjustments(node); + } else { + /* fatal error encountered. + * cleanup from this node and + * delete it from memory. + */ + evms_cs_remove_logical_node_from_list + (discover_list, node); + rc2 = DELETE(node); + if (rc2) { + LOG_SERIOUS + ("error(%d) attempting to delete '%s'.\n", + rc2, node->name); + } + } + } + } + return (rc); +} + +/** + * drivelink_discover: discover drivelinked storage objects + * @discover_list: the list of objects to inspect + * + * perform the drivelink discover process on the objects in the discovery list + * + * Return value: 0 on success + * otherwise error code +**/ +static int +drivelink_discover(struct evms_logical_node **discover_list) +{ + int rc = 0; + + MOD_INC_USE_COUNT; + rc = process_child_nodes(discover_list); + if (!rc) + rc = finalize_parent_nodes(discover_list); + + MOD_DEC_USE_COUNT; + return (rc); +} + +/********************************************************/ +/* Required Plugin Function Table Entry Point: */ +/* Delete function */ +/********************************************************/ + +/** + * drivelink_delete: purges a drivelink object and its children from memory + * @node: the drivelink object to delete + * + * purge the drivelink object, its private data, and all its children from memory. + * + * Return value: 0 on success + * otherwise error code +**/ +static int +drivelink_delete(struct evms_logical_node *node) +{ + int i, rc = 0; + struct runtime_data *rd; + struct runtime_entry *child_entry; + + LOG_DETAILS("deleting '%s'.\n", node->name); + + rd = (struct runtime_data *) node->private; + if (rd) { + for (i = 0; i < rd->child_count; i++) { + child_entry = &rd->child_table[i]; + /* delete the child node */ + if (child_entry->child_node) { + rc = DELETE(child_entry->child_node); + if (rc) + break; + child_entry->child_node = NULL; + } + /* delete the child's metadata */ + if (child_entry->child_metadata) { + kfree(child_entry->child_metadata); + child_entry->child_metadata = NULL; + } + } + if (!rc) { + /* delete the child table */ + if (rd->child_table) { + kfree(rd->child_table); + rd->child_table = NULL; + } + /* delete the instance data */ + kfree(rd); + node->private = NULL; + } + } + if (!rc) { + evms_cs_deallocate_logical_node(node); + MOD_DEC_USE_COUNT; + } + + return (rc); +} + +/** + * which_child: find the child node targetted by a IO to this drivelink object + * @parent: parent drivelink object + * @rsector: relative sector on the parent object + * @max_io_sects: largest IO size on the child, starting from rsector position + * + * This function find the child node a parent rsector maps to. + * It then adjusts the rsector value to be child relative and + * optionally computes the max # of sectors that can be access + * from this starting point on the child. + * + * Return value: + * The child node, the child relative rsector and max io size are + * returned to the caller. On error, the returned child node will + * be NULL. +**/ +static struct evms_logical_node * +which_child(struct evms_logical_node *parent, + u64 * rsector, u64 * max_io_sects) +{ + int i; + struct evms_logical_node *child = NULL; + struct runtime_data *rd; + struct runtime_entry *child_entry = NULL; + + rd = (struct runtime_data *) parent->private; + for (i = 0; i < rd->child_count; i++) { + child_entry = (struct runtime_entry *) &rd->child_table[i]; + + if (*rsector >= child_entry->vsize) { + *rsector -= child_entry->vsize; + } else { + /* get the child node */ + child = child_entry->child_node; + /* compute the sector count if requested */ + if (max_io_sects) + /* this is only used for INIT I/O + * to return the largest sector + * count size for this child based + * on first sector in the I/O. + */ + *max_io_sects = child_entry->vsize - *rsector; + break; + } + } + return (child); +} + +/** + * drivelink_io_error: log an IO error for drivelink + * @node: drivelink object + * @bh: buffer head targetting this object + * + * this function was primarily created because the function + * buffer_IO_error is inline and kgdb doesn't allow breakpoints + * to be set on inline functions. Since this was an error path + * and not mainline, I decided to add a trace statement to help + * report on the failing condition. +**/ +static void +drivelink_io_error(struct evms_logical_node *node, int io_flag, struct buffer_head *bh) +{ + LOG_SERIOUS("%s error on '%s' remapping rsector("PFU64").\n", + (io_flag) ? "WRITE" : "READ", + node->name, (u64) bh->b_rsector); + + bh->b_end_io(bh, 0); +} + +/********************************************************/ +/* Required Plugin Function Table Entry Point: */ +/* Read function & Support routines */ +/********************************************************/ + +/** + * drivelink_read: handles IO read operations to drivelink objects + * @node: drivelink object + * @bh: buffer head targetting this object + * + * handles IO read operations to the drivelink objects. internally remaps the + * drivelink relative requests to the child relative requests and then routes + * it to the child for further processing. +**/ +static void +drivelink_read(struct evms_logical_node *node, struct buffer_head *bh) +{ + struct evms_logical_node *child; + u64 io_size, rsector; + + rsector = bh->b_rsector; + child = which_child(node, &rsector, &io_size); + if (child && ((bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) <= io_size)) { + bh->b_rsector = rsector; + R_IO(child, bh); + } else { + drivelink_io_error(node, READ, bh); + } +} + +/********************************************************/ +/* Required Plugin Function Table Entry Point: */ +/* Write function & Support routines */ +/********************************************************/ + +/** + * drivelink_read_write: handles IO write operations to drivelink objects + * @node: drivelink object + * @bh: buffer head targetting this object + * + * handles IO write operations to the drivelink objects. internally remaps the + * drivelink relative requests to the child relative requests and then routes + * it to the child for further processing. +**/ +static void +drivelink_write(struct evms_logical_node *node, struct buffer_head *bh) +{ + struct evms_logical_node *child; + u64 io_size, rsector; + + rsector = bh->b_rsector; + child = which_child(node, &rsector, &io_size); + if (child && ((bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) <= io_size)) { + bh->b_rsector = rsector; + W_IO(child, bh); + } else { + drivelink_io_error(node, WRITE, bh); + } +} + +/********************************************************/ +/* Required Plugin Function Table Entry Point: */ +/* Init I/O function */ +/********************************************************/ + +/** + * drivelink_init_io: performs synchronous IO to drivelink objects + * @node: drivelink object + * @io_flag: read/write flag + * @sect_nr: starting sector, object relative (512 byte units) + * @num_sects: count of sectors + * @buf_addr: buffer address to read from/write to + * + * This function must determine which child or children a + * specified I/O request must be passed to. Also if, when, + * and how a request must be broken up. + * + * Return value: 0 on success + * otherwise error code +**/ +static int +drivelink_init_io(struct evms_logical_node *node, int io_flag, + u64 sect_nr, + u64 num_sects, + void *buf_addr) +{ + int rc = 0; + + if (!node) + rc = -EINVAL; + else { + u64 starting_sector, remaining_sectors; + void *io_buf; + struct runtime_data *rd; + + if ((sect_nr + num_sects) > node->total_vsectors) { + LOG_SERIOUS + ("attempted out of bound("PFU64") %s on '%s' at sector("PFU64"), count("PFU64").\n", + node->total_vsectors, (io_flag) ? "WRITE" : "READ", + node->name, sect_nr, num_sects); + rc = -EINVAL; + } else { + rd = (struct runtime_data *) node->private; + /* make working copies of input parameters */ + starting_sector = sect_nr; + remaining_sectors = num_sects; + io_buf = buf_addr; + /* loop until all I/O is performed */ + while (remaining_sectors) { + u64 io_start, io_size; + struct evms_logical_node *child; + + /* compute the child relative io_start + * and max io_size. + */ + io_start = starting_sector; + child = which_child(node, &io_start, &io_size); + /* adjust io_size based on + * original remaining sectors + * in this io. + */ + if (io_size > remaining_sectors) + io_size = remaining_sectors; + if (child) { + rc = INIT_IO(child, + io_flag, + io_start, io_size, io_buf); + } else { + /* if partial volume, return 0's + * for missing children. + */ + if (io_flag == READ) { + memset(io_buf, 0, + io_size << + EVMS_VSECTOR_SIZE_SHIFT); + } + } + if (!rc) { + /* adjust working copies */ + starting_sector += io_size; + remaining_sectors -= io_size; + io_buf += io_size << + EVMS_VSECTOR_SIZE_SHIFT; + } else + break; + } + } + } + + return (rc); +} + +/********************************************************/ +/* Required Plugin Function Table Entry Point: */ +/* IOCTL function & Support routines */ +/********************************************************/ + +/** + * drivelink_ioctl_cmd_plugin_ioctl: drivelink support for the 'plugin ioctl' command + * @node: drivelink object + * @inode: VFS supplied parameter + * @file: VFS supplied parameter + * @cmd: the specific ioctl command + * @arg: the specific ioctl arguments + * + * this function handles 'plugin ioctl' commands. currently there is no specific + * commands for this plugin. however, this plugin must broadcast some commands so + * lower layers can receive them. + * + * Return value: 0 on success + * otherwise error code +**/ +static int +drivelink_ioctl_cmd_plugin_ioctl(struct evms_logical_node *node, + struct inode *inode, struct file *file, + unsigned long cmd, unsigned long arg) +{ + int i, rc = 0; + struct runtime_data *rd; + struct evms_plugin_ioctl_pkt tmp, *user_parms; + + user_parms = (struct evms_plugin_ioctl_pkt *) arg; + /* copy user's parameters to kernel space */ + if (copy_from_user(&tmp, user_parms, sizeof (tmp))) + rc = -EFAULT; + + if (!rc) { + rd = (struct runtime_data *) node->private; + /* is this cmd targetted at this feature ? */ + if (tmp.feature_id == node->plugin->id) { + switch (tmp.feature_command) { + default: + break; + } + } else { /* broadcast this cmd to all children */ + for (i = 0; i < rd->child_count; i++) { + struct evms_logical_node *child_node; + + child_node = rd->child_table[i].child_node; + if (child_node) { + rc = IOCTL(child_node, inode, file, + cmd, arg); + if (rc) + break; + } + } + } + /* copy info to userspace */ + if (copy_to_user(user_parms, &tmp, sizeof (tmp))) + rc = -EFAULT; + } + return (rc); +} + +/** + * drivelink_ioctl_cmd_broadcast: broadcast ioctls to your children + * @node: drivelink object + * @inode: VFS supplied parameter + * @file: VFS supplied parameter + * @cmd: the specific ioctl command + * @arg: the specific ioctl arguments + * + * broadcast the specified ioctl command and arguments to all this objects + * children. OR (logical opeation) the return values from all the children + * and return the OR'd value to the caller. + * + * Return value: 0 on success + * otherwise error code +**/ +static int +drivelink_ioctl_cmd_broadcast(struct evms_logical_node *node, + struct inode *inode, struct file *file, + unsigned long cmd, unsigned long arg) +{ + int i, rc = 0; + struct runtime_data *rd; + + rd = (struct runtime_data *) node->private; + /* broadcast this cmd to all children */ + for (i = 0; i < rd->child_count; i++) { + struct evms_logical_node *child_node; + + child_node = rd->child_table[i].child_node; + if (child_node) { + rc |= IOCTL(child_node, inode, file, cmd, arg); + } + } + return (rc); +} + +/** + * drivelink_ioctl: main ioctl entry point and handler + * @node: drivelink object + * @inode: VFS supplied parameter + * @file: VFS supplied parameter + * @cmd: a specific ioctl command + * @arg: a specific ioctl argument + * + * handles specific ioctl command internally and routes other ioctls commands to + * the appropriate entry points. + * + * Returns: 0 on success + * otherwise error code + **/ +static int +drivelink_ioctl(struct evms_logical_node *node, + struct inode *inode, + struct file *file, unsigned int cmd, unsigned long arg) +{ + int rc = 0; + struct runtime_data *rd = NULL; + struct hd_geometry hdgeo; + + if ((!node) || (!inode)) + rc = -EINVAL; + + if (!rc) { + rd = (struct runtime_data *) node->private; + switch (cmd) { + case HDIO_GETGEO: + hdgeo.heads = 255; + hdgeo.sectors = 63; + hdgeo.cylinders = + ((unsigned int) node->total_vsectors) / + hdgeo.heads / hdgeo.sectors; + hdgeo.start = 0; + if (copy_to_user((int *) arg, &hdgeo, sizeof (hdgeo))) + rc = -EFAULT; + break; + case EVMS_QUIESCE_VOLUME: + case EVMS_GET_DISK_LIST: + case EVMS_CHECK_MEDIA_CHANGE: + case EVMS_REVALIDATE_DISK: + case EVMS_OPEN_VOLUME: + case EVMS_CLOSE_VOLUME: + case EVMS_CHECK_DEVICE_STATUS: + rc = drivelink_ioctl_cmd_broadcast(node, inode, file, + cmd, arg); + break; + case EVMS_PLUGIN_IOCTL: + rc = drivelink_ioctl_cmd_plugin_ioctl(node, inode, file, + cmd, arg); + break; + case EVMS_GET_BMAP: + { + struct evms_get_bmap_pkt *bmap; + u64 io_start, io_size; + struct evms_logical_node *child; + + bmap = (struct evms_get_bmap_pkt *) arg; + io_start = bmap->rsector; + child = which_child(node, &io_start, &io_size); + if (child) { + if (node->block_size != + child->block_size) { + bmap->status = -EPERM; + } else { + bmap->rsector = io_start; + rc = IOCTL(child, + inode, + file, cmd, arg); + } + } + } + break; + default: + rc = -EINVAL; + break; + } + } + return (rc); +} + +/********************************************************/ +/* Required Module Entry Point: */ +/* drivelink_init */ +/********************************************************/ + +/** + * drivelink_init: register this module for use within the EVMS framework + * + * Return value: 0 on success + * otherwise error code. +**/ +int __init +drivelink_init(void) +{ + return evms_cs_register_plugin(&plugin_header); +} + +/** + * drivelink_exit: unregister this module from use within the EVMS framework + * + * Return value: 0 on success + * otherwise error code. +**/ +void __exit +drivelink_exit(void) +{ + evms_cs_unregister_plugin(&plugin_header); +} + +module_init(drivelink_init); +module_exit(drivelink_exit); +#ifdef MODULE_LICENSE +MODULE_LICENSE("GPL"); +#endif diff -Naur linux-2002-09-30/drivers/evms/evms_ecr.c evms-2002-09-30/drivers/evms/evms_ecr.c --- linux-2002-09-30/drivers/evms/evms_ecr.c Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/drivers/evms/evms_ecr.c Fri Aug 16 16:19:56 2002 @@ -0,0 +1,213 @@ +/* -*- linux-c -*- */ +/* + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* linux/driver/evms/evms_ecr.c + * + * EVMS - Cluster enablement (ECR) module + * + */ + + +#include +#include +#include +#include +#include +#include + +#define LOG_PREFIX "ecr: " + + +/* + * ecr_group_join + */ +ecr_group_t ecr_group_join(char *group_name, ecr_table_t *f_table, + ecr_cred_t * cred, size_t size, ecr_instance_t *instance) +{ + /* dummy */ + return ECR_FAIL; +} + + + + +/* + * ecr_group_leave + */ +void ecr_group_leave(ecr_group_t group) +{ + /* dummy */ + return; +} + + + +/* + * ecr_group_send + */ +int ecr_group_send(ecr_group_t group, ecr_nodeid_t node, void *message, + size_t size, ecr_instance_t *instance, + void callback(int ret, ecr_instance_t *instance)) +{ + /* dummy */ + return ECR_FAIL; +} + + + +/* + * ecr_group_send_wait + */ +int ecr_group_send_wait(ecr_group_t group, ecr_nodeid_t node, void *message, + size_t size, int *ret) +{ + /* dummy */ + *ret = ECR_FAIL; + return ECR_FAIL; +} + + + +/* + * ecr_group_broadcast + */ +int ecr_group_broadcast(ecr_group_t group, void *message, size_t size, + ecr_instance_t *instance, + void callback(u_char ret, ecr_instance_t *instance)) +{ + /* dummy */ + return ECR_FAIL; +} + + + +/* + * ecr_group_broadcast_wait + */ +int ecr_group_broadcast_wait(ecr_group_t group, void *message, size_t size, + u_char *ret) +{ + /* dummy */ + *ret = ECR_FAIL; + return ECR_FAIL; +} + + + +/* + * ecr_group_atomic_execute + */ +int ecr_group_atomic_execute(ecr_group_t group, void *message, size_t size, + ecr_instance_t *instance, + void callback(ecr_instance_t *instance)) +{ + /* dummy */ + return ECR_FAIL; +} + + + +/* + * ecr_group_atomic_execute_wait + */ +int ecr_group_atomic_execute_wait(ecr_group_t group, void *message, size_t size) +{ + /* dummy */ + return ECR_FAIL; +} + + + +/* + * ecr_group_success_response + */ +void ecr_group_success_response(ecr_message_t *handle) +{ + /* dummy */ + return; +} + + + + +/* + * ecr_group_failure_response + */ +void ecr_group_failure_response(ecr_message_t *handle, int ret) +{ + /* dummy */ + return; +} + + + +/* + * ecr_lock_create + */ +ecr_lock_t ecr_lock_create(char *lockname) +{ + /* dummy */ + return ECR_FAIL; +} + +/* + * ecr_lock + */ +int ecr_lock(ecr_lock_t lock, u64 start, u64 length, + ecr_lock_mode_t mode, u_char flag) +{ + /* dummy */ + return ECR_FAIL; +} + + + +/* + * ecr_unlock + */ +int ecr_unlock(ecr_lock_t lock, u64 start, u64 length) +{ + /* dummy */ + return ECR_FAIL; +} + + +/********************************************************/ +/* Required Module Entry Point: */ +/* ecr_init() */ +/********************************************************/ + +static int __init ecr_init(void) +{ + /* dummy */ + return 0; +} + +static void __exit ecr_exit(void) +{ + return; +} + +module_init(ecr_init); +module_exit(ecr_exit); +#ifdef MODULE_LICENSE +MODULE_LICENSE("GPL"); +#endif + diff -Naur linux-2002-09-30/drivers/evms/evms_passthru.c evms-2002-09-30/drivers/evms/evms_passthru.c --- linux-2002-09-30/drivers/evms/evms_passthru.c Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/drivers/evms/evms_passthru.c Fri Sep 13 16:09:55 2002 @@ -0,0 +1,298 @@ +/* -*- linux-c -*- */ + +/* + * + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * + */ +/* + * linux/drivers/evms/evms_passthru.c + * + * EVMS System Data Manager + * + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define EVMS_PASSTHRU_ID 0 +#define LOG_PREFIX "passthru: " + +static int passthru_mgr_discover(struct evms_logical_node **); +static int passthru_mgr_delete(struct evms_logical_node *); +static void passthru_mgr_read(struct evms_logical_node *, struct buffer_head *); +static void passthru_mgr_write(struct evms_logical_node *, struct buffer_head *); +static int passthru_mgr_ioctl(struct evms_logical_node *, + struct inode *, + struct file *, unsigned int, unsigned long); +static int passthru_mgr_init_io(struct evms_logical_node *, + int, u64, u64, void *); + +static struct evms_plugin_fops fops = { + .discover = passthru_mgr_discover, + .delete = passthru_mgr_delete, + .read = passthru_mgr_read, + .write = passthru_mgr_write, + .init_io = passthru_mgr_init_io, + .ioctl = passthru_mgr_ioctl +}; + +static struct evms_plugin_header plugin_header = { + .id = SetPluginID(IBM_OEM_ID, + EVMS_FEATURE, + EVMS_PASSTHRU_ID), + .version = { + .major = 1, + .minor = 1, + .patchlevel = 1 + }, + .required_services_version = { + .major = 0, + .minor = 5, + .patchlevel = 0 + }, + .fops = &fops +}; + +/*******************************/ +/* discovery support functions */ +/*******************************/ + +static int +process_passthru_data(struct evms_logical_node **pp) +{ + int rc, size_in_sectors; + struct evms_logical_node *node, *new_node; + + node = *pp; + + size_in_sectors = + evms_cs_size_in_vsectors(sizeof (struct evms_feature_header)); + + /* allocate "parent" node */ + rc = evms_cs_allocate_logical_node(&new_node); + if (!rc) { + /* initialize "parent" node */ + new_node->private = node; + new_node->flags = node->flags; + new_node->plugin = &plugin_header; + new_node->system_id = node->system_id; + new_node->block_size = node->block_size; + new_node->hardsector_size = node->hardsector_size; + new_node->total_vsectors = node->total_vsectors; + new_node->total_vsectors -= + (size_in_sectors << 1) + + node->feature_header->alignment_padding; + new_node->volume_info = node->volume_info; + strcpy(new_node->name, node->name); + if (strlen(node->feature_header->object_name)) + strcat(new_node->name, + node->feature_header->object_name); + else + strcat(new_node->name, "_Passthru"); + + /* return "parent" node to caller */ + *pp = new_node; + + MOD_INC_USE_COUNT; + + LOG_DETAILS("feature header found on '%s', created '%s'.\n", + node->name, new_node->name); + /* we're done with the passthru feature headers + * so lets delete them now. + */ + kfree(node->feature_header); + node->feature_header = NULL; + } else { + /* on any fatal error, delete the node */ + int rc2 = DELETE(node); + if (rc2) { + LOG_DEFAULT + ("error(%d) attempting to delete node(%p,%s).\n", + rc2, node, node->name); + } + } + return (rc); +} + +/********** Required Plugin Functions **********/ + +/* + * Function: passthru_mgr_discover + * + */ +static int +passthru_mgr_discover(struct evms_logical_node **discover_list) +{ + int rc = 0; + struct evms_logical_node *node, *tmp_list_head; + + MOD_INC_USE_COUNT; + tmp_list_head = *discover_list; + *discover_list = NULL; + + while (tmp_list_head) { + node = tmp_list_head; + rc = evms_cs_remove_logical_node_from_list(&tmp_list_head, + node); + if (!rc) + rc = process_passthru_data(&node); + if (!rc) + if (node) + rc = evms_cs_add_logical_node_to_list + (discover_list, node); + } + MOD_DEC_USE_COUNT; + return (rc); +} + +/* + * Function: passthru_mgr_delete + * + */ +static int +passthru_mgr_delete(struct evms_logical_node *node) +{ + int rc; + struct evms_logical_node *p; + + LOG_DETAILS("deleting '%s'.\n", node->name); + + p = node->private; + rc = DELETE(p); + if (!rc) { + evms_cs_deallocate_logical_node(node); + MOD_DEC_USE_COUNT; + } + return (rc); +} + +/* + * function: passthru_io_error + * + * this function was primarily created because the function + * buffer_IO_error is inline and kgdb doesn't allow breakpoints + * to be set on inline functions. Since this was an error path + * and not mainline, I decided to add a trace statement to help + * report on the failing condition. + * + */ +static void +passthru_io_error(struct evms_logical_node *node, int io_flag, struct buffer_head *bh) +{ + LOG_SERIOUS + ("attempt to %s beyond boundary("PFU64") on (%s), rsector("PFU64").\n", + (io_flag) ? "WRITE" : "READ", node->total_vsectors - 1, + node->name, (u64) bh->b_rsector); + + bh->b_end_io(bh, 0); +} + +/* + * Function: passthru_mgr_read + */ +static void +passthru_mgr_read(struct evms_logical_node *node, struct buffer_head *bh) +{ + if ((bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT)) <= + node->total_vsectors) { + R_IO(((struct evms_logical_node *) (node->private)), bh); + } else + passthru_io_error(node, READ, bh); +} + +/* + * Function: passthru_mgr_write + * + */ +static void +passthru_mgr_write(struct evms_logical_node *node, struct buffer_head *bh) +{ + if ((bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT)) <= + node->total_vsectors) { + W_IO(((struct evms_logical_node *) (node->private)), bh); + } else + passthru_io_error(node, WRITE, bh); +} + +/* + * Function: passthru_mgr_ioctl + * + */ +static int +passthru_mgr_ioctl(struct evms_logical_node *node, + struct inode *inode, + struct file *file, unsigned int cmd, unsigned long arg) +{ + int rc; + + if ((!node) || (!inode)) + rc = -EINVAL; + else + rc = IOCTL(((struct evms_logical_node *) (node->private)), + inode, file, cmd, arg); + return (rc); +} + +static int +passthru_mgr_init_io(struct evms_logical_node *node, int io_flag, /* 0=read, 1=write */ + u64 sect_nr, /* disk LBA */ + u64 num_sects, /* # of sectors */ + void *buf_addr) +{ /* buffer address */ + int rc; + if ((sect_nr + num_sects) <= node->total_vsectors) { + rc = INIT_IO(((struct evms_logical_node *) (node-> + private)), + io_flag, sect_nr, num_sects, buf_addr); + } else + rc = -EINVAL; + return (rc); +} + +/* + * Function: passthru_init + * + */ +int __init +evms_passthru_manager_init(void) +{ + return evms_cs_register_plugin(&plugin_header); /* register with EVMS */ +} + +void __exit +evms_passthru_manager_exit(void) +{ + evms_cs_unregister_plugin(&plugin_header); +} + +module_init(evms_passthru_manager_init); +module_exit(evms_passthru_manager_exit); +#ifdef MODULE_LICENSE +MODULE_LICENSE("GPL"); +#endif diff -Naur linux-2002-09-30/drivers/evms/gpt_part.c evms-2002-09-30/drivers/evms/gpt_part.c --- linux-2002-09-30/drivers/evms/gpt_part.c Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/drivers/evms/gpt_part.c Fri Sep 13 16:09:55 2002 @@ -0,0 +1,1018 @@ +/* -*- linux-c -*- */ +/* + * + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * + */ + +/* linux/driver/evms/gpt_part.c + * + * EVMS - EFI GPT segment manager plugin + * + * This plugin provides support for the GUID Partition Table format specified + * by the Extensible Firmware Interface documentation ... version 1.02 + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* prefix used in logging messages */ +#define LOG_PREFIX "gpt_part: " + +/** + * struct gpt_private - Private data structure for this plugin + * @source_object: object this IO will get remapped to + * @start_sect: source object relative starting address in 512 byte units + * @nr_sect: partition size in 512 bytes units + * @type: partition type or filesystem format indicator + * + * private copy of the just the fields we require to remap IO requests + * to the underlying object. + **/ +struct gpt_private { + struct evms_logical_node *source_disk; + u64 start_sect; + u64 nr_sects; + unsigned char type; +}; + +#define GPT_DISKMAGIC 0x5452415020494645 // "EFI PART" +#define GPT_PNAME_SIZE 36 // max unicode partition name size + +/** + * struct guid - GUID structure + * @time_low: timestamp - low order 32 bits + * @time_mid: timestamp - mid 16 bits + * @time_high: timestamp - high 16 bits + * @clock_seq_high: clock - high order 8 bits + * @clock_seq_low: clock - low order 8 bits + * @node: spatial reference - unique id (ie. mac address of nic) + * + * GUID structure + **/ +struct guid { + u32 time_low; + u16 time_mid; + u16 time_high; + u8 clock_seq_high; + u8 clock_seq_low; + u8 node[6]; +}; + +/** + * struct gpt_partition - GPT partition record definition + * @type: partition type + * @part_id: partition record id + * @start: address of 1st block of partition + * @end: address of last block of partition + * @attributes: bit field reserved by EFI spec + * @name: unicode name of partition + * + * GPT partition record definition + **/ +struct gpt_partition { + struct guid type; + struct guid part_id; + u64 start; + u64 end; + u64 attributes; + u16 name[GPT_PNAME_SIZE]; +}; + +/** + * struct gpt_header - GPT header + * @signature: EFI compatible header signature + * @version: spec revision number + * @size: size (bytes) of gpt header + * @crc: crc of gpt header + * @reserve: reserved by spec ... must be zero + * @my_lba: lba of gpt header + * @alternate_lba: lba of 2nd copy of gpt header + * @start_useable: lba of 1st block of useable area on disk + * @end_useable: lba of last block of useable area on disk + * @disk_id: GUID - identifies this disk + * @ptable_lba: lba of partition table + * @ptable_count: number of entries in the partition table + * @ptable_entry_size: size of partition table entry + * @ptable_crc: crc of partition table + * + * GPT header + **/ +struct gpt_header { + u64 signature; + u32 version; + u32 size; + u32 crc; + u32 reserve; + u64 my_lba; + u64 alternate_lba; + u64 start_useable; + u64 end_useable; + struct guid disk_id; + u64 ptable_lba; + u32 ptable_count; + u32 ptable_entry_size; + u32 ptable_crc; +}; + +struct guid EFI_SYSTEM_PARTITION = { + 0xC12A7328, + 0xF81F, + 0x11D2, + 0xBA, + 0x4B, + {0x00, 0xA0, 0xC9, 0x3E, 0xC9, 0x3B} +}; + +struct guid BASIC_DATA_PARTITION = { + 0xEBD0A0A2, + 0xB9E5, + 0x4433, + 0x87, + 0xC0, + {0x68, 0xB6, 0xB7, 0x26, 0x99, 0xC7} +}; + +struct guid LEGACY_MBR_PARTITION = { + 0x024DEE41, + 0x33E7, + 0x11D3, + 0x9D, + 0x69, + {0x00, 0x08, 0xC7, 0x81, 0xF3, 0x9F} +}; + +struct guid GPT_SWAP_PARTITION = { + 0x0657FD6D, + 0xA4AB, + 0x43C4, + 0x84, + 0xE5, + {0x09, 0x33, 0xC8, 0x4B, 0x4F, 0x4F} +}; + +struct guid UNUSED_GPT_PARTITION = { + 0, 0, 0, 0, 0, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00} +}; + +static int exported_nodes; /* total # of exported segments + * produced during this discovery. + */ + +/* Prototypes */ +static int partition_discover(struct evms_logical_node **); +static int partition_delete(struct evms_logical_node *); +static void partition_read(struct evms_logical_node *, struct buffer_head *); +static void partition_write(struct evms_logical_node *, struct buffer_head *); +static int partition_ioctl(struct evms_logical_node *, + struct inode *, + struct file *, unsigned int, unsigned long); +static int partition_init_io(struct evms_logical_node *, + int, u64, u64, void *); + +static struct evms_plugin_fops fops = { + .discover = partition_discover, + .delete = partition_delete, + .read = partition_read, + .write = partition_write, + .init_io = partition_init_io, + .ioctl = partition_ioctl +}; + +#define EVMS_GPT_PARTITION_MANAGER_ID 3 + +static struct evms_plugin_header plugin_header = { + .id = SetPluginID(IBM_OEM_ID, + EVMS_SEGMENT_MANAGER, + EVMS_GPT_PARTITION_MANAGER_ID), + .version = { + .major = 1, + .minor = 1, + .patchlevel = 1 + }, + .required_services_version = { + .major = 0, + .minor = 5, + .patchlevel = 0 + }, + .fops = &fops +}; + +/***************************************************/ +/* List Support - Typedefs, Variables, & Functions */ +/***************************************************/ + +/* Typedefs */ + +struct segment_list_node { + struct evms_logical_node *segment; + struct segment_list_node *next; +}; + +struct disk_list_node { + struct evms_logical_node *disk; + struct segment_list_node *segment_list; + struct disk_list_node *next; +}; + +/* Variables */ + +static struct disk_list_node *my_disk_list; + +/* Functions */ + +/* + * Function: Convert a GPT header from disk format to the arch specific + * format. + */ +static void +disk_gpt_header_to_cpu(struct gpt_header *gh) +{ + gh->signature = le64_to_cpu(gh->signature); + gh->version = le32_to_cpu(gh->version); + gh->size = le32_to_cpu(gh->size); + gh->crc = le32_to_cpu(gh->crc); + gh->reserve = le32_to_cpu(gh->reserve); + gh->my_lba = le64_to_cpu(gh->my_lba); + gh->alternate_lba = le64_to_cpu(gh->alternate_lba); + gh->start_useable = le64_to_cpu(gh->start_useable); + gh->end_useable = le64_to_cpu(gh->end_useable); + gh->disk_id.time_low = le32_to_cpu(gh->disk_id.time_low); + gh->disk_id.time_mid = le16_to_cpu(gh->disk_id.time_mid); + gh->disk_id.time_high = le16_to_cpu(gh->disk_id.time_high); + gh->ptable_lba = le64_to_cpu(gh->ptable_lba); + gh->ptable_count = le32_to_cpu(gh->ptable_count); + gh->ptable_entry_size = le32_to_cpu(gh->ptable_entry_size); + gh->ptable_crc = le32_to_cpu(gh->ptable_crc); +} + +static int +matching_guids(struct guid *g1, struct guid *g2) +{ + if ((le32_to_cpu(g1->time_low) == g2->time_low) && + (le16_to_cpu(g1->time_mid) == g2->time_mid) && + (le16_to_cpu(g1->time_high) == g2->time_high) && + (g1->clock_seq_high == g2->clock_seq_high) && + (g1->clock_seq_low == g2->clock_seq_low)) { + return 1; + } + return 0; +} +static inline int +isa_basic_data_gpt_partition_record(struct gpt_partition *p) +{ + return (matching_guids(&p->type, &BASIC_DATA_PARTITION)); +} +static inline int +isa_legacy_mbr_gpt_partition_record(struct gpt_partition *p) +{ + return (matching_guids(&p->type, &LEGACY_MBR_PARTITION)); +} +static inline int +isa_esp_gpt_partition_record(struct gpt_partition *p) +{ + return (matching_guids(&p->type, &EFI_SYSTEM_PARTITION)); +} +static inline int +isa_gpt_swap_partition_record(struct gpt_partition *p) +{ + return (matching_guids(&p->type, &GPT_SWAP_PARTITION)); +} +static inline int +isa_unused_gpt_partition_record(struct gpt_partition *p) +{ + return (matching_guids(&p->type, &UNUSED_GPT_PARTITION)); +} + +static struct disk_list_node ** +lookup_disk(struct evms_logical_node *disk) +{ + struct disk_list_node **ldln; + + ldln = &my_disk_list; + while (*ldln) { + if ((*ldln)->disk == disk) + break; + ldln = &(*ldln)->next; + } + return (ldln); +} + +static struct segment_list_node ** +lookup_segment(struct disk_list_node *disk, struct evms_logical_node *segment) +{ + struct segment_list_node **lsln; + + lsln = &disk->segment_list; + while (*lsln) { + if ((*lsln)->segment == segment) + break; + lsln = &(*lsln)->next; + } + return (lsln); +} + +static struct evms_logical_node * +find_segment_on_disk(struct evms_logical_node *disk, + u64 start_sect, u64 nr_sects) +{ + struct evms_logical_node *rc = NULL; + struct disk_list_node **ldln; + struct segment_list_node **lsln; + struct gpt_private *gpt_prv; + + ldln = lookup_disk(disk); + if (*ldln) { + /* disk found in list */ + /* attempt to find segment */ + + lsln = &(*ldln)->segment_list; + while (*lsln) { + gpt_prv = (*lsln)->segment->private; + if (gpt_prv->start_sect == start_sect) + if (gpt_prv->nr_sects == nr_sects) + break; + lsln = &(*lsln)->next; + } + if (*lsln) + rc = (*lsln)->segment; + } + return (rc); +} + +/* function description: add_segment_to_disk + * + * this function attempts to add a segment to the segment + * list of a disk. if the specified disk is not found, it + * will be added to the global disk list. this function will + * return a pointer to the matching segment in the disk's + * segment list. the caller must compare the returned pointer + * to the specified segment to see if the + * specified segment was already present in the disk's segment + * list. if the return pointer matches the specified segment, + * then the specified segment was added to the list. if the + * return segment pointer to does not match the specified + * segment pointer, then the specified segment pointer was + * a duplicate and can be thrown away. + */ +static int +add_segment_to_disk(struct evms_logical_node *disk, + struct evms_logical_node *segment) +{ + int rc = 0; + struct disk_list_node **ldln, *new_disk; + struct segment_list_node **lsln, *new_segment; + + ldln = lookup_disk(disk); + if (*ldln == NULL) { + /* disk not in list, add disk */ + new_disk = kmalloc(sizeof (*new_disk), GFP_KERNEL); + if (new_disk) { + memset(new_disk, 0, sizeof (*new_disk)); + new_disk->disk = disk; + *ldln = new_disk; + } else { + rc = -ENOMEM; + } + } + if (!rc) { + /* attempt to add segment */ + lsln = lookup_segment(*ldln, segment); + if (*lsln == NULL) { + /* segment not in list, add segment */ + new_segment = + kmalloc(sizeof (*new_segment), GFP_KERNEL); + if (new_segment) { + memset(new_segment, 0, sizeof (*new_segment)); + new_segment->segment = segment; + *lsln = new_segment; + } else { + rc = -ENOMEM; + } + } else + rc = -1; + } + return (rc); +} + +static int +remove_segment_from_disk(struct evms_logical_node *disk, + struct evms_logical_node *segment, + struct evms_logical_node **empty_disk) +{ + int rc = 0; + struct disk_list_node **ldln, *tmp_disk_node; + struct segment_list_node **lsln, *tmp_segment_node; + + *empty_disk = NULL; + ldln = lookup_disk(disk); + if (*ldln == NULL) { + rc = -1; + } else { + /* disk found in list */ + /* attempt to add segment */ + lsln = lookup_segment(*ldln, segment); + if (*lsln == NULL) { + rc = -2; + } else { + tmp_segment_node = *lsln; + /* remove segment from list */ + *lsln = (*lsln)->next; + /* free the segment list node */ + kfree(tmp_segment_node); + + if ((*ldln)->segment_list == NULL) { + tmp_disk_node = *ldln; + *empty_disk = tmp_disk_node->disk; + /* remove disk from list */ + *ldln = (*ldln)->next; + /* free the disk list node */ + kfree(tmp_disk_node); + } + } + } + return (rc); +} + +/* + * Function: add_segment + */ +static int +process_segment(struct evms_logical_node **discover_list, + struct evms_logical_node *node, + u64 start_sect, + u64 nr_sects, + int type, int part_num, int evms_top_segment) +{ + struct gpt_private *gpt_prv = NULL; + struct evms_logical_node *segment; + int rc = 0; + + segment = find_segment_on_disk(node, start_sect, nr_sects); + if (segment) { + LOG_DETAILS("exporting segment '%s'.\n", segment->name); + } else { + gpt_prv = kmalloc(sizeof (*gpt_prv), GFP_KERNEL); + if (gpt_prv) { + gpt_prv->source_disk = node; + gpt_prv->start_sect = start_sect; + gpt_prv->nr_sects = nr_sects; + gpt_prv->type = type; + rc = evms_cs_allocate_logical_node(&segment); + } else { + rc = -ENOMEM; + } + if (!rc) { + segment->plugin = &plugin_header; + segment->system_id = (unsigned int) type; + segment->total_vsectors = nr_sects; + segment->block_size = node->block_size; + segment->hardsector_size = node->hardsector_size; + segment->private = gpt_prv; + segment->flags = node->flags; + if (evms_top_segment) + segment->iflags |= EVMS_TOP_SEGMENT; + strcpy(segment->name, node->name); + if (GetPluginType(node->plugin->id) == + EVMS_SEGMENT_MANAGER) { + strcat(segment->name, "."); + } + sprintf(segment->name + strlen(segment->name), "%d", + part_num); + LOG_DETAILS("creating segment '%s'.\n", segment->name); + rc = add_segment_to_disk(node, segment); + if (rc) { + LOG_ERROR + ("%s: error(%d) adding segment '%s'!\n", + __FUNCTION__, rc, segment->name); + rc = 0; + } else { + MOD_INC_USE_COUNT; + } + } + if (rc) { + if (gpt_prv) + kfree(gpt_prv); + if (segment) + evms_cs_deallocate_logical_node(segment); + } + } + if (!rc) { + evms_cs_add_logical_node_to_list(discover_list, segment); + exported_nodes++; + } + return rc; +} + +void +print_mem(void *buffer, int length) +{ + int i, done; + unsigned char *bufptr; + + bufptr = (unsigned char *) buffer; + i = done = 0; + while (!done) { + if ((i % 16) == 0) + printk(KERN_INFO "\n0x%p->", buffer + i); + printk(KERN_INFO "%02x ", bufptr[i]); + if (++i >= length) + done++; + } + printk(KERN_INFO "\n"); +} + +/* + * Function: get GPT Partition Table - reads partition table + * into memory and performs crc check. + * + */ +static struct gpt_partition * +get_gpt_partition_table(struct evms_logical_node *node, struct gpt_header *gh) +{ + int rc; + struct gpt_partition *pt; + u32 sector_count, calculated_crc; + + sector_count = + evms_cs_size_in_vsectors(gh->ptable_count * gh->ptable_entry_size); + + pt = kmalloc(sector_count * EVMS_VSECTOR_SIZE, GFP_KERNEL); + if (pt) { + + rc = INIT_IO(node, 0, gh->ptable_lba, sector_count, pt); + if (!rc) { + + calculated_crc = evms_cs_calculate_crc(EVMS_INITIAL_CRC, + pt, + gh-> + ptable_count * + gh-> + ptable_entry_size); + + if (~calculated_crc != gh->ptable_crc) { + rc = -ENODATA; + } + + } + } else { + rc = -ENOMEM; + } + + if (rc) { + if (pt) + kfree(pt); + pt = NULL; + } + + return (pt); +} + +/* + * Function: Validate GPT Header - runs basic checks to + * sanity check a gpt header. + * + */ +static int +isa_valid_gpt_header(struct evms_logical_node *node, u64 lsn, + struct gpt_header *gh) +{ + u32 crc; + u32 calculated_crc; + u64 sector_count; + + /* signature */ + if (le64_to_cpu(gh->signature) != GPT_DISKMAGIC) + return 0; + + /* crc */ + crc = le32_to_cpu(gh->crc); + gh->crc = 0; + calculated_crc = + ~(evms_cs_calculate_crc(EVMS_INITIAL_CRC, gh, le32_to_cpu(gh->size))); + gh->crc = cpu_to_le32(crc); + + if (calculated_crc != crc) + return 0; + + /* spec says lba reported by header must match actual location on disk */ + if (lsn != le64_to_cpu(gh->my_lba)) + return 0; + + /* sanity check partition table info found in header */ + if (gh->ptable_count == 0 || gh->ptable_entry_size == 0) + return 0; + + sector_count = + evms_cs_size_in_vsectors(le64_to_cpu(gh->ptable_count) * + le64_to_cpu(gh->ptable_entry_size)); + + if ((le64_to_cpu(gh->ptable_lba) + sector_count - 1) >= + node->total_vsectors - 1) + return 0; + + return 1; +} + +/* + * Function: get GPT Partition Table Header + * + */ +static struct gpt_header * +get_gpt_header(struct evms_logical_node *node, u64 lsn) +{ + int rc; + struct gpt_header *gh = NULL; + + gh = kmalloc(EVMS_VSECTOR_SIZE, GFP_KERNEL); + if (gh) { + rc = INIT_IO(node, 0, lsn, 1, gh); + if (!rc) { + if (isa_valid_gpt_header(node, lsn, gh)) { + disk_gpt_header_to_cpu(gh); + } else { + rc = -ENODATA; + } + + } + if (rc) { + kfree(gh); + gh = NULL; + } + } + + return (gh); +} + +/* + * Function: Get GPT Information + * + */ +static int +get_gpt_info(struct evms_logical_node *node, + struct gpt_header **gh, struct gpt_partition **ptable) +{ + struct gpt_header *gh1 = NULL, *gh2 = NULL; + + *gh = NULL; + *ptable = NULL; + + gh1 = get_gpt_header(node, 1); // offset past protective mbr + + if (gh1) { + *gh = gh1; + gh2 = get_gpt_header(node, gh1->alternate_lba); + if (gh2) + kfree(gh2); + else + LOG_WARNING + ("alternate guid partition table header is invalid, using primary copy.\n"); + } else { + gh2 = get_gpt_header(node, node->total_vsectors - 1); + if (gh2) { + *gh = gh2; + LOG_WARNING + ("primary guid partition table header is invalid, using alternate copy\n"); + } else { + LOG_DETAILS("no gpt header discovered on node %s\n", + node->name); + return 0; + } + } + + *ptable = get_gpt_partition_table(node, *gh); + if (!*ptable) { + kfree(*gh); + *gh = NULL; + return 0; + } + + return 1; +} + +/* + * Function: Probe for GPT segments on logical node + * + */ +static int +probe_for_segments(struct evms_logical_node **discover_list, + struct evms_logical_node *node) +{ + int rc; + int nextminor = 1; + int evms_top_segment; + u32 i; + u64 pstart,pend; + struct gpt_header *gh = NULL; + struct gpt_partition *ptable = NULL; + struct gpt_partition *part = NULL; + + /* no need to inspect our own nodes */ + if (node->plugin->id == plugin_header.id) + return 0; + + /* nor nodes marked as EVMS_TOP_SEGMENT */ + if (node->iflags & EVMS_TOP_SEGMENT) + return 0; + + /* look for guid partition table & header */ + if (!get_gpt_info(node, &gh, &ptable)) { + if (gh) + kfree(gh); + if (ptable) + kfree(ptable); + return 0; + } + + /* walk the guid partition table, producing segment storage objects */ + for (i = 0, part = ptable; i < gh->ptable_count; i++, part++) { + + if (!isa_unused_gpt_partition_record(part)) { + + pstart = le64_to_cpu(part->start); + pend = le64_to_cpu(part->end); + + LOG_DETAILS + ("gpt partition start="PFU64" end="PFU64"\n", + pstart, (pend - pstart + 1)); + + /* stop other seg mgrs from recursive discovery on a gpt system partition */ + if (isa_esp_gpt_partition_record(part)) + evms_top_segment = 1; + else + evms_top_segment = 0; + + rc = process_segment(discover_list, + node, + pstart, + (pend - pstart + 1), + 0, nextminor, evms_top_segment); + + if (!rc) { + ++nextminor; + } + } + + } + + /* remove node we just consumed */ + evms_cs_remove_logical_node_from_list(discover_list, node); + + kfree(ptable); + kfree(gh); + return 1; +} + +/* + * Function: partition_discover + * + */ +static int +partition_discover(struct evms_logical_node **discover_list) +{ + int rc = 0; + struct evms_logical_node *node, *next_node; + + MOD_INC_USE_COUNT; + LOG_ENTRY_EXIT("%s: ENTRY\n", __FUNCTION__); + + /* initialize global variable */ + exported_nodes = 0; + + /* examine each node on the discover list */ + next_node = *discover_list; + while (next_node) { + node = next_node; + next_node = node->next; + probe_for_segments(discover_list, node); + } + + LOG_ENTRY_EXIT("%s: EXIT(exported nodes:%d, error code:%d)\n", + __FUNCTION__, exported_nodes, rc); + if (exported_nodes) + rc = exported_nodes; + MOD_DEC_USE_COUNT; + return (rc); +} + +/* + * Function: partition_delete + * + */ +static int +partition_delete(struct evms_logical_node *segment) +{ + int rc = 0; + struct gpt_private *gpt_prv; + struct evms_logical_node *empty_disk = NULL; + + LOG_DETAILS("deleting segment '%s'.\n", segment->name); + + if (!segment) { + rc = -ENODEV; + } else { + gpt_prv = segment->private; + if (gpt_prv) { + /* remove the segment from the + * disk's segment list + */ + rc = remove_segment_from_disk(gpt_prv->source_disk, + segment, &empty_disk); + /* free the local instance data */ + kfree(gpt_prv); + } + /* free the segment node */ + evms_cs_deallocate_logical_node(segment); + MOD_DEC_USE_COUNT; + /* if the last segment on the disk was + * deleted, delete the disk node too + */ + if (empty_disk) + DELETE(empty_disk); + } + return (rc); +} + +/* + * function: partition_io_error + * + * this function was primarily created because the function + * buffer_IO_error is inline and kgdb doesn't allow breakpoints + * to be set on inline functions. Since this was an error path + * and not mainline, I decided to add a trace statement to help + * report on the failing condition. + * + */ +static void +partition_io_error(struct evms_logical_node *node, int io_flag, + struct buffer_head *bh) +{ + LOG_SERIOUS + ("attempt to %s beyond partition boundary("PFU64") on (%s), rsector(%ld).\n", + (io_flag) ? "WRITE" : "READ", node->total_vsectors - 1, node->name, + bh->b_rsector); + + bh->b_end_io(bh, 0); +} + +/* + * Function: partition_read + * + */ +static void +partition_read(struct evms_logical_node *partition, struct buffer_head *bh) +{ + struct gpt_private *gpt_prv = partition->private; + + if ((bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT)) <= + partition->total_vsectors) { + bh->b_rsector += gpt_prv->start_sect; + R_IO(gpt_prv->source_disk, bh); + } else + partition_io_error(partition, READ, bh); +} + +/* + * Function: partition_write + * + */ +static void +partition_write(struct evms_logical_node *partition, struct buffer_head *bh) +{ + struct gpt_private *gpt_prv = partition->private; + + if ((bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT)) <= + partition->total_vsectors) { + bh->b_rsector += gpt_prv->start_sect; + W_IO(gpt_prv->source_disk, bh); + } else + partition_io_error(partition, WRITE, bh); +} + +/* + * Function: partition_init_io + * + */ +static int +partition_init_io(struct evms_logical_node *partition, int io_flag, /* 0=read, 1=write */ + u64 sect_nr, /* disk LBA */ + u64 num_sects, /* # of sectors */ + void *buf_addr) +{ /* buffer address */ + int rc; + struct gpt_private *gpt_prv = partition->private; + + if ((sect_nr + num_sects) <= partition->total_vsectors) { + rc = INIT_IO(gpt_prv->source_disk, io_flag, + sect_nr + gpt_prv->start_sect, num_sects, + buf_addr); + } else { + LOG_SERIOUS + ("init_io: attempt to %s beyond partition(%s) boundary("PFU64") at sector("PFU64") for count("PFU64").\n", + (io_flag) ? "WRITE" : "READ", partition->name, + (gpt_prv->nr_sects - 1), sect_nr, num_sects); + rc = -EINVAL; + } + + return (rc); +} + +/* + * Function: partition_ioctl + * + */ +static int +partition_ioctl(struct evms_logical_node *partition, + struct inode *inode, + struct file *file, unsigned int cmd, unsigned long arg) +{ + struct gpt_private *gpt_prv; + struct hd_geometry hd_geo; + int rc; + + rc = 0; + gpt_prv = partition->private; + if (!inode) + return -EINVAL; + switch (cmd) { + case HDIO_GETGEO: + { + rc = IOCTL(gpt_prv->source_disk, inode, file, cmd, arg); + if (rc) + break; + if (copy_from_user + (&hd_geo, (void *) arg, + sizeof (struct hd_geometry))) + rc = -EFAULT; + if (rc) + break; + hd_geo.start = gpt_prv->start_sect; + if (copy_to_user + ((void *) arg, &hd_geo, + sizeof (struct hd_geometry))) + rc = -EFAULT; + } + break; + case EVMS_GET_BMAP: + { + struct evms_get_bmap_pkt *bmap = + (struct evms_get_bmap_pkt *) arg; + bmap->rsector += gpt_prv->start_sect; + /* intentionally fall thru to + * default ioctl down to device + * manager. + */ + } + default: + rc = IOCTL(gpt_prv->source_disk, inode, file, cmd, arg); + } + return rc; +} + +/* + * Function: gpt_module_init + * + */ +static int __init +gpt_module_init(void) +{ + return evms_cs_register_plugin(&plugin_header); /* register with EVMS */ +} + +/* + * Function: gpt module exit + */ +static void __exit +gpt_module_exit(void) +{ + evms_cs_unregister_plugin(&plugin_header); +} + +module_init(gpt_module_init); +module_exit(gpt_module_exit); +#ifdef MODULE_LICENSE +MODULE_LICENSE("GPL"); +#endif diff -Naur linux-2002-09-30/drivers/evms/ldev_mgr.c evms-2002-09-30/drivers/evms/ldev_mgr.c --- linux-2002-09-30/drivers/evms/ldev_mgr.c Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/drivers/evms/ldev_mgr.c Fri Sep 13 16:45:06 2002 @@ -0,0 +1,1500 @@ +/* -*- linux-c -*- */ +/* + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* linux/driver/evms/ldev_mgr.c + * + * EVMS - Local Device (Hard Drive) Manager + * + * This plugin walks the gendisk list and creates logical disk structures for each + * local ide or scsi device. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include /* must be included by all block drivers */ +#include +#include +#include +#include "../scsi/scsi.h" +#include "../scsi/sd.h" +#include +#include +#include + +#define LOG_PREFIX "ldev_mgr: " + +#define EVMS_LOCAL_DEVICE_MANAGER_ID 1 + +/** + * struct ldev_private - private data used by this plugin + * @major: major device number + * @minor: minor device number + * @bdev: block_device record for this device + * @gd: gendisk entry for this device + * @media_changed: media changed status field + * + * private data maintained for each device by this plugin + **/ +struct ldev_private { + int major, minor; + struct block_device *bdev; + struct gendisk *gd; + int media_changed; +}; + +/* prototypes for mandatory plugin interface functions */ +static int discover_disks(struct evms_logical_node **); +static int ldev_mgr_delete(struct evms_logical_node *); +static void ldev_mgr_read(struct evms_logical_node *, struct buffer_head *); +static void ldev_mgr_write(struct evms_logical_node *, struct buffer_head *); +static int ldev_mgr_ioctl(struct evms_logical_node *, + struct inode *, + struct file *, unsigned int, unsigned long); +static int ldev_init_io(struct evms_logical_node *, + int, u64, u64, void *); +static int ldev_mgr_direct_ioctl(struct inode *, + struct file *, unsigned int, unsigned long); + +/* plugin function table definition */ +static struct evms_plugin_fops fops = { + .discover = discover_disks, + .delete = ldev_mgr_delete, + .read = ldev_mgr_read, + .write = ldev_mgr_write, + .init_io = ldev_init_io, + .ioctl = ldev_mgr_ioctl, + .direct_ioctl = ldev_mgr_direct_ioctl +}; + +/* plugin header definition */ +static struct evms_plugin_header plugin_header = { + .id = SetPluginID(IBM_OEM_ID, + EVMS_DEVICE_MANAGER, + EVMS_LOCAL_DEVICE_MANAGER_ID), + .version = { + .major = 1, + .minor = 1, + .patchlevel = 1 + }, + .required_services_version = { + .major = 0, + .minor = 5, + .patchlevel = 0 + }, + .fops = &fops +}; + +#define TYPE_NONE 0 +#define TYPE_GENERIC 1 +#define TYPE_IDE 2 +#define TYPE_SCSI 3 + +#define INDEX_ALPHA 0 +#define INDEX_NUMERIC 1 + +/********************************************************/ +/* Required Plugin Function Table Entry Point: */ +/* Discover function & Support routines */ +/********************************************************/ + +#define MAX_NAME_BASE_SIZE 10 +#define MAX_NAME_MODIFIER_SIZE 4 +/** + * struct blk_device_info - block device info + * @devnode_name_base: base name (ie. hd or sd) for device + * @null1: guaranteed end-of-string NULL + * @devnode_name_modifier: name suffix (ie. ag for sdag) for device + * @null2: guaranteed end-of-string NULL + * @devnode_name_index: numeric device index (ie. 1 for hda1) + * @devnode_name_type: indicates numeric or alpha modifier + * @devnode_type: device type, IDE, SCSI, or GENERIC + * + * generic block device naming descriptor structure + **/ +struct blk_device_info { + char devnode_name_base[MAX_NAME_BASE_SIZE]; + char null1; + char devnode_name_modifier[MAX_NAME_MODIFIER_SIZE]; + char null2; + int devnode_name_index; + int devnode_name_type; + int device_type; +}; + +static struct blk_device_info *blk_dev_info = NULL; + +#define BLK_DEV_INFO(a,b,c,d,e) \ + strncpy(blk_dev_info[a].devnode_name_base, b, MAX_NAME_BASE_SIZE); \ + blk_dev_info[a].null1 = 0; \ + strncpy(blk_dev_info[a].devnode_name_modifier, c, MAX_NAME_MODIFIER_SIZE); \ + blk_dev_info[a].null2 = 0; \ + blk_dev_info[a].devnode_name_index = 0; \ + blk_dev_info[a].device_type = d; \ + blk_dev_info[a].devnode_name_type = e; + +static void +init_blk_dev_info(struct blk_device_info *blk_dev_info) +{ + BLK_DEV_INFO(IDE0_MAJOR, "hd", "a", TYPE_IDE, INDEX_ALPHA); + BLK_DEV_INFO(IDE1_MAJOR, "hd", "c", TYPE_IDE, INDEX_ALPHA); + BLK_DEV_INFO(IDE2_MAJOR, "hd", "e", TYPE_IDE, INDEX_ALPHA); + BLK_DEV_INFO(IDE3_MAJOR, "hd", "g", TYPE_IDE, INDEX_ALPHA); + BLK_DEV_INFO(IDE4_MAJOR, "hd", "i", TYPE_IDE, INDEX_ALPHA); + BLK_DEV_INFO(IDE5_MAJOR, "hd", "k", TYPE_IDE, INDEX_ALPHA); + BLK_DEV_INFO(IDE6_MAJOR, "hd", "m", TYPE_IDE, INDEX_ALPHA); + BLK_DEV_INFO(IDE7_MAJOR, "hd", "o", TYPE_IDE, INDEX_ALPHA); + BLK_DEV_INFO(IDE8_MAJOR, "hd", "q", TYPE_IDE, INDEX_ALPHA); + BLK_DEV_INFO(IDE9_MAJOR, "hd", "s", TYPE_IDE, INDEX_ALPHA); + + BLK_DEV_INFO(SCSI_DISK0_MAJOR, "sd", "a", TYPE_SCSI, INDEX_ALPHA); + BLK_DEV_INFO(SCSI_DISK1_MAJOR, "sd", "q", TYPE_SCSI, INDEX_ALPHA); + BLK_DEV_INFO(SCSI_DISK2_MAJOR, "sd", "ag", TYPE_SCSI, INDEX_ALPHA); + BLK_DEV_INFO(SCSI_DISK3_MAJOR, "sd", "aw", TYPE_SCSI, INDEX_ALPHA); + BLK_DEV_INFO(SCSI_DISK4_MAJOR, "sd", "bm", TYPE_SCSI, INDEX_ALPHA); + BLK_DEV_INFO(SCSI_DISK5_MAJOR, "sd", "cc", TYPE_SCSI, INDEX_ALPHA); + BLK_DEV_INFO(SCSI_DISK6_MAJOR, "sd", "cs", TYPE_SCSI, INDEX_ALPHA); + BLK_DEV_INFO(SCSI_DISK7_MAJOR, "sd", "di", TYPE_SCSI, INDEX_ALPHA); + + BLK_DEV_INFO(XT_DISK_MAJOR, "xd", "a", TYPE_GENERIC, INDEX_ALPHA); + + BLK_DEV_INFO(CYCLADES_MAJOR, "double", "0", TYPE_GENERIC, + INDEX_NUMERIC); + + BLK_DEV_INFO(MFM_ACORN_MAJOR, "mfm", "a", TYPE_GENERIC, INDEX_ALPHA); + + BLK_DEV_INFO(ACSI_MAJOR, "ad", "a", TYPE_GENERIC, INDEX_ALPHA); + + BLK_DEV_INFO(PS2ESDI_MAJOR, "ed", "a", TYPE_GENERIC, INDEX_ALPHA); + + BLK_DEV_INFO(40, "ez", "a", TYPE_GENERIC, INDEX_ALPHA); + BLK_DEV_INFO(43, "nb", "0", TYPE_GENERIC, INDEX_NUMERIC); + BLK_DEV_INFO(44, "ftl", "a", TYPE_GENERIC, INDEX_ALPHA); + BLK_DEV_INFO(45, "pd", "a", TYPE_GENERIC, INDEX_ALPHA); + BLK_DEV_INFO(47, "pf", "0", TYPE_GENERIC, INDEX_NUMERIC); + + BLK_DEV_INFO(DAC960_MAJOR + 0, "rd/c0d", "0", TYPE_GENERIC, + INDEX_NUMERIC); + BLK_DEV_INFO(DAC960_MAJOR + 1, "rd/c1d", "0", TYPE_GENERIC, + INDEX_NUMERIC); + BLK_DEV_INFO(DAC960_MAJOR + 2, "rd/c2d", "0", TYPE_GENERIC, + INDEX_NUMERIC); + BLK_DEV_INFO(DAC960_MAJOR + 3, "rd/c3d", "0", TYPE_GENERIC, + INDEX_NUMERIC); + BLK_DEV_INFO(DAC960_MAJOR + 4, "rd/c4d", "0", TYPE_GENERIC, + INDEX_NUMERIC); + BLK_DEV_INFO(DAC960_MAJOR + 5, "rd/c5d", "0", TYPE_GENERIC, + INDEX_NUMERIC); + BLK_DEV_INFO(DAC960_MAJOR + 6, "rd/c6d", "0", TYPE_GENERIC, + INDEX_NUMERIC); + BLK_DEV_INFO(DAC960_MAJOR + 7, "rd/c7d", "0", TYPE_GENERIC, + INDEX_NUMERIC); + + BLK_DEV_INFO(COMPAQ_SMART2_MAJOR, "ida/c0d", "0", TYPE_GENERIC, + INDEX_NUMERIC); + BLK_DEV_INFO(COMPAQ_SMART2_MAJOR1, "ida/c1d", "0", TYPE_GENERIC, + INDEX_NUMERIC); + BLK_DEV_INFO(COMPAQ_SMART2_MAJOR2, "ida/c2d", "0", TYPE_GENERIC, + INDEX_NUMERIC); + BLK_DEV_INFO(COMPAQ_SMART2_MAJOR3, "ida/c3d", "0", TYPE_GENERIC, + INDEX_NUMERIC); + BLK_DEV_INFO(COMPAQ_SMART2_MAJOR4, "ida/c4d", "0", TYPE_GENERIC, + INDEX_NUMERIC); + BLK_DEV_INFO(COMPAQ_SMART2_MAJOR5, "ida/c5d", "0", TYPE_GENERIC, + INDEX_NUMERIC); + BLK_DEV_INFO(COMPAQ_SMART2_MAJOR6, "ida/c6d", "0", TYPE_GENERIC, + INDEX_NUMERIC); + BLK_DEV_INFO(COMPAQ_SMART2_MAJOR7, "ida/c7d", "0", TYPE_GENERIC, + INDEX_NUMERIC); + + BLK_DEV_INFO(I2O_MAJOR + 0, "i2o/hd", "a", TYPE_GENERIC, INDEX_ALPHA); + BLK_DEV_INFO(I2O_MAJOR + 1, "i2o/hd", "q", TYPE_GENERIC, INDEX_ALPHA); + BLK_DEV_INFO(I2O_MAJOR + 2, "i2o/hd", "ag", TYPE_GENERIC, INDEX_ALPHA); + BLK_DEV_INFO(I2O_MAJOR + 3, "i2o/hd", "aw", TYPE_GENERIC, INDEX_ALPHA); + BLK_DEV_INFO(I2O_MAJOR + 4, "i2o/hd", "bm", TYPE_GENERIC, INDEX_ALPHA); + BLK_DEV_INFO(I2O_MAJOR + 5, "i2o/hd", "cc", TYPE_GENERIC, INDEX_ALPHA); + BLK_DEV_INFO(I2O_MAJOR + 6, "i2o/hd", "cs", TYPE_GENERIC, INDEX_ALPHA); + BLK_DEV_INFO(I2O_MAJOR + 7, "i2o/hd", "di", TYPE_GENERIC, INDEX_ALPHA); + + BLK_DEV_INFO(92, "ppdd", "0", TYPE_GENERIC, INDEX_NUMERIC); + BLK_DEV_INFO(93, "nftl", "a", TYPE_GENERIC, INDEX_ALPHA); + + BLK_DEV_INFO(DASD_MAJOR, "dasd", "a", TYPE_GENERIC, INDEX_ALPHA); + BLK_DEV_INFO(MDISK_MAJOR, "mdisk", "a", TYPE_GENERIC, INDEX_ALPHA); + + BLK_DEV_INFO(96, "msd", "0", TYPE_GENERIC, INDEX_NUMERIC); + BLK_DEV_INFO(97, "pktcdvd", "0", TYPE_GENERIC, INDEX_NUMERIC); + + BLK_DEV_INFO(UBD_MAJOR, "ubd", "0", TYPE_GENERIC, INDEX_NUMERIC); + + BLK_DEV_INFO(JSFD_MAJOR, "jsfd", "", TYPE_GENERIC, INDEX_NUMERIC); + + BLK_DEV_INFO(101, "amiraid/ar", "0", TYPE_GENERIC, INDEX_NUMERIC); + + BLK_DEV_INFO(104, "cciss/c0d", "0", TYPE_GENERIC, INDEX_NUMERIC); + BLK_DEV_INFO(105, "cciss/c1d", "0", TYPE_GENERIC, INDEX_NUMERIC); + BLK_DEV_INFO(106, "cciss/c2d", "0", TYPE_GENERIC, INDEX_NUMERIC); + BLK_DEV_INFO(107, "cciss/c3d", "0", TYPE_GENERIC, INDEX_NUMERIC); + BLK_DEV_INFO(108, "cciss/c4d", "0", TYPE_GENERIC, INDEX_NUMERIC); + BLK_DEV_INFO(108, "cciss/c5d", "0", TYPE_GENERIC, INDEX_NUMERIC); + BLK_DEV_INFO(110, "cciss/c6d", "0", TYPE_GENERIC, INDEX_NUMERIC); + BLK_DEV_INFO(111, "cciss/c7d", "0", TYPE_GENERIC, INDEX_NUMERIC); + + BLK_DEV_INFO(RAW_MAJOR, "raw", "0", TYPE_GENERIC, INDEX_NUMERIC); + + BLK_DEV_INFO(VXVM_MAJOR, "vx/dsk", "0", TYPE_GENERIC, INDEX_NUMERIC); + BLK_DEV_INFO(VXDMP_MAJOR, "vx/dmp", "0", TYPE_GENERIC, INDEX_NUMERIC); + BLK_DEV_INFO(LOOP_MAJOR, "loop", "0", TYPE_GENERIC, INDEX_NUMERIC); +} + +static int +is_in_device_list(struct gendisk *gd, int major, int minor) +{ + int found, done, rc; + struct evms_logical_node *device = NULL; + struct ldev_private *ldev_prv; + + done = found = FALSE; + while (done == FALSE) { + rc = evms_cs_find_next_device(device, &device); + if (rc || !device) + done = TRUE; + else { + ldev_prv = device->private; + if (ldev_prv->gd == gd) + if (ldev_prv->major == major) + if (ldev_prv->minor == minor) + done = found = TRUE; + } + } + return (found); +} + +static void +build_devnode_name(char *name_buf, int major) +{ + char buf[11], *modifier, *buf_ptr; + int int_mod, done; + struct blk_device_info *bdi; + + bdi = &blk_dev_info[major]; + + /* convert the base name modifier to an integer */ + modifier = bdi->devnode_name_modifier; + int_mod = 0; + while (*modifier) { + if (bdi->devnode_name_type == INDEX_ALPHA) { + int_mod *= 26; + int_mod += *modifier - 'a'; + } else { + int_mod *= 10; + int_mod += *modifier - '0'; + } + modifier++; + if (*modifier) { + int_mod++; + } + } + /* add in device_index_value */ + int_mod += bdi->devnode_name_index; + bdi->devnode_name_index++; + + /* convert integer modifier back to ALPHA/NUMERIC chars */ + memset(buf, 0, sizeof (buf)); + /* fill the buffer from the rear to front with the + * ascii version of the modifier, leaving space for + * NULL terminator at the end. + */ + buf_ptr = &buf[sizeof (buf) - 2]; + done = FALSE; + do { + if (bdi->devnode_name_type == INDEX_ALPHA) { + *buf_ptr = (int_mod % 26) + 'a'; + int_mod /= 26; + } else { + *buf_ptr = (int_mod % 10) + '0'; + int_mod /= 10; + } + if (int_mod) { + int_mod--; + } else { + done = TRUE; + } + buf_ptr--; + } while (!done); + + /* find beginning of modifier in buffer */ + modifier = buf; + while (!*modifier) + modifier++; + + /* build the final device devnode name */ + sprintf(name_buf, "%s%s", bdi->devnode_name_base, modifier); +} + +static int +ldev_mgr_lock_device(struct ldev_private *ldev_prv) +{ + int rc; + struct block_device *bdev; + + bdev = bdget(MKDEV(ldev_prv->major, ldev_prv->minor)); + if (!bdev) + return -ENOMEM; + rc = blkdev_get(bdev, FMODE_READ | FMODE_WRITE, 0, BDEV_RAW); + if (rc) + return rc; + ldev_prv->bdev = bdev; + return 0; +} + +static void +ldev_mgr_unlock_device(struct ldev_private *ldev_prv) +{ + struct block_device *bdev = ldev_prv->bdev; + ldev_prv->bdev = NULL; + if (!bdev) { + LOG_ERROR("error: NULL bdev field detected!\n"); + BUG(); + } + blkdev_put(bdev, BDEV_RAW); +} + +#define DEVICE_KNOWN 1234 +#define DEVICE_UNINITIALIZED 1235 +#define DEVICE_MEDIA_NOT_PRESENT 1236 +static int +create_logical_disk(struct evms_logical_node **disk_list, + struct gendisk *gd, int device_index) +{ + int rc = 0, major, minor; + struct evms_logical_node *new_disk = NULL; + struct ldev_private *ldev_prv = NULL; + char device_name[EVMS_VOLUME_NAME_SIZE + 1]; + + major = gd->major; + minor = device_index << gd->minor_shift; + + /* skip uninitialized devices */ + if (!blk_size[major]) + rc = DEVICE_UNINITIALIZED; + else if (!blk_size[major][minor]) + rc = DEVICE_UNINITIALIZED; + if (!rc) { + /* construct the devnode name for this device */ + build_devnode_name(device_name, major); + + /* skip devices we already know about */ + if (is_in_device_list(gd, major, minor) == TRUE) + rc = DEVICE_KNOWN; + } + /* allocate the new node */ + if (!rc) { + rc = evms_cs_allocate_logical_node(&new_disk); + } + /* allocate new nodes's instance data */ + if (!rc) { + ldev_prv = kmalloc(sizeof(struct ldev_private), GFP_KERNEL); + if (!ldev_prv) + rc = -ENOMEM; + } + /* initialize the new node */ + if (!rc) { + memset(ldev_prv, 0, sizeof(struct ldev_private)); + new_disk->plugin = &plugin_header; + + /* initialize the instance data */ + new_disk->private = ldev_prv; + ldev_prv->gd = gd; + ldev_prv->major = major; + ldev_prv->minor = minor; + rc = ldev_mgr_lock_device(ldev_prv); + if (rc) { + LOG_ERROR("error(%d): unable to lock device(%d,%d)!\n", + rc, major, minor); + } + } + if (!rc) { + /* determine hardsector size */ + new_disk->hardsector_size = 512; + if (hardsect_size[major]) { + new_disk->hardsector_size = hardsect_size[major][minor]; + } + /* save the block size */ + new_disk->block_size = 1024; + if (blksize_size[major]) { + new_disk->block_size = blksize_size[major][minor]; + } + /* obtain the device size in sectors + * + * try 64bit size first, if that fails + * fall back on the 32bit size. + */ + /* try 64bit size */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,18) + rc = evms_cs_kernel_ioctl(new_disk, BLKGETSIZE64, + (ulong) & new_disk->total_vsectors); + if (!rc) { + /* convert bytes to 512 byte sectors */ + new_disk->total_vsectors >>= EVMS_VSECTOR_SIZE_SHIFT; + } else +#endif + { + /* try 32bit size */ + ulong dev_size = 0; + rc = evms_cs_kernel_ioctl(new_disk, BLKGETSIZE, + (ulong) & dev_size); + new_disk->total_vsectors = dev_size; + } + if (!rc && !new_disk->total_vsectors) { + rc = -ENOSPC; + } + } + if (!rc) { + /* remember removable devices */ + if (gd->flags) + if (gd->flags[device_index] & GENHD_FL_REMOVABLE) + new_disk->flags |= EVMS_DEVICE_REMOVABLE; + + /* save the devnode name for this device */ + strcpy(new_disk->name, device_name); + + /* register this device with evms */ + evms_cs_register_device(new_disk); + MOD_INC_USE_COUNT; + + /* append this record the linked list */ + evms_cs_add_logical_node_to_list(disk_list, new_disk); + LOG_DETAILS + ("added logical disk(%s) for physical disk(%u,%u,%s), size("PFU64") in 512 byte units\n", + new_disk->name, major, minor, new_disk->name, + new_disk->total_vsectors); + + } + /* reset the "benign" error codes for the caller */ + switch (rc) { + case DEVICE_UNINITIALIZED: + case DEVICE_KNOWN: + case DEVICE_MEDIA_NOT_PRESENT: + rc = 0; + case 0: + break; + default: + LOG_ERROR + ("error(%d): creating logical disk for device(%d,%d).\n", + rc, major, minor); + if (new_disk) { + evms_cs_deallocate_logical_node(new_disk); + } + if (ldev_prv) { + kfree(ldev_prv); + } + break; + } + return (rc); +} + +static int +create_logical_generic_disks(struct evms_logical_node **disk_list, + struct gendisk *gd) +{ + int rc, i; + + /* This is a generic device */ + + rc = 0; + LOG_DEBUG("major name = %s\n", gd->major_name); + LOG_DEBUG("number of real devices = %i\n", gd->nr_real); + for (i = 0; i < gd->nr_real; i++) { + LOG_DEBUG("device %d:\n", i); + rc = create_logical_disk(disk_list, gd, i); + if (rc) + break; + } + return (rc); +} + +static int +create_logical_ide_disks(struct evms_logical_node **disk_list, + struct gendisk *gd) +{ + int rc = 0, i; + ide_hwif_t *ide_hwif; + ide_drive_t *drive; + + /* This is an IDE device */ + LOG_DEBUG("found IDE major : %i - searching for disks\n", gd->major); + + ide_hwif = gd->real_devices; /* IDE internal data */ + for (i = 0; i < MAX_DRIVES; i++) { + drive = &(ide_hwif->drives[i]); + if (drive->present && (drive->media == ide_disk)) { + /* force the name index value on ide drives */ + blk_dev_info[gd->major].devnode_name_index = i; + rc = create_logical_disk(disk_list, gd, i); + } + if (rc) + break; + } + return (rc); +} + +static int +create_logical_scsi_disks(struct evms_logical_node **disk_list, + struct gendisk *gd) +{ + int rc = 0, i; + Scsi_Disk *SDisks; + Scsi_Device *SDev; + + /* This is an SCSI device */ + LOG_DEBUG("found SCSI major : %i - searching for disks\n", gd->major); + LOG_DEBUG("scsi: major name = %s\n", gd->major_name); + LOG_DEBUG("scsi: number of real devices = %i\n", gd->nr_real); + SDisks = gd->real_devices; /* SCSI internal data */ + for (i = 0; i < gd->nr_real; i++) { + SDev = SDisks[i].device; + LOG_DEBUG + ("scsi: Channel = %i, Id = %i, Lun = %i, Capacity = %i\n", + SDev->channel, SDev->id, SDev->lun, SDisks[i].capacity); + rc = create_logical_disk(disk_list, gd, i); + if (rc) + break; + } + return (rc); +} + +static int +create_logical_disks(struct gendisk *gd, void *p_disk_list) +{ + int rc = 0; + struct evms_logical_node **disk_list = p_disk_list; + + /* create logical disks from all IDE & SCSI devices */ + switch (blk_dev_info[gd->major].device_type) { + case TYPE_IDE: + rc = create_logical_ide_disks(disk_list, gd); + break; + case TYPE_SCSI: + rc = create_logical_scsi_disks(disk_list, gd); + break; + case TYPE_GENERIC: + rc = create_logical_generic_disks(disk_list, gd); + break; + default: + LOG_DEBUG("unrecognized device major : %i\n", gd->major); + break; + } + + return (rc); +} + +static int +discover_disks(struct evms_logical_node **disk_list) +{ + int rc = 0; + + MOD_INC_USE_COUNT; + LOG_ENTRY_EXIT("%s Entry\n", __FUNCTION__); + + if (blk_dev_info == NULL) { + /* allocate space for device info array */ + blk_dev_info = kmalloc(sizeof (struct blk_device_info) + * (MAX_BLKDEV + 1), GFP_KERNEL); + if (blk_dev_info) { + /* initialize device info array */ + memset(blk_dev_info, 0, + sizeof (struct blk_device_info) * (MAX_BLKDEV + 1)); + init_blk_dev_info(blk_dev_info); + } else { + rc = -ENOMEM; + } + } + if (!rc) + /* create logical disks from the raw devices */ + rc = walk_gendisk(create_logical_disks, disk_list); + + /* free blk_dev_info table and null the ptr to it */ + kfree(blk_dev_info); + blk_dev_info = NULL; + + LOG_ENTRY_EXIT("%s Exit\n", __FUNCTION__); + MOD_DEC_USE_COUNT; + return (rc); +} + +/********************************************************/ +/* Required Plugin Function Table Entry Point: */ +/* Delete function */ +/********************************************************/ + +static int +ldev_mgr_delete(struct evms_logical_node *disk) +{ + struct ldev_private *ldev_prv; + + /* reset any evms volume related info from + * the device node, because we can't predict + * how this node will be used in the future. + */ + + /* removed the feature header if its been used + */ + if (disk->feature_header) { + kfree(disk->feature_header); + disk->feature_header = NULL; + } + /* remove the volume_info structure and flag + * if this has been used directly by an evms + * feature. + */ + evms_cs_deallocate_volume_info(disk); + /* reset the flags field to the appropriate state + */ + disk->flags &= ~EVMS_VOLUME_FLAG; + + /* disk nodes only get deleted when: + * 1) there are no references to the disk node + * in memory. + * 2) the device is removable + * 3) the device reported a media change + * + * All three of these conditions must be true + * before the disk node can be deleted. + * evms_check_for_device_changes should set + * and ensure these conditions before issuing + * deletes. + * + * Newly installed removable media will be + * picked up in this modules discover code. + * + * OR disk nodes can will be deleted if the + * devices they represent go away, for example + * in the case of a hotunplugged device or a + * required driver having been unloaded. + */ + if (disk->flags & (EVMS_MEDIA_CHANGED | EVMS_DEVICE_UNAVAILABLE)) { + LOG_DETAILS("deleting '%s'.\n", disk->name); + + evms_cs_unregister_device(disk); + MOD_DEC_USE_COUNT; + ldev_prv = disk->private; + ldev_mgr_unlock_device(ldev_prv); + if (ldev_prv) { + kfree(ldev_prv); + } + evms_cs_deallocate_logical_node(disk); + } + return 0; +} + +/********************************************************/ +/* Required Plugin Function Table Entry Point: */ +/* Read function */ +/********************************************************/ + +/* + * function: ldev_mgr_io_error + * + * this function was primarily created because the function + * buffer_IO_error is inline and kgdb doesn't allow breakpoints + * to be set on inline functions. Since this was an error path + * and not mainline, I decided to add a trace statement to help + * report on the failing condition. + * + */ +static void +ldev_mgr_io_error(struct evms_logical_node *disk, int io_flag, struct buffer_head *bh, int rc) +{ + if (rc == -EOVERFLOW) { + LOG_SERIOUS + ("attempt to %s beyond boundary("PFU64") on (%s), rsector(%ld).\n", + (io_flag) ? "WRITE" : "READ", disk->total_vsectors - 1, + disk->name, bh->b_rsector); + } else if (rc == -ENXIO) { + LOG_SERIOUS("attempt to access a non-existent device(%s).\n", + disk->name); + } + bh->b_end_io(bh, 0); +} + +/********************************************************/ +/* Required Plugin Function Table Entry Point: */ +/* Read function */ +/********************************************************/ + +static void +ldev_mgr_read(struct evms_logical_node *disk, struct buffer_head *bh) +{ + int rc = 0; + request_queue_t *q; + struct ldev_private *ldev_prv; + + ldev_prv = disk->private; + if (bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) <= + disk->total_vsectors) { + bh->b_rdev = MKDEV(ldev_prv->major, ldev_prv->minor); + q = blk_get_queue(bh->b_rdev); + if (q) { + disk->flags &= ~EVMS_DEVICE_UNAVAILABLE; + q->make_request_fn(q, READ, bh); + return; + } else { + rc = -ENXIO; + disk->flags |= EVMS_DEVICE_UNAVAILABLE; + } + } else { + rc = -EOVERFLOW; + } + if (rc) { + ldev_mgr_io_error(disk, READ, bh, rc); + } +} + +/********************************************************/ +/* Required Plugin Function Table Entry Point: */ +/* Write function */ +/********************************************************/ + +static void +ldev_mgr_write(struct evms_logical_node *disk, struct buffer_head *bh) +{ + int rc = 0; + request_queue_t *q; + struct ldev_private *ldev_prv; + + ldev_prv = disk->private; + if (bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) <= + disk->total_vsectors) { + bh->b_rdev = MKDEV(ldev_prv->major, ldev_prv->minor); + q = blk_get_queue(bh->b_rdev); + if (q) { + disk->flags &= ~EVMS_DEVICE_UNAVAILABLE; + q->make_request_fn(q, WRITE, bh); + return; + } else { + rc = -ENXIO; + disk->flags |= EVMS_DEVICE_UNAVAILABLE; + } + } else { + rc = -EOVERFLOW; + } + if (rc) { + ldev_mgr_io_error(disk, WRITE, bh, rc); + } +} + +/********************************************************/ +/* Required Plugin Function Table Entry Point: */ +/* Init_io function & Support routines */ +/********************************************************/ + +/* + * function: allocate_bh + * + * This function obtains a buffer head from the private + * buffer head pool (pre-allocated at EVMS initial + * discovery time). + * + * NOTE: All access to the buffer head pool are protected + * by a private spinlock. + * + */ +static inline struct buffer_head * +allocate_bh(void) +{ + struct buffer_head *bh = + evms_cs_allocate_from_pool(evms_bh_pool, FALSE); + if (bh) { + init_waitqueue_head(&bh->b_wait); + } + return (bh); +} + +/* + * function: deallocate_bh + * + * This function returns a buffer head to the private + * buffer head pool (pre-allocated at EVMS initial + * discovery time). + * + * NOTE: All access to the buffer head pool are protected + * by a private spinlock. + * + */ +static inline void +deallocate_bh(struct buffer_head *bh) +{ + evms_cs_deallocate_to_pool(evms_bh_pool, bh); +} + +/* this is the buffer head control block structure definition */ +typedef struct bh_cb_s { + int rc; + atomic_t blks_allocated; + wait_queue_head_t cb_wait; +} bh_cb_t; + +/* + * function: __wait_on_bh_cb + * + * This is a worker function to wait_on_bh_cb. + * This function waits for a set of private buffer heads + * associated to the specified buffer head control block + * to return from I/O completion. On completion of the + * last buffer head, the calling function is awakened + * and continues running. + * + * This is the worker function to the function wait_on_bh_cb. + * + */ +static void +__wait_on_bh_cb(bh_cb_t * bh_cb) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + + add_wait_queue(&bh_cb->cb_wait, &wait); + do { + run_task_queue(&tq_disk); + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (!atomic_read(&bh_cb->blks_allocated)) + break; + schedule(); + } while (atomic_read(&bh_cb->blks_allocated)); +#ifdef O1_SCHEDULER + set_task_state(tsk, TASK_RUNNING); +#else + tsk->state = TASK_RUNNING; +#endif + remove_wait_queue(&bh_cb->cb_wait, &wait); +} + +/* + * function: wait_on_bh_cb + * + * This function waits for a set of private buffer heads + * associated to the specified buffer head control block + * to return from I/O completion. On completion of the + * last buffer head, the calling function is awakened + * and continues running. + * + */ +static void +wait_on_bh_cb(bh_cb_t * bh_cb) +{ + if (atomic_read(&bh_cb->blks_allocated)) + __wait_on_bh_cb(bh_cb); + else + /* if we ended up with no buffer heads on + * this pass, lets wait a until a few buffer + * heads have been freed and try again. This + * should provide a reasonable delay. + */ + schedule(); +} + +/* + * function: end_bh_cb_io + * + * This is the I/O completion function that is called for + * each private buffer head obtained from the buffer head + * pool. Control is return thru this routine so we can track + * all outstanding requests to know when to awaken the caller, + * and to regain control after all I/Os have been performed. + * + */ +static void +end_bh_cb_io_sync(struct buffer_head *bh, int uptodate) +{ + bh_cb_t *bh_cb = (bh_cb_t *) bh->b_private; + + /* record that errors occurred */ + if (!uptodate) { + bh_cb->rc = -EIO; + } + mark_buffer_uptodate(bh, uptodate); + unlock_buffer(bh); + + deallocate_bh(bh); + atomic_dec(&bh_cb->blks_allocated); + if (!atomic_read(&bh_cb->blks_allocated)) + if (waitqueue_active(&bh_cb->cb_wait)) + wake_up(&bh_cb->cb_wait); +} + +/* + * function: ldev_partial_sector_init_io + * + * This function is a support function for ldev_init_io, + * which handles the cases of performing I/O to only a part + * of non-standard sized hardsector. This function is not + * designed to be called directly, but via ldev_init_io. + * + */ +static int +ldev_partial_sector_init_io(struct evms_logical_node *node, + int io_flag, + bh_cb_t * bh_cb, + u64 next_lsn, + u64 sector_lsn, + u64 io_size, + void *bufptr, unsigned char **sector_buf) +{ + int rc = 0; + struct ldev_private *ldev_prv = node->private; + kdev_t dev = MKDEV(ldev_prv->major, ldev_prv->minor); + struct buffer_head *bh; + + if (*sector_buf == NULL) { + /* allocate buffer for incoming sector */ + *sector_buf = kmalloc(node->hardsector_size, GFP_KERNEL); + if (!*sector_buf) + return -ENOMEM; + } + /* allocate a buffer head from the pool */ + while ((bh = allocate_bh()) == NULL) + /* yielding the cpu is playing it + * safe. it might be wiser to just + * spin. requires more thought. + */ + schedule(); + + /* set up the buffer head for this sector */ + bh->b_end_io = end_bh_cb_io_sync; + bh->b_size = node->hardsector_size; + bh->b_rdev = dev; + bh->b_rsector = next_lsn - sector_lsn; + bh->b_data = *sector_buf; + bh->b_page = virt_to_page(*sector_buf); /* this isn't handling the case of a block with more than 1 sector, that spans pages */ + bh->b_state = 0; + set_bit(BH_Dirty, &bh->b_state); + set_bit(BH_Lock, &bh->b_state); + set_bit(BH_Req, &bh->b_state); + set_bit(BH_Mapped, &bh->b_state); + bh->b_private = (void *) bh_cb; + atomic_inc(&bh_cb->blks_allocated); + + /* drive the buffer head down */ + /* to the device */ + generic_make_request(READ, bh); + + /* wait for all bh's I/O's to end */ + wait_on_bh_cb(bh_cb); + + /* copy data to/from user */ + if (io_flag != WRITE) + /* READ */ + memcpy(bufptr, + *sector_buf + (sector_lsn << EVMS_VSECTOR_SIZE_SHIFT), + io_size << EVMS_VSECTOR_SIZE_SHIFT); + else { + /* WRITE */ + memcpy(*sector_buf + (sector_lsn << EVMS_VSECTOR_SIZE_SHIFT), + bufptr, io_size << EVMS_VSECTOR_SIZE_SHIFT); + + /* allocate a buffer head from the pool */ + while ((bh = allocate_bh()) == NULL) + /* yielding the cpu is playing it + * safe. it might be wiser to just + * spin. requires more thought. + */ + schedule(); + + /* set up the buffer head for this sector */ + bh->b_end_io = end_bh_cb_io_sync; + bh->b_size = node->hardsector_size; + bh->b_rdev = dev; + bh->b_rsector = next_lsn - sector_lsn; + bh->b_data = *sector_buf; + bh->b_page = virt_to_page(*sector_buf); /* this isn't handling the case of a block with more than 1 sector, that spans pages */ + bh->b_state = 0; + set_bit(BH_Dirty, &bh->b_state); + set_bit(BH_Lock, &bh->b_state); + set_bit(BH_Req, &bh->b_state); + set_bit(BH_Mapped, &bh->b_state); + bh->b_private = (void *) bh_cb; + atomic_inc(&bh_cb->blks_allocated); + + /* drive the buffer head down */ + /* to the device */ + generic_make_request(WRITE, bh); + + /* wait for all bh's I/O's to end */ + wait_on_bh_cb(bh_cb); + } + return (rc); +} + +/* + * function: ldev_init_io + * + * This function provides support for synchronous I/O + * operations to the underlying devices. These I/O + * operations are NOT buffered in any way including the + * operating system's buffer cache. + * + * This function can work with any hardsector size that + * is a power of 2. + * + * node : logical node of the target logical disk + * io_flag : 0 = read, 1 = write, 2 = read-a-head + * starting_lsn : the 0-based (disk relative) logical + * : (512 byte) sector number (lsn) + * num_lsns : the total number of lsns in this I/O + * bufptr : address of the memory to read/write the data + * + */ +static int +ldev_init_io(struct evms_logical_node *node, + int io_flag, + u64 starting_lsn, u64 num_lsns, void *bufptr) +{ + int rc = 0, lsns_per_hardsector, lsns_per_blocksize; + unchar *sector_buf = NULL, *cur_bufptr; + u64 next_lsn, remaining_lsns, sector_lsn; + struct ldev_private *ldev_prv = node->private; + kdev_t dev = MKDEV(ldev_prv->major, ldev_prv->minor); + bh_cb_t bh_cb; + + LOG_EVERYTHING + ("%s Entry: Disk(%u,%u), ioflag(%u), start_lsn("PFU64"), num_lsns("PFU64"), bufptr(0x%p)\n", + __FUNCTION__, ldev_prv->major, ldev_prv->minor, io_flag, + starting_lsn, num_lsns, bufptr); + + /* check for valid device */ + if (!blk_size[ldev_prv->major][ldev_prv->minor]) { + node->flags |= EVMS_DEVICE_UNAVAILABLE; + return (-ENXIO); + } + /* check for 0 length request */ + if (num_lsns == 0) { + LOG_ERROR("%s: error requesting 0 sectors.\n", __FUNCTION__); + return (-EINVAL); + } + /* check for out of bound request */ + if ((starting_lsn + num_lsns) > node->total_vsectors) { + LOG_ERROR + ("%s: attempted %s beyond logical disk boundary("PFU64" LSNs), requesting LSN("PFU64"), total LSNs("PFU64").\n", + __FUNCTION__, (io_flag == WRITE) ? "WRITE" : "READ", + node->total_vsectors, starting_lsn, num_lsns); + return (-EINVAL); + } + /* check for invalid io_flag value */ + switch (io_flag) { + case READ: /* read... */ + case WRITE: /* write... */ + case READA: /* reada... */ + break; + default: + return (-EINVAL); + } + + /* compute some per device info once up-front */ + lsns_per_hardsector = node->hardsector_size / EVMS_VSECTOR_SIZE; + lsns_per_blocksize = node->block_size / EVMS_VSECTOR_SIZE; + + /* initialize the buffer head control block */ + memset(&bh_cb, 0, sizeof (bh_cb_t)); + init_waitqueue_head(&bh_cb.cb_wait); + bh_cb.blks_allocated = (atomic_t)ATOMIC_INIT(0); + + /* only update the local copy of variables */ + cur_bufptr = bufptr; + next_lsn = starting_lsn; + remaining_lsns = num_lsns; + + /* check for a mid-sector starting offset + * + * if found, perform I/O on part of that + * sector + */ + sector_lsn = next_lsn & (lsns_per_hardsector - 1); + if (sector_lsn) { + u64 io_size; + + /* determine bytes in IO to this sector */ + io_size = lsns_per_hardsector - sector_lsn; + if (io_size > remaining_lsns) + io_size = remaining_lsns; + + /* perform the partial sector io */ + rc = ldev_partial_sector_init_io(node, io_flag, &bh_cb, + next_lsn, + sector_lsn, io_size, + cur_bufptr, §or_buf); + + if (!rc) { + /* update progress in local variables */ + cur_bufptr += io_size << EVMS_VSECTOR_SIZE_SHIFT; + next_lsn += io_size; + remaining_lsns -= io_size; + } + } + + /* continue if no errors found */ + if (!rc) { + /* perform I/O on all the complete sectors + * in this request. + * + * loop until there are no more complete sectors + * to process. + */ + while (remaining_lsns >= lsns_per_hardsector) { + /* this inner loop attempts to drive as many + * bytes (in sector size multiples) down to + * the device as possible using the available + * buffer heads in the pool. + */ + while (remaining_lsns >= lsns_per_hardsector) { + struct buffer_head *bh; + + /* allocate a buffer head from the pool */ + bh = allocate_bh(); + if (bh == NULL) + break; + + /* set up the buffer head for this I/O */ + bh->b_end_io = end_bh_cb_io_sync; + bh->b_size = + (remaining_lsns >= lsns_per_blocksize) ? + node->block_size : node->hardsector_size; + bh->b_data = cur_bufptr; + bh->b_rdev = dev; + bh->b_rsector = next_lsn; + bh->b_page = virt_to_page(cur_bufptr); /* this isn't handling the case of a block with more than 1 sector, that spans pages */ + bh->b_state = 0; + set_bit(BH_Dirty, &bh->b_state); + set_bit(BH_Lock, &bh->b_state); + set_bit(BH_Req, &bh->b_state); + set_bit(BH_Mapped, &bh->b_state); + bh->b_private = (void *) &bh_cb; + atomic_inc(&bh_cb.blks_allocated); + + /* drive the buffer head down */ + /* to the device */ + generic_make_request(io_flag, bh); + + /* update progress in local variables */ + cur_bufptr += bh->b_size; + next_lsn += + bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT; + remaining_lsns -= + bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT; + } + /* wait for all bh's I/O's to end */ + wait_on_bh_cb(&bh_cb); + } + } + + /* continue if no errors found */ + if (!rc) + /* check for a mid-sector ending offset + * + * if found, perform I/O on part of that + * sector + */ + if (remaining_lsns) + /* perform the partial sector io */ + rc = ldev_partial_sector_init_io(node, io_flag, &bh_cb, + next_lsn, + 0, remaining_lsns, + cur_bufptr, + §or_buf); + + /* free the sector buffer if it was allocated */ + if (sector_buf) + kfree(sector_buf); + + /* coalesce return codes */ + rc |= bh_cb.rc; + + LOG_EVERYTHING("%s Exit: rc(%u)\n", __FUNCTION__, rc); + + return (rc); +} + +static int +ldev_mgr_direct_ioctl(struct inode *inode, + struct file *file, unsigned int cmd, unsigned long arg) +{ + int rc = 0; + struct ldev_private *ldev_prv; + struct evms_plugin_ioctl_pkt tmp, *user_parms; + struct ldev_plugin_ioctl pi_data; + struct evms_logical_node *disk; + + MOD_INC_USE_COUNT; + + user_parms = (struct evms_plugin_ioctl_pkt *) arg; + /* copy user's parameters to kernel space */ + if (copy_from_user(&tmp, user_parms, sizeof (tmp))) + rc = -EFAULT; + + if (!rc) { + /* validate its meant for us */ + if (tmp.feature_id != plugin_header.id) { + rc = -EINVAL; + } + } + + if (!rc) { + /* copy feature ioctl data to kernel space */ + if (copy_from_user(&pi_data, tmp.feature_ioctl_data, + sizeof (pi_data))) { + rc = -EFAULT; + } + } + + if (!rc) { + /* find the disk node specified by the disk_handle */ + int done = FALSE; + disk = NULL; + while (!done) { + rc = evms_cs_find_next_device(disk, + &disk); + if (rc) { + break; + } + if (!disk) { + rc = -ENODATA; + break; + } + if (disk == + DEV_HANDLE_TO_NODE(pi_data.disk_handle)) { + done = TRUE; + } + } + } + + if (!rc) { + /* perform feature command */ + ldev_prv = (struct ldev_private *) disk->private; + switch (tmp.feature_command) { + kdev_t save_dev; + case LDEV_MGR_BROADCAST_IOCTL_CMD: + save_dev = inode->i_rdev; + inode->i_rdev = + MKDEV(ldev_prv->major, ldev_prv->minor); + rc = ldev_prv->bdev->bd_op->ioctl(inode, file, + pi_data.cmd, + pi_data.arg); + inode->i_rdev = save_dev; + break; + default: + rc = -EINVAL; + break; + } + } + + /* return status value */ + tmp.status = rc; + copy_to_user((struct evms_plugin_ioctl_pkt *) arg, &tmp, sizeof (tmp)); + MOD_DEC_USE_COUNT; + return rc; +} + +/********************************************************/ +/* Required Plugin Function Table Entry Point: */ +/* IOCTL function & Support routines */ +/********************************************************/ + +static int +ldev_mgr_ioctl(struct evms_logical_node *disk, + struct inode *inode, + struct file *file, unsigned int cmd, unsigned long arg) +{ + int rc = 0; + struct ldev_private *ldev_prv = disk->private; + kdev_t save_dev; + struct block_device *save_bdev; + + if (!inode || !disk) + return -EINVAL; + + save_dev = inode->i_rdev; + inode->i_rdev = MKDEV(ldev_prv->major, ldev_prv->minor); + save_bdev = inode->i_bdev; + inode->i_bdev = ldev_prv->bdev; + /* check device availability */ + if (!blk_get_queue(MKDEV(ldev_prv->major, ldev_prv->minor))) { + disk->flags |= EVMS_DEVICE_UNAVAILABLE; + } + switch (cmd) { + case EVMS_QUIESCE_VOLUME: + case EVMS_PLUGIN_IOCTL: + break; + case EVMS_GET_BMAP: + { + struct evms_get_bmap_pkt *bmap = + (struct evms_get_bmap_pkt *) arg; + bmap->dev = MKDEV(ldev_prv->major, ldev_prv->minor); + bmap->status = 0; + } + break; + case EVMS_OPEN_VOLUME: + if (disk->flags & EVMS_DEVICE_UNAVAILABLE) { + rc = -ENXIO; + } else { + rc = ldev_prv->bdev->bd_op->open(inode, file); + } + break; + case EVMS_CLOSE_VOLUME: + if (disk->flags & EVMS_DEVICE_UNAVAILABLE) { + rc = -ENXIO; + } else { + rc = ldev_prv->bdev->bd_op->release(inode, file); + } + break; + case EVMS_CHECK_MEDIA_CHANGE: + if (disk->flags & EVMS_DEVICE_UNAVAILABLE) { + rc = -ENXIO; + } else { + /* once we detect that media changed + * is 'set', don't send any more ioctls + * down to the device, until the + * media change has been 'reset' by a + * revalidate disk ioctl. when already + * 'set', just return a 1 w/o actually + * performing another ioctl call to the + * device. + */ + if (ldev_prv->media_changed == TRUE) { + rc = 1; + break; + } + rc = ldev_prv->bdev->bd_op-> + check_media_change(MKDEV + (ldev_prv->major, + ldev_prv->minor)); + if (rc == 1) { + ldev_prv->media_changed = TRUE; + disk->flags |= EVMS_MEDIA_CHANGED; + } + } + break; + case EVMS_REVALIDATE_DISK: + if (disk->flags & EVMS_DEVICE_UNAVAILABLE) { + rc = -ENXIO; + } else { + /* don't actually send this ioctl down + * to the device, until we know that + * previous check media change ioctl + * has occurred. + * + * when we do actually send the ioctl + * down, reset the local media_changed + * flag. + */ + if (ldev_prv->media_changed == FALSE) + break; + rc = ldev_prv->bdev->bd_op-> + revalidate(MKDEV + (ldev_prv->major, ldev_prv->minor)); + ldev_prv->media_changed = FALSE; + } + break; + case EVMS_GET_DISK_LIST: + rc = evms_cs_add_item_to_list((struct evms_list_node **) arg, + disk); + if (rc > 0) + rc = 0; + break; + case EVMS_CHECK_DEVICE_STATUS: + if (arg) { + int *status = (int *) arg; + *status |= disk->flags; + } + break; + case EVMS_UPDATE_DEVICE_INFO: + /* determine hardsector size */ + disk->hardsector_size = 512; + if (hardsect_size[ldev_prv->major]) { + disk->hardsector_size = hardsect_size[ldev_prv->major][ldev_prv->minor]; + } + /* save the block size */ + disk->block_size = 1024; + if (blksize_size[ldev_prv->major]) { + disk->block_size = blksize_size[ldev_prv->major][ldev_prv->minor]; + } + /* device size in sectors + * + * try 64bit size first, if that fails + * fall back on the 32bit size. + */ + /* try 64bit size */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,18) + rc = evms_cs_kernel_ioctl(disk, BLKGETSIZE64, + (ulong) & disk->total_vsectors); + if (!rc) { + /* convert bytes to 512 byte sectors */ + disk->total_vsectors >>= EVMS_VSECTOR_SIZE_SHIFT; + } else +#endif + { + /* try 32bit size */ + ulong dev_size = 0; + rc = evms_cs_kernel_ioctl(disk, BLKGETSIZE, + (ulong) & dev_size); + disk->total_vsectors = dev_size; + } + break; + default: + if (disk->flags & EVMS_DEVICE_UNAVAILABLE) { + rc = -ENXIO; + } else { + rc = ldev_prv->bdev->bd_op->ioctl(inode, file, cmd, + arg); + } + break; + } + inode->i_bdev = save_bdev; + inode->i_rdev = save_dev; + + return (rc); +} + +/********************************************************/ +/* Required Module Entry Point: */ +/* ldev_mgr_init */ +/********************************************************/ + +static int __init +ldev_mgr_init(void) +{ + return evms_cs_register_plugin(&plugin_header); +} + +static void __exit +ldev_mgr_exit(void) +{ + evms_cs_unregister_plugin(&plugin_header); +} + +module_init(ldev_mgr_init); +module_exit(ldev_mgr_exit); +#ifdef MODULE_LICENSE +MODULE_LICENSE("GPL"); +#endif diff -Naur linux-2002-09-30/drivers/evms/lvm_vge.c evms-2002-09-30/drivers/evms/lvm_vge.c --- linux-2002-09-30/drivers/evms/lvm_vge.c Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/drivers/evms/lvm_vge.c Fri Sep 13 16:45:06 2002 @@ -0,0 +1,3734 @@ +/* -*- linux-c -*- */ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +/* + * linux/drivers/evms/lvm_vge.c + * + * EVMS Linux LVM Region Manager + */ + +#define LOG_PREFIX "lvm: " + +#include +#include +#include +#include +#include + +#include +#include + +/* Plugin API prototypes. */ +static int lvm_discover(struct evms_logical_node ** evms_node_list); +static int lvm_discover_end(struct evms_logical_node ** evms_node_list); +static int lvm_delete_node(struct evms_logical_node * logical_node); +static void lvm_read(struct evms_logical_node * node, struct buffer_head * bh); +static void lvm_write(struct evms_logical_node * node, struct buffer_head * bh); +static int lvm_init_io(struct evms_logical_node * node, + int io_flag, + u64 sect_nr, + u64 num_sects, + void * buf_addr); +static int lvm_ioctl(struct evms_logical_node * logical_node, + struct inode * inode, + struct file * file, + unsigned int cmd, + unsigned long arg); +static int lvm_direct_ioctl(struct inode * inode, + struct file * file, + unsigned int cmd, + unsigned long args); + +static struct snapshot_map_entry * allocate_snapshot_map_entry(u64 org_sector, + u64 snap_sector); + +/* LVM Plugin function table and header. */ +static struct evms_plugin_fops lvm_fops = { + .discover = lvm_discover, + .end_discover = lvm_discover_end, + .delete = lvm_delete_node, + .read = lvm_read, + .write = lvm_write, + .init_io = lvm_init_io, + .ioctl = lvm_ioctl, + .direct_ioctl = lvm_direct_ioctl +}; + +static struct evms_plugin_header lvm_plugin_header = { + .id = SetPluginID(IBM_OEM_ID, + EVMS_REGION_MANAGER, + 0x01), + .version = { + .major = EVMS_LVM_VERSION_MAJOR, + .minor = EVMS_LVM_VERSION_MINOR, + .patchlevel = EVMS_LVM_VERSION_PATCH + }, + .required_services_version = { + .major = 0, + .minor = 5, + .patchlevel = 0 + }, + .fops = &lvm_fops +}; + +static struct lvm_volume_group * lvm_group_list = NULL; +static struct proc_dir_entry * lvm_proc = NULL; + + +/********** Miscellaneous Functions **********/ + + +/** + * remap sector + * @node: + * @org_sector: Logical sector to remap. + * @size: Size (in sectors) or request to remap. + * @new_sector: Remapped sector. + * @new_size: New size (in sectors). + * @pe_start_sector: Starting sector of PE - needed for snapshotting. + * @pv_entry: New node for which new_sector is relative. + * + * Common function to remap LV lba to PV lba in appropriate PE. This + * function needs to deal with requests that span PEs and/or stripes. If + * this occurs, the request will simply be chopped off at the boundary of + * the first PE/stripe. It is up to the calling function to loop + * accordingly to finish the full remapping. This function is now partially + * 64-bit enabled. The striping section contains code that currently cannot + * eliminate at least one mod operation on 64 bit values. + **/ +static int remap_sector(struct evms_logical_node * node, + u64 org_sector, + u64 size, + u64 * new_sector, + u64 * new_size, + u64 * pe_start_sector, + struct lvm_physical_volume ** pv_entry) +{ + struct lvm_logical_volume * volume = node->private; + struct le_table_entry * le_entry; + u32 le, offset_in_le; + + *new_size = size; + + if ( volume->stripes > 1 ) { + /* Volume is striped. Reset the size if the request crosses + * a stripe boundary. Striping in LVM is not 64-bit enabled. + */ + u32 column, columns, sectors_per_column; + u32 sector_in_column, stripe_in_column, le_in_column; + u32 offset_in_stripe, stripe_in_le; + u32 org_sector32 = org_sector; + + sectors_per_column = volume->stripes * volume->pe_size; + column = org_sector32 / sectors_per_column; + sector_in_column = org_sector32 % sectors_per_column; + stripe_in_column = sector_in_column / volume->stripe_size; + le_in_column = stripe_in_column % volume->stripes; + columns = volume->num_le / volume->stripes; + le = column + (columns * le_in_column); + + offset_in_stripe = org_sector32 % volume->stripe_size; + stripe_in_le = stripe_in_column / volume->stripes; + offset_in_le = offset_in_stripe + + stripe_in_le * volume->stripe_size; + + if ( offset_in_stripe + size > volume->stripe_size ) { + *new_size = volume->stripe_size - offset_in_stripe; + } + } else { + /* Linear volume. Just find LE and offset. Reset the size if + * the request crosses an LE boundary. This path is 64-bit safe. + */ + le = org_sector >> volume->pe_size_shift; + offset_in_le = org_sector & (volume->pe_size - 1); + + if ( offset_in_le + size > volume->pe_size ) { + *new_size = volume->pe_size - offset_in_le; + } + } + + le_entry = &volume->le_map[le]; + *pe_start_sector = le_entry->pe_sector_offset; + *new_sector = le_entry->pe_sector_offset + offset_in_le; + *pv_entry = le_entry->owning_pv; + + return 0; +} + +/** + * add_group_to_list + * + * Add a volume group to the end of the LVM global group list. + **/ +static int add_group_to_list(struct lvm_volume_group * group) +{ + struct lvm_volume_group ** p_group; + + for ( p_group = &lvm_group_list; + *p_group; p_group = &(*p_group)->next_group ) { + ; + } + + *p_group = group; + group->next_group = NULL; + return 0; +} + +/** + * remove_group_from_list + * + * Remove an LVM volume group from the global LVM list. + **/ +static int remove_group_from_list(struct lvm_volume_group * group) +{ + struct lvm_volume_group ** p_group; + + for ( p_group = &lvm_group_list; + *p_group; p_group = &(*p_group)->next_group ) { + if ( *p_group == group ) { + *p_group = (*p_group)->next_group; + group->next_group = NULL; + break; + } + } + + return 0; +} + +/** + * find_group_by_uuid + * + * Use the vg_uuid to find the desired volume group. + **/ +static int find_group_by_uuid(u8 * vg_uuid, + struct lvm_volume_group ** group) +{ + struct lvm_volume_group * gp; + + for ( gp = lvm_group_list; gp; gp = gp->next_group ) { + if ( ! memcmp(vg_uuid, gp->vg_uuid, UUID_LEN) ) { + *group = gp; + return 0; + } + } + *group = NULL; + return -EINVAL; +} + +/** + * find_pv_by_number + * + * Search the PV list of the specified volume group, looking for the + * specified PV number. If found, return a pointer to that PV. + **/ +static struct lvm_physical_volume * +find_pv_by_number(u32 pv_number, + struct lvm_volume_group * group) +{ + struct lvm_physical_volume * pv_entry; + + for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) { + if ( pv_entry->pv_number == pv_number ) { + return pv_entry; + } + } + return NULL; +} + +/** + * translate_lv_name + * @lvm_lv_name: Input LVM-style name. + * @evms_node_name: Output EVMS-style name. + * + * In LVM, volumes have names based on their dev-node, which follow the + * pattern /dev/group_name/volume_name. In EVMS, the same volume needs + * to appear as /dev/evms/lvm/group_name/volume_name. Thus, the name from + * the lv_disk_t needs to be translated before copying to the associated + * node. evms_node_name must point to a NAME_LEN sized buffer. + **/ +static int translate_lv_name(char * lvm_lv_name, char * evms_node_name) +{ + char * ptr; + + memset(evms_node_name, 0, NAME_LEN); + + /* Make sure the string starts with /dev/, and skip over it. */ + ptr = strstr(lvm_lv_name, DEV_DIRECTORY); + if ( ptr != lvm_lv_name ) { + LOG_SERIOUS("Invalid LV name: %s\n", lvm_lv_name); + return -EINVAL; + } + ptr = &ptr[strlen(DEV_DIRECTORY)]; + + /* ptr now points to "group_name/volume_name". + * Use this to create the name for the EVMS node. + */ + strcpy(evms_node_name, LVM_DEV_DIRECTORY); + strncat(evms_node_name, ptr, NAME_LEN - strlen(evms_node_name) - 1); + + return 0; +} + +/** + * check_pv_for_lv + * + * Run through all LE maps of all LVs in this group, and make sure the + * specified PV is not being pointed to by any LEs. + **/ +static int check_pv_for_lv(struct lvm_physical_volume * pv_entry, + struct lvm_volume_group * group) +{ + struct lvm_logical_volume * volume; + int i, j; + + for ( i = 1; i <= MAX_LV; i++ ) { + if ( (volume = group->volume_list[i]) ) { + for ( j = 0; j < volume->num_le; j++ ) { + if ( volume->le_map[j].owning_pv == pv_entry ) { + return -EINVAL; + } + } + } + } + return 0; +} + + +/********** Metadata I/O Functions **********/ + + +/** + * endian_convert_pv + * + * Endian-neutral conversion for PV structures. + **/ +static inline void endian_convert_pv(struct pv_disk * pv) +{ + pv->version = le16_to_cpup(&pv->version); + pv->pv_on_disk.base = le32_to_cpup(&pv->pv_on_disk.base); + pv->pv_on_disk.size = le32_to_cpup(&pv->pv_on_disk.size); + pv->vg_on_disk.base = le32_to_cpup(&pv->vg_on_disk.base); + pv->vg_on_disk.size = le32_to_cpup(&pv->vg_on_disk.size); + pv->pv_uuidlist_on_disk.base = + le32_to_cpup(&pv->pv_uuidlist_on_disk.base); + pv->pv_uuidlist_on_disk.size = + le32_to_cpup(&pv->pv_uuidlist_on_disk.size); + pv->lv_on_disk.base = le32_to_cpup(&pv->lv_on_disk.base); + pv->lv_on_disk.size = le32_to_cpup(&pv->lv_on_disk.size); + pv->pe_on_disk.base = le32_to_cpup(&pv->pe_on_disk.base); + pv->pe_on_disk.size = le32_to_cpup(&pv->pe_on_disk.size); + pv->pv_major = le32_to_cpup(&pv->pv_major); + pv->pv_number = le32_to_cpup(&pv->pv_number); + pv->pv_status = le32_to_cpup(&pv->pv_status); + pv->pv_allocatable = le32_to_cpup(&pv->pv_allocatable); + pv->pv_size = le32_to_cpup(&pv->pv_size); + pv->lv_cur = le32_to_cpup(&pv->lv_cur); + pv->pe_size = le32_to_cpup(&pv->pe_size); + pv->pe_total = le32_to_cpup(&pv->pe_total); + pv->pe_allocated = le32_to_cpup(&pv->pe_allocated); + pv->pe_start = le32_to_cpup(&pv->pe_start); +} + +/** + * read_pv + * + * Read in the PV structure from the specified node. If it contains a + * valid PV signature, allocate a new struct pv_disk and copy the data. + **/ +static int read_pv(struct evms_logical_node * node, struct pv_disk ** pv) +{ + struct pv_disk * pv_buffer; + int rc = -ENOMEM; + + *pv = NULL; + + /* Buffer for reading the PV metadata. */ + pv_buffer = kmalloc(LVM_PV_DISK_SIZE, GFP_NOIO); + if (!pv_buffer) { + LOG_CRITICAL("Error allocating PV metadata buffer for %s\n", + node->name); + goto out; + } + + /* Read the first two sectors. */ + rc = INIT_IO(node, 0, evms_cs_size_in_vsectors(LVM_PV_DISK_BASE), + evms_cs_size_in_vsectors(LVM_PV_DISK_SIZE), pv_buffer); + if (rc) { + LOG_SERIOUS("Error reading PV metadata from %s\n", node->name); + goto out_kfree; + } + + /* Endian-neutral conversion of PV metadata. */ + endian_convert_pv(pv_buffer); + + /* Check for an LVM signature and make sure the sizes match. + * Versions 1 and 2 are both valid now. Thanks LVM! :) + */ + if ( !(pv_buffer->id[0] == 'H' && + pv_buffer->id[1] == 'M' && + (pv_buffer->version == 1 || pv_buffer->version == 2) && + pv_buffer->pv_size == node->total_vsectors) ) { + LOG_EXTRA("%s is not an LVM PV\n", node->name); + rc = -EINVAL; + goto out_kfree; + } + + /* This is a valid PV. Allocate a new pv_disk. */ + *pv = kmalloc(sizeof(struct pv_disk), GFP_NOIO); + if (!*pv) { + LOG_CRITICAL("Error allocating new PV for %s\n", node->name); + rc = -ENOMEM; + goto out_kfree; + } + + /* Copy the metadata. */ + memcpy(*pv, pv_buffer, sizeof(struct pv_disk)); + +out_kfree: + kfree(pv_buffer); +out: + return rc; +} + +/** + * endian_convert_vg + * + * Endian-neutral conversion for VG structures + **/ +static inline void endian_convert_vg(struct vg_disk * vg) +{ + vg->vg_number = le32_to_cpup(&vg->vg_number); + vg->vg_access = le32_to_cpup(&vg->vg_access); + vg->vg_status = le32_to_cpup(&vg->vg_status); + vg->lv_max = le32_to_cpup(&vg->lv_max); + vg->lv_cur = le32_to_cpup(&vg->lv_cur); + vg->lv_open = le32_to_cpup(&vg->lv_open); + vg->pv_max = le32_to_cpup(&vg->pv_max); + vg->pv_cur = le32_to_cpup(&vg->pv_cur); + vg->pv_act = le32_to_cpup(&vg->pv_act); + vg->dummy = le32_to_cpup(&vg->dummy); + vg->vgda = le32_to_cpup(&vg->vgda); + vg->pe_size = le32_to_cpup(&vg->pe_size); + vg->pe_total = le32_to_cpup(&vg->pe_total); + vg->pe_allocated = le32_to_cpup(&vg->pe_allocated); + vg->pvg_total = le32_to_cpup(&vg->pvg_total); +} + +/** + * read_vg + * + * Read in the VG structure from the specified node. Allocate a new + * struct vg_disk and copy the data. + **/ +static int read_vg(struct evms_logical_node * node, + struct pv_disk * pv, + struct vg_disk ** vg) +{ + struct vg_disk * vg_buffer; + unsigned long vg_sectors; + int rc = -ENOMEM; + + /* Allocate a buffer to read the VG metadata. */ + vg_sectors = evms_cs_size_in_vsectors(pv->vg_on_disk.size); + vg_buffer = kmalloc(vg_sectors << EVMS_VSECTOR_SIZE_SHIFT, GFP_NOIO); + if (!vg_buffer) { + LOG_CRITICAL("Error allocating VG metadata buffer for %s\n", + node->name); + goto out; + } + + /* Read the VG metadata. */ + rc = INIT_IO(node, 0, evms_cs_size_in_vsectors(pv->vg_on_disk.base), + vg_sectors, vg_buffer); + if (rc) { + LOG_SERIOUS("Error reading VG metadata from %s\n", node->name); + goto out_kfree; + } + + /* Endian-neutral conversion of VG metadata. */ + endian_convert_vg(vg_buffer); + + /* Allocate a new struct vg_disk. */ + *vg = kmalloc(sizeof(struct vg_disk), GFP_NOIO); + if (!*vg) { + LOG_CRITICAL("Error allocating new VG for %s\n", node->name); + rc = -ENOMEM; + goto out_kfree; + } + + /* Copy the metadata. */ + memcpy(*vg, vg_buffer, sizeof(struct vg_disk)); + +out_kfree: + kfree(vg_buffer); +out: + return rc; +} + +/** + * read_uuid_list + **/ +static int read_uuid_list(struct evms_logical_node * node, + struct pv_disk * pv, + struct lvm_volume_group * group) +{ + u64 start_sector; + unsigned long total_sectors; + unsigned char * uuid_buffer; + unsigned long buffer_size = IO_BUFFER_SECTORS * EVMS_VSECTOR_SIZE; + unsigned long uuid_list_size; + int i, rc = 0; + + if (group->uuid_list) { + LOG_EXTRA("Already read PV UUIDs for group %s\n", + group->vg_name); + goto out; + } + + start_sector = evms_cs_size_in_vsectors(pv->pv_uuidlist_on_disk.base); + total_sectors = evms_cs_size_in_vsectors(pv->pv_uuidlist_on_disk.size); + uuid_list_size = round_up(total_sectors * EVMS_VSECTOR_SIZE, + buffer_size); + + /* Allocate a buffer to perform the I/Os. */ + uuid_buffer = kmalloc(buffer_size, GFP_NOIO); + if (!uuid_buffer) { + LOG_CRITICAL("Error allocating buffer for UUID list in group %s\n", + group->vg_name); + rc = -ENOMEM; + goto out; + } + + /* Allocate memory for the UUID array for this group. */ + group->uuid_list = vmalloc(uuid_list_size); + if (!group->uuid_list) { + LOG_CRITICAL("Error allocating UUID list for group %s\n", + group->vg_name); + rc = -ENOMEM; + goto out_kfree; + } + memset(group->uuid_list, 0, uuid_list_size); + + for ( i = 0; i < total_sectors; i += IO_BUFFER_SECTORS ) { + rc = INIT_IO(node, 0, start_sector + i, + IO_BUFFER_SECTORS, uuid_buffer); + if (rc) { + LOG_SERIOUS("Error reading PV UUID list from %s\n", + node->name); + goto out_vfree; + } + /* Copy the I/O buffer into the UUID array. */ + memcpy(&(group->uuid_list[i * EVMS_VSECTOR_SIZE]), + uuid_buffer, buffer_size); + } + + /* Clear out the unused portion at the end of the uuid_list. */ + memset(&(group->uuid_list[pv->pv_uuidlist_on_disk.size]), 0, + uuid_list_size - pv->pv_uuidlist_on_disk.size); + +out_kfree: + kfree(uuid_buffer); +out: + return rc; + +out_vfree: + vfree(group->uuid_list); + group->uuid_list = NULL; + goto out_kfree; +} + +/** + * endian_convert_lv + * + * Endian-neutral conversion for LV structures + **/ +static inline void endian_convert_lv(struct lv_disk * lv) +{ + lv->lv_access = le32_to_cpup(&lv->lv_access); + lv->lv_status = le32_to_cpup(&lv->lv_status); + lv->lv_open = le32_to_cpup(&lv->lv_open); + lv->lv_dev = le32_to_cpup(&lv->lv_dev); + lv->lv_number = le32_to_cpup(&lv->lv_number); + lv->lv_mirror_copies = le32_to_cpup(&lv->lv_mirror_copies); + lv->lv_recovery = le32_to_cpup(&lv->lv_recovery); + lv->lv_schedule = le32_to_cpup(&lv->lv_schedule); + lv->lv_size = le32_to_cpup(&lv->lv_size); + lv->lv_snapshot_minor = le32_to_cpup(&lv->lv_snapshot_minor); + lv->lv_chunk_size = le16_to_cpup(&lv->lv_chunk_size); + lv->dummy = le16_to_cpup(&lv->dummy); + lv->lv_allocated_le = le32_to_cpup(&lv->lv_allocated_le); + lv->lv_stripes = le32_to_cpup(&lv->lv_stripes); + lv->lv_stripesize = le32_to_cpup(&lv->lv_stripesize); + lv->lv_badblock = le32_to_cpup(&lv->lv_badblock); + lv->lv_allocation = le32_to_cpup(&lv->lv_allocation); + lv->lv_io_timeout = le32_to_cpup(&lv->lv_io_timeout); + lv->lv_read_ahead = le32_to_cpup(&lv->lv_read_ahead); +} + +static inline void endian_convert_lvs(struct lvm_volume_group * group) +{ + int i; + for ( i = 0; i < group->vg->lv_max; i++ ) { + endian_convert_lv(&(group->lv_array[i])); + } +} + +/** + * read_lv + * + * Read in the LV structures for the specified group. Do the read from + * the first PV in the group. If that one fails, keep trying on the + * remaining PVs until one works. This function will allocate a buffer + * for the group to read in the structures. + **/ +static int read_lv(struct lvm_volume_group * group) +{ + struct lvm_physical_volume * pv_entry = group->pv_list; + unsigned char * lv_buffer = NULL; + u64 start_sector; + unsigned long total_sectors, lv_array_size = 0; + unsigned long buffer_size = IO_BUFFER_SECTORS * EVMS_VSECTOR_SIZE; + int i, rc = 1; + + if (group->lv_array) { + return 0; + } + + if (!pv_entry) { + LOG_ERROR("Group %s has no PVs. Cannot read LV structures.\n", + group->vg_name); + return -EINVAL; + } + + /* Allocate a buffer to do the actual I/Os. */ + lv_buffer = kmalloc(buffer_size, GFP_NOIO); + if (!lv_buffer) { + LOG_CRITICAL("Error allocating buffer for LV structs for Group %s\n", + group->vg_name); + return -ENOMEM; + } + + /* Read in the LV structures 4k at a time. If one PV returns errors, + * start over with the next PV in the group. + */ + while (rc && pv_entry) { + start_sector = evms_cs_size_in_vsectors(pv_entry->pv->lv_on_disk.base); + total_sectors = evms_cs_size_in_vsectors(pv_entry->pv->lv_on_disk.size); + lv_array_size = round_up(total_sectors * EVMS_VSECTOR_SIZE, + buffer_size); + + /* Allocate the buffer for this group to + * hold the entire LV array. + */ + if (group->lv_array) { + vfree(group->lv_array); + group->lv_array = NULL; + } + group->lv_array = vmalloc(lv_array_size); + if (!group->lv_array) { + LOG_CRITICAL("Error allocating lv_array buffer for Group %s\n", + group->vg_name); + rc = -ENOMEM; + goto out_kfree; + } + memset(group->lv_array, 0, lv_array_size); + + for ( i = 0; i < total_sectors; i += IO_BUFFER_SECTORS ) { + rc = INIT_IO(pv_entry->logical_node, 0, + start_sector + i, IO_BUFFER_SECTORS, + lv_buffer); + if (rc) { + LOG_SERIOUS("Error reading LV metadata from %s in Group %s\n", + pv_entry->logical_node->name, + group->vg_name); + + /* Try the next PV if the current one + * caused any errors. + */ + pv_entry = pv_entry->next; + break; + } + /* Copy the I/O buffer into the lv_array. */ + memcpy(&(((char *)(group->lv_array))[i * EVMS_VSECTOR_SIZE]), + lv_buffer, buffer_size); + } + } + + if (rc) { + LOG_SERIOUS("Unable to read LV metadata from any PV in Group %s\n", + group->vg_name); + goto out_vfree; + } + + /* Clear out the unused portion at the end of the lv_array. */ + memset(&(((char *)(group->lv_array))[pv_entry->pv->lv_on_disk.size]), + 0, lv_array_size - pv_entry->pv->lv_on_disk.size); + + /* Endian-neutral conversion of the LV metadata. */ + endian_convert_lvs(group); + +out_kfree: + kfree(lv_buffer); + return rc; + +out_vfree: + vfree(group->lv_array); + group->lv_array = NULL; + goto out_kfree; +} + +/** + * endian_convert_pe_map + * + * Endian-neutral conversion for PE structures + **/ +static inline void endian_convert_pe_map(struct lvm_physical_volume * pv_entry) +{ + int i; + for ( i = 0; i < pv_entry->pv->pe_total; i++ ) { + pv_entry->pe_map[i].lv_num = + le16_to_cpup(&pv_entry->pe_map[i].lv_num); + pv_entry->pe_map[i].le_num = + le16_to_cpup(&pv_entry->pe_map[i].le_num); + } +} + +/** + * read_pe_map + * + * Read in the PE map for the specified PV. This function will allocate a + * buffer to read in the data. + **/ +static int read_pe_map(struct lvm_physical_volume * pv_entry) +{ + struct evms_logical_node * node = pv_entry->logical_node; + struct pv_disk * pv = pv_entry->pv; + unsigned char * pe_buffer; + u64 start_sector; + unsigned long total_sectors, pe_map_size; + unsigned long buffer_size = IO_BUFFER_SECTORS * EVMS_VSECTOR_SIZE; + int i, rc = -ENOMEM; + + if (pv_entry->pe_map) { + return 0; + } + + start_sector = evms_cs_size_in_vsectors(pv->pe_on_disk.base); + total_sectors = evms_cs_size_in_vsectors(pv->pe_total * + sizeof(struct pe_disk)); + pe_map_size = round_up(total_sectors * EVMS_VSECTOR_SIZE, buffer_size); + + /* Allocate a buffer for performing the I/O. */ + pe_buffer = kmalloc(buffer_size, GFP_NOIO); + if (!pe_buffer) { + LOG_CRITICAL("Error allocating buffer for PE maps for %s\n", + node->name); + goto out; + } + + /* Allocate a buffer to hold the PE map for this PV. */ + pv_entry->pe_map = vmalloc(pe_map_size); + if (!pv_entry->pe_map) { + LOG_CRITICAL("Error allocating PE map for %s\n", node->name); + goto out_kfree; + } + memset(pv_entry->pe_map, 0, pe_map_size); + + for ( i = 0; i < total_sectors; i += IO_BUFFER_SECTORS ) { + rc = INIT_IO(node, 0, start_sector + i, + IO_BUFFER_SECTORS, pe_buffer); + if (rc) { + LOG_SERIOUS("Error reading PE maps from %s.\n", + node->name); + goto out_vfree; + } + /* Copy the data to the actual PE map. */ + memcpy(&(((char *)(pv_entry->pe_map))[i * EVMS_VSECTOR_SIZE]), + pe_buffer, buffer_size); + } + + /* Clear out the unused portion at the end of the PE map. */ + memset(&(((char *)(pv_entry->pe_map))[total_sectors * EVMS_VSECTOR_SIZE]), + 0, pe_map_size - total_sectors * EVMS_VSECTOR_SIZE); + + /* Endian-neutral conversion of the PE metadata. */ + endian_convert_pe_map(pv_entry); + +out_kfree: + kfree(pe_buffer); +out: + return rc; + +out_vfree: + vfree(pv_entry->pe_map); + pv_entry->pe_map = NULL; + goto out_kfree; +} + + +/********** Snapshot Manipulation Functions **********/ + + +/** + * snapshot_check_quiesce_original + * + * For this snapshot LV, check that both it and its original are quiesced. + **/ +static int +snapshot_check_quiesce_original(struct lvm_logical_volume * snap_volume) +{ + struct lvm_logical_volume * org_volume = snap_volume->snapshot_org; + + if ( ! (snap_volume->lv_access & EVMS_LV_QUIESCED) ) { + return -EINVAL; + } + + if ( org_volume && !(org_volume->lv_access & EVMS_LV_QUIESCED) ) { + return -EINVAL; + } + + return 0; +} + +/** + * snapshot_check_quiesce_all + * + * Go through the list of all snapshots for an original volume, and make + * sure everyone is in a quiesced state. + **/ +static int snapshot_check_quiesce_all(struct lvm_logical_volume * org_volume) +{ + struct lvm_logical_volume * snap; + + if ( ! (org_volume->lv_access & EVMS_LV_QUIESCED) ) { + return -EINVAL; + } + + for ( snap = org_volume->snapshot_next; + snap; snap = snap->snapshot_next ) { + if ( ! (snap->lv_access & EVMS_LV_QUIESCED) ) { + return -EINVAL; + } + } + + return 0; +} + +/** + * invalidate_snapshot_volume + * + * In the event a snapshot volume becomes full or corrupted, its metadata + * must be altered in order to prevent it from being used again. Write some + * invalid data into the first entry of the COW table. If this volume is + * not fully deleted by the user/engine, this invalid COW entry will be + * detected by build_snapshot_maps(), and will cause the volume to be + * deleted before being exported to EVMS during discover. This is obviously + * a hack, but it is the same hack currently used by LVM. We're just trying + * to be compatible. :) + **/ +static int invalidate_snapshot_volume(struct lvm_logical_volume * snap_volume) +{ + struct evms_logical_node tmp_node; + + tmp_node.private = snap_volume; + tmp_node.total_vsectors = snap_volume->lv_size; + + if ( ! (snap_volume->lv_access & LV_SNAPSHOT) ) { + LOG_WARNING("Volume %s is not a snapshot. Cannot invalidate\n", + snap_volume->name); + return -EINVAL; + } + + LOG_WARNING("Invalidating full/corrupt snapshot %s\n", + snap_volume->name); + LOG_WARNING("Run the EVMS administration tools to remove this snapshot.\n"); + + if (snap_volume->cow_table) { + snap_volume->cow_table[0].pv_org_rsector = + cpu_to_le64(((u64)1)); + if ( lvm_init_io(&tmp_node, 4, 0, 1, snap_volume->cow_table) ) { + LOG_SERIOUS("Unable to invalidate snapshot %s\n", + snap_volume->name); + } + } else { + LOG_SERIOUS("Unable to invalidate snapshot %s\n", + snap_volume->name); + } + + snap_volume->lv_status &= ~LV_ACTIVE; + return 0; +} + +/** + * remove_snapshot_from_chain + * + * Remove a snapshot volume from its original's chain of snapshots. This + * does not delete the snapshot volume. At runtime, we cannot delete + * volumes at the region-manager level, because EVMS may have this volume + * exported, and there is no way to notify EVMS of the deletion. It will + * eventually need to be deleted in the engine, which will then tell the + * EVMS kernel services to delete the volume in the kernel. + **/ +static int remove_snapshot_from_chain(struct lvm_logical_volume * snap_volume) +{ + struct lvm_logical_volume * org_volume = snap_volume->snapshot_org; + struct lvm_logical_volume ** p_volume; + + if (org_volume) { + for ( p_volume = &org_volume->snapshot_next; + *p_volume; + p_volume = &(*p_volume)->snapshot_next ) { + if ( *p_volume == snap_volume ) { + *p_volume = snap_volume->snapshot_next; + break; + } + } + } + + snap_volume->snapshot_org = NULL; + snap_volume->snapshot_next = NULL; + return 0; +} + +/** + * snapshot_hash + * + * The snapshot hash tables are NEVER going to have 4 billion entries, so + * we can safely cast the org_sector to 32 bits and just mod it by the + * hash table size. + **/ +static u32 snapshot_hash(u64 org_sector, + struct lvm_logical_volume * snap_volume) +{ + return (((u32)org_sector) % snap_volume->hash_table_size); +} + +/** + * snapshot_search_hash_chain + * + * Search the hash chain that is anchored at the specified head pointer. + * If the sector number is found, the result pointer is set to that entry + * in the chain, and a 1 is returned. If the sector is not found, the + * result pointer is set to the previous entry and 0 is returned. If the + * result pointer is NULL, this means either the list is empty, or the + * specified sector should become the first list item. + **/ +static int snapshot_search_hash_chain(u64 org_sector, + struct snapshot_map_entry * head, + struct snapshot_map_entry ** result) +{ + struct snapshot_map_entry * curr = head; + struct snapshot_map_entry * prev = head; + while ( curr && curr->org_sector < org_sector ) { + prev = curr; + curr = curr->next; + } + if (!curr) { + /* Either an empty chain or went off the end of the chain. */ + *result = prev; + return 0; + } else if ( curr->org_sector != org_sector ) { + *result = curr->prev; + return 0; + } else { + /* Found the desired sector. */ + *result = curr; + return 1; + } +} + +/** + * insert_snapshot_map_entry + * + * Insert a new entry into a snapshot hash chain, immediately following the + * specified entry. This function should not be used to add an entry into + * an empty list, or as the first entry in an existing list. For that case, + * use insert_snapshot_map_entry_at_head(). + **/ +static int insert_snapshot_map_entry(struct snapshot_map_entry * entry, + struct snapshot_map_entry * base) +{ + entry->next = base->next; + entry->prev = base; + base->next = entry; + if (entry->next) { + entry->next->prev = entry; + } + return 0; +} + +/** + * insert_snapshot_map_entry_at_head + * + * Insert a new entry into a snapshot chain as the first entry. + **/ +static int insert_snapshot_map_entry_at_head(struct snapshot_map_entry * entry, + struct snapshot_map_entry ** head) +{ + entry->next = *head; + entry->prev = NULL; + *head = entry; + if (entry->next) { + entry->next->prev = entry; + } + return 0; +} + +/** + * add_cow_entry_to_snapshot_map + * + * Convert a cow table entry (from the on-disk data) into an appropriate + * entry for the snapshot map. Insert this new entry into the appropriate + * map for the specified volume. + * + * The cow_entry passed into this function must have already been + * endian-converted from disk-order to cpu-order. + **/ +static int add_cow_entry_to_snapshot_map(struct lv_COW_table_disk * cow_entry, + struct lvm_logical_volume * volume) +{ + struct snapshot_map_entry * new_entry, * target_entry; + struct snapshot_map_entry ** hash_table, * chain_head; + u32 hash_value; + + if ( cow_entry->pv_org_number == 0 ) { + return -EINVAL; + } + + new_entry = allocate_snapshot_map_entry(cow_entry->pv_org_rsector, + cow_entry->pv_snap_rsector); + if (!new_entry) { + return -ENOMEM; + } + + new_entry->snap_pv = find_pv_by_number(cow_entry->pv_snap_number, + volume->group); + if (!new_entry->snap_pv) { + kfree(new_entry); + return -EINVAL; + } + + hash_value = snapshot_hash(new_entry->org_sector, volume); + hash_table = volume->snapshot_map[cow_entry->pv_org_number]; + chain_head = hash_table[hash_value]; + if ( snapshot_search_hash_chain(new_entry->org_sector, + chain_head, &target_entry) ) { + /* In general, we should not find this entry in the snapshot + * map already. However, it could happen on a re-discover, but + * the build_snapshot_maps function should weed out those cases. + * In either event, we can simply ignore duplicates. + */ + LOG_WARNING("Detected a duplicate snapshot map entry\n"); + LOG_WARNING("Snap PV "PFU64":"PFU64", Org PV "PFU64":"PFU64"\n", + cow_entry->pv_snap_number, + cow_entry->pv_snap_rsector, + cow_entry->pv_org_number, + cow_entry->pv_org_rsector); + kfree(new_entry); + } else { + if (target_entry) { + insert_snapshot_map_entry(new_entry, target_entry); + } else { + insert_snapshot_map_entry_at_head(new_entry, + &hash_table[hash_value]); + } + } + + return 0; +} + +/** + * snapshot_remap_sector + * + * Perform a sector remap on a snapshot volume. This should be called from + * the I/O read path, after the LE-to-PE translation has already been + * performed. First, determine the base sector of the chunk containing the + * specified sector, and save the remainder. Then, perform a search through + * the snapshot map for the specified volume. If an match is found, change + * the PV and sector numbers to the new values. If no match is found, leave + * the values alone, meaning the read should proceed down the original + * volume. + **/ +static void +snapshot_remap_sector(struct lvm_logical_volume * snap_volume, + u64 pe_start_sector, + u64 * sector, + struct lvm_physical_volume ** pv_entry) +{ + struct snapshot_map_entry ** hash_table; + struct snapshot_map_entry * chain_head, * result; + u32 hash_value; + u64 chunk_sector, remainder; + + if ( ! (snap_volume->lv_access & LV_SNAPSHOT) ) { + return; + } + + chunk_sector = ((*sector - pe_start_sector) & + ((u64)(~(snap_volume->chunk_size - 1)))) + + pe_start_sector; + remainder = *sector - chunk_sector; + hash_value = snapshot_hash(chunk_sector, snap_volume); + hash_table = snap_volume->snapshot_map[(*pv_entry)->pv_number]; + chain_head = hash_table[hash_value]; + + if ( snapshot_search_hash_chain(chunk_sector, chain_head, &result) ) { + *pv_entry = result->snap_pv; + *sector = result->snap_sector + remainder; + } +} + +/** + * snapshot_read_write_chunk + * + * This function takes care of reading one chunk of data from the + * original, and writing it to the snapshot. Since the original now has + * a fixed sized buffer for this data, we may have to loop to get the + * whole chunk copied. + **/ +static int snapshot_read_write_chunk(struct lvm_logical_volume * org_volume, + struct lvm_physical_volume * org_pv, + u64 chunk_sector, + struct lvm_logical_volume * snap_volume, + struct lvm_physical_volume ** snap_pv, + u64 * snap_sector) +{ + u32 io_size = snap_volume->chunk_size; + u64 snap_pe_start_sector, size; + int i, iterations = 1; + + if ( org_volume->chunk_size < snap_volume->chunk_size ) { + iterations = snap_volume->chunk_size / org_volume->chunk_size; + io_size = org_volume->chunk_size; + } + + remap_sector(snap_volume->volume_node, snap_volume->next_free_chunk, 1, + snap_sector, &size, &snap_pe_start_sector, snap_pv); + + /* Check for an incomplete volume. */ + if (!*snap_sector || !*snap_pv) { + invalidate_snapshot_volume(snap_volume); + return -1; + } + + for ( i = 0; i < iterations; i++ ) { + + /* Read the chunk from the original volume. This is a physical + * read, not logical. Thus, stripe boundary considerations are + * unnecessary. Also, chunks are always aligned with PEs, so PE + * boundary considerations are unnecessary. + */ + if ( INIT_IO(org_pv->logical_node, 0, + chunk_sector + i * io_size, io_size, + org_volume->chunk_data_buffer) ) { + return 1; + } + + /* Write this chunk to the snapshot volume. This does duplicate + * the local init_io code, but we need to have the remapped + * sector later on, so this is slightly more efficient. Snapshot + * volumes cannot be striped, so there is no need to consider + * stripe-boundary conditions. And just like the read in the + * previous line, chunks are always aligned with PEs, so we + * don't have to consider PE-boundary conditions. + */ + if ( INIT_IO((*snap_pv)->logical_node, 1, + *snap_sector + i * io_size, io_size, + org_volume->chunk_data_buffer) ) { + /* An error writing the chunk to the snapshot is the + * same situation as the snapshot being full. + */ + invalidate_snapshot_volume(snap_volume); + return -1; + } + } + + return 0; +} + +/** + * snapshot_copy_data + * + * On a write to a snapshotted volume, check all snapshots to see if the + * specified chunk has already been remapped. If it has not, read the + * original data from the volume, write the data to the next available + * chunk on the snapshot, update the COW table, write the COW table to + * the snapshot, and insert a new entry into the snapshot map. + * + * Now converted to copy data to a single snapshot. The looping is left + * up to lvm_write. + **/ +static int snapshot_copy_data(struct lvm_logical_volume * org_volume, + struct lvm_logical_volume * snap_volume, + u64 pe_start_sector, + u64 org_sector, + struct lvm_physical_volume * org_pv) +{ + struct lvm_physical_volume * snap_pv; + struct snapshot_map_entry ** hash_table, * chain_head; + struct snapshot_map_entry * target_entry, * new_map_entry; + u64 chunk_sector, snap_sector; + u32 hash_value; + int rc = 0; + + /* Lock out this snapshot while we are remapping. */ + down(&snap_volume->snap_semaphore); + + /* Make sure the snapshot has not been deactivated. */ + if ( ! (snap_volume->lv_status & LV_ACTIVE) ) { + goto out; + } + + /* Search the hash table to see if this sector has already been + * remapped on this snapshot. + */ + chunk_sector = ((org_sector - pe_start_sector) & + ((u64)(~(snap_volume->chunk_size - 1)))) + + pe_start_sector; + hash_value = snapshot_hash(chunk_sector, snap_volume); + hash_table = snap_volume->snapshot_map[org_pv->pv_number]; + chain_head = hash_table[hash_value]; + + if ( snapshot_search_hash_chain(chunk_sector, + chain_head, &target_entry) ) { + /* Chunk is already remapped. */ + goto out; + } + + /* Is there room on the snapshot to remap this chunk? */ + if ( snap_volume->next_free_chunk >= snap_volume->lv_size ) { + /* At this point, the snapshot is full. Any further + * writes to the original will cause the snapshot to + * become "corrupt" because they can't be remapped. + * Take this snapshot permanently offline. + */ + goto out_invalidate; + } + + rc = snapshot_read_write_chunk(org_volume, org_pv, chunk_sector, + snap_volume, &snap_pv, &snap_sector); + if (rc) { + rc = (rc > 0) ? -EIO : 0; + goto out; + } + + /* Fill in the appropriate COW table entry and write that + * metadata sector back to the snapshot volume. Since we are + * only writing one sector, there are no boundary conditions. + * Must endian-convert each entry as it is added. + */ + snap_volume->cow_table[snap_volume->next_cow_entry].pv_org_number = + cpu_to_le64((u64)(org_pv->pv_number)); + snap_volume->cow_table[snap_volume->next_cow_entry].pv_org_rsector = + cpu_to_le64p(&chunk_sector); + snap_volume->cow_table[snap_volume->next_cow_entry].pv_snap_number = + cpu_to_le64((u64)(snap_pv->pv_number)); + snap_volume->cow_table[snap_volume->next_cow_entry].pv_snap_rsector = + cpu_to_le64p(&snap_sector); + + if ( lvm_init_io(snap_volume->volume_node, 4, + snap_volume->current_cow_sector, + 1, snap_volume->cow_table) ) { + /* The data was written to the snapshot, but + * writing the metadata failed. + */ + goto out_invalidate; + } + + snap_volume->next_cow_entry++; + if ( snap_volume->next_cow_entry >= + (EVMS_VSECTOR_SIZE / sizeof (struct lv_COW_table_disk)) ) { + snap_volume->next_cow_entry = 0; + snap_volume->current_cow_sector++; + memset(snap_volume->cow_table, 0, EVMS_VSECTOR_SIZE); + if ( lvm_init_io(snap_volume->volume_node, 4, + snap_volume->current_cow_sector, + 1, snap_volume->cow_table) ) { + /* Can't clear out the next sector of metadata. */ + goto out_invalidate; + } + } + snap_volume->next_free_chunk += snap_volume->chunk_size; + + /* Create a new snapshot map entry and add it in the appropriate + * place in the map. + */ + new_map_entry = allocate_snapshot_map_entry(chunk_sector, snap_sector); + if (!new_map_entry) { + rc = -ENOMEM; + goto out_invalidate; + } + new_map_entry->snap_pv = snap_pv; + if (target_entry) { + insert_snapshot_map_entry(new_map_entry, target_entry); + } else { + insert_snapshot_map_entry_at_head(new_map_entry, + &(hash_table[hash_value])); + } + +out: + up(&snap_volume->snap_semaphore); + return rc; + +out_invalidate: + invalidate_snapshot_volume(snap_volume); + goto out; +} + +/** + * get_snapshot_stats + **/ +static int get_snapshot_stats(struct lvm_snapshot_stat_ioctl * snap_stats) +{ + struct lvm_logical_volume * volume; + struct lvm_volume_group * group; + + /* Make sure the parameters are in range. */ + if ( snap_stats->lv_number < 1 || snap_stats->lv_number > MAX_LV ) { + return 1; + } + + /* Make sure the specified group and volume exist, and that + * this is a snapshot volume. + */ + find_group_by_uuid(snap_stats->vg_uuid, &group); + if ( ! group || + ! (volume = group->volume_list[snap_stats->lv_number]) || + ! (volume->lv_access & LV_SNAPSHOT) ) { + return 1; + } + + /* Return the starting LBA of the next available chunk. */ + snap_stats->next_free_chunk = volume->next_free_chunk; + snap_stats->lv_status = volume->lv_status; + + return 0; +} + + +/********** Memory Allocation/Deallocation Functions **********/ + + +/** + * deallocate_physical_volume + * + * Free the memory used by this physical volume. Do not delete the EVMS + * node in this function, since this could be called during an error + * path when we want to save the logical node. + **/ +static int deallocate_physical_volume(struct lvm_physical_volume * pv_entry) +{ + if (pv_entry->pv) { + kfree(pv_entry->pv); + pv_entry->pv = NULL; + } + + if (pv_entry->pe_map) { + vfree(pv_entry->pe_map); + pv_entry->pe_map = NULL; + } + + kfree(pv_entry); + return 0; +} + +/** + * allocate_physical_volume + * + * Create a new struct lvm_physical_volume for the specified volume group. + * Initialize the new PV with the evms node and lvm pv information. + **/ +static struct lvm_physical_volume * +allocate_physical_volume(struct evms_logical_node * node, struct pv_disk * pv) +{ + struct lvm_physical_volume * new_pv; + + new_pv = kmalloc(sizeof(struct lvm_physical_volume), GFP_NOIO); + if (!new_pv) { + LOG_CRITICAL("Error allocating physical volume for %s.\n", + node->name); + kfree(pv); + goto out; + } + + /* Initialize the PV. */ + memset(new_pv, 0, sizeof(struct lvm_physical_volume)); + new_pv->logical_node = node; + new_pv->pv = pv; + new_pv->pv_number = pv->pv_number; + +out: + return new_pv; +} + +/** + * allocate_snapshot_map_entry + * + * Allocate memory for a new entry in the snapshot map and fill in the + * sector values. The PV pointer is not filled in here, but can easily + * be found by using the find_pv_by_number function. + **/ +static struct snapshot_map_entry * allocate_snapshot_map_entry(u64 org_sector, + u64 snap_sector) +{ + struct snapshot_map_entry * new_entry; + + new_entry = kmalloc(sizeof(struct snapshot_map_entry), GFP_NOIO); + if (!new_entry) { + goto out; + } + memset(new_entry, 0, sizeof(struct snapshot_map_entry)); + new_entry->org_sector = org_sector; + new_entry->snap_sector = snap_sector; +out: + return new_entry; +} + +/** + * deallocate_snapshot_map + * + * This function will delete one hash table, which is part of the whole + * snapshot remapping structure. Each hash table is an array of pointers + * to linked lists of struct snapshot_map_entry's. + **/ +static int deallocate_snapshot_map(struct snapshot_map_entry ** table, + u32 table_size) +{ + struct snapshot_map_entry * entry, * next; + int i; + + if (table) { + for ( i = 0; i < table_size; i++ ) { + for ( entry = table[i]; entry; entry = next ) { + next = entry->next; + kfree(entry); + } + } + vfree(table); + } + return 0; +} + +/** + * deallocate_logical_volume + * + * Delete the in-memory representation of a single LVM logical volume, + * including its PE map and any snapshot data. Do not alter the parent + * volume group, except to remove this volume from its volume list. + **/ +static int deallocate_logical_volume(struct lvm_logical_volume * volume) +{ + struct lvm_volume_group * group = volume->group; + struct lvm_logical_volume * org_volume, * snap_volume; + int i; + + if ( volume->lv_access & LV_SNAPSHOT ) { + /* This volume is a snapshot. Remove it from the linked + * list of volumes that are snapshotting the original. + * First, the original volume must be quiesced. + */ + org_volume = volume->snapshot_org; + + if ( snapshot_check_quiesce_original(volume) ) { + return -EINVAL; + } + + remove_snapshot_from_chain(volume); + + /* If the snapshot that was just removed was the last/only + * volume snapshotting the original, then mark the original + * as no longer being snapshotted. + */ + if ( org_volume && !org_volume->snapshot_next ) { + org_volume->lv_access &= ~LV_SNAPSHOT_ORG; + } + } else if ( volume->lv_access & LV_SNAPSHOT_ORG ) { + /* If this volume is a snapshot original, all of its snapshots + * must also be deleted. However, Those deletions need to be + * taken care of by the engine. So just check that they have + * all been quiesced before removing the original. + */ + if ( snapshot_check_quiesce_all(volume) ) { + return -EINVAL; + } + + /* In case there are any snapshots remaining, we must clear out + * their pointers to this original to prevent errors when those + * snapshots are accessed or deleted. + */ + for ( snap_volume = volume->snapshot_next; + snap_volume; snap_volume = snap_volume->snapshot_next ) { + snap_volume->snapshot_org = NULL; + } + } + + if (volume->name) { + LOG_DEBUG("Deleting volume %s\n", volume->name); + } + + /* Free all the memory. This includes the LE-to-PE map, any snapshot + * hash tables, the COW table, and chunk data buffer. + */ + if (volume->le_map) { + vfree(volume->le_map); + volume->le_map = NULL; + } + if (volume->snapshot_map) { + for ( i = 1; i <= group->pv_count; i++ ) { + deallocate_snapshot_map(volume->snapshot_map[i], + volume->hash_table_size); + } + kfree(volume->snapshot_map); + volume->snapshot_map = NULL; + } + if (volume->cow_table) { + kfree(volume->cow_table); + volume->cow_table = NULL; + } + if (volume->chunk_data_buffer) { + kfree(volume->chunk_data_buffer); + volume->chunk_data_buffer = NULL; + } + + /* Remove this volume from the group's list. */ + if ( group && group->volume_list[volume->lv_number] == volume ) { + group->volume_list[volume->lv_number] = NULL; + group->volume_count--; + } + + kfree(volume); + return 0; +} + +/** + * allocate_logical_volume + * + * Allocate space for a new LVM logical volume, including space for the + * LE-to-PE map and any necessary snapshot data. + **/ +static struct lvm_logical_volume * +allocate_logical_volume(struct lv_disk * lv, struct lvm_volume_group * group) +{ + struct lvm_logical_volume * new_volume; + u32 table_entries_per_chunk, table_chunks; + int i; + + /* Allocate space for the new logical volume. */ + new_volume = kmalloc(sizeof(struct lvm_logical_volume), GFP_NOIO); + if (!new_volume) { + LOG_CRITICAL("Error allocating new logical volume %s\n", + lv->lv_name); + goto out; + } + memset(new_volume, 0, sizeof(struct lvm_logical_volume)); + + /* Allocate space for the LE to PE mapping table. */ + new_volume->le_map = vmalloc(lv->lv_allocated_le * + sizeof(struct le_table_entry)); + if (!new_volume->le_map) { + LOG_CRITICAL("Error creating LE map for logical volume %s\n", + lv->lv_name); + goto error; + } + memset(new_volume->le_map, 0, + lv->lv_allocated_le * sizeof(struct le_table_entry)); + + /* Initialize the rest of the new volume. + * Need the +1 on lv_number to match the PE Map entries on the PV. + */ + new_volume->lv_number = lv->lv_number + 1; + new_volume->lv_size = lv->lv_size; + new_volume->lv_access = lv->lv_access | EVMS_LV_NEW | EVMS_LV_QUIESCED; + new_volume->lv_status = lv->lv_status | LV_ACTIVE; + new_volume->lv_minor = MINOR(lv->lv_dev); + new_volume->stripes = lv->lv_stripes; + new_volume->stripe_size = lv->lv_stripesize; + new_volume->stripe_size_shift = evms_cs_log2(lv->lv_stripesize); + new_volume->pe_size = group->vg->pe_size; + new_volume->pe_size_shift = evms_cs_log2(group->vg->pe_size); + new_volume->num_le = lv->lv_allocated_le; + new_volume->group = group; + /* Different naming scheme for EVMS nodes. */ + if ( translate_lv_name(lv->lv_name, new_volume->name) ) { + goto error; + } + + if ( new_volume->lv_access & LV_SNAPSHOT ) { + /* This volume is a snapshot, initialize the remaining data, + * and allocate space for the remapping structures, and one + * sector's worth of COW tables. + */ + new_volume->chunk_size = lv->lv_chunk_size; + new_volume->num_chunks = lv->lv_size / lv->lv_chunk_size; + new_volume->snap_org_minor = lv->lv_snapshot_minor; + new_volume->next_cow_entry = 0; + new_volume->current_cow_sector = 0; + table_entries_per_chunk = (new_volume->chunk_size << + EVMS_VSECTOR_SIZE_SHIFT) / + sizeof(struct lv_COW_table_disk); + table_chunks = (new_volume->num_chunks + + table_entries_per_chunk - 1) / + table_entries_per_chunk; + new_volume->next_free_chunk = table_chunks * + new_volume->chunk_size; + new_volume->hash_table_size = (lv->lv_size / lv->lv_chunk_size / + MAX_HASH_CHAIN_ENTRIES) + 1; + + new_volume->cow_table = kmalloc(EVMS_VSECTOR_SIZE, GFP_NOIO); + if (!new_volume->cow_table) { + LOG_CRITICAL("Error allocating COW table for logical volume %s\n", + lv->lv_name); + goto error; + } + memset(new_volume->cow_table, 0, EVMS_VSECTOR_SIZE); + + new_volume->snapshot_map = kmalloc((group->pv_count + 1) * + sizeof(struct snapshot_map_entry **), + GFP_NOIO); + if (!new_volume->snapshot_map) { + LOG_CRITICAL("Error allocating snapshot map for logical volume %s\n", + lv->lv_name); + goto error; + } + + new_volume->snapshot_map[0] = NULL; + for ( i = 1; i <= group->pv_count; i++ ) { + new_volume->snapshot_map[i] = + vmalloc(new_volume->hash_table_size * + sizeof(struct snapshot_map_entry *)); + if (!new_volume->snapshot_map[i]) { + LOG_CRITICAL("Error allocating snapshot sub-map for logical volume %s\n", + lv->lv_name); + goto error; + } + memset(new_volume->snapshot_map[i], 0, + new_volume->hash_table_size * + sizeof(struct snapshot_map_entry *)); + } + init_MUTEX(&new_volume->snap_semaphore); + } else if ( new_volume->lv_access & LV_SNAPSHOT_ORG ) { + /* This volume is a snapshot original, allocate space to use for + * copying snapshot chunks. This will now be a fixed size + * instead of being based on the chunk size of the snapshots. + */ + new_volume->chunk_size = CHUNK_DATA_BUFFER_SIZE; + new_volume->chunk_data_buffer = + kmalloc(new_volume->chunk_size << + EVMS_VSECTOR_SIZE_SHIFT, GFP_NOIO); + if (!new_volume->chunk_data_buffer) { + LOG_SERIOUS("Error allocating snapshot chunk buffer for logical volume %s\n", + lv->lv_name); + goto error; + } + memset(new_volume->chunk_data_buffer, 0, + new_volume->chunk_size << EVMS_VSECTOR_SIZE_SHIFT); + } + +out: + return new_volume; +error: + deallocate_logical_volume(new_volume); + new_volume = NULL; + goto out; +} + +/** + * deallocate_volume_group + * + * Delete the entire in-memory representation of an LVM volume group, + * including all PVs and logical volumes. If this group is on LVM's + * volume group list, remove it. + **/ +static int deallocate_volume_group(struct lvm_volume_group * group) +{ + struct lvm_physical_volume * pv_entry, * next_pv; + int i; + + LOG_DEBUG("Deleting volume group %s\n", group->vg_name); + + /* Remove the group from the global list. */ + remove_group_from_list(group); + + /* Delete the LV metadata array. */ + if (group->lv_array) { + vfree(group->lv_array); + group->lv_array = NULL; + } + + /* Delete the PV UUID list. */ + if (group->uuid_list) { + vfree(group->uuid_list); + group->uuid_list = NULL; + } + + /* Delete all logical volumes. */ + for ( i = 1; i <= MAX_LV; i++ ) { + if (group->volume_list[i]) { + deallocate_logical_volume(group->volume_list[i]); + group->volume_list[i] = NULL; + } + } + + /* Delete all PVs from the group's list. */ + for ( pv_entry = group->pv_list; pv_entry; pv_entry = next_pv ) { + next_pv = pv_entry->next; + if (pv_entry->logical_node) { + /* Send a delete command down to the segment manager. */ + LOG_DEBUG("Deleting PV %s from group %s\n", + pv_entry->logical_node->name, group->vg_name); + DELETE(pv_entry->logical_node); + pv_entry->logical_node = NULL; + } + deallocate_physical_volume(pv_entry); + } + + /* Delete the VG metadata. */ + if (group->vg) { + kfree(group->vg); + group->vg = NULL; + } + + kfree(group); + return 0; +} + +/** + * allocate_volume_group + * + * Allocate space for a new LVM volume group and all of its sub-fields. + * Initialize the appropriate fields. + * vg parameter should already have an allocate/initialized struct vg_disk. + **/ +static struct lvm_volume_group * allocate_volume_group(struct vg_disk * vg, + u8 * vg_name) +{ + struct lvm_volume_group * new_group; + + /* The volume group itself. */ + new_group = kmalloc(sizeof(struct lvm_volume_group), GFP_NOIO); + if (!new_group) { + kfree(vg); + goto out; + } + + /* Initialize the new group. */ + memset(new_group, 0, sizeof(struct lvm_volume_group)); + memcpy(new_group->vg_uuid, vg->vg_uuid, UUID_LEN); + strncpy(new_group->vg_name, vg_name, NAME_LEN - 1); + new_group->vg = vg; + /* Default sector and block sizes. */ + new_group->hard_sect_size = 512; + new_group->block_size = 1024; + new_group->flags = EVMS_VG_DIRTY; + + LOG_DETAILS("Discovered volume group %s\n", new_group->vg_name); + +out: + return new_group; +} + +/** + * remove_pv_from_group + * + * In the engine, when a PV is removed from a group (on a vgreduce), that + * same PV must be removed from that group in the kernel. Otherwise, when + * the rediscover occurs, that PV will still appear in the group, and + * will cause segfaults when we try to read metadata from it. + **/ +static int remove_pv_from_group(int pv_number, unsigned char * vg_uuid) +{ + struct lvm_volume_group * group; + struct lvm_physical_volume * pv_entry; + struct lvm_physical_volume ** p_pv_entry; + + /* Make sure the numbers are in range. */ + if ( pv_number < 0 || pv_number > MAX_PV ) { + return 0; + } + + /* Make sure the group exists. */ + find_group_by_uuid(vg_uuid, &group); + if (!group) { + return 0; + } + + /* Make sure the PV is in this group. */ + pv_entry = find_pv_by_number(pv_number, group); + if (!pv_entry) { + LOG_WARNING("Did not find PV %d in group %s\n", + pv_number, group->vg_name); + return 0; + } + + /* Make sure the PV is not in use by any volumes. */ + if ( check_pv_for_lv(pv_entry, group) ) { + LOG_SERIOUS("PV %d in group %s still contains LVs\n", + pv_number, group->vg_name); + return -EINVAL; + } + + /* Take this PV out of the group's list. */ + for ( p_pv_entry = &group->pv_list; + *p_pv_entry; p_pv_entry = &(*p_pv_entry)->next ) { + if ( *p_pv_entry == pv_entry ) { + *p_pv_entry = (*p_pv_entry)->next; + pv_entry->next = NULL; + break; + } + } + + group->pv_count--; + + /* There is no way that this PV was the last in this group, so the + * group never needs to be deleted at this point. The only way this + * group will exist in the kernel is if there are volumes exported from + * it. If this was the last PV, then those volumes must be on that PV, + * and it wouldn't be allowed to be removed from the group (above). + */ + + /* Free up the memory for this PV. Just drop the node. */ + deallocate_physical_volume(pv_entry); + + LOG_DEBUG("PV %d removed from group %s\n", pv_number, group->vg_name); + return 0; +} + + +/********** Consistency Checking Functions **********/ + + +/** + * clear_le_entries_for_missing_pv + * + * In the event that a PV turns up missing during a rediscover, we + * need to erase any LE map entries that might point to it. + **/ +static void +clear_le_entries_for_missing_pv(struct lvm_volume_group * group, + struct lvm_physical_volume * pv_entry) +{ + struct lvm_logical_volume * volume; + int i, j; + + for ( i = 1; i <= MAX_LV; i++ ) { + if (group->volume_list[i]) { + volume = group->volume_list[i]; + for ( j = 0; j < volume->num_le; j++ ) { + if ( volume->le_map[j].owning_pv == pv_entry ) { + volume->le_map[j].owning_pv = NULL; + volume->le_map[j].pe_sector_offset = 0; + } + } + } + } +} + +/** + * check_volume_groups + * + * This function performs some simple consistency checks on all dirty + * volume groups. Any groups that have no PVs are deleted. If any metadata + * structures (PV or VG) are missing, they are read in from disk. + **/ +static int check_volume_groups(void) +{ + struct lvm_volume_group * group, * next_group; + struct lvm_physical_volume * pv_entry, * next_pv; + int rc = 0; + + for ( group = lvm_group_list; group; group = next_group ) { + next_group = group->next_group; + + LOG_DEBUG("Checking Group %s\n", group->vg_name); + + /* If a group has no PVs, it can be safely deleted, + * because we can't find any volumes on it. + */ + if (!group->pv_count) { + LOG_WARNING("No PVs found for Group %s.\n", + group->vg_name); + if (!group->volume_count) { + deallocate_volume_group(group); + } + continue; + } + + /* Make sure all metadata for the PVs is present. On a + * rediscover, it may be missing, because we delete it at the + * end of discovery. If any is missing, read it in from disk. + * This is only necessary in the kernel. It can't happen in + * the engine. + */ + for ( pv_entry = group->pv_list; + pv_entry; pv_entry = next_pv ) { + next_pv = pv_entry->next; + if (!pv_entry->pv) { + LOG_DEBUG("Re-reading PV metadata for %s\n", + pv_entry->logical_node->name); + rc = read_pv(pv_entry->logical_node, + &pv_entry->pv); + if (rc) { + /* What happens if we can't re-read the + * PV metadata? This PV must be removed + * from the group. Need to also clear + * all LE entries in all LVs that are + * pointing to this PV before it can be + * removed from the list. + */ + LOG_SERIOUS("PV metadata is missing or cannot be read from %s\n", + pv_entry->logical_node->name); + clear_le_entries_for_missing_pv(group, + pv_entry); + remove_pv_from_group(pv_entry->pv_number, + group->vg_uuid); + continue; + } + pv_entry->pv_number = pv_entry->pv->pv_number; + + /* Check for a "stale" PV. This case should be + * already be covered, as long as the Engine is + * calling the PV_REMOVE ioctl when it does a + * vgreduce or a pvremove. If this is the last + * PV in the group, the group will be deleted. + */ + if (!pv_entry->pv_number) { + remove_pv_from_group(0, group->vg_uuid); + continue; + } + } + + if (!pv_entry->pe_map) { + LOG_DEBUG("Re-reading PE maps for %s\n", + pv_entry->logical_node->name); + rc = read_pe_map(pv_entry); + if (rc) { + LOG_WARNING("Error reading PE maps for %s\n", + pv_entry->logical_node->name); + LOG_WARNING("Any volumes residing on %s will be incomplete!\n", + pv_entry->logical_node->name); + } + } + } + + /* Make sure the metadata for the VG is present. If it's + * missing, read it in from the first PV in the VG. + */ + if (!group->vg && group->pv_count) { + LOG_DEBUG("Re-reading VG metadata for Group %s\n", + group->vg_name); + pv_entry = group->pv_list; + rc = read_vg(pv_entry->logical_node, + pv_entry->pv, &group->vg); + if (rc) { + /* What happens if we can't re-read the + * VG metadata? It's definitely bad + * news. Should we delete the VG? + */ + continue; + } + } + + /* Display a warning if the number of PVs found for the group + * doesn't match the number of PVs recorded for the VG. + */ + if ( group->vg && group->pv_count != group->vg->pv_cur ) { + LOG_WARNING("Group %s is incomplete.\n", + group->vg_name); + LOG_WARNING(" Only %d of %d PVs found.\n", + group->pv_count, group->vg->pv_cur); + LOG_WARNING(" Volumes in this group may be incomplete.\n"); + } + } + + return 0; +} + +/** + * check_le_maps + * + * Make sure all volumes in this group have valid LE-to-PE maps. Any + * volume that doesn't is marked as incomplete. This is safe for + * re-discovery because only new volumes could have corrupted LE maps. + **/ +static int check_le_maps(struct lvm_volume_group * group) +{ + struct lvm_logical_volume * volume; + int i, j, count; + + for ( i = 1; i <= MAX_LV; i++ ) { + volume = group->volume_list[i]; + if (!volume) { + continue; + } + + if (!volume->le_map) { + /* No point in keeping the volume around if it has + * no LE map at all. + */ + LOG_SERIOUS("Volume %s has no LE map.\n", volume->name); + deallocate_logical_volume(volume); + continue; + } + + /* If any entries in the LE map are missing, mark this volume + * as incomplete. + */ + for ( j = 0, count = 0; j < volume->num_le; j++ ) { + if ( !volume->le_map[j].owning_pv || + !volume->le_map[j].pe_sector_offset) { + count++; + } + } + if (count) { + LOG_SERIOUS("Volume %s has incomplete LE map.\n", + volume->name); + LOG_SERIOUS(" Missing %d out of %d LEs.\n", + count, volume->num_le); + volume->lv_access |= EVMS_LV_INCOMPLETE; + } + } + return 0; +} + +/** + * check_snapshot_map + * + * For snapshot volumes, make sure the snapshot map is intact, and that + * any existing entries in the map are in the correct order and there + * are no duplicate entries. + **/ +static int check_snapshot_map(struct lvm_logical_volume * snap_volume) +{ + struct snapshot_map_entry ** table, * curr; + int i, j; + + if ( ! (snap_volume->lv_access & LV_SNAPSHOT) ) { + return 0; + } + if (!snap_volume->snapshot_map) { + snap_volume->lv_access |= EVMS_LV_INVALID; + return -EINVAL; + } + + for ( i = 1; i <= snap_volume->group->pv_count; i++ ) { + if (!snap_volume->snapshot_map[i]) { + snap_volume->lv_access |= EVMS_LV_INVALID; + return -EINVAL; + } + table = snap_volume->snapshot_map[i]; + for ( j = 0; j < snap_volume->hash_table_size; j++ ) { + for ( curr = table[j]; curr; curr = curr->next ) { + if ( curr->next && + curr->org_sector >= + curr->next->org_sector) { + snap_volume->lv_access |= + EVMS_LV_INVALID; + return -EINVAL; + } + } + } + } + return 0; +} + +/** + * check_logical_volumes + * + * Perform a consistency check on all of the logical volumes that have been + * discovered. Any volume that has any inconsistencies will be marked as + * incomplete or invalid, depending on the severity of the problem. At the + * end, all invalid volumes are deleted. If the deleted_incompletes + * parameter is set, those will also be deleted. + **/ +static int check_logical_volumes(int final_discovery) +{ + struct lvm_volume_group * group; + struct lvm_logical_volume * volume, * snap, * next; + int count, i, j; + + /* Check every valid, dirty volume group. */ + for ( group = lvm_group_list; group; group = group->next_group ) { + if ( ! (group->flags & EVMS_VG_DIRTY) ) { + continue; + } + /* Check every valid volume in this group. */ + for ( i = 1; i <= MAX_LV; i++ ) { + volume = group->volume_list[i]; + if (!volume) { + continue; + } + + LOG_DEBUG("Checking logical volume %s\n", volume->name); + + if (!volume->group) { + volume->group = group; + } + + /* All LE-map entries must have valid values. The I/O + * paths now detect missing LE entries. + */ + if (volume->le_map) { + for ( j = 0, count = 0; + j < volume->num_le; j++ ) { + if ( !volume->le_map[j].owning_pv || + !volume->le_map[j].pe_sector_offset ) { + count++; + } + } + if (count) { + LOG_SERIOUS("Volume %s has incomplete LE map.\n", + volume->name); + LOG_SERIOUS(" Missing %d out of %d LEs.\n", + count, volume->num_le); + volume->lv_access |= EVMS_LV_INCOMPLETE; + } else { + /* In case this volume was previously + * marked incomplete. + */ + volume->lv_access &= + ~EVMS_LV_INCOMPLETE; + } + } else { + /* This should only ever happen due to + * memory corruption. + */ + LOG_SERIOUS("Volume %s has no LE map.\n", + volume->name); + volume->lv_access |= EVMS_LV_INVALID; + } + + if ( volume->lv_access & LV_SNAPSHOT_ORG ) { + /* For a snapshot original, check all snapshots + * in the chain, to make sure they point back to + * the original. Also, make sure there is memory + * for the chunk buffer. + */ + for ( snap = volume->snapshot_next, count = 0; + snap; + snap = snap->snapshot_next, count++ ) { + if ( snap->snapshot_org != volume ) { + LOG_SERIOUS("Snapshot volume %s not pointing at correct original\n", + volume->name); + snap->snapshot_org = NULL; + snap->lv_access |= + EVMS_LV_INVALID; + } + } + if (!count) { + LOG_WARNING("No snapshots found for volume %s\n", + volume->name); + if (final_discovery) { + volume->lv_access &= + ~LV_SNAPSHOT_ORG; + } + } else if (!volume->chunk_data_buffer) { + volume->lv_access |= EVMS_LV_INVALID; + } + } else if ( volume->lv_access & LV_SNAPSHOT ) { + /* For a snapshot volume, make sure it points + * back to its original. Also make sure there is + * memory for the cow table, and that any + * existing snapshot entries in the snapshot map + * are correctly ordered. + */ + /* Is there a COW table? */ + if (!volume->cow_table) { + LOG_SERIOUS("Snapshot volume %s has no COW table\n", + volume->name); + volume->lv_access |= EVMS_LV_INVALID; + } + /* Is the snapshot map in order? */ + if ( check_snapshot_map(volume) ) { + LOG_SERIOUS("Snapshot volume %s has snapshot map inconsistency\n", + volume->name); + volume->lv_access |= EVMS_LV_INVALID; + } + /* Is there an original volume? This is only + * a real problem during final discovery. + */ + if (!volume->snapshot_org) { + LOG_SERIOUS("Snapshot volume %s not pointing at an original\n", + volume->name); + if (final_discovery) { + volume->lv_access |= + EVMS_LV_INVALID; + } + } + /* Is the original the correct one? */ + else if ( volume->snap_org_minor != + volume->snapshot_org->lv_minor ) { + LOG_SERIOUS("Snapshot volume %s not pointing at correct original\n", + volume->name); + volume->lv_access |= EVMS_LV_INVALID; + } + } + /* Delete any invalid volumes from use. Delete + * incomplete volumes as well if this is not final + * discovery. If a snapshot original is bad, delete all + * of its snapshots. + */ + if ( volume->lv_access & EVMS_LV_INVALID || + (!final_discovery && + (volume->lv_access & EVMS_LV_INCOMPLETE) && + (volume->lv_access & EVMS_LV_NEW)) ) { + if ( volume->lv_access & LV_SNAPSHOT_ORG ) { + for ( snap = volume->snapshot_next; + snap; snap = next ) { + next = snap->snapshot_next; + snap->snapshot_next = NULL; + snap->snapshot_org = NULL; + invalidate_snapshot_volume(snap); + deallocate_logical_volume(snap); + } + volume->snapshot_next = NULL; + } else if ( volume->lv_access & LV_SNAPSHOT ) { + invalidate_snapshot_volume(volume); + } + deallocate_logical_volume(volume); + } + } + } + + return 0; +} + + +/********** Volume Group Discovery Functions **********/ + + +/** + * find_group_for_pv + * + * This is a discover-time function. It reads the VG metadata info for the + * specified node, and locates the appropriate group that owns that + * node. If that group does not already exist, it is created and + * initialized. + **/ +static int find_group_for_pv(struct evms_logical_node * node, + struct pv_disk * pv, + struct lvm_volume_group ** group) +{ + struct vg_disk * vg; + int rc; + + *group = NULL; + + /* Check for an unassigned PV. */ + if ( pv->vg_name[0] == 0 ) { + return 0; + } + + /* Read the VG on-disk info for this PV. If this succeeds, it + * allocates a new VG metadata structure. + */ + rc = read_vg(node, pv, &vg); + if (rc) { + return rc; + } + + /* Use the UUID from the VG metadata to determine if this group + * has already been discovered and constructed. + */ + find_group_by_uuid(vg->vg_uuid, group); + + if (!*group) { + /* Create a new group entry and add to the global list. */ + *group = allocate_volume_group(vg, pv->vg_name); + if (!*group) { + return -ENOMEM; + } + add_group_to_list(*group); + } else if (!(*group)->vg) { + /* On a rediscover, the VG metadata for an existing group might + * be missing. Fill it in if necessary. This check is also not + * necessary in the engine, since the metadata is never deleted. + */ +/* Should we re-copy vg_name? (vg_uuid can not be allowed to change). + * Or should vg_name changes be done through direct ioctl only? + */ + (*group)->vg = vg; + } else { + kfree(vg); + } + + /* Read in the UUID list for this group, if it isn't present. */ + rc = read_uuid_list(node, pv, *group); + if (rc) { + LOG_WARNING("Error reading UUID list for group %s.\n", + (*group)->vg_name); + LOG_WARNING("May not be able to verify PV UUIDs for group %s\n", + (*group)->vg_name); + } + + /* In the kernel, any time we even see a PV for a group, that group + * must be marked dirty so its volumes will be re-exported. + */ + (*group)->flags |= EVMS_VG_DIRTY; + + return 0; +} + +/** + * check_for_duplicate_pv + * + * Search the list of PVs in the specified volume group. If the + * specified node already exists in the list, we can discard it. + **/ +static int check_for_duplicate_pv(struct evms_logical_node * node, + struct pv_disk * pv, + struct lvm_volume_group * group) +{ + struct lvm_physical_volume * pv_entry; + + /* For re-discovery, we need to search all existing PVs in this VG to + * make sure we didn't get a duplicate from the plugin below us. The + * plugins below us should be re-exporting the same node on + * re-discovery, instead of creating a new node to represent the same + * objects, so just check the memory location. + */ + for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) { + if ( pv_entry->logical_node == node ) { + + /* We found a duplicate. Just ignore the duplicate. */ + LOG_DEBUG("PV %s is already in Group %s.\n", + node->name, group->vg_name); + + /* Even if the node was a duplicate, we may need to + * fill in the pv entry for this partition, since we + * always delete those at the end of discovery. + */ + if (!pv_entry->pv) { + pv_entry->pv = pv; + pv_entry->pv_number = pv->pv_number; + } else { + kfree(pv); + } + + return 1; + } + } + + /* No duplicate was found. */ + return 0; +} + +/** + * verify_pv_uuid + * + * Verify that the specified PV belongs in the specified group by + * searching for the PV's UUID in the group's list. + **/ +static int verify_pv_uuid(struct lvm_physical_volume * pv_entry, + struct lvm_volume_group * group) +{ + int i; + + /* Obviously the UUID list must be present in order to search. */ + if (!group->uuid_list) { + LOG_WARNING("UUID list is missing from group %s.\n", + group->vg_name); + LOG_WARNING("Cannot verify UUID for PV %s\n", + pv_entry->logical_node->name); + return 0; + } + + /* Start with the UUID entry for this PV's number. */ + if ( ! memcmp(pv_entry->pv->pv_uuid, + &(group->uuid_list[(pv_entry->pv_number - 1) * NAME_LEN]), + UUID_LEN) ) { + return 0; + } + + /* If it wasn't found there, then search the entire group's list. */ + for ( i = 0; i < group->vg->pv_cur; i++ ) { + if ( ! memcmp(pv_entry->pv->pv_uuid, + &(group->uuid_list[i * NAME_LEN]), UUID_LEN) ) { + /* Found the UUID. */ + LOG_WARNING("Detected UUID mismatch for PV %s!\n", + pv_entry->logical_node->name); + LOG_WARNING("PV %s is recorded as being at index %d,\n", + pv_entry->logical_node->name, + pv_entry->pv_number); + LOG_WARNING(" but Group %s has it recorded at index %d.\n", + group->vg_name, i + 1); + LOG_WARNING("Run the EVMS Engine to correct the problem.\n"); + LOG_WARNING("If you have any snapshot regions in group %s\n", + group->vg_name); + LOG_WARNING(" it is recommended that you delete them immediately!\n"); + return 0; + } + } + + LOG_SERIOUS("Could not find UUID for PV %s in group %s\n", + pv_entry->logical_node->name, group->vg_name); + return -EINVAL; +} + +/** + * add_pv_to_group + * + * Adds the physical volume to the appropriate volume group. The PV + * passed into this function MUST be part of a valid VG. + **/ +static int add_pv_to_group(struct lvm_physical_volume * pv_entry, + struct lvm_volume_group * group) +{ + int rc; + + /* Make sure this PV's UUID is listed in the group. */ + rc = verify_pv_uuid(pv_entry, group); + if (rc) { + LOG_SERIOUS("PV %s does not belong in group %s!\n", + pv_entry->logical_node->name, group->vg_name); + return rc; + } + + /* Add this PV to the beginning of its group's list. */ + pv_entry->next = group->pv_list; + group->pv_list = pv_entry; + group->pv_count++; + + /* Update the group's block and hardsector sizes as appropriate. */ + group->block_size = max(pv_entry->logical_node->block_size, + group->block_size); + group->hard_sect_size = max(pv_entry->logical_node->hardsector_size, + group->hard_sect_size); + + /* Check for the Partial or Removable flag on the PV. */ + if ( pv_entry->logical_node->flags & EVMS_VOLUME_PARTIAL ) { + group->flags |= EVMS_VG_PARTIAL_PVS; + } + if ( pv_entry->logical_node->flags & EVMS_DEVICE_REMOVABLE ) { + group->flags |= EVMS_VG_REMOVABLE_PVS; + } + + LOG_DETAILS("PV %s added to Group %s\n", + pv_entry->logical_node->name, group->vg_name); + + return 0; +} + +/** + * discover_volume_groups + * + * Examine the list of logical nodes. Any node that contains a valid PV + * structure is consumed and added to the appropriate volume group. PVs + * which do not belong to any group are deleted. Everything else is left + * on the discovery list. + **/ +static int discover_volume_groups(struct evms_logical_node ** evms_node_list) +{ + struct evms_logical_node * node, * next_node; + struct pv_disk * pv; + struct lvm_volume_group * group; + struct lvm_physical_volume * pv_entry; + int rc; + + LOG_EXTRA("Searching for PVs in the node list.\n"); + + /* Run through the discovery list. */ + for ( node = *evms_node_list; node; node = next_node ) { + /* Save the next node. We may remove this one from the list. */ + next_node = node->next; + + /* Read the PV metadata. This will also create a new struct pv_disk + * if it finds the correct LVM signatures. + */ + rc = read_pv(node, &pv); + if (rc) { + /* This node is not an LVM PV, or an error occurred. + * Just leave the node on the discovery list. + */ + continue; + } + + rc = find_group_for_pv(node, pv, &group); + if (rc) { + /* Error getting the group for this PV. */ + kfree(pv); + continue; + } + + if (!group) { + /* This node is an unassigned PV. */ + LOG_DETAILS("PV %s is unassigned.\n", node->name); + kfree(pv); + continue; + } + + rc = check_for_duplicate_pv(node, pv, group); + if (rc) { + /* This node is already in the group. This check is also + * only in the kernel because the engine has no notion + * of rediscover, and thus can never get a duplicate. + */ + evms_cs_remove_logical_node_from_list(evms_node_list, + node); + continue; + } + + /* Allocate a PV entry for this node. */ + pv_entry = allocate_physical_volume(node, pv); + if (!pv_entry) { + continue; + } + + /* Add this PV to the appropriate volume group. */ + rc = add_pv_to_group(pv_entry, group); + if (rc) { + deallocate_physical_volume(pv_entry); + continue; + } + + rc = read_pe_map(pv_entry); + if (rc) { + LOG_WARNING("Error reading PE maps for node %s\n", + node->name); + LOG_WARNING("Any volumes residing on this node will be incomplete!\n"); + } + + evms_cs_remove_logical_node_from_list(evms_node_list, node); + } + + LOG_EXTRA("Group discovery complete.\n"); + return 0; +} + + +/********** Logical Volume Discovery Functions **********/ + + +/** + * build_le_maps + * + * After all logical volumes have been discovered, the mappings from + * logical extents to physical extents must be constructed. Each PV + * contains a map on-disk of its PEs. Each PE map entry contains the + * logical volume number and the logical extent number on that volume. + * Our internal map is the reverse of this map for each volume, listing + * the PV node and sector offset for every logical extent on the volume. + **/ +static int build_le_maps(struct lvm_volume_group * group) +{ + struct lvm_logical_volume ** volume_list = group->volume_list; + struct lvm_physical_volume * pv_entry; + struct evms_logical_node * node; + struct pv_disk * pv; + struct pe_disk * pe_map; + u64 offset; + u32 lv_number, le_number, first_pe_sector; + int i; + + LOG_DEBUG("Building LE maps for new volumes in group %s.\n", + group->vg_name); + + /* For every PV in this VG. */ + for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) { + node = pv_entry->logical_node; + pv = pv_entry->pv; + pe_map = pv_entry->pe_map; + + /* Version 1 metadata uses pe_on_disk.base + .size to find start + * of first PE. Version 2 uses pe_start. + */ + if (pv->version == 1) { + first_pe_sector = + evms_cs_size_in_vsectors(pv->pe_on_disk.base + + pv->pe_on_disk.size); + } else { + first_pe_sector = pv->pe_start; + if (!first_pe_sector) { + first_pe_sector = + evms_cs_size_in_vsectors(pv->pe_on_disk.base + + pv->pe_on_disk.size); + } + } + + /* For every entry in the PE map, calculate the PE's sector offset + * and update the correct LV's PE map. LV number of 0 marks an unused PE. + * For re-discovery, only compute entries for new volumes. If a PV + * is read-only, all LVs on that PV will also be read-only. + */ + for ( i = 0; i < pv->pe_total; i++ ) { + lv_number = pe_map[i].lv_num; + if ( lv_number && + volume_list[lv_number] && + volume_list[lv_number]->lv_access & + (EVMS_LV_NEW | EVMS_LV_INCOMPLETE) ) { + le_number = pe_map[i].le_num; + offset = i * pv->pe_size + first_pe_sector; + volume_list[lv_number]->le_map[le_number].owning_pv = + pv_entry; + volume_list[lv_number]->le_map[le_number].pe_sector_offset = + offset; + if ( node->flags & EVMS_VOLUME_SET_READ_ONLY ) { + volume_list[lv_number]->lv_access &= + ~LV_WRITE; + } + } + } + } + + return 0; +} + +/** + * build_snapshot_maps + * + * For every volume in this group that is a snapshot, read all of the + * existing entries in the COW table, and build up the snapshot mapping + * structures accordingly. + * + * For reference, the COW tables attached to the snapshot volumes will + * always be in disk-order (little-endian), so that it can always be + * immediately written to disk. Therefore, endian conversions are necessary + * any time the COW table is accessed. This function will make a local + * copy of each COW table sector, and convert the local copy before + * building the snapshot maps. + **/ +static int build_snapshot_maps(struct lvm_volume_group * group) +{ + struct lvm_logical_volume * volume; + struct evms_logical_node tmp_node; + struct lv_COW_table_disk cow_table[EVMS_VSECTOR_SIZE / + sizeof(struct lv_COW_table_disk)]; + unsigned long max_entries = EVMS_VSECTOR_SIZE / + sizeof(struct lv_COW_table_disk); + int i, j; + + /* Check every volume in the group to see if it is a snapshot. Also + * check to make sure it is a new volume in the case of re-discovery. + */ + for ( i = 1; i <= MAX_LV; i++ ) { + + /* The volume must exist, must be new, and must be a snapshot. + */ + volume = group->volume_list[i]; + if ( !volume || + !(volume->lv_access & EVMS_LV_NEW) || + !(volume->lv_access & LV_SNAPSHOT)) { + continue; + } + + /* Set up a temporary EVMS node. */ + tmp_node.private = volume; + + LOG_DEBUG("Building snapshot map for volume %s\n", + volume->name); + + while (1) { + /* Read in one sector's worth of COW tables. */ + if ( lvm_init_io(&tmp_node, 0, + volume->current_cow_sector, + 1, volume->cow_table) ) { + goto error; + } + + /* Endian-conversion of this COW table + * to a local table. + */ + for ( j = 0; j < max_entries; j++ ) { + cow_table[j].pv_org_number = + le64_to_cpu(volume->cow_table[j].pv_org_number); + cow_table[j].pv_org_rsector = + le64_to_cpu(volume->cow_table[j].pv_org_rsector); + cow_table[j].pv_snap_number = + le64_to_cpu(volume->cow_table[j].pv_snap_number); + cow_table[j].pv_snap_rsector = + le64_to_cpu(volume->cow_table[j].pv_snap_rsector); + } + + /* Translate every valid COW table entry into + * a snapshot map entry. + */ + for ( volume->next_cow_entry = 0; + volume->next_cow_entry < max_entries && + cow_table[volume->next_cow_entry].pv_org_number; + volume->next_cow_entry++ ) { + /* org_rsector must be a valid sector number, + * i.e. it can't be within a PVs metadata. This + * is how we detect invalidated snapshots. + */ + if ( cow_table[volume->next_cow_entry].pv_org_rsector < 10 || + cow_table[volume->next_cow_entry].pv_org_number > group->pv_count || + add_cow_entry_to_snapshot_map(&(cow_table[volume->next_cow_entry]), volume) ) { + /* This volume either has an invalid COW entry, + * or had an error adding that COW entry to the + * snapshot map. This snapshot is done. + */ + goto error; + } + volume->next_free_chunk += volume->chunk_size; + } + + /* Move on to the next sector if necessary. */ + if ( volume->next_cow_entry == max_entries ) { + volume->current_cow_sector++; + } else { + break; + } + } + } + +out: + return 0; +error: + invalidate_snapshot_volume(volume); + deallocate_logical_volume(volume); + goto out; +} + +/** + * link_snapshot_volumes + * + * This function examines the list of logical volumes in this group and + * sets up the necessary pointers to link snapshots and their originals. + * A singly-linked list is created starting with the original volume. Also, + * all snapshot volumes point directly back to their original. This + * function should not be run until all volumes have been discovered. + * In the case of re-discovery, all of these links/lists get rebuilt as if + * they were not already there. Currently this should not pose a problem. + **/ +static int link_snapshot_volumes(struct lvm_volume_group * group) +{ + struct lvm_logical_volume * org_volume, * snap_volume; + u32 org_minor, buffer_size = 0; + int i, j; + + for ( i = 1; i <= MAX_LV; i++ ) { + + /* Only process snapshot-originals. */ + org_volume = group->volume_list[i]; + if ( !org_volume || !(org_volume->lv_access & LV_SNAPSHOT_ORG) ) { + continue; + } + + /* For snapshot-originals, look for all other volumes that + * claim to be snapshotting it. For each one that is found, + * insert it at the start of the original's list of snapshots. + * Need to start with a NULL snapshot_next, otherwise could + * wind up with circular lists. + */ + org_minor = org_volume->lv_minor; + org_volume->snapshot_next = NULL; + + for ( j = 1; j <= MAX_LV; j++ ) { + snap_volume = group->volume_list[j]; + if ( snap_volume && + snap_volume->lv_access & LV_SNAPSHOT && + (snap_volume->snap_org_minor == org_minor) ) { + snap_volume->snapshot_org = org_volume; + snap_volume->snapshot_next = + org_volume->snapshot_next; + org_volume->snapshot_next = snap_volume; + if ( snap_volume->chunk_size > buffer_size ) { + buffer_size = snap_volume->chunk_size; + } + LOG_DEBUG("Linking snapshot (%s) to original (%s)\n", + snap_volume->name, org_volume->name); + } + } + + /* If no snapshots were found for a volume that claims to be + * under snapshot, mark the group dirty. If this is final + * discovery, the original will have the snapshot flag turned + * off in check_logical_volumes(). + */ + if (!org_volume->snapshot_next) { + LOG_WARNING("No snapshots found for original (%s)\n", + org_volume->name); + group->flags |= EVMS_VG_DIRTY; + } + } + return 0; +} + +/** + * discover_volumes_in_group + **/ +static int discover_volumes_in_group(struct lvm_volume_group * group) +{ + struct lv_disk * lv_array = group->lv_array; + struct lvm_logical_volume * new_volume; + int i; + + /* Search through the LV structs for valid LV entries. */ + for ( i = 0; i < group->vg->lv_max; i++ ) { + + /* Only discover valid, active volumes. */ + if ( !lv_array[i].lv_name[0] || + lv_array[i].lv_number >= MAX_LV ) { + continue; + } + + /* Make sure this volume isn't already in the list. */ + if (group->volume_list[lv_array[i].lv_number + 1]) { + continue; + } + + /* Create a new logical volume and place it in the appropriate + * spot in this VG's volume list. + */ + new_volume = allocate_logical_volume(&(lv_array[i]), group); + if (!new_volume) { + /* This volume will be missing, but other + * volumes in this group can still be built. + */ + LOG_CRITICAL("Error allocating LV %s in Group %s\n", + lv_array[i].lv_name, group->vg_name); + continue; + } + + group->volume_list[new_volume->lv_number] = new_volume; + group->volume_count++; + group->flags |= EVMS_VG_DIRTY; + + LOG_DEBUG("Discovered volume %s in group %s.\n", + new_volume->name, group->vg_name); + } + + return 0; +} + +/** + * discover_logical_volumes + * + * After all PVs have been claimed and added to the appropriate VG list, + * the volumes for each VG must be constructed. For each group, read all + * the LV structs off the first PV in the list. Search this list of + * structs for valid LVs. For each valid LV, create a new volume and add + * it to the group. + **/ +static int discover_logical_volumes(int final_discovery) +{ + struct lvm_volume_group *group; + int rc; + + /* Look for volumes in each valid VG entry. We even need to check ones + * that aren't dirty - We could have deleted an incomplete volume on + * the previous pass, and need to rediscover it in case this is final + * discovery and we now want to export it. + */ + for ( group = lvm_group_list; group; group = group->next_group ) { + + if ( ! group->vg || + (! final_discovery && + ! (group->flags & EVMS_VG_DIRTY)) ) { + continue; + } + + LOG_DEBUG("Searching for volumes in group %s\n", + group->vg_name); + + /* Read in the LV array from disk if necessary. */ + rc = read_lv(group); + if (rc) { + LOG_WARNING("Unable to read LV metadata for group %s\n", + group->vg_name); + LOG_WARNING("No regions can be discovered for group %s\n", + group->vg_name); + continue; + } + + /* Assemble each volume in the group. */ + discover_volumes_in_group(group); + + /* Build the LE map for each LV discovered in this group. This + * must be done after all LVS in the group are discovered. + */ + build_le_maps(group); + check_le_maps(group); + + /* Set up all of the initial snapshot maps. Only the kernel + * keeps track of the snapshot maps. + */ + build_snapshot_maps(group); + + /* Set up the pointers to link snapshot volumes + * with their originals. + */ + link_snapshot_volumes(group); + } + + return 0; +} + +/** + * export_volumes + * + * The last thing the plugin must do is take each newly constructed volume + * and place it on the evms logical node list. A zero return-code from + * this function means nothing new was added to the list, and a positive + * return code means that many new items were added to the list. + **/ +static int export_volumes(struct evms_logical_node ** evms_node_list, + int final_discover) +{ + struct lvm_volume_group * group; + struct evms_logical_node * new_node; + struct lvm_logical_volume * volume; + int i, count = 0; + + LOG_EXTRA("Exporting volumes\n"); + + /* For every valid, dirty volume group. */ + for ( group = lvm_group_list; group; group = group->next_group ) { + if ( ! (group->flags & EVMS_VG_DIRTY) ) { + continue; + } + + /* Export every valid volume in the group. For re-discovery, + * we re-export the same logical node. + */ + for ( i = 1; i <= MAX_LV; i++ ) { + volume = group->volume_list[i]; + if (!volume) { + continue; + } + + /* For new volumes, create a new EVMS node and + * initialize the appropriate fields. + */ + if ( volume->lv_access & EVMS_LV_NEW ) { + if ( evms_cs_allocate_logical_node(&new_node) ) { + continue; + } + MOD_INC_USE_COUNT; + + volume->volume_node = new_node; + volume->lv_access &= (~EVMS_LV_QUIESCED & + ~EVMS_LV_NEW); + new_node->hardsector_size = + group->hard_sect_size; + new_node->block_size = group->block_size; + new_node->plugin = &lvm_plugin_header; + new_node->private = volume; + memcpy(new_node->name, volume->name, NAME_LEN); + + /* Snapshot volumes should report the + * size of their original. + */ + new_node->total_vsectors = + (volume->lv_access & LV_SNAPSHOT) ? + volume->snapshot_org->lv_size : + volume->lv_size; + + /* Is the volume read-only? */ + if ( ! (volume->lv_access & LV_WRITE) ) { + new_node->flags |= + EVMS_VOLUME_READ_ONLY; + LOG_DEBUG("LVM volume %s is read-only\n", + volume->name); + } + + /* Is the volume incomplete? */ + if ( volume->lv_access & EVMS_LV_INCOMPLETE ) { + new_node->flags |= + (EVMS_VOLUME_READ_ONLY | + EVMS_VOLUME_PARTIAL); + LOG_DEBUG("LVM volume %s is incomplete\n", + volume->name); + } + + /* Does the volume group contain any partial or + * removable PVs? + */ + if ( group->flags & EVMS_VG_PARTIAL_PVS ) { + new_node->flags |= EVMS_VOLUME_PARTIAL; + } + if ( group->flags & EVMS_VG_REMOVABLE_PVS ) { + new_node->flags |= + EVMS_DEVICE_REMOVABLE; + } + } + + /* Export the node, only if it hasn't been exported + * during this full EVMS discover. + */ + if ( ! (volume->lv_access & EVMS_LV_EXPORTED) ) { + if ( ! evms_cs_add_logical_node_to_list(evms_node_list, + volume->volume_node) ) { + LOG_DETAILS("Exporting LVM volume %s\n", + volume->name); + volume->lv_access |= EVMS_LV_EXPORTED; + count++; + } + } + + if (final_discover) { + volume->lv_access &= ~EVMS_LV_EXPORTED; + } + } + + /* The group is clean now. */ + group->flags &= ~EVMS_VG_DIRTY; + } + + return count; +} + +/** + * lvm_cleanup + * + * This function runs through the entire lvm data structure, removing + * all items that are not needed at runtime. Currently, this is just the + * struct vg_disk structure and the struct pv_disk structure for each PV. + * Also, any groups that don't contain any volumes are deleted. All of the + * other volume_group, logical_volume and evms_logical_node structures will + * be kept around at run-time. + **/ +static int lvm_cleanup(void) +{ + struct lvm_volume_group * group, * next_group; + struct lvm_physical_volume * pv_entry; + + for ( group = lvm_group_list; group; group = next_group ) { + next_group = group->next_group; + + /* Delete groups with no volumes. */ + if (!group->volume_count) { + LOG_WARNING("Group %s contains no logical volumes. Deleting.\n", + group->vg_name); + remove_group_from_list(group); + deallocate_volume_group(group); + /* Need to go back to the start of the list, + * just to be safe. :) + */ + next_group = lvm_group_list; + continue; + } + + /* Delete data structures that aren't used at runtime. */ + if (group->vg) { + kfree(group->vg); + group->vg = NULL; + } + + for ( pv_entry = group->pv_list; + pv_entry; pv_entry = pv_entry->next) { + if (pv_entry->pv) { + kfree(pv_entry->pv); + pv_entry->pv = NULL; + } + if (pv_entry->pe_map) { + vfree(pv_entry->pe_map); + pv_entry->pe_map = NULL; + } + } + if (group->lv_array) { + vfree(group->lv_array); + group->lv_array = NULL; + } + if (group->uuid_list) { + vfree(group->uuid_list); + group->uuid_list = NULL; + } + } + return 0; +} + +/** + * lvm_get_bmap + * + * Support for the BMAP ioctl used by LILO to translate filesystem blocks + * to disk blocks to map kernel images for boot time. + **/ +static int lvm_get_bmap(struct evms_logical_node * node, + struct evms_get_bmap_pkt * bmap, + struct evms_logical_node ** pv_node) +{ + struct lvm_logical_volume * volume = node->private; + struct lvm_physical_volume * pv_entry; + u64 pe_start_sector, new_sector = 0, new_size = 0; + int rc = 0; + + /* No kernel images allowed on snapshot LVs. */ + if ( volume->lv_access & LV_SNAPSHOT ) { + return -EINVAL; + } + + /* Range check. */ + if ( bmap->rsector >= volume->lv_size ) { + return -EINVAL; + } + + rc = remap_sector(node, bmap->rsector, 1, &new_sector, + &new_size, &pe_start_sector, &pv_entry); + + if (rc || !pv_entry || !new_sector) { + return -EINVAL; + } + + bmap->rsector = new_sector; + *pv_node = pv_entry->logical_node; + + return 0; +} + +/** + * lvm_global_proc_read + * + * A callback function for the lvm-global proc-fs entry. This will print + * general info about all LVM VGs, PVs, and LVs. + **/ +static int lvm_global_proc_read(char * page, char ** start, off_t off, + int count, int * eof, void * data) +{ + struct lvm_volume_group * group; + struct lvm_physical_volume * pv_entry; + struct lvm_logical_volume * volume, * snap; + int vgs = 0, lvs = 0, pvs = 0; + int i, sz = 0; + + PROCPRINT("Enterprise Volume Management System: LVM Plugin\n"); + PROCPRINT("Plugin ID: %x.%x.%x\n", + GetPluginOEM(lvm_plugin_header.id), + GetPluginType(lvm_plugin_header.id), + GetPluginID(lvm_plugin_header.id)); + PROCPRINT("Plugin Version: %d.%d.%d\n", + lvm_plugin_header.version.major, + lvm_plugin_header.version.minor, + lvm_plugin_header.version.patchlevel); + PROCPRINT("Required EVMS Services Version: %d.%d.%d\n", + lvm_plugin_header.required_services_version.major, + lvm_plugin_header.required_services_version.minor, + lvm_plugin_header.required_services_version.patchlevel); + + /* Count all existing items. */ + for ( group = lvm_group_list; group; group = group->next_group ) { + lvs += group->volume_count; + pvs += group->pv_count; + vgs++; + } + + PROCPRINT("\n"); + PROCPRINT("Total: %d VGs %d PVs %d LVs\n", vgs, pvs, lvs); + + /* Print out specifics about each VG. */ + for ( group = lvm_group_list; group; group = group->next_group ) { + PROCPRINT("\n"); + PROCPRINT("VG: %s [%d PV, %d LV]\n", + group->vg_name, group->pv_count, group->volume_count); + PROCPRINT("PVs:\n"); + for ( pv_entry = group->pv_list; + pv_entry; pv_entry = pv_entry->next ) { + if (pv_entry->logical_node) { + PROCPRINT("\t%s\t%10Ld KB\n", + pv_entry->logical_node->name, + (long long)pv_entry->logical_node->total_vsectors / 2); + } + } + PROCPRINT("LVs:\n"); + for ( i = 1; i <= MAX_LV; i++ ) { + if (group->volume_list[i]) { + volume = group->volume_list[i]; + PROCPRINT("\t%s\t%10Ld KB / %5d LEs", + volume->name, + (long long)volume->lv_size / 2, + volume->num_le); + if ( volume->lv_access & LV_SNAPSHOT ) { + PROCPRINT("\tSnapshot of : "); + if (volume->snapshot_org) { + PROCPRINT("%s : ", + volume->snapshot_org->name); + } else { + PROCPRINT("(unknown) : "); + } + PROCPRINT("%ld%% full : ", + (long)(volume->next_free_chunk) * + 100 / (long)(volume->lv_size)); + if ( volume->lv_status & LV_ACTIVE ) { + PROCPRINT("active"); + } else { + PROCPRINT("disabled"); + } + } else if ( volume->lv_access & LV_SNAPSHOT_ORG ) { + PROCPRINT("\tSnapshotted by : "); + for ( snap = volume->snapshot_next; + snap; + snap = snap->snapshot_next ) { + PROCPRINT("%s ", snap->name); + } + } + PROCPRINT("\n"); + } + } + } + +out: + *start = page + off; + sz -= off; + if (sz < 0) + sz = 0; + return sz > count ? count : sz; +} + + +/********** Required EVMS Plugin Functions **********/ + + +/** + * lvm_discover + * + * This is the entry point into the LVM discovery process. It is a three + * phase process. First, the list of nodes are examined for PVs, and the + * appropriate volume groups are created. Then each volume group is + * examined to find all available logical volumes. Finally, each LVM + * logical volume has a new EVMS node created for it, and added to the + * list of nodes. + **/ +static int lvm_discover(struct evms_logical_node ** evms_node_list) +{ + int rc; + + MOD_INC_USE_COUNT; + LOG_EXTRA("Beginning discovery.\n"); + + discover_volume_groups(evms_node_list); + + check_volume_groups(); + + discover_logical_volumes(FALSE); + + check_logical_volumes(FALSE); + + rc = export_volumes(evms_node_list, FALSE); + + LOG_EXTRA("Discovery complete.\n"); + MOD_DEC_USE_COUNT; + return rc; +} + +/** + * lvm_discover_end + * + * The discovery process at the region-manager level is now iterative, + * much like the EVMS feature level. This allows the ability to stack + * LVM on top of MD, or vice-versa. To accomplish this correctly, and + * also to accomplish partial volume discovery, a second discover + * entry point is needed, so EVMS can tell the region managers that + * discovery is over, and to finish up any discovery that is not yet + * complete. When this function is called, it should be assumed that + * the node list has had nothing new added to it since the last call + * of the regular discover function. Therefore, when this function is + * called, we do not need to try to discovery any additional volume + * groups. We will, however, look for logical volumes once more. This + * gives us the ability to export (read-only) volumes that have + * partially corrupted LE maps due to missing PVs in their VG. + **/ +static int lvm_discover_end(struct evms_logical_node ** evms_node_list) +{ + int rc; + + MOD_INC_USE_COUNT; + LOG_EXTRA("Beginning final discovery\n"); + + discover_volume_groups(evms_node_list); + + check_volume_groups(); + + discover_logical_volumes(TRUE); + + check_logical_volumes(TRUE); + + rc = export_volumes(evms_node_list, TRUE); + + lvm_cleanup(); + + LOG_EXTRA("Final discovery complete.\n"); + MOD_DEC_USE_COUNT; + return rc; +} + +/** + * lvm_delete_node + * + * This function deletes the in-memory representation of an LVM logical volume. + **/ +static int lvm_delete_node(struct evms_logical_node * logical_node) +{ + struct lvm_logical_volume * volume = logical_node->private; + struct lvm_volume_group * group = volume->group; + + LOG_DEBUG("Deleting LVM node %s\n", logical_node->name); + + if ( deallocate_logical_volume(volume) ) { + return -EINVAL; + } + + /* If we just removed the last volume from this group, the entire group + * must also be deleted. + */ + if ( group && group->volume_count == 0 ) { + remove_group_from_list(group); + deallocate_volume_group(group); + } + + /* Free the logical node. */ + evms_cs_deallocate_logical_node(logical_node); + MOD_DEC_USE_COUNT; + return 0; +} + +/** + * lvm_read + **/ +static void lvm_read(struct evms_logical_node * node, + struct buffer_head * bh) +{ + struct lvm_logical_volume * volume = node->private; + struct lvm_physical_volume * pv_entry; + u64 size = bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT; + u64 new_sector, new_size, pe_start_sector; + + /* If this volume is a snapshot, lock the volume, and do + * the LE-PE translation on its original volume. + */ + if ( volume->lv_access & LV_SNAPSHOT ) { + down(&volume->snap_semaphore); + if (!volume->snapshot_org) { + goto out_error; + } + node = volume->snapshot_org->volume_node; + } + + /* Make sure the volume is active and readable. */ + if ( !(volume->lv_access & LV_READ && + volume->lv_status & LV_ACTIVE) ) { + goto out_error; + } + + /* Check if I/O goes past end of logical volume. Must use the + * node, not the volume, so snapshots will work correctly. + */ + if ( bh->b_rsector + size > node->total_vsectors ) { + goto out_error; + } + + /* Logical-to-Physical remapping. Check for incomplete volumes. + * Check intermediate boundary conditions as well. + */ + if ( remap_sector(node, bh->b_rsector, size, &new_sector, + &new_size, &pe_start_sector, &pv_entry) || + !pe_start_sector || !pv_entry || + size != new_size ) { + goto out_error; + } + + /* For snapshot volumes, check if this sector's chunk has been + * remapped. If it has, new_sector and pv_entry will be changed + * accordingly. If not, they remain the same. + */ + if ( volume->lv_access & LV_SNAPSHOT ) { + snapshot_remap_sector(volume, pe_start_sector, + &new_sector, &pv_entry); + } + + bh->b_rsector = new_sector; + R_IO(pv_entry->logical_node, bh); + +out: + /* Unlock the snapshot. */ + if ( volume->lv_access & LV_SNAPSHOT ) { + up(&volume->snap_semaphore); + } + return; + +out_error: + bh->b_end_io(bh, 0); + goto out; +} + +/** + * lvm_write + **/ +static void lvm_write(struct evms_logical_node * node, + struct buffer_head * bh) +{ + struct lvm_logical_volume * volume = node->private; + struct lvm_logical_volume * snap_volume; + struct lvm_physical_volume * pv_entry; + u64 size = bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT; + u64 new_sector, new_size, pe_start_sector; + + /* Make sure the volume is active and writable. */ + if ( !(volume->lv_access & LV_WRITE && + volume->lv_status & LV_ACTIVE) ) { + goto out_error; + } + + /* Check if I/O goes past end of logical volume. */ + if ( bh->b_rsector + size > node->total_vsectors ) { + goto out_error; + } + + /* Logical-to-Physical remapping. Check for incomplete volumes. + * Check intermediate boundary conditions as well. + */ + if ( remap_sector(node, bh->b_rsector, size, &new_sector, + &new_size, &pe_start_sector, &pv_entry) || + !pe_start_sector || !pv_entry || + size != new_size ) { + goto out_error; + } + + /* Copy-on-write for snapshotting. */ + if ( volume->lv_access & LV_SNAPSHOT_ORG ) { + /* Originals can be snapshotted multiple times. */ + for ( snap_volume = volume->snapshot_next; + snap_volume; snap_volume = snap_volume->snapshot_next ) { + if ( snapshot_copy_data(volume, snap_volume, + pe_start_sector, new_sector, + pv_entry) ) { + goto out_error; + } + } + } + + bh->b_rsector = new_sector; + W_IO(pv_entry->logical_node, bh); +out: + return; +out_error: + bh->b_end_io(bh, 0); + goto out; +} + +/** + * lvm_init_io + * + * Init_io on a snapshot volume treats it like a regular volume. + **/ +static int lvm_init_io(struct evms_logical_node * node, + int io_flag, + u64 sect_nr, + u64 num_sects, + void * buf_addr) +{ + struct lvm_logical_volume * volume = node->private; + struct lvm_physical_volume * pv_entry; + u64 pe_start_sector, new_sector, new_size; + int rc = 0; + + /* Only allow internal writes to snapshots (io_flag==4). Disallow + * writes to snapshot originals. + */ + if ( io_flag == WRITE && + volume->lv_access & (LV_SNAPSHOT | LV_SNAPSHOT_ORG) ) { + return -EINVAL; + } + + /* The node for a snapshot reports the size of the original. If a + * request comes in in that range, just return. + */ + else if ( volume->lv_access & LV_SNAPSHOT && + sect_nr >= volume->lv_size && + sect_nr < node->total_vsectors ) { + if ( io_flag == READ ) { + memset(buf_addr, 0, + num_sects << EVMS_VSECTOR_SIZE_SHIFT); + } + return 0; + } + + /* Regular range check. */ + else if ( sect_nr + num_sects > volume->lv_size ) { + return -EINVAL; + } + + if ( io_flag == 4 ) { + io_flag = WRITE; + } + + /* Init IO needs to deal with the possibility of a request that spans + * PEs or stripes. This is possible because there is no limit on + * num_sects. To handle this, we loop through remap_sector and + * INIT_IO until num_sects reaches zero. + */ + while (num_sects) { + if ( remap_sector(node, sect_nr, num_sects, &new_sector, + &new_size, &pe_start_sector, &pv_entry) ) { + return -EIO; + } + + /* If the volume is incomplete, clear the buffer (on a read). */ + if (!pe_start_sector || !pv_entry) { + if ( io_flag == READ ) { + memset(buf_addr, 0, + new_size << EVMS_VSECTOR_SIZE_SHIFT); + } + } else { + rc = INIT_IO(pv_entry->logical_node, io_flag, + new_sector, new_size, buf_addr); + } + num_sects -= new_size; + sect_nr += new_size; + buf_addr = (void *)(((unsigned long) buf_addr) + + (unsigned long)(new_size << EVMS_VSECTOR_SIZE_SHIFT)); + } + + return rc; +} + +/** + * lvm_ioctl + **/ +static int lvm_ioctl(struct evms_logical_node * logical_node, + struct inode * inode, + struct file * file, + unsigned int cmd, + unsigned long arg) +{ + struct lvm_logical_volume * volume = logical_node->private; + int rc = 0; + + LOG_ENTRY_EXIT("Ioctl %d\n", cmd); + + switch (cmd) { + + case HDIO_GETGEO: + { + /* Fixed geometry for all LVM volumes. */ + unsigned char heads = 64; + unsigned char sectors = 32; + short cylinders; + long start = 0; + struct hd_geometry * hd = (struct hd_geometry *)arg; + cylinders = logical_node->total_vsectors; + cylinders = (cylinders / heads) / sectors; + + if (!hd) { + return -EINVAL; + } + + if ( copy_to_user((char *)(&hd->heads), + &heads, sizeof(heads)) || + copy_to_user((char *)(&hd->sectors), + §ors, sizeof(sectors)) || + copy_to_user((short *)(&hd->cylinders), + &cylinders, sizeof(cylinders)) || + copy_to_user((long *)(&hd->start), + &start, sizeof(start)) ) { + return -EFAULT; + } + } + break; + + case EVMS_QUIESCE_VOLUME: + { + struct evms_quiesce_vol_pkt * tmp = + (struct evms_quiesce_vol_pkt *)arg; + if (tmp->command) { + volume->lv_access |= EVMS_LV_QUIESCED; + } else { + volume->lv_access &= ~EVMS_LV_QUIESCED; + } + } + break; + + case EVMS_GET_BMAP: + { + struct evms_get_bmap_pkt * bmap = + (struct evms_get_bmap_pkt *)arg; + struct evms_logical_node * pv_node; + + rc = lvm_get_bmap(logical_node, bmap, &pv_node); + if (!rc) { + rc = IOCTL(pv_node, inode, file, cmd, + (unsigned long) bmap); + } + } + break; + + case EVMS_GET_DISK_LIST: + case EVMS_CHECK_MEDIA_CHANGE: + case EVMS_REVALIDATE_DISK: + case EVMS_OPEN_VOLUME: + case EVMS_CLOSE_VOLUME: + case EVMS_CHECK_DEVICE_STATUS: + { + /* These five ioctl all need to + * be broadcast to all PVs. + */ + struct lvm_volume_group * group = volume->group; + struct lvm_physical_volume * pv_entry; + for ( pv_entry = group->pv_list; + pv_entry; pv_entry = pv_entry->next ) { + rc |= IOCTL(pv_entry->logical_node, inode, + file, cmd, arg); + } + } + break; + + default: + /* Currently LVM does not send any ioctl's down to the + * PVs. Which PV would they go to? What would we do with + * the return codes? + */ + rc = -EINVAL; + } + + return rc; +} + +/** + * lvm_direct_ioctl + * + * This function provides a method for user-space to communicate directly + * with a plugin in the kernel. + **/ +static int lvm_direct_ioctl(struct inode * inode, + struct file * file, + unsigned int cmd, + unsigned long args) +{ + struct evms_plugin_ioctl_pkt pkt, * user_pkt; + struct lvm_pv_remove_ioctl pv_remove, * user_pv_remove; + struct lvm_snapshot_stat_ioctl snap_stats, * user_snap_stats; + int rc = 0; + + MOD_INC_USE_COUNT; + + user_pkt = (struct evms_plugin_ioctl_pkt *)args; + + /* Copy user's parameters to kernel space. */ + if ( copy_from_user(&pkt, user_pkt, sizeof(pkt)) ) { + MOD_DEC_USE_COUNT; + return -EFAULT; + } + + /* Make sure this is supposed to be our ioctl. */ + if ( pkt.feature_id != lvm_plugin_header.id ) { + MOD_DEC_USE_COUNT; + return -EINVAL; + } + + switch (pkt.feature_command) { + + case EVMS_LVM_PV_REMOVE_IOCTL: + user_pv_remove = + (struct lvm_pv_remove_ioctl *)pkt.feature_ioctl_data; + if ( copy_from_user(&pv_remove, user_pv_remove, + sizeof(pv_remove)) ) { + rc = -EINVAL; + break; + } + rc = remove_pv_from_group(pv_remove.pv_number, + pv_remove.vg_uuid); + break; + + case EVMS_LVM_SNAPSHOT_STAT_IOCTL: + user_snap_stats = + (struct lvm_snapshot_stat_ioctl *)pkt.feature_ioctl_data; + if ( copy_from_user(&snap_stats, user_snap_stats, + sizeof(snap_stats)) ) { + rc = -EINVAL; + break; + } + rc = get_snapshot_stats(&snap_stats); + if ( copy_to_user(user_snap_stats, &snap_stats, + sizeof(snap_stats)) ) { + rc = -EINVAL; + break; + } + break; + + default: + rc = -EINVAL; + break; + } + + pkt.status = rc; + copy_to_user(user_pkt, &pkt, sizeof(pkt)); + MOD_DEC_USE_COUNT; + return rc; +} + +/** + * lvm_vge_init + **/ +int __init lvm_vge_init(void) +{ + struct proc_dir_entry *pde; + + lvm_group_list = NULL; + lvm_proc = NULL; + + /* Register the global proc-fs entries. */ + pde = evms_cs_get_evms_proc_dir(); + if (pde) { + lvm_proc = create_proc_entry(LVM_PROC_NAME, S_IFDIR, pde); + if (lvm_proc) { + create_proc_read_entry(LVM_PROC_GLOBAL_NAME, S_IFREG, + lvm_proc, lvm_global_proc_read, + NULL); + } + } + + /* Register this plugin with EVMS. */ + return evms_cs_register_plugin(&lvm_plugin_header); +} + +/** + * lvm_vge_exit + **/ +void __exit lvm_vge_exit(void) +{ + struct lvm_volume_group * group, * next_group; + struct proc_dir_entry * pde; + int i; + + /* If LVM is called for module_exit, that means the reference + * count must be zero, which means there should be no volumes, + * and thus no volume groups. But, check anyway and delete + * any volumes and groups that are still hanging around. + */ + if (lvm_group_list) { + LOG_SERIOUS("Called for module_exit, but group list is not empty!\n"); + } + + for ( group = lvm_group_list; group; group = next_group ) { + next_group = group->next_group; + + LOG_SERIOUS("In module_exit: deleting all volumes from group %s.\n", + group->vg_name); + + for ( i = 1; i <= MAX_LV; i++ ) { + if (group->volume_list[i]) { + lvm_delete_node(group->volume_list[i]->volume_node); + } + } + } + + /* Unregister the proc-fs entries. */ + pde = evms_cs_get_evms_proc_dir(); + if (pde) { + remove_proc_entry(LVM_PROC_GLOBAL_NAME, lvm_proc); + remove_proc_entry(LVM_PROC_NAME, pde); + } + + /* Unregister this plugin from EVMS. */ + evms_cs_unregister_plugin(&lvm_plugin_header); +} + +module_init(lvm_vge_init); +module_exit(lvm_vge_exit); +#ifdef MODULE_LICENSE +MODULE_LICENSE("GPL"); +#endif + diff -Naur linux-2002-09-30/drivers/evms/md_core.c evms-2002-09-30/drivers/evms/md_core.c --- linux-2002-09-30/drivers/evms/md_core.c Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/drivers/evms/md_core.c Sun Sep 29 23:25:48 2002 @@ -0,0 +1,3633 @@ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * + * linux/drivers/evms/md_core.c + * + * EVMS Linux MD Region Manager + * + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LOG_PREFIX "md core: " + +/* + * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' + * is 100 KB/sec, so the extra system load does not show up that much. + * Increase it if you want to have more _guaranteed_ speed. Note that + * the RAID driver will use the maximum available bandwith if the IO + * subsystem is idle. There is also an 'absolute maximum' reconstruction + * speed limit - in case reconstruction slows down your system despite + * idle IO detection. + * + * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. + */ + +static MD_LIST_HEAD(all_raid_disks); +static MD_LIST_HEAD(pending_raid_disks); + +static int sysctl_speed_limit_min = 100; +static int sysctl_speed_limit_max = 100000; + + +static mdk_personality_t *pers[MAX_PERSONALITY]; + +static int md_blocksizes[MAX_MD_DEVS]; +static int md_hardsect_sizes[MAX_MD_DEVS]; +int evms_md_size[MAX_MD_DEVS]; +static struct evms_thread *evms_md_recovery_thread = NULL; + +/* + * Enables to iterate over all existing md arrays + */ +static LIST_HEAD(all_mddevs); +static LIST_HEAD(incomplete_mddevs); +static LIST_HEAD(running_mddevs); + +/* + * The mapping between kdev and mddev is not necessary a simple + * one! Eg. HSM uses several sub-devices to implement Logical + * Volumes. All these sub-devices map to the same mddev. + */ +struct dev_mapping evms_mddev_map[MAX_MD_DEVS]; + + +/* Support functions for discovery */ +static mdk_rdev_t * evms_md_find_rdev_all (struct evms_logical_node *node); +static mddev_t * evms_md_find_mddev_all (struct evms_logical_node *node); +static int evms_md_import_device (struct evms_logical_node **discover_list, + struct evms_logical_node *node); +static void evms_md_autostart_arrays(struct evms_logical_node **discover_list); +static void evms_md_run_devices (struct evms_logical_node **discover_list); +static int evms_md_run_array (struct evms_logical_node ** discover_list, + mddev_t *mddev); +static void evms_md_run_incomplete_array (struct evms_logical_node ** discover_list, + mddev_t *mddev); +static int evms_md_create_logical_node(struct evms_logical_node **discover_list, + mddev_t *mddev, uint flags); +static int evms_md_read_disk_sb (mdk_rdev_t * rdev); +static int evms_md_analyze_sbs (mddev_t * mddev); +static mddev_t * alloc_mddev (kdev_t dev); +static void free_mddev(mddev_t * mddev); +static void evms_md_create_recovery_thread(void); +static void evms_md_destroy_recovery_thread(void); +static int do_md_run (mddev_t * mddev); +static int do_md_stop (mddev_t * mddev, int ro); + +static void evms_md_export_rdev (mdk_rdev_t * rdev, int delete_node); +static void kick_rdev_from_array (mdk_rdev_t * rdev); +static mdp_disk_t *evms_md_find_disk(mddev_t *mddev, kdev_t dev); +static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb); + +/* Plugin API prototypes */ +static int md_discover( struct evms_logical_node ** discover_list ); +static int md_end_discover( struct evms_logical_node ** discover_list ); +static int md_delete( struct evms_logical_node * node); +static void md_read( struct evms_logical_node * node, + struct buffer_head * bh); +static void md_write( struct evms_logical_node * node, + struct buffer_head * bh); +static int md_sync_io( struct evms_logical_node *node, + int rw, + u64 sect_nr, + u64 num_sects, + void *data); +static int md_ioctl( struct evms_logical_node *node, + struct inode *inode, + struct file *file, + unsigned int cmd, + unsigned long arg); +static int md_ioctl_cmd_broadcast( + struct evms_logical_node *node, + struct inode *inode, + struct file *file, + unsigned long cmd, + unsigned long arg); + +static int md_direct_ioctl( + struct inode *inode, + struct file *file, + unsigned int cmd, + unsigned long arg); + +/* global MD data structures */ +static struct evms_plugin_fops md_fops = { + .discover = md_discover, + .end_discover = md_end_discover, + .delete = md_delete, + .read = md_read, + .write = md_write, + .init_io = md_sync_io, + .ioctl = md_ioctl, + .direct_ioctl = md_direct_ioctl +}; + +static struct evms_plugin_header md_plugin_header = { + .id = SetPluginID(IBM_OEM_ID, + EVMS_REGION_MANAGER, + EVMS_MD_ID), + .version = { + .major = EVMS_MD_MAJOR_VERSION, + .minor = EVMS_MD_MINOR_VERSION, + .patchlevel = EVMS_MD_PATCHLEVEL_VERSION + }, + .required_services_version = { + .major = EVMS_MD_COMMON_SERVICES_MAJOR, + .minor = EVMS_MD_COMMON_SERVICES_MINOR, + .patchlevel = EVMS_MD_COMMON_SERVICES_PATCHLEVEL + }, + .fops = &md_fops +}; + +/* global variables */ +static int exported_nodes; /* total # of exported devices + * produced during this discovery. + */ +static struct evms_logical_node **cur_discover_list = NULL; + +/**********************************************************/ +/* SYSCTL - EVMS/RAID folder */ +/**********************************************************/ + +#ifdef CONFIG_PROC_FS +static struct ctl_table_header *md_table_header; + +static ctl_table md_table[] = { + {DEV_EVMS_MD_SPEED_LIMIT_MIN, "speed_limit_min", + &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec}, + {DEV_EVMS_MD_SPEED_LIMIT_MAX, "speed_limit_max", + &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec}, + {0} +}; + +static ctl_table md_dir_table[] = { + {DEV_EVMS_MD, "md", NULL, 0, 0555, md_table}, + {0} +}; + +static ctl_table evms_dir_table[] = { + {DEV_EVMS, "evms", NULL, 0, 0555, md_dir_table}, + {0} +}; + +static ctl_table dev_dir_table[] = { + {CTL_DEV, "dev", NULL, 0, 0555, evms_dir_table}, + {0} +}; +#endif +/********** Required EVMS Plugin Functions **********/ + +/* + * Function: md_discover + * We should only export complete MD device nodes + */ +static int md_discover( struct evms_logical_node ** discover_list ) +{ + MOD_INC_USE_COUNT; + LOG_ENTRY_EXIT("%s: ENTRY\n", __FUNCTION__); + + /* initialize global variable */ + exported_nodes = 0; + cur_discover_list = discover_list; + evms_md_autostart_arrays(discover_list); + + LOG_ENTRY_EXIT("%s: EXIT (exported nodes: %d)\n", __FUNCTION__,exported_nodes); + cur_discover_list = NULL; + MOD_DEC_USE_COUNT; + return(exported_nodes); +} + +static mddev_t * evms_md_find_incomplete_array(int level) +{ + mddev_t *mddev; + struct list_head *tmp,*tmp2; + mdk_rdev_t *rdev; + + ITERATE_INCOMPLETE_MDDEV(mddev,tmp) { + ITERATE_RDEV(mddev, rdev, tmp2) { + if (rdev->sb && rdev->sb->level == level) + return mddev; + } + } + return NULL; +} + +/* + * Function: md_end_discover + */ +static int md_end_discover( struct evms_logical_node ** discover_list ) +{ + int rc = 0; + struct list_head *tmp; + mdk_rdev_t *rdev; + mddev_t *mddev; + struct evms_logical_node *node; + int done = FALSE; + + MOD_INC_USE_COUNT; + LOG_ENTRY_EXIT("%s: ENTRY\n", __FUNCTION__); + rc = md_discover(discover_list); + + do { + done = TRUE; + if ( (mddev = evms_md_find_incomplete_array(5)) != NULL) { + evms_md_run_incomplete_array(discover_list, mddev); + done = FALSE; + continue; + } + if ( (mddev = evms_md_find_incomplete_array(1)) != NULL) { + evms_md_run_incomplete_array(discover_list, mddev); + done = FALSE; + continue; + } + if ( (mddev = evms_md_find_incomplete_array(0)) != NULL) { + evms_md_run_incomplete_array(discover_list, mddev); + done = FALSE; + continue; + } + if ( (mddev = evms_md_find_incomplete_array(-1)) != NULL) { + evms_md_run_incomplete_array(discover_list, mddev); + done = FALSE; + continue; + } + + } while (!done); + + + /* + * At this point, delete all mddevs which did not start. + */ + ITERATE_MDDEV(mddev,tmp) { + if (mddev->pers == NULL) { + LOG_WARNING("%s: deleting md%d\n", __FUNCTION__, mdidx(mddev)); + free_mddev(mddev); + } + } + + + /* + * At this point, delete all rdevs which do not belong to any of discovered MD arrays. + */ + ITERATE_RDEV_ALL(rdev, tmp) { + if (!rdev->mddev) { + node = rdev->node; + if (node) { + if (node->plugin->id == md_plugin_header.id) + evms_md_export_rdev(rdev, FALSE); + else + evms_md_export_rdev(rdev, TRUE); + } + } + } + + LOG_ENTRY_EXIT("%s: EXIT\n", __FUNCTION__); + MOD_DEC_USE_COUNT; + return rc; +} + + +/* + * Function: md_delete_node + */ +static int md_delete( struct evms_logical_node * node) +{ + struct evms_md *evms_md; + mddev_t *mddev; + + evms_md = node->private; + mddev = evms_md->mddev; + LOG_DEFAULT("md_delete() [%s]\n", evms_md_partition_name(node)); + + if (mddev) + do_md_stop(mddev,0); + if (evms_md) { + if (evms_md->instance_plugin_hdr.fops) + kfree(evms_md->instance_plugin_hdr.fops); + kfree(evms_md); + } + + evms_cs_deallocate_logical_node(node); + return 0; +} + + +/* + * Function: md_read + */ +static void md_read( struct evms_logical_node * node, + struct buffer_head * bh) +{ + struct evms_md *evms_md; + mddev_t *mddev; + + evms_md = node->private; + mddev = evms_md->mddev; + if (evms_md_check_boundary(node, bh)) return; + if (mddev && mddev->pers) + mddev->pers->read(node, bh); +} + + +/* + * Function: md_write + */ +static void md_write( struct evms_logical_node * node, + struct buffer_head * bh) +{ + struct evms_md *evms_md; + mddev_t *mddev; + + evms_md = node->private; + mddev = evms_md->mddev; + if (evms_md_check_boundary(node, bh)) return; + if (mddev->ro) { + LOG_ERROR("%s: read-only is set for [%s]\n", __FUNCTION__, node->name); + bh->b_end_io(bh, 0); + return; + } + if (mddev && mddev->pers) + mddev->pers->write(node, bh); +} + +/* + * Function: md_sync_io + */ +static int md_sync_io( + struct evms_logical_node *node, + int rw, + u64 sect_nr, + u64 num_sects, + void *buf_addr) +{ + struct evms_md *evms_md; + mddev_t *mddev; + int rc = 0; + + evms_md = node->private; + mddev = evms_md->mddev; + + if (sect_nr + num_sects > node->total_vsectors) { + LOG_ERROR("%s: attempt to %s beyond MD device(%s) boundary("PFU64") with sect_nr("PFU64") and num_sects("PFU64")\n", + __FUNCTION__, + rw ? "WRITE" : "READ", + node->name, + node->total_vsectors, + sect_nr,num_sects); + rc = -EINVAL; + } + + if ((mddev->ro) && (rw != READ)) { + LOG_ERROR("%s: read-only is set for [%s]\n", __FUNCTION__, node->name); + return -EINVAL; + } + + if (!rc && mddev && mddev->pers) { + /* + * Check if the personality can handle synchronous I/O, + * otherwise use the generic function. + */ + if (mddev->pers->sync_io) + rc = mddev->pers->sync_io(mddev, rw, sect_nr, num_sects, buf_addr); + else + rc = evms_md_sync_io(node, rw, sect_nr, num_sects, buf_addr); + } else + rc = -EINVAL; + return rc; +} + +/** + * md_end_sync_request - End IO handler for synchronous I/O functions + **/ +static void md_end_sync_request(struct buffer_head *bh, int uptodate) +{ + struct evms_md_sync_cb * cb = (struct evms_md_sync_cb *) bh->b_private; + + if (!uptodate) + cb->rc |= -EIO; + /* we are done with the bh */ + evms_cs_deallocate_to_pool(evms_bh_pool, bh); + + if (atomic_dec_and_test(&cb->io_count)) { + if (waitqueue_active(&cb->wait)) + wake_up(&cb->wait); + } +} + +/** + * md_sync_request_submit_bh - submit a page-size bh + * @node - target MD node + * @bh - pointer to the buffer head + * @sector - the sector number + * @data - pointer to buffer + * @rw - READ/WRITE + * @cb - MD synchronous I/O control block + **/ +static inline void md_sync_request_submit_bh( + struct evms_logical_node *node, + struct buffer_head *bh, + unsigned long sector, + char *data, + int rw, + struct evms_md_sync_cb *cb) +{ + + bh->b_this_page = (struct buffer_head *)1; + bh->b_rsector = sector; + bh->b_size = PAGE_SIZE; + bh->b_state = 0; + set_bit(BH_Dirty, &bh->b_state); + set_bit(BH_Lock, &bh->b_state); + set_bit(BH_Req, &bh->b_state); + set_bit(BH_Mapped, &bh->b_state); + atomic_set(&bh->b_count, 1); + bh->b_data = data; + bh->b_page = virt_to_page(data); + bh->b_list = BUF_LOCKED; + bh->b_end_io = md_end_sync_request; + bh->b_private = cb; + atomic_inc(&cb->io_count); + if (rw == READ) + R_IO(node,bh); + else + W_IO(node,bh); +} + +/** + * evms_md_allocate_bh + * + * Note that this function will not return unless we got a free bh + **/ +static inline struct buffer_head *evms_md_allocate_bh(void) +{ + struct buffer_head *bh; + + while ((bh = evms_cs_allocate_from_pool(evms_bh_pool, FALSE)) == NULL) + schedule(); /* just yield for a someone to deallocate a bh */ + init_waitqueue_head(&bh->b_wait); + bh->b_count = (atomic_t)ATOMIC_INIT(0); + return(bh); +} + +/** + * md_partial_sync_io - + * This function handles synchronous I/O when sector is not page aligned + * @node - evms node for the MD array + * @rw - READ/WRITE + * @sector - the sector + * @nsects - on input, the total sectors for the request + * @nsects - on output, number of sectors completed + * @data - data buffer + **/ +int evms_md_partial_sync_io( + struct evms_logical_node *node, + int rw, + u64 sector, + u32 *nsects, + void *data) +{ + int rc; + u32 offset, size; + struct buffer_head *bh; + struct evms_md_sync_cb cb; + char *page; + + size = (u32)(*nsects << EVMS_VSECTOR_SIZE_SHIFT); + + /* calculate byte offset */ + offset = (u32)((sector & (EVMS_MD_SECTS_PER_PAGE-1)) << EVMS_VSECTOR_SIZE_SHIFT); + if (!offset && (*nsects >= EVMS_MD_SECTS_PER_PAGE)) { + *nsects = 0; + return 0; /* Nothing to do */ + } + + page = NULL; + rc = 0; + + page = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!page) { + LOG_ERROR("%s: no memory!\n", __FUNCTION__); + rc = -ENOMEM; + } + + bh = evms_md_allocate_bh(); + + if (!rc) { + memset(&cb, 0, sizeof(cb)); + init_waitqueue_head(&cb.wait); + cb.io_count = (atomic_t)ATOMIC_INIT(0); + md_sync_request_submit_bh( + node, bh, + (unsigned long)(sector & EVMS_MD_SECTS_PER_PAGE_MASK), + page, READ, &cb); + wait_disk_event(cb.wait, !atomic_read(&cb.io_count)); + rc |= cb.rc; + } + + if (!rc) { + size = (size <= (PAGE_SIZE - offset)) ? size : (PAGE_SIZE - offset); + + switch (rw) { + case READ: + /* copy data and return */ + memcpy(data, page+offset, size); + break; + case WRITE: + /* copy data and then write */ + memcpy(page+offset, data, size); + + bh = evms_md_allocate_bh(); + + md_sync_request_submit_bh( + node, bh, + (unsigned long)(sector & EVMS_MD_SECTS_PER_PAGE_MASK), + page, WRITE, &cb); + wait_disk_event(cb.wait, !atomic_read(&cb.io_count)); + rc |= cb.rc; + break; + default: + rc = -EINVAL; + } + } + + if (page) + kfree(page); + + if (!rc) + *nsects = (u64)(size >> EVMS_VSECTOR_SIZE_SHIFT); + else + *nsects = 0; + return rc; +} + +/** + * evms_md_sync_io - This function handles synchronous I/O + **/ +int evms_md_sync_io( + struct evms_logical_node *node, + int rw, + u64 sector, + u64 total_nr_sects, + void *data ) +{ + int rc = 0; + u64 total_nr_pages, size; + u32 nsects; + struct buffer_head *bh; + struct evms_md_sync_cb cb; + + if (sector % EVMS_MD_SECTS_PER_PAGE) { + nsects = total_nr_sects; + rc = evms_md_partial_sync_io(node, rw, sector, &nsects, data); + if (!rc) { + total_nr_sects -= nsects; + sector += nsects; + data += (nsects << EVMS_VSECTOR_SIZE_SHIFT); + if (total_nr_sects == 0) + return rc; + } else { + return rc; + } + } + + total_nr_pages = total_nr_sects / EVMS_MD_SECTS_PER_PAGE; + size = total_nr_sects << EVMS_VSECTOR_SIZE_SHIFT; + + memset(&cb, 0, sizeof(cb)); + init_waitqueue_head(&cb.wait); + cb.io_count = (atomic_t)ATOMIC_INIT(0); + + while (!rc && total_nr_pages) { + + bh = evms_md_allocate_bh(); + + md_sync_request_submit_bh(node, bh,(unsigned long)sector, data, rw, &cb); + + sector += EVMS_MD_SECTS_PER_PAGE; + size -= PAGE_SIZE; + total_nr_pages--; + data += PAGE_SIZE; + } + if (!rc) { + wait_disk_event(cb.wait, !atomic_read(&cb.io_count)); + rc |= cb.rc; + } + + if (!rc && size) { + nsects = size >> EVMS_VSECTOR_SIZE_SHIFT; + rc = evms_md_partial_sync_io(node, rw, sector, &nsects, data); + } + + return(rc); +} + +/* + * Function: md_ioctl + */ +static int md_ioctl( + struct evms_logical_node * node, + struct inode * inode, + struct file * file, + unsigned int cmd, + unsigned long arg) +{ + struct evms_md * evms_md = node->private; + mddev_t *mddev; + int rc = 0; + + if ((!inode) || (!evms_md) ) + rc = -EINVAL; + + if (!rc) { + switch (cmd) { + /* + * We have a problem here : there is no easy way to give a CHS + * virtual geometry. We currently pretend that we have a 2 heads + * 4 sectors (with a BIG number of cylinders...). This drives + * dosfs just mad... ;-) + */ + + case HDIO_GETGEO: + { + struct hd_geometry hdgeo; + hdgeo.heads = 2; + hdgeo.sectors = 4; + hdgeo.cylinders = ((unsigned int)node->total_vsectors) / + hdgeo.heads / hdgeo.sectors; + hdgeo.start = 0; + if (copy_to_user((int *)arg, + &hdgeo, + sizeof(hdgeo))) + rc = -EFAULT; + } + break; + case EVMS_QUIESCE_VOLUME: + case EVMS_GET_DISK_LIST: + case EVMS_CHECK_MEDIA_CHANGE: + case EVMS_REVALIDATE_DISK: + case EVMS_OPEN_VOLUME: + case EVMS_CLOSE_VOLUME: + case EVMS_CHECK_DEVICE_STATUS: + rc = md_ioctl_cmd_broadcast( + node, inode, file, cmd, arg); + break; + case EVMS_PLUGIN_IOCTL: + rc = md_direct_ioctl( + inode, file, cmd, arg); + break; + default: + mddev = evms_md->mddev; + if (mddev == NULL) { + rc = -ENODEV; + } else if (mddev->pers->evms_ioctl == NULL) { + rc = -ENOSYS; + } else { + rc = mddev->pers->evms_ioctl(mddev, inode, file, cmd, arg); + } + } + } + return(rc); +} + +static int md_ioctl_cmd_broadcast( + struct evms_logical_node *node, + struct inode *inode, + struct file *file, + unsigned long cmd, + unsigned long arg) +{ + int rc = 0; + struct evms_md *evms_md; + mddev_t *mddev; + struct list_head *tmp; + mdk_rdev_t *rdev; + + evms_md = node->private; + mddev = evms_md->mddev; + + /* broadcast this cmd to all children */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (!rdev->mddev) { + MD_BUG(); + continue; + } + if (!rdev->virtual_spare) { + rc |= IOCTL(rdev->node, inode, file, cmd, arg); + } + } + return (rc); +} + + +static int evms_md_add_virtual_spare (mddev_t *mddev, kdev_t dev) +{ + mdk_rdev_t *rdev; + mdp_disk_t *disk = NULL; + int i; + + if (evms_md_find_rdev(mddev,dev)) + return -EEXIST; + + LOG_ENTRY_EXIT("%s ENTRY\n", __FUNCTION__); + if ((rdev = kmalloc(sizeof(*rdev),GFP_KERNEL)) == NULL) + return -ENOMEM; + + memset(rdev, 0, sizeof(*rdev)); + + for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) { + disk = mddev->sb->disks + i; + if (!disk->major && !disk->minor) + break; + if (disk_removed(disk)) + break; + } + if (i == MD_SB_DISKS) { + LOG_WARNING("%s : [md%d]can not hot-add to full array!\n", __FUNCTION__, mdidx(mddev)); + kfree(rdev); + return -EBUSY; + } + + if (disk_removed(disk)) { + /* + * reuse slot + */ + if (disk->number != i) { + MD_BUG(); + kfree(rdev); + return -EINVAL; + } + } else { + disk->number = i; + } + + disk->raid_disk = disk->number; + disk->major = MAJOR(dev); + disk->minor = MINOR(dev); + + mark_disk_spare(disk); + + rdev->mddev = mddev; + rdev->dev = dev; + rdev->desc_nr = disk->number; + rdev->virtual_spare = 1; + + /* bind rdev to mddev array */ + list_add(&rdev->all, &all_raid_disks); + list_add(&rdev->same_set, &mddev->disks); + MD_INIT_LIST_HEAD(&rdev->pending); + + mddev->sb->nr_disks++; + mddev->sb->spare_disks++; + mddev->sb->working_disks++; + mddev->nb_dev++; + + mddev->sb_dirty = 1; + + evms_md_update_sb(mddev); + + return 0; +} + +static int evms_md_remove_disk(mddev_t *mddev, kdev_t dev) +{ + mdk_rdev_t *rdev = NULL; + mdp_disk_t *disk; + int rc = 0; + + disk = evms_md_find_disk(mddev,dev); + if (!disk) + return -ENODEV; + + rdev = evms_md_find_rdev(mddev,dev); + + if (rdev && !rdev->faulty) { + /* + * The disk is active in the array, + * must ask the personality to do it + */ + if (mddev->pers && mddev->pers->diskop) { + /* Assume spare, try to remove it first. */ + rc = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_SPARE); + if (rc) + rc = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK); + } else + rc = -ENOSYS; + } + + if (!rc) { + remove_descriptor(disk,mddev->sb); + if (rdev) + kick_rdev_from_array(rdev); + mddev->sb_dirty = 1; + evms_md_update_sb(mddev); + + } + return rc; +} + + +/* + * Function: md_direct_ioctl + * + * This function provides a method for user-space to communicate directly + * with a plugin in the kernel. + */ +static int md_direct_ioctl( + struct inode * inode, + struct file * file, + unsigned int cmd, + unsigned long args ) +{ + struct evms_plugin_ioctl_pkt argument; + kdev_t md_kdev; + mddev_t *mddev = NULL; + struct evms_md_ioctl ioctl_arg; + struct evms_md_kdev device; + struct evms_md_array_info array_info, *usr_array_info; + int rc = 0; + + MOD_INC_USE_COUNT; + + // Copy user's parameters to kernel space + if ( copy_from_user(&argument, (struct evms_plugin_ioctl_pkt*)args, sizeof(argument)) ) { + MOD_DEC_USE_COUNT; + return -EFAULT; + } + + // Make sure this is supposed to be our ioctl. + if ( argument.feature_id != md_plugin_header.id ) { + MOD_DEC_USE_COUNT; + return -EINVAL; + } + + // Copy user's md ioclt parmeters to kernel space + if ( copy_from_user(&ioctl_arg, + (struct evms_md_ioctl*)argument.feature_ioctl_data, + sizeof(ioctl_arg)) ) + rc = -EFAULT; + else { + if (ioctl_arg.mddev_idx < MAX_MD_DEVS) { + md_kdev = MKDEV(MD_MAJOR, ioctl_arg.mddev_idx); + mddev = kdev_to_mddev(md_kdev); + if (mddev == NULL) + rc = -ENODEV; + } else + rc = -ENODEV; + } + + if (!rc) { + switch(argument.feature_command) { + case EVMS_MD_PERS_IOCTL_CMD: + if (mddev->pers->md_pers_ioctl == NULL) { + MOD_DEC_USE_COUNT; + return -ENOSYS; + } + rc = mddev->pers->md_pers_ioctl(mddev, + ioctl_arg.cmd, + ioctl_arg.arg); + copy_to_user((struct evms_md_ioctl*)argument.feature_ioctl_data, + &ioctl_arg, + sizeof(ioctl_arg)); + break; + + case EVMS_MD_ADD: + if ( copy_from_user(&device, + (struct evms_md_kdev *)ioctl_arg.arg, + sizeof(device)) ) + rc = -EFAULT; + else + rc = evms_md_add_virtual_spare(mddev,MKDEV(device.major, device.minor)); + break; + + case EVMS_MD_REMOVE: + if ( copy_from_user(&device, + (struct evms_md_kdev *)ioctl_arg.arg, + sizeof(device)) ) + rc = -EFAULT; + else + rc = evms_md_remove_disk(mddev,MKDEV(device.major, device.minor)); + break; + + case EVMS_MD_ACTIVATE: + rc = -ENOSYS; + break; + + case EVMS_MD_DEACTIVATE: + rc = -ENOSYS; + break; + + case EVMS_MD_GET_ARRAY_INFO: + + usr_array_info = (struct evms_md_array_info *)ioctl_arg.arg; + if ( copy_from_user(&array_info, usr_array_info, + sizeof(array_info)) ) + rc = -EFAULT; + else { + array_info.state = 0; + if (mddev->curr_resync) + array_info.state |= EVMS_MD_ARRAY_SYNCING; + copy_to_user(&usr_array_info->state, &array_info.state, + sizeof(usr_array_info->state)); + if (copy_to_user(array_info.sb, mddev->sb, + sizeof(mdp_super_t))) + rc = -EFAULT; + } + break; + default: + rc = -ENOSYS; + break; + } + } + + argument.status = rc; + copy_to_user((struct evms_plugin_ioctl_pkt*)args, &argument, sizeof(argument)); + MOD_DEC_USE_COUNT; + return rc; +} + + + + +void evms_md_add_mddev_mapping (mddev_t * mddev, kdev_t dev, void *data) +{ + unsigned int minor = MINOR(dev); + + if (MAJOR(dev) != MD_MAJOR) { + MD_BUG(); + return; + } + if (evms_mddev_map[minor].mddev != NULL) { + MD_BUG(); + return; + } + evms_mddev_map[minor].mddev = mddev; + evms_mddev_map[minor].data = data; +} + +void evms_md_del_mddev_mapping (mddev_t * mddev, kdev_t dev) +{ + unsigned int minor = MINOR(dev); + + if (MAJOR(dev) != MD_MAJOR) { + MD_BUG(); + return; + } + if (evms_mddev_map[minor].mddev != mddev) { + MD_BUG(); + return; + } + evms_mddev_map[minor].mddev = NULL; + evms_mddev_map[minor].data = NULL; +} + +static mddev_t * alloc_mddev (kdev_t dev) +{ + mddev_t *mddev; + + if (MAJOR(dev) != MD_MAJOR) { + MD_BUG(); + return 0; + } + mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL); + if (!mddev) + return NULL; + + memset(mddev, 0, sizeof(*mddev)); + + mddev->__minor = MINOR(dev); + init_MUTEX(&mddev->reconfig_sem); + init_MUTEX(&mddev->recovery_sem); + init_MUTEX(&mddev->resync_sem); + INIT_LIST_HEAD(&mddev->disks); + INIT_LIST_HEAD(&mddev->all_mddevs); + INIT_LIST_HEAD(&mddev->incomplete_mddevs); + INIT_LIST_HEAD(&mddev->running_mddevs); + mddev->active = (atomic_t)ATOMIC_INIT(0); + mddev->recovery_active = (atomic_t)ATOMIC_INIT(0); + + /* + * The 'base' mddev is the one with data NULL. + * personalities can create additional mddevs + * if necessary. + */ + evms_md_add_mddev_mapping(mddev, dev, 0); + list_add(&mddev->all_mddevs, &all_mddevs); + + MOD_INC_USE_COUNT; + evms_md_create_recovery_thread(); + + return mddev; +} + +mdk_rdev_t * evms_md_find_rdev_nr(mddev_t *mddev, int nr) +{ + mdk_rdev_t * rdev; + struct list_head *tmp; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == nr) + return rdev; + } + return NULL; +} + + +mdk_rdev_t * evms_md_find_rdev(mddev_t * mddev, kdev_t dev) +{ + struct list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->dev == dev) + return rdev; + } + return NULL; +} + +mdk_rdev_t * evms_md_find_rdev_from_node(mddev_t * mddev, struct evms_logical_node * node) +{ + struct list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->node == node) + return rdev; + } + return NULL; +} + +static MD_LIST_HEAD(device_names); + +static char * org_partition_name (kdev_t dev) +{ + struct gendisk *hd; + static char nomem [] = ""; + dev_name_t *dname; + struct list_head *tmp = device_names.next; + + while (tmp != &device_names) { + dname = list_entry(tmp, dev_name_t, list); + if (dname->dev == dev) + return dname->name; + tmp = tmp->next; + } + + dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL); + + if (!dname) + return nomem; + /* + * ok, add this new device name to the list + */ + hd = get_gendisk (dev); + dname->name = NULL; + if (hd) + dname->name = disk_name (hd, MINOR(dev), dname->namebuf); + if (!dname->name) { + sprintf (dname->namebuf, "[dev %s]", kdevname(dev)); + dname->name = dname->namebuf; + } + + dname->dev = dev; + MD_INIT_LIST_HEAD(&dname->list); + list_add(&dname->list, &device_names); + + return dname->name; +} + + +#define EVMS_MD_NULL_PARTITION_NAME "" +char * evms_md_partition_name (struct evms_logical_node *node) +{ + if (node && node->name) + return node->name; + else + return EVMS_MD_NULL_PARTITION_NAME; +} + +static char * get_partition_name (mdk_rdev_t *rdev) +{ + if (rdev->node) + return evms_md_partition_name(rdev->node); + else + return org_partition_name(rdev->dev); +} + +/* + * Function: evms_md_calc_dev_sboffset + * return the LSN for md super block. + */ +static u64 evms_md_calc_dev_sboffset (struct evms_logical_node *node,mddev_t *mddev, int persistent) +{ + u64 size = 0; + + size = node->total_vsectors; + if (persistent) { + size = MD_NEW_SIZE_SECTORS(size); + } + return size; /* size in sectors */ +} + +/* + * Function: evms_md_calc_dev_size + * return data size (in blocks) for an "extended" device. + */ +static unsigned long evms_md_calc_dev_size (struct evms_logical_node *node, + mddev_t *mddev, + int persistent) +{ + unsigned long size; + u64 size_in_sectors; + + size_in_sectors = evms_md_calc_dev_sboffset(node, mddev, persistent); + size = size_in_sectors >> 1; + if (!mddev->sb) { + MD_BUG(); + return size; + } + if (mddev->sb->chunk_size) + size &= ~(mddev->sb->chunk_size/1024 - 1); + return size; +} + +static unsigned int zoned_raid_size (mddev_t *mddev) +{ + unsigned int mask; + mdk_rdev_t * rdev; + struct list_head *tmp; + + if (!mddev->sb) { + MD_BUG(); + return -EINVAL; + } + /* + * do size and offset calculations. + */ + mask = ~(mddev->sb->chunk_size/1024 - 1); + + ITERATE_RDEV(mddev,rdev,tmp) { + rdev->size &= mask; + evms_md_size[mdidx(mddev)] += rdev->size; + } + return 0; +} + +/* + * We check wether all devices are numbered from 0 to nb_dev-1. The + * order is guaranteed even after device name changes. + * + * Some personalities (raid0, linear) use this. Personalities that + * provide data have to be able to deal with loss of individual + * disks, so they do their checking themselves. + */ +int evms_md_check_ordering (mddev_t *mddev) +{ + int i, c; + mdk_rdev_t *rdev; + struct list_head *tmp; + + /* + * First, all devices must be fully functional + */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) { + LOG_ERROR("evms_md_check_ordering() md%d's device %s faulty, aborting.\n", + mdidx(mddev), get_partition_name(rdev)); + goto abort; + } + } + + c = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + c++; + } + if (c != mddev->nb_dev) { + MD_BUG(); + goto abort; + } + if (mddev->nb_dev != mddev->sb->raid_disks) { + LOG_ERROR("%s: [md%d] array needs %d disks, has %d, aborting.\n", + __FUNCTION__, mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev); + goto abort; + } + /* + * Now the numbering check + */ + for (i = 0; i < mddev->nb_dev; i++) { + c = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == i) + c++; + } + if (!c) { + LOG_ERROR("md%d, missing disk #%d, aborting.\n",mdidx(mddev), i); + goto abort; + } + if (c > 1) { + LOG_ERROR("md%d, too many disks #%d, aborting.\n",mdidx(mddev), i); + goto abort; + } + } + return 0; +abort: + return 1; +} + +static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb) +{ + if (disk_active(disk)) { + sb->working_disks--; + } else { + if (disk_spare(disk)) { + sb->spare_disks--; + sb->working_disks--; + } else { + sb->failed_disks--; + } + } + sb->nr_disks--; + disk->major = disk->minor = 0; + mark_disk_removed(disk); +} + +#define BAD_MINOR \ +"%s: invalid raid minor (%x)\n" + +#define NO_SB \ +"disabled device %s, could not read superblock.\n" + +#define BAD_CSUM \ +"invalid superblock checksum on %s\n" + + +static int alloc_array_sb (mddev_t * mddev) +{ + if (mddev->sb) { + MD_BUG(); + return 0; + } + + mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL); + if (!mddev->sb) { + LOG_ERROR("%s: Out of memory!\n", __FUNCTION__); + return -ENOMEM; + } + md_clear_page(mddev->sb); + return 0; +} + +static int alloc_disk_sb (mdk_rdev_t * rdev) +{ + if (rdev->sb) + MD_BUG(); + + rdev->sb = (mdp_super_t *) __get_free_page(GFP_KERNEL); + if (!rdev->sb) { + LOG_ERROR("%s: Out of memory!\n", __FUNCTION__); + return -EINVAL; + } + md_clear_page(rdev->sb); + + return 0; +} + +/* + * Function: free_disk_sb + * + */ +static void free_disk_sb (mdk_rdev_t * rdev) +{ + if (rdev->sb) { + free_page((unsigned long) rdev->sb); + rdev->sb = NULL; + rdev->sb_offset = 0; + rdev->size = 0; + } else { + if (!rdev->virtual_spare && !rdev->faulty) + MD_BUG(); + } +} + +/* + * Function: evms_md_read_disk_sb + * Read the MD superblock. + */ +static int evms_md_read_disk_sb (mdk_rdev_t * rdev) +{ + int rc = 0; + struct evms_logical_node *node = rdev->node; + u64 sb_offset_in_sectors; + + if (!rdev->sb) { + MD_BUG(); + return -EINVAL; + } + if (node->total_vsectors <= MD_RESERVED_SECTORS) { + LOG_DETAILS("%s is too small, total_vsectors("PFU64")\n", + evms_md_partition_name(node), node->total_vsectors); + return -EINVAL; + } + + /* + * Calculate the position of the superblock, + * it's at the end of the disk + */ + sb_offset_in_sectors = evms_md_calc_dev_sboffset(node, rdev->mddev, 1); + rdev->sb_offset = (unsigned long)(sb_offset_in_sectors >> 1); + LOG_DEBUG("(read) %s's sb offset("PFU64") total_vsectors("PFU64")\n", + evms_md_partition_name(node), sb_offset_in_sectors, node->total_vsectors); + + /* + * Read superblock + */ + rc = INIT_IO(node, 0, sb_offset_in_sectors, MD_SB_SECTORS, rdev->sb); + + return rc; +} + +static unsigned int calc_sb_csum (mdp_super_t * sb) +{ + unsigned int disk_csum, csum; + + disk_csum = sb->sb_csum; + sb->sb_csum = 0; + csum = csum_partial((void *)sb, MD_SB_BYTES, 0); + sb->sb_csum = disk_csum; + return csum; +} + + + +/* + * Check one RAID superblock for generic plausibility + */ + +static int check_disk_sb (mdk_rdev_t * rdev) +{ + mdp_super_t *sb; + int ret = -EINVAL; + + sb = rdev->sb; + if (!sb) { + MD_BUG(); + goto abort; + } + + if (sb->md_magic != MD_SB_MAGIC) { + goto abort; + } + + if (sb->md_minor >= MAX_MD_DEVS) { + LOG_ERROR(BAD_MINOR, get_partition_name(rdev), sb->md_minor); + goto abort; + } + if (calc_sb_csum(sb) != sb->sb_csum) { + LOG_ERROR(BAD_CSUM, get_partition_name(rdev)); + goto abort; + } + + switch (sb->level) { + case -1: + case 0: + case 1: + case 5: + break; + default: + LOG_ERROR("%s: EVMS MD does not support MD level %d\n", __FUNCTION__, sb->level); + goto abort; + } + ret = 0; +abort: + return ret; +} + +static kdev_t dev_unit(kdev_t dev) +{ + unsigned int mask; + struct gendisk *hd = get_gendisk(dev); + + if (!hd) + return 0; + mask = ~((1 << hd->minor_shift) - 1); + + return MKDEV(MAJOR(dev), MINOR(dev) & mask); +} + +static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev) +{ + struct list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev,rdev,tmp) + if (dev_unit(rdev->dev) == dev_unit(dev)) + return rdev; + + return NULL; +} + +static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) +{ + struct list_head *tmp; + mdk_rdev_t *rdev; + + ITERATE_RDEV(mddev1,rdev,tmp) + if (match_dev_unit(mddev2, rdev->dev)) + return 1; + + return 0; +} + + +static void bind_rdev_to_array (mdk_rdev_t * rdev, mddev_t * mddev) +{ + mdk_rdev_t *same_pdev; + + if (rdev->mddev) { + MD_BUG(); + return; + } + + same_pdev = match_dev_unit(mddev, rdev->dev); + if (same_pdev) + LOG_WARNING("[md%d] WARNING: %s appears to be on the same physical disk as %s. True\n" + " protection against single-disk failure might be compromised.\n", + mdidx(mddev), get_partition_name(rdev),get_partition_name(same_pdev)); + + list_add(&rdev->same_set, &mddev->disks); + rdev->mddev = mddev; + mddev->nb_dev++; + if (rdev->sb && disk_active(&rdev->sb->this_disk)) + mddev->nr_raid_disks++; + LOG_DETAILS("bind<%s,%d>\n", get_partition_name(rdev), rdev->mddev->nb_dev); +} + +static void unbind_rdev_from_array (mdk_rdev_t * rdev) +{ + if (!rdev->mddev) { + MD_BUG(); + return; + } + list_del(&rdev->same_set); + MD_INIT_LIST_HEAD(&rdev->same_set); + rdev->mddev->nb_dev--; + if (rdev->sb && disk_active(&rdev->sb->this_disk)) + rdev->mddev->nr_raid_disks--; + LOG_DETAILS("unbind<%s,%d>\n", get_partition_name(rdev), rdev->mddev->nb_dev); + rdev->mddev = NULL; +} + + +/* + * Function: evms_md_export_rdev + * EVMS MD version of export_rdev() + * Discard this MD "extended" device + */ +static void evms_md_export_rdev (mdk_rdev_t * rdev, int delete_node) +{ + LOG_DETAILS("%s: (%s)\n", __FUNCTION__ , get_partition_name(rdev)); + if (rdev->mddev) + MD_BUG(); + free_disk_sb(rdev); + list_del(&rdev->all); + MD_INIT_LIST_HEAD(&rdev->all); + if (rdev->pending.next != &rdev->pending) { + LOG_WARNING("%s: (%s was pending)\n",__FUNCTION__ ,get_partition_name(rdev)); + list_del(&rdev->pending); + MD_INIT_LIST_HEAD(&rdev->pending); + } + if (rdev->node && delete_node) { + if (cur_discover_list) { + LOG_DETAILS("%s: remove (%s) from discover list.\n", __FUNCTION__, + get_partition_name(rdev)); + evms_cs_remove_logical_node_from_list(cur_discover_list, rdev->node); + } + LOG_DETAILS("%s: deleting node %s\n", __FUNCTION__, get_partition_name(rdev)); + DELETE(rdev->node); + rdev->node = NULL; + } + rdev->dev = 0; + rdev->faulty = 0; + kfree(rdev); +} + + +static void kick_rdev_from_array (mdk_rdev_t * rdev) +{ + LOG_DEFAULT("%s: (%s)\n", __FUNCTION__,get_partition_name(rdev)); + unbind_rdev_from_array(rdev); + evms_md_export_rdev(rdev, TRUE); +} + +static void export_array (mddev_t *mddev) +{ + struct list_head *tmp; + mdk_rdev_t *rdev; + mdp_super_t *sb = mddev->sb; + + LOG_DEFAULT("%s: [md%d]\n",__FUNCTION__ ,mdidx(mddev)); + if (mddev->sb) { + mddev->sb = NULL; + free_page((unsigned long) sb); + } + + LOG_DEBUG("%s: removing all extended devices belong to md%d\n",__FUNCTION__,mdidx(mddev)); + ITERATE_RDEV(mddev,rdev,tmp) { + if (!rdev->mddev) { + MD_BUG(); + continue; + } + kick_rdev_from_array(rdev); + } + if (mddev->nb_dev) + MD_BUG(); +} + +static void free_mddev (mddev_t *mddev) +{ + struct evms_logical_node *node; + struct evms_md *evms_md; + + if (!mddev) { + MD_BUG(); + return; + } + + node = mddev->node; + + export_array(mddev); + evms_md_size[mdidx(mddev)] = 0; + + + /* + * Make sure nobody else is using this mddev + * (careful, we rely on the global kernel lock here) + */ + while (atomic_read(&mddev->resync_sem.count) != 1) + schedule(); + while (atomic_read(&mddev->recovery_sem.count) != 1) + schedule(); + + evms_md_del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev))); + list_del(&mddev->all_mddevs); + INIT_LIST_HEAD(&mddev->all_mddevs); + if (!list_empty(&mddev->running_mddevs)) { + list_del(&mddev->running_mddevs); + INIT_LIST_HEAD(&mddev->running_mddevs); + } + if (!list_empty(&mddev->incomplete_mddevs)) { + list_del(&mddev->incomplete_mddevs); + INIT_LIST_HEAD(&mddev->incomplete_mddevs); + } + + kfree(mddev); + if (node) { + evms_md = node->private; + evms_md->mddev = NULL; + } + MOD_DEC_USE_COUNT; + evms_md_destroy_recovery_thread(); +} + + +static void print_desc(mdp_disk_t *desc) +{ + printk(" DISK\n", desc->number, + desc->raid_disk,desc->state); +} + +static void print_sb(mdp_super_t *sb) +{ + int i; + + printk(" SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", + sb->major_version, sb->minor_version, sb->patch_version, + sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, + sb->ctime); + printk(" L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level, + sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor, + sb->layout, sb->chunk_size); + printk(" UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%x\n", + sb->utime, sb->state, sb->active_disks, sb->working_disks, + sb->failed_disks, sb->spare_disks, + sb->sb_csum, sb->events_lo); + + for (i = 0; i < MD_SB_DISKS; i++) { + mdp_disk_t *desc; + + desc = sb->disks + i; + if (desc->number || desc->major || desc->minor || desc->raid_disk || (desc->state && (desc->state != 4))) { + printk(" D %2d: ", i); + print_desc(desc); + } + } + printk(" THIS: "); + print_desc(&sb->this_disk); + +} + +static void print_rdev(mdk_rdev_t *rdev) +{ + printk("rdev %s: SZ:%08ld F:%d DN:%d ", + get_partition_name(rdev), + rdev->size, rdev->faulty, rdev->desc_nr); + if (rdev->sb) { + printk("rdev superblock:\n"); + print_sb(rdev->sb); + } else + printk("no rdev superblock!\n"); +} + +void evms_md_print_devices (void) +{ + struct list_head *tmp, *tmp2; + mdk_rdev_t *rdev; + mddev_t *mddev; + + printk("\n"); + printk(": **********************************\n"); + printk(": * *\n"); + printk(": **********************************\n"); + ITERATE_MDDEV(mddev,tmp) { + printk("md%d: ", mdidx(mddev)); + + ITERATE_RDEV(mddev,rdev,tmp2) + printk("<%s>", get_partition_name(rdev)); + + if (mddev->sb) { + printk(" array superblock:\n"); + print_sb(mddev->sb); + } else + printk(" no array superblock.\n"); + + ITERATE_RDEV(mddev,rdev,tmp2) + print_rdev(rdev); + } + printk(": **********************************\n"); + printk("\n"); +} + +static int sb_equal ( mdp_super_t *sb1, mdp_super_t *sb2) +{ + int ret; + mdp_super_t *tmp1, *tmp2; + + tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); + tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); + + if (!tmp1 || !tmp2) { + ret = 0; + printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n"); + goto abort; + } + + *tmp1 = *sb1; + *tmp2 = *sb2; + + /* + * nr_disks is not constant + */ + tmp1->nr_disks = 0; + tmp2->nr_disks = 0; + + if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4)) + ret = 0; + else + ret = 1; + +abort: + if (tmp1) + kfree(tmp1); + if (tmp2) + kfree(tmp2); + + return ret; +} + +static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2) +{ + if ( (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) && + (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) && + (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) && + (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3)) + + return 1; + + return 0; +} + +/* + * Function: evms_md_find_rdev_all + * EVMS MD version of find_rdev_all() + * Search entire all_raid_disks for "node" + * Return the MD "extended" device if found. + */ +static mdk_rdev_t * evms_md_find_rdev_all (struct evms_logical_node *node) +{ + struct list_head *tmp; + mdk_rdev_t *rdev; + + tmp = all_raid_disks.next; + while (tmp != &all_raid_disks) { + rdev = list_entry(tmp, mdk_rdev_t, all); + if (rdev->node == node) + return rdev; + tmp = tmp->next; + } + return NULL; +} + +/* + * Function: evms_md_find_mddev_all + */ +static mddev_t * evms_md_find_mddev_all (struct evms_logical_node *node) +{ + struct list_head *tmp; + mddev_t *mddev; + + ITERATE_MDDEV(mddev,tmp) { + if (mddev->node == node) + return mddev; + } + return NULL; +} + + +/* + * Function: evms_md_write_disk_sb + * EVMS MD version of write_disk_sb + */ +static int evms_md_write_disk_sb(mdk_rdev_t * rdev) +{ + unsigned long size; + u64 sb_offset_in_sectors; + + if (!rdev->sb) { + MD_BUG(); + return 1; + } + if (rdev->faulty) { + MD_BUG(); + return 1; + } + if (rdev->sb->md_magic != MD_SB_MAGIC) { + MD_BUG(); + return 1; + } + + sb_offset_in_sectors = evms_md_calc_dev_sboffset(rdev->node, rdev->mddev, 1); + if (rdev->sb_offset != (sb_offset_in_sectors >> 1)) { + LOG_WARNING("%s's sb offset has changed from blocks(%ld) to blocks(%ld), skipping\n", + get_partition_name(rdev), + rdev->sb_offset, + (unsigned long)(sb_offset_in_sectors >> 1)); + goto skip; + } + /* + * If the disk went offline meanwhile and it's just a spare, then + * its size has changed to zero silently, and the MD code does + * not yet know that it's faulty. + */ + size = evms_md_calc_dev_size(rdev->node, rdev->mddev, 1); + if (size != rdev->size) { + LOG_WARNING("%s's size has changed from %ld to %ld since import, skipping\n", + get_partition_name(rdev), rdev->size, size); + goto skip; + } + + LOG_DETAILS("(write) %s's sb offset: "PFU64"\n",get_partition_name(rdev), sb_offset_in_sectors); + + INIT_IO(rdev->node,WRITE,sb_offset_in_sectors,MD_SB_SECTORS,rdev->sb); + +skip: + return 0; +} + +static int evms_md_sync_sbs(mddev_t * mddev) +{ + mdk_rdev_t *rdev; + struct list_head *tmp; + mdp_disk_t * disk; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->virtual_spare || rdev->faulty) + continue; + + /* copy everything from the master */ + memcpy(rdev->sb, mddev->sb, sizeof(mdp_super_t)); + + /* this_disk is unique, copy it from the master */ +// rdev->sb->this_disk = mddev->sb->disks[rdev->desc_nr]; + // use the SB disk array since if update occurred on normal shutdown + // the rdevs may be out of date. + disk = evms_md_find_disk(mddev, rdev->dev); + if (disk) { + rdev->sb->this_disk = *disk; + } + + rdev->sb->sb_csum = calc_sb_csum(rdev->sb); + } + return 0; +} + +static int evms_md_update_sb_sync(mddev_t * mddev, int clean) +{ + mdk_rdev_t *rdev; + struct list_head *tmp; + int rc = 0; + int found = FALSE; + + ITERATE_RDEV(mddev,rdev,tmp) { + + if (rdev->virtual_spare || rdev->faulty) + continue; + + if ((rc = evms_md_read_disk_sb(rdev))) { + LOG_ERROR("%s: error reading superblock on %s!\n", + __FUNCTION__, evms_md_partition_name(rdev->node)); + break; + } + + if ((rc = check_disk_sb(rdev))) { + LOG_ERROR("%s: %s has invalid sb!\n", + __FUNCTION__, evms_md_partition_name(rdev->node)); + break; + } + + rdev->desc_nr = rdev->sb->this_disk.number; + rdev->dev = MKDEV(rdev->sb->this_disk.major, rdev->sb->this_disk.minor); + + /* copy master superlbock from the first good rdev */ + if (!found) { + found = TRUE; + memcpy(mddev->sb, rdev->sb, sizeof(mdp_super_t)); + if (clean) + mddev->sb->state |= 1 << MD_SB_CLEAN; + else + mddev->sb->state &= ~(1 << MD_SB_CLEAN); + } + } + if (!rc && found) { + evms_md_update_sb(mddev); + } else { + LOG_SERIOUS("%s: BUG! BUG! superblocks will not be updated!\n", __FUNCTION__); + } + return rc; + +} + +int evms_md_update_sb(mddev_t * mddev) +{ + int err, count = 100; + struct list_head *tmp; + mdk_rdev_t *rdev; + + +repeat: + mddev->sb->utime = CURRENT_TIME; + if ((++mddev->sb->events_lo)==0) + ++mddev->sb->events_hi; + + if ((mddev->sb->events_lo|mddev->sb->events_hi)==0) { + /* + * oops, this 64-bit counter should never wrap. + * Either we are in around ~1 trillion A.C., assuming + * 1 reboot per second, or we have a bug: + */ + MD_BUG(); + mddev->sb->events_lo = mddev->sb->events_hi = 0xffffffff; + } + evms_md_sync_sbs(mddev); + + /* + * do not write anything to disk if using + * nonpersistent superblocks + */ + if (mddev->sb->not_persistent) + return 0; + + LOG_DETAILS("%s: updating [md%d] superblock\n",__FUNCTION__ ,mdidx(mddev)); + + err = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (!rdev->virtual_spare && !rdev->faulty) { + LOG_DETAILS(" %s [events: %x]", + get_partition_name(rdev), + rdev->sb->events_lo); + err += evms_md_write_disk_sb(rdev); + } else { + if (rdev->faulty) + LOG_DETAILS(" skipping faulty %s\n", get_partition_name(rdev)); + if (rdev->virtual_spare) + LOG_DETAILS(" skipping virtual spare.\n"); + } + } + if (err) { + if (--count) { + LOG_WARNING("errors occurred during superblock update, repeating\n"); + goto repeat; + } + LOG_ERROR("excessive errors occurred during superblock update, exiting\n"); + } + return 0; +} + +/* + * Function: evms_md_import_device + * Insure that node is not yet imported. + * Read and validate the MD super block on this device + * Add to the global MD "extended" devices list (all_raid_disks) + * + */ +static int evms_md_import_device (struct evms_logical_node **discover_list, + struct evms_logical_node *node) +{ + int err; + mdk_rdev_t *rdev; + + LOG_ENTRY_EXIT("%s: discovering %s\n",__FUNCTION__,evms_md_partition_name(node)); + + if (evms_md_find_rdev_all(node)) { + LOG_DEBUG("%s exists\n", evms_md_partition_name(node)); + return -EEXIST; + } + + rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL); + if (!rdev) { + LOG_ERROR("could not alloc mem for %s!\n", evms_md_partition_name(node)); + return -ENOMEM; + } + memset(rdev, 0, sizeof(*rdev)); + + if ((err = alloc_disk_sb(rdev))) + goto abort_free; + + rdev->node = node; /* set this for evms_md_read_disk_sb() */ + + rdev->desc_nr = -1; + rdev->faulty = 0; + + if (!node->total_vsectors) { + LOG_ERROR("%s has zero size!\n", evms_md_partition_name(node)); + err = -EINVAL; + goto abort_free; + } + + if ((err = evms_md_read_disk_sb(rdev))) { + LOG_EXTRA("could not read %s's sb, not importing!\n",evms_md_partition_name(node)); + goto abort_free; + } + if ((err = check_disk_sb(rdev))) { + LOG_EXTRA("%s has invalid sb, not importing!\n",evms_md_partition_name(node)); + goto abort_free; + } + rdev->desc_nr = rdev->sb->this_disk.number; + rdev->dev = MKDEV(rdev->sb->this_disk.major, rdev->sb->this_disk.minor); + LOG_DETAILS("FOUND %s desc_nr(%d)\n", get_partition_name(rdev), rdev->desc_nr); + list_add(&rdev->all, &all_raid_disks); + MD_INIT_LIST_HEAD(&rdev->pending); + + if (rdev->faulty && rdev->sb) + free_disk_sb(rdev); + + return 0; + +abort_free: + if (rdev->sb) { + free_disk_sb(rdev); + } + kfree(rdev); + return err; +} + + + +/* + * Function: evms_md_analyze_sbs + * EVMS MD version of analyze_sbs() + */ +static int evms_md_analyze_sbs (mddev_t * mddev) +{ + int out_of_date = 0, i; + struct list_head *tmp, *tmp2; + mdk_rdev_t *rdev, *rdev2, *freshest; + mdp_super_t *sb; + + LOG_ENTRY_EXIT("Analyzing all superblocks...\n"); + /* + * Verify the RAID superblock on each real device + */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) { + MD_BUG(); + goto abort; + } + if (!rdev->sb) { + MD_BUG(); + goto abort; + } + if (check_disk_sb(rdev)) + goto abort; + } + + /* + * The superblock constant part has to be the same + * for all disks in the array. + */ + sb = NULL; + + ITERATE_RDEV(mddev,rdev,tmp) { + if (!sb) { + sb = rdev->sb; + continue; + } + if (!sb_equal(sb, rdev->sb)) { + LOG_WARNING("kick out %s\n",get_partition_name(rdev)); + kick_rdev_from_array(rdev); + continue; + } + } + + /* + * OK, we have all disks and the array is ready to run. Let's + * find the freshest superblock, that one will be the superblock + * that represents the whole array. + */ + if (!mddev->sb) + if (alloc_array_sb(mddev)) + goto abort; + sb = mddev->sb; + freshest = NULL; + + ITERATE_RDEV(mddev,rdev,tmp) { + __u64 ev1, ev2; + /* + * if the checksum is invalid, use the superblock + * only as a last resort. (decrease it's age by + * one event) + */ + if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) { + if (rdev->sb->events_lo || rdev->sb->events_hi) + if ((rdev->sb->events_lo--)==0) + rdev->sb->events_hi--; + } + LOG_DETAILS("%s's event counter: %x\n",get_partition_name(rdev), rdev->sb->events_lo); + + if (!freshest) { + freshest = rdev; + continue; + } + /* + * Find the newest superblock version + */ + ev1 = md_event(rdev->sb); + ev2 = md_event(freshest->sb); + if (ev1 != ev2) { + out_of_date = 1; + if (ev1 > ev2) + freshest = rdev; + } + } + if (out_of_date) { + LOG_WARNING("OUT OF DATE, freshest: %s\n",get_partition_name(freshest)); + } + memcpy (sb, freshest->sb, sizeof(*sb)); + + /* + * at this point we have picked the 'best' superblock + * from all available superblocks. + * now we validate this superblock and kick out possibly + * failed disks. + */ + ITERATE_RDEV(mddev,rdev,tmp) { + /* + * Kick all non-fresh devices + */ + __u64 ev1, ev2; + ev1 = md_event(rdev->sb); + ev2 = md_event(sb); + if (ev1 < ev2) { + if (ev1) { + LOG_WARNING("kicking non-fresh %s from array!\n",get_partition_name(rdev)); + kick_rdev_from_array(rdev); + continue; + } else { + LOG_DETAILS("%s is a new spare.\n",get_partition_name(rdev)); + } + } + } + + /* + * Remove unavailable and faulty devices ... + * + * note that if an array becomes completely unrunnable due to + * missing devices, we do not write the superblock back, so the + * administrator has a chance to fix things up. The removal thus + * only happens if it's nonfatal to the contents of the array. + */ + for (i = 0; i < MD_SB_DISKS; i++) { + int found; + mdp_disk_t *desc; + + desc = sb->disks + i; + + /* + * We kick faulty devices/descriptors immediately. + * + * Note: multipath devices are a special case. Since we + * were able to read the superblock on the path, we don't + * care if it was previously marked as faulty, it's up now + * so enable it. + */ + if (disk_faulty(desc) && mddev->sb->level != -4) { + found = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr != desc->number) + continue; + LOG_WARNING("[md%d] kicking faulty %s!\n",mdidx(mddev),get_partition_name(rdev)); + kick_rdev_from_array(rdev); + found = 1; + break; + } + if (!found) { + LOG_WARNING("%s: [md%d] found former faulty device [number=%d]\n", + __FUNCTION__ ,mdidx(mddev), desc->number); + } + /* + * Don't call remove_descriptor(), + * let the administrator remove it from the user-land */ + /* remove_descriptor(desc, sb); */ + continue; + } else if (disk_faulty(desc)) { + /* + * multipath entry marked as faulty, unfaulty it + */ + kdev_t dev; + + dev = MKDEV(desc->major, desc->minor); + + rdev = evms_md_find_rdev(mddev, dev); + if (rdev) + mark_disk_spare(desc); + else { + LOG_WARNING("%s: [md%d] (MULTIPATH) found former faulty device [number=%d]\n", + __FUNCTION__ ,mdidx(mddev), desc->number); + /* + * Don't call remove_descriptor(), + * let the administrator remove it from the user-land */ + /* remove_descriptor(desc, sb); */ + } + } + + /* + * Is this device present in the rdev ring? + */ + found = 0; + ITERATE_RDEV(mddev,rdev,tmp) { + /* + * Multi-path IO special-case: since we have no + * this_disk descriptor at auto-detect time, + * we cannot check rdev->number. + * We can check the device though. + */ + if ((sb->level == -4) && (rdev->dev == + MKDEV(desc->major,desc->minor))) { + found = 1; + break; + } + if (rdev->desc_nr == desc->number) { + found = 1; + break; + } + } + if (found) + continue; + + LOG_WARNING(" [md%d]: former device [number=%d] is unavailable!\n", + mdidx(mddev), desc->number); + remove_descriptor(desc, sb); + } + + /* + * Kick all rdevs that are not in the + * descriptor array: + */ + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == -1) + kick_rdev_from_array(rdev); + } + + /* + * Do a final reality check. + */ + if (mddev->sb->level != -4) { + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->desc_nr == -1) { + MD_BUG(); + goto abort; + } + /* + * is the desc_nr unique? + */ + ITERATE_RDEV(mddev,rdev2,tmp2) { + if ((rdev2 != rdev) && + (rdev2->desc_nr == rdev->desc_nr)) { + MD_BUG(); + goto abort; + } + } + } + } + +#define OLD_VERSION KERN_ALERT \ +"md%d: unsupported raid array version %d.%d.%d\n" + +#define NOT_CLEAN_IGNORE KERN_ERR \ +"md%d: raid array is not clean -- starting background reconstruction\n" + + /* + * Check if we can support this RAID array + */ + if (sb->major_version != MD_MAJOR_VERSION || + sb->minor_version > MD_MINOR_VERSION) { + + LOG_ERROR("[md%d] unsupported raid array version %d.%d.%d\n", + mdidx(mddev), + sb->major_version, + sb->minor_version, + sb->patch_version); + goto abort; + } + + if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) || + (sb->level == 4) || (sb->level == 5))) + LOG_WARNING("[md%d, level=%d] raid array is not clean -- starting background reconstruction\n", + mdidx(mddev), sb->level); + + LOG_ENTRY_EXIT("analysis of all superblocks is OK!\n"); + return 0; +abort: + LOG_WARNING("ABORT analyze_sbs()!!!\n"); + return 1; +} + + +static int device_size_calculation (mddev_t * mddev) +{ + int data_disks = 0, persistent; + //unsigned int readahead; + mdp_super_t *sb = mddev->sb; + struct list_head *tmp; + mdk_rdev_t *rdev; + + /* + * Do device size calculation. Bail out if too small. + * (we have to do this after having validated chunk_size, + * because device size has to be modulo chunk_size) + */ + persistent = !mddev->sb->not_persistent; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + if (rdev->size) { + LOG_DEFAULT("%s: already calculated %s\n", __FUNCTION__, get_partition_name(rdev)); + continue; + } + rdev->size = evms_md_calc_dev_size(rdev->node, mddev, persistent); + if (rdev->size < sb->chunk_size / 1024) { + LOG_WARNING("Dev %s smaller than chunk_size: %ldk < %dk\n", + get_partition_name(rdev), rdev->size, sb->chunk_size / 1024); + return -EINVAL; + } + } + + switch (sb->level) { + case -4: + data_disks = 1; + break; + case -3: + data_disks = 1; + break; + case -2: + data_disks = 1; + break; + case -1: + zoned_raid_size(mddev); + data_disks = 1; + break; + case 0: + zoned_raid_size(mddev); + data_disks = sb->raid_disks; + break; + case 1: + data_disks = 1; + break; + case 4: + case 5: + data_disks = sb->raid_disks-1; + break; + default: + LOG_ERROR("[md%d] unkown level %d\n", mdidx(mddev), sb->level); + goto abort; + } + if (!evms_md_size[mdidx(mddev)]) + evms_md_size[mdidx(mddev)] = sb->size * data_disks; + + return 0; +abort: + return 1; +} + + +#define TOO_BIG_CHUNKSIZE KERN_ERR \ +"too big chunk_size: %d > %d\n" + +#define TOO_SMALL_CHUNKSIZE KERN_ERR \ +"too small chunk_size: %d < %ld\n" + +#define BAD_CHUNKSIZE KERN_ERR \ +"no chunksize specified, see 'man raidtab'\n" + +static int do_md_run (mddev_t * mddev) +{ + int pnum, err; + int chunk_size; + struct list_head *tmp; + mdk_rdev_t *rdev; + + + if (!mddev->nb_dev) { + MD_BUG(); + return -EINVAL; + } + + if (mddev->pers) + return -EBUSY; + + /* + * Resize disks to align partitions size on a given + * chunk size. + */ + evms_md_size[mdidx(mddev)] = 0; + + /* + * Analyze all RAID superblock(s) + */ + if (evms_md_analyze_sbs(mddev)) { + MD_BUG(); + return -EINVAL; + } + + mddev->chunk_size = chunk_size = mddev->sb->chunk_size; + pnum = level_to_pers(mddev->sb->level); + + if ((pnum != MULTIPATH) && (pnum != RAID1)) { + if (!chunk_size) { + /* + * 'default chunksize' in the old md code used to + * be PAGE_SIZE, baaad. + * we abort here to be on the safe side. We dont + * want to continue the bad practice. + */ + printk(BAD_CHUNKSIZE); + return -EINVAL; + } + if (chunk_size > MAX_CHUNK_SIZE) { + printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE); + return -EINVAL; + } + /* + * chunk-size has to be a power of 2 and multiples of PAGE_SIZE + */ + if ( (1 << ffz(~chunk_size)) != chunk_size) { + MD_BUG(); + return -EINVAL; + } + if (chunk_size < PAGE_SIZE) { + printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE); + return -EINVAL; + } + } else + if (chunk_size) + printk(KERN_INFO "RAID level %d does not need chunksize! Continuing anyway.\n", mddev->sb->level); + + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + if (!pers[pnum]) + { +#ifdef CONFIG_KMOD + char module_name[80]; + sprintf (module_name, "md-personality-%d", pnum); + request_module (module_name); + if (!pers[pnum]) +#endif + { + printk(KERN_ERR "personality %d is not loaded!\n", + pnum); + return -EINVAL; + } + } + if (device_size_calculation(mddev)) + return -EINVAL; + + /* + * Drop all container device buffers, from now on + * the only valid external interface is through the md + * device. + * Also find largest hardsector size + */ + md_hardsect_sizes[mdidx(mddev)] = 512; + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) + continue; + invalidate_device(rdev->dev, 1); +/* if (get_hardsect_size(rdev->dev) + > md_hardsect_sizes[mdidx(mddev)]) + md_hardsect_sizes[mdidx(mddev)] = + get_hardsect_size(rdev->dev); */ + if (rdev->node->hardsector_size > md_hardsect_sizes[mdidx(mddev)]) { + md_hardsect_sizes[mdidx(mddev)] = rdev->node->hardsector_size; + } + + } + md_blocksizes[mdidx(mddev)] = 1024; + if (md_blocksizes[mdidx(mddev)] < md_hardsect_sizes[mdidx(mddev)]) + md_blocksizes[mdidx(mddev)] = md_hardsect_sizes[mdidx(mddev)]; + + mddev->pers = pers[pnum]; + + err = mddev->pers->run(mddev); + if (err) { + LOG_WARNING("%s: pers->run() failed.\n", __FUNCTION__); + mddev->pers = NULL; + return -EINVAL; + } + mddev->sb->state &= ~(1 << MD_SB_CLEAN); + + evms_md_update_sb(mddev); + + if (incomplete_mddev(mddev)) { + LOG_DEFAULT("%s: [md%d] was incomplete!\n", __FUNCTION__, mdidx(mddev)); + list_del(&mddev->incomplete_mddevs); + INIT_LIST_HEAD(&mddev->incomplete_mddevs); + } + + list_add(&mddev->running_mddevs, &running_mddevs); + + return (0); +} + +#undef TOO_BIG_CHUNKSIZE +#undef BAD_CHUNKSIZE + + +#define OUT(x) do { err = (x); goto out; } while (0) + + +#define STILL_MOUNTED KERN_WARNING \ +"md%d still mounted.\n" +#define STILL_IN_USE \ +"md%d still in use.\n" + +static int do_md_stop (mddev_t * mddev, int ro) +{ + int err = 0, resync_interrupted = 0, clean = 0; + kdev_t dev = mddev_to_kdev(mddev); + + if (atomic_read(&mddev->active)>1) { + printk(STILL_IN_USE, mdidx(mddev)); + OUT(-EBUSY); + } + + if (mddev->pers) { + /* + * It is safe to call stop here, it only frees private + * data. Also, it tells us if a device is unstoppable + * (eg. resyncing is in progress) + */ + if (mddev->pers->stop_resync) + if (mddev->pers->stop_resync(mddev)) + resync_interrupted = 1; + + if (mddev->recovery_running) + evms_cs_interrupt_thread(evms_md_recovery_thread); + + /* + * This synchronizes with signal delivery to the + * resync or reconstruction thread. It also nicely + * hangs the process if some reconstruction has not + * finished. + */ + down(&mddev->recovery_sem); + up(&mddev->recovery_sem); + + invalidate_device(dev, 1); + + if (ro) { + if (mddev->ro) + OUT(-ENXIO); + mddev->ro = 1; + mddev->node->plugin = &md_plugin_header; + } else { + if (mddev->ro) + set_device_ro(dev, 0); + if (mddev->pers->stop(mddev)) { + if (mddev->ro) + set_device_ro(dev, 1); + OUT(-EBUSY); + } + if (mddev->ro) + mddev->ro = 0; + } + if (mddev->sb) { + /* + * mark it clean only if there was no resync + * interrupted. + */ + if (!mddev->recovery_running && !resync_interrupted) { + LOG_DEBUG("%s: marking sb clean...\n", __FUNCTION__); + clean = 1; + } + evms_md_update_sb_sync(mddev, clean); + } + if (ro) + set_device_ro(dev, 1); + } + + /* + * Free resources if final stop + */ + if (!ro) { + printk (KERN_INFO "md%d stopped.\n", mdidx(mddev)); + free_mddev(mddev); + + } else + printk (KERN_INFO + "md%d switched to read-only mode.\n", mdidx(mddev)); +out: + return err; +} + + +static int evms_md_run_array (struct evms_logical_node ** discover_list, mddev_t *mddev) +{ + mdk_rdev_t *rdev; + struct list_head *tmp; + int err = 0; + uint flags = 0; + + if (mddev->disks.prev == &mddev->disks) { + MD_BUG(); + return -EINVAL; + } + + LOG_DETAILS("%s: trying to run array md%d\n", __FUNCTION__,mdidx(mddev) ); + + ITERATE_RDEV(mddev,rdev,tmp) { + LOG_DETAILS(" <%s>\n", get_partition_name(rdev)); + } + + err = do_md_run (mddev); + if (!err) { + /* + * remove all nodes consumed by this md device from the discover list + */ + ITERATE_RDEV(mddev,rdev,tmp) { + LOG_DETAILS(" removing %s from discover list.\n", get_partition_name(rdev)); + evms_cs_remove_logical_node_from_list(discover_list,rdev->node); + flags |= rdev->node->flags; + } + err = evms_md_create_logical_node(discover_list,mddev,flags); + if (!err) { + exported_nodes++; + } + } else { + LOG_WARNING("%s: could not start [md%d] containing: \n",__FUNCTION__,mdidx(mddev)); + ITERATE_RDEV(mddev,rdev,tmp) { + LOG_WARNING(" (%s, desc_nr=%d)\n", get_partition_name(rdev), rdev->desc_nr); + } + LOG_WARNING("%s: will try restart [md%d] again later.\n",__FUNCTION__,mdidx(mddev)); + + mddev->sb_dirty = 0; + } + return err; +} + +static void evms_md_run_incomplete_array (struct evms_logical_node ** discover_list, mddev_t *mddev) +{ + mdk_rdev_t *rdev; + + LOG_DEFAULT("%s [md%d]\n", + __FUNCTION__, mdidx(mddev)); + if (evms_md_run_array(discover_list,mddev) == 0) { + /* + * We succeeded running this MD device. + * Now read MD superblock on this newly created MD node. + */ + if (mddev->node && + (evms_md_import_device(discover_list,mddev->node) == 0)) { + /* + * Yes, there is a superblock on this MD node. + * We probably have a MD stacking case here. + */ + rdev = evms_md_find_rdev_all(mddev->node); + if (rdev) { + list_add(&rdev->pending, &pending_raid_disks); + evms_md_run_devices(discover_list); + } else { + LOG_WARNING("%s: imported %s but no rdev was found!\n", + __FUNCTION__, + evms_md_partition_name(mddev->node)); + } + } + } + if (incomplete_mddev(mddev)) { + list_del(&mddev->incomplete_mddevs); + INIT_LIST_HEAD(&mddev->incomplete_mddevs); + } +} + +/* + * lets try to run arrays based on all disks that have arrived + * until now. (those are in the ->pending list) + * + * the method: pick the first pending disk, collect all disks with + * the same UUID, remove all from the pending list and put them into + * the 'same_array' list. Then order this list based on superblock + * update time (freshest comes first), kick out 'old' disks and + * compare superblocks. If everything's fine then run it. + * + * If "unit" is allocated, then bump its reference count + */ +static void evms_md_run_devices (struct evms_logical_node **discover_list) +{ + struct list_head candidates; + struct list_head *tmp; + mdk_rdev_t *rdev0, *rdev; + mddev_t *mddev; + kdev_t md_kdev; + + + LOG_ENTRY_EXIT("%s: ENTRY\n", __FUNCTION__); + while (pending_raid_disks.next != &pending_raid_disks) { + rdev0 = list_entry(pending_raid_disks.next, + mdk_rdev_t, pending); + MD_INIT_LIST_HEAD(&candidates); + ITERATE_RDEV_PENDING(rdev,tmp) { + if (uuid_equal(rdev0, rdev)) { + if (!sb_equal(rdev0->sb, rdev->sb)) { + LOG_DETAILS("%s has same UUID as %s, but superblocks differ ...\n",\ + get_partition_name(rdev),get_partition_name(rdev0)); + continue; + } + list_del(&rdev->pending); + list_add(&rdev->pending, &candidates); + } + } + + /* + * now we have a set of devices, with all of them having + * mostly sane superblocks. It's time to allocate the + * mddev. + */ + md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor); + mddev = kdev_to_mddev(md_kdev); + if (mddev && (!incomplete_mddev(mddev))) { + LOG_DETAILS("md%d already running, cannot run %s\n", + mdidx(mddev), get_partition_name(rdev0)); + + ITERATE_RDEV(mddev,rdev,tmp) { + /* + * This is EVMS re-discovery! + * Remove all nodes consumed by this md device from the discover list + */ + evms_cs_remove_logical_node_from_list(discover_list,rdev->node); + } + + ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) { + if (evms_md_find_mddev_all(rdev->node)) + /* + * We have found an MD superblock on top of a running MD array. + * Delete rdev but keep the MD array. + */ + evms_md_export_rdev(rdev, FALSE); + else + evms_md_export_rdev(rdev, TRUE); + } + continue; + } + + if (!mddev) { + mddev = alloc_mddev(md_kdev); + if (mddev == NULL) { + LOG_ERROR("cannot allocate memory for md drive.\n"); + break; + } + LOG_DETAILS("created md%d\n", mdidx(mddev)); + } else { + LOG_DETAILS("%s: found INCOMPLETE md%d\n", __FUNCTION__, mdidx(mddev)); + } + + ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) { + bind_rdev_to_array(rdev, mddev); + list_del(&rdev->pending); + MD_INIT_LIST_HEAD(&rdev->pending); + } + + if ((mddev->nr_raid_disks >= rdev0->sb->raid_disks) || + (mddev->nb_dev == rdev0->sb->nr_disks)) { + evms_md_run_array(discover_list,mddev); + } else { + LOG_DETAILS("THIS md%d IS INCOMPLETE, found %d devices, need %d\n", + mdidx(mddev), mddev->nr_raid_disks, rdev0->sb->raid_disks); + list_add(&mddev->incomplete_mddevs, &incomplete_mddevs); + ITERATE_RDEV(mddev,rdev,tmp) { + evms_cs_remove_logical_node_from_list(discover_list,rdev->node); + } + } + } + LOG_ENTRY_EXIT("%s: EXIT\n", __FUNCTION__); +} + +void evms_md_recover_arrays(void) +{ + if (!evms_md_recovery_thread) { + MD_BUG(); + return; + } + evms_cs_wakeup_thread(evms_md_recovery_thread); +} + +int evms_md_error_dev( + mddev_t *mddev, + kdev_t dev) +{ + mdk_rdev_t * rdev; + + rdev = evms_md_find_rdev(mddev, dev); + if (rdev) { + return evms_md_error(mddev,rdev->node); + } else { + LOG_ERROR("%s: could not find %s in md%d\n", + __FUNCTION__, org_partition_name(dev), mdidx(mddev)); + return 0; + } +} + +int evms_md_error( + mddev_t *mddev, + struct evms_logical_node *node) +{ + mdk_rdev_t * rrdev; + + /* check for NULL first */ + if (!mddev) { + MD_BUG(); + return 0; + } + LOG_ERROR("evms_md_error dev:(md%d), node:(%s), (caller: %p,%p,%p,%p).\n", + mdidx(mddev), node->name, + __builtin_return_address(0),__builtin_return_address(1), + __builtin_return_address(2),__builtin_return_address(3)); + + rrdev = evms_md_find_rdev_from_node(mddev, node); + if (!rrdev || rrdev->faulty) + return 0; + if (!mddev->pers->error_handler + || mddev->pers->error_handler(mddev,node) <= 0) { + free_disk_sb(rrdev); + rrdev->faulty = 1; + } else + return 1; + /* + * if recovery was running, stop it now. + */ + if (mddev->pers->stop_resync) + mddev->pers->stop_resync(mddev); + if (mddev->recovery_running) + evms_cs_interrupt_thread(evms_md_recovery_thread); + evms_md_recover_arrays(); + + return 0; +} + +int evms_register_md_personality (int pnum, mdk_personality_t *p) +{ + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + if (pers[pnum]) { + MD_BUG(); + return -EBUSY; + } + + pers[pnum] = p; + LOG_DETAILS("%s personality registered as nr %d\n",p->name, pnum); + return 0; +} + +int evms_unregister_md_personality (int pnum) +{ + if (pnum >= MAX_PERSONALITY) { + MD_BUG(); + return -EINVAL; + } + + printk(KERN_INFO "%s personality unregistered\n", pers[pnum]->name); + pers[pnum] = NULL; + return 0; +} + +mdp_disk_t *evms_md_get_spare(mddev_t *mddev) +{ + mdp_super_t *sb = mddev->sb; + mdp_disk_t *disk; + mdk_rdev_t *rdev; + int i, j; + + for (i = 0, j = 0; j < mddev->nb_dev; i++) { + rdev = evms_md_find_rdev_nr(mddev, i); + if (rdev == NULL) + continue; + j++; + if (rdev->faulty) + continue; + if (!rdev->sb) { + if (!rdev->virtual_spare) + MD_BUG(); + continue; + } + disk = &sb->disks[rdev->desc_nr]; + if (disk_faulty(disk)) { + MD_BUG(); + continue; + } + if (disk_active(disk)) + continue; + return disk; + } + return NULL; +} + +static mdp_disk_t *evms_md_find_disk(mddev_t *mddev, kdev_t dev) +{ + mdp_super_t *sb = mddev->sb; + mdp_disk_t *disk; + int i; + + for (i=0; i < MD_SB_DISKS; i++) { + disk = &sb->disks[i]; + if ((disk->major == MAJOR(dev)) && (disk->minor == MINOR(dev))) + return disk; + } + return NULL; +} + +static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK]; +void evms_md_sync_acct( + kdev_t dev, + unsigned long nr_sectors) +{ + unsigned int major = MAJOR(dev); + unsigned int index; + + index = disk_index(dev); + if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR)) + return; + + sync_io[major][index] += nr_sectors; +} + +static int is_mddev_idle(mddev_t *mddev) +{ + mdk_rdev_t * rdev; + struct list_head *tmp; + int idle; + unsigned long curr_events; + + idle = 1; + ITERATE_RDEV(mddev,rdev,tmp) { + int major = MAJOR(rdev->dev); + int idx = disk_index(rdev->dev); + + if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR)) + continue; + + curr_events = kstat.dk_drive_rblk[major][idx] + + kstat.dk_drive_wblk[major][idx] ; + curr_events -= sync_io[major][idx]; + if ((curr_events - rdev->last_events) > 32) { + rdev->last_events = curr_events; + idle = 0; + } + } + return idle; +} + +MD_DECLARE_WAIT_QUEUE_HEAD(evms_resync_wait); + +void evms_md_done_sync(mddev_t *mddev, int blocks, int ok) +{ + /* another "blocks" (512byte) blocks have been synced */ + atomic_sub(blocks, &mddev->recovery_active); + wake_up(&mddev->recovery_wait); + if (!ok) { + // stop recovery, signal do_sync .... + } +} + +#define SYNC_MARKS 10 +#define SYNC_MARK_STEP (3*HZ) +int evms_md_do_sync(mddev_t *mddev, mdp_disk_t *spare) +{ + mddev_t *mddev2; + unsigned int max_sectors, currspeed, + j, window, err, serialize; + unsigned long mark[SYNC_MARKS]; + unsigned long mark_cnt[SYNC_MARKS]; + int last_mark,m; + struct list_head *tmp; + unsigned long last_check; + + + err = down_interruptible(&mddev->resync_sem); + if (err) + goto out_nolock; + +recheck: + serialize = 0; + ITERATE_MDDEV(mddev2,tmp) { + if (mddev2 == mddev) + continue; + if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) { + LOG_DEFAULT("delaying resync of md%d until md%d " + "has finished resync (they share one or more physical units)\n", + mdidx(mddev), mdidx(mddev2)); + serialize = 1; + break; + } + } + if (serialize) { + interruptible_sleep_on(&evms_resync_wait); + if (md_signal_pending(current)) { + md_flush_signals(); + err = -EINTR; + goto out; + } + goto recheck; + } + + mddev->curr_resync = 1; + + max_sectors = mddev->sb->size<<1; + + LOG_DEFAULT("syncing RAID array md%d\n", mdidx(mddev)); + LOG_DEFAULT("minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n", + sysctl_speed_limit_min); + LOG_DEFAULT("using maximum available idle IO bandwith " + "(but not more than %d KB/sec) for reconstruction.\n", + sysctl_speed_limit_max); + + /* + * Resync has low priority. + */ +#ifdef O1_SCHEDULER + set_user_nice(current,19); +#else + current->nice = 19; +#endif + + is_mddev_idle(mddev); /* this also initializes IO event counters */ + for (m = 0; m < SYNC_MARKS; m++) { + mark[m] = jiffies; + mark_cnt[m] = 0; + } + last_mark = 0; + mddev->resync_mark = mark[last_mark]; + mddev->resync_mark_cnt = mark_cnt[last_mark]; + + /* + * Tune reconstruction: + */ + window = MD_READAHEAD*(PAGE_SIZE/512); + LOG_DEFAULT("using %dk window, over a total of %d blocks.\n", + window/2,max_sectors/2); + + atomic_set(&mddev->recovery_active, 0); + init_waitqueue_head(&mddev->recovery_wait); + last_check = 0; + for (j = 0; j < max_sectors;) { + int sectors; + + sectors = mddev->pers->sync_request(mddev, j); + + if (sectors < 0) { + err = sectors; + goto out; + } + atomic_add(sectors, &mddev->recovery_active); + j += sectors; + mddev->curr_resync = j; + + if (last_check + window > j) + continue; + + last_check = j; + + run_task_queue(&tq_disk); + + repeat: + if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) { + /* step marks */ + int next = (last_mark+1) % SYNC_MARKS; + + mddev->resync_mark = mark[next]; + mddev->resync_mark_cnt = mark_cnt[next]; + mark[next] = jiffies; + mark_cnt[next] = j - atomic_read(&mddev->recovery_active); + last_mark = next; + } + + + if (md_signal_pending(current)) { + /* + * got a signal, exit. + */ + mddev->curr_resync = 0; + LOG_DEFAULT("evms_md_do_sync() got signal ... exiting\n"); + md_flush_signals(); + err = -EINTR; + goto out; + } + + /* + * this loop exits only if either when we are slower than + * the 'hard' speed limit, or the system was IO-idle for + * a jiffy. + * the system might be non-idle CPU-wise, but we only care + * about not overloading the IO subsystem. (things like an + * e2fsck being done on the RAID array should execute fast) + */ + if (md_need_resched(current)) + schedule(); + + currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1; + + if (currspeed > sysctl_speed_limit_min) { +#ifdef O1_SCHEDULER + set_user_nice(current,19); +#else + current->nice = 19; +#endif + + if ((currspeed > sysctl_speed_limit_max) || + !is_mddev_idle(mddev)) { +#ifdef O1_SCHEDULER + set_current_state(TASK_INTERRUPTIBLE); +#else + current->state = TASK_INTERRUPTIBLE; +#endif + md_schedule_timeout(HZ/4); + goto repeat; + } + } else +#ifdef O1_SCHEDULER + set_user_nice(current,-20); +#else + current->nice = -20; +#endif + } + LOG_DEFAULT("md%d: sync done.\n",mdidx(mddev)); + err = 0; + /* + * this also signals 'finished resyncing' to md_stop + */ +out: + wait_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0); + up(&mddev->resync_sem); +out_nolock: + mddev->curr_resync = 0; + wake_up(&evms_resync_wait); + return err; +} + + + +/* + * This is a kernel thread which syncs a spare disk with the active array + * + * the amount of foolproofing might seem to be a tad excessive, but an + * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs + * of my root partition with the first 0.5 gigs of my /home partition ... so + * i'm a bit nervous ;) + */ +void evms_md_do_recovery(void *data) +{ + int err; + mddev_t *mddev; + mdp_super_t *sb; + mdp_disk_t *spare; + struct list_head *tmp; + + LOG_DEFAULT("recovery thread got woken up ...\n"); +restart: + ITERATE_MDDEV(mddev,tmp) { + + sb = mddev->sb; + if (!sb) + continue; + if (mddev->recovery_running) + continue; + if (sb->active_disks == sb->raid_disks) + continue; + if (!sb->spare_disks) { + LOG_ERROR(" [md%d] no spare disk to reconstruct array! " + "-- continuing in degraded mode\n", mdidx(mddev)); + continue; + } + + spare = NULL; + + if (!spare) { + /* + * now here we get the spare and resync it. + */ + spare = evms_md_get_spare(mddev); + } + if (!spare) + continue; + + LOG_DEFAULT(" [md%d] resyncing spare disk %s to replace failed disk\n", + mdidx(mddev), org_partition_name(MKDEV(spare->major,spare->minor))); + if (!mddev->pers->diskop) + continue; + + if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE)) + continue; + + down(&mddev->recovery_sem); + mddev->recovery_running = 1; + err = evms_md_do_sync(mddev, spare); + if (err == -EIO) { + LOG_DEFAULT("[md%d] spare disk %s failed, skipping to next spare.\n", + mdidx(mddev), org_partition_name(MKDEV(spare->major,spare->minor))); + if (!disk_faulty(spare)) { + mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE); + mark_disk_faulty(spare); + mark_disk_nonsync(spare); + mark_disk_inactive(spare); + sb->spare_disks--; + sb->working_disks--; + sb->failed_disks++; + } + } else + if (disk_faulty(spare)) + mddev->pers->diskop(mddev, &spare, + DISKOP_SPARE_INACTIVE); + if (err == -EINTR || err == -ENOMEM) { + /* + * Recovery got interrupted, or ran out of mem ... + * signal back that we have finished using the array. + */ + mddev->pers->diskop(mddev, &spare, + DISKOP_SPARE_INACTIVE); + up(&mddev->recovery_sem); + mddev->recovery_running = 0; + continue; + } else { + mddev->recovery_running = 0; + up(&mddev->recovery_sem); + } + if (!disk_faulty(spare)) { + /* + * the SPARE_ACTIVE diskop possibly changes the + * pointer too + */ + mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE); + mark_disk_sync(spare); + mark_disk_active(spare); + sb->active_disks++; + sb->spare_disks--; + } + mddev->sb_dirty = 1; + evms_md_update_sb(mddev); + goto restart; + } + LOG_DEFAULT("recovery thread finished ...\n"); + +} + +static void evms_md_create_recovery_thread(void) +{ + static char * name = "evms_mdrecoveryd"; + + if (!evms_md_recovery_thread) { + /* Create MD recovery thread */ + evms_md_recovery_thread = evms_cs_register_thread(evms_md_do_recovery, NULL, name); + if (!evms_md_recovery_thread) + LOG_SERIOUS("%s: evms_cs_recovery_thread failed\n", __FUNCTION__); + } +} + +static void evms_md_destroy_recovery_thread(void) +{ + if (evms_md_recovery_thread && !MOD_IN_USE) { + /* Destroy MD recovery thread */ + evms_cs_unregister_thread(evms_md_recovery_thread); + evms_md_recovery_thread = NULL; + } +} + +/** + * evms_md_create_logical_node + **/ +static int evms_md_create_logical_node( + struct evms_logical_node **discover_list, + mddev_t *mddev, + uint flags) +{ + int rc; + struct evms_md *evms_md = NULL; + struct evms_logical_node *newnode = NULL; + struct evms_plugin_header *hdr = NULL; + struct evms_plugin_fops *fops = NULL; + + rc = evms_cs_allocate_logical_node(&newnode); + if (!rc) { + evms_md = kmalloc(sizeof(*evms_md), GFP_KERNEL); + if (!evms_md) { + rc = -ENOMEM; + } else { + + memset(evms_md,0,sizeof(*evms_md)); + evms_md->mddev = mddev; + + fops = kmalloc(sizeof(*fops), GFP_KERNEL); + if (fops) { + /* copy MD plugin header + * copy function table + * replace read and write function pointers. + */ + evms_md->instance_plugin_hdr = md_plugin_header; + memcpy(fops, &md_fops, sizeof(*fops)); + fops->read = mddev->pers->read; + fops->write = mddev->pers->write; + evms_md->instance_plugin_hdr.fops = fops; + hdr = &evms_md->instance_plugin_hdr; + } else { + LOG_WARNING("%s: No memory to copy function table\n",__FUNCTION__); + rc = 0; /* clear rc and continue */ + hdr = &md_plugin_header; + } + } + } + + if (!rc && hdr) { + memset(newnode,0,sizeof(*newnode)); + newnode->plugin = hdr; + newnode->total_vsectors = (u64)evms_md_size[mdidx(mddev)] * 2; + newnode->block_size = md_blocksizes[mdidx(mddev)]; + newnode->hardsector_size = md_hardsect_sizes[mdidx(mddev)]; + sprintf(newnode->name,"md/md%d",mdidx(mddev)); + newnode->private = evms_md; + newnode->flags = flags; + + rc = evms_cs_add_logical_node_to_list(discover_list, newnode); + if (rc) { + LOG_ERROR("%s: could not add md node %s\n", __FUNCTION__, newnode->name); + } else { + LOG_DEBUG("%s: added [%s] to discover list (total_vsectors="PFU64")\n", + __FUNCTION__, newnode->name, newnode->total_vsectors); + } + } + + if (!rc) { + mddev->node = newnode; + } else { + if (evms_md) { + if (fops) + kfree(fops); + kfree(evms_md); + } + if (newnode) + evms_cs_deallocate_logical_node(newnode); + } + return rc; +} + + +/* + * Function: evms_md_autostart_arrays + * Discover MD "extended" devices + * Add MD "extended" devices to pending list for further processing + */ +static void evms_md_autostart_arrays (struct evms_logical_node **discover_list) +{ + struct evms_logical_node *node, *next_node; + mdk_rdev_t *rdev; + int rc=0; + + LOG_ENTRY_EXIT(":autostart_arrays() ENTRY\n"); + + /* examine each node on the discover list */ + next_node = *discover_list; + while(next_node) { + node = next_node; + next_node = node->next; + + rc = evms_md_import_device(discover_list, node); + if (rc && (rc != -EEXIST)) { + LOG_EXTRA("autostart_arrrays() Not %s!\n",evms_md_partition_name(node)); + continue; + } + + /* + * Sanity checks: + */ + rdev = evms_md_find_rdev_all(node); + if (!rdev) { + LOG_ERROR("find_rdev_all() failed\n"); + continue; + } + if (rdev->faulty) { + MD_BUG(); + continue; + } + + if (!rc) { + list_add(&rdev->pending, &pending_raid_disks); + } else if (rc == -EEXIST) { + struct evms_logical_node *md_node; + /* + * Must be in a re-discovery process here. + * Find the EVMS MD node that this rdev is a member of + */ + if (rdev->mddev) { + md_node = rdev->mddev->node; + if (md_node) { + rc = evms_cs_add_logical_node_to_list(discover_list,md_node); + switch (rc) { + case 0: + exported_nodes++; + LOG_DETAILS("Added MD node (%s) to discover list\n", + md_node->name); + break; + case 1: /* already on the list */ + case 2: /* already on the list */ + break; + default: + LOG_WARNING("could not add md node (%s), rc=%d\n", + md_node->name, rc); + } + } else { + LOG_ERROR("This MD device [md%d] does not have an EVMS logical node.\n", + rdev->mddev->__minor); + } + } else { + LOG_ERROR("This device [%s] does not belong to any array!\n", + get_partition_name(rdev)); + evms_md_export_rdev(rdev, TRUE); + } + evms_cs_remove_logical_node_from_list(discover_list,node); + } + } + + evms_md_run_devices(discover_list); + LOG_DETAILS("EVMD MD:autostart_arrays() EXIT (exported_nodes=%d)\n",exported_nodes); +} + +#ifdef CONFIG_PROC_FS +static int status_resync(char * page, off_t * offset, int count, mddev_t * mddev) +{ + int sz = 0; + off_t off = *offset; + unsigned long max_blocks, resync, res, dt, db, rt; + + resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2; + max_blocks = mddev->sb->size; + + /* + * Should not happen. + */ + if (!max_blocks) { + MD_BUG(); + return 0; + } + res = (resync/1024)*1000/(max_blocks/1024 + 1); + { + int i, x = res/50, y = 20-x; + PROCPRINT("["); + for (i = 0; i < x; i++) + PROCPRINT("="); + sz += sprintf(page + sz, ">"); + for (i = 0; i < y; i++) + PROCPRINT("."); + PROCPRINT("] "); + } + if (!mddev->recovery_running) + /* + * true resync + */ + PROCPRINT(" resync =%3lu.%lu%% (%lu/%lu)", + res/10, res % 10, resync, max_blocks); + else + /* + * recovery ... + */ + PROCPRINT(" recovery =%3lu.%lu%% (%lu/%lu)", + res/10, res % 10, resync, max_blocks); + + /* + * We do not want to overflow, so the order of operands and + * the * 100 / 100 trick are important. We do a +1 to be + * safe against division by zero. We only estimate anyway. + * + * dt: time from mark until now + * db: blocks written from mark until now + * rt: remaining time + */ + dt = ((jiffies - mddev->resync_mark) / HZ); + if (!dt) dt++; + db = resync - (mddev->resync_mark_cnt/2); + rt = (dt * ((max_blocks-resync) / (db/100+1)))/100; + + PROCPRINT(" finish=%lu.%lumin", rt / 60, (rt % 60)/6); + + PROCPRINT(" speed=%ldK/sec", db/dt); + +out: + *offset = off; + return sz; +} + +static int evms_md_status_read_proc(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int sz = 0, j, size; + struct list_head *tmp, *tmp2; + mdk_rdev_t *rdev; + mddev_t *mddev; + + PROCPRINT("Enterprise Volume Management System: MD Status\n"); + PROCPRINT("Personalities : "); + for (j = 0; j < MAX_PERSONALITY; j++) + if (pers[j]) + PROCPRINT("[%s] ", pers[j]->name); + + PROCPRINT("\n"); + + + ITERATE_MDDEV(mddev,tmp) { + PROCPRINT("md%d : %sactive", mdidx(mddev), + mddev->pers ? "" : "in"); + if (mddev->pers) { + if (mddev->ro) + PROCPRINT(" (read-only)"); + PROCPRINT(" %s", mddev->pers->name); + } + + size = 0; + ITERATE_RDEV(mddev,rdev,tmp2) { + PROCPRINT(" %s[%d]", + rdev->node->name, rdev->desc_nr); + if (rdev->faulty) { + PROCPRINT("(F)"); + continue; + } + size += rdev->size; + } + + if (mddev->nb_dev) { + if (mddev->pers) + PROCPRINT("\n "PFU64" blocks", + mddev->node->total_vsectors >> 1); + else + PROCPRINT("\n %d blocks", size); + } + + if (!mddev->pers) { + PROCPRINT("\n"); + continue; + } + + sz += mddev->pers->status (page+sz, mddev); + + PROCPRINT("\n "); + if (mddev->curr_resync) { + sz += status_resync (page+sz, &off, count, mddev); + } else { + if (atomic_read(&mddev->resync_sem.count) != 1) + PROCPRINT(" resync=DELAYED"); + } + + PROCPRINT("\n"); + } + *eof = 1; +out: + *start = page + off; + sz -= off; + if (sz < 0) + sz = 0; + return sz > count ? count : sz; +} +#endif + +/* Function: md_core_init + */ +int __init md_core_init(void) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *evms_proc_dir; +#endif + +#ifdef CONFIG_PROC_FS + evms_proc_dir = evms_cs_get_evms_proc_dir(); + if (evms_proc_dir) { + create_proc_read_entry("mdstat", 0, evms_proc_dir, evms_md_status_read_proc, NULL); + } + md_table_header = register_sysctl_table(dev_dir_table, 1); +#endif + + return evms_cs_register_plugin(&md_plugin_header); +} + +static void __exit md_core_exit(void) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *evms_proc_dir; + + evms_proc_dir = evms_cs_get_evms_proc_dir(); + if (evms_proc_dir) { + remove_proc_entry("mdstat", evms_proc_dir); + } + unregister_sysctl_table(md_table_header); +#endif + evms_cs_unregister_plugin(&md_plugin_header); +} + +module_init(md_core_init); +module_exit(md_core_exit); +#ifdef MODULE_LICENSE +MODULE_LICENSE("GPL"); +#endif + +/* + * In order to have the coexistence of this EVMS plugin and the orginal MD + * module, the symbols exported by this plugin are prefixed with "evms_" + */ + +MD_EXPORT_SYMBOL(evms_md_size); +MD_EXPORT_SYMBOL(evms_register_md_personality); +MD_EXPORT_SYMBOL(evms_unregister_md_personality); + /* Export the following function for use with rdev->node in evms_md_k.h */ +MD_EXPORT_SYMBOL(evms_md_partition_name); + /* Export the following function for use with disks[] in md_p.h */ +MD_EXPORT_SYMBOL(evms_md_error); +MD_EXPORT_SYMBOL(evms_md_error_dev); +MD_EXPORT_SYMBOL(evms_md_update_sb); +MD_EXPORT_SYMBOL(evms_md_find_rdev_nr); +MD_EXPORT_SYMBOL(evms_md_find_rdev); +MD_EXPORT_SYMBOL(evms_md_find_rdev_from_node); +MD_EXPORT_SYMBOL(evms_md_print_devices); +MD_EXPORT_SYMBOL(evms_mddev_map); +MD_EXPORT_SYMBOL(evms_md_check_ordering); +MD_EXPORT_SYMBOL(evms_md_partial_sync_io); +MD_EXPORT_SYMBOL(evms_md_sync_io); +MD_EXPORT_SYMBOL(evms_md_do_sync); +MD_EXPORT_SYMBOL(evms_md_sync_acct); +MD_EXPORT_SYMBOL(evms_md_done_sync); +MD_EXPORT_SYMBOL(evms_md_recover_arrays); +MD_EXPORT_SYMBOL(evms_md_get_spare); + diff -Naur linux-2002-09-30/drivers/evms/md_linear.c evms-2002-09-30/drivers/evms/md_linear.c --- linux-2002-09-30/drivers/evms/md_linear.c Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/drivers/evms/md_linear.c Thu Aug 15 13:50:12 2002 @@ -0,0 +1,285 @@ +/* + linear.c : Multiple Devices driver for Linux + Copyright (C) 1994-96 Marc ZYNGIER + or + + + Linear mode management functions. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include +#include +#include +#include + + +#define MAJOR_NR MD_MAJOR +#define MD_DRIVER +#define MD_PERSONALITY + +#define LOG_PREFIX "md linear: " +static int linear_run (mddev_t *mddev) +{ + linear_conf_t *conf; + struct linear_hash *table; + mdk_rdev_t *rdev; + int size, i, j, nb_zone; + unsigned int curr_offset; + + MOD_INC_USE_COUNT; + + conf = kmalloc (sizeof (*conf), GFP_KERNEL); + if (!conf) + goto out; + mddev->private = conf; + + if (evms_md_check_ordering(mddev)) { + printk("linear: disks are not ordered, aborting!\n"); + goto out; + } + + /* + * Find the smallest device. + */ + + conf->smallest = NULL; + curr_offset = 0; + ITERATE_RDEV_ORDERED(mddev,rdev,j) { + dev_info_t *disk = conf->disks + j; + disk->node = rdev->node; + disk->dev = rdev->dev; + disk->size = rdev->size; + disk->offset = curr_offset; + + curr_offset += disk->size; + + if (!conf->smallest || (disk->size < conf->smallest->size)) + conf->smallest = disk; + } + + nb_zone = conf->nr_zones = evms_md_size[mdidx(mddev)] / conf->smallest->size + + ((evms_md_size[mdidx(mddev)] % conf->smallest->size) ? 1 : 0); + + conf->hash_table = kmalloc (sizeof (struct linear_hash) * nb_zone, + GFP_KERNEL); + if (!conf->hash_table) + goto out; + + /* + * Here we generate the linear hash table + */ + table = conf->hash_table; + i = 0; + size = 0; + for (j = 0; j < mddev->nb_dev; j++) { + dev_info_t *disk = conf->disks + j; + + if (size < 0) { + table[-1].dev1 = disk; + } + size += disk->size; + + while (size>0) { + table->dev0 = disk; + table->dev1 = NULL; + size -= conf->smallest->size; + table++; + } + } + if (table-conf->hash_table != nb_zone) + BUG(); + LOG_DETAILS("%s: nr_zones=%d, smallest=%lu\n", + __FUNCTION__, conf->nr_zones, conf->smallest->size); + return 0; + +out: + if (conf) + kfree(conf); + MOD_DEC_USE_COUNT; + return 1; +} + +static int linear_stop (mddev_t *mddev) +{ + linear_conf_t *conf = mddev_to_conf(mddev); + + kfree(conf->hash_table); + kfree(conf); + + MOD_DEC_USE_COUNT; + + return 0; +} + +/* + * Function: linear_map + */ +static int linear_map( + mddev_t *mddev, + struct evms_logical_node **node, + struct buffer_head *bh) +{ + linear_conf_t *conf = mddev_to_conf(mddev); + struct linear_hash *hash; + dev_info_t *tmp_dev; + unsigned long block; + + block = (bh->b_rsector >> 1); + hash = conf->hash_table + (block / conf->smallest->size); + if (block >= (hash->dev0->size + hash->dev0->offset)) { + if (!hash->dev1) { + LOG_ERROR("%s: hash->dev1==NULL for block %ld\n", __FUNCTION__, block); + return -ENXIO; + } + tmp_dev = hash->dev1; + } else + tmp_dev = hash->dev0; + + if ( (block + (bh->b_size >> 10)) > (tmp_dev->size + tmp_dev->offset) + || block < tmp_dev->offset) { + LOG_ERROR("%s: Block %ld out of bounds on node %s size %ld offset %ld\n", + __FUNCTION__, + block, + tmp_dev->node->name, + tmp_dev->size, + tmp_dev->offset); + return -ENXIO; + } + bh->b_rsector -= (tmp_dev->offset << 1); + *node = tmp_dev->node; + return 0; +} + +static void linear_read( + struct evms_logical_node *md_node, + struct buffer_head *bh) +{ + mddev_t *mddev = EVMS_MD_NODE_TO_MDDEV(md_node); + struct evms_logical_node *node; + + if (evms_md_check_boundary(md_node, bh)) return; + + if (!linear_map(mddev, &node, bh)) { + R_IO(node, bh); + } else { + bh->b_end_io(bh, 0); + } +} + +static void linear_write( + struct evms_logical_node *md_node, + struct buffer_head *bh) +{ + mddev_t *mddev = EVMS_MD_NODE_TO_MDDEV(md_node); + struct evms_logical_node *node; + + if (evms_md_check_boundary(md_node, bh)) return; + + if (!linear_map(mddev, &node, bh)) { + W_IO(node, bh); + } else { + bh->b_end_io(bh, 0); + } +} + +static int linear_status (char *page, mddev_t *mddev) +{ + int sz = 0; + +#undef MD_DEBUG +#ifdef MD_DEBUG + int j; + linear_conf_t *conf = mddev_to_conf(mddev); + + sz += sprintf(page+sz, " "); + for (j = 0; j < conf->nr_zones; j++) + { + sz += sprintf(page+sz, "[%s", + partition_name(conf->hash_table[j].dev0->dev)); + + if (conf->hash_table[j].dev1) + sz += sprintf(page+sz, "/%s] ", + partition_name(conf->hash_table[j].dev1->dev)); + else + sz += sprintf(page+sz, "] "); + } + sz += sprintf(page+sz, "\n"); +#endif + sz += sprintf(page+sz, " %dk rounding", mddev->chunk_size/1024); + return sz; +} + +static int linear_evms_ioctl ( + mddev_t * mddev, + struct inode * inode, + struct file * file, + unsigned int cmd, + unsigned long arg) +{ + int rc = 0; + struct evms_logical_node *node; + + switch (cmd) { + case EVMS_GET_BMAP: + { + struct evms_get_bmap_pkt *bmap = (struct evms_get_bmap_pkt *)arg; + struct buffer_head *bh = + evms_cs_allocate_from_pool(evms_bh_pool, FALSE); + if (bh) { + bh->b_rsector = (unsigned long)bmap->rsector; + bh->b_size = node->block_size; + rc = linear_map(mddev, &node, bh); + if (!rc) { + bmap->rsector = (u64)bh->b_rsector; + if (node) + rc = IOCTL(node, inode, file, cmd, arg); + else + rc = -ENODEV; + } + evms_cs_deallocate_to_pool(evms_bh_pool, bh); + } else + rc = -ENOMEM; + break; + } + + default: + rc = -EINVAL; + } + return rc; +} + +static mdk_personality_t linear_personality = { + .name = "evms_linear", + .read = linear_read, + .write = linear_write, + .run = linear_run, + .stop = linear_stop, + .status = linear_status, + .evms_ioctl = linear_evms_ioctl +}; + +static int md__init linear_init (void) +{ + return evms_register_md_personality (LINEAR, &linear_personality); +} + +static void linear_exit (void) +{ + evms_unregister_md_personality (LINEAR); +} + + +module_init(linear_init); +module_exit(linear_exit); +#ifdef MODULE_LICENSE +MODULE_LICENSE("GPL"); +#endif diff -Naur linux-2002-09-30/drivers/evms/md_raid0.c evms-2002-09-30/drivers/evms/md_raid0.c --- linux-2002-09-30/drivers/evms/md_raid0.c Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/drivers/evms/md_raid0.c Thu Aug 15 13:50:12 2002 @@ -0,0 +1,448 @@ +/* + raid0.c : Multiple Devices driver for Linux + Copyright (C) 1994-96 Marc ZYNGIER + or + + Copyright (C) 1999, 2000 Ingo Molnar, Red Hat + + + RAID-0 management functions. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include +#include + +#define MAJOR_NR MD_MAJOR +#define MD_DRIVER +#define MD_PERSONALITY + +#define LOG_PREFIX "md raid0: " + +static int create_strip_zones (mddev_t *mddev) +{ + int i, c, j, j1, j2; + unsigned long current_offset, curr_zone_offset, rdev_size_in_sects; + raid0_conf_t *conf = mddev_to_conf(mddev); + mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev; + + /* + * The number of 'same size groups' + */ + conf->nr_strip_zones = 0; + + ITERATE_RDEV_ORDERED(mddev,rdev1,j1) { + LOG_DEBUG(" looking at %s\n", evms_md_partition_name(rdev1->node)); + c = 0; + ITERATE_RDEV_ORDERED(mddev,rdev2,j2) { + LOG_DEBUG(" comparing %s(%ld sectors) with %s(%ld sectors)\n", + evms_md_partition_name(rdev1->node), rdev1->size << 1, + evms_md_partition_name(rdev2->node), rdev2->size << 1); + if (rdev2 == rdev1) { + LOG_DEBUG(" END\n"); + break; + } + if (rdev2->size == rdev1->size) + { + /* + * Not unique, dont count it as a new + * group + */ + LOG_DEBUG(" EQUAL\n"); + c = 1; + break; + } + LOG_DEBUG(" NOT EQUAL\n"); + } + if (!c) { + LOG_DEBUG(" ==> UNIQUE\n"); + conf->nr_strip_zones++; + LOG_DEBUG(" %d zones\n",conf->nr_strip_zones); + } + } + LOG_DEBUG(" FINAL %d zones\n",conf->nr_strip_zones); + + conf->strip_zone = vmalloc(sizeof(struct strip_zone)* + conf->nr_strip_zones); + if (!conf->strip_zone) + return 1; + + + conf->smallest = NULL; + current_offset = 0; + curr_zone_offset = 0; + + for (i = 0; i < conf->nr_strip_zones; i++) + { + struct strip_zone *zone = conf->strip_zone + i; + + LOG_DEBUG(" zone %d\n", i); + zone->dev_offset = current_offset; + smallest = NULL; + c = 0; + + ITERATE_RDEV_ORDERED(mddev,rdev,j) { + + LOG_DEBUG(" checking %s ...",evms_md_partition_name(rdev->node)); + rdev_size_in_sects = rdev->size << 1; + if (rdev_size_in_sects > current_offset) + { + LOG_DEBUG(" contained as device %d\n", c); + zone->node[c] = rdev->node; + c++; + if (!smallest || (rdev_size_in_sects < (smallest->size <<1) )) { + smallest = rdev; + LOG_DEBUG(" (%ld) is smallest!.\n", rdev_size_in_sects); + } + } else + LOG_DEBUG(" nope.\n"); + } + + zone->nb_dev = c; + zone->size_in_sects = ((smallest->size <<1) - current_offset) * c; + LOG_DEBUG(" zone->nb_dev: %d, size: %ld\n", + zone->nb_dev,zone->size_in_sects); + + if (!conf->smallest || (zone->size_in_sects < conf->smallest->size_in_sects)) + conf->smallest = zone; + + zone->zone_offset = curr_zone_offset; + curr_zone_offset += zone->size_in_sects; + + current_offset = smallest->size << 1; + LOG_DEBUG(" current zone offset: %ld\n",current_offset); + } + LOG_DEBUG(" done.\n"); + return 0; +} + +static int raid0_run (mddev_t *mddev) +{ + unsigned long cur=0, i=0, size, zone0_size, nb_zone; + unsigned long mddev_size_in_sects = evms_md_size[mdidx(mddev)] << 1; + raid0_conf_t *conf; + + MOD_INC_USE_COUNT; + + conf = vmalloc(sizeof (raid0_conf_t)); + if (!conf) + goto out; + mddev->private = (void *)conf; + + if (evms_md_check_ordering(mddev)) { + LOG_ERROR("disks are not ordered, aborting!\n"); + goto out_free_conf; + } + + if (create_strip_zones (mddev)) + goto out_free_conf; + + LOG_DETAILS("evms_md_size is %ld sectors.\n", mddev_size_in_sects); + LOG_DETAILS("conf->smallest->size_in_sects is %ld sectors.\n", conf->smallest->size_in_sects); + nb_zone = mddev_size_in_sects / conf->smallest->size_in_sects + + (mddev_size_in_sects % conf->smallest->size_in_sects ? 1 : 0); + LOG_DETAILS("nb_zone is %ld.\n", nb_zone); + conf->nr_zones = nb_zone; + + LOG_DEBUG("Allocating %ld bytes for hash.\n", nb_zone*sizeof(struct raid0_hash)); + + conf->hash_table = vmalloc (sizeof (struct raid0_hash)*nb_zone); + if (!conf->hash_table) + goto out_free_zone_conf; + size = conf->strip_zone[cur].size_in_sects; + + i = 0; + while (cur < conf->nr_strip_zones) { + conf->hash_table[i].zone0 = conf->strip_zone + cur; + + /* + * If we completely fill the slot + */ + if (size >= conf->smallest->size_in_sects) { + conf->hash_table[i++].zone1 = NULL; + size -= conf->smallest->size_in_sects; + + if (!size) { + if (++cur == conf->nr_strip_zones) + continue; + size = conf->strip_zone[cur].size_in_sects; + } + continue; + } + if (++cur == conf->nr_strip_zones) { + /* + * Last dev, set unit1 as NULL + */ + conf->hash_table[i].zone1=NULL; + continue; + } + + /* + * Here we use a 2nd dev to fill the slot + */ + zone0_size = size; + size = conf->strip_zone[cur].size_in_sects; + conf->hash_table[i++].zone1 = conf->strip_zone + cur; + size -= (conf->smallest->size_in_sects - zone0_size); + } + return 0; + +out_free_zone_conf: + vfree(conf->strip_zone); + conf->strip_zone = NULL; + +out_free_conf: + vfree(conf); + mddev->private = NULL; +out: + MOD_DEC_USE_COUNT; + return 1; +} + +static int raid0_stop (mddev_t *mddev) +{ + raid0_conf_t *conf = mddev_to_conf(mddev); + + vfree (conf->hash_table); + conf->hash_table = NULL; + vfree (conf->strip_zone); + conf->strip_zone = NULL; + vfree (conf); + mddev->private = NULL; + + MOD_DEC_USE_COUNT; + return 0; +} + + +/* + * Function: raid0_map + * + * Return 0 for success, else error + * + */ + +static inline int raid0_map( + mddev_t *mddev, + unsigned long lsn, + unsigned long size, + struct evms_logical_node **node, + unsigned long *new_lsn, + unsigned long *new_size) +{ + unsigned int sect_in_chunk, chunksize_bits, chunk_size_in_sects; + raid0_conf_t *conf = mddev_to_conf(mddev); + struct raid0_hash *hash; + struct strip_zone *zone; + unsigned long chunk; + + chunk_size_in_sects = mddev->chunk_size >> EVMS_VSECTOR_SIZE_SHIFT; + chunksize_bits = ffz(~chunk_size_in_sects); + hash = conf->hash_table + (lsn / conf->smallest->size_in_sects); + + /* Sanity check */ + if (!hash) + goto bad_hash; + + if (!hash->zone0) + goto bad_zone0; + + if (lsn >= (hash->zone0->size_in_sects + hash->zone0->zone_offset)) { + if (!hash->zone1) + goto bad_zone1; + zone = hash->zone1; + } else + zone = hash->zone0; + + sect_in_chunk = lsn & (chunk_size_in_sects - 1); + chunk = (lsn - zone->zone_offset) / (zone->nb_dev << chunksize_bits); + *node = zone->node[(lsn >> chunksize_bits) % zone->nb_dev]; + + *new_lsn = ((chunk << chunksize_bits) + zone->dev_offset) + sect_in_chunk; + + *new_size = (size <= chunk_size_in_sects - sect_in_chunk) ? + size : chunk_size_in_sects - sect_in_chunk; + + return 0; + +bad_hash: + LOG_ERROR("%s: bug: hash==NULL for lsn %lu\n", __FUNCTION__, lsn); + goto outerr; +bad_zone0: + LOG_ERROR("%s: bug: hash->zone0==NULL for lsn %lu\n", __FUNCTION__, lsn); + goto outerr; +bad_zone1: + LOG_ERROR("%s: bug: hash->zone1==NULL for lsn %lu\n", __FUNCTION__, lsn); +outerr: + return -EINVAL; +} + +void raid0_error(int rw, struct evms_logical_node *node, struct buffer_head *bh) +{ + LOG_ERROR(" %s FAILED on node(%s) rsector(%lu) size(%d)\n", + (rw == READ) ? "READ" : "WRITE", + node->name, + bh->b_rsector, + bh->b_size); + + bh->b_end_io(bh, 0); +} + +static inline void raid0_rw ( + struct evms_logical_node *md_node, + struct buffer_head *bh, + int rw) +{ + mddev_t *mddev = EVMS_MD_NODE_TO_MDDEV(md_node); + struct evms_logical_node *node; + unsigned long new_lsn, size_in_sects, new_size; + + if (evms_md_check_boundary(md_node, bh)) return; + size_in_sects = bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT; + if (!raid0_map(mddev, bh->b_rsector, size_in_sects, &node, &new_lsn, &new_size)) { + if (new_size == size_in_sects) { + /* + * This is the normal case: + * the request is entirely within the stripe boundary + */ + bh->b_rsector = new_lsn; + if (rw == READ) { + R_IO(node, bh); + } else { + W_IO(node, bh); + } + return; + } else { + /* + * BUGBUG! + * Need more processing here (ie. break up the request) + */ + LOG_ERROR("This version of EVMS RAID0 does not support I/O requests that are:\n"); + LOG_ERROR(" - larger than the stripe size\n"); + LOG_ERROR(" - cross the stripe boundary\n"); + } + } + raid0_error(rw, node, bh); +} + +static void raid0_read( + struct evms_logical_node *md_node, + struct buffer_head *bh) +{ + raid0_rw(md_node, bh, READ); +} + +static void raid0_write( + struct evms_logical_node *md_node, + struct buffer_head *bh) +{ + raid0_rw(md_node, bh, WRITE); +} + +static int raid0_status (char *page, mddev_t *mddev) +{ + int sz = 0; +#undef MD_DEBUG +#ifdef MD_DEBUG + int j, k; + raid0_conf_t *conf = mddev_to_conf(mddev); + + sz += sprintf(page + sz, " "); + for (j = 0; j < conf->nr_zones; j++) { + sz += sprintf(page + sz, "[z%d", + conf->hash_table[j].zone0 - conf->strip_zone); + if (conf->hash_table[j].zone1) + sz += sprintf(page+sz, "/z%d] ", + conf->hash_table[j].zone1 - conf->strip_zone); + else + sz += sprintf(page+sz, "] "); + } + + sz += sprintf(page + sz, "\n"); + + for (j = 0; j < conf->nr_strip_zones; j++) { + sz += sprintf(page + sz, " z%d=[", j); + for (k = 0; k < conf->strip_zone[j].nb_dev; k++) + sz += sprintf (page+sz, "%s/", conf->strip_zone[j].node[k]->name); + sz--; + sz += sprintf (page+sz, "] zo=%d do=%d s=%d\n", + conf->strip_zone[j].zone_offset, + conf->strip_zone[j].dev_offset, + conf->strip_zone[j].size_in_sects); + } +#endif + sz += sprintf(page + sz, " %dk chunks", mddev->chunk_size/1024); + return sz; +} + +static int raid0_evms_ioctl ( + mddev_t * mddev, + struct inode * inode, + struct file * file, + unsigned int cmd, + unsigned long arg) +{ + int rc = 0; + struct evms_logical_node *node; + + switch (cmd) { + case EVMS_GET_BMAP: + { + struct evms_get_bmap_pkt *bmap = (struct evms_get_bmap_pkt *)arg; + unsigned long new_lsn, new_size; + unsigned long size = mddev->node->block_size >> EVMS_VSECTOR_SIZE_SHIFT; + rc = raid0_map(mddev, + (unsigned long)bmap->rsector, + size, + &node, + &new_lsn, + &new_size); + if (!rc) { + if (node) { + bmap->rsector = (u64)new_lsn; + rc = IOCTL(node, inode, file, cmd, arg); + } else + rc = -ENODEV; + } + break; + } + + default: + rc = -EINVAL; + } + return rc; +} + +static mdk_personality_t raid0_personality = { + .name = "evms_raid0", + .read = raid0_read, + .write = raid0_write, + .run = raid0_run, + .stop = raid0_stop, + .status = raid0_status, + .evms_ioctl = raid0_evms_ioctl +}; + +static int md__init raid0_init (void) +{ + return evms_register_md_personality (RAID0, &raid0_personality); +} + +static void raid0_exit (void) +{ + evms_unregister_md_personality (RAID0); +} + +module_init(raid0_init); +module_exit(raid0_exit); +#ifdef MODULE_LICENSE +MODULE_LICENSE("GPL"); +#endif diff -Naur linux-2002-09-30/drivers/evms/md_raid1.c evms-2002-09-30/drivers/evms/md_raid1.c --- linux-2002-09-30/drivers/evms/md_raid1.c Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/drivers/evms/md_raid1.c Mon Sep 30 00:02:48 2002 @@ -0,0 +1,1935 @@ +/* + * md_raid1.c : Multiple Devices driver for Linux + * + * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat + * + * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman + * + * RAID-1 management functions. + * + * Better read-balancing code written by Mika Kuoppala , 2000 + * + * Fixes to reconstruction by Jakob Østergaard" + * Various fixes by Neil Brown + * + * 'md_raid1.c' is an EVMS version of linux/drivers/md/raid1.c modified + * by Cuong (Mike) Tran , January 2002. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include + +#define MAJOR_NR MD_MAJOR +#define MD_DRIVER +#define MD_PERSONALITY + +#define MAX_WORK_PER_DISK 128 + +#define NR_RESERVED_BUFS 32 + +#define LOG_PREFIX "md raid1: " +/* + * The following can be used to debug the driver + */ +#define RAID1_DEBUG 0 + +#if RAID1_DEBUG +#define PRINTK(x...) LOG_DEFAULT(x) +#define inline +#define __inline__ +#else +#define PRINTK(x...) do { } while (0) +#endif + + +static mdk_personality_t raid1_personality; +static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED; +struct raid1_bh *evms_raid1_retry_list = NULL, **evms_raid1_retry_tail; + +static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt) +{ + /* return a linked list of "cnt" struct buffer_heads. + * don't take any off the free list unless we know we can + * get all we need, otherwise we could deadlock + */ + struct buffer_head *bh=NULL; + + while(cnt) { + struct buffer_head *t; + md_spin_lock_irq(&conf->device_lock); + if (!conf->freebh_blocked && conf->freebh_cnt >= cnt) + while (cnt) { + t = conf->freebh; + conf->freebh = t->b_next; + t->b_next = bh; + bh = t; + t->b_state = 0; + conf->freebh_cnt--; + cnt--; + } + md_spin_unlock_irq(&conf->device_lock); + if (cnt == 0) + break; + t = kmem_cache_alloc(bh_cachep, SLAB_NOIO); + if (t) { + t->b_next = bh; + bh = t; + cnt--; + } else { + PRINTK("raid1: waiting for %d bh\n", cnt); + conf->freebh_blocked = 1; + wait_disk_event(conf->wait_buffer, + !conf->freebh_blocked || + conf->freebh_cnt > conf->raid_disks * NR_RESERVED_BUFS/2); + conf->freebh_blocked = 0; + } + } + return bh; +} + +static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh) +{ + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + while (bh) { + struct buffer_head *t = bh; + bh=bh->b_next; + if (t->b_pprev == NULL) + kmem_cache_free(bh_cachep, t); + else { + t->b_next= conf->freebh; + conf->freebh = t; + conf->freebh_cnt++; + } + } + spin_unlock_irqrestore(&conf->device_lock, flags); + wake_up(&conf->wait_buffer); +} + +static int raid1_grow_bh(raid1_conf_t *conf, int cnt) +{ + /* allocate cnt buffer_heads, possibly less if kmalloc fails */ + int i = 0; + + while (i < cnt) { + struct buffer_head *bh; + bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL); + if (!bh) break; + + md_spin_lock_irq(&conf->device_lock); + bh->b_pprev = &conf->freebh; + bh->b_next = conf->freebh; + conf->freebh = bh; + conf->freebh_cnt++; + md_spin_unlock_irq(&conf->device_lock); + + i++; + } + return i; +} + +static void raid1_shrink_bh(raid1_conf_t *conf) +{ + /* discard all buffer_heads */ + + md_spin_lock_irq(&conf->device_lock); + while (conf->freebh) { + struct buffer_head *bh = conf->freebh; + conf->freebh = bh->b_next; + kmem_cache_free(bh_cachep, bh); + conf->freebh_cnt--; + } + md_spin_unlock_irq(&conf->device_lock); +} + + +static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf) +{ + struct raid1_bh *r1_bh = NULL; + + do { + md_spin_lock_irq(&conf->device_lock); + if (!conf->freer1_blocked && conf->freer1) { + r1_bh = conf->freer1; + conf->freer1 = r1_bh->next_r1; + conf->freer1_cnt--; + r1_bh->next_r1 = NULL; + r1_bh->state = (1 << R1BH_PreAlloc); + r1_bh->bh_req.b_state = 0; + } + md_spin_unlock_irq(&conf->device_lock); + if (r1_bh) + return r1_bh; + r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO); + if (r1_bh) { + memset(r1_bh, 0, sizeof(*r1_bh)); + return r1_bh; + } + conf->freer1_blocked = 1; + wait_disk_event(conf->wait_buffer, + !conf->freer1_blocked || + conf->freer1_cnt > NR_RESERVED_BUFS/2 + ); + conf->freer1_blocked = 0; + } while (1); +} + +static inline void raid1_free_r1bh(struct raid1_bh *r1_bh) +{ + struct buffer_head *bh = r1_bh->mirror_bh_list; + raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev); + + r1_bh->mirror_bh_list = NULL; + + if (test_bit(R1BH_PreAlloc, &r1_bh->state)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + r1_bh->next_r1 = conf->freer1; + conf->freer1 = r1_bh; + conf->freer1_cnt++; + spin_unlock_irqrestore(&conf->device_lock, flags); + /* don't need to wakeup wait_buffer because + * raid1_free_bh below will do that + */ + } else { + kfree(r1_bh); + } + raid1_free_bh(conf, bh); +} + +static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt) +{ + int i = 0; + + while (i < cnt) { + struct raid1_bh *r1_bh; + r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL); + if (!r1_bh) + break; + memset(r1_bh, 0, sizeof(*r1_bh)); + set_bit(R1BH_PreAlloc, &r1_bh->state); + r1_bh->mddev = conf->mddev; + + raid1_free_r1bh(r1_bh); + i++; + } + return i; +} + +static void raid1_shrink_r1bh(raid1_conf_t *conf) +{ + md_spin_lock_irq(&conf->device_lock); + while (conf->freer1) { + struct raid1_bh *r1_bh = conf->freer1; + conf->freer1 = r1_bh->next_r1; + conf->freer1_cnt--; + kfree(r1_bh); + } + md_spin_unlock_irq(&conf->device_lock); +} + + + +static inline void raid1_free_buf(struct raid1_bh *r1_bh) +{ + unsigned long flags; + struct buffer_head *bh = r1_bh->mirror_bh_list; + raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev); + r1_bh->mirror_bh_list = NULL; + + spin_lock_irqsave(&conf->device_lock, flags); + r1_bh->next_r1 = conf->freebuf; + conf->freebuf = r1_bh; + spin_unlock_irqrestore(&conf->device_lock, flags); + raid1_free_bh(conf, bh); +} + +static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf) +{ + struct raid1_bh *r1_bh; + + md_spin_lock_irq(&conf->device_lock); + wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock); + r1_bh = conf->freebuf; + conf->freebuf = r1_bh->next_r1; + r1_bh->next_r1= NULL; + md_spin_unlock_irq(&conf->device_lock); + return r1_bh; +} + +static int raid1_grow_buffers (raid1_conf_t *conf, int cnt) +{ + int i = 0; + + md_spin_lock_irq(&conf->device_lock); + while (i < cnt) { + struct raid1_bh *r1_bh; + struct page *page; + + page = alloc_page(GFP_KERNEL); + if (!page) + break; + + r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL); + if (!r1_bh) { + __free_page(page); + break; + } + memset(r1_bh, 0, sizeof(*r1_bh)); + r1_bh->bh_req.b_page = page; + r1_bh->bh_req.b_data = page_address(page); + r1_bh->next_r1 = conf->freebuf; + conf->freebuf = r1_bh; + i++; + } + md_spin_unlock_irq(&conf->device_lock); + return i; +} + +static void raid1_shrink_buffers (raid1_conf_t *conf) +{ + md_spin_lock_irq(&conf->device_lock); + while (conf->freebuf) { + struct raid1_bh *r1_bh = conf->freebuf; + conf->freebuf = r1_bh->next_r1; + __free_page(r1_bh->bh_req.b_page); + kfree(r1_bh); + } + md_spin_unlock_irq(&conf->device_lock); +} + +/* + * evms_raid1_map + * EVMS raid1 version of raid1_map() + */ +static int evms_raid1_map (mddev_t *mddev, struct evms_logical_node **node, kdev_t *rdev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + int i; + + /* + * Later we do read balancing on the read side + * now we use the first available disk. + */ + + for (i = 0; i < MD_SB_DISKS; i++) { + if (conf->mirrors[i].operational) { + *node = conf->mirrors[i].node; + *rdev = conf->mirrors[i].dev; + return (0); + } + } + + LOG_ERROR("huh, no more operational devices?\n"); + return (-1); +} + +static void raid1_reschedule_retry (struct raid1_bh *r1_bh) +{ + unsigned long flags; + mddev_t *mddev = r1_bh->mddev; + raid1_conf_t *conf = mddev_to_conf(mddev); + + md_spin_lock_irqsave(&retry_list_lock, flags); + if (evms_raid1_retry_list == NULL) + evms_raid1_retry_tail = &evms_raid1_retry_list; + *evms_raid1_retry_tail = r1_bh; + evms_raid1_retry_tail = &r1_bh->next_r1; + r1_bh->next_r1 = NULL; + md_spin_unlock_irqrestore(&retry_list_lock, flags); + evms_cs_wakeup_thread(conf->thread); +} + + +static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase) +{ + unsigned long flags; + spin_lock_irqsave(&conf->segment_lock, flags); + if (sector < conf->start_active) + conf->cnt_done--; + else if (sector >= conf->start_future && conf->phase == phase) + conf->cnt_future--; + else if (!--conf->cnt_pending) + wake_up(&conf->wait_ready); + + spin_unlock_irqrestore(&conf->segment_lock, flags); +} + +static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf) +{ + unsigned long flags; + spin_lock_irqsave(&conf->segment_lock, flags); + if (sector >= conf->start_ready) + --conf->cnt_ready; + else if (sector >= conf->start_active) { + if (!--conf->cnt_active) { + conf->start_active = conf->start_ready; + wake_up(&conf->wait_done); + } + } + spin_unlock_irqrestore(&conf->segment_lock, flags); +} + +/* + * raid1_end_bh_io() is called when we have finished servicing a mirrored + * operation and are ready to return a success/failure code to the buffer + * cache layer. + */ +static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate) +{ + struct buffer_head *bh = r1_bh->master_bh; + + io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev), + test_bit(R1BH_SyncPhase, &r1_bh->state)); + + bh->b_end_io(bh, uptodate); + raid1_free_r1bh(r1_bh); +} + +void raid1_end_read_request (struct buffer_head *bh, int uptodate) +{ + struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private); + evms_cs_volume_request_in_progress(r1_bh->master_bh->b_rdev, -1, NULL); + if (uptodate) { + set_bit (R1BH_Uptodate, &r1_bh->state); + raid1_end_bh_io(r1_bh, uptodate); + } else { + evms_md_error_dev(r1_bh->mddev, bh->b_dev); + LOG_ERROR("rescheduling block %lu\n", bh->b_blocknr); + raid1_reschedule_retry(r1_bh); + } +} + +void raid1_end_write_request (struct buffer_head *bh, int uptodate) +{ + struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private); + + evms_cs_volume_request_in_progress(r1_bh->master_bh->b_rdev, -1, NULL); + if (!uptodate) + evms_md_error_dev(r1_bh->mddev, bh->b_dev); + else + set_bit (R1BH_Uptodate, &r1_bh->state); + + /* + * Let's see if all mirrored write operations have finished + * already. + */ + if (atomic_dec_and_test(&r1_bh->remaining)) + raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state)); +} + +/* + * This routine returns the disk from which the requested read should + * be done. It bookkeeps the last read position for every disk + * in array and when new read requests come, the disk which last + * position is nearest to the request, is chosen. + * + * TODO: now if there are 2 mirrors in the same 2 devices, performance + * degrades dramatically because position is mirror, not device based. + * This should be changed to be device based. Also atomic sequential + * reads should be somehow balanced. + */ + +static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh) +{ + int new_disk = conf->last_used; + const int sectors = bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT; + const unsigned long this_sector = bh->b_rsector; + int disk = new_disk; + unsigned long new_distance; + unsigned long current_distance; + + /* + * Check if it is sane at all to balance + */ + + if (conf->resync_mirrors || conf->mddev->recovery_running) + goto rb_out; + + + /* make sure that disk is operational */ + while( !conf->mirrors[new_disk].operational) { + if (new_disk <= 0) new_disk = conf->raid_disks; + new_disk--; + if (new_disk == disk) { + /* + * This means no working disk was found + * Nothing much to do, lets not change anything + * and hope for the best... + */ + + new_disk = conf->last_used; + + goto rb_out; + } + } + disk = new_disk; + /* now disk == new_disk == starting point for search */ + + /* + * Don't touch anything for sequential reads. + */ + + if (this_sector == conf->mirrors[new_disk].head_position) + goto rb_out; + + /* + * If reads have been done only on a single disk + * for a time, lets give another disk a change. + * This is for kicking those idling disks so that + * they would find work near some hotspot. + */ + + if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) { + conf->sect_count = 0; + + do { + if (new_disk<=0) + new_disk = conf->raid_disks; + new_disk--; + if (new_disk == disk) + break; + } while ((conf->mirrors[new_disk].write_only) || + (!conf->mirrors[new_disk].operational)); + + goto rb_out; + } + + current_distance = abs(this_sector - + conf->mirrors[disk].head_position); + + /* Find the disk which is closest */ + + do { + if (disk <= 0) + disk = conf->raid_disks; + disk--; + + if ((conf->mirrors[disk].write_only) || + (!conf->mirrors[disk].operational)) + continue; + + new_distance = abs(this_sector - + conf->mirrors[disk].head_position); + + if (new_distance < current_distance) { + conf->sect_count = 0; + current_distance = new_distance; + new_disk = disk; + } + } while (disk != conf->last_used); + +rb_out: + conf->mirrors[new_disk].head_position = this_sector + sectors; + + conf->last_used = new_disk; + conf->sect_count += sectors; + + return new_disk; +} + +static void raid1_read(struct evms_logical_node *md_node, struct buffer_head *bh) +{ + mddev_t *mddev = EVMS_MD_NODE_TO_MDDEV(md_node); + raid1_conf_t *conf = mddev_to_conf(mddev); + struct mirror_info *mirror; + struct buffer_head *bh_req; + struct raid1_bh * r1_bh; + + if (evms_md_check_boundary(md_node, bh)) return; + + if (!buffer_locked(bh)) + BUG(); + + r1_bh = raid1_alloc_r1bh (conf); + + spin_lock_irq(&conf->segment_lock); + wait_event_lock_irq(conf->wait_done, + bh->b_rsector < conf->start_active || + bh->b_rsector >= conf->start_future, + conf->segment_lock); + if (bh->b_rsector < conf->start_active) + conf->cnt_done++; + else { + conf->cnt_future++; + if (conf->phase) + set_bit(R1BH_SyncPhase, &r1_bh->state); + } + spin_unlock_irq(&conf->segment_lock); + + r1_bh->mddev = mddev; + r1_bh->cmd = READ; + r1_bh->master_bh = bh; + + mirror = conf->mirrors + raid1_read_balance(conf, bh); + + bh_req = &r1_bh->bh_req; + memcpy(bh_req, bh, sizeof(*bh)); + bh_req->b_blocknr = bh->b_rsector; + bh_req->b_dev = mirror->dev; + bh_req->b_end_io = raid1_end_read_request; + bh_req->b_private = r1_bh; + evms_cs_volume_request_in_progress(bh->b_rdev, 1, NULL); + R_IO(mirror->node, bh_req); +} + +static void raid1_write( + struct evms_logical_node *md_node, + struct buffer_head *bh) +{ + mddev_t *mddev = EVMS_MD_NODE_TO_MDDEV(md_node); + raid1_conf_t *conf = mddev_to_conf(mddev); + struct raid1_bh * r1_bh; + struct buffer_head *bhl; + struct buffer_head *mbh; + int i, sum_bhs; + + if (evms_md_check_boundary(md_node, bh)) return; + + if (!buffer_locked(bh)) + BUG(); + + r1_bh = raid1_alloc_r1bh (conf); + + spin_lock_irq(&conf->segment_lock); + wait_event_lock_irq(conf->wait_done, + bh->b_rsector < conf->start_active || + bh->b_rsector >= conf->start_future, + conf->segment_lock); + if (bh->b_rsector < conf->start_active) + conf->cnt_done++; + else { + conf->cnt_future++; + if (conf->phase) + set_bit(R1BH_SyncPhase, &r1_bh->state); + } + spin_unlock_irq(&conf->segment_lock); + + /* + * i think the read and write branch should be separated completely, + * since we want to do read balancing on the read side for example. + * Alternative implementations? :) --mingo + */ + + r1_bh->mddev = mddev; + r1_bh->cmd = WRITE; + r1_bh->master_bh = bh; + + bhl = raid1_alloc_bh(conf, conf->raid_disks); + + for (i=0, sum_bhs=0; + (sum_bhs < conf->raid_disks) && (i < MD_SB_DISKS); + i++) { + if (!conf->mirrors[i].operational) + continue; + + /* + * We should use a private pool (size depending on NR_REQUEST), + * to avoid writes filling up the memory with bhs + * + * Such pools are much faster than kmalloc anyways (so we waste + * almost nothing by not using the master bh when writing and + * win alot of cleanness) but for now we are cool enough. --mingo + * + * It's safe to sleep here, buffer heads cannot be used in a shared + * manner in the write branch. Look how we lock the buffer at the + * beginning of this function to grok the difference ;) + */ + mbh = bhl; + if (mbh == NULL) { + MD_BUG(); + break; + } + bhl = mbh->b_next; + mbh->b_next = NULL; + mbh->b_this_page = (struct buffer_head *)1; + + /* + * prepare mirrored mbh (fields ordered for max mem throughput): + */ + mbh->b_blocknr = bh->b_rsector; + mbh->b_rdev = bh->b_rdev; + mbh->b_dev = conf->mirrors[i].dev; + mbh->b_rsector = bh->b_rsector; + mbh->b_state = (1<b_count, 1); + mbh->b_size = bh->b_size; + mbh->b_page = bh->b_page; + mbh->b_data = bh->b_data; + mbh->b_list = BUF_LOCKED; + mbh->b_end_io = raid1_end_write_request; + mbh->b_private = conf->mirrors[i].node; + + mbh->b_next = r1_bh->mirror_bh_list; + r1_bh->mirror_bh_list = mbh; + sum_bhs++; + } + + if (bhl) raid1_free_bh(conf,bhl); + if (!sum_bhs) { + /* Gag - all mirrors non-operational.. */ + raid1_end_bh_io(r1_bh, 0); + return; + } + atomic_set(&r1_bh->remaining, sum_bhs); + + /* + * We have to be a bit careful about the semaphore above, thats + * why we start the requests separately. Since kmalloc() could + * fail, sleep and make_request() can sleep too, this is the + * safer solution. Imagine, end_request decreasing the semaphore + * before we could have set it up ... We could play tricks with + * the semaphore (presetting it and correcting at the end if + * sum_bhs is not 'n' but we have to do end_request by hand if + * all requests finish until we had a chance to set up the + * semaphore correctly ... lots of races). + */ + bhl = r1_bh->mirror_bh_list; + while(bhl) { + struct evms_logical_node *node; + + mbh = bhl; + bhl = mbh->b_next; + node = (struct evms_logical_node *)mbh->b_private; + mbh->b_private = r1_bh; + + evms_cs_volume_request_in_progress(mbh->b_rdev, 1, NULL); + W_IO(node, mbh); + } +} + + +static int raid1_status (char *page, mddev_t *mddev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + int sz = 0, i; + + sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, + conf->working_disks); + for (i = 0; i < conf->raid_disks; i++) + sz += sprintf (page+sz, "%s", + conf->mirrors[i].operational ? "U" : "_"); + sz += sprintf (page+sz, "]"); + return sz; +} + +#define LAST_DISK KERN_ALERT \ +"EVMS raid1: only one disk left and IO error.\n" + +#define NO_SPARE_DISK KERN_ALERT \ +"EVMS raid1: no spare disk left, degrading mirror level by one.\n" + +#define DISK_FAILED KERN_ALERT \ +"EVMS raid1: Disk failure on %s, disabling device. \n" \ +" Operation continuing on %d devices\n" + +#define START_SYNCING KERN_ALERT \ +"EVMS raid1: start syncing spare disk.\n" + +#define ALREADY_SYNCING KERN_INFO \ +"EVMS raid1: syncing already in progress.\n" + +static void mark_disk_bad (mddev_t *mddev, int failed) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + struct mirror_info *mirror = conf->mirrors+failed; + mdp_super_t *sb = mddev->sb; + + mirror->operational = 0; + mark_disk_faulty(sb->disks+mirror->number); + mark_disk_nonsync(sb->disks+mirror->number); + mark_disk_inactive(sb->disks+mirror->number); + if (!mirror->write_only) + sb->active_disks--; + sb->working_disks--; + sb->failed_disks++; + mddev->sb_dirty = 1; + evms_cs_wakeup_thread(conf->thread); + if (!mirror->write_only) + conf->working_disks--; + LOG_SERIOUS(DISK_FAILED, evms_md_partition_name(mirror->node),conf->working_disks); +} + +static int raid1_error ( + mddev_t *mddev, + struct evms_logical_node *node) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + struct mirror_info * mirrors = conf->mirrors; + int disks = MD_SB_DISKS; + int i; + + /* Find the drive. + * If it is not operational, then we have already marked it as dead + * else if it is the last working disks, ignore the error, let the + * next level up know. + * else mark the drive as failed + */ + + for (i = 0; i < disks; i++) + if (mirrors[i].node==node && mirrors[i].operational) + break; + if (i == disks) + return 0; + + if (i < conf->raid_disks && conf->working_disks == 1) { + /* Don't fail the drive, act as though we were just a + * normal single drive + */ + + return 1; + } + mark_disk_bad(mddev, i); + return 0; +} + +#undef LAST_DISK +#undef NO_SPARE_DISK +#undef DISK_FAILED +#undef START_SYNCING + + +static void print_raid1_conf (raid1_conf_t *conf) +{ + int i; + struct mirror_info *tmp; + + LOG_DEFAULT("RAID1 conf printout:\n"); + if (!conf) { + LOG_DEFAULT("(conf==NULL)\n"); + return; + } + LOG_DEFAULT(" --- wd:%d rd:%d nd:%d\n", + conf->working_disks,conf->raid_disks, conf->nr_disks); + + for (i = 0; i < conf->nr_disks; i++) { + tmp = conf->mirrors + i; + LOG_DEFAULT(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n", + i, tmp->spare,tmp->operational, + tmp->number,tmp->raid_disk,tmp->used_slot, + evms_md_partition_name(tmp->node)); + } +} + +static void close_sync(raid1_conf_t *conf) +{ + mddev_t *mddev = conf->mddev; + /* If reconstruction was interrupted, we need to close the "active" and "pending" + * holes. + * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0 + */ + /* this is really needed when recovery stops too... */ + spin_lock_irq(&conf->segment_lock); + conf->start_active = conf->start_pending; + conf->start_ready = conf->start_pending; + wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock); + conf->start_active =conf->start_ready = conf->start_pending = conf->start_future; + conf->start_future = mddev->sb->size+1; + conf->cnt_pending = conf->cnt_future; + conf->cnt_future = 0; + conf->phase = conf->phase ^1; + wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock); + conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0; + conf->phase = 0; + conf->cnt_future = conf->cnt_done;; + conf->cnt_done = 0; + spin_unlock_irq(&conf->segment_lock); + wake_up(&conf->wait_done); +} + +static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state) +{ + int err = 0; + int i, failed_disk=-1, spare_disk=-1, removed_disk=-1; + raid1_conf_t *conf = mddev->private; + struct mirror_info *tmp, *sdisk, *fdisk, *rdisk; + mdp_super_t *sb = mddev->sb; + mdp_disk_t *failed_desc, *spare_desc; + mdk_rdev_t *spare_rdev, *failed_rdev; + + print_raid1_conf(conf); + md_spin_lock_irq(&conf->device_lock); + /* + * find the disk ... + */ + switch (state) { + + case DISKOP_SPARE_ACTIVE: + + /* + * Find the failed disk within the RAID1 configuration ... + * (this can only be in the first conf->working_disks part) + */ + for (i = 0; i < conf->raid_disks; i++) { + tmp = conf->mirrors + i; + if ((!tmp->operational && !tmp->spare) || + !tmp->used_slot) { + failed_disk = i; + break; + } + } + /* + * When we activate a spare disk we _must_ have a disk in + * the lower (active) part of the array to replace. + */ +/* if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) { + MD_BUG(); + err = 1; + goto abort; + } + */ /* fall through */ + + case DISKOP_HOT_SPARE_ACTIVE: + case DISKOP_SPARE_WRITE: + case DISKOP_SPARE_INACTIVE: + + /* + * Find the spare disk ... (can only be in the 'high' + * area of the array) + ##### Actually it can be sooner now that we have improved MD ##### + This support required for expanding number of active mirrors. + */ + for (i = 0; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + if (tmp->spare && tmp->number == (*d)->number) { + spare_disk = i; + break; + } + } + if (spare_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_REMOVE_SPARE: + + for (i = 0; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + if (tmp->used_slot && (tmp->number == (*d)->number)) { + if (tmp->operational) { + err = -EBUSY; + goto abort; + } else if (!tmp->spare){ + MD_BUG(); + err = 1; + goto abort; + } + removed_disk = i; + break; + } + } + if (removed_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_REMOVE_DISK: + if (conf->working_disks <= 1) { + err = -EBUSY; + goto abort; + } + for (i = 0; i < MD_SB_DISKS; i++) { + tmp = conf->mirrors + i; + if (tmp->used_slot && (tmp->number == (*d)->number)) { + removed_disk = i; + break; + } + } + if (removed_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_ADD_DISK: + err = -ENOSYS; + goto abort; + break; + } + + switch (state) { + /* + * Switch the spare disk to write-only mode: + */ + case DISKOP_SPARE_WRITE: + sdisk = conf->mirrors + spare_disk; + sdisk->operational = 1; + sdisk->write_only = 1; + break; + /* + * Deactivate a spare disk: + */ + case DISKOP_SPARE_INACTIVE: + close_sync(conf); + sdisk = conf->mirrors + spare_disk; + sdisk->operational = 0; + sdisk->write_only = 0; + break; + /* + * Activate (mark read-write) the (now sync) spare disk, + * which means we switch it's 'raid position' (->raid_disk) + * with the failed disk. (only the first 'conf->nr_disks' + * slots are used for 'real' disks and we must preserve this + * property) + */ + case DISKOP_SPARE_ACTIVE: + close_sync(conf); + sdisk = conf->mirrors + spare_disk; + if (failed_disk < 0) { + // preset failed disk to itself if no failed disk. + failed_disk = spare_disk; + // try to find spare earlier in array + for (i = conf->raid_disks; i < spare_disk; i++) { + tmp = conf->mirrors + i; + if ((tmp->spare) || !tmp->used_slot) { + failed_disk = i; + break; + } + } + } + fdisk = conf->mirrors + failed_disk; + + spare_desc = &sb->disks[sdisk->number]; + failed_desc = &sb->disks[fdisk->number]; + + if (spare_desc != *d) { + MD_BUG(); + err = 1; + goto abort; + } + + if (spare_desc->raid_disk != sdisk->raid_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (sdisk->raid_disk != spare_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (failed_desc->raid_disk != fdisk->raid_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (fdisk->raid_disk != failed_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + /* + * do the switch finally + */ + spare_rdev = evms_md_find_rdev_nr(mddev, spare_desc->number); + failed_rdev = evms_md_find_rdev_nr(mddev, failed_desc->number); + + /* There must be a spare_rdev, but there may not be a + * failed_rdev. That slot might be empty... + */ + spare_rdev->desc_nr = failed_desc->number; + if (failed_rdev) + failed_rdev->desc_nr = spare_desc->number; + + xchg_values(*spare_desc, *failed_desc); + xchg_values(*fdisk, *sdisk); + + /* + * (careful, 'failed' and 'spare' are switched from now on) + * + * we want to preserve linear numbering and we want to + * give the proper raid_disk number to the now activated + * disk. (this means we switch back these values) + */ + + xchg_values(spare_desc->raid_disk, failed_desc->raid_disk); + xchg_values(sdisk->raid_disk, fdisk->raid_disk); + xchg_values(spare_desc->number, failed_desc->number); + xchg_values(sdisk->number, fdisk->number); + + *d = failed_desc; + + if (sdisk->dev == MKDEV(0,0)) + sdisk->used_slot = 0; + /* + * this really activates the spare. + */ + fdisk->spare = 0; + fdisk->write_only = 0; + + /* + * if we activate a spare, we definitely replace a + * non-operational disk slot in the 'low' area of + * the disk array. + */ + + conf->working_disks++; + + break; + + /* Activate a spare disk without a failed disk */ + case DISKOP_HOT_SPARE_ACTIVE: + sdisk = conf->mirrors + spare_disk; + sdisk->spare = 0; + sdisk->write_only = 0; + conf->working_disks++; + conf->raid_disks++; + if (raid1_grow_bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS) + LOG_WARNING("%s: Cannot grow BH pool\n", __FUNCTION__); + break; + + case DISKOP_HOT_REMOVE_SPARE: + rdisk = conf->mirrors + removed_disk; + + if (removed_disk < conf->raid_disks) { + MD_BUG(); + err = 1; + goto abort; + } + + LOG_WARNING("%s: removing spare %s, [md%d] nr_disks=%d\n", + __FUNCTION__, evms_md_partition_name(rdisk->node), + conf->mddev->__minor, conf->nr_disks-1); + + rdisk->dev = MKDEV(0,0); + rdisk->node = NULL; + rdisk->used_slot = 0; + conf->nr_disks--; + break; + + case DISKOP_HOT_REMOVE_DISK: + rdisk = conf->mirrors + removed_disk; + + LOG_WARNING("%s: removing active disk %s, [md%d] nr_disks=%d\n", + __FUNCTION__, evms_md_partition_name(rdisk->node), + conf->mddev->__minor, conf->nr_disks-1); + + rdisk->dev = MKDEV(0,0); + rdisk->node = NULL; + rdisk->used_slot = 0; + rdisk->operational = 0; + conf->working_disks--; + conf->nr_disks--; + sb->raid_disks--; //decrement raid disks. md_core now increments + //when activating new spare, don't assume add spare here + break; + default: + MD_BUG(); + err = 1; + goto abort; + } +abort: + md_spin_unlock_irq(&conf->device_lock); + if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE) + /* should move to "END_REBUILD" when such exists */ + raid1_shrink_buffers(conf); + + print_raid1_conf(conf); + return err; +} + + +#define IO_ERROR KERN_ALERT \ +"EVMS raid1: %s: unrecoverable I/O read error for block %lu\n" + +#define REDIRECT_SECTOR KERN_ERR \ +"EVMS raid1: %s: redirecting sector %lu to another mirror\n" + +/* + * This is a kernel thread which: + * + * 1. Retries failed read operations on working mirrors. + * 2. Updates the raid superblock when problems encounter. + * 3. Performs writes following reads for array syncronising. + */ +static void end_sync_write(struct buffer_head *bh, int uptodate); +static void end_sync_read(struct buffer_head *bh, int uptodate); + +static void raid1d (void *data) +{ + struct raid1_bh *r1_bh; + struct buffer_head *bh; + unsigned long flags; + mddev_t *mddev; + mdk_rdev_t *rdev; + kdev_t dev; + struct evms_logical_node *node; + raid1_conf_t *conf = (raid1_conf_t *) data; + + for (;;) { + mddev = conf->mddev; + if (mddev->sb_dirty) { + LOG_DEFAULT("EVMS raid1: dirty sb detected, updating.\n"); + mddev->sb_dirty = 0; + evms_md_update_sb(mddev); + } + md_spin_lock_irqsave(&retry_list_lock, flags); + r1_bh = evms_raid1_retry_list; + if (!r1_bh) + break; + evms_raid1_retry_list = r1_bh->next_r1; + md_spin_unlock_irqrestore(&retry_list_lock, flags); + + mddev = r1_bh->mddev; + bh = &r1_bh->bh_req; + switch(r1_bh->cmd) { + case SPECIAL: + /* have to allocate lots of bh structures and + * schedule writes + */ + if (test_bit(R1BH_Uptodate, &r1_bh->state)) { + int i, sum_bhs = 0; + int disks = MD_SB_DISKS; + struct buffer_head *bhl, *mbh; + + conf = mddev_to_conf(mddev); + bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */ + for (i = 0; i < disks ; i++) { + if (!conf->mirrors[i].operational) + continue; + if (i==conf->last_used) + /* we read from here, no need to write */ + continue; + if (i < conf->raid_disks + && !conf->resync_mirrors + && !conf->mirrors[i].write_only) + /* don't need to write this, + * we are just rebuilding */ + continue; + mbh = bhl; + if (!mbh) { + MD_BUG(); + break; + } + bhl = mbh->b_next; + mbh->b_this_page = (struct buffer_head *)1; + + + /* + * prepare mirrored bh (fields ordered for max mem throughput): + */ + mbh->b_blocknr = bh->b_blocknr; + mbh->b_dev = conf->mirrors[i].dev; + mbh->b_rsector = bh->b_blocknr; + mbh->b_state = (1<b_count, 1); + mbh->b_size = bh->b_size; + mbh->b_page = bh->b_page; + mbh->b_data = bh->b_data; + mbh->b_list = BUF_LOCKED; + mbh->b_end_io = end_sync_write; + mbh->b_private = conf->mirrors[i].node; + + mbh->b_next = r1_bh->mirror_bh_list; + r1_bh->mirror_bh_list = mbh; + + sum_bhs++; + } + atomic_set(&r1_bh->remaining, sum_bhs); + if (bhl) raid1_free_bh(conf, bhl); + mbh = r1_bh->mirror_bh_list; + + if (!sum_bhs) { + /* nowhere to write this too... I guess we + * must be done + */ + sync_request_done(bh->b_blocknr, conf); + evms_md_done_sync(mddev, + bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT, + 0); + raid1_free_buf(r1_bh); + } else { + while (mbh) { + + node = (struct evms_logical_node *)mbh->b_private; + mbh->b_private = r1_bh; + + W_IO(node, mbh); + evms_md_sync_acct(mbh->b_dev, + bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT); + mbh = mbh->b_next; + } + } + } else { + /* There is no point trying a read-for-reconstruct + * as reconstruct is about to be aborted + */ + rdev = evms_md_find_rdev(mddev,bh->b_dev); + if (rdev) + LOG_ERROR(IO_ERROR, + evms_md_partition_name(rdev->node), + bh->b_blocknr); + evms_md_done_sync(mddev, bh->b_size>>EVMS_VSECTOR_SIZE_SHIFT, 0); + } + + break; + case READ: + case READA: + + dev = bh->b_dev; + evms_raid1_map(mddev, &node, &bh->b_dev); + if (bh->b_dev == dev) { + rdev = evms_md_find_rdev(mddev,dev); + if (rdev) + LOG_ERROR(" unrecoverable read error on %s at LBA(%lu)\n", + evms_md_partition_name(rdev->node), + r1_bh->master_bh->b_rsector); + raid1_end_bh_io(r1_bh, 0); + } else { + /* retry I/O on new device */ + bh->b_rdev = r1_bh->master_bh->b_rdev; + bh->b_rsector = bh->b_blocknr; + evms_cs_volume_request_in_progress(r1_bh->master_bh->b_rdev, 1, NULL); + R_IO(node, bh); + } + break; + } + } + md_spin_unlock_irqrestore(&retry_list_lock, flags); +} +#undef IO_ERROR +#undef REDIRECT_SECTOR + +/* + * Private kernel thread to reconstruct mirrors after an unclean + * shutdown. + */ +static void raid1syncd (void *data) +{ + raid1_conf_t *conf = data; + mddev_t *mddev = conf->mddev; + + if (!conf->resync_mirrors) + return; + if (conf->resync_mirrors == 2) + return; + down(&mddev->recovery_sem); + if (!evms_md_do_sync(mddev, NULL)) { + /* + * Only if everything went Ok. + */ + conf->resync_mirrors = 0; + } + + close_sync(conf); + + up(&mddev->recovery_sem); + raid1_shrink_buffers(conf); +} + +/* + * perform a "sync" on one "block" + * + * We need to make sure that no normal I/O request - particularly write + * requests - conflict with active sync requests. + * This is achieved by conceptually dividing the device space into a + * number of sections: + * DONE: 0 .. a-1 These blocks are in-sync + * ACTIVE: a.. b-1 These blocks may have active sync requests, but + * no normal IO requests + * READY: b .. c-1 These blocks have no normal IO requests - sync + * request may be happening + * PENDING: c .. d-1 These blocks may have IO requests, but no new + * ones will be added + * FUTURE: d .. end These blocks are not to be considered yet. IO may + * be happening, but not sync + * + * We keep a + * phase which flips (0 or 1) each time d moves and + * a count of: + * z = active io requests in FUTURE since d moved - marked with + * current phase + * y = active io requests in FUTURE before d moved, or PENDING - + * marked with previous phase + * x = active sync requests in READY + * w = active sync requests in ACTIVE + * v = active io requests in DONE + * + * Normally, a=b=c=d=0 and z= active io requests + * or a=b=c=d=END and v= active io requests + * Allowed changes to a,b,c,d: + * A: c==d && y==0 -> d+=window, y=z, z=0, phase=!phase + * B: y==0 -> c=d + * C: b=c, w+=x, x=0 + * D: w==0 -> a=b + * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0 + * + * At start of sync we apply A. + * When y reaches 0, we apply B then A then being sync requests + * When sync point reaches c-1, we wait for y==0, and W==0, and + * then apply apply B then A then D then C. + * Finally, we apply E + * + * The sync request simply issues a "read" against a working drive + * This is marked so that on completion the raid1d thread is woken to + * issue suitable write requests + */ + +static int raid1_sync_request (mddev_t *mddev, unsigned long sector_nr) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + struct mirror_info *mirror; + struct raid1_bh *r1_bh; + struct buffer_head *bh; + int bsize; + int disk; + int block_nr; + + spin_lock_irq(&conf->segment_lock); + if (!sector_nr) { + /* initialize ...*/ + int buffs; + conf->start_active = 0; + conf->start_ready = 0; + conf->start_pending = 0; + conf->start_future = 0; + conf->phase = 0; + /* we want enough buffers to hold twice the window of 128*/ + buffs = 128 *2 / (PAGE_SIZE>>9); + buffs = raid1_grow_buffers(conf, buffs); + if (buffs < 2) + goto nomem; + + conf->window = buffs*(PAGE_SIZE>>9)/2; + conf->cnt_future += conf->cnt_done+conf->cnt_pending; + conf->cnt_done = conf->cnt_pending = 0; + if (conf->cnt_ready || conf->cnt_active) + MD_BUG(); + } + while (sector_nr >= conf->start_pending) { + PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n", + sector_nr, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future, + conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future); + wait_event_lock_irq(conf->wait_done, + !conf->cnt_active, + conf->segment_lock); + wait_event_lock_irq(conf->wait_ready, + !conf->cnt_pending, + conf->segment_lock); + conf->start_active = conf->start_ready; + conf->start_ready = conf->start_pending; + conf->start_pending = conf->start_future; + conf->start_future = conf->start_future+conf->window; + // Note: falling off the end is not a problem + conf->phase = conf->phase ^1; + conf->cnt_active = conf->cnt_ready; + conf->cnt_ready = 0; + conf->cnt_pending = conf->cnt_future; + conf->cnt_future = 0; + wake_up(&conf->wait_done); + } + conf->cnt_ready++; + spin_unlock_irq(&conf->segment_lock); + + + /* If reconstructing, and >1 working disc, + * could dedicate one to rebuild and others to + * service read requests .. + */ + disk = conf->last_used; + /* make sure disk is operational */ + while (!conf->mirrors[disk].operational) { + if (disk <= 0) disk = conf->raid_disks; + disk--; + if (disk == conf->last_used) + break; + } + conf->last_used = disk; + + mirror = conf->mirrors+conf->last_used; + + r1_bh = raid1_alloc_buf (conf); + r1_bh->mddev = mddev; + r1_bh->cmd = SPECIAL; + bh = &r1_bh->bh_req; + + block_nr = sector_nr; + bsize = 512; + while (!(block_nr & 1) && bsize < PAGE_SIZE + && (block_nr+2)*(bsize>>9) <= (mddev->sb->size *2)) { + block_nr >>= 1; + bsize <<= 1; + } + bh->b_size = bsize; + bh->b_list = BUF_LOCKED; + bh->b_dev = mirror->dev; + bh->b_state = (1<b_page) + BUG(); + if (!bh->b_data) + BUG(); + if (bh->b_data != page_address(bh->b_page)) + BUG(); + bh->b_end_io = end_sync_read; + bh->b_private = r1_bh; + bh->b_blocknr = sector_nr; + bh->b_rsector = sector_nr; + init_waitqueue_head(&bh->b_wait); + + R_IO(mirror->node, bh); + evms_md_sync_acct(bh->b_dev, bsize/512); + + return (bsize >> 9); + +nomem: + raid1_shrink_buffers(conf); + spin_unlock_irq(&conf->segment_lock); + return -ENOMEM; +} + +static void end_sync_read(struct buffer_head *bh, int uptodate) +{ + struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private); + + /* we have read a block, now it needs to be re-written, + * or re-read if the read failed. + * We don't do much here, just schedule handling by raid1d + */ + if (!uptodate) + evms_md_error_dev(r1_bh->mddev, bh->b_dev); + else + set_bit(R1BH_Uptodate, &r1_bh->state); + raid1_reschedule_retry(r1_bh); +} + +static void end_sync_write(struct buffer_head *bh, int uptodate) +{ + struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private); + + if (!uptodate) + evms_md_error_dev(r1_bh->mddev, bh->b_dev); + if (atomic_dec_and_test(&r1_bh->remaining)) { + mddev_t *mddev = r1_bh->mddev; + unsigned long sect = bh->b_blocknr; + int size = bh->b_size; + + raid1_free_buf(r1_bh); + sync_request_done(sect, mddev_to_conf(mddev)); + evms_md_done_sync(mddev, size>>EVMS_VSECTOR_SIZE_SHIFT, uptodate); + } +} + +#define INVALID_LEVEL KERN_WARNING \ +"EVMS raid1: md%d: raid level not set to mirroring (%d)\n" + +#define NO_SB KERN_ERR \ +"EVMS raid1: disabled mirror %s (couldn't access raid superblock)\n" + +#define ERRORS KERN_ERR \ +"EVMS raid1: disabled mirror %s (errors detected)\n" + +#define NOT_IN_SYNC KERN_ERR \ +"EVMS raid1: disabled mirror %s (not in sync)\n" + +#define INCONSISTENT KERN_ERR \ +"EVMS raid1: disabled mirror %s (inconsistent descriptor)\n" + +#define ALREADY_RUNNING KERN_ERR \ +"EVMS raid1: disabled mirror %s (mirror %d already operational)\n" + +#define OPERATIONAL KERN_INFO \ +"EVMS raid1: device %s operational as mirror %d\n" + +#define MEM_ERROR KERN_ERR \ +"EVMS raid1: couldn't allocate memory for md%d\n" + +#define SPARE KERN_INFO \ +"EVMS raid1: spare disk %s\n" + +#define NONE_OPERATIONAL KERN_ERR \ +"EVMS raid1: no operational mirrors for md%d\n" + +#define ARRAY_IS_ACTIVE KERN_INFO \ +"EVMS raid1: raid set md%d active with %d out of %d mirrors\n" + +#define THREAD_ERROR KERN_ERR \ +"EVMS raid1: couldn't allocate thread for md%d\n" + +#define START_RESYNC KERN_WARNING \ +"EVMS raid1: raid set md%d not clean; reconstructing mirrors\n" + +static int raid1_run (mddev_t *mddev) +{ + raid1_conf_t *conf; + int i, j, disk_idx; + struct mirror_info *disk; + mdp_super_t *sb = mddev->sb; + mdp_disk_t *descriptor; + mdk_rdev_t *rdev; + struct md_list_head *tmp; + int start_recovery = 0; + + MOD_INC_USE_COUNT; + + LOG_EXTRA("%s ENTRY\n", __FUNCTION__); + if (sb->level != 1) { + LOG_ERROR(INVALID_LEVEL, mdidx(mddev), sb->level); + goto out; + } + /* + * copy the already verified devices into our private RAID1 + * bookkeeping area. [whatever we allocate in raid1_run(), + * should be freed in raid1_stop()] + */ + + conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL); + mddev->private = conf; + if (!conf) { + LOG_ERROR(MEM_ERROR, mdidx(mddev)); + goto out; + } + memset(conf, 0, sizeof(*conf)); + + ITERATE_RDEV(mddev,rdev,tmp) { + if (rdev->faulty) { + LOG_ERROR(ERRORS, evms_md_partition_name(rdev->node)); + } else { + if (!rdev->sb) { + MD_BUG(); + continue; + } + } + if (rdev->desc_nr == -1) { + MD_BUG(); + continue; + } + descriptor = &sb->disks[rdev->desc_nr]; + disk_idx = descriptor->raid_disk; + disk = conf->mirrors + disk_idx; + + if (disk_faulty(descriptor)) { + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->node = rdev->node; + disk->dev = rdev->dev; + disk->sect_limit = MAX_WORK_PER_DISK; + disk->operational = 0; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + disk->head_position = 0; + continue; + } + if (disk_active(descriptor)) { + if (!disk_sync(descriptor)) { + LOG_ERROR(NOT_IN_SYNC, evms_md_partition_name(rdev->node)); + continue; + } + if ((descriptor->number > MD_SB_DISKS) || + (disk_idx > sb->raid_disks)) { + + LOG_ERROR(INCONSISTENT,evms_md_partition_name(rdev->node)); + continue; + } + if (disk->operational) { + LOG_ERROR(ALREADY_RUNNING, evms_md_partition_name(rdev->node), disk_idx); + continue; + } + LOG_DEFAULT(OPERATIONAL, evms_md_partition_name(rdev->node), disk_idx); + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->node = rdev->node; + disk->dev = rdev->dev; + disk->sect_limit = MAX_WORK_PER_DISK; + disk->operational = 1; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + disk->head_position = 0; + conf->working_disks++; + } else { + /* + * Must be a spare disk .. + */ + LOG_DEFAULT(SPARE, evms_md_partition_name(rdev->node)); + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->node = rdev->node; + disk->dev = rdev->dev; + disk->sect_limit = MAX_WORK_PER_DISK; + disk->operational = 0; + disk->write_only = 0; + disk->spare = 1; + disk->used_slot = 1; + disk->head_position = 0; + } + } + conf->raid_disks = sb->raid_disks; + conf->nr_disks = sb->nr_disks; + conf->mddev = mddev; + conf->device_lock = MD_SPIN_LOCK_UNLOCKED; + + conf->segment_lock = MD_SPIN_LOCK_UNLOCKED; + init_waitqueue_head(&conf->wait_buffer); + init_waitqueue_head(&conf->wait_done); + init_waitqueue_head(&conf->wait_ready); + + if (!conf->working_disks) { + LOG_ERROR(NONE_OPERATIONAL, mdidx(mddev)); + goto out_free_conf; + } + + + /* pre-allocate some buffer_head structures. + * As a minimum, 1 r1bh and raid_disks buffer_heads + * would probably get us by in tight memory situations, + * but a few more is probably a good idea. + * For now, try NR_RESERVED_BUFS r1bh and + * NR_RESERVED_BUFS*raid_disks bufferheads + * This will allow at least NR_RESERVED_BUFS concurrent + * reads or writes even if kmalloc starts failing + */ + if (raid1_grow_r1bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS || + raid1_grow_bh(conf, NR_RESERVED_BUFS*conf->raid_disks) + < NR_RESERVED_BUFS*conf->raid_disks) { + LOG_ERROR(MEM_ERROR, mdidx(mddev)); + goto out_free_conf; + } + + for (i = 0; i < MD_SB_DISKS; i++) { + + descriptor = sb->disks+i; + disk_idx = descriptor->raid_disk; + disk = conf->mirrors + disk_idx; + + if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) && + !disk->used_slot) { + + disk->number = descriptor->number; + disk->raid_disk = disk_idx; + disk->dev = MKDEV(0,0); + + disk->operational = 0; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + disk->head_position = 0; + } + } + + /* + * find the first working one and use it as a starting point + * to read balancing. + */ + for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++) + /* nothing */; + conf->last_used = j; + + + if (conf->working_disks != sb->raid_disks) { + LOG_SERIOUS(" md%d, not all disks are operational -- trying to recover array\n", + mdidx(mddev)); + start_recovery = 1; + } + + { + const char * name = "evms_raid1d"; + + conf->thread = evms_cs_register_thread(raid1d, conf, name); + if (!conf->thread) { + LOG_ERROR(THREAD_ERROR, mdidx(mddev)); + goto out_free_conf; + } + } + + if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN)) && + (conf->working_disks > 1)) { + const char * name = "evms_raid1syncd"; + + conf->resync_thread = evms_cs_register_thread(raid1syncd, conf,name); + if (!conf->resync_thread) { + LOG_ERROR(THREAD_ERROR, mdidx(mddev)); + goto out_free_conf; + } + + LOG_WARNING(START_RESYNC, mdidx(mddev)); + conf->resync_mirrors = 1; + evms_cs_wakeup_thread(conf->resync_thread); + } + + /* + * Regenerate the "device is in sync with the raid set" bit for + * each device. + */ + for (i = 0; i < MD_SB_DISKS; i++) { + mark_disk_nonsync(sb->disks+i); + for (j = 0; j < sb->raid_disks; j++) { + if (!conf->mirrors[j].operational) + continue; + if (sb->disks[i].number == conf->mirrors[j].number) + mark_disk_sync(sb->disks+i); + } + } + sb->active_disks = conf->working_disks; + + if (start_recovery) + evms_md_recover_arrays(); + + + LOG_DEFAULT(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks); + /* + * Ok, everything is just fine now + */ + return 0; + +out_free_conf: + raid1_shrink_r1bh(conf); + raid1_shrink_bh(conf); + raid1_shrink_buffers(conf); + kfree(conf); + mddev->private = NULL; +out: + MOD_DEC_USE_COUNT; + return -EIO; +} + +#undef INVALID_LEVEL +#undef NO_SB +#undef ERRORS +#undef NOT_IN_SYNC +#undef INCONSISTENT +#undef ALREADY_RUNNING +#undef OPERATIONAL +#undef SPARE +#undef NONE_OPERATIONAL +#undef ARRAY_IS_ACTIVE + +static int raid1_stop_resync (mddev_t *mddev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + + LOG_EXTRA("%s ENTRY\n", __FUNCTION__); + if (conf->resync_thread) { + if (conf->resync_mirrors) { + conf->resync_mirrors = 2; + evms_cs_interrupt_thread(conf->resync_thread); + LOG_WARNING(" mirror resync was not fully finished, restarting next time.\n"); + return 1; + } + return 0; + } + return 0; +} + +static int raid1_restart_resync (mddev_t *mddev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + + LOG_EXTRA("%s ENTRY\n", __FUNCTION__); + if (conf->resync_mirrors) { + if (!conf->resync_thread) { + MD_BUG(); + return 0; + } + conf->resync_mirrors = 1; + evms_cs_wakeup_thread(conf->resync_thread); + return 1; + } + return 0; +} + +static int raid1_stop (mddev_t *mddev) +{ + raid1_conf_t *conf = mddev_to_conf(mddev); + + LOG_EXTRA("%s ENTRY\n", __FUNCTION__); + evms_cs_unregister_thread(conf->thread); + if (conf->resync_thread) + evms_cs_unregister_thread(conf->resync_thread); + raid1_shrink_r1bh(conf); + raid1_shrink_bh(conf); + raid1_shrink_buffers(conf); + kfree(conf); + mddev->private = NULL; + MOD_DEC_USE_COUNT; + return 0; +} + +static int raid1_evms_ioctl ( + mddev_t * mddev, + struct inode * inode, + struct file * file, + unsigned int cmd, + unsigned long arg) +{ + int i, rc = 0; + struct evms_logical_node *node = NULL; + raid1_conf_t *conf = mddev_to_conf(mddev); + + switch (cmd) { + case EVMS_GET_BMAP: + { + for (i = 0; i < MD_SB_DISKS; i++) { + if (conf->mirrors[i].operational) { + node = conf->mirrors[i].node; + break; + } + } + + if (node) + rc = IOCTL(node, inode, file, cmd, arg); + else + rc = -ENODEV; + + break; + } + + default: + rc = -EINVAL; + } + return rc; +} + +static mdk_personality_t raid1_personality = { + .name = "evms_raid1", + .read = raid1_read, + .write = raid1_write, + .run = raid1_run, + .stop = raid1_stop, + .status = raid1_status, + .error_handler = raid1_error, + .diskop = raid1_diskop, + .stop_resync = raid1_stop_resync, + .restart_resync = raid1_restart_resync, + .sync_request = raid1_sync_request, + .evms_ioctl = raid1_evms_ioctl +}; + +static int md__init raid1_init (void) +{ + return evms_register_md_personality (RAID1, &raid1_personality); +} + +static void raid1_exit (void) +{ + evms_unregister_md_personality (RAID1); +} + +module_init(raid1_init); +module_exit(raid1_exit); +#ifdef MODULE_LICENSE +MODULE_LICENSE("GPL"); +#endif diff -Naur linux-2002-09-30/drivers/evms/md_raid5.c evms-2002-09-30/drivers/evms/md_raid5.c --- linux-2002-09-30/drivers/evms/md_raid5.c Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/drivers/evms/md_raid5.c Thu Sep 26 14:40:58 2002 @@ -0,0 +1,2283 @@ +/* + * md_raid5.c : Multiple Devices driver for Linux + * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman + * Copyright (C) 1999, 2000 Ingo Molnar + * + * RAID-5 management functions. + * + * 'md_raid5.c' is an EVMS version of linux/drivers/md/raid5.c modified + * by Cuong (Mike) Tran , January 2002. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define LOG_PREFIX "md raid5: " + +static mdk_personality_t raid5_personality; + +/* + * Stripe cache + */ + +#define NR_STRIPES 256 +#define IO_THRESHOLD 1 +#define HASH_PAGES 1 +#define HASH_PAGES_ORDER 0 +#define NR_HASH (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *)) +#define HASH_MASK (NR_HASH - 1) +#define stripe_hash(conf, sect) ((conf)->stripe_hashtbl[((sect) / ((conf)->buffer_size >> 9)) & HASH_MASK]) + +/* + * The following can be used to debug the driver + */ +#define RAID5_DEBUG 0 +#define RAID5_PARANOIA 1 +#if RAID5_PARANOIA && CONFIG_SMP +#define CHECK_DEVLOCK() if (!spin_is_locked(&conf->device_lock)) BUG() +#else +#define CHECK_DEVLOCK() +#endif + +static void print_raid5_conf(raid5_conf_t * conf); + +static inline void +__release_stripe(raid5_conf_t * conf, struct stripe_head *sh) +{ + if (atomic_dec_and_test(&sh->count)) { + if (!list_empty(&sh->lru)) + BUG(); + if (atomic_read(&conf->active_stripes) == 0) + BUG(); + if (test_bit(STRIPE_HANDLE, &sh->state)) { + if (test_bit(STRIPE_DELAYED, &sh->state)) + list_add_tail(&sh->lru, &conf->delayed_list); + else + list_add_tail(&sh->lru, &conf->handle_list); + evms_cs_wakeup_thread(conf->thread); + } else { + if (test_and_clear_bit + (STRIPE_PREREAD_ACTIVE, &sh->state)) { + atomic_dec(&conf->preread_active_stripes); + if (atomic_read(&conf->preread_active_stripes) < + IO_THRESHOLD) + evms_cs_wakeup_thread(conf->thread); + } + list_add_tail(&sh->lru, &conf->inactive_list); + atomic_dec(&conf->active_stripes); + if (!conf->inactive_blocked || + atomic_read(&conf->active_stripes) < + (NR_STRIPES * 3 / 4)) + wake_up(&conf->wait_for_stripe); + } + } +} +static void +release_stripe(struct stripe_head *sh) +{ + raid5_conf_t *conf = sh->raid_conf; + unsigned long flags; + + spin_lock_irqsave(&conf->device_lock, flags); + __release_stripe(conf, sh); + spin_unlock_irqrestore(&conf->device_lock, flags); +} + +static void +remove_hash(struct stripe_head *sh) +{ + + if (sh->hash_pprev) { + if (sh->hash_next) + sh->hash_next->hash_pprev = sh->hash_pprev; + *sh->hash_pprev = sh->hash_next; + sh->hash_pprev = NULL; + } +} + +static __inline__ void +insert_hash(raid5_conf_t * conf, struct stripe_head *sh) +{ + struct stripe_head **shp = &stripe_hash(conf, sh->sector); + + CHECK_DEVLOCK(); + if ((sh->hash_next = *shp) != NULL) + (*shp)->hash_pprev = &sh->hash_next; + *shp = sh; + sh->hash_pprev = shp; +} + +/* find an idle stripe, make sure it is unhashed, and return it. */ +static struct stripe_head * +get_free_stripe(raid5_conf_t * conf) +{ + struct stripe_head *sh = NULL; + struct list_head *first; + + CHECK_DEVLOCK(); + if (list_empty(&conf->inactive_list)) + goto out; + first = conf->inactive_list.next; + sh = list_entry(first, struct stripe_head, lru); + list_del_init(first); + remove_hash(sh); + atomic_inc(&conf->active_stripes); + out: + return sh; +} + +static void +shrink_buffers(struct stripe_head *sh, int num) +{ + struct buffer_head *bh; + int i; + + for (i = 0; i < num; i++) { + bh = sh->bh_cache[i]; + if (!bh) + return; + sh->bh_cache[i] = NULL; + free_page((unsigned long) bh->b_data); + kfree(bh); + } +} + +static int +grow_buffers(struct stripe_head *sh, int num, int b_size, int priority) +{ + struct buffer_head *bh; + int i; + + for (i = 0; i < num; i++) { + struct page *page; + bh = kmalloc(sizeof (struct buffer_head), priority); + if (!bh) + return 1; + memset(bh, 0, sizeof (struct buffer_head)); + init_waitqueue_head(&bh->b_wait); + if ((page = alloc_page(priority))) + bh->b_data = page_address(page); + else { + kfree(bh); + return 1; + } + bh->b_count = (atomic_t)ATOMIC_INIT(0); + bh->b_page = page; + sh->bh_cache[i] = bh; + + } + return 0; +} + +static struct buffer_head *raid5_build_block(struct stripe_head *sh, int i); + +static inline void +init_stripe(struct stripe_head *sh, unsigned long sector) +{ + raid5_conf_t *conf = sh->raid_conf; + int disks = conf->raid_disks, i; + + if (atomic_read(&sh->count) != 0) + BUG(); + if (test_bit(STRIPE_HANDLE, &sh->state)) + BUG(); + + CHECK_DEVLOCK(); + + remove_hash(sh); + + sh->sector = sector; + sh->size = conf->buffer_size; + sh->state = 0; + + for (i = disks; i--;) { + if (sh->bh_read[i] || sh->bh_write[i] || sh->bh_written[i] || + buffer_locked(sh->bh_cache[i])) { + LOG_ERROR("sector=%lx i=%d %p %p %p %d\n", + sh->sector, i, sh->bh_read[i], + sh->bh_write[i], sh->bh_written[i], + buffer_locked(sh->bh_cache[i])); + BUG(); + } + clear_bit(BH_Uptodate, &sh->bh_cache[i]->b_state); + raid5_build_block(sh, i); + } + insert_hash(conf, sh); +} + +/* the buffer size has changed, so unhash all stripes + * as active stripes complete, they will go onto inactive list + */ +static void +shrink_stripe_cache(raid5_conf_t * conf) +{ + int i; + CHECK_DEVLOCK(); + if (atomic_read(&conf->active_stripes)) + BUG(); + for (i = 0; i < NR_HASH; i++) { + struct stripe_head *sh; + while ((sh = conf->stripe_hashtbl[i])) + remove_hash(sh); + } +} + +static struct stripe_head * +__find_stripe(raid5_conf_t * conf, unsigned long sector) +{ + struct stripe_head *sh; + + CHECK_DEVLOCK(); + for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next) + if (sh->sector == sector) + return sh; + return NULL; +} + +static struct stripe_head * +get_active_stripe(raid5_conf_t * conf, unsigned long sector, int size) +{ + struct stripe_head *sh; + + md_spin_lock_irq(&conf->device_lock); + + do { + if (conf->buffer_size == 0 || + (size && size != conf->buffer_size)) { + /* either the size is being changed (buffer_size==0) or + * we need to change it. + * If size==0, we can proceed as soon as buffer_size gets set. + * If size>0, we can proceed when active_stripes reaches 0, or + * when someone else sets the buffer_size to size. + * If someone sets the buffer size to something else, we will need to + * assert that we want to change it again + */ + if (size == 0) + wait_event_lock_irq(conf->wait_for_stripe, + conf->buffer_size, + conf->device_lock); + else { + while (conf->buffer_size != size + && atomic_read(&conf->active_stripes)) { + conf->buffer_size = 0; + wait_event_lock_irq(conf-> + wait_for_stripe, + atomic_read(&conf-> + active_stripes) + == 0 + || conf-> + buffer_size, + conf->device_lock); + } + + if (conf->buffer_size != size) { + shrink_stripe_cache(conf); + if (size == 0) + BUG(); + conf->buffer_size = size; + } + } + } + if (size == 0) + sector -= sector & ((conf->buffer_size >> 9) - 1); + + sh = __find_stripe(conf, sector); + if (!sh) { + if (!conf->inactive_blocked) + sh = get_free_stripe(conf); + if (!sh) { + conf->inactive_blocked = 1; + wait_event_lock_irq(conf->wait_for_stripe, + !list_empty(&conf-> + inactive_list) + && + (atomic_read + (&conf->active_stripes) < + (NR_STRIPES * 3 / 4) + || !conf-> + inactive_blocked), + conf->device_lock); + conf->inactive_blocked = 0; + } else + init_stripe(sh, sector); + } else { + if (atomic_read(&sh->count)) { + if (!list_empty(&sh->lru)) + BUG(); + } else { + if (!test_bit(STRIPE_HANDLE, &sh->state)) + atomic_inc(&conf->active_stripes); + if (list_empty(&sh->lru)) + BUG(); + list_del_init(&sh->lru); + } + } + } while (sh == NULL); + + if (sh) + atomic_inc(&sh->count); + + md_spin_unlock_irq(&conf->device_lock); + return sh; +} + +static int +grow_stripes(raid5_conf_t * conf, int num, int priority) +{ + struct stripe_head *sh; + + while (num--) { + sh = kmalloc(sizeof (struct stripe_head), priority); + if (!sh) + return 1; + memset(sh, 0, sizeof (*sh)); + sh->raid_conf = conf; + sh->lock = SPIN_LOCK_UNLOCKED; + + if (grow_buffers(sh, conf->raid_disks, PAGE_SIZE, priority)) { + shrink_buffers(sh, conf->raid_disks); + kfree(sh); + return 1; + } + /* we just created an active stripe so... */ + sh->count = (atomic_t)ATOMIC_INIT(1); + atomic_inc(&conf->active_stripes); + INIT_LIST_HEAD(&sh->lru); + release_stripe(sh); + } + return 0; +} + +static void +shrink_stripes(raid5_conf_t * conf, int num) +{ + struct stripe_head *sh; + + while (num--) { + spin_lock_irq(&conf->device_lock); + sh = get_free_stripe(conf); + spin_unlock_irq(&conf->device_lock); + if (!sh) + break; + if (atomic_read(&sh->count)) + BUG(); + shrink_buffers(sh, conf->raid_disks); + kfree(sh); + atomic_dec(&conf->active_stripes); + } +} + +static void +raid5_end_read_request(struct buffer_head *bh, int uptodate) +{ + struct stripe_head *sh = bh->b_private; + raid5_conf_t *conf = sh->raid_conf; + int disks = conf->raid_disks, i; + unsigned long flags; + + for (i = 0; i < disks; i++) + if (bh == sh->bh_cache[i]) + break; + + if (i == disks) { + BUG(); + return; + } + + if (uptodate) { + struct buffer_head *buffer; + spin_lock_irqsave(&conf->device_lock, flags); + /* we can return a buffer if we bypassed the cache or + * if the top buffer is not in highmem. If there are + * multiple buffers, leave the extra work to + * handle_stripe + */ + buffer = sh->bh_read[i]; + if (buffer && (!PageHighMem(buffer->b_page) + || buffer->b_page == bh->b_page) + ) { + sh->bh_read[i] = buffer->b_reqnext; + buffer->b_reqnext = NULL; + } else + buffer = NULL; + spin_unlock_irqrestore(&conf->device_lock, flags); + if (sh->bh_page[i] == NULL) + set_bit(BH_Uptodate, &bh->b_state); + if (buffer) { + if (buffer->b_page != bh->b_page) + memcpy(buffer->b_data, bh->b_data, bh->b_size); + evms_cs_volume_request_in_progress(buffer->b_rdev, -1, NULL); + buffer->b_end_io(buffer, 1); + } + } else { + /* I/O error */ + if (sh->node[i]) + evms_md_error(conf->mddev, sh->node[i]); + else + LOG_WARNING + ("NODE was not set, skipping evms_md_error()\n"); + clear_bit(BH_Uptodate, &bh->b_state); + } + /* must restore b_page before unlocking buffer... */ + if (sh->bh_page[i]) { + bh->b_page = sh->bh_page[i]; + bh->b_data = page_address(bh->b_page); + sh->bh_page[i] = NULL; + clear_bit(BH_Uptodate, &bh->b_state); + } + clear_bit(BH_Lock, &bh->b_state); + set_bit(STRIPE_HANDLE, &sh->state); + release_stripe(sh); + if (sh->node[i]) { + sh->node[i] = NULL; + } else { + LOG_WARNING(" evms node was not set.\n"); + } + +} + +static void +raid5_end_write_request(struct buffer_head *bh, int uptodate) +{ + struct stripe_head *sh = bh->b_private; + raid5_conf_t *conf = sh->raid_conf; + int disks = conf->raid_disks, i; + unsigned long flags; + + for (i = 0; i < disks; i++) + if (bh == sh->bh_cache[i]) + break; + + if (i == disks) { + BUG(); + return; + } + + md_spin_lock_irqsave(&conf->device_lock, flags); + if (!uptodate) { + /* I/O error */ + if (sh->node[i]) + evms_md_error(conf->mddev, sh->node[i]); + else + LOG_WARNING + (" NODE was not set, skipping evms_md_error()\n"); + } + clear_bit(BH_Lock, &bh->b_state); + set_bit(STRIPE_HANDLE, &sh->state); + __release_stripe(conf, sh); + md_spin_unlock_irqrestore(&conf->device_lock, flags); + if (sh->node[i]) { + sh->node[i] = NULL; + } else { + LOG_WARNING(" evms node was not set.\n"); + } +} + +static struct buffer_head * +raid5_build_block(struct stripe_head *sh, int i) +{ + raid5_conf_t *conf = sh->raid_conf; + struct buffer_head *bh = sh->bh_cache[i]; + unsigned long block = sh->sector / (sh->size >> 9); + + init_buffer(bh, raid5_end_read_request, sh); + bh->b_dev = conf->disks[i].dev; + bh->b_blocknr = block; + + bh->b_state = (1 << BH_Req) | (1 << BH_Mapped); + bh->b_size = sh->size; + bh->b_list = BUF_LOCKED; + return bh; +} + +static int +raid5_error(mddev_t * mddev, struct evms_logical_node * node) +{ + raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + mdp_super_t *sb = mddev->sb; + struct disk_info *disk; + int i; + + LOG_WARNING("%s: called\n", __FUNCTION__); + + for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) { + if (disk->node == node) { + if (disk->operational) { + disk->operational = 0; + mark_disk_faulty(sb->disks + disk->number); + mark_disk_nonsync(sb->disks + disk->number); + mark_disk_inactive(sb->disks + disk->number); + sb->active_disks--; + sb->working_disks--; + sb->failed_disks++; + mddev->sb_dirty = 1; + conf->working_disks--; + conf->failed_disks++; + evms_cs_wakeup_thread(conf->thread); + LOG_WARNING + ("Disk failure on %s, disabling device." + " Operation continuing on %d devices\n", + evms_md_partition_name(disk->node), + conf->working_disks); + } + return 0; + } + } + /* + * handle errors in spares (during reconstruction) + */ + if (conf->spare) { + disk = conf->spare; + if (disk->node == node) { + LOG_WARNING("EVMS RAID5: Disk failure on spare %s\n", + evms_md_partition_name(disk->node)); + if (!conf->spare->operational) { + /* probably a SET_DISK_FAULTY ioctl */ + return -EIO; + } + disk->operational = 0; + disk->write_only = 0; + conf->spare = NULL; + mark_disk_faulty(sb->disks + disk->number); + mark_disk_nonsync(sb->disks + disk->number); + mark_disk_inactive(sb->disks + disk->number); + sb->spare_disks--; + sb->working_disks--; + sb->failed_disks++; + + mddev->sb_dirty = 1; + evms_cs_wakeup_thread(conf->thread); + + return 0; + } + } + MD_BUG(); + return -EIO; +} + +/* + * Input: a 'big' sector number, + * Output: index of the data and parity disk, and the sector # in them. + */ +static unsigned long +raid5_compute_sector(unsigned long r_sector, unsigned int raid_disks, + unsigned int data_disks, unsigned int *dd_idx, + unsigned int *pd_idx, raid5_conf_t * conf) +{ + unsigned long stripe; + unsigned long chunk_number; + unsigned int chunk_offset; + unsigned long new_sector; + int sectors_per_chunk = conf->chunk_size >> 9; + + /* First compute the information on this sector */ + + /* + * Compute the chunk number and the sector offset inside the chunk + */ + chunk_number = r_sector / sectors_per_chunk; + chunk_offset = r_sector % sectors_per_chunk; + + /* + * Compute the stripe number + */ + stripe = chunk_number / data_disks; + + /* + * Compute the data disk and parity disk indexes inside the stripe + */ + *dd_idx = chunk_number % data_disks; + + /* + * Select the parity disk based on the user selected algorithm. + */ + if (conf->level == 4) + *pd_idx = data_disks; + else + switch (conf->algorithm) { + case ALGORITHM_LEFT_ASYMMETRIC: + *pd_idx = data_disks - stripe % raid_disks; + if (*dd_idx >= *pd_idx) + (*dd_idx)++; + break; + case ALGORITHM_RIGHT_ASYMMETRIC: + *pd_idx = stripe % raid_disks; + if (*dd_idx >= *pd_idx) + (*dd_idx)++; + break; + case ALGORITHM_LEFT_SYMMETRIC: + *pd_idx = data_disks - stripe % raid_disks; + *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; + break; + case ALGORITHM_RIGHT_SYMMETRIC: + *pd_idx = stripe % raid_disks; + *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks; + break; + default: + LOG_ERROR(" unsupported algorithm %d\n", + conf->algorithm); + } + + /* + * Finally, compute the new sector number + */ + new_sector = stripe * sectors_per_chunk + chunk_offset; + return new_sector; +} + +#define check_xor() do { \ + if (count == MAX_XOR_BLOCKS) { \ + evms_md_xor_block(count, bh_ptr); \ + count = 1; \ + } \ + } while(0) + +static void +compute_block(struct stripe_head *sh, int dd_idx) +{ + raid5_conf_t *conf = sh->raid_conf; + int i, count, disks = conf->raid_disks; + struct buffer_head *bh_ptr[MAX_XOR_BLOCKS], *bh; + + memset(sh->bh_cache[dd_idx]->b_data, 0, sh->size); + bh_ptr[0] = sh->bh_cache[dd_idx]; + count = 1; + for (i = disks; i--;) { + if (i == dd_idx) + continue; + bh = sh->bh_cache[i]; + if (buffer_uptodate(bh)) + bh_ptr[count++] = bh; + else + LOG_ERROR("%s: %d, stripe %lu, %d not present\n", + __FUNCTION__, dd_idx, sh->sector, i); + + check_xor(); + } + if (count != 1) + evms_md_xor_block(count, bh_ptr); + set_bit(BH_Uptodate, &sh->bh_cache[dd_idx]->b_state); +} + +static void +compute_parity(struct stripe_head *sh, int method) +{ + raid5_conf_t *conf = sh->raid_conf; + int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count; + struct buffer_head *bh_ptr[MAX_XOR_BLOCKS]; + struct buffer_head *chosen[MD_SB_DISKS]; + + memset(chosen, 0, sizeof (chosen)); + + count = 1; + bh_ptr[0] = sh->bh_cache[pd_idx]; + switch (method) { + case READ_MODIFY_WRITE: + if (!buffer_uptodate(sh->bh_cache[pd_idx])) + BUG(); + for (i = disks; i--;) { + if (i == pd_idx) + continue; + if (sh->bh_write[i] && buffer_uptodate(sh->bh_cache[i])) { + bh_ptr[count++] = sh->bh_cache[i]; + chosen[i] = sh->bh_write[i]; + sh->bh_write[i] = sh->bh_write[i]->b_reqnext; + chosen[i]->b_reqnext = sh->bh_written[i]; + sh->bh_written[i] = chosen[i]; + check_xor(); + } + } + break; + case RECONSTRUCT_WRITE: + memset(sh->bh_cache[pd_idx]->b_data, 0, sh->size); + for (i = disks; i--;) + if (i != pd_idx && sh->bh_write[i]) { + chosen[i] = sh->bh_write[i]; + sh->bh_write[i] = sh->bh_write[i]->b_reqnext; + chosen[i]->b_reqnext = sh->bh_written[i]; + sh->bh_written[i] = chosen[i]; + } + break; + case CHECK_PARITY: + break; + } + if (count > 1) { + evms_md_xor_block(count, bh_ptr); + count = 1; + } + + for (i = disks; i--;) + if (chosen[i]) { + struct buffer_head *bh = sh->bh_cache[i]; + char *bdata; + bdata = bh_kmap(chosen[i]); + memcpy(bh->b_data, bdata, sh->size); + bh_kunmap(chosen[i]); + set_bit(BH_Lock, &bh->b_state); + mark_buffer_uptodate(bh, 1); + } + + switch (method) { + case RECONSTRUCT_WRITE: + case CHECK_PARITY: + for (i = disks; i--;) + if (i != pd_idx) { + bh_ptr[count++] = sh->bh_cache[i]; + check_xor(); + } + break; + case READ_MODIFY_WRITE: + for (i = disks; i--;) + if (chosen[i]) { + bh_ptr[count++] = sh->bh_cache[i]; + check_xor(); + } + } + if (count != 1) + evms_md_xor_block(count, bh_ptr); + + if (method != CHECK_PARITY) { + mark_buffer_uptodate(sh->bh_cache[pd_idx], 1); + set_bit(BH_Lock, &sh->bh_cache[pd_idx]->b_state); + } else + mark_buffer_uptodate(sh->bh_cache[pd_idx], 0); +} + +static void +add_stripe_bh(struct stripe_head *sh, struct buffer_head *bh, int dd_idx, + int rw) +{ + struct buffer_head **bhp; + raid5_conf_t *conf = sh->raid_conf; + + spin_lock(&sh->lock); + spin_lock_irq(&conf->device_lock); + bh->b_reqnext = NULL; + if (rw == READ) + bhp = &sh->bh_read[dd_idx]; + else + bhp = &sh->bh_write[dd_idx]; + while (*bhp) { + LOG_DEFAULT("EVMS RAID5: multiple %d requests for sector %ld\n", + rw, sh->sector); + bhp = &(*bhp)->b_reqnext; + } + *bhp = bh; + spin_unlock_irq(&conf->device_lock); + spin_unlock(&sh->lock); + +} + +/* + * handle_stripe - do things to a stripe. + * + * We lock the stripe and then examine the state of various bits + * to see what needs to be done. + * Possible results: + * return some read request which now have data + * return some write requests which are safely on disc + * schedule a read on some buffers + * schedule a write of some buffers + * return confirmation of parity correctness + * + * Parity calculations are done inside the stripe lock + * buffers are taken off read_list or write_list, and bh_cache buffers + * get BH_Lock set before the stripe lock is released. + * + */ + +static void +handle_stripe(struct stripe_head *sh) +{ + raid5_conf_t *conf = sh->raid_conf; + int disks = conf->raid_disks; + struct buffer_head *return_ok = NULL, *return_fail = NULL; + int action[MD_SB_DISKS]; + int i; + int syncing; + int locked = 0, uptodate = 0, to_read = 0, to_write = 0, failed = + 0, written = 0; + int failed_num = 0; + struct buffer_head *bh; + + memset(action, 0, sizeof (action)); + + spin_lock(&sh->lock); + clear_bit(STRIPE_HANDLE, &sh->state); + clear_bit(STRIPE_DELAYED, &sh->state); + + syncing = test_bit(STRIPE_SYNCING, &sh->state); + /* Now to look around and see what can be done */ + + for (i = disks; i--;) { + bh = sh->bh_cache[i]; + /* maybe we can reply to a read */ + if (buffer_uptodate(bh) && sh->bh_read[i]) { + struct buffer_head *rbh, *rbh2; + spin_lock_irq(&conf->device_lock); + rbh = sh->bh_read[i]; + sh->bh_read[i] = NULL; + spin_unlock_irq(&conf->device_lock); + while (rbh) { + char *bdata; + bdata = bh_kmap(rbh); + memcpy(bdata, bh->b_data, bh->b_size); + bh_kunmap(rbh); + rbh2 = rbh->b_reqnext; + rbh->b_reqnext = return_ok; + return_ok = rbh; + rbh = rbh2; + } + } + + /* now count some things */ + if (buffer_locked(bh)) + locked++; + if (buffer_uptodate(bh)) + uptodate++; + + if (sh->bh_read[i]) + to_read++; + if (sh->bh_write[i]) + to_write++; + if (sh->bh_written[i]) + written++; + if (!conf->disks[i].operational) { + failed++; + failed_num = i; + } + } + /* check if the array has lost two devices and, if so, some requests might + * need to be failed + */ + if (failed > 1 && to_read + to_write) { + for (i = disks; i--;) { + /* fail all writes first */ + if (sh->bh_write[i]) + to_write--; + while ((bh = sh->bh_write[i])) { + sh->bh_write[i] = bh->b_reqnext; + bh->b_reqnext = return_fail; + return_fail = bh; + } + /* fail any reads if this device is non-operational */ + if (!conf->disks[i].operational) { + spin_lock_irq(&conf->device_lock); + if (sh->bh_read[i]) + to_read--; + while ((bh = sh->bh_read[i])) { + sh->bh_read[i] = bh->b_reqnext; + bh->b_reqnext = return_fail; + return_fail = bh; + } + spin_unlock_irq(&conf->device_lock); + } + } + } + if (failed > 1 && syncing) { + evms_md_done_sync(conf->mddev, + (sh->size >> 9) - sh->sync_redone, 0); + clear_bit(STRIPE_SYNCING, &sh->state); + syncing = 0; + } + + /* might be able to return some write requests if the parity block + * is safe, or on a failed drive + */ + bh = sh->bh_cache[sh->pd_idx]; + if (written && + ((conf->disks[sh->pd_idx].operational && !buffer_locked(bh) + && buffer_uptodate(bh)) + || (failed == 1 && failed_num == sh->pd_idx)) + ) { + /* any written block on a uptodate or failed drive can be returned */ + for (i = disks; i--;) + if (sh->bh_written[i]) { + bh = sh->bh_cache[i]; + if (!conf->disks[sh->pd_idx].operational || + (!buffer_locked(bh) + && buffer_uptodate(bh))) { + /* maybe we can return some write requests */ + struct buffer_head *wbh, *wbh2; + wbh = sh->bh_written[i]; + sh->bh_written[i] = NULL; + while (wbh) { + wbh2 = wbh->b_reqnext; + wbh->b_reqnext = return_ok; + return_ok = wbh; + wbh = wbh2; + } + } + } + } + + /* Now we might consider reading some blocks, either to check/generate + * parity, or to satisfy requests + */ + if (to_read || (syncing && (uptodate + failed < disks))) { + for (i = disks; i--;) { + bh = sh->bh_cache[i]; + if (!buffer_locked(bh) && !buffer_uptodate(bh) && + (sh->bh_read[i] || syncing + || (failed && sh->bh_read[failed_num]))) { + /* we would like to get this block, possibly + * by computing it, but we might not be able to + */ + if (uptodate == disks - 1) { + compute_block(sh, i); + uptodate++; + } else if (conf->disks[i].operational) { + set_bit(BH_Lock, &bh->b_state); + action[i] = READ + 1; + /* if I am just reading this block and we don't have + a failed drive, or any pending writes then sidestep the cache */ + if (sh->bh_page[i]) + BUG(); + if (sh->bh_read[i] + && !sh->bh_read[i]->b_reqnext + && !syncing && !failed + && !to_write) { + sh->bh_page[i] = + sh->bh_cache[i]->b_page; + sh->bh_cache[i]->b_page = + sh->bh_read[i]->b_page; + sh->bh_cache[i]->b_data = + sh->bh_read[i]->b_data; + } + locked++; + if (syncing) + evms_md_sync_acct(conf-> + disks[i].dev, + bh-> + b_size >> 9); + } + } + } + set_bit(STRIPE_HANDLE, &sh->state); + } + + /* now to consider writing and what else, if anything should be read */ + if (to_write) { + int rmw = 0, rcw = 0; + for (i = disks; i--;) { + /* would I have to read this buffer for read_modify_write */ + bh = sh->bh_cache[i]; + if ((sh->bh_write[i] || i == sh->pd_idx) && + (!buffer_locked(bh) || sh->bh_page[i]) && + !buffer_uptodate(bh)) { + if (conf->disks[i].operational +/* && !(conf->resync_parity && i == sh->pd_idx) */ + ) + rmw++; + else + rmw += 2 * disks; /* cannot read it */ + } + /* Would I have to read this buffer for reconstruct_write */ + if (!sh->bh_write[i] && i != sh->pd_idx && + (!buffer_locked(bh) || sh->bh_page[i]) && + !buffer_uptodate(bh)) { + if (conf->disks[i].operational) + rcw++; + else + rcw += 2 * disks; + } + } + set_bit(STRIPE_HANDLE, &sh->state); + if (rmw < rcw && rmw > 0) + /* prefer read-modify-write, but need to get some data */ + for (i = disks; i--;) { + bh = sh->bh_cache[i]; + if ((sh->bh_write[i] || i == sh->pd_idx) && + !buffer_locked(bh) && !buffer_uptodate(bh) + && conf->disks[i].operational) { + if (test_bit + (STRIPE_PREREAD_ACTIVE, + &sh->state)) { + set_bit(BH_Lock, &bh->b_state); + action[i] = READ + 1; + locked++; + } else { + set_bit(STRIPE_DELAYED, + &sh->state); + set_bit(STRIPE_HANDLE, + &sh->state); + } + } + } + if (rcw <= rmw && rcw > 0) + /* want reconstruct write, but need to get some data */ + for (i = disks; i--;) { + bh = sh->bh_cache[i]; + if (!sh->bh_write[i] && i != sh->pd_idx && + !buffer_locked(bh) && !buffer_uptodate(bh) + && conf->disks[i].operational) { + if (test_bit + (STRIPE_PREREAD_ACTIVE, + &sh->state)) { + set_bit(BH_Lock, &bh->b_state); + action[i] = READ + 1; + locked++; + } else { + set_bit(STRIPE_DELAYED, + &sh->state); + set_bit(STRIPE_HANDLE, + &sh->state); + } + } + } + /* now if nothing is locked, and if we have enough data, we can start a write request */ + if (locked == 0 && (rcw == 0 || rmw == 0)) { + compute_parity(sh, + rcw == + 0 ? RECONSTRUCT_WRITE : + READ_MODIFY_WRITE); + /* now every locked buffer is ready to be written */ + for (i = disks; i--;) + if (buffer_locked(sh->bh_cache[i])) { + locked++; + action[i] = WRITE + 1; + if (!conf->disks[i].operational + || (i == sh->pd_idx && failed == 0)) + set_bit(STRIPE_INSYNC, + &sh->state); + } + if (test_and_clear_bit + (STRIPE_PREREAD_ACTIVE, &sh->state)) { + atomic_dec(&conf->preread_active_stripes); + if (atomic_read(&conf->preread_active_stripes) < + IO_THRESHOLD) + evms_cs_wakeup_thread(conf->thread); + } + } + } + + /* maybe we need to check and possibly fix the parity for this stripe + * Any reads will already have been scheduled, so we just see if enough data + * is available + */ + if (syncing && locked == 0 && + !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) { + set_bit(STRIPE_HANDLE, &sh->state); + if (failed == 0) { + if (uptodate != disks) + BUG(); + compute_parity(sh, CHECK_PARITY); + uptodate--; + bh = sh->bh_cache[sh->pd_idx]; + if ((*(u32 *) bh->b_data) == 0 && + !memcmp(bh->b_data, bh->b_data + 4, + bh->b_size - 4)) { + /* parity is correct (on disc, not in buffer any more) */ + set_bit(STRIPE_INSYNC, &sh->state); + } + } + if (!test_bit(STRIPE_INSYNC, &sh->state)) { + struct disk_info *spare; + if (failed == 0) + failed_num = sh->pd_idx; + /* should be able to compute the missing block and write it to spare */ + if (!buffer_uptodate(sh->bh_cache[failed_num])) { + if (uptodate + 1 != disks) + BUG(); + compute_block(sh, failed_num); + uptodate++; + } + if (uptodate != disks) + BUG(); + bh = sh->bh_cache[failed_num]; + set_bit(BH_Lock, &bh->b_state); + action[failed_num] = WRITE + 1; + locked++; + set_bit(STRIPE_INSYNC, &sh->state); + if (conf->disks[failed_num].operational) + evms_md_sync_acct(conf->disks[failed_num].dev, + bh->b_size >> 9); + else if ((spare = conf->spare)) + evms_md_sync_acct(spare->dev, bh->b_size >> 9); + + } + } + if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { + evms_md_done_sync(conf->mddev, + (sh->size >> 9) - sh->sync_redone, 1); + clear_bit(STRIPE_SYNCING, &sh->state); + } + + spin_unlock(&sh->lock); + + while ((bh = return_ok)) { + return_ok = bh->b_reqnext; + bh->b_reqnext = NULL; + evms_cs_volume_request_in_progress(bh->b_rdev, -1, NULL); + bh->b_end_io(bh, 1); + } + while ((bh = return_fail)) { + return_fail = bh->b_reqnext; + bh->b_reqnext = NULL; + evms_cs_volume_request_in_progress(bh->b_rdev, -1, NULL); + bh->b_end_io(bh, 0); + } + for (i = disks; i--;) + if (action[i]) { + struct buffer_head *bh = sh->bh_cache[i]; + struct disk_info *spare = conf->spare; + struct evms_logical_node *node = NULL; + int skip = 0; + if (action[i] == READ + 1) + bh->b_end_io = raid5_end_read_request; + else + bh->b_end_io = raid5_end_write_request; + if (conf->disks[i].operational) { + bh->b_dev = conf->disks[i].dev; + node = conf->disks[i].node; + } else if (spare && action[i] == WRITE + 1) { + bh->b_dev = spare->dev; + node = spare->node; + } else + skip = 1; + if (!skip) { + atomic_inc(&sh->count); + //bh->b_rdev = bh->b_dev; + bh->b_rsector = + bh->b_blocknr * (bh->b_size >> 9); + sh->node[i] = node; + if (action[i] == READ + 1) + R_IO(node, bh); + else + W_IO(node, bh); + } else { + clear_bit(BH_Lock, &bh->b_state); + set_bit(STRIPE_HANDLE, &sh->state); + } + } +} + +static inline void +raid5_activate_delayed(raid5_conf_t * conf) +{ + if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { + while (!list_empty(&conf->delayed_list)) { + struct list_head *l = conf->delayed_list.next; + struct stripe_head *sh; + sh = list_entry(l, struct stripe_head, lru); + list_del_init(l); + clear_bit(STRIPE_DELAYED, &sh->state); + if (!test_and_set_bit + (STRIPE_PREREAD_ACTIVE, &sh->state)) + atomic_inc(&conf->preread_active_stripes); + list_add_tail(&sh->lru, &conf->handle_list); + } + } +} +static void +raid5_unplug_device(void *data) +{ + raid5_conf_t *conf = (raid5_conf_t *) data; + unsigned long flags; + + spin_lock_irqsave(&conf->device_lock, flags); + + raid5_activate_delayed(conf); + + conf->plugged = 0; + evms_cs_wakeup_thread(conf->thread); + + spin_unlock_irqrestore(&conf->device_lock, flags); +} + +static inline void +raid5_plug_device(raid5_conf_t * conf) +{ + spin_lock_irq(&conf->device_lock); + if (list_empty(&conf->delayed_list)) + if (!conf->plugged) { + conf->plugged = 1; + queue_task(&conf->plug_tq, &tq_disk); + } + spin_unlock_irq(&conf->device_lock); +} + +static inline void +raid5_rw(struct evms_logical_node * md_node, struct buffer_head *bh, int rw) +{ + mddev_t *mddev = EVMS_MD_NODE_TO_MDDEV(md_node); + raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + const unsigned int raid_disks = conf->raid_disks; + const unsigned int data_disks = raid_disks - 1; + unsigned int dd_idx, pd_idx; + unsigned long new_sector; + struct stripe_head *sh; + unsigned long sectors_per_chunk = conf->chunk_size >> 9; + unsigned long sect_in_chunk = bh->b_rsector & (sectors_per_chunk - 1); + + if (evms_md_check_boundary(md_node, bh)) + return; + if ((sect_in_chunk + (bh->b_size >> 9)) > sectors_per_chunk) { + bh->b_end_io(bh, 0); + return; + } + + new_sector = raid5_compute_sector(bh->b_rsector, + raid_disks, data_disks, &dd_idx, + &pd_idx, conf); + + sh = get_active_stripe(conf, new_sector, bh->b_size); + if (sh) { + sh->pd_idx = pd_idx; + + add_stripe_bh(sh, bh, dd_idx, rw); + + raid5_plug_device(conf); + + evms_cs_volume_request_in_progress(bh->b_rdev, 1, NULL); + handle_stripe(sh); + release_stripe(sh); + } else { + evms_cs_volume_request_in_progress(bh->b_rdev, -1, NULL); + bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state)); + } +} + +static void +raid5_read(struct evms_logical_node * md_node, struct buffer_head *bh) +{ + raid5_rw(md_node, bh, READ); +} + +static void +raid5_write(struct evms_logical_node * md_node, struct buffer_head *bh) +{ + raid5_rw(md_node, bh, WRITE); +} + +static int +raid5_sync_request(mddev_t * mddev, unsigned long sector_nr) +{ + raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + struct stripe_head *sh; + int sectors_per_chunk = conf->chunk_size >> 9; + unsigned long stripe = sector_nr / sectors_per_chunk; + int chunk_offset = sector_nr % sectors_per_chunk; + int dd_idx, pd_idx; + unsigned long first_sector; + int raid_disks = conf->raid_disks; + int data_disks = raid_disks - 1; + int redone = 0; + int bufsize; + + sh = get_active_stripe(conf, sector_nr, 0); + bufsize = sh->size; + redone = sector_nr - sh->sector; + first_sector = + raid5_compute_sector(stripe * data_disks * sectors_per_chunk + + chunk_offset, raid_disks, data_disks, &dd_idx, + &pd_idx, conf); + sh->pd_idx = pd_idx; + spin_lock(&sh->lock); + set_bit(STRIPE_SYNCING, &sh->state); + clear_bit(STRIPE_INSYNC, &sh->state); + sh->sync_redone = redone; + spin_unlock(&sh->lock); + + handle_stripe(sh); + release_stripe(sh); + + return (bufsize >> 9) - redone; +} + +/* + * This is our raid5 kernel thread. + * + * We scan the hash table for stripes which can be handled now. + * During the scan, completed stripes are saved for us by the interrupt + * handler, so that they will not have to wait for our next wakeup. + */ +static void +raid5d(void *data) +{ + struct stripe_head *sh; + raid5_conf_t *conf = data; + mddev_t *mddev = conf->mddev; + int handled; + + handled = 0; + + if (mddev->sb_dirty) { + mddev->sb_dirty = 0; + evms_md_update_sb(mddev); + } + md_spin_lock_irq(&conf->device_lock); + while (1) { + struct list_head *first; + + if (list_empty(&conf->handle_list) && + atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD && + !conf->plugged && !list_empty(&conf->delayed_list)) + raid5_activate_delayed(conf); + + if (list_empty(&conf->handle_list)) + break; + + first = conf->handle_list.next; + sh = list_entry(first, struct stripe_head, lru); + + list_del_init(first); + atomic_inc(&sh->count); + if (atomic_read(&sh->count) != 1) + BUG(); + md_spin_unlock_irq(&conf->device_lock); + + handled++; + handle_stripe(sh); + release_stripe(sh); + + md_spin_lock_irq(&conf->device_lock); + } + + md_spin_unlock_irq(&conf->device_lock); + +} + +/* + * Private kernel thread for parity reconstruction after an unclean + * shutdown. Reconstruction on spare drives in case of a failed drive + * is done by the generic mdsyncd. + */ +static void +raid5syncd(void *data) +{ + raid5_conf_t *conf = data; + mddev_t *mddev = conf->mddev; + + if (!conf->resync_parity) + return; + if (conf->resync_parity == 2) + return; + down(&mddev->recovery_sem); + if (evms_md_do_sync(mddev, NULL)) { + up(&mddev->recovery_sem); + LOG_WARNING("resync aborted!\n"); + return; + } + conf->resync_parity = 0; + up(&mddev->recovery_sem); + LOG_DEFAULT("resync finished.\n"); +} + +static int +raid5_run(mddev_t * mddev) +{ + raid5_conf_t *conf; + int i, j, raid_disk, memory; + mdp_super_t *sb = mddev->sb; + mdp_disk_t *desc; + mdk_rdev_t *rdev; + struct disk_info *disk; + struct md_list_head *tmp; + int start_recovery = 0; + + MOD_INC_USE_COUNT; + + if (sb->level != 5 && sb->level != 4) { + LOG_ERROR("%s: [md%d] raid level not set to 4/5 (%d)\n", + __FUNCTION__, mdidx(mddev), sb->level); + MOD_DEC_USE_COUNT; + return -EIO; + } + + mddev->private = kmalloc(sizeof (raid5_conf_t), GFP_KERNEL); + if ((conf = mddev->private) == NULL) + goto abort; + memset(conf, 0, sizeof (*conf)); + conf->mddev = mddev; + + if ((conf->stripe_hashtbl = + (struct stripe_head **) md__get_free_pages(GFP_ATOMIC, + HASH_PAGES_ORDER)) == + NULL) + goto abort; + memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE); + + conf->device_lock = MD_SPIN_LOCK_UNLOCKED; + md_init_waitqueue_head(&conf->wait_for_stripe); + INIT_LIST_HEAD(&conf->handle_list); + INIT_LIST_HEAD(&conf->delayed_list); + INIT_LIST_HEAD(&conf->inactive_list); + conf->active_stripes = (atomic_t)ATOMIC_INIT(0); + conf->preread_active_stripes = (atomic_t)ATOMIC_INIT(0); + conf->buffer_size = PAGE_SIZE; /* good default for rebuild */ + + conf->plugged = 0; + conf->plug_tq.sync = 0; + conf->plug_tq.routine = &raid5_unplug_device; + conf->plug_tq.data = conf; + + ITERATE_RDEV(mddev, rdev, tmp) { + /* + * This is important -- we are using the descriptor on + * the disk only to get a pointer to the descriptor on + * the main superblock, which might be more recent. + */ + desc = sb->disks + rdev->desc_nr; + raid_disk = desc->raid_disk; + disk = conf->disks + raid_disk; + + if (disk_faulty(desc)) { + LOG_ERROR("%s: disabled device %s (errors detected)\n", + __FUNCTION__, + evms_md_partition_name(rdev->node)); + if (!rdev->faulty) { + MD_BUG(); + goto abort; + } + disk->number = desc->number; + disk->raid_disk = raid_disk; + disk->dev = rdev->dev; + disk->node = rdev->node; + + disk->operational = 0; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + continue; + } + if (disk_active(desc)) { + if (!disk_sync(desc)) { + LOG_ERROR + ("%s: disabled device %s (not in sync)\n", + __FUNCTION__, + evms_md_partition_name(rdev->node)); + MD_BUG(); + goto abort; + } + if (raid_disk > sb->raid_disks) { + LOG_ERROR + ("%s: disabled device %s (inconsistent descriptor)\n", + __FUNCTION__, + evms_md_partition_name(rdev->node)); + continue; + } + if (disk->operational) { + LOG_ERROR + ("%s: disabled device %s (device %d already operational)\n", + __FUNCTION__, + evms_md_partition_name(rdev->node), + raid_disk); + continue; + } + LOG_DEFAULT + ("%s: device %s operational as raid disk %d\n", + __FUNCTION__, evms_md_partition_name(rdev->node), + raid_disk); + + disk->number = desc->number; + disk->raid_disk = raid_disk; + disk->dev = rdev->dev; + disk->node = rdev->node; + disk->operational = 1; + disk->used_slot = 1; + + conf->working_disks++; + } else { + /* + * Must be a spare disk .. + */ + LOG_DEFAULT(" spare disk %s\n", + evms_md_partition_name(rdev->node)); + disk->number = desc->number; + disk->raid_disk = raid_disk; + disk->dev = rdev->dev; + disk->node = rdev->node; + + disk->operational = 0; + disk->write_only = 0; + disk->spare = 1; + disk->used_slot = 1; + } + } + + for (i = 0; i < MD_SB_DISKS; i++) { + desc = sb->disks + i; + raid_disk = desc->raid_disk; + disk = conf->disks + raid_disk; + + if (disk_faulty(desc) && (raid_disk < sb->raid_disks) && + !conf->disks[raid_disk].used_slot) { + + disk->number = desc->number; + disk->raid_disk = raid_disk; + disk->dev = MKDEV(0, 0); + disk->node = NULL; + + disk->operational = 0; + disk->write_only = 0; + disk->spare = 0; + disk->used_slot = 1; + } + } + + conf->raid_disks = sb->raid_disks; + /* + * faied_disks: 0 for a fully functional array, 1 for a degraded array. + */ + conf->failed_disks = conf->raid_disks - conf->working_disks; + conf->mddev = mddev; + conf->chunk_size = sb->chunk_size; + conf->level = sb->level; + conf->algorithm = sb->layout; + conf->max_nr_stripes = NR_STRIPES; + + /* + * If chunk_size is validated in md_core.c, why do it again? + * And the check in md_core is: + * chunk_size has to be a power of 2 and multiples of PAGE_SIZE + */ + + if (!conf->chunk_size || + ((1 << ffz(~conf->chunk_size)) != conf->chunk_size) || + (conf->chunk_size < PAGE_SIZE)) { + LOG_ERROR("%s: invalid chunk size %d for md%d\n", __FUNCTION__, + conf->chunk_size, mdidx(mddev)); + goto abort; + } + if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) { + LOG_ERROR(" unsupported parity algorithm %d for md%d\n", + conf->algorithm, mdidx(mddev)); + goto abort; + } + if (conf->failed_disks > 1) { + LOG_ERROR + (" not enough operational devices for md%d (%d/%d failed)\n", + mdidx(mddev), conf->failed_disks, conf->raid_disks); + goto abort; + } + + if (conf->working_disks != sb->raid_disks) { + LOG_WARNING + (" md%d, not all disks are operational -- trying to recover array\n", + mdidx(mddev)); + start_recovery = 1; + } + + { + const char *name = "evms_raid5d"; + + conf->thread = evms_cs_register_thread(raid5d, conf, name); + if (!conf->thread) { + LOG_ERROR("%s: couldn't allocate thread for md%d\n", + __FUNCTION__, mdidx(mddev)); + goto abort; + } + } + + memory = conf->max_nr_stripes * (sizeof (struct stripe_head) + + conf->raid_disks * + ((sizeof (struct buffer_head) + + PAGE_SIZE))) / 1024; + if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) { + LOG_ERROR("%s: couldn't allocate %dkB for buffers\n", + __FUNCTION__, memory); + shrink_stripes(conf, conf->max_nr_stripes); + goto abort; + } else + LOG_DETAILS("%s: allocated %dkB for md%d\n", __FUNCTION__, + memory, mdidx(mddev)); + + /* + * Regenerate the "device is in sync with the raid set" bit for + * each device. + */ + for (i = 0; i < MD_SB_DISKS; i++) { + mark_disk_nonsync(sb->disks + i); + for (j = 0; j < sb->raid_disks; j++) { + if (!conf->disks[j].operational) + continue; + if (sb->disks[i].number == conf->disks[j].number) + mark_disk_sync(sb->disks + i); + } + } + sb->active_disks = conf->working_disks; + + if (sb->active_disks == sb->raid_disks) { + LOG_DETAILS + ("%s: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", + __FUNCTION__, conf->level, mdidx(mddev), sb->active_disks, + sb->raid_disks, conf->algorithm); + } else { + LOG_WARNING + ("%s: raid level %d set md%d active with %d out of %d devices, algorithm %d\n", + __FUNCTION__, conf->level, mdidx(mddev), sb->active_disks, + sb->raid_disks, conf->algorithm); + } + + if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) { + const char *name = "evms_raid5syncd"; + + conf->resync_thread = + evms_cs_register_thread(raid5syncd, conf, name); + if (!conf->resync_thread) { + LOG_ERROR("%s: couldn't allocate thread for md%d\n", + __FUNCTION__, mdidx(mddev)); + goto abort; + } + + LOG_WARNING + ("%s: raid set md%d not clean; reconstructing parity\n", + __FUNCTION__, mdidx(mddev)); + conf->resync_parity = 1; + evms_cs_wakeup_thread(conf->resync_thread); + } + + print_raid5_conf(conf); + if (start_recovery) + evms_md_recover_arrays(); + print_raid5_conf(conf); + + /* Ok, everything is just fine now */ + return (0); + abort: + if (conf) { + print_raid5_conf(conf); + if (conf->stripe_hashtbl) + free_pages((unsigned long) conf->stripe_hashtbl, + HASH_PAGES_ORDER); + kfree(conf); + } + mddev->private = NULL; + LOG_WARNING("%s: failed to run raid set md%d\n", __FUNCTION__, + mdidx(mddev)); + MOD_DEC_USE_COUNT; + return -EIO; +} + +static int +raid5_stop_resync(mddev_t * mddev) +{ + raid5_conf_t *conf = mddev_to_conf(mddev); + struct evms_thread *thread; + + if (conf == NULL) { + return 0; + } + + thread = conf->resync_thread; + + if (thread) { + if (conf->resync_parity) { + conf->resync_parity = 2; + evms_cs_interrupt_thread(thread); + LOG_WARNING + ("%s: parity resync was not fully finished, restarting next time.\n", + __FUNCTION__); + return 1; + } + return 0; + } + return 0; +} + +static int +raid5_restart_resync(mddev_t * mddev) +{ + raid5_conf_t *conf = mddev_to_conf(mddev); + + if (conf->resync_parity) { + if (!conf->resync_thread) { + MD_BUG(); + return 0; + } + LOG_DEFAULT("%s: waking up raid5resync.\n", __FUNCTION__); + conf->resync_parity = 1; + evms_cs_wakeup_thread(conf->resync_thread); + return 1; + } else + LOG_DEFAULT("%s: no restart-resync needed.\n", __FUNCTION__); + return 0; +} + +static int +raid5_stop(mddev_t * mddev) +{ + raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + + if (conf != NULL) { + if (conf->resync_thread) + evms_cs_unregister_thread(conf->resync_thread); + evms_cs_unregister_thread(conf->thread); + shrink_stripes(conf, conf->max_nr_stripes); + free_pages((unsigned long) conf->stripe_hashtbl, + HASH_PAGES_ORDER); + kfree(conf); + mddev->private = NULL; + } + MOD_DEC_USE_COUNT; + return 0; +} + +#if RAID5_DEBUG +static void +print_sh(struct stripe_head *sh) +{ + int i; + + LOG_DEFAULT("sh %lu, size %d, pd_idx %d, state %ld.\n", sh->sector, + sh->size, sh->pd_idx, sh->state); + LOG_DEFAULT("sh %lu, count %d.\n", sh->sector, + atomic_read(&sh->count)); + LOG_DEFAULT("sh %lu, ", sh->sector); + for (i = 0; i < MD_SB_DISKS; i++) { + if (sh->bh_cache[i]) + LOG_DEFAULT("(cache%d: %p %ld) ", i, sh->bh_cache[i], + sh->bh_cache[i]->b_state); + } + LOG_DEFAULT("\n"); +} + +static void +printall(raid5_conf_t * conf) +{ + struct stripe_head *sh; + int i; + + md_spin_lock_irq(&conf->device_lock); + for (i = 0; i < NR_HASH; i++) { + sh = conf->stripe_hashtbl[i]; + for (; sh; sh = sh->hash_next) { + if (sh->raid_conf != conf) + continue; + print_sh(sh); + } + } + md_spin_unlock_irq(&conf->device_lock); +} +#endif + +static int +raid5_status(char *page, mddev_t * mddev) +{ + raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + mdp_super_t *sb = mddev->sb; + int sz = 0, i; + + sz += + sprintf(page + sz, " level %d, %dk chunk, algorithm %d", sb->level, + sb->chunk_size >> 10, sb->layout); + sz += + sprintf(page + sz, " [%d/%d] [", conf->raid_disks, + conf->working_disks); + for (i = 0; i < conf->raid_disks; i++) + sz += + sprintf(page + sz, "%s", + conf->disks[i].operational ? "U" : "_"); + sz += sprintf(page + sz, "]"); +#if RAID5_DEBUG +#define D(x) \ + sz += sprintf (page+sz, "<"#x":%d>", atomic_read(&conf->x)) + printall(conf); +#endif + return sz; +} + +static void +print_raid5_conf(raid5_conf_t * conf) +{ + int i; + struct disk_info *tmp; + + LOG_DEFAULT("RAID5 conf printout:\n"); + if (!conf) { + LOG_DEFAULT("(conf==NULL)\n"); + return; + } + LOG_DEFAULT(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks, + conf->working_disks, conf->failed_disks); + +#if RAID5_DEBUG + for (i = 0; i < MD_SB_DISKS; i++) { +#else + for (i = 0; i < conf->working_disks + conf->failed_disks; i++) { +#endif + tmp = conf->disks + i; + LOG_DEFAULT(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n", + i, tmp->spare, tmp->operational, + tmp->number, tmp->raid_disk, tmp->used_slot, + evms_md_partition_name(tmp->node)); + } +} + +static int +raid5_diskop(mddev_t * mddev, mdp_disk_t ** d, int state) +{ + int err = 0; + int i, failed_disk = -1, spare_disk = -1, removed_disk = -1; + raid5_conf_t *conf = mddev->private; + struct disk_info *tmp, *sdisk, *fdisk, *rdisk; + mdp_super_t *sb = mddev->sb; + mdp_disk_t *failed_desc, *spare_desc; + mdk_rdev_t *spare_rdev, *failed_rdev; + + print_raid5_conf(conf); + md_spin_lock_irq(&conf->device_lock); + /* + * find the disk ... + */ + switch (state) { + + case DISKOP_SPARE_ACTIVE: + + /* + * Find the failed disk within the RAID5 configuration ... + * (this can only be in the first conf->raid_disks part) + */ + for (i = 0; i < conf->raid_disks; i++) { + tmp = conf->disks + i; + if ((!tmp->operational && !tmp->spare) || + !tmp->used_slot) { + failed_disk = i; + break; + } + } + /* + * When we activate a spare disk we _must_ have a disk in + * the lower (active) part of the array to replace. + */ + if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) { + MD_BUG(); + err = 1; + goto abort; + } + /* fall through */ + + case DISKOP_SPARE_WRITE: + case DISKOP_SPARE_INACTIVE: + + /* + * Find the spare disk ... (can only be in the 'high' + * area of the array) + */ + for (i = conf->raid_disks; i < MD_SB_DISKS; i++) { + tmp = conf->disks + i; + if (tmp->spare && tmp->number == (*d)->number) { + spare_disk = i; + break; + } + } + if (spare_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_REMOVE_SPARE: + + for (i = 0; i < MD_SB_DISKS; i++) { + tmp = conf->disks + i; + if (tmp->used_slot && (tmp->number == (*d)->number)) { + if (tmp->operational) { + err = -EBUSY; + goto abort; + } else if (!tmp->spare) { + MD_BUG(); + err = 1; + goto abort; + } + removed_disk = i; + break; + } + } + if (removed_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_REMOVE_DISK: + for (i = 0; i < MD_SB_DISKS; i++) { + tmp = conf->disks + i; + if (tmp->used_slot && (tmp->number == (*d)->number)) { + if (i < conf->raid_disks) { + if (conf->working_disks != + conf->raid_disks) { + /* + * Can't remove a disk from an + * array that is running in + * degrade mode. + */ + err = -EBUSY; + goto abort; + } + if (sb->spare_disks == 0) { + /* + * Must have a spare ready + * before removing an active + * disk. + */ + err = -EBUSY; + goto abort; + } + } + removed_disk = i; + break; + } + } + if (removed_disk == -1) { + MD_BUG(); + err = 1; + goto abort; + } + break; + + case DISKOP_HOT_ADD_DISK: + err = -ENOSYS; + goto abort; + break; + } + + switch (state) { + /* + * Switch the spare disk to write-only mode: + */ + case DISKOP_SPARE_WRITE: + if (conf->spare) { + MD_BUG(); + err = 1; + goto abort; + } + sdisk = conf->disks + spare_disk; + sdisk->operational = 1; + sdisk->write_only = 1; + conf->spare = sdisk; + break; + /* + * Deactivate a spare disk: + */ + case DISKOP_SPARE_INACTIVE: + sdisk = conf->disks + spare_disk; + sdisk->operational = 0; + sdisk->write_only = 0; + /* + * Was the spare being resynced? + */ + if (conf->spare == sdisk) + conf->spare = NULL; + break; + /* + * Activate (mark read-write) the (now sync) spare disk, + * which means we switch it's 'raid position' (->raid_disk) + * with the failed disk. (only the first 'conf->raid_disks' + * slots are used for 'real' disks and we must preserve this + * property) + */ + case DISKOP_SPARE_ACTIVE: + if (!conf->spare) { + MD_BUG(); + err = 1; + goto abort; + } + sdisk = conf->disks + spare_disk; + fdisk = conf->disks + failed_disk; + + spare_desc = &sb->disks[sdisk->number]; + failed_desc = &sb->disks[fdisk->number]; + + if (spare_desc != *d) { + MD_BUG(); + err = 1; + goto abort; + } + + if (spare_desc->raid_disk != sdisk->raid_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (sdisk->raid_disk != spare_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (failed_desc->raid_disk != fdisk->raid_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + if (fdisk->raid_disk != failed_disk) { + MD_BUG(); + err = 1; + goto abort; + } + + /* + * do the switch finally + */ + spare_rdev = evms_md_find_rdev_nr(mddev, spare_desc->number); + failed_rdev = evms_md_find_rdev_nr(mddev, failed_desc->number); + + /* There must be a spare_rdev, but there may not be a + * failed_rdev. That slot might be empty... + */ + spare_rdev->desc_nr = failed_desc->number; + if (failed_rdev) + failed_rdev->desc_nr = spare_desc->number; + + xchg_values(*spare_desc, *failed_desc); + xchg_values(*fdisk, *sdisk); + + /* + * (careful, 'failed' and 'spare' are switched from now on) + * + * we want to preserve linear numbering and we want to + * give the proper raid_disk number to the now activated + * disk. (this means we switch back these values) + */ + + xchg_values(spare_desc->raid_disk, failed_desc->raid_disk); + xchg_values(sdisk->raid_disk, fdisk->raid_disk); + xchg_values(spare_desc->number, failed_desc->number); + xchg_values(sdisk->number, fdisk->number); + + *d = failed_desc; + + //if (sdisk->dev == MKDEV(0,0)) + if (sdisk->node == NULL) + sdisk->used_slot = 0; + + /* + * this really activates the spare. + */ + fdisk->spare = 0; + fdisk->write_only = 0; + + /* + * if we activate a spare, we definitely replace a + * non-operational disk slot in the 'low' area of + * the disk array. + */ + conf->failed_disks--; + conf->working_disks++; + conf->spare = NULL; + + break; + + case DISKOP_HOT_REMOVE_SPARE: + rdisk = conf->disks + removed_disk; + + if (rdisk->spare && (removed_disk < conf->raid_disks)) { + MD_BUG(); + err = 1; + goto abort; + } + if (conf->spare != NULL) { + if (conf->spare->number == removed_disk) { + conf->spare = NULL; + } + } + + rdisk->dev = MKDEV(0, 0); + rdisk->node = NULL; + rdisk->used_slot = 0; + + break; + + case DISKOP_HOT_REMOVE_DISK: + rdisk = conf->disks + removed_disk; + if (rdisk->operational) { + /* We're removing a running disk in the array. */ + conf->working_disks--; + conf->failed_disks++; + } + rdisk->dev = MKDEV(0, 0); + rdisk->node = NULL; + rdisk->used_slot = 0; + rdisk->operational = 0; + break; + + default: + MD_BUG(); + err = 1; + goto abort; + } + abort: + md_spin_unlock_irq(&conf->device_lock); + print_raid5_conf(conf); + return err; +} + +static int +raid5_bmap(mddev_t * mddev, + u64 * rsector, + struct evms_logical_node ** node) +{ + raid5_conf_t *conf = (raid5_conf_t *) mddev->private; + const unsigned int raid_disks = conf->raid_disks; + const unsigned int data_disks = raid_disks - 1; + unsigned int dd_idx, pd_idx; + + *rsector = (u64) raid5_compute_sector( (unsigned long) *rsector, + raid_disks, data_disks, + &dd_idx, &pd_idx, conf); + *node = conf->disks[dd_idx].node; + return 0; /* always successful */ +} + +static int +raid5_evms_ioctl(mddev_t * mddev, + struct inode *inode, + struct file *file, unsigned int cmd, unsigned long arg) +{ + int rc = 0; + struct evms_logical_node *node; + + switch (cmd) { + case EVMS_GET_BMAP: + { + struct evms_get_bmap_pkt *bmap = (struct evms_get_bmap_pkt *)arg; + rc = raid5_bmap(mddev, &bmap->rsector, &node); + if (!rc) { + if (node) + rc = IOCTL(node, inode, file, cmd, arg); + else + rc = -ENODEV; + } + break; + } + + default: + rc = -EINVAL; + } + return rc; +} + +#define MAX_IO_SIZE 128 +static int +raid5_pers_ioctl(mddev_t * mddev, int cmd, void *args) +{ + + int rc = 0; + struct r5_sync_io init_io_args; + void *data; + int io_size = MAX_IO_SIZE; + + LOG_DETAILS("%s: cmd == %d.\n", __FUNCTION__, cmd); + switch (cmd) { + case EVMS_MD_RAID5_INIT_IO: + + if (copy_from_user + (&init_io_args, (struct r5_sync_io *) args, + sizeof (init_io_args))) { + return -EFAULT; + } + /* allocate a io buffer upto 64Kbytes in size */ + if (init_io_args.nr_sects < MAX_IO_SIZE) + io_size = init_io_args.nr_sects; + + /* allocate buffer large enough to hold a single sector */ + data = kmalloc(io_size << EVMS_VSECTOR_SIZE_SHIFT, GFP_KERNEL); + if (!data) { + rc = -ENOMEM; + } else { + u64 io_sector_offset, io_remaining; + u64 io_bytes; + u_char *user_buffer_ptr; + + io_remaining = init_io_args.nr_sects; + io_sector_offset = 0; + user_buffer_ptr = init_io_args.data; + while (io_remaining) { + /* compute the io_size for this pass */ + io_size = (io_remaining >= MAX_IO_SIZE) ? + MAX_IO_SIZE : io_remaining; + + io_bytes = io_size << EVMS_VSECTOR_SIZE_SHIFT; + if (init_io_args.rw == WRITE) { + if (copy_from_user(data, + user_buffer_ptr, + io_bytes)) + rc = -EFAULT; + } + if (rc) + break; + + rc = evms_md_sync_io(mddev->node, + init_io_args.rw, + init_io_args.lsn + + io_sector_offset, io_size, + data); + + if (rc) + break; + + if (init_io_args.rw != WRITE) { + if (copy_to_user(user_buffer_ptr, + data, io_bytes)) + rc = -EFAULT; + } + if (rc) + break; + + user_buffer_ptr += io_bytes; + io_sector_offset += io_size; + io_remaining -= io_size; + } + } + break; + + default: + rc = -ENOSYS; + } + + return rc; +} + +static mdk_personality_t raid5_personality = { + .name = "evms_raid5", + .read = raid5_read, + .write = raid5_write, + .run = raid5_run, + .stop = raid5_stop, + .status = raid5_status, + .error_handler = raid5_error, + .diskop = raid5_diskop, + .stop_resync = raid5_stop_resync, + .restart_resync = raid5_restart_resync, + .sync_request = raid5_sync_request, + .evms_ioctl = raid5_evms_ioctl, + .md_pers_ioctl = raid5_pers_ioctl +}; + +static int md__init +raid5_init(void) +{ + return evms_register_md_personality(RAID5, &raid5_personality); +} + +static void +raid5_exit(void) +{ + evms_unregister_md_personality(RAID5); +} + +module_init(raid5_init); +module_exit(raid5_exit); +#ifdef MODULE_LICENSE +MODULE_LICENSE("GPL"); +#endif diff -Naur linux-2002-09-30/drivers/evms/md_xor.c evms-2002-09-30/drivers/evms/md_xor.c --- linux-2002-09-30/drivers/evms/md_xor.c Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/drivers/evms/md_xor.c Fri Mar 1 11:50:58 2002 @@ -0,0 +1,149 @@ +/* + * md_xor.c : Multiple Devices driver for Linux + * + * Copyright (C) 1996, 1997, 1998, 1999, 2000, + * Ingo Molnar, Matti Aarnio, Jakub Jelinek, Richard Henderson. + * + * Dispatch optimized RAID-5 checksumming functions. + * + * 'md_xor.c' is an EVMS version of linux/drivers/md/xor.c modified + * by Cuong (Mike) Tran , January 2002. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define BH_TRACE 0 +#include +#include +#include +#include + +#define LOG_PREFIX "md raid5: " +/* The xor routines to use. */ +static struct xor_block_template *active_template; + +void +evms_md_xor_block(unsigned int count, struct buffer_head **bh_ptr) +{ + unsigned long *p0, *p1, *p2, *p3, *p4; + unsigned long bytes = bh_ptr[0]->b_size; + + p0 = (unsigned long *) bh_ptr[0]->b_data; + p1 = (unsigned long *) bh_ptr[1]->b_data; + if (count == 2) { + active_template->do_2(bytes, p0, p1); + return; + } + + p2 = (unsigned long *) bh_ptr[2]->b_data; + if (count == 3) { + active_template->do_3(bytes, p0, p1, p2); + return; + } + + p3 = (unsigned long *) bh_ptr[3]->b_data; + if (count == 4) { + active_template->do_4(bytes, p0, p1, p2, p3); + return; + } + + p4 = (unsigned long *) bh_ptr[4]->b_data; + active_template->do_5(bytes, p0, p1, p2, p3, p4); +} + +/* Set of all registered templates. */ +static struct xor_block_template *template_list; + +#define BENCH_SIZE (PAGE_SIZE) + +static void +do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) +{ + int speed; + unsigned long now; + int i, count, max; + + tmpl->next = template_list; + template_list = tmpl; + + /* + * Count the number of XORs done during a whole jiffy, and use + * this to calculate the speed of checksumming. We use a 2-page + * allocation to have guaranteed color L1-cache layout. + */ + max = 0; + for (i = 0; i < 5; i++) { + now = jiffies; + count = 0; + while (jiffies == now) { + mb(); + tmpl->do_2(BENCH_SIZE, b1, b2); + mb(); + count++; + mb(); + } + if (count > max) + max = count; + } + + speed = max * (HZ * BENCH_SIZE / 1024); + tmpl->speed = speed; + + LOG_DEFAULT(" %-10s: %5d.%03d MB/sec\n", tmpl->name, + speed / 1000, speed % 1000); +} + +static int +calibrate_xor_block(void) +{ + void *b1, *b2; + struct xor_block_template *f, *fastest; + + b1 = (void *) md__get_free_pages(GFP_KERNEL, 2); + if (! b1) { + LOG_ERROR("Yikes! No memory available.\n"); + return -ENOMEM; + } + b2 = b1 + 2*PAGE_SIZE + BENCH_SIZE; + + LOG_DEFAULT("measuring checksumming speed\n"); + sti(); + +#define xor_speed(templ) do_xor_speed((templ), b1, b2) + + XOR_TRY_TEMPLATES; + +#undef xor_speed + + free_pages((unsigned long)b1, 2); + + fastest = template_list; + for (f = fastest; f; f = f->next) + if (f->speed > fastest->speed) + fastest = f; + +#ifdef XOR_SELECT_TEMPLATE + fastest = XOR_SELECT_TEMPLATE(fastest); +#endif + + active_template = fastest; + LOG_DEFAULT("using function: %s (%d.%03d MB/sec)\n", + fastest->name, fastest->speed / 1000, fastest->speed % 1000); + + return 0; +} + +MD_EXPORT_SYMBOL(evms_md_xor_block); + +#ifdef MODULE_LICENSE +MODULE_LICENSE("GPL"); +#endif + +module_init(calibrate_xor_block); diff -Naur linux-2002-09-30/drivers/evms/os2lvm_vge.c evms-2002-09-30/drivers/evms/os2lvm_vge.c --- linux-2002-09-30/drivers/evms/os2lvm_vge.c Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/drivers/evms/os2lvm_vge.c Fri Sep 13 16:09:55 2002 @@ -0,0 +1,2394 @@ +/* + * + * Copyright (c) International Business Machines Corp., 2001 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* + * linux/drivers/evms/os2lvm_vge.c + * + * EVMS OS/2 LVM Emulator + * + * This Volume Group Emulator will take the type 0x35 partitions created by + * OS/2 versions 4.5 and later and build them into volumes. It emulates + * the Drive Linking and Bad Block Relocation features and therefore + * provides binary compatibility with the OS/2 version. Of course, if + * you select to mkfs a file system OS/2 doesn't support, you're on your + * own... + * + * Since OS/2 LVM volumes can only exist on DOS-style partitioned disks, + * this VGE has a dependency on dospart.c to report a list of the + * candidate partitions. This module will then take the appropriate partitions + * from the list and use them to build the OS/2-style volumes. + * + * Change Activity: + * + * 7/01/2001 John Stiles getting started. + * 9/14/2001 John Stiles original version. + * 11/01/2001 John Stiles new naming scheme. + * 11/21/2001 John Stiles i/o path changes. + */ + +#define EVMS_DEBUG 1 +#define EVMS_OS2_DEBUG 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define LOG_PREFIX "os2lvm: " + +// Global Structure and Type definitions +struct transfer_record { + int Write_Flag; /* 0 = read, 1 = write */ + struct os2_dl_entry *Partition_Data; + struct buffer_head *bh; + struct transfer_record *next; +}; + +struct tracking_record { /* structure used to track IO requests that must be broken into two pieces due to drive linking */ + unsigned int io_in_progress; + int up_to_date; + struct buffer_head *org_bh; /* Original IO */ + struct buffer_head *link1_bh; /* First child. */ + struct os2_dl_entry *link1_data; + struct transfer_record *link1_transfer_rec; + int link1_bbr_attempted; + struct buffer_head *link2_bh; /* Second child */ + struct os2_dl_entry *link2_data; + struct transfer_record *link2_transfer_rec; + int link2_bbr_attempted; +}; + +// Prototypes for local VGE functions +static int discover_os2lvm_partitions(struct evms_logical_node **); +static struct evms_logical_node *find_os2_volume(u32); +static int add_os2link(struct os2_dl_entry *, + struct evms_logical_node *); +static struct os2_dl_entry + *find_link_data(struct os2_dl_entry **, u32); +static int find_drive_link(struct evms_logical_node *, + struct os2_dl_entry **, u64 *, u64 *); +static int validate_signaturesector(struct evms_logical_node *, + LVM_Signature_Sector *, u32); +static int validate_drivelinksector(void *, int, u32); +static int validate_bbrtablesector(void *, int, u32); +static u32 check_for_os2_bbr_relocations(char *); +static int check_os2_volumes(struct evms_logical_node **); +static int OS2_ioctl_cmd_broadcast(struct evms_logical_node *node, + struct inode *inode, struct file *file, + unsigned long cmd, unsigned long arg); +static int os2_ioctl_cmd_plugin_ioctl(struct evms_logical_node *node, + struct inode *inode, struct file *file, + unsigned long cmd, unsigned long arg); +static void BBR_Worker(void *); +static void OS2_BBR_Write_Callback(struct transfer_record * Transfer_Record, + struct buffer_head *bh, + int uptodate, int *redrive); +static void BBR_Transfer_IO(struct transfer_record * Transfer_Record); +static void OS2_DL_Callback(struct buffer_head *bh, int uptodate); +static int Sector_Is_Remapped(struct os2_dl_entry * io_dlentry, + u64 Source_Sector, u64 * Replacement_Sector); +static void Invalidate_Mapping(struct os2_dl_entry * io_dlentry, + u64 Source_Sector, + int Replacement_Sector_Is_Bad); +static int Create_New_BBR_Table_Entry(struct os2_dl_entry * + io_dlentry, u64 starting_lsn, + unsigned int count, void *buffer); +static void Clone_Bufferhead(struct buffer_head *Source, + struct buffer_head *Child); + +// Prototypes for local memory allocation/deallocation functions +static struct os2_dl_entry *new_os2_drive_link(LVM_Signature_Sector *, + struct + evms_logical_node *); +static char *new_os2_link_data(u32, u32, u32, struct evms_logical_node *); +static char *new_os2_bbr_data(u32, u32, u32, struct evms_logical_node *); +static struct evms_logical_node *new_os2volume(u32, char *); +static int delete_os2lvm_volume(struct evms_logical_node *); +static int delete_os2_drive_link(struct os2_dl_entry *, int); + +// Prototypes for Function Table interface +static int discover_os2lvm(struct evms_logical_node **); +static int delete_os2lvm(struct evms_logical_node *); +static void read_os2lvm(struct evms_logical_node *, struct buffer_head *); +static void write_os2lvm(struct evms_logical_node *, struct buffer_head *); +static int init_io_os2lvm(struct evms_logical_node *, int, u64, u64, void *); +static int ioctl_os2lvm(struct evms_logical_node *, struct inode *, + struct file *, unsigned int, unsigned long); +static int do_os2_bbr_io(struct os2_dl_entry *, int, u64, u64, + void *); + +// Global data structures +static struct evms_logical_node *os2lvm_nodes = NULL; +static struct evms_thread *BBR_Worker_Thread = NULL; +static spinlock_t BBR_Queue_Lock = SPIN_LOCK_UNLOCKED; +static const char *BBR_Worker_Name = "evms_os2_bbr_io"; +static struct transfer_record *BBR_IO_List_Head = NULL; +static struct transfer_record *BBR_IO_List_Tail = NULL; +static struct evms_pool_mgmt *BBR_Transfer_Pool = NULL; +static char *BBR_Transfer_Pool_Name = "OS-2 Transfer Pool"; +static char *DL_Tracking_Pool_Name = "OS-2 Tracking Pool"; +static struct evms_pool_mgmt *DL_Tracking_Pool = NULL; + +// Required plug-in Function Table definition +static struct evms_plugin_fops function_table = { + .discover = discover_os2lvm, + .delete = delete_os2lvm, + .read = read_os2lvm, + .write = write_os2lvm, + .init_io = init_io_os2lvm, + .ioctl = ioctl_os2lvm +}; + +// Required plug-in Header definition +static struct evms_plugin_header plugin_header = { + .id = SetPluginID(IBM_OEM_ID, + EVMS_REGION_MANAGER, + 2), + .version = { + .major = 1, + .minor = 1, + .patchlevel = 1 + }, + .required_services_version = { + .major = EVMS_COMMON_SERVICES_MAJOR, + .minor = EVMS_COMMON_SERVICES_MINOR, + .patchlevel = EVMS_COMMON_SERVICES_PATCHLEVEL + }, + .fops = &function_table +}; + +// Required Plugin Functions + +/* + * Function: discover_os2lvm + * + * This is the entry point into the discovery process. + */ +static int +discover_os2lvm(struct evms_logical_node **evms_partition_list) +{ + int rc; + + MOD_INC_USE_COUNT; + + if (!BBR_Transfer_Pool) { + BBR_Transfer_Pool = + evms_cs_create_pool(sizeof (struct transfer_record), + BBR_Transfer_Pool_Name, NULL, NULL); + if (!BBR_Transfer_Pool) { + MOD_DEC_USE_COUNT; + return -ENOMEM; + } + } + + if (!DL_Tracking_Pool) { + DL_Tracking_Pool = + evms_cs_create_pool(sizeof (struct tracking_record), + DL_Tracking_Pool_Name, NULL, NULL); + if (!DL_Tracking_Pool) { + MOD_DEC_USE_COUNT; + return -ENOMEM; + } + } + + rc = discover_os2lvm_partitions(evms_partition_list); + + if (!rc) { + rc = check_os2_volumes(evms_partition_list); + } + + MOD_DEC_USE_COUNT; + return rc; +} + +/* + * Function: delete_os2lvm + * + * This is the entry point for deleting a node. + */ +static int +delete_os2lvm(struct evms_logical_node *logical_node) +{ + LOG_EXTRA("Deleting volume: %s\n", logical_node->name); + + return delete_os2lvm_volume(logical_node); +} + +/* + * Function: read_os2lvm + */ +static void +read_os2lvm(struct evms_logical_node *node, struct buffer_head *bh) +{ + int rc; + u64 sector_count; + u64 rsector; + struct buffer_head *Link1 = NULL; + struct buffer_head *Link2 = NULL; + struct tracking_record *Tracking_Record = NULL; + struct os2_dl_entry *cur_dlentry = NULL; + struct transfer_record *Transfer_Record; + + rsector = bh->b_rsector; + sector_count = bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT; + rc = find_drive_link(node, &cur_dlentry, &rsector, §or_count); + bh->b_rsector = rsector; + switch (rc) { + case 1: + if (cur_dlentry->bbr_is_active) { + Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool, 1); /* Block until we get a transfer record. */ + /* Transfer the IO to the BBR Worker Thread. */ + Transfer_Record->Write_Flag = 0; + Transfer_Record->Partition_Data = cur_dlentry; + Transfer_Record->bh = bh; + Transfer_Record->next = NULL; + BBR_Transfer_IO(Transfer_Record); + } else + R_IO(cur_dlentry->link_partition, bh); + break; + case 2: + /* We must split the IO. Duplicate the buffer head twice and allocate the tracking record. */ + Tracking_Record = evms_cs_allocate_from_pool(DL_Tracking_Pool, 1); /* Block until we get a tracking record. */ + Link1 = evms_cs_allocate_from_pool(evms_bh_pool, 1); + Link2 = evms_cs_allocate_from_pool(evms_bh_pool, 1); + + /* Initialize the tracking record so we can associate the two new I/Os with the original. */ + Tracking_Record->io_in_progress = 2; + Tracking_Record->up_to_date = 0; + Tracking_Record->org_bh = bh; + + /* Create the I/O to the first link. */ + Clone_Bufferhead(bh, Link1); + Link1->b_private = Tracking_Record; + Link1->b_end_io = OS2_DL_Callback; + Link1->b_size = sector_count << EVMS_VSECTOR_SIZE_SHIFT; + Tracking_Record->link1_bh = Link1; + Tracking_Record->link1_data = cur_dlentry; + Tracking_Record->link1_bbr_attempted = 0; + Tracking_Record->link1_transfer_rec = NULL; + + /* Create the I/O to the second link */ + Clone_Bufferhead(bh, Link2); + Link2->b_private = Tracking_Record; + Link2->b_end_io = OS2_DL_Callback; + Link2->b_data += sector_count << EVMS_VSECTOR_SIZE_SHIFT; + Link2->b_rsector = 0; + Link2->b_size = + bh->b_size - (sector_count << EVMS_VSECTOR_SIZE_SHIFT); + Tracking_Record->link2_bh = Link2; + Tracking_Record->link2_data = cur_dlentry->next; + Tracking_Record->link2_bbr_attempted = 0; + Tracking_Record->link2_transfer_rec = NULL; + + /* Process the I/O to the first link. */ + if (cur_dlentry->bbr_is_active) { + Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool, 1); /* Block until we get a transfer record. */ + /* Transfer the IO to the BBR Worker Thread. */ + Transfer_Record->Write_Flag = 0; + Transfer_Record->Partition_Data = cur_dlentry; + Transfer_Record->bh = Tracking_Record->link1_bh; + Transfer_Record->next = NULL; + BBR_Transfer_IO(Transfer_Record); + } else + R_IO(cur_dlentry->link_partition, + Tracking_Record->link1_bh); + + /* Process the I/O to the second link. */ + cur_dlentry = cur_dlentry->next; + if (cur_dlentry->bbr_is_active) { + Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool, 1); /* Block until we get a transfer record. */ + /* Transfer the IO to the BBR Worker Thread. */ + Transfer_Record->Write_Flag = 0; + Transfer_Record->Partition_Data = cur_dlentry; + Transfer_Record->bh = Tracking_Record->link2_bh; + Transfer_Record->next = NULL; + BBR_Transfer_IO(Transfer_Record); + } else + R_IO(cur_dlentry->link_partition, + Tracking_Record->link2_bh); + + break; + default: + LOG_SERIOUS("READ error, request exceeds volume size.\n"); + bh->b_end_io(bh, 0); + break; + } +} + +/* + * Function: write_os2lvm + */ +static void +write_os2lvm(struct evms_logical_node *node, struct buffer_head *bh) +{ + int rc; + u64 rsector; + u64 sector_count; + struct buffer_head *Link1 = NULL; + struct buffer_head *Link2 = NULL; + struct tracking_record *Tracking_Record = NULL; + struct os2_dl_entry *cur_dlentry = NULL; + struct transfer_record *Transfer_Record; + + rsector = bh->b_rsector; + sector_count = bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT; + rc = find_drive_link(node, &cur_dlentry, &rsector, §or_count); + bh->b_rsector = rsector; + switch (rc) { + case 1: + /* Set up a Transfer Record. If there are Bad Blocks on the partition that this I/O is + directed to, then we will need the Transfer Record to put the I/O in the queue for the + BBR Worker Thread. If there are no bad blocks, then we will need the Transfer Record + for the OS2_BBR_Write_Callback function. This function expects the Transfer Record to + be pre-allocated and available because it is running on an interrupt thread and should + not do memory allocation. If there is an error during the write, then the + OS2_BBR_Write_Callback function will use the Transfer Record to transfer the I/O + to the BBR worker thread for further processing. If there are no errors during the I/O, + then the OS2_BBR_Write_Callback will deallocate the Transfer Record. */ + Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool, 1); /* Block until we get a transfer record. */ + Transfer_Record->Write_Flag = 1; + Transfer_Record->Partition_Data = cur_dlentry; + Transfer_Record->bh = bh; + Transfer_Record->next = NULL; + if (cur_dlentry->bbr_is_active) { + /* Transfer the IO to the BBR Worker Thread. */ + BBR_Transfer_IO(Transfer_Record); + } else { + evms_cs_register_for_end_io_notification + (Transfer_Record, bh, OS2_BBR_Write_Callback); + W_IO(cur_dlentry->link_partition, bh); + } + break; + case 2: + /* We must split the IO. Duplicate the buffer head twice and allocate the tracking record. */ + Tracking_Record = evms_cs_allocate_from_pool(DL_Tracking_Pool, 1); /* Block until we get a tracking record. */ + Link1 = evms_cs_allocate_from_pool(evms_bh_pool, 1); + Link2 = evms_cs_allocate_from_pool(evms_bh_pool, 1); + + /* Initialize the tracking record so we can associate the two new I/Os with the original. */ + Tracking_Record->io_in_progress = 2; + Tracking_Record->up_to_date = 0; + Tracking_Record->org_bh = bh; + + /* Create the I/O to the first link. */ + Clone_Bufferhead(bh, Link1); + Link1->b_private = Tracking_Record; + Link1->b_end_io = OS2_DL_Callback; + Link1->b_size = sector_count << EVMS_VSECTOR_SIZE_SHIFT; + Tracking_Record->link1_bh = Link1; + Tracking_Record->link1_data = cur_dlentry; + + /* Create the I/O to the second link */ + Clone_Bufferhead(bh, Link2); + Link2->b_private = Tracking_Record; + Link2->b_end_io = OS2_DL_Callback; + Link2->b_data += sector_count << EVMS_VSECTOR_SIZE_SHIFT; + Link2->b_rsector = 0; + Link2->b_size = + bh->b_size - (sector_count << EVMS_VSECTOR_SIZE_SHIFT); + Tracking_Record->link2_bh = Link2; + Tracking_Record->link2_data = cur_dlentry->next; + + Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool, 1); /* Block until we get a transfer record. */ + Transfer_Record->Write_Flag = 1; + Transfer_Record->Partition_Data = cur_dlentry; + Transfer_Record->bh = Tracking_Record->link1_bh; + Transfer_Record->next = NULL; + Tracking_Record->link1_transfer_rec = Transfer_Record; + /* Process the I/O to the first link. */ + if (cur_dlentry->bbr_is_active) { + /* Transfer the IO to the BBR Worker Thread. */ + Tracking_Record->link1_bbr_attempted = 1; + BBR_Transfer_IO(Transfer_Record); + } else { + Tracking_Record->link1_bbr_attempted = 0; + W_IO(cur_dlentry->link_partition, + Tracking_Record->link1_bh); + } + + /* Process the I/O to the second link. */ + cur_dlentry = cur_dlentry->next; + Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool, 1); /* Block until we get a transfer record. */ + Transfer_Record->Write_Flag = 1; + Transfer_Record->Partition_Data = cur_dlentry; + Transfer_Record->bh = Tracking_Record->link2_bh; + Transfer_Record->next = NULL; + Tracking_Record->link2_transfer_rec = Transfer_Record; + if (cur_dlentry->bbr_is_active) { + /* Transfer the IO to the BBR Worker Thread. */ + Tracking_Record->link2_bbr_attempted = 1; + BBR_Transfer_IO(Transfer_Record); + } else { + Tracking_Record->link2_bbr_attempted = 0; + W_IO(cur_dlentry->link_partition, + Tracking_Record->link2_bh); + } + + break; + default: + LOG_SERIOUS("WRITE error, request exceeds volume size.\n"); + bh->b_end_io(bh, 0); + break; + } +} + +static int +os2_ioctl_cmd_plugin_ioctl(struct evms_logical_node *node, + struct inode *inode, + struct file *file, + unsigned long cmd, unsigned long arg) +{ + int rc = 0; + os2_volume_runtime_entry_t *Node_Data; + struct os2_dl_entry *curlink, *nextlink; + struct evms_plugin_ioctl_pkt tmp, *user_parms; + + user_parms = (struct evms_plugin_ioctl_pkt *) arg; + /* copy user's parameters to kernel space */ + if (copy_from_user(&tmp, user_parms, sizeof (tmp))) + rc = -EFAULT; + + if (!rc) { + Node_Data = (os2_volume_runtime_entry_t *) node->private; + /* is this cmd targetted at this feature ? */ + if (tmp.feature_id == node->plugin->id) { + switch (tmp.feature_command) { + default: + break; + } + } else { /* broadcast this cmd to all children */ + curlink = Node_Data->drive_link; + + /* broadcast this cmd to all children */ + while (curlink) { + nextlink = curlink->next; + + rc = IOCTL(curlink->link_partition, inode, file, + cmd, arg); + + if (rc) { + break; + } + curlink = nextlink; + } + + } + /* copy info to userspace */ + if (copy_to_user(user_parms, &tmp, sizeof (tmp))) + rc = -EFAULT; + } + return (rc); +} + +static int +OS2_ioctl_cmd_broadcast(struct evms_logical_node *node, + struct inode *inode, + struct file *file, unsigned long cmd, unsigned long arg) +{ + int rc = 0; + os2_volume_runtime_entry_t *Node_Data; + struct os2_dl_entry *curlink, *nextlink; + + Node_Data = (os2_volume_runtime_entry_t *) node->private; + curlink = Node_Data->drive_link; + + /* broadcast this cmd to all children */ + while (curlink) { + nextlink = curlink->next; + + rc |= IOCTL(curlink->link_partition, inode, file, cmd, arg); + + curlink = nextlink; + } + + return (rc); +} + +/* + * Function: ioctl_os2lvm + */ +static int +ioctl_os2lvm(struct evms_logical_node *logical_node, + struct inode *inode, + struct file *file, unsigned int cmd, unsigned long arg) +{ + int rc = 0; + u64 Sectors_Per_Cylinder; + u64 Total_Sectors; + struct evms_logical_node *partition_node; + + partition_node = + ((os2_volume_runtime_entry_t *) logical_node->private)->drive_link-> + link_partition; + + if (!inode) + return -EINVAL; + + LOG_EVERYTHING("Ioctl %d\n", cmd); + + switch (cmd) { + case HDIO_GETGEO: + { + // Return fake geometry + struct hd_geometry *hd = (struct hd_geometry *) arg; + short cylinders; + unsigned char heads = 255; + unsigned char sectors = + OS2LVM_SYNTHETIC_SECTORS_PER_TRACK; + long start = 0; + + /* OS/2 always created a fake geometry using the maximum cylinder size. */ + Sectors_Per_Cylinder = heads * sectors; + for (cylinders = 0, Total_Sectors = 0; + Total_Sectors < + ((os2_volume_runtime_entry_t *) logical_node-> + private)->size_in_sectors; cylinders++) + Total_Sectors += Sectors_Per_Cylinder; + + cylinders--; + + if (copy_to_user + ((short *) (&hd->cylinders), &cylinders, + sizeof (cylinders)) + || copy_to_user((char *) (&hd->heads), &heads, + sizeof (heads)) + || copy_to_user((char *) (&hd->sectors), §ors, + sizeof (sectors)) + || copy_to_user((long *) (&hd->start), &start, + sizeof (start))) { + return -EFAULT; + } + } + break; + + case EVMS_GET_BMAP: + // No kernel images allowed on OS/2 volumes right now. + rc = -EINVAL; + break; + + case EVMS_QUIESCE_VOLUME: + case EVMS_GET_DISK_LIST: + case EVMS_CHECK_MEDIA_CHANGE: + case EVMS_REVALIDATE_DISK: + case EVMS_OPEN_VOLUME: + case EVMS_CLOSE_VOLUME: + case EVMS_CHECK_DEVICE_STATUS: + rc = OS2_ioctl_cmd_broadcast(logical_node, inode, file, cmd, + arg); + break; + case EVMS_PLUGIN_IOCTL: + rc = os2_ioctl_cmd_plugin_ioctl(logical_node, inode, file, cmd, + arg); + break; + default: + rc = -EINVAL; + break; + } + + return rc; +} + +/* + * Function: init_io_os2lvm + */ +static int +init_io_os2lvm(struct evms_logical_node *node, int io_flag, /* 0=read, 1=write */ + u64 sect_nr, /* disk LBA */ + u64 num_sects, /* # of sectors */ + void *buf_addr) +{ /* buffer address */ + int rc = 0; + u64 sector_count; + struct evms_logical_node *partition_node; + struct os2_dl_entry *cur_dlentry = NULL; + + sector_count = num_sects; + rc = find_drive_link(node, &cur_dlentry, §_nr, §or_count); + switch (rc) { + case 1: + partition_node = cur_dlentry->link_partition; + if (cur_dlentry->bbr_is_active) + rc = do_os2_bbr_io(cur_dlentry, io_flag, sect_nr, + num_sects, buf_addr); + else { + rc = INIT_IO(partition_node, io_flag, sect_nr, + num_sects, buf_addr); + if (rc && io_flag) { + cur_dlentry->bbr_is_active = 1; + rc = do_os2_bbr_io(cur_dlentry, io_flag, + sect_nr, num_sects, + buf_addr); + } + } + break; + case 2: + partition_node = cur_dlentry->link_partition; + if (cur_dlentry->bbr_is_active) + rc = do_os2_bbr_io(cur_dlentry, io_flag, sect_nr, + sector_count, buf_addr); + else { + rc = INIT_IO(partition_node, io_flag, sect_nr, + sector_count, buf_addr); + if (rc && io_flag) { + cur_dlentry->bbr_is_active = 1; + rc = do_os2_bbr_io(cur_dlentry, io_flag, + sect_nr, sector_count, + buf_addr); + } + } + + if (!rc) { + cur_dlentry = cur_dlentry->next; + partition_node = cur_dlentry->link_partition; + num_sects -= sector_count; + buf_addr += sector_count << OS2_SECTOR_SHIFT; + rc = 1; + if (cur_dlentry->bbr_is_active) + rc = do_os2_bbr_io(cur_dlentry, io_flag, 0, + num_sects, buf_addr); + else { + rc = INIT_IO(partition_node, io_flag, 0, + num_sects, buf_addr); + if (rc && io_flag) { + cur_dlentry->bbr_is_active = 1; + rc = do_os2_bbr_io(cur_dlentry, io_flag, + 0, num_sects, + buf_addr); + } + + } + } + break; + default: + LOG_SERIOUS("INITIO error, request exceeds volume size.\n"); + break; + } + + return rc; +} + +/* + * Function: do_os2_bbr_io + * + * Check the Bad Block Relocation list for relocated sectors. If any are found, + * this function will do the i/o directly. + * Return values: 0 == i/o done, 1 == unable to complete i/o + */ +static int +do_os2_bbr_io(struct os2_dl_entry * io_dlentry, int rw, /* 0=read, 1=write */ + u64 starting_lsn, /* disk LBA */ + u64 count, /* # of sectors */ + void *buffer) +{ /* buffer address */ + u64 lsn, remapped_lsn; + int rc; + + // For each sector in this request, check if this sector has already + // been remapped. If so, process all previous sectors in this request, + // followed by the remapped sector. Then reset the starting lsn and + // count and keep going with the rest of the request as if it were + // a whole new request. + for (lsn = 0; lsn < count; lsn++) { + remapped_lsn = starting_lsn + lsn; + rc = Sector_Is_Remapped(io_dlentry, remapped_lsn, + &remapped_lsn); + if (rc) { + // Process all sectors in the request up to this one. + if (lsn > 0) { + rc = INIT_IO(io_dlentry->link_partition, rw, + starting_lsn, lsn, buffer); + if (rc) { + /* If this is a read, then we are done. */ + if (!rw) { + return 1; + } + + /* Since this was a write, we must see if we can remap the bad sector to a replacement sector. */ + if (!Create_New_BBR_Table_Entry + (io_dlentry, starting_lsn, lsn, + buffer)) { + /* We were unable to remap the bad sector(s) in the I/O. We can not complete the I/O. */ + return 1; + } + } + buffer += (lsn * OS2_BYTES_PER_SECTOR); + } + // Process the remapped sector. + rc = INIT_IO(io_dlentry->link_partition, rw, + remapped_lsn, 1, buffer); + if (rc) { + /* If this is a read, then we are done. */ + if (!rw) { + return 1; + } + + /* Get the original sector that was remapped. */ + remapped_lsn = starting_lsn + lsn; + + /* Invalidate the current remapping. */ + Invalidate_Mapping(io_dlentry, remapped_lsn, 1); + + /* Try to remap the bad sector to another replacement sector. */ + if (!Create_New_BBR_Table_Entry + (io_dlentry, remapped_lsn, 1, buffer)) { + /* We were unable to remap the bad sector(s) in the I/O. We can not complete the I/O. */ + return 1; + } + + } + + buffer += OS2_BYTES_PER_SECTOR; + + starting_lsn += (lsn + 1); + count -= (lsn + 1); + lsn = -1; + } + + } + + /* Are there any sectors left to process? */ + if (count > 0) { + rc = INIT_IO(io_dlentry->link_partition, rw, starting_lsn, + count, buffer); + if (rc) { + /* If this is a read, then we are done. */ + if (!rw) { + return 1; + } + + /* Since this was a write, we must see if we can remap the bad sector to a replacement sector. */ + if (!Create_New_BBR_Table_Entry + (io_dlentry, starting_lsn, count, buffer)) { + /* We were unable to remap the bad sector(s) in the I/O. We can not complete the I/O. */ + return 1; + } + + } + + } + + return 0; +} + +/* + * Function: os2lvm_vge_init + */ +int __init +os2lvm_vge_init(void) +{ + /* Should I be allocating the pools and BBR Worker Thread here? */ + return evms_cs_register_plugin(&plugin_header); /* register with EVMS */ +} + +void __exit +os2lvm_vge_exit(void) +{ + /* BUGBUG - Is there where I need to kill the BBR Worker Thread and free any memory I am still holding? */ + + evms_cs_unregister_plugin(&plugin_header); +} + +module_init(os2lvm_vge_init); +module_exit(os2lvm_vge_exit); +#ifdef MODULE_LICENSE +MODULE_LICENSE("GPL"); +#endif + +// Local VGE Functions + +/* + * Function: discover_os2lvm_partitions + * + * Examine the list of logical partitions. Any type 0x35 partition that contains + * a valid OS/2 signature sector is consumed and added to the appropriate logical + * volume. + */ +static int +discover_os2lvm_partitions(struct evms_logical_node **evms_partition_list) +{ + struct evms_logical_node *evms_partition; + struct evms_logical_node *next_partition; + struct evms_logical_node *new_volume; + u64 sectornum = 0; + u32 volumeserial; + char *sigsect; + char *volumename; + char driveletter[8]; + LVM_Signature_Sector *sigsector; + struct os2_dl_entry *new_dlentry; + + LOG_ENTRY_EXIT("Discovering OS/2 Logical Volumes\n"); + sigsect = kmalloc(OS2_BYTES_PER_SECTOR, GFP_KERNEL); + if (!sigsect) { + LOG_SERIOUS("Could not allocate Signature sector data\n"); + return -ENOMEM; + } + + for (evms_partition = *evms_partition_list; evms_partition; + evms_partition = next_partition) { + // Save the next node. We may remove this one from the list. + next_partition = evms_partition->next; + + // The node must not have the OS/2 vge id. + if (evms_partition->plugin->id == plugin_header.id) { + continue; + } + + LOG_EXTRA("Examining partition serial %s\n", + evms_partition->name); + + // Have to go to the last accessible sector of the partition and + // read it in. It should be the LVM Signature Sector. + sectornum = evms_partition->total_vsectors - 1; + if (INIT_IO(evms_partition, 0, sectornum, 1, sigsect)) { + // On an I/O error, continue on to the next partition. + // This means that the volume it belongs to will be incomplete + // and later deleted in the completeness check. + LOG_SERIOUS("I/O error on Signature sector read\n"); + continue; + } + sigsector = (LVM_Signature_Sector *) sigsect; + + // Validate the Signature Sector + if (validate_signaturesector + (evms_partition, sigsector, OS2_BYTES_PER_SECTOR)) { + LOG_EXTRA("Signature sector is not valid\n"); + continue; + } +// Bugbug - At this point, we have validated an OS/2 LVM Signature Sector. However, if the partition +// is not marked as a type 0x35, then this Signature Sector may be erroneous. The problem here is that +// there is currently no way to find out if this partition was marked as a type 0x35. Also, if we +// should reject this partition due to some problem with the drive linking or BBR metadata, should we +// leave the partition in the evms partition list or not? If the partition was marked as a type 0x35 +// and the Signature Sector was valid, then I would say that we should remove it from the evms partition +// partition list. If the partition is not marked as a type 0x35 but the Signature Sector is valid, then +// we could have a stray Signature Sector, in which case the partition should remain in the evms partition +// list. The OS/2 LVM Signature Sector does have additional information that could be used to resolve +// this issue, such as the starting LBA of the partition that the Signature Sector belongs to, but +// we can not get the starting LBA of the partition to compare against. If we leave the partition in +// the evms partition list when we should not, then an extraneous compatibility volume could result. + // Build the Metadata for this partition + if (! + (new_dlentry = + new_os2_drive_link(sigsector, evms_partition))) { + continue; + } + // Search for the parent Volume for this partition + volumeserial = sigsector->Volume_Serial_Number; + if (!(new_volume = find_os2_volume(volumeserial))) { + + // If not found, allocate a new Volume + LOG_EVERYTHING("Parent not found, allocate new.\n"); + if (sigsector->Drive_Letter != '\0') { + driveletter[0] = sigsector->Drive_Letter; + driveletter[1] = '\0'; + volumename = driveletter; + } else + volumename = sigsector->Volume_Name; + + if (! + (new_volume = + new_os2volume(volumeserial, volumename))) { + delete_os2_drive_link(new_dlentry, 0); + new_dlentry = NULL; + continue; + } + } + // Now remove the partition from the List + evms_cs_remove_logical_node_from_list(evms_partition_list, + evms_partition); + + if (((os2_volume_runtime_entry_t *) new_volume->private)-> + complete) { + // Volume is complete, delete this duplicate + delete_os2_drive_link(new_dlentry, 0); + LOG_EVERYTHING("Deleting duplicate node.\n"); + ((os2_volume_runtime_entry_t *) new_volume->private)->Export_Needed = 1; //We must export this volume again! + } else /* Add this partition to its parent Volume */ + add_os2link(new_dlentry, new_volume); + + } + + kfree(sigsect); + LOG_ENTRY_EXIT("Finished Discovering OS/2 Logical Volumes\n"); + + return 0; +} + +/* + * Function: find_os2_volume + * + * Search for the OS/2 volume that matches the volume serial. + */ +static struct evms_logical_node * +find_os2_volume(u32 volumeserial) +{ + os2_volume_runtime_entry_t *cur_volume; + struct evms_logical_node *cur_node; + + cur_node = os2lvm_nodes; + + while (cur_node) { + cur_volume = (os2_volume_runtime_entry_t *) cur_node->private; + if (cur_volume->Volume_Serial_Number == volumeserial) { + LOG_EVERYTHING("%s: found volser match.\n", + __FUNCTION__); + return cur_node; + } + LOG_EVERYTHING("%s: volser does not match.\n", __FUNCTION__); + cur_node = cur_volume->next_os2lvm_node; + } + + return NULL; +} + +/* + * Function: add_os2link + * + * Add the Drive Link metadata to the parent OS/2 volume. + */ +static int +add_os2link(struct os2_dl_entry * newlink, + struct evms_logical_node *parent_volume) +{ + os2_volume_runtime_entry_t *parent_metadata = + (os2_volume_runtime_entry_t *) parent_volume->private; + struct os2_dl_entry *curlink = + parent_metadata->drive_link, *nextlink; + + if (curlink) { + nextlink = curlink->next; + while (nextlink) { + curlink = nextlink; + nextlink = curlink->next; + } + curlink->next = newlink; + } else { + parent_metadata->drive_link = newlink; + } + parent_metadata->drive_link_count++; + parent_metadata->size_in_sectors += newlink->sector_count; + parent_volume->total_vsectors += newlink->sector_count; + return 0; +} + +/* + * Function: find_link_data + * + * Find the Drive Link metadata that matches the partition serial number. + * Remove it from the link_list passed in. + */ +static struct os2_dl_entry * +find_link_data(struct os2_dl_entry ** link_list, u32 partitionser) +{ + struct os2_dl_entry *curlink = *link_list, *prevlink = NULL; + + while (curlink) { + if (curlink->partition_serial == partitionser) { + if (prevlink) { + prevlink->next = curlink->next; + } else { + *link_list = curlink->next; + } + curlink->next = NULL; + return curlink; + } + prevlink = curlink; + curlink = prevlink->next; + } + + return NULL; +} + +/* + * Function: find_drive_link + * + * Walk the linked list of drive links to find the proper + * target partition. Returns the metadata associated with + * the drive link. + * Return values: 1 == data contained in 1 partition, 2 == data crosses 2 partitions, + * 0 == target partition not found + */ +static int +find_drive_link(struct evms_logical_node *node, + struct os2_dl_entry ** dlentry, + u64 * sector, u64 * num_sectors) +{ + u64 last_link_sector, cur_last_sector; + struct os2_dl_entry *curlink = + ((os2_volume_runtime_entry_t *) node->private)->drive_link, + *nextlink; + + while (curlink) { + nextlink = curlink->next; + last_link_sector = + curlink->start_sector + curlink->sector_count; + if (*sector < last_link_sector) { + *dlentry = curlink; + cur_last_sector = *sector + *num_sectors; + *sector -= curlink->start_sector; + LOG_EVERYTHING + ("I/O start_RBA == "PFU64" , sector_count == "PFU64"\n", + *sector, *num_sectors); + if (cur_last_sector <= last_link_sector) + return 1; + else { + if ((*dlentry)->next) + *num_sectors -= + cur_last_sector - last_link_sector; + else + return 0; + } + return 2; + } + + curlink = nextlink; + } + + return 0; +} + +// Allocation/Deallocation Functions + +/* + * Function: new_os2_drive_link + * + * Allocate space for a new OS/2 drive link structure. + * Initialize the appropriate fields. + * Note: since the BBR info applies to each link, the BBR structures + * are also initialized here. + */ +static struct os2_dl_entry * +new_os2_drive_link(LVM_Signature_Sector * signature_sector, + struct evms_logical_node *evms_partition) +{ + int i; + u32 feature, feature_size, sectoroffset; + struct os2_dl_entry *new_dlentry; + + new_dlentry = + kmalloc(sizeof (struct os2_dl_entry), GFP_KERNEL); + if (!new_dlentry) { + LOG_SERIOUS("Could not allocate drivelink metadata\n"); + return NULL; + } + memset(new_dlentry, 0, sizeof (struct os2_dl_entry)); + new_dlentry->sector_count = + signature_sector->Partition_Size_To_Report_To_User; + new_dlentry->partition_serial = + signature_sector->partition_serial; + new_dlentry->bbr_is_active = 0; // initialize to not active + new_dlentry->link_partition = evms_partition; + init_MUTEX(&(new_dlentry->bbr_table_lock)); + + sectoroffset = signature_sector->Partition_Start; + LOG_EVERYTHING("Partition Start is at LBA %i\n", sectoroffset); + for (i = 0; i < OS2LVM_MAX_FEATURES_PER_VOLUME; i++) { + feature = signature_sector->LVM_Feature_Array[i].Feature_ID; + if (feature) { + feature_size = + signature_sector->LVM_Feature_Array[i]. + Feature_Data_Size; + LOG_EVERYTHING("Entry %d in Feature Table is valid,\n", + i + 1); + LOG_EVERYTHING("Feature Data size is %i sectors.\n", + feature_size); + if (feature == DRIVE_LINKING_FEATURE_ID) { + if (!new_dlentry->link_data) { + new_dlentry->dl_lsn1 = + signature_sector-> + LVM_Feature_Array[i]. + Location_Of_Primary_Feature_Data - + sectoroffset; + new_dlentry->dl_lsn2 = + signature_sector-> + LVM_Feature_Array[i]. + Location_Of_Secondary_Feature_Data - + sectoroffset; + new_dlentry->link_data = + new_os2_link_data(new_dlentry-> + dl_lsn1, + new_dlentry-> + dl_lsn2, + feature_size, + evms_partition); + if (new_dlentry->link_data == NULL) { + delete_os2_drive_link + (new_dlentry, 0); + new_dlentry = NULL; + } + } else { + LOG_WARNING + ("os2lvm_vge: Drive Linking Feature encountered twice in the same Feature Array!\n"); + delete_os2_drive_link(new_dlentry, 0); + new_dlentry = NULL; + } + } else if (feature == BBR_FEATURE_ID) { + if (!new_dlentry->bbr_data) { + new_dlentry->bbr_lsn1 = + signature_sector-> + LVM_Feature_Array[i]. + Location_Of_Primary_Feature_Data; + new_dlentry->bbr_lsn2 = + signature_sector-> + LVM_Feature_Array[i]. + Location_Of_Secondary_Feature_Data; + new_dlentry->bbr_feature_size = + feature_size; + new_dlentry->bbr_data = + new_os2_bbr_data(new_dlentry-> + bbr_lsn1, + new_dlentry-> + bbr_lsn2, + feature_size, + evms_partition); + if (new_dlentry->bbr_data == NULL) { + delete_os2_drive_link + (new_dlentry, 0); + new_dlentry = NULL; + } else if (signature_sector-> + LVM_Feature_Array[i]. + Feature_Active) { + new_dlentry->bbr_is_active = + check_for_os2_bbr_relocations + (new_dlentry->bbr_data); + } + } else { + LOG_WARNING + ("os2lvm_vge: BBR Feature encountered twice in the same Feature Array!\n"); + delete_os2_drive_link(new_dlentry, 0); + new_dlentry = NULL; + } + } else { + LOG_WARNING + ("os2lvm_vge: Unknown Feature entry %d found.\n", + feature); + delete_os2_drive_link(new_dlentry, 0); + new_dlentry = NULL; + } + + if (signature_sector->LVM_Feature_Array[i]. + Feature_Active) { + LOG_EVERYTHING("Feature is active.\n"); + } + } + } + + if (new_dlentry && + ((!new_dlentry->bbr_data) || (!new_dlentry->link_data)) + ) { + LOG_WARNING("os2lvm_vge: Incomplete Feature Data found.\n"); + delete_os2_drive_link(new_dlentry, 0); + new_dlentry = NULL; + } + return new_dlentry; +} + +/* + * Function: new_os2_link_data + * + * Allocate space for OS/2 drive link information. + * Read in and validate the information from disk. + * Note: assumes 512 byte sectors. + */ +static char * +new_os2_link_data(u32 linksector1, + u32 linksector2, + u32 linknumsectors, struct evms_logical_node *link_partition) +{ + char *new_data1; /* Buffer used to hold the primary copy of the drive linking data. */ + char *new_data2; /* Buffer used to hold the secondary copy of the drive linking data. */ + char *p1; /* Used to access individual sectors of data within new_data1. */ + char *p2; /* Used to access individual sectors of data within new_data2. */ + int memsize = linknumsectors * OS2_BYTES_PER_SECTOR; + u32 i, seq1, seq2; + + /* Allocate Memory for the buffers to hold the drive linking data. */ + LOG_EVERYTHING("Drive Linking Feature entry found.\n"); + new_data1 = kmalloc(memsize, GFP_KERNEL); + if (!new_data1) { + LOG_SERIOUS("Could not allocate Primary Link data\n"); + return NULL; + } + new_data2 = kmalloc(memsize, GFP_KERNEL); + if (!new_data2) { + LOG_SERIOUS("Could not allocate Secondary Link data\n"); + kfree(new_data1); + return NULL; + } + + LOG_EVERYTHING("Primary Feature Data starts at RBA %i\n", linksector1); + LOG_EVERYTHING("Secondary Feature Data starts at RBA %i\n", + linksector2); + + /* Read the drive linking data into memory. */ + if (INIT_IO(link_partition, 0, linksector1, linknumsectors, new_data1)) { + LOG_SERIOUS("I/O error reading Primary Feature Data.\n"); + seq1 = 0; + p1 = NULL; + } else { + /* Set up access to the buffer. Extract the Master Sequence Number from the buffer. */ + p1 = new_data1; + seq1 = ((struct link_table_first_sector *) p1)->Sequence_Number; + } + + if (INIT_IO(link_partition, 0, linksector2, linknumsectors, new_data2)) { + LOG_SERIOUS("I/O error reading Secondary Feature Data.\n"); + seq2 = 0; + p2 = NULL; + } else { + /* Set up access to the second buffer. Extract its copy of the Master Sequence Number. */ + p2 = new_data2; + seq2 = ((struct link_table_sector *) p2)->Sequence_Number; + } + + /* Validate both copies of the drive linking data one sector at a time. */ + for (i = 0; i < linknumsectors; + i++, p1 += OS2_BYTES_PER_SECTOR, p2 += OS2_BYTES_PER_SECTOR) { + if ((seq1 > 0) + && validate_drivelinksector((struct link_table_sector *) p1, i, + seq1)) { + LOG_SERIOUS + ("The primary copy of the drive link data is invalid! Sector %i is not valid\n", + i); + seq1 = 0; + } + + if ((seq2 > 0) + && validate_drivelinksector((struct link_table_sector *) p2, i, + seq2)) { + LOG_SERIOUS + ("The secondary copy of the drive link data is invalid! Sector %i is not valid\n", + i); + seq2 = 0; + } + + } + + LOG_EVERYTHING("Primary Feature Data sequence # %i\n", seq1); + LOG_EVERYTHING("Secondary Feature Data sequence # %i\n", seq2); + + /* Choose which copy of the drive linking data to use. If both sequence numbers are 0, then both copies + of the drive linking data are bad. If both are equal and non-zero, then both copies are good and it + really doesn't matter which one you choose. Otherwise, choose the copy with the highest sequence number. */ + if (seq2 > seq1) { + kfree(new_data1); + return new_data2; + } else { + kfree(new_data2); + if (!seq1) { + kfree(new_data1); + new_data1 = NULL; + } + } + return new_data1; +} + +/* + * Function: new_os2_bbr_data + * + * Allocate space for OS/2 bad block relocation information. + * Read in and validate the information from disk. + * Note: assumes 512 byte sectors. + */ +static char * +new_os2_bbr_data(u32 bbrsector1, + u32 bbrsector2, + u32 bbrnumsectors, struct evms_logical_node *bbr_partition) +{ + char *new_data1; /* Buffer to hold the primary copy of the BBR data. */ + char *new_data2; /* Buffer to hold the secondary copy of the BBR data. */ + char *p1; /* Used to examine the individual sectors of BBR data within new_data1. */ + char *p2; /* Used to examine the individual sectors of BBR data within new_data2. */ + int memsize = bbrnumsectors * OS2_BYTES_PER_SECTOR; + u32 i, seq1, seq2; + + LOG_EVERYTHING("BBR Feature entry found.\n"); + + /* Allocate memory for the buffers. */ + new_data1 = kmalloc(memsize, GFP_KERNEL); + if (!new_data1) { + LOG_SERIOUS("Could not allocate Primary BBR data\n"); + return NULL; + } + new_data2 = kmalloc(memsize, GFP_KERNEL); + if (!new_data2) { + LOG_SERIOUS("Could not allocate Secondary BBR data\n"); + kfree(new_data1); + return NULL; + } + + LOG_EVERYTHING("Primary Feature Data starts at RBA %i\n", bbrsector1); + LOG_EVERYTHING("Secondary Feature Data starts at RBA %i\n", bbrsector2); + + /* Read in both copies of the BBR data. */ + if (INIT_IO(bbr_partition, 0, bbrsector1, bbrnumsectors, new_data1)) { + LOG_SERIOUS("I/O error reading Primary Feature Data.\n"); + seq1 = 0; + p1 = NULL; + } else { + /* Establish access to the first sector of the BBR data. Extract the Master Sequence Number + for this copy of the BBR data. */ + p1 = new_data1; + seq1 = ((LVM_BBR_Table_First_Sector *) p1)->Sequence_Number; + } + + if (INIT_IO(bbr_partition, 0, bbrsector2, bbrnumsectors, new_data2)) { + LOG_SERIOUS("I/O error reading Secondary Feature Data.\n"); + seq2 = 0; + p2 = NULL; + } else { + /* Establish access to the first sector of the second copy of the BBR data. Extract the + Master Sequence Number for this copy of the BBR data. */ + p2 = new_data2; + seq2 = ((LVM_BBR_Table_Sector *) p2)->Sequence_Number; + } + + /* Validate both copies of the BBR Data, one sector at a time. */ + for (i = 0; i < bbrnumsectors; + i++, p1 += OS2_BYTES_PER_SECTOR, p2 += OS2_BYTES_PER_SECTOR) { + if ((seq1 > 0) && validate_bbrtablesector(p1, i, seq1)) { + LOG_SERIOUS + ("The primary BBR data is invalid! Sector %i is not valid\n", + i); + seq1 = 0; + } + + if ((seq2 > 0) && validate_bbrtablesector(p2, i, seq2)) { + LOG_SERIOUS + ("The secondary BBR data is invalid! Sector %i is not valid\n", + i); + seq2 = 0; + } + + } + + LOG_EVERYTHING("Primary Feature Data sequence # %i\n", seq1); + LOG_EVERYTHING("Secondary Feature Data sequence # %i\n", seq2); + + /* Choose which copy of the BBR Data to use based upon the sequence number. If both sequence numbers + are 0, then there is no valid BBR data. If both are non-zero and equal, then it really doesn't + matter which copy is used. Otherwise, choose the copy with the highest sequence number. */ + if (seq2 > seq1) { + kfree(new_data1); + return new_data2; + } else { + kfree(new_data2); + if (!seq1) { + kfree(new_data1); + new_data1 = NULL; + } + } + return new_data1; +} + +/* + * Function: new_os2volume + * + * Allocate space for a new OS/2 logical volume. + * Initialize the appropriate fields. + */ +static struct evms_logical_node * +new_os2volume(u32 volumeserial, char *volume_name) +{ + struct evms_logical_node *new_node; + os2_volume_runtime_entry_t *cur_volume; + + if (evms_cs_allocate_logical_node(&new_node)) { + LOG_SERIOUS("Could not allocate new volume\n"); + return NULL; + } + new_node->private = + kmalloc(sizeof (os2_volume_runtime_entry_t), GFP_KERNEL); + if (!new_node->private) { + LOG_SERIOUS("Could not allocate volume metadata\n"); + evms_cs_deallocate_logical_node(new_node); + return NULL; + } + memset(new_node->private, 0, sizeof (os2_volume_runtime_entry_t)); + new_node->plugin = &plugin_header; + new_node->system_id = LVM_PARTITION_INDICATOR; + sprintf(new_node->name, "os2/%s", volume_name); + cur_volume = (os2_volume_runtime_entry_t *) new_node->private; + cur_volume->Volume_Serial_Number = volumeserial; + cur_volume->Export_Needed = 1; + + if (os2lvm_nodes == NULL) + os2lvm_nodes = new_node; + + // This is the first node discovered. Start the BBR thread. + if (!BBR_Worker_Thread) { + BBR_Worker_Thread = + evms_cs_register_thread(BBR_Worker, NULL, BBR_Worker_Name); + if (!BBR_Worker_Thread) { + kfree(new_node->private); + evms_cs_deallocate_logical_node(new_node); + os2lvm_nodes = NULL; + return NULL; + } + } else { + cur_volume = + (os2_volume_runtime_entry_t *) os2lvm_nodes->private; + while (cur_volume->next_os2lvm_node) + cur_volume = + (os2_volume_runtime_entry_t *) cur_volume-> + next_os2lvm_node->private; + cur_volume->next_os2lvm_node = new_node; + } + + MOD_INC_USE_COUNT; + + return new_node; +} + +/* + * Function: delete_os2lvm_volume + * + * This function deletes the in-memory representation of an OS/2 + * logical volume. + */ +static int +delete_os2lvm_volume(struct evms_logical_node *logical_node) +{ + struct os2_dl_entry *curdrvlink = + ((os2_volume_runtime_entry_t *) logical_node->private)->drive_link, + *nextdrvlink; + os2_volume_runtime_entry_t *cur_volume, *next_volume; + + while (curdrvlink) { + nextdrvlink = curdrvlink->next; + delete_os2_drive_link(curdrvlink, 1); + curdrvlink = nextdrvlink; + } + + cur_volume = (os2_volume_runtime_entry_t *) os2lvm_nodes->private; + if (os2lvm_nodes == logical_node) + os2lvm_nodes = cur_volume->next_os2lvm_node; + else { + while (cur_volume->next_os2lvm_node) { + next_volume = + (os2_volume_runtime_entry_t *) cur_volume-> + next_os2lvm_node->private; + if (cur_volume->next_os2lvm_node == logical_node) { + cur_volume->next_os2lvm_node = + next_volume->next_os2lvm_node; + break; + } + } + } + + if (os2lvm_nodes == NULL) { + // Just deleted the last os2 node. Stop the BBR thread. + if (BBR_Worker_Thread) { + evms_cs_unregister_thread(BBR_Worker_Thread); + BBR_Worker_Thread = NULL; + } + } + + kfree(logical_node->private); + evms_cs_deallocate_logical_node(logical_node); + + MOD_DEC_USE_COUNT; + + return 0; +} + +/* + * Function: delete_os2_drive_link + * + * This function deletes the drive link runtime structure and any + * other structures it points to. + */ +static int +delete_os2_drive_link(struct os2_dl_entry * drive_link, + int delete_link_partition) +{ + if (drive_link->link_data) + kfree(drive_link->link_data); + if (drive_link->bbr_data) + kfree(drive_link->bbr_data); + if (delete_link_partition) + DELETE(drive_link->link_partition); + kfree(drive_link); + + return 0; +} + +// Consistency Checking Functions + +/* + * Function: validate_signaturesector + * + * This function checks the OS/2 LVM Signature Sector + */ +static int +validate_signaturesector(struct evms_logical_node *evms_partition, + LVM_Signature_Sector * signature_sector, + u32 sectorsize) +{ + u32 crc_hold, crc_new; + + /* In order for a signature sector to be considered valid, its signature and CRC must + be correct. Also, OS/2 stores the starting LBA of the partition and the size of + the partition that this signature sector corresponds to. These should be checked + as well. However, since the starting LBA of the partition that this belongs to is + not available to us as part of an struct evms_logical_node, we can only check the size + of the partition against what is stored in the signature sector. */ + + /* The signature used is in two parts. Test the first part. */ + if (signature_sector->LVM_Signature1 != OS2LVM_PRIMARY_SIGNATURE) { + LOG_EVERYTHING("Primary LVM Signature failed.\n"); + return 1; + } + + /* Test the second part of the signature. */ + if (signature_sector->LVM_Signature2 != OS2LVM_SECONDARY_SIGNATURE) { + LOG_EVERYTHING("Secondary LVM Signature failed.\n"); + return 1; + } + + /* Calculate the CRC and compare it against the stored CRC. */ + crc_hold = signature_sector->Signature_Sector_CRC; + signature_sector->Signature_Sector_CRC = 0; + crc_new = + evms_cs_calculate_crc(EVMS_INITIAL_CRC, (void *) signature_sector, + sectorsize); + if (crc_hold != crc_new) { + LOG_EVERYTHING("Signature sector crc failed.\n"); + LOG_EVERYTHING("sector_crc == %x , calc_crc == %x \n", crc_hold, + crc_new); + return 1; + } + // The partition size must == that found in the Signature Sector + if (evms_partition->total_vsectors != + signature_sector->Partition_Sector_Count) { + LOG_EXTRA("Partition size is not valid\n"); + return 1; + } + + return 0; +} + +/* + * Function: validate_drivelinksector + * + * This function checks the OS/2 LVM Drivelink Feature Sector + */ +static int +validate_drivelinksector(void *Sector_To_Validate, + int Sector_Index, u32 Master_Sequence_Number) +{ + u32 crc_hold, crc_new; + struct link_table_first_sector *First_Sector = + (struct link_table_first_sector *) Sector_To_Validate; + struct link_table_sector *Link_Sector = + (struct link_table_sector *) Sector_To_Validate; + + /* The OS/2 drive linking data covers several sectors. The format of the first sector is slightly + different from the following sectors because it contains additional information about how many + drive links are actually in use. The following sectors just contain portions of the drive link + table. Each sector of OS/2 drive linking data contains a signature, crc, and sequence number + which must be validated. */ + + if (Sector_Index == 0) { + + /* Link Table Master Signature Check */ + if (LINK_TABLE_MASTER_SIGNATURE != + First_Sector->Link_Table_Signature) { + LOG_EVERYTHING + ("Link Table Master Signature Test failed.\n"); + return 1; + } + + /* We will NOT check the sequence number here as the first sector of drive link data is the + source of the Master_Sequence_Number which was passed in to us. */ + + /* Set up for the CRC Check */ + crc_hold = First_Sector->Link_Table_CRC; + First_Sector->Link_Table_CRC = 0; + } else { + /* Link Table Internal Signature Check */ + if (LINK_TABLE_SIGNATURE != Link_Sector->Link_Table_Signature) { + LOG_EVERYTHING + ("Link Table Internal Signature Test failed.\n"); + return 1; + } + + /* Check the sequence number. */ + if (Master_Sequence_Number != Link_Sector->Sequence_Number) { + LOG_EVERYTHING + ("Link Table Internal Sequence Number Test failed.\n"); + return 1; + } + + /* Set up for the CRC Check */ + crc_hold = Link_Sector->Link_Table_CRC; + Link_Sector->Link_Table_CRC = 0; + } + + crc_new = + evms_cs_calculate_crc(EVMS_INITIAL_CRC, Sector_To_Validate, + OS2_BYTES_PER_SECTOR); + if (crc_hold != crc_new) { + LOG_EVERYTHING("Link Table crc failed.\n"); + LOG_EVERYTHING("sector_crc == %x , calc_crc == %x \n", crc_hold, + crc_new); + return 1; + } + + return 0; +} + +/* + * Function: validate_bbrtablesector + * + * This function checks the OS/2 LVM Bad Block Relocation Feature Sector + */ +static int +validate_bbrtablesector(void *Sector_To_Validate, + int Sector_Index, u32 Master_Sequence_Number) +{ + u32 crc_hold, crc_new; + LVM_BBR_Table_First_Sector *First_Sector = + (LVM_BBR_Table_First_Sector *) Sector_To_Validate; + LVM_BBR_Table_Sector *BBR_Sector = + (LVM_BBR_Table_Sector *) Sector_To_Validate; + + /* The OS/2 bad block relocation (BBR) data covers several sectors. The format of the first sector + is different from the following sectors because it contains additional information about how many + relocations are actually in use and the size and location of the block of replacement sectors. + The following sectors just contain portions of the BBR remap table. Each sector of OS/2 BBR data + contains a signature, crc, and sequence number which must be validated. */ + + if (Sector_Index == 0) { + + /* BBR Table Master Signature Check */ + if (BBR_TABLE_MASTER_SIGNATURE != First_Sector->Signature) { + LOG_EVERYTHING + ("BBR Table Master Signature Test failed.\n"); + return 1; + } + + /* We will NOT check the sequence number here as the first sector of BBR data is the + source of the Master_Sequence_Number which was passed in to us. */ + + /* Set up for the CRC Check */ + crc_hold = First_Sector->CRC; + First_Sector->CRC = 0; + + } else { + /* BBR Table Internal Signature Check */ + if (BBR_TABLE_SIGNATURE != BBR_Sector->Signature) { + LOG_EVERYTHING + ("BBR Table Internal Signature Test failed.\n"); + return 1; + } + + /* Check the sequence number. */ + if (Master_Sequence_Number != BBR_Sector->Sequence_Number) { + LOG_EVERYTHING + ("BBR Table Internal Sequence Number Test failed.\n"); + return 1; + } + + /* Set up for the CRC Check */ + crc_hold = BBR_Sector->CRC; + BBR_Sector->CRC = 0; + } + + crc_new = + evms_cs_calculate_crc(EVMS_INITIAL_CRC, Sector_To_Validate, + OS2_BYTES_PER_SECTOR); + if (crc_hold != crc_new) { + LOG_EVERYTHING("BBRTable crc failed.\n"); + LOG_EVERYTHING("sector_crc == %x , calc_crc == %x \n", crc_hold, + crc_new); + return 1; + } + + return 0; +} + +/* + * Function: check_for_os2_bbr_relocations + * + * This function checks the OS/2 LVM Bad Block Relocation Tables + * for any active relocation sectors. The bbr table is reformatted in memory + * to make searches faster. + * Return values: 0 == no active relocations, 1 == contains active relocations + */ +static u32 +check_for_os2_bbr_relocations(char *bbr_data_ptr) +{ + LVM_BBR_Feature *feature_data = (LVM_BBR_Feature *) bbr_data_ptr; + + if (feature_data->control.Table_Entries_In_Use) { + LOG_EVERYTHING("There are %d active relocations.\n", + feature_data->control.Table_Entries_In_Use); + return 1; + } + + return 0; +} + +/* + * Function: check_os2_volumes + * + * This function performs a consistency check on all existing OS/2 + * Logical Volumes. The list of constituent partitions ( links ) + * is checked and ordered according to the Link Table. If any link + * is missing or inconsistent, the entire volume will be deleted. + */ +static int +check_os2_volumes(struct evms_logical_node **node_list) +{ + os2_volume_runtime_entry_t *cur_volume; + os2_volume_runtime_entry_t *previous_volume; + struct evms_logical_node *cur_node; + struct evms_logical_node *previous_node = NULL; + struct os2_dl_entry *link_list, *link_hold; + struct link_table_first_sector *psector1; + int i, rc = 0; + u32 numlinks, countlinks, linkser; + u32 Master_Sequence_Number; /* Used to check whether or not all of the copies of Drive Linking data match. */ + u64 partition_offset; + char *sect_ptr; + + LOG_ENTRY_EXIT("Checking OS/2 Logical Volumes\n"); + + cur_node = os2lvm_nodes; + + while (cur_node) { + cur_volume = (os2_volume_runtime_entry_t *) cur_node->private; + link_list = NULL; + if (!cur_volume->complete) { /* need to verify this one */ + cur_volume->complete = 1; + LOG_EVERYTHING("Checking volume %s\n", cur_node->name); + + // Reset fields for sort operation + cur_volume->size_in_sectors = 0; + numlinks = cur_volume->drive_link_count; + cur_volume->drive_link_count = 0; + cur_node->total_vsectors = 0; + link_list = cur_volume->drive_link; + cur_volume->drive_link = NULL; + + // Access the link data to order the drive links + psector1 = + (struct link_table_first_sector *) link_list-> + link_data; + Master_Sequence_Number = psector1->Sequence_Number; + + if (numlinks != psector1->Links_In_Use) { + LOG_SERIOUS + ("Link Count mismatch vol=%i, table=%i\n", + numlinks, psector1->Links_In_Use); + cur_volume->complete = 0; + countlinks = 0; + } else { + if (numlinks > LINKS_IN_FIRST_SECTOR) { + countlinks = LINKS_IN_FIRST_SECTOR; + numlinks -= LINKS_IN_FIRST_SECTOR; + } else { + countlinks = numlinks; + numlinks = 0; + } + + } + + partition_offset = 0; + for (i = 0; + (i < countlinks) && (cur_volume->complete == 1); + i++) { + linkser = + psector1->Link_Table[i]. + partition_serial; + if ((link_hold = + find_link_data(&link_list, linkser))) { + // Add this partition to its parent Volume + add_os2link(link_hold, cur_node); + LOG_EVERYTHING + ("Link start_RBA == "PFU64" , sector_count == "PFU64"\n", + partition_offset, + link_hold->sector_count); + link_hold->start_sector = + partition_offset; + partition_offset += + link_hold->sector_count; + } else { + LOG_SERIOUS + ("Link Table entry %i metadata missing\n", + i); + cur_volume->complete = 0; + break; + } + } + + sect_ptr = (char *) psector1; + + while (numlinks && (cur_volume->complete == 1)) { + if (numlinks > LINKS_IN_NEXT_SECTOR) { + countlinks = LINKS_IN_NEXT_SECTOR; + numlinks -= LINKS_IN_NEXT_SECTOR; + } else { + countlinks = numlinks; + numlinks = 0; + } + sect_ptr += OS2_BYTES_PER_SECTOR; + if (Master_Sequence_Number != + ((struct link_table_sector *) sect_ptr)-> + Sequence_Number) { + cur_volume->complete = 0; + LOG_SERIOUS + ("Bad Sequence Number for Drive Linking Metadata!\n"); + } else { + for (i = 0; i < countlinks; i++) { + linkser = + ((struct link_table_sector *) + sect_ptr)->Link_Table[i]. + partition_serial; + if ((link_hold = + find_link_data(&link_list, + linkser))) { + // Add this partition to its parent Volume + add_os2link(link_hold, + cur_node); + LOG_EVERYTHING + ("Link start_RBA == "PFU64" , sector_count == "PFU64"\n", + partition_offset, + link_hold-> + sector_count); + link_hold-> + start_sector = + partition_offset; + partition_offset += + link_hold-> + sector_count; + } else { + LOG_SERIOUS + ("Link Table entry %i metadata missing\n", + i); + cur_volume->complete = + 0; + break; + } + } + } + } + } + + /* If the volume is complete we can export it for use. */ + if (cur_volume->complete && (link_list == NULL)) { + + // Link new volume into the node list + if (cur_volume->Export_Needed && + (!evms_cs_add_logical_node_to_list + (node_list, cur_node)) + ) { + rc++; + cur_volume->Export_Needed = 0; + } + + previous_node = cur_node; + cur_node = cur_volume->next_os2lvm_node; + } else { + /* Remove the volume from os2lvm_nodes list and delete it. */ + if (previous_node != NULL) { + + previous_volume = + (os2_volume_runtime_entry_t *) + previous_node->private; + previous_volume->next_os2lvm_node = + cur_volume->next_os2lvm_node; + cur_volume->next_os2lvm_node = NULL; + + delete_os2lvm_volume(cur_node); + + cur_node = previous_volume->next_os2lvm_node; + } else { + previous_node = cur_volume->next_os2lvm_node; + delete_os2lvm_volume(cur_node); + cur_node = previous_node; + previous_node = NULL; + os2lvm_nodes = cur_node; + } + + /* If any items remain in link_list, delete those as well. */ + while (link_list) { + link_hold = link_list->next; + delete_os2_drive_link(link_list, 1); + link_list = link_hold; + } + + } + + } + + LOG_ENTRY_EXIT("Finished Checking OS/2 Logical Volumes\n"); + + return rc; +} + +/* BBR_Transfer_IO + * + * Transfer the responsibility for completing the specified IO from + * the thread that requested it to the BBR Worker Thread + */ +static void +BBR_Transfer_IO(struct transfer_record * Transfer_Record) +{ + unsigned long flags; + int Wake_Worker_Thread = 0; /* Assume that the worker is already awake. */ + + spin_lock_irqsave(&BBR_Queue_Lock, flags); + + /* The BBR IO List is a singly linked list. BBR_IO_List_Head points + to the first item in the list, and BBR_IO_List_Tail points to the + last item in the list. */ + Transfer_Record->next = NULL; + if (!BBR_IO_List_Tail) { /* Empty list */ + BBR_IO_List_Head = Transfer_Record; + Wake_Worker_Thread = 1; /* Wake up the worker thread. */ + } else /* Items already in the list. */ + BBR_IO_List_Tail->next = Transfer_Record; + + BBR_IO_List_Tail = Transfer_Record; + + spin_unlock_irqrestore(&BBR_Queue_Lock, flags); + if (Wake_Worker_Thread) + evms_cs_wakeup_thread(BBR_Worker_Thread); + + return; +} + +/* OS2_DL_Callback + * + * This is the callback function used when an I/O request has to be broken + * into two parts because it crosses a drive link boundary. + * + */ +static void +OS2_DL_Callback(struct buffer_head *bh, int uptodate) +{ + + struct tracking_record *Tracking_Record; + struct buffer_head *Original; + + Tracking_Record = bh->b_private; + + /* Is this a read or a write? */ + if (Tracking_Record->link1_transfer_rec || + Tracking_Record->link2_transfer_rec) { + /* We have a write here. Was it successful? */ + if (!uptodate) { + /* Have we tried BBR yet? */ + if ((bh == Tracking_Record->link1_bh) && + (!Tracking_Record->link1_bbr_attempted)) { + /* Attempt BBR. */ + BBR_Transfer_IO(Tracking_Record-> + link1_transfer_rec); + Tracking_Record->link1_bbr_attempted = 1; + return; + } else if ((bh == Tracking_Record->link2_bh) && + (!Tracking_Record->link2_bbr_attempted)) { + /* Attempt BBR. */ + BBR_Transfer_IO(Tracking_Record-> + link2_transfer_rec); + Tracking_Record->link2_bbr_attempted = 1; + return; + } + + } + + } + + Tracking_Record->io_in_progress -= 1; + if (Tracking_Record->io_in_progress) { + Tracking_Record->up_to_date = uptodate; + } + Original = Tracking_Record->org_bh; + + if (!Tracking_Record->io_in_progress) { + uptodate &= Tracking_Record->up_to_date; + /* If this is a write, then Transfer Records will have been set up for both Link1 and Link2. + If the transfer records were used because of BBR, then the BBR worker thread will have + disposed of the transfer records. If the transfer records were not used, then we must + dispose of them here to prevent memory leaks. */ + if (Tracking_Record->link1_transfer_rec && + (!Tracking_Record->link1_bbr_attempted)) { + evms_cs_deallocate_to_pool(BBR_Transfer_Pool, + Tracking_Record-> + link1_transfer_rec); + } + if (Tracking_Record->link2_transfer_rec && + (!Tracking_Record->link2_bbr_attempted)) { + evms_cs_deallocate_to_pool(BBR_Transfer_Pool, + Tracking_Record-> + link2_transfer_rec); + } + evms_cs_deallocate_to_pool(evms_bh_pool, + Tracking_Record->link1_bh); + evms_cs_deallocate_to_pool(evms_bh_pool, + Tracking_Record->link2_bh); + evms_cs_deallocate_to_pool(DL_Tracking_Pool, Tracking_Record); + Original->b_end_io(Original, uptodate); + } + + return; +} + +/* OS2_BBR_Write_Callback + * + * This is the callback for normal write requests. Check for an error + * during the I/O, and send to the worker thread for processing if necessary. + */ +static void +OS2_BBR_Write_Callback(struct transfer_record * Transfer_Record, + struct buffer_head *bh, int uptodate, int *redrive) +{ + if (!uptodate) { + BBR_Transfer_IO(Transfer_Record); + *redrive = TRUE; + } else { + evms_cs_deallocate_to_pool(BBR_Transfer_Pool, Transfer_Record); + } + + return; +} + +/* Worker thread to handle: + + I/O to drive/partitions/objects where bad blocks are known to exist + I/O to drive/partition/object where a new bad block has been discovered and the I/O must be redriven. + +*/ +static void +BBR_Worker(void *Not_Used) +{ + unsigned long flags; + struct transfer_record *Current_IO; + int complete; + + for (;;) { + // Process bbr_io_list, one entry at a time. + spin_lock_irqsave(&BBR_Queue_Lock, flags); + + /* Is there any work for us? */ + if (!BBR_IO_List_Head) { + spin_unlock_irqrestore(&BBR_Queue_Lock, flags); + break; /* List empty - nothing to do. */ + } + + /* Get the IO to perform. */ + Current_IO = BBR_IO_List_Head; + BBR_IO_List_Head = Current_IO->next; + if (!BBR_IO_List_Head) + BBR_IO_List_Tail = BBR_IO_List_Head; + + spin_unlock_irqrestore(&BBR_Queue_Lock, flags); + + /* Now lets process the I/O request. */ + complete = do_os2_bbr_io(Current_IO->Partition_Data, + Current_IO->Write_Flag, + Current_IO->bh->b_rsector, + Current_IO->bh-> + b_size >> EVMS_VSECTOR_SIZE_SHIFT, + Current_IO->bh->b_data); + + /* We need to do the callback. */ + Current_IO->bh->b_end_io(Current_IO->bh, (complete == 0)); + + /* Now cleanup */ + evms_cs_deallocate_to_pool(BBR_Transfer_Pool, Current_IO); + } + + return; /* Go to sleep. */ + +} + +/* + * Sector_Is_Remapped + * + * This function returns 1 if the specified sector has been remapped, 0 if it has not + * + * If the sector has been remapped, then the new sector is returned in Replacement_Sector + * + */ +static int +Sector_Is_Remapped(struct os2_dl_entry * io_dlentry, + u64 Source_Sector, u64 * Replacement_Sector) +{ + LVM_BBR_Feature *Feature_Data = + (LVM_BBR_Feature *) io_dlentry->bbr_data; + unsigned int Sector_Index; /* The BBR Table is spread across several sectors. This tracks which sector we are looking at. */ + unsigned int BBR_Table_Index; /* This tracks the actual entry in the BBR Table that we are examining. */ + unsigned int BBR_Table_Entries_In_Use = + Feature_Data->control.Table_Entries_In_Use; + struct bbr_table_entry * table_entry; + unsigned int guard1; + + /* Default value is no remap. */ + *Replacement_Sector = Source_Sector; + + do { + guard1 = io_dlentry->guard1; /* Lamport's Theorem */ + + for (BBR_Table_Index = 0; + BBR_Table_Index < BBR_Table_Entries_In_Use; + BBR_Table_Index++) { + Sector_Index = + BBR_Table_Index / BBR_TABLE_ENTRIES_PER_SECTOR; + table_entry = + &(Feature_Data->remap[Sector_Index]. + BBR_Table[BBR_Table_Index - + (Sector_Index * + BBR_TABLE_ENTRIES_PER_SECTOR)]); + if (table_entry->BadSector == (u32)Source_Sector) { + *Replacement_Sector = + (u64)table_entry->ReplacementSector; + break; + } + } + + } while (guard1 != io_dlentry->guard2); /* Lamport's Theorem */ + + if (*Replacement_Sector != Source_Sector) + return 1; + else + return 0; +} + +/* + * Invalidate_Mapping + * + * This function either frees a replacement sector to be reused, or it + * marks the replacement sector as bad. + * + */ +static void +Invalidate_Mapping(struct os2_dl_entry * dlentry, + u64 Source_Sector, int Replacement_Sector_Is_Bad) +{ + LVM_BBR_Feature *Feature_Data = (LVM_BBR_Feature *) dlentry->bbr_data; + unsigned int Sector_Index; /* The BBR Table is spread across several sectors. This tracks which sector we are looking at. */ + unsigned int BBR_Table_Index; /* This tracks the actual entry in the BBR Table that we are examining. */ + unsigned int BBR_Table_Entries_In_Use = + Feature_Data->control.Table_Entries_In_Use; + struct bbr_table_entry * table_entry = NULL; + + /* Lock for the BBR Table. */ + down(&(dlentry->bbr_table_lock)); + + /* Find the entry to invalidate. */ + for (BBR_Table_Index = 0; BBR_Table_Index < BBR_Table_Entries_In_Use; + BBR_Table_Index++) { + Sector_Index = BBR_Table_Index / BBR_TABLE_ENTRIES_PER_SECTOR; + table_entry = + &(Feature_Data->remap[Sector_Index]. + BBR_Table[BBR_Table_Index - + (Sector_Index * BBR_TABLE_ENTRIES_PER_SECTOR)]); + if (table_entry->BadSector == Source_Sector) { + break; + } + } + + /* Now that we have found the entry, we must invalidate it. */ + if (Replacement_Sector_Is_Bad) { + table_entry->BadSector = (u32) - 1; + } + /* OS/2 supported a method for clearing out bad block remappings if the filesystem on the volume supported + the tracking of bad blocks. We don't support that under Linux, so there is no else case here. */ + + /* Unlock the BBR Table */ + up(&(dlentry->bbr_table_lock)); + + return; +} + +/* + * Create_New_struct bbr_table_entry + * + * Finds bad blocks within the range specified, allocates replacement sectors, + * writes the data to the replacement sectors, and updates the BBR metadata on + * disk to reflect the new mapping. Returns 1 if successful, 0 otherwise. + * + */ +static int +Create_New_BBR_Table_Entry(struct os2_dl_entry * dlentry, + u64 starting_lsn, unsigned int count, void *buffer) +{ + u64 lsn; + struct bbr_table_entry *Table_Entry; + unsigned int Sector_Index; + unsigned int Table_Index; + int rc; + int rc2; + u32 New_Sequence_Number; + LVM_BBR_Feature *BBR_Data = (LVM_BBR_Feature *) dlentry->bbr_data; + + for (lsn = starting_lsn; lsn < (starting_lsn + count); lsn++) { + rc = INIT_IO(dlentry->link_partition, 1, lsn, 1, buffer); + while (rc) { + + /* Lock for the BBR Table. */ + down(&(dlentry->bbr_table_lock)); + + /* Increment the second guard value. This will cause those reading the BBR Table to spin. */ + dlentry->guard2++; + + /* Ensure that the bbr active flag is set. */ + dlentry->bbr_is_active = 1; + + /* Allocate a replacement sector */ + if (BBR_Data->control.Table_Entries_In_Use < + BBR_Data->control.Table_Size) { + Sector_Index = + BBR_Data->control.Table_Entries_In_Use / + BBR_TABLE_ENTRIES_PER_SECTOR; + Table_Index = + BBR_Data->control.Table_Entries_In_Use % + BBR_TABLE_ENTRIES_PER_SECTOR; + BBR_Data->control.Table_Entries_In_Use = + BBR_Data->control.Table_Entries_In_Use + 1; + Table_Entry = + (struct bbr_table_entry *) & (BBR_Data-> + remap[Sector_Index]. + BBR_Table + [Table_Index]); + Table_Entry->BadSector = lsn; + } else { + /* There are no more replacement sectors available! Time to bail ... */ + up(&(dlentry->bbr_table_lock)); + return 0; + } + + /* Now that we have a replacement sector, increment the first guard value. This will free any + threads reading the BBR Table. */ + dlentry->guard1++; + + /* Release the lock now that we have a replacement sector. */ + up(&(dlentry->bbr_table_lock)); + + /* Test the replacement sector. */ + rc = INIT_IO(dlentry->link_partition, 1, + Table_Entry->ReplacementSector, 1, buffer); + if (rc) { + /* The replacement sector was bad. Lets mark it bad in the table and try again. */ + Table_Entry->BadSector = (u32) - 1; + } + + } /* End of processing for the current sector. */ + + } /* end of loop to test each sector in the I/O and remap any bad ones found. */ + + /* Need to write the modified BBR Table back to disk. This includes updating the sequence numbers and CRCs. */ + + /* Lock for the BBR Table. */ + down(&(dlentry->bbr_table_lock)); + + /* Increment the sequence numbers. */ + New_Sequence_Number = BBR_Data->control.Sequence_Number + 1; + BBR_Data->control.Sequence_Number = New_Sequence_Number; + for (Sector_Index = 0; + Sector_Index < BBR_Data->control.Sectors_Per_Table; + Sector_Index++) { + BBR_Data->remap[Sector_Index].Sequence_Number = + New_Sequence_Number; + } + + /* Calculate the new CRC values. */ + BBR_Data->control.CRC = 0; + BBR_Data->control.CRC = + evms_cs_calculate_crc(EVMS_INITIAL_CRC, &(BBR_Data->control), + OS2_BYTES_PER_SECTOR); + for (Sector_Index = 0; + Sector_Index < BBR_Data->control.Sectors_Per_Table; + Sector_Index++) { + BBR_Data->remap[Sector_Index].CRC = 0; + BBR_Data->remap[Sector_Index].CRC = + evms_cs_calculate_crc(EVMS_INITIAL_CRC, + &(BBR_Data->remap[Sector_Index]), + OS2_BYTES_PER_SECTOR); + } + + /* Now we must write the table back to the partition from whence it came. */ + + /* Write the first copy. */ + rc = INIT_IO(dlentry->link_partition, 1, dlentry->bbr_lsn1, + dlentry->bbr_feature_size, BBR_Data); + + /* Write the second copy. */ + rc2 = + INIT_IO(dlentry->link_partition, 1, dlentry->bbr_lsn2, + dlentry->bbr_feature_size, BBR_Data); + + /* If both copies failed to reach the disk, then fail the I/O. */ + if (rc && rc2) { + rc = 0; + } else + rc = 1; + + /* Unlock the BBR Table */ + up(&(dlentry->bbr_table_lock)); + + /* Indicate success. */ + return rc; +} + +/* + * Clone_Bufferhead + * + * Prepares a usable copy of an existing bufferhead. + * + */ +static void +Clone_Bufferhead(struct buffer_head *Source, struct buffer_head *Child) +{ + Child->b_next = NULL; + Child->b_blocknr = Source->b_blocknr; + Child->b_size = Source->b_size; + Child->b_list = BUF_LOCKED; + Child->b_dev = Source->b_dev; + Child->b_count = (atomic_t) ATOMIC_INIT(0); + atomic_set(&Child->b_count, atomic_read(&Source->b_count)); + Child->b_rdev = Source->b_rdev; + Child->b_state = Source->b_state; + Child->b_flushtime = 0; + Child->b_next_free = NULL; + Child->b_prev_free = NULL; + Child->b_this_page = (struct buffer_head *) 1; + Child->b_reqnext = NULL; + Child->b_pprev = NULL; + Child->b_data = Source->b_data; + Child->b_page = Source->b_page; + Child->b_end_io = Source->b_end_io; + Child->b_private = Source->b_private; + Child->b_rsector = Source->b_rsector; + Child->b_inode_buffers.next = NULL; + Child->b_inode_buffers.prev = NULL; + return; +} diff -Naur linux-2002-09-30/drivers/evms/s390_part.c evms-2002-09-30/drivers/evms/s390_part.c --- linux-2002-09-30/drivers/evms/s390_part.c Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/drivers/evms/s390_part.c Fri Sep 13 16:09:55 2002 @@ -0,0 +1,1445 @@ +/* -*- linux-c -*- */ +/* + * + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * + */ +/* + * linux/drivers/evms/s390_part.c + * + * EVMS S/390 partition manager + * + * Partial code extracted from + * + * linux/fs/partitions/ibm.c + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* prefix used in logging messages */ +#define LOG_PREFIX "s390_part: " + +/* Private instance data structure for node we produced */ +struct local_instance_data { + struct evms_logical_node *source_disk; + u64 start_sect; /* starting LBA */ + u64 nr_sects; /* number of sectors */ + unsigned char type; /* partition type or filesystem format indicator, can be set to 0 */ +}; + +static int exported_nodes; /* total # of exported segments + * produced during this discovery. + */ + +/* Prototypes */ +static int s390_partition_discover(struct evms_logical_node **); +static int s390_partition_delete(struct evms_logical_node *); +static void s390_partition_read(struct evms_logical_node *, + struct buffer_head *); +static void s390_partition_write(struct evms_logical_node *, + struct buffer_head *); +static int s390_partition_ioctl(struct evms_logical_node *, + struct inode *, + struct file *, unsigned int, unsigned long); +static int s390_partition_init_io(struct evms_logical_node *, + int, u64, u64, void *); + +static struct evms_plugin_fops fops = { + .discover = s390_partition_discover, + .delete = s390_partition_delete, + .read = s390_partition_read, + .write = s390_partition_write, + .init_io = s390_partition_init_io, + .ioctl = s390_partition_ioctl +}; + +#define EVMS_S390_PARTITION_MANAGER_ID 2 + +static struct evms_plugin_header plugin_header = { + .id = SetPluginID(IBM_OEM_ID, + EVMS_SEGMENT_MANAGER, + EVMS_S390_PARTITION_MANAGER_ID), + .version = { + .major = 1, + .minor = 0, + .patchlevel = 0}, + .required_services_version = { + .major = 0, + .minor = 5, + .patchlevel = 0}, + .fops = &fops +}; + +/***************************************************/ +/* List Support - Typedefs, Variables, & Functions */ +/***************************************************/ + +/* Typedefs */ + +/* structure to keep status on + * each disk. + */ +#define S390_DISK_OK 0 +#define S390_DISK_FAILED 1 +#define S390_FAILED_SKIP_COUNT 1024 +struct disk_object { + int flags; + atomic_t skipped_ios; + atomic_t pending_ios; + atomic_t total_ios; + atomic_t failed_ios; + struct evms_logical_node *disk; +}; + +/* structure to keep status + * on each device. + */ +struct device_object { + unsigned char label[8]; + int total_paths; + struct evms_list_node *disk_object_list; + struct evms_list_node *segment_list; +}; + +/* structure used to track in-flight IOs, + * and to handle failover scenarios. + */ +struct s390_io { + struct device_object *devo; + struct disk_object *dsko; + struct evms_logical_node *segment; + int rw_flag; + int paths_tried; + struct buffer_head *bh; + struct s390_io *next; +}; +static spinlock_t s390_redrive_list_lock = SPIN_LOCK_UNLOCKED; +static struct s390_io *s390_redrive_list = NULL; +static struct evms_thread *s390_io_redrive_thread; +static struct evms_pool_mgmt *s390_io_track_pool = NULL; + +/* Variables */ + +static struct evms_list_node *my_device_object_list; + +static struct evms_list_node ** +lookup_device_object(struct evms_logical_node *disk) +{ + struct evms_list_node **devoln; + + devoln = &my_device_object_list; + while (*devoln) { + struct evms_list_node **dskoln; + struct device_object *devo; + devo = (struct device_object *) (*devoln)->item; + dskoln = &devo->disk_object_list; + while (*dskoln) { + struct disk_object *dsko; + dsko = (struct disk_object *) (*dskoln)->item; + if (dsko->disk == disk) { + return (devoln); + } + dskoln = &(*dskoln)->next; + } + devoln = &(*devoln)->next; + } + return (devoln); +} + +static struct evms_list_node ** +lookup_label(unsigned char *label, struct evms_list_node **devoln) +{ + if (!devoln) { + devoln = &my_device_object_list; + } else { + devoln = &(*devoln)->next; + } + while (*devoln) { + struct device_object *devo; + struct disk_object *dsko; + devo = (struct device_object *) (*devoln)->item; + dsko = (struct disk_object *) devo->disk_object_list->item; + LOG_DEBUG("comparing labels: new(%s), %s(%s)\n", + label, dsko->disk->name, devo->label); + if (!strncmp(devo->label, label, 6)) { + LOG_DEBUG("matching label found!\n"); + break; + } + devoln = &(*devoln)->next; + } + return (devoln); +} + +static struct evms_logical_node * +find_segment_on_disk(struct evms_logical_node *disk, + u64 start_sect, u64 nr_sects) +{ + struct evms_logical_node *rc = NULL; + struct evms_list_node **devoln; + + /* find disk object */ + devoln = lookup_device_object(disk); + if (*devoln) { + /* disk object found in list */ + /* attempt to find segment */ + struct evms_list_node **sln; + struct device_object *devo; + + devo = (struct device_object *) (*devoln)->item; + sln = &devo->segment_list; + while (*sln) { + struct evms_logical_node *segment; + struct local_instance_data *lid; + + segment = (struct evms_logical_node *) (*sln)->item; + lid = segment->private; + if (lid->start_sect == start_sect) { + if (lid->nr_sects == nr_sects) { + rc = segment; + break; + } + } + sln = &(*sln)->next; + } + } + return (rc); +} + +static int +add_segment_to_disk(struct evms_logical_node *disk, + unsigned char *label, struct evms_logical_node *segment) +{ + int rc = 0; + struct evms_list_node **devoln; + struct device_object *devo; + + devoln = lookup_device_object(disk); + if (*devoln == NULL) { + struct disk_object *dsko = NULL; + /* device object not in list, add device object */ + devo = kmalloc(sizeof (*devo), GFP_KERNEL); + if (devo) { + memset(devo, 0, sizeof (*devo)); + strncpy(devo->label, label, 6); + rc = evms_cs_add_item_to_list(devoln, devo); + } else { + rc = -ENOMEM; + } + + /* create a disk object */ + if (!rc) { + dsko = kmalloc(sizeof (*dsko), GFP_KERNEL); + if (!dsko) { + rc = -ENOMEM; + } + } + if (!rc) { + memset(dsko, 0, sizeof (*dsko)); + /* add disk to disk object */ + dsko->disk = disk; + /* add disk object to disk object list + * in device object */ + rc = evms_cs_add_item_to_list(&devo->disk_object_list, + dsko); + } + if (!rc) { + devo->total_paths++; + } else { + /* on error clean up allocations */ + if (dsko) { + kfree(dsko); + } + if (*devoln) { + evms_cs_remove_item_from_list(devoln, devo); + } + if (devo) + kfree(devo); + } + } else { + devo = (struct device_object *) (*devoln)->item; + } + if (!rc) { + /* attempt to add segment */ + rc = evms_cs_add_item_to_list(&devo->segment_list, segment); + } + return (rc); +} + +static int +remove_segment_from_disk(struct evms_logical_node *disk, + struct evms_logical_node *segment, + struct evms_list_node **empty_disk_object_list) +{ + int rc = -1; + struct evms_list_node **devoln; + + *empty_disk_object_list = NULL; + devoln = lookup_device_object(disk); + if (*devoln) { + /* device object found in list */ + /* attempt to remove segment */ + struct device_object *devo; + devo = (struct device_object *) (*devoln)->item; + rc = evms_cs_remove_item_from_list(&devo->segment_list, + segment); + if (!rc) { + if (devo->segment_list == NULL) { + /* return disk object list to caller */ + *empty_disk_object_list = + devo->disk_object_list; + /* remove device object from list */ + rc = evms_cs_remove_item_from_list(devoln, + devo); + /* free device object */ + kfree(devo); + } + } + } + return (rc); +} + +/* function: s390_load_balance + * + * this function is used to route an IO to the appropriate + * paths of a multipath device. + * + * appropriate paths are determine used load balancing + * techniques. load balancing is accomplished by monitoring + * pending or in-flight IOs to each path. when a new IO + * request is received, all paths are examined, and the path + * with the fewest IOs pending is selected to receive the + * new request. + * + * this routine also utilizes some failed path recovery + * logic. + * + * if a failed path has been skipped for a given number + * (timeout value) of IO requests. it is then tried again, + * and if the path has become functional again, it returned + * to the active state and it becomes available for load + * balancing. + * + * if a new IO arrives and we find no currently active paths, + * each failed path will be attempted one time in the hopes + * that it may have become active from the time between when + * it was marked failed and now. only when all paths have + * been tried and found non-active, is the IO marked with + * an error and returned. + * + * this function works in concert with s390_end_io_callback + * function and the s390iod(aemon), to redrive failed IO + * requests. + * + */ +static void +s390_load_balance(struct s390_io **piot, struct evms_logical_node *disk) +{ + struct evms_list_node **dskoln; + struct disk_object *dsko, *selected_dsko = NULL; + int dskidx, path = 0; + struct s390_io *iot; + + /* allocate and initialize an IO tracking structure + * if one was not passed in. + */ + if (!*piot) { + struct evms_list_node **devoln; + /* allocate IO Track struct */ + *piot = evms_cs_allocate_from_pool(s390_io_track_pool, + EVMS_BLOCKABLE); + memset(*piot, 0, sizeof (*iot)); + /* find the device object */ + devoln = lookup_device_object(disk); + (*piot)->devo = (*devoln)->item; + } + iot = *piot; + + /* find next disk object based on current load */ + + /* check for failed paths that have timed-out */ + dskidx = 1; + dskoln = &iot->devo->disk_object_list; + while (*dskoln) { + dsko = (struct disk_object *) (*dskoln)->item; + do { + /* skip paths tried earlier */ + if (iot->paths_tried & dskidx) { + continue; + } + /* skip active disks */ + if (dsko->flags == S390_DISK_OK) { + continue; + } + /* skip disks that haven't timed-out yet */ + if (atomic_read(&dsko->skipped_ios) + < S390_FAILED_SKIP_COUNT) { + continue; + } + selected_dsko = dsko; + path = dskidx; + break; + } while (0); + dskoln = &(*dskoln)->next; + dskidx <<= 1; + } + + /* if we have no timed-out paths, then check for the + * path with lowest pending io count. if that path + * happens to be a failed path and there is active + * paths, increment the skipped io count, mark this + * path as having been selected, then go back and run + * the loop again, looking for the next best choice. + * continue this process until the best active has + * been selected, or we end up with the best failed + * path. + */ + if (!selected_dsko) { + int paths_selected, have_actives; + paths_selected = 0; + s390_repeat_active_search: + path = 0; + have_actives = FALSE; + dskidx = 1; + dskoln = &iot->devo->disk_object_list; + while (*dskoln) { + dsko = (struct disk_object *) (*dskoln)->item; + do { + /* skip paths tried earlier */ + if (iot->paths_tried & dskidx) { + continue; + } + /* skip previously selected disks */ + if (paths_selected & dskidx) { + continue; + } + /* remember if we have active disks */ + if (dsko->flags == S390_DISK_OK) { + have_actives = TRUE; + } + /* look for disk with smallest + * pending IO count. + */ + if (selected_dsko) { + if (atomic_read(&dsko->pending_ios) + >= + (atomic_read + (&selected_dsko->pending_ios))) { + continue; + } + } + selected_dsko = dsko; + path = dskidx; + } while (0); + dskoln = &(*dskoln)->next; + dskidx <<= 1; + } + /* if we have unselected active paths + * and the currently selected path is + * failed, increment its skipped io count, + * and then go back to find an active path. + * + * this loop is structured this way so that + * we can accurately determine and track when + * a path has been skipped. + */ + if (have_actives && selected_dsko) { + if (selected_dsko->flags & S390_DISK_FAILED) { + atomic_inc(&selected_dsko->skipped_ios); + paths_selected |= path; + selected_dsko = NULL; + goto s390_repeat_active_search; + } + } + } + + /* if we have a selected path, perform the necessary + * bookkeeping on it. + */ + if (selected_dsko) { + atomic_set(&selected_dsko->skipped_ios, 0); + atomic_inc(&selected_dsko->pending_ios); + atomic_inc(&selected_dsko->total_ios); + iot->paths_tried |= path; + } + /* store the selected path (disk object) in the + * IO tracking structure, for examination by the + * caller. + */ + iot->dsko = selected_dsko; +} + +static void +s390_end_io_callback(void *private, + struct buffer_head *bh, int uptodate, int *done) +{ + struct s390_io *iot; + ulong flags; + + iot = private; + + /* update the disk object's status */ +// spin_lock_irqsave(iot->devo->device_object_lock, flags); + atomic_dec(&iot->dsko->pending_ios); + iot->dsko->flags = !uptodate; +// spin_unlock_irqrestore(iot->devo->device_object_lock, flags); + + if (!uptodate) { + atomic_inc(&iot->dsko->failed_ios); + /* encountered error */ + + /* is this a multipath device? */ + if (iot->devo->total_paths > 1) { + /* yes, its a multipath device */ + + /* determine alternate path */ + s390_load_balance(&iot, NULL); + if (iot->dsko) { + /* queue up redrive request */ + spin_lock_irqsave(&s390_redrive_list_lock, + flags); + iot->next = s390_redrive_list; + s390_redrive_list = iot; + spin_unlock_irqrestore(&s390_redrive_list_lock, + flags); + /* wake up redrive daemon */ + evms_cs_wakeup_thread(s390_io_redrive_thread); + + /* prevent the end_io to caller of EVMS */ + *done = TRUE; + } + } + } + if (*done == FALSE) { + evms_cs_deallocate_to_pool(s390_io_track_pool, iot); + } +} + +/**************************************************** +* Function: s390iod +* +* This is a kernel thread that handles read/write of mirrorss +* This shouldn't ever run on a non-mirrored LV read/write +* +* +*****************************************************/ +static void +s390iod(void *data) +{ + struct s390_io *iot; + unsigned long flags; + int rc; + + while (1) { + spin_lock_irqsave(&s390_redrive_list_lock, flags); + if (s390_redrive_list == NULL) { + spin_unlock_irqrestore(&s390_redrive_list_lock, flags); + break; + } + iot = s390_redrive_list; + s390_redrive_list = iot->next; + iot->next = NULL; + spin_unlock_irqrestore(&s390_redrive_list_lock, flags); + + /* register for callback */ + rc = evms_cs_register_for_end_io_notification(iot, iot->bh, + s390_end_io_callback); + if (rc) { + LOG_ERROR + ("error(%d): unable to register for end io callback!\n", + rc); + } else { + /* redrive IO */ + if (!iot->rw_flag) { + R_IO(iot->dsko->disk, iot->bh); + } else { + W_IO(iot->dsko->disk, iot->bh); + } + } + } +} + +/* + * Function: add_segment + */ +static int +s390_process_segment(struct evms_logical_node **discover_list, + struct evms_logical_node *node, + unsigned char *label, + u64 start_sect, + u64 nr_sects, unsigned char type, int part_num) +{ + struct local_instance_data *InstData = NULL; + struct evms_logical_node *segment; + int rc = 0; + + segment = find_segment_on_disk(node, start_sect, nr_sects); + if (segment) { + LOG_DETAILS("exporting segment '%s'.\n", segment->name); + } else { + InstData = kmalloc(sizeof (*InstData), GFP_KERNEL); + if (InstData) { + memset(InstData, 0, sizeof (*InstData)); + InstData->source_disk = node; + InstData->start_sect = start_sect; + InstData->nr_sects = nr_sects; + InstData->type = type; + rc = evms_cs_allocate_logical_node(&segment); + } else { + rc = -ENOMEM; + } + + if (!rc) { + segment->plugin = &plugin_header; + segment->system_id = (unsigned int) type; + segment->total_vsectors = nr_sects; + segment->block_size = node->block_size; + segment->hardsector_size = node->hardsector_size; + segment->private = InstData; + segment->flags = node->flags; + strcpy(segment->name, node->name); + sprintf(segment->name + strlen(segment->name), "%d", + part_num); + LOG_DETAILS("creating segment '%s'.\n", segment->name); + rc = add_segment_to_disk(node, label, segment); + if (rc) { + LOG_ERROR + ("%s: error(%d) adding segment '%s'!\n", + __FUNCTION__, rc, segment->name); + rc = 0; + } else { + MOD_INC_USE_COUNT; + } + } + if (rc) { + if (InstData) + kfree(InstData); + if (segment) + evms_cs_deallocate_logical_node(segment); + } + } + if (!rc) { + evms_cs_add_logical_node_to_list(discover_list, segment); + exported_nodes++; + } + return rc; +} + +typedef enum { + ibm_partition_lnx1 = 0, + ibm_partition_vol1 = 1, + ibm_partition_cms1 = 2, + ibm_partition_none = 3 +} ibm_partition_t; + +static char *part_names[] = { + [ibm_partition_lnx1] = "LNX1", + [ibm_partition_vol1] = "VOL1", + [ibm_partition_cms1] = "CMS1", + [ibm_partition_none] = "(nonl)" +}; + +static ibm_partition_t +get_partition_type(char *type) +{ + int i; + for (i = 0; i < 3; i++) { + if (!strncmp(type, part_names[i], 4)) + break; + } + return i; +} + +/* + * compute the block number from a + * cyl-cyl-head-head structure + */ +static inline int +cchh2blk(cchh_t * ptr, struct hd_geometry *geo) +{ + return ptr->cc * geo->heads * geo->sectors + ptr->hh * geo->sectors; +} + +/* + * compute the block number from a + * cyl-cyl-head-head-block structure + */ +static inline int +cchhb2blk(cchhb_t * ptr, struct hd_geometry *geo) +{ + int block = 0; + + block = ptr->cc * geo->heads * geo->sectors + ptr->hh * geo->sectors; + if (ptr->b) { + block += ptr->b - 1; + } + + return block; +} + +static void +print_mem(void *buffer, int length) +{ + int i, done; + unsigned char *bufptr; + + bufptr = (unsigned char *) buffer; + i = done = 0; + while (!done) { + if ((i % 16) == 0) + printk(KERN_INFO "\n0x%p->", buffer + i); + printk(KERN_INFO "%02x ", bufptr[i]); + if (++i >= length) + done++; + } + printk(KERN_INFO "\n"); +} + +static int +s390_probe_multipath(struct evms_logical_node *disk, + unsigned char *label, + u64 label_lba, int label_offset, unsigned char *org_buf) +{ + int rc = FALSE; + struct evms_list_node **devoln; + unsigned char *sector_buf = NULL; + + LOG_ENTRY_EXIT("%s: Entry\n", __FUNCTION__); + /* check if this disk is already known. + * if it is already in our device list + * then we don't need to check for + * multipath associations. + */ + devoln = lookup_device_object(disk); + /* is this disk in our list? */ + if (*devoln) { + struct device_object *devo; + struct disk_object *dsko; + /* yes, disk already known */ + + /* we need to determine if this + * is our first path to this + * device. + */ + devo = (struct device_object *) (*devoln)->item; + /* if this is the first path to this + * device, return FALSE so the main + * routine will process its segments. + * if this is not the first path, + * return TRUE so the main routine + * will not process its segments. + */ + dsko = (struct disk_object *) devo->disk_object_list->item; + if (dsko->disk != disk) { + rc = TRUE; + } + /* only print multipath log msgs if its + * active on this device. + */ + if (devo->total_paths > 1) { + LOG_DEBUG + ("skipping probe of known multipath device '%s'.\n", + disk->name); + } + LOG_ENTRY_EXIT("%s: Exit RC(%d)\n", __FUNCTION__, rc); + return (rc); + } + + /* search device object list for a matching label */ + devoln = NULL; + while (*(devoln = lookup_label(label, devoln))) { + struct device_object *devo; + struct disk_object *dsko; + unsigned char org_label[6]; +#define S390_TEST_LABEL "~!@#$" + + /* yes, found matching label */ + if (!sector_buf) { + /* allocate buffer for incoming label sector */ + sector_buf = kmalloc(disk->hardsector_size, GFP_KERNEL); + if (!sector_buf) { + rc = -ENOMEM; + break; + } + } + + /* save original label */ + memcpy(org_label, org_buf + label_offset, 6); + /* alter label to test pattern */ + strcpy(org_buf + label_offset, S390_TEST_LABEL); + /* write test pattern to this disk */ + LOG_DEBUG("writing test label to '%s'.\n", disk->name); + rc = INIT_IO(disk, WRITE, label_lba, 1, org_buf); + if (rc) { + LOG_ERROR("error(%d) reading sector("PFU64") from '%s'.\n", + rc, label_lba, disk->name); + break; + } + + /* read label from device object with matching label */ + devo = (struct device_object *) (*devoln)->item; + dsko = (struct disk_object *) devo->disk_object_list->item; + LOG_DEBUG("reading label from '%s'.\n", dsko->disk->name); + rc = INIT_IO(dsko->disk, READ, label_lba, 1, sector_buf); + if (rc) { + LOG_ERROR("error(%d) writing sector("PFU64") to '%s'.\n", + rc, label_lba, dsko->disk->name); + } + + /* restore original label */ + memcpy(org_buf + label_offset, org_label, 6); + LOG_DEBUG("restoring original label to '%s'.\n", disk->name); + rc = INIT_IO(disk, WRITE, label_lba, 1, org_buf); + if (rc) { + LOG_ERROR("error(%d) reading sector("PFU64") from '%s'.\n", + rc, label_lba, disk->name); + break; + } + + LOG_DEBUG("checking label: %s(%6s), reference(%6s).\n", + dsko->disk->name, + sector_buf + label_offset, S390_TEST_LABEL); + if (!strcmp(sector_buf + label_offset, S390_TEST_LABEL)) { + LOG_DETAILS("assigning '%s' as first path to device.\n", + dsko->disk->name); + LOG_DETAILS("assigning '%s' as next path to device.\n", + disk->name); + /* store this disk in the disk object's + * disk list. + */ + /* create a disk object */ + dsko = NULL; + if (!rc) { + dsko = kmalloc(sizeof (*dsko), GFP_KERNEL); + if (!dsko) { + rc = -ENOMEM; + } + } + if (!rc) { + memset(dsko, 0, sizeof (*dsko)); + /* add disk to disk object */ + dsko->disk = disk; + /* add disk object to disk object list + * in device object */ + rc = evms_cs_add_item_to_list(&devo-> + disk_object_list, + dsko); + } + if (!rc) { + devo->total_paths++; + } else { + if (dsko) { + kfree(dsko); + } + break; + } + + /* indicate we found a multipath device */ + rc = TRUE; + break; + } + } + if (sector_buf) { + kfree(sector_buf); + } + + LOG_ENTRY_EXIT("%s: Exit RC(%d)\n", __FUNCTION__, rc); + return (rc); +} + +static int +s390_probe_for_segments(struct evms_logical_node **discover_list, + struct evms_logical_node *disk) +{ + char type[5] = { 0, }, name[7] = { + 0,}; + int rc, vsects_per_hardsect = 0; + unsigned int blk; + u64 io_start, label_lba = 3; + dasd_information_t *info = NULL; + struct hd_geometry *geo = NULL; + unchar *data = NULL; + + /* allocate space for DASD ioctl packet + */ + info = kmalloc(sizeof (dasd_information_t), GFP_KERNEL); + if (info) { + memset(info, 0, sizeof (dasd_information_t)); + LOG_DEBUG("probing '%s' for 390 DASD info...\n", disk->name); + /* issue DASD info ioctl + */ + rc = evms_cs_kernel_ioctl(disk, BIODASDINFO, + (unsigned long) info); + if (rc) { + LOG_DEBUG("error(%d) from BIODASDINFO ioctl.\n", rc); + LOG_DEBUG("assuming '%s' is not a valid 390 device!\n", + disk->name); + } + } else { + rc = -ENOMEM; + } + + if (!rc) { + /* if we successfully completed the previous + * get DASD info ioctl, we will assume that + * the device is a valid 390 disk. + * + * remove it from the discover list. + */ + rc = evms_cs_remove_logical_node_from_list(discover_list, disk); + if (rc) { + LOG_ERROR + ("error(%d) removing disk(%s) from discover list.\n", + rc, disk->name); + } + } + if (!rc) { + /* allocate space for the geometry packet + */ + geo = kmalloc(sizeof (struct hd_geometry), GFP_KERNEL); + if (!geo) { + rc = -ENOMEM; + } + } + if (!rc) { + memset(geo, 0, sizeof (struct hd_geometry)); + /* issue the Get GEO ioctl + */ + rc = evms_cs_kernel_ioctl(disk, HDIO_GETGEO, + (unsigned long) geo); + if (rc) { + LOG_ERROR("error(%d) from HDIO_GETGEO ioctl.\n", rc); + } + } + if (!rc) { + /* retrieve the vsects_per_hardsect (hardsector size) + */ + vsects_per_hardsect = disk->hardsector_size; + vsects_per_hardsect >>= EVMS_VSECTOR_SIZE_SHIFT; + data = kmalloc(EVMS_VSECTOR_SIZE, GFP_KERNEL); + if (!data) { + rc = -ENOMEM; + } + } + if (!rc) { + /* go read the 1st block on the disk + */ + label_lba = info->label_block * vsects_per_hardsect; + io_start = label_lba; + rc = INIT_IO(disk, READ, io_start, 1, data); + if (rc) { + LOG_ERROR("error(%d) reading sector("PFU64") from '%s'.\n", + rc, io_start, disk->name); + } else { +// print_mem(data, EVMS_VSECTOR_SIZE); + } + } + if (!rc) { + int offset, size, psize, counter = 0, label_offset; + int vstart = 0, vend = 0; + int vtoc_record_count, vtoc_index; + format1_label_t f1; + format4_label_t *f4; + volume_label_t vlabel; + ibm_partition_t partition_type; + + /* determine the format type + */ + + strncpy(type, data, 4); + if ((!info->FBA_layout) && (!strcmp(info->type, "ECKD"))) { + label_offset = 8; + } else { + label_offset = 4; + } + strncpy(name, data + label_offset, 6); + memcpy(&vlabel, data, sizeof (volume_label_t)); + + EBCASC(type, 4); + EBCASC(name, 6); + partition_type = get_partition_type(type); + LOG_DETAILS("disk: raw type(%s), type(%s), name(%s)\n", + type, part_names[partition_type], name); + + rc = s390_probe_multipath(disk, name, label_lba, label_offset, + data); + if (!rc) { + switch (partition_type) { + case ibm_partition_cms1: + if (*((long *) data + 13) != 0) { + /* disk is reserved minidisk */ + long *label = (long *) data; + vsects_per_hardsect = + label[3] >> EVMS_VSECTOR_SIZE_SHIFT; + offset = label[13]; + size = + (label[7] - + 1) * vsects_per_hardsect; + LOG_DEBUG("(MDSK)"); + } else { + offset = info->label_block + 1; + size = disk->total_vsectors; + } + offset *= vsects_per_hardsect; + /* adjust for 0 thru label block offset + */ + size -= offset; + rc = s390_process_segment(discover_list, + disk, + name, + offset, size, 0, 1); + break; + case ibm_partition_lnx1: + case ibm_partition_none: + offset = info->label_block + 1; + offset *= vsects_per_hardsect; + size = disk->total_vsectors; + /* adjust for 0 thru label block offset + */ + size -= offset; + rc = s390_process_segment(discover_list, + disk, + name, + offset, size, 0, 1); + break; + case ibm_partition_vol1: + /* set max dscb record count == single track till we see the vtoc descriptor */ + vtoc_record_count = geo->sectors; + /* set current index into vtoc */ + vtoc_index = 0; + /* get block number and read then first dscb */ + blk = cchhb2blk(&vlabel.vtoc, geo); + io_start = blk * vsects_per_hardsect; + rc = INIT_IO(disk, READ, io_start, 1, data); + if (rc) { + LOG_ERROR + ("error(%d) reading sector("PFU64") from '%s'.\n", + rc, io_start, disk->name); + break; + } else { + // print_mem(data, EVMS_VSECTOR_SIZE); + } + memcpy(&f1, data, sizeof (format1_label_t)); + + // read vtoc records ... terminate when : + // (1) we hit first NULL record + // (2) we get an error processing a vtoc record + // (3) we run out of vtoc records to process + while (f1.DS1FMTID != 0x00 && rc == 0 + && vtoc_index < vtoc_record_count) { + if (f1.DS1FMTID == 0xf4) { // vtoc descriptor + f4 = (format4_label_t *) data; + vstart = + cchh2blk(&f4->DS4VTOCE. + llimit, geo); + vend = + cchh2blk(&f4->DS4VTOCE. + ulimit, geo); + vtoc_record_count = + (vend - vstart) + + geo->sectors; + } else if (f1.DS1FMTID == 0xf1) { // dataset descriptor + + offset = + cchh2blk(&f1.DS1EXT1.llimit, + geo); + psize = + cchh2blk(&f1.DS1EXT1.ulimit, + geo) - offset + + geo->sectors; + + counter++; + rc = s390_process_segment + (discover_list, disk, name, + offset * + vsects_per_hardsect, + psize * + vsects_per_hardsect, 0, + counter); + } + if (!rc) { // get next dscb + ++vtoc_index; + ++blk; + io_start = + blk * vsects_per_hardsect; + rc = INIT_IO(disk, READ, + io_start, 1, data); + if (rc) { + LOG_ERROR + ("error(%d) reading sector("PFU64") from '%s'.\n", + rc, io_start, + disk->name); + break; + } else { + // print_mem(data, EVMS_VSECTOR_SIZE); + } + memcpy(&f1, data, + sizeof + (format1_label_t)); + } + } + break; + default: + rc = s390_process_segment(discover_list, + disk, name, + 0, 0, 0, 1); + break; + } + } + } + if (info) { + kfree(info); + } + if (geo) { + kfree(geo); + } + if (data) + kfree(data); + + return (rc); +} + +/* + * Function: s390_partition_discover + * + */ +static int +s390_partition_discover(struct evms_logical_node **discover_list) +{ + int rc = 0; + struct evms_logical_node *node, *next_node; + + MOD_INC_USE_COUNT; + LOG_ENTRY_EXIT("%s: ENTRY\n", __FUNCTION__); + + /* initialize global variable */ + exported_nodes = 0; + + /* examine each node on the discover list */ + next_node = *discover_list; + while (next_node) { + node = next_node; + next_node = node->next; + if (GetPluginType(node->plugin->id) != EVMS_DEVICE_MANAGER) + /* only process disk nodes + */ + continue; + if (node->iflags & EVMS_TOP_SEGMENT) + continue; + s390_probe_for_segments(discover_list, node); + } + + LOG_ENTRY_EXIT("%s: EXIT(exported nodes:%d, error code:%d)\n", + __FUNCTION__, exported_nodes, rc); + if (exported_nodes) + rc = exported_nodes; + MOD_DEC_USE_COUNT; + return (rc); +} + +/* + * Function: s390_partition_delete + * + */ +static int +s390_partition_delete(struct evms_logical_node *segment) +{ + int rc = 0; + struct local_instance_data *LID; + + LOG_DETAILS("deleting segment '%s'.\n", segment->name); + + if (!segment) { + rc = -ENODEV; + } else { + struct evms_list_node *empty_disk_object_list = NULL; + LID = segment->private; + if (LID) { + /* remove the segment from the + * disk's segment list + */ + rc = remove_segment_from_disk(LID->source_disk, + segment, + &empty_disk_object_list); + /* free the local instance data */ + kfree(LID); + } + /* free the segment node */ + evms_cs_deallocate_logical_node(segment); + MOD_DEC_USE_COUNT; + /* if the last segment on the disk was + * deleted, delete the disk node(s) too + */ + while (empty_disk_object_list) { + struct disk_object *dsko; + dsko = + (struct disk_object *) empty_disk_object_list->item; + rc = evms_cs_remove_item_from_list + (&empty_disk_object_list, dsko); + if (!rc) { + rc = DELETE(dsko->disk); + if (rc) { + LOG_ERROR + ("error(%d): attempting to delete '%s'.\n", + rc, dsko->disk->name); + rc = 0; + } + } + kfree(dsko); + } + } + return (rc); +} + +/* + * function: s390_partition_io_error + * + * this function was primarily created because the function + * buffer_IO_error is inline and kgdb doesn't allow breakpoints + * to be set on inline functions. Since this was an error path + * and not mainline, I decided to add a trace statement to help + * report on the failing condition. + * + */ +static void +s390_partition_io_error(int rc, + struct evms_logical_node *node, + int io_flag, struct buffer_head *bh) +{ + switch (rc) { + case 1: + LOG_SERIOUS + ("attempt to %s beyond partition boundary("PFU64") on (%s), rsector(%ld).\n", + (io_flag) ? "WRITE" : "READ", node->total_vsectors - 1, + node->name, bh->b_rsector); + break; + case 2: + LOG_ERROR + ("%s error(no active paths) on '%s' to drive the I/O.\n", + (io_flag) ? "WRITE" : "READ", node->name); + break; + } + + bh->b_end_io(bh, 0); +} + +/* + * Function: s390_partition_read + * + */ +static void +s390_partition_read(struct evms_logical_node *partition, struct buffer_head *bh) +{ + struct local_instance_data *LID = partition->private; + struct s390_io *iot = NULL; + int rc = 1; + + if (bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) <= + partition->total_vsectors) { + bh->b_rsector += LID->start_sect; + + s390_load_balance(&iot, LID->source_disk); + if (iot->dsko) { + iot->segment = partition; + iot->bh = bh; + iot->rw_flag = READ; + /* register the callback */ + evms_cs_register_for_end_io_notification(iot, bh, + s390_end_io_callback); + /* drive the IO */ + R_IO(iot->dsko->disk, bh); + return; + } else { + rc = 2; + } + } + s390_partition_io_error(rc, partition, READ, bh); +} + +/* + * Function: s390_partition_write + * + */ +static void +s390_partition_write(struct evms_logical_node *partition, + struct buffer_head *bh) +{ + struct local_instance_data *LID = partition->private; + struct s390_io *iot = NULL; + int rc = 1; + + if (bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) <= + partition->total_vsectors) { + bh->b_rsector += LID->start_sect; + + s390_load_balance(&iot, LID->source_disk); + if (iot->dsko) { + iot->segment = partition; + iot->bh = bh; + iot->rw_flag = WRITE; + /* register the callback */ + evms_cs_register_for_end_io_notification(iot, bh, + s390_end_io_callback); + /* drive the IO */ + W_IO(iot->dsko->disk, bh); + return; + } else { + rc = 2; + } + } + s390_partition_io_error(rc, partition, WRITE, bh); +} + +/* + * Function: s390_partition_init_io + * + */ +static int +s390_partition_init_io(struct evms_logical_node *partition, int io_flag, /* 0=read, 1=write */ + u64 sect_nr, /* disk LBA */ + u64 num_sects, /* # of sectors */ + void *buf_addr) +{ /* buffer address */ + int rc; + struct local_instance_data *LID = partition->private; + struct s390_io *iot = NULL; + + if ((sect_nr + num_sects) <= partition->total_vsectors) { + do { + s390_load_balance(&iot, LID->source_disk); + if (!iot->dsko) { + rc = -EIO; + break; + } + rc = INIT_IO(iot->dsko->disk, io_flag, + sect_nr + LID->start_sect, num_sects, + buf_addr); + /* do disk object IO bookkeeping */ + atomic_dec(&iot->dsko->pending_ios); + if (rc == -EIO) { + atomic_inc(&iot->dsko->failed_ios); + iot->dsko->flags = S390_DISK_FAILED; + } else { + iot->dsko->flags = S390_DISK_OK; + } + } while (rc == -EIO); + evms_cs_deallocate_to_pool(s390_io_track_pool, iot); + } else { + LOG_SERIOUS + ("init_io: attempt to %s beyond partition(%s) boundary("PFU64") at sector("PFU64") for count("PFU64").\n", + (io_flag) ? "WRITE" : "READ", partition->name, + (LID->nr_sects - 1), sect_nr, num_sects); + rc = -EINVAL; + } + + return (rc); +} + +/* + * Function: s390_partition_ioctl + * + */ +static int +s390_partition_ioctl(struct evms_logical_node *partition, + struct inode *inode, + struct file *file, unsigned int cmd, unsigned long arg) +{ + struct local_instance_data *LID; + struct hd_geometry hd_geo; + int rc; + + rc = 0; + LID = partition->private; + if (!inode) + return -EINVAL; + switch (cmd) { + case HDIO_GETGEO: + { + rc = IOCTL(LID->source_disk, inode, file, cmd, arg); + if (rc) + break; + if (copy_from_user + (&hd_geo, (void *) arg, + sizeof (struct hd_geometry))) + rc = -EFAULT; + if (rc) + break; + hd_geo.start = LID->start_sect; + if (copy_to_user + ((void *) arg, &hd_geo, + sizeof (struct hd_geometry))) + rc = -EFAULT; + } + break; + case EVMS_GET_BMAP: + { + struct evms_get_bmap_pkt *bmap = + (struct evms_get_bmap_pkt *) arg; + bmap->rsector += LID->start_sect; + /* intentionally fall thru to + * default ioctl down to device + * manager. + */ + } + default: + rc = IOCTL(LID->source_disk, inode, file, cmd, arg); + } + return rc; +} + +/* + * Function: s390_part_init + * + */ +static int __init +s390_part_init(void) +{ + const char *name = "evms_s390iod"; + + /* create s390 IODaemon thread */ + s390_io_redrive_thread = evms_cs_register_thread(s390iod, NULL, name); + /* create pool of IO tracking structures */ + s390_io_track_pool = + evms_cs_create_pool(sizeof (struct s390_io), "EVMS_s390_IO", NULL, + NULL); + + return evms_cs_register_plugin(&plugin_header); /* register with EVMS */ +} + +static void __exit +s390_part_exit(void) +{ + evms_cs_unregister_plugin(&plugin_header); +} + +module_init(s390_part_init); +module_exit(s390_part_exit); +#ifdef MODULE_LICENSE +MODULE_LICENSE("GPL"); +#endif diff -Naur linux-2002-09-30/drivers/evms/snapshot.c evms-2002-09-30/drivers/evms/snapshot.c --- linux-2002-09-30/drivers/evms/snapshot.c Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/drivers/evms/snapshot.c Wed Sep 25 16:53:00 2002 @@ -0,0 +1,2796 @@ +/* -*- linux-c -*- */ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +/* + * linux/drivers/evms/snapshot.c + * + * EVMS SnapShot Feature. + * + * This feature provides the ability to Snapshot ANY existing EVMS volume + * (including compatibility) to a new EVMS volume that is created when the + * SnapShot is enabled. This feature will appear in the call stack for both + * the original and the snapshot volume. + */ + +#define LOG_PREFIX "snapshot: " + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct proc_dir_entry * snap_proc = NULL; +static unsigned int snapshot_count = 0; /* Number of active snapshots and originals. */ + +/* Memory pools. */ +static kmem_cache_t * snap_page_slab = NULL; +static mempool_t * snap_page_pool = NULL; +static kmem_cache_t * snap_buffer_slab = NULL; +static mempool_t * snap_buffer_pool = NULL; +static kmem_cache_t * snap_async_org_io_slab = NULL; +static mempool_t * snap_async_org_io_pool = NULL; +static kmem_cache_t * snap_async_snap_io_slab = NULL; +static mempool_t * snap_async_snap_io_pool = NULL; +static kmem_cache_t * snap_hash_entry_slab = NULL; +static mempool_t * snap_hash_entry_pool = NULL; + +#ifdef SNAPSHOT_DEBUG +static struct async_org_io * debug_async_org_io_list = NULL; +static spinlock_t debug_async_org_io_list_lock = SPIN_LOCK_UNLOCKED; +#endif + +/* API prototypes */ +static int snap_discover_volumes(struct evms_logical_node ** evms_node_list); +static int snap_delete_volume(struct evms_logical_node * node); +static void snap_read(struct evms_logical_node * node, + struct buffer_head * bh); +static void snap_write(struct evms_logical_node * node, + struct buffer_head * bh); +static int snap_init_io(struct evms_logical_node * node, int rw, + u64 sect_nr, u64 num_sects, void * buf_addr); +static int snap_ioctl(struct evms_logical_node * node, + struct inode * inode, struct file * file, + unsigned int cmd, unsigned long arg); + +/* Other functions that require prototypes. */ +static int add_snapshot(struct evms_logical_node * node, + struct snapshot_metadata * metadata, + struct evms_logical_node ** evms_node_list); +static int snap_proc_read(char * page, char ** start, off_t off, + int count, int * eof, void * data); +static void snapshot_do_rollback(void * volume); +static void snap_async_io_thread(void * volume); +void snap_read_chunk_cb(struct buffer_head * bh, int uptodate); +void snap_write_chunk_cb(struct buffer_head * bh, int uptodate); +void snap_cow_table_cb(struct buffer_head * bh, int uptodate); + +/* Snapshot plugin's function table and header. */ +static struct evms_plugin_fops function_table = { + .discover = snap_discover_volumes, + .delete = snap_delete_volume, + .read = snap_read, + .write = snap_write, + .init_io = snap_init_io, + .ioctl = snap_ioctl +}; + +static struct evms_plugin_header plugin_header = { + .id = SetPluginID(IBM_OEM_ID, + EVMS_ASSOCIATIVE_FEATURE, + EVMS_SNAPSHOT_FEATURE_ID), + .version = { + .major = EVMS_SNAPSHOT_VERSION_MAJOR, + .minor = EVMS_SNAPSHOT_VERSION_MINOR, + .patchlevel = EVMS_SNAPSHOT_VERSION_PATCHLEVEL + }, + .required_services_version = { + .major = EVMS_COMMON_SERVICES_MAJOR, + .minor = EVMS_COMMON_SERVICES_MINOR, + .patchlevel = EVMS_COMMON_SERVICES_PATCHLEVEL + }, + .fops = &function_table +}; + +/** + * convert_metadata - Perform endian conversion on a metadata sector. + * @metadata: snapshot metadata sector + **/ +static void convert_metadata(struct snapshot_metadata * metadata) +{ + metadata->signature = le32_to_cpup(&metadata->signature); + metadata->CRC = le32_to_cpup(&metadata->CRC); + metadata->version.major = le32_to_cpup(&metadata->version.major); + metadata->version.minor = le32_to_cpup(&metadata->version.minor); + metadata->version.patchlevel = le32_to_cpup(&metadata->version.patchlevel); + metadata->flags = le32_to_cpup(&metadata->flags); + metadata->original_size = le64_to_cpup(&metadata->original_size); + metadata->lba_of_COW_table = le64_to_cpup(&metadata->lba_of_COW_table); + metadata->lba_of_first_chunk = le64_to_cpup(&metadata->lba_of_first_chunk); + metadata->chunk_size = le32_to_cpup(&metadata->chunk_size); + metadata->total_chunks = le32_to_cpup(&metadata->total_chunks); +} + +static void *slab_pool_alloc(int gfp_mask, void * data) +{ + return kmem_cache_alloc(data, gfp_mask); +} + +static void slab_pool_free(void * ptr, void * data) +{ + kmem_cache_free(data, ptr); +} + +/** + * allocate_snapshot_hash_entry + * @volume: Snapshot volume to get a new entry for. + * @org_chunk: Number of original chunk. + * @snap_chunk: Number of remap chunk. + * @chunk_state: see SNAP_CHUNK_* + * + * Get a snapshot_hash_entry from the pool and initialize. Accessing the + * free_hash_list is safe, since we only call this function while holding + * the snap_semaphore. + */ +static struct snapshot_hash_entry * +allocate_snapshot_hash_entry(struct snapshot_volume * volume, + u64 org_chunk, + u64 snap_chunk, + u32 chunk_state) +{ + struct snapshot_hash_entry * hash_entry; + + hash_entry = volume->free_hash_list; + if (hash_entry) { + volume->free_hash_list = hash_entry->next; + hash_entry->org_chunk = org_chunk; + hash_entry->snap_chunk = snap_chunk; + hash_entry->chunk_state = chunk_state; + hash_entry->snap_io = NULL; + hash_entry->next = NULL; + hash_entry->prev = NULL; + spin_lock_init(&hash_entry->chunk_state_lock); + } else { + /* Should never happen, since hash entries are max + * allocated at discovery time. + */ + BUG(); + } + + return hash_entry; +} + +/** + * insert_snapshot_hash_entry + * + * Insert a new entry into a snapshot hash chain, immediately following the + * specified entry. This function should not be used to add an entry into an + * empty list, or as the first entry in an existing list. For that case, use + * insert_snapshot_map_entry_at_head(). + */ +static int insert_snapshot_hash_entry(struct snapshot_hash_entry * entry, + struct snapshot_hash_entry * base) +{ + entry->next = base->next; + entry->prev = base; + base->next = entry; + if ( entry->next ) { + entry->next->prev = entry; + } + return 0; +} + +/** + * insert_snapshot_hash_entry_at_head + * + * Insert a new entry into a snapshot chain as the first entry in the chain. + */ +static int insert_snapshot_hash_entry_at_head(struct snapshot_hash_entry * entry, + struct snapshot_hash_entry ** head) +{ + entry->next = *head; + entry->prev = NULL; + *head = entry; + if ( entry->next ) { + entry->next->prev = entry; + } + return 0; +} + +/** + * set_snapshot_flags + * @snap_node: + * @set_flag: Flags to turn "on" in the metadata sector. + * @unset_flag: Flags to turn "off" in the metadata sector. + * + * Set the flags field in the metadata and write the metadata sector to + * the snapshot volume. The node passed in to this function should be the + * "lower" of the snapshot nodes, meaning the one consumed by the snapshot + * plugin, not the one exported from the plugin. + * + * Appropriate values for the two flag parameters are: + * EVMS_SNAPSHOT_DISABLED + * EVMS_SNAPSHOT_FULL + * EVMS_SNAPSHOT_ROLLBACK + * EVMS_SNAPSHOT_ROLLBACK_COMP + */ +static int set_snapshot_flags(struct evms_logical_node * snap_node, + u32 set_flag, + u32 unset_flag) +{ + unsigned char data[EVMS_VSECTOR_SIZE] = {0}; + struct snapshot_metadata * metadata = (struct snapshot_metadata*)data; + + /* Read the metadata sector */ + if ( INIT_IO(snap_node, 0, snap_node->total_vsectors-3, 1, data) ) { + return -EIO; + } + + /* Set the appropriate flags. Do endian conversion on the fly. */ + metadata->flags |= cpu_to_le32p(&set_flag); + metadata->flags &= ~(cpu_to_le32p(&unset_flag)); + metadata->CRC = 0; + metadata->CRC = cpu_to_le32(evms_cs_calculate_crc(EVMS_INITIAL_CRC, + metadata, + sizeof(struct snapshot_metadata))); + + /* Write the metadata sector back to the volume. */ + if ( INIT_IO(snap_node, 1, snap_node->total_vsectors-3, 1, data) ) { + return -EIO; + } + + return 0; +} + +/** + * disable_snapshot + */ +static void disable_snapshot(struct snapshot_volume * snap_volume, + int update_metadata) +{ + LOG_ERROR("Disabling snapshot volume '%s'.\n", + snap_volume->exported_node->name); + snap_volume->flags |= EVMS_SNAPSHOT_DISABLED; + if ( update_metadata ) { + set_snapshot_flags(snap_volume->logical_node, + EVMS_SNAPSHOT_DISABLED, 0); + } else { + snap_volume->flags |= EVMS_SNAPSHOT_DISABLED_PENDING; + evms_cs_wakeup_thread(snap_volume->snapshot_org->async_io_thread); + } +} + +/** + * snap_discover_volumes + * + * Inspect the global node list, looking for volumes with a valid + * snapshot metadata sector. + */ +static int snap_discover_volumes(struct evms_logical_node ** evms_node_list) +{ + struct evms_logical_node * node, * next_node; + struct snapshot_metadata * metadata = NULL; + int org_crc, final_crc, rc = 0; + + MOD_INC_USE_COUNT; + + /* A buffer for reading the metadata. */ + metadata = kmalloc(EVMS_VSECTOR_SIZE, GFP_KERNEL); + if (!metadata) { + MOD_DEC_USE_COUNT; + return -ENOMEM; + } + + /* Check every node on the discovery list. */ + for ( node = *evms_node_list; node && !rc; node = next_node ) { + next_node = node->next; + + /* This node must not be one we put back on the list already, + * and must have a feature header with snapshot's ID. + */ + if ( node->plugin->id == plugin_header.id || + ! node->feature_header || + node->feature_header->feature_id != plugin_header.id ) { + continue; + } + + /* Read third-to-last sector for the snapshot metadata. */ + rc = INIT_IO(node, 0, node->total_vsectors-3, 1, metadata); + if (rc) { + LOG_ERROR("IO error reading sector "PFU64" on '%s'.\n", + node->total_vsectors-3, node->name); + rc = -EVMS_FEATURE_FATAL_ERROR; + evms_cs_remove_logical_node_from_list(evms_node_list, + node); + DELETE(node); + continue; + } + + /* Check for a valid snapshot signature. */ + if ( le32_to_cpup(&metadata->signature) != + EVMS_SNAPSHOT_SIGNATURE ) { + continue; + } + evms_cs_remove_logical_node_from_list(evms_node_list, node); + + /* Check for a valid CRC. */ + org_crc = le32_to_cpup(&metadata->CRC); + metadata->CRC = 0; + final_crc = evms_cs_calculate_crc(EVMS_INITIAL_CRC, metadata, + sizeof(struct snapshot_metadata)); + if ( final_crc != org_crc ) { + LOG_ERROR("CRC error in feature data on '%s'.\n", + node->name); + rc = -EVMS_FEATURE_FATAL_ERROR; + DELETE(node); + continue; + } + + /* Check for correct metadata version. */ + convert_metadata(metadata); + if ( metadata->version.major > plugin_header.version.major ) { + LOG_ERROR("ERROR: unsuppoprted metadata version on '%s'.\n", + node->name); + rc = -EVMS_FEATURE_FATAL_ERROR; + DELETE(node); + continue; + } + + rc = add_snapshot(node, metadata, evms_node_list); + } + + kfree(metadata); + MOD_DEC_USE_COUNT; + return rc; +} + +/** + * check_quiesce + * + * Make sure an original volume and all of its snapshots are quiesced. + */ +static int check_quiesce(struct snapshot_volume * org_volume) +{ + struct snapshot_volume * next_vol; + + for ( next_vol = org_volume; + next_vol; + next_vol = next_vol->snapshot_next ) { + if ( ! (next_vol->flags & EVMS_SNAPSHOT_QUIESCED) ) { + LOG_ERROR("Can't delete snapshot, volume '%s' not quiesced.\n", + next_vol->logical_node->name); + return -EBUSY; + } + } + return 0; +} + +/** + * remove_snapshot_from_chain + * + * Remove the specified snapshot volume from its original's chain of snapshots. + */ +static int remove_snapshot_from_chain(struct snapshot_volume * snap_volume) +{ + struct snapshot_volume ** p_volume; + + if ( snap_volume->snapshot_org ) { + down_write(&snap_volume->snapshot_org->snap_semaphore); + for ( p_volume = &snap_volume->snapshot_org->snapshot_next; + *p_volume; + p_volume = &(*p_volume)->snapshot_next ) { + if ( *p_volume == snap_volume ) { + *p_volume = (*p_volume)->snapshot_next; + break; + } + } + up_write(&snap_volume->snapshot_org->snap_semaphore); + } + snap_volume->snapshot_org = NULL; + snap_volume->snapshot_next = NULL; + return 0; +} + +/** + * delete_snapshot_hash_chain + * + * Delete all items in a single chain in the hash table. + */ +static int delete_snapshot_hash_chain(struct snapshot_hash_entry * head) +{ + struct snapshot_hash_entry * next; + + while (head) { + next = head->next; + mempool_free(head, snap_hash_entry_pool); + head = next; + } + return 0; +} + +/** + * snapshot_delete_pools + * + * Delete all memory pools after all snapshots have been deleted. + * Also shutdown the daemon thread. + */ +static void snapshot_delete_pools(void) +{ + /* The pool of data pages. */ + if (snap_page_slab) { + if (snap_page_pool) { + mempool_destroy(snap_page_pool); + snap_page_pool = NULL; + } + kmem_cache_destroy(snap_page_slab); + snap_page_slab = NULL; + } + + /* The pool of snap_io_buffer's. */ + if (snap_buffer_slab) { + if (snap_buffer_pool) { + mempool_destroy(snap_buffer_pool); + snap_buffer_pool = NULL; + } + kmem_cache_destroy(snap_buffer_slab); + snap_buffer_slab = NULL; + } + + /* The pool of async_org_io's. */ + if (snap_async_org_io_slab) { + if (snap_async_org_io_pool) { + mempool_destroy(snap_async_org_io_pool); + snap_async_org_io_pool = NULL; + } + kmem_cache_destroy(snap_async_org_io_slab); + snap_async_org_io_slab = NULL; + } + + /* The pool of async_snap_io's. */ + if (snap_async_snap_io_slab) { + if (snap_async_snap_io_pool) { + mempool_destroy(snap_async_snap_io_pool); + snap_async_snap_io_pool = NULL; + } + kmem_cache_destroy(snap_async_snap_io_slab); + snap_async_snap_io_slab = NULL; + } + + /* The pool of hash table entries. */ + if (snap_hash_entry_slab) { + if (snap_hash_entry_pool) { + mempool_destroy(snap_hash_entry_pool); + snap_hash_entry_pool = NULL; + } + kmem_cache_destroy(snap_hash_entry_slab); + snap_hash_entry_slab = NULL; + } +} + +/** + * snapshot_create_pools + * + * Allocate all of the memory pools when the first snapshot is created. + * Also start up the daemon thread for processing async I/O's. + */ +static int snapshot_create_pools(void) +{ + /* Pool of data pages. */ + if (!snap_page_slab) { + snap_page_slab = kmem_cache_create("snap_page_slab", + PAGE_SIZE, 0, + SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (snap_page_slab) { + snap_page_pool = mempool_create(1, slab_pool_alloc, + slab_pool_free, + snap_page_slab); + } + } + + /* Pool of snap_io_buffer's. */ + if (!snap_buffer_slab) { + snap_buffer_slab = kmem_cache_create("snap_bh_slab", + sizeof(struct snap_io_buffer), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (snap_buffer_slab) { + snap_buffer_pool = mempool_create(1, slab_pool_alloc, + slab_pool_free, + snap_buffer_slab); + } + } + + /* Pool of async_org_io's. */ + if (!snap_async_org_io_slab) { + snap_async_org_io_slab = kmem_cache_create("async_org_io_slab", + sizeof(struct async_org_io), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (snap_async_org_io_slab) { + snap_async_org_io_pool = mempool_create(1, slab_pool_alloc, + slab_pool_free, + snap_async_org_io_slab); + } + } + + /* Pool of async_snap_io's. */ + if (!snap_async_snap_io_slab) { + snap_async_snap_io_slab = kmem_cache_create("async_snap_io_slab", + sizeof(struct async_snap_io), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (snap_async_snap_io_slab) { + snap_async_snap_io_pool = mempool_create(1, slab_pool_alloc, + slab_pool_free, + snap_async_snap_io_slab); + } + } + + /* Pool of hash table entries. */ + if (!snap_hash_entry_slab) { + snap_hash_entry_slab = kmem_cache_create("snap_hash_slab", + sizeof(struct snapshot_hash_entry), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (snap_hash_entry_slab) { + snap_hash_entry_pool = mempool_create(1, slab_pool_alloc, + slab_pool_free, + snap_hash_entry_slab); + } + } + + if ( ! snap_page_slab || ! snap_page_pool || + ! snap_buffer_slab || ! snap_buffer_pool || + ! snap_async_org_io_slab || ! snap_async_org_io_pool || + ! snap_async_snap_io_slab || ! snap_async_snap_io_pool || + ! snap_hash_entry_slab || ! snap_hash_entry_pool ) { + LOG_CRITICAL("No memory available to create snapshot pools.\n"); + snapshot_delete_pools(); + return -ENOMEM; + } + + return 0; +} + +/** + * snap_delete_volume + * + * Delete the in-memory representation of a volume. The specified node + * can actually be either a snapshot or an original. Deleting a snapshot + * causes it to be removed from its original's chain of snapshots. + * + * For async snapshots, we will need to flush the COW table and mark the + * snapshot clean in the metadata. + */ +static int snap_delete_volume(struct evms_logical_node * node) +{ + struct snapshot_volume * volume = node->private; + struct snapshot_volume * org_volume = volume->snapshot_org; + struct snapshot_volume * next_vol; + int i, rc = 0; + + /* Don't delete a snapshot that's rolling back. */ + if ( volume->flags & EVMS_SNAPSHOT_ROLLBACK && + ! (volume->flags & EVMS_SNAPSHOT_DISABLED) ) { + LOG_ERROR("Can't delete '%s' during snapshot rollback.", + node->name); + return -EBUSY; + } + + /* Delete the instance data. */ + if (volume) { + if ( volume->flags & EVMS_SNAPSHOT ) { + /* This node is a snapshot. Check that this snapshot and + * its original have been quiesced. For async snapshots, + * make sure there are no outstanding remaps in + * progress. Then remove it from the original's chain of + * snapshots. + */ + if ( ! (volume->flags & EVMS_SNAPSHOT_QUIESCED) ) { + LOG_ERROR("Can't delete snapshot, snapshot volume '%s' not quiesced.\n", + volume->logical_node->name); + return -EBUSY; + } + if ( org_volume && + ! (org_volume->flags & EVMS_SNAPSHOT_QUIESCED) ) { + LOG_ERROR("Can't delete snapshot, original volume '%s' not quiesced.\n", + org_volume->logical_node->name); + return -EBUSY; + } + + remove_snapshot_from_chain(volume); + + /* If we just deleted the only/last snapshot for this + * original, the original will not be modified. It is + * the engine's responsibility to delete the original + * and rediscover in order to clear it of its snapshot + * information. Even if that doesn't happen, the state + * of the kernel will still be safe. I/O's coming into + * this plugin for the original will just be passed + * down without any other action or modification. + */ + + /* Unregister the proc-fs entry for this node. */ + if (snap_proc) { + remove_proc_entry(node->volume_info->volume_name, + snap_proc); + } + } else { + /* This is an original. It's the engine's responsibility + * to delete all snapshots before deleting an original. + * Otherwise, a snapshot could be left pointing to an + * original that no longer exists. Thus, we just need to + * make sure there are no snapshots in the chain. + */ + rc = check_quiesce(volume); + if (rc) { + return rc; + } + + /* Shut down the async I/O thread. */ + if (volume->async_io_thread) { + evms_cs_unregister_thread(volume->async_io_thread); + } + + /* Loop through all snapshots left on this original, + * and NULL out their org pointer, in case they don't + * get deleted. + */ + for ( next_vol = volume->snapshot_next; next_vol; + next_vol = next_vol->snapshot_next ) { + next_vol->snapshot_org = NULL; + } + } + + /* Free up all memory used by the instance data, including + * the underlying node, the hash table, and the data buffer. + */ + if (volume->logical_node) { + rc = DELETE(volume->logical_node); + if (rc) { + return rc; + } + } + if (volume->snapshot_map) { + /* Delete all of the hash chains, + * then the actual table. + */ + for ( i = 0; i < volume->hash_table_size; i++ ) { + delete_snapshot_hash_chain(volume->snapshot_map[i]); + } + delete_snapshot_hash_chain(volume->free_hash_list); + vfree(volume->snapshot_map); + } + if (volume->chunk_data_buffer) { + kfree(volume->chunk_data_buffer); + } + if (volume->rollback_thread) { + evms_cs_unregister_thread(volume->rollback_thread); + } + + kfree(volume); + } + + evms_cs_deallocate_logical_node(node); + snapshot_count--; + + /* If there are no more snapshot objects, free up the memory pools. */ + if ( snapshot_count == 0 ) { + snapshot_delete_pools(); + } + + MOD_DEC_USE_COUNT; + + return 0; +} + +/** + * search_snapshot_hash_chain + * + * Search the hash chain that is anchored at the specified head pointer. If the + * chunk number is found, a pointer to that entry in the chain is set, and a 1 + * is returned. If the chunk is not found, a pointer to the previous entry is + * set and 0 is returned. If the return pointer is NULL, this means either the + * list is empty, or the specified sector should become the first list item. + */ +static int search_snapshot_hash_chain(u64 chunk, + struct snapshot_hash_entry * head, + struct snapshot_hash_entry ** result) +{ + struct snapshot_hash_entry * curr = head; + struct snapshot_hash_entry * prev = head; + while ( curr && curr->org_chunk < chunk ) { + prev = curr; + curr = curr->next; + } + if (!curr) { + /* Either an empty chain or went off the end of the chain. */ + *result = prev; + return 0; + } else if ( curr->org_chunk != chunk ) { + *result = curr->prev; + return 0; + } else { + *result = curr; + return 1; + } +} + +/** + * snapshot_remap_chunk + * + * Perform a sector remap on a snapshot volume. This should be called from the + * I/O read path, It first determines the base sector of the chunk containing + * the specified sector, and saves the remainder. Then it performs a search + * through the snapshot map for the specified volume. If a match is found, the + * sector number is changed to the new value. If no match is found, the value + * is left the same, meaning the read should proceed down the original volume. + */ +static int snapshot_remap_chunk(struct snapshot_volume * snap_volume, + struct buffer_head * bh) +{ + struct snapshot_hash_entry * result; + u64 chunk, sector = bh->b_rsector; + unsigned long remainder, hash_value; + unsigned long flags, queued = FALSE; + + remainder = sector & (u64)(snap_volume->chunk_size - 1); + chunk = sector >> snap_volume->chunk_shift; + hash_value = ((unsigned long)chunk) % snap_volume->hash_table_size; + + if ( search_snapshot_hash_chain(chunk, + snap_volume->snapshot_map[hash_value], + &result) ) { + bh->b_rsector = (result->snap_chunk << snap_volume->chunk_shift) + + remainder; + if ( result->chunk_state != SNAP_CHUNK_COPIED ) { + /* If this chunk is in the middle of being copied, + * place this request on the pending list. + */ + spin_lock_irqsave(&result->chunk_state_lock, flags); + if ( result->chunk_state != SNAP_CHUNK_COPIED ) { + bh->b_reqnext = result->snap_io->pending_reads; + result->snap_io->pending_reads = bh; + if (!result->snap_io->dev) { + result->snap_io->dev = bh->b_rdev; + } + evms_cs_volume_request_in_progress(result->snap_io->dev, + +1, NULL); + queued = TRUE; + } + spin_unlock_irqrestore(&result->chunk_state_lock, flags); + } + + if (queued) { + return -1; + } else { + return 1; + } + } + return 0; +} + +/** + * snap_read + */ +static void snap_read(struct evms_logical_node * node, + struct buffer_head * bh) +{ + struct snapshot_volume * volume = node->private; + u64 alignment; + int rc; + + /* Size check. */ + if ( bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) > + node->total_vsectors ) { + bh->b_end_io(bh, 0); + return; + } + + /* Can't read if rollback is in progress. */ + if ( volume->flags & EVMS_SNAPSHOT_ROLLBACK ) { + LOG_ERROR("Cannot read from snapshot '%s' during rollback.\n", + volume->logical_node->name); + bh->b_end_io(bh, 0); + return; + } + + /* On a read to the original, we can just pass it through completely + * untouched. Only reads to the snapshot can be remapped. + */ + if ( volume->flags & EVMS_SNAPSHOT_ORG ) { + R_IO(volume->logical_node, bh); + return; + } + + /* Lock the snapshot before processing the request. */ + down_read(&volume->snap_semaphore); + + /* Make sure the snapshot is not full/disabled, and that + * the original is present. + */ + if ( (volume->flags & (EVMS_SNAPSHOT_DISABLED|EVMS_SNAPSHOT_FULL)) || + (! volume->snapshot_org) ) { + bh->b_end_io(bh, 0); + up_read(&volume->snap_semaphore); + return; + } + + /* Check for unaligned I/O. This is mostly to prevent XFS from + * sending a request that spans a chunk. + */ + alignment = bh->b_rsector; + alignment <<= EVMS_VSECTOR_SIZE_SHIFT; + if ( unlikely(alignment & (bh->b_size - 1)) ) { + LOG_ERROR("Unaligned request [rsector(%lx), size(%x)] rejected on snapshot %s.\n", + bh->b_rsector, bh->b_size, node->name); + bh->b_end_io(bh, 0); + up_read(&volume->snap_semaphore); + return; + } + + /* Check if this sector has been remapped. */ + rc = snapshot_remap_chunk(volume, bh); + if ( rc > 0 ) { + /* Sector was remapped. Send IO to the snapshot. */ + up_read(&volume->snap_semaphore); + R_IO(volume->logical_node, bh); + } else if ( rc < 0 ) { + /* Sector was remapped but queued to be driven later. */ + up_read(&volume->snap_semaphore); + } else { + /* Has not been remapped. Send IO to the original. */ + R_IO(volume->snapshot_org->logical_node, bh); + up_read(&volume->snap_semaphore); + } +} + + +/********** Asynchronous Snapshot I/O Code **********/ + + +/** + * snap_deallocate_buffer + */ +static void snap_deallocate_buffer(struct snap_io_buffer * buf, + int deallocate_page) +{ + if (buf) { + if (deallocate_page) { + mempool_free(buf->bh->b_data, snap_page_pool); + } + mempool_free(buf, snap_buffer_pool); + } +} + +/** + * snap_allocate_buffer + * + * Allocate a snap_io_buffer and a data page from the respective memory + * pools. Initialize as appropriate. + */ +static struct snap_io_buffer * snap_allocate_buffer(int allocate_page) +{ + struct snap_io_buffer * buf; + struct buffer_head * bh; + + /* Grab a snap_io_buffer from the pool. */ + buf = mempool_alloc(snap_buffer_pool, GFP_NOIO); + if (!buf) { + return NULL; + } + memset(buf, 0, sizeof(struct snap_io_buffer)); + bh = buf->bh = &buf->_bh; + + /* Grab a data page from the pool. */ + if (allocate_page) { + bh->b_data = mempool_alloc(snap_page_pool, GFP_NOIO); + if (!bh->b_data) { + snap_deallocate_buffer(buf, FALSE); + return NULL; + } + bh->b_page = virt_to_page(bh->b_data); + } + + /* Initialize the rest of the buffer. */ + bh->b_size = PAGE_SIZE; + bh->b_list = BUF_LOCKED; + bh->b_count = (atomic_t)ATOMIC_INIT(1); + bh->b_this_page = (struct buffer_head *)1; + bh->b_private = buf; + set_bit(BH_Dirty, &bh->b_state); + set_bit(BH_Lock, &bh->b_state); + set_bit(BH_Req, &bh->b_state); + set_bit(BH_Mapped, &bh->b_state); + set_bit(BH_Uptodate, &bh->b_state); + init_waitqueue_head(&bh->b_wait); + INIT_LIST_HEAD(&buf->chunk_write_list); + + return buf; +} + +/** + * snap_deallocate_buffer_list + * + * Free each buffer in the specified list. + */ +static void snap_deallocate_buffer_list(struct snap_io_buffer * buf_list, + int deallocate_pages) +{ + struct snap_io_buffer * buf, * buf_next; + + for ( buf = buf_list; buf; buf = buf_next ) { + buf_next = buf->buffer_next; + snap_deallocate_buffer(buf, deallocate_pages); + } +} + +/** + * snap_allocate_buffer_list + * + * Allocate a list of snap_io_buffer's which will be used to copy a chunk + * from the original to the snapshot. + */ +static struct snap_io_buffer * +snap_allocate_buffer_list(unsigned int count, + u64 starting_lba, + void (*callback)(struct buffer_head *, int), + int allocate_pages) +{ + struct snap_io_buffer * buf, * head = NULL; + struct snap_io_buffer ** tail = &head; + unsigned int i; + + for ( i = 0; i < count; i++ ) { + /* Get a buffer from the pool. */ + buf = snap_allocate_buffer(allocate_pages); + if (!buf) { + snap_deallocate_buffer_list(head, allocate_pages); + return NULL; + } + + /* Set the callback function and the sector value. */ + buf->bh->b_end_io = callback; + buf->bh->b_rsector = starting_lba + i * + (PAGE_SIZE >> EVMS_VSECTOR_SIZE_SHIFT); + + /* Add this buffer to the list to return. */ + *tail = buf; + tail = &buf->buffer_next; + } + + return head; +} + +/** + * deallocate_async_snap_io + */ +static void deallocate_async_snap_io(struct async_snap_io * async_snap_io) +{ + DEBUG_CHECK_SNAP_IO(async_snap_io); + + snap_deallocate_buffer(async_snap_io->cow_table_buffer, TRUE); + snap_deallocate_buffer_list(async_snap_io->copy_buffers, FALSE); + mempool_free(async_snap_io, snap_async_snap_io_pool); +} + +/** + * allocate_async_snap_io + * @snap_volume: The snapshot volume this chunk belongs to. + * @hash_entry: The entry in the hash table representing this chunk. + * @async_org_io: The parent async I/O structure. + * @snap_chunk_lba: The starting LBA on the snapshot for this chunk. + * @buffer_count: The number of buffers needed to copy this chunk. + * + * Allocate an async_snap_io structure from the pool and initialize. + * Create a list of buffer heads to use for the copy. + */ +static struct async_snap_io * +allocate_async_snap_io(struct snapshot_volume * snap_volume, + struct snapshot_hash_entry * hash_entry, + struct async_org_io * async_org_io, + u64 snap_chunk_lba, + unsigned int buffer_count) +{ + struct async_snap_io * async_snap_io; + + async_snap_io = mempool_alloc(snap_async_snap_io_pool, GFP_NOIO); + if (async_snap_io) { + memset(async_snap_io, 0, sizeof(struct async_snap_io)); + async_snap_io->snap_volume = snap_volume; + async_snap_io->hash_table_entry = hash_entry; + async_snap_io->org_io = async_org_io; + INIT_LIST_HEAD(&async_snap_io->snap_pending_io_list); + INIT_LIST_HEAD(&async_snap_io->cow_write_list); + async_snap_io->write_count = (atomic_t)ATOMIC_INIT(buffer_count); + + async_snap_io->cow_table_buffer = snap_allocate_buffer(TRUE); + if (async_snap_io->cow_table_buffer) { + /* The buffer for the COW table needs to be adjusted. */ + struct snap_io_buffer * buf = async_snap_io->cow_table_buffer; + buf->bh->b_size = EVMS_VSECTOR_SIZE; + buf->bh->b_end_io = snap_cow_table_cb; + buf->buffer_private = async_snap_io; + + async_snap_io->copy_buffers = + snap_allocate_buffer_list(buffer_count, + snap_chunk_lba, + snap_write_chunk_cb, + FALSE); + if (!async_snap_io->copy_buffers) { + deallocate_async_snap_io(async_snap_io); + async_snap_io = NULL; + } + } else { + deallocate_async_snap_io(async_snap_io); + async_snap_io = NULL; + } + } + return async_snap_io; +} + +/** + * deallocate_async_org_io + */ +static void deallocate_async_org_io(struct async_org_io * async_org_io) +{ + DEBUG_REMOVE_ORG_IO_FROM_LIST(async_org_io); + + snap_deallocate_buffer_list(async_org_io->copy_buffers, TRUE); + mempool_free(async_org_io, snap_async_org_io_pool); +} + +/** + * allocate_async_org_io + * + * Allocate an async_org_io structure from the pool and initialize. + * Create a list of buffer heads to use for the copy. + */ +static struct async_org_io * +allocate_async_org_io(struct snapshot_volume * org_volume, + u64 org_chunk_lba, + unsigned int buffer_count) +{ + struct async_org_io * async_org_io; + + async_org_io = mempool_alloc(snap_async_org_io_pool, GFP_NOIO); + if (async_org_io) { + DEBUG_ADD_ORG_IO_TO_LIST(async_org_io); + + memset(async_org_io, 0, sizeof(struct async_org_io)); + async_org_io->org_volume = org_volume; + spin_lock_init(&async_org_io->pending_writes_lock); + INIT_LIST_HEAD(&async_org_io->org_pending_io_list); + async_org_io->copy_count = (atomic_t)ATOMIC_INIT(0); + async_org_io->ref_count = (atomic_t)ATOMIC_INIT(1); + + async_org_io->copy_buffers = + snap_allocate_buffer_list(buffer_count, + org_chunk_lba, + snap_read_chunk_cb, + TRUE); + if (!async_org_io->copy_buffers) { + deallocate_async_org_io(async_org_io); + async_org_io = NULL; + } + } + return async_org_io; +} + +/** + * deallocate_async_io + * + * This function deletes the entire async I/O structure, including the + * async_org_io, all async_snap_io's, and all buffer heads and pages. + */ +static void deallocate_async_io(struct async_org_io * async_org_io) +{ + struct async_snap_io * async_snap_io, * next_snap_io; + + for ( async_snap_io = async_org_io->snap_io_list; + async_snap_io; + async_snap_io = next_snap_io ) { + next_snap_io = async_snap_io->snap_io_list_next; + deallocate_async_snap_io(async_snap_io); + } + deallocate_async_org_io(async_org_io); +} + +/** + * process_org_pending_io_list + * + * Grab the first item from the org_pending_io_list and send all + * waiting write requests to the original. + */ +static void process_org_pending_io_list(struct snapshot_volume * org_volume, + int * done) +{ + struct async_org_io * async_org_io; + struct buffer_head * bh; + unsigned long flags; + + spin_lock_irqsave(&org_volume->org_pending_io_list_lock, flags); + if ( list_empty(&org_volume->org_pending_io_list) ) { + spin_unlock_irqrestore(&org_volume->org_pending_io_list_lock, + flags); + *done = TRUE; + } else { + async_org_io = ORG_PENDING_IO_ENTRY(org_volume->org_pending_io_list.next); + list_del(&async_org_io->org_pending_io_list); + spin_unlock_irqrestore(&org_volume->org_pending_io_list_lock, + flags); + + for ( bh = async_org_io->pending_writes; bh; + bh = async_org_io->pending_writes ) { + async_org_io->pending_writes = bh->b_reqnext; + bh->b_reqnext = NULL; + W_IO(async_org_io->org_volume->logical_node, bh); + evms_cs_volume_request_in_progress(async_org_io->dev, + -1, NULL); + } + + if ( atomic_dec_and_test(&async_org_io->ref_count) ) { + deallocate_async_io(async_org_io); + } + *done = FALSE; + } +} + + +/** + * process_snap_pending_io_list + * + * Grab the first item from the snap_pending_io_list and send + * all waiting read and write requests to the snapshot. + */ +static void process_snap_pending_io_list(struct snapshot_volume * org_volume, + int * done) +{ + struct async_snap_io * async_snap_io; + struct buffer_head * bh; + unsigned long flags; + + spin_lock_irqsave(&org_volume->snap_pending_io_list_lock, flags); + if ( list_empty(&org_volume->snap_pending_io_list) ) { + spin_unlock_irqrestore(&org_volume->snap_pending_io_list_lock, + flags); + } else { + async_snap_io = SNAP_PENDING_IO_ENTRY(org_volume->snap_pending_io_list.next); + list_del(&async_snap_io->snap_pending_io_list); + spin_unlock_irqrestore(&org_volume->snap_pending_io_list_lock, + flags); + + /* Reads */ + for ( bh = async_snap_io->pending_reads; bh; + bh = async_snap_io->pending_reads ) { + async_snap_io->pending_reads = bh->b_reqnext; + bh->b_reqnext = NULL; + R_IO(async_snap_io->snap_volume->logical_node, bh); + evms_cs_volume_request_in_progress(async_snap_io->dev, + -1, NULL); + } + /* Writes */ + for ( bh = async_snap_io->pending_writes; bh; + bh = async_snap_io->pending_writes ) { + async_snap_io->pending_writes = bh->b_reqnext; + bh->b_reqnext = NULL; + W_IO(async_snap_io->snap_volume->logical_node, bh); + evms_cs_volume_request_in_progress(async_snap_io->dev, + -1, NULL); + } + + if ( atomic_dec_and_test(&async_snap_io->org_io->ref_count) ) { + deallocate_async_io(async_snap_io->org_io); + } + *done = FALSE; + } +} + +/** + * process_chunk_write_list + * + * Grab the first item from the chunk_write_list and send down + * writes to each snapshot of this original. + */ +static void process_chunk_write_list(struct snapshot_volume * org_volume, + int * done) +{ + struct snap_io_buffer * buf, * snap_buf; + struct async_snap_io * async_snap_io; + unsigned long flags; + + spin_lock_irqsave(&org_volume->chunk_write_list_lock, flags); + if ( list_empty(&org_volume->chunk_write_list) ) { + spin_unlock_irqrestore(&org_volume->chunk_write_list_lock, + flags); + } else { + buf = CHUNK_WRITE_ENTRY(org_volume->chunk_write_list.next); + list_del(&buf->chunk_write_list); + spin_unlock_irqrestore(&org_volume->chunk_write_list_lock, + flags); + + for ( snap_buf = buf->copy_next; snap_buf; + snap_buf = snap_buf->copy_next ) { + async_snap_io = snap_buf->buffer_private; + W_IO(async_snap_io->snap_volume->logical_node, + snap_buf->bh); + } + *done = FALSE; + } +} + +/** + * write_cow_table + * + * On S/390 machines, the hardsector size is usually 4k, and the driver won't + * accept I/O requests that are less than 4k in size. Thus, the COW table + * cannot be written as a single sector. We must first read in the entire + * 4k hardsector, overlay the 512 byte COW table, and then write out the entire + * 4k again. + * + * On machines with hardsector size of 512, the COW table write will be + * processed just as it was before. + * + * If an error occurs in this function, we will send down the buffer as as + * read instead of a write. This will ensure that the callback function still + * runs and cleans up the async_io structures and releases all pending I/Os. + */ +static inline void write_cow_table(struct snapshot_volume * snap, + struct buffer_head * bh) +{ + if ( snap->logical_node->hardsector_size > bh->b_size ) { + u64 offset; + u8 * buffer; + unsigned short b_size = bh->b_size; + int rc; + + offset = bh->b_rsector & ((snap->logical_node->hardsector_size >> + EVMS_VSECTOR_SIZE_SHIFT) - 1); + bh->b_rsector -= offset; + bh->b_size = snap->logical_node->hardsector_size; + + /* Need a buffer to temporarily hold the COW table sector. */ + buffer = kmalloc(b_size, GFP_NOIO); + if (!buffer) { + disable_snapshot(snap, TRUE); + R_IO(snap->logical_node, bh); + return; + } + memcpy(buffer, bh->b_data, b_size); + + /* Read in the entire hardsector from disk. */ + rc = INIT_IO(snap->logical_node, READ, bh->b_rsector, + snap->logical_node->hardsector_size >> + EVMS_VSECTOR_SIZE_SHIFT, + bh->b_data); + if (rc) { + disable_snapshot(snap, TRUE); + R_IO(snap->logical_node, bh); + return; + } + + /* Copy the COW table back into the buffer. */ + memcpy(bh->b_data + (offset << EVMS_VSECTOR_SIZE_SHIFT), + buffer, b_size); + } + + W_IO(snap->logical_node, bh); +} + +/** + * process_cow_table_write_lists + */ +static void process_cow_table_write_lists(struct snapshot_volume * org_volume, + int * done) +{ + struct snapshot_volume * snap_volume; + struct async_snap_io * async_snap_io, * async_snap_io2; + struct list_head * lh; + unsigned long flags; + + /* Check the chunk_write_list for each snapshot on this original. */ + down_read(&org_volume->snap_semaphore); + for ( snap_volume = org_volume->snapshot_next; + snap_volume; + snap_volume = snap_volume->snapshot_next ) { + + /* While we are here, see if the DISABLED bit needs to + * be written to disk. + */ + if ( snap_volume->flags & EVMS_SNAPSHOT_DISABLED && + snap_volume->flags & EVMS_SNAPSHOT_DISABLED_PENDING ) { + disable_snapshot(snap_volume, TRUE); + snap_volume->flags &= ~EVMS_SNAPSHOT_DISABLED_PENDING; + } + + spin_lock_irqsave(&snap_volume->cow_table_write_list_lock, + flags); + + if ( list_empty(&snap_volume->cow_table_write_list) ) { + spin_unlock_irqrestore(&snap_volume->cow_table_write_list_lock, + flags); + continue; + } + + /* Check for an in-flight COW-table-write. */ + async_snap_io = COW_WRITE_ENTRY(snap_volume->cow_table_write_list.next); + if ( atomic_read(&async_snap_io->write_count) != 0 ) { + spin_unlock_irqrestore(&snap_volume->cow_table_write_list_lock, + flags); + continue; + } + + /* See if there are any COW-table-writes that can be skipped. */ + list_for_each(lh, &snap_volume->cow_table_write_list) { + /* No need to check the first list element, since + * we've already examined it. + */ + if ( lh->prev != &snap_volume->cow_table_write_list ) { + async_snap_io = COW_WRITE_ENTRY(lh); + async_snap_io2 = COW_WRITE_ENTRY(lh->prev); + if ( atomic_read(&async_snap_io->write_count) != 0 ) { + async_snap_io = async_snap_io2; + break; + } + if ( async_snap_io->cow_table_buffer->bh->b_rsector != + async_snap_io2->cow_table_buffer->bh->b_rsector ) { + async_snap_io = async_snap_io2; + break; + } + } + } + + /* We have the buffer to send down. Now mark all + * previous COW-table buffers as in-flight. + */ + list_for_each(lh, &snap_volume->cow_table_write_list) { + async_snap_io2 = COW_WRITE_ENTRY(lh); + atomic_dec(&async_snap_io2->write_count); + if ( async_snap_io2 == async_snap_io ) { + break; + } + else { + DEBUG_INC_COW_TABLE_OVERLAPS(snap_volume); + } + } + + spin_unlock_irqrestore(&snap_volume->cow_table_write_list_lock, flags); + + /* Write the COW table. */ + DEBUG_INC_COW_TABLE_WRITES(snap_volume); + write_cow_table(snap_volume, async_snap_io->cow_table_buffer->bh); + + *done = FALSE; + } + up_read(&org_volume->snap_semaphore); +} + +/** + * snap_async_io_thread + * + * This is the async I/O thread function. It processes requests from four + * lists, which are embedded in the original volume structure passed to the + * thread. + * + * The first list, org_pending_io_list, contains async_org_io's, each of which + * contain a list of write requests to the original volume that are waiting on + * the completion of a chunk copy. + * + * The second list, snap_pending_io_list, contains async_snap_io's, each of + * which contain a list of read requests and a list of write requests to the + * snapshot volume that are waiting on the completion of a chunk copy. + * + * The third list, chunk_write_list, contains buffers that were used to read + * part of a chunk from the original volume. Those buffers are linked to other + * buffers which are used to write the same part of that chunk to the snapshot. + * + * The fourth list is actually the list of snapshots for this original. Each + * snapshot then has a list of COW-table buffers that have to be written. The + * processing of this list is optimized to eliminate unnecessary, overlapping + * writes of the COW table. + * + * The loop will continue as long as there is an item on at least one of + * the four lists. When they are all empty, the loop exits and the thread + * goes back to sleep. + */ +static void snap_async_io_thread(void * volume) +{ + struct snapshot_volume * org_volume = volume; + int done = FALSE; + + while (!done) { + process_org_pending_io_list(org_volume, &done); + + process_snap_pending_io_list(org_volume, &done); + + process_chunk_write_list(org_volume, &done); + + process_cow_table_write_lists(org_volume, &done); + } + + run_task_queue(&tq_disk); +} + +/** + * schedule_org_pending_io + * + * Place the async_org_io on the thread's processing list. + */ +static void schedule_org_pending_io(struct async_org_io * async_org_io) +{ + struct snapshot_volume * org_volume = async_org_io->org_volume; + unsigned long flags; + + spin_lock_irqsave(&org_volume->org_pending_io_list_lock, flags); + list_add_tail(&async_org_io->org_pending_io_list, + &org_volume->org_pending_io_list); + spin_unlock_irqrestore(&org_volume->org_pending_io_list_lock, flags); + evms_cs_wakeup_thread(org_volume->async_io_thread); +} + +/** + * schedule_snap_pending_io + * + * Place the async_snap_io on the thread's processing list. + */ +static void schedule_snap_pending_io(struct async_snap_io * async_snap_io) +{ + struct snapshot_volume * org_volume = async_snap_io->org_io->org_volume; + unsigned long flags; + + spin_lock_irqsave(&org_volume->snap_pending_io_list_lock, flags); + list_add_tail(&async_snap_io->snap_pending_io_list, + &org_volume->snap_pending_io_list); + spin_unlock_irqrestore(&org_volume->snap_pending_io_list_lock, flags); + evms_cs_wakeup_thread(org_volume->async_io_thread); +} + +/** + * schedule_chunk_write + * + * Place the buffer on the chunk_write_list for the thread to process. This + * list uses the chunk_write_list field in the snap_io_buffer. + */ +static void schedule_chunk_write(struct snap_io_buffer * buf) +{ + struct async_org_io * org_io = buf->buffer_private; + struct snapshot_volume * org_volume = org_io->org_volume; + unsigned long flags; + + spin_lock_irqsave(&org_volume->chunk_write_list_lock, flags); + list_add_tail(&buf->chunk_write_list, &org_volume->chunk_write_list); + spin_unlock_irqrestore(&org_volume->chunk_write_list_lock, flags); + evms_cs_wakeup_thread(org_volume->async_io_thread); +} + +/** + * schedule_cow_table_write + * + * Place the async_snap_io on the thread's processing list. + */ +static void schedule_cow_table_write(struct async_snap_io * async_snap_io) +{ + struct snapshot_volume * snap_volume = async_snap_io->snap_volume; + unsigned long flags; + + spin_lock_irqsave(&snap_volume->cow_table_write_list_lock, flags); + list_add_tail(&async_snap_io->cow_write_list, + &snap_volume->cow_table_write_list); + spin_unlock_irqrestore(&snap_volume->cow_table_write_list_lock, flags); +} + +/** + * snap_read_chunk_cb + * + * This is the callback function for reading chunks from the original. + * When each read completes, we have to decrement the read_count in the + * async_org_io. If this count reaches zero, we can decrement the + * chunk_lock's in all of the hash entries, and send the original write + * request down. Finally, send this buffer head over to the thread to + * send the writes down to the snapshots. + */ +void snap_read_chunk_cb(struct buffer_head * bh, + int uptodate) +{ + struct snap_io_buffer * buf = bh->b_private; + + if (!uptodate) { + /* Error reading the chunk. Disable all snapshots on this org. */ + struct async_org_io * async_org_io = buf->buffer_private; + struct snapshot_volume * snap_volume; + LOG_ERROR("Error reading chunk from original '%s'.\n", + async_org_io->org_volume->exported_node->name); + for ( snap_volume = async_org_io->org_volume->snapshot_next; + snap_volume; + snap_volume = snap_volume->snapshot_next ) { + disable_snapshot(snap_volume, FALSE); + } + } + + schedule_chunk_write(buf); +} + +/** + * snap_write_chunk_cb + * + * This is the callback function for writing chunks to the snapshot. When + * each write completes, decrement the write_count in the async_snap_io. + * If this count reaches zero, decrement the chunk_lock in the hash entry, + * and decrement the remap_count in the async_org_io. If the remap_count + * reaches zero, then everybody is done, and we can free up the entire + * async_io structure. + */ +void snap_write_chunk_cb(struct buffer_head * bh, + int uptodate) +{ + struct snap_io_buffer * buf = bh->b_private; + struct async_snap_io * async_snap_io = buf->buffer_private; + + if (!uptodate) { + /* Error writing chunk. Disable this snapshot. */ + LOG_ERROR("Error writing chunk to snapshot '%s'.\n", + async_snap_io->snap_volume->exported_node->name); + disable_snapshot(async_snap_io->snap_volume, FALSE); + } + + atomic_dec(&async_snap_io->write_count); + evms_cs_wakeup_thread(async_snap_io->org_io->org_volume->async_io_thread); +} + +/** + * snap_cow_table_cb + * + * This is the callback function for writing out the COW table. + */ +void snap_cow_table_cb(struct buffer_head * bh, + int uptodate) +{ + struct snap_io_buffer * buf = bh->b_private; + struct async_snap_io * async_snap_io = buf->buffer_private; + struct async_snap_io * async_snap_io2; + struct async_org_io * async_org_io; + struct snapshot_volume * snap_volume = async_snap_io->snap_volume; + struct list_head * lh, * tmp; + unsigned long flags, flags2; + + if (!uptodate) { + /* Error writing the COW table sector. Disable the snapshot. */ + struct snapshot_volume * snap_volume = buf->buffer_private; + LOG_ERROR("Error writing COW table to snapshot '%s'.\n", + snap_volume->exported_node->name); + disable_snapshot(snap_volume, FALSE); + } + + spin_lock_irqsave(&snap_volume->cow_table_write_list_lock, flags); + + list_for_each_safe(lh, tmp, &snap_volume->cow_table_write_list) { + list_del(lh); + async_snap_io2 = COW_WRITE_ENTRY(lh); + async_org_io = async_snap_io2->org_io; + + /* Mark the chunk as copied in the hash table. */ + spin_lock_irqsave(&async_snap_io2->hash_table_entry->chunk_state_lock, + flags2); + async_snap_io2->hash_table_entry->chunk_state = SNAP_CHUNK_COPIED; + async_snap_io2->hash_table_entry->snap_io = NULL; + spin_unlock_irqrestore(&async_snap_io2->hash_table_entry->chunk_state_lock, + flags2); + + /* Release any pending I/Os waiting on this chunk. */ + schedule_snap_pending_io(async_snap_io2); + if ( atomic_dec_and_test(&async_org_io->copy_count) ) { + schedule_org_pending_io(async_org_io); + } + + if ( async_snap_io2 == async_snap_io ) { + break; + } + } + + spin_unlock_irqrestore(&snap_volume->cow_table_write_list_lock, flags); +} + +/** + * snap_queue_original_request + * + * An existing remap was found for the chunk for this write request. + * If the chunk has been fully copied, then the request can go through + * normally. If the chunk is still being processed, this request must + * be queued up to be driven after the chunk has been copied. + */ +static void snap_queue_original_request(struct snapshot_volume * snap_volume, + struct buffer_head * org_bh, + struct snapshot_hash_entry * target_entry, + u64 remainder, + int * queued_org_bh, + int write_to_snapshot) +{ + struct async_org_io * org_io; + unsigned long flags, flags2; + + if (write_to_snapshot) { + org_bh->b_rsector = (target_entry->snap_chunk << + snap_volume->chunk_shift) + + remainder; + } + + if ( ! *queued_org_bh && + target_entry->chunk_state != SNAP_CHUNK_COPIED ) { + spin_lock_irqsave(&target_entry->chunk_state_lock, flags); + if (write_to_snapshot) { + /* A write to the snapshot. */ + if ( target_entry->chunk_state != SNAP_CHUNK_COPIED ) { + org_bh->b_reqnext = + target_entry->snap_io->pending_writes; + target_entry->snap_io->pending_writes = org_bh; + if (!target_entry->snap_io->dev) { + target_entry->snap_io->dev = + org_bh->b_rdev; + } + evms_cs_volume_request_in_progress(target_entry->snap_io->dev, + +1, NULL); + *queued_org_bh = TRUE; + } + } else { + /* A write to the original. */ + if ( target_entry->chunk_state != SNAP_CHUNK_COPIED ) { + org_io = target_entry->snap_io->org_io; + spin_lock_irqsave(&org_io->pending_writes_lock, + flags2); + org_bh->b_reqnext = org_io->pending_writes; + org_io->pending_writes = org_bh; + if (!org_io->dev) { + org_io->dev = org_bh->b_rdev; + } + spin_unlock_irqrestore(&org_io->pending_writes_lock, + flags2); + evms_cs_volume_request_in_progress(org_io->dev, + +1, NULL); + *queued_org_bh = TRUE; + } + } + spin_unlock_irqrestore(&target_entry->chunk_state_lock, flags); + } +} + +/** + * snapshot_copy_1 + * + * Check this snapshot node to see if the given sector/chunk has been + * remapped yet. If it hasn't, create a new hash table entry, update the + * in-memory COW table, write the COW table to disk if it is full, and + * then start the process of copying the chunk from the original to the + * snapshot. + */ +static int snapshot_copy_1(struct snapshot_volume * snap_volume, + struct buffer_head * org_bh, + struct async_org_io ** async_org_io, + int * queued_org_bh, + int write_to_snapshot) +{ + struct snapshot_volume * org_volume = snap_volume->snapshot_org; + struct snapshot_hash_entry * target_entry, * new_map_entry; + struct snap_io_buffer * cow_buf, *buf1, *buf2; + struct async_snap_io * async_snap_io; + u64 org_sector = org_bh->b_rsector; + u64 org_chunk_lba, snap_chunk_lba; + u64 alignment; + u64 chunk, remainder; + unsigned long hash_value, buffer_count, sectors_in_chunk; + + /* Grab the read-lock when checking for an existing remap. */ + down_read(&snap_volume->snap_semaphore); + + /* Make sure the snapshot has not been disabled. */ + if ( snap_volume->flags & (EVMS_SNAPSHOT_DISABLED|EVMS_SNAPSHOT_FULL) || + ! org_volume ) { + up_read(&snap_volume->snap_semaphore); + return -ENOSPC; + } + + /* Check for unaligned I/O. This is mostly to prevent XFS from + * sending a request that spans a chunk. + */ + alignment = org_sector << EVMS_VSECTOR_SIZE_SHIFT; + if ( unlikely(alignment & (org_bh->b_size - 1)) ) { + LOG_ERROR("Unaligned request [rsector(%lx), size(%x)] rejected on snapshot %s.\n", + org_bh->b_rsector, org_bh->b_size, + snap_volume->logical_node->name); + if (!write_to_snapshot) { + disable_snapshot(snap_volume, TRUE); + } + up_read(&snap_volume->snap_semaphore); + return -EINVAL; + } + + /* Search the hash table to see if this sector has already been + * remapped on this snapshot. + */ + chunk = org_sector >> snap_volume->chunk_shift; + remainder = org_sector & (u64)(snap_volume->chunk_size - 1); + hash_value = (unsigned long)chunk % snap_volume->hash_table_size; + + if ( search_snapshot_hash_chain(chunk, + snap_volume->snapshot_map[hash_value], + &target_entry) ) { + /* Chunk is already remapped. If the remap is still in progress, + * queue up this request to be handled later. If the remap is + * complete, we can just keep going. + */ + up_read(&snap_volume->snap_semaphore); + snap_queue_original_request(snap_volume, org_bh, + target_entry, remainder, + queued_org_bh, write_to_snapshot); + return 0; + } + + /* Convert to a write-lock and check again for a remap. + * (Same search and check as just before). + */ + up_read(&snap_volume->snap_semaphore); + down_write(&snap_volume->snap_semaphore); + if ( search_snapshot_hash_chain(chunk, + snap_volume->snapshot_map[hash_value], + &target_entry) ) { + /* Chunk is already remapped. If the remap is still in progress, + * queue up this request to be handled later. If the remap is + * complete, we can just keep going. + */ + up_write(&snap_volume->snap_semaphore); + snap_queue_original_request(snap_volume, org_bh, + target_entry, remainder, + queued_org_bh, write_to_snapshot); + return 0; + } + + /* Is there enough room left on this snapshot to remap this chunk? */ + if ( snap_volume->next_free_chunk >= snap_volume->num_chunks ) { + /* Once the snapshot becomes full, further writes to the + * original can't be remapped, and thus this snapshot + * will become "corrupted". + */ + snap_volume->flags |= EVMS_SNAPSHOT_FULL; + set_snapshot_flags(snap_volume->logical_node, + EVMS_SNAPSHOT_FULL, EVMS_SNAPSHOT_DISABLED); + up_write(&snap_volume->snap_semaphore); + return -ENOSPC; + } + + /* Create and initialize a new hash table entry for the new remap. + * The value SNAP_CHUNK_COPYING indicates that this chunk still has to + * be read from the original and written to the snapshot. + */ + new_map_entry = allocate_snapshot_hash_entry(snap_volume, + chunk, + snap_volume->next_free_chunk, + SNAP_CHUNK_COPYING); + if (!new_map_entry) { + /* Can't get memory for map entry. Disable this snapshot. */ + LOG_ERROR("Memory error allocating hash table entry for snapshot '%s'.\n", + snap_volume->exported_node->name); + disable_snapshot(snap_volume, TRUE); + up_write(&snap_volume->snap_semaphore); + return -ENOMEM; + } + + /* Add the entry to the hash table. */ + if (target_entry) { + insert_snapshot_hash_entry(new_map_entry, target_entry); + } else { + insert_snapshot_hash_entry_at_head(new_map_entry, + &(snap_volume->snapshot_map[hash_value])); + } + + /* Calculate the number of buffers that will be needed to copy this + * chunk, and the starting LBAs for both the org and the snap. + */ + org_chunk_lba = chunk * org_volume->chunk_size; + snap_chunk_lba = snap_volume->next_free_chunk * org_volume->chunk_size; + snap_volume->next_free_chunk++; + sectors_in_chunk = min(((u64)org_volume->chunk_size), + org_volume->logical_node->total_vsectors - + org_chunk_lba); + buffer_count = (sectors_in_chunk + + (PAGE_SIZE >> EVMS_VSECTOR_SIZE_SHIFT) - 1) / + (PAGE_SIZE >> EVMS_VSECTOR_SIZE_SHIFT); + + /* Create the parent async_org_io structure if it hasn't been done yet. */ + if (!*async_org_io) { + *async_org_io = allocate_async_org_io(org_volume, + org_chunk_lba, + buffer_count); + if (!*async_org_io) { + // BUGBUG: Disable the snapshot? + BUG(); + } + + /* If we are only reading a partial chunk from the original, + * may need to readjust the size in the last buffer. + */ + if ( (sectors_in_chunk < org_volume->chunk_size) && + (sectors_in_chunk & + ((PAGE_SIZE >> EVMS_VSECTOR_SIZE_SHIFT) - 1)) ) { + for ( buf1 = (*async_org_io)->copy_buffers; + buf1->buffer_next; + buf1 = buf1->buffer_next ) { + ; + } + buf1->bh->b_size = (sectors_in_chunk << + EVMS_VSECTOR_SIZE_SHIFT) & + (PAGE_SIZE - 1); + } + } + + /* Create an async_snap_io structure for this snapshot and attach to + * the org io structure. + */ + async_snap_io = allocate_async_snap_io(snap_volume, new_map_entry, + *async_org_io, snap_chunk_lba, + buffer_count); + if (!async_snap_io) { + // BUGBUG: Disable the snapshot? + BUG(); + } + + /* Fill in the next entry in the COW table. Copy the COW table to the + * buffer to be written out later. + */ + snap_volume->cow_table[snap_volume->next_cow_entry] = cpu_to_le64p(&chunk); + snap_volume->next_cow_entry++; + cow_buf = async_snap_io->cow_table_buffer; + cow_buf->bh->b_rdev = org_bh->b_rdev; + cow_buf->bh->b_rsector = snap_volume->current_cow_sector; + memcpy(cow_buf->bh->b_data, snap_volume->cow_table, EVMS_VSECTOR_SIZE); + + /* If the COW table is full, reinitialize for the next sector. */ + if ( snap_volume->next_cow_entry >= (EVMS_VSECTOR_SIZE/sizeof(u64)) ) { + snap_volume->next_cow_entry = 0; + snap_volume->current_cow_sector++; + memset(snap_volume->cow_table, 0xff, EVMS_VSECTOR_SIZE); + } + + /* Attach the original buffer head, if it hasn't been queued + * already on a different copy. + */ + if (!*queued_org_bh) { + org_bh->b_reqnext = NULL; + if (write_to_snapshot) { + /* Write to the snapshot. Attach to the async_snap_io. */ + org_bh->b_rsector = (new_map_entry->snap_chunk << + snap_volume->chunk_shift) + + remainder; + async_snap_io->pending_writes = org_bh; + async_snap_io->dev = org_bh->b_rdev; + } else { + /* Write to the original. Attatch to the async_org_io. */ + (*async_org_io)->pending_writes = org_bh; + (*async_org_io)->dev = org_bh->b_rdev; + } + evms_cs_volume_request_in_progress(org_bh->b_rdev, +1, NULL); + *queued_org_bh = TRUE; + } + + /* Point the hash table entry at this async_snap_io. Then add this + * async_snap_io to the list in the async_org_io, as well as to the + * list in the snapshot volume. + */ + new_map_entry->snap_io = async_snap_io; + + async_snap_io->snap_io_list_next = (*async_org_io)->snap_io_list; + (*async_org_io)->snap_io_list = async_snap_io; + atomic_inc(&(*async_org_io)->copy_count); + atomic_inc(&(*async_org_io)->ref_count); + + schedule_cow_table_write(async_snap_io); + + /* Parallel walk through the copy_buffer's in the org and the snap, + * updating all necessary pointers and lists. + */ + for ( buf1 = (*async_org_io)->copy_buffers, + buf2 = async_snap_io->copy_buffers; + buf1 && buf2; + buf1 = buf1->buffer_next, buf2 = buf2->buffer_next ) { + buf2->copy_next = buf1->copy_next; + buf2->buffer_private = async_snap_io; + buf2->bh->b_rdev = org_bh->b_rdev; + buf2->bh->b_data = buf1->bh->b_data; + buf2->bh->b_page = buf1->bh->b_page; + + buf1->bh->b_rdev = org_bh->b_rdev; + buf1->copy_next = buf2; + buf1->buffer_private = *async_org_io; + } + + /* We're done modifying snapshot volume info, so we can release the + * lock. We can't start any reads until all snapshots for this original + * have been checked. Return and start the reads later. + */ + up_write(&snap_volume->snap_semaphore); + + return 0; +} + +/** + * snapshot_copy_data + */ +static void snapshot_copy_data(struct snapshot_volume * org_volume, + struct buffer_head * org_bh) +{ + struct snapshot_volume * snap_volume, * next_volume; + struct async_org_io * async_org_io = NULL; + struct snap_io_buffer * buf; + int queued_org_bh = FALSE; + + /* Check each snapshot on this original + * to see which one's need a remap. + */ + for ( snap_volume = org_volume->snapshot_next; + snap_volume; snap_volume = next_volume ) { + next_volume = snap_volume->snapshot_next; + snapshot_copy_1(snap_volume, org_bh, &async_org_io, + &queued_org_bh, FALSE); + } + + if (async_org_io) { + /* One or more snapshots need a remap. The async_io structures + * have been built. Now we just need to run through them and + * start all of the reads. + */ + for ( buf = async_org_io->copy_buffers; + buf; buf = buf->buffer_next ) { + R_IO(org_volume->logical_node, buf->bh); + } + } else if (!queued_org_bh) { + /* None of the snapshots needed a remap, and we didn't have to + * queue this request to be processed later due to a copy in + * progress. The write can be sent down normally. + */ + W_IO(org_volume->logical_node, org_bh); + } +} + +/** + * writeable_snapshot_copy_data + */ +static void writeable_snapshot_copy_data(struct snapshot_volume * snap_volume, + struct buffer_head * org_bh) +{ + struct snapshot_volume * org_volume = snap_volume->snapshot_org; + struct async_org_io * async_org_io = NULL; + struct snap_io_buffer * buf; + int rc, queued_org_bh = FALSE; + + rc = snapshot_copy_1(snap_volume, org_bh, &async_org_io, + &queued_org_bh, TRUE); + if ( rc < 0 ) { + org_bh->b_end_io(org_bh, 0); + return; + } + + if (async_org_io) { + /* Need to remap this chunk to the snapshot. The async_io + * structures have been built. Just need to run through them + * and start all of the reads. + */ + for ( buf = async_org_io->copy_buffers; buf; + buf = buf->buffer_next ) { + R_IO(org_volume->logical_node, buf->bh); + } + } else if (!queued_org_bh) { + /* No remap. The write can be sent down immediately. */ + W_IO(snap_volume->logical_node, org_bh); + } +} + +/** + * snap_write + */ +static void snap_write(struct evms_logical_node * node, + struct buffer_head * bh) +{ + struct snapshot_volume * volume = node->private; + + /* Size check. */ + if ( bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) > + node->total_vsectors) { + bh->b_end_io(bh, 0); + return; + } + + /* Can't write if rollback is in progress. */ + if ( volume->flags & EVMS_SNAPSHOT_ROLLBACK ) { + LOG_ERROR("Cannot write to snapshot '%s' during rollback.\n", + volume->logical_node->name); + bh->b_end_io(bh, 0); + return; + } + + if ( volume->flags & EVMS_SNAPSHOT ) { + /* Snapshot. */ + if ( volume->flags & EVMS_SNAPSHOT_WRITEABLE ) { + writeable_snapshot_copy_data(volume, bh); + } else { + bh->b_end_io(bh, 0); + } + } else { + /* Original. */ + snapshot_copy_data(volume, bh); + } +} + +/** + * snap_ioctl + */ +static int snap_ioctl(struct evms_logical_node * logical_node, + struct inode * inode, + struct file * file, + unsigned int cmd, + unsigned long arg) +{ + struct snapshot_volume * volume = logical_node->private; + struct evms_quiesce_vol_pkt * quiesce; + struct evms_plugin_ioctl_pkt pkt, * user_pkt; + int percent_full, rc = 0; + + switch (cmd) { + case EVMS_QUIESCE_VOLUME: + quiesce = (struct evms_quiesce_vol_pkt*)arg; + if (quiesce->command) { + /* Quiesce */ + volume->flags |= EVMS_SNAPSHOT_QUIESCED; + } else { + /* Un-quiesce */ + volume->flags &= ~EVMS_SNAPSHOT_QUIESCED; + } + break; + + case EVMS_GET_BMAP: + if ( volume->flags & EVMS_SNAPSHOT_ORG ) { + rc = IOCTL(volume->logical_node, inode, file, cmd, arg); + } else { + rc = -EINVAL; + } + break; + + case EVMS_PLUGIN_IOCTL: + user_pkt = (struct evms_plugin_ioctl_pkt *)arg; + + /* Copy user's parameters to kernel space. */ + if ( copy_from_user(&pkt, user_pkt, sizeof(pkt)) ) { + rc = -EFAULT; + break; + } + + if ( pkt.feature_id != logical_node->plugin->id ) { + /* This ioctl is not targetted at snapshotting, so + * broadcast the command to all children. + */ + rc = IOCTL(logical_node, inode, file, cmd, arg); + break; + } + + switch (pkt.feature_command) { + case SNAPSHOT_QUERY_PERCENT_FULL: + if ( volume->flags & EVMS_SNAPSHOT_FULL ) { + percent_full = -1; + } else if ( volume->flags & EVMS_SNAPSHOT_DISABLED ) { + percent_full = -2; + } else { + percent_full = (volume->next_free_chunk * 100) / + volume->num_chunks; + } + rc = copy_to_user(pkt.feature_ioctl_data, + &percent_full, + sizeof(percent_full)); + break; + + case SNAPSHOT_START_ROLLBACK: + if ( volume->flags & EVMS_SNAPSHOT_FULL ) { + rc = -ENOSPC; + } else if ( volume->flags & EVMS_SNAPSHOT_DISABLED ) { + rc = -EIO; + } else if ( ! (volume->flags & EVMS_SNAPSHOT) ) { + rc = -EINVAL; + } else { + set_snapshot_flags(volume->logical_node, + EVMS_SNAPSHOT_ROLLBACK, 0); + } + break; + + case SNAPSHOT_CHECK_STATE: + rc = copy_to_user(pkt.feature_ioctl_data, + &volume->flags, + sizeof(volume->flags)); + break; + + default: + rc = -EINVAL; + } + break; + + case EVMS_CHECK_MEDIA_CHANGE: + case EVMS_REVALIDATE_DISK: + case EVMS_GET_DISK_LIST: + case EVMS_CHECK_DEVICE_STATUS: + /* Broadcast these to all children. */ + if ( ! (volume->flags & EVMS_SNAPSHOT_ORG) ) { + volume = volume->snapshot_org; + } + while (volume) { + rc = IOCTL(volume->logical_node, inode, file, cmd, arg); + volume = volume->snapshot_next; + } + break; + + case EVMS_OPEN_VOLUME: + /* Disallow opens on rollback in progress. + * Otherwise fall through. + */ + if ( volume->flags & EVMS_SNAPSHOT_ROLLBACK) { + LOG_ERROR("Cannot open snapshot volume '%s' during rollback\n", + volume->logical_node->name); + rc = -EBUSY; + break; + } + + default: + rc = IOCTL(volume->logical_node, inode, file, cmd, arg); + + } + return rc; +} + +/** + * snap_init_io + */ +static int snap_init_io(struct evms_logical_node * node, + int rw, + u64 sect_nr, + u64 num_sects, + void * buf_addr) +{ + struct snapshot_volume * volume = node->private; + + /* No init io access to snapshot, and no writes allowed to original + * since they would not be snapshotted. + */ + if ( rw || (volume->flags & EVMS_SNAPSHOT) ) { + return -EINVAL; + } + return INIT_IO(volume->logical_node, rw, + sect_nr, num_sects, buf_addr); +} + +/** + * add_cow_entry_to_snapshot_map + * + * This function takes a cow table entry (from the on-disk data), and + * converts it into an appropriate entry for the snapshot map, and + * inserts it into the appropriate map for the specified volume. + */ +static int add_cow_entry_to_snapshot_map(u64 org_chunk, + u64 snap_chunk, + struct snapshot_volume * volume) +{ + struct snapshot_hash_entry * new_entry, * target_entry; + unsigned long hash_value; + + new_entry = allocate_snapshot_hash_entry(volume, org_chunk, + snap_chunk, SNAP_CHUNK_COPIED); + if (!new_entry) { + return -ENOMEM; + } + + hash_value = (long)org_chunk % volume->hash_table_size; + if ( search_snapshot_hash_chain(org_chunk, + volume->snapshot_map[hash_value], + &target_entry) ) { + /* A duplicate mapping was found. This should never happen. */ + } else { + if (target_entry) { + insert_snapshot_hash_entry(new_entry, target_entry); + } else { + insert_snapshot_hash_entry_at_head(new_entry, + &(volume->snapshot_map[hash_value])); + } + } + return 0; +} + +/** + * build_snapshot_maps + * + * Construct the initial hash table state based on + * existing COW tables on the disk. + */ +static int build_snapshot_maps(struct snapshot_volume * volume) +{ + int rc = 0; + int done = FALSE; + while (!done) { + /* Read in one sector's worth of COW tables. */ + if ( INIT_IO(volume->logical_node, 0, + volume->current_cow_sector, 1, + volume->cow_table) ) { + return -EIO; + } + + /* Translate every valid COW table entry into + * a snapshot map entry. + */ + for ( volume->next_cow_entry = 0; + volume->next_cow_entry < (EVMS_VSECTOR_SIZE/sizeof(u64)) && + volume->cow_table[volume->next_cow_entry] != 0xffffffffffffffff; + volume->next_cow_entry++, volume->next_free_chunk++ ) { + rc = add_cow_entry_to_snapshot_map(le64_to_cpup(&volume->cow_table[volume->next_cow_entry]), + volume->next_free_chunk, + volume); + if (rc) { + return(rc); + } + } + + /* Move on to the next sector if necessary. */ + if ( volume->next_cow_entry == + (EVMS_VSECTOR_SIZE/sizeof(u64)) ) { + volume->current_cow_sector++; + } else { + done = TRUE; + } + } + return 0; +} + +/** + * initialize_snapshot_node + */ +static int initialize_snapshot_node(struct evms_logical_node * snap_node, + struct evms_logical_node * new_snap_node, + struct evms_logical_node * org_node, + struct snapshot_metadata * metadata) +{ + struct snapshot_volume * snap_volume; + struct snapshot_hash_entry * new_entry; + int i, rc = 0; + + /* Instance data for the snapshot. */ + snap_volume = kmalloc(sizeof(struct snapshot_volume), GFP_KERNEL); + if (!snap_volume) { + set_snapshot_flags(snap_node, EVMS_SNAPSHOT_DISABLED, 0); + snap_delete_volume(new_snap_node); + DELETE(snap_node); + return -ENOMEM; + } + memset(snap_volume, 0, sizeof(struct snapshot_volume)); + + /* Initialize the snapshot node. */ + new_snap_node->total_vsectors = org_node->total_vsectors; + new_snap_node->plugin = &plugin_header; + new_snap_node->private = snap_volume; + new_snap_node->flags = snap_node->flags | + (org_node->flags & (EVMS_DEVICE_REMOVABLE | EVMS_VOLUME_PARTIAL)) | + ((metadata->flags & EVMS_SNAPSHOT_WRITEABLE) ? 0 : EVMS_VOLUME_READ_ONLY); + new_snap_node->hardsector_size = snap_node->hardsector_size; + new_snap_node->block_size = snap_node->block_size; + new_snap_node->system_id = EVMS_SNAPSHOT_SIGNATURE; + new_snap_node->volume_info = snap_node->volume_info; + /* Get the new node's name from the consumed node's feature header. */ + strcpy(new_snap_node->name, snap_node->feature_header->object_name); + + /* Initialize the private data. */ + snap_volume->logical_node = snap_node; + snap_volume->exported_node = new_snap_node; + init_rwsem(&snap_volume->snap_semaphore); + snap_volume->chunk_size = metadata->chunk_size; + snap_volume->chunk_shift = evms_cs_log2((u64)metadata->chunk_size); + snap_volume->num_chunks = metadata->total_chunks; + snap_volume->current_cow_sector = metadata->lba_of_COW_table; + snap_volume->hash_table_size = metadata->total_chunks / MAX_HASH_CHAIN_ENTRIES + 1; + snap_volume->flags = EVMS_SNAPSHOT | + (metadata->flags & EVMS_SNAPSHOT_WRITEABLE) | + (metadata->flags & EVMS_SNAPSHOT_ASYNC); + INIT_LIST_HEAD(&snap_volume->cow_table_write_list); + spin_lock_init(&snap_volume->cow_table_write_list_lock); + +#ifdef SNAPSHOT_DEBUG + snap_volume->cow_table_writes = (atomic_t)ATOMIC_INIT(0); + snap_volume->cow_table_overlaps = (atomic_t)ATOMIC_INIT(0); +#endif + + if ( metadata->flags & EVMS_SNAPSHOT_ROLLBACK ) { + + /* Buffer for reading rollback data. */ + snap_volume->chunk_data_buffer = kmalloc(SNAPSHOT_CHUNK_BUFFER_SIZE << + EVMS_VSECTOR_SIZE_SHIFT, + GFP_KERNEL); + if (!snap_volume->chunk_data_buffer) { + disable_snapshot(snap_volume, TRUE); + snap_delete_volume(new_snap_node); + return -ENOMEM; + } + + /* Create the rollback thread. */ + snap_volume->rollback_thread = + evms_cs_register_thread(snapshot_do_rollback, + snap_volume, + "evms_snapshot_rollback"); + if (!snap_volume->rollback_thread){ + LOG_SERIOUS("Could not start rollback thread for snapshot '%s'.\n", + snap_node->name); + disable_snapshot(snap_volume, TRUE); + snap_delete_volume(new_snap_node); + return -ENOMEM; + } + } else { + /* Snapshot hash table. */ + snap_volume->snapshot_map = vmalloc(snap_volume->hash_table_size * + sizeof(struct snapshot_hash_entry*)); + if (!snap_volume->snapshot_map) { + disable_snapshot(snap_volume, TRUE); + snap_delete_volume(new_snap_node); + return -ENOMEM; + } + memset(snap_volume->snapshot_map, 0, + snap_volume->hash_table_size * + sizeof(struct snapshot_hash_entry*)); + + /* Pre-allocate all of the hash entries we will need and + * store them in the free list in the volume. + */ + for ( i = 0; i < snap_volume->num_chunks; i++ ) { + new_entry = mempool_alloc(snap_hash_entry_pool, + GFP_KERNEL); + if (!new_entry) { + disable_snapshot(snap_volume, TRUE); + snap_delete_volume(new_snap_node); + return -ENOMEM; + } + new_entry->next = snap_volume->free_hash_list; + snap_volume->free_hash_list = new_entry; + } + + rc = build_snapshot_maps(snap_volume); + if (rc) { + disable_snapshot(snap_volume, TRUE); + snap_delete_volume(new_snap_node); + return rc; + } + } + + return 0; +} + +/** + * initialize_original_node + */ +static int initialize_original_node(struct evms_logical_node * snap_node, + struct evms_logical_node * new_snap_node, + struct evms_logical_node * org_node, + struct evms_logical_node * new_org_node) +{ + struct snapshot_volume * snap_volume = new_snap_node->private; + struct snapshot_volume * org_volume; + + /* Instance data for the original. */ + org_volume = kmalloc(sizeof(struct snapshot_volume), GFP_KERNEL); + if (!org_volume) { + disable_snapshot(snap_volume, TRUE); + snap_delete_volume(new_snap_node); + snap_delete_volume(new_org_node); + return -ENOMEM; + } + memset(org_volume, 0, sizeof(struct snapshot_volume)); + + /* Initialize the new node. */ + new_org_node->total_vsectors = org_node->total_vsectors; + new_org_node->plugin = &plugin_header; + new_org_node->private = org_volume; + new_org_node->flags = org_node->flags | + (snap_node->flags & + (EVMS_DEVICE_REMOVABLE | EVMS_VOLUME_PARTIAL)); + new_org_node->hardsector_size = org_node->hardsector_size; + new_org_node->block_size = org_node->block_size; + new_org_node->system_id = EVMS_ORIGINAL_SIGNATURE; + new_org_node->volume_info = org_node->volume_info; + /* Must reuse the original node's name. */ + strcpy(new_org_node->name, org_node->name); + + /* Initialize the private data. */ + org_volume->logical_node = org_node; + org_volume->exported_node = new_org_node; + init_rwsem(&org_volume->snap_semaphore); + org_volume->chunk_size = snap_volume->chunk_size; + org_volume->chunk_shift = snap_volume->chunk_shift; + org_volume->flags = EVMS_SNAPSHOT_ORG | + (snap_volume->flags & EVMS_SNAPSHOT_ASYNC); + INIT_LIST_HEAD(&org_volume->chunk_write_list); + spin_lock_init(&org_volume->chunk_write_list_lock); + INIT_LIST_HEAD(&org_volume->org_pending_io_list); + spin_lock_init(&org_volume->org_pending_io_list_lock); + INIT_LIST_HEAD(&org_volume->snap_pending_io_list); + spin_lock_init(&org_volume->snap_pending_io_list_lock); + + /* Start the async I/O thread for this original. */ + org_volume->async_io_thread = + evms_cs_register_thread(snap_async_io_thread, org_volume, + "evms_async_snapshot"); + if (!org_volume->async_io_thread) { + disable_snapshot(snap_volume, TRUE); + snap_delete_volume(new_snap_node); + snap_delete_volume(new_org_node); + return -ENOMEM; + } + + return 0; +} + +/** + * add_snapshot + * + * Initializes a snapshot instance and exports an evms_logical_node to + * the global list. + */ +static int add_snapshot(struct evms_logical_node * snap_node, + struct snapshot_metadata * metadata, + struct evms_logical_node ** evms_node_list) +{ + struct evms_logical_node * new_snap_node; + struct evms_logical_node * new_org_node; + struct evms_logical_node * org_node; + struct snapshot_volume * snap_volume; + struct snapshot_volume * org_volume; + struct snapshot_volume * tmp_volume; + int rc = 0; + + /* Make sure the snapshot is not full or disabled. */ + if ( metadata->flags & (EVMS_SNAPSHOT_DISABLED | EVMS_SNAPSHOT_FULL) ) { + LOG_WARNING("Error: Snapshot %s discovered as disabled/full.\n", + snap_node->name); + LOG_WARNING(" Deleting from further use.\n"); + DELETE(snap_node); + return -ENOSPC; + } + + /* Inspect the global list until a node is found with the name of + * this snapshot's original. There can only be one original for + * each snapshot. + */ + for ( org_node = *evms_node_list; + org_node && strncmp(EVMS_GET_NODE_NAME(org_node), + metadata->original_volume, + EVMS_VOLUME_NAME_SIZE); + org_node = org_node->next ) { + ; + } + if (!org_node) { + /* No original was found. Disable and delete the snapshot. */ + LOG_ERROR("Error: No original found for snapshot %s, looking for %s\n", + snap_node->name, metadata->original_volume); + set_snapshot_flags(snap_node, EVMS_SNAPSHOT_DISABLED, 0); + DELETE(snap_node); + return -ENODEV; + } + + LOG_DEBUG("Adding snapshot for '%s'\n", org_node->name); + + /* We found the original on the list. Verify the size to be sure the + * name didn't change for compatibility. For non-512-byte hardsector + * sizes, round down org node to a hardsector multiple to be the same + * as what was stored in the metadata. + */ + if ( (org_node->total_vsectors & + (~((org_node->hardsector_size/EVMS_VSECTOR_SIZE)-1))) != + metadata->original_size ) { + /* The snapshot no longer points at a valid original. + * Disable and delete the snapshot. + */ + LOG_ERROR("Error: Original volume size does not match for snapshot '%s'!\n", + snap_node->name); + LOG_ERROR(" volume=%s: org_size="PFU64", current size="PFU64"\n", + org_node->name, metadata->original_size, + org_node->total_vsectors); + set_snapshot_flags(snap_node, EVMS_SNAPSHOT_DISABLED, 0); + DELETE(snap_node); + return -ENODEV; + } + + /* New EVMS node for the snapshot. */ + if ( evms_cs_allocate_logical_node(&new_snap_node) ) { + set_snapshot_flags(snap_node, EVMS_SNAPSHOT_DISABLED, 0); + DELETE(snap_node); + return -ENOMEM; + } + + MOD_INC_USE_COUNT; + snapshot_count++; + + snapshot_create_pools(); + + rc = initialize_snapshot_node(snap_node, new_snap_node, + org_node, metadata); + if (rc) { + return rc; + } + snap_volume = new_snap_node->private; + + /* Check to see if the node we found is one we put back on the list due + * to another snapshot of the original, if so then don't allocate a new + * node and volume info, just get the old one. + */ + if ( org_node->plugin->id != plugin_header.id ) { + + /* New EVMS node for the original. */ + if ( evms_cs_allocate_logical_node(&new_org_node) ) { + disable_snapshot(snap_volume, TRUE); + snap_delete_volume(new_snap_node); + return -ENOMEM; + } + + MOD_INC_USE_COUNT; + snapshot_count++; + + rc = initialize_original_node(snap_node, new_snap_node, + org_node, new_org_node); + if (rc) { + return rc; + } + org_volume = new_org_node->private; + + /* Remove the original volume from the global list, then + * add the new version of the original to the global list. + */ + evms_cs_remove_logical_node_from_list(evms_node_list, org_node); + evms_cs_add_logical_node_to_list(evms_node_list, new_org_node); + } else { + /* There is already at least one snapshot for this original. */ + new_org_node = org_node; + org_volume = new_org_node->private; + org_node = org_volume->logical_node; + + /* Make sure this snapshot matches the current + * chunk size if we have async snapshots. + */ + if ( snap_volume->chunk_size != org_volume->chunk_size ) { + LOG_ERROR("Cannot add snapshot '%s' with chunk size %u to original '%s' with chunk size %u.\n", + new_snap_node->name, snap_volume->chunk_size, + new_org_node->name, org_volume->chunk_size); + disable_snapshot(snap_volume, TRUE); + snap_delete_volume(new_snap_node); + return -EINVAL; + } + + /* If the new snapshot is Removable or Partial, propogate + * the flags to the original and all other snapshots. + */ + for ( tmp_volume = org_volume; + tmp_volume; + tmp_volume = tmp_volume->snapshot_next) { + tmp_volume->exported_node->flags |= + (snap_node->flags & + (EVMS_DEVICE_REMOVABLE | EVMS_VOLUME_PARTIAL)); + } + } + + /* Create a proc-fs entry for this snapshot. */ + if (snap_proc) { + create_proc_read_entry(snap_node->feature_header->volume_name, + S_IFREG, snap_proc, + snap_proc_read, new_snap_node); + } + + /* Insert the new snapshot at the start of the original's chain. */ + down_write(&org_volume->snap_semaphore); + snap_volume->snapshot_next = org_volume->snapshot_next; + org_volume->snapshot_next = snap_volume; + snap_volume->snapshot_org = org_volume; + up_write(&org_volume->snap_semaphore); + + /* Place the new snapshot on the global list. */ + evms_cs_add_logical_node_to_list(evms_node_list, new_snap_node); + + if ( metadata->flags & EVMS_SNAPSHOT_ROLLBACK ) { + org_volume->flags |= EVMS_SNAPSHOT_ROLLBACK; + snap_volume->flags |= EVMS_SNAPSHOT_ROLLBACK; + evms_cs_wakeup_thread(snap_volume->rollback_thread); + } + + return 0; +} + +/** + * do_rollback + */ +void snapshot_do_rollback(void * volume) +{ + struct snapshot_volume * snap_volume = volume; + struct snapshot_volume * org_volume = snap_volume->snapshot_org; + u32 io_size = snap_volume->chunk_size; + u32 sectors = io_size; + int done = FALSE; + int i, iterations = 1; + + evms_cs_invalidate_volume(org_volume->exported_node); + evms_cs_invalidate_volume(snap_volume->exported_node); + + /* Safety to start at chunk 0. */ + snap_volume->next_free_chunk = 0; + while (!done) { + + if ( SNAPSHOT_CHUNK_BUFFER_SIZE < snap_volume->chunk_size ) { + iterations = snap_volume->chunk_size / + org_volume->chunk_size; + sectors = io_size = org_volume->chunk_size; + } + + /* Read in one sector's worth of COW tables. */ + if ( INIT_IO(snap_volume->logical_node, 0, + snap_volume->current_cow_sector, 1, + snap_volume->cow_table) ) { + LOG_ERROR("Error reading COW table from snapshot during rollback, aborting rollback\n"); + return; + } + + /* Translate every valid COW table entry into + * a snapshot map entry. + */ + for ( snap_volume->next_cow_entry = 0; + snap_volume->next_cow_entry < + (EVMS_VSECTOR_SIZE/sizeof(u64)) && + snap_volume->cow_table[snap_volume->next_cow_entry] != + 0xffffffffffffffff; + snap_volume->next_cow_entry++, + snap_volume->next_free_chunk++ ) { + for ( i = 0; i < iterations; i++ ) { + + /* Don't go off the end of the original. */ + if ( io_size > + org_volume->logical_node->total_vsectors - + (snap_volume->cow_table[snap_volume->next_cow_entry] * + snap_volume->chunk_size + i * io_size) ) { + sectors = org_volume->logical_node->total_vsectors - + (snap_volume->cow_table[snap_volume->next_cow_entry] * + snap_volume->chunk_size + i * io_size); + } + + /* Read the chunk from the snapshot volume. */ + if ( INIT_IO(snap_volume->logical_node, READ, + (snap_volume->next_free_chunk * + snap_volume->chunk_size + + i*io_size), + sectors, + snap_volume->chunk_data_buffer) ) { + LOG_ERROR("Error reading chunk %u from snapshot '%s'. Continuing.\n", + snap_volume->next_free_chunk, + snap_volume->logical_node->name); + } + + /* Write the chunk to the original volume. */ + if ( INIT_IO(org_volume->logical_node, WRITE, + snap_volume->cow_table[snap_volume->next_cow_entry] * + snap_volume->chunk_size + i*io_size, + sectors, + snap_volume->chunk_data_buffer) ) { + LOG_ERROR("Error writing chunk %u to original '%s' during rollback. Continuing.\n", + snap_volume->next_free_chunk, + org_volume->logical_node->name); + } + + if ( sectors < io_size ) { + break; + } + } + } + + /* Move on to the next COW table sector if necessary. */ + if ( snap_volume->next_cow_entry == + (EVMS_VSECTOR_SIZE/sizeof(u64)) ) { + snap_volume->current_cow_sector++; + } else { + done = TRUE; + snap_volume->flags |= EVMS_SNAPSHOT_DISABLED | + EVMS_SNAPSHOT_ROLLBACK_COMP; + snap_volume->flags &= ~EVMS_SNAPSHOT_ROLLBACK; + org_volume->flags &= ~EVMS_SNAPSHOT_ROLLBACK; + set_snapshot_flags(snap_volume->logical_node, + EVMS_SNAPSHOT_DISABLED | + EVMS_SNAPSHOT_ROLLBACK_COMP, + EVMS_SNAPSHOT_ROLLBACK); + LOG_DEFAULT("Rollback complete from snapshot %s\n", + snap_volume->exported_node->name); + } + } +} + +/** + * snap_proc_read + * + * Callback function for the proc-fs entry for each snapshot node. + * Print out pertinent information about this snapshot. The "data" + * parameter is a pointer to an EVMS logical node. + */ +static int snap_proc_read(char * page, char ** start, off_t off, + int count, int * eof, void * data) +{ + struct evms_logical_node * snap_node = data; + struct snapshot_volume * snap_volume = snap_node->private; + int sz = 0; + + PROCPRINT("Snapshot of : %s\n", (snap_volume->snapshot_org) ? EVMS_GET_NODE_NAME(snap_volume->snapshot_org->logical_node) : (u8 *)"Unknown"); + PROCPRINT("Size (KB) : %u\n", (snap_volume->num_chunks * snap_volume->chunk_size)/2); + PROCPRINT("Chunk Size (KB): %u\n", (snap_volume->chunk_size)/2); + PROCPRINT("Writeable : %s\n", (snap_volume->flags & EVMS_SNAPSHOT_WRITEABLE) ? "Yes" : "No"); + PROCPRINT("Usage : %u%%\n", (snap_volume->next_free_chunk * 100) / snap_volume->num_chunks); + PROCPRINT("Status : %s\n", (snap_volume->flags & EVMS_SNAPSHOT_FULL) ? "Full / Disabled" : (snap_volume->flags & EVMS_SNAPSHOT_DISABLED) ? "Disabled" : "Active"); +#ifdef SNAPSHOT_DEBUG + PROCPRINT("Next free chunk: %u\n", snap_volume->next_free_chunk); + PROCPRINT("COW Writes : %u\n", atomic_read(&snap_volume->cow_table_writes)); + PROCPRINT("COW Overlaps : %u\n", atomic_read(&snap_volume->cow_table_overlaps)); +#endif + +out: + *start = page + off; + sz -= off; + if (sz < 0) + sz = 0; + return sz > count ? count : sz; +} + +/** + * snapshot_init + */ +int __init snapshot_init(void) +{ + struct proc_dir_entry * pde; + + /* Register a directory in proc-fs. */ + pde = evms_cs_get_evms_proc_dir(); + if (pde) { + snap_proc = create_proc_entry("snapshot", S_IFDIR, pde); + } + + /* Register with EVMS. */ + return evms_cs_register_plugin(&plugin_header); +} + +/** + * snapshot_exit + */ +void __exit snapshot_exit(void) +{ + struct proc_dir_entry * pde; + + /* Unregister the directory in proc-fs. */ + pde = evms_cs_get_evms_proc_dir(); + if (pde) { + remove_proc_entry("snapshot", pde); + } + + evms_cs_unregister_plugin(&plugin_header); +} + +module_init(snapshot_init); +module_exit(snapshot_exit); +#ifdef MODULE_LICENSE +MODULE_LICENSE("GPL"); +#endif + diff -Naur linux-2002-09-30/include/linux/evms/evms.h evms-2002-09-30/include/linux/evms/evms.h --- linux-2002-09-30/include/linux/evms/evms.h Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/include/linux/evms/evms.h Thu Sep 26 11:55:45 2002 @@ -0,0 +1,575 @@ +/* -*- linux-c -*- */ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +/* + * linux/include/linux/evms/evms.h + * + * EVMS kernel header file + * + */ + +#ifndef __EVMS_INCLUDED__ +#define __EVMS_INCLUDED__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * version info + **/ +#define EVMS_MAJOR_VERSION 1 +#define EVMS_MINOR_VERSION 2 +#define EVMS_PATCHLEVEL_VERSION 0 + +/** + * general defines section + **/ +#define FALSE 0 +#define TRUE 1 + +#define MAX_EVMS_VOLUMES 256 +#define EVMS_VOLUME_NAME_SIZE 127 +#define IBM_OEM_ID 8112 +#define EVMS_INITIAL_CRC 0xFFFFFFFF +#define EVMS_MAGIC_CRC 0x31415926 +#define EVMS_VSECTOR_SIZE 512 +#define EVMS_VSECTOR_SIZE_SHIFT 9 + +#define DEV_PATH "/dev" +#define EVMS_DIR_NAME "evms" +#define EVMS_DEV_NAME "block_device" +#define EVMS_DEV_NODE_PATH DEV_PATH "/" EVMS_DIR_NAME "/" +#define EVMS_DEVICE_NAME DEV_PATH "/" EVMS_DIR_NAME "/" EVMS_DEV_NAME + +/** + * kernel logging levels defines + **/ +#define EVMS_INFO_CRITICAL 0 +#define EVMS_INFO_SERIOUS 1 +#define EVMS_INFO_ERROR 2 +#define EVMS_INFO_WARNING 3 +#define EVMS_INFO_DEFAULT 5 +#define EVMS_INFO_DETAILS 6 +#define EVMS_INFO_DEBUG 7 +#define EVMS_INFO_EXTRA 8 +#define EVMS_INFO_ENTRY_EXIT 9 +#define EVMS_INFO_EVERYTHING 10 + +/** + * kernel logging level variable + **/ +extern int evms_info_level; + +/** + * kernel logging macros + **/ +#define evmsLOG(info_level,prspec) { if (evms_info_level >= info_level) printk prspec; } +#define evmsLOG2(info_level,statement) { if (evms_info_level >= info_level) statement; } + +/** + * LOG MACROS to make evms log messages + * look much cleaner in the source. + **/ +#define EVMS_LOG_PREFIX "evms: " +#define LOG_CRITICAL(msg, args...) evmsLOG(EVMS_INFO_CRITICAL, (KERN_CRIT EVMS_LOG_PREFIX LOG_PREFIX msg, ## args)) +#define LOG_SERIOUS(msg, args...) evmsLOG(EVMS_INFO_SERIOUS, (KERN_ERR EVMS_LOG_PREFIX LOG_PREFIX msg, ## args)) +#define LOG_ERROR(msg, args...) evmsLOG(EVMS_INFO_ERROR, (KERN_ERR EVMS_LOG_PREFIX LOG_PREFIX msg, ## args)) +#define LOG_WARNING(msg, args...) evmsLOG(EVMS_INFO_WARNING, (KERN_WARNING EVMS_LOG_PREFIX LOG_PREFIX msg, ## args)) +#define LOG_DEFAULT(msg, args...) evmsLOG(EVMS_INFO_DEFAULT, (KERN_INFO EVMS_LOG_PREFIX LOG_PREFIX msg, ## args)) +#define LOG_DETAILS(msg, args...) evmsLOG(EVMS_INFO_DETAILS, (KERN_INFO EVMS_LOG_PREFIX LOG_PREFIX msg, ## args)) +#define LOG_DEBUG(msg, args...) evmsLOG(EVMS_INFO_DEBUG, (KERN_INFO EVMS_LOG_PREFIX LOG_PREFIX msg, ## args)) +#define LOG_EXTRA(msg, args...) evmsLOG(EVMS_INFO_EXTRA, (KERN_INFO EVMS_LOG_PREFIX LOG_PREFIX msg, ## args)) +#define LOG_ENTRY_EXIT(msg, args...) evmsLOG(EVMS_INFO_ENTRY_EXIT, (KERN_INFO EVMS_LOG_PREFIX LOG_PREFIX msg, ## args)) +#define LOG_EVERYTHING(msg, args...) evmsLOG(EVMS_INFO_EVERYTHING, (KERN_INFO EVMS_LOG_PREFIX LOG_PREFIX msg, ## args)) + +/** + * Macros to cleanly print 64-bit numbers on both 32-bit and 64-bit machines. + * Use these in place of %Ld, %Lu, and %Lx. + **/ +#if BITS_PER_LONG > 32 +#define PFD64 "%ld" +#define PFU64 "%lu" +#define PFX64 "%lx" +#else +#define PFD64 "%Ld" +#define PFU64 "%Lu" +#define PFX64 "%Lx" +#endif + +/** + * helpful PROCFS macro + **/ +#ifdef CONFIG_PROC_FS +#define PROCPRINT(msg, args...) (sz += sprintf(page + sz, msg, ## args));\ + if (sz < off)\ + off -= sz, sz = 0;\ + else if (sz >= off + count)\ + goto out +#endif + +/** + * PluginID convenience macros + * + * An EVMS PluginID is a 32-bit number with the following bit positions: + * Top 16 bits: OEM identifier. See IBM_OEM_ID. + * Next 4 bits: Plugin type identifier. See evms_plugin_code. + * Lowest 12 bits: Individual plugin identifier within a given plugin type. + **/ +#define SetPluginID(oem, type, id) ((oem << 16) | (type << 12) | id) +#define GetPluginOEM(pluginid) (pluginid >> 16) +#define GetPluginType(pluginid) ((pluginid >> 12) & 0xf) +#define GetPluginID(pluginid) (pluginid & 0xfff) + +/** + * enum evms_plugin_type - evms plugin types + **/ +enum evms_plugin_code { + EVMS_NO_PLUGIN = 0, + EVMS_DEVICE_MANAGER, + EVMS_SEGMENT_MANAGER, + EVMS_REGION_MANAGER, + EVMS_FEATURE, + EVMS_ASSOCIATIVE_FEATURE, + EVMS_FILESYSTEM_INTERFACE_MODULE, + EVMS_CLUSTER_MANAGER_INTERFACE_MODULE, + EVMS_DISTRIBUTED_LOCK_MANAGER_INTERFACE_MODULE +}; + +/** + * struct evms_version - + * @major: changes when incompatible difference are introduced + * @minor: changes when additions are made + * @patchlevel: reflects bug level fixes within a particular major/minor pair + * + * generic versioning info used by EVMS + **/ +struct evms_version { + u32 major; + u32 minor; + u32 patchlevel; +}; + +/** + * struct evms_plugin_header - kernel plugin header record + * @id: plugin id + * @version: plugin version + * @required_services_version: required common services version + * @fops: table of function operations + * + * kernel plugin header record + **/ +struct evms_plugin_header { + u32 id; + struct evms_version version; + struct evms_version required_services_version; + struct evms_plugin_fops *fops; +}; + +/** + * struct evms_feature_header - EVMS generic on-disk header for features + * @signature: unique magic number + * @crc: structure's crc + * @version: feature header version + * @engine_version: created by this evms engine version + * @flags: feature characteristics, bit definitions below. + * @feature_id: indicates which feature this header is describing + * @sequence_number: describes most recent copy of redundant metadata + * @alignment_padding: used when objects are moved between different sized devices + * @feature_data1_start_lsn: object relative start of 1st copy feature data + * @feature_data1_size: size of 1st copy of feature data + * @feature_data2_start_lsn: object relative start of 2nd copy feature data + * @feature_data2_size: size of 2nd copy of feature data + * @volume_serial_number: unique/persistent volume identifier + * @volume_system_id: unique/persistent minor number + * @object_depth: depth of object in volume tree + * @object_name: object's name + * @volume_name: volume name object is a part of + * @pad: padding to make structure be 512 byte aligned + * + * generic on-disk header used to describe any EVMS feature + * NOTE: 2nd copy of feature data is optional, if used set start_lsn to 0. + **/ +struct evms_feature_header { + u32 signature; + u32 crc; + struct evms_version version; + struct evms_version engine_version; + u32 flags; + u32 feature_id; + u64 sequence_number; + u64 alignment_padding; + u64 feature_data1_start_lsn; + u64 feature_data1_size; + u64 feature_data2_start_lsn; + u64 feature_data2_size; + u64 volume_serial_number; + u32 volume_system_id; + u32 object_depth; + u8 object_name[EVMS_VOLUME_NAME_SIZE + 1]; + u8 volume_name[EVMS_VOLUME_NAME_SIZE + 1]; + u8 pad[152]; +}; + +/** + * field evms_feature_header.signature majic number + **/ +#define EVMS_FEATURE_HEADER_SIGNATURE 0x54414546 /* FEAT */ +/** + * field evms_feature_header.flags defines + **/ +#define EVMS_FEATURE_ACTIVE (1<<0) +#define EVMS_FEATURE_VOLUME_COMPLETE (1<<1) +#define EVMS_VOLUME_DATA_OBJECT (1<<16) +#define EVMS_VOLUME_DATA_STOP (1<<17) +/** + * struct evms_feature_header version info + **/ +#define EVMS_FEATURE_HEADER_MAJOR 3 +#define EVMS_FEATURE_HEADER_MINOR 0 +#define EVMS_FEATURE_HEADER_PATCHLEVEL 0 + +/** + * EVMS specific error codes + **/ +#define EVMS_FEATURE_FATAL_ERROR 257 +#define EVMS_VOLUME_FATAL_ERROR 258 +#define EVMS_FEATURE_INCOMPLETE_ERROR 259 + +/** + * struct evms_volume_info - exported volume info + * @volume_sn: unique volume identifier + * @volume_minor: persistent device minor assigned to this volume + * @volume_name: persistent name assigned to this volume + * + * a collection of volume specific info + **/ +struct evms_volume_info { + u64 volume_sn; + u32 volume_minor; + u8 volume_name[EVMS_VOLUME_NAME_SIZE + 1]; +}; + +/** + * struct evms_logical_node - generic kernel storage object + * @total_vsectors: 0 size of this object in 512 byte units + * @plugin: 8 plugin that created/owns/manages this storage object + * @private: 12 location for owner to store private info + * @flags: 16 storage object characteristics (set/used by plugins) + * bit definitions located in evms_common.h + * @iflags: 20 internal flags (used exclusively by the framework, not for plugins to use/set) + * bit definitions below. + * @hardsector_size: 24 assumed physical sector size of underlying device + * @block_size: 28 default block size for this object + * @system_id: 32 system indicator (set by the segment manager) + * @volume_info: 36 persistent volume info, used only by EVMS volumes + * @feature_header: 40 generic on-disk metadata describing any EVMS feature + * @next: 44 linked list field + * @name: 48 storage object name + * 176 + * + * generic kernel storage object + */ +struct evms_logical_node { + u64 total_vsectors; + struct evms_plugin_header *plugin; + void *private; + u32 flags; + u32 iflags; + int hardsector_size; + int block_size; + u32 system_id; + struct evms_volume_info *volume_info; + struct evms_feature_header *feature_header; + struct evms_logical_node *next; + u8 name[EVMS_VOLUME_NAME_SIZE + 1]; +}; + +/** + * fields evms_logical_node.flags & evms_logical_volume.flags defines + **/ +#define EVMS_FLAGS_WIDTH 32 +#define EVMS_VOLUME_FLAG (1<<0) +#define EVMS_VOLUME_PARTIAL_FLAG (1<<1) +#define EVMS_VOLUME_PARTIAL (1<<1) +#define EVMS_VOLUME_SET_READ_ONLY (1<<2) +#define EVMS_VOLUME_READ_ONLY (1<<2) +/** + * these bits define volume status + **/ +#define EVMS_MEDIA_CHANGED (1<<20) +#define EVMS_DEVICE_UNPLUGGED (1<<21) +/** + * these bits used for removable status + **/ +#define EVMS_DEVICE_MEDIA_PRESENT (1<<24) +#define EVMS_DEVICE_PRESENT (1<<25) +#define EVMS_DEVICE_LOCKABLE (1<<26) +#define EVMS_DEVICE_REMOVABLE (1<<27) + +/** + * fields evms_logical_node.iflags defines + **/ +#define EVMS_FEATURE_BOTTOM (1<<0) +#define EVMS_TOP_SEGMENT (1<<1) + +/** + * macro to obtain a node's name from either EVMS or compatibility volumes + **/ +#define EVMS_GET_NODE_NAME(node) \ + ((node->flags & EVMS_VOLUME_FLAG) ? \ + node->volume_info->volume_name : \ + node->name) + +/** + * macro used to transform to/from userland device handles and device storage object nodes + **/ +#define EVMS_HANDLE_KEY 0x0123456789ABCDEF +#define DEV_HANDLE_TO_NODE(handle) ((struct evms_logical_node *)(unsigned long)((handle) ^ EVMS_HANDLE_KEY)) +#define NODE_TO_DEV_HANDLE(node) (((u64)(unsigned long)(node)) ^ EVMS_HANDLE_KEY) + +/** + * struct evms_logical_volume - logical volume info + * @name: logical volume name + * @node: logical volume storage object + * @flags: characteristics of logical volume + * @quiesced: quiesce state info + * @vfs_quiesced: vfs quiesce state info + * @requests_in_progress: count of in-flight I/Os + * @wait_queue: used when volume is quiesced + * @devfs_handle: handle for devfs + * @request_queue: unique request queue + * @request_lock: unique request queue lock + * + * contains all the fields needed to manage to a logical volume + **/ +struct evms_logical_volume { + u8 *name; + struct evms_logical_node *node; + int flags; + int quiesced; + int vfs_quiesced; + atomic_t opens; + atomic_t requests_in_progress; + wait_queue_head_t wait_queue; + devfs_handle_t devfs_handle; +#ifdef CONFIG_SMP + request_queue_t request_queue; + spinlock_t request_lock; +#endif +}; + +/** + * field evms_logical_volume.flags defines + **/ +/** + * queued flags bits + **/ +#define EVMS_REQUESTED_DELETE (1<<5) +#define EVMS_REQUESTED_QUIESCE (1<<6) +#define EVMS_REQUESTED_VFS_QUIESCE (1<<7) +/** + * this bit indicates corruption + **/ +#define EVMS_VOLUME_CORRUPT (1<<8) +/** + * these bits define the source of the corruption + **/ +#define EVMS_VOLUME_SOFT_DELETED (1<<9) +#define EVMS_DEVICE_UNAVAILABLE (1<<10) + +/* + * The following function table is used for all plugins. + */ +/** + * struct evms_plugin_fops - evms plugin's table of function operations + * @discover: volume discovery entry point + * @end_discover: final discovery entry point + * @delete: delete volume entry point + * @read: asynchronous read entry point + * @write: asynchronous write entry point + * @init_io: synchronous io entry point + * @ioctl: generic ioctl entry point + * @direct_ioctl: non-generic ioctl entry point + * + * evms plugin's table of function operations + **/ +struct evms_plugin_fops { + int (*discover) (struct evms_logical_node **); + int (*end_discover) (struct evms_logical_node **); + int (*delete) (struct evms_logical_node *); + void (*read) (struct evms_logical_node *, struct buffer_head *); + void (*write) (struct evms_logical_node *, struct buffer_head *); + int (*init_io) (struct evms_logical_node *, int, u64, + u64, void *); + int (*ioctl) (struct evms_logical_node *, struct inode *, + struct file *, u32, unsigned long); + int (*direct_ioctl) (struct inode *, struct file *, + u32, unsigned long); +}; + +/** + * convenience macros to use plugin's fops entry points + **/ +#define DISCOVER(node, list) ((node)->plugin->fops->discover(list)) +#define END_DISCOVER(node, list) ((node)->plugin->fops->end_discover(list)) +#define DELETE(node) ((node)->plugin->fops->delete(node)) +#define R_IO(node, bh) ((node)->plugin->fops->read(node, bh)) +#define W_IO(node, bh) ((node)->plugin->fops->write(node, bh)) +#define INIT_IO(node, rw_flag, start_sec, num_secs, buf_addr) ((node)->plugin->fops->init_io(node, rw_flag, start_sec, num_secs, buf_addr)) +#define IOCTL(node, inode, file, cmd, arg) ((node)->plugin->fops->ioctl(node, inode, file, cmd, arg)) +#define DIRECT_IOCTL(reg_record, inode, file, cmd, arg) ((reg_record)->plugin->fops->direct_ioctl(inode, file, cmd, arg)) + +/** + * struct evms_list_node - generic non-imbedded list node object + * @item: ptr to object in list + * @next: ptr to next item in list + * + * light weight generic non-imbedded list object definition + **/ +struct evms_list_node { + void *item; + struct evms_list_node *next; +}; + +/** + * struct evms_pool_mgmt - anchor block for private pool management + * @cachep: kmem_cache_t variable + * @member_size: size of each element in the pool + * @head: + * @waiters: count of waiters + * @wait_queue: list of waiters + * @name: name of the pool (must be less than 20 chars) + * + * anchor block for private pool management + **/ +struct evms_pool_mgmt { + kmem_cache_t *cachep; + int member_size; + void *head; + atomic_t waiters; + wait_queue_head_t wait_queue; + u8 *name; +}; + +/* + * Notes: + * All of the following kernel thread functions belong to EVMS base. + * These functions were copied from md_core.c + */ +#define EVMS_THREAD_WAKEUP 0 +/** + * struct evms_thread + * @run: + * @data: + * @wqueue: thread wait queue + * @flags: thread attributes + * @event: event completion + * @tsk: task info + * @name: thread name + * + * data structure for creating/managing a kernel thread + **/ +struct evms_thread { + void (*run) (void *data); + void *data; + wait_queue_head_t wqueue; + unsigned long flags; + struct completion *event; + struct task_struct *tsk; + const u8 *name; +}; + +/** + * EVMS (common services) exported functions prototypes + * + * since these function names are global, evms_cs_ has been prepended + * to each function name, to ensure they do not collide with any + * other global functions in the kernel. + **/ +#define EVMS_COMMON_SERVICES_MAJOR 0 +#define EVMS_COMMON_SERVICES_MINOR 6 +#define EVMS_COMMON_SERVICES_PATCHLEVEL 0 + +void evms_cs_get_version(int *, int *); +int evms_cs_check_version(struct evms_version *, struct evms_version *); +int evms_cs_register_plugin(struct evms_plugin_header *); +int evms_cs_unregister_plugin(struct evms_plugin_header *); +#ifdef EVMS_MEM_DEBUG +int evms_cs_verify_memory_integrity(int); +#endif +int evms_cs_allocate_logical_node(struct evms_logical_node **); +void evms_cs_deallocate_volume_info(struct evms_logical_node *); +void evms_cs_deallocate_logical_node(struct evms_logical_node *); +int evms_cs_add_logical_node_to_list(struct evms_logical_node **, + struct evms_logical_node *); +int evms_cs_remove_logical_node_from_list(struct evms_logical_node **, + struct evms_logical_node *); +int evms_cs_kernel_ioctl(struct evms_logical_node *, u32, + unsigned long); +inline unsigned long evms_cs_size_in_vsectors(long long); +inline int evms_cs_log2(long long); +u32 evms_cs_calculate_crc(u32, void *, u32); +int evms_cs_register_for_end_io_notification(void *, + struct buffer_head *, + void *callback_function); +struct evms_pool_mgmt *evms_cs_create_pool(int, + u8 *, + void (*ctor) (void *, kmem_cache_t *, + unsigned long), + void (*dtor) (void *, kmem_cache_t *, + unsigned long)); +#define EVMS_BLOCKABLE TRUE +void *evms_cs_allocate_from_pool(struct evms_pool_mgmt *, int); +void evms_cs_deallocate_to_pool(struct evms_pool_mgmt *, void *); +void evms_cs_destroy_pool(struct evms_pool_mgmt *); +struct evms_list_node **evms_cs_lookup_item_in_list(struct evms_list_node **, + void *); +int evms_cs_add_item_to_list(struct evms_list_node **, void *); +int evms_cs_remove_item_from_list(struct evms_list_node **, void *); +int evms_cs_register_device(struct evms_logical_node *); +int evms_cs_unregister_device(struct evms_logical_node *); +int evms_cs_find_next_device(struct evms_logical_node *, + struct evms_logical_node **); +void evms_cs_signal_event(int); +struct evms_thread *evms_cs_register_thread(void (*run) (void *), + void *data, const u8 *name); +void evms_cs_unregister_thread(struct evms_thread *thread); +void evms_cs_wakeup_thread(struct evms_thread *thread); +void evms_cs_interrupt_thread(struct evms_thread *thread); +struct proc_dir_entry *evms_cs_get_evms_proc_dir(void); +int evms_cs_volume_request_in_progress(kdev_t, int, int *); +void evms_cs_invalidate_volume(struct evms_logical_node *topmost_node); + +/* EVMS exported global variables */ +extern struct evms_pool_mgmt *evms_bh_pool; +extern u8 *evms_primary_string; +extern u8 *evms_secondary_string; + +/* Have to include this at the end, since it depends + * on structures and definitions in this file. + */ +#include + +#endif diff -Naur linux-2002-09-30/include/linux/evms/evms_aix.h evms-2002-09-30/include/linux/evms/evms_aix.h --- linux-2002-09-30/include/linux/evms/evms_aix.h Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/include/linux/evms/evms_aix.h Mon Sep 23 15:11:41 2002 @@ -0,0 +1,428 @@ +/* +* The following structures are nested within the structures used by the +* system management routines. These structures and sizes were pulled from the AIX +* src tree. +*/ +#define LVM_MAXLPS 65535 /* max number of logical partitions allowed */ +#define LVM_NAMESIZ 64 /* maximum size for the logical volume name */ +#define LVM_NUMCOPIES 3 /* max number of copies allowed of a logical partition */ +#define LVM_MAXVGS 255 +#define LVM_MAXPVS 32 +#define LVM_MAXLVS 256 +#define AIX_MIN_BLOCK_SIZE 4096 +#define VGSA_BT_PV 127 +#define NBPI 32 +#define TRUE 1 +#define OFFSET_CONSTANT 144 +#define SLEEP_TIME 0 +#define MAXLVS_OFFSET 16 +#define PHYS_VOL_OFFSET 34 +#define AIX_PVHPP_LENGTH PHYS_VOL_OFFSET +#define MAX_SECTORS_NAMELIST 32 +#define AIX_DEFAULT_MIRRORING 1 +#define AIX_FIRST_MIRROR 2 +#define AIX_MAX_MIRRORS 3 // AIX defines ALL copies as mirrors - 3 mirrors MAX - 1 orig and 2 copies + +#define EVMS_AIX_FEATURE_ID 3 + +#define EVMS_AIX_RESYNC_MIRRORS 1 + +#define PSN_LVM_REC 7 +#define PSN_VGSA_REC 128 +#define PSN_NAMELIST_REC 2065 +#define PSN_VGT_TRAILER 135 +#define PSN_LVE_REC 1 +#define PSN_PPH_OFFSET 17 +#define PSN_PVH_INCREMENT 17 +#define AIX_MIN_PVH_SIZE 271 // used to find the PV header info for Pv's other than 0 +#define AIX_SECTOR_SIZE 512 +#define MAX_PPENT_SECTOR 16 +#define NAME_LEN 128 /* don't change!!! */ +#define UUID_LEN 32 /* don't change!!! */ +#define MAX_SECTORS_LV_ENTRIES 16 +#define AIX_MIN_MIRROR_POOL 10 +#define AIX_MIRROR_POOL_CHANGE 10 + +#define LV_SET_ACCESS _IOW ( 0xfe, 0x28, 1) +#define LV_SET_ALLOCATION _IOW ( 0xfe, 0x29, 1) +#define LV_SET_STATUS _IOW ( 0xfe, 0x2a, 1) +#define LV_BMAP _IOWR ( 0xfe, 0x30, 1) + +#define LV_ACTIVE 0x01 /* lv_status */ +#define LV_SPINDOWN 0x02 /* " */ +#define LV_ERROR 0x99 /* " */ + +#define VG_ACTIVE 0x01 /* vg_status */ + +#define AIX_LV_READ 0x00 /* lv_access */ +#define AIX_LV_WRITE 0x01 /* " */ +#define EVMS_LV_NEW 0x10 // volume was created during the current discovery pass +#define EVMS_LV_INCOMPLETE 0x20 // volume has an incomplete LE map +#define EVMS_LV_INVALID 0x40 // volume has a memory-corruption problem + +/* vg flags */ +#define AIX_VG_DIRTY 0x01 // group has had a new PV added during this discovery +#define AIX_VG_INCOMPLETE 0x20 // volume group is incomplete + +#define AIX_LVM_LVUNDEF 0 /* the logical volume is not defined to a */ +/* volume group */ +#define AIX_LVM_LVDEFINED 1 /* the logical volume is defined to a */ +/* volume group */ +#define AIX_LVM_LVSTALE 2 /* the logical volume has stale logical */ +/* partitions */ +#define AIX_LVM_LVMIRBKP 4 /* the logical volume is an online mirror backup */ +/* We are skipping '3' since it is used by CMDLVM_LVSTALE */ +/* as an addition of LVM_LVDEFINE + LVM_LVSTALE, and is */ +/* defined in src/bos/usr/sbin/lvm/include/ls.h */ + + + +#define LOG_PREFIX "--AIXlvm: " + +// Entries in the list of physical volumes (PV) +// in a volume group (VG) + +struct unique_id { + u32 word1; + u32 word2; + u32 word3; + u32 word4; +}; + +struct partition_list_entry { + struct evms_logical_node * logical_node; + u32 pv_number; + u32 block_size; // bytes + u32 hard_sect_size; // bytes + struct partition_list_entry * next; + +}; + +// Table for mapping logical extents (LE) to physical extents (PE) +struct pe_table_entry { + struct partition_list_entry * owning_pv; + u64 pe_sector_offset; + char pp_state; +}; + +// Logical volumes (LV) in a volume group (VG) +struct aix_logical_volume { + u32 lv_number; + u64 lv_size; // Sectors + u32 lv_access; // Flags: LV_READ, LV_WRITE, LN_NEW + u32 lv_status; // Flags: LV_ACTIVE, LV_SPINDOWN +// u32 lv_minor; // Device minor number + u32 mirror_copies; // Do we have mirroring and how many ? +// u32 mirror_number; // mirror number - which copy is this ? +// u32 mirror_iterations; // Which mirror should we be writing to ? + u32 stripes; + u32 stripe_size; // Sectors + u32 stripe_size_shift; // Number of bits to shift right instead of dividing by stripe_size + u32 pe_size; // Sectors + u32 pe_size_shift; // Number of bits to shift right instead of dividing by pe_size + u32 num_le; // Number of entries in the le_to_pe_map +// u32 new_volume; // Flag to indicate if this volume needs to be exported + struct aix_volume_group * group; // Pointer back to parent volume group + unsigned char name[EVMS_VOLUME_NAME_SIZE+1]; // Dev-tree volume name (eg: /dev/group0/vol0) + struct pe_table_entry * le_to_pe_map; // Mapping of logical to physical extents + struct pe_table_entry * le_to_pe_map_mir1; // Mapping of logical to physical extents for mirror 1 + struct pe_table_entry * le_to_pe_map_mir2; // Mapping of logical to physical extents for mirror 2 + struct evms_logical_node * volume_node; // Pointer to the parent EVMS node representing this volume + +}; + +// Volume groups (VG) +struct aix_volume_group { + struct unique_id vg_id; // volume group number */ + struct partition_list_entry * partition_list; // List of partitions/segments/PVs that make up this VG + struct aix_logical_volume ** volume_list; // Array of volumes found in this VG. + struct aix_volume_group * next; // Pointer to the next VG + struct vg_header * AIXvgh; // Pointer to valid data area on disk for the VG + s32 vgda_psn; // Which VGDA we should use +// u32 numpvs; // Number of PVs found on this VG. + u32 numlvs; // Number of LVs found on this VG. + u32 hard_sect_size; // The largest hard_sect_size and block_size + u32 block_size; // values of all partitions in this group. + u32 flags; // +// u32 lv_max; // maximum logical volumes */ + u32 pe_size; // physical extent size in sectors */ + u32 partition_count; // actual partitions found for this group + u32 CleanVGInfo; // Do we have a clean VG Info to work with ? + u32 vgda_len; // length of the volume group descriptor area */ +}; + +struct aix_resync_struct { + u64 master_offset; + u64 slave1_offset; + u64 slave2_offset; + struct partition_list_entry * master_part; // + struct partition_list_entry * slave1_part; // + struct partition_list_entry * slave2_part; // + struct aix_logical_volume * resync_vol; + struct aix_logical_volume * next_resync_vol; +}; + +struct aix_mirror_bh { + atomic_t remaining; + s32 iteration; // 'have we finished' count, used from IRQ handlers + u32 le; // In case we have to flag this pp as stale later. + s32 cmd; + u64 mir_sector1; + u64 mir_sector2; + struct buffer_head *master_bh; + struct buffer_head bh_req; + struct aix_mirror_bh *mirror_bh_list; + struct evms_logical_node *node; // map to evms node (READ only) + struct evms_logical_node *mir_node1; // + struct evms_logical_node *mir_node2; // + struct aix_mirror_bh *next_r1; // next for retry or in free list + char sync_flag; // Flag for resyncing of mirrored PPs +}; + +struct aix_volume_resync_ioctl { + char object_name[EVMS_VOLUME_NAME_SIZE+1]; // Input - Name of bbr object from feature header + s32 force; +}; + +struct timestruc { + int tv_sec; + int tv_nsec; + +}; + +struct aix_ipl_rec_area { + u32 IPL_record_id; /* This physical volume contains a */ + /* valid IPL record if and only if */ + /* this field contains IPLRECID */ + +#define IPLRECID 0xc9c2d4c1 /* Value is EBCIDIC 'IBMA' */ + + char reserved1[20]; + u32 formatted_cap; /* Formatted capacity. The number of */ + /* sectors available after formatting*/ + /* The presence or absence of bad */ + /* blocks does not alter this value. */ + + char last_head; /* THIS IS DISKETTE INFORMATION */ + /* The number of heads minus 1. Heads*/ + /* are number from 0 to last_head. */ + + char last_sector; /* THIS IS DISKETTE INFORMATION */ + /* The number of sectors per track. */ + /* Sectors are numbered from 1 to */ + /* last_sector. */ + + char reserved2[6]; + + u32 boot_code_length; /* Boot code length in sectors. A 0 */ + /* value implies no boot code present*/ + + u32 boot_code_offset; /* Boot code offset. Must be 0 if no */ + /* boot code present, else contains */ + /* byte offset from start of boot */ + /* code to first instruction. */ + + u32 boot_lv_start; /* Contains the PSN of the start of */ + /* the BLV. */ + + u32 boot_prg_start; /* Boot code start. Must be 0 if no */ + /* boot code present, else contains */ + /* the PSN of the start of boot code.*/ + + u32 boot_lv_length; /* BLV length in sectors. */ + + u32 boot_load_add; /* 512 byte boundary load address for*/ + /* boot code. */ + + char boot_frag; /* Boot code fragmentation flag. Must*/ + /* be 0 if no fragmentation allowed, */ + /* else must be 0x01. */ + + char boot_emulation; /* ROS network emulation flag */ + /* 0x0 => not an emul support image */ + /* 0x1 => ROS network emulation code */ + /* 0x2 => AIX code supporting ROS emul*/ + + char reserved3[2]; + + u16 basecn_length; /* Number of sectors for base */ + /* customization. Normal mode. */ + + u16 basecs_length; /* Number of sectors for base */ + /* customization. Service mode. */ + + u32 basecn_start; /* Starting PSN value for base */ + /* customization. Normal mode. */ + + u32 basecs_start; /* Starting PSN value for base */ + /* customization. Service mode. */ + + char reserved4[24]; + + u32 ser_code_length; /* Service code length in sectors. */ + /* A 0 value implies no service code */ + /* present. */ + + u32 ser_code_offset; /* Service code offset. Must be 0 if */ + /* no service code is present, else */ + /* contains byte offset from start of*/ + /* service code to first instruction.*/ + + u32 ser_lv_start; /* Contains the PSN of the start of */ + /* the SLV. */ + + u32 ser_prg_start; /* Service code start. Must be 0 if */ + /* service code is not present, else */ + /* contains the PSN of the start of */ + /* service code. */ + + u32 ser_lv_length; /* SLV length in sectors. */ + + u32 ser_load_add; /* 512 byte boundary load address for*/ + /* service code. */ + + char ser_frag; /* Service code fragmentation flag. */ + /* Must be 0 if no fragmentation */ + /* allowed, else must be 0x01. */ + + char ser_emulation; /* ROS network emulation flag */ + /* 0x0 => not an emul support image */ + /* 0x1 => ROS network emulation code */ + /* 0x2 => AIX code supporting ROS emul*/ + + char reserved5[2]; + + struct unique_id pv_id; /* The unique identifier for this */ + /* physical volume. */ + char dummy[512 - 128 - sizeof(struct unique_id)]; +}; + + +struct AIXlvm_rec +/* structure which describes the physical volume LVM record */ { + u32 lvm_id; /* LVM id field which identifies whether the PV is a member of a volume group */ + +#define AIX_LVM_LVMID 0x5F4C564D /* LVM id field of ASCII "_LVM" */ + + struct unique_id vg_id; /* the id of the volume group to which this physical volume belongs */ + u32 lvmarea_len; /* the length of the LVM reserved area */ + u32 vgda_len; /* length of the volume group descriptor area */ + s32 vgda_psn [2]; /* the physical sector numbers of the beginning of the volume group descriptor area copies on this disk */ + s32 reloc_psn; /* the physical sector number of the beginning of a pool of blocks */ + /* (located at the end of the PV) which are reserved for the relocation of bad blocks */ + u32 reloc_len; /* the length in number of sectors of the pool of bad block relocation blocks */ + s16 pv_num; /* the physical volume number within the volume group of this physical volume */ + s16 pp_size; /* the size in bytes for the partition, expressed as a power of 2 (i.e., the partition size is 2 to the power pp_size) */ + u32 vgsa_len; /* length of the volume group status area */ + s32 vgsa_psn [2]; /* the physical sector numbers of the beginning of the volume group status area copies on this disk */ + s16 version; /* the version number of this volume group descriptor and status area */ + +#define LVM_VERSION_1 1 /* first version - AIX 3.0 */ +#define LVM_STRIPE_ENHANCE 2 /* version with striped lv's - AIX 4.1 */ +#define LVM_1024_PPSIZE 3 /* ppsizes of 512 and 1024 */ +#define LVM_GT_1016 4 /* version with support for > 1016 pps/pv */ +#define LVM_MAX_VERSION LVM_GT_1016 /* max version # */ + + char res1 [450]; /* reserved area */ + +}; + + + +/* II.Volume Group Descriptor Area */ + +struct vgsa_area { + struct timestruc b_tmstamp; /* Beginning timestamp */ + u32 pv_missing [(LVM_MAXPVS + (NBPI -1)) / NBPI]; /* Bit per PV */ + unsigned char stalepp [LVM_MAXPVS] [VGSA_BT_PV]; + s16 factor; + char resv[10]; /* Padding */ + struct timestruc e_tmstamp; /* Ending timestamp */ + +} ; + +struct vg_header { + struct timestruc vg_timestamp; /* time of last update */ + struct unique_id vg_id; /* unique id for volume group */ + s16 numlvs; /* number of lvs in vg */ + s16 maxlvs; /* max number of lvs allowed in vg */ + s16 pp_size; /* size of pps in the vg */ + s16 numpvs; /* number of pvs in the vg */ + s16 total_vgdas; /* number of copies of vg */ + /* descriptor area on disk */ + s16 vgda_size; /* size of volume group descriptor */ + s16 bigvg; + s16 quorum; + s16 auto_varyon; + s32 checksum; + s32 bigda_size; +}; + +struct lv_entries { + s16 lvname; /* name of LV */ + s16 res1; /* reserved area */ + s32 maxsize; /* maximum number of partitions allowed */ + char lv_state; /* state of logical volume */ + char mirror; /* none,single, or double */ + s16 mirror_policy; /* type of writing used to write */ + s32 num_lps; /* number of logical partitions on the lv */ + /* base 1 */ + char permissions; /* read write or read only */ + char bb_relocation; /* specifies if bad block */ + /* relocation is desired */ + char write_verify; /* verify all writes to the LV */ + char mirwrt_consist; /* mirror write consistency flag */ + u16 stripe_exp; /* stripe size in exponent value */ + u16 striping_width; /* stripe width */ + u16 lv_avoid; + u16 child_minor_num; + char res4[4]; /* reserved area on disk */ +}; + + +struct pv_header { + struct unique_id pv_id; /* unique identifier of PV */ + u16 pp_count; /* number of physical partitions */ + /* on PV */ + char pv_state; /* state of physical volume */ + char res1; /* reserved area on disk */ + s32 psn_part1; /* physical sector number of 1st pp */ + s16 pvnum_vgdas;/* number of vg descriptor areas */ + /* on the physical volume */ + s16 pv_num; /* PV number */ + u32 res2; /* reserved area on disk */ + +}; + +struct pp_entries { + s16 lv_index; /* index to lv pp is on */ + s16 res_1; /* reserved area on disk */ + u32 lp_num; /* log. part. number */ + char copy; /* the copy of the logical partition */ + /* that this pp is allocated for */ + char pp_state; /* current state of pp */ + char fst_alt_vol; /* pv where partition allocation for*/ + /* first mirror begins */ + char snd_alt_vol; /* pv where partition allocation for*/ + /* second mirror begins */ + s16 fst_alt_part; /* partition to begin first mirror */ + s16 snd_alt_part; /*partition to begin second mirror */ + u64 res_3; /* reserved area on disk */ + u64 res_4; /* reserved area on disk */ +}; + +struct namelist { + char name[LVM_MAXLVS][LVM_NAMESIZ]; +}; + +struct vg_trailer { + struct timestruc timestamp; /* time of last update */ + s16 concurrency; + /* MS Nibble = concurrent capable */ + /* LS Nibble = concurrent auto-varyon */ + s16 res_2; + s32 res_3; /* reserved area on disk */ + u64 res_4; /* reserved area on disk */ + u64 res_5; /* reserved area on disk */ +}; + diff -Naur linux-2002-09-30/include/linux/evms/evms_bbr_k.h evms-2002-09-30/include/linux/evms/evms_bbr_k.h --- linux-2002-09-30/include/linux/evms/evms_bbr_k.h Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/include/linux/evms/evms_bbr_k.h Wed Sep 25 15:04:22 2002 @@ -0,0 +1,226 @@ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +/* linux/include/linux/evms/evms_bbr_k.h + * + * Kernel header file for Bad Block Relocation (BBR) Feature + * + * BBR feature is designed to remap I/O write failures to another safe location + * on disk. Note that most disk drives have BBR built into them, this means + * that our software BBR will be only activated when all hardware BBR + * replacement sectors have been used. + */ + +#ifndef __EVMS_BBR_K__ +#define __EVMS_BBR_K__ + +#define EVMS_BBR_VERSION_MAJOR 1 +#define EVMS_BBR_VERSION_MINOR 1 +#define EVMS_BBR_VERSION_PATCHLEVEL 1 + +#define EVMS_BBR_COMMON_SERVICES_MAJOR 0 +#define EVMS_BBR_COMMON_SERVICES_MINOR 6 +#define EVMS_BBR_COMMON_SERVICES_PATCHLEVEL 0 + +#define EVMS_BBR_FEATURE_ID 6 +#define EVMS_BBR_SIGNATURE 0x42627246 /* BbrF */ +#define EVMS_BBR_TABLE_SIGNATURE 0x42627254 /* BbrT */ + +#define EVMS_BBR_ENTRIES_PER_SECT 31 +#define BBR_POOL_NAME_LENGTH 20 +#define BBR_STOP_REMAP (1<<0) +#define BBR_BH_USE_EVMS_CALLBACK (1<<0) + +/* BBR direct ioctl commands. + * + * BBR_GET_INFO_CMD: Return total number of sectors that are currently + * remapped for the specified BBR object. + * BBR_STOP_REMAP_CMD: Stop remapping. Do not remap any new sectors or even + * honor any existing remaps for the specified BBR object + * until the next rediscover command is received. + * BBR_SECTOR_IO_CMD: Process an I/O from the engine directly through the + * specified BBR object in the kernel. + */ +#define BBR_GET_INFO_CMD 1 +#define BBR_STOP_REMAP_CMD 2 +#define BBR_SECTOR_IO_CMD 3 + +/** + * struct evms_bbr_table_entry + * @bad_sect: LBA of bad location. + * @replacement_sect: LBA of new location. + * + * Structure to describe one BBR remap. + */ +struct evms_bbr_table_entry { + u64 bad_sect; + u64 replacement_sect; +}; + +/** + * struct evms_bbr_table + * @signature: Signature on each BBR table sector. + * @crc: CRC for this table sector. + * @sequence_number: Used to resolve conflicts when primary and secondary + * tables do not match. + * @in_use_cnt: Number of in-use table entries. + * @entries: Actual table of remaps. + * + * Structure to describe each sector of the metadata table. Each sector in this + * table can describe 31 remapped sectors. + */ +struct evms_bbr_table { + u32 signature; + u32 crc; + u32 sequence_number; + u32 in_use_cnt; + struct evms_bbr_table_entry entries[EVMS_BBR_ENTRIES_PER_SECT]; +}; + +/** + * struct evms_bbr_metadata + * @signature: 0 EVMS_BBR_SIGNATURE + * @crc: 4 + * @block_size: 8 Block size in bytes. + * @flags: 12 Global flags used by BBR. + * @sequence_number: 16 + * @start_sect_bbr_table: 24 LBA of start of BBR table. + * @nr_sects_bbr_table: 32 Number of sectors in the BBR table. + * @start_replacement_sect: 40 LBA of start of replacement sectors. + * @nr_replacement_blks: 48 Number of replacement sectors. + * @pads: 56 + * + * On-disk metadata identifying an object as a BBR object. + */ +struct evms_bbr_metadata { + u32 signature; + u32 crc; + u32 block_size; + u32 flags; + u64 sequence_number; + u64 start_sect_bbr_table; + u64 nr_sects_bbr_table; + u64 start_replacement_sect; + u64 nr_replacement_blks; + u8 pads[456]; +}; + +/** + * struct evms_notify_bbr + * @object_name: Input - Name of BBR object from feature header. + * @count: Output - Number of remapped sectors. + * @start_sect: Input - Start sector for sector_io. + * @nr_sect: Input - Number of sectors for sector_io. + * @buffer: Input/Output - Pointer to data buffer for sector_io. + * @rw: Input - READ or WRITE for sector_io. + */ +struct evms_notify_bbr { + u8 object_name[EVMS_VOLUME_NAME_SIZE+1]; + u64 count; + u64 start_sect; + u64 nr_sect; + u8 * buffer; + s32 rw; +}; + +/** + * struct bbr_runtime_remap + * + * Node in the binary tree used to keep track of remaps. + */ +struct bbr_runtime_remap { + struct evms_bbr_table_entry remap; + struct bbr_runtime_remap * left; + struct bbr_runtime_remap * right; +}; + +/** + * struct bbr_private + * @next: List of all bbr_private structures. + * @node: Output node. + * @source: Consumed node. + * @bbr_table: Copy of metadata table. + * @lba_table1: LBA of primary BBR table. + * @lba_table2: LBA of secondary BBR table. + * @nr_sects_bbr_table: Size of each BBR table. + * @nr_replacement_blks: Number of replacement sectors. + * @start_replacement_sect: LBA of start of replacement sectors. + * @blksize_in_sects: Size of each sector. + * @in_use_replacement_blks: Current number of remaps. + * @remap_root: Binary tree containing all remaps. + * @bbr_id_lock: Lock for the binary tree. + * @flags: BBR_STOP_REMAP + */ +struct bbr_private { + struct bbr_private * next; + struct evms_logical_node * node; + struct evms_logical_node * source; + struct evms_bbr_table * bbr_table; + u64 lba_table1; + u64 lba_table2; + u64 nr_sects_bbr_table; + u64 nr_replacement_blks; + u64 start_replacement_sect; + u32 blksize_in_sects; + atomic_t in_use_replacement_blks; + struct bbr_runtime_remap * remap_root; + spinlock_t bbr_id_lock; + u32 flag; +}; + +/** + * struct bbr_io_buffer + * @bbr_io_list: Thread's list of bbr_io_buf's. + * @bbr_id: Object for this request. + * @bh: Original buffer_head. + * @org_end_io: Saved callback address from original buffer_head. + * @org_private: Saved private data address from original buffer_head. + * @org_rsector: Saved sector value from original buffer_head. + * @org_dev: Saved b_rdev field from original buffer_head. + * @complete: Completion structure used by init_io. + * @rw: READ or WRITE. + * @rc: Return code from bbr_io_handler. + * + * Structure used to track each write request. + */ +struct bbr_io_buffer { + struct list_head bbr_io_list; + struct bbr_private * bbr_id; + struct buffer_head * bh; + void (* org_end_io)(struct buffer_head *bh, int uptodate); + void * org_private; + u64 org_rsector; + struct completion * complete; + kdev_t org_dev; + s32 rw; + s32 rc; +}; + +#ifdef EVMS_BBR_DEBUG +static void print_meta_data(struct evms_bbr_metadata * md); +static void print_bbr_table_sector(struct evms_bbr_table * bbr_table); +static void print_remap_list(struct bbr_private * bbr_id); +#define BBR_DEBUG_PRINT_META_DATA(md) print_meta_data(md) +#define BBR_DEBUG_PRINT_TABLE_SECTOR(table) print_bbr_table_sector(table) +#define BBR_DEBUG_PRINT_REMAP_LIST(bbr_id) print_remap_list(bbr_id) +#else +#define BBR_DEBUG_PRINT_META_DATA(md) +#define BBR_DEBUG_PRINT_TABLE_SECTOR(table) +#define BBR_DEBUG_PRINT_REMAP_LIST(bbr_id) +#endif + +#endif diff -Naur linux-2002-09-30/include/linux/evms/evms_drivelink.h evms-2002-09-30/include/linux/evms/evms_drivelink.h --- linux-2002-09-30/include/linux/evms/evms_drivelink.h Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/include/linux/evms/evms_drivelink.h Fri Aug 16 16:43:11 2002 @@ -0,0 +1,125 @@ +/* -*- linux-c -*- */ +/* + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +/* + * linux/include/linux/evms_drvlink.h + * + * EVMS DriveLink Feature kernel header file + * + */ + +#ifndef __EVMS_DRIVELINK_INCLUDED__ +#define __EVMS_DRIVELINK_INCLUDED__ + +#define EVMS_DRIVELINK_FEATURE_ID 1 +#define EVMS_DRIVELINK_SIGNATURE 0x4C767244 //DrvL +#define EVMS_DRIVELINK_MAX_ENTRIES 60 + +/* + * feature data version defines + */ +#define DRIVELINK_METADATA_MAJOR 2 +#define DRIVELINK_METADATA_MINOR 0 +#define DRIVELINK_METADATA_PATCHLEVEL 0 + +static struct evms_version metadata_ver = { + .major = DRIVELINK_METADATA_MAJOR, + .minor = DRIVELINK_METADATA_MINOR, + .patchlevel = DRIVELINK_METADATA_PATCHLEVEL +}; + +/** + * struct evms_dl_ordering_table_entry - ordering table entry structure definition + * @child_sn: child serial number + * @child_size: in sectors + * + * ordering table entry struction definition + **/ +struct evms_dl_ordering_table_entry { + u64 child_serial_number; + u64 child_vsize; +}; + +/** + * struct evms_drivelink_metadata - on-disk metadata definition + * @signature: drivelink metadata magic number + * @crc: crc of entire structure + * @version: drivelink metadata version + * @flags: + * @sequence_number: used to determine most recent redundant data + * @child_sn: child object serial number + * @parent_sn: parent object serial number + * @child_count: count of child objects of parent + * @pad: used for alignment of following table + * @ordering_table: table of child ordering entries + * + * drivelink on-disk metadata definition + **/ +struct evms_drivelink_metadata { + u32 signature; + u32 crc; + struct evms_version version; + u32 flags; + u64 sequence_number; + u64 child_serial_number; + u64 parent_serial_number; + u64 child_count; + u64 pad; + struct evms_dl_ordering_table_entry + ordering_table[EVMS_DRIVELINK_MAX_ENTRIES]; +}; + +#ifdef __KERNEL__ +/** + * struct runtime_entry - in-memory metadata entry description + * @block_size: largest block size of all children + * @voffset: relative offset of child object within parent object (in 512 byte units) + * @vsize: child object size (in 512 byte units) + * @child_node: child storage object + * @child_metadata: child's on-disk metadata + * + * drivelink's in-memory metadata entry description + **/ +struct runtime_entry { + u64 block_size; + u64 voffset; + u64 vsize; + struct evms_logical_node *child_node; + struct evms_drivelink_metadata *child_metadata; +}; + +/** + * struct runtime_data - in-memory metadata description + * @block_size: largest block size of all children + * @voffset: relative offset of child object within parent object (in 512 byte units) + * @vsize: child object size (in 512 byte units) + * @child_node: child storage object + * @child_metadata: child's on-disk metadata + * + * drivelink's in-memory metadata description + **/ +struct runtime_data { + u64 block_size; + u64 parent_sn; + u64 child_count; + struct runtime_entry *child_table; +}; +#endif + +#endif diff -Naur linux-2002-09-30/include/linux/evms/evms_ecr.h evms-2002-09-30/include/linux/evms/evms_ecr.h --- linux-2002-09-30/include/linux/evms/evms_ecr.h Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/include/linux/evms/evms_ecr.h Fri Aug 16 16:19:56 2002 @@ -0,0 +1,107 @@ +/* + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ +/* + * linux/include/linux/evms_ecr.h + * + * EVMS Cluster enablement kernel header file + * + */ + +#ifndef __EVMS_ECR__ + +#define __EVMS_ECR__ + +#define ECR_SUCCESS 0 +#define ECR_FAIL -1 + +/* + * Beginning of group messaging API + */ +typedef int ecr_group_t; +typedef int ecr_nodeid_t; +typedef void ecr_cred_t; +typedef void ecr_instance_t; +typedef void ecr_message_t; + +typedef enum ecr_type_s { + ECR_GROUP_START, /* 0th entry is reserved */ + ECR_P2P, /* Point to Point message type */ + ECR_BROADCAST, /* Broadcast message type */ + ECR_ATOMIC_EXECUTE, /* Atomic execute type */ + ECR_GROUP_LAST /* Just a last enum type, not a message type */ +} ecr_type_t; + +typedef struct ecr_table_s { + void (*join) (ecr_nodeid_t, uint, ecr_nodeid_t *, ecr_instance_t *); + int (*can_join)(ecr_nodeid_t, ecr_cred_t *, size_t, ecr_instance_t *); + void (*leave) (ecr_nodeid_t, ecr_instance_t *); + void (*recover)(ecr_nodeid_t, ecr_instance_t *); + void (*message)(ecr_message_t *, ecr_type_t, ecr_nodeid_t, + void *, size_t, ecr_instance_t *); + void (*vol_leave)(ecr_nodeid_t, ecr_instance_t *); +} ecr_table_t; + + +#define ECR_GROUPNAME_MAX_SIZE NAME_SIZE /* maximum size of a group name */ + +ecr_group_t ecr_group_join(char *, ecr_table_t *, ecr_cred_t *, size_t, + ecr_instance_t *); +void ecr_group_leave(ecr_group_t); +int ecr_group_send(ecr_group_t, ecr_nodeid_t, void *, size_t, + ecr_instance_t *, + void callback(int, ecr_instance_t *)); +int ecr_group_send_wait(ecr_group_t, ecr_nodeid_t, void *, size_t, + int *); +int ecr_group_broadcast(ecr_group_t, void *, size_t, ecr_instance_t *, + void callback(u_char, ecr_instance_t *)); +int ecr_group_broadcast_wait(ecr_group_t, void *, size_t, u_char *); +int ecr_group_atomic_execute(ecr_group_t, void *, size_t, + ecr_instance_t *, + void callback(ecr_instance_t *)); +int ecr_group_atomic_execute_wait(ecr_group_t, void *, size_t); +void ecr_group_success_response(ecr_message_t *); +void ecr_group_failure_response(ecr_message_t *, int); + + + +/* + * Beginning of distributed lock API + */ + +typedef int ecr_lock_t; +typedef enum ecr_lock_mode_s { + ECR_LOCK_START, /* 0th entry is reserved */ + ECR_LOCK_CONCURRENT, /* concurrent access */ + ECR_LOCK_EXCLUSIVE, /* exclusive access */ + ECR_LOCK_LAST /* Just a last enum type, not a lock type */ +} ecr_lock_mode_t; + +typedef u_char ecr_mode_t; + + +#define ECR_LOCKNAME_MAX_SIZE NAME_SIZE /* maximum size of a lock name */ +#define ECR_BLOCK 1 /* waitflag set */ + +ecr_lock_t ecr_lock_create(char * /* lock name */); +int ecr_lock(ecr_lock_t, u64, u64, ecr_lock_mode_t, + u_char /*waitflag*/); +int ecr_unlock(ecr_lock_t, u64, u64); + +#endif /* __EVMS_ECR__ */ diff -Naur linux-2002-09-30/include/linux/evms/evms_ioctl.h evms-2002-09-30/include/linux/evms/evms_ioctl.h --- linux-2002-09-30/include/linux/evms/evms_ioctl.h Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/include/linux/evms/evms_ioctl.h Thu Sep 26 11:55:45 2002 @@ -0,0 +1,516 @@ +/* -*- linux-c -*- */ +/* + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +/* + * linux/include/linux/evms.h + * + * EVMS public kernel header file + * + */ + +#ifndef __EVMS_IOCTL_INCLUDED__ +#define __EVMS_IOCTL_INCLUDED__ + +#include + +/* IOCTL interface version definitions */ +#define EVMS_IOCTL_INTERFACE_MAJOR 11 +#define EVMS_IOCTL_INTERFACE_MINOR 3 +#define EVMS_IOCTL_INTERFACE_PATCHLEVEL 0 + +/* IOCTL definitions */ +enum evms_ioctl_cmds { + /* version commands */ + EVMS_GET_IOCTL_VERSION_NUMBER = 0, + EVMS_GET_VERSION_NUMBER, +#ifdef __KERNEL__ + /* EVMS internal commands */ + EVMS_GET_DISK_LIST_NUMBER = 0x40, + EVMS_CHECK_MEDIA_CHANGE_NUMBER, + EVMS_REVALIDATE_DISK_NUMBER, + EVMS_OPEN_VOLUME_NUMBER, + EVMS_CLOSE_VOLUME_NUMBER, + EVMS_QUIESCE_VOLUME_NUMBER, + EVMS_CHECK_DEVICE_STATUS_NUMBER, + EVMS_UPDATE_DEVICE_INFO_NUMBER, +#endif + /* configuration commands */ + EVMS_GET_INFO_LEVEL_NUMBER = 0x80, + EVMS_SET_INFO_LEVEL_NUMBER, + EVMS_REDISCOVER_VOLUMES_NUMBER, + EVMS_DELETE_VOLUME_NUMBER, + EVMS_PLUGIN_IOCTL_NUMBER, + EVMS_PROCESS_NOTIFY_EVENT_NUMBER, + /* query info commands */ + EVMS_GET_LOGICAL_DISK_NUMBER = 0xC0, + EVMS_GET_LOGICAL_DISK_INFO_NUMBER, + EVMS_SECTOR_IO_NUMBER, + EVMS_GET_MINOR_NUMBER, + EVMS_GET_VOLUME_DATA_NUMBER, + EVMS_GET_PLUGIN_NUMBER, + EVMS_COMPUTE_CSUM_NUMBER, + EVMS_GET_BMAP_NUMBER, + EVMS_CHECK_MOUNT_STATUS_NUMBER, + EVMS_CHECK_OPEN_STATUS_NUMBER, + /* commands for non-EVMS apps */ + EVMS_GET_VOL_STRIPE_INFO_NUMBER = 0xF0, +}; + +/* version commands */ +#define EVMS_GET_IOCTL_VERSION_STRING "EVMS_GET_IOCTL_VERSION" +#define EVMS_GET_IOCTL_VERSION _IOR(EVMS_MAJOR, EVMS_GET_IOCTL_VERSION_NUMBER, struct evms_version) + +#define EVMS_GET_VERSION_STRING "EVMS_GET_VERSION" +#define EVMS_GET_VERSION _IOR(EVMS_MAJOR, EVMS_GET_VERSION_NUMBER, struct evms_version) + +#ifdef __KERNEL__ + +/* EVMS internal commands */ +#define EVMS_GET_DISK_LIST_STRING "EVMS_GET_DISK_LIST" +#define EVMS_GET_DISK_LIST _IOWR(EVMS_MAJOR, EVMS_GET_DISK_LIST_NUMBER, struct evms_list_node **) + +#define EVMS_CHECK_MEDIA_CHANGE_STRING "EVMS_CHECK_MEDIA_CHANGE" +#define EVMS_CHECK_MEDIA_CHANGE _IO(EVMS_MAJOR, EVMS_CHECK_MEDIA_CHANGE_NUMBER) + +#define EVMS_REVALIDATE_DISK_STRING "EVMS_REVALIDATE_DISK" +#define EVMS_REVALIDATE_DISK _IO(EVMS_MAJOR, EVMS_REVALIDATE_DISK_NUMBER) + +#define EVMS_OPEN_VOLUME_STRING "EVMS_OPEN_VOLUME" +#define EVMS_OPEN_VOLUME _IO(EVMS_MAJOR, EVMS_OPEN_VOLUME_NUMBER) + +#define EVMS_CLOSE_VOLUME_STRING "EVMS_CLOSE_VOLUME" +#define EVMS_CLOSE_VOLUME _IO(EVMS_MAJOR, EVMS_CLOSE_VOLUME_NUMBER) + +/** + * struct evms_quiesce_vol_pkt - ioctl packet definition + * @command: 0 = unquiesce, 1 = quiesce + * @minor: minor device number of target volume + * @do_vfs: 0 = do nothing, 1 = also perform equivalent VFS operation + * @status: returned operation status + * + * ioctl packet definition for EVMS_QUIESCE_VOLUME + **/ +struct evms_quiesce_vol_pkt { + s32 command; + s32 minor; + s32 do_vfs; + s32 status; +}; +/** + * defines for evms_quiesce_vol_pkt.command field + **/ +#define EVMS_UNQUIESCE 0 +#define EVMS_QUIESCE 1 +/** + * defines for evms_quiesce_vol_pkt.do_vfs field + * located below struct evms_delete_vol_pkt definition + **/ + +#define EVMS_QUIESCE_VOLUME_STRING "EVMS_QUIESCE_VOLUME" +#define EVMS_QUIESCE_VOLUME _IOR(EVMS_MAJOR, EVMS_QUIESCE_VOLUME_NUMBER, struct evms_quiesce_vol_pkt) + +#define EVMS_CHECK_DEVICE_STATUS_STRING "EVMS_CHECK_DEVICE_STATUS" +#define EVMS_CHECK_DEVICE_STATUS _IOR(EVMS_MAJOR, EVMS_CHECK_DEVICE_STATUS_NUMBER, int) + +#define EVMS_UPDATE_DEVICE_INFO_STRING "EVMS_UPDATE_DEVICE_INFO" +#define EVMS_UPDATE_DEVICE_INFO _IO(EVMS_MAJOR, EVMS_UPDATE_DEVICE_INFO_NUMBER) + +#endif + +/* configuration commands */ +#define EVMS_GET_INFO_LEVEL_STRING "EVMS_GET_INFO_LEVEL" +#define EVMS_GET_INFO_LEVEL _IOR(EVMS_MAJOR, EVMS_GET_INFO_LEVEL_NUMBER, int) + +#define EVMS_SET_INFO_LEVEL_STRING "EVMS_SET_INFO_LEVEL" +#define EVMS_SET_INFO_LEVEL _IOW(EVMS_MAJOR, EVMS_SET_INFO_LEVEL_NUMBER, int) + +/** + * struct evms_rediscover_pkt - rediscover volume ioctl packet definition + * @status: return operation status + * @drive_count: count of drives being probed, 0xffffffff for all disks + * @drive_array: array of drive handles to be probed + * + * ioctl packet definition for EVMS_REDISCOVER_VOLUMES ioctl + **/ +struct evms_rediscover_pkt { + s32 status; + u32 drive_count; + u64 *drive_array; +}; +/** + * defines for evms_delete_vol_pkt.command field + **/ +#define EVMS_SOFT_DELETE 0 +#define EVMS_HARD_DELETE 1 +/** + * defines evms_rediscover_pkt.drive_count field + **/ +#define REDISCOVER_ALL_DEVICES 0xFFFFFFFF + +#define EVMS_REDISCOVER_VOLUMES_STRING "EVMS_REDISCOVER_VOLUMES" +#define EVMS_REDISCOVER_VOLUMES _IOWR(EVMS_MAJOR, EVMS_REDISCOVER_VOLUMES_NUMBER, struct evms_rediscover_pkt) + +/* field: command: defines */ + +/** + * struct evms_delete_vol_pkt - delete volume ioctl packet definition + * @command: 0 = soft delete, 1 = hard delete + * @minor: minor device num of target volume + * @do_vfs: 0 = do nothing, 1 = perform VFS operation(s) + * @associative_minor: optional minor device num of associative volume, 0 when unused + * @author returned operation status + * + * ioctl packet definition for EVMS_DELETE_VOLUME ioctl + **/ +struct evms_delete_vol_pkt { + s32 command; + s32 minor; + s32 do_vfs; + s32 associative_minor; + s32 status; +}; +/** + * field evms_delete_vol_pkt defines + * @EVMS_VFS_DO_NOTHING: + * @EVMS_VFS_DO: + * + * NOTE: these defines are also used with evms_quiesce_vol_pkt. + **/ +#define EVMS_VFS_DO_NOTHING 0 +#define EVMS_VFS_DO 1 + +#define EVMS_DELETE_VOLUME_STRING "EVMS_DELETE_VOLUME" +#define EVMS_DELETE_VOLUME _IOR(EVMS_MAJOR, EVMS_DELETE_VOLUME_NUMBER, struct evms_delete_vol_pkt) + +/** + * struct evms_plugin_ioctl_pkt - generic plugin ioctl packet definition + * @feature_id: plugin ID of feature to receive this ioctl + * @feature_command: feature specific ioctl command + * @status: 0 = completed, 0 != error + * @feature_ioctl_data: ptr to feature specific ioctl struct + * + * ioctl packet definition for EVMS_PLUGIN_IOCTL ioctl + **/ +struct evms_plugin_ioctl_pkt { + ulong feature_id; + s32 feature_command; + s32 status; + void *feature_ioctl_data; +}; + +#define EVMS_PLUGIN_IOCTL_STRING "EVMS_PLUGIN_IOCTL" +#define EVMS_PLUGIN_IOCTL _IOR(EVMS_MAJOR, EVMS_PLUGIN_IOCTL_NUMBER, struct evms_plugin_ioctl_pkt) + +/** + * struct evms_event - evms event structure + * @pid: PID to act on + * @eventid: event id to respond to + * @signo: signal # to send when event occurs + * + * contains process event notification info + **/ +struct evms_event { + s32 pid; + s32 eventid; + s32 signo; +}; +/** + * field evms_event_pkt.eventid defines + **/ +#define EVMS_EVENT_END_OF_DISCOVERY 0 + +/** + * struct evms_notify_pkt - evms event notification ioctl packet definition + * @command: 0 = unregister, 1 = register + * @eventry: event structure + * @status: returned operation status + * + * ioctl packet definition for EVMS_PROCESS_NOTIFY_EVENT ioctl + **/ +struct evms_notify_pkt { + s32 command; + struct evms_event eventry; + s32 status; +}; +/** + * field evms_notify_pkt.command defines + **/ +#define EVMS_EVENT_UNREGISTER 0 +#define EVMS_EVENT_REGISTER 1 + +#define EVMS_PROCESS_NOTIFY_EVENT_STRING "EVMS_PROCESS_NOTIFY_EVENT" +#define EVMS_PROCESS_NOTIFY_EVENT _IOWR(EVMS_MAJOR, EVMS_PROCESS_NOTIFY_EVENT_NUMBER, struct evms_notify_pkt) + +/* query info commands */ + +/** + * struct evms_user_disk_pkt - get disk handle ioctl packet definition + * @command: 0 = first disk, 1 = next disk + * @status: 0 = no more disks, 1 = valid disk info + * @disk_handle: only valid when status == 1 + * + * ioctl packet definition for EVMS_GET_LOGICAL_DISK ioctl + **/ +struct evms_user_disk_pkt { + s32 command; + s32 status; + u64 disk_handle; +}; +/** + * field evms_user_disk_pkt.command defines + **/ +#define EVMS_FIRST_DISK 0 +#define EVMS_NEXT_DISK 1 +/** + * field evms_user_disk_pkt.status defines + **/ +#define EVMS_DISK_INVALID 0 +#define EVMS_DISK_VALID 1 + +#define EVMS_GET_LOGICAL_DISK_STRING "EVMS_GET_LOGICAL_DISK" +#define EVMS_GET_LOGICAL_DISK _IOWR(EVMS_MAJOR, EVMS_GET_LOGICAL_DISK_NUMBER, struct evms_user_disk_pkt) + +/** + * evms_user_disk_info_pkt - disk info packet definition + * @status: return operation status + * @flags: device characteristics + * @disk_handle: kernel handle to specified device + * @disk_dev: kernel device info, used by MD plugin + * @geometry: reported device geometry + * @block_size: reported block size + * @hardsect_size: reported physical sector size + * @total_vsectors: size of device in 512 byte units + * @disk_name: legacy name for the device + * + * ioctl packet definition for EVMS_GET_LOGICAL_DISK_INFO ioctl + **/ +struct evms_user_disk_info_pkt { + u32 status; + u32 flags; + u64 disk_handle; + u32 disk_dev; + u32 geo_sectors; + u32 geo_heads; + u64 geo_cylinders; + u32 block_size; + u32 hardsect_size; + u64 total_sectors; + u8 disk_name[EVMS_VOLUME_NAME_SIZE + 1]; +}; +/** + * field evms_user_disk_info_pkt.flags define in evms.h + **/ + +#define EVMS_GET_LOGICAL_DISK_INFO_STRING "EVMS_GET_LOGICAL_DISK_INFO" +#define EVMS_GET_LOGICAL_DISK_INFO _IOWR(EVMS_MAJOR, EVMS_GET_LOGICAL_DISK_INFO_NUMBER, struct evms_user_disk_info_pkt) + +/** + * struct evms_sector_io_pkt - sector io ioctl packet definition + * @disk_handle: disk handle of target device + * @io_flag: 0 = read, 1 = write + * @starting_sector: disk relative starting sector + * @sector_count: count of sectors + * @buffer_address: user buffer address + * @status: return operation status + * + * ioctl packet definition for EVMS_SECTOR_IO ioctl + **/ +struct evms_sector_io_pkt { + u64 disk_handle; + s32 io_flag; + u64 starting_sector; + u64 sector_count; + u8 *buffer_address; + s32 status; +}; +/** + * field evms_sector_io_pkt.io_flag defines + **/ +#define EVMS_SECTOR_IO_READ 0 +#define EVMS_SECTOR_IO_WRITE 1 + +#define EVMS_SECTOR_IO_STRING "EVMS_SECTOR_IO" +#define EVMS_SECTOR_IO _IOWR(EVMS_MAJOR, EVMS_SECTOR_IO_NUMBER, struct evms_sector_io_pkt) + +/** + * struct evms_user_minor_pkt - get a list of device minors, one at a time + * @command: 0 = first volume, 1 = next volume + * @status: returned operation status + * @minor: returned minor number, only valid when status == 1 + * + * ioctl packet definition for EVMS_GET_MINOR ioctl + **/ +struct evms_user_minor_pkt { + s32 command; + s32 status; + s32 minor; +}; +/** + * field evms_user_minor_pkt.command defines + **/ +#define EVMS_FIRST_VOLUME 0 +#define EVMS_NEXT_VOLUME 1 +/** + * field evms_user_minor_pkt.status defines + **/ +#define EVMS_VOLUME_INVALID 0 +#define EVMS_VOLUME_VALID 1 + +#define EVMS_GET_MINOR_STRING "EVMS_GET_MINOR" +#define EVMS_GET_MINOR _IOWR(EVMS_MAJOR, EVMS_GET_MINOR_NUMBER, struct evms_user_minor_pkt) + +/** + * struct evms_volume_data_pkt - volume data packet definition + * @minor: minor device number of target volume + * @flags: returned volume characteristics + * @volume_name: returned volume name + * @status: returned operation status + * + * ioctl packet definition for EVMS_GET_VOLUME_DATA ioctl + **/ +struct evms_volume_data_pkt { + s32 minor; + s32 flags; + u8 volume_name[EVMS_VOLUME_NAME_SIZE + 1]; + s32 status; +}; +/** + * field evms_volume_data_pkt.flags defines found in evms_common.h + **/ + +#define EVMS_GET_VOLUME_DATA_STRING "EVMS_GET_VOLUME_DATA" +#define EVMS_GET_VOLUME_DATA _IOWR(EVMS_MAJOR, EVMS_GET_VOLUME_DATA_NUMBER, struct evms_volume_data_pkt) + +/** + * struct evms_kernel_plugin_pkt - get kernel plugin ioctl packet definition + * @command: 0 = first plugin, 1 = next plugin + * @id: returned plugin id + * @version: returned plugin version info + * @status: returned operation status + * + * ioctl packet definition for EVMS_GET_PLUGIN ioctl + **/ +struct evms_kernel_plugin_pkt { + s32 command; + u32 id; + struct evms_version version; + s32 status; +}; +/** + * field evms_kernel_plugin_pkt.command defines + **/ +#define EVMS_FIRST_PLUGIN 0 +#define EVMS_NEXT_PLUGIN 1 +/** + * field evms_kernel_plugin_pkt.status defines + **/ +#define EVMS_PLUGIN_INVALID 0 +#define EVMS_PLUGIN_VALID 1 + +#define EVMS_GET_PLUGIN_STRING "EVMS_GET_PLUGIN" +#define EVMS_GET_PLUGIN _IOWR(EVMS_MAJOR, EVMS_GET_PLUGIN_NUMBER, struct evms_kernel_plugin_pkt) + +/** + * struct evms_compute_csum_pkt - compute checksum ioctl packet definition + * @buffer_address: + * @buffer_size: + * @insum: + * @outsum: + * @status: + * + * ioctl packet definition for EVMS_COMPUTE_CSUM ioctl + **/ +struct evms_compute_csum_pkt { + u8 *buffer_address; + s32 buffer_size; + u32 insum; + u32 outsum; + s32 status; +}; + +#define EVMS_COMPUTE_CSUM_STRING "EVMS_COMPUTE_CSUM" +#define EVMS_COMPUTE_CSUM _IOWR(EVMS_MAJOR, EVMS_COMPUTE_CSUM_NUMBER, struct evms_compute_csum_pkt) + +/** + * struct evms_get_bmap_pkt - get bmap data ioctl packet definition + * @rsector: input, volume relative rsector value + * output, disk relative rsector value + * @dev output, physical device + * @status: output, operation status + * + * ioctl packet definition for EVMS_GET_BMAP ioctl + **/ +struct evms_get_bmap_pkt { + u64 rsector; + u32 dev; + s32 status; +}; + +#define EVMS_GET_BMAP_STRING "EVMS_GET_BMAP" +#define EVMS_GET_BMAP _IOWR(EVMS_MAJOR, EVMS_GET_BMAP_NUMBER, struct evms_get_bmap_pkt) + +/** + * struct evms_mount_status_pkt - ioctl packet definition + * @minor: input, minor of volume to check + * @mounted: output, TRUE if mounted, FALSE if not + * @status: output, operation completion status + * + * ioctl packet definition for EVMS_CHECK_MOUNT_STATUS ioctl. + **/ +struct evms_mount_status_pkt { + u32 minor; + u32 mounted; + s32 status; +}; + +#define EVMS_CHECK_MOUNT_STATUS_STRING "EVMS_CHECK_MOUNT_STATUS" +#define EVMS_CHECK_MOUNT_STATUS _IOWR(EVMS_MAJOR, EVMS_CHECK_MOUNT_STATUS_NUMBER, struct evms_mount_status_pkt) + +/** + * struct evms_open_status_pkt - ioctl packet definition + * @minor: input, minor of volume to check + * @opens: output, 0 (FALSE) if not, count (TRUE) of opens + * @status: output, operation completion status + * + * ioctl packet definition for EVMS_CHECK_OPEN_STATUS ioctl. + **/ +struct evms_open_status_pkt { + u32 minor; + u32 opens; + s32 status; +}; + +#define EVMS_CHECK_OPEN_STATUS_STRING "EVMS_CHECK_OPEN_STATUS" +#define EVMS_CHECK_OPEN_STATUS _IOWR(EVMS_MAJOR, EVMS_CHECK_OPEN_STATUS_NUMBER, struct evms_open_status_pkt) + +/** + * struct evms_vol_stripe_info_pkt - ioctl packet definition + * @size: the stripe unit specified in 512 byte block units + * @width: the number of stripe members or RAID data disks + * + * ioctl packet definition for EVMS_GET_VOL_STRIPE_INFO ioctl. + **/ +struct evms_vol_stripe_info_pkt { + u32 size; + u32 width; +}; + +#define EVMS_GET_VOL_STRIPE_INFO_STRING "EVMS_GET_VOL_STRIPE_INFO" +#define EVMS_GET_VOL_STRIPE_INFO _IOR(EVMS_MAJOR, EVMS_GET_VOL_STRIPE_INFO_NUMBER, struct evms_vol_stripe_info_pkt) +#endif diff -Naur linux-2002-09-30/include/linux/evms/evms_linear.h evms-2002-09-30/include/linux/evms/evms_linear.h --- linux-2002-09-30/include/linux/evms/evms_linear.h Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/include/linux/evms/evms_linear.h Tue Aug 6 01:03:24 2002 @@ -0,0 +1,33 @@ +#ifndef __EVMS_LINEAR_H +#define __EVMS_LINEAR_H + +#include + +struct dev_info { + struct evms_logical_node *node; + kdev_t dev; + unsigned long size; + unsigned long offset; +}; + +typedef struct dev_info dev_info_t; + +struct linear_hash +{ + dev_info_t *dev0, *dev1; +}; + +struct linear_private_data +{ + struct linear_hash *hash_table; + dev_info_t disks[MD_SB_DISKS]; + dev_info_t *smallest; + int nr_zones; +}; + + +typedef struct linear_private_data linear_conf_t; + +#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private) + +#endif diff -Naur linux-2002-09-30/include/linux/evms/evms_lvm.h evms-2002-09-30/include/linux/evms/evms_lvm.h --- linux-2002-09-30/include/linux/evms/evms_lvm.h Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/include/linux/evms/evms_lvm.h Mon Aug 26 10:01:08 2002 @@ -0,0 +1,479 @@ +/* -*- linux-c -*- */ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +/* + * linux/include/linux/evms_lvm.h + * + * EVMS LVM VGE kernel header file + */ + +#ifndef __EVMS_LVM_H__ +#define __EVMS_LVM_H__ + +#define EVMS_LVM_VERSION_MAJOR 1 +#define EVMS_LVM_VERSION_MINOR 1 +#define EVMS_LVM_VERSION_PATCH 1 + +/* The following definitions and data structures are copied from lvm.h and + * liblvm.h from the LVM 0.9.1beta8 distribution. Since the metadata format + * changed in beta8, lvm.h changed significantly enough that this module would + * no longer compile. Instead of requiring evms users to install the latest lvm + * release, the required definitions and data structures will now be included + * in this header file. + */ + +#define MAX_VG 99 +#define MAX_LV 256 +#define MAX_PV 256 +#define NAME_LEN 128 +#define UUID_LEN 32 +#define LVM_VGDA_ALIGN 4096UL +#define LVM_PV_DISK_BASE 0L +#define LVM_PV_DISK_SIZE 1024L +#define LVM_VG_DISK_BASE round_up(LVM_PV_DISK_BASE + LVM_PV_DISK_SIZE, \ + LVM_VGDA_ALIGN) +#define LVM_VG_DISK_SIZE (8*512L) + +/* + * Status flags + */ +/* lv->lv_status */ +#define LV_ACTIVE 0x01 +/* lv->lv_access */ +#define LV_READ 0x01 +#define LV_WRITE 0x02 +#define LV_SNAPSHOT 0x04 +#define LV_SNAPSHOT_ORG 0x08 + +/** + * struct lv_COW_table_disk_v1 + * @pv_org_number: + * @pv_org_rsector: + * @pv_snap_number: + * @pv_snap_rsector: + * + * Copy-On-Write tables in disk format (version 1). + **/ +struct lv_COW_table_disk { + u64 pv_org_number; + u64 pv_org_rsector; + u64 pv_snap_number; + u64 pv_snap_rsector; +}; + +/** + * struct pe_disk + * @lv_num: + * @le_num: + * + * Disk stored PE map entry definition. + **/ +struct pe_disk { + u16 lv_num; + u16 le_num; +}; + +/** + * struct lvm_disk_data + * @base: + * @size: + * + * Disk stored PV, VG, LV and PE size and offset information. + */ +struct lvm_disk_data { + u32 base; + u32 size; +}; + +/** + * struct pv_disk + * @id: + * @version: + * @pv_on_disk: + * @vg_on_disk: + * @pv_uuidlist_on_disk: + * @lv_on_disk: + * @pe_on_disk: + * @pv_uuid: + * @vg_name: + * @system_id: used by vgexport/vgimport + * @pv_major: + * @pv_number: + * @pv_status: + * @pv_allocatable: + * @pv_size: + * @lv_cur + * @pe_size: + * @pe_total: + * @pe_allocated: + * @pe_start: in sectors (new in version 2) + * + * Physical volume on disk metadata definition (version 2). + */ +struct pv_disk { + u8 id[2]; + u16 version; + struct lvm_disk_data pv_on_disk; + struct lvm_disk_data vg_on_disk; + struct lvm_disk_data pv_uuidlist_on_disk; + struct lvm_disk_data lv_on_disk; + struct lvm_disk_data pe_on_disk; + u8 pv_uuid[NAME_LEN]; + u8 vg_name[NAME_LEN]; + u8 system_id[NAME_LEN]; + u32 pv_major; + u32 pv_number; + u32 pv_status; + u32 pv_allocatable; + u32 pv_size; + u32 lv_cur; + u32 pe_size; + u32 pe_total; + u32 pe_allocated; + u32 pe_start; +}; + +/** + * struct lv_disk + * @lv_name: + * @vg_name: + * @lv_access: + * @lv_status: + * @lv_open: + * @lv_dev: + * @lv_number: + * @lv_mirror_copies: + * @lv_recovery: + * @lv_schedule: + * @lv_size: + * @lv_snapshot_minor: minor number of original + * @lv_chunk_size: chuck size for snapshots + * @lv_dummy: + * @lv_allocated_le: + * @lv_stripes: + * @lv_stripesize: + * @lv_badblock: + * @lv_allocated: + * @lv_io_timeout: + * @lv_read_ahead: + * + * Logical volume metadata definition (version 3). + */ +struct lv_disk { + u8 lv_name[NAME_LEN]; + u8 vg_name[NAME_LEN]; + u32 lv_access; + u32 lv_status; + u32 lv_open; + u32 lv_dev; + u32 lv_number; + u32 lv_mirror_copies; + u32 lv_recovery; + u32 lv_schedule; + u32 lv_size; + u32 lv_snapshot_minor; + u16 lv_chunk_size; + u16 dummy; + u32 lv_allocated_le; + u32 lv_stripes; + u32 lv_stripesize; + u32 lv_badblock; + u32 lv_allocation; + u32 lv_io_timeout; + u32 lv_read_ahead; +}; + +/** + * struct vg_disk + * @vg_uuid: Volume group UUID + * @vg_name_dummy: Remainder of version 1 VG name + * @vg_number: Volume group number + * @vg_access: Read/Write + * @vg_status: Active or not + * @lv_max: Maximum logical volumes + * @lv_cur: Current logical volumes + * @lv_open: Open logical volumes + * @pv_max: Maximum physical volumes + * @pv_cur: Current physical volumes + * @pv_act: Active physical volumes + * @dummy: + * @vgda: Volume group descriptor arrays + * @pe_size: Physical extent size in sectors + * @pe_total: Total of physical extents + * @pe_allocated: Allocated physical extents + * @pvg_total: Physical volume groups + * + * Volume group metadata definition (version 2). + */ +struct vg_disk { + u8 vg_uuid[UUID_LEN]; + u8 vg_name_dummy[NAME_LEN - UUID_LEN]; + u32 vg_number; + u32 vg_access; + u32 vg_status; + u32 lv_max; + u32 lv_cur; + u32 lv_open; + u32 pv_max; + u32 pv_cur; + u32 pv_act; + u32 dummy; + u32 vgda; + u32 pe_size; + u32 pe_total; + u32 pe_allocated; + u32 pvg_total; +}; + +/* Useful inlines */ +static inline ulong round_up(ulong n, ulong size) +{ + size--; + return (n + size) & ~size; +} + +static inline ulong div_up(ulong n, ulong size) +{ + return round_up(n, size) / size; +} + +/* End of lvm.h imported data structures. */ + +#define DEV_DIRECTORY "/dev/" +#define LVM_DEV_DIRECTORY "lvm/" +#define LVM_PROC_NAME "lvm" +#define LVM_PROC_VG_NAME "VGs" +#define LVM_PROC_LV_NAME "LVs" +#define LVM_PROC_PV_NAME "PVs" +#define LVM_PROC_GLOBAL_NAME "global" +#define IO_BUFFER_SECTORS 8 + +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,9) +#define max(a,b) (((a)>(b))?(a):(b)) +#endif + +/* Structure for doing PV remove ioctls. */ + +#define EVMS_LVM_PV_REMOVE_IOCTL 0x01 +#define EVMS_LVM_SNAPSHOT_STAT_IOCTL 0x02 + +/** + * struct lvm_pv_remove_ioctl + * @vg_uuid: Volume group UUID + * @pv_number: Physical volume number + * @next: Link to next packet (engine-use only) + * + * PV remove ioctl packet definition. + */ +struct lvm_pv_remove_ioctl { + u8 vg_uuid[UUID_LEN]; + u32 pv_number; + struct lvm_pv_remove_ioctl * next; +}; + +/** + * struct lvm_snapshot_stat_ioctl + * @vg_uuid: Volume group UUID + * @lv_number: Logical volume number + * @next_free_chuck: + * @lv_status: + * + * Snapshot statistics ioctl packet definition. + **/ +struct lvm_snapshot_stat_ioctl { + u8 vg_uuid[UUID_LEN]; + u32 lv_number; + u64 next_free_chunk; + u32 lv_status; +}; + +/** + * struct lvm_physical_volume + * @logical_node: Storage object + * @pv: Copy of on-disk PV struct + * @pe_map: + * @pv_number: + * @next: Pointer to next entry + * + * Entries in the list of physical volumes (PV) in a volume group (VG). + */ +struct lvm_physical_volume { + struct evms_logical_node * logical_node; + struct pv_disk * pv; + struct pe_disk * pe_map; + u32 pv_number; + struct lvm_physical_volume * next; +}; + +/** + * struct le_table_entry + * @owning_pv: + * @pe_sector_offset: + * + * Table entry definition for mapping logical + * extents (LE) to physical extents (PE). + */ +struct le_table_entry { + struct lvm_physical_volume * owning_pv; + u64 pe_sector_offset; +}; + +/** + * struct snapshot_map_entry + * @org_sector: + * @snap_sector: + * @snap_pv: + * @next: + * @prev: + * + * Snapshot remapping entry structure definition. + */ +struct snapshot_map_entry { + u64 org_sector; + u64 snap_sector; + struct lvm_physical_volume * snap_pv; + struct snapshot_map_entry * next; + struct snapshot_map_entry * prev; +}; + +#define MAX_HASH_CHAIN_ENTRIES 10 +#define CHUNK_DATA_BUFFER_SIZE 128 + +/** + * struct lvm_logical_volume + * @lv_number: + * @lv_size: In sectors + * @lv_access: Flags: LV_READ, LV_WRITE, LV_SNAPSHOT, + * LV_SNAPSHOT_ROG, EVMS_LV* + * @lv_status: Flags: LV_ACTIVE, LV_SPINDOWN + * @lv_minor: Device minor number + * @stripes: + * @stripe_size: In sectors + * @stripe_size_shift: # of bits to shift right instead of dividing by stripe_size + * @pe_size: In sectors + * @pe_size_shift: Number of bits to shift right instead of dividing by pe_size + * @num_le: Number of entries in the le-to-pe map + * @group: Pointer back to parent volume group + * @name: Dev-tree volume name (eg. /dev/group0/vol0) + * @le_map: Mapping of logical to physical extents + * @volume_node: Pointer to parent EVMS object representing this volume + * @chunk_size: In sectors + * @num_chunks: lv_size / chunk_size + * @snap_org_minor: Minor number of snapshot original + * @next_cow_entry: Index into current COW table + * @current_cow_sector: Logical sector of current COW table + * @next_free_chunk: Starting logical sector of next free chunk + * @hash_table_size: Number of pointers in each hash table + * @cow_table: Pointer to one sector's worth of COW tables. + * @chunk_data_buffer: Buffer reading data when doing copy-on-write + * @snap_semaphore: For locking during snapshot IO operations + * @snapshot_map: Pointer to remapping hash tables + * @snapshot_next: Linked list of volumes being snapshotted + * @snapshot_org: Pointer to volume being snapshotted + * + * In-memory representation of an LVM LV. + */ +struct lvm_logical_volume { + u32 lv_number; + u64 lv_size; + u32 lv_access; + u32 lv_status; + u32 lv_minor; + u32 stripes; + u32 stripe_size; + u32 stripe_size_shift; + u32 pe_size; + u32 pe_size_shift; + u32 num_le; + struct lvm_volume_group * group; + u8 name[NAME_LEN]; + struct le_table_entry * le_map; + struct evms_logical_node * volume_node; + u32 chunk_size; + u32 num_chunks; + u32 snap_org_minor; + u32 next_cow_entry; + u64 current_cow_sector; + u64 next_free_chunk; + u32 hash_table_size; + struct lv_COW_table_disk * cow_table; + u8 * chunk_data_buffer; + struct semaphore snap_semaphore; + struct snapshot_map_entry *** snapshot_map; + struct lvm_logical_volume * snapshot_next; + struct lvm_logical_volume * snapshot_org; +}; + +/* lv_access: + * EVMS_LV_NEW: Volume was created during the current discovery pass. + * EVMS_LV_INCOMPLETE: Volume has an incomplete LE map. + * EVMS_LV_INVALID: Volume has a memory-corruption problem. + * EVMS_LV_QUIESCED: Volume is in quiesced state. + * EVMS_LV_EXPORTED: Volume has been exported during this EVMS discovery pass. + */ +#define EVMS_LV_NEW 0x10 +#define EVMS_LV_INCOMPLETE 0x20 +#define EVMS_LV_INVALID 0x40 +#define EVMS_LV_QUIESCED 0x80 +#define EVMS_LV_EXPORTED 0x100 + +/** + * struct lvm_volume_group + * @vg: Copy of on-disk VG metadata + * @pv_list: List of PVs that make up this group + * @volume_list: Array of volumes + * @lv_array: Array of LV metadata + * @uuid_list: List of PV UUIDs + * @vg_uuid: UUID from the VG metadata + * @vg_name: Name from the PV metadata + * @pv_count: # of PVs found in this group + * @volume_count: # of LVs found in this group + * @hard_sect_size: Largest hardsector size of all PVs in this group + * @block_size: Largest block size of all PVs in this group + * @flags: EVMS_VG* + * @next_group: Linked list + * + * In-memory representation of an LVM VG. + */ +struct lvm_volume_group { + struct vg_disk * vg; + struct lvm_physical_volume * pv_list; + struct lvm_logical_volume * volume_list[MAX_LV + 1]; + struct lv_disk * lv_array; + u8 * uuid_list; + u8 vg_uuid[UUID_LEN]; + u8 vg_name[NAME_LEN]; + u32 pv_count; + u32 volume_count; + s32 hard_sect_size; + s32 block_size; + u32 flags; + struct lvm_volume_group * next_group; +}; + +/* flags + * EVMS_VG_DIRTY: Group is new or has had a PV added + * during this discovery. + * EVMS_VG_PARTIAL_PVS: Group contains at least one partial PV. + * EVMS_VG_REMOVABLE_PVS: Group contains at least one removable PV. + */ +#define EVMS_VG_DIRTY (1 << 0) +#define EVMS_VG_PARTIAL_PVS (1 << 1) +#define EVMS_VG_REMOVABLE_PVS (1 << 2) + +#endif diff -Naur linux-2002-09-30/include/linux/evms/evms_md.h evms-2002-09-30/include/linux/evms/evms_md.h --- linux-2002-09-30/include/linux/evms/evms_md.h Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/include/linux/evms/evms_md.h Fri Aug 16 11:10:59 2002 @@ -0,0 +1,120 @@ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * linux/include/linux/evms/evms_md.h + * + * EVMS Linux MD Region Manager Public Header File + * + * 'evms_md.h' is an EVMS version of linux/include/linux/raid/md.h modified + * by Cuong (Mike) Tran , January 2002. + * + */ + +#ifndef __EVMS_MD_INCLUDED +#define __EVMS_MD_INCLUDED + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +/* + * 'md_p.h' holds the 'physical' layout of RAID devices + * 'md_u.h' holds the user <=> kernel API + * + * 'md_k.h' holds kernel internal definitions + */ + +#include +#include +#include + +/* + * Different major versions are not compatible. + * Different minor versions are only downward compatible. + * Different patchlevel versions are downward and upward compatible. + */ +#define EVMS_MD_MAJOR_VERSION 1 +#define EVMS_MD_MINOR_VERSION 1 +#define EVMS_MD_PATCHLEVEL_VERSION 1 + +#define MD_MAJOR_VERSION 0 +#define MD_MINOR_VERSION 90 +#define MD_PATCHLEVEL_VERSION 0 + +#define EVMS_MD_COMMON_SERVICES_MAJOR 0 +#define EVMS_MD_COMMON_SERVICES_MINOR 5 +#define EVMS_MD_COMMON_SERVICES_PATCHLEVEL 0 + + +extern int evms_md_size[MAX_MD_DEVS]; + +extern void evms_md_add_mddev_mapping (mddev_t *mddev, kdev_t dev, void *data); +extern void evms_md_del_mddev_mapping (mddev_t *mddev, kdev_t dev); +extern char * evms_md_partition_name (struct evms_logical_node *node); +extern int evms_register_md_personality (int p_num, mdk_personality_t *p); +extern int evms_unregister_md_personality (int p_num); + +extern int evms_md_update_sb (mddev_t *mddev); +extern int evms_md_check_ordering (mddev_t *mddev); +extern void evms_md_print_devices (void); + +extern int evms_md_sync_io( + struct evms_logical_node *node, /* evms node for the MD array */ + int rw, /* READ / WRITE */ + u64 sector, /* starting sector */ + u64 total_nr_sects, /* total number of sectors */ + void *data ); /* pointer to buffer */ + +extern int evms_md_partial_sync_io( + struct evms_logical_node *node, /* evms node for the MD array */ + int rw, /* READ / WRITE */ + u64 sector, /* starting sector */ + u32 *nsects, /* on input: the total number of sectors for the request */ + /* on output, number of sectors completed */ + void *data); /* pointer to buffer */ + + +extern int evms_md_do_sync(mddev_t *mddev, mdp_disk_t *spare); +extern void evms_md_done_sync(mddev_t *mddev, int blocks, int ok); +extern void evms_md_sync_acct(kdev_t dev, unsigned long nr_sectors); +extern void evms_md_recover_arrays (void); +extern int evms_md_error (mddev_t *mddev, struct evms_logical_node *node); +extern int evms_md_error_dev(mddev_t *mddev, kdev_t dev); + +#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); evms_md_print_devices(); } + + +#endif + diff -Naur linux-2002-09-30/include/linux/evms/evms_md_k.h evms-2002-09-30/include/linux/evms/evms_md_k.h --- linux-2002-09-30/include/linux/evms/evms_md_k.h Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/include/linux/evms/evms_md_k.h Tue Aug 6 01:03:24 2002 @@ -0,0 +1,483 @@ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +/* + * linux/include/linux/evms/evms_md_k.h + * + * EVMS Linux MD Region Manager Public Header File + * + * 'evms_md_k.h' is an EVMS version of linux/include/linux/raid/md_k.h modified + * by Cuong (Mike) Tran , January 2002. + * + */ + +#ifndef __EVMS_MD_K_INC__ +#define __EVMS_MD_K_INC__ + +#define EVMS_MD_SECTS_PER_PAGE (PAGE_SIZE >> EVMS_VSECTOR_SIZE_SHIFT) +#define EVMS_MD_SECTS_PER_PAGE_MASK (~(EVMS_MD_SECTS_PER_PAGE-1)) + +#define MD_RESERVED 0UL +#define LINEAR 1UL +#define RAID0 2UL +#define RAID1 3UL +#define RAID5 4UL +#define TRANSLUCENT 5UL +#define HSM 6UL +#define MULTIPATH 7UL +#define MAX_PERSONALITY 8UL + +static inline int pers_to_level (int pers) +{ + switch (pers) { + case MULTIPATH: return -4; + case HSM: return -3; + case TRANSLUCENT: return -2; + case LINEAR: return -1; + case RAID0: return 0; + case RAID1: return 1; + case RAID5: return 5; + } + BUG(); + return MD_RESERVED; +} + +static inline int level_to_pers (int level) +{ + switch (level) { + case -3: return HSM; + case -2: return TRANSLUCENT; + case -1: return LINEAR; + case 0: return RAID0; + case 1: return RAID1; + case 4: + case 5: return RAID5; + } + return MD_RESERVED; +} + +typedef struct mddev_s mddev_t; +typedef struct mdk_rdev_s mdk_rdev_t; + +#if (MINORBITS != 8) +#error MD doesnt handle bigger kdev yet +#endif + +#define MAX_MD_DEVS (1<state & (1 << MD_DISK_FAULTY); +} + +static inline int disk_active(mdp_disk_t * d) +{ + return d->state & (1 << MD_DISK_ACTIVE); +} + +static inline int disk_sync(mdp_disk_t * d) +{ + return d->state & (1 << MD_DISK_SYNC); +} + +static inline int disk_spare(mdp_disk_t * d) +{ + return !disk_sync(d) && !disk_active(d) && !disk_faulty(d); +} + +static inline int disk_removed(mdp_disk_t * d) +{ + return d->state & (1 << MD_DISK_REMOVED); +} + +static inline void mark_disk_faulty(mdp_disk_t * d) +{ + d->state |= (1 << MD_DISK_FAULTY); +} + +static inline void mark_disk_active(mdp_disk_t * d) +{ + d->state |= (1 << MD_DISK_ACTIVE); + d->state &= ~(1 << MD_DISK_PENDING_ACTIVE); +} + +static inline void mark_disk_sync(mdp_disk_t * d) +{ + d->state |= (1 << MD_DISK_SYNC); +} + +static inline void mark_disk_spare(mdp_disk_t * d) +{ + d->state = 0; +} + +static inline void mark_disk_removed(mdp_disk_t * d) +{ + d->state = (1 << MD_DISK_FAULTY) | (1 << MD_DISK_REMOVED); +} + +static inline void mark_disk_inactive(mdp_disk_t * d) +{ + d->state &= ~(1 << MD_DISK_ACTIVE); +} + +static inline void mark_disk_nonsync(mdp_disk_t * d) +{ + d->state &= ~(1 << MD_DISK_SYNC); +} + +/* + * MD's 'extended' device + */ +struct mdk_rdev_s +{ + struct md_list_head same_set; /* RAID devices within the same set */ + struct md_list_head all; /* all RAID devices */ + struct md_list_head pending; /* undetected RAID devices */ + struct evms_logical_node *node; /* EVMS device node */ + kdev_t dev; /* Device number */ + kdev_t old_dev; /* "" when it was last imported */ + unsigned long size; /* Device size (in blocks) */ + mddev_t *mddev; /* RAID array if running */ + unsigned long last_events; /* IO event timestamp */ + + struct block_device *bdev; /* block device handle */ + + mdp_super_t *sb; + unsigned long sb_offset; /* in blocks */ + + int virtual_spare; /* "virtual" spare added via IOCTL */ + int alias_device; /* device alias to the same disk */ + int faulty; /* if faulty do not issue IO requests */ + int desc_nr; /* descriptor index in the superblock */ +}; + + +/* + * disk operations in a working array: + */ +#define DISKOP_SPARE_INACTIVE 0 +#define DISKOP_SPARE_WRITE 1 +#define DISKOP_SPARE_ACTIVE 2 +#define DISKOP_HOT_SPARE_ACTIVE 3 +#define DISKOP_HOT_REMOVE_SPARE 4 +#define DISKOP_HOT_REMOVE_DISK 5 +#define DISKOP_HOT_ADD_DISK 6 +#define DISKOP_HOT_DEACTIVATE_DISK 7 + +typedef struct mdk_personality_s mdk_personality_t; + +struct mddev_s +{ + void *private; + mdk_personality_t *pers; + struct evms_logical_node *node; + unsigned long flag; + int nr_raid_disks; + int __minor; + int chunk_size; + mdp_super_t *sb; + int nb_dev; + struct md_list_head disks; + int sb_dirty; + int ro; + unsigned long curr_resync; /* blocks scheduled */ + unsigned long resync_mark; /* a recent timestamp */ + unsigned long resync_mark_cnt;/* blocks written at resync_mark */ + char *name; + int recovery_running; + struct semaphore reconfig_sem; + struct semaphore recovery_sem; + struct semaphore resync_sem; + atomic_t active; + + atomic_t recovery_active; /* blocks scheduled, but not written */ + md_wait_queue_head_t recovery_wait; + + struct md_list_head all_mddevs; + struct md_list_head incomplete_mddevs; + struct md_list_head running_mddevs; +}; + +struct mdk_personality_s +{ + char *name; + int (*sync_io) (mddev_t *mddev, int rw, u64 LSN, u64 nr_sects, void *data); + void (*read)(struct evms_logical_node *node, struct buffer_head *bh); + void (*write)(struct evms_logical_node *node, struct buffer_head *bh); + int (*run)(mddev_t *mddev); + int (*stop)(mddev_t *mddev); + int (*status)(char *page, mddev_t *mddev); + int (*error_handler)(mddev_t *mddev, struct evms_logical_node *node); + +/* + * Some personalities (RAID-1, RAID-5) can have disks hot-added and + * hot-removed. Hot removal is different from failure. (failure marks + * a disk inactive, but the disk is still part of the array) The interface + * to such operations is the 'pers->diskop()' function, can be NULL. + * + * the diskop function can change the pointer pointing to the incoming + * descriptor, but must do so very carefully. (currently only + * SPARE_ACTIVE expects such a change) + */ + int (*diskop) (mddev_t *mddev, mdp_disk_t **descriptor, int state); + + int (*stop_resync)(mddev_t *mddev); + int (*restart_resync)(mddev_t *mddev); + int (*sync_request)(mddev_t *mddev, unsigned long block_nr); + int (*evms_ioctl)(mddev_t *mddev, struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg); + int (*md_pers_ioctl)(mddev_t *mddev, int cmd, void* pers_arg); +}; + +/** + * EVMS MD instance data structure definition + **/ +struct evms_md { + mddev_t *mddev; + struct evms_plugin_header instance_plugin_hdr; +}; + +#define EVMS_MD_NODE_TO_MDDEV(node) ((struct evms_md *)(node->private))->mddev + +static inline int evms_md_check_boundary(struct evms_logical_node *node, struct buffer_head *bh) +{ + if ((bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT)) > node->total_vsectors) { + bh->b_end_io(bh, 0); + return -EIO; + } + return 0; +} + +/** + * This structure is used for synchronous I/O + * @rc : error code + * @io_count: number of I/Os + + @wait: wait queue + **/ +struct evms_md_sync_cb { + int rc; + atomic_t io_count; + wait_queue_head_t wait; +}; + + +/** + * This structure is required for activating a spare device + * @next: next spare + * @mddev: target md device + * @spare: spare to activate + **/ +struct evms_md_activate_spare { + struct evms_md_activate_spare *next; + mddev_t *mddev; + mdp_disk_t *spare; +}; + +static inline int incomplete_mddev(mddev_t * mddev) +{ + return (mddev->incomplete_mddevs.next != &mddev->incomplete_mddevs); +} + +/* + * Currently we index md_array directly, based on the minor + * number. This will have to change to dynamic allocation + * once we start supporting partitioning of md devices. + */ +static inline int mdidx (mddev_t * mddev) +{ + return mddev->__minor; +} + +static inline kdev_t mddev_to_kdev(mddev_t * mddev) +{ + return MKDEV(MD_MAJOR, mdidx(mddev)); +} + +extern mdk_rdev_t * evms_md_find_rdev(mddev_t * mddev, kdev_t dev); +extern mdk_rdev_t * evms_md_find_rdev_nr(mddev_t *mddev, int nr); +extern mdp_disk_t *get_spare(mddev_t *mddev); + +/* + * iterates through some rdev ringlist. It's safe to remove the + * current 'rdev'. Dont touch 'tmp' though. + */ +#define ITERATE_RDEV_GENERIC(head,field,rdev,tmp) \ + \ + for (tmp = head.next; \ + rdev = md_list_entry(tmp, mdk_rdev_t, field), \ + tmp = tmp->next, tmp->prev != &head \ + ; ) +/* + * iterates through the 'same array disks' ringlist + */ +#define ITERATE_RDEV(mddev,rdev,tmp) \ + ITERATE_RDEV_GENERIC((mddev)->disks,same_set,rdev,tmp) + +/* + * Same as above, but assumes that the device has rdev->desc_nr numbered + * from 0 to mddev->nb_dev, and iterates through rdevs in ascending order. + */ +#define ITERATE_RDEV_ORDERED(mddev,rdev,i) \ + for (i = 0; rdev = evms_md_find_rdev_nr(mddev, i), i < mddev->nb_dev; i++) + + +/* + * Iterates through all 'RAID managed disks' + */ +#define ITERATE_RDEV_ALL(rdev,tmp) \ + ITERATE_RDEV_GENERIC(all_raid_disks,all,rdev,tmp) + +/* + * Iterates through 'pending RAID disks' + */ +#define ITERATE_RDEV_PENDING(rdev,tmp) \ + ITERATE_RDEV_GENERIC(pending_raid_disks,pending,rdev,tmp) + +/* + * iterates through all used mddevs in the system. + */ +#define ITERATE_MDDEV(mddev,tmp) \ + \ + for (tmp = all_mddevs.next; \ + mddev = md_list_entry(tmp, mddev_t, all_mddevs), \ + tmp = tmp->next, tmp->prev != &all_mddevs \ + ; ) + +/* + * iterates through all incomplete mddevs in the system. + */ +#define ITERATE_INCOMPLETE_MDDEV(mddev,tmp) \ + \ + for (tmp = incomplete_mddevs.next; \ + mddev = list_entry(tmp, mddev_t, incomplete_mddevs), \ + tmp = tmp->next, tmp->prev != &incomplete_mddevs\ + ; ) +/* + * iterates through all running mddevs in the system. + */ +#define ITERATE_RUNNING_MDDEV(mddev,tmp) \ + \ + for (tmp = running_mddevs.next; \ + mddev = list_entry(tmp, mddev_t, running_mddevs), \ + tmp = tmp->next, tmp->prev != &running_mddevs \ + ; ) + +static inline int lock_mddev (mddev_t * mddev) +{ + return down_interruptible(&mddev->reconfig_sem); +} + +static inline void unlock_mddev (mddev_t * mddev) +{ + up(&mddev->reconfig_sem); +} + +#define xchg_values(x,y) do { __typeof__(x) __tmp = x; \ + x = y; y = __tmp; } while (0) + +#define MAX_DISKNAME_LEN 64 + +typedef struct dev_name_s { + struct md_list_head list; + kdev_t dev; + char namebuf [MAX_DISKNAME_LEN]; + char *name; +} dev_name_t; + + +#define __wait_event_lock_irq(wq, condition, lock) \ +do { \ + wait_queue_t __wait; \ + init_waitqueue_entry(&__wait, current); \ + \ + add_wait_queue(&wq, &__wait); \ + for (;;) { \ + set_current_state(TASK_UNINTERRUPTIBLE); \ + if (condition) \ + break; \ + spin_unlock_irq(&lock); \ + run_task_queue(&tq_disk); \ + schedule(); \ + spin_lock_irq(&lock); \ + } \ + current->state = TASK_RUNNING; \ + remove_wait_queue(&wq, &__wait); \ +} while (0) + +#define wait_event_lock_irq(wq, condition, lock) \ +do { \ + if (condition) \ + break; \ + __wait_event_lock_irq(wq, condition, lock); \ +} while (0) + + +#define __wait_disk_event(wq, condition) \ +do { \ + wait_queue_t __wait; \ + init_waitqueue_entry(&__wait, current); \ + \ + add_wait_queue(&wq, &__wait); \ + for (;;) { \ + set_current_state(TASK_UNINTERRUPTIBLE); \ + if (condition) \ + break; \ + run_task_queue(&tq_disk); \ + schedule(); \ + } \ + current->state = TASK_RUNNING; \ + remove_wait_queue(&wq, &__wait); \ +} while (0) + +#define wait_disk_event(wq, condition) \ +do { \ + if (condition) \ + break; \ + __wait_disk_event(wq, condition); \ +} while (0) + +#endif + diff -Naur linux-2002-09-30/include/linux/evms/evms_md_p.h evms-2002-09-30/include/linux/evms/evms_md_p.h --- linux-2002-09-30/include/linux/evms/evms_md_p.h Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/include/linux/evms/evms_md_p.h Tue Mar 26 18:58:57 2002 @@ -0,0 +1,197 @@ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +/* + * linux/include/linux/evms/evms_md_p.h + * + * EVMS Linux MD Region Manager Public Header File + * + * 'evms_md_p.h' is an EVMS version of linux/include/linux/raid/md_p.h modified + * by Cuong (Mike) Tran , March 2002. + * + */ + +#ifndef __EVMS_MD_P_INC__ +#define __EVMS_MD_P_INC__ + +/* + * RAID superblock. + * + * The RAID superblock maintains some statistics on each RAID configuration. + * Each real device in the RAID set contains it near the end of the device. + * Some of the ideas are copied from the ext2fs implementation. + * + * We currently use 4096 bytes as follows: + * + * word offset function + * + * 0 - 31 Constant generic RAID device information. + * 32 - 63 Generic state information. + * 64 - 127 Personality specific information. + * 128 - 511 12 32-words descriptors of the disks in the raid set. + * 512 - 911 Reserved. + * 912 - 1023 Disk specific descriptor. + */ + +/* + * If x is the real device size in bytes, we return an apparent size of: + * + * y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES + * + * and place the 4kB superblock at offset y. + */ +#define MD_RESERVED_BYTES (64 * 1024) +#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512) +#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE) + +#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS) +#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS) + +#define MD_SB_BYTES 4096 +#define MD_SB_WORDS (MD_SB_BYTES / 4) +#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE) +#define MD_SB_SECTORS (MD_SB_BYTES / 512) + +/* + * The following are counted in 32-bit words + */ +#define MD_SB_GENERIC_OFFSET 0 +#define MD_SB_PERSONALITY_OFFSET 64 +#define MD_SB_DISKS_OFFSET 128 +#define MD_SB_DESCRIPTOR_OFFSET 992 + +#define MD_SB_GENERIC_CONSTANT_WORDS 32 +#define MD_SB_GENERIC_STATE_WORDS 32 +#define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS) +#define MD_SB_PERSONALITY_WORDS 64 +#define MD_SB_DESCRIPTOR_WORDS 32 +#define MD_SB_DISKS 27 +#define MD_SB_DISKS_WORDS (MD_SB_DISKS*MD_SB_DESCRIPTOR_WORDS) +#define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS) +#define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS) + +/* + * Device "operational" state bits + */ +#define MD_DISK_FAULTY 0 /* disk is faulty / operational */ +#define MD_DISK_ACTIVE 1 /* disk is running or spare disk */ +#define MD_DISK_SYNC 2 /* disk is in sync with the raid set */ +#define MD_DISK_REMOVED 3 /* disk has kind of been removed, but not really or it would not be here */ +#define MD_DISK_NEW 4 /* disk has just been added to the raid set */ +#define MD_DISK_PENDING_ACTIVE 5 /* disk was spare, but should be activated */ + +typedef struct mdp_device_descriptor_s { + __u32 number; /* 0 Device number in the entire set */ + __u32 major; /* 1 Device major number */ + __u32 minor; /* 2 Device minor number */ + __u32 raid_disk; /* 3 The role of the device in the raid set */ + __u32 state; /* 4 Operational state */ + __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5]; +} mdp_disk_t; + +#define MD_SB_MAGIC 0xa92b4efc + +/* + * Superblock state bits + */ +#define MD_SB_CLEAN 0 +#define MD_SB_ERRORS 1 + +typedef struct mdp_superblock_s { + /* + * Constant generic information + */ + __u32 md_magic; /* 0 MD identifier */ + __u32 major_version; /* 1 major version to which the set conforms */ + __u32 minor_version; /* 2 minor version ... */ + __u32 patch_version; /* 3 patchlevel version ... */ + __u32 gvalid_words; /* 4 Number of used words in this section */ + __u32 set_uuid0; /* 5 Raid set identifier */ + __u32 ctime; /* 6 Creation time */ + __u32 level; /* 7 Raid personality */ + __u32 size; /* 8 Apparent size of each individual disk */ + __u32 nr_disks; /* 9 total disks in the raid set */ + __u32 raid_disks; /* 10 disks in a fully functional raid set */ + __u32 md_minor; /* 11 preferred MD minor device number */ + __u32 not_persistent; /* 12 does it have a persistent superblock */ + __u32 set_uuid1; /* 13 Raid set identifier #2 */ + __u32 set_uuid2; /* 14 Raid set identifier #3 */ + __u32 set_uuid3; /* 15 Raid set identifier #4 */ + __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 16]; + + /* + * Generic state information + */ + __u32 utime; /* 0 Superblock update time */ + __u32 state; /* 1 State bits (clean, ...) */ + __u32 active_disks; /* 2 Number of currently active disks */ + __u32 working_disks; /* 3 Number of working disks */ + __u32 failed_disks; /* 4 Number of failed disks */ + __u32 spare_disks; /* 5 Number of spare disks */ + __u32 sb_csum; /* 6 checksum of the whole superblock */ +#ifdef __KERNEL__ +#ifdef __BIG_ENDIAN + __u32 events_hi; /* 7 high-order of superblock update count */ + __u32 events_lo; /* 8 low-order of superblock update count */ +#else + __u32 events_lo; /* 7 low-order of superblock update count */ + __u32 events_hi; /* 8 high-order of superblock update count */ +#endif +#else +#if __BYTE_ORDER == __BIG_ENDIAN + __u32 events_hi; /* 7 high-order of superblock update count */ + __u32 events_lo; /* 8 low-order of superblock update count */ +#else + __u32 events_lo; /* 7 low-order of superblock update count */ + __u32 events_hi; /* 8 high-order of superblock update count */ +#endif +#endif + __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 9]; + + /* + * Personality information + */ + __u32 layout; /* 0 the array's physical layout */ + __u32 chunk_size; /* 1 chunk size in bytes */ + __u32 root_pv; /* 2 LV root PV */ + __u32 root_block; /* 3 LV root block */ + __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 4]; + + /* + * Disks information + */ + mdp_disk_t disks[MD_SB_DISKS]; + + /* + * Reserved + */ + __u32 reserved[MD_SB_RESERVED_WORDS]; + + /* + * Active descriptor + */ + mdp_disk_t this_disk; + +}mdp_super_t; + +static inline __u64 md_event(mdp_super_t *sb) { + __u64 ev = sb->events_hi; + return (ev<<32)| sb->events_lo; +} + +#endif + diff -Naur linux-2002-09-30/include/linux/evms/evms_md_u.h evms-2002-09-30/include/linux/evms/evms_md_u.h --- linux-2002-09-30/include/linux/evms/evms_md_u.h Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/include/linux/evms/evms_md_u.h Fri Aug 16 16:19:56 2002 @@ -0,0 +1,69 @@ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * + * linux/include/linux/evms/evms_md_h.c + * + * EVMS MD Region Manager, User <-> Kernel common file + * + */ + +#ifndef _EVMS_MD_U_INC_ +#define _EVMS_MD_U_INC_ + +#define EVMS_MD_ID 4 +#define MD_SET_PLUGIN_ID SetPluginID(IBM_OEM_ID,EVMS_REGION_MANAGER,EVMS_MD_ID) + +#define EVMS_MD_PERS_IOCTL_CMD 1 /* personality specific ioctl command */ +#define EVMS_MD_ADD 2 +#define EVMS_MD_REMOVE 3 +#define EVMS_MD_ACTIVATE 4 +#define EVMS_MD_DEACTIVATE 5 +#define EVMS_MD_GET_ARRAY_INFO 6 + +/** + * structure definition to use with MD_ADD, MD_REMOVE, MD_ACTIVATE + **/ +struct evms_md_kdev { + u32 major; + u32 minor; +}; + +/** + * structure definition to use with MD_GET_ARRAY_INFO + **/ +#define EVMS_MD_ARRAY_DEGRADED (1<<0) +#define EVMS_MD_ARRAY_SYNCING (1<<1) +struct evms_md_array_info { + u32 state; + mdp_super_t *sb; +}; + +/** + * EVMS MD user/kernel communication + * @mddev_idx: md minor + * @cmd: command for personality + * @arg: specific command structure + **/ +struct evms_md_ioctl { + u32 mddev_idx; + u32 cmd; + void *arg; +}; + +#endif + diff -Naur linux-2002-09-30/include/linux/evms/evms_os2.h evms-2002-09-30/include/linux/evms/evms_os2.h --- linux-2002-09-30/include/linux/evms/evms_os2.h Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/include/linux/evms/evms_os2.h Thu Aug 8 17:40:37 2002 @@ -0,0 +1,386 @@ +/* + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Module: linux/include/linux/evms_os2.h + */ + +/* + * Change History: + * + */ + +/* + * Description: This module defines the disk structures used by the OS/2 + * Logical Volume Manager, including that of the Master + * Boot Record (MBR) and Extended Boot Records (EBR). + * + * Notes: LVM Drive Letter Assignment Tables (DLA_Tables) appear on the + * last sector of each track containing a valid MBR or EBR. Since + * partitions must be track aligned, any track containing an MBR or + * EBR will be almost all empty sectors. We will grab the last + * of these empty sectors for our DLT_Tables. + * + */ + +#ifndef OS2LVM_INCLUDED__ +#define OS2LVM_INCLUDED__ + +/* The following define the values used to indicate that a partition table entry is for an EBR, not a partition. */ +#define EBR_BOOT_INDICATOR 0 +#define EBR_FORMAT_INDICATOR 5 + +/* The following define is used as the default Format_Indicator for new non-primary partitions. */ +#define NEW_LOGICAL_DRIVE_FORMAT_INDICATOR 0x6 + +/* The following define is used as the default Format_Indicator for a new non-active primary partitions. */ +#define NEW_PRIMARY_PARTITION_FORMAT_INDICATOR 0x16 + +/* The following define is used as the default Format_Indicator for a new active primary partition. */ +#define NEW_ACTIVE_PRIMARY_PARTITION_FORMAT_INDICATOR 0x06 + +/* The following define is used to hold the value of the Boot_Indicator for active partitions. */ +#define ACTIVE_PARTITION 0x80 + +/* Define the size of a Partition Name. Partition Names are user defined names given to a partition. */ +#define PARTITION_NAME_SIZE 20 + +/* Define the size of a volume name. Volume Names are user defined names given to a volume. */ +#define VOLUME_NAME_SIZE 20 + +/* Define the size of a disk name. Disk Names are user defined names given to physical disk drives in the system. */ +#define DISK_NAME_SIZE 20 + +/* The name of the filesystem in use on a partition. This name may be up to 12 ( + NULL terminator) characters long. */ +#define FILESYSTEM_NAME_SIZE 20 + +/* The comment field is reserved but is not currently used. This is for future expansion and use. */ +#define COMMENT_SIZE 81 + +/* Define the minimum number of sectors to reserve on the disk for Boot Manager. */ +#define BOOT_MANAGER_SIZE 2048 + +#define OS2_BYTES_PER_SECTOR 512 +#define OS2_SECTOR_SHIFT 9 + +/*-------------------------------------------------- + * Type definitions + --------------------------------------------------*/ + +/* The following definitions define the drive letter assignment table used by LVM. + For each partition table on the disk, there will be a drive letter assignment table in the last sector + of the track containing the partition table. */ + +/* NOTE: DLA stands for Drive Letter Assignment. */ + +#define DLA_TABLE_SIGNATURE1 0x424D5202L +#define DLA_TABLE_SIGNATURE2 0x44464D50L + +struct dla_entry { /* DE */ + u32 Volume_Serial_Number; /* The serial number of the volume that this partition belongs to. */ + u32 partition_serial; /* The serial number of this partition. */ + u32 Partition_Size; /* The size of the partition, in sectors. */ + u32 Partition_Start; /* The starting sector of the partition. */ + unsigned char On_Boot_Manager_Menu; /* Set to TRUE if this volume/partition is on the Boot Manager Menu. */ + unsigned char Installable; /* Set to TRUE if this volume is the one to install the operating system on. */ + char Drive_Letter; /* The drive letter assigned to the partition. */ + unsigned char Reserved; + char Volume_Name[VOLUME_NAME_SIZE]; /* The name assigned to the volume by the user. */ + char Partition_Name[PARTITION_NAME_SIZE]; /* The name assigned to the partition. */ +}; + +struct dla_table_sector { /* DTS */ + u32 DLA_Signature1; /* The magic signature (part 1) of a Drive Letter Assignment Table. */ + u32 DLA_Signature2; /* The magic signature (part 2) of a Drive Letter Assignment Table. */ + u32 DLA_CRC; /* The 32 bit CRC for this sector. Calculated assuming that this field and all unused space in the sector is 0. */ + u32 Disk_Serial_Number; /* The serial number assigned to this disk. */ + u32 Boot_Disk_Serial_Number; /* The serial number of the disk used to boot the system. This is for conflict resolution when multiple volumes + want the same drive letter. Since LVM.EXE will not let this situation happen, the only way to get this situation + is for the disk to have been altered by something other than LVM.EXE, or if a disk drive has been moved from one + machine to another. If the drive has been moved, then it should have a different Boot_Disk_Serial_Number. Thus, + we can tell which disk drive is the "foreign" drive and therefore reject its claim for the drive letter in question. + If we find that all of the claimaints have the same Boot_Disk_Serial_Number, then we must assign drive letters on + a first come, first serve basis. */ + u32 Install_Flags; /* Used by the Install program. */ + u32 Cylinders; + u32 Heads_Per_Cylinder; + u32 Sectors_Per_Track; + char Disk_Name[DISK_NAME_SIZE]; /* The name assigned to the disk containing this sector. */ + unsigned char Reboot; /* For use by Install. Used to keep track of reboots initiated by install. */ + unsigned char Reserved[3]; /* Alignment. */ + struct dla_entry DLA_Array[4]; /* These are the four entries which correspond to the entries in the partition table. */ +}; + +/* The following definitions define the LVM signature sector which will appear as the last sector in an LVM partition. */ + +#define OS2LVM_PRIMARY_SIGNATURE 0x4A435332L +#define OS2LVM_SECONDARY_SIGNATURE 0x4252444BL + +#define CURRENT_OS2LVM_MAJOR_VERSION_NUMBER 2 /* Define as appropriate. */ +#define CURRENT_OS2LVM_MINOR_VERSION_NUMBER 0 /* Define as appropriate. */ + +/* The following definitions limit the number of LVM features that can be applied to a volume, as well as defining a "NULL" feature for use in feature table entries that are not being used. */ +#define OS2LVM_MAX_FEATURES_PER_VOLUME 10 /* The maximum number of LVM features that can be applied to a volume. */ +#define OS2LVM_NULL_FEATURE 0 /* No feature. Used in all unused entries of the feature array in the LVM Signature sector. */ + +/* The following structure is used to hold the location of the feature specific data for LVM features. */ +typedef struct _LVM_Feature_Data { /* LFD */ + u32 Feature_ID; /* The ID of the feature. */ + u32 Location_Of_Primary_Feature_Data; /* The u32 of the starting sector of the private data for this feature. */ + u32 Location_Of_Secondary_Feature_Data; /* The u32 of the starting sector of the backup copy of the private data for this feature. */ + u32 Feature_Data_Size; /* The number of sectors used by this feature for its private data. */ + u16 Feature_Major_Version_Number; /* The integer portion of the version number of this feature. */ + u16 Feature_Minor_Version_Number; /* The decimal portion of the version number of this feature. */ + unsigned char Feature_Active; /* TRUE if this feature is active on this partition/volume, FALSE otherwise. */ + unsigned char Reserved[3]; /* Alignment. */ +} LVM_Feature_Data; + +/* The following structure defines the LVM Signature Sector. This is the last sector of every partition which is part of an LVM volume. It gives vital + information about the version of LVM used to create the LVM volume that it is a part of, as well as which LVM features (BBR, drive linking, etc.) are + active on the volume that this partition is a part of. */ +typedef struct _LVM_Signature_Sector { /* LSS */ + u32 LVM_Signature1; /* The first part of the magic LVM signature. */ + u32 LVM_Signature2; /* The second part of the magic LVM signature. */ + u32 Signature_Sector_CRC; /* 32 bit CRC for this sector. Calculated using 0 for this field. */ + u32 partition_serial; /* The LVM assigned serial number for this partition. */ + u32 Partition_Start; /* u32 of the first sector of this partition. */ + u32 Partition_End; /* u32 of the last sector of this partition. */ + u32 Partition_Sector_Count; /* The number of sectors in this partition. */ + u32 LVM_Reserved_Sector_Count; /* The number of sectors reserved for use by LVM. */ + u32 Partition_Size_To_Report_To_User; /* The size of the partition as the user sees it - i.e. (the actual size of the partition - LVM reserved sectors) rounded to a track boundary. */ + u32 Boot_Disk_Serial_Number; /* The serial number of the boot disk for the system. If the system contains Boot Manager, then this is the serial number of the disk containing the active copy of Boot Manager. */ + u32 Volume_Serial_Number; /* The serial number of the volume that this partition belongs to. */ + u32 Fake_EBR_Location; /* The location, on disk, of a Fake EBR, if one has been allocated. */ + u16 LVM_Major_Version_Number; /* Major version number of the LVM that created this partition. */ + u16 LVM_Minor_Version_Number; /* Minor version number of the LVM that created this partition. */ + char Partition_Name[PARTITION_NAME_SIZE]; /* User defined partition name. */ + char Volume_Name[VOLUME_NAME_SIZE]; /* The name of the volume that this partition belongs to. */ + LVM_Feature_Data LVM_Feature_Array[OS2LVM_MAX_FEATURES_PER_VOLUME]; /* The feature array. This indicates which LVM features, if any, are active on this volume + and what order they should be applied in. */ + char Drive_Letter; /* The drive letter assigned to the volume that this partition is part of. */ + unsigned char Fake_EBR_Allocated; /* If TRUE, then a fake EBR has been allocated. */ + char Comment[COMMENT_SIZE]; /* User comment. */ + char Disk_Name[DISK_NAME_SIZE]; /* Added to allow BBR to report the name of a disk when bad sectors are encountered on that disk. */ + u32 Sequence_Number; /* This indicates the order that partitions within a volume are used. This number is 1 based. A 0 here indicates that the volume was made by LVM Ver. 1. */ + u32 Next_Aggregate_Number; /* Used during volume creation and expansion when creating unique names for aggregates. */ + /* The remainder of the sector is reserved for future use and should be all zero or else the CRC will not come out correctly. */ +} LVM_Signature_Sector; + +/* The following definitions define the format of a partition table and the Master Boot Record (MBR). */ +typedef struct _Partition_Record { /* PR */ + unsigned char Boot_Indicator; /* 80h = active partition. */ + unsigned char Starting_Head; + unsigned char Starting_Sector; /* Bits 0-5 are the sector. Bits 6 and 7 are the high order bits of the starting cylinder. */ + unsigned char Starting_Cylinder; /* The cylinder number is a 10 bit value. The high order bits of the 10 bit value come from bits 6 & 7 of the Starting_Sector field. */ + unsigned char Format_Indicator; /* An indicator of the format/operation system on this partition. */ + unsigned char Ending_Head; + unsigned char Ending_Sector; + unsigned char Ending_Cylinder; + u32 Sector_Offset; /* The number of sectors on the disk which are prior to the start of this partition. */ + u32 Sector_Count; /* The number of sectors in this partition. */ +} Partition_Record; + +typedef struct _Master_Boot_Record { /* MBR */ + unsigned char Reserved[446]; + Partition_Record Partition_Table[4]; + u16 Signature; /* AA55h in this field indicates that this is a valid partition table/MBR. */ +} Master_Boot_Record; + +typedef Master_Boot_Record Extended_Boot_Record; + +/* The following definition covers the Boot Manager Alias Table in the EBR. + + The Alias Table in the EBR has 2 entries in it, although only the first one is actually used. */ +#define ALIAS_NAME_SIZE 8 +typedef struct _AliasTableEntry { /* ATE */ + unsigned char On_Boot_Manager_Menu; + char Name[ALIAS_NAME_SIZE]; +} AliasTableEntry; + +#define ALIAS_TABLE_OFFSET 0x18A + +/* XLATOFF */ +/* The following text is used for the Boot Manager Alias for items that were placed on the Boot Manager Menu by FDISK and + which have since been migrated to the new LVM format. This text is put into the Name field of an AliasTableEntry so + that, if FDISK ( or another program which understands the old Boot Manager Menu format) is run, it will display + something for those partitions/volumes which are on the Boot Manager Menu. + + NOTE: This text must be exactly ALIAS_NAME_SIZE characters in length! */ +#define ALIAS_TABLE_ENTRY_MIGRATION_TEXT "--> LVM " +#define ALIAS_TABLE_ENTRY_MIGRATION_TEXT2 "--> LVM*" + +/* XLATON */ + +/* The following is the signature used for an Master Boot Record, an Extended Boot Record, and a Boot Sector. */ +#define MBR_EBR_SIGNATURE 0xAA55 + +/* The following list of definitions defines the values of interest for the Format_Indicator in a Partition_Record. */ +#define EBR_INDICATOR 0x5 +#define WINDOZE_EBR_INDICATOR 0xF +#define UNUSED_INDICATOR 0x0 +#define IFS_INDICATOR 0x7 +#define FAT12_INDICATOR 0x1 +#define FAT16_SMALL_PARTITION_INDICATOR 0x4 +#define FAT16_LARGE_PARTITION_INDICATOR 0x6 +#define BOOT_MANAGER_HIDDEN_PARTITION_FLAG 0x10 +#define LVM_PARTITION_INDICATOR 0x35 +#define BOOT_MANAGER_INDICATOR 0x0A + +/* The following is the signature used in the Boot Sector for Boot Manager. */ +#define OS2LVM_BOOT_MANAGER_SIGNATURE "APJ&WN" + +/* The following is used for determining the synthetic geometry reported for Volumes employing drive linking. */ +#define OS2LVM_SYNTHETIC_SECTORS_PER_TRACK 63 + +/*-------------------------------------------------- + * Declares for Drive Linking feature: + *--------------------------------------------------*/ + +/* The following defines uniquely identify Drive Linking. */ +#define DRIVE_LINKING_FEATURE_ID 100 +#define DRIVE_LINKING_MAJOR_VERSION 1 +#define DRIVE_LINKING_MINOR_VERSION 0 + +/* The following definitions are used for the disk structures supporting drive linking. */ + +#define LINK_TABLE_MASTER_SIGNATURE 0x434E4157L +#define LINK_TABLE_SIGNATURE 0X4D4D5652L + +#define MAXIMUM_LINKS 246 + +#define DRIVE_LINKING_RESERVED_SECTOR_COUNT 4 + +#define LINKS_IN_FIRST_SECTOR 60 + +#define LINKS_IN_NEXT_SECTOR 62 + +struct drive_link { + u32 drive_serial; + u32 partition_serial; +}; + +struct link_table_first_sector { + u32 Link_Table_Signature; /* Use the LINK_TABLE_MASTER_SIGNATURE here. */ + u32 Link_Table_CRC; + u32 Sequence_Number; /* Used to resolve conflicts when the primary and secondary tables do not match. */ + u32 Links_In_Use; + struct drive_link Link_Table[LINKS_IN_FIRST_SECTOR]; +}; + +struct link_table_sector { + u32 Link_Table_Signature; /* Use LINK_TABLE_SIGNATURE here. */ + u32 Link_Table_CRC; + u32 Sequence_Number; /* Used to resolve conflicts when the primary and secondary tables do not match. */ + struct drive_link Link_Table[LINKS_IN_NEXT_SECTOR]; +}; + +/*-------------------------------------------------- + * Declares for Bad Block Relocation feature: + *--------------------------------------------------*/ + +/* The following definition is the numeric ID for Bad Block Relocation. */ +#define BBR_FEATURE_ID 101 + +#define BBR_FEATURE_MAJOR_VERSION 0x0001 +#define BBR_FEATURE_MINOR_VERSION 0x0000 + +/* The following definitions are used for the disk structures supporting bad block relocation. */ + +/* NOTE: BBR stands for Bad Block Relocation. */ + +#define BBR_TABLE_MASTER_SIGNATURE 0x00726D62 +#define BBR_TABLE_SIGNATURE 0x01726276 + +struct bbr_table_entry { + u32 BadSector; + u32 ReplacementSector; +}; + +typedef struct _LVM_BBR_Table_First_Sector { + u32 Signature; /* Signature for the first sector of the BBR Table. Use BBR_TABLE_MASTER_SIGNATURE here. */ + u32 CRC; /* CRC for this sector. */ + u32 Sequence_Number; /* Used to resolve conflicts when the primary and secondary tables do not match. */ + u32 Table_Size; /* The number of BBR_Table_Entries in the BBR Table. */ + u32 Table_Entries_In_Use; /* The number of BBR Table entries which are in use. */ + u32 Sectors_Per_Table; /* The number of LVM_BBR_Table_Sectors used to hold the BBR Table. */ + u32 First_Replacement_Sector; /* The location of the first replacement sector. */ + u32 Last_Replacement_Sector; /* The location of the last replacement sector. */ + u32 Replacement_Sector_Count; /* The number of replacement sectors. */ + u32 Flags; /* Flags global to the Bad Block Relocation Feature. */ +} LVM_BBR_Table_First_Sector; + +/* Flags for LVM_BBR_Table_First_Sector */ +#define BBR_Flag_Write_Verify 0x00000001 /* Indicate convert Write I/O to Write/Verify */ + +#define BBR_TABLE_ENTRIES_PER_SECTOR 62 + +typedef struct _LVM_BBR_Table_Sector { + u32 Signature; /* Signature for a sector of the BBR_Table which is not the first sector of the BBR Table. Use BBR_TABLE_SIGNATURE here. */ + u32 CRC; /* CRC for this sector of the BBR Table. */ + u32 Sequence_Number; /* Used to resolve conflicts when the primary and secondary tables do not match. */ + struct bbr_table_entry BBR_Table[BBR_TABLE_ENTRIES_PER_SECTOR]; + u32 reserved1; /* for block alignment */ +} LVM_BBR_Table_Sector; + +// +// Combined structure to hold entire BBR feature data as it exists on disk. +typedef struct _LVM_BBR_Feature { + LVM_BBR_Table_First_Sector control; + char reserved1[OS2_BYTES_PER_SECTOR - + sizeof (LVM_BBR_Table_First_Sector)]; + LVM_BBR_Table_Sector remap[1]; +} +LVM_BBR_Feature; + +/* The following defines establish the minimum and maximum number of replacement sectors which can be allocated for + Bad Block Relocation. Otherwise, 1 replacement sector per MB of disk space is allocated. */ +#define BBR_FLOOR 62 +#define BBR_LIMIT 4096 + +// In-memory Meta Data for Bad Block Relocation +// In-memory Meta Data for Drive Linking +struct os2_dl_entry { + u64 start_sector; + u64 sector_count; + u64 dl_lsn1; /* LSN of first on-disk copy of drive linking data. */ + u64 dl_lsn2; /* LSN of the second on-disk copy of drive linking data. */ + char *link_data; + u32 partition_serial; + u64 bbr_lsn1; /* LSN of the first on-disk copy of the BBR data. */ + u64 bbr_lsn2; /* LSN of the second on-disk copy of the BBR data. */ + u32 bbr_feature_size; /* # of sectors of BBR data. */ + u32 bbr_is_active; + struct semaphore bbr_table_lock; /* Used to serialize writers */ + unsigned int guard1; /* Lamport's Theorem for mutual exclusion */ + char *bbr_data; + unsigned int guard2; /* Lamport's Theorem for mutual exclusion */ + struct evms_logical_node *link_partition; + struct os2_dl_entry *next; +}; + +// In-memory Meta Data for each OS/2 LVM Volume: +typedef struct os2_volume_runtime_entry_s { + int complete; + u32 Export_Needed; + u64 size_in_sectors; + u32 Volume_Serial_Number; + u32 drive_link_count; + struct os2_dl_entry *drive_link; + struct evms_logical_node *next_os2lvm_node; +} os2_volume_runtime_entry_t; + +#endif diff -Naur linux-2002-09-30/include/linux/evms/evms_raid0.h evms-2002-09-30/include/linux/evms/evms_raid0.h --- linux-2002-09-30/include/linux/evms/evms_raid0.h Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/include/linux/evms/evms_raid0.h Tue Aug 6 01:03:24 2002 @@ -0,0 +1,33 @@ +#ifndef _EVMS_RAID0_INCL_ +#define _EVMS_RAID0_INCL_ + +#include + +struct strip_zone +{ + unsigned long zone_offset; /* Zone offset (in sectors) in md_dev */ + unsigned long dev_offset; /* Zone offset (in sectors) in real dev */ + unsigned long size_in_sects; /* Zone size in sectors */ + int nb_dev; /* # of devices attached to the zone */ + struct evms_logical_node *node[MD_SB_DISKS]; /* EVMS nodes attached to the zone */ +}; + +struct raid0_hash +{ + struct strip_zone *zone0, *zone1; +}; + +struct raid0_private_data +{ + struct raid0_hash *hash_table; /* Dynamically allocated */ + struct strip_zone *strip_zone; /* This one too */ + int nr_strip_zones; + struct strip_zone *smallest; + int nr_zones; +}; + +typedef struct raid0_private_data raid0_conf_t; + +#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private) + +#endif diff -Naur linux-2002-09-30/include/linux/evms/evms_raid1.h evms-2002-09-30/include/linux/evms/evms_raid1.h --- linux-2002-09-30/include/linux/evms/evms_raid1.h Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/include/linux/evms/evms_raid1.h Tue Aug 6 01:03:24 2002 @@ -0,0 +1,103 @@ +#ifndef _EVMS_RAID1_H +#define _EVMS_RAID1_H + +#include + +struct mirror_info { + int number; + int raid_disk; + struct evms_logical_node *node; + kdev_t dev; + int sect_limit; + int head_position; + + /* + * State bits: + */ + int operational; + int write_only; + int spare; + + int used_slot; +}; + +struct raid1_private_data { + mddev_t *mddev; + struct mirror_info mirrors[MD_SB_DISKS]; + int nr_disks; + int raid_disks; + int working_disks; + int last_used; + unsigned long next_sect; + int sect_count; + struct evms_thread *thread, *resync_thread; + int resync_mirrors; + struct mirror_info *spare; + md_spinlock_t device_lock; + + /* buffer pool */ + /* buffer_heads that we have pre-allocated have b_pprev -> &freebh + * and are linked into a stack using b_next + * raid1_bh that are pre-allocated have R1BH_PreAlloc set. + * All these variable are protected by device_lock + */ + struct buffer_head *freebh; + int freebh_cnt; /* how many are on the list */ + int freebh_blocked; + struct raid1_bh *freer1; + int freer1_blocked; + int freer1_cnt; + struct raid1_bh *freebuf; /* each bh_req has a page allocated */ + md_wait_queue_head_t wait_buffer; + + /* for use when syncing mirrors: */ + unsigned long start_active, start_ready, + start_pending, start_future; + int cnt_done, cnt_active, cnt_ready, + cnt_pending, cnt_future; + int phase; + int window; + md_wait_queue_head_t wait_done; + md_wait_queue_head_t wait_ready; + md_spinlock_t segment_lock; +}; + +typedef struct raid1_private_data raid1_conf_t; + +/* + * this is the only point in the RAID code where we violate + * C type safety. mddev->private is an 'opaque' pointer. + */ +#define mddev_to_conf(mddev) ((raid1_conf_t *) mddev->private) + +/* + * this is our 'private' 'collective' RAID1 buffer head. + * it contains information about what kind of IO operations were started + * for this RAID1 operation, and about their status: + */ + +struct raid1_bh { + atomic_t remaining; /* 'have we finished' count, + * used from IRQ handlers + */ + int cmd; + unsigned long state; + mddev_t *mddev; + struct buffer_head *master_bh; + struct buffer_head *mirror_bh_list; + struct buffer_head bh_req; + struct evms_logical_node *node; /* map to evms node (READ only) */ + struct raid1_bh *next_r1; /* next for retry or in free list */ +}; + +typedef struct raid1_sync_cb_s { + int rc; + atomic_t io_count; + md_wait_queue_head_t wait; +} raid1_sync_cb_t; + +/* bits for raid1_bh.state */ +#define R1BH_Uptodate 1 +#define R1BH_SyncPhase 2 +#define R1BH_PreAlloc 3 /* this was pre-allocated, add to free list */ +#endif diff -Naur linux-2002-09-30/include/linux/evms/evms_raid5.h evms-2002-09-30/include/linux/evms/evms_raid5.h --- linux-2002-09-30/include/linux/evms/evms_raid5.h Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/include/linux/evms/evms_raid5.h Tue Aug 6 01:03:23 2002 @@ -0,0 +1,251 @@ +#ifndef _RAID5_H +#define _RAID5_H + +#include +#include + +/* + * + * Each stripe contains one buffer per disc. Each buffer can be in + * one of a number of states determined by bh_state. Changes between + * these states happen *almost* exclusively under a per-stripe + * spinlock. Some very specific changes can happen in b_end_io, and + * these are not protected by the spin lock. + * + * The bh_state bits that are used to represent these states are: + * BH_Uptodate, BH_Lock + * + * State Empty == !Uptodate, !Lock + * We have no data, and there is no active request + * State Want == !Uptodate, Lock + * A read request is being submitted for this block + * State Dirty == Uptodate, Lock + * Some new data is in this buffer, and it is being written out + * State Clean == Uptodate, !Lock + * We have valid data which is the same as on disc + * + * The possible state transitions are: + * + * Empty -> Want - on read or write to get old data for parity calc + * Empty -> Dirty - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE) + * Empty -> Clean - on compute_block when computing a block for failed drive + * Want -> Empty - on failed read + * Want -> Clean - on successful completion of read request + * Dirty -> Clean - on successful completion of write request + * Dirty -> Clean - on failed write + * Clean -> Dirty - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW) + * + * The Want->Empty, Want->Clean, Dirty->Clean, transitions + * all happen in b_end_io at interrupt time. + * Each sets the Uptodate bit before releasing the Lock bit. + * This leaves one multi-stage transition: + * Want->Dirty->Clean + * This is safe because thinking that a Clean buffer is actually dirty + * will at worst delay some action, and the stripe will be scheduled + * for attention after the transition is complete. + * + * There is one possibility that is not covered by these states. That + * is if one drive has failed and there is a spare being rebuilt. We + * can't distinguish between a clean block that has been generated + * from parity calculations, and a clean block that has been + * successfully written to the spare ( or to parity when resyncing). + * To distingush these states we have a stripe bit STRIPE_INSYNC that + * is set whenever a write is scheduled to the spare, or to the parity + * disc if there is no spare. A sync request clears this bit, and + * when we find it set with no buffers locked, we know the sync is + * complete. + * + * Buffers for the md device that arrive via make_request are attached + * to the appropriate stripe in one of two lists linked on b_reqnext. + * One list (bh_read) for read requests, one (bh_write) for write. + * There should never be more than one buffer on the two lists + * together, but we are not guaranteed of that so we allow for more. + * + * If a buffer is on the read list when the associated cache buffer is + * Uptodate, the data is copied into the read buffer and it's b_end_io + * routine is called. This may happen in the end_request routine only + * if the buffer has just successfully been read. end_request should + * remove the buffers from the list and then set the Uptodate bit on + * the buffer. Other threads may do this only if they first check + * that the Uptodate bit is set. Once they have checked that they may + * take buffers off the read queue. + * + * When a buffer on the write list is committed for write is it copied + * into the cache buffer, which is then marked dirty, and moved onto a + * third list, the written list (bh_written). Once both the parity + * block and the cached buffer are successfully written, any buffer on + * a written list can be returned with b_end_io. + * + * The write list and read list both act as fifos. The read list is + * protected by the device_lock. The write and written lists are + * protected by the stripe lock. The device_lock, which can be + * claimed while the stipe lock is held, is only for list + * manipulations and will only be held for a very short time. It can + * be claimed from interrupts. + * + * + * Stripes in the stripe cache can be on one of two lists (or on + * neither). The "inactive_list" contains stripes which are not + * currently being used for any request. They can freely be reused + * for another stripe. The "handle_list" contains stripes that need + * to be handled in some way. Both of these are fifo queues. Each + * stripe is also (potentially) linked to a hash bucket in the hash + * table so that it can be found by sector number. Stripes that are + * not hashed must be on the inactive_list, and will normally be at + * the front. All stripes start life this way. + * + * The inactive_list, handle_list and hash bucket lists are all protected by the + * device_lock. + * - stripes on the inactive_list never have their stripe_lock held. + * - stripes have a reference counter. If count==0, they are on a list. + * - If a stripe might need handling, STRIPE_HANDLE is set. + * - When refcount reaches zero, then if STRIPE_HANDLE it is put on + * handle_list else inactive_list + * + * This, combined with the fact that STRIPE_HANDLE is only ever + * cleared while a stripe has a non-zero count means that if the + * refcount is 0 and STRIPE_HANDLE is set, then it is on the + * handle_list and if recount is 0 and STRIPE_HANDLE is not set, then + * the stripe is on inactive_list. + * + * The possible transitions are: + * activate an unhashed/inactive stripe (get_active_stripe()) + * lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev + * activate a hashed, possibly active stripe (get_active_stripe()) + * lockdev check-hash if(!cnt++)unlink-stripe unlockdev + * attach a request to an active stripe (add_stripe_bh()) + * lockdev attach-buffer unlockdev + * handle a stripe (handle_stripe()) + * lockstripe clrSTRIPE_HANDLE ... (lockdev check-buffers unlockdev) .. change-state .. record io needed unlockstripe schedule io + * release an active stripe (release_stripe()) + * lockdev if (!--cnt) { if STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev + * + * The refcount counts each thread that have activated the stripe, + * plus raid5d if it is handling it, plus one for each active request + * on a cached buffer. + */ +struct stripe_head { + struct stripe_head *hash_next, **hash_pprev; /* hash pointers */ + struct list_head lru; /* inactive_list or handle_list */ + struct raid5_private_data *raid_conf; + struct buffer_head *bh_cache[MD_SB_DISKS]; /* buffered copy */ + struct buffer_head *bh_read[MD_SB_DISKS]; /* read request buffers of the MD device */ + struct buffer_head *bh_write[MD_SB_DISKS]; /* write request buffers of the MD device */ + struct buffer_head *bh_written[MD_SB_DISKS]; /* write request buffers of the MD device that have been scheduled for write */ + struct page *bh_page[MD_SB_DISKS]; /* saved bh_cache[n]->b_page when reading around the cache */ + struct evms_logical_node *node[MD_SB_DISKS]; /* the target device node */ + unsigned long sector; /* sector of this row */ + int size; /* buffers size */ + int pd_idx; /* parity disk index */ + unsigned long state; /* state flags */ + atomic_t count; /* nr of active thread/requests */ + spinlock_t lock; + int sync_redone; +}; + + +/* + * Write method + */ +#define RECONSTRUCT_WRITE 1 +#define READ_MODIFY_WRITE 2 +/* not a write method, but a compute_parity mode */ +#define CHECK_PARITY 3 + +/* + * Stripe state + */ +#define STRIPE_ERROR 1 +#define STRIPE_HANDLE 2 +#define STRIPE_SYNCING 3 +#define STRIPE_INSYNC 4 +#define STRIPE_PREREAD_ACTIVE 5 +#define STRIPE_DELAYED 6 + +/* + * Plugging: + * + * To improve write throughput, we need to delay the handling of some + * stripes until there has been a chance that several write requests + * for the one stripe have all been collected. + * In particular, any write request that would require pre-reading + * is put on a "delayed" queue until there are no stripes currently + * in a pre-read phase. Further, if the "delayed" queue is empty when + * a stripe is put on it then we "plug" the queue and do not process it + * until an unplg call is made. (the tq_disk list is run). + * + * When preread is initiated on a stripe, we set PREREAD_ACTIVE and add + * it to the count of prereading stripes. + * When write is initiated, or the stripe refcnt == 0 (just in case) we + * clear the PREREAD_ACTIVE flag and decrement the count + * Whenever the delayed queue is empty and the device is not plugged, we + * move any strips from delayed to handle and clear the DELAYED flag and set PREREAD_ACTIVE. + * In stripe_handle, if we find pre-reading is necessary, we do it if + * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue. + * HANDLE gets cleared if stripe_handle leave nothing locked. + */ + + +struct disk_info { + kdev_t dev; + struct evms_logical_node *node; + int operational; + int number; + int raid_disk; + int write_only; + int spare; + int used_slot; +}; + +struct raid5_private_data { + struct stripe_head **stripe_hashtbl; + mddev_t *mddev; + struct evms_thread *thread, *resync_thread; + struct disk_info disks[MD_SB_DISKS]; + struct disk_info *spare; + int buffer_size; + int chunk_size, level, algorithm; + int raid_disks, working_disks, failed_disks; + int resync_parity; + int max_nr_stripes; + + struct list_head handle_list; /* stripes needing handling */ + struct list_head delayed_list; /* stripes that have plugged requests */ + atomic_t preread_active_stripes; /* stripes with scheduled io */ + /* + * Free stripes pool + */ + atomic_t active_stripes; + struct list_head inactive_list; + md_wait_queue_head_t wait_for_stripe; + int inactive_blocked; /* release of inactive stripes blocked, + * waiting for 25% to be free + */ + md_spinlock_t device_lock; + + int plugged; + struct tq_struct plug_tq; +}; + +typedef struct raid5_private_data raid5_conf_t; + +#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private) + +/* + * Our supported algorithms + */ +#define ALGORITHM_LEFT_ASYMMETRIC 0 +#define ALGORITHM_RIGHT_ASYMMETRIC 1 +#define ALGORITHM_LEFT_SYMMETRIC 2 +#define ALGORITHM_RIGHT_SYMMETRIC 3 + + +#define EVMS_MD_RAID5_INIT_IO 1 + +struct r5_sync_io { + int rw; + u64 lsn; + u64 nr_sects; + void *data; +}; +#endif diff -Naur linux-2002-09-30/include/linux/evms/evms_snapshot.h evms-2002-09-30/include/linux/evms/evms_snapshot.h --- linux-2002-09-30/include/linux/evms/evms_snapshot.h Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/include/linux/evms/evms_snapshot.h Wed Sep 25 15:05:19 2002 @@ -0,0 +1,361 @@ +/* -*- linux-c -*- */ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +/* + * linux/include/linux/evms_snapshot.h + * + * EVMS Snapshot Feature kernel header file + */ + +#ifndef __EVMS_SNAPSHOT_INCLUDED__ +#define __EVMS_SNAPSHOT_INCLUDED__ + +#define EVMS_SNAPSHOT_VERSION_MAJOR 2 +#define EVMS_SNAPSHOT_VERSION_MINOR 1 +#define EVMS_SNAPSHOT_VERSION_PATCHLEVEL 1 + +#define EVMS_SNAPSHOT_FEATURE_ID 104 + +#define EVMS_SNAPSHOT_SIGNATURE 0x536e4170 /* SnAp */ +#define EVMS_ORIGINAL_SIGNATURE 0x4f724967 /* OrIg */ +#define MAX_HASH_CHAIN_ENTRIES 10 + +/* Status flags */ +#define EVMS_SNAPSHOT 0x001 +#define EVMS_SNAPSHOT_ORG 0x002 +#define EVMS_SNAPSHOT_DISABLED 0x004 +#define EVMS_SNAPSHOT_FULL 0x008 +#define EVMS_SNAPSHOT_QUIESCED 0x010 +#define EVMS_SNAPSHOT_WRITEABLE 0x020 +#define EVMS_SNAPSHOT_ASYNC 0x040 +#define EVMS_SNAPSHOT_ROLLBACK 0x080 +#define EVMS_SNAPSHOT_ROLLBACK_COMP 0x100 +#define EVMS_SNAPSHOT_DISABLED_PENDING 0x200 + +/* Private ioctl commands */ +#define SNAPSHOT_QUERY_PERCENT_FULL 1 +#define SNAPSHOT_START_ROLLBACK 2 +#define SNAPSHOT_CHECK_STATE 3 + +/* Chunk states - for async mode */ +#define SNAP_CHUNK_COPYING 1 /* Chunk is being copied from org to snap. */ +#define SNAP_CHUNK_COPIED 0 /* Chunk has been copied from org to snap. */ + +#define SNAPSHOT_DEFAULT_CHUNK_SIZE 128 /* sectors == 64k */ +#define SNAPSHOT_MIN_CHUNK_SIZE 16 /* 8kB */ +#define SNAPSHOT_MAX_CHUNK_SIZE 2048 /* 1MB */ +#define SNAPSHOT_CHUNK_BUFFER_SIZE 128 /* copy buffer */ + +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,9) +#define min(a,b) (((a)<(b))?(a):(b)) +#endif + +/** + * struct snapshot_metadata + * + * @signature: 0 : EVMS_SNAPSHOT_SIGNATURE + * @CRC: 4 : + * @version: 8 : Major, minor, patchlevel + * @flags: 20 : EVMS_SNAPSHOT_* + * @original_volume: 24 : Name of volume being snapshotted. + * @original_size: 152: In sectors. + * @lba_of_COW_table: 160: + * @lba_of_first_chunk: 168: + * @chunk_size: 176: In sectors + * @total_chunks: 180: + * + * On-disk metadata sector for EVMS Snapshot feature. + */ +struct snapshot_metadata { + u32 signature; + u32 CRC; + struct evms_version version; + u32 flags; + u8 original_volume[128]; + u64 original_size; + u64 lba_of_COW_table; + u64 lba_of_first_chunk; + u32 chunk_size; + u32 total_chunks; +}; + +/** + * struct snapshot_hash_entry + * + * @org_chunk: Chunk number, not LBA. + * @snap_chunk: Chunk_number, not LBA. + * @chunk_state: SNAP_CHUNK_* + * @chunk_state_lock: Protects access to chunk_state + * @snap_io: In async mode, the control-block for copying this chunk. + * @next + * @prev + * + * Entries in the snapshot remapping hash-table. + */ +struct snapshot_hash_entry { + u64 org_chunk; + u64 snap_chunk; + u32 chunk_state; + spinlock_t chunk_state_lock; + struct async_snap_io * snap_io; + struct snapshot_hash_entry * next; + struct snapshot_hash_entry * prev; +}; + +/** + * struct snapshot_volume + * + * @logical_node: Node below us. + * @exported_node: Node above us. + * @snapshot_org: The volume being snapshotted. + * @snapshot_next: List of volumes snapshotting this original. + * @snap_semaphore: On snapshots: protects access to the snapshot + * volume structure. + * On originals: protects the list of snapshots. + * @snapshot_map: Hash table of remapped chunks. + * @free_hash_list: List of pre-allocated hash entries. + * @chunk_size: In sectors. + * @chunk_shift: Shift value for chunk_size. + * @num_chunks: In this volume. + * @next_cow_entry: Index into current COW table sector. + * @current_cow_sector: Logical sector of current COW table. + * @next_free_chunk: Index of next free chunk (not LBA!). + * @hash_table_size: Size of the hash table for the remap. + * @flags: Status flags. EVMS_SNAPSHOT_* + * @cow_table: One sector's worth of COW tables. + * @async_io_thread: Thread for async copy-on-writes. Only on originals. + * @chunk_write_list: Lists and locks attached to the original. + * @chunk_write_list_lock: + * @org_pending_io_list: + * @org_pending_io_list_lock: + * @snap_pending_io_list: + * @snap_pending_io_list_lock: + * @cow_table_write_list: List and lock attached to the snapshot. + * @cow_table_write_list_lock: + * @rollback_thread: Thread for rollbacks. Only on snapshots. + * @chunk_data_buffer: Buffer for copying data during rollbacks. + * + * Private data for one snapshot volume or one original volume. + */ +struct snapshot_volume { + struct evms_logical_node * logical_node; + struct evms_logical_node * exported_node; + struct snapshot_volume * snapshot_org; + struct snapshot_volume * snapshot_next; + struct rw_semaphore snap_semaphore; + struct snapshot_hash_entry ** snapshot_map; + struct snapshot_hash_entry * free_hash_list; + u32 chunk_size; + u32 chunk_shift; + u32 num_chunks; + u32 next_cow_entry; + u64 current_cow_sector; + u32 next_free_chunk; + u32 hash_table_size; + u32 flags; + u64 cow_table[64]; + struct evms_thread * async_io_thread; + struct list_head chunk_write_list; + spinlock_t chunk_write_list_lock; + struct list_head org_pending_io_list; + spinlock_t org_pending_io_list_lock; + struct list_head snap_pending_io_list; + spinlock_t snap_pending_io_list_lock; + struct list_head cow_table_write_list; + spinlock_t cow_table_write_list_lock; +#ifdef SNAPSHOT_DEBUG + atomic_t cow_table_writes; + atomic_t cow_table_overlaps; +#endif + struct evms_thread * rollback_thread; + u8 * chunk_data_buffer; +}; + +/** + * struct snap_io_buffer + * + * @bh: A pointer to the embedded buffer_head at the end. + * @buffer_private: Private data associated with this buffer. + * @buffer_next: List of snap_io_buffer's for one async_[org|snap]_io. + * @copy_next: List of buffers that will write the data that this + * buffer just read. + * @chunk_write_list: List for the thread to use to drive writes to the + * snapshot as part of a copy. + * @_bh: An embedded buffer_head. The b_private field will + * always point back at the snap_io_buffer. + * + * A wrapper around a buffer_head, to allow for the buffer to exist on the + * variety of lists used by snapshotting. + */ +struct snap_io_buffer { + struct buffer_head * bh; + void * buffer_private; + struct snap_io_buffer * buffer_next; + struct snap_io_buffer * copy_next; + struct list_head chunk_write_list; + struct buffer_head _bh; +}; + +#define CHUNK_WRITE_ENTRY(lh) list_entry((lh), \ + struct snap_io_buffer, \ + chunk_write_list) + +/** + * struct async_snap_io + * + * @snap_volume: Snapshot volume that this chunk belongs to. + * @hash_table_entry: Hash table entry that this chunk belongs to. + * @org_io: Parent async I/O structure that contains list + * of read buffers. + * @pending_reads: List of pending read requests to the snapshot. + * @pending_writes: List of pending write requests to the snapshot. + * @copy_buffers: List of buffers to use to write this chunk to the + * snapshot. + * @cow_table_buffer: Buffer for writing the cow table to disk. + * @snap_io_list_next: List of async_snap_io's for the parent async_org_io. + * @snap_pending_io_list: List of async_snap_io's to be processed by the thread. + * For each of these, the thread will process the contents + * of the pending_[reads|writes] lists. + * @cow_write_list: List of cow table writes to be processed by the thread. + * For each of these, the thread will process the + * cow_table_buffer. + * @write_count: Number of buffers remaining to write for this chunk + * (equal to the length of the copy_buffers list). + * @dev: Copy of the b_rdev field for this volume. Needed in + * order to tell EVMS about pending I/Os. + * + * Control structure that handles writing a single chunk to the snapshot during + * a copy-on-write. + */ +struct async_snap_io { + struct snapshot_volume * snap_volume; + struct snapshot_hash_entry * hash_table_entry; + struct async_org_io * org_io; + struct buffer_head * pending_reads; + struct buffer_head * pending_writes; + struct snap_io_buffer * copy_buffers; + struct snap_io_buffer * cow_table_buffer; + struct async_snap_io * snap_io_list_next; + struct list_head snap_pending_io_list; + struct list_head cow_write_list; + atomic_t write_count; + kdev_t dev; +}; + +#define SNAP_PENDING_IO_ENTRY(lh) list_entry((lh), \ + struct async_snap_io, \ + snap_pending_io_list) +#define COW_WRITE_ENTRY(lh) list_entry((lh), \ + struct async_snap_io, \ + cow_write_list) + +/** + * struct async_org_io + * + * @org_volume: Original volume that this chunk belongs to. + * @pending_writes: List of pending write requests to the original. + * @pending_writes_lock:Protect the pending_writes list. + * @copy_buffers: List ob buffers to use to read this chunk from the + * original. + * @snap_io_list: List of async_snap_io's that will write this chunk to + * the snapshots. + * @org_pending_io_list:List of async_org_io's to be processed by the thread. + * For each of these, the thread will process the contents + * of the pending_writes list. + * @copy_count: Number of snapshots remaining to write this chunk. + * @ref_count: = copy_count + 1. Needed to determine when the entire + * async I/O structure can be deallocated. + * @dev: Copy of the b_rdev field for this volume. Needed in + * order to tell EVMS about pending I/Os. + */ +struct async_org_io { + struct snapshot_volume * org_volume; + struct buffer_head * pending_writes; + spinlock_t pending_writes_lock; + struct snap_io_buffer * copy_buffers; + struct async_snap_io * snap_io_list; + struct list_head org_pending_io_list; + atomic_t copy_count; + atomic_t ref_count; +#ifdef SNAPSHOT_DEBUG + struct async_org_io * debug_next_org_io; +#endif + kdev_t dev; +}; + +#define ORG_PENDING_IO_ENTRY(lh) list_entry((lh), \ + struct async_org_io, \ + org_pending_io_list) + +/* Debugging code */ +#ifdef SNAPSHOT_DEBUG + +#define DEBUG_CHECK_SNAP_IO(async_snap_io) \ + do { \ + if ( (async_snap_io)->pending_reads || \ + (async_snap_io)->pending_writes ) { \ + BUG(); \ + } \ + } while (0); + +#define DEBUG_REMOVE_ORG_IO_FROM_LIST(async_org_io) \ + do { \ + struct async_org_io ** p_org_io; \ + unsigned long flags; \ + if ((async_org_io)->pending_writes) { \ + BUG(); \ + } \ + spin_lock_irqsave(&debug_async_org_io_list_lock, flags); \ + for ( p_org_io = &debug_async_org_io_list; *p_org_io; \ + p_org_io = &(*p_org_io)->debug_next_org_io ) { \ + if ( *p_org_io == (async_org_io) ) { \ + *p_org_io = (async_org_io)->debug_next_org_io; \ + break; \ + } \ + } \ + (async_org_io)->debug_next_org_io = NULL; \ + spin_unlock_irqrestore(&debug_async_org_io_list_lock, flags); \ + } while (0); + +#define DEBUG_ADD_ORG_IO_TO_LIST(async_org_io) \ + do { \ + unsigned long flags; \ + spin_lock_irqsave(&debug_async_org_io_list_lock, flags); \ + (async_org_io)->debug_next_org_io = debug_async_org_io_list; \ + debug_async_org_io_list = (async_org_io); \ + spin_unlock_irqrestore(&debug_async_org_io_list_lock, flags); \ + } while (0); + +#define DEBUG_INC_COW_TABLE_OVERLAPS(snap_volume) \ + atomic_inc(&(snap_volume)->cow_table_overlaps) + +#define DEBUG_INC_COW_TABLE_WRITES(snap_volume) \ + atomic_inc(&(snap_volume)->cow_table_writes) + +#else /* SNAPSHOT_DEBUG */ + +#define DEBUG_CHECK_SNAP_IO(async_snap_io) +#define DEBUG_REMOVE_ORG_IO_FROM_LIST(async_org_io) +#define DEBUG_ADD_ORG_IO_TO_LIST(async_org_io) +#define DEBUG_INC_COW_TABLE_OVERLAPS(snap_volume) +#define DEBUG_INC_COW_TABLE_WRITES(snap_volume) + +#endif /* SNAPSHOT_DEBUG */ + +#endif /* __EVMS_SNAPSHOT_INCLUDED__ */ + diff -Naur linux-2002-09-30/include/linux/evms/evms_xor.h evms-2002-09-30/include/linux/evms/evms_xor.h --- linux-2002-09-30/include/linux/evms/evms_xor.h Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/include/linux/evms/evms_xor.h Mon Feb 4 09:58:43 2002 @@ -0,0 +1,23 @@ +#ifndef _XOR_H +#define _XOR_H + +#include + +#define MAX_XOR_BLOCKS 5 + +extern void evms_md_xor_block(unsigned int count, struct buffer_head **bh_ptr); + +struct xor_block_template { + struct xor_block_template *next; + const char *name; + int speed; + void (*do_2)(unsigned long, unsigned long *, unsigned long *); + void (*do_3)(unsigned long, unsigned long *, unsigned long *, + unsigned long *); + void (*do_4)(unsigned long, unsigned long *, unsigned long *, + unsigned long *, unsigned long *); + void (*do_5)(unsigned long, unsigned long *, unsigned long *, + unsigned long *, unsigned long *, unsigned long *); +}; + +#endif diff -Naur linux-2002-09-30/include/linux/evms/ldev_mgr.h evms-2002-09-30/include/linux/evms/ldev_mgr.h --- linux-2002-09-30/include/linux/evms/ldev_mgr.h Wed Dec 31 18:00:00 1969 +++ evms-2002-09-30/include/linux/evms/ldev_mgr.h Wed Aug 28 14:30:51 2002 @@ -0,0 +1,46 @@ + +/* -*- linux-c -*- */ +/* + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* linux/driver/evms/ldev_mgr.h + * + * EVMS - Local Device (Hard Drive) Manager + * + */ + +/* plugin feature ID */ +#define EVMS_LOCAL_DEVICE_MANAGER_ID 1 + +/* plugin ioctl feature command defines */ +#define LDEV_MGR_BROADCAST_IOCTL_CMD 1 + +/** + * struct ldev_plugin_ioctl - ldev mgr direct ioctl packet definition + * @disk_handle: handle identifying target disk + * @cmd: ioctl cmd + * @arg: ioctl argument + * + * local device manager direct ioctl packet definition + **/ +struct ldev_plugin_ioctl { + u64 disk_handle; + u32 cmd; + ulong arg; +};