evms-1.2.0-linux-2.4.patch

   1 diff -Naur linux-2002-09-30/drivers/evms/AIXlvm_vge.c evms-2002-09-30/drivers/evms/AIXlvm_vge.c
   2 --- linux-2002-09-30/drivers/evms/AIXlvm_vge.c  Wed Dec 31 18:00:00 1969
   3 +++ evms-2002-09-30/drivers/evms/AIXlvm_vge.c   Fri Sep 27 14:55:45 2002
   4 @@ -0,0 +1,3681 @@
   5 +/* -*- linux-c -*- */
   6 +
   7 +/*
   8 + *
   9 + *
  10 + *   Copyright (c) International Business Machines  Corp., 2000
  11 + *
  12 + *   This program is free software;  you can redistribute it and/or modify
  13 + *   it under the terms of the GNU General Public License as published by
  14 + *   the Free Software Foundation; either version 2 of the License, or
  15 + *   (at your option) any later version.
  16 + *
  17 + *   This program is distributed in the hope that it will be useful,
  18 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
  19 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
  20 + *   the GNU General Public License for more details.
  21 + *
  22 + *   You should have received a copy of the GNU General Public License
  23 + *   along with this program;  if not, write to the Free Software
  24 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  25 + *
  26 + *
  27 + */
  28 +/*
  29 + * linux/drivers/evms/AIXlvm_vge.c
  30 + *
  31 + * EVMS AIX LVM Volume Group Emulator
  32 + *
  33 + *
  34 + */
  35 +
  36 +#define EVMS_DEBUG     1
  37 +#define EVMS_AIX_DEBUG 1
  38 +
  39 +#define AIX_COMMON_SERVICES_MAJOR        0     // Required common services levels for the AIX kernel plugin
  40 +#define AIX_COMMON_SERVICES_MINOR        5     // These must be incremented if new function is added to common
  41 +#define AIX_COMMON_SERVICES_PATCHLEVEL   0     // services and the AIX kernel plugin uses the new function.
  42 +#define AIX_INCREMENT_REQUEST            1
  43 +#define AIX_DECREMENT_REQUEST           -1
  44 +#define AIX_RESYNC_BLOCKSIZE           512
  45 +#define AIX_SYNC_INCOMPLETE           0x01
  46 +#define AIX_SYNC_COMPLETE             0x00
  47 +#define AIX_MASTER                       0
  48 +#define AIX_SLAVE_1                      1
  49 +#define AIX_SLAVE_2                      2
  50 +
  51 +#include <linux/module.h>
  52 +#include <linux/kernel.h>
  53 +#include <linux/config.h>
  54 +
  55 +#include <linux/genhd.h>
  56 +#include <linux/string.h>
  57 +#include <linux/blk.h>
  58 +#include <linux/init.h>
  59 +#include <linux/slab.h>
  60 +
  61 +#include <linux/evms/evms.h>
  62 +#include <linux/evms/evms_aix.h>
  63 +#include <asm/system.h>
  64 +#include <asm/uaccess.h>
  65 +
  66 +#include <linux/sched.h>
  67 +#include <linux/smp_lock.h>
  68 +#include <linux/locks.h>
  69 +#include <linux/delay.h>
  70 +#include <linux/reboot.h>
  71 +#include <linux/completion.h>
  72 +#include <linux/vmalloc.h>
  73 +
  74 +#ifdef EVMS_AIX_DEBUG
  75 +static int AIX_volume_group_dump(void);
  76 +#endif
  77 +
  78 +static struct aix_volume_group *AIXVolumeGroupList = NULL;
  79 +static struct evms_thread *AIX_mirror_read_retry_thread;
  80 +static struct evms_thread *AIX_mirror_resync_thread;
  81 +static struct evms_pool_mgmt *AIX_BH_list_pool = NULL;
  82 +static struct aix_mirror_bh *AIX_retry_list = NULL;
  83 +static struct aix_mirror_bh **AIX_retry_tail = NULL;
  84 +static spinlock_t AIX_retry_list_lock = SPIN_LOCK_UNLOCKED;
  85 +static spinlock_t AIX_resync_list_lock = SPIN_LOCK_UNLOCKED;
  86 +static spinlock_t AIX_resync_pp_lock = SPIN_LOCK_UNLOCKED;
  87 +static int AIXResyncInProgress = FALSE;
  88 +static struct aix_resync_struct *AIX_resync_list = NULL;
  89 +
  90 +// Plugin API prototypes
  91 +
  92 +static void AIXiod(void *data);
  93 +static void AIXresync(void *data);
  94 +static int discover_aix(struct evms_logical_node **evms_logical_disk_head);
  95 +static int discover_volume_groups(struct evms_logical_node **);
  96 +static int discover_logical_volumes(void);
  97 +static int end_discover_aix(struct evms_logical_node **evms_logical_disk_head);
  98 +static void read_aix(struct evms_logical_node *node, struct buffer_head *bh);
  99 +static void write_aix(struct evms_logical_node *node, struct buffer_head *bh);
 100 +static int ioctl_aix(struct evms_logical_node *logical_node,
 101 +                    struct inode *inode,
 102 +                    struct file *file, unsigned int cmd, unsigned long arg);
 103 +
 104 +static int aix_direct_ioctl(struct inode *inode,
 105 +                           struct file *file,
 106 +                           unsigned int cmd, unsigned long args);
 107 +
 108 +static int AIX_remap_sector(struct evms_logical_node *node, u64 org_sector,    // logical sector to remap
 109 +                           u64 size,   // size (in sectors) of request to remap
 110 +                           u64 * new_sector,   // remapped sector
 111 +                           u64 * new_size,     // new size (in sectors)
 112 +                           struct partition_list_entry **partition,    // new node for which new_sector is relative
 113 +                           u32 * le, u32 * offset_in_le);
 114 +
 115 +static int validate_build_volume_group_disk_info(struct evms_logical_node
 116 +                                                *logical_node,
 117 +                                                struct AIXlvm_rec *AIXlvm);
 118 +
 119 +static int add_VG_data_to_VG_list(struct evms_logical_node *logical_node,
 120 +                                 struct aix_volume_group *new_group,
 121 +                                 short int pvNum);
 122 +static int add_PV_to_volume_group(struct aix_volume_group *group,
 123 +                                 struct evms_logical_node *evms_partition,
 124 +                                 int pvNum);
 125 +static struct aix_volume_group *AIX_create_volume_group(struct evms_logical_node
 126 +                                                       *logical_node,
 127 +                                                       struct AIXlvm_rec
 128 +                                                       *AIXlvm);
 129 +
 130 +static int AIX_update_volume_group(struct aix_volume_group *AIXVGLptr,
 131 +                                  struct evms_logical_node *logical_node,
 132 +                                  struct AIXlvm_rec *AIXlvm);
 133 +
 134 +static int AIX_evms_cs_notify_lv_io_error(struct evms_logical_node *node);
 135 +
 136 +static int AIX_pvh_data_posn(u32 vgda_psn, u32 * pvh_posn, struct partition_list_entry *partition, u32 numpvs);
 137 +
 138 +static int AIX_resync_lv_mirrors(struct aix_logical_volume *volume, int force);
 139 +
 140 +static int AIX_copy_on_read(struct aix_logical_volume *volume,
 141 +                           struct partition_list_entry *master_part,
 142 +                           struct partition_list_entry *slave1_part,
 143 +                           struct partition_list_entry *slave2_part,
 144 +                           u64 master_offset,
 145 +                           u64 slave1_offset,
 146 +                           u64 slave2_offset, u32 pe_size, int le);
 147 +
 148 +static int export_volumes(struct evms_logical_node **evms_logical_disk_head);
 149 +static int lvm_cleanup(void);
 150 +static int AIX_copy_header_info(struct vg_header *AIXvgh,
 151 +                               struct vg_header *AIXvgh2);
 152 +static int build_pe_maps(struct aix_volume_group *volume_group);
 153 +
 154 +static struct aix_logical_volume *new_logical_volume(struct lv_entries
 155 +                                                    *AIXlvent,
 156 +                                                    struct aix_volume_group
 157 +                                                    *group, char *lv_name,
 158 +                                                    u32 stripesize);
 159 +
 160 +static int check_log_volume_and_pe_maps(struct aix_volume_group *group);
 161 +static int check_volume_groups(void);
 162 +static int init_io_aix(struct evms_logical_node *node, int io_flag,    /* 0=read, 1=write */
 163 +                      u64 sect_nr,     /* disk LBA */
 164 +                      u64 num_sects,   /* # of sectors */
 165 +                      void *buf_addr); /* buffer address */
 166 +
 167 +static int delete_logical_volume(struct aix_logical_volume *volume);
 168 +static int delete_aix_node(struct evms_logical_node *logical_node);
 169 +static int deallocate_volume_group(struct aix_volume_group *group);
 170 +
 171 +static void AIX_handle_read_mirror_drives(struct buffer_head *bh, int uptodate);
 172 +
 173 +static void AIX_handle_write_mirror_drives(struct buffer_head *bh,
 174 +                                          int uptodate);
 175 +
 176 +static void aix_notify_cache_ctor(void *foo, kmem_cache_t * cachep,
 177 +                                 unsigned long flags);
 178 +
 179 +static void AIX_schedule_resync(struct aix_logical_volume *resync_volume,
 180 +                               int force);
 181 +static struct aix_logical_volume *AIX_get_volume_data(char *object_name);
 182 +
 183 +static void AIX_sync_mirrored_partitions(struct buffer_head *bh, int uptodate);
 184 +
 185 +static int AIX_get_set_mirror_offset(struct aix_mirror_bh *tmp_bh,
 186 +                                    int index, int offset);
 187 +
 188 +static struct aix_mirror_bh *AIX_alloc_rbh(struct evms_logical_node *node,
 189 +                                          struct buffer_head *bh,
 190 +                                          u32 mirror_copies,
 191 +                                          u32 le, u64 org_sector, int cmd);
 192 +
 193 +static struct aix_mirror_bh *AIX_alloc_wbh(struct evms_logical_node *node,
 194 +                                          struct evms_logical_node *node2,
 195 +                                          struct evms_logical_node *node3,
 196 +                                          struct buffer_head *bh,
 197 +                                          u32 mirror_copies,
 198 +                                          u32 le,
 199 +                                          u64 new_sector2, u64 new_sector3);
 200 +
 201 +static struct aix_mirror_bh *AIX_alloc_sbh(struct aix_logical_volume *volume,
 202 +                                          struct partition_list_entry
 203 +                                          *master_part,
 204 +                                          struct partition_list_entry
 205 +                                          *slave1_part,
 206 +                                          struct partition_list_entry
 207 +                                          *slave2_part, u64 master_offset,
 208 +                                          u64 slave1_offset, u64 slave2_offset,
 209 +                                          u32 pe_size);
 210 +
 211 +static void AIX_free_headers(struct vg_header *AIXvgh,
 212 +                            struct vg_header *AIXvgh2,
 213 +                            struct vg_trailer *AIXvgt,
 214 +                            struct vg_trailer *AIXvgt2);
 215 +
 216 +static int remove_group_from_list(struct aix_volume_group *group);
 217 +
 218 +//****************************************************************************************************
 219 +
 220 +/* END of PROTOTYES*/
 221 +
 222 +#define GET_PHYSICAL_PART_SIZE(v1) (1 << v1)
 223 +
 224 +#define COMPARE_TIMESTAMPS(t1, t2)     ( (t1).tv_sec  == (t2).tv_sec && \
 225 +                                         (t1).tv_nsec == (t2).tv_nsec )
 226 +
 227 +#define COMPARE_UNIQUE_IDS(id1, id2)   ( (id1).word1 == (id2).word1 && \
 228 +                                         (id1).word2 == (id2).word2 && \
 229 +                                         (id1).word3 == (id2).word3 && \
 230 +                                         (id1).word4 == (id2).word4 )
 231 +
 232 +#define SECTOR_IN_RANGE(s1, s2)  ((s2 > s1) && (s2 < s1 + AIX_RESYNC_BLOCKSIZE))
 233 +
 234 +#define AIX_PV_STATE_VALID              0      // Both VGDAs are valid and match.
 235 +#define AIX_PV_STATE_FIRST_VGDA                 1      // Only the first VGDA is valid.
 236 +#define AIX_PV_STATE_SECOND_VGDA        2      // Only the second VGDA is valid.
 237 +#define AIX_PV_STATE_EITHER_VGDA       -1      // Both VGDAs are valid, but do not match each other.
 238 +#define AIX_PV_STATE_INVALID            -2     // We're in an invalid state but there's more PVs in this group
 239 +
 240 +#ifndef EVMS_AIX_DEBUG
 241 +#define AIX_VOLUME_GROUP_DUMP()
 242 +#else
 243 +#define AIX_VOLUME_GROUP_DUMP() LOG_DEBUG("Called line:%d \n",__LINE__); \
 244 +                                AIX_volume_group_dump()
 245 +#endif
 246 +
 247 +// Global LVM data structures
 248 +
 249 +static struct evms_plugin_fops AIXlvm_fops = {
 250 +       .discover = discover_aix,
 251 +       .end_discover = end_discover_aix,
 252 +       .delete = delete_aix_node,
 253 +       .read = read_aix,
 254 +       .write = write_aix,
 255 +       .init_io = init_io_aix,
 256 +       .ioctl = ioctl_aix,
 257 +       .direct_ioctl = aix_direct_ioctl
 258 +};
 259 +
 260 +static struct evms_plugin_header plugin_header = {
 261 +       .id = SetPluginID(IBM_OEM_ID,
 262 +                         EVMS_REGION_MANAGER,
 263 +                         EVMS_AIX_FEATURE_ID),
 264 +       .version = {
 265 +                   .major = 1,
 266 +                   .minor = 1,
 267 +                   .patchlevel = 1},
 268 +       .required_services_version = {
 269 +                                     .major = AIX_COMMON_SERVICES_MAJOR,
 270 +                                     .minor = AIX_COMMON_SERVICES_MINOR,
 271 +                                     .patchlevel =
 272 +                                     AIX_COMMON_SERVICES_PATCHLEVEL},
 273 +       .fops = &AIXlvm_fops
 274 +};
 275 +
 276 +/*
 277 + * Function: remap sector
 278 + *  Common function to remap volume lba to partition lba in appropriate PE
 279 + */
 280 +static int
 281 +AIX_remap_sector(struct evms_logical_node *node, u64 org_sector,       // logical sector to remap
 282 +                u64 size,      // size (in sectors) of request to remap
 283 +                u64 * new_sector,      // remapped sector
 284 +                u64 * new_size,        // new size (in sectors)
 285 +                struct partition_list_entry **partition,       // new node for which new_sector is relative
 286 +                u32 * le, u32 * offset_in_le)
 287 +{
 288 +       struct aix_logical_volume *volume;
 289 +
 290 +       u32 sectors_per_stripe;
 291 +       u32 partition_to_use;
 292 +       u32 column;
 293 +       u32 stripe_in_column;
 294 +
 295 +       u32 org_sector32;       // Until striping is 64-bit enabled.
 296 +
 297 +       volume = (struct aix_logical_volume *) node->private;
 298 +
 299 +#ifdef EVMS_DEBUG
 300 +       LOG_DEBUG("-- %s volume:%p lv:%d size:" PFU64 " Name:%s\n",
 301 +                 __FUNCTION__, volume, volume->lv_number, size, volume->name);
 302 +       LOG_DEBUG(" node %p node_name [%s] org_sector:" PFU64 "\n", node,
 303 +                 node->name, org_sector);
 304 +       LOG_DEBUG(" mirror_copies:%d volume->lv_size:" PFU64 "\n",
 305 +                 volume->mirror_copies, volume->lv_size);
 306 +#endif
 307 +
 308 +       org_sector32 = org_sector;
 309 +
 310 +       *(new_size) = size;
 311 +
 312 +       // Check if volume is striped. Reset the size if the request
 313 +       // crosses a stripe boundary.
 314 +       if (volume->stripes > 1) {
 315 +#ifdef EVMS_DEBUG
 316 +               LOG_DEBUG(" *** STRIPED ***\n");
 317 +               LOG_DEBUG(" ------- volume->stripe_size:%d org_sector:%d volume_stripes:%d\n",
 318 +                    volume->stripe_size, org_sector32, volume->stripes);
 319 +#endif
 320 +
 321 +               *(le) = org_sector >> volume->pe_size_shift;    // 64-bit safe
 322 +               *(offset_in_le) = org_sector & (volume->pe_size - 1);   // 64-bit safe
 323 +
 324 +#ifdef EVMS_DEBUG
 325 +               LOG_DEBUG("OLD - le:%d -- offset_in_le:%d \n", *(le),
 326 +                         *(offset_in_le));
 327 +#endif
 328 +
 329 +               sectors_per_stripe = volume->stripe_size / AIX_SECTOR_SIZE;
 330 +               partition_to_use =
 331 +                   (org_sector32 / sectors_per_stripe) % volume->stripes;
 332 +               stripe_in_column =
 333 +                   ((((org_sector32 / volume->stripe_size) / volume->stripes) *
 334 +                     volume->stripe_size) +
 335 +                    (org_sector32 % sectors_per_stripe));
 336 +               column =
 337 +                   ((org_sector32 / sectors_per_stripe) / volume->stripes) *
 338 +                   sectors_per_stripe;
 339 +
 340 +#ifdef EVMS_DEBUG
 341 +               LOG_DEBUG("offset_in_le:%d org_sector:" PFU64
 342 +                         " pe_shift:%d stripe_shift:%d\n", *(offset_in_le),
 343 +                         org_sector, volume->pe_size_shift,
 344 +                         volume->stripe_size_shift);
 345 +
 346 +               LOG_DEBUG(" org_sector:%d  sectors_per_stripe:%d partition_to_use:%d stripe_in_column:%d column:%d\n",
 347 +                    org_sector32, sectors_per_stripe, partition_to_use,
 348 +                    stripe_in_column, column);
 349 +               LOG_DEBUG(" offset_in_le + size:" PFU64
 350 +                         " volume->pe_size:%d volume->lv_size:" PFU64 "\n",
 351 +                         (*(offset_in_le) + size), volume->pe_size,
 352 +                         volume->lv_size);
 353 +#endif
 354 +
 355 +               if (*(offset_in_le) + size > volume->pe_size) {
 356 +                       *new_size = volume->pe_size - *(offset_in_le);
 357 +                       LOG_DEBUG("  new_size " PFU64 "\n", *new_size);
 358 +               }
 359 +
 360 +       }
 361 +       // Non-striped volume. Just find LE and offset. Reset the size
 362 +       // if the request crosses an LE boundary.
 363 +       else {
 364 +#ifdef EVMS_DEBUG
 365 +               LOG_DEBUG(" *** NON-STRIPED ***\n");
 366 +#endif
 367 +
 368 +               *(le) = org_sector >> volume->pe_size_shift;    // 64-bit safe
 369 +               *(offset_in_le) = org_sector & (volume->pe_size - 1);   // 64-bit safe
 370 +
 371 +       }
 372 +
 373 +#ifdef EVMS_DEBUG
 374 +       LOG_DEBUG(" offset_in_le:%d org_sector:" PFU64 " shift:%d\n",
 375 +                 *(offset_in_le), org_sector, volume->pe_size_shift);
 376 +
 377 +       if (*(le) >= volume->num_le) {
 378 +               LOG_DEBUG(" le Memory Overwrite !! le:%d vs volume->num_le:%d\n",
 379 +                    *(le), volume->num_le);
 380 +               return -EINVAL;
 381 +       }
 382 +#endif
 383 +
 384 +       *(new_sector) = volume->le_to_pe_map[*(le)].pe_sector_offset + *(offset_in_le);
 385 +       *(partition) = volume->le_to_pe_map[*(le)].owning_pv;
 386 +
 387 +#ifdef EVMS_DEBUG
 388 +       LOG_DEBUG(" new_sector:" PFU64 "\n", *(new_sector));
 389 +       LOG_DEBUG(" Owning Part %p\n", *(partition));
 390 +       LOG_DEBUG(" End %s\n", __FUNCTION__);
 391 +#endif
 392 +
 393 +       return (0);
 394 +}
 395 +
 396 +/*
 397 + * Function: read_aix
 398 + */
 399 +static void
 400 +read_aix(struct evms_logical_node *node, struct buffer_head *bh)
 401 +{
 402 +       struct partition_list_entry *partition;
 403 +       u64 org_sector;
 404 +       u64 new_sector;
 405 +       u64 new_size;
 406 +       struct aix_logical_volume *volume;
 407 +       struct aix_mirror_bh *tmp_bh;
 408 +       u32 le, offset_in_le, count;
 409 +       int flags = 0;
 410 +
 411 +       volume = (struct aix_logical_volume *) node->private;
 412 +//#ifdef EVMS_DEBUG
 413 +//     LOG_DEBUG(" ***** %s ***** bh:%p volume->iter:%d\n", __FUNCTION__, bh,
 414 +//               volume->mirror_iterations);
 415 +//#endif
 416 +
 417 +#ifdef EVMS_DEBUG
 418 +       LOG_DEBUG(" node->total_vsectors:" PFU64 "\n", node->total_vsectors);
 419 +       LOG_DEBUG(" rsector:%lu rsize:%u node_flags:%u\n", bh->b_rsector,
 420 +                 bh->b_size, node->flags);
 421 +#endif
 422 +
 423 +       // Check if I/O goes past end of logical volume.
 424 +       if (bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) >
 425 +           node->total_vsectors) {
 426 +               LOG_CRITICAL(" read_aix ERROR %d\n", __LINE__);
 427 +               buffer_IO_error(bh);
 428 +               return;
 429 +       }
 430 +
 431 +       // Logical-to-physical remapping.
 432 +       if (AIX_remap_sector
 433 +           (node, bh->b_rsector, (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT),
 434 +            &new_sector, &new_size, &partition, &le, &offset_in_le)
 435 +           || (!partition || !new_sector)) {
 436 +               LOG_CRITICAL(" read_aix bh: ERROR %d\n", __LINE__);
 437 +               buffer_IO_error(bh);
 438 +               return;
 439 +       }
 440 +
 441 +       org_sector = bh->b_rsector;
 442 +       bh->b_rsector = new_sector;
 443 +       //bh->b_size    = new_size;
 444 +
 445 +#ifdef EVMS_DEBUG
 446 +       LOG_DEBUG(" read_aix Mirror_Copies:%d\n", volume->mirror_copies);
 447 +#endif
 448 +
 449 +       if (volume->mirror_copies > AIX_DEFAULT_MIRRORING) {
 450 +
 451 +               tmp_bh =
 452 +                   AIX_alloc_rbh(node, bh, 1, le, new_sector, AIX_LV_READ);
 453 +
 454 +               if (!tmp_bh) {
 455 +                       buffer_IO_error(bh);
 456 +                       return;
 457 +               }
 458 +
 459 +               if (volume->le_to_pe_map_mir1) {
 460 +                       tmp_bh->mir_node1 =
 461 +                           volume->le_to_pe_map_mir1[le].owning_pv->
 462 +                           logical_node;
 463 +                       tmp_bh->mir_sector1 =
 464 +                           volume->le_to_pe_map_mir1[le].pe_sector_offset +
 465 +                           offset_in_le;
 466 +               }
 467 +
 468 +               if (volume->mirror_copies == AIX_MAX_MIRRORS) {
 469 +                       tmp_bh->mir_node2 =
 470 +                           volume->le_to_pe_map_mir2[le].owning_pv->
 471 +                           logical_node;
 472 +                       tmp_bh->mir_sector2 =
 473 +                           volume->le_to_pe_map_mir2[le].pe_sector_offset +
 474 +                           offset_in_le;
 475 +               }
 476 +
 477 +               if (evms_cs_volume_request_in_progress
 478 +                   (tmp_bh->bh_req.b_rdev, AIX_INCREMENT_REQUEST, &count)) {
 479 +                       buffer_IO_error(bh);
 480 +                       return;
 481 +               }
 482 +
 483 +               if (AIXResyncInProgress) {
 484 +                       if (SECTOR_IN_RANGE
 485 +                           (tmp_bh->bh_req.b_rsector,
 486 +                            AIX_resync_list->master_offset)) {
 487 +                               spin_lock_irqsave(&AIX_resync_list_lock, flags);
 488 +                       }
 489 +               }
 490 +
 491 +               R_IO(partition->logical_node, &tmp_bh->bh_req);
 492 +
 493 +               if (AIXResyncInProgress) {
 494 +                       if (SECTOR_IN_RANGE
 495 +                           (tmp_bh->bh_req.b_rsector,
 496 +                            AIX_resync_list->master_offset)) {
 497 +                               spin_unlock_irqrestore(&AIX_resync_list_lock,
 498 +                                                      flags);
 499 +                       }
 500 +               }
 501 +
 502 +       } else {
 503 +
 504 +               R_IO(partition->logical_node, bh);
 505 +       }
 506 +
 507 +#ifdef EVMS_DEBUG
 508 +       LOG_DEBUG(" ***** %s ***** returning\n", __FUNCTION__);
 509 +#endif
 510 +       return;
 511 +}
 512 +
 513 +/*
 514 + * Function: write_aix
 515 + */
 516 +static void
 517 +write_aix(struct evms_logical_node *node, struct buffer_head *bh)
 518 +{
 519 +       struct partition_list_entry *partition;
 520 +       u64 new_sector, new_sector2 = 0, new_sector3 = 0;
 521 +       u64 org_sector;
 522 +       u64 new_size;
 523 +       struct aix_logical_volume *volume;
 524 +       struct aix_mirror_bh *tmp_bh;
 525 +       struct evms_logical_node *node2 = NULL, *node3 = NULL;
 526 +       u32 le, offset_in_le, count;
 527 +       int flags = 0;
 528 +
 529 +       volume = (struct aix_logical_volume *) node->private;
 530 +
 531 +#ifdef EVMS_DEBUG
 532 +//     LOG_DEBUG(" ***** %s ***** bh:%p volume->iter:%d\n", __FUNCTION__, bh,
 533 +//               volume->mirror_iterations);
 534 +       LOG_DEBUG(" write_aix rsector:%lu rsize:%u\n", bh->b_rsector,
 535 +                 bh->b_size);
 536 +       LOG_DEBUG(" write_aix total_sectors:" PFU64 "\n", node->total_vsectors);
 537 +#endif
 538 +
 539 +       if (volume->lv_access & EVMS_LV_INCOMPLETE) {   //No writes allowed on incomplete volumes
 540 +               LOG_CRITICAL(" write_aix incomplete volume ERROR %d\n",
 541 +                            __LINE__);
 542 +               buffer_IO_error(bh);
 543 +               return;
 544 +       }
 545 +
 546 +       // Check if I/O goes past end of logical volume.
 547 +       if (bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) >
 548 +           node->total_vsectors) {
 549 +               LOG_CRITICAL(" write_aix ERROR %d\n", __LINE__);
 550 +               buffer_IO_error(bh);
 551 +               return;
 552 +       }
 553 +       // Logical-to-Physical remapping
 554 +       if (AIX_remap_sector
 555 +           (node, bh->b_rsector, (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT),
 556 +            &new_sector, &new_size, &partition, &le, &offset_in_le)
 557 +           || (!new_sector || !partition)) {
 558 +               LOG_CRITICAL(" write_aix ERROR %d\n", __LINE__);
 559 +               buffer_IO_error(bh);
 560 +               return;
 561 +       }
 562 +
 563 +       org_sector = bh->b_rsector;
 564 +       bh->b_rsector = new_sector;
 565 +       //bh->b_size   = new_size;
 566 +
 567 +#ifdef EVMS_DEBUG
 568 +       LOG_DEBUG(" write_aix  Mirror_Copies:%d\n", volume->mirror_copies);
 569 +#endif
 570 +
 571 +       if (volume->mirror_copies > AIX_DEFAULT_MIRRORING) {
 572 +
 573 +               if (volume->le_to_pe_map_mir1) {
 574 +                       new_sector2 =
 575 +                           volume->le_to_pe_map_mir1[le].pe_sector_offset +
 576 +                           offset_in_le;
 577 +                       node2 =
 578 +                           volume->le_to_pe_map_mir1[le].owning_pv->
 579 +                           logical_node;
 580 +               }
 581 +
 582 +               if (volume->mirror_copies == AIX_MAX_MIRRORS) {
 583 +
 584 +                       new_sector3 =
 585 +                           volume->le_to_pe_map_mir2[le].pe_sector_offset +
 586 +                           offset_in_le;
 587 +                       node3 =
 588 +                           volume->le_to_pe_map_mir2[le].owning_pv->
 589 +                           logical_node;
 590 +               }
 591 +
 592 +               tmp_bh =
 593 +                   AIX_alloc_wbh(partition->logical_node, node2, node3, bh, le,
 594 +                                 volume->mirror_copies, new_sector2,
 595 +                                 new_sector3);
 596 +
 597 +               if (!tmp_bh) {
 598 +                       buffer_IO_error(bh);
 599 +                       return;
 600 +               }
 601 +               tmp_bh->node = node;
 602 +
 603 +               tmp_bh = tmp_bh->mirror_bh_list;
 604 +
 605 +               if (evms_cs_volume_request_in_progress
 606 +                   (tmp_bh->bh_req.b_rdev, AIX_INCREMENT_REQUEST, &count)) {
 607 +                       buffer_IO_error(bh);
 608 +                       // free memory here
 609 +                       return;
 610 +               }
 611 +
 612 +               if (AIXResyncInProgress) {
 613 +                       if (SECTOR_IN_RANGE
 614 +                           (tmp_bh->bh_req.b_rsector,
 615 +                            AIX_resync_list->master_offset)) {
 616 +                               spin_lock_irqsave(&AIX_resync_list_lock, flags);
 617 +                       }
 618 +               }
 619 +
 620 +               W_IO(tmp_bh->node, &tmp_bh->bh_req);
 621 +
 622 +               if (AIXResyncInProgress) {
 623 +                       if (SECTOR_IN_RANGE
 624 +                           (tmp_bh->bh_req.b_rsector,
 625 +                            AIX_resync_list->master_offset)) {
 626 +                               spin_unlock_irqrestore(&AIX_resync_list_lock,
 627 +                                                      flags);
 628 +                       }
 629 +               }
 630 +
 631 +               tmp_bh = tmp_bh->next_r1;
 632 +
 633 +               if (tmp_bh) {
 634 +                       W_IO(tmp_bh->node, &tmp_bh->bh_req);
 635 +                       tmp_bh = tmp_bh->next_r1;
 636 +               }
 637 +
 638 +               if (tmp_bh) {
 639 +                       W_IO(tmp_bh->node, &tmp_bh->bh_req);
 640 +               }
 641 +
 642 +       } else {
 643 +
 644 +               W_IO(partition->logical_node, bh);
 645 +       }
 646 +
 647 +#ifdef EVMS_DEBUG
 648 +       LOG_DEBUG(" ***** %s returning *****\n", __FUNCTION__);
 649 +#endif
 650 +       return;
 651 +}
 652 +
 653 +/*
 654 + * Function: ioctl_aix
 655 + *
 656 + */
 657 +static int
 658 +ioctl_aix(struct evms_logical_node *logical_node,
 659 +         struct inode *inode,
 660 +         struct file *file, unsigned int cmd, unsigned long arg)
 661 +{
 662 +       struct aix_logical_volume *volume =
 663 +           (struct aix_logical_volume *) (logical_node->private);
 664 +       int rc = 0;
 665 +
 666 +       LOG_EXTRA(" Ioctl %u\n", cmd);
 667 +
 668 +       switch (cmd) {
 669 +
 670 +       case HDIO_GETGEO:
 671 +               {
 672 +                       // Fixed geomerty for all LVM volumes
 673 +                       unsigned char heads = 64;
 674 +                       unsigned char sectors = 32;
 675 +                       long start = 0;
 676 +                       struct hd_geometry *hd = (struct hd_geometry *) arg;
 677 +                       short cylinders;
 678 +                       cylinders = logical_node->total_vsectors;
 679 +                       cylinders = (cylinders / heads) / sectors;
 680 +
 681 +                       if (hd == NULL) {
 682 +                               return -EINVAL;
 683 +                       }
 684 +
 685 +                       if (copy_to_user
 686 +                           ((char *) (&hd->heads), &heads, sizeof (heads)) != 0
 687 +                           || copy_to_user((char *) (&hd->sectors), &sectors,
 688 +                                           sizeof (sectors)) != 0
 689 +                           || copy_to_user((short *) (&hd->cylinders),
 690 +                                           &cylinders, sizeof (cylinders)) != 0
 691 +                           || copy_to_user((long *) (&hd->start), &start,
 692 +                                           sizeof (start)) != 0) {
 693 +                               return -EFAULT;
 694 +                       }
 695 +               }
 696 +               break;
 697 +
 698 +       case EVMS_QUIESCE_VOLUME:
 699 +               break;
 700 +
 701 +       case EVMS_GET_DISK_LIST:
 702 +       case EVMS_CHECK_MEDIA_CHANGE:
 703 +       case EVMS_REVALIDATE_DISK:
 704 +       case EVMS_OPEN_VOLUME:
 705 +       case EVMS_CLOSE_VOLUME:
 706 +       case EVMS_CHECK_DEVICE_STATUS:
 707 +               {
 708 +                       // These five ioctl all need to be broadcast to all PVs.
 709 +                       struct aix_volume_group *group = volume->group;
 710 +                       struct partition_list_entry *partition;
 711 +                       for (partition = group->partition_list; partition;
 712 +                            partition = partition->next) {
 713 +                               rc |=
 714 +                                   IOCTL(partition->logical_node, inode, file,
 715 +                                         cmd, arg);
 716 +                       }
 717 +               }
 718 +               break;
 719 +
 720 +       default:
 721 +               // Currently the VGE does not send any ioctl's down to the
 722 +               // partitions. Which partition would they go to?
 723 +               rc = -ENOTTY;
 724 +       }
 725 +
 726 +       return rc;
 727 +}
 728 +
 729 +/* Function: aix_direct_ioctl
 730 + *
 731 + *     This function provides a method for user-space to communicate directly
 732 + *     with a plugin in the kernel.
 733 + */
 734 +static int
 735 +aix_direct_ioctl(struct inode *inode,
 736 +                struct file *file, unsigned int cmd, unsigned long args)
 737 +{
 738 +       struct aix_logical_volume *volume = NULL;
 739 +       struct evms_plugin_ioctl_pkt argument;
 740 +       int rc = 0;
 741 +
 742 +       MOD_INC_USE_COUNT;
 743 +       LOG_DEBUG(" Function:%s cmd:%d \n", __FUNCTION__, cmd);
 744 +
 745 +       // Copy user's parameters to kernel space
 746 +       if (copy_from_user
 747 +           (&argument, (struct evms_plugin_ioctl *) args, sizeof (argument))) {
 748 +               MOD_DEC_USE_COUNT;
 749 +               return -EFAULT;
 750 +       }
 751 +       // Make sure this is supposed to be our ioctl.
 752 +       if (argument.feature_id != plugin_header.id) {
 753 +               MOD_DEC_USE_COUNT;
 754 +               return -EINVAL;
 755 +       }
 756 +
 757 +       argument.feature_command = 1;
 758 +
 759 +       switch (argument.feature_command) {
 760 +
 761 +       case EVMS_AIX_RESYNC_MIRRORS:
 762 +               {
 763 +                       struct aix_volume_resync_ioctl aix_lv_resync;
 764 +
 765 +                       if (copy_from_user
 766 +                           (&aix_lv_resync,
 767 +                            (struct aix_volume_resync_ioctl *) argument.
 768 +                            feature_ioctl_data, sizeof (aix_lv_resync))) {
 769 +                               rc = -EINVAL;
 770 +                               break;
 771 +                       }
 772 +
 773 +                       volume = AIX_get_volume_data(aix_lv_resync.object_name);
 774 +
 775 +                       if (volume) {
 776 +                               AIX_schedule_resync(volume, FALSE);
 777 +                       } else {
 778 +                               LOG_DEBUG
 779 +                                   (" Function:%s object_name:%s -- no match found\n",
 780 +                                    __FUNCTION__, aix_lv_resync.object_name);
 781 +                               rc = -EINVAL;
 782 +                       }
 783 +
 784 +               }
 785 +               break;
 786 +
 787 +       default:
 788 +               rc = -EINVAL;
 789 +               break;
 790 +       }
 791 +
 792 +       argument.status = rc;
 793 +       copy_to_user((struct evms_plugin_ioctl *) args, &argument,
 794 +                    sizeof (argument));
 795 +       MOD_DEC_USE_COUNT;
 796 +       return rc;
 797 +}
 798 +
 799 +/* Function: aix_direct_ioctl
 800 + *
 801 + *     This function provides a method for user-space to communicate directly
 802 + *     with a plugin in the kernel.
 803 + */
 804 +static struct aix_logical_volume *
 805 +AIX_get_volume_data(char *object_name)
 806 +{
 807 +
 808 +       struct aix_volume_group *VG_ptr;
 809 +       struct aix_logical_volume *volume = NULL;
 810 +       int i;
 811 +
 812 +       LOG_DEBUG(" Function:%s object_name:%s \n", __FUNCTION__, object_name);
 813 +
 814 +       if (!object_name || !strlen(object_name)) {
 815 +               return NULL;
 816 +       }
 817 +
 818 +       for (VG_ptr = AIXVolumeGroupList; VG_ptr; VG_ptr = VG_ptr->next) {
 819 +               for (i = 0; VG_ptr->volume_list[i]; i++) {
 820 +                       if (!strcmp(VG_ptr->volume_list[i]->name, object_name)) {
 821 +                               LOG_DEBUG
 822 +                                   (" Function:%s FOUND!! volume_name:%s \n",
 823 +                                    __FUNCTION__,
 824 +                                    VG_ptr->volume_list[i]->name);
 825 +                               volume = VG_ptr->volume_list[i];
 826 +                               break;
 827 +                       }
 828 +               }
 829 +       }
 830 +
 831 +       if (!volume) {
 832 +               LOG_DEBUG(" Function:%s object_name:%s NOT FOUND !! volume:%p \n",
 833 +                    __FUNCTION__, object_name, volume);
 834 +       }
 835 +
 836 +       return volume;
 837 +}
 838 +
 839 +/*
 840 + * Function: init_io_aix
 841 + *
 842 + */
 843 +static int
 844 +init_io_aix(struct evms_logical_node *node, int io_flag,       /* 0=read, 1=write */
 845 +           u64 sect_nr,        /* disk LBA */
 846 +           u64 num_sects,      /* # of sectors */
 847 +           void *buf_addr)
 848 +{                              /* buffer address */
 849 +       struct partition_list_entry *partition;
 850 +       u64 new_sector = 0;
 851 +       u64 new_size = 0;
 852 +       int rc = 0;
 853 +       u32 le, offset;
 854 +
 855 +       LOG_DEBUG(" ************ init_io_aix() num_sects:" PFU64
 856 +                 " node:%p sect_nr:" PFU64 "\n", num_sects, node, sect_nr);
 857 +
 858 +       // Init IO needs to deal with the possibility that a request can come
 859 +       // in that spans PEs or stripes. This is possible because there is no
 860 +       // limit on num_sects. To fix this, we loop through AIX_remap_sector and
 861 +       // INIT_IO until num_sects reaches zero.
 862 +
 863 +       while (num_sects > 0) {
 864 +
 865 +               if (AIX_remap_sector(node, sect_nr, num_sects, &new_sector, &new_size,
 866 +                    &partition, &le, &offset) || (!new_sector || !partition)) {
 867 +                       LOG_CRITICAL("--- Error returned from AIX_remap_sector %d\n",
 868 +                            __LINE__);
 869 +                       return -EIO;
 870 +               }
 871 +
 872 +               LOG_DEBUG(" init_io_aix() line:%d logical_node:%p io_flag:%d new_sector:"
 873 +                    PFU64 " new_size:" PFU64 "\n", __LINE__,
 874 +                    partition->logical_node, io_flag, new_sector, new_size);
 875 +
 876 +               rc = INIT_IO(partition->logical_node, io_flag, new_sector,
 877 +                            new_size, buf_addr);
 878 +               num_sects -= new_size;
 879 +               sect_nr += new_size;
 880 +               buf_addr = (void *) (((unsigned long) buf_addr) +
 881 +                             (unsigned long) (new_size << EVMS_VSECTOR_SIZE_SHIFT));
 882 +       }
 883 +
 884 +       return rc;
 885 +}
 886 +
 887 +/*
 888 + * Function: AIXlvm_vge_init
 889 + *
 890 + */
 891 +int __init
 892 +AIXlvm_vge_init(void)
 893 +{
 894 +
 895 +       LOG_DEBUG(" %s --------\n", __FUNCTION__);
 896 +
 897 +       MOD_INC_USE_COUNT;
 898 +       return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
 899 +}
 900 +
 901 +module_init(AIXlvm_vge_init);
 902 +
 903 +/********** Required Plugin Functions **********/
 904 +
 905 +/*
 906 + * Function: discover_aix
 907 + *
 908 + *  This is the entry point into the LVM discovery process.
 909 + */
 910 +static int
 911 +discover_aix(struct evms_logical_node **evms_logical_disk_head)
 912 +{
 913 +       int rc = 0, count = 0;
 914 +
 915 +       MOD_INC_USE_COUNT;
 916 +       LOG_DEBUG("[%s] discover_volume_groups\n", __FUNCTION__);
 917 +
 918 +       rc = discover_volume_groups(evms_logical_disk_head);
 919 +
 920 +       if (rc) {
 921 +               LOG_ERROR("[%s] discover_volume_groups rc=%d\n", __FUNCTION__,rc);
 922 +       }
 923 +
 924 +       if (AIXVolumeGroupList && !rc) {
 925 +
 926 +               LOG_DEBUG("[%s] discover_logical_volumes\n", __FUNCTION__);
 927 +
 928 +               rc = discover_logical_volumes();
 929 +
 930 +               if (rc) {
 931 +                       LOG_ERROR("[%s] discover_logical_volumes rc=%d\n",
 932 +                                 __FUNCTION__, rc);
 933 +               }
 934 +
 935 +               LOG_DEBUG("[%s] export_volumes\n", __FUNCTION__);
 936 +
 937 +               count = export_volumes(evms_logical_disk_head);
 938 +
 939 +               LOG_DEBUG("[%s] export_volumes count=%d\n", __FUNCTION__,
 940 +                         count);
 941 +       }
 942 +
 943 +       MOD_DEC_USE_COUNT;
 944 +       return (count);
 945 +}
 946 +
 947 +static int
 948 +discover_volume_groups(struct evms_logical_node **evms_logical_disk_head)
 949 +{
 950 +       struct evms_logical_node *logical_node;
 951 +       struct evms_logical_node *next_node;
 952 +       struct aix_ipl_rec_area *AIXpv;
 953 +       struct AIXlvm_rec *AIXlvm;      // Temp holder for the LVM on disk rec
 954 +
 955 +       LOG_DEBUG(" Begin %s\n", __FUNCTION__);
 956 +
 957 +       AIXpv = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
 958 +       if (!AIXpv) {
 959 +               return -ENOMEM;
 960 +       }
 961 +
 962 +       // We'll create at least one volume entry, if we don't find any AIX volumes we'll clean it up later
 963 +
 964 +       AIXlvm = kmalloc(sizeof (struct AIXlvm_rec), GFP_KERNEL);
 965 +       if (!AIXlvm) {
 966 +               kfree(AIXpv);
 967 +               return -ENOMEM;
 968 +       }
 969 +
 970 +       for (logical_node = *evms_logical_disk_head; logical_node;
 971 +            logical_node = next_node) {
 972 +
 973 +               // Grab the next list item in case we remove this partition from the global list.
 974 +               next_node = logical_node->next;
 975 +
 976 +               // Read the first sector and see if it has a valid AIX PV signature.
 977 +
 978 +               if (INIT_IO(logical_node, 0, 0, 1, AIXpv)) {
 979 +                       // On an I/O error, continue on to the next
 980 +                       // partition. The group that this partition
 981 +                       // belongs to will be incomplete, but we still
 982 +                       // need to discover any other groups.
 983 +
 984 +                       LOG_ERROR(" Error reading PV [%p]\n", logical_node);
 985 +                       continue;
 986 +               }
 987 +
 988 +               if (AIXpv->IPL_record_id == IPLRECID) {
 989 +
 990 +                       // This partition is definitely a PV,
 991 +                       // but is it part of a valid VG?
 992 +                       LOG_DEBUG(" DVG removing node from list logical_node %p\n",
 993 +                            logical_node);
 994 +
 995 +                       if (INIT_IO(logical_node, 0, PSN_LVM_REC, 1, AIXlvm)) {
 996 +                               LOG_ERROR(" Error reading PV [%p]\n",logical_node);
 997 +                               continue;
 998 +                       }
 999 +
1000 +                       if (AIXlvm->lvm_id == AIX_LVM_LVMID) {
1001 +
1002 +                               if (validate_build_volume_group_disk_info(
1003 +                                       logical_node, AIXlvm)) {
1004 +                                       // Again, continue on and we'll
1005 +                                       // clean up later.
1006 +                                       continue;
1007 +                               }
1008 +
1009 +                               evms_cs_remove_logical_node_from_list(
1010 +                                   evms_logical_disk_head, logical_node);
1011 +
1012 +                       } else {
1013 +                               LOG_DEBUG(" Found an AIX PV with no parent LVM (LVM ID: %d)\n",
1014 +                                    AIXlvm->lvm_id);
1015 +                               continue;
1016 +                       }
1017 +               } else {
1018 +                       LOG_DEBUG(" Found a PV not belonging to AIX [%p]\n",
1019 +                                 logical_node);
1020 +               }
1021 +       }
1022 +
1023 +       AIX_VOLUME_GROUP_DUMP();
1024 +
1025 +       if (check_volume_groups()) {
1026 +               return -EINVAL;
1027 +       }
1028 +
1029 +       kfree(AIXpv);
1030 +       kfree(AIXlvm);
1031 +
1032 +       return 0;
1033 +}
1034 +
1035 +/*
1036 + * Function:  validate_build_volume_group_disk_info
1037 + *
1038 + *  Creates and validates the volume groups found on the disk structures.
1039 + *
1040 + */
1041 +static int
1042 +validate_build_volume_group_disk_info(struct evms_logical_node *logical_node,
1043 +                                     struct AIXlvm_rec *AIXlvm)
1044 +{
1045 +
1046 +       struct aix_volume_group *AIXVGLptr = AIXVolumeGroupList;
1047 +
1048 +       LOG_DEBUG(" VBVGDI pv_num:%d\n", AIXlvm->pv_num);
1049 +
1050 +       while (AIXVGLptr) {
1051 +               if (COMPARE_UNIQUE_IDS(AIXlvm->vg_id, AIXVGLptr->vg_id)) {
1052 +                       break;
1053 +               }
1054 +               AIXVGLptr = AIXVGLptr->next;    // There is more than one so walk the list
1055 +       }
1056 +
1057 +       if (!AIXVGLptr) {
1058 +               LOG_DEBUG(" VBVGDI AIXVGLptr:%p line:%d\n", AIXVGLptr,__LINE__);
1059 +               AIXVGLptr = AIX_create_volume_group(logical_node, AIXlvm);
1060 +        if (AIXVGLptr) {
1061 +            AIXVGLptr->next = AIXVolumeGroupList;
1062 +            AIXVolumeGroupList = AIXVGLptr;
1063 +        }
1064 +       } else {
1065 +               LOG_DEBUG(" VBVGDI Rediscover AIXVGLptr:%p line:%d\n",
1066 +                         AIXVGLptr, __LINE__);
1067 +
1068 +               if (AIX_update_volume_group(AIXVGLptr, logical_node, AIXlvm)) {
1069 +                       LOG_DEBUG
1070 +                           (" VBVGDI ERROR on Rediscover AIXVGLptr:%p  line:%d\n",
1071 +                            AIXVGLptr, __LINE__);
1072 +               }
1073 +       }
1074 +
1075 +       if (!AIXVGLptr) {
1076 +
1077 +               LOG_DEBUG(" VBVGDI AIXVGLptr:%p line:%d\n", AIXVGLptr,
1078 +                         __LINE__);
1079 +               LOG_DEBUG(" VBVGDI flags:%d\n", AIXVGLptr->flags);
1080 +               LOG_CRITICAL("Unable to allocate volume group data struct Volume Group Corruption !!\n");
1081 +               return -EINVAL;
1082 +       } else {
1083 +
1084 +               LOG_DEBUG(" VBVGDI AIXVolumeGroupList:%p line:%d\n",
1085 +                         AIXVolumeGroupList, __LINE__);
1086 +               LOG_DEBUG(" VBVGDI AIXVGLptr:%p line:%d\n", AIXVGLptr,
1087 +                         __LINE__);
1088 +               LOG_DEBUG(" VBVGDI flags:%d\n", AIXVGLptr->flags);
1089 +
1090 +               if (add_PV_to_volume_group(AIXVGLptr, logical_node, AIXlvm->pv_num)) {
1091 +                       return -EINVAL;
1092 +               }
1093 +       }
1094 +
1095 +       return 0;
1096 +}
1097 +
1098 +/*
1099 + * Function: add_VG_data_to_VG_list
1100 + *
1101 + *  Allocate space for a new LVM volume group and all of its sub-fields.
1102 + *  Initialize the appropriate fields.
1103 + */
1104 +
1105 +static int
1106 +add_VG_data_to_VG_list(struct evms_logical_node *logical_node,
1107 +                      struct aix_volume_group *new_group, short int pvNum)
1108 +{
1109 +//     int pvh_pos;
1110 +
1111 +//     struct pv_header *AIXpvh;
1112 +
1113 +       // The array of pointer to the logical volumes.
1114 +       // Leave this allocation at the max permitted, the lv numbering may not be sequential so you may have gaps
1115 +       // in the array allocation i.e. 1,2,3,4,5,6,7,8,11,15,21,33 etc. even though you only have 12 LVs.
1116 +
1117 +       LOG_DEBUG(" AVGDVGL Entering pvNum:%d vgda_PSN:%d\n", pvNum,
1118 +                 new_group->vgda_psn);
1119 +
1120 +//     pvh_pos = AIX_PVH_DATA_PSN(new_group->vgda_psn, pvNum);
1121 +
1122 +/*     AIXpvh = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
1123 +       if (!AIXpvh) {
1124 +               return -ENOMEM;
1125 +       }
1126 +
1127 +       memset(AIXpvh, 0, AIX_SECTOR_SIZE);
1128 +
1129 +       LOG_DEBUG(" AVGDVGL pvh_pos:%d\n", pvh_pos);
1130 +
1131 +       if (INIT_IO(logical_node, 0, pvh_pos, 1, AIXpvh)) {
1132 +               return -EIO;
1133 +       }
1134 +
1135 +       LOG_DEBUG(" AVGDVGL AIXpvh->pv_num:%d\n", pvNum);
1136 +*/
1137 +       if (!new_group->volume_list) {
1138 +               new_group->volume_list =
1139 +                   kmalloc(LVM_MAXLVS * sizeof (struct aix_logical_volume *),
1140 +                           GFP_KERNEL);
1141 +               if (!new_group->volume_list) {
1142 +//                     kfree(AIXpvh);
1143 +                       return -ENOMEM;
1144 +               }
1145 +               memset(new_group->volume_list, 0,
1146 +                      (LVM_MAXLVS * sizeof (struct aix_logical_volume *)));
1147 +       }
1148 +
1149 +       new_group->vg_id.word1 = new_group->AIXvgh->vg_id.word1;
1150 +       new_group->vg_id.word2 = new_group->AIXvgh->vg_id.word2;
1151 +       new_group->vg_id.word3 = new_group->AIXvgh->vg_id.word3;
1152 +       new_group->vg_id.word4 = new_group->AIXvgh->vg_id.word4;
1153 +//     new_group->numpvs = new_group->AIXvgh->numpvs;
1154 +//     new_group->numlvs = new_group->AIXvgh->numlvs;
1155 +//     new_group->lv_max = new_group->AIXvgh->maxlvs;
1156 +       new_group->pe_size = GET_PHYSICAL_PART_SIZE(new_group->AIXvgh->pp_size) /
1157 +                               AIX_SECTOR_SIZE;
1158 +
1159 +//     new_group->block_size = 0;
1160 +//     new_group->hard_sect_size = 0;
1161 +       new_group->flags |= AIX_VG_DIRTY;
1162 +
1163 +//     kfree(AIXpvh);
1164 +
1165 +       LOG_DEBUG(" AVGDVGL Vol Group ID %x\n", new_group->vg_id.word2);
1166 +
1167 +       return 0;
1168 +}
1169 +
1170 +/*
1171 + * Function: add_PV_to_volume_group
1172 + *
1173 + *  Create a new partition_list_entry for the specified volume group.
1174 + *  Initialize the new partition with the evms node and lvm pv information,
1175 + *  and add the new partition to the group's list.
1176 + */
1177 +
1178 +static int
1179 +add_PV_to_volume_group(struct aix_volume_group *group,
1180 +                      struct evms_logical_node *evms_partition, int pvNum)
1181 +{
1182 +       struct partition_list_entry *new_partition;
1183 +
1184 +       LOG_DEBUG(" APVVG Entering pvNum:%d\n", pvNum);
1185 +
1186 +       group->flags |= AIX_VG_DIRTY;
1187 +
1188 +       for (new_partition = group->partition_list; new_partition != NULL;
1189 +            new_partition = new_partition->next) {
1190 +               if (new_partition->logical_node == evms_partition) {
1191 +                       return 0;
1192 +               }
1193 +       }
1194 +
1195 +       new_partition =
1196 +           kmalloc(sizeof (struct partition_list_entry), GFP_KERNEL);
1197 +       if (!new_partition) {
1198 +               return -ENOMEM;
1199 +       }
1200 +
1201 +       memset(new_partition, 0, sizeof (struct partition_list_entry));
1202 +
1203 +       // Add this partition to this group's list.
1204 +       new_partition->logical_node = evms_partition;
1205 +       new_partition->pv_number = pvNum;
1206 +
1207 +       if (evms_partition->hardsector_size > group->hard_sect_size) {
1208 +               group->hard_sect_size = evms_partition->hardsector_size;
1209 +       }
1210 +       if (evms_partition->block_size > group->block_size) {
1211 +               group->block_size = evms_partition->block_size;
1212 +       }
1213 +
1214 +       // Add this partition to the beginning of its group's list.
1215 +       new_partition->next = group->partition_list;
1216 +       group->partition_list = new_partition;
1217 +       group->partition_count++;
1218 +
1219 +       LOG_DEBUG(" APVVG partition_count:%d pv_num:%d\n",
1220 +                 group->partition_count, pvNum);
1221 +
1222 +       return 0;
1223 +}
1224 +
1225 +/****************************************************
1226 +*
1227 +*
1228 +*
1229 +*****************************************************/
1230 +static struct aix_volume_group *
1231 +AIX_create_volume_group(struct evms_logical_node *logical_node,
1232 +                       struct AIXlvm_rec *AIXlvm)
1233 +{
1234 +       struct vg_header *AIXvgh = NULL, *AIXvgh2 = NULL;
1235 +       struct vg_trailer *AIXvgt = NULL, *AIXvgt2 = NULL;
1236 +       struct aix_volume_group *AIXVGLptr;
1237 +
1238 +       AIXvgh = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
1239 +       if (!AIXvgh) {
1240 +               return NULL;
1241 +       }
1242 +
1243 +       AIXvgh2 = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
1244 +       if (!AIXvgh2) {
1245 +               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1246 +               return NULL;
1247 +       }
1248 +
1249 +       AIXvgt = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
1250 +       if (!AIXvgt) {
1251 +               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1252 +               return NULL;
1253 +       }
1254 +
1255 +       AIXvgt2 = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
1256 +       if (!AIXvgt2) {
1257 +               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1258 +               return NULL;
1259 +       }
1260 +
1261 +       memset(AIXvgh, 0, AIX_SECTOR_SIZE);
1262 +       memset(AIXvgh2, 0, AIX_SECTOR_SIZE);
1263 +       memset(AIXvgt, 0, AIX_SECTOR_SIZE);
1264 +       memset(AIXvgt2, 0, AIX_SECTOR_SIZE);
1265 +
1266 +       // First time thru we want to read this in, we may only have one PV in this group, all others
1267 +       // may be corrupt, etc. If the info is clean we shouldn't get here.
1268 +
1269 +       if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[0], 1, AIXvgh)) {
1270 +               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1271 +               return NULL;
1272 +       }
1273 +
1274 +       if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[1], 1, AIXvgh2)) {
1275 +               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1276 +               return NULL;
1277 +       }
1278 +
1279 +       if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[0] + AIXlvm->vgda_len - 1), 1,
1280 +            AIXvgt)) {
1281 +               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1282 +               return NULL;
1283 +       }
1284 +
1285 +       if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[1] + AIXlvm->vgda_len - 1), 1,
1286 +            AIXvgt2)) {
1287 +               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1288 +               return NULL;
1289 +       }
1290 +
1291 +       LOG_DEBUG("CVG AIXvgh->vgda_psn[%d]:%d\n", 0, AIXlvm->vgda_psn[0]);
1292 +       LOG_DEBUG("CVG AIXvgh->vgda_psn[%d]:%d\n", 1, AIXlvm->vgda_psn[1]);
1293 +       LOG_DEBUG("CVG AIXvgt psn[%d]:%d\n", 0,(AIXlvm->vgda_psn[0] + AIXlvm->vgda_len - 1));
1294 +       LOG_DEBUG("CVG AIXvgt psn[%d]:%d\n", 1,(AIXlvm->vgda_psn[1] + AIXlvm->vgda_len - 1));
1295 +       LOG_DEBUG("CVG Allocating AIXVGLptr:size:%d \n",(int) sizeof (struct aix_volume_group));
1296 +
1297 +       AIXVGLptr = kmalloc(sizeof (struct aix_volume_group), GFP_KERNEL);
1298 +       if (!AIXVGLptr) {
1299 +               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1300 +               return NULL;
1301 +       }
1302 +       memset(AIXVGLptr, 0, sizeof (struct aix_volume_group));
1303 +
1304 +       AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1305 +       AIXVGLptr->flags |= AIX_VG_DIRTY;
1306 +
1307 +       LOG_DEBUG("CVG AIXVGLptr:%p line %d\n", AIXVGLptr, __LINE__);
1308 +
1309 +       AIXVGLptr->AIXvgh = kmalloc(sizeof (struct vg_header), GFP_KERNEL);
1310 +       if (!AIXVGLptr->AIXvgh) {
1311 +               kfree(AIXVGLptr);
1312 +               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1313 +               return NULL;
1314 +       }
1315 +       memset(AIXVGLptr->AIXvgh, 0, sizeof (struct vg_header));
1316 +
1317 +       LOG_DEBUG("CVG COMP TS AIXVGLptr->CleanVGInfo:%d \n",
1318 +                 AIXVGLptr->CleanVGInfo);
1319 +
1320 +       if (AIXVGLptr->CleanVGInfo == AIX_PV_STATE_INVALID) {
1321 +               if (COMPARE_TIMESTAMPS(AIXvgh->vg_timestamp, AIXvgt->timestamp)) {
1322 +                       if (COMPARE_TIMESTAMPS
1323 +                           (AIXvgh2->vg_timestamp, AIXvgt2->timestamp)) {
1324 +                               if (COMPARE_TIMESTAMPS
1325 +                                   (AIXvgh->vg_timestamp,
1326 +                                    AIXvgh2->vg_timestamp)) {
1327 +                                       // All timestamps match. Yea!
1328 +                                       AIXVGLptr->CleanVGInfo =
1329 +                                           AIX_PV_STATE_VALID;
1330 +                               } else {
1331 +                                       // Both VGDAs are good, but timestamps are
1332 +                                       // different. Can't tell yet which one is
1333 +                                       // correct.
1334 +                                       AIXVGLptr->CleanVGInfo =
1335 +                                           AIX_PV_STATE_EITHER_VGDA;
1336 +                               }
1337 +                       } else {
1338 +                               // First VGDA is good, second is bad.
1339 +                               AIXVGLptr->CleanVGInfo =
1340 +                                   AIX_PV_STATE_FIRST_VGDA;
1341 +                       }
1342 +               } else {
1343 +                       if (COMPARE_TIMESTAMPS
1344 +                           (AIXvgh2->vg_timestamp, AIXvgt2->timestamp)) {
1345 +                               // First VGDA is bad, second is good.
1346 +                               AIXVGLptr->CleanVGInfo =
1347 +                                   AIX_PV_STATE_SECOND_VGDA;
1348 +                       } else if (AIXvgh->numpvs == 1) {       // We only have 1 PV in this group, mismatch or not this will have to do
1349 +                               AIXVGLptr->CleanVGInfo = AIX_PV_STATE_VALID;
1350 +                       } else {
1351 +                               // This should never happen.
1352 +                               LOG_DEBUG("All four VG timestamps for %d are different. What happened?!?\n",
1353 +                                    AIXVGLptr->vg_id.word2);
1354 +                               AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1355 +
1356 +                       }
1357 +               }
1358 +
1359 +               LOG_DEBUG("CVG SWITCH TS AIXVGLptr->CleanVGInfo:%d \n",
1360 +                         AIXVGLptr->CleanVGInfo);
1361 +
1362 +               switch (AIXVGLptr->CleanVGInfo) {
1363 +               case AIX_PV_STATE_VALID:
1364 +               case AIX_PV_STATE_FIRST_VGDA:
1365 +
1366 +                       LOG_DEBUG("CVG SWITCH VALID %d size:%d\n",
1367 +                                 AIXVGLptr->CleanVGInfo,
1368 +                                 (int) sizeof (struct vg_header));
1369 +
1370 +                       AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh);        // Get the info. we need
1371 +
1372 +                       AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0];
1373 +                       AIXVGLptr->vgda_len = AIXlvm->vgda_len;
1374 +                       break;
1375 +
1376 +               case AIX_PV_STATE_SECOND_VGDA:
1377 +                       LOG_DEBUG("CVG SWITCH SECOND VGDA %d size:%d\n",
1378 +                                 AIXVGLptr->CleanVGInfo,
1379 +                                 (int) sizeof (struct vg_header));
1380 +
1381 +                       AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh2);       // Get the info. we need
1382 +
1383 +                       AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[1];
1384 +                       AIXVGLptr->vgda_len = AIXlvm->vgda_len;
1385 +                       break;
1386 +
1387 +               case AIX_PV_STATE_EITHER_VGDA:
1388 +                       LOG_DEBUG("CVG SWITCH EITHER VGDA %d size:%d\n",
1389 +                                 AIXVGLptr->CleanVGInfo,(int) sizeof (struct vg_header));
1390 +                       if (COMPARE_UNIQUE_IDS(AIXvgh->vg_id, AIXvgh2->vg_id)) {
1391 +
1392 +                               AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh);        // Get the info. we need
1393 +
1394 +                               AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0];
1395 +                               AIXVGLptr->vgda_len = AIXlvm->vgda_len;
1396 +                       } else {
1397 +                               AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1398 +                               // Not sure where this PV belongs. It thinks it is
1399 +                               // supposed to be in two different containers. We will
1400 +                               // probably need to put this on a separate, temporary
1401 +                               // list, and determine later which container is missing
1402 +                               // a PV.
1403 +                       }
1404 +                       break;
1405 +
1406 +               default:
1407 +                       LOG_ERROR("Invalid PV state (%d) for %d\n",
1408 +                                 AIXVGLptr->CleanVGInfo,
1409 +                                 AIXVGLptr->vg_id.word2);
1410 +                       AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1411 +                       break;
1412 +               }
1413 +
1414 +       }
1415 +
1416 +    // Currently AIX Big VGDA is not supported - cleanup and return NULL so this VG doesn't get added
1417 +
1418 +    if (AIXVGLptr->AIXvgh->bigvg != 0) {
1419 +        LOG_SERIOUS("Error creating Volume Group AIX Big VGDA is not currently supported\n");
1420 +        if (AIXVGLptr->AIXvgh) {
1421 +            kfree(AIXVGLptr->AIXvgh);
1422 +            AIXVGLptr->AIXvgh = NULL;
1423 +        }
1424 +
1425 +        if (AIXVGLptr) {
1426 +            kfree(AIXVGLptr);
1427 +            AIXVGLptr = NULL;
1428 +        }
1429 +
1430 +        AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1431 +        return NULL;
1432 +    }
1433 +
1434 +       add_VG_data_to_VG_list(logical_node, AIXVGLptr, AIXlvm->pv_num);
1435 +
1436 +       AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1437 +
1438 +       LOG_DEBUG("CVG Exiting CleanVGInfo:%d\n", AIXVGLptr->CleanVGInfo);
1439 +
1440 +       return AIXVGLptr;
1441 +}
1442 +
1443 +/****************************************************
1444 +*
1445 +*
1446 +*
1447 +*****************************************************/
1448 +static int
1449 +AIX_update_volume_group(struct aix_volume_group *AIXVGLptr,
1450 +                       struct evms_logical_node *logical_node,
1451 +                       struct AIXlvm_rec *AIXlvm)
1452 +{
1453 +       struct vg_header *AIXvgh = NULL, *AIXvgh2 = NULL;
1454 +       struct vg_trailer *AIXvgt = NULL, *AIXvgt2 = NULL;
1455 +
1456 +       AIXvgh = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
1457 +       if (!AIXvgh) {
1458 +               return -ENOMEM;
1459 +       }
1460 +
1461 +       AIXvgh2 = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
1462 +       if (!AIXvgh2) {
1463 +               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1464 +               return -ENOMEM;
1465 +       }
1466 +
1467 +       AIXvgt = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
1468 +       if (!AIXvgt) {
1469 +               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1470 +               return -ENOMEM;
1471 +       }
1472 +
1473 +       AIXvgt2 = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
1474 +       if (!AIXvgt2) {
1475 +               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1476 +               return -ENOMEM;
1477 +       }
1478 +
1479 +       // First time thru we want to read this in, we may only have one PV in this group, all others
1480 +       // may be corrupt, etc. If the info is clean we shouldn't get here.
1481 +
1482 +       if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[0], 1, AIXvgh)) {
1483 +               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1484 +               return -ENOMEM;
1485 +       }
1486 +
1487 +       if (INIT_IO(logical_node, 0, AIXlvm->vgda_psn[1], 1, AIXvgh2)) {
1488 +               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1489 +               return -ENOMEM;
1490 +       }
1491 +
1492 +       if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[0] + AIXlvm->vgda_len - 1), 1,
1493 +            AIXvgt)) {
1494 +               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1495 +               return -ENOMEM;
1496 +       }
1497 +
1498 +       if (INIT_IO(logical_node, 0, (AIXlvm->vgda_psn[1] + AIXlvm->vgda_len - 1), 1,
1499 +            AIXvgt2)) {
1500 +               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1501 +               return -ENOMEM;
1502 +       }
1503 +
1504 +       LOG_DEBUG("UVG AIXvgh->vgda_psn[%d]:%d\n", 0, AIXlvm->vgda_psn[0]);
1505 +       LOG_DEBUG("UVG AIXvgh->vgda_psn[%d]:%d\n", 1, AIXlvm->vgda_psn[1]);
1506 +       LOG_DEBUG("UVG AIXvgt psn[%d]:%d\n", 0,(AIXlvm->vgda_psn[0] + AIXlvm->vgda_len - 1));
1507 +       LOG_DEBUG("UVG AIXvgt psn[%d]:%d\n", 1,(AIXlvm->vgda_psn[1] + AIXlvm->vgda_len - 1));
1508 +
1509 +       AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1510 +       AIXVGLptr->flags |= AIX_VG_DIRTY;
1511 +
1512 +       LOG_DEBUG("UVG AIXVGLptr:%p line %d\n", AIXVGLptr, __LINE__);
1513 +
1514 +       AIXVGLptr->AIXvgh = kmalloc(sizeof (struct vg_header), GFP_KERNEL);
1515 +       if (!AIXVGLptr->AIXvgh) {
1516 +               AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1517 +               return -ENOMEM;
1518 +       }
1519 +       memset(AIXVGLptr->AIXvgh, 0, sizeof (struct vg_header));
1520 +
1521 +       LOG_DEBUG("UVG COMP TS AIXVGLptr->CleanVGInfo:%d \n",AIXVGLptr->CleanVGInfo);
1522 +
1523 +       if (AIXVGLptr->CleanVGInfo == AIX_PV_STATE_INVALID) {
1524 +               if (COMPARE_TIMESTAMPS(AIXvgh->vg_timestamp, AIXvgt->timestamp)) {
1525 +                       if (COMPARE_TIMESTAMPS
1526 +                           (AIXvgh2->vg_timestamp, AIXvgt2->timestamp)) {
1527 +                               if (COMPARE_TIMESTAMPS
1528 +                                   (AIXvgh->vg_timestamp,
1529 +                                    AIXvgh2->vg_timestamp)) {
1530 +                                       // All timestamps match. Yea!
1531 +                                       AIXVGLptr->CleanVGInfo =
1532 +                                           AIX_PV_STATE_VALID;
1533 +                               } else {
1534 +                                       // Both VGDAs are good, but timestamps are
1535 +                                       // different. Can't tell yet which one is
1536 +                                       // correct.
1537 +                                       AIXVGLptr->CleanVGInfo =
1538 +                                           AIX_PV_STATE_EITHER_VGDA;
1539 +                               }
1540 +                       } else {
1541 +                               // First VGDA is good, second is bad.
1542 +                               AIXVGLptr->CleanVGInfo =
1543 +                                   AIX_PV_STATE_FIRST_VGDA;
1544 +                       }
1545 +               } else {
1546 +                       if (COMPARE_TIMESTAMPS
1547 +                           (AIXvgh2->vg_timestamp, AIXvgt2->timestamp)) {
1548 +                               // First VGDA is bad, second is good.
1549 +                               AIXVGLptr->CleanVGInfo =
1550 +                                   AIX_PV_STATE_SECOND_VGDA;
1551 +                       } else if (AIXvgh->numpvs == 1) {       // We only have 1 PV in this group, mismatch or not this will have to do
1552 +                               AIXVGLptr->CleanVGInfo = AIX_PV_STATE_VALID;
1553 +                       } else {
1554 +                               // This should never happen.
1555 +                               LOG_DEBUG
1556 +                                   ("All four VG timestamps for %d are different. What happened?!?\n",
1557 +                                    AIXVGLptr->vg_id.word2);
1558 +                               AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1559 +
1560 +                       }
1561 +               }
1562 +
1563 +               LOG_DEBUG("UVG SWITCH TS AIXVGLptr->CleanVGInfo:%d \n",
1564 +                         AIXVGLptr->CleanVGInfo);
1565 +
1566 +               switch (AIXVGLptr->CleanVGInfo) {
1567 +               case AIX_PV_STATE_VALID:
1568 +               case AIX_PV_STATE_FIRST_VGDA:
1569 +
1570 +                       LOG_DEBUG("UVG SWITCH VALID %d size:%d\n",
1571 +                                 AIXVGLptr->CleanVGInfo,
1572 +                                 (int) sizeof (struct vg_header));
1573 +
1574 +                       AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh);        // Get the info. we need
1575 +
1576 +                       AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0];
1577 +                       AIXVGLptr->vgda_len = AIXlvm->vgda_len;
1578 +                       break;
1579 +
1580 +               case AIX_PV_STATE_SECOND_VGDA:
1581 +                       LOG_DEBUG("UVG SWITCH SECOND VGDA %d size:%d\n",
1582 +                                 AIXVGLptr->CleanVGInfo,
1583 +                                 (int) sizeof (struct vg_header));
1584 +
1585 +                       AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh2);       // Get the info. we need
1586 +
1587 +                       AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[1];
1588 +                       AIXVGLptr->vgda_len = AIXlvm->vgda_len;
1589 +                       break;
1590 +
1591 +               case AIX_PV_STATE_EITHER_VGDA:
1592 +                       LOG_DEBUG("UVG SWITCH EITHER VGDA %d size:%d\n",
1593 +                                 AIXVGLptr->CleanVGInfo,
1594 +                                 (int) sizeof (struct vg_header));
1595 +                       if (COMPARE_UNIQUE_IDS(AIXvgh->vg_id, AIXvgh2->vg_id)) {
1596 +
1597 +                               AIX_copy_header_info(AIXVGLptr->AIXvgh, AIXvgh);        // Get the info. we need
1598 +
1599 +                               AIXVGLptr->vgda_psn = AIXlvm->vgda_psn[0];
1600 +                               AIXVGLptr->vgda_len = AIXlvm->vgda_len;
1601 +                       } else {
1602 +                               AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1603 +                               // Not sure where this PV belongs. It thinks it is
1604 +                               // supposed to be in two different containers. We will
1605 +                               // probably need to put this on a separate, temporary
1606 +                               // list, and determine later which container is missing
1607 +                               // a PV.
1608 +                       }
1609 +                       break;
1610 +
1611 +               default:
1612 +                       LOG_ERROR("UVG Invalid PV state (%d) for %d\n",
1613 +                                 AIXVGLptr->CleanVGInfo,
1614 +                                 AIXVGLptr->vg_id.word2);
1615 +                       AIXVGLptr->CleanVGInfo = AIX_PV_STATE_INVALID;
1616 +                       break;
1617 +               }
1618 +
1619 +       }
1620 +
1621 +//     add_VG_data_to_VG_list(logical_node, AIXVGLptr, AIXlvm->pv_num);
1622 +       AIXVGLptr->flags |= AIX_VG_DIRTY;
1623 +
1624 +       AIX_free_headers(AIXvgh, AIXvgh2, AIXvgt, AIXvgt2);
1625 +
1626 +       LOG_DEBUG("UVG Exiting CleanVGInfo:%d\n", AIXVGLptr->CleanVGInfo);
1627 +
1628 +       return 0;
1629 +}
1630 +
1631 +/****************************************************
1632 +* Function: check_volume_groups
1633 +*
1634 +* We just want to make sure the volume groups have found
1635 +* all their drives.
1636 +*
1637 +* If not, we'll continue and build what we can
1638 +*****************************************************/
1639 +static int
1640 +check_volume_groups(void)
1641 +{
1642 +       struct aix_volume_group *group;
1643 +       struct aix_volume_group *next_group;
1644 +//     struct partition_list_entry *partitions;
1645 +//     int NumPVS = 0;
1646 +
1647 +       LOG_DEBUG("CHVG Checking volume groups:\n");
1648 +
1649 +
1650 +       for (group = AIXVolumeGroupList; group; group = next_group) {
1651 +               next_group = group->next;
1652 +
1653 +               if (group->flags & AIX_VG_DIRTY){
1654 +                       if (group->AIXvgh->numlvs == 0) {
1655 +                               remove_group_from_list(group);
1656 +                               deallocate_volume_group(group);
1657 +                       } else {
1658 +                               if (group->partition_count != group->AIXvgh->numpvs) {
1659 +                                       group->flags |= AIX_VG_INCOMPLETE;
1660 +                                       LOG_ERROR("CHVG Found incomplete VG !! flags:%x\n",
1661 +                                                 group->flags);
1662 +                                       LOG_ERROR("CHVG Found %d PVs should have %d PVs\n",
1663 +                                                 group->partition_count, group->AIXvgh->numpvs);
1664 +                               }
1665 +                       }
1666 +               }
1667 +       }
1668 +
1669 +       LOG_DEBUG("CHVG Finished Checking volume groups:\n");
1670 +       return 0;
1671 +
1672 +}
1673 +
1674 +/************************************************************************
1675 + * Function: discover_logical_volumes
1676 + *
1677 + *  After all PVs have been claimed and added to the appropriate VG list,
1678 + *  the volumes for each VG must be constructed.
1679 + *
1680 + *
1681 + */
1682 +static int
1683 +discover_logical_volumes(void)
1684 +{
1685 +
1686 +       struct aix_volume_group *AIXVGLPtr;
1687 +       struct aix_logical_volume *new_LV;
1688 +       struct partition_list_entry *partition;
1689 +       struct evms_logical_node *node;
1690 +       struct lv_entries *AIXlvent, *AIXlventHead;
1691 +       int j, lv_found, all_lvs_found, rc;
1692 +       struct namelist *AIXnamelist;
1693 +       char *NameBuffer;
1694 +
1695 +       AIXlventHead =
1696 +           kmalloc(MAX_SECTORS_LV_ENTRIES * AIX_SECTOR_SIZE, GFP_KERNEL);
1697 +       if (!AIXlventHead) {
1698 +               return -ENOMEM;
1699 +       }
1700 +
1701 +       memset(AIXlventHead, 0, (MAX_SECTORS_LV_ENTRIES * AIX_SECTOR_SIZE));
1702 +
1703 +       NameBuffer =
1704 +           kmalloc(MAX_SECTORS_NAMELIST * AIX_SECTOR_SIZE, GFP_KERNEL);
1705 +       if (!NameBuffer) {
1706 +               kfree(AIXlventHead);
1707 +               return -ENOMEM;
1708 +       }
1709 +
1710 +       memset(NameBuffer, 0, (MAX_SECTORS_NAMELIST * AIX_SECTOR_SIZE));
1711 +
1712 +       for (AIXVGLPtr = AIXVolumeGroupList; AIXVGLPtr;
1713 +               AIXVGLPtr = AIXVGLPtr->next ) {
1714 +
1715 +               partition = AIXVGLPtr->partition_list;
1716 +
1717 +               if (!(AIXVGLPtr->flags & AIX_VG_DIRTY)) {
1718 +                       continue;
1719 +               }
1720 +
1721 +               if (partition == NULL) {
1722 +                       continue;
1723 +               }
1724 +
1725 +               node = partition->logical_node;
1726 +
1727 +               if (node == NULL) {
1728 +                       continue;
1729 +               }
1730 +
1731 +               LOG_DEBUG("DLV INIT_IO AIXNameList position:%d\n",
1732 +                         ((AIXVGLPtr->vgda_psn + AIXVGLPtr->vgda_len) - 1 -
1733 +                          MAX_SECTORS_NAMELIST));
1734 +               LOG_DEBUG("AIXVGLPTR:%p partition:%p node:%p \n", AIXVGLPtr,
1735 +                         partition, node);
1736 +
1737 +               if (INIT_IO(node, 0,
1738 +                    ((AIXVGLPtr->vgda_psn + AIXVGLPtr->vgda_len) - 1 -
1739 +                     MAX_SECTORS_NAMELIST), MAX_SECTORS_NAMELIST,
1740 +                    NameBuffer)) {
1741 +                       continue;
1742 +               }
1743 +
1744 +               LOG_DEBUG("DLV INIT_IO AIXNameList\n");
1745 +
1746 +               if (INIT_IO(node, 0, AIXVGLPtr->vgda_psn + PSN_LVE_REC,
1747 +                    MAX_SECTORS_LV_ENTRIES, AIXlventHead)) {
1748 +                       continue;
1749 +               }
1750 +               AIXlvent = AIXlventHead;
1751 +               AIXnamelist = (struct namelist *) NameBuffer;
1752 +
1753 +               LOG_DEBUG("DLV INIT_IO AIXlvent\n");
1754 +               // Search through the LV structs for valid LV entries
1755 +               // We're just going to search until all valid LVs are found
1756 +               // The max. allowable LVs is 256 and we want don't want to
1757 +               // search for 255 if only 8 are defined 1-8 however, there
1758 +               // could be gaps in the LV numbering. i.e 1,2,3,4,5,6,7,8, 27,43, etc.
1759 +
1760 +               for (j = 0, lv_found = 0, all_lvs_found = FALSE;
1761 +                    !all_lvs_found && j < LVM_MAXLVS; j++, AIXlvent++) {
1762 +
1763 +                       LOG_DEBUG(" ** DVIG:lv_size:%d lvname:[%s] j:%d lv_number:%d ** \n",
1764 +                            AIXlvent->num_lps, AIXnamelist->name[j], j,
1765 +                            AIXlvent->lvname);
1766 +                       LOG_DEBUG(" DVIG:stripe_exp:%u stripesize:%u lv_status:%d\n",
1767 +                            AIXlvent->striping_width,
1768 +                            GET_PHYSICAL_PART_SIZE(AIXlvent->stripe_exp),
1769 +                            AIXlvent->lv_state);
1770 +                       LOG_DEBUG(" DVIG Group:%x.Access:%x\n",
1771 +                                 (unsigned int) AIXVGLPtr->vg_id.word2,
1772 +                                 AIXlvent->permissions);
1773 +                       LOG_DEBUG(" DVIG mirror:%d mirror_policy:%d mirwrt:%d \n",
1774 +                            AIXlvent->mirror, AIXlvent->mirror_policy,
1775 +                            AIXlvent->mirwrt_consist);
1776 +
1777 +                       // This is the same check we used in "diskedit" and "readdisk"
1778 +                       if (AIXlvent->lv_state == 0 ||
1779 +                           AIXlvent->permissions > 0x10) {
1780 +                               continue;
1781 +                       }
1782 +
1783 +                               lv_found++;
1784 +                               if (lv_found == AIXVGLPtr->AIXvgh->numlvs) {
1785 +                                       all_lvs_found = TRUE;
1786 +                               }
1787 +
1788 +                               LOG_DEBUG(" DVIG lv_found:%d all_lvs_found:%d \n",
1789 +                                    lv_found, all_lvs_found);
1790 +
1791 +                               // Create a new logical volume and place it in the appropriate
1792 +                               // spot in this VG's volume list. For re-discovery, make sure
1793 +                               // this volume does not already exist.
1794 +                               if (!AIXVGLPtr->volume_list[AIXlvent->lvname]) {
1795 +                                       new_LV =
1796 +                                           new_logical_volume(AIXlvent,
1797 +                                                              AIXVGLPtr,
1798 +                                                              AIXnamelist->
1799 +                                                              name[j],
1800 +                                                              GET_PHYSICAL_PART_SIZE
1801 +                                                              (AIXlvent->
1802 +                                                               stripe_exp));
1803 +                                       if (!new_LV) {
1804 +                                               continue;
1805 +                                       }
1806 +                                       LOG_DEBUG(" DVIG Adding new logical volume %d to group:%x \n",
1807 +                                            new_LV->lv_number,AIXVGLPtr->vg_id.word2);
1808 +
1809 +                                       AIXVGLPtr->volume_list[new_LV->lv_number] = new_LV;
1810 +                               } else {
1811 +                                       LOG_DEBUG("DVIG Updating Vol Exists\n");
1812 +                               }
1813 +               }
1814 +
1815 +               // Build the le_to_pe_map for each volume that was discovered above.
1816 +               // This has to be done after all volumes in the group are discovered
1817 +               if ((rc = build_pe_maps(AIXVGLPtr))) {
1818 +                       continue;
1819 +               }
1820 +
1821 +               check_log_volume_and_pe_maps(AIXVGLPtr);
1822 +       }
1823 +
1824 +       kfree(NameBuffer);
1825 +       kfree(AIXlventHead);
1826 +
1827 +       return 0;
1828 +}
1829 +
1830 +/*
1831 + * Function: new_logical_volume
1832 + *
1833 + *  Allocate space for a new LVM logical volume, including space for the
1834 + *  PE map
1835 + */
1836 +static struct aix_logical_volume *
1837 +new_logical_volume(struct lv_entries *AIXlvent,
1838 +                  struct aix_volume_group *volume_group,
1839 +                  char *lv_name, u32 stripesize)
1840 +{
1841 +
1842 +       struct aix_logical_volume *new_volume;
1843 +       const char *name = "evms_AIXiod";
1844 +       const char *resync_name = "evms_AIXresync";
1845 +
1846 +       LOG_DEBUG(" NLV: lv_number:%d lv_allocated_le:%d lv_size:%d\n",
1847 +                 AIXlvent->lvname, AIXlvent->num_lps,
1848 +                 AIXlvent->num_lps * volume_group->pe_size);
1849 +
1850 +       // Allocate space for the new logical volume.
1851 +       new_volume = kmalloc(sizeof (struct aix_logical_volume), GFP_KERNEL);
1852 +       if (!new_volume) {
1853 +               return NULL;
1854 +       }
1855 +       memset(new_volume, 0, sizeof (struct aix_logical_volume));
1856 +
1857 +       // Allocate space for the LE to PE mapping table
1858 +       // We add 1 for the allocated le to ease mapping later on, all AIX le are 1 based
1859 +       new_volume->le_to_pe_map =
1860 +           kmalloc((AIXlvent->num_lps + 1) * sizeof (struct pe_table_entry),
1861 +                   GFP_KERNEL);
1862 +       if (!new_volume->le_to_pe_map) {
1863 +               delete_logical_volume(new_volume);
1864 +               return NULL;
1865 +       }
1866 +
1867 +       memset(new_volume->le_to_pe_map, 0,
1868 +              (AIXlvent->num_lps + 1) * sizeof (struct pe_table_entry));
1869 +
1870 +       if (AIXlvent->mirror > AIX_DEFAULT_MIRRORING) {
1871 +               new_volume->le_to_pe_map_mir1 =
1872 +                   kmalloc((AIXlvent->num_lps +
1873 +                            1) * sizeof (struct pe_table_entry), GFP_KERNEL);
1874 +               if (!new_volume->le_to_pe_map_mir1) {
1875 +                       delete_logical_volume(new_volume);
1876 +                       return NULL;
1877 +               }
1878 +               memset(new_volume->le_to_pe_map_mir1, 0,
1879 +                      (AIXlvent->num_lps +
1880 +                       1) * sizeof (struct pe_table_entry));
1881 +       }
1882 +
1883 +       if (AIXlvent->mirror == AIX_MAX_MIRRORS) {
1884 +               new_volume->le_to_pe_map_mir2 =
1885 +                   kmalloc((AIXlvent->num_lps + 1)
1886 +                   * sizeof (struct pe_table_entry), GFP_KERNEL);
1887 +               if (!new_volume->le_to_pe_map_mir2) {
1888 +                       delete_logical_volume(new_volume);
1889 +                       return NULL;
1890 +               }
1891 +               memset(new_volume->le_to_pe_map_mir2, 0,
1892 +                      (AIXlvent->num_lps +1)
1893 +                      * sizeof (struct pe_table_entry));
1894 +       }
1895 +
1896 +       // Initialize the rest of the new volume.
1897 +       new_volume->lv_number = AIXlvent->lvname;
1898 +       new_volume->lv_size = AIXlvent->num_lps * (volume_group->pe_size);
1899 +       new_volume->lv_access = AIXlvent->permissions | EVMS_LV_NEW;    // All volumes start new.
1900 +       new_volume->lv_status = AIXlvent->lv_state;
1901 +       //new_volume->lv_minor          = MINOR(1);
1902 +       new_volume->mirror_copies = AIXlvent->mirror;
1903 +//     new_volume->mirror_iterations = AIX_DEFAULT_MIRRORING;
1904 +       new_volume->stripes = AIXlvent->striping_width;
1905 +       new_volume->stripe_size = stripesize;
1906 +       new_volume->stripe_size_shift = evms_cs_log2(stripesize);
1907 +       new_volume->pe_size = volume_group->pe_size;
1908 +       new_volume->pe_size_shift = evms_cs_log2(volume_group->pe_size);
1909 +       new_volume->num_le = AIXlvent->num_lps;
1910 +//     new_volume->new_volume = TRUE;
1911 +       new_volume->group = volume_group;
1912 +
1913 +       volume_group->numlvs++;
1914 +
1915 +       sprintf(new_volume->name, "aix/%s", lv_name);
1916 +
1917 +       if (!AIX_BH_list_pool
1918 +           && new_volume->mirror_copies > AIX_DEFAULT_MIRRORING) {
1919 +
1920 +               // We only need the ReSync thread if we have at least one mirrored LV.
1921 +               // You can't ReSync a non-mirrored drive
1922 +
1923 +               AIX_BH_list_pool =
1924 +                   evms_cs_create_pool(sizeof (struct aix_mirror_bh),
1925 +                                       "EVMS_AIX_BH", aix_notify_cache_ctor,
1926 +                                       NULL);
1927 +               if (!AIX_BH_list_pool) {
1928 +                       return NULL;
1929 +
1930 +                       AIX_mirror_read_retry_thread =
1931 +                           evms_cs_register_thread(AIXiod, NULL, name);
1932 +
1933 +                       AIX_mirror_resync_thread =
1934 +                           evms_cs_register_thread(AIXresync, NULL,
1935 +                                                   resync_name);
1936 +               }
1937 +       }
1938 +
1939 +       LOG_DEBUG("NLV lv_number:%d name:%s lv_size " PFU64 " \n",
1940 +                 new_volume->lv_number, new_volume->name, new_volume->lv_size);
1941 +       LOG_DEBUG("NLV stripe_size:%d stripe_size_shift:%d\n",
1942 +                 new_volume->stripe_size, new_volume->stripe_size_shift);
1943 +
1944 +       return new_volume;
1945 +}
1946 +
1947 +/*
1948 + * Function: aix_notify_cache_ctor
1949 + * this function initializes the b_wait field in the buffer heads
1950 + * in our private buffer head pool.
1951 + */
1952 +static void
1953 +aix_notify_cache_ctor(void *foo, kmem_cache_t * cachep, unsigned long flags)
1954 +{
1955 +       if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
1956 +           SLAB_CTOR_CONSTRUCTOR) {
1957 +               struct aix_mirror_bh *rbh = (struct aix_mirror_bh *) foo;
1958 +               memset(rbh, 0, sizeof (struct aix_mirror_bh));
1959 +               rbh->remaining = (atomic_t) ATOMIC_INIT(0);
1960 +               init_waitqueue_head(&rbh->bh_req.b_wait);
1961 +       }
1962 +}
1963 +
1964 +/*
1965 + * Function: build_pe_maps
1966 + *
1967 + *  After all logical volumes have been discovered, the mappings from
1968 + *  logical extents to physical extents must be constructed. Each PV
1969 + *  contains a map on-disk of its PEs. Each PE map entry contains the
1970 + *  logical volume number and the logical extent number on that volume.
1971 + *  Our internal map is the reverse of this map for each volume, listing
1972 + *  the PV node and sector offset for every logical extent on the volume.
1973 + */
1974 +static int
1975 +    build_pe_maps(struct aix_volume_group *volume_group)
1976 +{
1977 +       struct partition_list_entry *partition;
1978 +       struct partition_list_entry *mirror_partition;
1979 +       struct pp_entries *AIXppent, *AIXppent_buff;
1980 +       struct pv_header *AIXpvh;
1981 +       u64 offset;
1982 +       u32 le_number;
1983 +       u32 j, pp_count, pvh_pos;
1984 +       u32 MirrorFound;
1985 +    u32 pvh_posn[LVM_MAXPVS];
1986 +    u32 rc;
1987 +#ifdef EVMS_DEBUG_MIRRORS
1988 +       u32 lv_found, all_lvs_found;
1989 +       u32 mirs = 0;
1990 +#endif
1991 +
1992 +       LOG_DEBUG(" *** BPEM ***\n");
1993 +       // For every partition in this VG
1994 +
1995 +       AIXppent_buff = kmalloc(AIX_SECTOR_SIZE * PHYS_VOL_OFFSET, GFP_KERNEL);
1996 +       if (!AIXppent_buff) {
1997 +               return -ENOMEM;
1998 +       }
1999 +
2000 +       memset(AIXppent_buff, 0, AIX_SECTOR_SIZE * PHYS_VOL_OFFSET);
2001 +    memset(pvh_posn, 0, LVM_MAXPVS);
2002 +
2003 +       AIXpvh = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
2004 +       if (!AIXpvh) {
2005 +               kfree(AIXppent_buff);
2006 +               return -ENOMEM;
2007 +       }
2008 +
2009 +       memset(AIXpvh, 0, AIX_SECTOR_SIZE);
2010 +
2011 +       LOG_DEBUG(" BPEM AIXppent_buff:%d \n",
2012 +                 (AIX_SECTOR_SIZE * PHYS_VOL_OFFSET));
2013 +
2014 +    // This next section is to calculate the sector spacing between PV info for the VG
2015 +    // AIX doesn't always space the info. the same. It could be 17 or 34 sectors apart
2016 +    // depending on the PE size selected.
2017 +
2018 +    rc = AIX_pvh_data_posn(volume_group->vgda_psn, pvh_posn, volume_group->partition_list, volume_group->AIXvgh->numpvs);
2019 +
2020 +    if (rc != 0) {
2021 +        kfree(AIXppent_buff);
2022 +        kfree(AIXpvh);
2023 +        return (rc);
2024 +    }
2025 +
2026 +       for (partition = volume_group->partition_list; partition;
2027 +           partition = partition->next) {
2028 +
2029 +               LOG_DEBUG(" BPEM partition:%p next:%p\n", partition,
2030 +                         partition->next);
2031 +
2032 +               pvh_pos = pvh_posn[partition->pv_number];
2033 +
2034 +               LOG_DEBUG(" BPEM pvh_pos:%d pv_number:%d\n", pvh_pos, partition->pv_number);
2035 +
2036 +               if (INIT_IO(partition->logical_node, 0, pvh_pos, 1, AIXpvh)) {
2037 +                       kfree(AIXppent_buff);
2038 +                       kfree(AIXpvh);
2039 +                       return -EIO;
2040 +               }
2041 +               // For every entry in the PE map, calculate the PE's sector offset
2042 +               // and update the correct LV's PE map. LV number of 0 marks an unused PE.
2043 +               // For re-discovery, only compute entries for new volumes.
2044 +
2045 +               if (INIT_IO(partition->logical_node, 0, pvh_pos, AIX_PVHPP_LENGTH,
2046 +                           AIXppent_buff)) {
2047 +                       kfree(AIXppent_buff);
2048 +                       kfree(AIXpvh);
2049 +                       return -EIO;
2050 +               }
2051 +
2052 +               AIXppent = AIXppent_buff;
2053 +               AIXppent++;
2054 +
2055 +               pp_count = AIXpvh->pp_count;
2056 +
2057 +        LOG_DEBUG("BPEM AIXpvh data: pp_count:%d psn_part1:%d pv_id1:%d pv_id2:%d pv_id3:%d pv_id4:%d pv_num:%d pv_state:%d vgdas:%d res1:%d res2:%d\n", AIXpvh->pp_count,
2058 +                                     AIXpvh->psn_part1,
2059 +                                     AIXpvh->pv_id.word1,
2060 +                                     AIXpvh->pv_id.word2,
2061 +                                     AIXpvh->pv_id.word3,
2062 +                                     AIXpvh->pv_id.word4,
2063 +                                     AIXpvh->pv_num,
2064 +                                    AIXpvh->pv_state, AIXpvh->pvnum_vgdas, AIXpvh->res1, AIXpvh->res2);
2065 +
2066 +               LOG_DEBUG(" PE Map: volgrp:%x AIXpvh->pv_num:%d partition:%p next:%p lv_index:%d pp_count:%d\n",
2067 +                    volume_group->vg_id.word2, AIXpvh->pv_num, partition,
2068 +                    partition->next, AIXppent->lv_index, pp_count);
2069 +
2070 +               for (j = 0; j < pp_count; j++,AIXppent++) {
2071 +                       if (!AIXppent->lv_index || AIXppent->pp_state == AIX_LVM_LVUNDEF) {
2072 +                               continue;
2073 +                       }
2074 +
2075 +                       LOG_EXTRA(" -- pv:%x pp:%d st:%d nm:%s lv:%d lp:%d cp:%d fst v:%d fst p:%d snd v:%d snd p:%d \n",
2076 +                            volume_group->vg_id.word2, j + 1,
2077 +                            AIXppent->pp_state,
2078 +                            volume_group->volume_list[AIXppent->lv_index -1]->name,
2079 +                            AIXppent->lv_index, AIXppent->lp_num,
2080 +                            AIXppent->copy, AIXppent->fst_alt_vol,
2081 +                            AIXppent->fst_alt_part,
2082 +                            AIXppent->snd_alt_vol,
2083 +                            AIXppent->snd_alt_part);
2084 +
2085 +                       le_number = AIXppent->lp_num - 1;       // AIX lp's start @ 1, we want a 0 index
2086 +                       offset = ((j * (volume_group->pe_size)) + AIXpvh->psn_part1);
2087 +
2088 +                       LOG_DEBUG(" PE Map: le_number:%d partition:%p lv_index:%d lv_name:%s\n",
2089 +                            le_number, partition, AIXppent->lv_index,
2090 +                            volume_group->volume_list[AIXppent->lv_index -1]->name);
2091 +
2092 +                       if (!volume_group->volume_list[AIXppent->lv_index - 1]) {
2093 +                               LOG_SERIOUS("Failed attempt to access volume without memory allocation lv:%d\n",
2094 +                                    AIXppent->lv_index - 1);
2095 +                               continue;
2096 +                       }
2097 +
2098 +                               if (volume_group->volume_list[AIXppent->lv_index -1]->le_to_pe_map
2099 +                                   && le_number <= volume_group->volume_list[AIXppent->lv_index - 1]->num_le) {
2100 +
2101 +                               volume_group->volume_list[AIXppent->lv_index -1]->le_to_pe_map[le_number].owning_pv = partition;
2102 +                               volume_group->volume_list[AIXppent->lv_index -1]->le_to_pe_map[le_number].pe_sector_offset = offset;
2103 +                               volume_group->volume_list[AIXppent->lv_index -1]->le_to_pe_map[le_number].pp_state = AIXppent->pp_state;
2104 +                               }
2105 +
2106 +                               if (volume_group->volume_list[AIXppent->lv_index -1]->mirror_copies >
2107 +                                   AIX_DEFAULT_MIRRORING) {
2108 +
2109 +                                       LOG_EXTRA(" PE Map: Mirror found lv:%d -- \n",
2110 +                                            AIXppent->lv_index);
2111 +
2112 +                                       for (mirror_partition = volume_group->partition_list,
2113 +                                       MirrorFound = FALSE;
2114 +                                       mirror_partition && !MirrorFound;
2115 +                                       mirror_partition = mirror_partition->next) {
2116 +
2117 +                                               if (mirror_partition->pv_number == AIXppent->fst_alt_vol) {
2118 +
2119 +                                                       offset = (((AIXppent->fst_alt_part - 1) * (volume_group->pe_size)) + AIXpvh->psn_part1);
2120 +
2121 +                                                       volume_group->volume_list[AIXppent->lv_index -1]->le_to_pe_map_mir1[le_number].owning_pv = mirror_partition;
2122 +                                                       volume_group->volume_list[AIXppent->lv_index -1]->le_to_pe_map_mir1[le_number].pe_sector_offset = offset;
2123 +                                                       volume_group->volume_list[AIXppent->lv_index -1]->le_to_pe_map_mir1[le_number].pp_state = AIXppent->pp_state;
2124 +
2125 +                                                       LOG_EXTRA(" PE Map: mirror_partition:%p \n",
2126 +                                                            mirror_partition);
2127 +                                                       LOG_EXTRA(" PE Map: mirror_sector_offet:%d\n",
2128 +                                                            AIXppent->fst_alt_part);
2129 +
2130 +                                                       MirrorFound = TRUE;
2131 +                                               }
2132 +                                       }
2133 +
2134 +                                       if (volume_group->volume_list[AIXppent->lv_index -1]->mirror_copies == AIX_MAX_MIRRORS) {
2135 +
2136 +                                               for (mirror_partition = volume_group->partition_list,
2137 +                                                    MirrorFound = FALSE;
2138 +                                                    mirror_partition && !MirrorFound;
2139 +                                                    mirror_partition = mirror_partition->next) {
2140 +
2141 +                                                       if (mirror_partition->pv_number == AIXppent->snd_alt_vol) {
2142 +
2143 +                                                               offset = (((AIXppent->snd_alt_part - 1) * (volume_group->pe_size)) + AIXpvh->psn_part1);
2144 +
2145 +                                                               volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map_mir2[le_number].owning_pv = mirror_partition;
2146 +                                                               volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map_mir2[le_number].pe_sector_offset = offset;
2147 +                                                               volume_group->volume_list[AIXppent->lv_index-1]->le_to_pe_map_mir2[le_number].pp_state = AIXppent->pp_state;
2148 +
2149 +                                                               LOG_EXTRA(" PE Map: mirror_partition2:%p \n",
2150 +                                                                    mirror_partition);
2151 +                                                               LOG_EXTRA(" PE Map: mirror_sector_offet2:%d\n",
2152 +                                                                    AIXppent->snd_alt_part);
2153 +
2154 +                                                               MirrorFound = TRUE;
2155 +                                                       }
2156 +                                               }
2157 +                                       }
2158 +
2159 +                               }       // End of if mirroring is enabled
2160 +               }
2161 +       }
2162 +
2163 +//      LOG_EXTRA(" PE Map: PE maps:%d Mirror count:%d -- \n", lvs, mirs);
2164 +
2165 +#ifdef EVMS_DEBUG_MIRRORS
2166 +       for (mirs = 0, lv_found = 0, all_lvs_found = FALSE;
2167 +           !all_lvs_found && mirs < LVM_MAXLVS; mirs++) {
2168 +
2169 +               if (volume_group->volume_list[mirs] != NULL) {
2170 +                       if (volume_group->volume_list[mirs]->lv_status ==
2171 +                           LV_ACTIVE) {
2172 +
2173 +                               lv_found++;
2174 +
2175 +                               LOG_DEBUG(" PE Map: owning part lv %d -- %p\n",
2176 +                                         mirs,
2177 +                                         volume_group->volume_list[mirs]->
2178 +                                         le_to_pe_map[0].owning_pv);
2179 +                               if (volume_group->volume_list[mirs]->
2180 +                                   mirror_copies > AIX_DEFAULT_MIRRORING) {
2181 +                                       LOG_DEBUG(" PE Map: mirror_partition lv %d -- %p \n",
2182 +                                            mirs,
2183 +                                            volume_group->volume_list[mirs]->
2184 +                                            le_to_pe_map_mir1[0].owning_pv);
2185 +                               }
2186 +                               if (volume_group->volume_list[mirs]->
2187 +                                   mirror_copies == AIX_MAX_MIRRORS) {
2188 +                                       LOG_DEBUG(" PE Map: mirror_partition lv %d -- %p \n",
2189 +                                            mirs,
2190 +                                            volume_group->volume_list[mirs]->
2191 +                                            le_to_pe_map_mir2[0].owning_pv);
2192 +                               }
2193 +                       }
2194 +                       if (lv_found == volume_group->AIXvgh->numlvs) {
2195 +                               all_lvs_found = TRUE;
2196 +                               LOG_DEBUG(" PE Map: all_lvs_found\n");
2197 +                       }
2198 +               }
2199 +       }
2200 +#endif
2201 +
2202 +       kfree(AIXpvh);
2203 +       kfree(AIXppent_buff);
2204 +
2205 +       return 0;
2206 +}
2207 +
2208 +/*
2209 + * Function: check_log_volume_and_pe_maps
2210 + *
2211 + *  Make sure all volumes in this group have valid LE-to-PE maps.
2212 + *  Any volume that doesn't is deleted. This is safe for re-discovery
2213 + *  because only new volumes could have corrupted PE maps.
2214 + */
2215 +static int
2216 +check_log_volume_and_pe_maps(struct aix_volume_group *group)
2217 +{
2218 +       struct aix_logical_volume *volume;
2219 +       int i, j, lv_found, all_lvs_found;
2220 +
2221 +       LOG_DEBUG(" check_pe_map.\n");
2222 +
2223 +       for (i = 0, all_lvs_found = FALSE, lv_found = 0;
2224 +            !all_lvs_found && i < LVM_MAXLVS; i++) {
2225 +               if (!group->volume_list[i]) {
2226 +                       LOG_DEBUG(" CPEM No Volume %d found \n", i);
2227 +                       continue;
2228 +               }
2229 +
2230 +               volume = group->volume_list[i];
2231 +               if (!volume->le_to_pe_map) {
2232 +                       LOG_DEBUG(" CPEM Volume %s has no PE map.\n",
2233 +                                 volume->name);
2234 +                       delete_logical_volume(volume);
2235 +                       continue;
2236 +               }
2237 +
2238 +               LOG_DEBUG(" CPEM volume %s num_le: %d \n", volume->name,
2239 +                         volume->num_le);
2240 +
2241 +               lv_found++;
2242 +
2243 +               if (lv_found == group->AIXvgh->numlvs) {
2244 +                       all_lvs_found = TRUE;
2245 +               }
2246 +
2247 +               for (j = 0; j < volume->num_le; j++) {
2248 +                       if (!volume->le_to_pe_map[j].owning_pv ||
2249 +                           !volume->le_to_pe_map[j].pe_sector_offset) {
2250 +                               LOG_SERIOUS(" CPEM Volume (%s) incomplete PE map (LE %d) \n",
2251 +                                    volume->name, j);
2252 +                               volume->lv_access |= EVMS_LV_INCOMPLETE;
2253 +                       }
2254 +
2255 +                       if (volume->mirror_copies > AIX_DEFAULT_MIRRORING) {
2256 +                               if (!volume->le_to_pe_map_mir1[j].owning_pv ||
2257 +                                   !volume->le_to_pe_map_mir1[j].
2258 +                                   pe_sector_offset) {
2259 +                                       LOG_SERIOUS(" CPEM Volume (%s) incomplete PE mirror map 1 (LE %d) \n",
2260 +                                            volume->name, j);
2261 +                                       volume->lv_access |= EVMS_LV_INCOMPLETE;
2262 +                               }
2263 +
2264 +                               if (volume->mirror_copies == AIX_MAX_MIRRORS) {
2265 +                                       if (!volume->le_to_pe_map_mir2[j].
2266 +                                           owning_pv
2267 +                                           || !volume->le_to_pe_map_mir2[j].
2268 +                                           pe_sector_offset) {
2269 +                                               LOG_SERIOUS(" CPEM Volume (%s) incomplete PE mirror map 2 (LE %d) \n",
2270 +                                                    volume->name, j);
2271 +                                               volume->lv_access |= EVMS_LV_INCOMPLETE;
2272 +                                       }
2273 +                               }
2274 +                       }
2275 +               }
2276 +       }
2277 +
2278 +       LOG_EXTRA(" Leaving check_pe_map.\n");
2279 +       return 0;
2280 +}
2281 +
2282 +/*
2283 + * Function: export_volumes
2284 + *
2285 + *  The last thing this VGE must do is take each constructed volume and
2286 + *  place it back on the evms logical partition list.
2287 + */
2288 +static int
2289 +export_volumes(struct evms_logical_node **evms_partition_list)
2290 +{
2291 +       struct aix_volume_group *AIXVGLPtr;
2292 +       struct evms_logical_node *new_node;
2293 +       struct aix_logical_volume *volume;
2294 +       int j, lv_found, all_lvs_found;
2295 +       int count = 0;
2296 +
2297 +       for (AIXVGLPtr = AIXVolumeGroupList; AIXVGLPtr; AIXVGLPtr = AIXVGLPtr->next) {
2298 +
2299 +               if (!(AIXVGLPtr->flags & AIX_VG_DIRTY)) {
2300 +                       LOG_DEBUG(" EV Existing group(%d), not dirty, skipping\n",
2301 +                                 AIXVGLPtr->vg_id.word2);
2302 +                       continue;
2303 +               }
2304 +                       LOG_DEBUG(" Exporting all new volumes numpvs:%d numlvs:%d \n",
2305 +                            AIXVGLPtr->AIXvgh->numpvs, AIXVGLPtr->numlvs);
2306 +
2307 +                       // Export every valid volume in the group. For re-discovery,
2308 +                       // make sure we are only exporting "new" volumes.
2309 +
2310 +                       for (j = 0, all_lvs_found = FALSE, lv_found = 0;
2311 +                            !all_lvs_found && j < LVM_MAXLVS; j++) {
2312 +                               if (AIXVGLPtr->volume_list[j] != NULL) {
2313 +                                       if (AIXVGLPtr->volume_list[j]->lv_access & EVMS_LV_NEW) {
2314 +
2315 +                                               LOG_DEBUG(" EV Checking LV:[%d] volume:%p\n",
2316 +                                                    j,AIXVGLPtr->volume_list[j]);
2317 +
2318 +                                               volume = AIXVGLPtr->volume_list[j];
2319 +                                               lv_found++;
2320 +
2321 +                                               if (lv_found == AIXVGLPtr->AIXvgh->numlvs) {
2322 +                                                       all_lvs_found = TRUE;
2323 +                                               }
2324 +                                               // For new volumes, create a new EVMS node and
2325 +                                               // initialize the appropriate fields.
2326 +                                               if (evms_cs_allocate_logical_node(&new_node)) {
2327 +                                                       LOG_DEBUG(" Export Vol Error allocating node !!\n");
2328 +                                                       continue;
2329 +                                               } else {
2330 +                                                       LOG_DEBUG(" EV Node allocated OK\n");
2331 +                                               }
2332 +
2333 +//                                             volume->new_volume = 0;
2334 +                                               volume->volume_node = new_node;
2335 +                                               volume->lv_access &= (~EVMS_LV_NEW);
2336 +                                               new_node->hardsector_size = AIXVGLPtr->hard_sect_size;
2337 +                                               new_node->block_size = AIXVGLPtr->block_size;
2338 +                                               new_node->plugin = &plugin_header;
2339 +                                               new_node->private = volume;
2340 +                                               new_node->total_vsectors = volume->lv_size;
2341 +
2342 +                                               LOG_DEBUG(" EV volume->name:[%s]\n",
2343 +                                                    volume->name);
2344 +
2345 +                                               strncpy(new_node->name,volume->name,
2346 +                                                       EVMS_VOLUME_NAME_SIZE + 1);
2347 +
2348 +                                               // Is the volume read-only?
2349 +                                               if (!(volume->lv_access & AIX_LV_WRITE)
2350 +                                                   || volume->lv_access & EVMS_LV_INCOMPLETE)
2351 +                                               {
2352 +                                                       new_node->flags |= EVMS_VOLUME_SET_READ_ONLY;
2353 +                                                       LOG_DEBUG(" EV Read Only volume->lv_access:%d\n",
2354 +                                                            volume->lv_access);
2355 +                                               }
2356 +
2357 +                                               evms_cs_add_logical_node_to_list(evms_partition_list,
2358 +                                                    new_node);
2359 +                                               count++;
2360 +
2361 +                                               LOG_DEBUG(" Exporting LVM volume %p new_node:%p ESD->volume_name[%s]\n",
2362 +                                                    volume, new_node,new_node->name);
2363 +                                       } else {
2364 +                                               evms_cs_add_logical_node_to_list(evms_partition_list,
2365 +                                                    AIXVGLPtr->volume_list[j]->volume_node);
2366 +                                               count++;
2367 +                                               LOG_DEBUG(" ELV vol_list[%d]%p\n", j,
2368 +                                                    AIXVGLPtr->volume_list[j]);
2369 +                                       }
2370 +                               } else {
2371 +                                       LOG_DEBUG(" EV Checking LV:[%d] == NULL\n",j);
2372 +                               }
2373 +                       }       // end checking all lvs
2374 +
2375 +               AIXVGLPtr->flags &= ~AIX_VG_DIRTY;
2376 +       }
2377 +
2378 +       return count;
2379 +}
2380 +
2381 +/*
2382 + * Function: delete_logical_volume
2383 + *
2384 + *  This function deletes the in-memory representation of a single LVM
2385 + *  logical volume, including its PE map and any snapshot data. It does
2386 + *  not alter the parent volume group, except to remove this volume from
2387 + *  its volume list.
2388 + */
2389 +static int
2390 +delete_logical_volume(struct aix_logical_volume *volume)
2391 +{
2392 +       struct aix_volume_group *group = volume->group;
2393 +
2394 +       LOG_DEBUG(" Deleting volume %s\n", volume->name);
2395 +
2396 +       // Now free up all the memory. This includes the LE-to-PE map, any
2397 +       // mirror PEs, etc.
2398 +       if (volume->le_to_pe_map) {
2399 +               kfree(volume->le_to_pe_map);
2400 +               volume->le_to_pe_map = NULL;
2401 +       }
2402 +
2403 +       if (volume->le_to_pe_map_mir1) {
2404 +               kfree(volume->le_to_pe_map_mir1);
2405 +               volume->le_to_pe_map_mir1 = NULL;
2406 +       }
2407 +
2408 +       if (volume->le_to_pe_map_mir2) {
2409 +               kfree(volume->le_to_pe_map_mir2);
2410 +               volume->le_to_pe_map_mir2 = NULL;
2411 +       }
2412 +       // Remove this volume from the volume-group's list.
2413 +       if (group && group->volume_list[volume->lv_number] == volume) {
2414 +               group->volume_list[volume->lv_number] = NULL;
2415 +               group->numlvs--;
2416 +       }
2417 +
2418 +       kfree(volume);
2419 +
2420 +       return 0;
2421 +}
2422 +
2423 +/* Function: remove_group_from_list
2424 + *
2425 + *     Remove an LVM volume group from the global LVM list.
2426 + */
2427 +static int
2428 +remove_group_from_list(struct aix_volume_group *group)
2429 +{
2430 +       struct aix_volume_group **p_group;
2431 +
2432 +       for (p_group = &AIXVolumeGroupList; *p_group;
2433 +            p_group = &(*p_group)->next) {
2434 +               if (*p_group == group) {
2435 +                       *p_group = (*p_group)->next;
2436 +                       group->next = NULL;
2437 +                       break;
2438 +               }
2439 +       }
2440 +       return 0;
2441 +}
2442 +
2443 +/*
2444 + * Function: delete_aix_node
2445 + *
2446 + *  This function deletes the in-memory representation of an LVM
2447 + *  logical volume. Right now it makes a lot of assumptions about
2448 + *  the data in the group not being corrupted. It would be possible
2449 + *  to put in a lot of consistency checks before deleting everything
2450 + *  to indicate if problems have occurred during the lifetime of the
2451 + *  volume and its volume group.
2452 + */
2453 +static int
2454 +delete_aix_node(struct evms_logical_node *logical_node)
2455 +{
2456 +       struct aix_logical_volume *volume =
2457 +           (struct aix_logical_volume *) (logical_node->private);
2458 +       struct aix_volume_group *group = volume->group;
2459 +
2460 +       if (delete_logical_volume(volume)) {
2461 +               return -EINVAL;
2462 +       }
2463 +       // If we just removed the last volume from this group, the entire group
2464 +       // can also be deleted.
2465 +       if (group && group->numlvs == 0) {
2466 +               remove_group_from_list(group);
2467 +               deallocate_volume_group(group);
2468 +       }
2469 +       // Free the logical node.
2470 +       evms_cs_deallocate_logical_node(logical_node);
2471 +
2472 +       return 0;
2473 +}
2474 +
2475 +/* Function: deallocate_volume_group
2476 + *
2477 + *  This function deletes the entire in-memory representation of an LVM
2478 + *  volume group, including all partitions and logical volumes. If this
2479 + *  group is on the VGE's volume group list, it is removed.
2480 + */
2481 +static int
2482 +deallocate_volume_group(struct aix_volume_group *group)
2483 +{
2484 +       struct partition_list_entry *partition;
2485 +       struct partition_list_entry *next_part;
2486 +       int i;
2487 +
2488 +       LOG_DEBUG(" Deleting volume group %x\n", group->vg_id.word2);
2489 +
2490 +       // Delete all partitions from the group's list.
2491 +       for (partition = group->partition_list; partition;
2492 +            partition = next_part) {
2493 +
2494 +               next_part = partition->next;
2495 +
2496 +               if (partition->logical_node) {
2497 +                       // Send a delete command down to the partition manager.
2498 +                       LOG_DEBUG(" Deleting PV %d from group %x\n",
2499 +                                 partition->pv_number, group->vg_id.word2);
2500 +                       DELETE(partition->logical_node);
2501 +               }
2502 +               kfree(partition);
2503 +       }
2504 +
2505 +       // Delete all logical volumes, and the array of pointers.
2506 +       for (i = 0; i < LVM_MAXLVS; i++) {
2507 +               if (group->volume_list[i]) {
2508 +                       delete_logical_volume(group->volume_list[i]);
2509 +               }
2510 +       }
2511 +
2512 +       kfree(group);
2513 +
2514 +       return 0;
2515 +}
2516 +
2517 +/* Function: end_discover_aix
2518 + *
2519 + *     The discovery process at the region-manager level is now iterative,
2520 + *     much like the EVMS feature level. To accomplish this correctly, and
2521 + *     also to accomplish partial volume discovery, a second discover
2522 + *     entry point is needed, so EVMS can tell the region managers that
2523 + *     discovery is over, and to finish up any discovery that is not yet
2524 + *     complete. When this function is called, it should be assumed that
2525 + *     the node list has had nothing new added to it since the last call
2526 + *     of the regular discover function. Therefore, when this function is
2527 + *     called, we do not need to try to discovery any additional volume
2528 + *     groups. We will, however, look for logical volumes once more. This
2529 + *     gives us the ability to export (read-only) volumes that have
2530 + *     partially corrupted LE maps due to missing PVs in their VG.
2531 + */
2532 +static int
2533 +end_discover_aix(struct evms_logical_node **evms_logical_disk_head)
2534 +{
2535 +
2536 +       int rc;
2537 +
2538 +       MOD_INC_USE_COUNT;
2539 +       LOG_DEBUG("Final Discovery:\n");
2540 +
2541 +       rc = discover_logical_volumes();
2542 +
2543 +       if (!rc) {
2544 +               rc = export_volumes(evms_logical_disk_head);
2545 +
2546 +               lvm_cleanup();
2547 +       }
2548 +
2549 +       MOD_DEC_USE_COUNT;
2550 +       return rc;
2551 +}
2552 +
2553 +/****************************************************
2554 +* Function: AIX_alloc_wbh
2555 +*
2556 +* Alloc any buffer heads from the pool and return a linked list
2557 +*
2558 +*
2559 +*****************************************************/
2560 +static struct aix_mirror_bh *
2561 +AIX_alloc_wbh(struct evms_logical_node *node,
2562 +             struct evms_logical_node *node2,
2563 +             struct evms_logical_node *node3,
2564 +             struct buffer_head *bh,
2565 +             u32 mirror_copies, u32 le, u64 new_sector2, u64 new_sector3)
2566 +{
2567 +       struct aix_mirror_bh *tmp_bh = NULL, *head_bh = NULL;
2568 +       int i;
2569 +
2570 +       head_bh = evms_cs_allocate_from_pool(AIX_BH_list_pool, EVMS_BLOCKABLE);
2571 +
2572 +       if (!head_bh) {
2573 +               LOG_SERIOUS("Unable to allocate memory for mirror pool line:%d\n",
2574 +                    __LINE__);
2575 +               return NULL;
2576 +       }
2577 +
2578 +       head_bh->master_bh = bh;
2579 +       head_bh->mirror_bh_list = NULL;
2580 +       head_bh->remaining = (atomic_t) ATOMIC_INIT(0);
2581 +
2582 +       for (i = AIX_DEFAULT_MIRRORING; i <= mirror_copies; i++) {
2583 +
2584 +               tmp_bh =
2585 +                   evms_cs_allocate_from_pool(AIX_BH_list_pool,
2586 +                                              EVMS_BLOCKABLE);
2587 +               if (!tmp_bh) {
2588 +                       LOG_SERIOUS("Unable to allocate memory for mirror pool line:%d\n",
2589 +                            __LINE__);
2590 +                       return NULL;
2591 +               }
2592 +
2593 +               tmp_bh->next_r1 = head_bh->mirror_bh_list;
2594 +               head_bh->mirror_bh_list = tmp_bh;
2595 +               atomic_inc(&head_bh->remaining);
2596 +
2597 +               memcpy(&tmp_bh->bh_req, bh, sizeof (struct buffer_head));
2598 +               tmp_bh->remaining = (atomic_t) ATOMIC_INIT(0);
2599 +               init_waitqueue_head(&tmp_bh->bh_req.b_wait);
2600 +               //tmp_bh->bh_req.b_size   = bh->b_size;
2601 +
2602 +               switch (i) {
2603 +
2604 +               case AIX_DEFAULT_MIRRORING:
2605 +                       tmp_bh->node = node;
2606 +                       tmp_bh->bh_req.b_rsector = bh->b_rsector;
2607 +                       break;
2608 +
2609 +               case AIX_FIRST_MIRROR:
2610 +                       tmp_bh->node = node2;
2611 +                       tmp_bh->bh_req.b_rsector = new_sector2;
2612 +                       break;
2613 +
2614 +               case AIX_MAX_MIRRORS:
2615 +                       tmp_bh->node = node3;
2616 +                       tmp_bh->bh_req.b_rsector = new_sector3;
2617 +                       break;
2618 +               }
2619 +
2620 +               tmp_bh->bh_req.b_end_io = AIX_handle_write_mirror_drives;       //setup callback routine
2621 +               tmp_bh->bh_req.b_private = (void *) head_bh;
2622 +
2623 +       }
2624 +
2625 +       return head_bh;
2626 +
2627 +}
2628 +
2629 +/****************************************************
2630 +* Function: AIX_handle_write_mirror_drives
2631 +*
2632 +* Handles a write from a set of mirrored AIX LVs
2633 +
2634 +*
2635 +*
2636 +*****************************************************/
2637 +static void
2638 +AIX_handle_write_mirror_drives(struct buffer_head *bh, int uptodate)
2639 +{
2640 +       struct aix_logical_volume *volume;
2641 +       struct evms_logical_node *node;
2642 +       struct aix_mirror_bh *tmp_bh = NULL, *tmp_bh2 = NULL;
2643 +       kdev_t tmp_b_rdev;
2644 +       u32 count, le = 0;
2645 +
2646 +       tmp_bh = (struct aix_mirror_bh *) bh->b_private;
2647 +       tmp_b_rdev = tmp_bh->master_bh->b_rdev;
2648 +       node = tmp_bh->node;
2649 +       volume = (struct aix_logical_volume *) node->private;
2650 +
2651 +       LOG_DEBUG("AHWMD node:%p bh_flags:%lu uptodate:%d mirror_copies:%d \n",
2652 +                 node, bh->b_state, uptodate, volume->mirror_copies);
2653 +
2654 +       if (!uptodate) {
2655 +               le = tmp_bh->le;
2656 +
2657 +               switch (tmp_bh->iteration) {
2658 +               case AIX_DEFAULT_MIRRORING:
2659 +                       volume->le_to_pe_map[le].pp_state += AIX_LVM_LVSTALE;
2660 +                       break;
2661 +
2662 +               case AIX_FIRST_MIRROR:
2663 +                       volume->le_to_pe_map_mir1[le].pp_state +=
2664 +                           AIX_LVM_LVSTALE;
2665 +                       break;
2666 +
2667 +               case AIX_MAX_MIRRORS:
2668 +                       volume->le_to_pe_map_mir2[le].pp_state +=
2669 +                           AIX_LVM_LVSTALE;
2670 +                       break;
2671 +               }
2672 +
2673 +               AIX_evms_cs_notify_lv_io_error(node);
2674 +       }
2675 +
2676 +       if (atomic_dec_and_test(&tmp_bh->remaining)) {
2677 +               tmp_bh->master_bh->b_end_io(tmp_bh->master_bh, uptodate);
2678 +               tmp_bh2 = tmp_bh->mirror_bh_list;
2679 +               evms_cs_deallocate_to_pool(AIX_BH_list_pool, tmp_bh);
2680 +
2681 +               while (tmp_bh2) {
2682 +                       tmp_bh = tmp_bh2->next_r1;
2683 +                       evms_cs_deallocate_to_pool(AIX_BH_list_pool, tmp_bh2);
2684 +                       tmp_bh2 = tmp_bh;
2685 +               }
2686 +
2687 +               evms_cs_volume_request_in_progress(tmp_b_rdev,
2688 +                                                  AIX_DECREMENT_REQUEST,
2689 +                                                  &count);
2690 +       }
2691 +
2692 +       return;
2693 +}
2694 +
2695 +/****************************************************
2696 +* Function: AIX_alloc_rbh
2697 +*
2698 +* Alloc any buffer heads from the pool and return a linked list
2699 +*
2700 +*
2701 +*****************************************************/
2702 +static struct aix_mirror_bh *
2703 +AIX_alloc_rbh(struct evms_logical_node *node,
2704 +             struct buffer_head *bh,
2705 +             u32 mirror_copies, u32 le, u64 org_sector, int cmd)
2706 +{
2707 +       struct aix_mirror_bh *tmp_bh = NULL;
2708 +
2709 +       tmp_bh = evms_cs_allocate_from_pool(AIX_BH_list_pool, EVMS_BLOCKABLE);
2710 +
2711 +       if (!tmp_bh) {
2712 +               LOG_SERIOUS
2713 +                   ("Unable to allocate memory for mirror pool line:%d\n",
2714 +                    __LINE__);
2715 +               return NULL;
2716 +       }
2717 +
2718 +       memcpy(&tmp_bh->bh_req, bh, sizeof (struct buffer_head));
2719 +       tmp_bh->remaining = (atomic_t) ATOMIC_INIT(0);
2720 +       tmp_bh->node = node;
2721 +       tmp_bh->master_bh = bh;
2722 +       tmp_bh->iteration = AIX_FIRST_MIRROR;
2723 +       //tmp_bh->eio.rsector     = eio->rsector;
2724 +       //tmp_bh->eio.rsize       = eio->rsize;
2725 +       tmp_bh->le = le;
2726 +       //tmp_bh->eio.bh          = &tmp_bh->bh_req;
2727 +
2728 +       if (cmd == AIX_LV_READ) {
2729 +               tmp_bh->bh_req.b_end_io = AIX_handle_read_mirror_drives;        //setup callback routine
2730 +       } else {
2731 +               tmp_bh->bh_req.b_end_io = AIX_sync_mirrored_partitions; //setup callback routine
2732 +       }
2733 +
2734 +       tmp_bh->bh_req.b_private = (void *) tmp_bh;
2735 +
2736 +       tmp_bh->cmd = cmd;
2737 +       tmp_bh->next_r1 = NULL;
2738 +       tmp_bh->node = node;
2739 +
2740 +       return tmp_bh;
2741 +
2742 +}
2743 +
2744 +/****************************************************
2745 +* Function: AIX_reschedule_retry
2746 +*
2747 +* reschedule a read of one of our mirror copies
2748 +*
2749 +*
2750 +*****************************************************/
2751 +static void
2752 +AIX_reschedule_retry(struct aix_mirror_bh *aix_bh)
2753 +{
2754 +       unsigned long flags;
2755 +
2756 +       spin_lock_irqsave(&AIX_retry_list_lock, flags);
2757 +       if (AIX_retry_list == NULL)
2758 +               AIX_retry_tail = &AIX_retry_list;
2759 +       *AIX_retry_tail = aix_bh;
2760 +       AIX_retry_tail = &aix_bh->next_r1;
2761 +       aix_bh->next_r1 = NULL;
2762 +       spin_unlock_irqrestore(&AIX_retry_list_lock, flags);
2763 +       evms_cs_wakeup_thread(AIX_mirror_read_retry_thread);
2764 +}
2765 +
2766 +/****************************************************
2767 +* Function: AIX_handle_read_mirror_drives
2768 +*
2769 +* Handles a read from a set of mirrored AIX LVs
2770 +
2771 +*
2772 +*
2773 +*****************************************************/
2774 +static void
2775 +AIX_handle_read_mirror_drives(struct buffer_head *bh, int uptodate)
2776 +{
2777 +       struct aix_logical_volume *volume;
2778 +       struct evms_logical_node *node;
2779 +       struct aix_mirror_bh *tmp_bh;
2780 +       kdev_t tmp_b_rdev;
2781 +       u32 count, le = 0;
2782 +
2783 +       tmp_bh = (struct aix_mirror_bh *) bh->b_private;
2784 +       tmp_b_rdev = tmp_bh->master_bh->b_rdev;
2785 +       volume = (struct aix_logical_volume *) tmp_bh->node->private;
2786 +       node = tmp_bh->node;
2787 +       le = tmp_bh->le;
2788 +
2789 +       LOG_DEBUG("AHRMD node:%p bh_flags:%lu uptodate:%d mirror_copies:%d \n",
2790 +                 node, bh->b_state, uptodate, volume->mirror_copies);
2791 +
2792 +       switch (tmp_bh->iteration) {
2793 +       case AIX_DEFAULT_MIRRORING:
2794 +               count = volume->le_to_pe_map[le].pp_state;
2795 +               break;
2796 +
2797 +       case AIX_FIRST_MIRROR:
2798 +               count = volume->le_to_pe_map[le].pp_state;
2799 +               break;
2800 +
2801 +       case AIX_MAX_MIRRORS:
2802 +               count = volume->le_to_pe_map[le].pp_state;
2803 +               break;
2804 +       }
2805 +
2806 +       if (count == (AIX_LVM_LVSTALE + AIX_LVM_LVDEFINED)) {
2807 +               uptodate = 0;
2808 +               count = 0;
2809 +       }
2810 +
2811 +       if (!uptodate && tmp_bh->iteration < volume->mirror_copies) {
2812 +               AIX_evms_cs_notify_lv_io_error(node);
2813 +               AIX_reschedule_retry(tmp_bh);
2814 +       } else {
2815 +               tmp_bh->master_bh->b_end_io(tmp_bh->master_bh, uptodate);
2816 +               evms_cs_deallocate_to_pool(AIX_BH_list_pool, tmp_bh);
2817 +               evms_cs_volume_request_in_progress(tmp_b_rdev,
2818 +                                                  AIX_DECREMENT_REQUEST,
2819 +                                                  &count);
2820 +
2821 +       }
2822 +
2823 +       return;
2824 +}
2825 +
2826 +/****************************************************
2827 +* This is a temporary function until a common EVMS
2828 +* notification function can be created.
2829 +*
2830 +*****************************************************/
2831 +static int
2832 +AIX_evms_cs_notify_lv_io_error(struct evms_logical_node *node)
2833 +{
2834 +       struct aix_logical_volume *volume;
2835 +
2836 +       volume = (struct aix_logical_volume *) node->private;
2837 +
2838 +       LOG_CRITICAL("Notify_ERROR !!  node:%p volume->lv_status:%d volume->name:[%s]\n",
2839 +            node, volume->lv_status, volume->name);
2840 +
2841 +       return 0;
2842 +}
2843 +
2844 +/* Function: lvm_cleanup
2845 + *
2846 + *     This function runs through the entire lvm data structure, removing
2847 + *     all items that are not needed at runtime. Currently, this is just the
2848 + *     vg_disk_t structure and the pv_disk_t structure for each PV. Also, any
2849 + *     groups that don't contain any volumes are deleted. All of the other
2850 + *     volume_group, logical_volume and evms_logical_node structures will be
2851 + *     kept around at run-time.
2852 + */
2853 +static int
2854 +lvm_cleanup(void)
2855 +{
2856 +       struct aix_volume_group *group;
2857 +
2858 +       group = AIXVolumeGroupList;
2859 +
2860 +       while (group) {
2861 +
2862 +               if (group->AIXvgh) {
2863 +                       kfree(group->AIXvgh);
2864 +                       group->AIXvgh = NULL;
2865 +               }
2866 +
2867 +               group = group->next;
2868 +       }
2869 +
2870 +       return 0;
2871 +}
2872 +
2873 +/****************************************************
2874 +* Function: AIX_copy_header_info
2875 +*
2876 +* Copy the disk header info into the volume struct
2877 +* so we can use it later.
2878 +*
2879 +*
2880 +*
2881 +*****************************************************/
2882 +static int
2883 +AIX_copy_header_info(struct vg_header *AIXvgh, struct vg_header *AIXvgh2)
2884 +{
2885 +
2886 +       LOG_DEBUG("CHI  AIXvgh:%p AIXvgh2:%p\n", AIXvgh, AIXvgh2);
2887 +
2888 +       if (AIXvgh) {
2889 +
2890 +               AIXvgh->vg_timestamp.tv_sec = AIXvgh2->vg_timestamp.tv_sec;
2891 +               AIXvgh->vg_timestamp.tv_nsec = AIXvgh2->vg_timestamp.tv_nsec;
2892 +               AIXvgh->vg_id.word1 = AIXvgh2->vg_id.word1;
2893 +               AIXvgh->vg_id.word2 = AIXvgh2->vg_id.word2;
2894 +               AIXvgh->vg_id.word3 = AIXvgh2->vg_id.word3;
2895 +               AIXvgh->vg_id.word4 = AIXvgh2->vg_id.word4;
2896 +               AIXvgh->numlvs = AIXvgh2->numlvs;
2897 +               AIXvgh->maxlvs = AIXvgh2->maxlvs;
2898 +               AIXvgh->pp_size = AIXvgh2->pp_size;
2899 +               AIXvgh->numpvs = AIXvgh2->numpvs;
2900 +               AIXvgh->total_vgdas = AIXvgh2->total_vgdas;
2901 +               AIXvgh->vgda_size = AIXvgh2->vgda_size;
2902 +               AIXvgh->bigvg = AIXvgh2->bigvg;
2903 +               AIXvgh->quorum = AIXvgh2->quorum;
2904 +               AIXvgh->auto_varyon = AIXvgh2->auto_varyon;
2905 +               AIXvgh->checksum = AIXvgh2->checksum;
2906 +               AIXvgh->bigda_size = AIXvgh2->bigda_size;
2907 +
2908 +       } else {
2909 +               return -ENOMEM;
2910 +       }
2911 +
2912 +       LOG_DEBUG("Returning CHI  AIXvgh:%p AIXvgh2:%p\n", AIXvgh, AIXvgh2);
2913 +
2914 +       return 0;
2915 +}
2916 +
2917 +/****************************************************
2918 +* Function: AIX_free_header
2919 +*
2920 +*
2921 +*
2922 +*
2923 +*
2924 +*****************************************************/
2925 +static void
2926 +AIX_free_headers(struct vg_header *AIXvgh, struct vg_header *AIXvgh2,
2927 +                struct vg_trailer *AIXvgt, struct vg_trailer *AIXvgt2)
2928 +{
2929 +
2930 +       if (AIXvgh) {
2931 +               kfree(AIXvgh);
2932 +               AIXvgh = NULL;
2933 +       }
2934 +
2935 +       if (AIXvgh2) {
2936 +               kfree(AIXvgh2);
2937 +               AIXvgh2 = NULL;
2938 +       }
2939 +
2940 +       if (AIXvgt) {
2941 +               kfree(AIXvgt);
2942 +               AIXvgt = NULL;
2943 +       }
2944 +
2945 +       if (AIXvgt2) {
2946 +               kfree(AIXvgt2);
2947 +               AIXvgt2 = NULL;
2948 +       }
2949 +
2950 +}
2951 +
2952 +/****************************************************
2953 +* Function: AIXiod
2954 +*
2955 +* This is a kernel thread that handles read of mirrors
2956 +* This shouldn't ever run on a non-mirrored LV read
2957 +*
2958 +*
2959 +*****************************************************/
2960 +static void
2961 +AIXiod(void *data)
2962 +{
2963 +       struct aix_mirror_bh *r1_bh;
2964 +       struct evms_logical_node *node;
2965 +       unsigned long flags;
2966 +
2967 +       while (1) {
2968 +
2969 +               spin_lock_irqsave(&AIX_retry_list_lock, flags);
2970 +               if (AIX_retry_list == NULL) {
2971 +                       spin_unlock_irqrestore(&AIX_retry_list_lock, flags);
2972 +                       break;
2973 +               }
2974 +               r1_bh = AIX_retry_list;
2975 +               AIX_retry_list = r1_bh->next_r1;
2976 +               spin_unlock_irqrestore(&AIX_retry_list_lock, flags);
2977 +               r1_bh->next_r1 = NULL;  // for mark
2978 +
2979 +               switch (r1_bh->cmd) {
2980 +               case AIX_LV_READ:
2981 +
2982 +                       r1_bh->iteration++;
2983 +                       LOG_DEBUG("Report from thread AIXiod READ\n");
2984 +
2985 +                       if (r1_bh->iteration == AIX_FIRST_MIRROR) {
2986 +                               node = r1_bh->mir_node1;
2987 +                               r1_bh->bh_req.b_rsector = r1_bh->mir_sector1;
2988 +                       } else {
2989 +                               node = r1_bh->mir_node2;
2990 +                               r1_bh->bh_req.b_rsector = r1_bh->mir_sector2;
2991 +                       }
2992 +
2993 +                       R_IO(node, &r1_bh->bh_req);
2994 +
2995 +                       break;
2996 +
2997 +               default:
2998 +                       LOG_DEBUG("AIXiod unknown cmd passed to thread:%d\n",
2999 +                                 r1_bh->cmd);
3000 +                       break;
3001 +               }
3002 +
3003 +       }
3004 +       return;
3005 +}
3006 +
3007 +/****************************************************
3008 +* Function: AIX_schedule_resync
3009 +*
3010 +* schedule a resync of one of our lv mirror copies
3011 +*
3012 +*
3013 +*****************************************************/
3014 +static void
3015 +AIX_schedule_resync(struct aix_logical_volume *resync_volume, int force)
3016 +{
3017 +       unsigned long flags;
3018 +
3019 +       LOG_DEBUG("Function %s volume: %s \n", __FUNCTION__,
3020 +                 resync_volume->name);
3021 +
3022 +       spin_lock_irqsave(&AIX_resync_list_lock, flags);
3023 +
3024 +       if (!AIX_resync_list) {
3025 +               AIX_resync_list =
3026 +                   kmalloc(sizeof (struct aix_resync_struct), GFP_ATOMIC);
3027 +               if (!AIX_resync_list) {
3028 +                       return;
3029 +               }
3030 +               memset(AIX_resync_list, 0, sizeof (struct aix_resync_struct));
3031 +       }
3032 +
3033 +       AIX_resync_list->resync_vol = resync_volume;
3034 +       AIX_resync_list->next_resync_vol = NULL;
3035 +
3036 +       spin_unlock_irqrestore(&AIX_resync_list_lock, flags);
3037 +       evms_cs_wakeup_thread(AIX_mirror_resync_thread);
3038 +}
3039 +
3040 +/****************************************************
3041 +* Function: AIXresync
3042 +*
3043 +* This is a kernel thread that handles resync of mirrors
3044 +* This shouldn't ever run on a non-mirrored LV
3045 +*
3046 +*
3047 +*****************************************************/
3048 +static void
3049 +AIXresync(void *data)
3050 +{
3051 +
3052 +       struct aix_logical_volume *volume = NULL;
3053 +       int force = FALSE;      // Currently we don't force a resync of non-stale pe's
3054 +
3055 +       if (AIX_resync_list == NULL) {
3056 +               LOG_ERROR("No Volumes on list to resync\n");
3057 +               return;
3058 +       }
3059 +
3060 +       volume = AIX_resync_list->resync_vol;
3061 +       LOG_DEBUG("Function %s volume: %s \n", __FUNCTION__, volume->name);
3062 +
3063 +       if (!volume) {
3064 +               LOG_ERROR("Invalid volume passed to sync\n");
3065 +               return;
3066 +       }
3067 +
3068 +       if (AIXResyncInProgress) {
3069 +               LOG_ERROR("Unable to resync multiple LVs concurrently %s\n",
3070 +                         volume->name);
3071 +               return;
3072 +       }
3073 +
3074 +       if (volume->mirror_copies == AIX_DEFAULT_MIRRORING) {
3075 +               LOG_ERROR("Unable to resync non-mirrored LV %s \n",
3076 +                         volume->name);
3077 +               return;
3078 +       }
3079 +
3080 +       AIXResyncInProgress = TRUE;
3081 +
3082 +       AIX_resync_lv_mirrors(volume, force);
3083 +
3084 +       return;
3085 +}
3086 +
3087 +/****************************************************
3088 +* Function: AIX_resync_lv_mirrors
3089 +*
3090 +*
3091 +*
3092 +*
3093 +*
3094 +*****************************************************/
3095 +static int
3096 +AIX_resync_lv_mirrors(struct aix_logical_volume *volume, int force)
3097 +{
3098 +
3099 +       int i;
3100 +       char pp_stale = FALSE;
3101 +
3102 +       struct partition_list_entry *master_part = NULL;
3103 +       struct partition_list_entry *slave1_part = NULL;
3104 +       struct partition_list_entry *slave2_part = NULL;
3105 +
3106 +       u64 master_offset = 0;
3107 +       u64 slave1_offset = 0;
3108 +       u64 slave2_offset = 0;
3109 +
3110 +       LOG_DEBUG("Function %s volume: %s \n", __FUNCTION__, volume->name);
3111 +
3112 +       for (i = 0; i < volume->num_le; i++, pp_stale = FALSE) {
3113 +
3114 +               // We need to see which mirror has a valid non-stale copy.
3115 +               // The first non-stale copy will be our master and we'll
3116 +               // copy to the slave(s).
3117 +
3118 +               if ((volume->le_to_pe_map[i].pp_state & AIX_LVM_LVSTALE)) {
3119 +                       pp_stale = TRUE;
3120 +               }
3121 +
3122 +               if (volume->le_to_pe_map_mir1 != NULL) {
3123 +                       if ((volume->le_to_pe_map_mir1[i].
3124 +                            pp_state & AIX_LVM_LVSTALE)) {
3125 +                               pp_stale = TRUE;
3126 +                       }
3127 +               }
3128 +
3129 +               if (volume->le_to_pe_map_mir2 != NULL) {
3130 +                       if ((volume->le_to_pe_map_mir2[i].
3131 +                            pp_state & AIX_LVM_LVSTALE)) {
3132 +                               pp_stale = TRUE;
3133 +                       }
3134 +               }
3135 +
3136 +               LOG_DEBUG("Function %s pp_stale:%d force:%d \n", __FUNCTION__,
3137 +                         pp_stale, force);
3138 +
3139 +               if (pp_stale || force) {
3140 +                       if (!(volume->le_to_pe_map[i].pp_state & AIX_LVM_LVSTALE)) {
3141 +
3142 +                               master_part = volume->le_to_pe_map[i].owning_pv;
3143 +                               master_offset = volume->le_to_pe_map[i].pe_sector_offset;
3144 +
3145 +                               if (volume->le_to_pe_map_mir1 != NULL) {
3146 +                                       slave1_part = volume->le_to_pe_map_mir1[i].owning_pv;
3147 +                                       slave1_offset = volume->le_to_pe_map_mir1[i].pe_sector_offset;
3148 +                               }
3149 +
3150 +                               if (volume->le_to_pe_map_mir2 != NULL) {
3151 +                                       slave2_part = volume->le_to_pe_map_mir2[i].owning_pv;
3152 +                                       slave2_offset = volume->le_to_pe_map_mir2[i].pe_sector_offset;
3153 +                               }
3154 +                       } else
3155 +                           if (!(volume->le_to_pe_map_mir1[i].pp_state & AIX_LVM_LVSTALE)) {
3156 +                               master_part = volume->le_to_pe_map_mir1[i].owning_pv;
3157 +                               master_offset = volume->le_to_pe_map_mir1[i].pe_sector_offset;
3158 +
3159 +                               if (volume->le_to_pe_map != NULL) {
3160 +                                       slave1_part = volume->le_to_pe_map[i].owning_pv;
3161 +                                       slave1_offset = volume->le_to_pe_map[i].pe_sector_offset;
3162 +                               }
3163 +
3164 +                               if (volume->le_to_pe_map_mir2 != NULL) {
3165 +                                       slave2_part = volume->le_to_pe_map_mir2[i].owning_pv;
3166 +                                       slave2_offset = volume->le_to_pe_map_mir2[i].pe_sector_offset;
3167 +                               }
3168 +                       } else
3169 +                           if (!(volume->le_to_pe_map_mir2[i].pp_state & AIX_LVM_LVSTALE)) {
3170 +                               master_part = volume->le_to_pe_map_mir2[i].owning_pv;
3171 +                               master_offset = volume->le_to_pe_map_mir2[i].pe_sector_offset;
3172 +
3173 +                               if (volume->le_to_pe_map != NULL) {
3174 +                                       slave1_part = volume->le_to_pe_map[i].owning_pv;
3175 +                                       slave1_offset = volume->le_to_pe_map[i].pe_sector_offset;
3176 +                               }
3177 +
3178 +                               if (volume->le_to_pe_map_mir1 != NULL) {
3179 +                                       slave2_part = volume->le_to_pe_map_mir1[i].owning_pv;
3180 +                                       slave2_offset = volume->le_to_pe_map_mir1[i].pe_sector_offset;
3181 +                               }
3182 +                       }
3183 +
3184 +                       if (AIX_copy_on_read(volume, master_part, slave1_part, slave2_part,
3185 +                            master_offset, slave1_offset, slave2_offset,
3186 +                            volume->pe_size, i)) {
3187 +
3188 +                               LOG_CRITICAL("ReSync of logical Volume %s FAILED !!\n",
3189 +                                    volume->name);
3190 +                               AIX_evms_cs_notify_lv_io_error(volume->
3191 +                                                              volume_node);
3192 +                               break;
3193 +                       }
3194 +
3195 +               }
3196 +
3197 +       }
3198 +
3199 +       return 0;
3200 +}
3201 +
3202 +/****************************************************
3203 +* Function: AIX_copy_on_read
3204 +*
3205 +*
3206 +*
3207 +*
3208 +*
3209 +*****************************************************/
3210 +static int
3211 +AIX_copy_on_read(struct aix_logical_volume *volume,
3212 +                struct partition_list_entry *master_part,
3213 +                struct partition_list_entry *slave1_part,
3214 +                struct partition_list_entry *slave2_part,
3215 +                u64 master_offset,
3216 +                u64 slave1_offset, u64 slave2_offset, u32 pe_size, int le)
3217 +{
3218 +       unsigned long flags;
3219 +       struct aix_mirror_bh *tmp_bh = NULL;
3220 +
3221 +       // Check for valid partitions we need at least 2 good partitions so slave2 doesn't have to be valid
3222 +
3223 +       if (!master_part || !slave1_part) {
3224 +               LOG_ERROR("Invalid partitions for resync master part:%p slave1_part:%p slave2_part:%p\n",
3225 +                    master_part, slave1_part, slave2_part);
3226 +               return -EINVAL;
3227 +       }
3228 +
3229 +       LOG_DEBUG("Function %s volume:%s master_part:%d, slave1_part:%d, slave2_part:%d master_offset:"
3230 +            PFU64 ", slave1_offset:" PFU64 " slave2_offset:" PFU64 ",  \n",
3231 +            __FUNCTION__, volume->name, master_part->pv_number,
3232 +            slave1_part->pv_number, slave2_part->pv_number, master_offset,
3233 +            slave1_offset, slave2_offset);
3234 +
3235 +       LOG_DEBUG("pe_size:%d le:%d\n", pe_size, le);
3236 +
3237 +       tmp_bh =
3238 +           AIX_alloc_sbh(volume, master_part, slave1_part, slave2_part,
3239 +                         master_offset, slave1_offset, slave2_offset, pe_size);
3240 +
3241 +       if (!tmp_bh) {
3242 +               buffer_IO_error(&tmp_bh->bh_req);
3243 +               return -ENOMEM;
3244 +       }
3245 +
3246 +/*     if (evms_cs_volume_request_in_progress
3247 +           (tmp_bh->bh_req.b_rdev, AIX_INCREMENT_REQUEST, &count)) {
3248 +               buffer_IO_error(&tmp_bh->bh_req);
3249 +               return -EIO;
3250 +       } */
3251 +
3252 +       spin_lock_irqsave(&AIX_resync_pp_lock, flags);
3253 +
3254 +       LOG_DEBUG("Function:%s kicking off read node:%p\n", __FUNCTION__,
3255 +                 master_part->logical_node);
3256 +
3257 +       R_IO(master_part->logical_node, &tmp_bh->bh_req);
3258 +
3259 +       spin_unlock_irqrestore(&AIX_resync_pp_lock, flags);
3260 +
3261 +       return 0;
3262 +}
3263 +
3264 +/****************************************************
3265 +* Function: AIX_alloc_sbh
3266 +*
3267 +* Alloc any buffer heads from the pool and return a linked list
3268 +*
3269 +*
3270 +*****************************************************/
3271 +static struct aix_mirror_bh *
3272 +AIX_alloc_sbh(struct aix_logical_volume *volume,
3273 +             struct partition_list_entry *master_part,
3274 +             struct partition_list_entry *slave1_part,
3275 +             struct partition_list_entry *slave2_part,
3276 +             u64 master_offset,
3277 +             u64 slave1_offset, u64 slave2_offset, u32 pe_size)
3278 +{
3279 +       struct aix_mirror_bh *tmp_bh = NULL, *head_bh = NULL;
3280 +       unsigned long flags;
3281 +
3282 +       LOG_DEBUG("Function:%s Enter\n", __FUNCTION__);
3283 +
3284 +       head_bh = evms_cs_allocate_from_pool(AIX_BH_list_pool, EVMS_BLOCKABLE);
3285 +       if (!head_bh) {
3286 +               LOG_SERIOUS
3287 +                   ("Unable to allocate memory for mirror pool line:%d\n",
3288 +                    __LINE__);
3289 +               return NULL;
3290 +       }
3291 +       // Update buffer so we block on a read/write on the normal IO path
3292 +       // if we're trying to sync the same sector on the disk
3293 +       // We don't want to block if it's different sectors
3294 +
3295 +       spin_lock_irqsave(&AIX_resync_list_lock, flags);
3296 +
3297 +       AIX_resync_list->master_part = master_part;
3298 +       AIX_resync_list->slave1_part = slave1_part;
3299 +       AIX_resync_list->slave2_part = slave2_part;
3300 +       AIX_resync_list->master_offset = master_offset;
3301 +       AIX_resync_list->slave1_offset = slave1_offset;
3302 +       AIX_resync_list->slave2_offset = slave2_offset;
3303 +
3304 +       head_bh->bh_req.b_data = kmalloc(AIX_RESYNC_BLOCKSIZE + 1, GFP_NOIO);
3305 +       if (!head_bh->bh_req.b_data) {
3306 +               evms_cs_deallocate_to_pool(AIX_BH_list_pool, head_bh);
3307 +               LOG_SERIOUS
3308 +                   ("Unable to allocate memory for mirror pool line:%d\n",
3309 +                    __LINE__);
3310 +               return NULL;
3311 +       }
3312 +
3313 +       memset(head_bh->bh_req.b_data, 0, AIX_RESYNC_BLOCKSIZE + 1);
3314 +
3315 +       head_bh->remaining = (atomic_t) ATOMIC_INIT(0);
3316 +       head_bh->bh_req.b_rsector = master_offset;
3317 +       head_bh->bh_req.b_size = AIX_RESYNC_BLOCKSIZE;
3318 +       head_bh->sync_flag = AIX_SYNC_INCOMPLETE;
3319 +       head_bh->bh_req.b_end_io = AIX_sync_mirrored_partitions;
3320 +       head_bh->bh_req.b_page = virt_to_page(head_bh->bh_req.b_data);
3321 +       head_bh->bh_req.b_state = 0;
3322 +       set_bit(BH_Dirty, &head_bh->bh_req.b_state);
3323 +       set_bit(BH_Lock, &head_bh->bh_req.b_state);
3324 +       set_bit(BH_Req, &head_bh->bh_req.b_state);
3325 +       set_bit(BH_Mapped, &head_bh->bh_req.b_state);
3326 +       head_bh->master_bh = NULL;
3327 +       head_bh->mirror_bh_list = NULL;
3328 +
3329 +       tmp_bh = evms_cs_allocate_from_pool(AIX_BH_list_pool, EVMS_BLOCKABLE);
3330 +       if (!tmp_bh) {
3331 +               LOG_SERIOUS
3332 +                   ("Unable to allocate memory for mirror pool line:%d\n",
3333 +                    __LINE__);
3334 +               return NULL;
3335 +       }
3336 +
3337 +       head_bh->next_r1 = tmp_bh;
3338 +       memcpy(&tmp_bh->bh_req, head_bh, sizeof (struct buffer_head));
3339 +       tmp_bh->remaining = (atomic_t) ATOMIC_INIT(0);
3340 +       tmp_bh->bh_req.b_end_io = NULL;
3341 +
3342 +       if (volume->mirror_copies == AIX_MAX_MIRRORS) {
3343 +               tmp_bh->next_r1 =
3344 +                   evms_cs_allocate_from_pool(AIX_BH_list_pool,
3345 +                                              EVMS_BLOCKABLE);
3346 +               if (!tmp_bh->next_r1) {
3347 +                       LOG_SERIOUS
3348 +                           ("Unable to allocate memory for mirror pool line:%d\n",
3349 +                            __LINE__);
3350 +                       return NULL;
3351 +               }
3352 +
3353 +               memcpy(&tmp_bh->next_r1->bh_req, head_bh,
3354 +                      sizeof (struct buffer_head));
3355 +               tmp_bh->next_r1->bh_req.b_end_io = NULL;
3356 +               tmp_bh->next_r1->remaining = (atomic_t) ATOMIC_INIT(0);
3357 +       }
3358 +
3359 +       init_waitqueue_head(&head_bh->bh_req.b_wait);
3360 +
3361 +       spin_unlock_irqrestore(&AIX_resync_list_lock, flags);
3362 +
3363 +       LOG_DEBUG("Function:%s Exit head_bh:%p\n", __FUNCTION__, head_bh);
3364 +
3365 +       return head_bh;
3366 +}
3367 +
3368 +/****************************************************
3369 +* Function: AIX_sync_mirrored_partitions
3370 +*
3371 +*
3372 +*
3373 +*
3374 +*
3375 +*****************************************************/
3376 +static void
3377 +AIX_sync_mirrored_partitions(struct buffer_head *bh, int uptodate)
3378 +{
3379 +       struct aix_logical_volume *volume = NULL;
3380 +       struct aix_mirror_bh *tmp_bh, *head_bh;
3381 +
3382 +       head_bh = tmp_bh = (struct aix_mirror_bh *) bh->b_private;
3383 +       volume = (struct aix_logical_volume *) tmp_bh->node->private;
3384 +
3385 +       LOG_DEBUG("Function:%s Enter uptodate:%d\n", __FUNCTION__, uptodate);
3386 +
3387 +       if (!uptodate) {
3388 +
3389 +               AIX_evms_cs_notify_lv_io_error(tmp_bh->node);
3390 +       }
3391 +
3392 +       tmp_bh = head_bh->next_r1;
3393 +
3394 +       LOG_DEBUG("Function:%s line:%d write to mirror:%p\n", __FUNCTION__,
3395 +                 __LINE__, tmp_bh);
3396 +
3397 +       if (tmp_bh) {
3398 +               W_IO(tmp_bh->node, &tmp_bh->bh_req);
3399 +               AIX_get_set_mirror_offset(tmp_bh, AIX_SLAVE_1,
3400 +                                         AIX_RESYNC_BLOCKSIZE);
3401 +       }
3402 +
3403 +       tmp_bh = tmp_bh->next_r1;
3404 +       LOG_DEBUG("Function:%s line:%d write to mirror:%p\n", __FUNCTION__,
3405 +                 __LINE__, tmp_bh);
3406 +
3407 +       if (tmp_bh) {
3408 +               W_IO(tmp_bh->node, &tmp_bh->bh_req);
3409 +               AIX_get_set_mirror_offset(tmp_bh, AIX_SLAVE_2,
3410 +                                         AIX_RESYNC_BLOCKSIZE);
3411 +       }
3412 +
3413 +       LOG_DEBUG("Function:%s line:%d read from  master:%p\n", __FUNCTION__,
3414 +                 __LINE__, head_bh);
3415 +
3416 +       if (head_bh && head_bh->sync_flag) {
3417 +               AIX_get_set_mirror_offset(head_bh, AIX_MASTER,
3418 +                                         AIX_RESYNC_BLOCKSIZE);
3419 +               if (head_bh->sync_flag == AIX_SYNC_INCOMPLETE) {
3420 +                       R_IO(head_bh->node, &head_bh->bh_req);
3421 +               }
3422 +       }
3423 +
3424 +       LOG_DEBUG("Function:%s line:%d head_bh->sync_flag:%d\n", __FUNCTION__,
3425 +                 __LINE__, head_bh->sync_flag);
3426 +
3427 +       if (!head_bh->sync_flag) {
3428 +               tmp_bh = head_bh;
3429 +               head_bh = head_bh->next_r1;
3430 +
3431 +               while (tmp_bh != NULL) {
3432 +                       evms_cs_deallocate_to_pool(AIX_BH_list_pool, tmp_bh);
3433 +                       tmp_bh = head_bh;
3434 +               }
3435 +
3436 +               AIXResyncInProgress = FALSE;
3437 +/*             evms_cs_volume_request_in_progress(tmp_bh->bh_req.b_rdev,
3438 +                                                  AIX_DECREMENT_REQUEST,
3439 +                                                  &count); */
3440 +
3441 +               if (AIX_resync_list) {
3442 +                       kfree(AIX_resync_list);
3443 +               }
3444 +       }
3445 +
3446 +       return;
3447 +}
3448 +
3449 +/****************************************************
3450 +* Function: AIX_get_set_mirror_offset
3451 +*
3452 +*
3453 +*
3454 +*
3455 +*
3456 +*****************************************************/
3457 +static int
3458 +AIX_get_set_mirror_offset(struct aix_mirror_bh *tmp_bh, int index, int offset)
3459 +{
3460 +       int flags;
3461 +
3462 +       if (!tmp_bh) {
3463 +               return -EINVAL;
3464 +       }
3465 +
3466 +       LOG_DEBUG("Function:%s Enter offset:%d\n", __FUNCTION__, offset);
3467 +
3468 +       tmp_bh->bh_req.b_rsector += tmp_bh->bh_req.b_rsector + offset;
3469 +
3470 +       if (tmp_bh->bh_req.b_rsector > tmp_bh->node->total_vsectors) {
3471 +               tmp_bh->sync_flag = AIX_SYNC_COMPLETE;
3472 +               return -EIO;
3473 +       }
3474 +       // Update buffer so we block on a read/write on the normal IO path
3475 +       // if we're trying to sync the same sector on the disk
3476 +       // We don't want to block if it's different sectors
3477 +
3478 +       spin_lock_irqsave(&AIX_resync_list_lock, flags);
3479 +
3480 +       if (AIX_resync_list->master_part->logical_node == tmp_bh->node) {
3481 +               AIX_resync_list->master_offset += offset;
3482 +       }
3483 +
3484 +       if (AIX_resync_list->slave1_part->logical_node == tmp_bh->node) {
3485 +               AIX_resync_list->slave1_offset += offset;
3486 +       }
3487 +
3488 +       if (AIX_resync_list->slave2_part->logical_node == tmp_bh->node) {
3489 +               AIX_resync_list->slave2_offset += offset;
3490 +       }
3491 +
3492 +       spin_unlock_irqrestore(&AIX_resync_list_lock, flags);
3493 +
3494 +       return 0;
3495 +
3496 +}
3497 +
3498 +static int AIX_pvh_data_posn(u32 vgda_psn, u32 * pvh_posn, struct partition_list_entry *partition, u32 numpvs)
3499 +{
3500 +    struct partition_list_entry * pv;
3501 +    struct pv_header            * AIXpvh;
3502 +    int posn = 0;
3503 +    int num_pps;
3504 +    int tmp,i;
3505 +
3506 +    LOG_DEBUG("APDP - vgda_psn:%d numpvs:%d \n", vgda_psn, numpvs);
3507 +
3508 +       AIXpvh = kmalloc(AIX_SECTOR_SIZE, GFP_KERNEL);
3509 +       if (!AIXpvh) {
3510 +               return -ENOMEM;
3511 +       }
3512 +
3513 +    memset(AIXpvh, 0 , sizeof(struct pv_header));
3514 +
3515 +    // Adjust this because when AIX VGs/Volumes are created on Intel platforms, the
3516 +    // pp_count could be anything since we don't give up the entire physical drive.
3517 +    // This is for calculation purposes only.
3518 +
3519 +    pvh_posn[0] = 0;
3520 +    pv = partition;
3521 +
3522 +    for (i = 1; i <= numpvs; i++) {
3523 +        for (pv = partition; pv->pv_number != i; pv = pv->next );
3524 +
3525 +        LOG_DEBUG("APDP line:%d pp_count:%d \n", __LINE__,  AIXpvh->pp_count);
3526 +
3527 +        num_pps = AIXpvh->pp_count;
3528 +        num_pps++; // Account for the pv_header on the front
3529 +
3530 +        while ((num_pps * sizeof(struct pp_entries)) % AIX_SECTOR_SIZE) {
3531 +            LOG_EXTRA("num_pps:%d \n", num_pps);
3532 +            num_pps++;
3533 +        }
3534 +
3535 +        tmp = (num_pps * sizeof(struct pp_entries)) / AIX_SECTOR_SIZE;
3536 +
3537 +        LOG_DEBUG("APDP tmp:%d num_pps:%d \n", tmp,num_pps);
3538 +
3539 +        posn = ((vgda_psn + PSN_PPH_OFFSET) + ((pv->pv_number -1) * tmp));
3540 +
3541 +        pvh_posn[pv->pv_number] = posn;
3542 +
3543 +        if (INIT_IO(pv->logical_node, 0, posn, 1, AIXpvh)) {
3544 +            kfree(AIXpvh);
3545 +            return -EIO;
3546 +        }
3547 +
3548 +        pv = partition;
3549 +    }
3550 +
3551 +    kfree(AIXpvh);
3552 +
3553 +    return 0;
3554 +}
3555 +
3556 +/****************************************************
3557 +* Function: AIX_volume_group_dump
3558 +*
3559 +* This is for debug purposes and will walk the volume group list
3560 +* and LV's within the volume groups
3561 +*
3562 +* It can be called at anytime however the output to the display is large
3563 +*
3564 +*****************************************************/
3565 +#ifdef EVMS_AIX_DEBUG
3566 +static int
3567 +AIX_volume_group_dump(void)
3568 +{
3569 +       struct aix_volume_group *AIXVGLDebugPtr;
3570 +       struct partition_list_entry *DebugPartitionList;
3571 +       struct aix_logical_volume *DebugLVList;
3572 +       int i;
3573 +
3574 +       AIXVGLDebugPtr = AIXVolumeGroupList;
3575 +
3576 +       if (!AIXVGLDebugPtr) {
3577 +               LOG_DEBUG("***********************************************\n");
3578 +               LOG_DEBUG("ERROR Nothing built in the list to check !!!   \n");
3579 +               LOG_DEBUG("***********************************************\n");
3580 +               return 0;
3581 +       }
3582 +
3583 +       LOG_DEBUG("***********************************************    \n");
3584 +       LOG_DEBUG("Begin Volume Group Dump \n");
3585 +       LOG_DEBUG("***********************************************    \n");
3586 +
3587 +       while (AIXVGLDebugPtr) {
3588 +
3589 +               LOG_DEBUG("vg_number      %x\n", AIXVGLDebugPtr->vg_id.word2);
3590 +               LOG_DEBUG("numpsrtitions  %d\n", AIXVGLDebugPtr->partition_count);
3591 +               LOG_DEBUG("numlvs         %d\n", AIXVGLDebugPtr->numlvs);
3592 +               LOG_DEBUG("hard_sect_size %d\n", AIXVGLDebugPtr->hard_sect_size);
3593 +               LOG_DEBUG("block_size     %d\n", AIXVGLDebugPtr->block_size);
3594 +               LOG_DEBUG("flags          %d\n", AIXVGLDebugPtr->flags);
3595 +//             LOG_DEBUG("lv_max         %d\n", AIXVGLDebugPtr->lv_max);
3596 +               LOG_DEBUG("pe_size        %d\n", AIXVGLDebugPtr->pe_size);
3597 +               LOG_DEBUG("CleanVGInfo    %d\n", AIXVGLDebugPtr->CleanVGInfo);
3598 +
3599 +               DebugPartitionList = AIXVGLDebugPtr->partition_list;
3600 +
3601 +               LOG_DEBUG("********* Begin Volume Partition Dump ********* \n");
3602 +
3603 +               if (!DebugPartitionList) {
3604 +                       LOG_DEBUG("No partitions to check !!  \n");
3605 +               }
3606 +
3607 +               while (DebugPartitionList) {
3608 +                       LOG_DEBUG("logical_node       %p\n",
3609 +                                 DebugPartitionList->logical_node);
3610 +                       LOG_DEBUG("pv_number          %d\n",
3611 +                                 DebugPartitionList->pv_number);
3612 +                       LOG_DEBUG("block_size         %d\n",
3613 +                                 DebugPartitionList->block_size);
3614 +                       LOG_DEBUG("hard_sect_size     %d\n",
3615 +                                 DebugPartitionList->hard_sect_size);
3616 +                       LOG_DEBUG("-------------------------------------------------------------\n");
3617 +                       DebugPartitionList = DebugPartitionList->next;
3618 +               }
3619 +
3620 +               LOG_DEBUG("********* End Volume Partition Dump **********\n");
3621 +
3622 +               LOG_DEBUG("********** Begin Logical Volume Partition Dump **********\n");
3623 +
3624 +               DebugLVList = AIXVGLDebugPtr->volume_list[0];
3625 +
3626 +               if (!DebugLVList) {
3627 +                       LOG_DEBUG("No logical volumes to check !!  \n");
3628 +               }
3629 +
3630 +               for (i = 0; i < LVM_MAXLVS && DebugLVList; i++) {
3631 +
3632 +                       DebugLVList = AIXVGLDebugPtr->volume_list[i];
3633 +
3634 +                       if (DebugLVList) {
3635 +                               LOG_DEBUG("volume_list #    %d \n", i);
3636 +                               LOG_DEBUG("lv_number        %d \n",
3637 +                                         DebugLVList->lv_number);
3638 +                               LOG_DEBUG("LV name          %s \n",
3639 +                                         DebugLVList->name);
3640 +                               LOG_DEBUG("lv_size          " PFU64 " \n",
3641 +                                         DebugLVList->lv_size);
3642 +                               LOG_DEBUG("lv_access        %d \n",
3643 +                                         DebugLVList->lv_access);
3644 +                               LOG_DEBUG("lv_status        %d \n",
3645 +                                         DebugLVList->lv_status);
3646 +//                             LOG_DEBUG("lv_minor         %d \n",
3647 +//                                       DebugLVList->lv_minor);
3648 +                               LOG_DEBUG("mirror_copies    %d \n",
3649 +                                         DebugLVList->mirror_copies);
3650 +//                             LOG_DEBUG("mirror_number    %d \n",
3651 +//                                       DebugLVList->mirror_number);
3652 +                               LOG_DEBUG("stripes          %d \n",
3653 +                                         DebugLVList->stripes);
3654 +                               LOG_DEBUG("stripe_size      %d \n",
3655 +                                         DebugLVList->stripe_size);
3656 +                               LOG_DEBUG("stripe_size_shift%d \n",
3657 +                                         DebugLVList->stripe_size_shift);
3658 +                               LOG_DEBUG("pe_size          %d \n",
3659 +                                         DebugLVList->pe_size);
3660 +                               LOG_DEBUG("pe_size_shift    %d \n",
3661 +                                         DebugLVList->pe_size_shift);
3662 +                               LOG_DEBUG("num_le           %d \n",
3663 +                                         DebugLVList->num_le);
3664 +//                             LOG_DEBUG("new_volume       %d \n",
3665 +//                                       DebugLVList->new_volume);
3666 +                               LOG_DEBUG("group            %p \n",
3667 +                                         DebugLVList->group);
3668 +                       }
3669 +
3670 +               }
3671 +
3672 +               AIXVGLDebugPtr = AIXVGLDebugPtr->next;
3673 +
3674 +               LOG_DEBUG("********** End Logical Volume Partition Dump **********\n");
3675 +
3676 +       }
3677 +
3678 +       LOG_DEBUG("***********************************************\n");
3679 +       LOG_DEBUG("End Volume Group Dump                          \n");
3680 +       LOG_DEBUG("***********************************************\n");
3681 +
3682 +       return 0;
3683 +
3684 +}
3685 +#endif
3686 diff -Naur linux-2002-09-30/drivers/evms/Config.in evms-2002-09-30/drivers/evms/Config.in
3687 --- linux-2002-09-30/drivers/evms/Config.in     Wed Dec 31 18:00:00 1969
3688 +++ evms-2002-09-30/drivers/evms/Config.in      Mon Sep 16 15:55:24 2002
3689 @@ -0,0 +1,60 @@
3690 +#
3691 +#   Copyright (c) International Business Machines  Corp., 2000
3692 +#
3693 +#   This program is free software;  you can redistribute it and/or modify
3694 +#   it under the terms of the GNU General Public License as published by
3695 +#   the Free Software Foundation; either version 2 of the License, or
3696 +#   (at your option) any later version.
3697 +#
3698 +#   This program is distributed in the hope that it will be useful,
3699 +#   but WITHOUT ANY WARRANTY;  without even the implied warranty of
3700 +#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
3701 +#   the GNU General Public License for more details.
3702 +#
3703 +#   You should have received a copy of the GNU General Public License
3704 +#   along with this program;  if not, write to the Free Software
3705 +#   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
3706 +#
3707 +#
3708 +# EVMS driver configuration
3709 +#
3710 +
3711 +mainmenu_option next_comment
3712 +comment 'Enterprise Volume Management System'
3713 +
3714 +tristate     'EVMS Kernel Runtime' CONFIG_EVMS
3715 +dep_tristate '  EVMS Local Device Manager' CONFIG_EVMS_LOCAL_DEV_MGR $CONFIG_EVMS
3716 +dep_tristate '  EVMS DOS Segment Manager' CONFIG_EVMS_DOS_SEGMENT_MGR $CONFIG_EVMS
3717 +dep_tristate '  EVMS GPT Segment Manager' CONFIG_EVMS_GPT_SEGMENT_MGR $CONFIG_EVMS
3718 +if [ "$CONFIG_ARCH_S390" = "y" ]; then
3719 +dep_tristate '  EVMS S/390 Segment Manager' CONFIG_EVMS_S390_SEGMENT_MGR $CONFIG_EVMS
3720 +fi
3721 +dep_tristate '  EVMS SnapShot Feature' CONFIG_EVMS_SNAPSHOT $CONFIG_EVMS
3722 +dep_tristate '  EVMS DriveLink Feature' CONFIG_EVMS_DRIVELINK $CONFIG_EVMS
3723 +dep_tristate '  EVMS Bad Block Relocation (BBR) Feature' CONFIG_EVMS_BBR $CONFIG_EVMS
3724 +dep_tristate '  EVMS Linux LVM Package' CONFIG_EVMS_LVM $CONFIG_EVMS
3725 +dep_tristate '  EVMS Linux MD Package' CONFIG_EVMS_MD $CONFIG_EVMS
3726 +dep_tristate '    EVMS MD Linear (append) mode' CONFIG_EVMS_MD_LINEAR $CONFIG_EVMS_MD
3727 +dep_tristate '    EVMS MD RAID-0 (stripe) mode' CONFIG_EVMS_MD_RAID0 $CONFIG_EVMS_MD
3728 +dep_tristate '    EVMS MD RAID-1 (mirroring) mode' CONFIG_EVMS_MD_RAID1 $CONFIG_EVMS_MD
3729 +dep_tristate '    EVMS MD RAID-4/RAID-5 mode' CONFIG_EVMS_MD_RAID5 $CONFIG_EVMS_MD
3730 +dep_tristate '  EVMS AIX LVM Package' CONFIG_EVMS_AIX $CONFIG_EVMS
3731 +dep_tristate '  EVMS OS/2 LVM Package' CONFIG_EVMS_OS2 $CONFIG_EVMS
3732 +#dep_tristate '  EVMS Clustering Package' CONFIG_EVMS_ECR $CONFIG_EVMS
3733 +
3734 +if [ "$CONFIG_EVMS" != "n" ]; then
3735 +       choice '  EVMS Debug Level' \
3736 +               "Critical       CONFIG_EVMS_INFO_CRITICAL \
3737 +                Serious        CONFIG_EVMS_INFO_SERIOUS \
3738 +                Error          CONFIG_EVMS_INFO_ERROR \
3739 +                Warning        CONFIG_EVMS_INFO_WARNING \
3740 +                Default        CONFIG_EVMS_INFO_DEFAULT \
3741 +                Details        CONFIG_EVMS_INFO_DETAILS \
3742 +                Debug          CONFIG_EVMS_INFO_DEBUG \
3743 +                Extra          CONFIG_EVMS_INFO_EXTRA \
3744 +                Entry_Exit     CONFIG_EVMS_INFO_ENTRY_EXIT \
3745 +                Everything     CONFIG_EVMS_INFO_EVERYTHING" Default
3746 +fi
3747 +
3748 +endmenu
3749 +
3750 diff -Naur linux-2002-09-30/drivers/evms/Makefile evms-2002-09-30/drivers/evms/Makefile
3751 --- linux-2002-09-30/drivers/evms/Makefile      Wed Dec 31 18:00:00 1969
3752 +++ evms-2002-09-30/drivers/evms/Makefile       Mon Sep 16 15:55:24 2002
3753 @@ -0,0 +1,64 @@
3754 +#
3755 +# Makefile for the kernel EVMS driver and modules.
3756 +#
3757 +# 08 March 2001, Mark Peloquin <peloquin@us.ibm.com>
3758 +#
3759 +
3760 +O_TARGET := evmsdrvr.o
3761 +
3762 +export-objs := evms.o evms_passthru.o ldev_mgr.o dos_part.o lvm_vge.o \
3763 +               snapshot.o evms_drivelink.o evms_bbr.o AIXlvm_vge.o \
3764 +               os2lvm_vge.o evms_ecr.o md_core.o md_linear.o md_raid0.o \
3765 +               md_raid1.o md_raid5.o md_xor.o s390_part.o gpt_part.o
3766 +
3767 +# Link order is important! Plugins must come first, then the EVMS core.
3768 +
3769 +obj-$(CONFIG_EVMS_LOCAL_DEV_MGR)       += ldev_mgr.o
3770 +obj-$(CONFIG_EVMS_DOS_SEGMENT_MGR)     += dos_part.o
3771 +obj-$(CONFIG_EVMS_GPT_SEGMENT_MGR)     += gpt_part.o
3772 +obj-$(CONFIG_EVMS_S390_SEGMENT_MGR)    += s390_part.o
3773 +obj-$(CONFIG_EVMS_MD)                  += md_core.o
3774 +obj-$(CONFIG_EVMS_MD_LINEAR)           += md_linear.o
3775 +obj-$(CONFIG_EVMS_MD_RAID0)            += md_raid0.o
3776 +obj-$(CONFIG_EVMS_MD_RAID1)            += md_raid1.o
3777 +obj-$(CONFIG_EVMS_MD_RAID5)            += md_raid5.o md_xor.o
3778 +obj-$(CONFIG_EVMS_LVM)                 += lvm_vge.o
3779 +obj-$(CONFIG_EVMS_AIX)                 += AIXlvm_vge.o
3780 +obj-$(CONFIG_EVMS_OS2)                 += os2lvm_vge.o
3781 +obj-$(CONFIG_EVMS_DRIVELINK)           += evms_drivelink.o
3782 +obj-$(CONFIG_EVMS_BBR)                 += evms_bbr.o
3783 +obj-$(CONFIG_EVMS_SNAPSHOT)            += snapshot.o
3784 +obj-$(CONFIG_EVMS_ECR)                 += evms_ecr.o
3785 +obj-$(CONFIG_EVMS)                     += evms_passthru.o evms.o
3786 +
3787 +EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_DEFAULT
3788 +ifeq ($(CONFIG_EVMS_INFO_CRITICAL),y)
3789 +       EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_CRITICAL
3790 +endif
3791 +ifeq ($(CONFIG_EVMS_INFO_SERIOUS),y)
3792 +       EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_SERIOUS
3793 +endif
3794 +ifeq ($(CONFIG_EVMS_INFO_ERROR),y)
3795 +       EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_ERROR
3796 +endif
3797 +ifeq ($(CONFIG_EVMS_INFO_WARNING),y)
3798 +       EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_WARNING
3799 +endif
3800 +ifeq ($(CONFIG_EVMS_INFO_DETAILS),y)
3801 +       EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_DETAILS
3802 +endif
3803 +ifeq ($(CONFIG_EVMS_INFO_DEBUG),y)
3804 +       EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_DEBUG
3805 +endif
3806 +ifeq ($(CONFIG_EVMS_INFO_EXTRA),y)
3807 +       EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_EXTRA
3808 +endif
3809 +ifeq ($(CONFIG_EVMS_INFO_ENTRY_EXIT),y)
3810 +       EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_ENTRY_EXIT
3811 +endif
3812 +ifeq ($(CONFIG_EVMS_INFO_EVERYTHING),y)
3813 +       EXTRA_CFLAGS=-DEVMS_INFO_LEVEL=EVMS_INFO_EVERYTHING
3814 +endif
3815 +
3816 +include $(TOPDIR)/Rules.make
3817 +
3818 diff -Naur linux-2002-09-30/drivers/evms/dos_part.c evms-2002-09-30/drivers/evms/dos_part.c
3819 --- linux-2002-09-30/drivers/evms/dos_part.c    Wed Dec 31 18:00:00 1969
3820 +++ evms-2002-09-30/drivers/evms/dos_part.c     Fri Sep 13 16:09:55 2002
3821 @@ -0,0 +1,1452 @@
3822 +/* -*- linux-c -*- */
3823 +/*
3824 + *
3825 + *
3826 + *   Copyright (c) International Business Machines  Corp., 2000
3827 + *
3828 + *   This program is free software;  you can redistribute it and/or modify
3829 + *   it under the terms of the GNU General Public License as published by
3830 + *   the Free Software Foundation; either version 2 of the License, or
3831 + *   (at your option) any later version.
3832 + *
3833 + *   This program is distributed in the hope that it will be useful,
3834 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
3835 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
3836 + *   the GNU General Public License for more details.
3837 + *
3838 + *   You should have received a copy of the GNU General Public License
3839 + *   along with this program;  if not, write to the Free Software
3840 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
3841 + *
3842 + *
3843 + */
3844 +/*
3845 + * linux/drivers/evms/dos_part.c
3846 + *
3847 + * EVMS DOS partition manager
3848 + *
3849 + * Partial code extracted from
3850 + *
3851 + *  linux/fs/partitions/msdos.c
3852 + *
3853 + */
3854 +
3855 +#include <linux/config.h>
3856 +#include <linux/module.h>
3857 +#include <linux/kernel.h>
3858 +#include <linux/config.h>
3859 +#include <linux/fs.h>
3860 +#include <linux/genhd.h>
3861 +#include <linux/string.h>
3862 +#include <linux/blk.h>
3863 +#include <linux/init.h>
3864 +#include <linux/iobuf.h>       /* for kiobuf stuffs */
3865 +
3866 +#ifdef CONFIG_BLK_DEV_IDE
3867 +#include <linux/ide.h>         /* IDE xlate */
3868 +#endif                         /* CONFIG_BLK_DEV_IDE */
3869 +
3870 +#include <linux/evms/evms.h>
3871 +#include <linux/evms/evms_os2.h>
3872 +
3873 +#include <asm/system.h>
3874 +#include <asm/uaccess.h>
3875 +
3876 +/* prefix used in logging messages */
3877 +#define LOG_PREFIX "dos_part: "
3878 +
3879 +/* #include "msdos.h" */
3880 +#define MSDOS_LABEL_MAGIC               0xAA55
3881 +#define GPT_ENTIRE_DISK_INDICATOR       0xEE
3882 +#define GPT_ESP_INDICATOR               0xEF
3883 +
3884 +/**
3885 + * struct mbr_ebr -  Skeletal MBR/EBR structure useful for our purposes
3886 + * @unused1: skip IPL record code
3887 + * @partitions: partition table
3888 + * @signature: DOS magic
3889 + *
3890 + * skeletal access to parition table in MBR/EBR
3891 + **/
3892 +struct mbr_ebr {
3893 +       u8 unused1[0x1be];
3894 +       struct partition partitions[4];
3895 +       u16 signature;
3896 +};
3897 +
3898 +/**
3899 + * struct dos_private -  Private data structure for this plugin
3900 + * @source_object: object this IO will get remapped to
3901 + * @start_sect: source object relative starting address in 512 byte units
3902 + * @nr_sect: partition size in 512 bytes units
3903 + * @type: partition type or filesystem format indicator
3904 + *
3905 + * private copy of the just the fields we require to remap IO requests
3906 + * to the underlying object.
3907 + **/
3908 +struct dos_private {
3909 +       struct evms_logical_node *source_disk;
3910 +       u64 start_sect;
3911 +       u64 nr_sects;
3912 +       unsigned char type;
3913 +};
3914 +
3915 +/**
3916 + * struct extended_part - Structure used to track progress traversing an EBR chain
3917 + * @extended: partition table in the extended boot record
3918 + * @start_sect: address of the extended boot record in 512 byte units
3919 + * @next_ebr_start: address of next ebr in the chain
3920 + * @done: progress flag
3921 + *
3922 + * struct used to track extended boot record chain traversals.
3923 + **/
3924 +struct extended_part {
3925 +       struct partition *extended;
3926 +       u64 start_sect;
3927 +       u64 next_ebr_start;
3928 +       int done;
3929 +};
3930 +
3931 +/* Global variables */
3932 +static int cur_comp_part_num;  /* used to track non-primary
3933 +                                * partition numbers
3934 +                                */
3935 +static int exported_nodes;     /* total # of exported segments
3936 +                                * produced during this discovery.
3937 +                                */
3938 +
3939 +/* External references */
3940 +#if CONFIG_BLK_DEV_MD && CONFIG_AUTODETECT_RAID
3941 +extern void md_autodetect_dev(kdev_t dev);
3942 +#endif
3943 +
3944 +/* Prototypes */
3945 +static int mbr_ebr_partition_discover(struct evms_logical_node **);
3946 +static int mbr_ebr_partition_delete(struct evms_logical_node *);
3947 +static void mbr_ebr_partition_read(struct evms_logical_node *,
3948 +                                  struct buffer_head *);
3949 +static void mbr_ebr_partition_write(struct evms_logical_node *,
3950 +                                   struct buffer_head *);
3951 +static int mbr_ebr_partition_ioctl(struct evms_logical_node *, struct inode *,
3952 +                                  struct file *, unsigned int, unsigned long);
3953 +static int mbr_ebr_partition_init_io(struct evms_logical_node *,
3954 +                                    int, u64, u64, void *);
3955 +
3956 +static struct evms_plugin_fops fops = {
3957 +       .discover       = mbr_ebr_partition_discover,
3958 +       .delete         = mbr_ebr_partition_delete,
3959 +       .read           = mbr_ebr_partition_read,
3960 +       .write          = mbr_ebr_partition_write,
3961 +       .init_io        = mbr_ebr_partition_init_io,
3962 +       .ioctl          = mbr_ebr_partition_ioctl
3963 +};
3964 +
3965 +#define EVMS_MSDOS_PARTITION_MANAGER_ID 1
3966 +
3967 +static struct evms_plugin_header plugin_header = {
3968 +       .id = SetPluginID(IBM_OEM_ID,
3969 +                         EVMS_SEGMENT_MANAGER,
3970 +                         EVMS_MSDOS_PARTITION_MANAGER_ID),
3971 +       .version = {
3972 +               .major          = 1,
3973 +               .minor          = 1,
3974 +               .patchlevel     = 1
3975 +       },
3976 +       .required_services_version = {
3977 +               .major          = 0,
3978 +               .minor          = 5,
3979 +               .patchlevel     = 0
3980 +       },
3981 +       .fops = &fops
3982 +};
3983 +
3984 +/*
3985 + * Many architectures don't like unaligned accesses, which is
3986 + * frequently the case with the nr_sects and start_sect partition
3987 + * table entries.
3988 + */
3989 +#include <asm/unaligned.h>
3990 +
3991 +#define SYS_IND(p)      (get_unaligned(&p->sys_ind))
3992 +#define NR_SECTS(p)     (u64)({ __typeof__(p->nr_sects) __a =        \
3993 +                                get_unaligned(&p->nr_sects);    \
3994 +                                le32_to_cpu(__a); \
3995 +                        })
3996 +
3997 +#define START_SECT(p)   (u64)({ __typeof__(p->start_sect) __a =      \
3998 +                                get_unaligned(&p->start_sect);  \
3999 +                                le32_to_cpu(__a); \
4000 +                        })
4001 +
4002 +/******************************************/
4003 +/* List Support -  Variables, & Functions */
4004 +/******************************************/
4005 +
4006 +/* Typedefs */
4007 +
4008 +struct segment_list_node {
4009 +       struct evms_logical_node *segment;
4010 +       struct segment_list_node *next;
4011 +};
4012 +
4013 +struct disk_list_node {
4014 +       struct evms_logical_node *disk;
4015 +       struct segment_list_node *segment_list;
4016 +       struct disk_list_node *next;
4017 +};
4018 +
4019 +/* Variables */
4020 +
4021 +static struct disk_list_node *my_disk_list;
4022 +
4023 +/* Functions */
4024 +
4025 +static struct disk_list_node **
4026 +lookup_disk(struct evms_logical_node *disk)
4027 +{
4028 +       struct disk_list_node **ldln;
4029 +
4030 +       ldln = &my_disk_list;
4031 +       while (*ldln) {
4032 +               if ((*ldln)->disk == disk)
4033 +                       break;
4034 +               ldln = &(*ldln)->next;
4035 +       }
4036 +       return (ldln);
4037 +}
4038 +
4039 +static struct segment_list_node **
4040 +lookup_segment(struct disk_list_node *disk, struct evms_logical_node *segment)
4041 +{
4042 +       struct segment_list_node **lsln;
4043 +
4044 +       lsln = &disk->segment_list;
4045 +       while (*lsln) {
4046 +               if ((*lsln)->segment == segment)
4047 +                       break;
4048 +               lsln = &(*lsln)->next;
4049 +       }
4050 +       return (lsln);
4051 +}
4052 +
4053 +static struct evms_logical_node *
4054 +find_segment_on_disk(struct evms_logical_node *disk,
4055 +                    u64 start_sect, u64 nr_sects)
4056 +{
4057 +       struct evms_logical_node *rc = NULL;
4058 +       struct disk_list_node **ldln;
4059 +       struct segment_list_node **lsln;
4060 +       struct dos_private *dos_prv;
4061 +
4062 +       ldln = lookup_disk(disk);
4063 +       if (*ldln) {
4064 +               /* disk found in list */
4065 +               /* attempt to find segment */
4066 +
4067 +               lsln = &(*ldln)->segment_list;
4068 +               while (*lsln) {
4069 +                       dos_prv = (*lsln)->segment->private;
4070 +                       if (dos_prv->start_sect == start_sect)
4071 +                               if (dos_prv->nr_sects == nr_sects)
4072 +                                       break;
4073 +                       lsln = &(*lsln)->next;
4074 +               }
4075 +               if (*lsln)
4076 +                       rc = (*lsln)->segment;
4077 +       }
4078 +       return (rc);
4079 +}
4080 +
4081 +/* function description: add_segment_to_disk
4082 + *
4083 + * this function attempts to add a segment to the segment
4084 + * list of a disk. if the specified disk is not found, it
4085 + * will be added to the global disk list. this function will
4086 + * return a pointer to the matching segment in the disk's
4087 + * segment list. the caller must compare the returned pointer
4088 + * to the specified segment to see if the
4089 + * specified segment was already present in the disk's segment
4090 + * list. if the return pointer matches the specified segment,
4091 + * then the specified segment was added to the list. if the
4092 + * return segment pointer to does not match the specified
4093 + * segment pointer, then the specified segment pointer was
4094 + * a duplicate and can be thrown away.
4095 + */
4096 +static int
4097 +add_segment_to_disk(struct evms_logical_node *disk,
4098 +                   struct evms_logical_node *segment)
4099 +{
4100 +       int rc = 0;
4101 +       struct disk_list_node **ldln, *new_disk;
4102 +       struct segment_list_node **lsln, *new_segment;
4103 +
4104 +       ldln = lookup_disk(disk);
4105 +       if (*ldln == NULL) {
4106 +               /* disk not in list, add disk */
4107 +               new_disk = kmalloc(sizeof (*new_disk), GFP_KERNEL);
4108 +               if (new_disk) {
4109 +                       memset(new_disk, 0, sizeof (*new_disk));
4110 +                       new_disk->disk = disk;
4111 +                       *ldln = new_disk;
4112 +               } else {
4113 +                       rc = -ENOMEM;
4114 +               }
4115 +       }
4116 +       if (!rc) {
4117 +               /* attempt to add segment */
4118 +               lsln = lookup_segment(*ldln, segment);
4119 +               if (*lsln == NULL) {
4120 +                       /* segment not in list, add segment */
4121 +                       new_segment =
4122 +                           kmalloc(sizeof (*new_segment), GFP_KERNEL);
4123 +                       if (new_segment) {
4124 +                               memset(new_segment, 0, sizeof (*new_segment));
4125 +                               new_segment->segment = segment;
4126 +                               *lsln = new_segment;
4127 +                       } else {
4128 +                               rc = -ENOMEM;
4129 +                       }
4130 +               } else
4131 +                       rc = -1;
4132 +       }
4133 +       return (rc);
4134 +}
4135 +
4136 +static int
4137 +remove_segment_from_disk(struct evms_logical_node *disk,
4138 +                        struct evms_logical_node *segment,
4139 +                        struct evms_logical_node **empty_disk)
4140 +{
4141 +       int rc = 0;
4142 +       struct disk_list_node **ldln, *tmp_disk_node;
4143 +       struct segment_list_node **lsln, *tmp_segment_node;
4144 +
4145 +       *empty_disk = NULL;
4146 +       ldln = lookup_disk(disk);
4147 +       if (*ldln == NULL) {
4148 +               rc = -1;
4149 +       } else {
4150 +               /* disk found in list */
4151 +               /* attempt to add segment */
4152 +               lsln = lookup_segment(*ldln, segment);
4153 +               if (*lsln == NULL) {
4154 +                       rc = -2;
4155 +               } else {
4156 +                       tmp_segment_node = *lsln;
4157 +                       /* remove segment from list */
4158 +                       *lsln = (*lsln)->next;
4159 +                       /* free the segment list node */
4160 +                       kfree(tmp_segment_node);
4161 +
4162 +                       if ((*ldln)->segment_list == NULL) {
4163 +                               tmp_disk_node = *ldln;
4164 +                               *empty_disk = tmp_disk_node->disk;
4165 +                               /* remove disk from list */
4166 +                               *ldln = (*ldln)->next;
4167 +                               /* free the disk list node */
4168 +                               kfree(tmp_disk_node);
4169 +                       }
4170 +               }
4171 +       }
4172 +       return (rc);
4173 +}
4174 +
4175 +static inline int
4176 +is_extended_partition(struct partition *p)
4177 +{
4178 +       return (SYS_IND(p) == DOS_EXTENDED_PARTITION ||
4179 +               SYS_IND(p) == WIN98_EXTENDED_PARTITION ||
4180 +               SYS_IND(p) == LINUX_EXTENDED_PARTITION);
4181 +}
4182 +
4183 +static inline u64
4184 +part_start(struct partition *part, u64 ext_start, u64 ebr_start)
4185 +{
4186 +       u64 pstart = START_SECT(part);
4187 +       pstart += (is_extended_partition(part)) ? ext_start : ebr_start;
4188 +       return (pstart);
4189 +}
4190 +
4191 +static int
4192 +validate_mbr_ebr(struct evms_logical_node *node,
4193 +                struct mbr_ebr *mbr_ebr, u64 ext_start,
4194 +                u64 ebr_start)
4195 +{
4196 +       int valid_mbr_ebr, i, j, mbr_flag;
4197 +       struct partition *pi, *pj;
4198 +       u64 pi_start, pi_end, pj_start, pj_end;
4199 +
4200 +       /* assume an MBR */
4201 +       mbr_flag = TRUE;
4202 +
4203 +       /* assume its valid */
4204 +       valid_mbr_ebr = TRUE;
4205 +
4206 +       /* check for valid signature */
4207 +       if (mbr_ebr->signature != cpu_to_le16(MSDOS_LABEL_MAGIC)) {
4208 +               LOG_DEBUG("%s: invalid signature on '%s'!\n",
4209 +                         __FUNCTION__, node->name);
4210 +               valid_mbr_ebr = FALSE;
4211 +       }
4212 +
4213 +       /* check for an AIX IPL signature */
4214 +#define IPLRECID 0xc9c2d4c1    /* Value is EBCIDIC 'IBMA'           */
4215 +       if (*(unsigned int *) mbr_ebr == IPLRECID) {
4216 +               LOG_DEBUG("%s: found an AIX IPL signature on '%s'\n",
4217 +                         __FUNCTION__, node->name);
4218 +               valid_mbr_ebr = FALSE;
4219 +       }
4220 +
4221 +       /* check for boot sector fields */
4222 +
4223 +#if 0                          //Remove checking of the first byte
4224 +
4225 +       /* attempt to make some initial assumptions about
4226 +        * what type of data structure this could be. we
4227 +        * start by checking the 1st byte. we can tell a
4228 +        * few things based on what is or isn't there.
4229 +        */
4230 +       if (valid_mbr_ebr == TRUE)
4231 +               switch (*(u_char *) mbr_ebr) {
4232 +                       /* check for JMP as 1st instruction
4233 +                        * if found, assume (for now), that
4234 +                        * this is a boot sector.
4235 +                        */
4236 +                       /* Removed the JMP opcode check because it's not enough to determine
4237 +                        * that this sector does not have a valid MBR.
4238 +                        * Note:  To avoid going thru validation process of partition table,
4239 +                        * it's necessary to have a better boot sector check
4240 +                        * (eg. JMP opcode && other conditions) */
4241 +                       /*
4242 +                          case 0xEB:
4243 +                          LOG_DEBUG("%s: boot sector detected!\n", __FUNCTION__);
4244 +                          valid_mbr_ebr = FALSE;
4245 +                        */
4246 +                       /* let this fall thru to pick up the
4247 +                        * mbr_flag == FALSE.
4248 +                        */
4249 +
4250 +                       /* the MBR should contain boot strap
4251 +                        * code, so we don't expect the 1st
4252 +                        * byte to be a 0x0. If the 1st byte
4253 +                        * IS 0x0, its assumed (for now) to
4254 +                        * be an EBR.
4255 +                        */
4256 +               case 0:
4257 +                       mbr_flag = FALSE;
4258 +                       break;
4259 +               }
4260 +#endif                         //Remove checking of the first byte
4261 +
4262 +       if (valid_mbr_ebr == TRUE) {
4263 +               /* dump the partition table entries in debug mode */
4264 +               LOG_DEBUG
4265 +                   ("%s: disk relative starts: ext_part("PFU64"), ebr("PFU64").\n",
4266 +                    __FUNCTION__, ext_start, ebr_start);
4267 +               for (i = 0; i < 4; i++) {
4268 +                       pi = &mbr_ebr->partitions[i];
4269 +                       LOG_DEBUG
4270 +                           ("%s: Partition: index(%d), start("PFU64"), size("PFU64"), sys(0x%x).\n",
4271 +                            __FUNCTION__, i, START_SECT(pi), NR_SECTS(pi),
4272 +                            SYS_IND(pi));
4273 +               }
4274 +
4275 +               /* check for PMBR (Protected Master Boot Record)
4276 +                * and skip this node if found
4277 +                */
4278 +               for (i = 0; i < 4; i++) {
4279 +                       pi = &mbr_ebr->partitions[i];
4280 +
4281 +                       if (SYS_IND(pi) == 0xEE) {
4282 +                               valid_mbr_ebr = FALSE;
4283 +                               LOG_DETAILS
4284 +                                   ("%s: detected PMBR on '%s', skipping.\n",
4285 +                                    __FUNCTION__, node->name);
4286 +                               break;
4287 +                       }
4288 +               }
4289 +
4290 +               /* check of this segment is marked as non-dividable
4291 +                * and skip if found
4292 +                */
4293 +               if (node->iflags & EVMS_TOP_SEGMENT) {
4294 +                       valid_mbr_ebr = FALSE;
4295 +               }
4296 +       }
4297 +
4298 +       if (valid_mbr_ebr == TRUE) {
4299 +               /* check for mbr/ebr partition table validity */
4300 +               for (i = 0; i < 4; i++) {
4301 +                       pi = &mbr_ebr->partitions[i];
4302 +                       if (NR_SECTS(pi)) {
4303 +                               /* check for partition extending past end of node */
4304 +                               pi_start = part_start(pi, ext_start, ebr_start);
4305 +                               pi_end = pi_start + NR_SECTS(pi) - 1;
4306 +                               if (pi_end >= node->total_vsectors) {
4307 +                                       LOG_DEBUG
4308 +                                           ("%s: partition(%d) ends("PFU64") beyond the end of the disk(%s,"PFU64")!\n",
4309 +                                            __FUNCTION__, i, pi_end,
4310 +                                            node->name, node->total_vsectors);
4311 +                                       valid_mbr_ebr = FALSE;
4312 +                               }
4313 +                               if (valid_mbr_ebr == FALSE)
4314 +                                       break;
4315 +
4316 +                               /* check for partition overlap */
4317 +                               for (j = i + 1; j < 4; j++) {
4318 +                                       pj = &mbr_ebr->partitions[j];
4319 +                                       if (NR_SECTS(pj)) {
4320 +                                               pj_start =
4321 +                                                   part_start(pj, ext_start,
4322 +                                                              ebr_start);
4323 +                                               pj_end =
4324 +                                                   pj_start + NR_SECTS(pj) - 1;
4325 +                                               if (pi_start == pj_start) {
4326 +                                                       valid_mbr_ebr = FALSE;
4327 +                                               } else if (pi_start < pj_start) {
4328 +                                                       if (pi_end >= pj_start)
4329 +                                                               valid_mbr_ebr =
4330 +                                                                   FALSE;
4331 +                                               } else if (pi_start <= pj_end)
4332 +                                                       valid_mbr_ebr = FALSE;
4333 +
4334 +                                               if (valid_mbr_ebr == FALSE) {
4335 +                                                       LOG_DEBUG
4336 +                                                           ("%s: overlapping partitions(%d,%d) detected on '%s'!\n",
4337 +                                                            __FUNCTION__, i, j,
4338 +                                                            node->name);
4339 +                                                       break;
4340 +                                               }
4341 +                                       }
4342 +                               }
4343 +                               if (valid_mbr_ebr == FALSE)
4344 +                                       break;
4345 +                       }
4346 +               }
4347 +       }
4348 +       if (valid_mbr_ebr == TRUE) {
4349 +               LOG_DEBUG("%s: valid %cBR detected on '%s'!\n", __FUNCTION__,
4350 +                         (mbr_flag == TRUE) ? 'M' : 'E', node->name);
4351 +       } else {
4352 +               LOG_DEBUG("%s: no valid MBR/EBR detected on '%s'!\n",
4353 +                         __FUNCTION__, node->name);
4354 +       }
4355 +       return (valid_mbr_ebr);
4356 +}
4357 +
4358 +/*
4359 + * Function:  add_segment
4360 + */
4361 +static int
4362 +mbr_ebr_process_segment(struct evms_logical_node **discover_list,
4363 +                       struct evms_logical_node *node,
4364 +                       u64 start_sect,
4365 +                       u64 nr_sects,
4366 +                       unsigned char type, int part_num, char *partition_name)
4367 +{
4368 +       struct dos_private *dos_prv = NULL;
4369 +       struct evms_logical_node *segment;
4370 +       int rc = 0;
4371 +
4372 +       segment = find_segment_on_disk(node, start_sect, nr_sects);
4373 +       if (segment) {
4374 +               LOG_DETAILS("exporting segment '%s'.\n", segment->name);
4375 +       } else {
4376 +               dos_prv = kmalloc(sizeof (*dos_prv), GFP_KERNEL);
4377 +               if (dos_prv) {
4378 +                       memset(dos_prv, 0, sizeof (*dos_prv));
4379 +                       dos_prv->source_disk = node;
4380 +                       dos_prv->start_sect = start_sect;
4381 +                       dos_prv->nr_sects = nr_sects;
4382 +                       dos_prv->type = type;
4383 +                       rc = evms_cs_allocate_logical_node(&segment);
4384 +               } else {
4385 +                       rc = -ENOMEM;
4386 +               }
4387 +               if (!rc) {
4388 +                       segment->plugin = &plugin_header;
4389 +                       segment->system_id = (unsigned int) type;
4390 +                       segment->total_vsectors = nr_sects;
4391 +                       segment->block_size = node->block_size;
4392 +                       segment->hardsector_size = node->hardsector_size;
4393 +                       segment->private = dos_prv;
4394 +                       segment->flags = node->flags;
4395 +                       if (partition_name)
4396 +                               strcpy(segment->name, partition_name);
4397 +                       else {
4398 +                               strcpy(segment->name, node->name);
4399 +                               if (GetPluginType(node->plugin->id) ==
4400 +                                   EVMS_SEGMENT_MANAGER) {
4401 +                                       strcat(segment->name, ".");
4402 +                               }
4403 +                               sprintf(segment->name + strlen(segment->name),
4404 +                                       "%d", part_num);
4405 +                       }
4406 +                       /* watch for super floppy format gpt system partition
4407 +                        * and dont let it be sub divided
4408 +                        */
4409 +                       if (segment->system_id == GPT_ESP_INDICATOR) {
4410 +                               node->iflags |= EVMS_TOP_SEGMENT;
4411 +                       }
4412 +                       LOG_DETAILS("creating segment '%s'.\n", segment->name);
4413 +                       rc = add_segment_to_disk(node, segment);
4414 +                       if (rc) {
4415 +                               LOG_ERROR
4416 +                                   ("%s: error(%d) adding segment '%s'!\n",
4417 +                                    __FUNCTION__, rc, segment->name);
4418 +                               rc = 0;
4419 +                       } else {
4420 +                               MOD_INC_USE_COUNT;
4421 +                       }
4422 +               }
4423 +               if (rc) {
4424 +                       if (dos_prv)
4425 +                               kfree(dos_prv);
4426 +                       if (segment)
4427 +                               evms_cs_deallocate_logical_node(segment);
4428 +               }
4429 +       }
4430 +       if (!rc) {
4431 +               evms_cs_add_logical_node_to_list(discover_list, segment);
4432 +               exported_nodes++;
4433 +       }
4434 +       return rc;
4435 +}
4436 +
4437 +static void
4438 +print_partition_info(char *leading_comment, struct partition *p)
4439 +{
4440 +       LOG_EXTRA
4441 +           ("%s: boot_ind(0x%02x), sys_ind(0x%02x), startCHS(%u,%u,%u), endCHS(%u,%u,%u), startLBA("PFU64"), sizeLBA("PFU64")\n",
4442 +            leading_comment, p->boot_ind, p->sys_ind, p->cyl, p->head,
4443 +            p->sector, p->end_cyl, p->end_head, p->end_sector, START_SECT(p),
4444 +            NR_SECTS(p));
4445 +}
4446 +
4447 +#ifdef CONFIG_BSD_DISKLABEL
4448 +#define BSD_DISKLABEL_PART_TABLE_SECTOR_OFFSET 1
4449 +static void
4450 +print_bsd_partition_info(char *leading_comment, struct bsd_partition *p)
4451 +{
4452 +       LOG_EXTRA
4453 +           ("%s: p_size(%u), p_offset(%u), p_fsize(%u), p_fstype(0x%02X), p_frag(0x%02X), p_cpg(%u)\n",
4454 +            leading_comment, p->p_size, p->p_offset, p->p_fsize, p->p_fstype,
4455 +            p->p_frag, p->p_cpg);
4456 +}
4457 +
4458 +/*
4459 + * bsd_disklabel_partition
4460 + *
4461 + * Return:
4462 + *  - 0 for 0 partition
4463 + *  - (positive) number for number of BSD partitions found
4464 + *  - (negative) error code
4465 + */
4466 +static int
4467 +bsd_disklabel_partition(struct evms_logical_node **discover_list,
4468 +                       struct evms_logical_node *node, struct partition *bsd)
4469 +{
4470 +       struct bsd_disklabel *l;
4471 +       struct bsd_partition *p;
4472 +       int max_partitions;
4473 +       char *data;
4474 +       int rc = 0;
4475 +       int count = 0;
4476 +
4477 +       data = kmalloc(node->hardsector_size, GFP_KERNEL);
4478 +       if (data)
4479 +               rc = INIT_IO(node,
4480 +                            0,
4481 +                            START_SECT(bsd) +
4482 +                            BSD_DISKLABEL_PART_TABLE_SECTOR_OFFSET, 1, data);
4483 +       else
4484 +               rc = -ENOMEM;
4485 +       if (!rc) {
4486 +
4487 +               l = (struct bsd_disklabel *) data;
4488 +               if (l->d_magic == BSD_DISKMAGIC) {
4489 +
4490 +                       max_partitions =
4491 +                           ((SYS_IND(bsd) ==
4492 +                             OPENBSD_PARTITION) ? OPENBSD_MAXPARTITIONS :
4493 +                            BSD_MAXPARTITIONS);
4494 +                       if (l->d_npartitions < max_partitions)
4495 +                               max_partitions = l->d_npartitions;
4496 +                       for (p = l->d_partitions;
4497 +                            p - l->d_partitions < max_partitions; p++) {
4498 +                               if (p->p_fstype != BSD_FS_UNUSED) {
4499 +                                       evmsLOG2(EVMS_INFO_EXTRA,
4500 +                                                (print_bsd_partition_info
4501 +                                                 (__FUNCTION__, p)));
4502 +                                       rc = mbr_ebr_process_segment
4503 +                                           (discover_list, node,
4504 +                                            (u64) p->p_offset,
4505 +                                            (u64) p->p_size, p->p_fstype,
4506 +                                            cur_comp_part_num++, NULL);
4507 +                                       if (rc)
4508 +                                               break;
4509 +                                       count++;
4510 +                               }
4511 +                       }
4512 +               }
4513 +       }
4514 +       if (data)
4515 +               kfree(data);
4516 +       if (!rc)
4517 +               rc = count;
4518 +       LOG_DETAILS("%s: exported (%d) partitions\n", __FUNCTION__, rc);
4519 +       return rc;
4520 +}
4521 +#endif
4522 +
4523 +#ifdef CONFIG_UNIXWARE_DISKLABEL
4524 +#define UNIXWARE_PART_TABLE_SECTOR_OFFSET 29
4525 +
4526 +/*
4527 + * unixware_partition
4528 + *
4529 + * Return:
4530 + *  - 0 for 0 partition
4531 + *  - (positive) number for number of UNIXWARE partitions found
4532 + *  - (negative) error code
4533 + */
4534 +static int
4535 +unixware_partition(struct evms_logical_node **discover_list,
4536 +                  struct evms_logical_node *node,
4537 +                  struct partition *unixware_part)
4538 +{
4539 +       struct unixware_disklabel *l;
4540 +       struct unixware_slice *p;
4541 +       char *data = NULL;
4542 +       int rc = 0;
4543 +       int count = 0;
4544 +
4545 +       data = kmalloc(node->hardsector_size, GFP_KERNEL);
4546 +       if (data)
4547 +               rc = INIT_IO(node,
4548 +                            0,
4549 +                            START_SECT(unixware_part) +
4550 +                            UNIXWARE_PART_TABLE_SECTOR_OFFSET, 1, data);
4551 +       else
4552 +               rc = -ENOMEM;
4553 +       if (!rc) {
4554 +               l = (struct unixware_disklabel *) data;
4555 +               if (le32_to_cpu(l->d_magic) == UNIXWARE_DISKMAGIC &&
4556 +                   le32_to_cpu(l->vtoc.v_magic) == UNIXWARE_DISKMAGIC2) {
4557 +                       p = &l->vtoc.v_slice[1];        /* The 0th slice is the same as whole disk. */
4558 +                       while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) {
4559 +                               if (p->s_label != UNIXWARE_FS_UNUSED) {
4560 +                                       rc = mbr_ebr_process_segment
4561 +                                           (discover_list, node, START_SECT(p),
4562 +                                            NR_SECTS(p), UNIXWARE_PARTITION,
4563 +                                            cur_comp_part_num++, NULL);
4564 +                                       if (rc)
4565 +                                               break;
4566 +                                       count++;
4567 +                               }
4568 +                               p++;
4569 +                       }
4570 +               }
4571 +       }
4572 +       if (data)
4573 +               kfree(data);
4574 +       if (!rc)
4575 +               rc = count;
4576 +       LOG_DETAILS("%s: exported (%d) partitions\n", __FUNCTION__, rc);
4577 +       return rc;
4578 +}
4579 +#endif
4580 +
4581 +#ifdef CONFIG_SOLARIS_X86_PARTITION
4582 +#define SOLARIS_X86_PART_TABLE_SECTOR_OFFSET 1
4583 +/*
4584 + * solaris_x86_partition
4585 + *
4586 + * Return:
4587 + *  - 0 for 0 partition
4588 + *  - (positive) number for number of solaris partitions found
4589 + *  - (negative) error code
4590 + */
4591 +static int
4592 +solaris_x86_partition(struct evms_logical_node **discover_list,
4593 +                     struct evms_logical_node *node,
4594 +                     struct partition *solaris_x86, int probe_only)
4595 +{                              /* if TRUE, do not add segments */
4596 +       long offset = START_SECT(solaris_x86);
4597 +       struct solaris_x86_vtoc *v;
4598 +       struct solaris_x86_slice *s;
4599 +       int i;
4600 +       char *data = NULL;
4601 +       int rc = 0;
4602 +       int count = 0;
4603 +
4604 +       data = kmalloc(node->hardsector_size, GFP_KERNEL);
4605 +       if (data)
4606 +               rc = INIT_IO(node,
4607 +                            0,
4608 +                            START_SECT(solaris_x86) +
4609 +                            SOLARIS_X86_PART_TABLE_SECTOR_OFFSET, 1, data);
4610 +       else
4611 +               rc = -ENOMEM;
4612 +       if (!rc) {
4613 +
4614 +               v = (struct solaris_x86_vtoc *) data;
4615 +
4616 +               if (v->v_sanity == SOLARIS_X86_VTOC_SANE) {
4617 +                       if (v->v_version != 1) {
4618 +                               LOG_WARNING
4619 +                                   ("%s: cannot handle version %d vtoc>\n",
4620 +                                    __FUNCTION__, v->v_version);
4621 +                       } else {
4622 +                               for (i = 0; i < v->v_nparts; i++) {
4623 +                                       s = &v->v_slice[i];
4624 +                                       LOG_EXTRA
4625 +                                           ("s[%d] s_tag(%u), s_flag(%u), s_start(%u), s_size(%u), last_sector(%u)\n",
4626 +                                            i, s->s_tag, s->s_flag, s->s_start,
4627 +                                            s->s_size,
4628 +                                            s->s_start + s->s_size - 1);
4629 +
4630 +                                       if ((s->s_size == 0)
4631 +                                           || (s->s_tag == 0x05))
4632 +                                               continue;
4633 +                                       if (!probe_only) {
4634 +                                               rc = mbr_ebr_process_segment
4635 +                                                   (discover_list, node,
4636 +                                                    (u64) (s->s_start +
4637 +                                                                 offset),
4638 +                                                    (u64) s->s_size,
4639 +                                                    SOLARIS_X86_PARTITION,
4640 +                                                    cur_comp_part_num++, NULL);
4641 +                                               if (rc)
4642 +                                                       break;
4643 +                                       }
4644 +                                       count++;
4645 +                               }
4646 +                       }
4647 +               }
4648 +       }
4649 +       if (data)
4650 +               kfree(data);
4651 +       if (!rc)
4652 +               rc = count;
4653 +       LOG_DETAILS("%s: %s (%d) partitions\n",
4654 +                   __FUNCTION__, probe_only ? " " : "exported", rc);
4655 +       return rc;
4656 +}
4657 +#endif
4658 +
4659 +/*
4660 + * os2lvm_partition() looks for DLAT at last sector of the track containing MBR/EBR
4661 + *
4662 + * Returns:     1 - os2 DLAT was found
4663 + *              0 otherwise
4664 + *
4665 + */
4666 +static int
4667 +os2lvm_partition(u64 MBR_EBR_sect,
4668 +                struct evms_logical_node *node, struct dla_table_sector *dlat)
4669 +{
4670 +       struct hd_geometry geometry;
4671 +       int rc;
4672 +       u32 crc_hold;
4673 +
4674 +       rc = evms_cs_kernel_ioctl(node, HDIO_GETGEO, (unsigned long) &geometry);
4675 +       if (rc) {
4676 +               LOG_SERIOUS("%s: ioctl failed(%u) on '%s'\n",
4677 +                           __FUNCTION__, rc, node->name);
4678 +       } else
4679 +           if (!INIT_IO(node, 0, MBR_EBR_sect + geometry.sectors - 1, 1, dlat))
4680 +       {
4681 +               if ((dlat->DLA_Signature1 == cpu_to_le32(DLA_TABLE_SIGNATURE1))
4682 +                   && (dlat->DLA_Signature2 ==
4683 +                       cpu_to_le32(DLA_TABLE_SIGNATURE2))) {
4684 +                       crc_hold = le32_to_cpu(dlat->DLA_CRC);
4685 +                       dlat->DLA_CRC = 0;
4686 +                       if (evms_cs_calculate_crc
4687 +                           (EVMS_INITIAL_CRC, (void *) dlat,
4688 +                            node->hardsector_size) == crc_hold)
4689 +                               return 1;
4690 +               }
4691 +       }
4692 +       return 0;
4693 +}
4694 +
4695 +static int
4696 +mbr_ebr_process_logical_drive(struct evms_logical_node **discover_list,
4697 +                             struct evms_logical_node *node,
4698 +                             struct extended_part *ext_info,
4699 +                             int i,
4700 +                             struct partition *p,
4701 +                             int os2lvm, struct dla_table_sector *dlat)
4702 +{
4703 +       int rc = 0;
4704 +       char tmp_buf[EVMS_VOLUME_NAME_SIZE], *partition_name;
4705 +
4706 +       LOG_EXTRA("%s: PartitionTableIndex(%i), Start("PFU64"), Size("PFU64")\n",
4707 +                 __FUNCTION__, i, START_SECT(p), NR_SECTS(p));
4708 +
4709 +       if (NR_SECTS(p)) {
4710 +               if (is_extended_partition(p)) {
4711 +                       ext_info->next_ebr_start =
4712 +                           (u64) (START_SECT(p) +
4713 +                                        START_SECT(ext_info->extended));
4714 +                       ext_info->done = FALSE; /* not done yet */
4715 +               } else {
4716 +                       partition_name = NULL;
4717 +                       if (os2lvm && p->sys_ind != LVM_PARTITION_INDICATOR &&
4718 +                           le32_to_cpu(dlat->DLA_Array[i].Partition_Start) ==
4719 +                           (ext_info->start_sect + START_SECT(p))
4720 +                           && le32_to_cpu(dlat->DLA_Array[i].Partition_Size) ==
4721 +                           NR_SECTS(p)
4722 +                           && dlat->DLA_Array[i].Drive_Letter != '\0') {
4723 +                               sprintf(tmp_buf, "os2/%c",
4724 +                                       dlat->DLA_Array[i].Drive_Letter);
4725 +                               partition_name = tmp_buf;
4726 +                       }
4727 +                       evmsLOG2(EVMS_INFO_EXTRA,
4728 +                                (print_partition_info(__FUNCTION__, p)));
4729 +
4730 +                       rc = mbr_ebr_process_segment(discover_list,
4731 +                                                    node,
4732 +                                                    ext_info->start_sect +
4733 +                                                    START_SECT(p), NR_SECTS(p),
4734 +                                                    p->sys_ind,
4735 +                                                    cur_comp_part_num++,
4736 +                                                    partition_name);
4737 +               }
4738 +       }
4739 +       return (rc);
4740 +}
4741 +
4742 +static int
4743 +mbr_ebr_process_ebr(struct evms_logical_node **discover_list,
4744 +                   struct evms_logical_node *node,
4745 +                   struct extended_part *ext_info, struct mbr_ebr *ebr)
4746 +{
4747 +       int rc = 0, i, os2lvm;
4748 +       struct partition *p;
4749 +       struct dla_table_sector *dlat = NULL;
4750 +
4751 +       /* allocate space for the OS2 DLAT info */
4752 +       dlat = kmalloc(node->hardsector_size, GFP_KERNEL);
4753 +       if (dlat) {
4754 +               /* read the dlat for this mbr */
4755 +               os2lvm = os2lvm_partition(ext_info->start_sect, node, dlat);
4756 +
4757 +               /* walk thru the partition table in the mbr
4758 +                * processing each partition record.
4759 +                */
4760 +               for (i = 0; i < 4; i++) {
4761 +                       p = &ebr->partitions[i];
4762 +                       rc = mbr_ebr_process_logical_drive(discover_list,
4763 +                                                          node,
4764 +                                                          ext_info,
4765 +                                                          i, p, os2lvm, dlat);
4766 +               }
4767 +       } else {
4768 +               rc = -ENOMEM;
4769 +       }
4770 +
4771 +       /* free the space used for OS2 DLAT info */
4772 +       if (dlat)
4773 +               kfree(dlat);
4774 +
4775 +       return (rc);
4776 +}
4777 +
4778 +static int
4779 +mbr_ebr_probe_for_ebr(struct evms_logical_node **discover_list,
4780 +                     struct evms_logical_node *node,
4781 +                     struct extended_part *ext_info)
4782 +{
4783 +       int rc = 0;
4784 +       u_char *sector_buffer = NULL;
4785 +       struct mbr_ebr *ebr = NULL;
4786 +
4787 +       /* allocate a sector size buffer */
4788 +       sector_buffer = kmalloc(node->hardsector_size, GFP_KERNEL);
4789 +       if (sector_buffer)
4790 +               /* read the location of the mbr sector */
4791 +               rc = INIT_IO(node, 0, ext_info->start_sect, 1, sector_buffer);
4792 +       else
4793 +               rc = -ENOMEM;
4794 +
4795 +       if (!rc) {
4796 +               ebr = (struct mbr_ebr *) sector_buffer;
4797 +               if (validate_mbr_ebr(node, ebr,
4798 +                                    START_SECT(ext_info->extended),
4799 +                                    ext_info->start_sect) == TRUE)
4800 +                       rc = mbr_ebr_process_ebr(discover_list,
4801 +                                                node, ext_info, ebr);
4802 +       }
4803 +
4804 +       if (sector_buffer)
4805 +               kfree(sector_buffer);
4806 +
4807 +       return (rc);
4808 +}
4809 +
4810 +static int
4811 +mbr_ebr_process_extended_partition(struct evms_logical_node **discover_list,
4812 +                                  struct evms_logical_node *node,
4813 +                                  struct partition *p)
4814 +{
4815 +       int rc = 0;
4816 +       struct extended_part ext_info;
4817 +
4818 +       memset(&ext_info, 0, sizeof (ext_info));
4819 +       ext_info.done = FALSE;
4820 +       ext_info.extended = p;
4821 +       ext_info.next_ebr_start = START_SECT(p);
4822 +       while (ext_info.done == FALSE) {
4823 +               ext_info.done = TRUE;   /* assume done, unless we find another EBR */
4824 +               ext_info.start_sect = ext_info.next_ebr_start;
4825 +               rc = mbr_ebr_probe_for_ebr(discover_list, node, &ext_info);
4826 +       }
4827 +       return rc;
4828 +}
4829 +
4830 +/*
4831 + * is_non_dos_extended
4832 + *
4833 + * This function returns TRUE if the partition entry represents a non-DOS
4834 + * extended partition such as UnixWare, Solaris x86 and BSD
4835 + */
4836 +static int
4837 +is_non_dos_extended(struct evms_logical_node **discover_list,
4838 +                   struct evms_logical_node *node, struct partition *p)
4839 +{
4840 +       if (NR_SECTS(p)) {
4841 +#ifdef CONFIG_BSD_DISKLABEL
4842 +               if (SYS_IND(p) == BSD_PARTITION ||
4843 +                   SYS_IND(p) == NETBSD_PARTITION ||
4844 +                   SYS_IND(p) == OPENBSD_PARTITION)
4845 +                       return TRUE;
4846 +#endif
4847 +
4848 +#ifdef CONFIG_UNIXWARE_DISKLABEL
4849 +               if (SYS_IND(p) == UNIXWARE_PARTITION)
4850 +                       return TRUE;
4851 +#endif
4852 +
4853 +#ifdef CONFIG_SOLARIS_X86_PARTITION
4854 +               if ((SYS_IND(p) == SOLARIS_X86_PARTITION) &&
4855 +                   (solaris_x86_partition(discover_list, node, p, TRUE) > 0))
4856 +                       return TRUE;
4857 +#endif
4858 +       }
4859 +       return (FALSE);
4860 +}
4861 +
4862 +/*
4863 + * mbr_ebr_process_other_primary_partition
4864 + * This function processes other (non-DOS) primary partitions such as
4865 + * UnixWare, Solaris x86 and BSD
4866 + */
4867 +static int
4868 +mbr_ebr_process_other_primary_partition(struct evms_logical_node
4869 +                                       **discover_list,
4870 +                                       struct evms_logical_node *node,
4871 +                                       struct partition *p)
4872 +{
4873 +       if (NR_SECTS(p)) {
4874 +#ifdef CONFIG_BSD_DISKLABEL
4875 +               if (SYS_IND(p) == BSD_PARTITION ||
4876 +                   SYS_IND(p) == NETBSD_PARTITION ||
4877 +                   SYS_IND(p) == OPENBSD_PARTITION)
4878 +                       return bsd_disklabel_partition(discover_list, node, p);
4879 +#endif
4880 +
4881 +#ifdef CONFIG_UNIXWARE_DISKLABEL
4882 +               if (SYS_IND(p) == UNIXWARE_PARTITION)
4883 +                       return unixware_partition(discover_list, node, p);
4884 +#endif
4885 +
4886 +#ifdef CONFIG_SOLARIS_X86_PARTITION
4887 +               if (SYS_IND(p) == SOLARIS_X86_PARTITION)
4888 +                       return solaris_x86_partition(discover_list, node, p,
4889 +                                                    FALSE);
4890 +#endif
4891 +       }
4892 +       return (0);
4893 +}
4894 +
4895 +static int
4896 +mbr_ebr_process_dos_primary_partition(struct evms_logical_node **discover_list,
4897 +                                     struct evms_logical_node *node,
4898 +                                     int i,
4899 +                                     struct partition *p,
4900 +                                     int os2lvm, struct dla_table_sector *dlat)
4901 +{
4902 +       int rc = 0;
4903 +       char tmp_buf[EVMS_VOLUME_NAME_SIZE], *partition_name;
4904 +
4905 +       LOG_EVERYTHING("%s: PartitionTableIndex(%i), Start("PFU64"), Size("PFU64")\n",
4906 +                      __FUNCTION__, i, START_SECT(p), NR_SECTS(p));
4907 +
4908 +       if (NR_SECTS(p)) {
4909 +
4910 +               if (is_extended_partition(p))
4911 +                       rc = mbr_ebr_process_extended_partition(discover_list,
4912 +                                                               node, p);
4913 +
4914 +               else {
4915 +                       partition_name = NULL;
4916 +                       if (os2lvm && p->sys_ind != LVM_PARTITION_INDICATOR &&
4917 +                           le32_to_cpu(dlat->DLA_Array[i].Partition_Start) ==
4918 +                           START_SECT(p)
4919 +                           && le32_to_cpu(dlat->DLA_Array[i].Partition_Size) ==
4920 +                           NR_SECTS(p)
4921 +                           && dlat->DLA_Array[i].Drive_Letter != '\0') {
4922 +                               sprintf(tmp_buf, "os2/%c",
4923 +                                       dlat->DLA_Array[i].Drive_Letter);
4924 +                               partition_name = tmp_buf;
4925 +                       }
4926 +                       evmsLOG2(EVMS_INFO_EXTRA,
4927 +                                (print_partition_info(__FUNCTION__, p)));
4928 +
4929 +                       rc = mbr_ebr_process_segment(discover_list,
4930 +                                                    node,
4931 +                                                    START_SECT(p),
4932 +                                                    NR_SECTS(p),
4933 +                                                    p->sys_ind,
4934 +                                                    i + 1, partition_name);
4935 +               }
4936 +       }
4937 +       return (rc);
4938 +}
4939 +
4940 +static int
4941 +mbr_ebr_process_mbr(struct evms_logical_node **discover_list,
4942 +                   struct evms_logical_node *node, struct mbr_ebr *mbr)
4943 +{
4944 +       int rc = 0, i, os2lvm;
4945 +       struct partition *p;
4946 +       struct dla_table_sector *dlat = NULL;
4947 +
4948 +       cur_comp_part_num = 5;  /* set this value for each disk */
4949 +
4950 +       /* allocate space for the OS2 DLAT info */
4951 +       dlat = kmalloc(node->hardsector_size, GFP_KERNEL);
4952 +       if (dlat) {
4953 +               /* read the dlat for this mbr */
4954 +               os2lvm = os2lvm_partition(0, node, dlat);
4955 +
4956 +               /* Pass 1: walk thru the partition table in the mbr
4957 +                * processing each partition record.
4958 +                */
4959 +               for (i = 0; i < 4; i++) {
4960 +                       p = &mbr->partitions[i];
4961 +                       if (is_non_dos_extended(discover_list, node, p)) {
4962 +                               LOG_DETAILS
4963 +                                   (" Found and skip a non-dos extended partition.\n");
4964 +                               continue;
4965 +                       }
4966 +
4967 +                       mbr_ebr_process_dos_primary_partition(discover_list,
4968 +                                                             node,
4969 +                                                             i,
4970 +                                                             p, os2lvm, dlat);
4971 +               }
4972 +
4973 +               /* Pass 2: walk thru the partition table in the mbr
4974 +                * processing each partition record for non-DOS extended partitions
4975 +                */
4976 +               for (i = 0; i < 4; i++) {
4977 +                       p = &mbr->partitions[i];
4978 +                       mbr_ebr_process_other_primary_partition(discover_list,
4979 +                                                               node, p);
4980 +               }
4981 +
4982 +       } else {
4983 +               rc = -ENOMEM;
4984 +       }
4985 +
4986 +       /* free the space used for OS2 DLAT info */
4987 +       if (dlat)
4988 +               kfree(dlat);
4989 +
4990 +       return (rc);
4991 +}
4992 +
4993 +static int
4994 +mbr_ebr_probe_for_mbr(struct evms_logical_node **discover_list,
4995 +                     struct evms_logical_node *node)
4996 +{
4997 +       int rc = 0;
4998 +       u_char *sector_buffer = NULL;
4999 +       struct mbr_ebr *mbr = NULL;
5000 +
5001 +       LOG_DEBUG("%s: probing (%s).\n", __FUNCTION__, node->name);
5002 +
5003 +       /* allocate a sector size buffer */
5004 +       sector_buffer = kmalloc(node->hardsector_size, GFP_KERNEL);
5005 +       if (sector_buffer)
5006 +               /* read the location of the mbr sector */
5007 +               rc = INIT_IO(node, 0, 0, 1, sector_buffer);
5008 +       else
5009 +               rc = -ENOMEM;
5010 +       if (rc) {
5011 +               LOG_ERROR("%s: read error(%d) on '%s'.\n",
5012 +                         __FUNCTION__, rc, node->name);
5013 +       } else {
5014 +               mbr = (struct mbr_ebr *) sector_buffer;
5015 +               if (validate_mbr_ebr(node, mbr, 0, 0) == TRUE) {
5016 +                       /* since it looks like this disk has a
5017 +                        * valid MBR, remove the disk node from
5018 +                        * the discover list. it may already be
5019 +                        * on the global list, or it will be
5020 +                        * added to it. in the case of an mbr
5021 +                        * with no partitions, it is simply
5022 +                        * removed and forgotten. when one or
5023 +                        * more partitions are created, the
5024 +                        * disk will be examined and handled
5025 +                        * properly during the following
5026 +                        * rediscover operation.
5027 +                        */
5028 +                       evms_cs_remove_logical_node_from_list(discover_list,
5029 +                                                             node);
5030 +
5031 +                       rc = mbr_ebr_process_mbr(discover_list, node, mbr);
5032 +               }
5033 +       }
5034 +
5035 +       if (sector_buffer)
5036 +               kfree(sector_buffer);
5037 +
5038 +       return (rc);
5039 +}
5040 +
5041 +/*
5042 + * Function: mbr_ebr_partition_discover
5043 + *
5044 + */
5045 +static int
5046 +mbr_ebr_partition_discover(struct evms_logical_node **discover_list)
5047 +{
5048 +       int rc = 0;
5049 +       struct evms_logical_node *node, *next_node;
5050 +
5051 +       MOD_INC_USE_COUNT;
5052 +       LOG_ENTRY_EXIT("%s: ENTRY\n", __FUNCTION__);
5053 +
5054 +       /* initialize global variable */
5055 +       exported_nodes = 0;
5056 +
5057 +       /* examine each node on the discover list */
5058 +       next_node = *discover_list;
5059 +       while (next_node) {
5060 +               node = next_node;
5061 +               next_node = node->next;
5062 +               if (node->plugin->id == plugin_header.id)
5063 +                       /* don't recurse into our own objects
5064 +                        */
5065 +                       continue;
5066 +               mbr_ebr_probe_for_mbr(discover_list, node);
5067 +       }
5068 +
5069 +       LOG_ENTRY_EXIT("%s: EXIT(exported nodes:%d, error code:%d)\n",
5070 +                      __FUNCTION__, exported_nodes, rc);
5071 +       if (exported_nodes)
5072 +               rc = exported_nodes;
5073 +       MOD_DEC_USE_COUNT;
5074 +       return (rc);
5075 +}
5076 +
5077 +/*
5078 + * Function: mbr_ebr_partition_delete
5079 + *
5080 + */
5081 +static int
5082 +mbr_ebr_partition_delete(struct evms_logical_node *segment)
5083 +{
5084 +       int rc = 0;
5085 +       struct dos_private *dos_prv;
5086 +       struct evms_logical_node *empty_disk = NULL;
5087 +
5088 +       LOG_DETAILS("deleting segment '%s'.\n", segment->name);
5089 +
5090 +       if (!segment) {
5091 +               rc = -ENODEV;
5092 +       } else {
5093 +               dos_prv = segment->private;
5094 +               if (dos_prv) {
5095 +                       /* remove the segment from the
5096 +                        * disk's segment list
5097 +                        */
5098 +                       rc = remove_segment_from_disk(dos_prv->source_disk,
5099 +                                                     segment, &empty_disk);
5100 +                       /* free the local instance data */
5101 +                       kfree(dos_prv);
5102 +               }
5103 +               /* free the segment node */
5104 +               evms_cs_deallocate_logical_node(segment);
5105 +               MOD_DEC_USE_COUNT;
5106 +               /* if the last segment on the disk was
5107 +                * deleted, delete the disk node too
5108 +                */
5109 +               if (empty_disk)
5110 +                       DELETE(empty_disk);
5111 +       }
5112 +       return (rc);
5113 +}
5114 +
5115 +/*
5116 + * function: mbr_ebr_partition_io_error
5117 + *
5118 + * this function was primarily created because the function
5119 + * buffer_IO_error is inline and kgdb doesn't allow breakpoints
5120 + * to be set on inline functions. Since this was an error path
5121 + * and not mainline, I decided to add a trace statement to help
5122 + * report on the failing condition.
5123 + *
5124 + */
5125 +static void
5126 +mbr_ebr_partition_io_error(struct evms_logical_node *node,
5127 +                          int io_flag, struct buffer_head *bh)
5128 +{
5129 +       LOG_SERIOUS
5130 +           ("attempt to %s beyond partition boundary("PFU64") on (%s), rsector("PFU64").\n",
5131 +            (io_flag) ? "WRITE" : "READ", node->total_vsectors - 1, node->name,
5132 +            (u64) bh->b_rsector);
5133 +
5134 +       bh->b_end_io(bh, 0);
5135 +}
5136 +
5137 +/*
5138 + * Function: mbr_ebr_partition_read
5139 + *
5140 + */
5141 +static void
5142 +mbr_ebr_partition_read(struct evms_logical_node *partition,
5143 +                      struct buffer_head *bh)
5144 +{
5145 +       struct dos_private *dos_prv = partition->private;
5146 +
5147 +       if ((bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT)) <=
5148 +           partition->total_vsectors) {
5149 +               bh->b_rsector += dos_prv->start_sect;
5150 +               R_IO(dos_prv->source_disk, bh);
5151 +       } else
5152 +               mbr_ebr_partition_io_error(partition, READ, bh);
5153 +}
5154 +
5155 +/*
5156 + * Function: mbr_ebr_partition_write
5157 + *
5158 + */
5159 +static void
5160 +mbr_ebr_partition_write(struct evms_logical_node *partition,
5161 +                       struct buffer_head *bh)
5162 +{
5163 +       struct dos_private *dos_prv = partition->private;
5164 +
5165 +       if ((bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT)) <=
5166 +           partition->total_vsectors) {
5167 +               bh->b_rsector += dos_prv->start_sect;
5168 +               W_IO(dos_prv->source_disk, bh);
5169 +       } else
5170 +               mbr_ebr_partition_io_error(partition, WRITE, bh);
5171 +}
5172 +
5173 +/*
5174 + * Function: mbr_ebr_partition_init_io
5175 + *
5176 + */
5177 +static int
5178 +mbr_ebr_partition_init_io(struct evms_logical_node *partition, int io_flag,    /* 0=read, 1=write */
5179 +                         u64 sect_nr,  /* disk LBA */
5180 +                         u64 num_sects,        /* # of sectors */
5181 +                         void *buf_addr)
5182 +{                              /* buffer address */
5183 +       int rc;
5184 +       struct dos_private *dos_prv = partition->private;
5185 +
5186 +       if ((sect_nr + num_sects) <= partition->total_vsectors) {
5187 +               rc = INIT_IO(dos_prv->source_disk, io_flag,
5188 +                            sect_nr + dos_prv->start_sect, num_sects,
5189 +                            buf_addr);
5190 +       } else {
5191 +               LOG_SERIOUS
5192 +                   ("init_io: attempt to %s beyond partition(%s) boundary("PFU64") at sector("PFU64") for count("PFU64").\n",
5193 +                    (io_flag) ? "WRITE" : "READ", partition->name,
5194 +                    (dos_prv->nr_sects - 1), sect_nr, num_sects);
5195 +               rc = -EINVAL;
5196 +       }
5197 +
5198 +       return (rc);
5199 +}
5200 +
5201 +/*
5202 + * Function: mbr_ebr_partition_ioctl
5203 + *
5204 + */
5205 +static int
5206 +mbr_ebr_partition_ioctl(struct evms_logical_node *partition,
5207 +                       struct inode *inode,
5208 +                       struct file *file, unsigned int cmd, unsigned long arg)
5209 +{
5210 +       struct dos_private *dos_prv;
5211 +       struct hd_geometry hd_geo;
5212 +       int rc;
5213 +
5214 +       rc = 0;
5215 +       dos_prv = partition->private;
5216 +       if (!inode)
5217 +               return -EINVAL;
5218 +       switch (cmd) {
5219 +       case HDIO_GETGEO:
5220 +               {
5221 +                       rc = IOCTL(dos_prv->source_disk, inode, file, cmd, arg);
5222 +                       if (rc)
5223 +                               break;
5224 +                       if (copy_from_user
5225 +                           (&hd_geo, (void *) arg,
5226 +                            sizeof (struct hd_geometry)))
5227 +                               rc = -EFAULT;
5228 +                       if (rc)
5229 +                               break;
5230 +                       hd_geo.start = dos_prv->start_sect;
5231 +                       if (copy_to_user
5232 +                           ((void *) arg, &hd_geo,
5233 +                            sizeof (struct hd_geometry)))
5234 +                               rc = -EFAULT;
5235 +               }
5236 +               break;
5237 +       case EVMS_GET_BMAP:
5238 +               {
5239 +                       struct evms_get_bmap_pkt *bmap =
5240 +                           (struct evms_get_bmap_pkt *) arg;
5241 +                       bmap->rsector += dos_prv->start_sect;
5242 +                       /* intentionally fall thru to
5243 +                        * default ioctl down to device
5244 +                        * manager.
5245 +                        */
5246 +               }
5247 +       default:
5248 +               rc = IOCTL(dos_prv->source_disk, inode, file, cmd, arg);
5249 +       }
5250 +       return rc;
5251 +}
5252 +
5253 +/*
5254 + * Function: dos_part_init
5255 + *
5256 + */
5257 +static int __init
5258 +dos_part_init(void)
5259 +{
5260 +       return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
5261 +}
5262 +
5263 +static void __exit
5264 +dos_part_exit(void)
5265 +{
5266 +       evms_cs_unregister_plugin(&plugin_header);
5267 +}
5268 +
5269 +module_init(dos_part_init);
5270 +module_exit(dos_part_exit);
5271 +#ifdef MODULE_LICENSE
5272 +MODULE_LICENSE("GPL");
5273 +#endif
5274 diff -Naur linux-2002-09-30/drivers/evms/evms.c evms-2002-09-30/drivers/evms/evms.c
5275 --- linux-2002-09-30/drivers/evms/evms.c        Wed Dec 31 18:00:00 1969
5276 +++ evms-2002-09-30/drivers/evms/evms.c Thu Sep 26 11:55:45 2002
5277 @@ -0,0 +1,5865 @@
5278 +/* -*- linux-c -*- */
5279 +/*
5280 + *
5281 + *
5282 + *   Copyright (c) International Business Machines  Corp., 2000
5283 + *
5284 + *   This program is free software;  you can redistribute it and/or modify
5285 + *   it under the terms of the GNU General Public License as published by
5286 + *   the Free Software Foundation; either version 2 of the License, or
5287 + *   (at your option) any later version.
5288 + *
5289 + *   This program is distributed in the hope that it will be useful,
5290 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
5291 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
5292 + *   the GNU General Public License for more details.
5293 + *
5294 + *   You should have received a copy of the GNU General Public License
5295 + *   along with this program;  if not, write to the Free Software
5296 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
5297 + *
5298 + *
5299 + */
5300 +/*
5301 + *
5302 + * linux/drivers/evms/evms.c
5303 + *
5304 + * EVMS Base and Common Services
5305 + *
5306 + */
5307 +
5308 +#define DEVICE_NR(device) MINOR(device)        /* evms has no partition bits */
5309 +#define DEVICE_NAME "evms"     /* name for messaging */
5310 +#define DEVICE_NO_RANDOM       /* no entropy to contribute */
5311 +#define DEVICE_OFF(d)          /* do nothing */
5312 +
5313 +//#define LOCAL_DEBUG 1
5314 +
5315 +#include <linux/config.h>
5316 +#include <linux/module.h>
5317 +#include <linux/errno.h>
5318 +#include <linux/kernel.h>
5319 +#include <linux/init.h>
5320 +#include <linux/fs.h>
5321 +#include <linux/slab.h>
5322 +#include <asm/uaccess.h>
5323 +#include <linux/blk.h>         /* must be included by all block drivers */
5324 +#include <linux/blkdev.h>
5325 +#include <linux/blkpg.h>
5326 +#include <linux/iobuf.h>
5327 +#include <linux/genhd.h>
5328 +#include <linux/sched.h>
5329 +#include <linux/completion.h>
5330 +#include <linux/version.h>
5331 +#include <linux/swap.h>
5332 +#include <net/checksum.h>
5333 +#include <linux/sysctl.h>
5334 +#include <linux/smp_lock.h>
5335 +#include <linux/reboot.h>
5336 +#include <linux/compiler.h>
5337 +#include <linux/evms/evms.h>
5338 +
5339 +//#define VFS_PATCH_PRESENT
5340 +
5341 +/* prefix used in logging messages */
5342 +#define LOG_PREFIX
5343 +
5344 +struct evms_registered_plugin {
5345 +       struct evms_plugin_header *plugin;
5346 +       struct evms_registered_plugin *next;
5347 +};
5348 +static struct evms_registered_plugin *registered_plugin_head = NULL;
5349 +
5350 +static struct evms_list_node *evms_global_device_list = NULL;
5351 +static struct evms_list_node *evms_global_feature_node_list = NULL;
5352 +static struct evms_list_node *evms_global_notify_list = NULL;
5353 +
5354 +int evms_info_level = EVMS_INFO_LEVEL;
5355 +struct proc_dir_entry *evms_proc_dir = NULL;
5356 +EXPORT_SYMBOL(evms_info_level);
5357 +static struct evms_logical_volume *evms_logical_volumes;
5358 +static int evms_volumes = 0;
5359 +/* a few variables to aid in detecting memory leaks.
5360 + * these variables are always in use, regardless of
5361 + * the state of EVMS_MEM_DEBUG.
5362 + */
5363 +static atomic_t evms_allocs = (atomic_t) ATOMIC_INIT(0);
5364 +static atomic_t evms_logical_nodes = (atomic_t) ATOMIC_INIT(0);
5365 +
5366 +u8 *evms_primary_string = "primary";
5367 +EXPORT_SYMBOL(evms_primary_string);
5368 +u8 *evms_secondary_string = "secondary";
5369 +EXPORT_SYMBOL(evms_secondary_string);
5370 +
5371 +static struct evms_version evms_svc_version = {
5372 +       .major          = EVMS_COMMON_SERVICES_MAJOR,
5373 +       .minor          = EVMS_COMMON_SERVICES_MINOR,
5374 +       .patchlevel     = EVMS_COMMON_SERVICES_PATCHLEVEL
5375 +};
5376 +
5377 +/* Handles for "private" EVMS object pools */
5378 +static struct evms_pool_mgmt *evms_io_notify_pool;
5379 +
5380 +/* Handles for "public" EVMS object pools */
5381 +struct evms_pool_mgmt *evms_bh_pool;
5382 +EXPORT_SYMBOL(evms_bh_pool);
5383 +
5384 +/* Handle for the devfs directory entry */
5385 +devfs_handle_t evms_dir_devfs_handle;
5386 +devfs_handle_t evms_blk_devfs_handle;
5387 +
5388 +/**********************************************************/
5389 +/* SYSCTL - EVMS folder                                          */
5390 +/**********************************************************/
5391 +
5392 +#ifdef CONFIG_PROC_FS
5393 +static struct ctl_table_header *evms_table_header;
5394 +static int evms_info_level_min = EVMS_INFO_CRITICAL;
5395 +static int evms_info_level_max = EVMS_INFO_EVERYTHING;
5396 +
5397 +static ctl_table evms_table[] = {
5398 +       {DEV_EVMS_INFO_LEVEL, "evms_info_level",
5399 +        &evms_info_level, sizeof (int), 0644, NULL,
5400 +        &proc_dointvec_minmax, &sysctl_intvec,
5401 +        NULL, &evms_info_level_min, &evms_info_level_max},
5402 +       {0}
5403 +};
5404 +
5405 +static ctl_table evms_dir_table[] = {
5406 +       {DEV_EVMS, "evms", NULL, 0, 0555, evms_table},
5407 +       {0}
5408 +};
5409 +
5410 +static ctl_table dev_dir_table[] = {
5411 +       {CTL_DEV, "dev", NULL, 0, 0555, evms_dir_table},
5412 +       {0}
5413 +};
5414 +#endif
5415 +
5416 +/**********************************************************/
5417 +/* START -- arch ioctl32 support                          */
5418 +/**********************************************************/
5419 +#if defined(CONFIG_PPC64) || defined(CONFIG_SPARC64)
5420 +#include <linux/evms/evms_bbr_k.h>
5421 +#include <linux/raid/md.h>
5422 +
5423 +extern asmlinkage long
5424 +sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg);
5425 +
5426 +extern int
5427 +register_ioctl32_conversion(unsigned int cmd, void *handler);
5428 +
5429 +extern int
5430 +unregister_ioctl32_conversion(unsigned int cmd);
5431 +
5432 +#define uvirt_to_kernel(__x) ((unsigned long)(__x))
5433 +typedef unsigned int __uvirt_addr;
5434 +
5435 +struct evms_sector_io32 {
5436 +       u64 disk_handle;
5437 +       s32 io_flag;
5438 +       u64 starting_sector;
5439 +       u64 sector_count;
5440 +       __uvirt_addr  buffer_address;
5441 +       s32 status;
5442 +};
5443 +
5444 +struct evms_rediscover32 {
5445 +       s32 status;
5446 +       u32 drive_count;
5447 +       __uvirt_addr  drive_array;
5448 +};
5449 +
5450 +struct evms_compute_csum32 {
5451 +       __uvirt_addr  buffer_address;
5452 +       s32 buffer_size;
5453 +       u32 insum;
5454 +       u32 outsum;
5455 +       s32 status;
5456 +};
5457 +
5458 +struct evms_plugin_ioctl32 {
5459 +       u32 feature_id;
5460 +       s32 feature_command;
5461 +       s32 status;
5462 +       __uvirt_addr  feature_ioctl_data;
5463 +};
5464 +
5465 +struct evms_notify_bbr32 {
5466 +       char object_name[EVMS_VOLUME_NAME_SIZE+1];
5467 +       u64 count;
5468 +       u64 start_sect;
5469 +       u64 nr_sect;
5470 +       __uvirt_addr buffer;
5471 +       s32 rw;
5472 +};
5473 +
5474 +#define EVMS_MD_ID              4
5475 +#define EVMS_MD_PERS_IOCTL_CMD  1
5476 +#define EVMS_MD_ADD             2
5477 +#define EVMS_MD_REMOVE          3
5478 +#define EVMS_MD_ACTIVATE        4
5479 +#define EVMS_MD_DEACTIVATE      5
5480 +#define EVMS_MD_GET_ARRAY_INFO  6
5481 +#define EVMS_MD_RAID5_INIT_IO   1
5482 +
5483 +struct evms_md_ioctl {
5484 +       int mddev_idx;
5485 +       int cmd;
5486 +       void *arg;
5487 +};
5488 +
5489 +struct evms_md_ioctl32 {
5490 +       u32 mddev_idx;
5491 +       u32 cmd;
5492 +       __uvirt_addr arg;
5493 +};
5494 +
5495 +struct evms_md_array_info {
5496 +       unsigned long state;
5497 +       mdp_super_t *sb;
5498 +};
5499 +
5500 +struct evms_md_array_info32 {
5501 +       u32 state;
5502 +       __uvirt_addr sb;
5503 +};
5504 +
5505 +struct raid5_ioctl_init_io {
5506 +       int rw;
5507 +       u64 lsn;
5508 +       u64 nr_sects;
5509 +       void *data;
5510 +};
5511 +
5512 +struct raid5_ioctl_init_io32 {
5513 +       s32 rw;
5514 +       u64 lsn;
5515 +       u64 nr_sects;
5516 +       __uvirt_addr data;
5517 +};
5518 +
5519 +#define EVMS_MD_PLUGIN_ID ((IBM_OEM_ID << 16) | \
5520 +                          (EVMS_REGION_MANAGER << 12) | EVMS_MD_ID)
5521 +#define EVMS_BBR_PLUGIN_ID ((IBM_OEM_ID << 16) | \
5522 +                           (EVMS_FEATURE << 12) | EVMS_BBR_FEATURE_ID)
5523 +
5524 +#define EVMS_SECTOR_IO_32 _IOWR(EVMS_MAJOR, \
5525 +                               EVMS_SECTOR_IO_NUMBER, \
5526 +                               struct evms_sector_io32)
5527 +#define EVMS_REDISCOVER_VOLUMES_32 _IOWR(EVMS_MAJOR, \
5528 +                                        EVMS_REDISCOVER_VOLUMES_NUMBER, \
5529 +                                        struct evms_rediscover32)
5530 +#define EVMS_COMPUTE_CSUM_32 _IOWR(EVMS_MAJOR, \
5531 +                                  EVMS_COMPUTE_CSUM_NUMBER, \
5532 +                                  struct evms_compute_csum32)
5533 +#define EVMS_PLUGIN_IOCTL_32 _IOR(EVMS_MAJOR, \
5534 +                                 EVMS_PLUGIN_IOCTL_NUMBER, \
5535 +                                 struct evms_plugin_ioctl32)
5536 +
5537 +static int evms_sector_io(unsigned int fd,
5538 +                         unsigned int cmd,
5539 +                         unsigned long arg)
5540 +{
5541 +       mm_segment_t old_fs = get_fs();
5542 +       struct evms_sector_io32 parms32;
5543 +       struct evms_sector_io_pkt parms;
5544 +       unsigned int kcmd;
5545 +       void *karg;
5546 +       int rc = 0;
5547 +
5548 +       if (copy_from_user(&parms32, (struct evms_sector_io32 *)arg,
5549 +                          sizeof(struct evms_sector_io32)))
5550 +               return -EFAULT;
5551 +
5552 +       parms.disk_handle     = parms32.disk_handle;
5553 +       parms.io_flag         = parms32.io_flag;
5554 +       parms.starting_sector = parms32.starting_sector;
5555 +       parms.sector_count    = parms32.sector_count;
5556 +       parms.buffer_address  = (u8 *)uvirt_to_kernel(parms32.buffer_address);
5557 +       parms.status          = 0;
5558 +
5559 +       kcmd = EVMS_SECTOR_IO;
5560 +       karg = &parms;
5561 +
5562 +       set_fs(KERNEL_DS);
5563 +       rc = sys_ioctl(fd, kcmd, (unsigned long)karg);
5564 +       set_fs(old_fs);
5565 +
5566 +       parms32.status = parms.status;
5567 +
5568 +       if (copy_to_user((struct evms_sector_io32 *)arg, &parms32,
5569 +                        sizeof(struct evms_sector_io32)))
5570 +               return -EFAULT;
5571 +
5572 +       return rc;
5573 +}
5574 +
5575 +static int evms_rediscover(unsigned int fd,
5576 +                          unsigned int cmd,
5577 +                          unsigned long arg)
5578 +{
5579 +       mm_segment_t old_fs = get_fs();
5580 +       struct evms_rediscover32 parms32;
5581 +       struct evms_rediscover_pkt parms;
5582 +       unsigned int kcmd;
5583 +       void *karg;
5584 +       int rc = 0;
5585 +
5586 +       if (copy_from_user(&parms32, (struct evms_rediscover32 *)arg,
5587 +                          sizeof(struct evms_rediscover32)))
5588 +               return -EFAULT;
5589 +
5590 +       parms.drive_count = parms32.drive_count;
5591 +       parms.drive_array = (void *)uvirt_to_kernel(parms32.drive_array);
5592 +       parms.status      = 0;
5593 +
5594 +       kcmd = EVMS_REDISCOVER_VOLUMES;
5595 +       karg = &parms;
5596 +
5597 +       set_fs(KERNEL_DS);
5598 +       rc = sys_ioctl(fd, kcmd, (unsigned long)karg);
5599 +       set_fs(old_fs);
5600 +
5601 +       parms32.status = parms.status;
5602 +
5603 +       if (copy_to_user((struct evms_rediscover32 *)arg, &parms32,
5604 +                        sizeof(struct evms_rediscover32)))
5605 +               return -EFAULT;
5606 +
5607 +       return rc;
5608 +}
5609 +
5610 +static int evms_compute_csum(unsigned int fd,
5611 +                            unsigned int cmd,
5612 +                            unsigned long arg)
5613 +{
5614 +       mm_segment_t old_fs = get_fs();
5615 +       struct evms_compute_csum32 parms32;
5616 +       struct evms_compute_csum_pkt parms;
5617 +       unsigned int kcmd;
5618 +       void *karg;
5619 +       int rc = 0;
5620 +
5621 +       if (copy_from_user(&parms32, (struct evms_compute_csum32 *)arg,
5622 +                          sizeof(struct evms_compute_csum32)))
5623 +               return -EFAULT;
5624 +
5625 +       parms.insum          = parms32.insum;
5626 +       parms.outsum         = parms32.outsum;
5627 +       parms.buffer_size    = parms32.buffer_size;
5628 +       parms.buffer_address = (void *)uvirt_to_kernel(parms32.buffer_address);
5629 +       parms.status         = 0;
5630 +
5631 +       kcmd = EVMS_COMPUTE_CSUM;
5632 +       karg = &parms;
5633 +
5634 +       set_fs(KERNEL_DS);
5635 +       rc = sys_ioctl(fd, kcmd, (unsigned long)karg);
5636 +       set_fs(old_fs);
5637 +
5638 +       parms32.status = parms.status;
5639 +       parms32.outsum = parms.outsum;
5640 +
5641 +       if (copy_to_user((struct evms_compute_csum32 *)arg, &parms32,
5642 +                        sizeof(struct evms_compute_csum32)))
5643 +               return -EFAULT;
5644 +
5645 +       return rc;
5646 +}
5647 +
5648 +static int evms_bbr_plugin_ioctl(unsigned int fd,
5649 +                                unsigned int cmd,
5650 +                                unsigned long arg)
5651 +{
5652 +       mm_segment_t old_fs = get_fs();
5653 +       struct evms_notify_bbr32 bbr_parms32;
5654 +       struct evms_notify_bbr bbr_parms;
5655 +       struct evms_plugin_ioctl_pkt *parms =
5656 +               (struct evms_plugin_ioctl_pkt *)arg;
5657 +       void *old_ptr = NULL;
5658 +       int rc;
5659 +
5660 +       if (copy_from_user(&bbr_parms32,
5661 +                          (struct evms_notify_bbr32 *)parms->feature_ioctl_data,
5662 +                          sizeof(struct evms_notify_bbr32)))
5663 +               return -EFAULT;
5664 +
5665 +       memcpy(&bbr_parms, &bbr_parms32, sizeof(struct evms_notify_bbr32));
5666 +       bbr_parms.buffer = (void *)uvirt_to_kernel(bbr_parms32.buffer);
5667 +       bbr_parms.rw = bbr_parms32.rw;
5668 +       old_ptr = parms->feature_ioctl_data;
5669 +       parms->feature_ioctl_data = &bbr_parms;
5670 +
5671 +       set_fs(KERNEL_DS);
5672 +       rc = sys_ioctl(fd, cmd, arg);
5673 +       set_fs(old_fs);
5674 +
5675 +       parms->feature_ioctl_data = old_ptr;
5676 +
5677 +       if (!rc) {
5678 +               bbr_parms32.nr_sect = bbr_parms.nr_sect;
5679 +               rc = copy_to_user((struct evms_notify_bbr32 *)parms->feature_ioctl_data,
5680 +                                 &bbr_parms32,
5681 +                                 sizeof(struct evms_notify_bbr32));
5682 +       }
5683 +
5684 +       return rc;
5685 +}
5686 +
5687 +static int evms_md_plugin_ioctl(unsigned int fd,
5688 +                               unsigned int cmd,
5689 +                               unsigned long arg)
5690 +{
5691 +       mm_segment_t old_fs = get_fs();
5692 +       void *old_ptr = NULL;
5693 +       void *old_md_ptr = NULL;
5694 +       struct evms_md_ioctl32 md_parms32;
5695 +       struct evms_md_ioctl md_parms;
5696 +       struct evms_md_array_info32 md_array_parms32;
5697 +       struct evms_md_array_info md_array_parms;
5698 +       struct raid5_ioctl_init_io32 r5_init_io_parms32;
5699 +       struct raid5_ioctl_init_io r5_init_io_parms;
5700 +       struct evms_plugin_ioctl_pkt *parms =
5701 +               (struct evms_plugin_ioctl_pkt *)arg;
5702 +       int rc;
5703 +
5704 +       if (copy_from_user(&md_parms32,
5705 +                          (struct evms_md_ioctl*)parms->feature_ioctl_data,
5706 +                          sizeof(struct evms_md_ioctl32)))
5707 +               return -EFAULT;
5708 +
5709 +       md_parms.mddev_idx = md_parms32.mddev_idx;
5710 +       md_parms.cmd = md_parms32.cmd;
5711 +       md_parms.arg = (void *)uvirt_to_kernel(md_parms32.arg);
5712 +       old_ptr = parms->feature_ioctl_data;
5713 +       parms->feature_ioctl_data = &md_parms;
5714 +
5715 +       if (parms->feature_command == EVMS_MD_GET_ARRAY_INFO) {
5716 +               if (copy_from_user(&md_array_parms32,
5717 +                                  (struct evms_md_array_info32*)md_parms.arg,
5718 +                                  sizeof(struct evms_md_array_info32)))
5719 +                       return -EFAULT;
5720 +
5721 +               md_array_parms.state = md_array_parms32.state;
5722 +               md_array_parms.sb =
5723 +                       (void *)uvirt_to_kernel(md_array_parms32.sb);
5724 +               old_md_ptr = (void *)md_parms.arg;
5725 +               md_parms.arg = &md_array_parms;
5726 +       } else if (parms->feature_command == EVMS_MD_PERS_IOCTL_CMD) {
5727 +               if (md_parms.cmd == EVMS_MD_RAID5_INIT_IO) {
5728 +                       if (copy_from_user(&r5_init_io_parms32,
5729 +                                          (struct raid5_ioctl_init_io32*)md_parms.arg,
5730 +                                          sizeof(struct raid5_ioctl_init_io32)))
5731 +                               return -EFAULT;
5732 +
5733 +                       r5_init_io_parms.rw = r5_init_io_parms32.rw;
5734 +                       r5_init_io_parms.lsn = r5_init_io_parms32.lsn;
5735 +                       r5_init_io_parms.nr_sects = r5_init_io_parms32.nr_sects;
5736 +                       r5_init_io_parms.data =
5737 +                               (void *)uvirt_to_kernel(r5_init_io_parms32.data);
5738 +                       old_md_ptr = (void *)md_parms.arg;
5739 +                       md_parms.arg = &r5_init_io_parms;
5740 +               }
5741 +       }
5742 +
5743 +       set_fs(KERNEL_DS);
5744 +       rc = sys_ioctl(fd, cmd, arg);
5745 +       set_fs(old_fs);
5746 +
5747 +       parms->feature_ioctl_data = old_ptr;
5748 +       md_parms.arg = old_md_ptr;
5749 +
5750 +       if (!rc) {
5751 +               if (parms->feature_command == EVMS_MD_GET_ARRAY_INFO) {
5752 +                       md_array_parms32.state = md_array_parms.state;
5753 +                       rc = copy_to_user((struct evms_md_array_info32 *)md_parms.arg,
5754 +                                         &md_array_parms32,
5755 +                                         sizeof(struct evms_md_array_info32));
5756 +               }
5757 +               if (!rc) {
5758 +                       md_parms32.mddev_idx = md_parms.mddev_idx;
5759 +                       rc = copy_to_user((struct evms_md_ioctl*)parms->feature_ioctl_data,
5760 +                                         &md_parms32,
5761 +                                         sizeof(struct evms_md_ioctl32));
5762 +               }
5763 +       }
5764 +
5765 +       return rc;
5766 +}
5767 +
5768 +static int evms_plugin_ioctl(unsigned int fd,
5769 +                            unsigned int cmd,
5770 +                            unsigned long arg)
5771 +{
5772 +       mm_segment_t old_fs = get_fs();
5773 +       struct evms_plugin_ioctl32 parms32;
5774 +       struct evms_plugin_ioctl_pkt parms;
5775 +       unsigned int kcmd;
5776 +       void *karg;
5777 +       int rc;
5778 +
5779 +       if (copy_from_user(&parms32, (struct evms_plugin_ioctl32 *)arg,
5780 +                          sizeof(struct evms_plugin_ioctl32)))
5781 +               return -EFAULT;
5782 +
5783 +       parms.feature_id = parms32.feature_id;
5784 +       parms.feature_command = parms32.feature_command;
5785 +       parms.status = parms32.status;
5786 +       parms.feature_ioctl_data =
5787 +               (void *)uvirt_to_kernel(parms32.feature_ioctl_data);
5788 +
5789 +       kcmd = EVMS_PLUGIN_IOCTL;
5790 +       karg = &parms;
5791 +
5792 +       switch (parms.feature_id) {
5793 +       case EVMS_MD_PLUGIN_ID:
5794 +               rc = evms_md_plugin_ioctl(fd, kcmd, (unsigned long)karg);
5795 +               break;
5796 +       case EVMS_BBR_PLUGIN_ID:
5797 +               rc = evms_bbr_plugin_ioctl(fd, kcmd, (unsigned long)karg);
5798 +               break;
5799 +       default:
5800 +               set_fs(KERNEL_DS);
5801 +               rc = sys_ioctl(fd, kcmd, (unsigned long)karg);
5802 +               set_fs(old_fs);
5803 +       }
5804 +
5805 +       if (!rc) {
5806 +               parms32.status = parms.status;
5807 +               rc = copy_to_user((struct evms_plugin_ioctl32 *)arg, &parms32,
5808 +                                 sizeof(struct evms_plugin_ioctl32));
5809 +       }
5810 +
5811 +       return rc;
5812 +}
5813 +#endif
5814 +
5815 +/**********************************************************/
5816 +/* START -- exported functions/Common Services            */
5817 +/**********************************************************/
5818 +
5819 +/*
5820 + * Function:     evms_cs_get_version
5821 + * Description: This function returns the current EVMS version
5822 + */
5823 +void
5824 +evms_cs_get_version(int *major, int *minor)
5825 +{
5826 +       *major = EVMS_MAJOR_VERSION;
5827 +       *minor = EVMS_MINOR_VERSION;
5828 +}
5829 +
5830 +EXPORT_SYMBOL(evms_cs_get_version);
5831 +
5832 +int
5833 +evms_cs_check_version(struct evms_version *required,
5834 +                     struct evms_version *actual)
5835 +{
5836 +       if (required->major != actual->major)
5837 +               return -EINVAL;
5838 +       else if (required->minor > actual->minor)
5839 +               return -EINVAL;
5840 +       else if (required->minor == actual->minor)
5841 +               if (required->patchlevel > actual->patchlevel)
5842 +                       return -EINVAL;
5843 +       return 0;
5844 +}
5845 +
5846 +EXPORT_SYMBOL(evms_cs_check_version);
5847 +
5848 +int
5849 +evms_cs_allocate_logical_node(struct evms_logical_node **pp)
5850 +{
5851 +       *pp = kmalloc(sizeof (struct evms_logical_node), GFP_KERNEL);
5852 +       if (*pp) {
5853 +               memset(*pp, 0, sizeof (struct evms_logical_node));
5854 +               atomic_inc(&evms_logical_nodes);
5855 +               return 0;
5856 +       }
5857 +       return -ENOMEM;
5858 +}
5859 +
5860 +EXPORT_SYMBOL(evms_cs_allocate_logical_node);
5861 +
5862 +void
5863 +evms_cs_deallocate_volume_info(struct evms_logical_node *p)
5864 +{
5865 +       if (p->iflags & EVMS_FEATURE_BOTTOM) {
5866 +               evms_cs_remove_item_from_list(&evms_global_feature_node_list,
5867 +                                             p);
5868 +               kfree(p->volume_info);
5869 +               p->volume_info = NULL;
5870 +               p->iflags &= ~EVMS_FEATURE_BOTTOM;
5871 +       }
5872 +}
5873 +
5874 +EXPORT_SYMBOL(evms_cs_deallocate_volume_info);
5875 +
5876 +void
5877 +evms_cs_deallocate_logical_node(struct evms_logical_node *p)
5878 +{
5879 +       if (p->next) {
5880 +               LOG_SERIOUS
5881 +                   ("Deallocating object whose NEXT ptr is not null!!\n");
5882 +       }
5883 +       evms_cs_deallocate_volume_info(p);
5884 +       if (p->feature_header) {
5885 +               kfree(p->feature_header);
5886 +               p->feature_header = NULL;
5887 +       }
5888 +       kfree(p);
5889 +       atomic_dec(&evms_logical_nodes);
5890 +}
5891 +
5892 +EXPORT_SYMBOL(evms_cs_deallocate_logical_node);
5893 +
5894 +/*
5895 + * Function:     evms_cs_register_plugin
5896 + * Description: This function is exported so that all plugins can register with EVMS
5897 + */
5898 +int
5899 +evms_cs_register_plugin(struct evms_plugin_header *plugin)
5900 +{
5901 +       int rc = 0;
5902 +       struct evms_registered_plugin *reg_record, **pp;
5903 +       struct evms_version *ver;
5904 +
5905 +       ver = &plugin->required_services_version;
5906 +
5907 +       LOG_EXTRA
5908 +           ("registering plugin (plugin.id=%d.%d.%d, plugin.ver=%d.%d.%d, req.svc.ver=%d.%d.%d)\n",
5909 +            GetPluginOEM(plugin->id), GetPluginType(plugin->id),
5910 +            GetPluginID(plugin->id), plugin->version.major,
5911 +            plugin->version.minor, plugin->version.patchlevel, ver->major,
5912 +            ver->minor, ver->patchlevel);
5913 +
5914 +       /* check common services requirements */
5915 +       rc = evms_cs_check_version(ver, &evms_svc_version);
5916 +       if (rc) {
5917 +               LOG_SERIOUS
5918 +                   ("plugin failed to load: common services (vers:%d,%d,%d) incompatibility!\n",
5919 +                    EVMS_COMMON_SERVICES_MAJOR, EVMS_COMMON_SERVICES_MINOR,
5920 +                    EVMS_COMMON_SERVICES_PATCHLEVEL);
5921 +       }
5922 +       if (!rc) {
5923 +               /* ensure a plugin with this feature id is
5924 +                * not already loaded.
5925 +                */
5926 +               for (pp = &registered_plugin_head; *pp; pp = &(*pp)->next) {
5927 +                       if ((*pp)->plugin->id == plugin->id) {
5928 +                               rc = -EBUSY;
5929 +                               LOG_ERROR
5930 +                                   ("error(%d) attempting to load another plugin with id(%x).\n",
5931 +                                    rc, plugin->id);
5932 +                       }
5933 +               }
5934 +       }
5935 +       if (!rc) {
5936 +               /* ensure the plugin has provided functions for
5937 +                * the mandatory entry points.
5938 +                */
5939 +               if (!plugin->fops->discover) {
5940 +                       rc = -EINVAL;
5941 +               } else if (!plugin->fops->init_io) {
5942 +                       rc = -EINVAL;
5943 +               } else if (!plugin->fops->ioctl) {
5944 +                       rc = -EINVAL;
5945 +               } else if (!plugin->fops->read) {
5946 +                       rc = -EINVAL;
5947 +               } else if (!plugin->fops->write) {
5948 +                       rc = -EINVAL;
5949 +               } else if (!plugin->fops->delete) {
5950 +                       rc = -EINVAL;
5951 +               }
5952 +       }
5953 +       if (!rc) {
5954 +               /* allocate a new plugin registration record */
5955 +               reg_record =
5956 +                   kmalloc(sizeof (struct evms_registered_plugin), GFP_KERNEL);
5957 +               if (!reg_record) {
5958 +                       rc = -ENOMEM;
5959 +               }
5960 +       }
5961 +       if (!rc) {
5962 +               memset(reg_record, 0, sizeof (struct evms_registered_plugin));
5963 +               /* store ptr to plugin header in new registration record */
5964 +               reg_record->plugin = plugin;
5965 +
5966 +               /* terminate the record */
5967 +               reg_record->next = NULL;
5968 +
5969 +               /* find end of the plugin registration list */
5970 +               for (pp = &registered_plugin_head; *pp; pp = &(*pp)->next) ;
5971 +               /* add registration record to list */
5972 +               *pp = reg_record;
5973 +
5974 +               /* increment the usage count */
5975 +               MOD_INC_USE_COUNT;
5976 +       }
5977 +
5978 +       return (rc);
5979 +}
5980 +
5981 +EXPORT_SYMBOL(evms_cs_register_plugin);
5982 +
5983 +/*
5984 + * Function:     evms_cs_unregister_plugin
5985 + * Description: This function is exported so that all plugins can
5986 + * unregister with EVMS
5987 + */
5988 +int
5989 +evms_cs_unregister_plugin(struct evms_plugin_header *plugin)
5990 +{
5991 +       int rc = 0, found = FALSE;
5992 +       struct evms_registered_plugin **pp;
5993 +       struct evms_version *ver;
5994 +
5995 +       ver = &plugin->required_services_version;
5996 +
5997 +       LOG_EXTRA
5998 +           ("unregistering plugin (plugin.id=%d.%d.%d, plugin.ver=%d.%d.%d, req.svc.ver=%d.%d.%d)\n",
5999 +            GetPluginOEM(plugin->id), GetPluginType(plugin->id),
6000 +            GetPluginID(plugin->id), plugin->version.major,
6001 +            plugin->version.minor, plugin->version.patchlevel, ver->major,
6002 +            ver->minor, ver->patchlevel);
6003 +       /* ensure a plugin with this feature id is
6004 +        * currently loaded.
6005 +        */
6006 +       for (pp = &registered_plugin_head; *pp; pp = &(*pp)->next) {
6007 +               if ((*pp)->plugin->id == plugin->id) {
6008 +                       found = TRUE;
6009 +                       break;
6010 +               }
6011 +       }
6012 +       if (!found) {
6013 +               rc = -ENOPKG;
6014 +               LOG_ERROR
6015 +                   ("error(%d) attempt to unload a non-loaded plugin with id(%x).\n",
6016 +                    rc, plugin->id);
6017 +       }
6018 +       /* actually unload the plugin now */
6019 +       if (!rc) {
6020 +               struct evms_registered_plugin *tmp = *pp;
6021 +
6022 +               /* remove the plugin record from our
6023 +                * internal plugin list
6024 +                */
6025 +               *pp = (*pp)->next;
6026 +               /* deallocate the plugin registration record
6027 +                */
6028 +               kfree(tmp);
6029 +
6030 +               /* decrement the usage count */
6031 +               MOD_DEC_USE_COUNT;
6032 +       }
6033 +       return (rc);
6034 +}
6035 +
6036 +EXPORT_SYMBOL(evms_cs_unregister_plugin);
6037 +
6038 +/* function: evms_cs_add_logical_node_to_list
6039 + *
6040 + * This functions adds a new logical node to the end of a
6041 + * node list.
6042 + *
6043 + * NOTE: This function is only expected to be called at
6044 + * discovery time, which is singled threaded by nature,
6045 + * and therefore doesn't need to be made SMP safe.
6046 + */
6047 +int
6048 +evms_cs_add_logical_node_to_list(struct evms_logical_node **list_head,
6049 +                                struct evms_logical_node *node)
6050 +{
6051 +       int rc = 0;
6052 +       struct evms_logical_node **pp = NULL;
6053 +
6054 +       /* check to make sure node is not already on a list */
6055 +       if (node->next)
6056 +               rc = 1;
6057 +       else
6058 +               /* check to make sure node being added is not already in the list */
6059 +               for (pp = list_head; *pp; pp = &(*pp)->next)
6060 +                       if (*pp == node) {
6061 +                               rc = 2;
6062 +                               break;
6063 +                       }
6064 +
6065 +       /* add node to the end of the list */
6066 +       if (!rc)
6067 +               *pp = node;
6068 +
6069 +       return (rc);
6070 +}
6071 +
6072 +EXPORT_SYMBOL(evms_cs_add_logical_node_to_list);
6073 +
6074 +/* function: evms_cs_remove_logical_node_from_list
6075 + *
6076 + * This functions removes a new logical node from a node list.
6077 + *
6078 + * NOTE: This function is only expected to be called at
6079 + * discovery time, which is singled threaded by nature,
6080 + * and therefore doesn't need to be made SMP safe.
6081 + */
6082 +int
6083 +evms_cs_remove_logical_node_from_list(struct evms_logical_node **list_head,
6084 +                                     struct evms_logical_node *node)
6085 +{
6086 +       /* remove this node from the head of the list */
6087 +       int rc = 1;             /* assume failure until target node is found */
6088 +       struct evms_logical_node **pp;
6089 +       for (pp = list_head; *pp; pp = &(*pp)->next)
6090 +               if (*pp == node) {
6091 +                       *pp = (*pp)->next;
6092 +                       node->next = NULL;
6093 +                       rc = 0;
6094 +                       break;
6095 +               }
6096 +       return (rc);
6097 +}
6098 +
6099 +EXPORT_SYMBOL(evms_cs_remove_logical_node_from_list);
6100 +
6101 +int
6102 +evms_cs_kernel_ioctl(struct evms_logical_node *node, unsigned int cmd,
6103 +                    unsigned long arg)
6104 +{
6105 +       int rc = 0;
6106 +       struct inode tmp_inode;
6107 +       mm_segment_t fs;
6108 +
6109 +       lock_kernel();
6110 +       fs = get_fs();
6111 +       set_fs(get_ds());
6112 +       rc = IOCTL(node, &tmp_inode, NULL, cmd, arg);
6113 +       set_fs(fs);
6114 +       unlock_kernel();
6115 +
6116 +       return (rc);
6117 +
6118 +}
6119 +
6120 +EXPORT_SYMBOL(evms_cs_kernel_ioctl);
6121 +
6122 +/*
6123 + * function: evms_cs_size_in_vsectors
6124 + *
6125 + * In EVMS a V(irtual)Sector is 512 bytes in size.
6126 + * This function computes the number of VSECTORs an specified
6127 + * item size would require.
6128 + *
6129 + * NOTE: This function has been coded to work with 64 bit values.
6130 + */
6131 +unsigned long
6132 +evms_cs_size_in_vsectors(long long item_size)
6133 +{
6134 +       long long sectors;
6135 +
6136 +       sectors = item_size >> EVMS_VSECTOR_SIZE_SHIFT;
6137 +       if (item_size & (EVMS_VSECTOR_SIZE - 1))
6138 +               sectors++;
6139 +
6140 +       return (sectors);
6141 +}
6142 +
6143 +EXPORT_SYMBOL(evms_cs_size_in_vsectors);
6144 +
6145 +/*
6146 + * function: evms_cs_log2
6147 + *
6148 + * this function computes the power of the 2 of specified
6149 + * value. If the value is 0, a -1 is returned. If the value
6150 + * is NOT a power of 2, a -2 is return. Otherwise the power
6151 + * of 2 is returned.
6152 + */
6153 +int
6154 +evms_cs_log2(long long value)
6155 +{
6156 +       int result = -1;
6157 +       long long tmp;
6158 +
6159 +       if (value) {
6160 +               tmp = value;
6161 +               result++;
6162 +               while (!(tmp & 1)) {
6163 +                       result++;
6164 +                       tmp >>= 1;
6165 +               }
6166 +               if (tmp != 1) {
6167 +                       result = -2;
6168 +               }
6169 +       }
6170 +       return (result);
6171 +}
6172 +
6173 +EXPORT_SYMBOL(evms_cs_log2);
6174 +
6175 +/*
6176 + * Functions:
6177 + *
6178 + *              build_crc_table()
6179 + *              calculate_crc()
6180 + *
6181 + *
6182 + * Description: The functions in this module provide a means of calculating
6183 + *              the 32 bit CRC for a block of data.  build_crc_table must
6184 + *              be called to initialize this module.  calculate_crc must
6185 + *              NOT be used until after build_crc_table has been called.
6186 + *              Once build_crc_table has been called, calculate_crc can
6187 + *              be used to calculate the crc of the data residing in a
6188 + *              user specified buffer.
6189 + *
6190 + */
6191 +
6192 +#define CRC_POLYNOMIAL     0xEDB88320L
6193 +
6194 +static u32 crc_table[256];
6195 +static u32 crc_table_built = FALSE;
6196 +
6197 +/*********************************************************************/
6198 +/*                                                                   */
6199 +/*   Function Name: build_crc_table                                  */
6200 +/*                                                                   */
6201 +/*   Descriptive Name: This module implements the crc function using */
6202 +/*                     a table driven method.  The required table    */
6203 +/*                     must be setup before the calculate_crc        */
6204 +/*                     function can be used.  This table only needs  */
6205 +/*                     to be set up once.  This function sets up the */
6206 +/*                     crc table needed by calculate_crc.            */
6207 +/*                                                                   */
6208 +/*   Input: None                                                     */
6209 +/*                                                                   */
6210 +/*   Output: None                                                    */
6211 +/*                                                                   */
6212 +/*   Error Handling: N/A                                             */
6213 +/*                                                                   */
6214 +/*   Side Effects:  The internal crc table is initialized.           */
6215 +/*                                                                   */
6216 +/*   Notes:  None.                                                   */
6217 +/*                                                                   */
6218 +/*********************************************************************/
6219 +static void
6220 +build_crc_table(void)
6221 +{
6222 +       u32 i, j, crc;
6223 +
6224 +       for (i = 0; i <= 255; i++) {
6225 +               crc = i;
6226 +               for (j = 8; j > 0; j--) {
6227 +                       if (crc & 1)
6228 +                               crc = (crc >> 1) ^ CRC_POLYNOMIAL;
6229 +                       else
6230 +                               crc >>= 1;
6231 +               }
6232 +               crc_table[i] = crc;
6233 +       }
6234 +       crc_table_built = TRUE;
6235 +}
6236 +
6237 +/*********************************************************************/
6238 +/*                                                                   */
6239 +/*   Function Name: calculate_crc                                    */
6240 +/*                                                                   */
6241 +/*   Descriptive Name: This function calculates the crc value for    */
6242 +/*                     the data in the buffer specified by Buffer.   */
6243 +/*                                                                   */
6244 +/*   Input: u32    crc : This is the starting crc.  If you are */
6245 +/*                             starting a new crc calculation, then  */
6246 +/*                             this should be set to 0xFFFFFFFF.  If */
6247 +/*                             you are continuing a crc calculation  */
6248 +/*                             (i.e. all of the data did not fit in  */
6249 +/*                             the buffer so you could not calculate */
6250 +/*                             the crc in a single operation), then  */
6251 +/*                             this is the crc output by the last    */
6252 +/*                             calculate_crc call.                   */
6253 +/*                                                                   */
6254 +/*   Output: The crc for the data in the buffer, based upon the value*/
6255 +/*           of the input parameter crc.                             */
6256 +/*                                                                   */
6257 +/*   Error Handling: None.                                           */
6258 +/*                                                                   */
6259 +/*   Side Effects:  None.                                            */
6260 +/*                                                                   */
6261 +/*   Notes:  None.                                                   */
6262 +/*                                                                   */
6263 +/*********************************************************************/
6264 +u32
6265 +evms_cs_calculate_crc(u32 crc, void *buffer, u32 buffersize)
6266 +{
6267 +       unsigned char *current_byte;
6268 +       u32 temp1, temp2, i;
6269 +
6270 +       current_byte = (unsigned char *) buffer;
6271 +       /* Make sure the crc table is available */
6272 +       if (crc_table_built == FALSE)
6273 +               build_crc_table();
6274 +       /* Process each byte in the buffer. */
6275 +       for (i = 0; i < buffersize; i++) {
6276 +               temp1 = (crc >> 8) & 0x00FFFFFF;
6277 +               temp2 =
6278 +                   crc_table[(crc ^ (u32) *
6279 +                              current_byte) & (u32) 0xff];
6280 +               current_byte++;
6281 +               crc = temp1 ^ temp2;
6282 +       }
6283 +       return (crc);
6284 +}
6285 +
6286 +EXPORT_SYMBOL(evms_cs_calculate_crc);
6287 +
6288 +#define EVMS_ORIGINAL_CALLBACK_FLAG    1<<0
6289 +typedef struct io_notify_s {
6290 +       unsigned int flags;
6291 +       void *private;
6292 +       struct buffer_head *bh;
6293 +       u64 rsector;
6294 +       kdev_t rdev;
6295 +       void *b_private;
6296 +       void (*callback_function) (struct evms_logical_node * node,
6297 +                                  struct buffer_head * bh,
6298 +                                  int uptodate, int *redrive);
6299 +       struct io_notify_s *next;
6300 +} io_notify_t;
6301 +
6302 +struct evms_pool_mgmt *
6303 +evms_cs_create_pool(int objsize,
6304 +                   u8 * pool_name,
6305 +                   void (*ctor) (void *, kmem_cache_t *, unsigned long),
6306 +                   void (*dtor) (void *, kmem_cache_t *, unsigned long))
6307 +{
6308 +       struct evms_pool_mgmt *pool;
6309 +
6310 +       /* create the pool management structure */
6311 +       pool = kmalloc(sizeof (struct evms_pool_mgmt), GFP_KERNEL);
6312 +       if (!pool) {
6313 +               LOG_CRITICAL("Cannot create %s fpool mgmt structure",
6314 +                            pool_name);
6315 +               return NULL;
6316 +       }
6317 +       /* initialize various field in pool mgmt structure */
6318 +       memset(pool, 0, sizeof (struct evms_pool_mgmt));
6319 +       pool->member_size = objsize;
6320 +       pool->name = pool_name;
6321 +       pool->waiters = (atomic_t) ATOMIC_INIT(0);
6322 +       init_waitqueue_head(&pool->wait_queue);
6323 +       /* go create the pool */
6324 +       pool->cachep = kmem_cache_create(pool->name,
6325 +                                        pool->member_size,
6326 +                                        0, SLAB_HWCACHE_ALIGN, ctor, dtor);
6327 +       if (!pool->cachep)
6328 +               panic("Cannot create %s SLAB cache", pool->name);
6329 +       return (pool);
6330 +}
6331 +
6332 +EXPORT_SYMBOL(evms_cs_create_pool);
6333 +
6334 +void *
6335 +evms_cs_allocate_from_pool(struct evms_pool_mgmt *pool, int blockable)
6336 +{
6337 +       void *objp;
6338 +
6339 +       while (1) {
6340 +               objp = kmem_cache_alloc(pool->cachep, SLAB_NOIO);
6341 +               if (objp || !blockable) {
6342 +                       return (objp);
6343 +               } else {
6344 +                       /* block and wait for an object to
6345 +                        * be returned to the pool
6346 +                        */
6347 +                       atomic_inc(&pool->waiters);
6348 +                       wait_event(pool->wait_queue,
6349 +                                  (!atomic_read(&pool->waiters)));
6350 +               }
6351 +       }
6352 +       return (objp);
6353 +}
6354 +
6355 +EXPORT_SYMBOL(evms_cs_allocate_from_pool);
6356 +
6357 +void
6358 +evms_cs_deallocate_to_pool(struct evms_pool_mgmt *pool, void *objp)
6359 +{
6360 +       kmem_cache_free(pool->cachep, objp);
6361 +       atomic_set(&pool->waiters, 0);
6362 +       if (waitqueue_active(&pool->wait_queue)) {
6363 +               wake_up(&pool->wait_queue);
6364 +       }
6365 +}
6366 +
6367 +EXPORT_SYMBOL(evms_cs_deallocate_to_pool);
6368 +
6369 +void
6370 +evms_cs_destroy_pool(struct evms_pool_mgmt *pool)
6371 +{
6372 +       kmem_cache_destroy(pool->cachep);
6373 +       kfree(pool);
6374 +}
6375 +
6376 +EXPORT_SYMBOL(evms_cs_destroy_pool);
6377 +
6378 +/*
6379 + * function: evms_end_io
6380 + *
6381 + * This is a support function for
6382 + * evms_cs_register_for_end_io_notification.
6383 + * This function is called during I/O completion on any buffer
6384 + * head that was registered by a plugin. Control is passed here
6385 + * and this routine will, thru the use of the I/O notify entry
6386 + * stored in the b_private field of the buffer head, restore
6387 + * the b_rsector value the buffer head had at the time of
6388 + * registration and pass control to the registered callback
6389 + * address, with pointers to the buffer head and an optional
6390 + * plugin private data. Upon completion of the callback,
6391 + * control is returned back here. The io notify list entry
6392 + * is deleted. This process repeats until this routine
6393 + * detects that all registered plugins have been called back
6394 + * and the buffer head's original end_io function has been
6395 + * called. At this point the DONE flag is set, and we terminate
6396 + * callback loop and exit.
6397 + *
6398 + * Plugins may desire to break or interrupt the callback
6399 + * sequence or chain. This may be useful to redrive I/O or
6400 + * to wait for other buffer heads to complete before
6401 + * allowing the original buffer head callback to occur.
6402 + * To interrupt the callback "chain", a registered
6403 + * plugin's callback must return with the DONE flag set.
6404 + *
6405 + * NOTE: If a plugin set the DONE flag, and wishes to redrive
6406 + * a buffer head, the plugin MUST reregister the buffer head
6407 + * to receive another callback on this buffer head. Also, the
6408 + * plugin MUST ensure that the original buffer head end_io
6409 + * function get called at some point, either by reregistering
6410 + * this buffer head and receiving another callback, or by
6411 + * means of buffer head aggregation triggered by the callbacks
6412 + * of other buffer heads.
6413 + *
6414 + */
6415 +static void
6416 +evms_end_io(struct buffer_head *bh, int uptodate)
6417 +{
6418 +       io_notify_t *entry;
6419 +       int done;
6420 +
6421 +       done = FALSE;
6422 +       while (!done) {
6423 +               /* retrieve the io_notify_entry ptr from
6424 +                * the b_private field in the buffer head.
6425 +                */
6426 +               entry = (io_notify_t *) bh->b_private;
6427 +
6428 +               /* restore the b_private value to
6429 +                * the previous b_private value (which
6430 +                * should be a previous io_notify_entry
6431 +                * or the original b_private pointer).
6432 +                */
6433 +               bh->b_private = entry->b_private;
6434 +
6435 +               /* check for original callback for this bh */
6436 +               if (entry->flags & EVMS_ORIGINAL_CALLBACK_FLAG) {
6437 +                       /* this is the original for bh */
6438 +
6439 +                       /* turn off flag marking this as the original */
6440 +                       entry->flags &= ~EVMS_ORIGINAL_CALLBACK_FLAG;
6441 +
6442 +                       /* decrement volume's requests_in_progress var */
6443 +                       atomic_dec(&evms_logical_volumes[MINOR(bh->b_rdev)].
6444 +                                  requests_in_progress);
6445 +
6446 +                       /* restore b_end_io to original value */
6447 +                       bh->b_end_io = (void *) entry->callback_function;
6448 +                       if (bh->b_end_io) {
6449 +                               /* invoke original callback function
6450 +                                * if it exists.
6451 +                                */
6452 +                               bh->b_end_io(bh, uptodate);
6453 +                       }
6454 +                       done = TRUE;
6455 +               } else {
6456 +                       /* this is a plugin callback */
6457 +
6458 +                       /* restore the rsector value to the
6459 +                        * value at the time of callback
6460 +                        * registration.
6461 +                        */
6462 +                       bh->b_rsector = entry->rsector;
6463 +                       bh->b_rdev = entry->rdev;
6464 +                       /* invoke plugin callback function */
6465 +                       entry->callback_function(entry->private, bh, uptodate,
6466 +                                                &done);
6467 +               }
6468 +               /* free the io notify entry */
6469 +               evms_cs_deallocate_to_pool(evms_io_notify_pool, entry);
6470 +       }
6471 +}
6472 +
6473 +/*
6474 + * function: evms_cs_register_for_end_io_notification
6475 + *
6476 + * This function is an evms common service.
6477 + * This routine allows a (plugin) function to register to
6478 + * participate in the io completion notification process.
6479 + * This is useful for plugins which alter data after it
6480 + * has been read from the disk (i.e. encryption or
6481 + * compression).
6482 + *
6483 + * This routine also records the rsector value at the time
6484 + * of registration, so that it can be restored to that value
6485 + * prior to the callback to a plugin, thus allowing that
6486 + * plugin to work with the value it had seen during the
6487 + * initiating I/O request.
6488 + *
6489 + * This routine also records a private data pointer at the
6490 + * time of registration, and is returned to the plugin
6491 + * at callback time. This private data pointer was designed
6492 + * to contain context/callback/buffer_head specific data, and
6493 + * frees the plugin from having to store and find associated
6494 + * data at the time of the callback. This field is not used
6495 + * by this function and is optional (NULL if unused). It is
6496 + * recorded and returned as a convenience for the plugins.
6497 + *
6498 + * DANGER!!! - WILL ROBINSON - DANGER!!!
6499 + * This routine uses the b_private field in the
6500 + * buffer_head structure. If any lower level driver uses this
6501 + * field and do NOT restore it, the I/O callback will fail!!
6502 + *
6503 + * Any plugins writers requiring a field for private storage
6504 + * should instead use the private field parameter in this
6505 + * function to store their private data.
6506 + *
6507 + */
6508 +
6509 +int
6510 +evms_cs_register_for_end_io_notification(void *private,
6511 +                                        struct buffer_head *bh,
6512 +                                        void *callback_function)
6513 +{
6514 +       int rc = 0, done;
6515 +       io_notify_t *new_entry;
6516 +
6517 +       done = FALSE;
6518 +       while (!done) {
6519 +               /* allocate a notify entry */
6520 +               new_entry =
6521 +                   evms_cs_allocate_from_pool(evms_io_notify_pool,
6522 +                                              EVMS_BLOCKABLE);
6523 +               if (!new_entry) {
6524 +                       schedule();
6525 +                       continue;
6526 +               }
6527 +
6528 +               /* initialize notify entry */
6529 +               new_entry->private = private;
6530 +               new_entry->bh = bh;
6531 +               new_entry->rsector = bh->b_rsector;
6532 +               new_entry->rdev = bh->b_rdev;
6533 +               new_entry->b_private = bh->b_private;
6534 +               new_entry->flags = 0;
6535 +
6536 +               /* is this the first callback for this bh? */
6537 +               if (bh->b_end_io != evms_end_io) {
6538 +                       /* yes, first callback */
6539 +                       new_entry->flags |= EVMS_ORIGINAL_CALLBACK_FLAG;
6540 +                       new_entry->callback_function = (void *) bh->b_end_io;
6541 +
6542 +                       /* increment volume's requests_in_progress var */
6543 +                       atomic_inc(&evms_logical_volumes[MINOR(bh->b_rdev)].
6544 +                                  requests_in_progress);
6545 +
6546 +                       /* set b_end_io so we get control */
6547 +                       bh->b_end_io = evms_end_io;
6548 +               } else {
6549 +                       /* no, not first callback */
6550 +                       new_entry->callback_function = callback_function;
6551 +                       done = TRUE;
6552 +               }
6553 +               /* set b_private to aid in quick lookup */
6554 +               bh->b_private = new_entry;
6555 +       }
6556 +       return (rc);
6557 +}
6558 +
6559 +EXPORT_SYMBOL(evms_cs_register_for_end_io_notification);
6560 +
6561 +/* function description: evms_cs_lookup_item_in_list
6562 + *
6563 + * this function searches for the specified item in the
6564 + * specified node list. it returns the address of the
6565 + * evms_list_node containing the specified item.
6566 + */
6567 +struct evms_list_node **
6568 +evms_cs_lookup_item_in_list(struct evms_list_node **node_list, void *item)
6569 +{
6570 +       struct evms_list_node **list_node;
6571 +
6572 +       list_node = node_list;
6573 +       while (*list_node) {
6574 +               if ((*list_node)->item == item)
6575 +                       break;
6576 +               list_node = &(*list_node)->next;
6577 +       }
6578 +       return (list_node);
6579 +}
6580 +
6581 +EXPORT_SYMBOL(evms_cs_lookup_item_in_list);
6582 +
6583 +/* function description: evms_add_item_to_list
6584 + *
6585 + * this function adds an item to the list. the
6586 + * node for the new item is added to the end
6587 + * of the list. the list is traversed to find the end.
6588 + * while the traversal occurs, the list is checked
6589 + * for the presence of the specified item. if already
6590 + * present in the list, and error code is returned.
6591 + */
6592 +/* function description: evms_cs_add_item_to_list
6593 + *
6594 + * this function adds an item to an item list.
6595 + *
6596 + * RC == 0 is returned for:
6597 + *     a successful add of a new item
6598 + *
6599 + * RC == 1 is returned when:
6600 + *     the item is already on the list
6601 + *
6602 + * RC < 0 is returned for an error attempting to add the item.
6603 + */
6604 +int
6605 +evms_cs_add_item_to_list(struct evms_list_node **list, void *item)
6606 +{
6607 +       int rc = 0;
6608 +       struct evms_list_node **list_node, *new_node;
6609 +
6610 +       list_node = evms_cs_lookup_item_in_list(list, item);
6611 +       if (*list_node == NULL) {
6612 +               new_node = kmalloc(sizeof (struct evms_list_node), GFP_NOIO);
6613 +               if (new_node) {
6614 +                       memset(new_node, 0, sizeof (struct evms_list_node));
6615 +                       new_node->item = item;
6616 +                       *list_node = new_node;
6617 +               } else {
6618 +                       rc = -ENOMEM;
6619 +               }
6620 +       } else {
6621 +               rc = 1;
6622 +               LOG_DEBUG
6623 +                   ("warning: attempt to add duplicate item(%p) to list(%p).\n",
6624 +                    item, list);
6625 +       }
6626 +       return (rc);
6627 +}
6628 +
6629 +EXPORT_SYMBOL(evms_cs_add_item_to_list);
6630 +
6631 +/* function description: evms_remove_item_from_list
6632 + *
6633 + * this function removes a specified item from the
6634 + * specified list. if the specified item is not
6635 + * found in the list, and error is returned.
6636 + */
6637 +int
6638 +evms_cs_remove_item_from_list(struct evms_list_node **list, void *item)
6639 +{
6640 +       int rc = 0;
6641 +       struct evms_list_node **list_node;
6642 +
6643 +       /* check to see if item is in the list */
6644 +       list_node = evms_cs_lookup_item_in_list(list, item);
6645 +
6646 +       /* was the node found in the list? */
6647 +       if (*list_node) {
6648 +               /* yes, it was found */
6649 +               struct evms_list_node *tmp_node;
6650 +
6651 +               /* save ptr to node being removed */
6652 +               tmp_node = *list_node;
6653 +               /* remove it from the global list */
6654 +               *list_node = tmp_node->next;
6655 +               /* delete removed node */
6656 +               kfree(tmp_node);
6657 +       } else {
6658 +               /* no, it was not found */
6659 +               rc = -1;
6660 +               LOG_ERROR
6661 +                   ("error(%d): attempt to remove nonexistant node(%p) from list(%p).\n",
6662 +                    rc, item, list);
6663 +       }
6664 +       return (rc);
6665 +}
6666 +
6667 +EXPORT_SYMBOL(evms_cs_remove_item_from_list);
6668 +
6669 +/* function description: evms_cs_register_device
6670 + *
6671 + * this function adds a device to the EVMS global device list.
6672 + *
6673 + * RC == 0 is returned for:
6674 + *     a successful add of a new device
6675 + *
6676 + * RC == 1 is returned when:
6677 + *     the device is already on the list
6678 + *
6679 + * RC < 0 is returned for an error attempting to add the device.
6680 + */
6681 +int
6682 +evms_cs_register_device(struct evms_logical_node *device)
6683 +{
6684 +       return (evms_cs_add_item_to_list(&evms_global_device_list, device));
6685 +}
6686 +
6687 +EXPORT_SYMBOL(evms_cs_register_device);
6688 +
6689 +/* function description: evms_cs_unregister_device
6690 + *
6691 + * this function removes a device from the EVMS global device list.
6692 + *
6693 + * RC == 0 is returned for:
6694 + *     a successful removal of the specified device
6695 + *
6696 + * RC < 0 is returned for an error attempting to add the device.
6697 + *     -ENODATA is returned if specified device is not found.
6698 + */
6699 +int
6700 +evms_cs_unregister_device(struct evms_logical_node *device)
6701 +{
6702 +       return (evms_cs_remove_item_from_list(&evms_global_device_list,
6703 +                                             device));
6704 +}
6705 +
6706 +EXPORT_SYMBOL(evms_cs_unregister_device);
6707 +
6708 +static struct evms_list_node *find_first_next_list_node = NULL;
6709 +int
6710 +evms_cs_find_next_device(struct evms_logical_node *in_device,
6711 +                        struct evms_logical_node **out_device)
6712 +{
6713 +       int rc = 0;
6714 +       struct evms_list_node **list_node;
6715 +
6716 +       if (in_device == NULL)
6717 +               find_first_next_list_node = evms_global_device_list;
6718 +       else {
6719 +               list_node =
6720 +                   evms_cs_lookup_item_in_list(&evms_global_device_list,
6721 +                                               in_device);
6722 +               find_first_next_list_node = *list_node;
6723 +               if (find_first_next_list_node == NULL)
6724 +                       rc = -ENODATA;
6725 +               else
6726 +                       find_first_next_list_node =
6727 +                           find_first_next_list_node->next;
6728 +       }
6729 +
6730 +       if (find_first_next_list_node == NULL)
6731 +               *out_device = NULL;
6732 +       else
6733 +               *out_device = (struct evms_logical_node *)
6734 +                   find_first_next_list_node->item;
6735 +
6736 +       return (rc);
6737 +}
6738 +
6739 +EXPORT_SYMBOL(evms_cs_find_next_device);
6740 +
6741 +void
6742 +evms_cs_signal_event(int eventid)
6743 +{
6744 +       int rc;
6745 +       struct evms_list_node **list_node;
6746 +
6747 +       /* signal PID(s) of specified event */
6748 +       list_node = &evms_global_notify_list;
6749 +       while (*list_node) {
6750 +               struct evms_event *event;
6751 +
6752 +               event = (*list_node)->item;
6753 +               if (event->eventid == eventid) {
6754 +                       struct task_struct *tsk;
6755 +
6756 +                       tsk = find_task_by_pid(event->pid);
6757 +                       if (tsk) {
6758 +                               struct siginfo siginfo;
6759 +
6760 +                               siginfo.si_signo = event->signo;
6761 +                               siginfo.si_errno = 0;
6762 +                               siginfo.si_code = 0;
6763 +                               rc = send_sig_info(event->signo, &siginfo, tsk);
6764 +                       } else {
6765 +                               /* TODO:
6766 +                                * unregister this stale
6767 +                                * notification record
6768 +                                */
6769 +                       }
6770 +               }
6771 +               list_node = &(*list_node)->next;
6772 +       }
6773 +}
6774 +
6775 +EXPORT_SYMBOL(evms_cs_signal_event);
6776 +
6777 +static inline void
6778 +evms_flush_signals(void)
6779 +{
6780 +       spin_lock(&current->sigmask_lock);
6781 +       flush_signals(current);
6782 +       spin_unlock(&current->sigmask_lock);
6783 +}
6784 +
6785 +static inline void
6786 +evms_init_signals(void)
6787 +{
6788 +       current->exit_signal = SIGCHLD;
6789 +       siginitsetinv(&current->blocked, sigmask(SIGKILL));
6790 +}
6791 +
6792 +static int
6793 +evms_thread(void *arg)
6794 +{
6795 +       struct evms_thread *thread = arg;
6796 +       lock_kernel();
6797 +
6798 +       /*
6799 +        * Detach thread
6800 +        */
6801 +
6802 +       daemonize();
6803 +
6804 +       sprintf(current->comm, thread->name);
6805 +       evms_init_signals();
6806 +       evms_flush_signals();
6807 +       thread->tsk = current;
6808 +
6809 +       current->policy = SCHED_OTHER;
6810 +#ifdef O1_SCHEDULER
6811 +       set_user_nice(current, -20);
6812 +#else
6813 +       current->nice = -20;
6814 +#endif
6815 +       unlock_kernel();
6816 +
6817 +       complete(thread->event);
6818 +       while (thread->run) {
6819 +               void (*run) (void *data);
6820 +               DECLARE_WAITQUEUE(wait, current);
6821 +
6822 +               add_wait_queue(&thread->wqueue, &wait);
6823 +#ifdef O1_SCHEDULER
6824 +               set_current_state(TASK_INTERRUPTIBLE);
6825 +#else
6826 +               set_task_state(current, TASK_INTERRUPTIBLE);
6827 +#endif
6828 +               if (!test_bit(EVMS_THREAD_WAKEUP, &thread->flags)) {
6829 +                       schedule();
6830 +               }
6831 +#ifdef O1_SCHEDULER
6832 +               set_current_state(TASK_RUNNING);
6833 +#else
6834 +               current->state = TASK_RUNNING;
6835 +#endif
6836 +               remove_wait_queue(&thread->wqueue, &wait);
6837 +               clear_bit(EVMS_THREAD_WAKEUP, &thread->flags);
6838 +
6839 +               run = thread->run;
6840 +               if (run) {
6841 +                       run(thread->data);
6842 +                       run_task_queue(&tq_disk);
6843 +               }
6844 +               if (signal_pending(current)) {
6845 +                       evms_flush_signals();
6846 +               }
6847 +       }
6848 +       complete(thread->event);
6849 +       return 0;
6850 +}
6851 +
6852 +struct evms_thread *
6853 +evms_cs_register_thread(void (*run) (void *), void *data, const u8 * name)
6854 +{
6855 +       struct evms_thread *thread;
6856 +       int ret;
6857 +       struct completion event;
6858 +
6859 +       thread = kmalloc(sizeof (struct evms_thread), GFP_KERNEL);
6860 +       if (!thread) {
6861 +               return NULL;
6862 +       }
6863 +       memset(thread, 0, sizeof (struct evms_thread));
6864 +       init_waitqueue_head(&thread->wqueue);
6865 +
6866 +       init_completion(&event);
6867 +       thread->event = &event;
6868 +       thread->run = run;
6869 +       thread->data = data;
6870 +       thread->name = name;
6871 +       ret = kernel_thread(evms_thread, thread, 0);
6872 +       if (ret < 0) {
6873 +               kfree(thread);
6874 +               return NULL;
6875 +       }
6876 +       wait_for_completion(&event);
6877 +       return thread;
6878 +}
6879 +
6880 +EXPORT_SYMBOL(evms_cs_register_thread);
6881 +
6882 +void
6883 +evms_cs_unregister_thread(struct evms_thread *thread)
6884 +{
6885 +       struct completion event;
6886 +
6887 +       init_completion(&event);
6888 +
6889 +       thread->event = &event;
6890 +       thread->run = NULL;
6891 +       thread->name = NULL;
6892 +       evms_cs_interrupt_thread(thread);
6893 +       wait_for_completion(&event);
6894 +       kfree(thread);
6895 +}
6896 +
6897 +EXPORT_SYMBOL(evms_cs_unregister_thread);
6898 +
6899 +void
6900 +evms_cs_wakeup_thread(struct evms_thread *thread)
6901 +{
6902 +       set_bit(EVMS_THREAD_WAKEUP, &thread->flags);
6903 +       wake_up(&thread->wqueue);
6904 +}
6905 +
6906 +EXPORT_SYMBOL(evms_cs_wakeup_thread);
6907 +
6908 +void
6909 +evms_cs_interrupt_thread(struct evms_thread *thread)
6910 +{
6911 +       if (!thread->tsk) {
6912 +               LOG_ERROR("error: attempted to interrupt an invalid thread!\n");
6913 +               return;
6914 +       }
6915 +       send_sig(SIGKILL, thread->tsk, 1);
6916 +}
6917 +
6918 +EXPORT_SYMBOL(evms_cs_interrupt_thread);
6919 +
6920 +struct proc_dir_entry *
6921 +evms_cs_get_evms_proc_dir(void)
6922 +{
6923 +#ifdef CONFIG_PROC_FS
6924 +       if (!evms_proc_dir) {
6925 +               evms_proc_dir = create_proc_entry("evms", S_IFDIR, &proc_root);
6926 +       }
6927 +#endif
6928 +       return (evms_proc_dir);
6929 +}
6930 +
6931 +EXPORT_SYMBOL(evms_cs_get_evms_proc_dir);
6932 +
6933 +int
6934 +evms_cs_volume_request_in_progress(kdev_t dev,
6935 +                                  int operation, int *current_count)
6936 +{
6937 +       int rc = 0;
6938 +       struct evms_logical_volume *volume;
6939 +
6940 +       volume = &evms_logical_volumes[MINOR(dev)];
6941 +       if (volume->node) {
6942 +               if (operation > 0) {
6943 +                       atomic_inc(&volume->requests_in_progress);
6944 +               } else if (operation < 0) {
6945 +                       atomic_dec(&volume->requests_in_progress);
6946 +               }
6947 +               if (current_count) {
6948 +                       *current_count =
6949 +                           atomic_read(&volume->requests_in_progress);
6950 +               }
6951 +       } else {
6952 +               rc = -ENODEV;
6953 +       }
6954 +       return (rc);
6955 +}
6956 +
6957 +EXPORT_SYMBOL(evms_cs_volume_request_in_progress);
6958 +
6959 +void
6960 +evms_cs_invalidate_volume(struct evms_logical_node *node)
6961 +{
6962 +       int i;
6963 +       for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
6964 +               if (evms_logical_volumes[i].node && node->name) {
6965 +                       if (!
6966 +                           (strcmp
6967 +                            (evms_logical_volumes[i].node->name,
6968 +                             node->name))) {
6969 +                               LOG_DETAILS
6970 +                                   ("Invalidating EVMS device %s minor %d\n",
6971 +                                    node->name, i);
6972 +                               invalidate_device(MKDEV(EVMS_MAJOR, i), 0);
6973 +                               break;
6974 +                       }
6975 +               }
6976 +       }
6977 +}
6978 +
6979 +EXPORT_SYMBOL(evms_cs_invalidate_volume);
6980 +
6981 +static int
6982 +is_open(int minor)
6983 +{
6984 +       return atomic_read(&evms_logical_volumes[minor].opens);
6985 +}
6986 +
6987 +/**********************************************************/
6988 +/* END -- exported functions/Common Services              */
6989 +/**********************************************************/
6990 +
6991 +/**********************************************************/
6992 +/* START -- Proc FS Support functions                     */
6993 +/**********************************************************/
6994 +
6995 +#ifdef CONFIG_PROC_FS
6996 +static int
6997 +evms_info_read_proc(char *page,
6998 +                   char **start, off_t off, int count, int *eof, void *data)
6999 +{
7000 +       int sz = 0;
7001 +       char *info_level_text = NULL;
7002 +
7003 +       PROCPRINT("Enterprise Volume Management System: Info\n");
7004 +       switch (evms_info_level) {
7005 +       case EVMS_INFO_CRITICAL:
7006 +               info_level_text = "critical";
7007 +               break;
7008 +       case EVMS_INFO_SERIOUS:
7009 +               info_level_text = "serious";
7010 +               break;
7011 +       case EVMS_INFO_ERROR:
7012 +               info_level_text = "error";
7013 +               break;
7014 +       case EVMS_INFO_WARNING:
7015 +               info_level_text = "warning";
7016 +               break;
7017 +       case EVMS_INFO_DEFAULT:
7018 +               info_level_text = "default";
7019 +               break;
7020 +       case EVMS_INFO_DETAILS:
7021 +               info_level_text = "details";
7022 +               break;
7023 +       case EVMS_INFO_DEBUG:
7024 +               info_level_text = "debug";
7025 +               break;
7026 +       case EVMS_INFO_EXTRA:
7027 +               info_level_text = "extra";
7028 +               break;
7029 +       case EVMS_INFO_ENTRY_EXIT:
7030 +               info_level_text = "entry exit";
7031 +               break;
7032 +       case EVMS_INFO_EVERYTHING:
7033 +               info_level_text = "everything";
7034 +               break;
7035 +       default:
7036 +               info_level_text = "unknown";
7037 +               break;
7038 +       }
7039 +       PROCPRINT("EVMS info level: %d (%s).\n",
7040 +                 evms_info_level, info_level_text);
7041 +
7042 +       PROCPRINT("EVMS kernel version: %d.%d.%d\n",
7043 +                 EVMS_MAJOR_VERSION,
7044 +                 EVMS_MINOR_VERSION, EVMS_PATCHLEVEL_VERSION);
7045 +
7046 +       PROCPRINT("EVMS IOCTL interface version: %d.%d.%d\n",
7047 +                 EVMS_IOCTL_INTERFACE_MAJOR,
7048 +                 EVMS_IOCTL_INTERFACE_MINOR, EVMS_IOCTL_INTERFACE_PATCHLEVEL);
7049 +
7050 +       PROCPRINT("EVMS Common Services version: %d.%d.%d\n",
7051 +                 EVMS_COMMON_SERVICES_MAJOR,
7052 +                 EVMS_COMMON_SERVICES_MINOR, EVMS_COMMON_SERVICES_PATCHLEVEL);
7053 +
7054 +       *eof = 1;
7055 +
7056 +out:
7057 +       *start = page + off;
7058 +       sz -= off;
7059 +       if (sz < 0)
7060 +               sz = 0;
7061 +       return sz > count ? count : sz;
7062 +}
7063 +
7064 +static int
7065 +evms_plugins_read_proc(char *page,
7066 +                      char **start, off_t off, int count, int *eof, void *data)
7067 +{
7068 +       int sz = 0;
7069 +       struct evms_registered_plugin *rp = NULL;
7070 +
7071 +       PROCPRINT("Enterprise Volume Management System: Plugins\n");
7072 +       /*             0    1    1    2    2    3    3    4    4    5    5    6    6    7 */
7073 +       /*         1   5    0    5    0    5    0    5    0    5    0    5    0    5    0 */
7074 +       PROCPRINT(" ---------Plugin----------      required services\n");
7075 +       PROCPRINT(" ----id----        version      version\n\n");
7076 +       for (rp = registered_plugin_head; rp; rp = rp->next) {
7077 +               PROCPRINT(" %x.%x.%x\t   %d.%d.%d\t%d.%d.%d\n",
7078 +                         GetPluginOEM(rp->plugin->id),
7079 +                         GetPluginType(rp->plugin->id),
7080 +                         GetPluginID(rp->plugin->id),
7081 +                         rp->plugin->version.major,
7082 +                         rp->plugin->version.minor,
7083 +                         rp->plugin->version.patchlevel,
7084 +                         rp->plugin->required_services_version.major,
7085 +                         rp->plugin->required_services_version.minor,
7086 +                         rp->plugin->required_services_version.patchlevel);
7087 +       }
7088 +
7089 +out:
7090 +       *start = page + off;
7091 +       sz -= off;
7092 +       if (sz < 0)
7093 +               sz = 0;
7094 +       return sz > count ? count : sz;
7095 +}
7096 +
7097 +static int
7098 +evms_volumes_read_proc(char *page,
7099 +                      char **start, off_t off, int count, int *eof, void *data)
7100 +{
7101 +       int sz = 0, j;
7102 +
7103 +       PROCPRINT("Enterprise Volume Management System: Volumes\n");
7104 +       PROCPRINT("major   minor          #blocks type   flags name\n\n");
7105 +       for (j = 1; j < MAX_EVMS_VOLUMES; j++) {
7106 +               struct evms_logical_volume *volume;
7107 +
7108 +               volume = &evms_logical_volumes[j];
7109 +               if (volume->node) {
7110 +                       PROCPRINT("%5d %7d %16Ld %s %s %s %s%s\n",
7111 +                                 EVMS_MAJOR, j,
7112 +                                 (long long)volume->node->total_vsectors >> 1,
7113 +                                 (volume->
7114 +                                  flags & EVMS_VOLUME_FLAG) ? "evms  " :
7115 +                                 "compat",
7116 +                                 (volume->
7117 +                                  flags & EVMS_VOLUME_READ_ONLY) ? "ro" : "rw",
7118 +                                 (volume->
7119 +                                  flags & EVMS_VOLUME_PARTIAL) ? "p " : "  ",
7120 +                                 EVMS_DEV_NODE_PATH, volume->name);
7121 +               }
7122 +       }
7123 +out:
7124 +       *start = page + off;
7125 +       sz -= off;
7126 +       if (sz < 0)
7127 +               sz = 0;
7128 +       return sz > count ? count : sz;
7129 +
7130 +}
7131 +#endif
7132 +
7133 +/**********************************************************/
7134 +/* END -- Proc FS Support functions                       */
7135 +/**********************************************************/
7136 +
7137 +/**********************************************************/
7138 +/* START -- FOPS functions definitions                    */
7139 +/**********************************************************/
7140 +
7141 +/************************************************/
7142 +/* START -- IOCTL commands -- EVMS specific     */
7143 +/************************************************/
7144 +
7145 +static int
7146 +evms_ioctl_cmd_get_ioctl_version(void *arg)
7147 +{
7148 +       int rc = 0;
7149 +       struct evms_version ver;
7150 +
7151 +       ver.major = EVMS_IOCTL_INTERFACE_MAJOR;
7152 +       ver.minor = EVMS_IOCTL_INTERFACE_MINOR;
7153 +       ver.patchlevel = EVMS_IOCTL_INTERFACE_PATCHLEVEL;
7154 +
7155 +       /* copy info to userspace */
7156 +       if (copy_to_user(arg, &ver, sizeof (ver)))
7157 +               rc = -EFAULT;
7158 +
7159 +       return (rc);
7160 +}
7161 +
7162 +static int
7163 +evms_ioctl_cmd_get_version(void *arg)
7164 +{
7165 +       int rc = 0;
7166 +       struct evms_version ver;
7167 +
7168 +       ver.major = EVMS_MAJOR_VERSION;
7169 +       ver.minor = EVMS_MINOR_VERSION;
7170 +       ver.patchlevel = EVMS_PATCHLEVEL_VERSION;
7171 +
7172 +       /* copy info to userspace */
7173 +       if (copy_to_user(arg, &ver, sizeof (ver)))
7174 +               rc = -EFAULT;
7175 +
7176 +       return (rc);
7177 +}
7178 +
7179 +static int
7180 +evms_ioctl_cmd_get_info_level(void *arg)
7181 +{
7182 +       int rc = 0;
7183 +
7184 +       /* copy info to userspace */
7185 +       if (copy_to_user(arg, &evms_info_level, sizeof (evms_info_level)))
7186 +               rc = -EFAULT;
7187 +
7188 +       return (rc);
7189 +}
7190 +
7191 +static int
7192 +evms_ioctl_cmd_set_info_level(void *arg)
7193 +{
7194 +       int temp, rc = 0;
7195 +
7196 +       /* copy info from userspace */
7197 +       if (copy_from_user(&temp, arg, sizeof (temp)))
7198 +               rc = -EFAULT;
7199 +       else
7200 +               evms_info_level = temp;
7201 +
7202 +       return (rc);
7203 +}
7204 +
7205 +/* function: evms_quiesce_volume
7206 + *
7207 + * this function performs the actual quiesce operation on
7208 + * a volume in kernel memory.
7209 + *
7210 + * when quiescing, all new I/Os to a volume are stopped,
7211 + * causing the calling thread to block. this thread then
7212 + * waits until all I/Os in progress are completed, before
7213 + * return control to the caller.
7214 + *
7215 + * when unquiescing, all new I/Os are allowed to proceed
7216 + * unencumbered, and all threads waiting (blocked) on this
7217 + * volume, are woken up and allowed to proceed.
7218 + *
7219 + */
7220 +static int
7221 +evms_quiesce_volume(struct evms_logical_volume *volume,
7222 +                   struct inode *inode,
7223 +                   struct file *file, struct evms_quiesce_vol_pkt *qv)
7224 +{
7225 +       int rc;
7226 +
7227 +       LOG_DEBUG("%squiescing %s.\n",
7228 +                 ((qv->command) ? "" : "un"), volume->name);
7229 +
7230 +#ifdef VFS_PATCH_PRESENT
7231 +       if (qv->do_vfs) {
7232 +               /* VFS function call to sync and lock the filesystem */
7233 +               fsync_dev_lockfs(MKDEV(EVMS_MAJOR, qv->minor));
7234 +               volume->vfs_quiesced = TRUE;
7235 +       }
7236 +#endif
7237 +       volume->quiesced = qv->command;
7238 +
7239 +       /* Command specified was "quiesce". */
7240 +       if (qv->command) {
7241 +               /* After setting the volume to
7242 +                * a quiesced state, there could
7243 +                * be threads (on SMP systems)
7244 +                * that are executing in the
7245 +                * function, evms_handle_request,
7246 +                * between the "wait_event" and the
7247 +                * "atomic_inc" lines. We need to
7248 +                * provide a "delay" sufficient
7249 +                * to allow those threads to
7250 +                * to reach the atomic_inc's
7251 +                * before executing the while loop
7252 +                * below. The "schedule" call should
7253 +                * provide this.
7254 +                */
7255 +               schedule();
7256 +               /* wait for outstanding requests
7257 +                * to complete
7258 +                */
7259 +               while (atomic_read(&volume->requests_in_progress) > 0)
7260 +                       schedule();
7261 +       }
7262 +       /* send this command down the stack so lower */
7263 +       /* layers can know about this                */
7264 +       rc = IOCTL(volume->node, inode, file,
7265 +                  EVMS_QUIESCE_VOLUME, (unsigned long) qv);
7266 +       if (!rc) {
7267 +               /* Command specified was "unquiesce". */
7268 +               if (!qv->command) {
7269 +                       /* "wakeup" any I/O requests waiting on
7270 +                        * this volume.
7271 +                        */
7272 +                       if (waitqueue_active(&volume->wait_queue))
7273 +                               wake_up(&volume->wait_queue);
7274 +#ifdef VFS_PATCH_PRESENT
7275 +                       if (volume->vfs_quiesced) {
7276 +                               /* VFS function call to unlock the filesystem */
7277 +                               unlockfs(MKDEV(EVMS_MAJOR, qv->minor));
7278 +                               volume->vfs_quiesced = FALSE;
7279 +                       }
7280 +#endif
7281 +               }
7282 +       } else {
7283 +               LOG_ERROR("error(%d) %squiescing %s.\n",
7284 +                         rc, ((qv->command) ? "" : "un"), volume->name);
7285 +       }
7286 +       return (rc);
7287 +}
7288 +
7289 +/* function: evms_delete_volume
7290 + *
7291 + * this function performs the actual delete operation on
7292 + * a volume to purge it from kernel memory. all structures
7293 + * and memory consumed by this volume will be free as well
7294 + * as clearing or unregistering any system services or
7295 + * global data arrays.
7296 + *
7297 + * NOTE: this function will return -EBUSY on attempts to
7298 + * delete mounted volumes.
7299 + *
7300 + */
7301 +static int
7302 +evms_delete_volume(struct evms_logical_volume *volume,
7303 +                  struct evms_delete_vol_pkt *dv)
7304 +{
7305 +       int rc = 0;
7306 +
7307 +       /* if this is a "permament" delete */
7308 +       /* check to make sure volume is not mounted */
7309 +       if (dv->command) {
7310 +               if (is_open(dv->minor)) {
7311 +                       rc = -EBUSY;
7312 +               } else {
7313 +                       // invalidate the device since it is not coming back
7314 +                       // this is required incase we are re-using the minor number
7315 +                       invalidate_device(MKDEV(EVMS_MAJOR, dv->minor), 1);
7316 +               }
7317 +       }
7318 +
7319 +       /* invoke the delete ioctl at the top of the feature stack */
7320 +       if (!rc) {
7321 +               LOG_DETAILS("deleting '%s'.\n", volume->name);
7322 +               rc = DELETE(volume->node);
7323 +       }
7324 +
7325 +       /* the volume has been deleted, do any clean up work
7326 +        * required.
7327 +        */
7328 +       if (!rc) {
7329 +               devfs_unregister(volume->devfs_handle);
7330 +               if (dv->command) {
7331 +                       /* if "permanent" delete, free the name
7332 +                        * and NULL the name field.
7333 +                        */
7334 +                       kfree(volume->name);
7335 +                       volume->name = NULL;
7336 +                       volume->flags = 0;
7337 +               } else {
7338 +                       /* if "soft" delete, leave the name so
7339 +                        * we can use it to reassign the same
7340 +                        * minor to this volume after a
7341 +                        * rediscovery.
7342 +                        */
7343 +                       volume->flags = EVMS_VOLUME_SOFT_DELETED;
7344 +               }
7345 +               volume->node = NULL;
7346 +               set_device_ro(MKDEV(EVMS_MAJOR, dv->minor), 0);
7347 +               blk_size[EVMS_MAJOR][dv->minor] = 0;
7348 +               blksize_size[EVMS_MAJOR][dv->minor] = 0;
7349 +               hardsect_size[EVMS_MAJOR][dv->minor] = 0;
7350 +               evms_volumes--;
7351 +       } else {
7352 +               LOG_ERROR("error(%d) %s deleting %s.\n",
7353 +                         rc, ((dv->command) ? "hard" : "soft"), volume->name);
7354 +       }
7355 +       return (rc);
7356 +}
7357 +
7358 +/* function: evms_user_delete_volume
7359 + *
7360 + * this function, depending on the parameters, performs
7361 + * a "soft" or a "hard" delete. for a "soft" delete, a
7362 + * quiesce & delete request is queued up, to be executed
7363 + * at the beginning of the next rediscovery. for a
7364 + * "hard" delete, the target volume is quiesced and then
7365 + * deleted. if there is any errors attempting to delete
7366 + * the target, then the target is unquiesced. if an
7367 + * associative volume is specified it is quiesced before
7368 + * the target volume is quiesced, and is unquiesced
7369 + * after the attempt to delete the target volume.
7370 + *
7371 + */
7372 +static int
7373 +evms_user_delete_volume(struct evms_logical_volume *lvt,
7374 +                       struct inode *inode,
7375 +                       struct file *file, struct evms_delete_vol_pkt *dv)
7376 +{
7377 +       int rc = 0;
7378 +
7379 +       if (!dv->command) {
7380 +               /* "soft delete" requested */
7381 +               lvt->flags |= (EVMS_REQUESTED_QUIESCE | EVMS_REQUESTED_DELETE);
7382 +               if (dv->do_vfs) {
7383 +                       lvt->flags |= EVMS_REQUESTED_VFS_QUIESCE;
7384 +               }
7385 +       } else {
7386 +               /* "hard delete" requested */
7387 +               int qa = FALSE;
7388 +               struct evms_quiesce_vol_pkt qv;
7389 +               struct evms_logical_volume *lva = NULL;
7390 +
7391 +               if (dv->associative_minor) {
7392 +                       /* associative volume specified
7393 +                        *
7394 +                        * quiesce it
7395 +                        */
7396 +                       lva = &evms_logical_volumes[dv->associative_minor];
7397 +                       /* quiesce associative volume */
7398 +                       qv.command = EVMS_QUIESCE;
7399 +                       qv.do_vfs = EVMS_VFS_DO_NOTHING;
7400 +                       qv.minor = dv->associative_minor;
7401 +                       rc = evms_quiesce_volume(lva, inode, file, &qv);
7402 +                       qa = (rc) ? FALSE : TRUE;
7403 +               }
7404 +               if (!rc) {
7405 +                       /* quiesce target volume */
7406 +                       qv.command = EVMS_QUIESCE;
7407 +                       qv.do_vfs = EVMS_VFS_DO_NOTHING;
7408 +                       qv.minor = dv->minor;
7409 +                       rc = evms_quiesce_volume(lvt, inode, file, &qv);
7410 +               }
7411 +               if (!rc) {
7412 +                       /* delete the target volume */
7413 +                       rc = evms_delete_volume(lvt, dv);
7414 +                       if (rc) {
7415 +                               /* got an error undeleting...
7416 +                                *
7417 +                                * unquiesce the target
7418 +                                */
7419 +                               qv.command = EVMS_UNQUIESCE;
7420 +                               qv.do_vfs = EVMS_VFS_DO_NOTHING;
7421 +                               qv.minor = dv->minor;
7422 +                               evms_quiesce_volume(lvt, inode, file, &qv);
7423 +                       }
7424 +               }
7425 +               if (dv->associative_minor) {
7426 +                       /* associative volume specified
7427 +                        *
7428 +                        * unquiesce it
7429 +                        */
7430 +                       if (qa) {
7431 +                               /* only unquiesce associative
7432 +                                * if we successfully quiesced
7433 +                                * it previously.
7434 +                                */
7435 +                               qv.command = EVMS_UNQUIESCE;
7436 +                               qv.do_vfs = EVMS_VFS_DO_NOTHING;
7437 +                               qv.minor = dv->associative_minor;
7438 +                               evms_quiesce_volume(lva, inode, file, &qv);
7439 +                       }
7440 +               }
7441 +       }
7442 +       return (rc);
7443 +}
7444 +
7445 +/* function: evms_ioctl_cmd_delete_volume
7446 + *
7447 + * this function copy user data to/from the kernel, and
7448 + * validates user parameters. after validation, control
7449 + * is passed to worker routine evms_user_delete_volume.
7450 + *
7451 + */
7452 +static int
7453 +evms_ioctl_cmd_delete_volume(struct inode *inode,
7454 +                            struct file *file, unsigned long arg)
7455 +{
7456 +       int rc = 0;
7457 +       struct evms_delete_vol_pkt tmp, *user_parms;
7458 +       struct evms_logical_volume *volume = NULL;
7459 +
7460 +       user_parms = (struct evms_delete_vol_pkt *) arg;
7461 +       /* copy user's parameters to kernel space */
7462 +       if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
7463 +               rc = -EFAULT;
7464 +
7465 +       /* check to make sure associative minor is in use */
7466 +       if (!rc) {
7467 +               if (tmp.associative_minor) {
7468 +                       volume = &evms_logical_volumes[tmp.associative_minor];
7469 +                       if (volume->node == NULL)
7470 +                               rc = -ENXIO;
7471 +               }
7472 +       }
7473 +       /* check to make sure target minor is in use */
7474 +       if (!rc) {
7475 +               volume = &evms_logical_volumes[tmp.minor];
7476 +               if (volume->node == NULL)
7477 +                       rc = -ENXIO;
7478 +               else
7479 +                       rc = evms_user_delete_volume(volume, inode, file, &tmp);
7480 +       }
7481 +       /* copy the status value back to the user */
7482 +       tmp.status = rc;
7483 +       if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
7484 +               rc = -EFAULT;
7485 +
7486 +       return (rc);
7487 +}
7488 +
7489 +/* function: evms_full_rediscover_prep
7490 + *
7491 + * this function helps to prevent problems when evms is
7492 + * configured with the base built in statically and some
7493 + * plugins built as modules.
7494 + *
7495 + * in these cases, when the initial discovery is done,
7496 + * only the statically built modules are available for
7497 + * volume construction. as a result, some volumes that
7498 + * require the plugins built as modules (which haven't
7499 + * been loaded), to be fully reconstructed, may come up
7500 + * as compatibility volumes or partial volumes.
7501 + *
7502 + * when parts of evms are built as modules, the
7503 + * evms_rediscover_pkty utility is used, to perform a secondary
7504 + * rediscover, after all the plugins built as modules
7505 + * have been loaded, to construct all the volumes
7506 + * requiring these plugins.
7507 + *
7508 + * however since some of the volumes, requiring the plugins
7509 + * built as modules, may have been already exported as
7510 + * compatibility or partial volumes, we need to purge these
7511 + * volumes from kernel's memory, so that can be rediscovered
7512 + * and claimed by the appropriate plugins, and reconstructed
7513 + * into the correct volumes.
7514 + *
7515 + * this function purges all compatibility volumes that are
7516 + * not in use(mounted) and all partial volumes, prior to
7517 + * doing the secondary rediscover, thus allowing volumes to
7518 + * rediscovered correctly.
7519 + *
7520 + * NOTE: again, this is only required in cases when a
7521 + * combination of plugins are built statically and as
7522 + * modules.
7523 + *
7524 + */
7525 +static void
7526 +evms_full_rediscover_prep(struct inode *inode, struct file *file)
7527 +{
7528 +       int rc = 0, i;
7529 +
7530 +       LOG_DETAILS("%s: started.\n", __FUNCTION__);
7531 +       /* check for acceptable volumes to be deleted */
7532 +       for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
7533 +               struct evms_logical_volume *volume = NULL;
7534 +               struct evms_delete_vol_pkt dv;
7535 +               int volume_open, doit;
7536 +
7537 +               volume = &evms_logical_volumes[i];
7538 +               if (!volume->node)
7539 +                       continue;
7540 +               volume_open = is_open(i);
7541 +               /* only proceed on volumes that are:
7542 +                *   partial volumes
7543 +                *      OR
7544 +                *   unopened compatibility volumes
7545 +                */
7546 +               doit = FALSE;
7547 +               if (volume->flags & EVMS_VOLUME_PARTIAL) {
7548 +                       /* do all partial volumes
7549 +                        */
7550 +                       doit = TRUE;
7551 +               } else if (!(volume->flags & EVMS_VOLUME_FLAG)) {
7552 +                       /* check all compatibility volumes
7553 +                        */
7554 +                       if (!volume_open && !is_swap_partition(MKDEV(EVMS_MAJOR, i))) {
7555 +                               /* only do unopened volumes
7556 +                                */
7557 +                               doit = TRUE;
7558 +                       }
7559 +               }
7560 +               if (doit == FALSE) {
7561 +                       continue;
7562 +               }
7563 +               /* delete the volume from memory.
7564 +                * do a 'soft' delete if volume
7565 +                * is mounted, and 'hard' delete
7566 +                * if it is not.
7567 +                *
7568 +                * NOTE: the delete operation will
7569 +                * clear the bits in the flags field.
7570 +                */
7571 +               dv.command = (volume_open) ?
7572 +                   EVMS_SOFT_DELETE : EVMS_HARD_DELETE;
7573 +               dv.minor = i;
7574 +               dv.associative_minor = 0;
7575 +               dv.status = 0;
7576 +               rc = evms_user_delete_volume(volume, inode, file, &dv);
7577 +       }
7578 +       LOG_DETAILS("%s: completed.\n", __FUNCTION__);
7579 +}
7580 +
7581 +static int
7582 +evms_ioctl_cmd_rediscover_volumes(struct inode *inode,
7583 +                                 struct file *file,
7584 +                                 unsigned int cmd, unsigned long arg)
7585 +{
7586 +       int rc, i;
7587 +       struct evms_rediscover_pkt tmp, *user_parms;
7588 +       u64 *array_ptr = NULL;
7589 +       ulong array_size = 0;
7590 +       struct evms_logical_volume *volume = NULL;
7591 +
7592 +       rc = tmp.drive_count = 0;
7593 +       user_parms = (struct evms_rediscover_pkt *) arg;
7594 +       /* copy user's parameters to kernel space */
7595 +       if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
7596 +               rc = -EFAULT;
7597 +
7598 +       if (tmp.drive_count == REDISCOVER_ALL_DEVICES) {
7599 +               evms_full_rediscover_prep(inode, file);
7600 +       }
7601 +       /* quiesce all queued volumes */
7602 +       for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
7603 +               struct evms_quiesce_vol_pkt qv;
7604 +
7605 +               volume = &evms_logical_volumes[i];
7606 +               if (!volume->node) {
7607 +                       continue;
7608 +               }
7609 +               if (!(volume->flags & EVMS_REQUESTED_QUIESCE)) {
7610 +                       continue;
7611 +               }
7612 +               qv.command = EVMS_QUIESCE;
7613 +               qv.minor = i;
7614 +               qv.do_vfs = (volume->flags & EVMS_REQUESTED_VFS_QUIESCE) ?
7615 +                   EVMS_VFS_DO : EVMS_VFS_DO_NOTHING, qv.status = 0;
7616 +               rc = evms_quiesce_volume(volume, inode, file, &qv);
7617 +       }
7618 +       /* "soft" delete all queued volumes */
7619 +       for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
7620 +               struct evms_delete_vol_pkt dv;
7621 +
7622 +               volume = &evms_logical_volumes[i];
7623 +               if (!volume->node) {
7624 +                       continue;
7625 +               }
7626 +               if (!(volume->flags & EVMS_REQUESTED_DELETE)) {
7627 +                       continue;
7628 +               }
7629 +               dv.command = EVMS_SOFT_DELETE;
7630 +               dv.minor = i;
7631 +               dv.associative_minor = 0;
7632 +               dv.status = 0;
7633 +               rc = evms_delete_volume(volume, &dv);
7634 +       }
7635 +
7636 +       if (tmp.drive_count && (tmp.drive_count != REDISCOVER_ALL_DEVICES)) {
7637 +               if (!rc) {
7638 +                       /* create space for userspace drive array */
7639 +                       array_size =
7640 +                           sizeof (*tmp.drive_array) * tmp.drive_count;
7641 +                       array_ptr = tmp.drive_array;
7642 +                       tmp.drive_array = kmalloc(array_size, GFP_KERNEL);
7643 +                       if (!tmp.drive_array) {
7644 +                               rc = -ENOMEM;
7645 +                       }
7646 +               }
7647 +               if (!rc)
7648 +                       /* copy rediscover drive array to kernel space */
7649 +                       if (copy_from_user
7650 +                           (tmp.drive_array, array_ptr, array_size))
7651 +                               rc = -EFAULT;
7652 +       }
7653 +
7654 +       if (!rc) {
7655 +               static int evms_discover_volumes(struct evms_rediscover_pkt *);
7656 +               /* perform the rediscovery operation */
7657 +               rc = evms_discover_volumes(&tmp);
7658 +       }
7659 +
7660 +       /* clean up after operation */
7661 +       if (tmp.drive_count && (tmp.drive_count != REDISCOVER_ALL_DEVICES))
7662 +               kfree(tmp.drive_array);
7663 +
7664 +       /* set return code and copy info to userspace */
7665 +       tmp.status = rc;
7666 +       if (copy_to_user(&user_parms->status, &tmp.status, sizeof (tmp.status)))
7667 +               rc = -EFAULT;
7668 +
7669 +       return (rc);
7670 +}
7671 +
7672 +static struct evms_list_node *user_disk_ptr;
7673 +static int
7674 +evms_ioctl_cmd_get_logical_disk(void *arg)
7675 +{
7676 +       int rc = 0;
7677 +       struct evms_user_disk_pkt tmp, *user_parms;
7678 +
7679 +       user_parms = (struct evms_user_disk_pkt *) arg;
7680 +       /* copy user's parameters to kernel space */
7681 +       if (copy_from_user
7682 +           (&tmp.command, &user_parms->command, sizeof (tmp.command)))
7683 +               rc = -EFAULT;
7684 +
7685 +       if (!rc) {
7686 +               if (tmp.command == EVMS_FIRST_DISK)
7687 +                       user_disk_ptr = evms_global_device_list;
7688 +               else            /* tmp.command == EVMS_NEXT_DISK */
7689 +                       user_disk_ptr = user_disk_ptr->next;
7690 +
7691 +               if (user_disk_ptr == NULL)
7692 +                       tmp.status = EVMS_DISK_INVALID;
7693 +               else {
7694 +                       tmp.status = EVMS_DISK_VALID;
7695 +                       tmp.disk_handle =
7696 +                           NODE_TO_DEV_HANDLE(user_disk_ptr->item);
7697 +               }
7698 +               /* copy info to userspace */
7699 +               if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
7700 +                       rc = -EFAULT;
7701 +       }
7702 +       return (rc);
7703 +}
7704 +
7705 +static int
7706 +evms_ioctl_cmd_get_logical_disk_info(void *arg)
7707 +{
7708 +       int rc = 0;
7709 +       struct evms_user_disk_info_pkt tmp, *user_parms;
7710 +       struct evms_list_node *p;
7711 +       struct evms_logical_node *disk_node = NULL;
7712 +
7713 +       user_parms = (struct evms_user_disk_info_pkt *) arg;
7714 +       /* copy user's parameters to kernel space */
7715 +       if (copy_from_user
7716 +           (&tmp.disk_handle, &user_parms->disk_handle,
7717 +            sizeof (tmp.disk_handle)))
7718 +               rc = -EFAULT;
7719 +
7720 +       /* check handle for validity */
7721 +       if (!rc) {
7722 +               rc = -EINVAL;
7723 +               disk_node = DEV_HANDLE_TO_NODE(tmp.disk_handle);
7724 +               for (p = evms_global_device_list; p; p = p->next)
7725 +                       if (p->item == disk_node) {
7726 +                               rc = 0;
7727 +                               user_disk_ptr = p;
7728 +                               break;
7729 +                       }
7730 +       }
7731 +
7732 +       /* populate kernel copy of user's structure with appropriate info */
7733 +       if (!rc) {
7734 +               struct hd_geometry geo;
7735 +               struct evms_logical_node *node =
7736 +                   (struct evms_logical_node *) user_disk_ptr->item;
7737 +               tmp.flags = node->flags;
7738 +               strcpy(tmp.disk_name, EVMS_DEV_NODE_PATH);
7739 +               strcat(tmp.disk_name, node->name);
7740 +               rc = evms_cs_kernel_ioctl(node, EVMS_UPDATE_DEVICE_INFO,
7741 +                                         (ulong) NULL);
7742 +               if (!rc) {
7743 +                       tmp.total_sectors = node->total_vsectors;
7744 +                       tmp.hardsect_size = node->hardsector_size;
7745 +                       tmp.block_size = node->block_size;
7746 +                       rc = evms_cs_kernel_ioctl(node, HDIO_GETGEO,
7747 +                                                 (unsigned long) &geo);
7748 +               }
7749 +               if (!rc) {
7750 +                       tmp.geo_sectors = geo.sectors;
7751 +                       tmp.geo_heads = geo.heads;
7752 +                       tmp.geo_cylinders = geo.cylinders;
7753 +               }
7754 +       }
7755 +
7756 +       /* set return code and copy info to userspace */
7757 +       tmp.status = rc;
7758 +       if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
7759 +               rc = -EFAULT;
7760 +
7761 +       return (rc);
7762 +}
7763 +
7764 +static int
7765 +evms_ioctl_cmd_sector_io(void *arg)
7766 +{
7767 +       int rc;
7768 +#define MAX_IO_SIZE 128
7769 +       u64 io_size, max_io_size = MAX_IO_SIZE;
7770 +#undef MAX_IO_SIZE
7771 +       struct evms_sector_io_pkt tmp, *user_parms;
7772 +       struct evms_logical_node *disk_node = NULL;
7773 +       struct evms_list_node *list_node;
7774 +       unsigned char *io_buffer;
7775 +
7776 +       rc = 0;
7777 +       list_node = NULL;
7778 +       io_buffer = NULL;
7779 +
7780 +       user_parms = (struct evms_sector_io_pkt *) arg;
7781 +       /* copy user's parameters to kernel space */
7782 +       if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
7783 +               rc = -EFAULT;
7784 +
7785 +       /* check handle for validity */
7786 +       if (!rc) {
7787 +               rc = -EINVAL;
7788 +               disk_node = DEV_HANDLE_TO_NODE(tmp.disk_handle);
7789 +               for (list_node = evms_global_device_list; list_node;
7790 +                    list_node = list_node->next)
7791 +                       if (list_node->item == disk_node) {
7792 +                               rc = 0;
7793 +                               break;
7794 +                       }
7795 +       }
7796 +       if (!rc) {
7797 +               int done;
7798 +               /* allocate a io buffer upto 64Kbytes in size */
7799 +               if (tmp.sector_count < max_io_size)
7800 +                       max_io_size = tmp.sector_count;
7801 +               do {
7802 +                       done = TRUE;
7803 +                       /* allocate buffer large enough to max_io_size sectors */
7804 +                       io_buffer =
7805 +                           kmalloc(max_io_size << EVMS_VSECTOR_SIZE_SHIFT,
7806 +                                   GFP_KERNEL);
7807 +                       if (!io_buffer) {
7808 +                               max_io_size >>= 1;
7809 +                               if (!max_io_size) {
7810 +                                       rc = -ENOMEM;
7811 +                               } else {
7812 +                                       done = FALSE;
7813 +                               }
7814 +                       }
7815 +               } while (!done);
7816 +       }
7817 +       /* perform io with specified disk */
7818 +       if (!rc) {
7819 +               u64 io_sector_offset, io_remaining;
7820 +               u64 io_bytes;
7821 +               u_char *user_buffer_ptr;
7822 +
7823 +               io_remaining = tmp.sector_count;
7824 +               io_sector_offset = 0;
7825 +               user_buffer_ptr = tmp.buffer_address;
7826 +               while (io_remaining) {
7827 +                       /* compute the io_size for this pass */
7828 +                       io_size = (io_remaining >= max_io_size) ?
7829 +                           max_io_size : io_remaining;
7830 +
7831 +                       io_bytes = io_size << EVMS_VSECTOR_SIZE_SHIFT;
7832 +                       /* for writes, copy a sector from user to kernel */
7833 +                       if (tmp.io_flag == EVMS_SECTOR_IO_WRITE) {
7834 +                               /* copy sector from user data buffer */
7835 +                               if (copy_from_user(io_buffer,
7836 +                                                  user_buffer_ptr, io_bytes))
7837 +                                       rc = -EFAULT;
7838 +                       }
7839 +                       if (rc)
7840 +                               break;
7841 +
7842 +                       /* perform IO one sector at a time */
7843 +                       rc = INIT_IO(disk_node,
7844 +                                    tmp.io_flag,
7845 +                                    io_sector_offset + tmp.starting_sector,
7846 +                                    io_size, io_buffer);
7847 +
7848 +                       if (rc)
7849 +                               break;
7850 +
7851 +                       if (tmp.io_flag != EVMS_SECTOR_IO_WRITE) {
7852 +                               /* copy sector to user data buffer */
7853 +                               if (copy_to_user(user_buffer_ptr,
7854 +                                                io_buffer, io_bytes))
7855 +                                       rc = -EFAULT;
7856 +                       }
7857 +                       if (rc)
7858 +                               break;
7859 +
7860 +                       user_buffer_ptr += io_bytes;
7861 +                       tmp.buffer_address += io_bytes;
7862 +                       io_sector_offset += io_size;
7863 +                       io_remaining -= io_size;
7864 +               }
7865 +       }
7866 +
7867 +       /* if the sector_buffer was allocated, free it */
7868 +       if (io_buffer)
7869 +               kfree(io_buffer);
7870 +
7871 +       /* copy the status value back to the user */
7872 +       tmp.status = rc;
7873 +       if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
7874 +               rc = -EFAULT;
7875 +
7876 +       return (rc);
7877 +}
7878 +
7879 +static int user_minor;
7880 +static int
7881 +evms_ioctl_cmd_get_minor(void *arg)
7882 +{
7883 +       int rc = 0;
7884 +       struct evms_user_minor_pkt tmp, *user_parms;
7885 +
7886 +       user_parms = (struct evms_user_minor_pkt *) arg;
7887 +       /* copy user's parameters to kernel space */
7888 +       if (copy_from_user
7889 +           (&tmp.command, &user_parms->command, sizeof (tmp.command)))
7890 +               rc = -EFAULT;
7891 +
7892 +       if (!rc) {
7893 +               if (tmp.command == EVMS_FIRST_VOLUME)
7894 +                       user_minor = 1;
7895 +               else            /* tmp.command == EVMS_NEXT_VOLUME */
7896 +                       user_minor++;
7897 +
7898 +               tmp.status = EVMS_VOLUME_INVALID;
7899 +               for (; user_minor < MAX_EVMS_VOLUMES; user_minor++) {
7900 +                       struct evms_logical_volume *lv;
7901 +
7902 +                       lv = &evms_logical_volumes[user_minor];
7903 +                       /* see if any corrupt volumes have been
7904 +                        * unmounted. If so, clean up the
7905 +                        * evms_logical_volumes array entry, and
7906 +                        * don't report the volume to the user.
7907 +                        */
7908 +                       if (lv->flags & EVMS_VOLUME_CORRUPT) {
7909 +                               if (!is_open(user_minor)) {
7910 +                                       /* clear logical volume structure
7911 +                                          * for this volume so it may be
7912 +                                          * reused.
7913 +                                        */
7914 +                                       LOG_WARNING
7915 +                                           ("ioctl_get_minor: found unmounted %s volume(%u,%u,%s).\n",
7916 +                                            ((lv->
7917 +                                              flags & EVMS_VOLUME_SOFT_DELETED)
7918 +                                             ? "'soft deleted'" : ""),
7919 +                                            EVMS_MAJOR, user_minor, lv->name);
7920 +                                       LOG_WARNING
7921 +                                           ("            releasing minor(%d) used by volume(%s)!\n",
7922 +                                            user_minor, lv->name);
7923 +                                       kfree(lv->name);
7924 +                                       lv->name = NULL;
7925 +                                       lv->flags = 0;
7926 +                               }
7927 +                       }
7928 +                       if (lv->node || (lv->flags & EVMS_VOLUME_CORRUPT)) {
7929 +                               tmp.status = EVMS_VOLUME_VALID;
7930 +                               tmp.minor = user_minor;
7931 +                               break;
7932 +                       }
7933 +               }
7934 +
7935 +               /* copy info to userspace */
7936 +               if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
7937 +                       rc = -EFAULT;
7938 +       }
7939 +       return (rc);
7940 +}
7941 +
7942 +static int
7943 +evms_ioctl_cmd_get_volume_data(void *arg)
7944 +{
7945 +       int rc = 0;
7946 +       struct evms_volume_data_pkt tmp, *user_parms;
7947 +       struct evms_logical_volume *volume = NULL;
7948 +       struct evms_logical_node *node = NULL;
7949 +
7950 +       user_parms = (struct evms_volume_data_pkt *) arg;
7951 +       /* copy user's parameters to kernel space */
7952 +       if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
7953 +               rc = -EFAULT;
7954 +
7955 +       if (!rc) {
7956 +               volume = &evms_logical_volumes[tmp.minor];
7957 +               node = volume->node;
7958 +               if (node == NULL)
7959 +                       rc = -ENODEV;
7960 +       }
7961 +       if (!rc) {
7962 +               tmp.flags = volume->flags;
7963 +               strcpy(tmp.volume_name, EVMS_DEV_NODE_PATH);
7964 +               strcat(tmp.volume_name, volume->name);
7965 +       }
7966 +
7967 +       /* copy return code and info to userspace */
7968 +       tmp.status = rc;
7969 +       if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
7970 +               rc = -EFAULT;
7971 +       return (rc);
7972 +}
7973 +
7974 +static struct evms_registered_plugin *ioctl_reg_record;
7975 +static int
7976 +evms_ioctl_cmd_get_plugin(void *arg)
7977 +{
7978 +       int rc = 0;
7979 +       struct evms_kernel_plugin_pkt tmp, *user_parms;
7980 +
7981 +       user_parms = (struct evms_kernel_plugin_pkt *) arg;
7982 +       /* copy user's parameters to kernel space */
7983 +       if (copy_from_user
7984 +           (&tmp.command, &user_parms->command, sizeof (tmp.command)))
7985 +               rc = -EFAULT;
7986 +
7987 +       if (!rc) {
7988 +               /* if the command is not 0, then verify
7989 +                * that ioctl_reg_record is pointing to
7990 +                * current and valid plugin header.
7991 +                */
7992 +               if (tmp.command) {      /* tmp.command == EVMS_NEXT_PLUGIN */
7993 +                       struct evms_registered_plugin *tmp_reg_record;
7994 +                       tmp_reg_record = registered_plugin_head;
7995 +                       /* search the current plugin list */
7996 +                       while (tmp_reg_record) {
7997 +                               if (tmp_reg_record == ioctl_reg_record)
7998 +                                       break;
7999 +                               tmp_reg_record = tmp_reg_record->next;
8000 +                       }
8001 +                       /* if the ioctl_reg_record is not in the
8002 +                        * current list, then start at the beginning.
8003 +                        */
8004 +                       if (!tmp_reg_record)
8005 +                               tmp.command = EVMS_FIRST_PLUGIN;
8006 +               }
8007 +
8008 +               if (tmp.command == EVMS_FIRST_PLUGIN)
8009 +                       /* start at beginning of plugin list */
8010 +                       ioctl_reg_record = registered_plugin_head;
8011 +               else            /* tmp.command == EVMS_NEXT_PLUGIN */
8012 +                       /* continue from current position in list */
8013 +                       ioctl_reg_record = ioctl_reg_record->next;
8014 +
8015 +               tmp.status = EVMS_PLUGIN_INVALID;
8016 +               tmp.id = 0;
8017 +               if (ioctl_reg_record) {
8018 +                       tmp.id = ioctl_reg_record->plugin->id;
8019 +                       tmp.version = ioctl_reg_record->plugin->version;
8020 +                       tmp.status = EVMS_PLUGIN_VALID;
8021 +               }
8022 +
8023 +               /* copy info to userspace */
8024 +               if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
8025 +                       rc = -EFAULT;
8026 +       }
8027 +       return (rc);
8028 +}
8029 +
8030 +static int
8031 +evms_ioctl_cmd_plugin_ioctl(struct inode *inode,
8032 +                           struct file *file,
8033 +                           unsigned int cmd, unsigned long arg)
8034 +{
8035 +       int rc = 0, found = FALSE;
8036 +       struct evms_plugin_ioctl_pkt tmp, *user_parms;
8037 +       struct evms_registered_plugin *p;
8038 +
8039 +       user_parms = (struct evms_plugin_ioctl_pkt *) arg;
8040 +       /* copy user's parameters to kernel space */
8041 +       if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
8042 +               rc = -EFAULT;
8043 +
8044 +       if (!rc) {
8045 +               /* search for the specified plugin */
8046 +               for (p = registered_plugin_head; p; p = p->next)
8047 +                       /* check for the specified feature id */
8048 +                       if (p->plugin->id == tmp.feature_id) {
8049 +                               found = TRUE;
8050 +                               /* check that entry point is used */
8051 +                               if (p->plugin->fops->direct_ioctl)
8052 +                                       rc = DIRECT_IOCTL(p, inode, file, cmd,
8053 +                                                         arg);
8054 +                               else
8055 +                                       rc = -ENOSYS;
8056 +                               break;
8057 +                       }
8058 +               /* was the specified plugin found? */
8059 +               if (found == FALSE)
8060 +                       rc = -ENOPKG;
8061 +
8062 +               /* copy the status value back to the user */
8063 +               tmp.status = rc;
8064 +               if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
8065 +                       rc = -EFAULT;
8066 +       }
8067 +       return (rc);
8068 +}
8069 +
8070 +#define MAX_BUFFER_SIZE 65536
8071 +static int
8072 +evms_ioctl_cmd_kernel_partial_csum(void *arg)
8073 +{
8074 +       int rc = 0;
8075 +       u64 compute_size = MAX_BUFFER_SIZE;
8076 +       struct evms_compute_csum_pkt tmp, *user_parms;
8077 +       unsigned char *buffer = NULL;
8078 +
8079 +       user_parms = (struct evms_compute_csum_pkt *) arg;
8080 +       /* copy user's parameters to kernel space */
8081 +       if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
8082 +               rc = -EFAULT;
8083 +
8084 +       if (!rc) {
8085 +               /* allocate a io buffer upto 64Kbytes in size */
8086 +               if (tmp.buffer_size < MAX_BUFFER_SIZE)
8087 +                       compute_size = tmp.buffer_size;
8088 +
8089 +               /* allocate buffer large enough to hold a single sector */
8090 +               buffer = kmalloc(compute_size, GFP_KERNEL);
8091 +               if (!buffer) {
8092 +                       rc = -ENOMEM;
8093 +               }
8094 +       }
8095 +       /* perform io with specified disk */
8096 +       if (!rc) {
8097 +               u64 remaining_bytes;
8098 +               u_char *user_buffer_ptr;
8099 +               unsigned int insum = tmp.insum;
8100 +
8101 +               remaining_bytes = tmp.buffer_size;
8102 +               user_buffer_ptr = tmp.buffer_address;
8103 +               while (remaining_bytes) {
8104 +                       /* compute the compute_size for this pass */
8105 +                       compute_size = (remaining_bytes >= MAX_BUFFER_SIZE) ?
8106 +                           MAX_BUFFER_SIZE : remaining_bytes;
8107 +
8108 +                       /* copy into kernel from user data buffer */
8109 +                       if (copy_from_user(buffer, user_buffer_ptr,
8110 +                                          compute_size))
8111 +                               rc = -EFAULT;
8112 +                       if (rc)
8113 +                               break;
8114 +                       /* compute the checksum for this pass */
8115 +                       tmp.outsum = csum_partial(buffer, tmp.buffer_size,
8116 +                                                 insum);
8117 +                       /* set up for another possible pass */
8118 +                       insum = tmp.outsum;
8119 +                       /* update loop progress variables */
8120 +                       user_buffer_ptr += compute_size;
8121 +                       tmp.buffer_address += compute_size;
8122 +                       remaining_bytes -= compute_size;
8123 +               }
8124 +       }
8125 +
8126 +       /* if the sector_buffer was allocated, free it */
8127 +       if (buffer)
8128 +               kfree(buffer);
8129 +
8130 +       /* copy the status value back to the user */
8131 +       tmp.status = rc;
8132 +       if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
8133 +               rc = -EFAULT;
8134 +
8135 +       return (rc);
8136 +}
8137 +
8138 +#undef MAX_BUFFER_SIZE
8139 +
8140 +static int
8141 +evms_ioctl_cmd_get_bmap(struct inode *inode,
8142 +                       struct file *file, unsigned int cmd, unsigned long arg)
8143 +{
8144 +       int rc = 0;
8145 +       struct evms_get_bmap_pkt tmp, *user_parms;
8146 +
8147 +       user_parms = (struct evms_get_bmap_pkt *) arg;
8148 +       /* copy user's parameters to kernel space */
8149 +       if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
8150 +               rc = -EFAULT;
8151 +
8152 +       /* pass the ioctl down the volume stack */
8153 +       if (!rc) {
8154 +               struct evms_logical_volume *volume;
8155 +
8156 +               volume = &evms_logical_volumes[MINOR(inode->i_rdev)];
8157 +               rc = IOCTL(volume->node, inode, file, cmd,
8158 +                          (unsigned long) &tmp);
8159 +       }
8160 +       /* copy the status value back to the user */
8161 +       tmp.status = rc;
8162 +       if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
8163 +               rc = -EFAULT;
8164 +
8165 +       return (rc);
8166 +}
8167 +
8168 +static int
8169 +evms_ioctl_cmd_process_notify_event(unsigned long arg)
8170 +{
8171 +       int rc = 0, found = FALSE;
8172 +       struct evms_notify_pkt tmp, *user_parms;
8173 +       struct evms_list_node **list_node = NULL;
8174 +       struct evms_event *event = NULL;
8175 +
8176 +       user_parms = (struct evms_notify_pkt *) arg;
8177 +       /* copy user's parameters to kernel space */
8178 +       if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
8179 +               rc = -EFAULT;
8180 +
8181 +       /* check to see if PID has already been registered
8182 +        * for this event.
8183 +        */
8184 +       if (!rc) {
8185 +               list_node = &evms_global_notify_list;
8186 +               while (*list_node) {
8187 +                       event = (*list_node)->item;
8188 +                       if ((event->pid == tmp.eventry.pid) &&
8189 +                           (event->eventid == tmp.eventry.eventid)) {
8190 +                               found = TRUE;
8191 +                               break;
8192 +                       }
8193 +                       list_node = &(*list_node)->next;
8194 +               }
8195 +       }
8196 +       if (tmp.command) {      /* tmp.command == EVMS_REGISTER_EVENT */
8197 +               /* registration code */
8198 +               if (found) {
8199 +                       rc = -EBUSY;
8200 +                       LOG_ERROR
8201 +                           ("error(%d) pid(%d) already register to receive signal(%d) on event(%d).\n",
8202 +                            rc, tmp.eventry.pid, tmp.eventry.signo,
8203 +                            tmp.eventry.eventid);
8204 +               } else {
8205 +                       /* register this pid/event type */
8206 +                       event = kmalloc(sizeof (struct evms_event), GFP_KERNEL);
8207 +                       if (!event) {
8208 +                               rc = -ENOMEM;
8209 +                               LOG_ERROR
8210 +                                   ("error(%d) allocating event structure.\n",
8211 +                                    rc);
8212 +                       } else {
8213 +                               memset(event, 0, sizeof (struct evms_event));
8214 +                               event->pid = tmp.eventry.pid;
8215 +                               event->eventid = tmp.eventry.eventid;
8216 +                               event->signo = tmp.eventry.signo;
8217 +                               rc = evms_cs_add_item_to_list
8218 +                                   (&evms_global_notify_list, event);
8219 +                       }
8220 +               }
8221 +       } else {                /* tmp.command == EVMS_UNREGISTER_EVENT */
8222 +               /* unregistration code */
8223 +               if (!found) {
8224 +                       rc = -ENODATA;
8225 +                       LOG_ERROR
8226 +                           ("error(%d) attempting to unregister a non-registered pid(%d) on event(%d).\n",
8227 +                            rc, tmp.eventry.pid, tmp.eventry.eventid);
8228 +               } else {
8229 +                       event = (*list_node)->item;
8230 +                       rc = evms_cs_remove_item_from_list
8231 +                           (&evms_global_notify_list, event);
8232 +                       if (!rc) {
8233 +                               kfree(event);
8234 +                       }
8235 +               }
8236 +       }
8237 +       /* copy the status value back to the user */
8238 +       tmp.status = rc;
8239 +       if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
8240 +               rc = -EFAULT;
8241 +
8242 +       return (rc);
8243 +}
8244 +
8245 +static int
8246 +evms_ioctl_cmd_check_mount_status(struct inode *inode, struct file *file,
8247 +                                 ulong arg)
8248 +{
8249 +       int rc = 0;
8250 +       struct evms_mount_status_pkt tmp, *user_parms;
8251 +
8252 +       user_parms = (struct evms_mount_status_pkt *) arg;
8253 +       /* copy user's parameters to kernel space */
8254 +       if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
8255 +               rc = -EFAULT;
8256 +
8257 +       if (!rc) {
8258 +               tmp.mounted =
8259 +                   (is_mounted(MKDEV(EVMS_MAJOR, tmp.minor))) ? TRUE : FALSE;
8260 +       }
8261 +
8262 +       /* copy the status value back to the user */
8263 +       tmp.status = rc;
8264 +       if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
8265 +               rc = -EFAULT;
8266 +
8267 +       return (rc);
8268 +}
8269 +
8270 +static int
8271 +evms_ioctl_cmd_check_open_status(struct inode *inode, struct file *file,
8272 +                                 ulong arg)
8273 +{
8274 +       int rc = 0;
8275 +       struct evms_open_status_pkt tmp, *user_parms;
8276 +
8277 +       user_parms = (struct evms_open_status_pkt *) arg;
8278 +       /* copy user's parameters to kernel space */
8279 +       if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
8280 +               rc = -EFAULT;
8281 +
8282 +       if (!rc) {
8283 +               tmp.opens = is_open(tmp.minor);
8284 +       }
8285 +
8286 +       /* copy the status value back to the user */
8287 +       tmp.status = rc;
8288 +       if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
8289 +               rc = -EFAULT;
8290 +
8291 +       return (rc);
8292 +}
8293 +
8294 +/************************************************/
8295 +/* END -- IOCTL commands -- EVMS specific       */
8296 +/************************************************/
8297 +
8298 +/************************************************/
8299 +/* START -- IOCTL commands -- Volume specific   */
8300 +/************************************************/
8301 +
8302 +/************************************************/
8303 +/* END -- IOCTL commands -- Volume specific     */
8304 +/************************************************/
8305 +
8306 +/************************************************/
8307 +/* START -- IOCTL main                          */
8308 +/************************************************/
8309 +
8310 +/*
8311 + * Function: evms_ioctl
8312 + *
8313 + *  This function is the main ioctl entry point for all of evms.
8314 + */
8315 +
8316 +static int
8317 +evms_ioctl(struct inode *inode,
8318 +          struct file *file, unsigned int cmd, unsigned long arg)
8319 +{
8320 +       unsigned long minor = 0;
8321 +       int rc = 0;
8322 +       struct evms_logical_node *node = NULL;
8323 +
8324 +       /* check user access */
8325 +       if (!capable(CAP_SYS_ADMIN))
8326 +               rc = -EACCES;
8327 +
8328 +       if (!inode)
8329 +               rc = -EINVAL;
8330 +
8331 +       if (!rc) {
8332 +               /* get the minor */
8333 +               minor = MINOR(inode->i_rdev);
8334 +               LOG_EXTRA
8335 +                   ("ioctl: minor(%lu), dir(%d), size(%d), type(%d), nr(%d)\n",
8336 +                    minor, (cmd >> _IOC_DIRSHIFT) & _IOC_DIRMASK,
8337 +                    (cmd >> _IOC_SIZESHIFT) & _IOC_SIZEMASK,
8338 +                    (cmd >> _IOC_TYPESHIFT) & _IOC_TYPEMASK,
8339 +                    (cmd >> _IOC_NRSHIFT) & _IOC_NRMASK);
8340 +
8341 +               /* insure this minor points to a valid volume */
8342 +               if (minor) {
8343 +                       node = evms_logical_volumes[minor].node;
8344 +                       if (node == NULL)
8345 +                               rc = -ENXIO;
8346 +               }
8347 +       }
8348 +
8349 +       /* process the IOCTL commands */
8350 +       if (!rc) {
8351 +               if (!minor) {
8352 +                       /* process all EVMS specific commands */
8353 +                       switch (cmd) {
8354 +                       case EVMS_GET_IOCTL_VERSION:
8355 +                               rc = evms_ioctl_cmd_get_ioctl_version((void *)
8356 +                                                                     arg);
8357 +                               break;
8358 +                       case EVMS_GET_VERSION:
8359 +                               rc = evms_ioctl_cmd_get_version((void *) arg);
8360 +                               break;
8361 +                       case EVMS_GET_INFO_LEVEL:
8362 +                               rc = evms_ioctl_cmd_get_info_level((void *)
8363 +                                                                  arg);
8364 +                               break;
8365 +                       case EVMS_SET_INFO_LEVEL:
8366 +                               rc = evms_ioctl_cmd_set_info_level((void *)
8367 +                                                                  arg);
8368 +                               break;
8369 +                       case EVMS_REDISCOVER_VOLUMES:
8370 +                               rc = evms_ioctl_cmd_rediscover_volumes(inode,
8371 +                                                                      file,
8372 +                                                                      cmd,
8373 +                                                                      arg);
8374 +                               break;
8375 +                       case EVMS_GET_LOGICAL_DISK:
8376 +                               rc = evms_ioctl_cmd_get_logical_disk((void *)
8377 +                                                                    arg);
8378 +                               break;
8379 +                       case EVMS_GET_LOGICAL_DISK_INFO:
8380 +                               rc = evms_ioctl_cmd_get_logical_disk_info((void
8381 +                                                                          *)
8382 +                                                                         arg);
8383 +                               break;
8384 +                       case EVMS_SECTOR_IO:
8385 +                               rc = evms_ioctl_cmd_sector_io((void *) arg);
8386 +                               break;
8387 +                       case EVMS_GET_MINOR:
8388 +                               rc = evms_ioctl_cmd_get_minor((void *) arg);
8389 +                               break;
8390 +                       case EVMS_GET_VOLUME_DATA:
8391 +                               rc = evms_ioctl_cmd_get_volume_data((void *)
8392 +                                                                   arg);
8393 +                               break;
8394 +                       case EVMS_DELETE_VOLUME:
8395 +                               rc = evms_ioctl_cmd_delete_volume(inode, file,
8396 +                                                                 arg);
8397 +                               break;
8398 +                       case EVMS_GET_PLUGIN:
8399 +                               rc = evms_ioctl_cmd_get_plugin((void *) arg);
8400 +                               break;
8401 +                       case EVMS_PLUGIN_IOCTL:
8402 +                               rc = evms_ioctl_cmd_plugin_ioctl(inode, file,
8403 +                                                                cmd, arg);
8404 +                               break;
8405 +                       case EVMS_COMPUTE_CSUM:
8406 +                               rc = evms_ioctl_cmd_kernel_partial_csum((void *)
8407 +                                                                       arg);
8408 +                               break;
8409 +                       case EVMS_PROCESS_NOTIFY_EVENT:
8410 +                               rc = evms_ioctl_cmd_process_notify_event(arg);
8411 +                               break;
8412 +                       case EVMS_CHECK_MOUNT_STATUS:
8413 +                               rc = evms_ioctl_cmd_check_mount_status(inode,
8414 +                                                                      file,
8415 +                                                                      arg);
8416 +                               break;
8417 +                       case EVMS_CHECK_OPEN_STATUS:
8418 +                               rc = evms_ioctl_cmd_check_open_status(inode,
8419 +                                                                      file,
8420 +                                                                      arg);
8421 +                               break;
8422 +                       default:
8423 +                               rc = -EINVAL;
8424 +                               break;
8425 +                       }
8426 +               } else {
8427 +                       /* process Volume specific commands */
8428 +                       switch (cmd) {
8429 +                               /* pick up standard blk ioctls */
8430 +                       case BLKFLSBUF:
8431 +                       case BLKROSET:
8432 +                       case BLKROGET:
8433 +                       case BLKRASET:
8434 +                       case BLKRAGET:
8435 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,10)
8436 +                       case BLKBSZGET:
8437 +                       case BLKBSZSET:
8438 +#endif
8439 +                       case BLKSSZGET:
8440 +                               rc = blk_ioctl(inode->i_rdev, cmd, arg);
8441 +                               break;
8442 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,10)
8443 +                       case BLKGETSIZE:
8444 +                               {
8445 +                                       /* casting size down to 32-bits until
8446 +                                        * kernel allows return of 64-bit size
8447 +                                        * values.
8448 +                                        */
8449 +                                       long size = node->total_vsectors;
8450 +                                       if (copy_to_user
8451 +                                           ((long *) arg, &size,
8452 +                                            sizeof (long)))
8453 +                                               rc = -EFAULT;
8454 +                               }
8455 +                               break;
8456 +                       case BLKGETSIZE64:
8457 +                               {
8458 +                                       u64 size_in_bytes =
8459 +                                           node->
8460 +                                           total_vsectors <<
8461 +                                           EVMS_VSECTOR_SIZE_SHIFT;
8462 +                                       if (copy_to_user
8463 +                                           ((u64 *) arg, &size_in_bytes,
8464 +                                            sizeof (u64)))
8465 +                                               rc = -EFAULT;
8466 +                               }
8467 +                               break;
8468 +#endif
8469 +                       case EVMS_GET_IOCTL_VERSION:
8470 +                               rc = evms_ioctl_cmd_get_ioctl_version((void *)
8471 +                                                                     arg);
8472 +                               break;
8473 +                       case EVMS_GET_BMAP:
8474 +                               rc = evms_ioctl_cmd_get_bmap(inode, file, cmd,
8475 +                                                            arg);
8476 +                               break;
8477 +                       case EVMS_GET_VOL_STRIPE_INFO:
8478 +                               {
8479 +                                       struct evms_vol_stripe_info_pkt info;
8480 +
8481 +                                       info.size =
8482 +                                           PAGE_SIZE >>
8483 +                                           EVMS_VSECTOR_SIZE_SHIFT;
8484 +                                       info.width = 1;
8485 +                                       if (copy_to_user
8486 +                                           ((struct evms_vol_stripe_info_pkt *)
8487 +                                            arg, &info, sizeof (info)))
8488 +                                               rc = -EFAULT;
8489 +                               }
8490 +                               break;
8491 +
8492 +                       default:
8493 +                               rc = IOCTL(node, inode, file, cmd, arg);
8494 +                               break;
8495 +                       }
8496 +               }
8497 +       }
8498 +       return rc;
8499 +}
8500 +
8501 +/************************************************/
8502 +/* END -- IOCTL main                            */
8503 +/************************************************/
8504 +
8505 +/************************************************/
8506 +/* START -- CHECK MEDIA CHANGE                 */
8507 +/************************************************/
8508 +
8509 +static int
8510 +evms_check_media_change(kdev_t dev)
8511 +{
8512 +       int rc = 0;
8513 +       struct evms_logical_volume *volume = NULL;
8514 +
8515 +       /* check user access */
8516 +       if (!capable(CAP_SYS_ADMIN))
8517 +               rc = -EACCES;
8518 +       if (!rc) {
8519 +               int minor;
8520 +               /* get the minor */
8521 +               minor = MINOR(dev);
8522 +               /* insure this minor points to a valid volume */
8523 +               volume = &evms_logical_volumes[minor];
8524 +               if (volume->node == NULL) {
8525 +                       rc = -ENXIO;
8526 +               }
8527 +       }
8528 +       if (!rc) {
8529 +               if (volume->flags & EVMS_DEVICE_REMOVABLE) {
8530 +                       /* check for media change */
8531 +                       rc = evms_cs_kernel_ioctl(volume->node,
8532 +                                                 EVMS_CHECK_MEDIA_CHANGE,
8533 +                                                 (unsigned long) NULL);
8534 +                       if (rc < 0) {
8535 +                               LOG_ERROR
8536 +                                   ("error(%d) doing EVMS_CHECK_MEDIA_CHANGE ioctl on '%s'.\n",
8537 +                                    rc, volume->name);
8538 +                       }
8539 +               }
8540 +       }
8541 +       return (rc);
8542 +}
8543 +
8544 +/************************************************/
8545 +/* END -- CHECK MEDIA CHANGE                   */
8546 +/************************************************/
8547 +
8548 +static int
8549 +evms_check_for_device_changes(struct inode *inode, struct file *file)
8550 +{
8551 +       int rc = 0, something_changed = 0, i;
8552 +       struct evms_rediscover_pkt kernel_rd_pckt = { 0, 0, NULL };
8553 +       struct evms_list_node *disk_list = NULL, *lnode, *next_lnode;
8554 +       struct evms_logical_node *disk, *new_device_list = NULL;
8555 +       struct evms_logical_volume *volume = NULL;
8556 +
8557 +       /* check for new devices
8558 +        *
8559 +        * put all new devices on the disk list so they
8560 +        * will be included in the rediscovery process.
8561 +        */
8562 +       static void evms_discover_logical_disks(struct evms_logical_node **);
8563 +       evms_discover_logical_disks(&new_device_list);
8564 +       if (new_device_list) {
8565 +               LOG_DETAILS("%s: new devices detected.\n", __FUNCTION__);
8566 +               something_changed++;
8567 +               /* put these new nodes on the disk list */
8568 +               while (new_device_list) {
8569 +                       disk = new_device_list;
8570 +                       rc = evms_cs_remove_logical_node_from_list
8571 +                           (&new_device_list, disk);
8572 +                       if (rc) {
8573 +                               LOG_ERROR
8574 +                                   ("%s: error(%d) removing device(%s) from list.\n",
8575 +                                    __FUNCTION__, rc, disk->name);
8576 +                       }
8577 +                       rc = evms_cs_add_item_to_list(&disk_list, disk);
8578 +                       if (rc) {
8579 +                               LOG_ERROR
8580 +                                   ("%s: error(%d) adding device(%s) from list.\n",
8581 +                                    __FUNCTION__, rc, disk->name);
8582 +                       }
8583 +               }
8584 +       }
8585 +
8586 +       /* check all devices for changed removable media
8587 +        *
8588 +        * scan the global device list and issue check
8589 +        * media change on each removable media device.
8590 +        * put all removable devices that indicate a
8591 +        * media change on the disk list.
8592 +        *
8593 +        * also scan for devices that have been unplugged
8594 +        * or contain corrupt volumes.
8595 +        */
8596 +       for (lnode = evms_global_device_list; lnode; lnode = lnode->next) {
8597 +               int add_to_list = FALSE;
8598 +               disk = (struct evms_logical_node *) lnode->item;
8599 +               /* only really check removable media devices */
8600 +               if (disk->flags & EVMS_DEVICE_REMOVABLE) {
8601 +                       /* check for media change */
8602 +                       rc = evms_cs_kernel_ioctl(disk,
8603 +                                                 EVMS_CHECK_MEDIA_CHANGE,
8604 +                                                 (unsigned long) NULL);
8605 +                       if (rc < 0) {
8606 +                               LOG_ERROR
8607 +                                   ("%s: error(%d) doing EVMS_CHECK_MEDIA_CHANGE ioctl on '%s'.\n",
8608 +                                    __FUNCTION__, rc, disk->name);
8609 +                       } else if (rc == 1) {
8610 +                               add_to_list = TRUE;
8611 +                       }
8612 +               }
8613 +               /* check for device that where present
8614 +                * before but are gone (unplugged
8615 +                * device or unloaded driver).
8616 +                */
8617 +               rc = IOCTL(disk, inode, file,
8618 +                          EVMS_CHECK_DEVICE_STATUS, (ulong) NULL);
8619 +               if (rc) {
8620 +                       LOG_ERROR
8621 +                           ("error(%d) doing EVMS_CHECK_DEVICE_STATUS ioctl on '%s'.\n",
8622 +                            rc, volume->name);
8623 +               }
8624 +               if (disk->flags & EVMS_DEVICE_UNAVAILABLE) {
8625 +                       add_to_list = TRUE;
8626 +               }
8627 +               if (add_to_list) {
8628 +                       something_changed++;
8629 +                       rc = evms_cs_add_item_to_list(&disk_list, disk);
8630 +               }
8631 +       }
8632 +       /* log a statement that we detected changed media.
8633 +        */
8634 +       if (disk_list) {
8635 +               LOG_DETAILS("%s: media change detected.\n", __FUNCTION__);
8636 +       }
8637 +
8638 +       /* check for volumes with removed removable media.
8639 +        * mark the volumes that reside on changed media.
8640 +        */
8641 +       for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
8642 +               volume = &evms_logical_volumes[i];
8643 +               if (!volume->node)
8644 +                       continue;
8645 +               if (!(volume->flags & EVMS_DEVICE_REMOVABLE))
8646 +                       continue;
8647 +               if (evms_check_media_change(MKDEV(EVMS_MAJOR, i)) <= 0)
8648 +                       continue;
8649 +               /* remember which volumes have changed media */
8650 +               volume->flags |= EVMS_MEDIA_CHANGED;
8651 +               something_changed++;
8652 +       }
8653 +
8654 +       /* check for removed devices */
8655 +       for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
8656 +               int status;
8657 +               volume = &evms_logical_volumes[i];
8658 +               if (!volume->node)
8659 +                       continue;
8660 +               /* check for device status */
8661 +               status = 0;
8662 +               rc = IOCTL(volume->node, inode, file,
8663 +                          EVMS_CHECK_DEVICE_STATUS, (ulong) & status);
8664 +               if (rc) {
8665 +                       LOG_ERROR
8666 +                           ("error(%d) doing EVMS_CHECK_DEVICE_STATUS ioctl on '%s'.\n",
8667 +                            rc, volume->name);
8668 +                       continue;
8669 +               }
8670 +               if (!(status & EVMS_DEVICE_UNAVAILABLE)) {
8671 +                       continue;
8672 +               }
8673 +               /* remember which volumes have changed media */
8674 +               volume->flags |= EVMS_DEVICE_UNPLUGGED;
8675 +               something_changed++;
8676 +       }
8677 +
8678 +       /* do we have some work to do? */
8679 +       if (something_changed) {
8680 +               /* check for volumes to be deleted */
8681 +               for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
8682 +                       struct evms_quiesce_vol_pkt qv;
8683 +
8684 +                       volume = &evms_logical_volumes[i];
8685 +                       if (!volume->node)
8686 +                               continue;
8687 +                       /* only proceed on volumes with:
8688 +                        *  changed media,
8689 +                        *  hot-unplugged devices,
8690 +                        *  & partial volumes
8691 +                        */
8692 +                       if (!(volume->flags &
8693 +                             (EVMS_MEDIA_CHANGED |
8694 +                              EVMS_VOLUME_PARTIAL | EVMS_DEVICE_UNPLUGGED)))
8695 +                               continue;
8696 +                       /* gather the disk's needing to be
8697 +                        * rediscovered to rebuild this
8698 +                        * volume.
8699 +                        *
8700 +                        * this will locate other disks that
8701 +                        * the volume resides on that don't
8702 +                        * indicate media change.
8703 +                        */
8704 +                       rc = evms_cs_kernel_ioctl(volume->node,
8705 +                                                 EVMS_GET_DISK_LIST,
8706 +                                                 (unsigned long) &disk_list);
8707 +                       if (rc) {
8708 +                               LOG_ERROR
8709 +                                   ("%s: error(%d) retrieving underlying disk list for '%s', skipping ...\n",
8710 +                                    __FUNCTION__, rc, volume->name);
8711 +                               continue;
8712 +                       }
8713 +                       /* quiesce all the changed volumes
8714 +                        * prior to being deleted.
8715 +                        */
8716 +                       qv.command = 1; // quiesce
8717 +                       qv.minor = i;   //
8718 +                       qv.status = 0;  // reset status
8719 +                       qv.do_vfs = 0;
8720 +                       rc = evms_quiesce_volume(volume, inode, file, &qv);
8721 +                       if (rc) {
8722 +                               LOG_ERROR
8723 +                                   ("%s: error(%d) attempting to quiesce '%s%s'.\n",
8724 +                                    __FUNCTION__, rc, EVMS_DEV_NODE_PATH,
8725 +                                    volume->name);
8726 +                       }
8727 +               }
8728 +
8729 +               /* we need to revalidate all the changed
8730 +                * media. this is accomplished by issuing
8731 +                * the revalidate disk ioctl to each device
8732 +                * with changed media. the device manager
8733 +                * remembers which devices indicated
8734 +                * media changed (set by check media
8735 +                * changed ioctl issued earlier), and will
8736 +                * only issue the revalidate disk ioctl to
8737 +                * those disks one time.
8738 +                *
8739 +                * NOTE:
8740 +                * this needs to be done BEFORE deleting
8741 +                * the volumes because deleting the
8742 +                * last segment on disk will cause the
8743 +                * associated disk node to freed, and we
8744 +                * will not be able to issue the
8745 +                * revalidate disk ioctl after that.
8746 +                */
8747 +               for (lnode = disk_list; lnode; lnode = lnode->next) {
8748 +                       disk = (struct evms_logical_node *) lnode->item;
8749 +                       /* only really do removable media devices */
8750 +                       if (disk->flags & EVMS_MEDIA_CHANGED) {
8751 +                               /* go revalidate the change media */
8752 +                               rc = evms_cs_kernel_ioctl(disk,
8753 +                                                         EVMS_REVALIDATE_DISK,
8754 +                                                         (unsigned long) NULL);
8755 +                               if (rc) {
8756 +                                       LOG_ERROR
8757 +                                           ("%s: error(%d) attempting to revalidate '%s%s'.\n",
8758 +                                            __FUNCTION__, rc,
8759 +                                            EVMS_DEV_NODE_PATH, volume->name);
8760 +                               }
8761 +                       }
8762 +               }
8763 +
8764 +               /* delete all the affected volumes */
8765 +               for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
8766 +                       struct evms_delete_vol_pkt dv;
8767 +
8768 +                       volume = &evms_logical_volumes[i];
8769 +                       if (!volume->node)
8770 +                               continue;
8771 +                       /* only proceed on volumes with:
8772 +                        *  changed media,
8773 +                        *  hot-unplugged devices,
8774 +                        *  & partial volumes
8775 +                        */
8776 +                       if (!(volume->flags &
8777 +                             (EVMS_MEDIA_CHANGED |
8778 +                              EVMS_VOLUME_PARTIAL | EVMS_DEVICE_UNPLUGGED)))
8779 +                               continue;
8780 +                       /* only delete quiesced volumes */
8781 +                       if (!volume->quiesced)
8782 +                               continue;
8783 +                       /* delete the volume from memory.
8784 +                        * do a 'soft' delete if volume
8785 +                        * is mounted, and 'hard' delete
8786 +                        * if it is not.
8787 +                        *
8788 +                        * NOTE: the delete operation will
8789 +                        * clear the bits in the flags field.
8790 +                        */
8791 +                       dv.command = is_open(i);
8792 +                       dv.minor = i;
8793 +                       dv.status = 0;
8794 +                       rc = evms_delete_volume(volume, &dv);
8795 +               }
8796 +
8797 +               /* at this point all devices indicating
8798 +                * media change that had volumes on them
8799 +                * should be gone. however, we could still
8800 +                * have devices indicating media change
8801 +                * that had no volumes on them in the disk
8802 +                * list. we need to delete these devices
8803 +                * from kernel memory and the global device
8804 +                * list.
8805 +                */
8806 +               for (lnode = evms_global_device_list; lnode; lnode = next_lnode) {
8807 +                       next_lnode = lnode->next;
8808 +
8809 +                       disk = (struct evms_logical_node *) lnode->item;
8810 +                       if (disk->flags & EVMS_MEDIA_CHANGED) {
8811 +                               rc = DELETE(disk);
8812 +                       }
8813 +               }
8814 +
8815 +               /* all the devices that indicated media
8816 +                * change should be gone, both from kernel
8817 +                * memory and global device list. we now
8818 +                * need to remove any references to these
8819 +                * devices from the disk list.
8820 +                *
8821 +                * when removable media is installed, it
8822 +                * will get detected in the device manager's
8823 +                * rediscovery as a new device and added to
8824 +                * the discover list.
8825 +                */
8826 +               for (lnode = disk_list; lnode; lnode = next_lnode) {
8827 +                       struct evms_list_node *glnode;
8828 +                       int lnode_still_there;
8829 +
8830 +                       next_lnode = lnode->next;
8831 +
8832 +                       lnode_still_there = FALSE;
8833 +                       for (glnode = evms_global_device_list;
8834 +                            glnode; glnode = glnode->next) {
8835 +                               if (glnode->item == lnode->item) {
8836 +                                       lnode_still_there = TRUE;
8837 +                                       break;
8838 +                               }
8839 +                       }
8840 +                       if (lnode_still_there == FALSE) {
8841 +                               rc = evms_cs_remove_item_from_list(&disk_list,
8842 +                                                                  lnode->item);
8843 +                               if (rc) {
8844 +                                       LOG_ERROR
8845 +                                           ("%s: error(%d) attempting to remove item(%p) from disk_list(%p).\n",
8846 +                                            __FUNCTION__, rc, lnode->item,
8847 +                                            &disk_list);
8848 +                               }
8849 +                       }
8850 +               }
8851 +
8852 +               /* build the in-kernel rediscover packet */
8853 +
8854 +               /* allocate the space for the drive_array in
8855 +                * the struct evms_rediscover_pkt packet. to do this
8856 +                * we need to count the number of disk nodes,
8857 +                * then allocate the necessary space.
8858 +                */
8859 +               /* count the disk nodes */
8860 +               for (lnode = disk_list; lnode; lnode = lnode->next)
8861 +                       kernel_rd_pckt.drive_count++;
8862 +               /* allocate the space */
8863 +               if (kernel_rd_pckt.drive_count) {
8864 +                       kernel_rd_pckt.drive_array =
8865 +                           kmalloc(kernel_rd_pckt.drive_count *
8866 +                                   sizeof (u64), GFP_KERNEL);
8867 +                       if (!kernel_rd_pckt.drive_array) {
8868 +                               rc = -ENOMEM;
8869 +                               LOG_ERROR
8870 +                                   ("%s: error(%d) allocating rediscover drive array.\n",
8871 +                                    __FUNCTION__, rc);
8872 +                       }
8873 +               }
8874 +               /* populate the drive array
8875 +                *
8876 +                * this also frees the disk_list which is useful
8877 +                * if we had an error allocating the drive array.
8878 +                */
8879 +               for (i = 0, lnode = disk_list; lnode; lnode = next_lnode, i++) {
8880 +                       next_lnode = lnode->next;
8881 +
8882 +                       /* remove this disk from the disk list */
8883 +                       disk = (struct evms_logical_node *) lnode->item;
8884 +                       rc = evms_cs_remove_item_from_list(&disk_list, disk);
8885 +                       if (!rc) {
8886 +                               /* add this disk to rediscover
8887 +                                * packet
8888 +                                */
8889 +                               kernel_rd_pckt.drive_array[i] =
8890 +                                   NODE_TO_DEV_HANDLE(disk);
8891 +                       }
8892 +               }
8893 +               /* perform the rediscovery operation */
8894 +               if (!rc) {
8895 +                       static int evms_discover_volumes(struct
8896 +                                                        evms_rediscover_pkt *);
8897 +                       rc = evms_discover_volumes(&kernel_rd_pckt);
8898 +                       if (kernel_rd_pckt.drive_count) {
8899 +                               kfree(kernel_rd_pckt.drive_array);
8900 +                       }
8901 +               }
8902 +               LOG_DETAILS("%s: rediscover completed.\n", __FUNCTION__);
8903 +       }
8904 +
8905 +       return (rc);
8906 +}
8907 +
8908 +/************************************************/
8909 +/* START -- REVALIDATE DISK                    */
8910 +/************************************************/
8911 +
8912 +static int
8913 +evms_revalidate_disk(kdev_t dev)
8914 +{
8915 +       int rc = 0;
8916 +       struct evms_logical_volume *volume = NULL;
8917 +
8918 +       /* check user access */
8919 +       if (!capable(CAP_SYS_ADMIN))
8920 +               rc = -EACCES;
8921 +       if (!rc) {
8922 +               int minor;
8923 +               /* get the minor */
8924 +               minor = MINOR(dev);
8925 +               /* insure this minor points to a valid volume */
8926 +               volume = &evms_logical_volumes[minor];
8927 +               if (volume->node == NULL) {
8928 +                       rc = -ENXIO;
8929 +               }
8930 +       }
8931 +       if (!rc) {
8932 +               /* go revalidate the change media */
8933 +               rc = evms_cs_kernel_ioctl(volume->node,
8934 +                                         EVMS_REVALIDATE_DISK,
8935 +                                         (unsigned long) NULL);
8936 +       }
8937 +       return (rc);
8938 +}
8939 +
8940 +/************************************************/
8941 +/* END -- REVALIDATE DISK                      */
8942 +/************************************************/
8943 +
8944 +/************************************************/
8945 +/* START -- OPEN                               */
8946 +/************************************************/
8947 +
8948 +static int
8949 +evms_open(struct inode *inode, struct file *file)
8950 +{
8951 +       int rc = 0, minor = 0;
8952 +       struct evms_logical_volume *volume = NULL;
8953 +
8954 +       /* check user access */
8955 +       if (!capable(CAP_SYS_ADMIN))
8956 +               rc = -EACCES;
8957 +       if (!rc) {
8958 +               if (!inode)
8959 +                       rc = -EINVAL;
8960 +       }
8961 +       rc = evms_check_for_device_changes(inode, file);
8962 +       if (!rc) {
8963 +               /* get the minor */
8964 +               minor = MINOR(inode->i_rdev);
8965 +               if (minor) {
8966 +                       /* insure this minor points to a valid volume */
8967 +                       volume = &evms_logical_volumes[minor];
8968 +                       if (volume->node == NULL) {
8969 +                               rc = -ENXIO;
8970 +                       }
8971 +               }
8972 +       }
8973 +       /* go "open" the volume */
8974 +       if (!rc && minor) {
8975 +               atomic_inc(&volume->opens);
8976 +               rc = IOCTL(volume->node, inode, file,
8977 +                          EVMS_OPEN_VOLUME, (unsigned long) NULL);
8978 +               if (rc) {
8979 +                       LOG_ERROR
8980 +                           ("error(%d) doing EVMS_OPEN_VOLUME ioctl to '%s'.\n",
8981 +                            rc, volume->name);
8982 +                       atomic_dec(&volume->opens);
8983 +               }
8984 +       }
8985 +       return (rc);
8986 +}
8987 +
8988 +/************************************************/
8989 +/* END -- OPEN                                 */
8990 +/************************************************/
8991 +
8992 +/************************************************/
8993 +/* START -- RELEASE                            */
8994 +/************************************************/
8995 +
8996 +static int
8997 +evms_release(struct inode *inode, struct file *file)
8998 +{
8999 +       int rc = 0, minor = 0;
9000 +       struct evms_logical_volume *volume = NULL;
9001 +
9002 +       if (!inode)
9003 +               rc = -EINVAL;
9004 +       if (!rc) {
9005 +               /* get the minor */
9006 +               minor = MINOR(inode->i_rdev);
9007 +               if (minor) {
9008 +                       /* insure this minor points to a valid volume */
9009 +                       volume = &evms_logical_volumes[minor];
9010 +                       if (volume->node == NULL) {
9011 +                               rc = -ENXIO;
9012 +                       }
9013 +               }
9014 +       }
9015 +       /* go "close" the volume */
9016 +       if (!rc && minor) {
9017 +               rc = IOCTL(volume->node, inode, file,
9018 +                          EVMS_CLOSE_VOLUME, (unsigned long) NULL);
9019 +               if (rc) {
9020 +                       LOG_ERROR
9021 +                           ("error(%d) doing EVMS_CLOSE_VOLUME ioctl to '%s'.\n",
9022 +                            rc, volume->name);
9023 +               } else {
9024 +                       atomic_dec(&volume->opens);
9025 +               }
9026 +       }
9027 +       return (rc);
9028 +}
9029 +
9030 +/************************************************/
9031 +/* END -- RELEASE                              */
9032 +/************************************************/
9033 +
9034 +static struct block_device_operations evms_fops = {
9035 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,14)
9036 +       owner:THIS_MODULE,
9037 +#endif
9038 +       open:evms_open,
9039 +       release:evms_release,
9040 +       ioctl:evms_ioctl,
9041 +       check_media_change:evms_check_media_change,
9042 +       revalidate:evms_revalidate_disk
9043 +};
9044 +
9045 +/**********************************************************/
9046 +/* END -- FOPS functions definitions                      */
9047 +/**********************************************************/
9048 +
9049 +/**********************************************************/
9050 +/* START -- RUNTIME support functions                     */
9051 +/**********************************************************/
9052 +
9053 +static void
9054 +evms_do_request_fn(request_queue_t * q)
9055 +{
9056 +       LOG_WARNING("This function should not be called.\n");
9057 +}
9058 +
9059 +#ifdef CONFIG_SMP
9060 +static request_queue_t *
9061 +evms_find_queue(kdev_t dev)
9062 +{
9063 +       request_queue_t *rq = NULL;
9064 +       struct evms_logical_volume *volume;
9065 +
9066 +       volume = &evms_logical_volumes[MINOR(dev)];
9067 +       if (volume->node)
9068 +               rq = &volume->request_queue;
9069 +       return (rq);
9070 +}
9071 +#endif
9072 +
9073 +/*
9074 + * Function:    evms_make_request_fn
9075 + *
9076 + */
9077 +static int
9078 +evms_make_request_fn(request_queue_t * q, int rw, struct buffer_head *bh)
9079 +{
9080 +       struct evms_logical_volume *volume;
9081 +
9082 +       volume = &evms_logical_volumes[MINOR(bh->b_rdev)];
9083 +       wait_event(volume->wait_queue, (!volume->quiesced));
9084 +       if (volume->node) {
9085 +               switch (rw) {
9086 +               case READ:
9087 +               case READA:
9088 +                       atomic_inc(&volume->requests_in_progress);
9089 +                       R_IO(volume->node, bh);
9090 +                       atomic_dec(&volume->requests_in_progress);
9091 +                       return 0;
9092 +               case WRITE:
9093 +                       atomic_inc(&volume->requests_in_progress);
9094 +                       W_IO(volume->node, bh);
9095 +                       atomic_dec(&volume->requests_in_progress);
9096 +                       return 0;
9097 +               default:
9098 +                       buffer_IO_error(bh);
9099 +                       return 0;
9100 +               }
9101 +       } else {
9102 +               LOG_ERROR("request for unknown logical volume [minor(%d)].\n",
9103 +                         MINOR(bh->b_rdev));
9104 +               buffer_IO_error(bh);
9105 +       }
9106 +       return 0;
9107 +}
9108 +
9109 +/**********************************************************/
9110 +/* END -- RUNTIME support functions                       */
9111 +/**********************************************************/
9112 +
9113 +/**********************************************************/
9114 +/* START -- INIT/DISCOVERY support functions              */
9115 +/**********************************************************/
9116 +
9117 +#ifdef LOCAL_DEBUG
9118 +static void
9119 +display_discover_list(struct evms_logical_node *discover_list, char *text)
9120 +{
9121 +       struct evms_logical_node *node;
9122 +
9123 +       LOG_DETAILS("discover list:(%s)\n", text);
9124 +       for (node = discover_list; node; node = node->next) {
9125 +               LOG_DETAILS("\nnode info:\n");
9126 +               LOG_DETAILS("node.....................(0x%p)\n", node);
9127 +               LOG_DETAILS("name.....................(%s)\n", node->name);
9128 +               LOG_DETAILS("plugin id................(0x%x)\n",
9129 +                           node->plugin->id);
9130 +               LOG_DETAILS("size.....................("PFU64")\n",
9131 +                           node->total_vsectors);
9132 +               LOG_DETAILS("flags....................(0x%x)\n", node->flags);
9133 +               LOG_DETAILS("iflags...................(0x%x)\n", node->iflags);
9134 +               LOG_DETAILS("sector size..............(%d)\n",
9135 +                           node->hardsector_size);
9136 +               LOG_DETAILS("block size...............(%d)\n",
9137 +                           node->block_size);
9138 +               LOG_DETAILS("sys id...................(0x%x)\n",
9139 +                           node->system_id);
9140 +
9141 +               if (node->feature_header) {
9142 +                       struct evms_feature_header *fh;
9143 +
9144 +                       fh = node->feature_header;
9145 +                       LOG_DETAILS("\nfeature header:\n");
9146 +                       LOG_DETAILS("signature................(0x%x)\n",
9147 +                                   fh->signature);
9148 +                       LOG_DETAILS("crc......................(0x%x)\n",
9149 +                                   fh->crc);
9150 +                       LOG_DETAILS("feature header version...(%d.%d.%d)\n",
9151 +                                   fh->version.major, fh->version.minor,
9152 +                                   fh->version.patchlevel);
9153 +                       LOG_DETAILS("engine version...........(%d.%d.%d)\n",
9154 +                                   fh->engine_version.major,
9155 +                                   fh->engine_version.minor,
9156 +                                   fh->engine_version.patchlevel);
9157 +                       LOG_DETAILS("flags....................(0x%x)\n",
9158 +                                   fh->flags);
9159 +                       LOG_DETAILS("feature id...............(0x%x)\n",
9160 +                                   fh->feature_id);
9161 +                       LOG_DETAILS("sequence#................("PFU64")\n",
9162 +                                   fh->sequence_number);
9163 +                       LOG_DETAILS("alignment padding........("PFU64")\n",
9164 +                                   fh->alignment_padding);
9165 +                       LOG_DETAILS("feature data1 lsn........("PFU64")\n",
9166 +                                   fh->feature_data1_start_lsn);
9167 +                       LOG_DETAILS("feature data1 size.......("PFU64")\n",
9168 +                                   fh->feature_data1_size);
9169 +                       LOG_DETAILS("feature data2 lsn........("PFU64")\n",
9170 +                                   fh->feature_data2_start_lsn);
9171 +                       LOG_DETAILS("feature data2 size.......("PFU64")\n",
9172 +                                   fh->feature_data2_size);
9173 +                       LOG_DETAILS("volume sn................("PFU64")\n",
9174 +                                   fh->volume_serial_number);
9175 +                       LOG_DETAILS("volume minor#............(%d)\n",
9176 +                                   fh->volume_system_id);
9177 +                       LOG_DETAILS("object depth.............(%d)\n",
9178 +                                   fh->object_depth);
9179 +                       LOG_DETAILS("object name..............(%s)\n",
9180 +                                   fh->object_name);
9181 +                       LOG_DETAILS("volume name..............(%s)\n",
9182 +                                   fh->volume_name);
9183 +               }
9184 +
9185 +               if (node->volume_info) {
9186 +                       struct evms_volume_info *vi;
9187 +
9188 +                       vi = node->volume_info;
9189 +                       LOG_DETAILS("\nvolume info:\n");
9190 +                       LOG_DETAILS("volume name..............(%s)\n",
9191 +                                   vi->volume_name);
9192 +                       LOG_DETAILS("volume sn................("PFU64")\n",
9193 +                                   vi->volume_sn);
9194 +                       LOG_DETAILS("volume minor#............(%d)\n",
9195 +                                   vi->volume_minor);
9196 +               }
9197 +       }
9198 +       if (discover_list) {
9199 +               LOG_DETAILS("\n");
9200 +       }
9201 +}
9202 +#endif
9203 +
9204 +/*
9205 + * Function:     evms_discover_logical_disks
9206 + * Description: Construct the logical disk list by calling all registered device managers.
9207 + */
9208 +static void
9209 +evms_discover_logical_disks(struct evms_logical_node **disk_list)
9210 +{
9211 +       struct evms_registered_plugin *p;
9212 +       LOG_EXTRA("discovering logical disks...\n");
9213 +       for (p = registered_plugin_head; p; p = p->next) {
9214 +               if (GetPluginType(p->plugin->id) == EVMS_DEVICE_MANAGER) {
9215 +                       DISCOVER(p, disk_list);
9216 +               }
9217 +       }
9218 +}
9219 +
9220 +/*
9221 + * Function:     evms_discover_logical_partitions
9222 + * Description: Construct the logical partition list by calling all registered partition managers.
9223 + */
9224 +static void
9225 +evms_discover_logical_partitions(struct evms_logical_node **discover_list)
9226 +{
9227 +       int rc, done;
9228 +
9229 +       struct evms_registered_plugin *p;
9230 +       LOG_EXTRA("discovering logical partitions...\n");
9231 +       do {
9232 +               done = TRUE;
9233 +               for (p = registered_plugin_head; p; p = p->next) {
9234 +                       if (GetPluginType(p->plugin->id) ==
9235 +                           EVMS_SEGMENT_MANAGER) {
9236 +                               rc = DISCOVER(p, discover_list);
9237 +                               /* RC > 0 means the plugin
9238 +                                * added something to the
9239 +                                * discover list. This also
9240 +                                * means we must loop thru
9241 +                                * these plugins another time.
9242 +                                * RC == 0 means nothing was
9243 +                                * added to the discover list
9244 +                                * by this plugin.
9245 +                                * RC < 0 means the plugin
9246 +                                * encountered some error and
9247 +                                * nothing was added to the list.
9248 +                                * NOTE: If a plugin has both
9249 +                                * added something new to the
9250 +                                * discover list and encountered
9251 +                                * an error, RC > 0 must be
9252 +                                * returned.
9253 +                                */
9254 +                               if (rc > 0)
9255 +                                       done = FALSE;
9256 +                       }
9257 +               }
9258 +       } while (done == FALSE);
9259 +
9260 +       /* send the end of discovery signal to each
9261 +        * partition manager plugin.
9262 +        */
9263 +       for (p = registered_plugin_head; p; p = p->next)
9264 +               if (GetPluginType(p->plugin->id) == EVMS_SEGMENT_MANAGER)
9265 +                       if (p->plugin->fops->end_discover)
9266 +                               rc = END_DISCOVER(p, discover_list);
9267 +}
9268 +
9269 +/*
9270 + * Function:     evms_discover_volume_groups
9271 + * Description: Find volume groups within the logical partitions list
9272 + */
9273 +static void
9274 +evms_discover_volume_groups(struct evms_logical_node **discover_list)
9275 +{
9276 +       int rc, done;
9277 +
9278 +       struct evms_registered_plugin *p;
9279 +       LOG_EXTRA("discovering logical volume groups...\n");
9280 +       do {
9281 +               done = TRUE;
9282 +               for (p = registered_plugin_head; p; p = p->next) {
9283 +                       if (GetPluginType(p->plugin->id) == EVMS_REGION_MANAGER) {
9284 +                               rc = DISCOVER(p, discover_list);
9285 +                               /* RC > 0 means the plugin
9286 +                                * added something to the
9287 +                                * discover list. This also
9288 +                                * means we must loop thru
9289 +                                * these plugins another time.
9290 +                                * RC == 0 means nothing was
9291 +                                * added to the discover list
9292 +                                * by this plugin.
9293 +                                * RC < 0 means the plugin
9294 +                                * encountered some error and
9295 +                                * nothing was added to the list.
9296 +                                * NOTE: If a plugin has both
9297 +                                * added something new to the
9298 +                                * discover list and encountered
9299 +                                * an error, RC > 0 must be
9300 +                                * returned.
9301 +                                */
9302 +                               if (rc > 0)
9303 +                                       done = FALSE;
9304 +                       }
9305 +               }
9306 +       } while (done == FALSE);
9307 +
9308 +       /* send the end of discovery signal to each volume
9309 +        * group plugin.
9310 +        */
9311 +       for (p = registered_plugin_head; p; p = p->next)
9312 +               if (GetPluginType(p->plugin->id) == EVMS_REGION_MANAGER)
9313 +                       if (p->plugin->fops->end_discover)
9314 +                               rc = END_DISCOVER(p, discover_list);
9315 +}
9316 +
9317 +/*
9318 + *
9319 + * convert all the feature header fields into cpu native format
9320 + * from the on-disk Little Endian format. From this point forward
9321 + * all plugins can deal with feature headers natively.
9322 + */
9323 +void
9324 +le_feature_header_to_cpu(struct evms_feature_header *fh)
9325 +{
9326 +       fh->signature = le32_to_cpup(&fh->signature);
9327 +       fh->crc = le32_to_cpup(&fh->crc);
9328 +       fh->version.major = le32_to_cpup(&fh->version.major);
9329 +       fh->version.minor = le32_to_cpup(&fh->version.minor);
9330 +       fh->version.patchlevel = le32_to_cpup(&fh->version.patchlevel);
9331 +       fh->engine_version.major = le32_to_cpup(&fh->engine_version.major);
9332 +       fh->engine_version.minor = le32_to_cpup(&fh->engine_version.minor);
9333 +       fh->engine_version.patchlevel =
9334 +           le32_to_cpup(&fh->engine_version.patchlevel);
9335 +       fh->flags = le32_to_cpup(&fh->flags);
9336 +       fh->feature_id = le32_to_cpup(&fh->feature_id);
9337 +       fh->sequence_number = le64_to_cpup(&fh->sequence_number);
9338 +       fh->alignment_padding = le64_to_cpup(&fh->alignment_padding);
9339 +       fh->feature_data1_start_lsn =
9340 +           le64_to_cpup(&fh->feature_data1_start_lsn);
9341 +       fh->feature_data1_size = le64_to_cpup(&fh->feature_data1_size);
9342 +       fh->feature_data2_start_lsn =
9343 +           le64_to_cpup(&fh->feature_data2_start_lsn);
9344 +       fh->feature_data2_size = le64_to_cpup(&fh->feature_data2_size);
9345 +       fh->volume_serial_number = le64_to_cpup(&fh->volume_serial_number);
9346 +       fh->volume_system_id = le32_to_cpup(&fh->volume_system_id);
9347 +       fh->object_depth = le32_to_cpup(&fh->object_depth);
9348 +}
9349 +
9350 +static int
9351 +edef_load_feature_header(struct evms_logical_node *node)
9352 +{
9353 +       int i, rc = 0, rc_array[2] = { 0, 0 };
9354 +       unsigned long size_in_bytes;
9355 +       u64 size_in_sectors, starting_sector = 0;
9356 +       struct evms_feature_header *fh = NULL, *fh1 = NULL, *fh2 = NULL;
9357 +       char *location_name = NULL;
9358 +       struct evms_version version = {
9359 +               EVMS_FEATURE_HEADER_MAJOR,
9360 +               EVMS_FEATURE_HEADER_MINOR,
9361 +               EVMS_FEATURE_HEADER_PATCHLEVEL
9362 +       };
9363 +
9364 +       if (!node->feature_header) {
9365 +               size_in_sectors = evms_cs_size_in_vsectors(sizeof (*fh));
9366 +               size_in_bytes = size_in_sectors << EVMS_VSECTOR_SIZE_SHIFT;
9367 +               fh1 = kmalloc(size_in_bytes, GFP_KERNEL);
9368 +               if (fh1) {
9369 +                       fh2 = kmalloc(size_in_bytes, GFP_KERNEL);
9370 +                       if (!fh2) {
9371 +                               kfree(fh1);
9372 +                               rc = -ENOMEM;
9373 +                       }
9374 +               } else {
9375 +                       rc = -ENOMEM;
9376 +               }
9377 +
9378 +               for (i = 0; i < 2; i++) {
9379 +                       if (i == 0) {
9380 +                               starting_sector =
9381 +                                   node->total_vsectors - size_in_sectors;
9382 +                               fh = fh1;
9383 +                               location_name = evms_primary_string;
9384 +                       } else {
9385 +                               starting_sector--;
9386 +                               fh = fh2;
9387 +                               location_name = evms_secondary_string;
9388 +                       }
9389 +                       /* read header into buffer */
9390 +                       rc = INIT_IO(node,
9391 +                                    0, starting_sector, size_in_sectors, fh);
9392 +                       if (rc) {
9393 +                               LOG_ERROR
9394 +                                   ("error(%d) probing for %s feature header(at "PFU64") on '%s'.\n",
9395 +                                    rc, location_name, starting_sector,
9396 +                                    node->name);
9397 +                               rc_array[i] = rc;
9398 +                               continue;
9399 +                       }
9400 +                       /* validate header signature */
9401 +                       if (cpu_to_le32(fh->signature) !=
9402 +                           EVMS_FEATURE_HEADER_SIGNATURE) {
9403 +                               rc = -ENODATA;
9404 +                               rc_array[i] = rc;
9405 +                               continue;
9406 +                       }
9407 +                       /* validate header CRC */
9408 +                       if (fh->crc != EVMS_MAGIC_CRC) {
9409 +                               u32 org_crc, final_crc;
9410 +                               org_crc = cpu_to_le32(fh->crc);
9411 +                               fh->crc = 0;
9412 +                               final_crc =
9413 +                                   evms_cs_calculate_crc(EVMS_INITIAL_CRC, fh,
9414 +                                                         sizeof (*fh));
9415 +                               if (final_crc != org_crc) {
9416 +                                       LOG_ERROR
9417 +                                           ("CRC mismatch error [stored(%x), computed(%x)] in %s feature header(at "PFU64") on '%s'.\n",
9418 +                                            org_crc, final_crc, location_name,
9419 +                                            starting_sector, node->name);
9420 +                                       rc = -EINVAL;
9421 +                                       rc_array[i] = rc;
9422 +                                       continue;
9423 +                               }
9424 +                       } else {
9425 +                               LOG_WARNING
9426 +                                   ("CRC disabled in %s feature header(at "PFU64") on '%s'.\n",
9427 +                                    location_name, starting_sector,
9428 +                                    node->name);
9429 +                       }
9430 +                       /* convert the feature header from the
9431 +                        * on-disk format (Little Endian) to
9432 +                        * native cpu format.
9433 +                        */
9434 +                       le_feature_header_to_cpu(fh);
9435 +                       /* verify the system data version */
9436 +                       rc = evms_cs_check_version(&version, &fh->version);
9437 +                       if (rc) {
9438 +                               LOG_ERROR
9439 +                                   ("error: obsolete version(%d,%d,%d) in %s feature header on '%s'.\n",
9440 +                                    fh->version.major, fh->version.minor,
9441 +                                    fh->version.patchlevel, location_name,
9442 +                                    node->name);
9443 +                               rc_array[i] = rc;
9444 +                       }
9445 +               }
9446 +
9447 +               /* getting same return code for both copies? */
9448 +               if (rc_array[0] == rc_array[1]) {
9449 +                       rc = rc_array[0];
9450 +                       /* if no errors on both copies,
9451 +                        * check the sequence numbers.
9452 +                        * use the highest sequence number.
9453 +                        */
9454 +                       if (!rc) {
9455 +                               /* compare sequence numbers */
9456 +                               if (fh1->sequence_number ==
9457 +                                   fh2->sequence_number) {
9458 +                                       fh = fh1;
9459 +                               } else {
9460 +                                       LOG_WARNING
9461 +                                           ("%s feature header sequence number("PFU64") mismatches %s feature header sequence number("PFU64") on '%s'!\n",
9462 +                                            evms_primary_string,
9463 +                                            fh1->sequence_number,
9464 +                                            evms_secondary_string,
9465 +                                            fh2->sequence_number, node->name);
9466 +                                       if (fh1->sequence_number >
9467 +                                           fh2->sequence_number) {
9468 +                                               fh = fh1;
9469 +                                               location_name =
9470 +                                                   evms_primary_string;
9471 +                                               /* indicate bad sequence number of secondary */
9472 +                                               rc_array[1] = -1;
9473 +                                       } else {
9474 +                                               fh = fh2;
9475 +                                               location_name =
9476 +                                                   evms_secondary_string;
9477 +                                               /* indicate bad sequence number of primary */
9478 +                                               rc_array[0] = -1;
9479 +                                       }
9480 +                               }
9481 +                       }
9482 +                       /* getting different return codes for each copy */
9483 +               } else
9484 +                       /* either primary or secondary copy is
9485 +                        * valid, so use the valid copy.
9486 +                        */
9487 +               if ((rc_array[0] == 0) || (rc_array[1] == 0)) {
9488 +                       char *warn_name = NULL;
9489 +
9490 +                       /* indicate success */
9491 +                       rc = 0;
9492 +                       /* set variables based on which copy is valid */
9493 +                       if (rc_array[0] == 0) {
9494 +                               /* use primary (rear) copy if its good */
9495 +                               fh = fh1;
9496 +                               location_name = evms_primary_string;
9497 +                               warn_name = evms_secondary_string;
9498 +                       } else {
9499 +                               /* use secondary (front) copy if its good */
9500 +                               fh = fh2;
9501 +                               location_name = evms_secondary_string;
9502 +                               warn_name = evms_primary_string;
9503 +                       }
9504 +                       /* warn the user about the invalid copy */
9505 +                       LOG_WARNING
9506 +                           ("warning: error(%d) probing/verifying the %s feature header on '%s'.\n",
9507 +                            rc_array[0] + rc_array[1], warn_name, node->name);
9508 +               } else
9509 +                       /* both copies had a different error,
9510 +                        * and one was a fatal error, so
9511 +                        * indicate fatal error.
9512 +                        */
9513 +               if ((rc_array[0] == -EINVAL) || (rc_array[1] == -EINVAL)) {
9514 +                       rc = -EINVAL;
9515 +               }
9516 +
9517 +               /* on error, set fh to NULL */
9518 +               if (rc)
9519 +                       fh = NULL;
9520 +
9521 +               /* deallocate metadata buffers appropriately */
9522 +               if (fh != fh1)
9523 +                       kfree(fh1);
9524 +               if (fh != fh2)
9525 +                       kfree(fh2);
9526 +
9527 +               /* save validated feature header pointer */
9528 +               if (!rc) {
9529 +                       node->feature_header = fh;
9530 +                       if (rc_array[0] != rc_array[1]) {
9531 +                               LOG_DETAILS
9532 +                                   ("using %s feature header on '%s'.\n",
9533 +                                    location_name, node->name);
9534 +                       }
9535 +               }
9536 +
9537 +               /* if no signature found, adjust return code */
9538 +               if (rc == -ENODATA) {
9539 +                       rc = 0;
9540 +                       LOG_DEBUG("no feature header found on '%s'.\n",
9541 +                                 node->name);
9542 +               }
9543 +       }
9544 +       return (rc);
9545 +}
9546 +
9547 +static int
9548 +edef_find_first_features(struct evms_logical_node **discover_list)
9549 +{
9550 +       int rc;
9551 +       struct evms_logical_node *node, *tmp_list_head;
9552 +
9553 +       tmp_list_head = *discover_list;
9554 +       *discover_list = NULL;
9555 +
9556 +       while (tmp_list_head) {
9557 +               struct evms_list_node **evms_node;
9558 +
9559 +               node = tmp_list_head;
9560 +               rc = evms_cs_remove_logical_node_from_list(&tmp_list_head,
9561 +                                                          node);
9562 +               if (rc)
9563 +                       BUG();
9564 +
9565 +               /* check for duplicate pointers
9566 +                * search for the node in global list
9567 +                */
9568 +               evms_node =
9569 +                   evms_cs_lookup_item_in_list(&evms_global_feature_node_list,
9570 +                                               node);
9571 +               /* already present? */
9572 +               if (*evms_node) {
9573 +                       /* yes, already present */
9574 +                       rc = -ENODATA;  /* dont process this node further */
9575 +                       LOG_DETAILS("deleting duplicate reference to '%s'.\n",
9576 +                                   node->name);
9577 +                       /* forget this node */
9578 +                       node = NULL;
9579 +               } else {
9580 +                       /* load the feature header if present */
9581 +                       rc = edef_load_feature_header(node);
9582 +                       /* This node have a feature header ?
9583 +                        * it won't be if there is no header to load
9584 +                        * OR
9585 +                        * there was a fatal error attempting to read it.
9586 +                        */
9587 +                       if (node->feature_header) {
9588 +                               /* check for object flag */
9589 +                               if (node->feature_header->flags &
9590 +                                   EVMS_VOLUME_DATA_OBJECT) {
9591 +                                       LOG_DEFAULT
9592 +                                           ("object detected, deleting '%s'.\n",
9593 +                                            node->name);
9594 +                                       rc = -EINVAL;
9595 +                               } else
9596 +                                       /* check for stop-data flag */
9597 +                               if (node->feature_header->flags &
9598 +                                           EVMS_VOLUME_DATA_STOP) {
9599 +                                       LOG_DEFAULT
9600 +                                           ("stop data detected, deleting '%s'.\n",
9601 +                                            node->name);
9602 +                                       rc = -EINVAL;
9603 +                               } else {
9604 +                                       /* we have a valid feature header.
9605 +                                        * initialize appropriate node fields
9606 +                                        * to indicate this.
9607 +                                        */
9608 +                                       node->flags |= EVMS_VOLUME_FLAG;
9609 +                                       node->iflags |= EVMS_FEATURE_BOTTOM;
9610 +                                       node->volume_info =
9611 +                                           kmalloc(sizeof
9612 +                                                   (struct evms_volume_info),
9613 +                                                   GFP_KERNEL);
9614 +                                       if (node->volume_info) {
9615 +                                               /* set up volume
9616 +                                                * info struct
9617 +                                                */
9618 +                                               memset(node->volume_info, 0,
9619 +                                                      sizeof
9620 +                                                      (struct
9621 +                                                       evms_volume_info));
9622 +                                               node->volume_info->volume_sn =
9623 +                                                   node->feature_header->
9624 +                                                   volume_serial_number;
9625 +                                               node->volume_info->
9626 +                                                   volume_minor =
9627 +                                                   node->feature_header->
9628 +                                                   volume_system_id;
9629 +                                               strcpy(node->volume_info->
9630 +                                                      volume_name,
9631 +                                                      node->feature_header->
9632 +                                                      volume_name);
9633 +                                               /* register(add) node to
9634 +                                                * the global list.
9635 +                                                */
9636 +                                               rc = evms_cs_add_item_to_list
9637 +                                                   (&evms_global_feature_node_list,
9638 +                                                    node);
9639 +                                       } else {
9640 +                                               rc = -ENOMEM;
9641 +                                       }
9642 +                               }
9643 +                       }
9644 +               }
9645 +               /* if any errors, delete the node */
9646 +               if (rc) {
9647 +                       if (node) {
9648 +                               DELETE(node);
9649 +                       }
9650 +               } else
9651 +                       /* on successful processing of this node
9652 +                        * place it back on the discover list.
9653 +                        */
9654 +                       evms_cs_add_logical_node_to_list(discover_list, node);
9655 +       }
9656 +       return (0);
9657 +}
9658 +
9659 +/* These define describe the node types that can be isolated. */
9660 +#define ISOLATE_ASSOCIATIVE_FEATURES           0
9661 +#define ISOLATE_COMPATIBILITY_VOLUMES          1
9662 +#define ISOLATE_EVMS_VOLUMES                   2
9663 +#define ISOLATE_EVMS_VOLUME_SERIAL_NUMBER      3
9664 +#define ISOLATE_EVMS_NODES_BY_FEATURE_AND_DEPTH        4
9665 +static int
9666 +edef_isolate_nodes_by_type(unsigned int type,
9667 +                          struct evms_logical_node **src_list,
9668 +                          struct evms_logical_node **trg_list,
9669 +                          u32 compare32, u64 compare64)
9670 +{
9671 +       struct evms_logical_node *node, *next_node;
9672 +       int rc = 0, found_node;
9673 +       struct evms_feature_header *fh = NULL;
9674 +
9675 +       for (node = *src_list; node; node = next_node) {
9676 +               next_node = node->next;
9677 +
9678 +               if (node->feature_header)
9679 +                       fh = node->feature_header;
9680 +               found_node = FALSE;
9681 +               switch (type) {
9682 +               case ISOLATE_ASSOCIATIVE_FEATURES:
9683 +                       if (fh) {
9684 +                               if (GetPluginType(fh->feature_id) ==
9685 +                                   EVMS_ASSOCIATIVE_FEATURE)
9686 +                                       found_node = TRUE;
9687 +                       }
9688 +                       break;
9689 +               case ISOLATE_COMPATIBILITY_VOLUMES:
9690 +                       if (!(node->flags & EVMS_VOLUME_FLAG))
9691 +                               found_node = TRUE;
9692 +                       break;
9693 +               case ISOLATE_EVMS_VOLUMES:
9694 +                       if (node->flags & EVMS_VOLUME_FLAG)
9695 +                               found_node = TRUE;
9696 +                       break;
9697 +                       /* EVMS volumes with same serial # */
9698 +               case ISOLATE_EVMS_VOLUME_SERIAL_NUMBER:
9699 +                       if (node->volume_info->volume_sn == compare64)
9700 +                               found_node = TRUE;
9701 +                       break;
9702 +               case ISOLATE_EVMS_NODES_BY_FEATURE_AND_DEPTH:
9703 +                       if (fh)
9704 +                               if (fh->object_depth == compare64)
9705 +                                       if (fh->feature_id == compare32)
9706 +                                               found_node = TRUE;
9707 +                       break;
9708 +               }
9709 +               if (found_node == TRUE) {
9710 +                       rc = evms_cs_remove_logical_node_from_list(src_list,
9711 +                                                                  node);
9712 +                       if (rc)
9713 +                               break;
9714 +                       rc = evms_cs_add_logical_node_to_list(trg_list, node);
9715 +                       if (rc)
9716 +                               break;
9717 +               }
9718 +       }
9719 +       return (rc);
9720 +}
9721 +
9722 +static int
9723 +edef_apply_feature(struct evms_logical_node *node,
9724 +                  struct evms_logical_node **volume_node_list)
9725 +{
9726 +       struct evms_registered_plugin *p;
9727 +       int rc = -1;
9728 +
9729 +       for (p = registered_plugin_head; p; p = p->next) {
9730 +               if (p->plugin->id == node->feature_header->feature_id) {
9731 +                       rc = DISCOVER(p, volume_node_list);
9732 +                       break;
9733 +               }
9734 +       }
9735 +       return (rc);
9736 +}
9737 +
9738 +static int
9739 +edef_get_feature_plugin_header(u32 id, struct evms_plugin_header **header)
9740 +{
9741 +       int rc = -ENOPKG;
9742 +       struct evms_registered_plugin *p;
9743 +
9744 +       for (p = registered_plugin_head; p; p = p->next) {
9745 +               if (p->plugin->id == id) {
9746 +                       *header = p->plugin;
9747 +                       rc = 0;
9748 +                       break;
9749 +               }
9750 +       }
9751 +       if (rc) {
9752 +               LOG_SERIOUS("no plugin loaded for feature id(0x%x)\n", id);
9753 +       }
9754 +       return (rc);
9755 +}
9756 +
9757 +typedef struct evms_volume_build_info_s {
9758 +       int node_count;
9759 +       int feature_header_count;
9760 +       int feature_count;
9761 +       int associative_feature_count;
9762 +       u64 max_depth;
9763 +       struct evms_plugin_header *plugin;
9764 +       struct evms_logical_node *feature_node_list;
9765 +} evms_volume_build_info_t;
9766 +
9767 +/*
9768 + * edef_evaluate_volume_node_list:
9769 + *   does:
9770 + *     1) put all nodes from feature list back on volume list
9771 + *      2) loads the node's feature headers
9772 + *      3) counts the node list's entries
9773 + *      4) builds the feature node list
9774 + *     5) counts the feature headers for associative features
9775 + *     6) sets feature count to >1 if >1 features to be processed
9776 + */
9777 +static int
9778 +edef_evaluate_volume_node_list(struct evms_logical_node **volume_node_list,
9779 +                              evms_volume_build_info_t * vbi,
9780 +                              int volume_complete)
9781 +{
9782 +       int rc;
9783 +       struct evms_logical_node *node;
9784 +
9785 +       vbi->node_count =
9786 +           vbi->feature_count =
9787 +           vbi->associative_feature_count = vbi->max_depth = 0;
9788 +       vbi->plugin = NULL;
9789 +
9790 +       /* put all feature nodes back on the volume list */
9791 +       rc = edef_isolate_nodes_by_type(ISOLATE_EVMS_VOLUMES,
9792 +                                       &vbi->feature_node_list,
9793 +                                       volume_node_list, 0, 0);
9794 +       if (rc)
9795 +               return (rc);
9796 +
9797 +       /* load all the feature headers */
9798 +       if (!volume_complete) {
9799 +               for (node = *volume_node_list; node; node = node->next) {
9800 +                       rc = edef_load_feature_header(node);
9801 +                       if (rc)
9802 +                               return (rc);
9803 +               }
9804 +       }
9805 +
9806 +       /* find the 1st max depth object:
9807 +        *   record the depth
9808 +        *   record the plugin
9809 +        */
9810 +       for (node = *volume_node_list; node; node = node->next) {
9811 +               struct evms_plugin_header *plugin;
9812 +               struct evms_feature_header *fh = node->feature_header;
9813 +
9814 +               /* count the nodes */
9815 +               vbi->node_count++;
9816 +
9817 +               /* no feature header found, continue to next node */
9818 +               if (!fh)
9819 +                       continue;
9820 +
9821 +               /* check the depth */
9822 +               if (fh->object_depth > vbi->max_depth) {
9823 +                       /* record new max depth */
9824 +                       vbi->max_depth = fh->object_depth;
9825 +                       /* find the plugin header for this feature id */
9826 +                       rc = edef_get_feature_plugin_header(fh->feature_id,
9827 +                                                           &plugin);
9828 +                       if (rc)
9829 +                               return (rc);
9830 +                       /* check for >1 plugins */
9831 +                       if (vbi->plugin != plugin) {
9832 +                               vbi->feature_count++;
9833 +                               vbi->plugin = plugin;
9834 +                       }
9835 +               }
9836 +               /* check for "associative" feature indicator */
9837 +               if (GetPluginType(vbi->plugin->id) == EVMS_ASSOCIATIVE_FEATURE)
9838 +                       vbi->associative_feature_count++;
9839 +       }
9840 +       /* build a list of max depth nodes for this feature */
9841 +       if (vbi->max_depth) {
9842 +               rc = edef_isolate_nodes_by_type
9843 +                   (ISOLATE_EVMS_NODES_BY_FEATURE_AND_DEPTH, volume_node_list,
9844 +                    &vbi->feature_node_list, vbi->plugin->id, vbi->max_depth);
9845 +               if (rc)
9846 +                       return (rc);
9847 +               if (!vbi->plugin)
9848 +                       return (-ENODATA);
9849 +               if (!vbi->feature_node_list)
9850 +                       return (-ENODATA);
9851 +       }
9852 +
9853 +       return (rc);
9854 +}
9855 +
9856 +/* function: edef_check_feature_conditions
9857 + *
9858 + * This routine verifies the state of volume based on the features
9859 + * headers and nodes in the current discovery list. All detected
9860 + * errors are considered fatal.
9861 + */
9862 +static int
9863 +edef_check_feature_conditions(evms_volume_build_info_t * vbi)
9864 +{
9865 +       int rc = 0;
9866 +
9867 +       if (vbi->associative_feature_count) {
9868 +               if (vbi->node_count > 1) {
9869 +                       rc = -EVMS_VOLUME_FATAL_ERROR;
9870 +                       LOG_ERROR
9871 +                           ("associative ERROR: > 1 nodes(%d) remaining to be processed!\n",
9872 +                            vbi->node_count);
9873 +               } else if (vbi->max_depth != 1) {
9874 +                       rc = -EVMS_VOLUME_FATAL_ERROR;
9875 +                       LOG_ERROR
9876 +                           ("associative ERROR: associative feature found at node depth("PFU64") != 1!\n",
9877 +                            vbi->max_depth);
9878 +               } else
9879 +                       rc = -EVMS_ASSOCIATIVE_FEATURE;
9880 +       }
9881 +       if (!rc) {
9882 +               if (!vbi->max_depth) {
9883 +                       if (vbi->node_count > 1) {
9884 +                               rc = -EVMS_VOLUME_FATAL_ERROR;
9885 +                               LOG_ERROR
9886 +                                   ("max depth ERROR: > 1 nodes(%d) remaining to be processed!\n",
9887 +                                    vbi->node_count);
9888 +                       }
9889 +               } else if (vbi->max_depth == 1) {
9890 +                       if (vbi->feature_count > 1) {
9891 +                               rc = -EVMS_VOLUME_FATAL_ERROR;
9892 +                               LOG_ERROR
9893 +                                   ("max depth 1 ERROR: > 1 features remaining to be processed!\n");
9894 +                       }
9895 +               }
9896 +       }
9897 +       return (rc);
9898 +}
9899 +
9900 +/* function: edef_apply_features
9901 + *
9902 + * This routine applies none, one, or more features to an EVMS
9903 + * volume. The system data structure is first verified and then
9904 + * features are applied and verified recursively until the
9905 + * entire volume has been constructed. Fatal errors result in
9906 + * all nodes in the volume discovery list being deleted.
9907 + */
9908 +static int
9909 +edef_apply_features(struct evms_logical_node **volume_node_list)
9910 +{
9911 +       int rc = 1, done, top_feature_applying;
9912 +       evms_volume_build_info_t vbi;
9913 +
9914 +       vbi.feature_node_list = NULL;
9915 +       rc = edef_evaluate_volume_node_list(volume_node_list, &vbi, FALSE);
9916 +
9917 +       /* ensure we don't go into the next loop
9918 +        * without having a target plugin to
9919 +        * pass control to.
9920 +        */
9921 +       if (!rc) {
9922 +               if (!vbi.plugin) {
9923 +                       rc = -ENODATA;
9924 +               }
9925 +       }
9926 +
9927 +       /* this loop should ONLY get used when
9928 +        * there are features to process.
9929 +        */
9930 +       done = (rc) ? TRUE : FALSE;
9931 +       while (!done) {
9932 +               rc = edef_check_feature_conditions(&vbi);
9933 +               if (rc)
9934 +                       break;
9935 +               top_feature_applying = (vbi.max_depth == 1) ? TRUE : FALSE;
9936 +               rc = vbi.plugin->fops->discover(&vbi.feature_node_list);
9937 +               if (!rc) {
9938 +                       rc = edef_evaluate_volume_node_list(volume_node_list,
9939 +                                                           &vbi,
9940 +                                                           top_feature_applying);
9941 +                       if (top_feature_applying == TRUE) {
9942 +                               if (vbi.node_count > 1) {
9943 +                                       rc = -EVMS_VOLUME_FATAL_ERROR;
9944 +                                       LOG_ERROR
9945 +                                           ("ERROR: detected > 1 node at volume completion!\n");
9946 +                               }
9947 +                               done = TRUE;
9948 +                       } else {
9949 +                               if (!vbi.plugin) {
9950 +                                       rc = -EVMS_VOLUME_FATAL_ERROR;
9951 +                                       LOG_ERROR
9952 +                                           ("ERROR: depth("PFU64"): expected another feature!\n",
9953 +                                            vbi.max_depth);
9954 +                                       done = TRUE;
9955 +                               }
9956 +                       }
9957 +               } else {        /* rc != 0 */
9958 +                       rc = -EVMS_VOLUME_FATAL_ERROR;
9959 +                       done = TRUE;
9960 +               }
9961 +       }
9962 +       if (rc)
9963 +               /* put all feature nodes back on the volume list */
9964 +               if (edef_isolate_nodes_by_type(ISOLATE_EVMS_VOLUMES,
9965 +                                              &vbi.feature_node_list,
9966 +                                              volume_node_list, 0, 0))
9967 +                       BUG();
9968 +       return (rc);
9969 +}
9970 +
9971 +static int
9972 +edef_delete_node(struct evms_logical_node **node_list,
9973 +                struct evms_logical_node *node, int return_code,
9974 +                char *log_text)
9975 +{
9976 +       int rc;
9977 +
9978 +       rc = evms_cs_remove_logical_node_from_list(node_list, node);
9979 +       if (!rc) {
9980 +               LOG_ERROR("%s error(%d): deleting volume(%s), node(%s)\n",
9981 +                         log_text, return_code,
9982 +                         node->volume_info->volume_name, node->name);
9983 +               rc = DELETE(node);
9984 +               if (rc) {
9985 +                       LOG_ERROR("error(%d) while deleting node(%s)\n",
9986 +                                 rc, node->name);
9987 +               }
9988 +       } else {
9989 +               LOG_WARNING
9990 +                   ("%s error(%d): node gone, assumed deleted by plugin.\n",
9991 +                    log_text, return_code);
9992 +               /* plugin must have cleaned up the node.
9993 +                * So just reset the return code and leave.
9994 +                */
9995 +               rc = 0;
9996 +       }
9997 +
9998 +       return (rc);
9999 +}
10000 +
10001 +static int
10002 +edef_process_evms_volumes(struct evms_logical_node **discover_list,
10003 +                         struct evms_logical_node **associative_feature_list)
10004 +{
10005 +       int rc = 0;
10006 +       struct evms_logical_node *node, *evms_volumes_list, *volume_node_list;
10007 +       u64 volume_sn;
10008 +
10009 +       /* put all EVMS volumes on their own list */
10010 +       evms_volumes_list = NULL;
10011 +       rc = edef_isolate_nodes_by_type(ISOLATE_EVMS_VOLUMES,
10012 +                                       discover_list,
10013 +                                       &evms_volumes_list, 0, 0);
10014 +
10015 +       /* apply features to each EVMS volume */
10016 +       /* one volume at a time on each pass  */
10017 +       while (evms_volumes_list) {
10018 +               node = evms_volumes_list;
10019 +               /* put all nodes for one EVMS volume on separate list */
10020 +               volume_node_list = NULL;
10021 +               volume_sn = node->volume_info->volume_sn;
10022 +               rc = edef_isolate_nodes_by_type
10023 +                   (ISOLATE_EVMS_VOLUME_SERIAL_NUMBER, &evms_volumes_list,
10024 +                    &volume_node_list, 0, volume_sn);
10025 +               if (rc)
10026 +                       break;
10027 +               /* go apply all the volume features now */
10028 +               rc = edef_apply_features(&volume_node_list);
10029 +               switch (rc) {
10030 +               case 0: /* SUCCESS */
10031 +                       /* remove volume just processed */
10032 +                       node = volume_node_list;
10033 +                       rc = evms_cs_remove_logical_node_from_list
10034 +                           (&volume_node_list, node);
10035 +                       if (rc)
10036 +                               break;
10037 +                       /* put volume on global list */
10038 +                       rc = evms_cs_add_logical_node_to_list(discover_list,
10039 +                                                             node);
10040 +                       break;
10041 +               case -EVMS_ASSOCIATIVE_FEATURE:
10042 +                       /* put all "associative" features on their own list */
10043 +                       rc = edef_isolate_nodes_by_type
10044 +                           (ISOLATE_ASSOCIATIVE_FEATURES, &volume_node_list,
10045 +                            associative_feature_list, 0, 0);
10046 +                       break;
10047 +               default:        /* FATAL ERROR */
10048 +                       /* delete each node remaining in the list */
10049 +                       if (volume_node_list) {
10050 +                               LOG_ERROR
10051 +                                   ("encountered fatal error building volume '%s'\n",
10052 +                                    volume_node_list->volume_info->
10053 +                                    volume_name);
10054 +                       }
10055 +                       while (volume_node_list) {
10056 +                               node = volume_node_list;
10057 +                               edef_delete_node(&volume_node_list,
10058 +                                                node, rc, "EVMS feature");
10059 +                       }
10060 +                       rc = 0;
10061 +                       break;
10062 +               }
10063 +               if (rc)
10064 +                       break;
10065 +       }
10066 +       return (rc);
10067 +}
10068 +
10069 +static int
10070 +edef_process_associative_volumes(struct evms_logical_node
10071 +                                **associative_feature_list,
10072 +                                struct evms_logical_node **discover_list)
10073 +{
10074 +       int rc = 0;
10075 +       struct evms_logical_node *node;
10076 +
10077 +       while (*associative_feature_list) {
10078 +               node = *associative_feature_list;
10079 +               /* remove this node from associative feature list */
10080 +               rc = evms_cs_remove_logical_node_from_list
10081 +                   (associative_feature_list, node);
10082 +               if (rc)
10083 +                       break;
10084 +               /* put volume on global list */
10085 +               rc = evms_cs_add_logical_node_to_list(discover_list, node);
10086 +               if (rc)
10087 +                       break;
10088 +               rc = edef_load_feature_header(node);
10089 +               if (rc)
10090 +                       break;
10091 +               rc = edef_apply_feature(node, discover_list);
10092 +               if (rc)
10093 +                       edef_delete_node(discover_list, node, rc,
10094 +                                        "Associative feature");
10095 +       }
10096 +       return (rc);
10097 +}
10098 +
10099 +static int
10100 +edef_check_for_incomplete_volumes(struct evms_logical_node **discover_list)
10101 +{
10102 +       int rc = 0;
10103 +       struct evms_logical_node *next_node, *node;
10104 +
10105 +       /* check to see if any incomplete volumes are left around */
10106 +       /* if so, delete them.                                    */
10107 +       /* complete volumes should not have feature_headers       */
10108 +       /* hanging off them, if we find any, we know the volume   */
10109 +       /* is incomplete.                                         */
10110 +
10111 +       for (node = *discover_list; node; node = next_node) {
10112 +               next_node = node->next;
10113 +
10114 +               if (node->feature_header) {
10115 +                       edef_delete_node(discover_list, node, rc,
10116 +                                        "Unexpected feature header");
10117 +               }
10118 +       }
10119 +       return (rc);
10120 +}
10121 +
10122 +/*
10123 + * Function:     evms_discover_evms_features
10124 + * Description: Find features for nodes on the logical partitions list
10125 + */
10126 +static int
10127 +evms_discover_evms_features(struct evms_logical_node **discover_list)
10128 +{
10129 +       struct evms_logical_node *associative_feature_list;
10130 +       int rc = 0;
10131 +
10132 +       LOG_EXTRA("discovering evms volume features...\n");
10133 +
10134 +       /* initialize "associative" features list */
10135 +       associative_feature_list = NULL;
10136 +
10137 +       /* find the bottom features */
10138 +       rc = edef_find_first_features(discover_list);
10139 +#ifdef LOCAL_DEBUG
10140 +       display_discover_list(*discover_list, "after 1st features hdr");
10141 +#endif
10142 +       if (!rc)
10143 +               /* process EVMS volumes here */
10144 +               rc = edef_process_evms_volumes(discover_list,
10145 +                                              &associative_feature_list);
10146 +#ifdef LOCAL_DEBUG
10147 +       display_discover_list(*discover_list, "after evms volumes");
10148 +#endif
10149 +       if (!rc)
10150 +               /* process "associative" features here */
10151 +               rc = edef_process_associative_volumes(&associative_feature_list,
10152 +                                                     discover_list);
10153 +#ifdef LOCAL_DEBUG
10154 +       display_discover_list(*discover_list, "after associatives");
10155 +#endif
10156 +       if (!rc)
10157 +               /* check for incomplete volumes */
10158 +               rc = edef_check_for_incomplete_volumes(discover_list);
10159 +
10160 +       return (rc);
10161 +}
10162 +
10163 +/*
10164 + * function: eelv_assign_volume_minor
10165 + *
10166 + * This is a support function for evms_export_logical_volumes.
10167 + * This routine assigns a specific minor number to a volume. It
10168 + * also performs the remaining steps to make this volume visible
10169 + * and usable to the kernel.
10170 + *
10171 + */
10172 +static void
10173 +eelv_assign_volume_minor(struct evms_logical_node *node, int minor)
10174 +{
10175 +       struct evms_logical_volume *volume;
10176 +
10177 +       /* initialize the logical_node entry in the volume array */
10178 +       volume = &evms_logical_volumes[minor];
10179 +       volume->node = node;
10180 +       volume->name =
10181 +           kmalloc(strlen(EVMS_GET_NODE_NAME(node)) + 1, GFP_KERNEL);
10182 +       if (!volume->name)
10183 +               BUG();
10184 +       strcpy(volume->name, EVMS_GET_NODE_NAME(node));
10185 +
10186 +       /* copy flags from top level node into volume structure */
10187 +       volume->flags = node->flags;
10188 +
10189 +       /* check for read-only volume */
10190 +       if (volume->flags & EVMS_VOLUME_READ_ONLY) {
10191 +               set_device_ro(MKDEV(EVMS_MAJOR, minor), 1);
10192 +       }
10193 +
10194 +       /* adjust volume size based on hardsector size */
10195 +       node->total_vsectors &=
10196 +           ~((node->hardsector_size >> EVMS_VSECTOR_SIZE_SHIFT) - 1);
10197 +
10198 +       /* initialize the global device arrays */
10199 +       blksize_size[EVMS_MAJOR][minor] = node->block_size;
10200 +       hardsect_size[EVMS_MAJOR][minor] = node->hardsector_size;
10201 +       blk_size[EVMS_MAJOR][minor] = (int) (node->total_vsectors >> 1);
10202 +
10203 +       /* register this volume with devfs */
10204 +       volume->devfs_handle =
10205 +           devfs_register(evms_dir_devfs_handle,
10206 +                          volume->name,
10207 +                          DEVFS_FL_DEFAULT,
10208 +                          EVMS_MAJOR, minor,
10209 +                          S_IFBLK | S_IRUGO | S_IWUGO, &evms_fops, NULL);
10210 +
10211 +       evms_volumes++;
10212 +
10213 +       LOG_DEFAULT("Exporting EVMS Volume(%u,%u) from \"%s%s\".\n",
10214 +                   EVMS_MAJOR, minor, EVMS_DEV_NODE_PATH, volume->name);
10215 +}
10216 +
10217 +/*
10218 + * function: eelv_check_for_duplicity
10219 + *
10220 + * This is a support function for evms_export_logical_volumes.
10221 + * This routine compares the serial number in the top most node
10222 + * in the volume to the list of currently exported volumes. If
10223 + * this volumes serial number is found in the list then we know
10224 + * this volume is a duplicate and it is then delete.
10225 + *
10226 + */
10227 +static void
10228 +eelv_check_for_duplicity(struct evms_logical_node **discover_list)
10229 +{
10230 +       struct evms_logical_node *next_node, *node;
10231 +       struct evms_logical_volume *lv;
10232 +       int i, is_dup;
10233 +
10234 +       for (node = *discover_list; node; node = next_node) {
10235 +               next_node = node->next;
10236 +
10237 +               is_dup = FALSE;
10238 +               for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
10239 +                       lv = &evms_logical_volumes[i];
10240 +                       /* only check exported volumes */
10241 +                       if (lv->node) {
10242 +                               char *type_ptr = NULL;
10243 +
10244 +                               /* check for duplicate pointer */
10245 +                               if (node == lv->node) {
10246 +                                       is_dup = TRUE;
10247 +                                       type_ptr = "pointer";
10248 +                                       /* check for duplicate node */
10249 +                               } else if (!strcmp(node->name, lv->node->name)) {
10250 +                                       is_dup = TRUE;
10251 +                                       type_ptr = "node";
10252 +                               }
10253 +                               if (is_dup == TRUE) {
10254 +                                       evms_cs_remove_logical_node_from_list
10255 +                                           (discover_list, node);
10256 +                                       LOG_DETAILS
10257 +                                           ("deleting duplicate %s to EVMS volume(%u,%u,%s)...\n",
10258 +                                            type_ptr, EVMS_MAJOR, i,
10259 +                                            EVMS_GET_NODE_NAME(node));
10260 +                                       /* forget duplicate */
10261 +                                       break;
10262 +                               }
10263 +                       }
10264 +               }
10265 +       }
10266 +}
10267 +
10268 +/*
10269 + * function: eelv_reassign_soft_deleted_volume_minors
10270 + *
10271 + * This is a support function for evms_export_logical_volumes.
10272 + * This routine reassigns minor numbers to rediscovered "soft"
10273 + * deleted volumes.
10274 + *
10275 + */
10276 +static void
10277 +eelv_reassign_soft_deleted_volume_minors(struct evms_logical_node
10278 +                                        **discover_list)
10279 +{
10280 +       struct evms_logical_node *next_node, *node;
10281 +       struct evms_logical_volume *lv;
10282 +       int i, node_removed;
10283 +
10284 +       for (node = *discover_list; node; node = next_node) {
10285 +               next_node = node->next;
10286 +
10287 +               node_removed = FALSE;
10288 +               for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
10289 +                       lv = &evms_logical_volumes[i];
10290 +                       /* only check soft deleted volumes:
10291 +                        *  they have a non-NULL name.
10292 +                        */
10293 +                       if (lv->flags & EVMS_VOLUME_SOFT_DELETED) {
10294 +                               if (!strcmp(EVMS_GET_NODE_NAME(node), lv->name)) {
10295 +                                       /* reassign requested minor */
10296 +                                       evms_cs_remove_logical_node_from_list
10297 +                                           (discover_list, node);
10298 +                                       node_removed = TRUE;
10299 +                                       LOG_DEFAULT("Re");
10300 +                                       /* free the previously used name */
10301 +                                       kfree(lv->name);
10302 +                                       lv->name = NULL;
10303 +                                       /* clear the EVMS_VOLUME_SOFT_DELETED flag */
10304 +                                       lv->flags = 0;
10305 +                                       eelv_assign_volume_minor(node, i);
10306 +                                       break;
10307 +                               }
10308 +                       }
10309 +               }
10310 +       }
10311 +}
10312 +
10313 +/*
10314 + * function: eelv_assign_evms_volume_minors
10315 + *
10316 + * This is a support function for evms_export_logical_volumes.
10317 + * This routine assigns minor numbers to new evms volumes. If
10318 + * the specified minor is already in use, the requested minor
10319 + * is set to 0, and will be assigned next available along with
10320 + * any remaining volumes at the end of evms_export_logical_volumes.
10321 + *
10322 + */
10323 +static void
10324 +eelv_assign_evms_volume_minors(struct evms_logical_node **discover_list)
10325 +{
10326 +       struct evms_logical_node *next_node, *node, *lv_node;
10327 +       unsigned int requested_minor, node_removed;
10328 +
10329 +       for (node = *discover_list; node; node = next_node) {
10330 +               next_node = node->next;
10331 +
10332 +               node_removed = FALSE;
10333 +               /* only process evms volumes */
10334 +               if (node->flags & EVMS_VOLUME_FLAG) {
10335 +                       requested_minor = node->volume_info->volume_minor;
10336 +                       /* is there a requested minor? */
10337 +                       if (requested_minor) {
10338 +                               int lv_flags = 0;
10339 +
10340 +                               /* check range of requested minor */
10341 +                               if (requested_minor >= MAX_EVMS_VOLUMES)
10342 +                                       lv_node = node;
10343 +                               else {
10344 +                                       struct evms_logical_volume *lv;
10345 +                                       lv = &evms_logical_volumes
10346 +                                           [requested_minor];
10347 +                                       lv_node = lv->node;
10348 +                                       lv_flags = lv->flags;
10349 +                               }
10350 +                               if ((!lv_node)
10351 +                                   && (!(lv_flags & EVMS_VOLUME_SOFT_DELETED))) {
10352 +                                       /* assign requested minor */
10353 +                                       evms_cs_remove_logical_node_from_list
10354 +                                           (discover_list, node);
10355 +                                       node_removed = TRUE;
10356 +                                       eelv_assign_volume_minor(node,
10357 +                                                                requested_minor);
10358 +                               } else {
10359 +                                       LOG_WARNING
10360 +                                           ("EVMS volume(%s) requesting invalid/in-use minor(%d), assigning next available!\n",
10361 +                                            node->volume_info->volume_name,
10362 +                                            requested_minor);
10363 +                                       /*
10364 +                                        * requested minor is already
10365 +                                        * in use, defer assignment
10366 +                                        * until later.
10367 +                                        */
10368 +                                       node->volume_info->volume_minor = 0;
10369 +                               }
10370 +                       }
10371 +               }
10372 +       }
10373 +}
10374 +
10375 +/*
10376 + * function: eelv_assign_remaining_evms_volume_minors
10377 + *
10378 + * This is a support function for evms_export_logical_volumes.
10379 + * This routine assigns minor numbers to new evms volumes that
10380 + * have no/conflicting minor assignments. This function will
10381 + * search from high(255) minor values down, for the first available
10382 + * minor. Searching high to low minimizes the possibility of
10383 + * conflicting evms volumes causing "compatibility" minor
10384 + * assignments to shift from expected assignments.
10385 + *
10386 + */
10387 +static void
10388 +eelv_assign_remaining_evms_volume_minors(struct evms_logical_node
10389 +                                        **discover_list)
10390 +{
10391 +       struct evms_logical_node *next_node, *node;
10392 +       int requested_minor, node_removed;
10393 +
10394 +       for (node = *discover_list; node; node = next_node) {
10395 +               next_node = node->next;
10396 +
10397 +               node_removed = FALSE;
10398 +               /* only process evms volumes */
10399 +               /* all remaining evms volumes should now
10400 +                * have a minor value of 0, meaning they
10401 +                * had no minor assignment, or their minor
10402 +                * assignment conflicted with an existing
10403 +                * minor assignment.
10404 +                */
10405 +               if (node->flags & EVMS_VOLUME_FLAG) {
10406 +                       evms_cs_remove_logical_node_from_list(discover_list,
10407 +                                                             node);
10408 +                       node_removed = TRUE;
10409 +                       /* find next available minor number */
10410 +                       for (requested_minor = 255;
10411 +                            (evms_logical_volumes[requested_minor].node ||
10412 +                             evms_logical_volumes[requested_minor].name) &&
10413 +                            requested_minor; requested_minor--) ;
10414 +                       /* check range of assigned minor */
10415 +                       if (!requested_minor) {
10416 +                               LOG_CRITICAL
10417 +                                   ("no more minor numbers available for evms volumes!!!!\n");
10418 +                               DELETE(node);
10419 +                       } else
10420 +                               /* assign requested minor */
10421 +                               eelv_assign_volume_minor(node, requested_minor);
10422 +               }
10423 +       }
10424 +}
10425 +
10426 +/*
10427 + * function: eelv_assign_remaining_volume_minors
10428 + *
10429 + * This is a support function for evms_export_logical_volumes.
10430 + * This routine assigns minor numbers to all remaining unassigned
10431 + * volumes. Minor numbers are assigned on an availability
10432 + * basis. The first free minor number is used in the assignment.
10433 + *
10434 + */
10435 +static void
10436 +eelv_assign_remaining_volume_minors(struct evms_logical_node **discover_list)
10437 +{
10438 +       struct evms_logical_node *node;
10439 +       int minor;
10440 +
10441 +       while (*discover_list) {
10442 +               node = *discover_list;
10443 +               evms_cs_remove_logical_node_from_list(discover_list, node);
10444 +
10445 +               /* find next available minor number */
10446 +               for (minor = 1;
10447 +                    (evms_logical_volumes[minor].node ||
10448 +                     evms_logical_volumes[minor].name) &&
10449 +                    minor < MAX_EVMS_VOLUMES; minor++) ;
10450 +
10451 +               if (minor >= MAX_EVMS_VOLUMES) {
10452 +                       LOG_CRITICAL
10453 +                           ("no more minor numbers available for compatibility volumes!!!!\n");
10454 +                       DELETE(node);
10455 +               } else
10456 +                       /* assign minor */
10457 +                       eelv_assign_volume_minor(node, minor);
10458 +       }
10459 +}
10460 +
10461 +/*
10462 + * function: eelv_check_for_unreassign_soft_deleted_volume
10463 + *
10464 + * This is a support function for evms_export_logical_volumes.
10465 + * This routine reports any "soft deleted" volumes that were not
10466 + * found after a rediscovery.
10467 + */
10468 +static void
10469 +eelv_check_for_unreassign_soft_deleted_volume(void)
10470 +{
10471 +       struct evms_logical_volume *lv;
10472 +       int i;
10473 +
10474 +       for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
10475 +               lv = &evms_logical_volumes[i];
10476 +               /* only check soft deleted volumes:
10477 +                *  they have a NULL node ptr &
10478 +                *  they have a non-NULL name.
10479 +                */
10480 +               if (lv->flags & EVMS_VOLUME_SOFT_DELETED) {
10481 +                       if (is_open(i))
10482 +                               lv->flags |= EVMS_VOLUME_CORRUPT;
10483 +                       LOG_ERROR
10484 +                           ("error: rediscovery failed to find %smounted 'soft deleted' volume(%u,%u,%s)...\n",
10485 +                            ((lv->flags & EVMS_VOLUME_CORRUPT) ? "" : "un"),
10486 +                            EVMS_MAJOR, i, lv->name);
10487 +                       if (lv->flags & EVMS_VOLUME_CORRUPT) {
10488 +                               LOG_ERROR
10489 +                                   ("         flagging volume(%u,%u,%s) as CORRUPT!\n",
10490 +                                    EVMS_MAJOR, i, lv->name);
10491 +                       } else {
10492 +                               LOG_ERROR
10493 +                                   ("         releasing minor(%d) used by volume(%s)!\n",
10494 +                                    i, lv->name);
10495 +                               /* clear logical volume structure
10496 +                                * for this volume so it may be
10497 +                                * reused.
10498 +                                */
10499 +                               kfree(lv->name);
10500 +                               lv->name = NULL;
10501 +                               lv->flags = 0;
10502 +                       }
10503 +               }
10504 +       }
10505 +}
10506 +
10507 +static void
10508 +eelv_unquiesce_volumes(void)
10509 +{
10510 +       int i;
10511 +
10512 +       /* check each volume array entry */
10513 +       for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
10514 +               struct evms_logical_volume *volume;
10515 +
10516 +               volume = &evms_logical_volumes[i];
10517 +               /* is this volume "quiesced" ? */
10518 +               if (volume->quiesced) {
10519 +                       int rc = 1;
10520 +                       if (volume->node) {
10521 +                               /* "unquiesce" it */
10522 +                               struct inode inode;
10523 +                               struct evms_quiesce_vol_pkt qv;
10524 +
10525 +                               qv.command = qv.status = 0;
10526 +                               qv.do_vfs = 0;
10527 +                               qv.minor = i;
10528 +                               rc = evms_quiesce_volume(volume, &inode, NULL,
10529 +                                                        &qv);
10530 +                       }
10531 +                       /* Wake up any waiters */
10532 +                       if (rc) {
10533 +                               /* clear the flag */
10534 +                               volume->quiesced = 0;
10535 +                               /* wake up the waiters */
10536 +                               if (waitqueue_active(&volume->wait_queue))
10537 +                                       wake_up(&volume->wait_queue);
10538 +#ifdef VFS_PATCH_PRESENT
10539 +                               /* unquiesce VFS if quiesced */
10540 +                               if (volume->vfs_quiesced) {
10541 +                                       /* VFS function call to unlock the filesystem */
10542 +                                       unlockfs(MKDEV(EVMS_MAJOR, i));
10543 +                                       volume->vfs_quiesced = FALSE;
10544 +                               }
10545 +#endif
10546 +                       }
10547 +               }
10548 +       }
10549 +}
10550 +
10551 +/*
10552 + * Function:     evms_export_logical_volumes
10553 + *
10554 + * This function is called from evms_discover_volumes. It
10555 + * check for duplicate volumes, assigns minor values to evms
10556 + * volumes, and assigns minor values to the remaining volumes.
10557 + * In addition to assigning minor values to each volume this
10558 + * function also completes the final steps necessary to allow
10559 + * the volumes to be using by the operating system.
10560 + */
10561 +static void
10562 +evms_export_logical_volumes(struct evms_logical_node **discover_list)
10563 +{
10564 +       LOG_EXTRA("exporting EVMS logical volumes...\n");
10565 +
10566 +       eelv_check_for_duplicity(discover_list);
10567 +
10568 +       eelv_reassign_soft_deleted_volume_minors(discover_list);
10569 +
10570 +       eelv_assign_evms_volume_minors(discover_list);
10571 +
10572 +       eelv_assign_remaining_evms_volume_minors(discover_list);
10573 +
10574 +       eelv_assign_remaining_volume_minors(discover_list);
10575 +
10576 +       eelv_check_for_unreassign_soft_deleted_volume();
10577 +
10578 +       /* "unquiesce" any "quiesced" volumes */
10579 +       eelv_unquiesce_volumes();
10580 +}
10581 +
10582 +static int
10583 +edv_populate_discover_list(struct evms_list_node *src_list,
10584 +                          struct evms_logical_node **trg_list,
10585 +                          struct evms_rediscover_pkt *discover_parms)
10586 +{
10587 +       int rc = 0, i, move_node, use_all_disks = FALSE;
10588 +       struct evms_list_node *src_node;
10589 +       struct evms_logical_node *disk_node = NULL;
10590 +
10591 +       /* if no discover parameters are specified */
10592 +       /* copy ALL the disk nodes into the        */
10593 +       /* discovery list.                         */
10594 +       if ((discover_parms == NULL) ||
10595 +           (discover_parms->drive_count == REDISCOVER_ALL_DEVICES))
10596 +               use_all_disks = TRUE;
10597 +
10598 +       /* copy the disk nodes specified in the */
10599 +       /* discover_parms over to a discover list */
10600 +       src_node = src_list;
10601 +       while (src_node) {
10602 +               move_node = use_all_disks;
10603 +               if (move_node == FALSE)
10604 +                       /* check the rediscovery array */
10605 +                       for (i = 0; i < discover_parms->drive_count; i++) {
10606 +                               disk_node =
10607 +                                   DEV_HANDLE_TO_NODE(discover_parms->
10608 +                                                      drive_array[i]);
10609 +                               if (disk_node == src_node->item) {
10610 +                                       move_node = TRUE;
10611 +                                       break;
10612 +                               }
10613 +                       }
10614 +               /* check to see if we want this node */
10615 +               if (move_node == TRUE)
10616 +                       evms_cs_add_logical_node_to_list(trg_list,
10617 +                                                        (struct
10618 +                                                         evms_logical_node *)
10619 +                                                        src_node->item);
10620 +               /* advance to next struct evms_list_node */
10621 +               src_node = src_node->next;
10622 +       }
10623 +       return (rc);
10624 +}
10625 +
10626 +static int
10627 +evms_discover_volumes(struct evms_rediscover_pkt *discover_parms)
10628 +{
10629 +       int rc = 0;
10630 +       struct evms_logical_node *discover_list = NULL;
10631 +
10632 +       evms_discover_logical_disks(&discover_list);
10633 +       if (evms_global_device_list) {
10634 +               /* move the appropriate disk nodes, based on */
10635 +               /* on the discover parameters, onto the      */
10636 +               /* discover list for the partition managers  */
10637 +               /* to process                                */
10638 +               edv_populate_discover_list(evms_global_device_list,
10639 +                                          &discover_list, discover_parms);
10640 +       }
10641 +       if (discover_list) {
10642 +#ifdef LOCAL_DEBUG
10643 +               display_discover_list(discover_list, "after dev mgrs");
10644 +#endif
10645 +               evms_discover_logical_partitions(&discover_list);
10646 +       }
10647 +       if (discover_list) {
10648 +#ifdef LOCAL_DEBUG
10649 +               display_discover_list(discover_list, "after seg mgrs");
10650 +#endif
10651 +               evms_discover_volume_groups(&discover_list);
10652 +       }
10653 +       if (discover_list) {
10654 +#ifdef LOCAL_DEBUG
10655 +               display_discover_list(discover_list, "after reg mgrs");
10656 +#endif
10657 +               evms_discover_evms_features(&discover_list);
10658 +       }
10659 +       if (discover_list) {
10660 +#ifdef LOCAL_DEBUG
10661 +               display_discover_list(discover_list, "after features");
10662 +#endif
10663 +               evms_export_logical_volumes(&discover_list);
10664 +               evms_cs_signal_event(EVMS_EVENT_END_OF_DISCOVERY);
10665 +       }
10666 +       return (rc);
10667 +}
10668 +
10669 +/* function: evms_notify_reboot
10670 + *
10671 + * this function gets called at shutdown time and is used
10672 + * to remove any evms controlled volumes from memory, thus
10673 + * allowing any plugins needing to flush internal caches
10674 + * to do so.
10675 + */
10676 +int
10677 +evms_notify_reboot(struct notifier_block *this, unsigned long code, void *x)
10678 +{
10679 +       int i;
10680 +       struct evms_logical_volume *volume;
10681 +
10682 +       switch (code) {
10683 +       case SYS_DOWN:
10684 +       case SYS_HALT:
10685 +       case SYS_POWER_OFF:
10686 +               LOG_DEFAULT("stopping all evms controlled volumes.\n");
10687 +
10688 +               /* quiesce all volumes */
10689 +               for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
10690 +                       struct evms_quiesce_vol_pkt qv;
10691 +                       struct inode inode;
10692 +
10693 +                       volume = &evms_logical_volumes[i];
10694 +                       if (!volume->node)
10695 +                               continue;
10696 +                       qv.command = 1; // quiesce
10697 +                       qv.minor = i;   //
10698 +                       qv.status = 0;  // reset status
10699 +                       qv.do_vfs = 0;
10700 +                       evms_quiesce_volume(volume, &inode, NULL, &qv);
10701 +               }
10702 +               /* delete all volumes
10703 +                *
10704 +                * to ensure this work under the
10705 +                * most circumstances, a "soft"
10706 +                * delete will be done. this will
10707 +                * handle the strange case of a
10708 +                * volume still being mounted.
10709 +                */
10710 +               for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
10711 +                       struct evms_delete_vol_pkt dv;
10712 +
10713 +                       volume = &evms_logical_volumes[i];
10714 +                       if (!volume->node)
10715 +                               continue;
10716 +                       /* only delete quiesced volumes */
10717 +                       if (!volume->quiesced)
10718 +                               continue;
10719 +                       /* delete the volume from memory.
10720 +                        * do a 'soft' delete if volume
10721 +                        * is mounted, and 'hard' delete
10722 +                        * if it is not.
10723 +                        */
10724 +                       dv.command = is_open(i);
10725 +                       dv.minor = i;
10726 +                       dv.status = 0;
10727 +                       evms_delete_volume(volume, &dv);
10728 +               }
10729 +       }
10730 +       return NOTIFY_DONE;
10731 +}
10732 +
10733 +static struct notifier_block evms_notifier = {
10734 +       .notifier_call  = evms_notify_reboot,
10735 +       .next           = NULL,
10736 +       .priority       = INT_MAX,      /* before any real devices */
10737 +};
10738 +
10739 +/*
10740 + * Function: find_root_fs_dev
10741 + * If "root=/dev/evms/???" was specified on the kernel command line, and devfs
10742 + * is not enabled, we need to determine the appropriate minor number for the
10743 + * specified volume for the root fs.
10744 + */
10745 +static void
10746 +find_root_fs_dev(void)
10747 +{
10748 +#ifndef MODULE
10749 +       char root_name[64] = { 0 };
10750 +       char *name;
10751 +       int i;
10752 +
10753 +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,18)
10754 +       strncpy(root_name, root_device_name, 63);
10755 +#else
10756 +       get_root_device_name(root_name);
10757 +#endif
10758 +
10759 +       if (!strncmp(root_name, EVMS_DIR_NAME "/", strlen(EVMS_DIR_NAME) + 1)) {
10760 +               name = &root_name[strlen(EVMS_DIR_NAME) + 1];
10761 +
10762 +               for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
10763 +                       if (evms_logical_volumes[i].name &&
10764 +                           !strncmp(name, evms_logical_volumes[i].name,
10765 +                                    strlen(evms_logical_volumes[i].name))) {
10766 +                               ROOT_DEV = MKDEV(EVMS_MAJOR, i);
10767 +                               return;
10768 +                       }
10769 +               }
10770 +       }
10771 +#endif
10772 +}
10773 +
10774 +/*
10775 + * Function: bh_cache_ctor
10776 + * this function initializes the b_wait field in the buffer heads
10777 + * in our private buffer head pool.
10778 + */
10779 +static void
10780 +io_notify_cache_ctor(void *foo, kmem_cache_t * cachep, unsigned long flags)
10781 +{
10782 +       if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
10783 +           SLAB_CTOR_CONSTRUCTOR) {
10784 +               io_notify_t *io_notify = (io_notify_t *) foo;
10785 +               memset(io_notify, 0, sizeof (*io_notify));
10786 +       }
10787 +}
10788 +
10789 +/*
10790 + * Function: bh_cache_ctor
10791 + * this function initializes the b_wait field in the buffer heads
10792 + * in our private buffer head pool.
10793 + */
10794 +static void
10795 +bh_cache_ctor(void *foo, kmem_cache_t * cachep, unsigned long flags)
10796 +{
10797 +       if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
10798 +           SLAB_CTOR_CONSTRUCTOR) {
10799 +               struct buffer_head *bh = (struct buffer_head *) foo;
10800 +               memset(bh, 0, sizeof (*bh));
10801 +               init_waitqueue_head(&bh->b_wait);
10802 +       }
10803 +}
10804 +
10805 +/*
10806 + * Function:  evms_init_module
10807 + * This function runs once at system initialization.
10808 + */
10809 +static int __init
10810 +evms_init_module(void)
10811 +{
10812 +       int rc = 0, i;
10813 +       int *evms_blocksizes;
10814 +
10815 +       LOG_DEFAULT("EVMS v%d.%d.%d initializing .... info level(%d).\n",
10816 +                   EVMS_MAJOR_VERSION,
10817 +                   EVMS_MINOR_VERSION,
10818 +                   EVMS_PATCHLEVEL_VERSION, evms_info_level);
10819 +
10820 +       /* initialize memory management counters */
10821 +       evms_allocs = (atomic_t) ATOMIC_INIT(0);
10822 +       evms_logical_nodes = (atomic_t) ATOMIC_INIT(0);
10823 +
10824 +       /* initialize the io_notify_entry pool */
10825 +       if (!rc)
10826 +               evms_io_notify_pool = evms_cs_create_pool(sizeof (io_notify_t),
10827 +                                                         "EVMS IO Notify",
10828 +                                                         io_notify_cache_ctor,
10829 +                                                         NULL);
10830 +
10831 +       /* initialize the "public" buffer_head pool */
10832 +       if (!rc)
10833 +               evms_bh_pool = evms_cs_create_pool(sizeof (struct buffer_head),
10834 +                                                  "EVMS BH",
10835 +                                                  bh_cache_ctor, NULL);
10836 +
10837 +       /* allocate the logical volume array */
10838 +       if (!rc)
10839 +               evms_logical_volumes =
10840 +                   kmalloc(sizeof (struct evms_logical_volume) *
10841 +                           MAX_EVMS_VOLUMES, GFP_KERNEL);
10842 +       if (!evms_logical_volumes) {
10843 +               rc = -ENOMEM;
10844 +       }
10845 +
10846 +       /* initialize the logical volume array entries */
10847 +       if (!rc) {
10848 +               memset(evms_logical_volumes, 0,
10849 +                      sizeof (struct evms_logical_volume) * MAX_EVMS_VOLUMES);
10850 +               for (i = 1; i < MAX_EVMS_VOLUMES; i++) {
10851 +                       struct evms_logical_volume *volume;
10852 +
10853 +                       volume = &evms_logical_volumes[i];
10854 +                       init_waitqueue_head(&volume->wait_queue);
10855 +                       volume->requests_in_progress =
10856 +                           (atomic_t) ATOMIC_INIT(0);
10857 +#ifdef CONFIG_SMP
10858 +                       blk_init_queue(&volume->request_queue,
10859 +                                      evms_do_request_fn);
10860 +                       blk_queue_make_request(&volume->request_queue,
10861 +                                              evms_make_request_fn);
10862 +#endif
10863 +               }
10864 +       }
10865 +
10866 +       /* allocate EVMS' blk_size array */
10867 +       if (!rc) {
10868 +               evms_blocksizes = kmalloc(MAX_EVMS_VOLUMES *
10869 +                                         sizeof (int), GFP_KERNEL);
10870 +               if (!evms_blocksizes) {
10871 +                       rc = -ENOMEM;
10872 +                       LOG_CRITICAL
10873 +                           ("can't allocate memory for EVMS blk_size\n");
10874 +               } else {
10875 +                       memset(evms_blocksizes, 0,
10876 +                              MAX_EVMS_VOLUMES * sizeof (int));
10877 +                       blk_size[EVMS_MAJOR] = evms_blocksizes;
10878 +               }
10879 +       }
10880 +
10881 +       /* allocate EVMS' blksize_size array */
10882 +       if (!rc) {
10883 +               evms_blocksizes = kmalloc(MAX_EVMS_VOLUMES *
10884 +                                         sizeof (int), GFP_KERNEL);
10885 +               if (!evms_blocksizes) {
10886 +                       rc = -ENOMEM;
10887 +                       LOG_CRITICAL
10888 +                           ("can't allocate memory for EVMS blksize_size\n");
10889 +               } else {
10890 +                       memset(evms_blocksizes, 0,
10891 +                              MAX_EVMS_VOLUMES * sizeof (int));
10892 +                       blksize_size[EVMS_MAJOR] = evms_blocksizes;
10893 +               }
10894 +       }
10895 +
10896 +       /* allocate EVMS' hardsect_size array */
10897 +       if (!rc) {
10898 +               evms_blocksizes = kmalloc(MAX_EVMS_VOLUMES *
10899 +                                         sizeof (int), GFP_KERNEL);
10900 +               if (!evms_blocksizes) {
10901 +                       rc = -ENOMEM;
10902 +                       LOG_CRITICAL
10903 +                           ("can't allocate memory for EVMS hardsect_size\n");
10904 +               } else {
10905 +                       memset(evms_blocksizes, 0,
10906 +                              MAX_EVMS_VOLUMES * sizeof (int));
10907 +                       hardsect_size[EVMS_MAJOR] = evms_blocksizes;
10908 +               }
10909 +       }
10910 +
10911 +       /* Register the block device */
10912 +       if (!rc) {
10913 +               rc = devfs_register_blkdev(EVMS_MAJOR, EVMS_DIR_NAME,
10914 +                                          &evms_fops);
10915 +               if (rc) {
10916 +                       LOG_CRITICAL
10917 +                           ("error calling devfs_register_blkdev()  err=%u\n",
10918 +                            rc);
10919 +                       rc = -EINVAL;
10920 +               }
10921 +       }
10922 +
10923 +       /* Register with devfs */
10924 +       if (!rc) {
10925 +               evms_dir_devfs_handle = devfs_mk_dir(NULL, EVMS_DIR_NAME, NULL);
10926 +               // A NULL return cannot be fatal.
10927 +               // Devfs just might not be running
10928 +               if (!evms_dir_devfs_handle) {
10929 +                       LOG_EXTRA
10930 +                           ("NULL return from devfs_mk_dir() for \"%s\"\n",
10931 +                            EVMS_DIR_NAME);
10932 +                       LOG_EXTRA("Is devfs enabled?\n");
10933 +               } else {
10934 +                       evms_blk_devfs_handle =
10935 +                           devfs_register(evms_dir_devfs_handle, EVMS_DEV_NAME,
10936 +                                          DEVFS_FL_DEFAULT, EVMS_MAJOR, 0,
10937 +                                          S_IFBLK | S_IRUGO | S_IWUGO,
10938 +                                          &evms_fops, NULL);
10939 +                       if (!evms_blk_devfs_handle) {
10940 +                               LOG_DETAILS
10941 +                                   ("NULL return from devfs_register() for \"%s\"\n",
10942 +                                    EVMS_DEV_NAME);
10943 +                       }
10944 +               }
10945 +       }
10946 +
10947 +       if (!rc) {
10948 +               read_ahead[EVMS_MAJOR] = 4096;
10949 +#ifdef CONFIG_SMP
10950 +               blk_dev[EVMS_MAJOR].queue = evms_find_queue;
10951 +#else
10952 +               blk_init_queue(BLK_DEFAULT_QUEUE(EVMS_MAJOR),
10953 +                              evms_do_request_fn);
10954 +               blk_queue_make_request(BLK_DEFAULT_QUEUE(EVMS_MAJOR),
10955 +                                      evms_make_request_fn);
10956 +#endif
10957 +#ifdef CONFIG_PROC_FS
10958 +               evms_cs_get_evms_proc_dir();
10959 +               if (evms_proc_dir) {
10960 +                       create_proc_read_entry("info", 0, evms_proc_dir,
10961 +                                              evms_info_read_proc, NULL);
10962 +                       create_proc_read_entry("plugins", 0, evms_proc_dir,
10963 +                                              evms_plugins_read_proc, NULL);
10964 +                       create_proc_read_entry("volumes", 0, evms_proc_dir,
10965 +                                              evms_volumes_read_proc, NULL);
10966 +               }
10967 +               evms_table_header = register_sysctl_table(dev_dir_table, 1);
10968 +#endif
10969 +               /* Register for reboot notification */
10970 +               register_reboot_notifier(&evms_notifier);
10971 +
10972 +#if defined(CONFIG_PPC64) || defined(CONFIG_SPARC64)
10973 +               /* Register evms 32bit ioctl handlers */
10974 +               lock_kernel();
10975 +               register_ioctl32_conversion(EVMS_GET_INFO_LEVEL,NULL);
10976 +               register_ioctl32_conversion(EVMS_SET_INFO_LEVEL,NULL);
10977 +               register_ioctl32_conversion(EVMS_REDISCOVER_VOLUMES_32,
10978 +                                           evms_rediscover);
10979 +               register_ioctl32_conversion(EVMS_DELETE_VOLUME,NULL);
10980 +               register_ioctl32_conversion(EVMS_PLUGIN_IOCTL_32,
10981 +                                           evms_plugin_ioctl);
10982 +               register_ioctl32_conversion(EVMS_PROCESS_NOTIFY_EVENT,NULL);
10983 +               register_ioctl32_conversion(EVMS_GET_LOGICAL_DISK,NULL);
10984 +               register_ioctl32_conversion(EVMS_GET_LOGICAL_DISK_INFO,NULL);
10985 +               register_ioctl32_conversion(EVMS_SECTOR_IO_32, evms_sector_io);
10986 +               register_ioctl32_conversion(EVMS_GET_MINOR,NULL);
10987 +               register_ioctl32_conversion(EVMS_GET_VOLUME_DATA,NULL);
10988 +               register_ioctl32_conversion(EVMS_GET_PLUGIN,NULL);
10989 +               register_ioctl32_conversion(EVMS_COMPUTE_CSUM_32,
10990 +                                           evms_compute_csum);
10991 +               register_ioctl32_conversion(EVMS_GET_BMAP,NULL);
10992 +               register_ioctl32_conversion(EVMS_GET_IOCTL_VERSION,NULL);
10993 +               register_ioctl32_conversion(EVMS_GET_VERSION,NULL);
10994 +               register_ioctl32_conversion(EVMS_UPDATE_DEVICE_INFO,NULL);
10995 +               register_ioctl32_conversion(EVMS_CHECK_MOUNT_STATUS,NULL);
10996 +               register_ioctl32_conversion(EVMS_GET_VOL_STRIPE_INFO,NULL);
10997 +               unlock_kernel();
10998 +#endif
10999 +
11000 +       }
11001 +
11002 +       return rc;
11003 +}
11004 +
11005 +/*
11006 + * Function:  evms_exit_module
11007 + * This function runs once when the EVMS core module is unloaded.
11008 + */
11009 +static void __exit
11010 +evms_exit_module(void)
11011 +{
11012 +       LOG_DEFAULT("EVMS v%d.%d.%d unloading ....\n",
11013 +                   EVMS_MAJOR_VERSION,
11014 +                   EVMS_MINOR_VERSION, EVMS_PATCHLEVEL_VERSION);
11015 +
11016 +#if defined(CONFIG_PPC64) || defined(CONFIG_SPARC64)
11017 +       /* Un-Register evms 32bit ioctl handlers */
11018 +       lock_kernel();
11019 +       unregister_ioctl32_conversion(EVMS_GET_INFO_LEVEL);
11020 +       unregister_ioctl32_conversion(EVMS_SET_INFO_LEVEL);
11021 +       unregister_ioctl32_conversion(EVMS_REDISCOVER_VOLUMES_32);
11022 +       unregister_ioctl32_conversion(EVMS_DELETE_VOLUME);
11023 +       unregister_ioctl32_conversion(EVMS_PLUGIN_IOCTL_32);
11024 +       unregister_ioctl32_conversion(EVMS_PROCESS_NOTIFY_EVENT);
11025 +       unregister_ioctl32_conversion(EVMS_GET_LOGICAL_DISK);
11026 +       unregister_ioctl32_conversion(EVMS_GET_LOGICAL_DISK_INFO);
11027 +       unregister_ioctl32_conversion(EVMS_SECTOR_IO_32);
11028 +       unregister_ioctl32_conversion(EVMS_GET_MINOR);
11029 +       unregister_ioctl32_conversion(EVMS_GET_VOLUME_DATA);
11030 +       unregister_ioctl32_conversion(EVMS_GET_PLUGIN);
11031 +       unregister_ioctl32_conversion(EVMS_COMPUTE_CSUM_32);
11032 +       unregister_ioctl32_conversion(EVMS_GET_BMAP);
11033 +       unregister_ioctl32_conversion(EVMS_GET_IOCTL_VERSION);
11034 +       unregister_ioctl32_conversion(EVMS_GET_VERSION);
11035 +       unregister_ioctl32_conversion(EVMS_UPDATE_DEVICE_INFO);
11036 +       unregister_ioctl32_conversion(EVMS_CHECK_MOUNT_STATUS);
11037 +       unregister_ioctl32_conversion(EVMS_GET_VOL_STRIPE_INFO);
11038 +       unlock_kernel();
11039 +#endif
11040 +
11041 +       /* unregister with devfs
11042 +        */
11043 +       devfs_unregister(evms_dir_devfs_handle);
11044 +       /* clean up the queue for the block device
11045 +        */
11046 +       blk_cleanup_queue(blk_get_queue(MKDEV(EVMS_MAJOR, 0)));
11047 +       /* unregister block device
11048 +        */
11049 +       devfs_unregister_blkdev(EVMS_MAJOR, EVMS_DIR_NAME);
11050 +       /* deallocate device arrays
11051 +        */
11052 +       kfree(blk_size[EVMS_MAJOR]);
11053 +       blk_size[EVMS_MAJOR] = NULL;
11054 +       kfree(blksize_size[EVMS_MAJOR]);
11055 +       blksize_size[EVMS_MAJOR] = NULL;
11056 +       kfree(hardsect_size[EVMS_MAJOR]);
11057 +       hardsect_size[EVMS_MAJOR] = NULL;
11058 +       read_ahead[EVMS_MAJOR] = 0;
11059 +       /* deallocate logical volumes array
11060 +        */
11061 +       kfree(evms_logical_volumes);
11062 +       /* destroy buffer head pool
11063 +        */
11064 +       evms_cs_destroy_pool(evms_bh_pool);
11065 +       /* destroy io notify pool
11066 +        */
11067 +       evms_cs_destroy_pool(evms_io_notify_pool);
11068 +#ifdef CONFIG_PROC_FS
11069 +       if (evms_proc_dir) {
11070 +               remove_proc_entry("volumes", evms_proc_dir);
11071 +               remove_proc_entry("plugins", evms_proc_dir);
11072 +               remove_proc_entry("info", evms_proc_dir);
11073 +               remove_proc_entry("evms", NULL);
11074 +       }
11075 +       unregister_sysctl_table(evms_table_header);
11076 +#endif
11077 +}
11078 +
11079 +/*
11080 + * Function: evms_init_discover
11081 + * If EVMS is statically built into the kernel, this function will be called
11082 + * to perform an initial volume discovery.
11083 + */
11084 +int __init
11085 +evms_init_discover(void)
11086 +{
11087 +       /* go find volumes */
11088 +       evms_discover_volumes(NULL);
11089 +
11090 +       /* Check if the root fs is on EVMS */
11091 +       if (MAJOR(ROOT_DEV) == EVMS_MAJOR) {
11092 +               find_root_fs_dev();
11093 +       }
11094 +
11095 +       return 0;
11096 +}
11097 +
11098 +/*
11099 + * a placeholder for cluster enablement
11100 + */
11101 +void
11102 +evms_cluster_init(int nodeid, int clusterid)
11103 +{
11104 +       /* dummy */
11105 +       return;
11106 +}
11107 +
11108 +EXPORT_SYMBOL(evms_cluster_init);
11109 +
11110 +/*
11111 + * a placeholder for cluster enablement
11112 + */
11113 +int
11114 +evms_cluster_shutdown(void)
11115 +{
11116 +       /* dummy */
11117 +       return -1;
11118 +}
11119 +
11120 +EXPORT_SYMBOL(evms_cluster_shutdown);
11121 +
11122 +static int __init
11123 +evms_boot_info_level(char *str)
11124 +{
11125 +       int evms_boot_info_level = (int) simple_strtoul(str, NULL, 10);
11126 +       if (evms_boot_info_level) {
11127 +               evms_info_level = evms_boot_info_level;
11128 +       }
11129 +       return 1;
11130 +}
11131 +
11132 +__setup("evms_info_level=", evms_boot_info_level);
11133 +module_init(evms_init_module);
11134 +module_exit(evms_exit_module);
11135 +__initcall(evms_init_discover);
11136 +#ifdef MODULE_LICENSE
11137 +MODULE_LICENSE("GPL");
11138 +#endif
11139 +
11140 +/**********************************************************/
11141 +/* END -- INIT/DISCOVERY support functions                */
11142 +/**********************************************************/
11143 diff -Naur linux-2002-09-30/drivers/evms/evms_bbr.c evms-2002-09-30/drivers/evms/evms_bbr.c
11144 --- linux-2002-09-30/drivers/evms/evms_bbr.c    Wed Dec 31 18:00:00 1969
11145 +++ evms-2002-09-30/drivers/evms/evms_bbr.c     Wed Sep 25 15:04:22 2002
11146 @@ -0,0 +1,1817 @@
11147 +/* -*- linux-c -*- */
11148 +/*
11149 + *   Copyright (c) International Business Machines  Corp., 2000
11150 + *
11151 + *   This program is free software;  you can redistribute it and/or modify
11152 + *   it under the terms of the GNU General Public License as published by
11153 + *   the Free Software Foundation; either version 2 of the License, or
11154 + *   (at your option) any later version.
11155 + *
11156 + *   This program is distributed in the hope that it will be useful,
11157 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
11158 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
11159 + *   the GNU General Public License for more details.
11160 + *
11161 + *   You should have received a copy of the GNU General Public License
11162 + *   along with this program;  if not, write to the Free Software
11163 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
11164 + */
11165 +/* linux/driver/evms/evms_bbr.c
11166 + *
11167 + * EVMS - Bad Block Relocation (BBR) Feature Plugin
11168 + *
11169 + * BBR feature is designed to remap I/O write failures to another safe location
11170 + * on disk. Note that most disk drives have BBR built into them, this means
11171 + * that our software BBR will be only activated when all hardware BBR
11172 + * replacement sectors have been used.
11173 + */
11174 +
11175 +#define LOG_PREFIX "bbr: "
11176 +
11177 +#include <linux/config.h>
11178 +#include <linux/kernel.h>
11179 +#include <linux/module.h>
11180 +#include <linux/mempool.h>
11181 +#include <asm/uaccess.h>
11182 +
11183 +#include <linux/evms/evms.h>
11184 +#include <linux/evms/evms_bbr_k.h>
11185 +
11186 +/* API prototypes. */
11187 +static int bbr_discover(struct evms_logical_node ** discover_list);
11188 +static int bbr_delete(struct evms_logical_node * node);
11189 +static void bbr_read(struct evms_logical_node * node, struct buffer_head * bh);
11190 +static void bbr_write(struct evms_logical_node * node, struct buffer_head * bh);
11191 +static int bbr_ioctl(struct evms_logical_node * bbr_node,
11192 +                    struct inode * inode,
11193 +                    struct file * file,
11194 +                    unsigned int cmd,
11195 +                    unsigned long arg);
11196 +static int bbr_direct_ioctl(struct inode * inode,
11197 +                           struct file * file,
11198 +                           unsigned int cmd,
11199 +                           unsigned long arg);
11200 +static int bbr_init_io(struct evms_logical_node * bbr_node,
11201 +                      int io_flag,
11202 +                      u64 startLSN,
11203 +                      u64 nr_sects,
11204 +                      void * bufptr);
11205 +
11206 +/* Other function prototypes. */
11207 +static int bbr_create_pools(void);
11208 +static void bbr_destroy_pools(void);
11209 +static u32 bbr_table_to_remap_list(struct bbr_private * bbr_id);
11210 +static void bbr_io_handler(void * void_data);
11211 +static void bbr_free_private(struct bbr_private * bbr_id);
11212 +static inline void bbr_list_add(struct bbr_private * bbr_id);
11213 +
11214 +/* List of all BBR nodes. */
11215 +static struct bbr_private * bbr_instances = NULL;
11216 +
11217 +/* Data pertaining to the I/O thread. */
11218 +static struct evms_thread * bbr_io_thread = NULL;
11219 +static spinlock_t bbr_io_list_lock = SPIN_LOCK_UNLOCKED;
11220 +static struct list_head bbr_io_list = LIST_HEAD_INIT(bbr_io_list);
11221 +
11222 +/* Global pools for bbr_io_buf's and bbr_remap's. */
11223 +kmem_cache_t * bbr_io_buf_slab;
11224 +mempool_t * bbr_io_buf_pool;
11225 +kmem_cache_t * bbr_remap_slab;
11226 +mempool_t * bbr_remap_pool;
11227 +
11228 +/* Plugin function table and header. */
11229 +static struct evms_plugin_fops function_table = {
11230 +       .discover       = bbr_discover,
11231 +       .delete         = bbr_delete,
11232 +       .read           = bbr_read,
11233 +       .write          = bbr_write,
11234 +       .init_io        = bbr_init_io,
11235 +       .ioctl          = bbr_ioctl,
11236 +       .direct_ioctl   = bbr_direct_ioctl
11237 +};
11238 +
11239 +static struct evms_plugin_header plugin_header = {
11240 +       .id = SetPluginID(IBM_OEM_ID,
11241 +                         EVMS_FEATURE,
11242 +                         EVMS_BBR_FEATURE_ID),
11243 +       .version = {
11244 +               .major          = EVMS_BBR_VERSION_MAJOR,
11245 +               .minor          = EVMS_BBR_VERSION_MINOR,
11246 +               .patchlevel     = EVMS_BBR_VERSION_PATCHLEVEL
11247 +       },
11248 +       .required_services_version = {
11249 +               .major          = EVMS_BBR_COMMON_SERVICES_MAJOR,
11250 +               .minor          = EVMS_BBR_COMMON_SERVICES_MINOR,
11251 +               .patchlevel     = EVMS_BBR_COMMON_SERVICES_PATCHLEVEL
11252 +       },
11253 +       .fops = &function_table
11254 +};
11255 +
11256 +/**
11257 + * le_meta_data_to_cpu
11258 + *
11259 + * Convert bbr meta data from on-disk (LE) format
11260 + * to the native cpu endian format.
11261 + */
11262 +void le_meta_data_to_cpu(struct evms_bbr_metadata * md)
11263 +{
11264 +       md->signature              = le32_to_cpup(&md->signature);
11265 +       md->crc                    = le32_to_cpup(&md->crc);
11266 +       md->block_size             = le32_to_cpup(&md->block_size);
11267 +       md->flags                  = le32_to_cpup(&md->flags);
11268 +       md->sequence_number        = le64_to_cpup(&md->sequence_number);
11269 +       md->start_sect_bbr_table   = le64_to_cpup(&md->start_sect_bbr_table);
11270 +       md->nr_sects_bbr_table     = le64_to_cpup(&md->nr_sects_bbr_table);
11271 +       md->start_replacement_sect = le64_to_cpup(&md->start_replacement_sect);
11272 +       md->nr_replacement_blks    = le64_to_cpup(&md->nr_replacement_blks);
11273 +}
11274 +
11275 +/**
11276 + * le_bbr_table_sector_to_cpu
11277 + *
11278 + * Convert bbr meta data from on-disk (LE) format
11279 + * to the native cpu endian format.
11280 + */
11281 +void le_bbr_table_sector_to_cpu(struct evms_bbr_table * p)
11282 +{
11283 +       int i;
11284 +       p->signature            = le32_to_cpup(&p->signature);
11285 +       p->crc                  = le32_to_cpup(&p->crc);
11286 +       p->sequence_number      = le32_to_cpup(&p->sequence_number);
11287 +       p->in_use_cnt           = le32_to_cpup(&p->in_use_cnt);
11288 +       for ( i = 0; i < EVMS_BBR_ENTRIES_PER_SECT; i++ ) {
11289 +               p->entries[i].bad_sect =
11290 +                       le64_to_cpup(&p->entries[i].bad_sect);
11291 +               p->entries[i].replacement_sect =
11292 +                       le64_to_cpup(&p->entries[i].replacement_sect);
11293 +       }
11294 +}
11295 +
11296 +/**
11297 + * cpu_bbr_table_sector_to_le
11298 + *
11299 + * Convert bbr meta data from cpu endian format to on-disk (LE) format
11300 + */
11301 +void cpu_bbr_table_sector_to_le(struct evms_bbr_table * p,
11302 +                               struct evms_bbr_table * le)
11303 +{
11304 +       int i;
11305 +       le->signature           = cpu_to_le32p(&p->signature);
11306 +       le->crc                 = cpu_to_le32p(&p->crc);
11307 +       le->sequence_number     = cpu_to_le32p(&p->sequence_number);
11308 +       le->in_use_cnt          = cpu_to_le32p(&p->in_use_cnt);
11309 +       for ( i = 0; i < EVMS_BBR_ENTRIES_PER_SECT; i++ ) {
11310 +               le->entries[i].bad_sect =
11311 +                       cpu_to_le64p(&p->entries[i].bad_sect);
11312 +               le->entries[i].replacement_sect =
11313 +                       cpu_to_le64p(&p->entries[i].replacement_sect);
11314 +       }
11315 +}
11316 +
11317 +#ifdef EVMS_BBR_DEBUG
11318 +static void print_meta_data(struct evms_bbr_metadata * md)
11319 +{
11320 +       LOG_DEBUG("BBR Metadata Sector:\n"
11321 +                 "  signature               0x%08X\n"
11322 +                 "  crc                     0x%08X\n"
11323 +                 "  block_size              %u\n"
11324 +                 "  start_sect_bbr_table    "PFU64"\n"
11325 +                 "  nr_sects_bbr_table      "PFU64"\n"
11326 +                 "  start_replacement_sect  "PFU64"\n"
11327 +                 "  nr_replacement_blks     "PFU64"\n",
11328 +                 md->signature, md->crc, md->block_size,
11329 +                 md->start_sect_bbr_table, md->nr_sects_bbr_table,
11330 +                 md->start_replacement_sect, md->nr_replacement_blks);
11331 +}
11332 +
11333 +static void print_bbr_table_sector(struct evms_bbr_table * p)
11334 +{
11335 +       int i;
11336 +       LOG_DEBUG("BBR Table Sector:\n"
11337 +                 "  sig          0x%08X\n"
11338 +                 "  crc          0x%08X\n"
11339 +                 "  sequence     %u\n"
11340 +                 "  in_use_cnt   %u\n"
11341 +                 "  Table Entries:\n",
11342 +                 p->signature, p->crc, p->sequence_number, p->in_use_cnt);
11343 +       for ( i = 0; i < EVMS_BBR_ENTRIES_PER_SECT; i++ ) {
11344 +               LOG_DEBUG("  [%d] bad_sect: "PFU64"     replacement_sect: "PFU64"\n",
11345 +                         i, p->entries[i].bad_sect,
11346 +                         p->entries[i].replacement_sect);
11347 +       }
11348 +}
11349 +
11350 +void print_binary_tree(struct bbr_runtime_remap * node)
11351 +{
11352 +       if (node) {
11353 +               LOG_DEFAULT("["PFU64","PFU64"]\n", node->remap.bad_sect,
11354 +                           node->remap.replacement_sect);
11355 +               print_binary_tree(node->left);
11356 +               print_binary_tree(node->right);
11357 +       }
11358 +}
11359 +
11360 +static void print_remap_list(struct bbr_private * bbr_id)
11361 +{
11362 +       if (bbr_id->remap_root) {
11363 +               LOG_DEFAULT("%s for %s\n", __FUNCTION__, bbr_id->node->name);
11364 +               print_binary_tree(bbr_id->remap_root);
11365 +       }
11366 +}
11367 +#endif
11368 +
11369 +/**
11370 + * validate_bbr_table_sector
11371 + *
11372 + * Check the specified BBR table sector for a valid signature and CRC.
11373 + */
11374 +static int validate_bbr_table_sector(struct evms_bbr_table * p)
11375 +{
11376 +       int rc = 0;
11377 +       int org_crc, final_crc;
11378 +
11379 +       if ( le32_to_cpup(&p->signature) != EVMS_BBR_TABLE_SIGNATURE ) {
11380 +               LOG_ERROR("BBR table signature doesn't match!\n");
11381 +               LOG_ERROR("Sector has (0x%08X) expected(0x%08X)\n",
11382 +                         le32_to_cpup(&p->signature),
11383 +                         EVMS_BBR_TABLE_SIGNATURE);
11384 +               rc = -EINVAL;
11385 +       } else {
11386 +               if (p->crc) {
11387 +                       org_crc = le32_to_cpup(&p->crc);
11388 +                       p->crc = 0;
11389 +                       final_crc = evms_cs_calculate_crc(EVMS_INITIAL_CRC, p,
11390 +                                                         sizeof(*p));
11391 +                       if ( final_crc != org_crc ) {
11392 +                               LOG_ERROR("CRC failed!\n");
11393 +                               LOG_ERROR("Sector has (0x%08X) calculated(0x%08X)\n",
11394 +                                         org_crc, final_crc);
11395 +                               rc = -EINVAL;
11396 +                       }
11397 +                       p->crc = cpu_to_le32p(&org_crc);
11398 +               } else {
11399 +                       LOG_ERROR("BBR table sector has no CRC!\n");
11400 +                       rc = -EINVAL;
11401 +               }
11402 +       }
11403 +       if (rc)
11404 +               BBR_DEBUG_PRINT_TABLE_SECTOR(p);
11405 +       le_bbr_table_sector_to_cpu(p);
11406 +       return rc;
11407 +}
11408 +
11409 +/**
11410 + * update_invalid_bbr_table_sector
11411 + *
11412 + * If one copy of a BBR table sector is bad, replace it with the valid copy.
11413 + */
11414 +void update_invalid_bbr_table_sector(struct evms_logical_node * node,
11415 +                                    struct evms_bbr_table * valid,
11416 +                                    struct evms_bbr_table * invalid,
11417 +                                    u64 lsn)
11418 +{
11419 +       int rc;
11420 +       struct evms_bbr_table * tmp_bbr_table;
11421 +
11422 +       /* Correct the invalid bbr table sector */
11423 +       memcpy(invalid, valid, sizeof(struct evms_bbr_table));
11424 +
11425 +       /* Allocate memory for I/O */
11426 +       tmp_bbr_table = kmalloc(sizeof(struct evms_bbr_table), GFP_KERNEL);
11427 +       if (tmp_bbr_table) {
11428 +               memset(tmp_bbr_table, 0, sizeof(struct evms_bbr_table));
11429 +               cpu_bbr_table_sector_to_le(valid, tmp_bbr_table);
11430 +               LOG_WARNING("Correcting BBR table sector "PFU64"\n", lsn);
11431 +               rc = INIT_IO(node, 1, lsn, 1, tmp_bbr_table);
11432 +               if (rc) {
11433 +                       LOG_ERROR("Could not correct BBR table sector "PFU64".\n",
11434 +                                 lsn);
11435 +               }
11436 +               kfree(tmp_bbr_table);
11437 +       }
11438 +}
11439 +
11440 +/**
11441 + * validate_bbr_table
11442 + *
11443 + * Validate the entire range of sectors in the BBR table.
11444 + */
11445 +static u32 validate_bbr_table(struct evms_bbr_metadata * md,
11446 +                             struct evms_bbr_table * p)
11447 +{
11448 +       u32 i, nr_sects;
11449 +
11450 +       nr_sects = md->nr_sects_bbr_table;
11451 +
11452 +       for ( i = 0; i < nr_sects; i++, p++ ) {
11453 +               if ( validate_bbr_table_sector(p) )
11454 +                       break;
11455 +       }
11456 +
11457 +       if ( i != nr_sects ) {
11458 +               LOG_SERIOUS("Stopped BBR table validation at sector %u.\n", i);
11459 +               nr_sects = i;
11460 +       }
11461 +       LOG_DEBUG("Validated %u BBR table sectors.\n", nr_sects);
11462 +       return nr_sects;
11463 +}
11464 +
11465 +/**
11466 + * validate_bbr_tables
11467 + * @node:      BBR node to validate.
11468 + * @MD1:       Primary metadata sector.
11469 + * @MD2:       Secondary metadata sector.
11470 + * @p1:                Primary BBR table.
11471 + * @p2:                Secondary BBR table.
11472 + *
11473 + * Validate both copies of the BBR table. If one of them is invalid,
11474 + * try to correct the errors using the valid copy.
11475 + */
11476 +static u32 validate_bbr_tables(struct evms_logical_node * node,
11477 +                              struct evms_bbr_metadata * MD1,
11478 +                              struct evms_bbr_metadata * MD2,
11479 +                              struct evms_bbr_table * p1,
11480 +                              struct evms_bbr_table * p2)
11481 +{
11482 +       u32 i, rc1, rc2, nr_sects;
11483 +
11484 +       nr_sects = MD1->nr_sects_bbr_table;
11485 +       if ( nr_sects != MD2->nr_sects_bbr_table ) {
11486 +               nr_sects = (nr_sects < MD2->nr_sects_bbr_table) ?
11487 +                          nr_sects : MD2->nr_sects_bbr_table;
11488 +               LOG_SERIOUS("Size of BBR tables don't match. Using %u\n",
11489 +                           nr_sects);
11490 +       }
11491 +
11492 +       for ( i = 0; i < nr_sects; i++, p1++, p2++ ) {
11493 +               rc1 = validate_bbr_table_sector(p1);
11494 +               if (rc1) {
11495 +                       LOG_WARNING("Invalid BBR table sector at "PFU64".\n",
11496 +                                   MD1->start_sect_bbr_table + i);
11497 +               }
11498 +               rc2 = validate_bbr_table_sector(p2);
11499 +               if (rc2) {
11500 +                       LOG_WARNING("Invalid BBR table sector at "PFU64".\n",
11501 +                                   MD2->start_sect_bbr_table + i);
11502 +               }
11503 +
11504 +               /* Correct BBR table errors. */
11505 +               if (rc1 && rc2) {
11506 +                       /* Cannot fix. */
11507 +                       break;
11508 +               } else if (rc1) {
11509 +                       update_invalid_bbr_table_sector(node, p2, p1,
11510 +                                                       MD1->start_sect_bbr_table + i);
11511 +                       continue;
11512 +               } else if (rc2) {
11513 +                       update_invalid_bbr_table_sector(node, p1, p2,
11514 +                                                       MD2->start_sect_bbr_table + i);
11515 +                       continue;
11516 +               }
11517 +
11518 +               if ( p1->sequence_number != p2->sequence_number ) {
11519 +                       LOG_WARNING("Sequence numbers for BBR table index %u don't match.\n", i);
11520 +                       LOG_WARNING("MD1 sequence_nr=%u, MD2 sequence_nr_2=%u\n",
11521 +                                   p1->sequence_number, p2->sequence_number);
11522 +                       if ( p1->sequence_number < p2->sequence_number ) {
11523 +                               update_invalid_bbr_table_sector(node, p2, p1,
11524 +                                                               MD1->start_sect_bbr_table + i);
11525 +                       } else {
11526 +                               update_invalid_bbr_table_sector(node, p1, p2,
11527 +                                                               MD2->start_sect_bbr_table + i);
11528 +                       }
11529 +               }
11530 +       }
11531 +       if ( i != nr_sects ) {
11532 +               LOG_SERIOUS("Stopped validation at sector %u\n", i);
11533 +               nr_sects = i;
11534 +       }
11535 +       LOG_DEBUG("Validated %u BBR table sectors.\n", nr_sects);
11536 +       return nr_sects;
11537 +}
11538 +
11539 +/**
11540 + * validate_meta_data
11541 + *
11542 + * Check the specified BBR metadata sector for a valid signature and CRC.
11543 + */
11544 +static int validate_meta_data(struct evms_bbr_metadata * md)
11545 +{
11546 +       int org_crc, final_crc;
11547 +
11548 +       BBR_DEBUG_PRINT_META_DATA(md);
11549 +
11550 +       if ( le32_to_cpup(&md->signature) != EVMS_BBR_SIGNATURE ) {
11551 +               LOG_SERIOUS("BBR signature doesn't match!\n");
11552 +               LOG_SERIOUS("Found: 0x%08X  Expecting: 0x%08X\n",
11553 +                           le32_to_cpup(&md->signature), EVMS_BBR_SIGNATURE);
11554 +               return -EINVAL;
11555 +       }
11556 +
11557 +       if (md->crc) {
11558 +               org_crc = le32_to_cpup(&md->crc);
11559 +               md->crc = 0;
11560 +               final_crc = evms_cs_calculate_crc(EVMS_INITIAL_CRC, md,
11561 +                                                 sizeof(*md));
11562 +               if ( final_crc != org_crc ) {
11563 +                       LOG_ERROR("CRC failed!\n");
11564 +                       LOG_ERROR("Sector has (0x%08X) calculated(0x%08X)\n",
11565 +                                   org_crc, final_crc);
11566 +                       return -EINVAL;
11567 +               }
11568 +               md->crc = cpu_to_le32p(&org_crc);
11569 +       } else {
11570 +               LOG_WARNING("Metadata sector has no CRC!\n");
11571 +       }
11572 +
11573 +       le_meta_data_to_cpu(md);
11574 +       return 0;
11575 +}
11576 +
11577 +/**
11578 + * bbr_load_meta_data
11579 + * @node:      BBR node to read metadata from.
11580 + * @lsn:       Sector to read metadata from.
11581 + * @md:                Pointer to return metadata structure.
11582 + * @bbr_table: Pointer to return BBR table.
11583 + *
11584 + * Load one copy of the BBR metadata. If the metadata is valid, load the
11585 + * corresponding copy of the BBR table.
11586 + */
11587 +static int load_meta_data(struct evms_logical_node * node,
11588 +                         u64 lsn,
11589 +                         struct evms_bbr_metadata ** md,
11590 +                         struct evms_bbr_table ** bbr_table)
11591 +{
11592 +       int rc;
11593 +
11594 +       *md = NULL;
11595 +       *bbr_table = NULL;
11596 +
11597 +       if (!lsn) {
11598 +               LOG_WARNING("No sector specified for BBR metadata on %s.\n",
11599 +                           node->name);
11600 +               return -ENODATA;
11601 +       }
11602 +
11603 +       /* Allocate a buffer for the metadata sector. */
11604 +       *md = kmalloc(sizeof(struct evms_bbr_metadata), GFP_KERNEL);
11605 +       if (!*md) {
11606 +               LOG_ERROR("kmalloc error creating metadata buffer for %s.\n",
11607 +                         node->name);
11608 +               return -ENOMEM;
11609 +       }
11610 +
11611 +       /* Read the metadata sector. */
11612 +       rc = INIT_IO(node, 0, lsn, 1, *md);
11613 +       if (rc) {
11614 +               LOG_ERROR("init_io error on %s.\n", node->name);
11615 +               kfree(*md);
11616 +               *md = NULL;
11617 +               return rc;
11618 +       }
11619 +
11620 +       /* Validate the metadata sector. */
11621 +       rc = validate_meta_data(*md);
11622 +       if (rc) {
11623 +               LOG_ERROR("Error validating metadata for %s.\n", node->name);
11624 +               kfree(*md);
11625 +               *md = NULL;
11626 +               return rc;
11627 +       }
11628 +
11629 +       /* Allocate a buffer for the BBR table. */
11630 +       *bbr_table = kmalloc((*md)->nr_sects_bbr_table <<
11631 +                            EVMS_VSECTOR_SIZE_SHIFT, GFP_KERNEL);
11632 +       if (!*bbr_table) {
11633 +               LOG_ERROR("kmalloc error creating BBR table buffer for %s.\n",
11634 +                         node->name);
11635 +               kfree(*md);
11636 +               *md = NULL;
11637 +               return -ENOMEM;
11638 +       }
11639 +
11640 +       /* Read the BBR table but don't validate here. */
11641 +       rc = INIT_IO(node, 0, (*md)->start_sect_bbr_table,
11642 +                    (*md)->nr_sects_bbr_table, *bbr_table);
11643 +       if (rc) {
11644 +               LOG_ERROR("init_io error on %s.\n", node->name);
11645 +               kfree(*md);
11646 +               *md = NULL;
11647 +               kfree(*bbr_table);
11648 +               *bbr_table = NULL;
11649 +       }
11650 +
11651 +       return rc;
11652 +}
11653 +
11654 +/**
11655 + * bbr_load_feature_data
11656 + * @node:      BBR node
11657 + * @ID:                Return pointer to BBR private data.
11658 + *
11659 + * Load both copies of the BBR metadata and table. If one is invalid, try
11660 + * to correct is using the valid copy. When a valid copy is found, create
11661 + * a private data structure for the specified node.
11662 + */
11663 +static int load_feature_data(struct evms_logical_node * node,
11664 +                            struct bbr_private ** ID)
11665 +{
11666 +       struct evms_bbr_metadata * md1 = NULL;
11667 +       struct evms_bbr_metadata * md2 = NULL;
11668 +       struct evms_bbr_table * table1 = NULL;
11669 +       struct evms_bbr_table * table2 = NULL;
11670 +       u64 lba_table1 = 0, lba_table2 = 0;
11671 +       u32 nr_sects = 0;
11672 +       int rc = 0, rc1, rc2;
11673 +
11674 +       *ID = NULL;
11675 +
11676 +       /* Load metadata 1 */
11677 +       rc1 = load_meta_data(node,
11678 +                            node->feature_header->feature_data1_start_lsn,
11679 +                            &md1, &table1);
11680 +       /* Load metadata 2 */
11681 +       rc2 = load_meta_data(node,
11682 +                            node->feature_header->feature_data2_start_lsn,
11683 +                            &md2, &table2);
11684 +
11685 +       if (rc1 && rc2) {
11686 +               /* Both copies are bad? Cannot continue. */
11687 +               rc = -ENODATA;
11688 +       } else if (rc1 || rc2) {
11689 +               /* One copy is bad. Use the good copy. */
11690 +               if (rc1) {
11691 +                       lba_table2 = md2->start_sect_bbr_table;
11692 +                       kfree(table1);
11693 +                       kfree(md1);
11694 +                       table1 = table2;
11695 +                       table2 = NULL;
11696 +                       md1 = md2;
11697 +                       md2 = NULL;
11698 +               } else {
11699 +                       lba_table1 = md1->start_sect_bbr_table;
11700 +               }
11701 +
11702 +               nr_sects = validate_bbr_table(md1, table1);
11703 +               if ( nr_sects == 0 ) {
11704 +                       rc = -ENODATA;
11705 +               }
11706 +       } else {
11707 +               lba_table1 = md1->start_sect_bbr_table;
11708 +               lba_table2 = md2->start_sect_bbr_table;
11709 +               nr_sects = validate_bbr_tables(node, md1, md2, table1, table2);
11710 +               if ( nr_sects == 0 ) {
11711 +                       rc = -ENODATA;
11712 +               }
11713 +       }
11714 +
11715 +       if (!rc && nr_sects) {
11716 +               *ID = kmalloc(sizeof(struct bbr_private), GFP_KERNEL);
11717 +               if (*ID) {
11718 +                       memset(*ID, 0, sizeof(struct bbr_private));
11719 +                       (*ID)->source = node;
11720 +                       (*ID)->blksize_in_sects = md1->block_size >>
11721 +                                                 EVMS_VSECTOR_SIZE_SHIFT;
11722 +                       (*ID)->remap_root = NULL;
11723 +                       (*ID)->lba_table1 = lba_table1;
11724 +                       (*ID)->lba_table2 = lba_table2;
11725 +                       (*ID)->bbr_table = table1;
11726 +                       (*ID)->nr_sects_bbr_table = nr_sects;
11727 +                       if ( nr_sects < md1->nr_sects_bbr_table ) {
11728 +                               LOG_WARNING("Making BBR node read-only\n");
11729 +                               (*ID)->flag |= EVMS_VOLUME_READ_ONLY;
11730 +                       }
11731 +                       (*ID)->nr_replacement_blks = nr_sects *
11732 +                                                    EVMS_BBR_ENTRIES_PER_SECT;
11733 +                       (*ID)->start_replacement_sect = md1->start_replacement_sect;
11734 +                       (*ID)->in_use_replacement_blks = (atomic_t)ATOMIC_INIT(0);
11735 +                       (*ID)->bbr_id_lock = SPIN_LOCK_UNLOCKED;
11736 +                       if ( !bbr_remap_pool || !bbr_io_buf_pool ) {
11737 +                               rc = bbr_create_pools();
11738 +                       }
11739 +                       if (!rc) {
11740 +                               atomic_set(&(*ID)->in_use_replacement_blks,
11741 +                                          bbr_table_to_remap_list(*ID));
11742 +                       }
11743 +               } else {
11744 +                       rc = -ENOMEM;
11745 +               }
11746 +       }
11747 +
11748 +       if (!rc) {
11749 +               if (!bbr_io_thread) {
11750 +                       const char * name = "evms_bbr_io";
11751 +                       bbr_io_thread = evms_cs_register_thread(bbr_io_handler,
11752 +                                                               NULL, name);
11753 +                       if (!bbr_io_thread) {
11754 +                               rc = -EINVAL;
11755 +                       }
11756 +               }
11757 +       }
11758 +
11759 +       /* If error, free table1. */
11760 +       if (rc) {
11761 +               if (table1) {
11762 +                       kfree(table1);
11763 +               }
11764 +               if (*ID) {
11765 +                       (*ID)->bbr_table = NULL;
11766 +                       bbr_free_private(*ID);
11767 +                       (*ID) = NULL;
11768 +               }
11769 +       }
11770 +
11771 +       /* Will never use md1, md2 and table2 again */
11772 +       if (md1) {
11773 +               kfree(md1);
11774 +       }
11775 +       if (md2) {
11776 +               kfree(md2);
11777 +       }
11778 +       if (table2) {
11779 +               kfree(table2);
11780 +       }
11781 +
11782 +       return rc;
11783 +}
11784 +
11785 +/**
11786 + * bbr_binary_tree_insert
11787 + *
11788 + * Insert a node into the binary tree.
11789 + */
11790 +void bbr_binary_tree_insert(struct bbr_runtime_remap ** root,
11791 +                           struct bbr_runtime_remap * newnode)
11792 +{
11793 +       struct bbr_runtime_remap ** node = root;
11794 +       while (node && *node) {
11795 +               if ( newnode->remap.bad_sect > (*node)->remap.bad_sect ) {
11796 +                       node = &((*node)->right);
11797 +               } else {
11798 +                       node = &((*node)->left);
11799 +               }
11800 +       }
11801 +
11802 +       newnode->left = newnode->right = NULL;
11803 +       *node = newnode;
11804 +}
11805 +
11806 +/**
11807 + * bbr_binary_search
11808 + *
11809 + * Search for a node that contains bad_sect = lsn.
11810 + */
11811 +struct bbr_runtime_remap * bbr_binary_search(struct bbr_runtime_remap * root,
11812 +                                            u64 lsn)
11813 +{
11814 +       struct bbr_runtime_remap * node = root;
11815 +       while (node) {
11816 +               if (node->remap.bad_sect == lsn) {
11817 +                       break;
11818 +               }
11819 +               if ( lsn > node->remap.bad_sect ) {
11820 +                       node = node->right;
11821 +               } else {
11822 +                       node = node->left;
11823 +               }
11824 +       }
11825 +       return node;
11826 +}
11827 +
11828 +/**
11829 + * bbr_binary_tree_destroy
11830 + *
11831 + * Destroy the binary tree.
11832 + */
11833 +void bbr_binary_tree_destroy(struct bbr_runtime_remap * root,
11834 +                            struct bbr_private * bbr_id)
11835 +{
11836 +       struct bbr_runtime_remap ** link = NULL;
11837 +       struct bbr_runtime_remap * node = root;
11838 +
11839 +       while (node) {
11840 +               if (node->left) {
11841 +                       link = &(node->left);
11842 +                       node = node->left;
11843 +                       continue;
11844 +               }
11845 +               if (node->right) {
11846 +                       link = &(node->right);
11847 +                       node = node->right;
11848 +                       continue;
11849 +               }
11850 +
11851 +               mempool_free(node, bbr_remap_pool);
11852 +               if (node == root) {
11853 +                       /* If root is deleted, we're done. */
11854 +                       break;
11855 +               }
11856 +
11857 +               /* Back to root. */
11858 +               node = root;
11859 +               *link = NULL;
11860 +       }
11861 +}
11862 +
11863 +static void bbr_free_remap(struct bbr_private * bbr_id)
11864 +{
11865 +       unsigned long flags;
11866 +       spin_lock_irqsave(&bbr_id->bbr_id_lock, flags);
11867 +       bbr_binary_tree_destroy(bbr_id->remap_root, bbr_id);
11868 +       bbr_id->remap_root = NULL;
11869 +       spin_unlock_irqrestore(&bbr_id->bbr_id_lock, flags);
11870 +}
11871 +
11872 +/**
11873 + * bbr_insert_remap_entry
11874 + *
11875 + * Create a new remap entry and add it to the binary tree for this node.
11876 + */
11877 +static int bbr_insert_remap_entry(struct bbr_private * bbr_id,
11878 +                                 struct evms_bbr_table_entry * new_bbr_entry)
11879 +{
11880 +       struct bbr_runtime_remap * newnode = NULL;
11881 +       unsigned long flags;
11882 +       int rc;
11883 +
11884 +       newnode = mempool_alloc(bbr_remap_pool, GFP_NOIO);
11885 +       if (!newnode) {
11886 +               rc = -ENOMEM;
11887 +               LOG_SERIOUS("Could not allocate from remap pool! (rc=%d)\n", rc);
11888 +               return rc;
11889 +       }
11890 +       newnode->remap.bad_sect  = new_bbr_entry->bad_sect;
11891 +       newnode->remap.replacement_sect = new_bbr_entry->replacement_sect;
11892 +       spin_lock_irqsave(&bbr_id->bbr_id_lock, flags);
11893 +       bbr_binary_tree_insert(&bbr_id->remap_root, newnode);
11894 +       spin_unlock_irqrestore(&bbr_id->bbr_id_lock, flags);
11895 +       return 0;
11896 +}
11897 +
11898 +/**
11899 + * bbr_table_to_remap_list
11900 + *
11901 + * The on-disk bbr table is sorted by the replacement sector LBA. In order to
11902 + * improve run time performance, the in memory remap list must be sorted by
11903 + * the bad sector LBA. This function is called at discovery time to initialize
11904 + * the remap list. This function assumes that at least one copy of meta data
11905 + * is valid.
11906 + */
11907 +static u32 bbr_table_to_remap_list(struct bbr_private * bbr_id)
11908 +{
11909 +       u32 in_use_blks = 0;
11910 +       int i, j;
11911 +       struct evms_bbr_table * p;
11912 +
11913 +
11914 +       for ( i = 0, p = bbr_id->bbr_table;
11915 +             i < bbr_id->nr_sects_bbr_table;
11916 +             i++, p++ ) {
11917 +               if (!p->in_use_cnt) {
11918 +                       break;
11919 +               }
11920 +               in_use_blks += p->in_use_cnt;
11921 +               for ( j = 0; j < p->in_use_cnt; j++ ) {
11922 +                       bbr_insert_remap_entry(bbr_id, &p->entries[j]);
11923 +               }
11924 +       }
11925 +
11926 +       return in_use_blks;
11927 +}
11928 +
11929 +/**
11930 + * bbr_search_remap_entry
11931 + *
11932 + * Search remap entry for the specified sector. If found, return a pointer to
11933 + * the table entry. Otherwise, return NULL.
11934 + */
11935 +static struct evms_bbr_table_entry * bbr_search_remap_entry(struct bbr_private * bbr_id,
11936 +                                                           u64 lsn)
11937 +{
11938 +       struct bbr_runtime_remap * p;
11939 +       unsigned long flags;
11940 +
11941 +       spin_lock_irqsave(&bbr_id->bbr_id_lock, flags);
11942 +       p = bbr_binary_search(bbr_id->remap_root, lsn);
11943 +       spin_unlock_irqrestore(&bbr_id->bbr_id_lock, flags);
11944 +       if (p) {
11945 +               return (&p->remap);
11946 +       } else {
11947 +               return NULL;
11948 +       }
11949 +}
11950 +
11951 +/**
11952 + * bbr_remap
11953 + *
11954 + * If *lsn is in the remap table, return TRUE and modify *lsn,
11955 + * else, return FALSE.
11956 + */
11957 +static inline int bbr_remap(struct bbr_private * bbr_id,
11958 +                           u64 * lsn)
11959 +{
11960 +       struct evms_bbr_table_entry *e;
11961 +
11962 +       if ( atomic_read(&bbr_id->in_use_replacement_blks) &&
11963 +            ! (bbr_id->flag & BBR_STOP_REMAP) ) {
11964 +               e = bbr_search_remap_entry(bbr_id, *lsn);
11965 +               if (e) {
11966 +                       *lsn = e->replacement_sect;
11967 +                       LOG_EXTRA("%s replacement sector (LSN="PFU64")\n",
11968 +                                 __FUNCTION__, *lsn);
11969 +                       return TRUE;
11970 +               }
11971 +       }
11972 +       return FALSE;
11973 +}
11974 +
11975 +/**
11976 + * bbr_remap_probe
11977 + *
11978 + * If any of the sectors in the range [lsn, lsn+nr_sects] are in the remap
11979 + * table return TRUE, Else, return FALSE.
11980 + */
11981 +static inline int bbr_remap_probe(struct bbr_private * bbr_id,
11982 +                                 u64 lsn, u64 nr_sects)
11983 +{
11984 +       u64 tmp, cnt;
11985 +
11986 +       if ( atomic_read(&bbr_id->in_use_replacement_blks) &&
11987 +            ! (bbr_id->flag & BBR_STOP_REMAP) ) {
11988 +               for ( cnt = 0, tmp = lsn;
11989 +                     cnt < nr_sects;
11990 +                     cnt += bbr_id->blksize_in_sects, tmp = lsn + cnt) {
11991 +                       if ( bbr_remap(bbr_id,&tmp) ) {
11992 +                               return TRUE;
11993 +                       }
11994 +               }
11995 +       }
11996 +       return FALSE;
11997 +}
11998 +
11999 +static void *bbr_slab_pool_alloc(int gfp_mask, void * data)
12000 +{
12001 +       return kmem_cache_alloc(data, gfp_mask);
12002 +}
12003 +
12004 +static void bbr_slab_pool_free(void *ptr, void * data)
12005 +{
12006 +       kmem_cache_free(data, ptr);
12007 +}
12008 +
12009 +static int bbr_create_pools(void)
12010 +{
12011 +       /* Create a memory pool for the remap list. */
12012 +       if (!bbr_remap_slab) {
12013 +               bbr_remap_slab = kmem_cache_create("BBR_Remap_Slab",
12014 +                                                  sizeof(struct bbr_runtime_remap),
12015 +                                                  0, SLAB_HWCACHE_ALIGN,
12016 +                                                  NULL, NULL);
12017 +               if (!bbr_remap_slab) {
12018 +                       panic("Unable to create BBR remap cache.");
12019 +               }
12020 +       }
12021 +       if (!bbr_remap_pool) {
12022 +               bbr_remap_pool = mempool_create(64, bbr_slab_pool_alloc,
12023 +                                               bbr_slab_pool_free,
12024 +                                               bbr_remap_slab);
12025 +               if (!bbr_remap_pool) {
12026 +                       panic("Unable to create BBR remap pool.");
12027 +               }
12028 +       }
12029 +
12030 +       /* Create a memory pool for the BBR I/O anchors. */
12031 +       if (!bbr_io_buf_slab) {
12032 +               bbr_io_buf_slab = kmem_cache_create("BBR_IO_Buf_Slab",
12033 +                                                   sizeof(struct bbr_io_buffer),
12034 +                                                   0, SLAB_HWCACHE_ALIGN,
12035 +                                                   NULL, NULL);
12036 +               if (!bbr_io_buf_slab) {
12037 +                       panic("Unable to create BBR I/O buffer cache.");
12038 +               }
12039 +       }
12040 +       if (!bbr_io_buf_pool) {
12041 +               bbr_io_buf_pool = mempool_create(256, bbr_slab_pool_alloc,
12042 +                                                bbr_slab_pool_free,
12043 +                                                bbr_io_buf_slab);
12044 +               if (!bbr_io_buf_pool) {
12045 +                       panic("Unable to create BBR I/O buffer pool.");
12046 +               }
12047 +       }
12048 +
12049 +       return 0;
12050 +}
12051 +
12052 +static void bbr_destroy_pools(void)
12053 +{
12054 +       if (bbr_io_buf_pool) {
12055 +               mempool_destroy(bbr_io_buf_pool);
12056 +               bbr_io_buf_pool = NULL;
12057 +       }
12058 +       if (bbr_io_buf_slab) {
12059 +               kmem_cache_destroy(bbr_io_buf_slab);
12060 +               bbr_io_buf_slab = NULL;
12061 +       }
12062 +       if (bbr_remap_pool) {
12063 +               mempool_destroy(bbr_remap_pool);
12064 +               bbr_remap_pool = NULL;
12065 +       }
12066 +       if (bbr_remap_slab) {
12067 +               kmem_cache_destroy(bbr_remap_slab);
12068 +               bbr_remap_slab = NULL;
12069 +       }
12070 +}
12071 +
12072 +/**
12073 + * bbr_discover
12074 + *
12075 + * Search through the discover list looking for object with BBR metadata.
12076 + * Remove them from the list and replace with a new BBR node.
12077 + */
12078 +static int bbr_discover(struct evms_logical_node ** discover_list)
12079 +{
12080 +        struct evms_logical_node * node, * next_node;
12081 +        struct evms_logical_node * bbr_node = NULL;
12082 +        struct bbr_private * bbr_id;
12083 +       int bad_blocks, rc = 0;
12084 +
12085 +       MOD_INC_USE_COUNT;
12086 +
12087 +       next_node = *discover_list;
12088 +       while (next_node) {
12089 +               node = next_node;
12090 +               next_node = node->next;
12091 +
12092 +               /* The node must have a BBR feature-header. */
12093 +                       if ( ! node->feature_header ||
12094 +                    node->feature_header->feature_id != plugin_header.id ) {
12095 +                               continue;
12096 +               }
12097 +
12098 +               rc = load_feature_data(node, &bbr_id);
12099 +               if (rc) {
12100 +                       /* Error loading feature data.
12101 +                        * This node belongs to us, but metadata is invalid,
12102 +                        * - remove it from the discovery list
12103 +                        * - delete it
12104 +                        * - clear error code then continue.
12105 +                        * Will consider creating a read only BBR node in
12106 +                        * the future.
12107 +                        */
12108 +                       LOG_SERIOUS("Error in node (%s) with "PFU64" sectors.\n",
12109 +                                   node->name, node->total_vsectors);
12110 +                       evms_cs_remove_logical_node_from_list(discover_list,
12111 +                                                             node);
12112 +                       DELETE(node);
12113 +                       rc = 0;
12114 +                       continue;
12115 +               }
12116 +
12117 +               rc = evms_cs_allocate_logical_node(&bbr_node);
12118 +               if (rc) {
12119 +                       LOG_SERIOUS("Could not allocate logical node! rc=%d\n", rc);
12120 +                       bbr_free_private(bbr_id);
12121 +                       continue;
12122 +               }
12123 +
12124 +               MOD_INC_USE_COUNT;
12125 +               bbr_node->volume_info = node->volume_info;
12126 +               bbr_node->flags |= node->flags;
12127 +               bbr_node->plugin = &plugin_header;
12128 +               strcpy(bbr_node->name,
12129 +                      node->feature_header->object_name);
12130 +               bbr_node->hardsector_size = node->hardsector_size;
12131 +               bbr_node->total_vsectors = node->total_vsectors - 2 -
12132 +                                          node->feature_header->feature_data1_size -
12133 +                                          node->feature_header->feature_data2_size;
12134 +               bbr_node->block_size = node->block_size;
12135 +               bbr_node->private = bbr_id;
12136 +               bbr_id->node = bbr_node;
12137 +
12138 +               /* Free the feature header */
12139 +               kfree(node->feature_header);
12140 +               node->feature_header = NULL;
12141 +               evms_cs_remove_logical_node_from_list(discover_list, node);
12142 +
12143 +               /* If bad blocks exist, give warning */
12144 +               bad_blocks = atomic_read(&bbr_id->in_use_replacement_blks);
12145 +               if (bad_blocks) {
12146 +                       BBR_DEBUG_PRINT_REMAP_LIST(bbr_id);
12147 +                       LOG_WARNING("%s has %d bad blocks.\n",
12148 +                                   bbr_id->source->name, bad_blocks);
12149 +                       LOG_WARNING("There are "PFU64" total replacement blocks.\n",
12150 +                                   bbr_id->nr_replacement_blks);
12151 +                       LOG_WARNING("There are "PFU64" remaining replacement blocks.\n",
12152 +                                   bbr_id->nr_replacement_blks -
12153 +                                   bad_blocks);
12154 +               }
12155 +
12156 +               evms_cs_add_logical_node_to_list(discover_list, bbr_node);
12157 +               bbr_list_add(bbr_id);
12158 +       }
12159 +
12160 +       MOD_DEC_USE_COUNT;
12161 +       return rc;
12162 +}
12163 +
12164 +static inline void bbr_list_add(struct bbr_private * bbr_id)
12165 +{
12166 +               bbr_id->next = bbr_instances;
12167 +       bbr_instances = bbr_id;
12168 +}
12169 +
12170 +static void bbr_list_remove(struct bbr_private * bbr_id)
12171 +{
12172 +       struct bbr_private ** p;
12173 +
12174 +       for ( p = &bbr_instances; *p; p = &(*p)->next ) {
12175 +               if ( *p == bbr_id ) {
12176 +                       *p = (*p)->next;
12177 +                       break;
12178 +               }
12179 +       }
12180 +}
12181 +
12182 +static struct bbr_private * bbr_find_private(char * object_name)
12183 +{
12184 +       struct bbr_private * p;
12185 +
12186 +       for ( p = bbr_instances; p; p = p->next ) {
12187 +               if ( ! strncmp(p->node->name, object_name,
12188 +                              EVMS_VOLUME_NAME_SIZE) ) {
12189 +                       return p;
12190 +               }
12191 +       }
12192 +       return NULL;
12193 +}
12194 +
12195 +static void bbr_free_private(struct bbr_private * bbr_id)
12196 +{
12197 +       if (bbr_id->remap_root) {
12198 +               bbr_free_remap(bbr_id);
12199 +       }
12200 +       if (bbr_id->bbr_table) {
12201 +               kfree(bbr_id->bbr_table);
12202 +       }
12203 +       bbr_list_remove(bbr_id);
12204 +       kfree(bbr_id);
12205 +}
12206 +
12207 +/**
12208 + * bbr_delete
12209 + *
12210 + * Delete the specified BBR node and the node it is built on. If the last BBR
12211 + * node is deleted, shut down the I/O thread.
12212 + */
12213 +static int bbr_delete(struct evms_logical_node * bbr_node)
12214 +{
12215 +       struct bbr_private * bbr_id;
12216 +       int rc;
12217 +
12218 +        bbr_id = bbr_node->private;
12219 +
12220 +        rc = DELETE(bbr_id->source);
12221 +       if (!rc) {
12222 +               /* Now cleanup and go away */
12223 +               bbr_free_private(bbr_id);
12224 +               evms_cs_deallocate_logical_node(bbr_node);
12225 +               if (!bbr_instances) {
12226 +                       bbr_destroy_pools();
12227 +                       if (bbr_io_thread) {
12228 +                               evms_cs_unregister_thread(bbr_io_thread);
12229 +                               bbr_io_thread = NULL;
12230 +                       }
12231 +               }
12232 +               MOD_DEC_USE_COUNT;
12233 +       }
12234 +        return rc;
12235 +}
12236 +
12237 +static struct bbr_io_buffer * allocate_bbr_io_buf(struct bbr_private * bbr_id,
12238 +                                                 struct buffer_head * bh,
12239 +                                                 int rw)
12240 +{
12241 +       struct bbr_io_buffer * bbr_io_buf;
12242 +
12243 +       bbr_io_buf = mempool_alloc(bbr_io_buf_pool, GFP_NOIO);
12244 +       if (bbr_io_buf) {
12245 +               memset(bbr_io_buf, 0, sizeof(struct bbr_io_buffer));
12246 +               INIT_LIST_HEAD(&bbr_io_buf->bbr_io_list);
12247 +               bbr_io_buf->bbr_id = bbr_id;
12248 +               bbr_io_buf->bh = bh;
12249 +               bbr_io_buf->rw = rw;
12250 +       } else {
12251 +               LOG_WARNING("Could not allocate from BBR I/O buffer pool!\n");
12252 +       }
12253 +       return bbr_io_buf;
12254 +}
12255 +
12256 +static void free_bbr_io_buf(struct bbr_io_buffer * bbr_io_buf)
12257 +{
12258 +       mempool_free(bbr_io_buf, bbr_io_buf_pool);
12259 +}
12260 +
12261 +/**
12262 + * bbr_io_remap_error
12263 + * @bbr_id:            Private data for the BBR node.
12264 + * @rw:                        READ or WRITE.
12265 + * @starting_lsn:      Starting sector of request to remap.
12266 + * @count:             Number of sectors in the request.
12267 + * @buffer:            Data buffer for the request.
12268 + *
12269 + * For the requested range, try to write each sector individually. For each
12270 + * sector that fails, find the next available remap location and write the
12271 + * data to that new location. Then update the table and write both copies
12272 + * of the table to disk. Finally, update the in-memory mapping and do any
12273 + * other necessary bookkeeping.
12274 + */
12275 +static int bbr_io_remap_error(struct bbr_private * bbr_id,
12276 +                             int rw,
12277 +                             u64 starting_lsn,
12278 +                             u64 count,
12279 +                             char * buffer )
12280 +{
12281 +       struct evms_bbr_table * bbr_table;
12282 +       unsigned long table_sector_index;
12283 +       unsigned long table_sector_offset;
12284 +       unsigned long index;
12285 +       u64 lsn, new_lsn;
12286 +       int rc;
12287 +
12288 +       if ( rw == READ ) {
12289 +               /* Nothing can be done about read errors. */
12290 +               return -EIO;
12291 +       }
12292 +
12293 +       /* For each sector in the request. */
12294 +       for ( lsn = 0; lsn < count; lsn++, buffer += EVMS_VSECTOR_SIZE ) {
12295 +               rc = INIT_IO(bbr_id->source, rw, starting_lsn + lsn, 1, buffer);
12296 +               while (rc) {
12297 +                       if ( bbr_id->flag & BBR_STOP_REMAP ) {
12298 +                               /* Can't allow new remaps if the
12299 +                                * engine told us to stop.
12300 +                                */
12301 +                               LOG_ERROR("Object %s: Bad sector ("PFU64"), but remapping is turned off.\n",
12302 +                                         bbr_id->node->name, starting_lsn+lsn);
12303 +                               return -EIO;
12304 +                       }
12305 +
12306 +                       /* Find the next available relocation sector. */
12307 +                       new_lsn = atomic_read(&bbr_id->in_use_replacement_blks);
12308 +                       if ( new_lsn >= bbr_id->nr_replacement_blks ) {
12309 +                               /* No more replacement sectors available. */
12310 +                               return -EIO;
12311 +                       }
12312 +                       new_lsn += bbr_id->start_replacement_sect;
12313 +
12314 +                       /* Write the data to its new location. */
12315 +                       LOG_WARNING("Object %s: Trying to remap bad sector ("PFU64") to sector ("PFU64")\n",
12316 +                                   bbr_id->node->name, starting_lsn + lsn,
12317 +                                   new_lsn);
12318 +                       rc = INIT_IO(bbr_id->source, rw, new_lsn, 1, buffer);
12319 +                       if (rc) {
12320 +                               /* This replacement sector is bad.
12321 +                                * Try the next one.
12322 +                                */
12323 +                               LOG_ERROR("Object %s: Replacement sector ("PFU64") is bad. Skipping.\n",
12324 +                                       bbr_id->node->name, new_lsn);
12325 +                               atomic_inc(&bbr_id->in_use_replacement_blks);
12326 +                               continue;
12327 +                       }
12328 +
12329 +                       /* Add this new entry to the on-disk table. */
12330 +                       table_sector_index = new_lsn -
12331 +                                            bbr_id->start_replacement_sect;
12332 +                       table_sector_offset = table_sector_index /
12333 +                                             EVMS_BBR_ENTRIES_PER_SECT;
12334 +                       index = table_sector_index % EVMS_BBR_ENTRIES_PER_SECT;
12335 +
12336 +                       bbr_table = &bbr_id->bbr_table[table_sector_offset];
12337 +                       bbr_table->entries[index].bad_sect = starting_lsn + lsn;
12338 +                       bbr_table->entries[index].replacement_sect = new_lsn;
12339 +                       bbr_table->in_use_cnt++;
12340 +                       bbr_table->sequence_number++;
12341 +                       bbr_table->crc = 0;
12342 +                       bbr_table->crc = evms_cs_calculate_crc(EVMS_INITIAL_CRC,
12343 +                                                              bbr_table,
12344 +                                                              sizeof(struct evms_bbr_table));
12345 +
12346 +                       /* Write the table to disk. */
12347 +                       cpu_bbr_table_sector_to_le(bbr_table, bbr_table);
12348 +                       if ( bbr_id->lba_table1 ) {
12349 +                               rc = INIT_IO(bbr_id->source, WRITE,
12350 +                                            bbr_id->lba_table1 +
12351 +                                            table_sector_offset,
12352 +                                            1, bbr_table);
12353 +                       }
12354 +                       if ( bbr_id->lba_table2 ) {
12355 +                               rc |= INIT_IO(bbr_id->source, WRITE,
12356 +                                             bbr_id->lba_table2 +
12357 +                                             table_sector_offset,
12358 +                                             1, bbr_table);
12359 +                       }
12360 +                       le_bbr_table_sector_to_cpu(bbr_table);
12361 +
12362 +                       if (rc) {
12363 +                               /* Error writing one of the tables to disk. */
12364 +                               LOG_ERROR("Object %s: Error updating BBR tables on disk.\n",
12365 +                                         bbr_id->node->name);
12366 +                               return rc;
12367 +                       }
12368 +
12369 +                       /* Insert a new entry in the remapping binary-tree. */
12370 +                       rc = bbr_insert_remap_entry(bbr_id,
12371 +                                                   &bbr_table->entries[index]);
12372 +                       if (rc) {
12373 +                               LOG_ERROR("Object %s: Error adding new entry to remap tree.\n",
12374 +                                         bbr_id->node->name);
12375 +                               return rc;
12376 +                       }
12377 +
12378 +                       atomic_inc(&bbr_id->in_use_replacement_blks);
12379 +               }
12380 +       }
12381 +
12382 +       return 0;
12383 +}
12384 +
12385 +/**
12386 + * bbr_io_process_request
12387 + *
12388 + * For each sector in this request, check if the sector has already
12389 + * been remapped. If so, process all previous sectors in the request,
12390 + * followed by the remapped sector. Then reset the starting lsn and
12391 + * count, and keep going with the rest of the request as if it were
12392 + * a whole new request. If any of the INIT_IO's return an error,
12393 + * call the remapper to relocate the bad sector(s).
12394 + */
12395 +static int bbr_io_process_request(struct bbr_io_buffer * bbr_io_buf)
12396 +{
12397 +       struct bbr_private * bbr_id = bbr_io_buf->bbr_id;
12398 +       u64 starting_lsn = bbr_io_buf->bh->b_rsector;
12399 +       u64 count = bbr_io_buf->bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
12400 +       u64 lsn, remapped_lsn;
12401 +       char * buffer = bbr_io_buf->bh->b_data;
12402 +       int rc = 0, rw = bbr_io_buf->rw;
12403 +
12404 +       /* For each sector in this request, check if this sector has already
12405 +        * been remapped. If so, process all previous sectors in this request,
12406 +        * followed by the remapped sector. Then reset the starting lsn and
12407 +        * count and keep going with the rest of the request as if it were
12408 +        * a whole new request.
12409 +        */
12410 +       for ( lsn = 0; lsn < count && !(bbr_id->flag & BBR_STOP_REMAP); lsn++ ) {
12411 +               remapped_lsn = starting_lsn + lsn;
12412 +               rc = bbr_remap(bbr_id, &remapped_lsn);
12413 +               if (!rc) {
12414 +                       /* This sector is fine. */
12415 +                       continue;
12416 +               }
12417 +
12418 +               /* Process all sectors in the request up to this one. */
12419 +               if ( lsn > 0 ) {
12420 +                       rc = INIT_IO(bbr_id->source, rw,
12421 +                                    starting_lsn, lsn, buffer);
12422 +                       if (rc) {
12423 +                               /* If this I/O failed, then one of the sectors
12424 +                                * in this request needs to be relocated.
12425 +                                */
12426 +                               rc = bbr_io_remap_error(bbr_id, rw, starting_lsn,
12427 +                                                       lsn, buffer);
12428 +                               if (rc) {
12429 +                                       return rc;
12430 +                               }
12431 +                       }
12432 +                       buffer += (lsn << EVMS_VSECTOR_SIZE_SHIFT);
12433 +               }
12434 +
12435 +               /* Process the remapped sector. */
12436 +               rc = INIT_IO(bbr_id->source, rw, remapped_lsn, 1, buffer);
12437 +               if (rc) {
12438 +                       /* BUGBUG - Need more processing if this caused an
12439 +                        * an error. If this I/O failed, then the existing
12440 +                        * remap is now bad, and we need to find a new remap.
12441 +                        * Can't use bbr_io_remap_error(), because the existing
12442 +                        * map entry needs to be changed, not added again, and
12443 +                        * the original table entry also needs to be changed.
12444 +                        */
12445 +                       return rc;
12446 +               }
12447 +
12448 +               buffer          += EVMS_VSECTOR_SIZE;
12449 +               starting_lsn    += (lsn + 1);
12450 +               count           -= (lsn + 1);
12451 +               lsn             = -1;
12452 +       }
12453 +
12454 +       /* Check for any remaining sectors after the last split. This could
12455 +        * potentially be the whole request, but that should be a rare case
12456 +        * because requests should only be processed by the thread if we know
12457 +        * an error occurred or they contained one or more remapped sectors.
12458 +        */
12459 +       if ( count ) {
12460 +               rc = INIT_IO(bbr_id->source, rw, starting_lsn, count, buffer);
12461 +               if (rc) {
12462 +                       /* If this I/O failed, then one of the sectors in this
12463 +                        * request needs to be relocated.
12464 +                        */
12465 +                       rc = bbr_io_remap_error(bbr_id, rw, starting_lsn,
12466 +                                               count, buffer);
12467 +                       if (rc) {
12468 +                               return rc;
12469 +                       }
12470 +               }
12471 +       }
12472 +
12473 +       return 0;
12474 +}
12475 +
12476 +/**
12477 + * bbr_io_handler
12478 + *
12479 + * This is the handler for the bbr_io_thread. It continuously loops,
12480 + * taking I/O requests off its list and processing them. If nothing
12481 + * is on the list, the thread goes back to sleep until specifically
12482 + * woken up.
12483 + *
12484 + * I/O requests should only be sent to this thread if we know that:
12485 + * a) the request contains at least one remapped sector.
12486 + *   or
12487 + * b) the request caused an error on the normal I/O path.
12488 + * This function uses synchronous I/O, so sending a request to this
12489 + * thread that doesn't need special processing will cause severe
12490 + * performance degredation.
12491 + */
12492 +static void bbr_io_handler(void * void_data)
12493 +{
12494 +       struct bbr_io_buffer * bbr_io_buf;
12495 +       struct buffer_head * bh;
12496 +       unsigned long flags;
12497 +       int rc = 0;
12498 +
12499 +       while (1) {
12500 +               /* Process bbr_io_list, one entry at a time. */
12501 +               spin_lock_irqsave(&bbr_io_list_lock, flags);
12502 +               if (list_empty(&bbr_io_list)) {
12503 +                       /* No more items on the list. */
12504 +                       spin_unlock_irqrestore(&bbr_io_list_lock, flags);
12505 +                       break;
12506 +               }
12507 +               bbr_io_buf = list_entry(bbr_io_list.next,
12508 +                                       struct bbr_io_buffer, bbr_io_list);
12509 +               list_del(&bbr_io_buf->bbr_io_list);
12510 +               spin_unlock_irqrestore(&bbr_io_list_lock, flags);
12511 +
12512 +               rc = bbr_io_process_request(bbr_io_buf);
12513 +
12514 +               /* Clean up and complete the original I/O. */
12515 +               bh = bbr_io_buf->bh;
12516 +               if (bh->b_end_io) {
12517 +                       free_bbr_io_buf(bbr_io_buf);
12518 +                       evms_cs_volume_request_in_progress(bh->b_rdev, -1, NULL);
12519 +                       bh->b_end_io(bh, rc ? 0 : 1);
12520 +               } else {
12521 +                       /* A request that originated from bbr_init_io. */
12522 +                       bbr_io_buf->rc = rc;
12523 +                       complete(bbr_io_buf->complete);
12524 +               }
12525 +       }
12526 +}
12527 +
12528 +/**
12529 + * bbr_schedule_io
12530 + *
12531 + * Place the specified bbr_io_buf on the thread's processing list.
12532 + */
12533 +static void bbr_schedule_io(struct bbr_io_buffer * bbr_io_buf)
12534 +{
12535 +       unsigned long flags;
12536 +
12537 +       spin_lock_irqsave(&bbr_io_list_lock, flags);
12538 +       list_add_tail(&bbr_io_buf->bbr_io_list, &bbr_io_list);
12539 +       spin_unlock_irqrestore(&bbr_io_list_lock, flags);
12540 +       evms_cs_wakeup_thread(bbr_io_thread);
12541 +}
12542 +
12543 +/**
12544 + * bbr_read
12545 + *
12546 + * If there are any remapped sectors on this object, send this request over
12547 + * to the thread for processing. Otherwise send it down the stack normally.
12548 + */
12549 +static void bbr_read(struct evms_logical_node * bbr_node,
12550 +                    struct buffer_head * bh )
12551 +{
12552 +        struct bbr_private * bbr_id = bbr_node->private;
12553 +       struct bbr_io_buffer * bbr_io_buf;
12554 +
12555 +       if ( bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) >
12556 +            bbr_node->total_vsectors ) {
12557 +               /* Request is off the end of the object. */
12558 +               bh->b_end_io(bh, 0);
12559 +               return;
12560 +       }
12561 +
12562 +       if ( atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
12563 +            bbr_id->flag & BBR_STOP_REMAP ||
12564 +            ! bbr_remap_probe(bbr_id, bh->b_rsector,
12565 +                              bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) ) {
12566 +               /* No existing remaps, this request doesn't contain any
12567 +                * remapped sectors, or the engine told us not to remap.
12568 +                */
12569 +               R_IO(bbr_id->source, bh);
12570 +               return;
12571 +       }
12572 +
12573 +       /* This request has at least one remapped sector. */
12574 +       bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, READ);
12575 +       if (!bbr_io_buf) {
12576 +               /* Can't get memory to track the I/O. */
12577 +               bh->b_end_io(bh, 0);
12578 +               return;
12579 +       }
12580 +
12581 +       evms_cs_volume_request_in_progress(bbr_io_buf->bh->b_rdev, +1, NULL);
12582 +       bbr_schedule_io(bbr_io_buf);
12583 +}
12584 +
12585 +/**
12586 + * bbr_write_callback
12587 + *
12588 + * This is the callback for normal write requests. Check for an error
12589 + * during the I/O, and send to the thread for processing if necessary.
12590 + */
12591 +static void bbr_write_callback(struct buffer_head * bh,
12592 +                              int uptodate)
12593 +{
12594 +       struct bbr_io_buffer * bbr_io_buf = bh->b_private;
12595 +
12596 +       bh->b_end_io = bbr_io_buf->org_end_io;
12597 +       bh->b_private = bbr_io_buf->org_private;
12598 +       bh->b_rsector = bbr_io_buf->org_rsector;
12599 +       bh->b_rdev = bbr_io_buf->org_dev;
12600 +
12601 +       if (!(bbr_io_buf->bbr_id->flag & BBR_STOP_REMAP) &&
12602 +           !uptodate) {
12603 +               LOG_ERROR("Object %s: Write failure on sector ("PFU64"). Scheduling for retry.\n",
12604 +                       bbr_io_buf->bbr_id->node->name, (u64)bbr_io_buf->bh->b_rsector);
12605 +               bbr_schedule_io(bbr_io_buf);
12606 +       } else {
12607 +               free_bbr_io_buf(bbr_io_buf);
12608 +               evms_cs_volume_request_in_progress(bh->b_rdev, -1, NULL);
12609 +               bh->b_end_io(bh, uptodate);
12610 +       }
12611 +}
12612 +
12613 +/**
12614 + * bbr_write
12615 + *
12616 + * If there are any remapped sectors on this object, send the request over
12617 + * to the thread for processing. Otherwise, register for callback
12618 + * notification, and send the request down normally.
12619 + */
12620 +static void bbr_write(struct evms_logical_node * bbr_node,
12621 +                     struct buffer_head * bh)
12622 +{
12623 +        struct bbr_private * bbr_id = bbr_node->private;
12624 +       struct bbr_io_buffer * bbr_io_buf;
12625 +
12626 +       if ( bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) >
12627 +            bbr_node->total_vsectors ||
12628 +            bbr_id->flag & EVMS_VOLUME_READ_ONLY ) {
12629 +               /* Request is off the end of the object, or this
12630 +                * is a read-only object.
12631 +                */
12632 +               bh->b_end_io(bh, 0);
12633 +               return;
12634 +       }
12635 +
12636 +       bbr_io_buf = allocate_bbr_io_buf(bbr_id, bh, WRITE);
12637 +       if (!bbr_io_buf) {
12638 +               /* Can't get memory to track the I/O. */
12639 +               bh->b_end_io(bh, 0);
12640 +               return;
12641 +       }
12642 +
12643 +       evms_cs_volume_request_in_progress(bh->b_rdev, +1, NULL);
12644 +
12645 +       if ( atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
12646 +            bbr_id->flag & BBR_STOP_REMAP ||
12647 +            ! bbr_remap_probe(bbr_id, bh->b_rsector,
12648 +                              bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) ) {
12649 +               /* No existing remaps, this request contains no remapped
12650 +                * sectors, or the engine said to stop remapping.
12651 +                */
12652 +               bbr_io_buf->org_end_io = bh->b_end_io;
12653 +               bbr_io_buf->org_private = bh->b_private;
12654 +               bbr_io_buf->org_rsector = bh->b_rsector;
12655 +               bbr_io_buf->org_dev = bh->b_rdev;
12656 +               bh->b_end_io = bbr_write_callback;
12657 +               bh->b_private = bbr_io_buf;
12658 +               W_IO(bbr_id->source, bh);
12659 +       } else {
12660 +               /* This request contains at least one remapped sector. */
12661 +               bbr_schedule_io(bbr_io_buf);
12662 +       }
12663 +}
12664 +
12665 +/**
12666 + * bbr_init_io_schedule_io
12667 + * @bbr_id:    Private data for the BBR node.
12668 + * @rw:                READ or WRITE.
12669 + * @lsn:       Starting sector for the request.
12670 + * @count:     Number of sectors in the request.
12671 + * @buffer:    Data buffer for the request.
12672 + *
12673 + * During init_io, failures must still be handled by the I/O thread. Create
12674 + * a bbr_io_buf, and schedule it to be handled by the thread. Then wait until
12675 + * the request is complete.
12676 + */
12677 +static int bbr_init_io_schedule_io(struct bbr_private * bbr_id,
12678 +                                  int rw,
12679 +                                  u64 lsn,
12680 +                                  u64 count,
12681 +                                  void * buffer)
12682 +{
12683 +       struct bbr_io_buffer * bbr_io_buf;
12684 +       struct buffer_head bh;
12685 +       struct completion complete;
12686 +       int rc = 0;
12687 +
12688 +       if ( rw != WRITE ) {
12689 +               /* Nothing can be done about read failures. */
12690 +               return -EIO;
12691 +       }
12692 +
12693 +       LOG_ERROR("Object %s: init_io write failure (sector "PFU64": count "PFU64"). Scheduling for retry.\n",
12694 +                 bbr_id->node->name, lsn, count);
12695 +       bbr_io_buf = allocate_bbr_io_buf(bbr_id, &bh, rw);
12696 +       if (!bbr_io_buf) {
12697 +               return -ENOMEM;
12698 +       }
12699 +
12700 +       memset(&bh, 0, sizeof(struct buffer_head));
12701 +       init_waitqueue_head(&bh.b_wait);
12702 +       bh.b_rsector = lsn;
12703 +       bh.b_size = count << EVMS_VSECTOR_SIZE_SHIFT;
12704 +       bh.b_data = buffer;
12705 +       bh.b_end_io = NULL;
12706 +
12707 +       /* Schedule the I/O and wait for it to finish. */
12708 +       bbr_io_buf->complete = &complete;
12709 +       init_completion(bbr_io_buf->complete);
12710 +       bbr_schedule_io(bbr_io_buf);
12711 +       wait_for_completion(bbr_io_buf->complete);
12712 +
12713 +       rc = bbr_io_buf->rc;
12714 +       free_bbr_io_buf(bbr_io_buf);
12715 +
12716 +       return rc;
12717 +}
12718 +
12719 +/**
12720 + * bbr_init_io
12721 + * @bbr_node:  BBR node.
12722 + * @rw:                READ or WRITE.
12723 + * @lsn:       Starting sector for I/O request.
12724 + * @count:     Number of sectors in the I/O request.
12725 + * @buffer:    Data buffer for the I/O request.
12726 + *
12727 + * Synchronous I/O requests.
12728 + */
12729 +static int bbr_init_io(struct evms_logical_node * bbr_node,
12730 +                      int rw,
12731 +                      u64 start_lsn,
12732 +                      u64 count,
12733 +                      void * buffer )
12734 +{
12735 +        struct bbr_private * bbr_id = bbr_node->private;
12736 +       u64 lsn;
12737 +       int rc = 0;
12738 +
12739 +       if ( start_lsn + count > bbr_node->total_vsectors ) {
12740 +               /* Request is off the end of the object. */
12741 +               return -EINVAL;
12742 +       }
12743 +
12744 +       if ( rw == WRITE && (bbr_id->flag & EVMS_VOLUME_READ_ONLY) ) {
12745 +               /* Can't write to a read-only object. */
12746 +               return -EINVAL;
12747 +       }
12748 +
12749 +       if ( bbr_id->flag & BBR_STOP_REMAP ||
12750 +            atomic_read(&bbr_id->in_use_replacement_blks) == 0 ||
12751 +            ! bbr_remap_probe(bbr_id, start_lsn, count) ) {
12752 +               /* Normal case (no existing remaps). */
12753 +               rc = INIT_IO(bbr_id->source, rw, start_lsn, count, buffer);
12754 +               if (rc && ! (bbr_id->flag & BBR_STOP_REMAP) ) {
12755 +                       /* Init_io error. Send request over to
12756 +                        * thread for further processing.
12757 +                        */
12758 +                       rc = bbr_init_io_schedule_io(bbr_id, rw, start_lsn,
12759 +                                                    count, buffer);
12760 +               }
12761 +       } else {
12762 +               /* At least one sector in this request needs to be remapped.
12763 +                * Test and send each one down individually.
12764 +                */
12765 +               for ( lsn = start_lsn;
12766 +                     lsn < start_lsn + count;
12767 +                     lsn++, buffer += EVMS_VSECTOR_SIZE ) {
12768 +                       bbr_remap(bbr_id, &lsn);
12769 +                       rc = INIT_IO(bbr_id->source, rw, lsn, 1, buffer);
12770 +                       if (rc) {
12771 +                               /* Init_io error. Send request
12772 +                                * to thread for processing.
12773 +                                */
12774 +                               rc = bbr_init_io_schedule_io(bbr_id, rw,
12775 +                                                            lsn, 1, buffer);
12776 +                               if (rc) {
12777 +                                       break;
12778 +                               }
12779 +                       }
12780 +               }
12781 +       }
12782 +
12783 +       return rc;
12784 +}
12785 +
12786 +/**
12787 + * bbr_direct_ioctl_sector_io
12788 + *
12789 + * Process an I/O from the engine on an active BBR object.
12790 + */
12791 +static int bbr_direct_ioctl_sector_io(struct bbr_private * bbr_id,
12792 +                                     struct evms_notify_bbr * notify)
12793 +{
12794 +       char * buffer, * user_buffer;
12795 +       u64 lsn;
12796 +       int rc = 0;
12797 +
12798 +       buffer = kmalloc(EVMS_VSECTOR_SIZE, GFP_NOIO);
12799 +       if (!buffer) {
12800 +               return -ENOMEM;
12801 +       }
12802 +
12803 +       user_buffer = (char*)notify->buffer;
12804 +
12805 +       for ( lsn = 0;
12806 +             lsn < notify->nr_sect;
12807 +             lsn++, user_buffer += EVMS_VSECTOR_SIZE ) {
12808 +               if ( notify->rw == WRITE ) {
12809 +                       if ( copy_from_user(buffer, user_buffer,
12810 +                                           EVMS_VSECTOR_SIZE) ) {
12811 +                               rc = -EFAULT;
12812 +                               break;
12813 +                       }
12814 +               }
12815 +
12816 +               rc = bbr_init_io(bbr_id->node, notify->rw,
12817 +                                notify->start_sect + lsn, 1, buffer);
12818 +               if (rc) {
12819 +                       break;
12820 +               }
12821 +
12822 +               if ( notify->rw == READ ) {
12823 +                       if ( copy_to_user(user_buffer, buffer,
12824 +                                         EVMS_VSECTOR_SIZE) ) {
12825 +                               rc = -EFAULT;
12826 +                               break;
12827 +                       }
12828 +               }
12829 +       }
12830 +
12831 +       kfree(buffer);
12832 +       return rc;
12833 +}
12834 +
12835 +/**
12836 + * bbr_direct_ioctl
12837 + * @inode:     N/A
12838 + * @file:      N/A
12839 + * @cmd:       N/A
12840 + * @arg:       Pointer to an evms_plugin_ioctl_pkt.
12841 + *
12842 + * BBR-specific ioctls from the engine. Currently handles:
12843 + *   BBR_STOP_REMAP_CMD
12844 + *   BBR_GET_INFO_CMD
12845 + *   BBR_SECTOR_IO_CMD
12846 + */
12847 +static int bbr_direct_ioctl(struct inode * inode,
12848 +                           struct file * file,
12849 +                           unsigned int cmd,
12850 +                           unsigned long arg)
12851 +{
12852 +       int rc = 0;
12853 +       struct bbr_private * bbr_id;
12854 +       struct evms_plugin_ioctl_pkt pkt, * user_pkt;
12855 +       struct evms_notify_bbr notify, * user_notify;
12856 +
12857 +       MOD_INC_USE_COUNT;
12858 +
12859 +       user_pkt = (struct evms_plugin_ioctl_pkt *)arg;
12860 +       if ( copy_from_user(&pkt, user_pkt, sizeof(pkt)) ) {
12861 +               MOD_DEC_USE_COUNT;
12862 +               return -EFAULT;
12863 +       }
12864 +
12865 +       if ( pkt.feature_id != plugin_header.id ) {
12866 +               MOD_DEC_USE_COUNT;
12867 +               return -EINVAL;
12868 +       }
12869 +
12870 +       user_notify = (struct evms_notify_bbr *)pkt.feature_ioctl_data;
12871 +       if ( copy_from_user(&notify, user_notify, sizeof(notify)) ) {
12872 +               rc = -EFAULT;
12873 +       } else {
12874 +               bbr_id = bbr_find_private(notify.object_name);
12875 +               if (!bbr_id) {
12876 +                       rc = -ENODEV;
12877 +               } else {
12878 +
12879 +                       switch(pkt.feature_command) {
12880 +
12881 +                       case BBR_STOP_REMAP_CMD:
12882 +                               bbr_id->flag |= BBR_STOP_REMAP;
12883 +                               /* Fall through. */
12884 +
12885 +                       case BBR_GET_INFO_CMD:
12886 +                               notify.count = atomic_read(&bbr_id->in_use_replacement_blks);
12887 +                               if ( copy_to_user(&user_notify->count,
12888 +                                                 &notify.count,
12889 +                                                 sizeof(user_notify->count))) {
12890 +                                       rc = -EFAULT;
12891 +                               }
12892 +                               break;
12893 +
12894 +                       case BBR_SECTOR_IO_CMD:
12895 +                               rc = bbr_direct_ioctl_sector_io(bbr_id,
12896 +                                                               &notify);
12897 +                               break;
12898 +
12899 +                       default:
12900 +                               rc = -ENOSYS;
12901 +                       }
12902 +               }
12903 +       }
12904 +
12905 +       pkt.status = rc;
12906 +       copy_to_user(user_pkt, &pkt, sizeof(pkt));
12907 +       MOD_DEC_USE_COUNT;
12908 +       return rc;
12909 +}
12910 +
12911 +/**
12912 + * bbr_ioctl
12913 + * @bbr_node:  BBR node.
12914 + * @inode:     N/A
12915 + * @file:      N/A
12916 + * @cmd:       ioctl command to process.
12917 + * @arg:       ioctl-specific data pointer.
12918 + *
12919 + * IOCTL handler. Currently BBR handles plugin-specific ioctls, as well as
12920 + * EVMS_GET_BMAP. All others are passed to the child node.
12921 + */
12922 +static int bbr_ioctl (struct evms_logical_node * bbr_node,
12923 +                     struct inode * inode,
12924 +                     struct file * file,
12925 +                     unsigned int cmd,
12926 +                     unsigned long arg)
12927 +{
12928 +        struct bbr_private * bbr_id = bbr_node->private;
12929 +       struct evms_get_bmap_pkt * bmap;
12930 +        int rc = 0;
12931 +
12932 +        switch (cmd) {
12933 +               case EVMS_PLUGIN_IOCTL:
12934 +                       rc = bbr_direct_ioctl(inode, file, cmd, arg);
12935 +                       break;
12936 +
12937 +               case EVMS_GET_BMAP:
12938 +                       bmap = (struct evms_get_bmap_pkt *)arg;
12939 +                       bbr_remap(bbr_id, &bmap->rsector);
12940 +                       /* fall thru */
12941 +
12942 +               default:
12943 +                       rc = IOCTL(bbr_id->source, inode, file, cmd, arg);
12944 +        }
12945 +        return rc;
12946 +}
12947 +
12948 +static int __init bbr_init(void)
12949 +{
12950 +        return evms_cs_register_plugin(&plugin_header);
12951 +}
12952 +
12953 +static void __exit bbr_exit(void)
12954 +{
12955 +       evms_cs_unregister_plugin(&plugin_header);
12956 +}
12957 +
12958 +module_init(bbr_init);
12959 +module_exit(bbr_exit);
12960 +#ifdef MODULE_LICENSE
12961 +MODULE_LICENSE("GPL");
12962 +#endif
12963 +
12964 diff -Naur linux-2002-09-30/drivers/evms/evms_drivelink.c evms-2002-09-30/drivers/evms/evms_drivelink.c
12965 --- linux-2002-09-30/drivers/evms/evms_drivelink.c      Wed Dec 31 18:00:00 1969
12966 +++ evms-2002-09-30/drivers/evms/evms_drivelink.c       Fri Sep 13 16:09:55 2002
12967 @@ -0,0 +1,1274 @@
12968 +/* -*- linux-c -*-
12969 + *
12970 + *
12971 + *   Copyright (c) International Business Machines  Corp., 2000
12972 + *
12973 + *   This program is free software;  you can redistribute it and/or modify
12974 + *   it under the terms of the GNU General Public License as published by
12975 + *   the Free Software Foundation; either version 2 of the License, or
12976 + *   (at your option) any later version.
12977 + *
12978 + *   This program is distributed in the hope that it will be useful,
12979 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
12980 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
12981 + *   the GNU General Public License for more details.
12982 + *
12983 + *   You should have received a copy of the GNU General Public License
12984 + *   along with this program;  if not, write to the Free Software
12985 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
12986 + *
12987 + *
12988 + */
12989 +/*
12990 + * linux/drivers/evms/drvlink.c
12991 +
12992 + *
12993 + * EVMS Drive Linking Feature.
12994 + *
12995 + * This feature provides the ability to link multiple storage objects
12996 + * together as a single virtual storage object.
12997 + *
12998 + */
12999 +
13000 +#include <linux/module.h>
13001 +#include <linux/kernel.h>
13002 +#include <linux/config.h>
13003 +#include <linux/genhd.h>
13004 +#include <linux/blk.h>
13005 +#include <linux/evms/evms.h>
13006 +#include <linux/evms/evms_drivelink.h>
13007 +#include <asm/uaccess.h>
13008 +
13009 +#define LOG_PREFIX "drivelink: "
13010 +
13011 +/* prototypes for mandatory plugin interface functions */
13012 +static int drivelink_discover(struct evms_logical_node **);
13013 +static int drivelink_delete(struct evms_logical_node *);
13014 +static void drivelink_read(struct evms_logical_node *, struct buffer_head *);
13015 +static void drivelink_write(struct evms_logical_node *, struct buffer_head *);
13016 +static int drivelink_ioctl(struct evms_logical_node *,
13017 +                          struct inode *,
13018 +                          struct file *, unsigned int, unsigned long);
13019 +static int drivelink_init_io(struct evms_logical_node *,
13020 +                            int, u64, u64, void *);
13021 +
13022 +/* plugin function table definition */
13023 +static struct evms_plugin_fops fops = {
13024 +       .discover       = drivelink_discover,
13025 +       .delete         = drivelink_delete,
13026 +       .read           = drivelink_read,
13027 +       .write          = drivelink_write,
13028 +       .init_io        = drivelink_init_io,
13029 +       .ioctl          = drivelink_ioctl
13030 +};
13031 +
13032 +/* plugin header definition */
13033 +static struct evms_plugin_header plugin_header = {
13034 +       .id = SetPluginID(IBM_OEM_ID,
13035 +                         EVMS_FEATURE,
13036 +                         EVMS_DRIVELINK_FEATURE_ID),
13037 +       .version = {
13038 +               .major          = 2,
13039 +               .minor          = 0,
13040 +               .patchlevel     = 1
13041 +       },
13042 +       .required_services_version = {
13043 +               .major          = 0,
13044 +               .minor          = 5,
13045 +               .patchlevel     = 0
13046 +       },
13047 +       .fops = &fops
13048 +};
13049 +
13050 +/********************************************************/
13051 +/* Required Plugin Function Table Entry Point:          */
13052 +/*      Discover function & Support routines            */
13053 +/********************************************************/
13054 +
13055 +/**
13056 + * le_feature_data_to_cpu:
13057 + * @md:        drivelink metadata
13058 + *
13059 + * convert feature data from on-disk (Little Endian) format
13060 + * to the native cpu endian format.
13061 +**/
13062 +static void
13063 +le_feature_data_to_cpu(struct evms_drivelink_metadata *md)
13064 +{
13065 +       int i;
13066 +
13067 +       md->signature = le32_to_cpup(&md->signature);
13068 +       md->crc = le32_to_cpup(&md->crc);
13069 +       md->version.major = le32_to_cpup(&md->version.major);
13070 +       md->version.minor = le32_to_cpup(&md->version.minor);
13071 +       md->version.patchlevel = le32_to_cpup(&md->version.patchlevel);
13072 +       md->flags = le32_to_cpup(&md->flags);
13073 +       md->sequence_number = le64_to_cpup(&md->sequence_number);
13074 +       md->child_serial_number = le64_to_cpup(&md->child_serial_number);
13075 +       md->parent_serial_number = le64_to_cpup(&md->parent_serial_number);
13076 +       md->child_count = le64_to_cpup(&md->child_count);
13077 +       for (i = 0; i < EVMS_DRIVELINK_MAX_ENTRIES; i++) {
13078 +               struct evms_dl_ordering_table_entry *child_entry;
13079 +
13080 +               child_entry = &md->ordering_table[i];
13081 +               child_entry->child_serial_number =
13082 +                   le64_to_cpup(&child_entry->child_serial_number);
13083 +               child_entry->child_vsize =
13084 +                   le64_to_cpup(&child_entry->child_vsize);
13085 +       }
13086 +}
13087 +
13088 +/**
13089 + * load_feature_data:  load a feature header from disk
13090 + * @node:      storage object
13091 + * @md:        ptr to drivelink metadata
13092 + *
13093 + * loads and verifies redundant copies of drivelink metadata. @md is modified
13094 + * and returned to the caller.
13095 + *
13096 + * Return value: 0 on success
13097 + *              Otherwise error code
13098 +**/
13099 +static int
13100 +load_feature_data(struct evms_logical_node *node,
13101 +                 struct evms_drivelink_metadata **md)
13102 +{
13103 +       int i, rc = 0, rc_array[2] = { 0, 0 }, size_in_bytes;
13104 +       u64 real_metadata_size, feature_data_size;
13105 +       u64 starting_sector;
13106 +       struct evms_drivelink_metadata *cur_md, *md1, *md2 = NULL;
13107 +       char *location_name;
13108 +
13109 +       /* verify the feature metadata size from the  */
13110 +       /* feature header agrees with the real size   */
13111 +       /* of the current metadata structure.         */
13112 +       real_metadata_size = evms_cs_size_in_vsectors(sizeof (**md));
13113 +
13114 +       /* allocate a buffer large enough to hold all */
13115 +       /* sectors containing the feature's metadata  */
13116 +       size_in_bytes = real_metadata_size * EVMS_VSECTOR_SIZE;
13117 +       md1 = kmalloc(size_in_bytes, GFP_KERNEL);
13118 +       if (md1) {
13119 +               md2 = kmalloc(size_in_bytes, GFP_KERNEL);
13120 +               if (!md2) {
13121 +                       kfree(md1);
13122 +                       rc = -ENOMEM;
13123 +               }
13124 +       } else {
13125 +               rc = -ENOMEM;
13126 +       }
13127 +       if (!rc) {
13128 +               for (i = 0; i < 2; i++) {
13129 +                       if (i == 0) {
13130 +                               starting_sector =
13131 +                                   node->feature_header->
13132 +                                   feature_data1_start_lsn;
13133 +                               feature_data_size =
13134 +                                   node->feature_header->feature_data1_size;
13135 +                               cur_md = md1;
13136 +                               location_name = evms_primary_string;
13137 +                       } else {
13138 +                               starting_sector =
13139 +                                   node->feature_header->
13140 +                                   feature_data2_start_lsn;
13141 +                               feature_data_size =
13142 +                                   node->feature_header->feature_data2_size;
13143 +                               cur_md = md2;
13144 +                               location_name = evms_secondary_string;
13145 +                       }
13146 +                       /* check that real metadata size matches the  */
13147 +                       /* feature data size                          */
13148 +                       if (real_metadata_size != feature_data_size) {
13149 +                               LOG_ERROR
13150 +                                   ("%s feature data size("PFU64" bytes) doesn't match expected size("PFU64" bytes).\n",
13151 +                                    location_name,
13152 +                                    feature_data_size <<
13153 +                                    EVMS_VSECTOR_SIZE_SHIFT,
13154 +                                    real_metadata_size <<
13155 +                                    EVMS_VSECTOR_SIZE_SHIFT);
13156 +                               rc = -EINVAL;
13157 +                               rc_array[i] = rc;
13158 +                               continue;
13159 +                       }
13160 +                       /* load the node's feature data */
13161 +                       rc = INIT_IO(node,
13162 +                                    0,
13163 +                                    starting_sector,
13164 +                                    feature_data_size, cur_md);
13165 +                       if (rc) {
13166 +                               LOG_ERROR
13167 +                                   ("error(%d) probing for %s feature data at sector("PFU64") on '%s'.\n",
13168 +                                    rc, location_name, starting_sector,
13169 +                                    node->name);
13170 +                               rc_array[i] = rc;
13171 +                               continue;
13172 +                       }
13173 +                       /* check for valid metadata signature */
13174 +                       if (le32_to_cpup(&cur_md->signature) !=
13175 +                           EVMS_DRIVELINK_SIGNATURE) {
13176 +                               rc = -ENODATA;
13177 +                               LOG_SERIOUS
13178 +                                   ("error(%d) invalid signature in %s feature data on '%s'\n",
13179 +                                    rc, location_name, node->name);
13180 +                               rc_array[i] = rc;
13181 +                               continue;
13182 +                       }
13183 +                       /* validate feature data CRC */
13184 +                       if (cur_md->crc != EVMS_MAGIC_CRC) {
13185 +                               int org_crc, final_crc;
13186 +                               org_crc = le32_to_cpup(&cur_md->crc);
13187 +                               cur_md->crc = 0;
13188 +                               final_crc =
13189 +                                   evms_cs_calculate_crc(EVMS_INITIAL_CRC,
13190 +                                                         cur_md,
13191 +                                                         sizeof (*cur_md));
13192 +                               if (final_crc != org_crc) {
13193 +                                       LOG_ERROR
13194 +                                           ("CRC mismatch error [stored(%x), computed(%x)] in %s feature data on '%s'.\n",
13195 +                                            org_crc, final_crc, location_name,
13196 +                                            node->name);
13197 +                                       rc = -EINVAL;
13198 +                                       rc_array[i] = rc;
13199 +                                       continue;
13200 +                               }
13201 +                       } else {
13202 +                               LOG_WARNING
13203 +                                   ("CRC disabled in %s feature data on '%s'.\n",
13204 +                                    location_name, node->name);
13205 +                       }
13206 +                       /* convert feature data from on-disk
13207 +                        * format (Little Endian) to native
13208 +                        * cpu endian format.
13209 +                        */
13210 +                       le_feature_data_to_cpu(cur_md);
13211 +                       /* check for valid structure version */
13212 +                       rc = evms_cs_check_version(&metadata_ver,
13213 +                                                  &cur_md->version);
13214 +                       if (rc) {
13215 +                               LOG_SERIOUS
13216 +                                   ("error(%d) obsolete version detected: actual(%d,%d,%d), requires(%d,%d,%d) in %s feature data on '%s'\n",
13217 +                                    rc, cur_md->version.major,
13218 +                                    cur_md->version.minor,
13219 +                                    cur_md->version.patchlevel,
13220 +                                    DRIVELINK_METADATA_MAJOR,
13221 +                                    DRIVELINK_METADATA_MINOR,
13222 +                                    DRIVELINK_METADATA_PATCHLEVEL,
13223 +                                    location_name, node->name);
13224 +                               rc_array[i] = rc;
13225 +                       }
13226 +               }
13227 +               /* getting same return code for both copies? */
13228 +               if (rc_array[0] == rc_array[1]) {
13229 +                       rc = rc_array[0];
13230 +                       /* if no errors on both copies,
13231 +                        * check the sequence numbers.
13232 +                        * use the highest sequence number.
13233 +                        */
13234 +                       if (!rc) {
13235 +                               /* compare sequence numbers */
13236 +                               if (md1->sequence_number ==
13237 +                                   md2->sequence_number) {
13238 +                                       cur_md = md1;
13239 +                               } else {
13240 +                                       LOG_WARNING
13241 +                                           ("sequence number mismatches between front("PFU64") and rear("PFU64") feature data copies on node(%s)!\n",
13242 +                                            md2->sequence_number,
13243 +                                            md1->sequence_number, node->name);
13244 +                                       if (md1->sequence_number >
13245 +                                           md2->sequence_number)
13246 +                                               cur_md = md1;
13247 +                                       else
13248 +                                               cur_md = md2;
13249 +                                       LOG_WARNING
13250 +                                           ("using %s feature data copy!\n",
13251 +                                            (cur_md ==
13252 +                                             md1) ? evms_primary_string :
13253 +                                            evms_secondary_string);
13254 +                               }
13255 +                       }
13256 +                       /* getting different return codes for each copy */
13257 +               } else if (rc_array[0] == 0) {
13258 +                       /* use 1st (rear) copy if its good */
13259 +                       rc = 0;
13260 +                       cur_md = md1;
13261 +               } else if (rc_array[1] == 0) {
13262 +                       /* use 2nd (front) copy if its good */
13263 +                       rc = 0;
13264 +                       cur_md = md2;
13265 +               } else if ((rc_array[0] == -EINVAL) || (rc_array[1] == -EINVAL)) {
13266 +                       /* fail if either give a fatal error */
13267 +                       rc = -EINVAL;
13268 +                       cur_md = NULL;
13269 +               }
13270 +
13271 +               /* deallocate metadata buffers appropriately */
13272 +               if (rc || (cur_md == md1))
13273 +                       kfree(md2);
13274 +               if (rc || (cur_md == md2))
13275 +                       kfree(md1);
13276 +
13277 +               /* save validated feature header pointer */
13278 +               if (!rc)
13279 +                       *md = cur_md;
13280 +       }
13281 +       return (rc);
13282 +}
13283 +
13284 +/**
13285 + * find_parent_node_for_child_node: finds or creates a parent node for this child node
13286 + * @child_node:        input, child node
13287 + * @md:                input, on-disk metadata
13288 + * @parent_node:       output, parent node
13289 + * @dl_private:        output, runtime metadata
13290 + * @discover_list:     input/output, list of objects being discovered
13291 + *
13292 + * finds or creates a parent node for the specified child node. if the parent node is
13293 + * created, create and initialize the parent's private data area.
13294 + *
13295 + * Return value: 0 on success
13296 + *              Otherwise error code.
13297 +**/
13298 +static int
13299 +find_parent_node_for_child_node(struct evms_logical_node *child_node,
13300 +                               struct evms_drivelink_metadata *md,
13301 +                               struct evms_logical_node **parent_node,
13302 +                               struct runtime_data **dl_private,
13303 +                               struct evms_logical_node **discover_list)
13304 +{
13305 +       int rc = 0, parent_found = FALSE;
13306 +       struct evms_logical_node *parent = NULL;
13307 +       struct runtime_data *rd = NULL;
13308 +
13309 +       /* find the parent node for this child */
13310 +       for (parent = *discover_list; parent; parent = parent->next) {
13311 +               /* only parent nodes will have null feature headers */
13312 +               if (!parent->feature_header) {
13313 +                       rd = (struct runtime_data *) parent->private;
13314 +                       if (rd->parent_sn == md->parent_serial_number) {
13315 +                               parent_found = TRUE;
13316 +                               break;
13317 +                       }
13318 +               }
13319 +       }
13320 +       /* if no parent node found, create it */
13321 +       if (parent_found == FALSE) {
13322 +               rc = evms_cs_allocate_logical_node(&parent);
13323 +               if (!rc) {
13324 +                       /* transpose info from child to parent */
13325 +                       parent->flags |= child_node->flags;
13326 +                       strcpy(parent->name,
13327 +                              child_node->feature_header->object_name);
13328 +                       /* copy evms system data to parent */
13329 +                       parent->volume_info = child_node->volume_info;
13330 +                       /* initialize the plugin id field */
13331 +                       parent->plugin = &plugin_header;
13332 +                       /* allocate parent's instance data */
13333 +                       parent->private = kmalloc(sizeof(*rd), GFP_KERNEL);
13334 +                       if (!parent->private)
13335 +                               rc = -ENOMEM;
13336 +               }
13337 +               if (!rc) {
13338 +                       /* initialize some instance data fields */
13339 +                       rd = (struct runtime_data *) parent->private;
13340 +                       rd->block_size = 0;
13341 +                       rd->parent_sn = md->parent_serial_number;
13342 +                       rd->child_count = md->child_count;
13343 +                       /* allocate the child table */
13344 +                       rd->child_table = kmalloc(sizeof(struct runtime_entry) *
13345 +                                                 rd->child_count, GFP_KERNEL);
13346 +                       if (!rd->child_table)
13347 +                               rc = -ENOMEM;
13348 +               }
13349 +               if (!rc) {
13350 +                       memset(rd->child_table, 0,
13351 +                               sizeof(struct runtime_entry) * rd->child_count);
13352 +                       /* add the parent node to the discover list */
13353 +                       rc = evms_cs_add_logical_node_to_list(discover_list,
13354 +                                                             parent);
13355 +                       MOD_INC_USE_COUNT;
13356 +               }
13357 +               /* if any errors encountered, try to clean up */
13358 +               if (rc) {
13359 +                       LOG_SERIOUS("find_parent_node: rc(%d) from '%s'\n",
13360 +                                   rc, child_node->name);
13361 +                       if (parent) {
13362 +                               DELETE(parent);
13363 +                               parent = NULL;
13364 +                               rd = NULL;
13365 +                       }
13366 +               }
13367 +       }
13368 +
13369 +       *dl_private = rd;
13370 +       *parent_node = parent;
13371 +
13372 +       return (rc);
13373 +}
13374 +
13375 +/**
13376 + * compute_child_index: compute the index for a specific child node
13377 + * @node:      the child node
13378 + * @md:        the drivelink on-disk metadata
13379 + *
13380 + * compute and return and 0-based index value of this child node's position
13381 + * in the parent node's ordering table.
13382 + *
13383 + * Return value: -1 on error
13384 + *              otherwise the index of the specified child.
13385 +**/
13386 +static int
13387 +compute_child_index(struct evms_logical_node *node,
13388 +                   struct evms_drivelink_metadata *md)
13389 +{
13390 +       int i, position = -1;
13391 +
13392 +       for (i = 0; i < md->child_count; i++) {
13393 +               if (md->ordering_table[i].child_serial_number ==
13394 +                   md->child_serial_number) {
13395 +                       position = i;
13396 +                       break;
13397 +               }
13398 +       }
13399 +       if (position == -1) {
13400 +               LOG_SERIOUS("%s: child not found from '%s'\n",
13401 +                           __FUNCTION__, node->name);
13402 +       }
13403 +       return (position);
13404 +}
13405 +
13406 +/**
13407 + * process_child_nodes: perform the discovery operation on each child node
13408 + * @discover_list:     the list of potential child objects
13409 + *
13410 + * search the discovery list of drivelink child nodes. for each node found,
13411 + * perform the discovery operation on it.
13412 + *
13413 + * Return value: 0 on success
13414 + *              otherwise error code
13415 +**/
13416 +static int
13417 +process_child_nodes(struct evms_logical_node **discover_list)
13418 +{
13419 +       int rc = 0, index = -1;
13420 +       struct evms_logical_node *node, *next_node, *parent;
13421 +       struct evms_drivelink_metadata *md;
13422 +       struct runtime_data *rd;
13423 +       struct runtime_entry *child_entry = NULL;
13424 +
13425 +       for (node = *discover_list; node; node = next_node) {
13426 +               next_node = node->next;
13427 +               if ((!node->feature_header) ||
13428 +                   (node->feature_header->feature_id != plugin_header.id)) {
13429 +                       continue;
13430 +               }
13431 +
13432 +               rc = evms_cs_remove_logical_node_from_list(discover_list, node);
13433 +               if (rc)
13434 +                       BUG();
13435 +               /* we need to load the feature data to   */
13436 +               /* find the parent's serial number this  */
13437 +               /* child node belongs to.                */
13438 +               md = NULL;
13439 +               rc = load_feature_data(node, &md);
13440 +               if (!rc) {
13441 +                       /* find the parent node for this child */
13442 +                       parent = NULL;
13443 +                       rc = find_parent_node_for_child_node(node, md,
13444 +                                                            &parent, &rd,
13445 +                                                            discover_list);
13446 +               }
13447 +               if (!rc) {
13448 +                       /* determine position of child in drive link object */
13449 +                       index = compute_child_index(node, md);
13450 +                       if (index == -1)
13451 +                               rc = index;
13452 +               }
13453 +               if (!rc) {
13454 +                       /* check for multiple child index requests */
13455 +                       child_entry =
13456 +                           (struct runtime_entry *) &rd->child_table[index];
13457 +                       /* check to see if this child index is
13458 +                        * already in use.
13459 +                        */
13460 +                       if (child_entry->child_node) {
13461 +                               LOG_SERIOUS
13462 +                                   ("attempt to put '%s' in child index(%d). Already occupied by '%s'.\n",
13463 +                                    node->name, index,
13464 +                                    child_entry->child_node->name);
13465 +                               rc = -1;
13466 +                       }
13467 +               }
13468 +               if (!rc) {
13469 +                       /* fill in child info in parent */
13470 +
13471 +                       /* check the sector size for this node */
13472 +                       if (node->hardsector_size > parent->hardsector_size)
13473 +                               parent->hardsector_size = node->hardsector_size;
13474 +                       /* check the block size for this node */
13475 +                       if (node->block_size > parent->block_size)
13476 +                               parent->block_size = node->block_size;
13477 +                       /* set the child node */
13478 +                       child_entry->child_node = node;
13479 +                       /* set the metadata for this node */
13480 +                       child_entry->child_metadata = md;
13481 +               }
13482 +
13483 +               /* on error, clean up accordingly */
13484 +               if (rc) {
13485 +                       if (md)
13486 +                               kfree(md);
13487 +                       LOG_SERIOUS("%s: rc(%d) from '%s'\n",
13488 +                                   __FUNCTION__, rc, node->name);
13489 +                       LOG_SERIOUS("deleting child node '%s'.\n", node->name);
13490 +                       rc = DELETE(node);
13491 +                       if (rc) {
13492 +                               LOG_SERIOUS
13493 +                                   ("error(%d) attempting to delete '%s'.\n",
13494 +                                    rc, node->name);
13495 +                       }
13496 +               }
13497 +       }
13498 +
13499 +       /* errors are handled internal to this function */
13500 +       /* by deleting the failed node. This will get   */
13501 +       /* picked up by finalize_parent_nodes as a      */
13502 +       /* missing child node                           */
13503 +       return (0);
13504 +}
13505 +
13506 +#define TEST_CHILD_PRESENCE            0
13507 +#define TEST_CHILD_COUNT               1
13508 +#define TEST_CHILD_PARENTS_SERIAL_NUM  2
13509 +#define TEST_CHILD_POSITION            3
13510 +#define TEST_CHILD_METADATA            4
13511 +
13512 +/**
13513 + * test_parent_node: verify that a parent is complete
13514 + * @node:      specified parent node
13515 + *
13516 + * verify that the parent node has all of its child nodes accounted for.
13517 + *
13518 + * Return value: 0 on success
13519 + *              otherwise error code
13520 +**/
13521 +static int
13522 +test_parent_node(struct evms_logical_node *node)
13523 +{
13524 +       int i, rc = 0;
13525 +       struct runtime_data *rd;
13526 +       struct runtime_entry *child_entry;
13527 +
13528 +       rd = (struct runtime_data *) node->private;
13529 +       for (i = 0; i < rd->child_count; i++) {
13530 +               child_entry = (struct runtime_entry *) &rd->child_table[i];
13531 +
13532 +               /* insure each child entry is filled */
13533 +               if (!child_entry->child_node) {
13534 +                       node->flags |=
13535 +                           EVMS_VOLUME_SET_READ_ONLY | EVMS_VOLUME_PARTIAL;
13536 +                       LOG_ERROR("%s: missing child(%d).\n", __FUNCTION__, i);
13537 +               } else
13538 +                       /* insure child count is the same */
13539 +                       /* in each child's metadata       */
13540 +               if (child_entry->child_metadata->child_count != rd->child_count) {
13541 +                       rc = -EVMS_FEATURE_FATAL_ERROR;
13542 +                       LOG_ERROR("%s: child count wrong for node '%s'\n",
13543 +                                 __FUNCTION__, node->name);
13544 +               } else
13545 +                       /* insure parent serial number is    */
13546 +                       /* the same in each child's metadata */
13547 +               if (child_entry->child_metadata->parent_serial_number !=
13548 +                           rd->parent_sn) {
13549 +                       rc = -EVMS_FEATURE_FATAL_ERROR;
13550 +                       LOG_ERROR
13551 +                           ("%s: incorrect [is("PFU64"), should be("PFU64")] child serial number for node '%s'\n",
13552 +                            __FUNCTION__,
13553 +                            child_entry->child_metadata->parent_serial_number,
13554 +                            rd->parent_sn, node->name);
13555 +               } else
13556 +                       /* insure each is in the correct entry */
13557 +               if (child_entry->child_metadata->ordering_table[i].
13558 +                           child_serial_number !=
13559 +                           child_entry->child_metadata->child_serial_number) {
13560 +                       rc = -EVMS_FEATURE_FATAL_ERROR;
13561 +                       LOG_ERROR
13562 +                           ("%s: child reports different index for node '%s'\n",
13563 +                            __FUNCTION__, node->name);
13564 +               } else {
13565 +                       struct runtime_entry *other_child_entry;
13566 +                       int j, rc2;
13567 +                       /* compare the children's metadata */
13568 +
13569 +                       /* look for another present child to
13570 +                        * compare against.
13571 +                        */
13572 +                       other_child_entry = NULL;
13573 +                       for (j = 0; j < rd->child_count; j++) {
13574 +                               /* skip comparing to ourselves */
13575 +                               if (j == i) {
13576 +                                       continue;
13577 +                               }
13578 +                               /* is this child is present? */
13579 +                               if (rd->child_table[j].child_node) {
13580 +                                       /* yes, use it */
13581 +                                       other_child_entry = &rd->child_table[j];
13582 +                                       break;
13583 +                               }
13584 +                       }
13585 +                       /* if we can't find another valid
13586 +                        * child node's metadata to compare
13587 +                        * against, just skip this test.
13588 +                        */
13589 +                       if (!other_child_entry) {
13590 +                               continue;
13591 +                       }
13592 +                       rc2 =
13593 +                           memcmp(other_child_entry->child_metadata->
13594 +                                  ordering_table,
13595 +                                  child_entry->child_metadata->ordering_table,
13596 +                                  sizeof (child_entry->child_metadata->
13597 +                                          ordering_table));
13598 +                       if (rc2) {
13599 +                               rc = -EVMS_FEATURE_FATAL_ERROR;
13600 +                               LOG_ERROR
13601 +                                   ("%s: mismatching child metadata for nodes '%s' and '%s'\n",
13602 +                                    __FUNCTION__,
13603 +                                    rd->child_table[i - 1].child_node->name,
13604 +                                    child_entry->child_node->name);
13605 +                       }
13606 +               }
13607 +               /* stop if fatal error encountered */
13608 +               if (rc == -EVMS_FEATURE_FATAL_ERROR) {
13609 +                       break;
13610 +               }
13611 +       }
13612 +       return (rc);
13613 +}
13614 +
13615 +/**
13616 + * perform_final_adjustments:  do final tweaks to parent node
13617 + * @node:      parent node
13618 + *
13619 + * This function does the following:
13620 + *           sets the vsize (in vsectors) field in each child node
13621 + *           sets the voffset (in vsectors) field in each child node
13622 + *           frees each child node's metadata
13623 + *           sets the parent's total size field
13624 +**/
13625 +static void
13626 +perform_final_adjustments(struct evms_logical_node *node)
13627 +{
13628 +       int i;
13629 +       struct runtime_data *rd;
13630 +       struct runtime_entry *child_entry = NULL;
13631 +       struct evms_drivelink_metadata *ref_data = NULL;
13632 +
13633 +       rd = (struct runtime_data *) node->private;
13634 +       /* find a valid copy of the ordering table.
13635 +        * since all the ordering tables are the same
13636 +        * we can just pick one to use for all the
13637 +        * child computations.
13638 +        */
13639 +       for (i = 0; i < rd->child_count; i++) {
13640 +               child_entry = (struct runtime_entry *) &rd->child_table[i];
13641 +               if (child_entry->child_node) {
13642 +                       ref_data = child_entry->child_metadata;
13643 +                       break;
13644 +               }
13645 +       }
13646 +       /* if we got this far, there should
13647 +        * always be at least one valid child.
13648 +        */
13649 +       if (!ref_data)
13650 +               BUG();
13651 +       /* compute the parent's usable size,
13652 +        * and construct the table used to
13653 +        * remap parent I/Os to child I/Os */
13654 +       for (i = 0; i < rd->child_count; i++) {
13655 +               child_entry = (struct runtime_entry *) &rd->child_table[i];
13656 +               /* set the LBA count for this child node */
13657 +               child_entry->vsize = ref_data->ordering_table[i].child_vsize;
13658 +               /* set the start LBA value for this child node */
13659 +               child_entry->voffset = node->total_vsectors;
13660 +               /* keep a running total of size in sectors */
13661 +               node->total_vsectors += child_entry->vsize;
13662 +               /* free the metadata for this child node */
13663 +               if (ref_data != child_entry->child_metadata) {
13664 +                       kfree(child_entry->child_metadata);
13665 +               }
13666 +               child_entry->child_metadata = NULL;
13667 +               /* free the feature header for this child node */
13668 +               if (child_entry->child_node) {
13669 +                       kfree(child_entry->child_node->feature_header);
13670 +                       child_entry->child_node->feature_header = NULL;
13671 +               }
13672 +       }
13673 +       /* free the reference data */
13674 +       kfree(ref_data);
13675 +}
13676 +
13677 +/**
13678 + * finalize_parent_nodes: verify and prepare parent nodes
13679 + * @discover_list:     list of potential drivelink parent objects
13680 + *
13681 + * verify the completeness of each parent node. if not complete, purge the in-memory
13682 + * structs for this object and all its children. If complete, perform final tweaks
13683 + * to allow this node to useable.
13684 + *
13685 + * Return value: 0 on success
13686 + *              otherwise error code
13687 +**/
13688 +static int
13689 +finalize_parent_nodes(struct evms_logical_node **discover_list)
13690 +{
13691 +       int rc = 0, rc2;
13692 +       struct evms_logical_node *node, *next_node;
13693 +
13694 +       for (node = *discover_list; node; node = next_node) {
13695 +               next_node = node->next;
13696 +               /* only check parent nodes */
13697 +               if (!node->feature_header) {
13698 +                       /* valid the children of this parent */
13699 +                       rc = test_parent_node(node);
13700 +                       if (!rc) {
13701 +                               /* compute parent size and
13702 +                                * child remap table.
13703 +                                */
13704 +                               perform_final_adjustments(node);
13705 +                       } else {
13706 +                               /* fatal error encountered.
13707 +                                * cleanup from this node and
13708 +                                * delete it from memory.
13709 +                                */
13710 +                               evms_cs_remove_logical_node_from_list
13711 +                                   (discover_list, node);
13712 +                               rc2 = DELETE(node);
13713 +                               if (rc2) {
13714 +                                       LOG_SERIOUS
13715 +                                           ("error(%d) attempting to delete '%s'.\n",
13716 +                                            rc2, node->name);
13717 +                               }
13718 +                       }
13719 +               }
13720 +       }
13721 +       return (rc);
13722 +}
13723 +
13724 +/**
13725 + * drivelink_discover: discover drivelinked storage objects
13726 + * @discover_list:     the list of objects to inspect
13727 + *
13728 + * perform the drivelink discover process on the objects in the discovery list
13729 + *
13730 + * Return value: 0 on success
13731 + *              otherwise error code
13732 +**/
13733 +static int
13734 +drivelink_discover(struct evms_logical_node **discover_list)
13735 +{
13736 +       int rc = 0;
13737 +
13738 +       MOD_INC_USE_COUNT;
13739 +       rc = process_child_nodes(discover_list);
13740 +       if (!rc)
13741 +               rc = finalize_parent_nodes(discover_list);
13742 +
13743 +       MOD_DEC_USE_COUNT;
13744 +       return (rc);
13745 +}
13746 +
13747 +/********************************************************/
13748 +/* Required Plugin Function Table Entry Point:          */
13749 +/*      Delete function                                 */
13750 +/********************************************************/
13751 +
13752 +/**
13753 + * drivelink_delete: purges a drivelink object and its children from memory
13754 + * @node:      the drivelink object to delete
13755 + *
13756 + * purge the drivelink object, its private data, and all its children from memory.
13757 + *
13758 + * Return value: 0 on success
13759 + *              otherwise error code
13760 +**/
13761 +static int
13762 +drivelink_delete(struct evms_logical_node *node)
13763 +{
13764 +       int i, rc = 0;
13765 +       struct runtime_data *rd;
13766 +       struct runtime_entry *child_entry;
13767 +
13768 +       LOG_DETAILS("deleting '%s'.\n", node->name);
13769 +
13770 +       rd = (struct runtime_data *) node->private;
13771 +       if (rd) {
13772 +               for (i = 0; i < rd->child_count; i++) {
13773 +                       child_entry = &rd->child_table[i];
13774 +                       /* delete the child node */
13775 +                       if (child_entry->child_node) {
13776 +                               rc = DELETE(child_entry->child_node);
13777 +                               if (rc)
13778 +                                       break;
13779 +                               child_entry->child_node = NULL;
13780 +                       }
13781 +                       /* delete the child's metadata */
13782 +                       if (child_entry->child_metadata) {
13783 +                               kfree(child_entry->child_metadata);
13784 +                               child_entry->child_metadata = NULL;
13785 +                       }
13786 +               }
13787 +               if (!rc) {
13788 +                       /* delete the child table */
13789 +                       if (rd->child_table) {
13790 +                               kfree(rd->child_table);
13791 +                               rd->child_table = NULL;
13792 +                       }
13793 +                       /* delete the instance data */
13794 +                       kfree(rd);
13795 +                       node->private = NULL;
13796 +               }
13797 +       }
13798 +       if (!rc) {
13799 +               evms_cs_deallocate_logical_node(node);
13800 +               MOD_DEC_USE_COUNT;
13801 +       }
13802 +
13803 +       return (rc);
13804 +}
13805 +
13806 +/**
13807 + * which_child: find the child node targetted by a IO to this drivelink object
13808 + * @parent:            parent drivelink object
13809 + * @rsector:           relative sector on the parent object
13810 + * @max_io_sects:      largest IO size on the child, starting from rsector position
13811 + *
13812 + * This function find the child node a parent rsector maps to.
13813 + * It then adjusts the rsector value to be child relative and
13814 + * optionally computes the max # of sectors that can be access
13815 + * from this starting point on the child.
13816 + *
13817 + * Return value:
13818 + *   The child node, the child relative rsector and max io size are
13819 + * returned to the caller. On error, the returned child node will
13820 + * be NULL.
13821 +**/
13822 +static struct evms_logical_node *
13823 +which_child(struct evms_logical_node *parent,
13824 +           u64 * rsector, u64 * max_io_sects)
13825 +{
13826 +       int i;
13827 +       struct evms_logical_node *child = NULL;
13828 +       struct runtime_data *rd;
13829 +       struct runtime_entry *child_entry = NULL;
13830 +
13831 +       rd = (struct runtime_data *) parent->private;
13832 +       for (i = 0; i < rd->child_count; i++) {
13833 +               child_entry = (struct runtime_entry *) &rd->child_table[i];
13834 +
13835 +               if (*rsector >= child_entry->vsize) {
13836 +                       *rsector -= child_entry->vsize;
13837 +               } else {
13838 +                       /* get the child node */
13839 +                       child = child_entry->child_node;
13840 +                       /* compute the sector count if requested */
13841 +                       if (max_io_sects)
13842 +                               /* this is only used for INIT I/O
13843 +                                * to return the largest sector
13844 +                                * count size for this child based
13845 +                                * on first sector in the I/O.
13846 +                                */
13847 +                               *max_io_sects = child_entry->vsize - *rsector;
13848 +                       break;
13849 +               }
13850 +       }
13851 +       return (child);
13852 +}
13853 +
13854 +/**
13855 + * drivelink_io_error: log an IO error for drivelink
13856 + * @node:      drivelink object
13857 + * @bh:        buffer head targetting this object
13858 + *
13859 + * this function was primarily created because the function
13860 + * buffer_IO_error is inline and kgdb doesn't allow breakpoints
13861 + * to be set on inline functions. Since this was an error path
13862 + * and not mainline, I decided to add a trace statement to help
13863 + * report on the failing condition.
13864 +**/
13865 +static void
13866 +drivelink_io_error(struct evms_logical_node *node, int io_flag, struct buffer_head *bh)
13867 +{
13868 +       LOG_SERIOUS("%s error on '%s' remapping rsector("PFU64").\n",
13869 +                   (io_flag) ? "WRITE" : "READ",
13870 +                   node->name, (u64) bh->b_rsector);
13871 +
13872 +       bh->b_end_io(bh, 0);
13873 +}
13874 +
13875 +/********************************************************/
13876 +/* Required Plugin Function Table Entry Point:          */
13877 +/*      Read function & Support routines                */
13878 +/********************************************************/
13879 +
13880 +/**
13881 + * drivelink_read: handles IO read operations to drivelink objects
13882 + * @node:      drivelink object
13883 + * @bh:        buffer head targetting this object
13884 + *
13885 + * handles IO read operations to the drivelink objects. internally remaps the
13886 + * drivelink relative requests to the child relative requests and then routes
13887 + * it to the child for further processing.
13888 +**/
13889 +static void
13890 +drivelink_read(struct evms_logical_node *node, struct buffer_head *bh)
13891 +{
13892 +       struct evms_logical_node *child;
13893 +       u64 io_size, rsector;
13894 +
13895 +       rsector = bh->b_rsector;
13896 +       child = which_child(node, &rsector, &io_size);
13897 +       if (child && ((bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) <= io_size)) {
13898 +               bh->b_rsector = rsector;
13899 +               R_IO(child, bh);
13900 +       } else {
13901 +               drivelink_io_error(node, READ, bh);
13902 +       }
13903 +}
13904 +
13905 +/********************************************************/
13906 +/* Required Plugin Function Table Entry Point:          */
13907 +/*      Write function & Support routines               */
13908 +/********************************************************/
13909 +
13910 +/**
13911 + * drivelink_read_write: handles IO write operations to drivelink objects
13912 + * @node:      drivelink object
13913 + * @bh:        buffer head targetting this object
13914 + *
13915 + * handles IO write operations to the drivelink objects. internally remaps the
13916 + * drivelink relative requests to the child relative requests and then routes
13917 + * it to the child for further processing.
13918 +**/
13919 +static void
13920 +drivelink_write(struct evms_logical_node *node, struct buffer_head *bh)
13921 +{
13922 +       struct evms_logical_node *child;
13923 +       u64 io_size, rsector;
13924 +
13925 +       rsector = bh->b_rsector;
13926 +       child = which_child(node, &rsector, &io_size);
13927 +       if (child && ((bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) <= io_size)) {
13928 +               bh->b_rsector = rsector;
13929 +               W_IO(child, bh);
13930 +       } else {
13931 +               drivelink_io_error(node, WRITE, bh);
13932 +       }
13933 +}
13934 +
13935 +/********************************************************/
13936 +/* Required Plugin Function Table Entry Point:          */
13937 +/*      Init I/O function                               */
13938 +/********************************************************/
13939 +
13940 +/**
13941 + * drivelink_init_io: performs synchronous IO to drivelink objects
13942 + * @node:      drivelink object
13943 + * @io_flag:   read/write flag
13944 + * @sect_nr:   starting sector, object relative (512 byte units)
13945 + * @num_sects:         count of sectors
13946 + * @buf_addr:  buffer address to read from/write to
13947 + *
13948 + * This function must determine which child or children a
13949 + * specified I/O request must be passed to. Also if, when,
13950 + * and how a request must be broken up.
13951 + *
13952 + * Return value: 0 on success
13953 + *              otherwise error code
13954 +**/
13955 +static int
13956 +drivelink_init_io(struct evms_logical_node *node, int io_flag,
13957 +                 u64 sect_nr,
13958 +                 u64 num_sects,
13959 +                 void *buf_addr)
13960 +{
13961 +       int rc = 0;
13962 +
13963 +       if (!node)
13964 +               rc = -EINVAL;
13965 +       else {
13966 +               u64 starting_sector, remaining_sectors;
13967 +               void *io_buf;
13968 +               struct runtime_data *rd;
13969 +
13970 +               if ((sect_nr + num_sects) > node->total_vsectors) {
13971 +                       LOG_SERIOUS
13972 +                           ("attempted out of bound("PFU64") %s on '%s' at sector("PFU64"), count("PFU64").\n",
13973 +                            node->total_vsectors, (io_flag) ? "WRITE" : "READ",
13974 +                            node->name, sect_nr, num_sects);
13975 +                       rc = -EINVAL;
13976 +               } else {
13977 +                       rd = (struct runtime_data *) node->private;
13978 +                       /* make working copies of input parameters */
13979 +                       starting_sector = sect_nr;
13980 +                       remaining_sectors = num_sects;
13981 +                       io_buf = buf_addr;
13982 +                       /* loop until all I/O is performed */
13983 +                       while (remaining_sectors) {
13984 +                               u64 io_start, io_size;
13985 +                               struct evms_logical_node *child;
13986 +
13987 +                               /* compute the child relative io_start
13988 +                                * and max io_size.
13989 +                                */
13990 +                               io_start = starting_sector;
13991 +                               child = which_child(node, &io_start, &io_size);
13992 +                               /* adjust io_size based on
13993 +                                * original remaining sectors
13994 +                                * in this io.
13995 +                                */
13996 +                               if (io_size > remaining_sectors)
13997 +                                       io_size = remaining_sectors;
13998 +                               if (child) {
13999 +                                       rc = INIT_IO(child,
14000 +                                                    io_flag,
14001 +                                                    io_start, io_size, io_buf);
14002 +                               } else {
14003 +                                       /* if partial volume, return 0's
14004 +                                        * for missing children.
14005 +                                        */
14006 +                                       if (io_flag == READ) {
14007 +                                               memset(io_buf, 0,
14008 +                                                      io_size <<
14009 +                                                      EVMS_VSECTOR_SIZE_SHIFT);
14010 +                                       }
14011 +                               }
14012 +                               if (!rc) {
14013 +                                       /* adjust working copies */
14014 +                                       starting_sector += io_size;
14015 +                                       remaining_sectors -= io_size;
14016 +                                       io_buf += io_size <<
14017 +                                           EVMS_VSECTOR_SIZE_SHIFT;
14018 +                               } else
14019 +                                       break;
14020 +                       }
14021 +               }
14022 +       }
14023 +
14024 +       return (rc);
14025 +}
14026 +
14027 +/********************************************************/
14028 +/* Required Plugin Function Table Entry Point:          */
14029 +/*      IOCTL function & Support routines               */
14030 +/********************************************************/
14031 +
14032 +/**
14033 + * drivelink_ioctl_cmd_plugin_ioctl: drivelink support for the 'plugin ioctl' command
14034 + * @node:      drivelink object
14035 + * @inode:     VFS supplied parameter
14036 + * @file:      VFS supplied parameter
14037 + * @cmd:       the specific ioctl command
14038 + * @arg:       the specific ioctl arguments
14039 + *
14040 + * this function handles 'plugin ioctl' commands. currently there is no specific
14041 + * commands for this plugin. however, this plugin must broadcast some commands so
14042 + * lower layers can receive them.
14043 + *
14044 + * Return value: 0 on success
14045 + *              otherwise error code
14046 +**/
14047 +static int
14048 +drivelink_ioctl_cmd_plugin_ioctl(struct evms_logical_node *node,
14049 +                                struct inode *inode, struct file *file,
14050 +                                unsigned long cmd, unsigned long arg)
14051 +{
14052 +       int i, rc = 0;
14053 +       struct runtime_data *rd;
14054 +       struct evms_plugin_ioctl_pkt tmp, *user_parms;
14055 +
14056 +       user_parms = (struct evms_plugin_ioctl_pkt *) arg;
14057 +       /* copy user's parameters to kernel space */
14058 +       if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
14059 +               rc = -EFAULT;
14060 +
14061 +       if (!rc) {
14062 +               rd = (struct runtime_data *) node->private;
14063 +               /* is this cmd targetted at this feature ? */
14064 +               if (tmp.feature_id == node->plugin->id) {
14065 +                       switch (tmp.feature_command) {
14066 +                       default:
14067 +                               break;
14068 +                       }
14069 +               } else {        /* broadcast this cmd to all children */
14070 +                       for (i = 0; i < rd->child_count; i++) {
14071 +                               struct evms_logical_node *child_node;
14072 +
14073 +                               child_node = rd->child_table[i].child_node;
14074 +                               if (child_node) {
14075 +                                       rc = IOCTL(child_node, inode, file,
14076 +                                                  cmd, arg);
14077 +                                       if (rc)
14078 +                                               break;
14079 +                               }
14080 +                       }
14081 +               }
14082 +               /* copy info to userspace */
14083 +               if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
14084 +                       rc = -EFAULT;
14085 +       }
14086 +       return (rc);
14087 +}
14088 +
14089 +/**
14090 + * drivelink_ioctl_cmd_broadcast: broadcast ioctls to your children
14091 + * @node:      drivelink object
14092 + * @inode:     VFS supplied parameter
14093 + * @file:      VFS supplied parameter
14094 + * @cmd:       the specific ioctl command
14095 + * @arg:       the specific ioctl arguments
14096 + *
14097 + * broadcast the specified ioctl command and arguments to all this objects
14098 + * children. OR (logical opeation) the return values from all the children
14099 + * and return the OR'd value to the caller.
14100 + *
14101 + * Return value: 0 on success
14102 + *              otherwise error code
14103 +**/
14104 +static int
14105 +drivelink_ioctl_cmd_broadcast(struct evms_logical_node *node,
14106 +                             struct inode *inode, struct file *file,
14107 +                             unsigned long cmd, unsigned long arg)
14108 +{
14109 +       int i, rc = 0;
14110 +       struct runtime_data *rd;
14111 +
14112 +       rd = (struct runtime_data *) node->private;
14113 +       /* broadcast this cmd to all children */
14114 +       for (i = 0; i < rd->child_count; i++) {
14115 +               struct evms_logical_node *child_node;
14116 +
14117 +               child_node = rd->child_table[i].child_node;
14118 +               if (child_node) {
14119 +                       rc |= IOCTL(child_node, inode, file, cmd, arg);
14120 +               }
14121 +       }
14122 +       return (rc);
14123 +}
14124 +
14125 +/**
14126 + * drivelink_ioctl: main ioctl entry point and handler
14127 + * @node:      drivelink object
14128 + * @inode:     VFS supplied parameter
14129 + * @file:      VFS supplied parameter
14130 + * @cmd:       a specific ioctl command
14131 + * @arg:       a specific ioctl argument
14132 + *
14133 + * handles specific ioctl command internally and routes other ioctls commands to
14134 + * the appropriate entry points.
14135 + *
14136 + * Returns: 0 on success
14137 + *         otherwise error code
14138 + **/
14139 +static int
14140 +drivelink_ioctl(struct evms_logical_node *node,
14141 +               struct inode *inode,
14142 +               struct file *file, unsigned int cmd, unsigned long arg)
14143 +{
14144 +       int rc = 0;
14145 +       struct runtime_data *rd = NULL;
14146 +       struct hd_geometry hdgeo;
14147 +
14148 +       if ((!node) || (!inode))
14149 +               rc = -EINVAL;
14150 +
14151 +       if (!rc) {
14152 +               rd = (struct runtime_data *) node->private;
14153 +               switch (cmd) {
14154 +               case HDIO_GETGEO:
14155 +                       hdgeo.heads = 255;
14156 +                       hdgeo.sectors = 63;
14157 +                       hdgeo.cylinders =
14158 +                           ((unsigned int) node->total_vsectors) /
14159 +                           hdgeo.heads / hdgeo.sectors;
14160 +                       hdgeo.start = 0;
14161 +                       if (copy_to_user((int *) arg, &hdgeo, sizeof (hdgeo)))
14162 +                               rc = -EFAULT;
14163 +                       break;
14164 +               case EVMS_QUIESCE_VOLUME:
14165 +               case EVMS_GET_DISK_LIST:
14166 +               case EVMS_CHECK_MEDIA_CHANGE:
14167 +               case EVMS_REVALIDATE_DISK:
14168 +               case EVMS_OPEN_VOLUME:
14169 +               case EVMS_CLOSE_VOLUME:
14170 +               case EVMS_CHECK_DEVICE_STATUS:
14171 +                       rc = drivelink_ioctl_cmd_broadcast(node, inode, file,
14172 +                                                          cmd, arg);
14173 +                       break;
14174 +               case EVMS_PLUGIN_IOCTL:
14175 +                       rc = drivelink_ioctl_cmd_plugin_ioctl(node, inode, file,
14176 +                                                             cmd, arg);
14177 +                       break;
14178 +               case EVMS_GET_BMAP:
14179 +                       {
14180 +                               struct evms_get_bmap_pkt *bmap;
14181 +                               u64 io_start, io_size;
14182 +                               struct evms_logical_node *child;
14183 +
14184 +                               bmap = (struct evms_get_bmap_pkt *) arg;
14185 +                               io_start = bmap->rsector;
14186 +                               child = which_child(node, &io_start, &io_size);
14187 +                               if (child) {
14188 +                                       if (node->block_size !=
14189 +                                           child->block_size) {
14190 +                                               bmap->status = -EPERM;
14191 +                                       } else {
14192 +                                               bmap->rsector = io_start;
14193 +                                               rc = IOCTL(child,
14194 +                                                          inode,
14195 +                                                          file, cmd, arg);
14196 +                                       }
14197 +                               }
14198 +                       }
14199 +                       break;
14200 +               default:
14201 +                       rc = -EINVAL;
14202 +                       break;
14203 +               }
14204 +       }
14205 +       return (rc);
14206 +}
14207 +
14208 +/********************************************************/
14209 +/* Required Module Entry Point:                         */
14210 +/*      drivelink_init                                  */
14211 +/********************************************************/
14212 +
14213 +/**
14214 + * drivelink_init: register this module for use within the EVMS framework
14215 + *
14216 + * Return value: 0 on success
14217 + *              otherwise error code.
14218 +**/
14219 +int __init
14220 +drivelink_init(void)
14221 +{
14222 +       return evms_cs_register_plugin(&plugin_header);
14223 +}
14224 +
14225 +/**
14226 + * drivelink_exit: unregister this module from use within the EVMS framework
14227 + *
14228 + * Return value: 0 on success
14229 + *              otherwise error code.
14230 +**/
14231 +void __exit
14232 +drivelink_exit(void)
14233 +{
14234 +       evms_cs_unregister_plugin(&plugin_header);
14235 +}
14236 +
14237 +module_init(drivelink_init);
14238 +module_exit(drivelink_exit);
14239 +#ifdef MODULE_LICENSE
14240 +MODULE_LICENSE("GPL");
14241 +#endif
14242 diff -Naur linux-2002-09-30/drivers/evms/evms_ecr.c evms-2002-09-30/drivers/evms/evms_ecr.c
14243 --- linux-2002-09-30/drivers/evms/evms_ecr.c    Wed Dec 31 18:00:00 1969
14244 +++ evms-2002-09-30/drivers/evms/evms_ecr.c     Fri Aug 16 16:19:56 2002
14245 @@ -0,0 +1,213 @@
14246 +/* -*- linux-c -*- */
14247 +/*
14248 + *
14249 + *   Copyright (c) International Business Machines  Corp., 2000
14250 + *
14251 + *   This program is free software;  you can redistribute it and/or modify
14252 + *   it under the terms of the GNU General Public License as published by
14253 + *   the Free Software Foundation; either version 2 of the License, or
14254 + *   (at your option) any later version.
14255 + *
14256 + *   This program is distributed in the hope that it will be useful,
14257 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
14258 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
14259 + *   the GNU General Public License for more details.
14260 + *
14261 + *   You should have received a copy of the GNU General Public License
14262 + *   along with this program;  if not, write to the Free Software
14263 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
14264 + */
14265 +
14266 +/* linux/driver/evms/evms_ecr.c
14267 + *
14268 + * EVMS - Cluster enablement (ECR) module
14269 + *
14270 + */
14271 +
14272 +
14273 +#include <linux/kernel.h>
14274 +#include <linux/module.h>
14275 +#include <linux/init.h>
14276 +#include <linux/types.h>
14277 +#include <linux/evms/evms.h>
14278 +#include <linux/evms/evms_ecr.h>
14279 +
14280 +#define LOG_PREFIX "ecr: "
14281 +
14282 +
14283 +/*
14284 + *  ecr_group_join
14285 + */
14286 +ecr_group_t ecr_group_join(char *group_name, ecr_table_t *f_table,
14287 +                  ecr_cred_t * cred, size_t size, ecr_instance_t *instance)
14288 +{
14289 +       /* dummy */
14290 +       return ECR_FAIL;
14291 +}
14292 +
14293 +
14294 +
14295 +
14296 +/*
14297 + *  ecr_group_leave
14298 + */
14299 +void  ecr_group_leave(ecr_group_t group)
14300 +{
14301 +       /* dummy */
14302 +       return;
14303 +}
14304 +
14305 +
14306 +
14307 +/*
14308 + * ecr_group_send
14309 + */
14310 +int ecr_group_send(ecr_group_t group, ecr_nodeid_t node, void *message,
14311 +               size_t size, ecr_instance_t *instance,
14312 +               void callback(int ret, ecr_instance_t *instance))
14313 +{
14314 +       /* dummy */
14315 +       return ECR_FAIL;
14316 +}
14317 +
14318 +
14319 +
14320 +/*
14321 + * ecr_group_send_wait
14322 + */
14323 +int ecr_group_send_wait(ecr_group_t group, ecr_nodeid_t node, void *message,
14324 +               size_t size, int *ret)
14325 +{
14326 +       /* dummy */
14327 +       *ret = ECR_FAIL;
14328 +       return ECR_FAIL;
14329 +}
14330 +
14331 +
14332 +
14333 +/*
14334 + * ecr_group_broadcast
14335 + */
14336 +int ecr_group_broadcast(ecr_group_t group, void *message, size_t size,
14337 +                       ecr_instance_t *instance,
14338 +                       void callback(u_char ret, ecr_instance_t *instance))
14339 +{
14340 +       /* dummy */
14341 +       return ECR_FAIL;
14342 +}
14343 +
14344 +
14345 +
14346 +/*
14347 + * ecr_group_broadcast_wait
14348 + */
14349 +int ecr_group_broadcast_wait(ecr_group_t group, void *message, size_t size,
14350 +                       u_char *ret)
14351 +{
14352 +       /* dummy */
14353 +       *ret = ECR_FAIL;
14354 +       return ECR_FAIL;
14355 +}
14356 +
14357 +
14358 +
14359 +/*
14360 + * ecr_group_atomic_execute
14361 + */
14362 +int ecr_group_atomic_execute(ecr_group_t group, void *message, size_t size,
14363 +                       ecr_instance_t *instance,
14364 +                       void callback(ecr_instance_t *instance))
14365 +{
14366 +       /* dummy */
14367 +       return ECR_FAIL;
14368 +}
14369 +
14370 +
14371 +
14372 +/*
14373 + * ecr_group_atomic_execute_wait
14374 + */
14375 +int ecr_group_atomic_execute_wait(ecr_group_t group, void *message, size_t size)
14376 +{
14377 +       /* dummy */
14378 +       return ECR_FAIL;
14379 +}
14380 +
14381 +
14382 +
14383 +/*
14384 + * ecr_group_success_response
14385 + */
14386 +void ecr_group_success_response(ecr_message_t *handle)
14387 +{
14388 +       /* dummy */
14389 +       return;
14390 +}
14391 +
14392 +
14393 +
14394 +
14395 +/*
14396 + * ecr_group_failure_response
14397 + */
14398 +void ecr_group_failure_response(ecr_message_t *handle, int ret)
14399 +{
14400 +       /* dummy */
14401 +       return;
14402 +}
14403 +
14404 +
14405 +
14406 +/*
14407 + * ecr_lock_create
14408 + */
14409 +ecr_lock_t ecr_lock_create(char *lockname)
14410 +{
14411 +       /* dummy */
14412 +       return ECR_FAIL;
14413 +}
14414 +
14415 +/*
14416 + * ecr_lock
14417 + */
14418 +int  ecr_lock(ecr_lock_t lock, u64 start, u64 length,
14419 +               ecr_lock_mode_t mode, u_char flag)
14420 +{
14421 +       /* dummy */
14422 +       return ECR_FAIL;
14423 +}
14424 +
14425 +
14426 +
14427 +/*
14428 + * ecr_unlock
14429 + */
14430 +int ecr_unlock(ecr_lock_t lock, u64 start, u64 length)
14431 +{
14432 +       /* dummy */
14433 +       return ECR_FAIL;
14434 +}
14435 +
14436 +
14437 +/********************************************************/
14438 +/* Required Module Entry Point:                         */
14439 +/*      ecr_init()                                        */
14440 +/********************************************************/
14441 +
14442 +static int __init ecr_init(void)
14443 +{
14444 +        /* dummy */
14445 +       return 0;
14446 +}
14447 +
14448 +static void __exit ecr_exit(void)
14449 +{
14450 +       return;
14451 +}
14452 +
14453 +module_init(ecr_init);
14454 +module_exit(ecr_exit);
14455 +#ifdef MODULE_LICENSE
14456 +MODULE_LICENSE("GPL");
14457 +#endif
14458 +
14459 diff -Naur linux-2002-09-30/drivers/evms/evms_passthru.c evms-2002-09-30/drivers/evms/evms_passthru.c
14460 --- linux-2002-09-30/drivers/evms/evms_passthru.c       Wed Dec 31 18:00:00 1969
14461 +++ evms-2002-09-30/drivers/evms/evms_passthru.c        Fri Sep 13 16:09:55 2002
14462 @@ -0,0 +1,298 @@
14463 +/* -*- linux-c -*- */
14464 +
14465 +/*
14466 + *
14467 + *
14468 + *   Copyright (c) International Business Machines  Corp., 2000
14469 + *
14470 + *   This program is free software;  you can redistribute it and/or modify
14471 + *   it under the terms of the GNU General Public License as published by
14472 + *   the Free Software Foundation; either version 2 of the License, or
14473 + *   (at your option) any later version.
14474 + *
14475 + *   This program is distributed in the hope that it will be useful,
14476 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
14477 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
14478 + *   the GNU General Public License for more details.
14479 + *
14480 + *   You should have received a copy of the GNU General Public License
14481 + *   along with this program;  if not, write to the Free Software
14482 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
14483 + *
14484 + *
14485 + */
14486 +/*
14487 + * linux/drivers/evms/evms_passthru.c
14488 + *
14489 + * EVMS System Data Manager
14490 + *
14491 + *
14492 + */
14493 +
14494 +#include <linux/module.h>
14495 +#include <linux/kernel.h>
14496 +#include <linux/config.h>
14497 +#include <linux/genhd.h>
14498 +#include <linux/string.h>
14499 +#include <linux/blk.h>
14500 +#include <linux/init.h>
14501 +#include <linux/slab.h>
14502 +#include <linux/evms/evms.h>
14503 +#include <asm/system.h>
14504 +
14505 +#define EVMS_PASSTHRU_ID     0
14506 +#define LOG_PREFIX "passthru: "
14507 +
14508 +static int passthru_mgr_discover(struct evms_logical_node **);
14509 +static int passthru_mgr_delete(struct evms_logical_node *);
14510 +static void passthru_mgr_read(struct evms_logical_node *, struct buffer_head *);
14511 +static void passthru_mgr_write(struct evms_logical_node *, struct buffer_head *);
14512 +static int passthru_mgr_ioctl(struct evms_logical_node *,
14513 +                             struct inode *,
14514 +                             struct file *, unsigned int, unsigned long);
14515 +static int passthru_mgr_init_io(struct evms_logical_node *,
14516 +                               int, u64, u64, void *);
14517 +
14518 +static struct evms_plugin_fops fops = {
14519 +       .discover       = passthru_mgr_discover,
14520 +       .delete         = passthru_mgr_delete,
14521 +       .read           = passthru_mgr_read,
14522 +       .write          = passthru_mgr_write,
14523 +       .init_io        = passthru_mgr_init_io,
14524 +       .ioctl          = passthru_mgr_ioctl
14525 +};
14526 +
14527 +static struct evms_plugin_header plugin_header = {
14528 +       .id = SetPluginID(IBM_OEM_ID,
14529 +                         EVMS_FEATURE,
14530 +                         EVMS_PASSTHRU_ID),
14531 +       .version = {
14532 +               .major          = 1,
14533 +               .minor          = 1,
14534 +               .patchlevel     = 1
14535 +       },
14536 +       .required_services_version = {
14537 +               .major          = 0,
14538 +               .minor          = 5,
14539 +               .patchlevel     = 0
14540 +       },
14541 +       .fops = &fops
14542 +};
14543 +
14544 +/*******************************/
14545 +/* discovery support functions */
14546 +/*******************************/
14547 +
14548 +static int
14549 +process_passthru_data(struct evms_logical_node **pp)
14550 +{
14551 +       int rc, size_in_sectors;
14552 +       struct evms_logical_node *node, *new_node;
14553 +
14554 +       node = *pp;
14555 +
14556 +       size_in_sectors =
14557 +           evms_cs_size_in_vsectors(sizeof (struct evms_feature_header));
14558 +
14559 +       /* allocate "parent" node */
14560 +       rc = evms_cs_allocate_logical_node(&new_node);
14561 +       if (!rc) {
14562 +               /* initialize "parent" node */
14563 +               new_node->private = node;
14564 +               new_node->flags = node->flags;
14565 +               new_node->plugin = &plugin_header;
14566 +               new_node->system_id = node->system_id;
14567 +               new_node->block_size = node->block_size;
14568 +               new_node->hardsector_size = node->hardsector_size;
14569 +               new_node->total_vsectors = node->total_vsectors;
14570 +               new_node->total_vsectors -=
14571 +                   (size_in_sectors << 1) +
14572 +                   node->feature_header->alignment_padding;
14573 +               new_node->volume_info = node->volume_info;
14574 +               strcpy(new_node->name, node->name);
14575 +               if (strlen(node->feature_header->object_name))
14576 +                       strcat(new_node->name,
14577 +                              node->feature_header->object_name);
14578 +               else
14579 +                       strcat(new_node->name, "_Passthru");
14580 +
14581 +               /* return "parent" node to caller */
14582 +               *pp = new_node;
14583 +
14584 +               MOD_INC_USE_COUNT;
14585 +
14586 +               LOG_DETAILS("feature header found on '%s', created '%s'.\n",
14587 +                           node->name, new_node->name);
14588 +               /* we're done with the passthru feature headers
14589 +                * so lets delete them now.
14590 +                */
14591 +               kfree(node->feature_header);
14592 +               node->feature_header = NULL;
14593 +       } else {
14594 +               /* on any fatal error, delete the node */
14595 +               int rc2 = DELETE(node);
14596 +               if (rc2) {
14597 +                       LOG_DEFAULT
14598 +                           ("error(%d) attempting to delete node(%p,%s).\n",
14599 +                            rc2, node, node->name);
14600 +               }
14601 +       }
14602 +       return (rc);
14603 +}
14604 +
14605 +/********** Required Plugin Functions **********/
14606 +
14607 +/*
14608 + * Function: passthru_mgr_discover
14609 + *
14610 + */
14611 +static int
14612 +passthru_mgr_discover(struct evms_logical_node **discover_list)
14613 +{
14614 +       int rc = 0;
14615 +       struct evms_logical_node *node, *tmp_list_head;
14616 +
14617 +       MOD_INC_USE_COUNT;
14618 +       tmp_list_head = *discover_list;
14619 +       *discover_list = NULL;
14620 +
14621 +       while (tmp_list_head) {
14622 +               node = tmp_list_head;
14623 +               rc = evms_cs_remove_logical_node_from_list(&tmp_list_head,
14624 +                                                          node);
14625 +               if (!rc)
14626 +                       rc = process_passthru_data(&node);
14627 +               if (!rc)
14628 +                       if (node)
14629 +                               rc = evms_cs_add_logical_node_to_list
14630 +                                   (discover_list, node);
14631 +       }
14632 +       MOD_DEC_USE_COUNT;
14633 +       return (rc);
14634 +}
14635 +
14636 +/*
14637 + * Function: passthru_mgr_delete
14638 + *
14639 + */
14640 +static int
14641 +passthru_mgr_delete(struct evms_logical_node *node)
14642 +{
14643 +       int rc;
14644 +       struct evms_logical_node *p;
14645 +
14646 +       LOG_DETAILS("deleting '%s'.\n", node->name);
14647 +
14648 +       p = node->private;
14649 +       rc = DELETE(p);
14650 +       if (!rc) {
14651 +               evms_cs_deallocate_logical_node(node);
14652 +               MOD_DEC_USE_COUNT;
14653 +       }
14654 +       return (rc);
14655 +}
14656 +
14657 +/*
14658 + * function: passthru_io_error
14659 + *
14660 + * this function was primarily created because the function
14661 + * buffer_IO_error is inline and kgdb doesn't allow breakpoints
14662 + * to be set on inline functions. Since this was an error path
14663 + * and not mainline, I decided to add a trace statement to help
14664 + * report on the failing condition.
14665 + *
14666 + */
14667 +static void
14668 +passthru_io_error(struct evms_logical_node *node, int io_flag, struct buffer_head *bh)
14669 +{
14670 +       LOG_SERIOUS
14671 +           ("attempt to %s beyond boundary("PFU64") on (%s), rsector("PFU64").\n",
14672 +            (io_flag) ? "WRITE" : "READ", node->total_vsectors - 1,
14673 +            node->name, (u64) bh->b_rsector);
14674 +
14675 +       bh->b_end_io(bh, 0);
14676 +}
14677 +
14678 +/*
14679 + * Function: passthru_mgr_read
14680 + */
14681 +static void
14682 +passthru_mgr_read(struct evms_logical_node *node, struct buffer_head *bh)
14683 +{
14684 +       if ((bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT)) <=
14685 +           node->total_vsectors) {
14686 +               R_IO(((struct evms_logical_node *) (node->private)), bh);
14687 +       } else
14688 +               passthru_io_error(node, READ, bh);
14689 +}
14690 +
14691 +/*
14692 + * Function: passthru_mgr_write
14693 + *
14694 + */
14695 +static void
14696 +passthru_mgr_write(struct evms_logical_node *node, struct buffer_head *bh)
14697 +{
14698 +       if ((bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT)) <=
14699 +           node->total_vsectors) {
14700 +               W_IO(((struct evms_logical_node *) (node->private)), bh);
14701 +       } else
14702 +               passthru_io_error(node, WRITE, bh);
14703 +}
14704 +
14705 +/*
14706 + * Function: passthru_mgr_ioctl
14707 + *
14708 + */
14709 +static int
14710 +passthru_mgr_ioctl(struct evms_logical_node *node,
14711 +                  struct inode *inode,
14712 +                  struct file *file, unsigned int cmd, unsigned long arg)
14713 +{
14714 +       int rc;
14715 +
14716 +       if ((!node) || (!inode))
14717 +               rc = -EINVAL;
14718 +       else
14719 +               rc = IOCTL(((struct evms_logical_node *) (node->private)),
14720 +                          inode, file, cmd, arg);
14721 +       return (rc);
14722 +}
14723 +
14724 +static int
14725 +passthru_mgr_init_io(struct evms_logical_node *node, int io_flag,      /* 0=read, 1=write */
14726 +                    u64 sect_nr,       /* disk LBA */
14727 +                    u64 num_sects,     /* # of sectors */
14728 +                    void *buf_addr)
14729 +{                              /* buffer address */
14730 +       int rc;
14731 +       if ((sect_nr + num_sects) <= node->total_vsectors) {
14732 +               rc = INIT_IO(((struct evms_logical_node *) (node->
14733 +                                                           private)),
14734 +                            io_flag, sect_nr, num_sects, buf_addr);
14735 +       } else
14736 +               rc = -EINVAL;
14737 +       return (rc);
14738 +}
14739 +
14740 +/*
14741 + * Function: passthru_init
14742 + *
14743 + */
14744 +int __init
14745 +evms_passthru_manager_init(void)
14746 +{
14747 +       return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
14748 +}
14749 +
14750 +void __exit
14751 +evms_passthru_manager_exit(void)
14752 +{
14753 +       evms_cs_unregister_plugin(&plugin_header);
14754 +}
14755 +
14756 +module_init(evms_passthru_manager_init);
14757 +module_exit(evms_passthru_manager_exit);
14758 +#ifdef MODULE_LICENSE
14759 +MODULE_LICENSE("GPL");
14760 +#endif
14761 diff -Naur linux-2002-09-30/drivers/evms/gpt_part.c evms-2002-09-30/drivers/evms/gpt_part.c
14762 --- linux-2002-09-30/drivers/evms/gpt_part.c    Wed Dec 31 18:00:00 1969
14763 +++ evms-2002-09-30/drivers/evms/gpt_part.c     Fri Sep 13 16:09:55 2002
14764 @@ -0,0 +1,1018 @@
14765 +/* -*- linux-c -*- */
14766 +/*
14767 + *
14768 + *
14769 + *   Copyright (c) International Business Machines  Corp., 2000
14770 + *
14771 + *   This program is free software;  you can redistribute it and/or modify
14772 + *   it under the terms of the GNU General Public License as published by
14773 + *   the Free Software Foundation; either version 2 of the License, or
14774 + *   (at your option) any later version.
14775 + *
14776 + *   This program is distributed in the hope that it will be useful,
14777 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
14778 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
14779 + *   the GNU General Public License for more details.
14780 + *
14781 + *   You should have received a copy of the GNU General Public License
14782 + *   along with this program;  if not, write to the Free Software
14783 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
14784 + *
14785 + *
14786 + */
14787 +
14788 +/* linux/driver/evms/gpt_part.c
14789 + *
14790 + * EVMS - EFI GPT segment manager plugin
14791 + *
14792 + * This plugin provides support for the GUID Partition Table format specified
14793 + * by the Extensible Firmware Interface documentation ... version 1.02
14794 + */
14795 +
14796 +#include <linux/config.h>
14797 +#include <linux/module.h>
14798 +#include <linux/kernel.h>
14799 +#include <linux/config.h>
14800 +#include <linux/string.h>
14801 +#include <linux/blk.h>
14802 +#include <asm/uaccess.h>
14803 +#include <linux/evms/evms.h>
14804 +
14805 +/* prefix used in logging messages */
14806 +#define LOG_PREFIX "gpt_part: "
14807 +
14808 +/**
14809 + * struct gpt_private -  Private data structure for this plugin
14810 + * @source_object: object this IO will get remapped to
14811 + * @start_sect: source object relative starting address in 512 byte units
14812 + * @nr_sect: partition size in 512 bytes units
14813 + * @type: partition type or filesystem format indicator
14814 + *
14815 + * private copy of the just the fields we require to remap IO requests
14816 + * to the underlying object.
14817 + **/
14818 +struct gpt_private {
14819 +       struct evms_logical_node *source_disk;
14820 +       u64 start_sect;
14821 +       u64 nr_sects;
14822 +       unsigned char type;
14823 +};
14824 +
14825 +#define GPT_DISKMAGIC               0x5452415020494645 // "EFI PART"
14826 +#define GPT_PNAME_SIZE              36 // max unicode partition name size
14827 +
14828 +/**
14829 + * struct guid - GUID structure
14830 + * @time_low: timestamp - low order 32 bits
14831 + * @time_mid: timestamp - mid 16 bits
14832 + * @time_high: timestamp - high 16 bits
14833 + * @clock_seq_high: clock - high order 8 bits
14834 + * @clock_seq_low: clock - low order 8 bits
14835 + * @node: spatial reference - unique id (ie. mac address of nic)
14836 + *
14837 + * GUID structure
14838 + **/
14839 +struct guid {
14840 +       u32 time_low;
14841 +       u16 time_mid;
14842 +       u16 time_high;
14843 +       u8 clock_seq_high;
14844 +       u8 clock_seq_low;
14845 +       u8 node[6];
14846 +};
14847 +
14848 +/**
14849 + * struct gpt_partition - GPT partition record definition
14850 + * @type: partition type
14851 + * @part_id: partition record id
14852 + * @start: address of 1st block of partition
14853 + * @end: address of last block of partition
14854 + * @attributes: bit field reserved by EFI spec
14855 + * @name: unicode name of partition
14856 + *
14857 + * GPT partition record definition
14858 + **/
14859 +struct gpt_partition {
14860 +       struct guid type;
14861 +       struct guid part_id;
14862 +       u64 start;
14863 +       u64 end;
14864 +       u64 attributes;
14865 +       u16 name[GPT_PNAME_SIZE];
14866 +};
14867 +
14868 +/**
14869 + * struct gpt_header - GPT header
14870 + * @signature: EFI compatible header signature
14871 + * @version: spec revision number
14872 + * @size: size (bytes) of gpt header
14873 + * @crc: crc of gpt header
14874 + * @reserve: reserved by spec ... must be zero
14875 + * @my_lba: lba of gpt header
14876 + * @alternate_lba: lba of 2nd copy of gpt header
14877 + * @start_useable: lba of 1st block of useable area on disk
14878 + * @end_useable: lba of last block of useable area on disk
14879 + * @disk_id: GUID - identifies this disk
14880 + * @ptable_lba: lba of partition table
14881 + * @ptable_count: number of entries in the partition table
14882 + * @ptable_entry_size: size of partition table entry
14883 + * @ptable_crc: crc of partition table
14884 + *
14885 + * GPT header
14886 + **/
14887 +struct gpt_header {
14888 +       u64 signature;
14889 +       u32 version;
14890 +       u32 size;
14891 +       u32 crc;
14892 +       u32 reserve;
14893 +       u64 my_lba;
14894 +       u64 alternate_lba;
14895 +       u64 start_useable;
14896 +       u64 end_useable;
14897 +       struct guid disk_id;
14898 +       u64 ptable_lba;
14899 +       u32 ptable_count;
14900 +       u32 ptable_entry_size;
14901 +       u32 ptable_crc;
14902 +};
14903 +
14904 +struct guid EFI_SYSTEM_PARTITION = {
14905 +       0xC12A7328,
14906 +       0xF81F,
14907 +       0x11D2,
14908 +       0xBA,
14909 +       0x4B,
14910 +       {0x00, 0xA0, 0xC9, 0x3E, 0xC9, 0x3B}
14911 +};
14912 +
14913 +struct guid BASIC_DATA_PARTITION = {
14914 +       0xEBD0A0A2,
14915 +       0xB9E5,
14916 +       0x4433,
14917 +       0x87,
14918 +       0xC0,
14919 +       {0x68, 0xB6, 0xB7, 0x26, 0x99, 0xC7}
14920 +};
14921 +
14922 +struct guid LEGACY_MBR_PARTITION = {
14923 +       0x024DEE41,
14924 +       0x33E7,
14925 +       0x11D3,
14926 +       0x9D,
14927 +       0x69,
14928 +       {0x00, 0x08, 0xC7, 0x81, 0xF3, 0x9F}
14929 +};
14930 +
14931 +struct guid GPT_SWAP_PARTITION = {
14932 +       0x0657FD6D,
14933 +       0xA4AB,
14934 +       0x43C4,
14935 +       0x84,
14936 +       0xE5,
14937 +       {0x09, 0x33, 0xC8, 0x4B, 0x4F, 0x4F}
14938 +};
14939 +
14940 +struct guid UNUSED_GPT_PARTITION = {
14941 +       0, 0, 0, 0, 0,
14942 +       {0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
14943 +};
14944 +
14945 +static int exported_nodes;     /* total # of exported segments
14946 +                                * produced during this discovery.
14947 +                                */
14948 +
14949 +/* Prototypes */
14950 +static int partition_discover(struct evms_logical_node **);
14951 +static int partition_delete(struct evms_logical_node *);
14952 +static void partition_read(struct evms_logical_node *, struct buffer_head *);
14953 +static void partition_write(struct evms_logical_node *, struct buffer_head *);
14954 +static int partition_ioctl(struct evms_logical_node *,
14955 +                          struct inode *,
14956 +                          struct file *, unsigned int, unsigned long);
14957 +static int partition_init_io(struct evms_logical_node *,
14958 +                            int, u64, u64, void *);
14959 +
14960 +static struct evms_plugin_fops fops = {
14961 +       .discover       = partition_discover,
14962 +       .delete         = partition_delete,
14963 +       .read           = partition_read,
14964 +       .write          = partition_write,
14965 +       .init_io        = partition_init_io,
14966 +       .ioctl          = partition_ioctl
14967 +};
14968 +
14969 +#define EVMS_GPT_PARTITION_MANAGER_ID 3
14970 +
14971 +static struct evms_plugin_header plugin_header = {
14972 +       .id = SetPluginID(IBM_OEM_ID,
14973 +                         EVMS_SEGMENT_MANAGER,
14974 +                         EVMS_GPT_PARTITION_MANAGER_ID),
14975 +       .version = {
14976 +               .major          = 1,
14977 +               .minor          = 1,
14978 +               .patchlevel     = 1
14979 +       },
14980 +       .required_services_version = {
14981 +               .major          = 0,
14982 +               .minor          = 5,
14983 +               .patchlevel     = 0
14984 +       },
14985 +       .fops = &fops
14986 +};
14987 +
14988 +/***************************************************/
14989 +/* List Support - Typedefs, Variables, & Functions */
14990 +/***************************************************/
14991 +
14992 +/* Typedefs */
14993 +
14994 +struct segment_list_node {
14995 +       struct evms_logical_node *segment;
14996 +       struct segment_list_node *next;
14997 +};
14998 +
14999 +struct disk_list_node {
15000 +       struct evms_logical_node *disk;
15001 +       struct segment_list_node *segment_list;
15002 +       struct disk_list_node *next;
15003 +};
15004 +
15005 +/* Variables */
15006 +
15007 +static struct disk_list_node *my_disk_list;
15008 +
15009 +/* Functions */
15010 +
15011 +/*
15012 + *  Function: Convert a GPT header from disk format to the arch specific
15013 + *  format.
15014 + */
15015 +static void
15016 +disk_gpt_header_to_cpu(struct gpt_header *gh)
15017 +{
15018 +       gh->signature = le64_to_cpu(gh->signature);
15019 +       gh->version = le32_to_cpu(gh->version);
15020 +       gh->size = le32_to_cpu(gh->size);
15021 +       gh->crc = le32_to_cpu(gh->crc);
15022 +       gh->reserve = le32_to_cpu(gh->reserve);
15023 +       gh->my_lba = le64_to_cpu(gh->my_lba);
15024 +       gh->alternate_lba = le64_to_cpu(gh->alternate_lba);
15025 +       gh->start_useable = le64_to_cpu(gh->start_useable);
15026 +       gh->end_useable = le64_to_cpu(gh->end_useable);
15027 +       gh->disk_id.time_low = le32_to_cpu(gh->disk_id.time_low);
15028 +       gh->disk_id.time_mid = le16_to_cpu(gh->disk_id.time_mid);
15029 +       gh->disk_id.time_high = le16_to_cpu(gh->disk_id.time_high);
15030 +       gh->ptable_lba = le64_to_cpu(gh->ptable_lba);
15031 +       gh->ptable_count = le32_to_cpu(gh->ptable_count);
15032 +       gh->ptable_entry_size = le32_to_cpu(gh->ptable_entry_size);
15033 +       gh->ptable_crc = le32_to_cpu(gh->ptable_crc);
15034 +}
15035 +
15036 +static int
15037 +matching_guids(struct guid *g1, struct guid *g2)
15038 +{
15039 +       if ((le32_to_cpu(g1->time_low) == g2->time_low) &&
15040 +           (le16_to_cpu(g1->time_mid) == g2->time_mid) &&
15041 +           (le16_to_cpu(g1->time_high) == g2->time_high) &&
15042 +           (g1->clock_seq_high == g2->clock_seq_high) &&
15043 +           (g1->clock_seq_low == g2->clock_seq_low)) {
15044 +               return 1;
15045 +       }
15046 +       return 0;
15047 +}
15048 +static inline int
15049 +isa_basic_data_gpt_partition_record(struct gpt_partition *p)
15050 +{
15051 +       return (matching_guids(&p->type, &BASIC_DATA_PARTITION));
15052 +}
15053 +static inline int
15054 +isa_legacy_mbr_gpt_partition_record(struct gpt_partition *p)
15055 +{
15056 +       return (matching_guids(&p->type, &LEGACY_MBR_PARTITION));
15057 +}
15058 +static inline int
15059 +isa_esp_gpt_partition_record(struct gpt_partition *p)
15060 +{
15061 +       return (matching_guids(&p->type, &EFI_SYSTEM_PARTITION));
15062 +}
15063 +static inline int
15064 +isa_gpt_swap_partition_record(struct gpt_partition *p)
15065 +{
15066 +       return (matching_guids(&p->type, &GPT_SWAP_PARTITION));
15067 +}
15068 +static inline int
15069 +isa_unused_gpt_partition_record(struct gpt_partition *p)
15070 +{
15071 +       return (matching_guids(&p->type, &UNUSED_GPT_PARTITION));
15072 +}
15073 +
15074 +static struct disk_list_node **
15075 +lookup_disk(struct evms_logical_node *disk)
15076 +{
15077 +       struct disk_list_node **ldln;
15078 +
15079 +       ldln = &my_disk_list;
15080 +       while (*ldln) {
15081 +               if ((*ldln)->disk == disk)
15082 +                       break;
15083 +               ldln = &(*ldln)->next;
15084 +       }
15085 +       return (ldln);
15086 +}
15087 +
15088 +static struct segment_list_node **
15089 +lookup_segment(struct disk_list_node *disk, struct evms_logical_node *segment)
15090 +{
15091 +       struct segment_list_node **lsln;
15092 +
15093 +       lsln = &disk->segment_list;
15094 +       while (*lsln) {
15095 +               if ((*lsln)->segment == segment)
15096 +                       break;
15097 +               lsln = &(*lsln)->next;
15098 +       }
15099 +       return (lsln);
15100 +}
15101 +
15102 +static struct evms_logical_node *
15103 +find_segment_on_disk(struct evms_logical_node *disk,
15104 +                    u64 start_sect, u64 nr_sects)
15105 +{
15106 +       struct evms_logical_node *rc = NULL;
15107 +       struct disk_list_node **ldln;
15108 +       struct segment_list_node **lsln;
15109 +       struct gpt_private *gpt_prv;
15110 +
15111 +       ldln = lookup_disk(disk);
15112 +       if (*ldln) {
15113 +               /* disk found in list */
15114 +               /* attempt to find segment */
15115 +
15116 +               lsln = &(*ldln)->segment_list;
15117 +               while (*lsln) {
15118 +                       gpt_prv = (*lsln)->segment->private;
15119 +                       if (gpt_prv->start_sect == start_sect)
15120 +                               if (gpt_prv->nr_sects == nr_sects)
15121 +                                       break;
15122 +                       lsln = &(*lsln)->next;
15123 +               }
15124 +               if (*lsln)
15125 +                       rc = (*lsln)->segment;
15126 +       }
15127 +       return (rc);
15128 +}
15129 +
15130 +/* function description: add_segment_to_disk
15131 + *
15132 + * this function attempts to add a segment to the segment
15133 + * list of a disk. if the specified disk is not found, it
15134 + * will be added to the global disk list. this function will
15135 + * return a pointer to the matching segment in the disk's
15136 + * segment list. the caller must compare the returned pointer
15137 + * to the specified segment to see if the
15138 + * specified segment was already present in the disk's segment
15139 + * list. if the return pointer matches the specified segment,
15140 + * then the specified segment was added to the list. if the
15141 + * return segment pointer to does not match the specified
15142 + * segment pointer, then the specified segment pointer was
15143 + * a duplicate and can be thrown away.
15144 + */
15145 +static int
15146 +add_segment_to_disk(struct evms_logical_node *disk,
15147 +                   struct evms_logical_node *segment)
15148 +{
15149 +       int rc = 0;
15150 +       struct disk_list_node **ldln, *new_disk;
15151 +       struct segment_list_node **lsln, *new_segment;
15152 +
15153 +       ldln = lookup_disk(disk);
15154 +       if (*ldln == NULL) {
15155 +               /* disk not in list, add disk */
15156 +               new_disk = kmalloc(sizeof (*new_disk), GFP_KERNEL);
15157 +               if (new_disk) {
15158 +                       memset(new_disk, 0, sizeof (*new_disk));
15159 +                       new_disk->disk = disk;
15160 +                       *ldln = new_disk;
15161 +               } else {
15162 +                       rc = -ENOMEM;
15163 +               }
15164 +       }
15165 +       if (!rc) {
15166 +               /* attempt to add segment */
15167 +               lsln = lookup_segment(*ldln, segment);
15168 +               if (*lsln == NULL) {
15169 +                       /* segment not in list, add segment */
15170 +                       new_segment =
15171 +                           kmalloc(sizeof (*new_segment), GFP_KERNEL);
15172 +                       if (new_segment) {
15173 +                               memset(new_segment, 0, sizeof (*new_segment));
15174 +                               new_segment->segment = segment;
15175 +                               *lsln = new_segment;
15176 +                       } else {
15177 +                               rc = -ENOMEM;
15178 +                       }
15179 +               } else
15180 +                       rc = -1;
15181 +       }
15182 +       return (rc);
15183 +}
15184 +
15185 +static int
15186 +remove_segment_from_disk(struct evms_logical_node *disk,
15187 +                        struct evms_logical_node *segment,
15188 +                        struct evms_logical_node **empty_disk)
15189 +{
15190 +       int rc = 0;
15191 +       struct disk_list_node **ldln, *tmp_disk_node;
15192 +       struct segment_list_node **lsln, *tmp_segment_node;
15193 +
15194 +       *empty_disk = NULL;
15195 +       ldln = lookup_disk(disk);
15196 +       if (*ldln == NULL) {
15197 +               rc = -1;
15198 +       } else {
15199 +               /* disk found in list */
15200 +               /* attempt to add segment */
15201 +               lsln = lookup_segment(*ldln, segment);
15202 +               if (*lsln == NULL) {
15203 +                       rc = -2;
15204 +               } else {
15205 +                       tmp_segment_node = *lsln;
15206 +                       /* remove segment from list */
15207 +                       *lsln = (*lsln)->next;
15208 +                       /* free the segment list node */
15209 +                       kfree(tmp_segment_node);
15210 +
15211 +                       if ((*ldln)->segment_list == NULL) {
15212 +                               tmp_disk_node = *ldln;
15213 +                               *empty_disk = tmp_disk_node->disk;
15214 +                               /* remove disk from list */
15215 +                               *ldln = (*ldln)->next;
15216 +                               /* free the disk list node */
15217 +                               kfree(tmp_disk_node);
15218 +                       }
15219 +               }
15220 +       }
15221 +       return (rc);
15222 +}
15223 +
15224 +/*
15225 + * Function:  add_segment
15226 + */
15227 +static int
15228 +process_segment(struct evms_logical_node **discover_list,
15229 +               struct evms_logical_node *node,
15230 +               u64 start_sect,
15231 +               u64 nr_sects,
15232 +               int type, int part_num, int evms_top_segment)
15233 +{
15234 +       struct gpt_private *gpt_prv = NULL;
15235 +       struct evms_logical_node *segment;
15236 +       int rc = 0;
15237 +
15238 +       segment = find_segment_on_disk(node, start_sect, nr_sects);
15239 +       if (segment) {
15240 +               LOG_DETAILS("exporting segment '%s'.\n", segment->name);
15241 +       } else {
15242 +               gpt_prv = kmalloc(sizeof (*gpt_prv), GFP_KERNEL);
15243 +               if (gpt_prv) {
15244 +                       gpt_prv->source_disk = node;
15245 +                       gpt_prv->start_sect = start_sect;
15246 +                       gpt_prv->nr_sects = nr_sects;
15247 +                       gpt_prv->type = type;
15248 +                       rc = evms_cs_allocate_logical_node(&segment);
15249 +               } else {
15250 +                       rc = -ENOMEM;
15251 +               }
15252 +               if (!rc) {
15253 +                       segment->plugin = &plugin_header;
15254 +                       segment->system_id = (unsigned int) type;
15255 +                       segment->total_vsectors = nr_sects;
15256 +                       segment->block_size = node->block_size;
15257 +                       segment->hardsector_size = node->hardsector_size;
15258 +                       segment->private = gpt_prv;
15259 +                       segment->flags = node->flags;
15260 +                       if (evms_top_segment)
15261 +                               segment->iflags |= EVMS_TOP_SEGMENT;
15262 +                       strcpy(segment->name, node->name);
15263 +                       if (GetPluginType(node->plugin->id) ==
15264 +                           EVMS_SEGMENT_MANAGER) {
15265 +                               strcat(segment->name, ".");
15266 +                       }
15267 +                       sprintf(segment->name + strlen(segment->name), "%d",
15268 +                               part_num);
15269 +                       LOG_DETAILS("creating segment '%s'.\n", segment->name);
15270 +                       rc = add_segment_to_disk(node, segment);
15271 +                       if (rc) {
15272 +                               LOG_ERROR
15273 +                                   ("%s: error(%d) adding segment '%s'!\n",
15274 +                                    __FUNCTION__, rc, segment->name);
15275 +                               rc = 0;
15276 +                       } else {
15277 +                               MOD_INC_USE_COUNT;
15278 +                       }
15279 +               }
15280 +               if (rc) {
15281 +                       if (gpt_prv)
15282 +                               kfree(gpt_prv);
15283 +                       if (segment)
15284 +                               evms_cs_deallocate_logical_node(segment);
15285 +               }
15286 +       }
15287 +       if (!rc) {
15288 +               evms_cs_add_logical_node_to_list(discover_list, segment);
15289 +               exported_nodes++;
15290 +       }
15291 +       return rc;
15292 +}
15293 +
15294 +void
15295 +print_mem(void *buffer, int length)
15296 +{
15297 +       int i, done;
15298 +       unsigned char *bufptr;
15299 +
15300 +       bufptr = (unsigned char *) buffer;
15301 +       i = done = 0;
15302 +       while (!done) {
15303 +               if ((i % 16) == 0)
15304 +                       printk(KERN_INFO "\n0x%p->", buffer + i);
15305 +               printk(KERN_INFO "%02x ", bufptr[i]);
15306 +               if (++i >= length)
15307 +                       done++;
15308 +       }
15309 +       printk(KERN_INFO "\n");
15310 +}
15311 +
15312 +/*
15313 + *  Function: get GPT Partition Table - reads partition table
15314 + *            into memory and performs crc check.
15315 + *
15316 + */
15317 +static struct gpt_partition *
15318 +get_gpt_partition_table(struct evms_logical_node *node, struct gpt_header *gh)
15319 +{
15320 +       int rc;
15321 +       struct gpt_partition *pt;
15322 +       u32 sector_count, calculated_crc;
15323 +
15324 +       sector_count =
15325 +           evms_cs_size_in_vsectors(gh->ptable_count * gh->ptable_entry_size);
15326 +
15327 +       pt = kmalloc(sector_count * EVMS_VSECTOR_SIZE, GFP_KERNEL);
15328 +       if (pt) {
15329 +
15330 +               rc = INIT_IO(node, 0, gh->ptable_lba, sector_count, pt);
15331 +               if (!rc) {
15332 +
15333 +                       calculated_crc = evms_cs_calculate_crc(EVMS_INITIAL_CRC,
15334 +                                                              pt,
15335 +                                                              gh->
15336 +                                                              ptable_count *
15337 +                                                              gh->
15338 +                                                              ptable_entry_size);
15339 +
15340 +                       if (~calculated_crc != gh->ptable_crc) {
15341 +                               rc = -ENODATA;
15342 +                       }
15343 +
15344 +               }
15345 +       } else {
15346 +               rc = -ENOMEM;
15347 +       }
15348 +
15349 +       if (rc) {
15350 +               if (pt)
15351 +                       kfree(pt);
15352 +               pt = NULL;
15353 +       }
15354 +
15355 +       return (pt);
15356 +}
15357 +
15358 +/*
15359 + *  Function: Validate GPT Header - runs basic checks to
15360 + *            sanity check a gpt header.
15361 + *
15362 + */
15363 +static int
15364 +isa_valid_gpt_header(struct evms_logical_node *node, u64 lsn,
15365 +                    struct gpt_header *gh)
15366 +{
15367 +       u32 crc;
15368 +       u32 calculated_crc;
15369 +       u64 sector_count;
15370 +
15371 +       /* signature */
15372 +       if (le64_to_cpu(gh->signature) != GPT_DISKMAGIC)
15373 +               return 0;
15374 +
15375 +       /* crc */
15376 +       crc = le32_to_cpu(gh->crc);
15377 +       gh->crc = 0;
15378 +       calculated_crc =
15379 +           ~(evms_cs_calculate_crc(EVMS_INITIAL_CRC, gh, le32_to_cpu(gh->size)));
15380 +       gh->crc = cpu_to_le32(crc);
15381 +
15382 +       if (calculated_crc != crc)
15383 +               return 0;
15384 +
15385 +       /* spec says lba reported by header must match actual location on disk */
15386 +       if (lsn != le64_to_cpu(gh->my_lba))
15387 +               return 0;
15388 +
15389 +       /* sanity check partition table info found in header */
15390 +       if (gh->ptable_count == 0 || gh->ptable_entry_size == 0)
15391 +               return 0;
15392 +
15393 +       sector_count =
15394 +           evms_cs_size_in_vsectors(le64_to_cpu(gh->ptable_count) *
15395 +                                    le64_to_cpu(gh->ptable_entry_size));
15396 +
15397 +       if ((le64_to_cpu(gh->ptable_lba) + sector_count - 1) >=
15398 +           node->total_vsectors - 1)
15399 +               return 0;
15400 +
15401 +       return 1;
15402 +}
15403 +
15404 +/*
15405 + *  Function: get GPT Partition Table Header
15406 + *
15407 + */
15408 +static struct gpt_header *
15409 +get_gpt_header(struct evms_logical_node *node, u64 lsn)
15410 +{
15411 +       int rc;
15412 +       struct gpt_header *gh = NULL;
15413 +
15414 +       gh = kmalloc(EVMS_VSECTOR_SIZE, GFP_KERNEL);
15415 +       if (gh) {
15416 +               rc = INIT_IO(node, 0, lsn, 1, gh);
15417 +               if (!rc) {
15418 +                       if (isa_valid_gpt_header(node, lsn, gh)) {
15419 +                               disk_gpt_header_to_cpu(gh);
15420 +                       } else {
15421 +                               rc = -ENODATA;
15422 +                       }
15423 +
15424 +               }
15425 +               if (rc) {
15426 +                       kfree(gh);
15427 +                       gh = NULL;
15428 +               }
15429 +       }
15430 +
15431 +       return (gh);
15432 +}
15433 +
15434 +/*
15435 + *  Function: Get GPT Information
15436 + *
15437 + */
15438 +static int
15439 +get_gpt_info(struct evms_logical_node *node,
15440 +            struct gpt_header **gh, struct gpt_partition **ptable)
15441 +{
15442 +       struct gpt_header *gh1 = NULL, *gh2 = NULL;
15443 +
15444 +       *gh = NULL;
15445 +       *ptable = NULL;
15446 +
15447 +       gh1 = get_gpt_header(node, 1);  // offset past protective mbr
15448 +
15449 +       if (gh1) {
15450 +               *gh = gh1;
15451 +               gh2 = get_gpt_header(node, gh1->alternate_lba);
15452 +               if (gh2)
15453 +                       kfree(gh2);
15454 +               else
15455 +                       LOG_WARNING
15456 +                           ("alternate guid partition table header is invalid, using primary copy.\n");
15457 +       } else {
15458 +               gh2 = get_gpt_header(node, node->total_vsectors - 1);
15459 +               if (gh2) {
15460 +                       *gh = gh2;
15461 +                       LOG_WARNING
15462 +                           ("primary guid partition table header is invalid, using alternate copy\n");
15463 +               } else {
15464 +                       LOG_DETAILS("no gpt header discovered on node %s\n",
15465 +                                   node->name);
15466 +                       return 0;
15467 +               }
15468 +       }
15469 +
15470 +       *ptable = get_gpt_partition_table(node, *gh);
15471 +       if (!*ptable) {
15472 +               kfree(*gh);
15473 +               *gh = NULL;
15474 +               return 0;
15475 +       }
15476 +
15477 +       return 1;
15478 +}
15479 +
15480 +/*
15481 + *  Function: Probe for GPT segments on logical node
15482 + *
15483 + */
15484 +static int
15485 +probe_for_segments(struct evms_logical_node **discover_list,
15486 +                  struct evms_logical_node *node)
15487 +{
15488 +       int rc;
15489 +       int nextminor = 1;
15490 +       int evms_top_segment;
15491 +       u32 i;
15492 +       u64 pstart,pend;
15493 +       struct gpt_header *gh = NULL;
15494 +       struct gpt_partition *ptable = NULL;
15495 +       struct gpt_partition *part = NULL;
15496 +
15497 +       /* no need to inspect our own nodes */
15498 +       if (node->plugin->id == plugin_header.id)
15499 +               return 0;
15500 +
15501 +       /* nor nodes marked as EVMS_TOP_SEGMENT */
15502 +       if (node->iflags & EVMS_TOP_SEGMENT)
15503 +               return 0;
15504 +
15505 +       /* look for guid partition table & header */
15506 +       if (!get_gpt_info(node, &gh, &ptable)) {
15507 +               if (gh)
15508 +                       kfree(gh);
15509 +               if (ptable)
15510 +                       kfree(ptable);
15511 +               return 0;
15512 +       }
15513 +
15514 +       /* walk the guid partition table, producing segment storage objects */
15515 +       for (i = 0, part = ptable; i < gh->ptable_count; i++, part++) {
15516 +
15517 +               if (!isa_unused_gpt_partition_record(part)) {
15518 +
15519 +                       pstart = le64_to_cpu(part->start);
15520 +                       pend   = le64_to_cpu(part->end);
15521 +
15522 +                       LOG_DETAILS
15523 +                           ("gpt partition start="PFU64"  end="PFU64"\n",
15524 +                            pstart, (pend - pstart + 1));
15525 +
15526 +                       /* stop other seg mgrs from recursive discovery on a gpt system partition */
15527 +                       if (isa_esp_gpt_partition_record(part))
15528 +                               evms_top_segment = 1;
15529 +                       else
15530 +                               evms_top_segment = 0;
15531 +
15532 +                       rc = process_segment(discover_list,
15533 +                                            node,
15534 +                                            pstart,
15535 +                                            (pend - pstart + 1),
15536 +                                            0, nextminor, evms_top_segment);
15537 +
15538 +                       if (!rc) {
15539 +                               ++nextminor;
15540 +                       }
15541 +               }
15542 +
15543 +       }
15544 +
15545 +       /* remove node we just consumed */
15546 +       evms_cs_remove_logical_node_from_list(discover_list, node);
15547 +
15548 +       kfree(ptable);
15549 +       kfree(gh);
15550 +       return 1;
15551 +}
15552 +
15553 +/*
15554 + * Function: partition_discover
15555 + *
15556 + */
15557 +static int
15558 +partition_discover(struct evms_logical_node **discover_list)
15559 +{
15560 +       int rc = 0;
15561 +       struct evms_logical_node *node, *next_node;
15562 +
15563 +       MOD_INC_USE_COUNT;
15564 +       LOG_ENTRY_EXIT("%s: ENTRY\n", __FUNCTION__);
15565 +
15566 +       /* initialize global variable */
15567 +       exported_nodes = 0;
15568 +
15569 +       /* examine each node on the discover list */
15570 +       next_node = *discover_list;
15571 +       while (next_node) {
15572 +               node = next_node;
15573 +               next_node = node->next;
15574 +               probe_for_segments(discover_list, node);
15575 +       }
15576 +
15577 +       LOG_ENTRY_EXIT("%s: EXIT(exported nodes:%d, error code:%d)\n",
15578 +                      __FUNCTION__, exported_nodes, rc);
15579 +       if (exported_nodes)
15580 +               rc = exported_nodes;
15581 +       MOD_DEC_USE_COUNT;
15582 +       return (rc);
15583 +}
15584 +
15585 +/*
15586 + * Function: partition_delete
15587 + *
15588 + */
15589 +static int
15590 +partition_delete(struct evms_logical_node *segment)
15591 +{
15592 +       int rc = 0;
15593 +       struct gpt_private *gpt_prv;
15594 +       struct evms_logical_node *empty_disk = NULL;
15595 +
15596 +       LOG_DETAILS("deleting segment '%s'.\n", segment->name);
15597 +
15598 +       if (!segment) {
15599 +               rc = -ENODEV;
15600 +       } else {
15601 +               gpt_prv = segment->private;
15602 +               if (gpt_prv) {
15603 +                       /* remove the segment from the
15604 +                        * disk's segment list
15605 +                        */
15606 +                       rc = remove_segment_from_disk(gpt_prv->source_disk,
15607 +                                                     segment, &empty_disk);
15608 +                       /* free the local instance data */
15609 +                       kfree(gpt_prv);
15610 +               }
15611 +               /* free the segment node */
15612 +               evms_cs_deallocate_logical_node(segment);
15613 +               MOD_DEC_USE_COUNT;
15614 +               /* if the last segment on the disk was
15615 +                * deleted, delete the disk node too
15616 +                */
15617 +               if (empty_disk)
15618 +                       DELETE(empty_disk);
15619 +       }
15620 +       return (rc);
15621 +}
15622 +
15623 +/*
15624 + * function: partition_io_error
15625 + *
15626 + * this function was primarily created because the function
15627 + * buffer_IO_error is inline and kgdb doesn't allow breakpoints
15628 + * to be set on inline functions. Since this was an error path
15629 + * and not mainline, I decided to add a trace statement to help
15630 + * report on the failing condition.
15631 + *
15632 + */
15633 +static void
15634 +partition_io_error(struct evms_logical_node *node, int io_flag,
15635 +                  struct buffer_head *bh)
15636 +{
15637 +       LOG_SERIOUS
15638 +           ("attempt to %s beyond partition boundary("PFU64") on (%s), rsector(%ld).\n",
15639 +            (io_flag) ? "WRITE" : "READ", node->total_vsectors - 1, node->name,
15640 +            bh->b_rsector);
15641 +
15642 +       bh->b_end_io(bh, 0);
15643 +}
15644 +
15645 +/*
15646 + * Function: partition_read
15647 + *
15648 + */
15649 +static void
15650 +partition_read(struct evms_logical_node *partition, struct buffer_head *bh)
15651 +{
15652 +       struct gpt_private *gpt_prv = partition->private;
15653 +
15654 +       if ((bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT)) <=
15655 +           partition->total_vsectors) {
15656 +               bh->b_rsector += gpt_prv->start_sect;
15657 +               R_IO(gpt_prv->source_disk, bh);
15658 +       } else
15659 +               partition_io_error(partition, READ, bh);
15660 +}
15661 +
15662 +/*
15663 + * Function: partition_write
15664 + *
15665 + */
15666 +static void
15667 +partition_write(struct evms_logical_node *partition, struct buffer_head *bh)
15668 +{
15669 +       struct gpt_private *gpt_prv = partition->private;
15670 +
15671 +       if ((bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT)) <=
15672 +           partition->total_vsectors) {
15673 +               bh->b_rsector += gpt_prv->start_sect;
15674 +               W_IO(gpt_prv->source_disk, bh);
15675 +       } else
15676 +               partition_io_error(partition, WRITE, bh);
15677 +}
15678 +
15679 +/*
15680 + * Function: partition_init_io
15681 + *
15682 + */
15683 +static int
15684 +partition_init_io(struct evms_logical_node *partition, int io_flag,    /* 0=read, 1=write */
15685 +                 u64 sect_nr,  /* disk LBA */
15686 +                 u64 num_sects,        /* # of sectors */
15687 +                 void *buf_addr)
15688 +{                              /* buffer address */
15689 +       int rc;
15690 +       struct gpt_private *gpt_prv = partition->private;
15691 +
15692 +       if ((sect_nr + num_sects) <= partition->total_vsectors) {
15693 +               rc = INIT_IO(gpt_prv->source_disk, io_flag,
15694 +                            sect_nr + gpt_prv->start_sect, num_sects,
15695 +                            buf_addr);
15696 +       } else {
15697 +               LOG_SERIOUS
15698 +                   ("init_io: attempt to %s beyond partition(%s) boundary("PFU64") at sector("PFU64") for count("PFU64").\n",
15699 +                    (io_flag) ? "WRITE" : "READ", partition->name,
15700 +                    (gpt_prv->nr_sects - 1), sect_nr, num_sects);
15701 +               rc = -EINVAL;
15702 +       }
15703 +
15704 +       return (rc);
15705 +}
15706 +
15707 +/*
15708 + * Function: partition_ioctl
15709 + *
15710 + */
15711 +static int
15712 +partition_ioctl(struct evms_logical_node *partition,
15713 +               struct inode *inode,
15714 +               struct file *file, unsigned int cmd, unsigned long arg)
15715 +{
15716 +       struct gpt_private *gpt_prv;
15717 +       struct hd_geometry hd_geo;
15718 +       int rc;
15719 +
15720 +       rc = 0;
15721 +       gpt_prv = partition->private;
15722 +       if (!inode)
15723 +               return -EINVAL;
15724 +       switch (cmd) {
15725 +       case HDIO_GETGEO:
15726 +               {
15727 +                       rc = IOCTL(gpt_prv->source_disk, inode, file, cmd, arg);
15728 +                       if (rc)
15729 +                               break;
15730 +                       if (copy_from_user
15731 +                           (&hd_geo, (void *) arg,
15732 +                            sizeof (struct hd_geometry)))
15733 +                               rc = -EFAULT;
15734 +                       if (rc)
15735 +                               break;
15736 +                       hd_geo.start = gpt_prv->start_sect;
15737 +                       if (copy_to_user
15738 +                           ((void *) arg, &hd_geo,
15739 +                            sizeof (struct hd_geometry)))
15740 +                               rc = -EFAULT;
15741 +               }
15742 +               break;
15743 +       case EVMS_GET_BMAP:
15744 +               {
15745 +                       struct evms_get_bmap_pkt *bmap =
15746 +                           (struct evms_get_bmap_pkt *) arg;
15747 +                       bmap->rsector += gpt_prv->start_sect;
15748 +                       /* intentionally fall thru to
15749 +                        * default ioctl down to device
15750 +                        * manager.
15751 +                        */
15752 +               }
15753 +       default:
15754 +               rc = IOCTL(gpt_prv->source_disk, inode, file, cmd, arg);
15755 +       }
15756 +       return rc;
15757 +}
15758 +
15759 +/*
15760 + * Function: gpt_module_init
15761 + *
15762 + */
15763 +static int __init
15764 +gpt_module_init(void)
15765 +{
15766 +       return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
15767 +}
15768 +
15769 +/*
15770 + * Function: gpt module exit
15771 + */
15772 +static void __exit
15773 +gpt_module_exit(void)
15774 +{
15775 +       evms_cs_unregister_plugin(&plugin_header);
15776 +}
15777 +
15778 +module_init(gpt_module_init);
15779 +module_exit(gpt_module_exit);
15780 +#ifdef MODULE_LICENSE
15781 +MODULE_LICENSE("GPL");
15782 +#endif
15783 diff -Naur linux-2002-09-30/drivers/evms/ldev_mgr.c evms-2002-09-30/drivers/evms/ldev_mgr.c
15784 --- linux-2002-09-30/drivers/evms/ldev_mgr.c    Wed Dec 31 18:00:00 1969
15785 +++ evms-2002-09-30/drivers/evms/ldev_mgr.c     Fri Sep 13 16:45:06 2002
15786 @@ -0,0 +1,1500 @@
15787 +/* -*- linux-c -*- */
15788 +/*
15789 + *
15790 + *   Copyright (c) International Business Machines  Corp., 2000
15791 + *
15792 + *   This program is free software;  you can redistribute it and/or modify
15793 + *   it under the terms of the GNU General Public License as published by
15794 + *   the Free Software Foundation; either version 2 of the License, or
15795 + *   (at your option) any later version.
15796 + *
15797 + *   This program is distributed in the hope that it will be useful,
15798 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
15799 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
15800 + *   the GNU General Public License for more details.
15801 + *
15802 + *   You should have received a copy of the GNU General Public License
15803 + *   along with this program;  if not, write to the Free Software
15804 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
15805 + */
15806 +
15807 +/* linux/driver/evms/ldev_mgr.c
15808 + *
15809 + * EVMS - Local Device (Hard Drive) Manager
15810 + *
15811 + *  This plugin walks the gendisk list and creates logical disk structures for each
15812 + *  local ide or scsi device.
15813 + *
15814 + */
15815 +
15816 +#include <linux/config.h>
15817 +#include <linux/module.h>
15818 +#include <linux/errno.h>
15819 +#include <linux/kernel.h>
15820 +#include <linux/fs.h>
15821 +#include <linux/slab.h>
15822 +#include <asm/uaccess.h>
15823 +#include <linux/blk.h>         /* must be included by all block drivers */
15824 +#include <linux/genhd.h>
15825 +#include <linux/ide.h>
15826 +#include <linux/version.h>
15827 +#include "../scsi/scsi.h"
15828 +#include "../scsi/sd.h"
15829 +#include <linux/init.h>
15830 +#include <linux/evms/evms.h>
15831 +#include <linux/evms/ldev_mgr.h>
15832 +
15833 +#define LOG_PREFIX "ldev_mgr: "
15834 +
15835 +#define EVMS_LOCAL_DEVICE_MANAGER_ID    1
15836 +
15837 +/**
15838 + * struct ldev_private - private data used by this plugin
15839 + * @major: major device number
15840 + * @minor: minor device number
15841 + * @bdev: block_device record for this device
15842 + * @gd: gendisk entry for this device
15843 + * @media_changed: media changed status field
15844 + *
15845 + * private data maintained for each device by this plugin
15846 + **/
15847 +struct ldev_private {
15848 +       int major, minor;
15849 +       struct block_device *bdev;
15850 +       struct gendisk *gd;
15851 +       int media_changed;
15852 +};
15853 +
15854 +/* prototypes for mandatory plugin interface functions */
15855 +static int discover_disks(struct evms_logical_node **);
15856 +static int ldev_mgr_delete(struct evms_logical_node *);
15857 +static void ldev_mgr_read(struct evms_logical_node *, struct buffer_head *);
15858 +static void ldev_mgr_write(struct evms_logical_node *, struct buffer_head *);
15859 +static int ldev_mgr_ioctl(struct evms_logical_node *,
15860 +                         struct inode *,
15861 +                         struct file *, unsigned int, unsigned long);
15862 +static int ldev_init_io(struct evms_logical_node *,
15863 +                       int, u64, u64, void *);
15864 +static int ldev_mgr_direct_ioctl(struct inode *,
15865 +                                struct file *, unsigned int, unsigned long);
15866 +
15867 +/* plugin function table definition */
15868 +static struct evms_plugin_fops fops = {
15869 +       .discover       = discover_disks,
15870 +       .delete         = ldev_mgr_delete,
15871 +       .read           = ldev_mgr_read,
15872 +       .write          = ldev_mgr_write,
15873 +       .init_io        = ldev_init_io,
15874 +       .ioctl          = ldev_mgr_ioctl,
15875 +       .direct_ioctl   = ldev_mgr_direct_ioctl
15876 +};
15877 +
15878 +/* plugin header definition */
15879 +static struct evms_plugin_header plugin_header = {
15880 +       .id = SetPluginID(IBM_OEM_ID,
15881 +                         EVMS_DEVICE_MANAGER,
15882 +                         EVMS_LOCAL_DEVICE_MANAGER_ID),
15883 +       .version = {
15884 +               .major          = 1,
15885 +               .minor          = 1,
15886 +               .patchlevel     = 1
15887 +       },
15888 +       .required_services_version = {
15889 +               .major          = 0,
15890 +               .minor          = 5,
15891 +               .patchlevel     = 0
15892 +       },
15893 +       .fops = &fops
15894 +};
15895 +
15896 +#define TYPE_NONE      0
15897 +#define TYPE_GENERIC   1
15898 +#define TYPE_IDE       2
15899 +#define TYPE_SCSI      3
15900 +
15901 +#define INDEX_ALPHA    0
15902 +#define INDEX_NUMERIC  1
15903 +
15904 +/********************************************************/
15905 +/* Required Plugin Function Table Entry Point:          */
15906 +/*      Discover function & Support routines            */
15907 +/********************************************************/
15908 +
15909 +#define MAX_NAME_BASE_SIZE     10
15910 +#define MAX_NAME_MODIFIER_SIZE 4
15911 +/**
15912 + * struct blk_device_info - block device info
15913 + * @devnode_name_base: base name (ie. hd or sd) for device
15914 + * @null1: guaranteed end-of-string NULL
15915 + * @devnode_name_modifier: name suffix (ie. ag for sdag) for device
15916 + * @null2: guaranteed end-of-string NULL
15917 + * @devnode_name_index: numeric device index (ie. 1 for hda1)
15918 + * @devnode_name_type: indicates numeric or alpha modifier
15919 + * @devnode_type: device type, IDE, SCSI, or GENERIC
15920 + *
15921 + * generic block device naming descriptor structure
15922 + **/
15923 +struct blk_device_info {
15924 +       char devnode_name_base[MAX_NAME_BASE_SIZE];
15925 +       char null1;
15926 +       char devnode_name_modifier[MAX_NAME_MODIFIER_SIZE];
15927 +       char null2;
15928 +       int devnode_name_index;
15929 +       int devnode_name_type;
15930 +       int device_type;
15931 +};
15932 +
15933 +static struct blk_device_info *blk_dev_info = NULL;
15934 +
15935 +#define BLK_DEV_INFO(a,b,c,d,e)                                \
15936 +       strncpy(blk_dev_info[a].devnode_name_base, b, MAX_NAME_BASE_SIZE);      \
15937 +       blk_dev_info[a].null1 = 0;                              \
15938 +       strncpy(blk_dev_info[a].devnode_name_modifier, c, MAX_NAME_MODIFIER_SIZE);      \
15939 +       blk_dev_info[a].null2 = 0;                              \
15940 +       blk_dev_info[a].devnode_name_index = 0;                 \
15941 +       blk_dev_info[a].device_type = d;                        \
15942 +       blk_dev_info[a].devnode_name_type = e;
15943 +
15944 +static void
15945 +init_blk_dev_info(struct blk_device_info *blk_dev_info)
15946 +{
15947 +       BLK_DEV_INFO(IDE0_MAJOR, "hd", "a", TYPE_IDE, INDEX_ALPHA);
15948 +       BLK_DEV_INFO(IDE1_MAJOR, "hd", "c", TYPE_IDE, INDEX_ALPHA);
15949 +       BLK_DEV_INFO(IDE2_MAJOR, "hd", "e", TYPE_IDE, INDEX_ALPHA);
15950 +       BLK_DEV_INFO(IDE3_MAJOR, "hd", "g", TYPE_IDE, INDEX_ALPHA);
15951 +       BLK_DEV_INFO(IDE4_MAJOR, "hd", "i", TYPE_IDE, INDEX_ALPHA);
15952 +       BLK_DEV_INFO(IDE5_MAJOR, "hd", "k", TYPE_IDE, INDEX_ALPHA);
15953 +       BLK_DEV_INFO(IDE6_MAJOR, "hd", "m", TYPE_IDE, INDEX_ALPHA);
15954 +       BLK_DEV_INFO(IDE7_MAJOR, "hd", "o", TYPE_IDE, INDEX_ALPHA);
15955 +       BLK_DEV_INFO(IDE8_MAJOR, "hd", "q", TYPE_IDE, INDEX_ALPHA);
15956 +       BLK_DEV_INFO(IDE9_MAJOR, "hd", "s", TYPE_IDE, INDEX_ALPHA);
15957 +
15958 +       BLK_DEV_INFO(SCSI_DISK0_MAJOR, "sd", "a", TYPE_SCSI, INDEX_ALPHA);
15959 +       BLK_DEV_INFO(SCSI_DISK1_MAJOR, "sd", "q", TYPE_SCSI, INDEX_ALPHA);
15960 +       BLK_DEV_INFO(SCSI_DISK2_MAJOR, "sd", "ag", TYPE_SCSI, INDEX_ALPHA);
15961 +       BLK_DEV_INFO(SCSI_DISK3_MAJOR, "sd", "aw", TYPE_SCSI, INDEX_ALPHA);
15962 +       BLK_DEV_INFO(SCSI_DISK4_MAJOR, "sd", "bm", TYPE_SCSI, INDEX_ALPHA);
15963 +       BLK_DEV_INFO(SCSI_DISK5_MAJOR, "sd", "cc", TYPE_SCSI, INDEX_ALPHA);
15964 +       BLK_DEV_INFO(SCSI_DISK6_MAJOR, "sd", "cs", TYPE_SCSI, INDEX_ALPHA);
15965 +       BLK_DEV_INFO(SCSI_DISK7_MAJOR, "sd", "di", TYPE_SCSI, INDEX_ALPHA);
15966 +
15967 +       BLK_DEV_INFO(XT_DISK_MAJOR, "xd", "a", TYPE_GENERIC, INDEX_ALPHA);
15968 +
15969 +       BLK_DEV_INFO(CYCLADES_MAJOR, "double", "0", TYPE_GENERIC,
15970 +                    INDEX_NUMERIC);
15971 +
15972 +       BLK_DEV_INFO(MFM_ACORN_MAJOR, "mfm", "a", TYPE_GENERIC, INDEX_ALPHA);
15973 +
15974 +       BLK_DEV_INFO(ACSI_MAJOR, "ad", "a", TYPE_GENERIC, INDEX_ALPHA);
15975 +
15976 +       BLK_DEV_INFO(PS2ESDI_MAJOR, "ed", "a", TYPE_GENERIC, INDEX_ALPHA);
15977 +
15978 +       BLK_DEV_INFO(40, "ez", "a", TYPE_GENERIC, INDEX_ALPHA);
15979 +       BLK_DEV_INFO(43, "nb", "0", TYPE_GENERIC, INDEX_NUMERIC);
15980 +       BLK_DEV_INFO(44, "ftl", "a", TYPE_GENERIC, INDEX_ALPHA);
15981 +       BLK_DEV_INFO(45, "pd", "a", TYPE_GENERIC, INDEX_ALPHA);
15982 +       BLK_DEV_INFO(47, "pf", "0", TYPE_GENERIC, INDEX_NUMERIC);
15983 +
15984 +       BLK_DEV_INFO(DAC960_MAJOR + 0, "rd/c0d", "0", TYPE_GENERIC,
15985 +                    INDEX_NUMERIC);
15986 +       BLK_DEV_INFO(DAC960_MAJOR + 1, "rd/c1d", "0", TYPE_GENERIC,
15987 +                    INDEX_NUMERIC);
15988 +       BLK_DEV_INFO(DAC960_MAJOR + 2, "rd/c2d", "0", TYPE_GENERIC,
15989 +                    INDEX_NUMERIC);
15990 +       BLK_DEV_INFO(DAC960_MAJOR + 3, "rd/c3d", "0", TYPE_GENERIC,
15991 +                    INDEX_NUMERIC);
15992 +       BLK_DEV_INFO(DAC960_MAJOR + 4, "rd/c4d", "0", TYPE_GENERIC,
15993 +                    INDEX_NUMERIC);
15994 +       BLK_DEV_INFO(DAC960_MAJOR + 5, "rd/c5d", "0", TYPE_GENERIC,
15995 +                    INDEX_NUMERIC);
15996 +       BLK_DEV_INFO(DAC960_MAJOR + 6, "rd/c6d", "0", TYPE_GENERIC,
15997 +                    INDEX_NUMERIC);
15998 +       BLK_DEV_INFO(DAC960_MAJOR + 7, "rd/c7d", "0", TYPE_GENERIC,
15999 +                    INDEX_NUMERIC);
16000 +
16001 +       BLK_DEV_INFO(COMPAQ_SMART2_MAJOR, "ida/c0d", "0", TYPE_GENERIC,
16002 +                    INDEX_NUMERIC);
16003 +       BLK_DEV_INFO(COMPAQ_SMART2_MAJOR1, "ida/c1d", "0", TYPE_GENERIC,
16004 +                    INDEX_NUMERIC);
16005 +       BLK_DEV_INFO(COMPAQ_SMART2_MAJOR2, "ida/c2d", "0", TYPE_GENERIC,
16006 +                    INDEX_NUMERIC);
16007 +       BLK_DEV_INFO(COMPAQ_SMART2_MAJOR3, "ida/c3d", "0", TYPE_GENERIC,
16008 +                    INDEX_NUMERIC);
16009 +       BLK_DEV_INFO(COMPAQ_SMART2_MAJOR4, "ida/c4d", "0", TYPE_GENERIC,
16010 +                    INDEX_NUMERIC);
16011 +       BLK_DEV_INFO(COMPAQ_SMART2_MAJOR5, "ida/c5d", "0", TYPE_GENERIC,
16012 +                    INDEX_NUMERIC);
16013 +       BLK_DEV_INFO(COMPAQ_SMART2_MAJOR6, "ida/c6d", "0", TYPE_GENERIC,
16014 +                    INDEX_NUMERIC);
16015 +       BLK_DEV_INFO(COMPAQ_SMART2_MAJOR7, "ida/c7d", "0", TYPE_GENERIC,
16016 +                    INDEX_NUMERIC);
16017 +
16018 +       BLK_DEV_INFO(I2O_MAJOR + 0, "i2o/hd", "a", TYPE_GENERIC, INDEX_ALPHA);
16019 +       BLK_DEV_INFO(I2O_MAJOR + 1, "i2o/hd", "q", TYPE_GENERIC, INDEX_ALPHA);
16020 +       BLK_DEV_INFO(I2O_MAJOR + 2, "i2o/hd", "ag", TYPE_GENERIC, INDEX_ALPHA);
16021 +       BLK_DEV_INFO(I2O_MAJOR + 3, "i2o/hd", "aw", TYPE_GENERIC, INDEX_ALPHA);
16022 +       BLK_DEV_INFO(I2O_MAJOR + 4, "i2o/hd", "bm", TYPE_GENERIC, INDEX_ALPHA);
16023 +       BLK_DEV_INFO(I2O_MAJOR + 5, "i2o/hd", "cc", TYPE_GENERIC, INDEX_ALPHA);
16024 +       BLK_DEV_INFO(I2O_MAJOR + 6, "i2o/hd", "cs", TYPE_GENERIC, INDEX_ALPHA);
16025 +       BLK_DEV_INFO(I2O_MAJOR + 7, "i2o/hd", "di", TYPE_GENERIC, INDEX_ALPHA);
16026 +
16027 +       BLK_DEV_INFO(92, "ppdd", "0", TYPE_GENERIC, INDEX_NUMERIC);
16028 +       BLK_DEV_INFO(93, "nftl", "a", TYPE_GENERIC, INDEX_ALPHA);
16029 +
16030 +       BLK_DEV_INFO(DASD_MAJOR, "dasd", "a", TYPE_GENERIC, INDEX_ALPHA);
16031 +       BLK_DEV_INFO(MDISK_MAJOR, "mdisk", "a", TYPE_GENERIC, INDEX_ALPHA);
16032 +
16033 +       BLK_DEV_INFO(96, "msd", "0", TYPE_GENERIC, INDEX_NUMERIC);
16034 +       BLK_DEV_INFO(97, "pktcdvd", "0", TYPE_GENERIC, INDEX_NUMERIC);
16035 +
16036 +       BLK_DEV_INFO(UBD_MAJOR, "ubd", "0", TYPE_GENERIC, INDEX_NUMERIC);
16037 +
16038 +       BLK_DEV_INFO(JSFD_MAJOR, "jsfd", "", TYPE_GENERIC, INDEX_NUMERIC);
16039 +
16040 +       BLK_DEV_INFO(101, "amiraid/ar", "0", TYPE_GENERIC, INDEX_NUMERIC);
16041 +
16042 +       BLK_DEV_INFO(104, "cciss/c0d", "0", TYPE_GENERIC, INDEX_NUMERIC);
16043 +       BLK_DEV_INFO(105, "cciss/c1d", "0", TYPE_GENERIC, INDEX_NUMERIC);
16044 +       BLK_DEV_INFO(106, "cciss/c2d", "0", TYPE_GENERIC, INDEX_NUMERIC);
16045 +       BLK_DEV_INFO(107, "cciss/c3d", "0", TYPE_GENERIC, INDEX_NUMERIC);
16046 +       BLK_DEV_INFO(108, "cciss/c4d", "0", TYPE_GENERIC, INDEX_NUMERIC);
16047 +       BLK_DEV_INFO(108, "cciss/c5d", "0", TYPE_GENERIC, INDEX_NUMERIC);
16048 +       BLK_DEV_INFO(110, "cciss/c6d", "0", TYPE_GENERIC, INDEX_NUMERIC);
16049 +       BLK_DEV_INFO(111, "cciss/c7d", "0", TYPE_GENERIC, INDEX_NUMERIC);
16050 +
16051 +       BLK_DEV_INFO(RAW_MAJOR, "raw", "0", TYPE_GENERIC, INDEX_NUMERIC);
16052 +
16053 +       BLK_DEV_INFO(VXVM_MAJOR, "vx/dsk", "0", TYPE_GENERIC, INDEX_NUMERIC);
16054 +       BLK_DEV_INFO(VXDMP_MAJOR, "vx/dmp", "0", TYPE_GENERIC, INDEX_NUMERIC);
16055 +       BLK_DEV_INFO(LOOP_MAJOR, "loop", "0", TYPE_GENERIC, INDEX_NUMERIC);
16056 +}
16057 +
16058 +static int
16059 +is_in_device_list(struct gendisk *gd, int major, int minor)
16060 +{
16061 +       int found, done, rc;
16062 +       struct evms_logical_node *device = NULL;
16063 +       struct ldev_private *ldev_prv;
16064 +
16065 +       done = found = FALSE;
16066 +       while (done == FALSE) {
16067 +               rc = evms_cs_find_next_device(device, &device);
16068 +               if (rc || !device)
16069 +                       done = TRUE;
16070 +               else {
16071 +                       ldev_prv = device->private;
16072 +                       if (ldev_prv->gd == gd)
16073 +                               if (ldev_prv->major == major)
16074 +                                       if (ldev_prv->minor == minor)
16075 +                                               done = found = TRUE;
16076 +               }
16077 +       }
16078 +       return (found);
16079 +}
16080 +
16081 +static void
16082 +build_devnode_name(char *name_buf, int major)
16083 +{
16084 +       char buf[11], *modifier, *buf_ptr;
16085 +       int int_mod, done;
16086 +       struct blk_device_info *bdi;
16087 +
16088 +       bdi = &blk_dev_info[major];
16089 +
16090 +       /* convert the base name modifier to an integer */
16091 +       modifier = bdi->devnode_name_modifier;
16092 +       int_mod = 0;
16093 +       while (*modifier) {
16094 +               if (bdi->devnode_name_type == INDEX_ALPHA) {
16095 +                       int_mod *= 26;
16096 +                       int_mod += *modifier - 'a';
16097 +               } else {
16098 +                       int_mod *= 10;
16099 +                       int_mod += *modifier - '0';
16100 +               }
16101 +               modifier++;
16102 +               if (*modifier) {
16103 +                       int_mod++;
16104 +               }
16105 +       }
16106 +       /* add in device_index_value */
16107 +       int_mod += bdi->devnode_name_index;
16108 +       bdi->devnode_name_index++;
16109 +
16110 +       /* convert integer modifier back to ALPHA/NUMERIC chars */
16111 +       memset(buf, 0, sizeof (buf));
16112 +       /* fill the buffer from the rear to front with the
16113 +        * ascii version of the modifier, leaving space for
16114 +        * NULL terminator at the end.
16115 +        */
16116 +       buf_ptr = &buf[sizeof (buf) - 2];
16117 +       done = FALSE;
16118 +       do {
16119 +               if (bdi->devnode_name_type == INDEX_ALPHA) {
16120 +                       *buf_ptr = (int_mod % 26) + 'a';
16121 +                       int_mod /= 26;
16122 +               } else {
16123 +                       *buf_ptr = (int_mod % 10) + '0';
16124 +                       int_mod /= 10;
16125 +               }
16126 +               if (int_mod) {
16127 +                       int_mod--;
16128 +               } else {
16129 +                       done = TRUE;
16130 +               }
16131 +               buf_ptr--;
16132 +       } while (!done);
16133 +
16134 +       /* find beginning of modifier in buffer */
16135 +       modifier = buf;
16136 +       while (!*modifier)
16137 +               modifier++;
16138 +
16139 +       /* build the final device devnode name */
16140 +       sprintf(name_buf, "%s%s", bdi->devnode_name_base, modifier);
16141 +}
16142 +
16143 +static int
16144 +ldev_mgr_lock_device(struct ldev_private *ldev_prv)
16145 +{
16146 +       int rc;
16147 +       struct block_device *bdev;
16148 +
16149 +       bdev = bdget(MKDEV(ldev_prv->major, ldev_prv->minor));
16150 +       if (!bdev)
16151 +               return -ENOMEM;
16152 +       rc = blkdev_get(bdev, FMODE_READ | FMODE_WRITE, 0, BDEV_RAW);
16153 +       if (rc)
16154 +               return rc;
16155 +       ldev_prv->bdev = bdev;
16156 +       return 0;
16157 +}
16158 +
16159 +static void
16160 +ldev_mgr_unlock_device(struct ldev_private *ldev_prv)
16161 +{
16162 +       struct block_device *bdev = ldev_prv->bdev;
16163 +       ldev_prv->bdev = NULL;
16164 +       if (!bdev) {
16165 +               LOG_ERROR("error: NULL bdev field detected!\n");
16166 +               BUG();
16167 +       }
16168 +       blkdev_put(bdev, BDEV_RAW);
16169 +}
16170 +
16171 +#define DEVICE_KNOWN                   1234
16172 +#define DEVICE_UNINITIALIZED           1235
16173 +#define DEVICE_MEDIA_NOT_PRESENT       1236
16174 +static int
16175 +create_logical_disk(struct evms_logical_node **disk_list,
16176 +                   struct gendisk *gd, int device_index)
16177 +{
16178 +       int rc = 0, major, minor;
16179 +       struct evms_logical_node *new_disk = NULL;
16180 +       struct ldev_private *ldev_prv = NULL;
16181 +       char device_name[EVMS_VOLUME_NAME_SIZE + 1];
16182 +
16183 +       major = gd->major;
16184 +       minor = device_index << gd->minor_shift;
16185 +
16186 +       /* skip uninitialized devices */
16187 +       if (!blk_size[major])
16188 +               rc = DEVICE_UNINITIALIZED;
16189 +       else if (!blk_size[major][minor])
16190 +               rc = DEVICE_UNINITIALIZED;
16191 +       if (!rc) {
16192 +               /* construct the devnode name for this device */
16193 +               build_devnode_name(device_name, major);
16194 +
16195 +               /* skip devices we already know about */
16196 +               if (is_in_device_list(gd, major, minor) == TRUE)
16197 +                       rc = DEVICE_KNOWN;
16198 +       }
16199 +       /* allocate the new node */
16200 +       if (!rc) {
16201 +               rc = evms_cs_allocate_logical_node(&new_disk);
16202 +       }
16203 +       /* allocate new nodes's instance data */
16204 +       if (!rc) {
16205 +               ldev_prv = kmalloc(sizeof(struct ldev_private), GFP_KERNEL);
16206 +               if (!ldev_prv)
16207 +                       rc = -ENOMEM;
16208 +       }
16209 +       /* initialize the new node */
16210 +       if (!rc) {
16211 +               memset(ldev_prv, 0, sizeof(struct ldev_private));
16212 +               new_disk->plugin = &plugin_header;
16213 +
16214 +               /* initialize the instance data */
16215 +               new_disk->private = ldev_prv;
16216 +               ldev_prv->gd = gd;
16217 +               ldev_prv->major = major;
16218 +               ldev_prv->minor = minor;
16219 +               rc = ldev_mgr_lock_device(ldev_prv);
16220 +               if (rc) {
16221 +                       LOG_ERROR("error(%d): unable to lock device(%d,%d)!\n",
16222 +                                 rc, major, minor);
16223 +               }
16224 +       }
16225 +       if (!rc) {
16226 +               /* determine hardsector size */
16227 +               new_disk->hardsector_size = 512;
16228 +               if (hardsect_size[major]) {
16229 +                       new_disk->hardsector_size = hardsect_size[major][minor];
16230 +               }
16231 +               /* save the block size */
16232 +               new_disk->block_size = 1024;
16233 +               if (blksize_size[major]) {
16234 +                       new_disk->block_size = blksize_size[major][minor];
16235 +               }
16236 +               /* obtain the device size in sectors
16237 +                *
16238 +                * try 64bit size first, if that fails
16239 +                * fall back on the 32bit size.
16240 +                */
16241 +               /* try 64bit size */
16242 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,18)
16243 +               rc = evms_cs_kernel_ioctl(new_disk, BLKGETSIZE64,
16244 +                                         (ulong) & new_disk->total_vsectors);
16245 +               if (!rc) {
16246 +                       /* convert bytes to 512 byte sectors */
16247 +                       new_disk->total_vsectors >>= EVMS_VSECTOR_SIZE_SHIFT;
16248 +               } else
16249 +#endif
16250 +               {
16251 +                       /* try 32bit size */
16252 +                       ulong dev_size = 0;
16253 +                       rc = evms_cs_kernel_ioctl(new_disk, BLKGETSIZE,
16254 +                                                 (ulong) & dev_size);
16255 +                       new_disk->total_vsectors = dev_size;
16256 +               }
16257 +               if (!rc && !new_disk->total_vsectors) {
16258 +                       rc = -ENOSPC;
16259 +               }
16260 +       }
16261 +       if (!rc) {
16262 +               /* remember removable devices */
16263 +               if (gd->flags)
16264 +                       if (gd->flags[device_index] & GENHD_FL_REMOVABLE)
16265 +                               new_disk->flags |= EVMS_DEVICE_REMOVABLE;
16266 +
16267 +               /* save the devnode name for this device */
16268 +               strcpy(new_disk->name, device_name);
16269 +
16270 +               /* register this device with evms */
16271 +               evms_cs_register_device(new_disk);
16272 +               MOD_INC_USE_COUNT;
16273 +
16274 +               /* append this record the linked list */
16275 +               evms_cs_add_logical_node_to_list(disk_list, new_disk);
16276 +               LOG_DETAILS
16277 +                   ("added logical disk(%s) for physical disk(%u,%u,%s), size("PFU64") in 512 byte units\n",
16278 +                    new_disk->name, major, minor, new_disk->name,
16279 +                    new_disk->total_vsectors);
16280 +
16281 +       }
16282 +       /* reset the "benign" error codes for the caller */
16283 +       switch (rc) {
16284 +       case DEVICE_UNINITIALIZED:
16285 +       case DEVICE_KNOWN:
16286 +       case DEVICE_MEDIA_NOT_PRESENT:
16287 +               rc = 0;
16288 +       case 0:
16289 +               break;
16290 +       default:
16291 +               LOG_ERROR
16292 +                   ("error(%d): creating logical disk for device(%d,%d).\n",
16293 +                    rc, major, minor);
16294 +               if (new_disk) {
16295 +                       evms_cs_deallocate_logical_node(new_disk);
16296 +               }
16297 +               if (ldev_prv) {
16298 +                       kfree(ldev_prv);
16299 +               }
16300 +               break;
16301 +       }
16302 +       return (rc);
16303 +}
16304 +
16305 +static int
16306 +create_logical_generic_disks(struct evms_logical_node **disk_list,
16307 +                            struct gendisk *gd)
16308 +{
16309 +       int rc, i;
16310 +
16311 +       /* This is a generic device */
16312 +
16313 +       rc = 0;
16314 +       LOG_DEBUG("major name = %s\n", gd->major_name);
16315 +       LOG_DEBUG("number of real devices = %i\n", gd->nr_real);
16316 +       for (i = 0; i < gd->nr_real; i++) {
16317 +               LOG_DEBUG("device %d:\n", i);
16318 +               rc = create_logical_disk(disk_list, gd, i);
16319 +               if (rc)
16320 +                       break;
16321 +       }
16322 +       return (rc);
16323 +}
16324 +
16325 +static int
16326 +create_logical_ide_disks(struct evms_logical_node **disk_list,
16327 +                        struct gendisk *gd)
16328 +{
16329 +       int rc = 0, i;
16330 +       ide_hwif_t *ide_hwif;
16331 +       ide_drive_t *drive;
16332 +
16333 +       /* This is an IDE device */
16334 +       LOG_DEBUG("found IDE major : %i - searching for disks\n", gd->major);
16335 +
16336 +       ide_hwif = gd->real_devices;    /* IDE internal data */
16337 +       for (i = 0; i < MAX_DRIVES; i++) {
16338 +               drive = &(ide_hwif->drives[i]);
16339 +               if (drive->present && (drive->media == ide_disk)) {
16340 +                       /* force the name index value on ide drives */
16341 +                       blk_dev_info[gd->major].devnode_name_index = i;
16342 +                       rc = create_logical_disk(disk_list, gd, i);
16343 +               }
16344 +               if (rc)
16345 +                       break;
16346 +       }
16347 +       return (rc);
16348 +}
16349 +
16350 +static int
16351 +create_logical_scsi_disks(struct evms_logical_node **disk_list,
16352 +                         struct gendisk *gd)
16353 +{
16354 +       int rc = 0, i;
16355 +       Scsi_Disk *SDisks;
16356 +       Scsi_Device *SDev;
16357 +
16358 +       /* This is an SCSI device */
16359 +       LOG_DEBUG("found SCSI major : %i - searching for disks\n", gd->major);
16360 +       LOG_DEBUG("scsi: major name = %s\n", gd->major_name);
16361 +       LOG_DEBUG("scsi: number of real devices = %i\n", gd->nr_real);
16362 +       SDisks = gd->real_devices;      /* SCSI internal data */
16363 +       for (i = 0; i < gd->nr_real; i++) {
16364 +               SDev = SDisks[i].device;
16365 +               LOG_DEBUG
16366 +                   ("scsi: Channel = %i, Id = %i, Lun = %i, Capacity = %i\n",
16367 +                    SDev->channel, SDev->id, SDev->lun, SDisks[i].capacity);
16368 +               rc = create_logical_disk(disk_list, gd, i);
16369 +               if (rc)
16370 +                       break;
16371 +       }
16372 +       return (rc);
16373 +}
16374 +
16375 +static int
16376 +create_logical_disks(struct gendisk *gd, void *p_disk_list)
16377 +{
16378 +       int rc = 0;
16379 +       struct evms_logical_node **disk_list = p_disk_list;
16380 +
16381 +       /* create logical disks from all IDE & SCSI devices */
16382 +       switch (blk_dev_info[gd->major].device_type) {
16383 +       case TYPE_IDE:
16384 +               rc = create_logical_ide_disks(disk_list, gd);
16385 +               break;
16386 +       case TYPE_SCSI:
16387 +               rc = create_logical_scsi_disks(disk_list, gd);
16388 +               break;
16389 +       case TYPE_GENERIC:
16390 +               rc = create_logical_generic_disks(disk_list, gd);
16391 +               break;
16392 +       default:
16393 +               LOG_DEBUG("unrecognized device major : %i\n", gd->major);
16394 +               break;
16395 +       }
16396 +
16397 +       return (rc);
16398 +}
16399 +
16400 +static int
16401 +discover_disks(struct evms_logical_node **disk_list)
16402 +{
16403 +       int rc = 0;
16404 +
16405 +       MOD_INC_USE_COUNT;
16406 +       LOG_ENTRY_EXIT("%s Entry\n", __FUNCTION__);
16407 +
16408 +       if (blk_dev_info == NULL) {
16409 +               /* allocate space for device info array */
16410 +               blk_dev_info = kmalloc(sizeof (struct blk_device_info)
16411 +                                      * (MAX_BLKDEV + 1), GFP_KERNEL);
16412 +               if (blk_dev_info) {
16413 +                       /* initialize device info array */
16414 +                       memset(blk_dev_info, 0,
16415 +                              sizeof (struct blk_device_info) * (MAX_BLKDEV + 1));
16416 +                       init_blk_dev_info(blk_dev_info);
16417 +               } else {
16418 +                       rc = -ENOMEM;
16419 +               }
16420 +       }
16421 +       if (!rc)
16422 +               /* create logical disks from the raw devices */
16423 +               rc = walk_gendisk(create_logical_disks, disk_list);
16424 +
16425 +       /* free blk_dev_info table and null the ptr to it */
16426 +       kfree(blk_dev_info);
16427 +       blk_dev_info = NULL;
16428 +
16429 +       LOG_ENTRY_EXIT("%s Exit\n", __FUNCTION__);
16430 +       MOD_DEC_USE_COUNT;
16431 +       return (rc);
16432 +}
16433 +
16434 +/********************************************************/
16435 +/* Required Plugin Function Table Entry Point:          */
16436 +/*      Delete function                                 */
16437 +/********************************************************/
16438 +
16439 +static int
16440 +ldev_mgr_delete(struct evms_logical_node *disk)
16441 +{
16442 +       struct ldev_private *ldev_prv;
16443 +
16444 +       /* reset any evms volume related info from
16445 +        * the device node, because we can't predict
16446 +        * how this node will be used in the future.
16447 +        */
16448 +
16449 +       /* removed the feature header if its been used
16450 +        */
16451 +       if (disk->feature_header) {
16452 +               kfree(disk->feature_header);
16453 +               disk->feature_header = NULL;
16454 +       }
16455 +       /* remove the volume_info structure and flag
16456 +        * if this has been used directly by an evms
16457 +        * feature.
16458 +        */
16459 +       evms_cs_deallocate_volume_info(disk);
16460 +       /* reset the flags field to the appropriate state
16461 +        */
16462 +       disk->flags &= ~EVMS_VOLUME_FLAG;
16463 +
16464 +       /* disk nodes only get deleted when:
16465 +        * 1)  there are no references to the disk node
16466 +        *      in memory.
16467 +        * 2)  the device is removable
16468 +        * 3)  the device reported a media change
16469 +        *
16470 +        * All three of these conditions must be true
16471 +        * before the disk node can be deleted.
16472 +        * evms_check_for_device_changes should set
16473 +        * and ensure these conditions before issuing
16474 +        * deletes.
16475 +        *
16476 +        * Newly installed removable media will be
16477 +        * picked up in this modules discover code.
16478 +        *
16479 +        * OR disk nodes can will be deleted if the
16480 +        * devices they represent go away, for example
16481 +        * in the case of a hotunplugged device or a
16482 +        * required driver having been unloaded.
16483 +        */
16484 +       if (disk->flags & (EVMS_MEDIA_CHANGED | EVMS_DEVICE_UNAVAILABLE)) {
16485 +               LOG_DETAILS("deleting '%s'.\n", disk->name);
16486 +
16487 +               evms_cs_unregister_device(disk);
16488 +               MOD_DEC_USE_COUNT;
16489 +               ldev_prv = disk->private;
16490 +               ldev_mgr_unlock_device(ldev_prv);
16491 +               if (ldev_prv) {
16492 +                       kfree(ldev_prv);
16493 +               }
16494 +               evms_cs_deallocate_logical_node(disk);
16495 +       }
16496 +       return 0;
16497 +}
16498 +
16499 +/********************************************************/
16500 +/* Required Plugin Function Table Entry Point:          */
16501 +/*      Read function                                   */
16502 +/********************************************************/
16503 +
16504 +/*
16505 + * function: ldev_mgr_io_error
16506 + *
16507 + * this function was primarily created because the function
16508 + * buffer_IO_error is inline and kgdb doesn't allow breakpoints
16509 + * to be set on inline functions. Since this was an error path
16510 + * and not mainline, I decided to add a trace statement to help
16511 + * report on the failing condition.
16512 + *
16513 + */
16514 +static void
16515 +ldev_mgr_io_error(struct evms_logical_node *disk, int io_flag, struct buffer_head *bh, int rc)
16516 +{
16517 +       if (rc == -EOVERFLOW) {
16518 +               LOG_SERIOUS
16519 +                   ("attempt to %s beyond boundary("PFU64") on (%s), rsector(%ld).\n",
16520 +                    (io_flag) ? "WRITE" : "READ", disk->total_vsectors - 1,
16521 +                    disk->name, bh->b_rsector);
16522 +       } else if (rc == -ENXIO) {
16523 +               LOG_SERIOUS("attempt to access a non-existent device(%s).\n",
16524 +                           disk->name);
16525 +       }
16526 +       bh->b_end_io(bh, 0);
16527 +}
16528 +
16529 +/********************************************************/
16530 +/* Required Plugin Function Table Entry Point:          */
16531 +/*      Read function                                   */
16532 +/********************************************************/
16533 +
16534 +static void
16535 +ldev_mgr_read(struct evms_logical_node *disk, struct buffer_head *bh)
16536 +{
16537 +       int rc = 0;
16538 +       request_queue_t *q;
16539 +       struct ldev_private *ldev_prv;
16540 +
16541 +       ldev_prv = disk->private;
16542 +       if (bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) <=
16543 +           disk->total_vsectors) {
16544 +               bh->b_rdev = MKDEV(ldev_prv->major, ldev_prv->minor);
16545 +               q = blk_get_queue(bh->b_rdev);
16546 +               if (q) {
16547 +                       disk->flags &= ~EVMS_DEVICE_UNAVAILABLE;
16548 +                       q->make_request_fn(q, READ, bh);
16549 +                       return;
16550 +               } else {
16551 +                       rc = -ENXIO;
16552 +                       disk->flags |= EVMS_DEVICE_UNAVAILABLE;
16553 +               }
16554 +       } else {
16555 +               rc = -EOVERFLOW;
16556 +       }
16557 +       if (rc) {
16558 +               ldev_mgr_io_error(disk, READ, bh, rc);
16559 +       }
16560 +}
16561 +
16562 +/********************************************************/
16563 +/* Required Plugin Function Table Entry Point:          */
16564 +/*      Write function                                  */
16565 +/********************************************************/
16566 +
16567 +static void
16568 +ldev_mgr_write(struct evms_logical_node *disk, struct buffer_head *bh)
16569 +{
16570 +       int rc = 0;
16571 +       request_queue_t *q;
16572 +       struct ldev_private *ldev_prv;
16573 +
16574 +       ldev_prv = disk->private;
16575 +       if (bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) <=
16576 +           disk->total_vsectors) {
16577 +               bh->b_rdev = MKDEV(ldev_prv->major, ldev_prv->minor);
16578 +               q = blk_get_queue(bh->b_rdev);
16579 +               if (q) {
16580 +                       disk->flags &= ~EVMS_DEVICE_UNAVAILABLE;
16581 +                       q->make_request_fn(q, WRITE, bh);
16582 +                       return;
16583 +               } else {
16584 +                       rc = -ENXIO;
16585 +                       disk->flags |= EVMS_DEVICE_UNAVAILABLE;
16586 +               }
16587 +       } else {
16588 +               rc = -EOVERFLOW;
16589 +       }
16590 +       if (rc) {
16591 +               ldev_mgr_io_error(disk, WRITE, bh, rc);
16592 +       }
16593 +}
16594 +
16595 +/********************************************************/
16596 +/* Required Plugin Function Table Entry Point:          */
16597 +/*      Init_io function & Support routines             */
16598 +/********************************************************/
16599 +
16600 +/*
16601 + * function: allocate_bh
16602 + *
16603 + * This function obtains a buffer head from the private
16604 + * buffer head pool (pre-allocated at EVMS initial
16605 + * discovery time).
16606 + *
16607 + * NOTE: All access to the buffer head pool are protected
16608 + * by a private spinlock.
16609 + *
16610 + */
16611 +static inline struct buffer_head *
16612 +allocate_bh(void)
16613 +{
16614 +       struct buffer_head *bh =
16615 +           evms_cs_allocate_from_pool(evms_bh_pool, FALSE);
16616 +       if (bh) {
16617 +               init_waitqueue_head(&bh->b_wait);
16618 +       }
16619 +       return (bh);
16620 +}
16621 +
16622 +/*
16623 + * function: deallocate_bh
16624 + *
16625 + * This function returns a buffer head to the private
16626 + * buffer head pool (pre-allocated at EVMS initial
16627 + * discovery time).
16628 + *
16629 + * NOTE: All access to the buffer head pool are protected
16630 + * by a private spinlock.
16631 + *
16632 + */
16633 +static inline void
16634 +deallocate_bh(struct buffer_head *bh)
16635 +{
16636 +       evms_cs_deallocate_to_pool(evms_bh_pool, bh);
16637 +}
16638 +
16639 +/* this is the buffer head control block structure definition */
16640 +typedef struct bh_cb_s {
16641 +       int rc;
16642 +       atomic_t blks_allocated;
16643 +       wait_queue_head_t cb_wait;
16644 +} bh_cb_t;
16645 +
16646 +/*
16647 + * function: __wait_on_bh_cb
16648 + *
16649 + * This is a worker function to wait_on_bh_cb.
16650 + * This function waits for a set of private buffer heads
16651 + * associated to the specified buffer head control block
16652 + * to return from I/O completion. On completion of the
16653 + * last buffer head, the calling function is awakened
16654 + * and continues running.
16655 + *
16656 + * This is the worker function to the function wait_on_bh_cb.
16657 + *
16658 + */
16659 +static void
16660 +__wait_on_bh_cb(bh_cb_t * bh_cb)
16661 +{
16662 +       struct task_struct *tsk = current;
16663 +       DECLARE_WAITQUEUE(wait, tsk);
16664 +
16665 +       add_wait_queue(&bh_cb->cb_wait, &wait);
16666 +       do {
16667 +               run_task_queue(&tq_disk);
16668 +               set_task_state(tsk, TASK_UNINTERRUPTIBLE);
16669 +               if (!atomic_read(&bh_cb->blks_allocated))
16670 +                       break;
16671 +               schedule();
16672 +       } while (atomic_read(&bh_cb->blks_allocated));
16673 +#ifdef O1_SCHEDULER
16674 +       set_task_state(tsk, TASK_RUNNING);
16675 +#else
16676 +       tsk->state = TASK_RUNNING;
16677 +#endif
16678 +       remove_wait_queue(&bh_cb->cb_wait, &wait);
16679 +}
16680 +
16681 +/*
16682 + * function: wait_on_bh_cb
16683 + *
16684 + * This function waits for a set of private buffer heads
16685 + * associated to the specified buffer head control block
16686 + * to return from I/O completion. On completion of the
16687 + * last buffer head, the calling function is awakened
16688 + * and continues running.
16689 + *
16690 + */
16691 +static void
16692 +wait_on_bh_cb(bh_cb_t * bh_cb)
16693 +{
16694 +       if (atomic_read(&bh_cb->blks_allocated))
16695 +               __wait_on_bh_cb(bh_cb);
16696 +       else
16697 +               /* if we ended up with no buffer heads on
16698 +                * this pass, lets wait a until a few buffer
16699 +                * heads have been freed and try again. This
16700 +                * should provide a reasonable delay.
16701 +                */
16702 +               schedule();
16703 +}
16704 +
16705 +/*
16706 + * function: end_bh_cb_io
16707 + *
16708 + * This is the I/O completion function that is called for
16709 + * each private buffer head obtained from the buffer head
16710 + * pool. Control is return thru this routine so we can track
16711 + * all outstanding requests to know when to awaken the caller,
16712 + * and to regain control after all I/Os have been performed.
16713 + *
16714 + */
16715 +static void
16716 +end_bh_cb_io_sync(struct buffer_head *bh, int uptodate)
16717 +{
16718 +       bh_cb_t *bh_cb = (bh_cb_t *) bh->b_private;
16719 +
16720 +       /* record that errors occurred */
16721 +       if (!uptodate) {
16722 +               bh_cb->rc = -EIO;
16723 +       }
16724 +       mark_buffer_uptodate(bh, uptodate);
16725 +       unlock_buffer(bh);
16726 +
16727 +       deallocate_bh(bh);
16728 +       atomic_dec(&bh_cb->blks_allocated);
16729 +       if (!atomic_read(&bh_cb->blks_allocated))
16730 +               if (waitqueue_active(&bh_cb->cb_wait))
16731 +                       wake_up(&bh_cb->cb_wait);
16732 +}
16733 +
16734 +/*
16735 + * function: ldev_partial_sector_init_io
16736 + *
16737 + * This function is a support function for ldev_init_io,
16738 + * which handles the cases of performing I/O to only a part
16739 + * of non-standard sized hardsector. This function is not
16740 + * designed to be called directly, but via ldev_init_io.
16741 + *
16742 + */
16743 +static int
16744 +ldev_partial_sector_init_io(struct evms_logical_node *node,
16745 +                           int io_flag,
16746 +                           bh_cb_t * bh_cb,
16747 +                           u64 next_lsn,
16748 +                           u64 sector_lsn,
16749 +                           u64 io_size,
16750 +                           void *bufptr, unsigned char **sector_buf)
16751 +{
16752 +       int rc = 0;
16753 +       struct ldev_private *ldev_prv = node->private;
16754 +       kdev_t dev = MKDEV(ldev_prv->major, ldev_prv->minor);
16755 +       struct buffer_head *bh;
16756 +
16757 +       if (*sector_buf == NULL) {
16758 +               /* allocate buffer for incoming sector */
16759 +               *sector_buf = kmalloc(node->hardsector_size, GFP_KERNEL);
16760 +               if (!*sector_buf)
16761 +                       return -ENOMEM;
16762 +       }
16763 +       /* allocate a buffer head from the pool */
16764 +       while ((bh = allocate_bh()) == NULL)
16765 +               /* yielding the cpu is playing it
16766 +                * safe. it might be wiser to just
16767 +                * spin. requires more thought.
16768 +                */
16769 +               schedule();
16770 +
16771 +       /* set up the buffer head for this sector */
16772 +       bh->b_end_io = end_bh_cb_io_sync;
16773 +       bh->b_size = node->hardsector_size;
16774 +       bh->b_rdev = dev;
16775 +       bh->b_rsector = next_lsn - sector_lsn;
16776 +       bh->b_data = *sector_buf;
16777 +       bh->b_page = virt_to_page(*sector_buf); /* this isn't handling the case of a block with more than 1 sector, that spans pages */
16778 +       bh->b_state = 0;
16779 +       set_bit(BH_Dirty, &bh->b_state);
16780 +       set_bit(BH_Lock, &bh->b_state);
16781 +       set_bit(BH_Req, &bh->b_state);
16782 +       set_bit(BH_Mapped, &bh->b_state);
16783 +       bh->b_private = (void *) bh_cb;
16784 +       atomic_inc(&bh_cb->blks_allocated);
16785 +
16786 +       /* drive the buffer head down   */
16787 +       /* to the device                */
16788 +       generic_make_request(READ, bh);
16789 +
16790 +       /* wait for all bh's I/O's to end */
16791 +       wait_on_bh_cb(bh_cb);
16792 +
16793 +       /* copy data to/from user */
16794 +       if (io_flag != WRITE)
16795 +               /* READ */
16796 +               memcpy(bufptr,
16797 +                      *sector_buf + (sector_lsn << EVMS_VSECTOR_SIZE_SHIFT),
16798 +                      io_size << EVMS_VSECTOR_SIZE_SHIFT);
16799 +       else {
16800 +               /* WRITE */
16801 +               memcpy(*sector_buf + (sector_lsn << EVMS_VSECTOR_SIZE_SHIFT),
16802 +                      bufptr, io_size << EVMS_VSECTOR_SIZE_SHIFT);
16803 +
16804 +               /* allocate a buffer head from the pool */
16805 +               while ((bh = allocate_bh()) == NULL)
16806 +                       /* yielding the cpu is playing it
16807 +                        * safe. it might be wiser to just
16808 +                        * spin. requires more thought.
16809 +                        */
16810 +                       schedule();
16811 +
16812 +               /* set up the buffer head for this sector */
16813 +               bh->b_end_io = end_bh_cb_io_sync;
16814 +               bh->b_size = node->hardsector_size;
16815 +               bh->b_rdev = dev;
16816 +               bh->b_rsector = next_lsn - sector_lsn;
16817 +               bh->b_data = *sector_buf;
16818 +               bh->b_page = virt_to_page(*sector_buf); /* this isn't handling the case of a block with more than 1 sector, that spans pages */
16819 +               bh->b_state = 0;
16820 +               set_bit(BH_Dirty, &bh->b_state);
16821 +               set_bit(BH_Lock, &bh->b_state);
16822 +               set_bit(BH_Req, &bh->b_state);
16823 +               set_bit(BH_Mapped, &bh->b_state);
16824 +               bh->b_private = (void *) bh_cb;
16825 +               atomic_inc(&bh_cb->blks_allocated);
16826 +
16827 +               /* drive the buffer head down   */
16828 +               /* to the device                */
16829 +               generic_make_request(WRITE, bh);
16830 +
16831 +               /* wait for all bh's I/O's to end */
16832 +               wait_on_bh_cb(bh_cb);
16833 +       }
16834 +       return (rc);
16835 +}
16836 +
16837 +/*
16838 + * function: ldev_init_io
16839 + *
16840 + * This function provides support for synchronous I/O
16841 + * operations to the underlying devices. These I/O
16842 + * operations are NOT buffered in any way including the
16843 + * operating system's buffer cache.
16844 + *
16845 + * This function can work with any hardsector size that
16846 + * is a power of 2.
16847 + *
16848 + * node           : logical node of the target logical disk
16849 + * io_flag        : 0 = read, 1 = write, 2 = read-a-head
16850 + * starting_lsn   : the 0-based (disk relative) logical
16851 + *               :  (512 byte) sector number (lsn)
16852 + * num_lsns       : the total number of lsns in this I/O
16853 + * bufptr         : address of the memory to read/write the data
16854 + *
16855 + */
16856 +static int
16857 +ldev_init_io(struct evms_logical_node *node,
16858 +            int io_flag,
16859 +            u64 starting_lsn, u64 num_lsns, void *bufptr)
16860 +{
16861 +       int rc = 0, lsns_per_hardsector, lsns_per_blocksize;
16862 +       unchar *sector_buf = NULL, *cur_bufptr;
16863 +       u64 next_lsn, remaining_lsns, sector_lsn;
16864 +       struct ldev_private *ldev_prv = node->private;
16865 +       kdev_t dev = MKDEV(ldev_prv->major, ldev_prv->minor);
16866 +       bh_cb_t bh_cb;
16867 +
16868 +       LOG_EVERYTHING
16869 +           ("%s Entry: Disk(%u,%u), ioflag(%u), start_lsn("PFU64"), num_lsns("PFU64"), bufptr(0x%p)\n",
16870 +            __FUNCTION__, ldev_prv->major, ldev_prv->minor, io_flag,
16871 +            starting_lsn, num_lsns, bufptr);
16872 +
16873 +       /* check for valid device */
16874 +       if (!blk_size[ldev_prv->major][ldev_prv->minor]) {
16875 +               node->flags |= EVMS_DEVICE_UNAVAILABLE;
16876 +               return (-ENXIO);
16877 +       }
16878 +       /* check for 0 length request */
16879 +       if (num_lsns == 0) {
16880 +               LOG_ERROR("%s: error requesting 0 sectors.\n", __FUNCTION__);
16881 +               return (-EINVAL);
16882 +       }
16883 +       /* check for out of bound request */
16884 +       if ((starting_lsn + num_lsns) > node->total_vsectors) {
16885 +               LOG_ERROR
16886 +                   ("%s: attempted %s beyond logical disk boundary("PFU64" LSNs), requesting LSN("PFU64"), total LSNs("PFU64").\n",
16887 +                    __FUNCTION__, (io_flag == WRITE) ? "WRITE" : "READ",
16888 +                    node->total_vsectors, starting_lsn, num_lsns);
16889 +               return (-EINVAL);
16890 +       }
16891 +       /* check for invalid io_flag value */
16892 +       switch (io_flag) {
16893 +       case READ:              /* read...   */
16894 +       case WRITE:             /* write...  */
16895 +       case READA:             /* reada...  */
16896 +               break;
16897 +       default:
16898 +               return (-EINVAL);
16899 +       }
16900 +
16901 +       /* compute some per device info once up-front */
16902 +       lsns_per_hardsector = node->hardsector_size / EVMS_VSECTOR_SIZE;
16903 +       lsns_per_blocksize = node->block_size / EVMS_VSECTOR_SIZE;
16904 +
16905 +       /* initialize the buffer head control block */
16906 +       memset(&bh_cb, 0, sizeof (bh_cb_t));
16907 +       init_waitqueue_head(&bh_cb.cb_wait);
16908 +       bh_cb.blks_allocated = (atomic_t)ATOMIC_INIT(0);
16909 +
16910 +       /* only update the local copy of variables */
16911 +       cur_bufptr = bufptr;
16912 +       next_lsn = starting_lsn;
16913 +       remaining_lsns = num_lsns;
16914 +
16915 +       /* check for a mid-sector starting offset
16916 +        *
16917 +        * if found, perform I/O on part of that
16918 +        * sector
16919 +        */
16920 +       sector_lsn = next_lsn & (lsns_per_hardsector - 1);
16921 +       if (sector_lsn) {
16922 +               u64 io_size;
16923 +
16924 +               /* determine bytes in IO to this sector */
16925 +               io_size = lsns_per_hardsector - sector_lsn;
16926 +               if (io_size > remaining_lsns)
16927 +                       io_size = remaining_lsns;
16928 +
16929 +               /* perform the partial sector io */
16930 +               rc = ldev_partial_sector_init_io(node, io_flag, &bh_cb,
16931 +                                                next_lsn,
16932 +                                                sector_lsn, io_size,
16933 +                                                cur_bufptr, &sector_buf);
16934 +
16935 +               if (!rc) {
16936 +                       /* update progress in local variables */
16937 +                       cur_bufptr += io_size << EVMS_VSECTOR_SIZE_SHIFT;
16938 +                       next_lsn += io_size;
16939 +                       remaining_lsns -= io_size;
16940 +               }
16941 +       }
16942 +
16943 +       /* continue if no errors found */
16944 +       if (!rc) {
16945 +               /* perform I/O on all the complete sectors
16946 +                * in this request.
16947 +                *
16948 +                * loop until there are no more complete sectors
16949 +                * to process.
16950 +                */
16951 +               while (remaining_lsns >= lsns_per_hardsector) {
16952 +                       /* this inner loop attempts to drive as many
16953 +                        * bytes (in sector size multiples) down to
16954 +                        * the device as possible using the available
16955 +                        * buffer heads in the pool.
16956 +                        */
16957 +                       while (remaining_lsns >= lsns_per_hardsector) {
16958 +                               struct buffer_head *bh;
16959 +
16960 +                               /* allocate a buffer head from the pool */
16961 +                               bh = allocate_bh();
16962 +                               if (bh == NULL)
16963 +                                       break;
16964 +
16965 +                               /* set up the buffer head for this I/O */
16966 +                               bh->b_end_io = end_bh_cb_io_sync;
16967 +                               bh->b_size =
16968 +                                   (remaining_lsns >= lsns_per_blocksize) ?
16969 +                                   node->block_size : node->hardsector_size;
16970 +                               bh->b_data = cur_bufptr;
16971 +                               bh->b_rdev = dev;
16972 +                               bh->b_rsector = next_lsn;
16973 +                               bh->b_page = virt_to_page(cur_bufptr);  /* this isn't handling the case of a block with more than 1 sector, that spans pages */
16974 +                               bh->b_state = 0;
16975 +                               set_bit(BH_Dirty, &bh->b_state);
16976 +                               set_bit(BH_Lock, &bh->b_state);
16977 +                               set_bit(BH_Req, &bh->b_state);
16978 +                               set_bit(BH_Mapped, &bh->b_state);
16979 +                               bh->b_private = (void *) &bh_cb;
16980 +                               atomic_inc(&bh_cb.blks_allocated);
16981 +
16982 +                               /* drive the buffer head down   */
16983 +                               /* to the device                */
16984 +                               generic_make_request(io_flag, bh);
16985 +
16986 +                               /* update progress in local variables */
16987 +                               cur_bufptr += bh->b_size;
16988 +                               next_lsn +=
16989 +                                   bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
16990 +                               remaining_lsns -=
16991 +                                   bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
16992 +                       }
16993 +                       /* wait for all bh's I/O's to end */
16994 +                       wait_on_bh_cb(&bh_cb);
16995 +               }
16996 +       }
16997 +
16998 +       /* continue if no errors found */
16999 +       if (!rc)
17000 +               /* check for a mid-sector ending offset
17001 +                *
17002 +                * if found, perform I/O on part of that
17003 +                * sector
17004 +                */
17005 +               if (remaining_lsns)
17006 +                       /* perform the partial sector io */
17007 +                       rc = ldev_partial_sector_init_io(node, io_flag, &bh_cb,
17008 +                                                        next_lsn,
17009 +                                                        0, remaining_lsns,
17010 +                                                        cur_bufptr,
17011 +                                                        &sector_buf);
17012 +
17013 +       /* free the sector buffer if it was allocated */
17014 +       if (sector_buf)
17015 +               kfree(sector_buf);
17016 +
17017 +       /* coalesce return codes */
17018 +       rc |= bh_cb.rc;
17019 +
17020 +       LOG_EVERYTHING("%s Exit: rc(%u)\n", __FUNCTION__, rc);
17021 +
17022 +       return (rc);
17023 +}
17024 +
17025 +static int
17026 +ldev_mgr_direct_ioctl(struct inode *inode,
17027 +                     struct file *file, unsigned int cmd, unsigned long arg)
17028 +{
17029 +       int rc = 0;
17030 +       struct ldev_private *ldev_prv;
17031 +       struct evms_plugin_ioctl_pkt tmp, *user_parms;
17032 +       struct ldev_plugin_ioctl pi_data;
17033 +       struct evms_logical_node *disk;
17034 +
17035 +       MOD_INC_USE_COUNT;
17036 +
17037 +       user_parms = (struct evms_plugin_ioctl_pkt *) arg;
17038 +       /* copy user's parameters to kernel space */
17039 +       if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
17040 +               rc = -EFAULT;
17041 +
17042 +       if (!rc) {
17043 +               /* validate its meant for us */
17044 +               if (tmp.feature_id != plugin_header.id) {
17045 +                       rc = -EINVAL;
17046 +               }
17047 +       }
17048 +
17049 +       if (!rc) {
17050 +               /* copy feature ioctl data to kernel space */
17051 +               if (copy_from_user(&pi_data, tmp.feature_ioctl_data,
17052 +                                  sizeof (pi_data))) {
17053 +                       rc = -EFAULT;
17054 +               }
17055 +       }
17056 +
17057 +       if (!rc) {
17058 +               /* find the disk node specified by the disk_handle */
17059 +               int done = FALSE;
17060 +               disk = NULL;
17061 +               while (!done) {
17062 +                       rc = evms_cs_find_next_device(disk,
17063 +                                                     &disk);
17064 +                       if (rc) {
17065 +                               break;
17066 +                       }
17067 +                       if (!disk) {
17068 +                               rc = -ENODATA;
17069 +                               break;
17070 +                       }
17071 +                       if (disk ==
17072 +                           DEV_HANDLE_TO_NODE(pi_data.disk_handle)) {
17073 +                               done = TRUE;
17074 +                       }
17075 +               }
17076 +       }
17077 +
17078 +       if (!rc) {
17079 +               /* perform feature command */
17080 +               ldev_prv = (struct ldev_private *) disk->private;
17081 +               switch (tmp.feature_command) {
17082 +                       kdev_t save_dev;
17083 +               case LDEV_MGR_BROADCAST_IOCTL_CMD:
17084 +                       save_dev = inode->i_rdev;
17085 +                       inode->i_rdev =
17086 +                           MKDEV(ldev_prv->major, ldev_prv->minor);
17087 +                       rc = ldev_prv->bdev->bd_op->ioctl(inode, file,
17088 +                                                         pi_data.cmd,
17089 +                                                         pi_data.arg);
17090 +                       inode->i_rdev = save_dev;
17091 +                       break;
17092 +               default:
17093 +                       rc = -EINVAL;
17094 +                       break;
17095 +               }
17096 +       }
17097 +
17098 +       /* return status value */
17099 +       tmp.status = rc;
17100 +       copy_to_user((struct evms_plugin_ioctl_pkt *) arg, &tmp, sizeof (tmp));
17101 +       MOD_DEC_USE_COUNT;
17102 +       return rc;
17103 +}
17104 +
17105 +/********************************************************/
17106 +/* Required Plugin Function Table Entry Point:          */
17107 +/*      IOCTL function & Support routines               */
17108 +/********************************************************/
17109 +
17110 +static int
17111 +ldev_mgr_ioctl(struct evms_logical_node *disk,
17112 +              struct inode *inode,
17113 +              struct file *file, unsigned int cmd, unsigned long arg)
17114 +{
17115 +       int rc = 0;
17116 +       struct ldev_private *ldev_prv = disk->private;
17117 +       kdev_t save_dev;
17118 +       struct block_device *save_bdev;
17119 +
17120 +       if (!inode || !disk)
17121 +               return -EINVAL;
17122 +
17123 +       save_dev = inode->i_rdev;
17124 +       inode->i_rdev = MKDEV(ldev_prv->major, ldev_prv->minor);
17125 +       save_bdev = inode->i_bdev;
17126 +       inode->i_bdev = ldev_prv->bdev;
17127 +       /* check device availability */
17128 +       if (!blk_get_queue(MKDEV(ldev_prv->major, ldev_prv->minor))) {
17129 +               disk->flags |= EVMS_DEVICE_UNAVAILABLE;
17130 +       }
17131 +       switch (cmd) {
17132 +       case EVMS_QUIESCE_VOLUME:
17133 +       case EVMS_PLUGIN_IOCTL:
17134 +               break;
17135 +       case EVMS_GET_BMAP:
17136 +               {
17137 +                       struct evms_get_bmap_pkt *bmap =
17138 +                           (struct evms_get_bmap_pkt *) arg;
17139 +                       bmap->dev = MKDEV(ldev_prv->major, ldev_prv->minor);
17140 +                       bmap->status = 0;
17141 +               }
17142 +               break;
17143 +       case EVMS_OPEN_VOLUME:
17144 +               if (disk->flags & EVMS_DEVICE_UNAVAILABLE) {
17145 +                       rc = -ENXIO;
17146 +               } else {
17147 +                       rc = ldev_prv->bdev->bd_op->open(inode, file);
17148 +               }
17149 +               break;
17150 +       case EVMS_CLOSE_VOLUME:
17151 +               if (disk->flags & EVMS_DEVICE_UNAVAILABLE) {
17152 +                       rc = -ENXIO;
17153 +               } else {
17154 +                       rc = ldev_prv->bdev->bd_op->release(inode, file);
17155 +               }
17156 +               break;
17157 +       case EVMS_CHECK_MEDIA_CHANGE:
17158 +               if (disk->flags & EVMS_DEVICE_UNAVAILABLE) {
17159 +                       rc = -ENXIO;
17160 +               } else {
17161 +                       /* once we detect that media changed
17162 +                        * is 'set', don't send any more ioctls
17163 +                        * down to the device, until the
17164 +                        * media change has been 'reset' by a
17165 +                        * revalidate disk ioctl. when already
17166 +                        * 'set', just return a 1 w/o actually
17167 +                        * performing another ioctl call to the
17168 +                        * device.
17169 +                        */
17170 +                       if (ldev_prv->media_changed == TRUE) {
17171 +                               rc = 1;
17172 +                               break;
17173 +                       }
17174 +                       rc = ldev_prv->bdev->bd_op->
17175 +                           check_media_change(MKDEV
17176 +                                              (ldev_prv->major,
17177 +                                               ldev_prv->minor));
17178 +                       if (rc == 1) {
17179 +                               ldev_prv->media_changed = TRUE;
17180 +                               disk->flags |= EVMS_MEDIA_CHANGED;
17181 +                       }
17182 +               }
17183 +               break;
17184 +       case EVMS_REVALIDATE_DISK:
17185 +               if (disk->flags & EVMS_DEVICE_UNAVAILABLE) {
17186 +                       rc = -ENXIO;
17187 +               } else {
17188 +                       /* don't actually send this ioctl down
17189 +                        * to the device, until we know that
17190 +                        * previous check media change ioctl
17191 +                        * has occurred.
17192 +                        *
17193 +                        * when we do actually send the ioctl
17194 +                        * down, reset the local media_changed
17195 +                        * flag.
17196 +                        */
17197 +                       if (ldev_prv->media_changed == FALSE)
17198 +                               break;
17199 +                       rc = ldev_prv->bdev->bd_op->
17200 +                           revalidate(MKDEV
17201 +                                      (ldev_prv->major, ldev_prv->minor));
17202 +                       ldev_prv->media_changed = FALSE;
17203 +               }
17204 +               break;
17205 +       case EVMS_GET_DISK_LIST:
17206 +               rc = evms_cs_add_item_to_list((struct evms_list_node **) arg,
17207 +                                             disk);
17208 +               if (rc > 0)
17209 +                       rc = 0;
17210 +               break;
17211 +       case EVMS_CHECK_DEVICE_STATUS:
17212 +               if (arg) {
17213 +                       int *status = (int *) arg;
17214 +                       *status |= disk->flags;
17215 +               }
17216 +               break;
17217 +       case EVMS_UPDATE_DEVICE_INFO:
17218 +               /* determine hardsector size */
17219 +               disk->hardsector_size = 512;
17220 +               if (hardsect_size[ldev_prv->major]) {
17221 +                       disk->hardsector_size = hardsect_size[ldev_prv->major][ldev_prv->minor];
17222 +               }
17223 +               /* save the block size */
17224 +               disk->block_size = 1024;
17225 +               if (blksize_size[ldev_prv->major]) {
17226 +                       disk->block_size = blksize_size[ldev_prv->major][ldev_prv->minor];
17227 +               }
17228 +               /* device size in sectors
17229 +                *
17230 +                * try 64bit size first, if that fails
17231 +                * fall back on the 32bit size.
17232 +                */
17233 +               /* try 64bit size */
17234 +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,18)
17235 +               rc = evms_cs_kernel_ioctl(disk, BLKGETSIZE64,
17236 +                                         (ulong) & disk->total_vsectors);
17237 +               if (!rc) {
17238 +                       /* convert bytes to 512 byte sectors */
17239 +                       disk->total_vsectors >>= EVMS_VSECTOR_SIZE_SHIFT;
17240 +               } else
17241 +#endif
17242 +                       {
17243 +                       /* try 32bit size */
17244 +                       ulong dev_size = 0;
17245 +                       rc = evms_cs_kernel_ioctl(disk, BLKGETSIZE,
17246 +                                                 (ulong) & dev_size);
17247 +                       disk->total_vsectors = dev_size;
17248 +               }
17249 +               break;
17250 +       default:
17251 +               if (disk->flags & EVMS_DEVICE_UNAVAILABLE) {
17252 +                       rc = -ENXIO;
17253 +               } else {
17254 +                       rc = ldev_prv->bdev->bd_op->ioctl(inode, file, cmd,
17255 +                                                         arg);
17256 +               }
17257 +               break;
17258 +       }
17259 +       inode->i_bdev = save_bdev;
17260 +       inode->i_rdev = save_dev;
17261 +
17262 +       return (rc);
17263 +}
17264 +
17265 +/********************************************************/
17266 +/* Required Module Entry Point:                         */
17267 +/*      ldev_mgr_init                                   */
17268 +/********************************************************/
17269 +
17270 +static int __init
17271 +ldev_mgr_init(void)
17272 +{
17273 +       return evms_cs_register_plugin(&plugin_header);
17274 +}
17275 +
17276 +static void __exit
17277 +ldev_mgr_exit(void)
17278 +{
17279 +       evms_cs_unregister_plugin(&plugin_header);
17280 +}
17281 +
17282 +module_init(ldev_mgr_init);
17283 +module_exit(ldev_mgr_exit);
17284 +#ifdef MODULE_LICENSE
17285 +MODULE_LICENSE("GPL");
17286 +#endif
17287 diff -Naur linux-2002-09-30/drivers/evms/lvm_vge.c evms-2002-09-30/drivers/evms/lvm_vge.c
17288 --- linux-2002-09-30/drivers/evms/lvm_vge.c     Wed Dec 31 18:00:00 1969
17289 +++ evms-2002-09-30/drivers/evms/lvm_vge.c      Fri Sep 13 16:45:06 2002
17290 @@ -0,0 +1,3734 @@
17291 +/* -*- linux-c -*- */
17292 +/*
17293 + *   Copyright (c) International Business Machines  Corp., 2000
17294 + *
17295 + *   This program is free software;  you can redistribute it and/or modify
17296 + *   it under the terms of the GNU General Public License as published by
17297 + *   the Free Software Foundation; either version 2 of the License, or
17298 + *   (at your option) any later version.
17299 + *
17300 + *   This program is distributed in the hope that it will be useful,
17301 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
17302 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
17303 + *   the GNU General Public License for more details.
17304 + *
17305 + *   You should have received a copy of the GNU General Public License
17306 + *   along with this program;  if not, write to the Free Software
17307 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17308 + */
17309 +/*
17310 + * linux/drivers/evms/lvm_vge.c
17311 + *
17312 + * EVMS Linux LVM Region Manager
17313 + */
17314 +
17315 +#define LOG_PREFIX "lvm: "
17316 +
17317 +#include <linux/kernel.h>
17318 +#include <linux/module.h>
17319 +#include <linux/vmalloc.h>
17320 +#include <linux/version.h>
17321 +#include <asm/uaccess.h>
17322 +
17323 +#include <linux/evms/evms.h>
17324 +#include <linux/evms/evms_lvm.h>
17325 +
17326 +/* Plugin API prototypes. */
17327 +static int lvm_discover(struct evms_logical_node ** evms_node_list);
17328 +static int lvm_discover_end(struct evms_logical_node ** evms_node_list);
17329 +static int lvm_delete_node(struct evms_logical_node * logical_node);
17330 +static void lvm_read(struct evms_logical_node * node, struct buffer_head * bh);
17331 +static void lvm_write(struct evms_logical_node * node, struct buffer_head * bh);
17332 +static int lvm_init_io(struct evms_logical_node * node,
17333 +                      int io_flag,
17334 +                      u64 sect_nr,
17335 +                      u64 num_sects,
17336 +                      void * buf_addr);
17337 +static int lvm_ioctl(struct evms_logical_node * logical_node,
17338 +                    struct inode * inode,
17339 +                    struct file * file,
17340 +                    unsigned int cmd,
17341 +                    unsigned long arg);
17342 +static int lvm_direct_ioctl(struct inode * inode,
17343 +                           struct file * file,
17344 +                           unsigned int cmd,
17345 +                           unsigned long args);
17346 +
17347 +static struct snapshot_map_entry * allocate_snapshot_map_entry(u64 org_sector,
17348 +                                                              u64 snap_sector);
17349 +
17350 +/* LVM Plugin function table and header. */
17351 +static struct evms_plugin_fops lvm_fops = {
17352 +       .discover       = lvm_discover,
17353 +       .end_discover   = lvm_discover_end,
17354 +       .delete         = lvm_delete_node,
17355 +       .read           = lvm_read,
17356 +       .write          = lvm_write,
17357 +       .init_io        = lvm_init_io,
17358 +       .ioctl          = lvm_ioctl,
17359 +       .direct_ioctl   = lvm_direct_ioctl
17360 +};
17361 +
17362 +static struct evms_plugin_header lvm_plugin_header = {
17363 +       .id = SetPluginID(IBM_OEM_ID,
17364 +                         EVMS_REGION_MANAGER,
17365 +                         0x01),
17366 +       .version = {
17367 +               .major          = EVMS_LVM_VERSION_MAJOR,
17368 +               .minor          = EVMS_LVM_VERSION_MINOR,
17369 +               .patchlevel     = EVMS_LVM_VERSION_PATCH
17370 +       },
17371 +       .required_services_version = {
17372 +               .major          = 0,
17373 +               .minor          = 5,
17374 +               .patchlevel     = 0
17375 +       },
17376 +       .fops = &lvm_fops
17377 +};
17378 +
17379 +static struct lvm_volume_group * lvm_group_list = NULL;
17380 +static struct proc_dir_entry * lvm_proc = NULL;
17381 +
17382 +
17383 +/********** Miscellaneous Functions **********/
17384 +
17385 +
17386 +/**
17387 + * remap sector
17388 + * @node:
17389 + * @org_sector:                Logical sector to remap.
17390 + * @size:              Size (in sectors) or request to remap.
17391 + * @new_sector:                Remapped sector.
17392 + * @new_size:          New size (in sectors).
17393 + * @pe_start_sector:   Starting sector of PE - needed for snapshotting.
17394 + * @pv_entry:          New node for which new_sector is relative.
17395 + *
17396 + * Common function to remap LV lba to PV lba in appropriate PE. This
17397 + * function needs to deal with requests that span PEs and/or stripes. If
17398 + * this occurs, the request will simply be chopped off at the boundary of
17399 + * the first PE/stripe. It is up to the calling function to loop
17400 + * accordingly to finish the full remapping. This function is now partially
17401 + * 64-bit enabled. The striping section contains code that currently cannot
17402 + * eliminate at least one mod operation on 64 bit values.
17403 + **/
17404 +static int remap_sector(struct evms_logical_node * node,
17405 +                       u64 org_sector,
17406 +                       u64 size,
17407 +                       u64 * new_sector,
17408 +                       u64 * new_size,
17409 +                       u64 * pe_start_sector,
17410 +                       struct lvm_physical_volume ** pv_entry)
17411 +{
17412 +       struct lvm_logical_volume * volume = node->private;
17413 +       struct le_table_entry * le_entry;
17414 +       u32 le, offset_in_le;
17415 +
17416 +       *new_size = size;
17417 +
17418 +       if ( volume->stripes > 1 ) {
17419 +               /* Volume is striped. Reset the size if the request crosses
17420 +                * a stripe boundary. Striping in LVM is not 64-bit enabled.
17421 +                */
17422 +               u32 column, columns, sectors_per_column;
17423 +               u32 sector_in_column, stripe_in_column, le_in_column;
17424 +               u32 offset_in_stripe, stripe_in_le;
17425 +               u32 org_sector32 = org_sector;
17426 +
17427 +               sectors_per_column = volume->stripes * volume->pe_size;
17428 +               column = org_sector32 / sectors_per_column;
17429 +               sector_in_column = org_sector32 % sectors_per_column;
17430 +               stripe_in_column = sector_in_column / volume->stripe_size;
17431 +               le_in_column = stripe_in_column % volume->stripes;
17432 +               columns = volume->num_le / volume->stripes;
17433 +               le = column + (columns * le_in_column);
17434 +
17435 +               offset_in_stripe = org_sector32 % volume->stripe_size;
17436 +               stripe_in_le = stripe_in_column / volume->stripes;
17437 +               offset_in_le = offset_in_stripe +
17438 +                              stripe_in_le * volume->stripe_size;
17439 +
17440 +               if ( offset_in_stripe + size > volume->stripe_size ) {
17441 +                       *new_size = volume->stripe_size - offset_in_stripe;
17442 +               }
17443 +       } else {
17444 +               /* Linear volume. Just find LE and offset. Reset the size if
17445 +                * the request crosses an LE boundary. This path is 64-bit safe.
17446 +                */
17447 +               le = org_sector >> volume->pe_size_shift;
17448 +               offset_in_le = org_sector & (volume->pe_size - 1);
17449 +
17450 +               if ( offset_in_le + size > volume->pe_size ) {
17451 +                       *new_size = volume->pe_size - offset_in_le;
17452 +               }
17453 +       }
17454 +
17455 +       le_entry = &volume->le_map[le];
17456 +       *pe_start_sector = le_entry->pe_sector_offset;
17457 +       *new_sector = le_entry->pe_sector_offset + offset_in_le;
17458 +       *pv_entry = le_entry->owning_pv;
17459 +
17460 +       return 0;
17461 +}
17462 +
17463 +/**
17464 + * add_group_to_list
17465 + *
17466 + * Add a volume group to the end of the LVM global group list.
17467 + **/
17468 +static int add_group_to_list(struct lvm_volume_group * group)
17469 +{
17470 +       struct lvm_volume_group ** p_group;
17471 +
17472 +       for ( p_group = &lvm_group_list;
17473 +             *p_group; p_group = &(*p_group)->next_group ) {
17474 +               ;
17475 +       }
17476 +
17477 +       *p_group = group;
17478 +       group->next_group = NULL;
17479 +       return 0;
17480 +}
17481 +
17482 +/**
17483 + * remove_group_from_list
17484 + *
17485 + * Remove an LVM volume group from the global LVM list.
17486 + **/
17487 +static int remove_group_from_list(struct lvm_volume_group * group)
17488 +{
17489 +       struct lvm_volume_group ** p_group;
17490 +
17491 +       for ( p_group = &lvm_group_list;
17492 +             *p_group; p_group = &(*p_group)->next_group ) {
17493 +               if ( *p_group == group ) {
17494 +                       *p_group = (*p_group)->next_group;
17495 +                       group->next_group = NULL;
17496 +                       break;
17497 +               }
17498 +       }
17499 +
17500 +       return 0;
17501 +}
17502 +
17503 +/**
17504 + * find_group_by_uuid
17505 + *
17506 + * Use the vg_uuid to find the desired volume group.
17507 + **/
17508 +static int find_group_by_uuid(u8 * vg_uuid,
17509 +                             struct lvm_volume_group ** group)
17510 +{
17511 +       struct lvm_volume_group * gp;
17512 +
17513 +       for ( gp = lvm_group_list; gp; gp = gp->next_group ) {
17514 +               if ( ! memcmp(vg_uuid, gp->vg_uuid, UUID_LEN) ) {
17515 +                       *group = gp;
17516 +                       return 0;
17517 +               }
17518 +       }
17519 +       *group = NULL;
17520 +       return -EINVAL;
17521 +}
17522 +
17523 +/**
17524 + * find_pv_by_number
17525 + *
17526 + * Search the PV list of the specified volume group, looking for the
17527 + * specified PV number. If found, return a pointer to that PV.
17528 + **/
17529 +static struct lvm_physical_volume *
17530 +find_pv_by_number(u32 pv_number,
17531 +                 struct lvm_volume_group * group)
17532 +{
17533 +       struct lvm_physical_volume * pv_entry;
17534 +
17535 +       for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {
17536 +               if ( pv_entry->pv_number == pv_number ) {
17537 +                       return pv_entry;
17538 +               }
17539 +       }
17540 +       return NULL;
17541 +}
17542 +
17543 +/**
17544 + * translate_lv_name
17545 + * @lvm_lv_name:       Input LVM-style name.
17546 + * @evms_node_name:    Output EVMS-style name.
17547 + *
17548 + * In LVM, volumes have names based on their dev-node, which follow the
17549 + * pattern /dev/group_name/volume_name. In EVMS, the same volume needs
17550 + * to appear as /dev/evms/lvm/group_name/volume_name. Thus, the name from
17551 + * the lv_disk_t needs to be translated before copying to the associated
17552 + * node. evms_node_name must point to a NAME_LEN sized buffer.
17553 + **/
17554 +static int translate_lv_name(char * lvm_lv_name, char * evms_node_name)
17555 +{
17556 +       char * ptr;
17557 +
17558 +       memset(evms_node_name, 0, NAME_LEN);
17559 +
17560 +       /* Make sure the string starts with /dev/, and skip over it. */
17561 +       ptr = strstr(lvm_lv_name, DEV_DIRECTORY);
17562 +       if ( ptr != lvm_lv_name ) {
17563 +               LOG_SERIOUS("Invalid LV name: %s\n", lvm_lv_name);
17564 +               return -EINVAL;
17565 +       }
17566 +       ptr = &ptr[strlen(DEV_DIRECTORY)];
17567 +
17568 +       /* ptr now points to "group_name/volume_name".
17569 +        * Use this to create the name for the EVMS node.
17570 +        */
17571 +       strcpy(evms_node_name, LVM_DEV_DIRECTORY);
17572 +       strncat(evms_node_name, ptr, NAME_LEN - strlen(evms_node_name) - 1);
17573 +
17574 +       return 0;
17575 +}
17576 +
17577 +/**
17578 + * check_pv_for_lv
17579 + *
17580 + * Run through all LE maps of all LVs in this group, and make sure the
17581 + * specified PV is not being pointed to by any LEs.
17582 + **/
17583 +static int check_pv_for_lv(struct lvm_physical_volume * pv_entry,
17584 +                          struct lvm_volume_group * group)
17585 +{
17586 +       struct lvm_logical_volume * volume;
17587 +       int i, j;
17588 +
17589 +       for ( i = 1; i <= MAX_LV; i++ ) {
17590 +               if ( (volume = group->volume_list[i]) ) {
17591 +                       for ( j = 0; j < volume->num_le; j++ ) {
17592 +                               if ( volume->le_map[j].owning_pv == pv_entry ) {
17593 +                                       return -EINVAL;
17594 +                               }
17595 +                       }
17596 +               }
17597 +       }
17598 +       return 0;
17599 +}
17600 +
17601 +
17602 +/********** Metadata I/O Functions **********/
17603 +
17604 +
17605 +/**
17606 + * endian_convert_pv
17607 + *
17608 + * Endian-neutral conversion for PV structures.
17609 + **/
17610 +static inline void endian_convert_pv(struct pv_disk * pv)
17611 +{
17612 +       pv->version             = le16_to_cpup(&pv->version);
17613 +       pv->pv_on_disk.base     = le32_to_cpup(&pv->pv_on_disk.base);
17614 +       pv->pv_on_disk.size     = le32_to_cpup(&pv->pv_on_disk.size);
17615 +       pv->vg_on_disk.base     = le32_to_cpup(&pv->vg_on_disk.base);
17616 +       pv->vg_on_disk.size     = le32_to_cpup(&pv->vg_on_disk.size);
17617 +       pv->pv_uuidlist_on_disk.base =
17618 +               le32_to_cpup(&pv->pv_uuidlist_on_disk.base);
17619 +       pv->pv_uuidlist_on_disk.size =
17620 +               le32_to_cpup(&pv->pv_uuidlist_on_disk.size);
17621 +       pv->lv_on_disk.base     = le32_to_cpup(&pv->lv_on_disk.base);
17622 +       pv->lv_on_disk.size     = le32_to_cpup(&pv->lv_on_disk.size);
17623 +       pv->pe_on_disk.base     = le32_to_cpup(&pv->pe_on_disk.base);
17624 +       pv->pe_on_disk.size     = le32_to_cpup(&pv->pe_on_disk.size);
17625 +       pv->pv_major            = le32_to_cpup(&pv->pv_major);
17626 +       pv->pv_number           = le32_to_cpup(&pv->pv_number);
17627 +       pv->pv_status           = le32_to_cpup(&pv->pv_status);
17628 +       pv->pv_allocatable      = le32_to_cpup(&pv->pv_allocatable);
17629 +       pv->pv_size             = le32_to_cpup(&pv->pv_size);
17630 +       pv->lv_cur              = le32_to_cpup(&pv->lv_cur);
17631 +       pv->pe_size             = le32_to_cpup(&pv->pe_size);
17632 +       pv->pe_total            = le32_to_cpup(&pv->pe_total);
17633 +       pv->pe_allocated        = le32_to_cpup(&pv->pe_allocated);
17634 +       pv->pe_start            = le32_to_cpup(&pv->pe_start);
17635 +}
17636 +
17637 +/**
17638 + * read_pv
17639 + *
17640 + * Read in the PV structure from the specified node. If it contains a
17641 + * valid PV signature, allocate a new struct pv_disk and copy the data.
17642 + **/
17643 +static int read_pv(struct evms_logical_node * node, struct pv_disk ** pv)
17644 +{
17645 +       struct pv_disk * pv_buffer;
17646 +       int rc = -ENOMEM;
17647 +
17648 +       *pv = NULL;
17649 +
17650 +       /* Buffer for reading the PV metadata. */
17651 +       pv_buffer = kmalloc(LVM_PV_DISK_SIZE, GFP_NOIO);
17652 +       if (!pv_buffer) {
17653 +               LOG_CRITICAL("Error allocating PV metadata buffer for %s\n",
17654 +                            node->name);
17655 +               goto out;
17656 +       }
17657 +
17658 +       /* Read the first two sectors. */
17659 +       rc = INIT_IO(node, 0, evms_cs_size_in_vsectors(LVM_PV_DISK_BASE),
17660 +                    evms_cs_size_in_vsectors(LVM_PV_DISK_SIZE), pv_buffer);
17661 +       if (rc) {
17662 +               LOG_SERIOUS("Error reading PV metadata from %s\n", node->name);
17663 +               goto out_kfree;
17664 +       }
17665 +
17666 +       /* Endian-neutral conversion of PV metadata. */
17667 +       endian_convert_pv(pv_buffer);
17668 +
17669 +       /* Check for an LVM signature and make sure the sizes match.
17670 +        * Versions 1 and 2 are both valid now. Thanks LVM! :)
17671 +        */
17672 +       if ( !(pv_buffer->id[0] == 'H' &&
17673 +              pv_buffer->id[1] == 'M' &&
17674 +              (pv_buffer->version == 1 || pv_buffer->version == 2) &&
17675 +              pv_buffer->pv_size == node->total_vsectors) ) {
17676 +               LOG_EXTRA("%s is not an LVM PV\n", node->name);
17677 +               rc = -EINVAL;
17678 +               goto out_kfree;
17679 +       }
17680 +
17681 +       /* This is a valid PV. Allocate a new pv_disk. */
17682 +       *pv = kmalloc(sizeof(struct pv_disk), GFP_NOIO);
17683 +       if (!*pv) {
17684 +               LOG_CRITICAL("Error allocating new PV for %s\n", node->name);
17685 +               rc = -ENOMEM;
17686 +               goto out_kfree;
17687 +       }
17688 +
17689 +       /* Copy the metadata. */
17690 +       memcpy(*pv, pv_buffer, sizeof(struct pv_disk));
17691 +
17692 +out_kfree:
17693 +       kfree(pv_buffer);
17694 +out:
17695 +       return rc;
17696 +}
17697 +
17698 +/**
17699 + * endian_convert_vg
17700 + *
17701 + * Endian-neutral conversion for VG structures
17702 + **/
17703 +static inline void endian_convert_vg(struct vg_disk * vg)
17704 +{
17705 +       vg->vg_number   = le32_to_cpup(&vg->vg_number);
17706 +       vg->vg_access   = le32_to_cpup(&vg->vg_access);
17707 +       vg->vg_status   = le32_to_cpup(&vg->vg_status);
17708 +       vg->lv_max      = le32_to_cpup(&vg->lv_max);
17709 +       vg->lv_cur      = le32_to_cpup(&vg->lv_cur);
17710 +       vg->lv_open     = le32_to_cpup(&vg->lv_open);
17711 +       vg->pv_max      = le32_to_cpup(&vg->pv_max);
17712 +       vg->pv_cur      = le32_to_cpup(&vg->pv_cur);
17713 +       vg->pv_act      = le32_to_cpup(&vg->pv_act);
17714 +       vg->dummy       = le32_to_cpup(&vg->dummy);
17715 +       vg->vgda        = le32_to_cpup(&vg->vgda);
17716 +       vg->pe_size     = le32_to_cpup(&vg->pe_size);
17717 +       vg->pe_total    = le32_to_cpup(&vg->pe_total);
17718 +       vg->pe_allocated = le32_to_cpup(&vg->pe_allocated);
17719 +       vg->pvg_total   = le32_to_cpup(&vg->pvg_total);
17720 +}
17721 +
17722 +/**
17723 + * read_vg
17724 + *
17725 + * Read in the VG structure from the specified node. Allocate a new
17726 + * struct vg_disk and copy the data.
17727 + **/
17728 +static int read_vg(struct evms_logical_node * node,
17729 +                  struct pv_disk * pv,
17730 +                  struct vg_disk ** vg)
17731 +{
17732 +       struct vg_disk * vg_buffer;
17733 +       unsigned long vg_sectors;
17734 +       int rc = -ENOMEM;
17735 +
17736 +       /* Allocate a buffer to read the VG metadata. */
17737 +       vg_sectors = evms_cs_size_in_vsectors(pv->vg_on_disk.size);
17738 +       vg_buffer = kmalloc(vg_sectors << EVMS_VSECTOR_SIZE_SHIFT, GFP_NOIO);
17739 +       if (!vg_buffer) {
17740 +               LOG_CRITICAL("Error allocating VG metadata buffer for %s\n",
17741 +                            node->name);
17742 +               goto out;
17743 +       }
17744 +
17745 +       /* Read the VG metadata. */
17746 +       rc = INIT_IO(node, 0, evms_cs_size_in_vsectors(pv->vg_on_disk.base),
17747 +                    vg_sectors, vg_buffer);
17748 +       if (rc) {
17749 +               LOG_SERIOUS("Error reading VG metadata from %s\n", node->name);
17750 +               goto out_kfree;
17751 +       }
17752 +
17753 +       /* Endian-neutral conversion of VG metadata. */
17754 +       endian_convert_vg(vg_buffer);
17755 +
17756 +       /* Allocate a new struct vg_disk. */
17757 +       *vg = kmalloc(sizeof(struct vg_disk), GFP_NOIO);
17758 +       if (!*vg) {
17759 +               LOG_CRITICAL("Error allocating new VG for %s\n", node->name);
17760 +               rc = -ENOMEM;
17761 +               goto out_kfree;
17762 +       }
17763 +
17764 +       /* Copy the metadata. */
17765 +       memcpy(*vg, vg_buffer, sizeof(struct vg_disk));
17766 +
17767 +out_kfree:
17768 +       kfree(vg_buffer);
17769 +out:
17770 +       return rc;
17771 +}
17772 +
17773 +/**
17774 + * read_uuid_list
17775 + **/
17776 +static int read_uuid_list(struct evms_logical_node * node,
17777 +                         struct pv_disk * pv,
17778 +                         struct lvm_volume_group * group)
17779 +{
17780 +       u64 start_sector;
17781 +       unsigned long total_sectors;
17782 +       unsigned char * uuid_buffer;
17783 +       unsigned long buffer_size = IO_BUFFER_SECTORS * EVMS_VSECTOR_SIZE;
17784 +       unsigned long uuid_list_size;
17785 +       int i, rc = 0;
17786 +
17787 +       if (group->uuid_list) {
17788 +               LOG_EXTRA("Already read PV UUIDs for group %s\n",
17789 +                         group->vg_name);
17790 +               goto out;
17791 +       }
17792 +
17793 +       start_sector = evms_cs_size_in_vsectors(pv->pv_uuidlist_on_disk.base);
17794 +       total_sectors = evms_cs_size_in_vsectors(pv->pv_uuidlist_on_disk.size);
17795 +       uuid_list_size = round_up(total_sectors * EVMS_VSECTOR_SIZE,
17796 +                                 buffer_size);
17797 +
17798 +       /* Allocate a buffer to perform the I/Os. */
17799 +       uuid_buffer = kmalloc(buffer_size, GFP_NOIO);
17800 +       if (!uuid_buffer) {
17801 +               LOG_CRITICAL("Error allocating buffer for UUID list in group %s\n",
17802 +                            group->vg_name);
17803 +               rc = -ENOMEM;
17804 +               goto out;
17805 +       }
17806 +
17807 +       /* Allocate memory for the UUID array for this group. */
17808 +       group->uuid_list = vmalloc(uuid_list_size);
17809 +       if (!group->uuid_list) {
17810 +               LOG_CRITICAL("Error allocating UUID list for group %s\n",
17811 +                            group->vg_name);
17812 +               rc = -ENOMEM;
17813 +               goto out_kfree;
17814 +       }
17815 +       memset(group->uuid_list, 0, uuid_list_size);
17816 +
17817 +       for ( i = 0; i < total_sectors; i += IO_BUFFER_SECTORS ) {
17818 +               rc = INIT_IO(node, 0, start_sector + i,
17819 +                            IO_BUFFER_SECTORS, uuid_buffer);
17820 +               if (rc) {
17821 +                       LOG_SERIOUS("Error reading PV UUID list from %s\n",
17822 +                                   node->name);
17823 +                       goto out_vfree;
17824 +               }
17825 +               /* Copy the I/O buffer into the UUID array. */
17826 +               memcpy(&(group->uuid_list[i * EVMS_VSECTOR_SIZE]),
17827 +                      uuid_buffer, buffer_size);
17828 +       }
17829 +
17830 +       /* Clear out the unused portion at the end of the uuid_list. */
17831 +       memset(&(group->uuid_list[pv->pv_uuidlist_on_disk.size]), 0,
17832 +              uuid_list_size - pv->pv_uuidlist_on_disk.size);
17833 +
17834 +out_kfree:
17835 +       kfree(uuid_buffer);
17836 +out:
17837 +       return rc;
17838 +
17839 +out_vfree:
17840 +       vfree(group->uuid_list);
17841 +       group->uuid_list = NULL;
17842 +       goto out_kfree;
17843 +}
17844 +
17845 +/**
17846 + * endian_convert_lv
17847 + *
17848 + * Endian-neutral conversion for LV structures
17849 + **/
17850 +static inline void endian_convert_lv(struct lv_disk * lv)
17851 +{
17852 +       lv->lv_access           = le32_to_cpup(&lv->lv_access);
17853 +       lv->lv_status           = le32_to_cpup(&lv->lv_status);
17854 +       lv->lv_open             = le32_to_cpup(&lv->lv_open);
17855 +       lv->lv_dev              = le32_to_cpup(&lv->lv_dev);
17856 +       lv->lv_number           = le32_to_cpup(&lv->lv_number);
17857 +       lv->lv_mirror_copies    = le32_to_cpup(&lv->lv_mirror_copies);
17858 +       lv->lv_recovery         = le32_to_cpup(&lv->lv_recovery);
17859 +       lv->lv_schedule         = le32_to_cpup(&lv->lv_schedule);
17860 +       lv->lv_size             = le32_to_cpup(&lv->lv_size);
17861 +       lv->lv_snapshot_minor   = le32_to_cpup(&lv->lv_snapshot_minor);
17862 +       lv->lv_chunk_size       = le16_to_cpup(&lv->lv_chunk_size);
17863 +       lv->dummy               = le16_to_cpup(&lv->dummy);
17864 +       lv->lv_allocated_le     = le32_to_cpup(&lv->lv_allocated_le);
17865 +       lv->lv_stripes          = le32_to_cpup(&lv->lv_stripes);
17866 +       lv->lv_stripesize       = le32_to_cpup(&lv->lv_stripesize);
17867 +       lv->lv_badblock         = le32_to_cpup(&lv->lv_badblock);
17868 +       lv->lv_allocation       = le32_to_cpup(&lv->lv_allocation);
17869 +       lv->lv_io_timeout       = le32_to_cpup(&lv->lv_io_timeout);
17870 +       lv->lv_read_ahead       = le32_to_cpup(&lv->lv_read_ahead);
17871 +}
17872 +
17873 +static inline void endian_convert_lvs(struct lvm_volume_group * group)
17874 +{
17875 +       int i;
17876 +       for ( i = 0; i < group->vg->lv_max; i++ ) {
17877 +               endian_convert_lv(&(group->lv_array[i]));
17878 +       }
17879 +}
17880 +
17881 +/**
17882 + * read_lv
17883 + *
17884 + * Read in the LV structures for the specified group. Do the read from
17885 + * the first PV in the group. If that one fails, keep trying on the
17886 + * remaining PVs until one works. This function will allocate a buffer
17887 + * for the group to read in the structures.
17888 + **/
17889 +static int read_lv(struct lvm_volume_group * group)
17890 +{
17891 +       struct lvm_physical_volume * pv_entry = group->pv_list;
17892 +       unsigned char * lv_buffer = NULL;
17893 +       u64 start_sector;
17894 +       unsigned long total_sectors, lv_array_size = 0;
17895 +       unsigned long buffer_size = IO_BUFFER_SECTORS * EVMS_VSECTOR_SIZE;
17896 +       int i, rc = 1;
17897 +
17898 +       if (group->lv_array) {
17899 +               return 0;
17900 +       }
17901 +
17902 +       if (!pv_entry) {
17903 +               LOG_ERROR("Group %s has no PVs. Cannot read LV structures.\n",
17904 +                         group->vg_name);
17905 +               return -EINVAL;
17906 +       }
17907 +
17908 +       /* Allocate a buffer to do the actual I/Os. */
17909 +       lv_buffer = kmalloc(buffer_size, GFP_NOIO);
17910 +       if (!lv_buffer) {
17911 +               LOG_CRITICAL("Error allocating buffer for LV structs for Group %s\n",
17912 +                            group->vg_name);
17913 +               return -ENOMEM;
17914 +       }
17915 +
17916 +       /* Read in the LV structures 4k at a time. If one PV returns errors,
17917 +        * start over with the next PV in the group.
17918 +        */
17919 +       while (rc && pv_entry) {
17920 +               start_sector = evms_cs_size_in_vsectors(pv_entry->pv->lv_on_disk.base);
17921 +               total_sectors = evms_cs_size_in_vsectors(pv_entry->pv->lv_on_disk.size);
17922 +               lv_array_size = round_up(total_sectors * EVMS_VSECTOR_SIZE,
17923 +                                        buffer_size);
17924 +
17925 +               /* Allocate the buffer for this group to
17926 +                * hold the entire LV array.
17927 +                */
17928 +               if (group->lv_array) {
17929 +                       vfree(group->lv_array);
17930 +                       group->lv_array = NULL;
17931 +               }
17932 +               group->lv_array = vmalloc(lv_array_size);
17933 +               if (!group->lv_array) {
17934 +                       LOG_CRITICAL("Error allocating lv_array buffer for Group %s\n",
17935 +                                    group->vg_name);
17936 +                       rc = -ENOMEM;
17937 +                       goto out_kfree;
17938 +               }
17939 +               memset(group->lv_array, 0, lv_array_size);
17940 +
17941 +               for ( i = 0; i < total_sectors; i += IO_BUFFER_SECTORS ) {
17942 +                       rc = INIT_IO(pv_entry->logical_node, 0,
17943 +                                    start_sector + i, IO_BUFFER_SECTORS,
17944 +                                    lv_buffer);
17945 +                       if (rc) {
17946 +                               LOG_SERIOUS("Error reading LV metadata from %s in Group %s\n",
17947 +                                           pv_entry->logical_node->name,
17948 +                                           group->vg_name);
17949 +
17950 +                               /* Try the next PV if the current one
17951 +                                * caused any errors.
17952 +                                */
17953 +                               pv_entry = pv_entry->next;
17954 +                               break;
17955 +                       }
17956 +                       /* Copy the I/O buffer into the lv_array. */
17957 +                       memcpy(&(((char *)(group->lv_array))[i * EVMS_VSECTOR_SIZE]),
17958 +                              lv_buffer, buffer_size);
17959 +               }
17960 +       }
17961 +
17962 +       if (rc) {
17963 +               LOG_SERIOUS("Unable to read LV metadata from any PV in Group %s\n",
17964 +                           group->vg_name);
17965 +               goto out_vfree;
17966 +       }
17967 +
17968 +       /* Clear out the unused portion at the end of the lv_array. */
17969 +       memset(&(((char *)(group->lv_array))[pv_entry->pv->lv_on_disk.size]),
17970 +              0, lv_array_size - pv_entry->pv->lv_on_disk.size);
17971 +
17972 +       /* Endian-neutral conversion of the LV metadata. */
17973 +       endian_convert_lvs(group);
17974 +
17975 +out_kfree:
17976 +       kfree(lv_buffer);
17977 +       return rc;
17978 +
17979 +out_vfree:
17980 +       vfree(group->lv_array);
17981 +       group->lv_array = NULL;
17982 +       goto out_kfree;
17983 +}
17984 +
17985 +/**
17986 + * endian_convert_pe_map
17987 + *
17988 + * Endian-neutral conversion for PE structures
17989 + **/
17990 +static inline void endian_convert_pe_map(struct lvm_physical_volume * pv_entry)
17991 +{
17992 +       int i;
17993 +       for ( i = 0; i < pv_entry->pv->pe_total; i++ ) {
17994 +               pv_entry->pe_map[i].lv_num =
17995 +                       le16_to_cpup(&pv_entry->pe_map[i].lv_num);
17996 +               pv_entry->pe_map[i].le_num =
17997 +                       le16_to_cpup(&pv_entry->pe_map[i].le_num);
17998 +       }
17999 +}
18000 +
18001 +/**
18002 + * read_pe_map
18003 + *
18004 + * Read in the PE map for the specified PV. This function will allocate a
18005 + * buffer to read in the data.
18006 + **/
18007 +static int read_pe_map(struct lvm_physical_volume * pv_entry)
18008 +{
18009 +       struct evms_logical_node * node = pv_entry->logical_node;
18010 +       struct pv_disk * pv = pv_entry->pv;
18011 +       unsigned char * pe_buffer;
18012 +       u64 start_sector;
18013 +       unsigned long total_sectors, pe_map_size;
18014 +       unsigned long buffer_size = IO_BUFFER_SECTORS * EVMS_VSECTOR_SIZE;
18015 +       int i, rc = -ENOMEM;
18016 +
18017 +       if (pv_entry->pe_map) {
18018 +               return 0;
18019 +       }
18020 +
18021 +       start_sector = evms_cs_size_in_vsectors(pv->pe_on_disk.base);
18022 +       total_sectors = evms_cs_size_in_vsectors(pv->pe_total *
18023 +                                                sizeof(struct pe_disk));
18024 +       pe_map_size = round_up(total_sectors * EVMS_VSECTOR_SIZE, buffer_size);
18025 +
18026 +       /* Allocate a buffer for performing the I/O. */
18027 +       pe_buffer = kmalloc(buffer_size, GFP_NOIO);
18028 +       if (!pe_buffer) {
18029 +               LOG_CRITICAL("Error allocating buffer for PE maps for %s\n",
18030 +                            node->name);
18031 +               goto out;
18032 +       }
18033 +
18034 +       /* Allocate a buffer to hold the PE map for this PV. */
18035 +       pv_entry->pe_map = vmalloc(pe_map_size);
18036 +       if (!pv_entry->pe_map) {
18037 +               LOG_CRITICAL("Error allocating PE map for %s\n", node->name);
18038 +               goto out_kfree;
18039 +       }
18040 +       memset(pv_entry->pe_map, 0, pe_map_size);
18041 +
18042 +       for ( i = 0; i < total_sectors; i += IO_BUFFER_SECTORS ) {
18043 +               rc = INIT_IO(node, 0, start_sector + i,
18044 +                            IO_BUFFER_SECTORS, pe_buffer);
18045 +               if (rc) {
18046 +                       LOG_SERIOUS("Error reading PE maps from %s.\n",
18047 +                                   node->name);
18048 +                       goto out_vfree;
18049 +               }
18050 +               /* Copy the data to the actual PE map. */
18051 +               memcpy(&(((char *)(pv_entry->pe_map))[i * EVMS_VSECTOR_SIZE]),
18052 +                      pe_buffer, buffer_size);
18053 +       }
18054 +
18055 +       /* Clear out the unused portion at the end of the PE map. */
18056 +       memset(&(((char *)(pv_entry->pe_map))[total_sectors * EVMS_VSECTOR_SIZE]),
18057 +              0, pe_map_size - total_sectors * EVMS_VSECTOR_SIZE);
18058 +
18059 +       /* Endian-neutral conversion of the PE metadata. */
18060 +       endian_convert_pe_map(pv_entry);
18061 +
18062 +out_kfree:
18063 +       kfree(pe_buffer);
18064 +out:
18065 +       return rc;
18066 +
18067 +out_vfree:
18068 +       vfree(pv_entry->pe_map);
18069 +       pv_entry->pe_map = NULL;
18070 +       goto out_kfree;
18071 +}
18072 +
18073 +
18074 +/********** Snapshot Manipulation Functions **********/
18075 +
18076 +
18077 +/**
18078 + * snapshot_check_quiesce_original
18079 + *
18080 + * For this snapshot LV, check that both it and its original are quiesced.
18081 + **/
18082 +static int
18083 +snapshot_check_quiesce_original(struct lvm_logical_volume * snap_volume)
18084 +{
18085 +       struct lvm_logical_volume * org_volume = snap_volume->snapshot_org;
18086 +
18087 +       if ( ! (snap_volume->lv_access & EVMS_LV_QUIESCED) ) {
18088 +               return -EINVAL;
18089 +       }
18090 +
18091 +       if ( org_volume && !(org_volume->lv_access & EVMS_LV_QUIESCED) ) {
18092 +               return -EINVAL;
18093 +       }
18094 +
18095 +       return 0;
18096 +}
18097 +
18098 +/**
18099 + * snapshot_check_quiesce_all
18100 + *
18101 + * Go through the list of all snapshots for an original volume, and make
18102 + * sure everyone is in a quiesced state.
18103 + **/
18104 +static int snapshot_check_quiesce_all(struct lvm_logical_volume * org_volume)
18105 +{
18106 +       struct lvm_logical_volume * snap;
18107 +
18108 +       if ( ! (org_volume->lv_access & EVMS_LV_QUIESCED) ) {
18109 +               return -EINVAL;
18110 +       }
18111 +
18112 +       for ( snap = org_volume->snapshot_next;
18113 +             snap; snap = snap->snapshot_next ) {
18114 +               if ( ! (snap->lv_access & EVMS_LV_QUIESCED) ) {
18115 +                       return -EINVAL;
18116 +               }
18117 +       }
18118 +
18119 +       return 0;
18120 +}
18121 +
18122 +/**
18123 + * invalidate_snapshot_volume
18124 + *
18125 + * In the event a snapshot volume becomes full or corrupted, its metadata
18126 + * must be altered in order to prevent it from being used again. Write some
18127 + * invalid data into the first entry of the COW table. If this volume is
18128 + * not fully deleted by the user/engine, this invalid COW entry will be
18129 + * detected by build_snapshot_maps(), and will cause the volume to be
18130 + * deleted before being exported to EVMS during discover. This is obviously
18131 + * a hack, but it is the same hack currently used by LVM. We're just trying
18132 + * to be compatible. :)
18133 + **/
18134 +static int invalidate_snapshot_volume(struct lvm_logical_volume * snap_volume)
18135 +{
18136 +       struct evms_logical_node tmp_node;
18137 +
18138 +       tmp_node.private = snap_volume;
18139 +       tmp_node.total_vsectors = snap_volume->lv_size;
18140 +
18141 +       if ( ! (snap_volume->lv_access & LV_SNAPSHOT) ) {
18142 +               LOG_WARNING("Volume %s is not a snapshot. Cannot invalidate\n",
18143 +                           snap_volume->name);
18144 +               return -EINVAL;
18145 +       }
18146 +
18147 +       LOG_WARNING("Invalidating full/corrupt snapshot %s\n",
18148 +                   snap_volume->name);
18149 +       LOG_WARNING("Run the EVMS administration tools to remove this snapshot.\n");
18150 +
18151 +       if (snap_volume->cow_table) {
18152 +               snap_volume->cow_table[0].pv_org_rsector =
18153 +                       cpu_to_le64(((u64)1));
18154 +               if ( lvm_init_io(&tmp_node, 4, 0, 1, snap_volume->cow_table) ) {
18155 +                       LOG_SERIOUS("Unable to invalidate snapshot %s\n",
18156 +                                   snap_volume->name);
18157 +               }
18158 +       } else {
18159 +               LOG_SERIOUS("Unable to invalidate snapshot %s\n",
18160 +                           snap_volume->name);
18161 +       }
18162 +
18163 +       snap_volume->lv_status &= ~LV_ACTIVE;
18164 +       return 0;
18165 +}
18166 +
18167 +/**
18168 + * remove_snapshot_from_chain
18169 + *
18170 + * Remove a snapshot volume from its original's chain of snapshots. This
18171 + * does not delete the snapshot volume. At runtime, we cannot delete
18172 + * volumes at the region-manager level, because EVMS may have this volume
18173 + * exported, and there is no way to notify EVMS of the deletion. It will
18174 + * eventually need to be deleted in the engine, which will then tell the
18175 + * EVMS kernel services to delete the volume in the kernel.
18176 + **/
18177 +static int remove_snapshot_from_chain(struct lvm_logical_volume * snap_volume)
18178 +{
18179 +       struct lvm_logical_volume * org_volume = snap_volume->snapshot_org;
18180 +       struct lvm_logical_volume ** p_volume;
18181 +
18182 +       if (org_volume) {
18183 +               for ( p_volume = &org_volume->snapshot_next;
18184 +                     *p_volume;
18185 +                     p_volume = &(*p_volume)->snapshot_next ) {
18186 +                       if ( *p_volume == snap_volume ) {
18187 +                               *p_volume = snap_volume->snapshot_next;
18188 +                               break;
18189 +                       }
18190 +               }
18191 +       }
18192 +
18193 +       snap_volume->snapshot_org = NULL;
18194 +       snap_volume->snapshot_next = NULL;
18195 +       return 0;
18196 +}
18197 +
18198 +/**
18199 + * snapshot_hash
18200 + *
18201 + * The snapshot hash tables are NEVER going to have 4 billion entries, so
18202 + * we can safely cast the org_sector to 32 bits and just mod it by the
18203 + * hash table size.
18204 + **/
18205 +static u32 snapshot_hash(u64 org_sector,
18206 +                        struct lvm_logical_volume * snap_volume)
18207 +{
18208 +       return (((u32)org_sector) % snap_volume->hash_table_size);
18209 +}
18210 +
18211 +/**
18212 + * snapshot_search_hash_chain
18213 + *
18214 + * Search the hash chain that is anchored at the specified head pointer.
18215 + * If the sector number is found, the result pointer is set to that entry
18216 + * in the chain, and a 1 is returned. If the sector is not found, the
18217 + * result pointer is set to the previous entry and 0 is returned. If the
18218 + * result pointer is NULL, this means either the list is empty, or the
18219 + * specified sector should become the first list item.
18220 + **/
18221 +static int snapshot_search_hash_chain(u64 org_sector,
18222 +                                     struct snapshot_map_entry * head,
18223 +                                     struct snapshot_map_entry ** result)
18224 +{
18225 +       struct snapshot_map_entry * curr = head;
18226 +       struct snapshot_map_entry * prev = head;
18227 +       while ( curr && curr->org_sector < org_sector ) {
18228 +               prev = curr;
18229 +               curr = curr->next;
18230 +       }
18231 +       if (!curr) {
18232 +               /* Either an empty chain or went off the end of the chain. */
18233 +               *result = prev;
18234 +               return 0;
18235 +       } else if ( curr->org_sector != org_sector ) {
18236 +               *result = curr->prev;
18237 +               return 0;
18238 +       } else {
18239 +               /* Found the desired sector. */
18240 +               *result = curr;
18241 +               return 1;
18242 +       }
18243 +}
18244 +
18245 +/**
18246 + * insert_snapshot_map_entry
18247 + *
18248 + * Insert a new entry into a snapshot hash chain, immediately following the
18249 + * specified entry. This function should not be used to add an entry into
18250 + * an empty list, or as the first entry in an existing list. For that case,
18251 + * use insert_snapshot_map_entry_at_head().
18252 + **/
18253 +static int insert_snapshot_map_entry(struct snapshot_map_entry * entry,
18254 +                                    struct snapshot_map_entry * base)
18255 +{
18256 +       entry->next = base->next;
18257 +       entry->prev = base;
18258 +       base->next = entry;
18259 +       if (entry->next) {
18260 +               entry->next->prev = entry;
18261 +       }
18262 +       return 0;
18263 +}
18264 +
18265 +/**
18266 + * insert_snapshot_map_entry_at_head
18267 + *
18268 + * Insert a new entry into a snapshot chain as the first entry.
18269 + **/
18270 +static int insert_snapshot_map_entry_at_head(struct snapshot_map_entry * entry,
18271 +                                            struct snapshot_map_entry ** head)
18272 +{
18273 +       entry->next = *head;
18274 +       entry->prev = NULL;
18275 +       *head = entry;
18276 +       if (entry->next) {
18277 +               entry->next->prev = entry;
18278 +       }
18279 +       return 0;
18280 +}
18281 +
18282 +/**
18283 + * add_cow_entry_to_snapshot_map
18284 + *
18285 + * Convert a cow table entry (from the on-disk data) into an appropriate
18286 + * entry for the snapshot map. Insert this new entry into the appropriate
18287 + * map for the specified volume.
18288 + *
18289 + * The cow_entry passed into this function must have already been
18290 + * endian-converted from disk-order to cpu-order.
18291 + **/
18292 +static int add_cow_entry_to_snapshot_map(struct lv_COW_table_disk * cow_entry,
18293 +                                        struct lvm_logical_volume * volume)
18294 +{
18295 +       struct snapshot_map_entry * new_entry, * target_entry;
18296 +       struct snapshot_map_entry ** hash_table, * chain_head;
18297 +       u32 hash_value;
18298 +
18299 +       if ( cow_entry->pv_org_number == 0 ) {
18300 +               return -EINVAL;
18301 +       }
18302 +
18303 +       new_entry = allocate_snapshot_map_entry(cow_entry->pv_org_rsector,
18304 +                                               cow_entry->pv_snap_rsector);
18305 +       if (!new_entry) {
18306 +               return -ENOMEM;
18307 +       }
18308 +
18309 +       new_entry->snap_pv = find_pv_by_number(cow_entry->pv_snap_number,
18310 +                                              volume->group);
18311 +       if (!new_entry->snap_pv) {
18312 +               kfree(new_entry);
18313 +               return -EINVAL;
18314 +       }
18315 +
18316 +       hash_value = snapshot_hash(new_entry->org_sector, volume);
18317 +       hash_table = volume->snapshot_map[cow_entry->pv_org_number];
18318 +       chain_head = hash_table[hash_value];
18319 +       if ( snapshot_search_hash_chain(new_entry->org_sector,
18320 +                                       chain_head, &target_entry) ) {
18321 +               /* In general, we should not find this entry in the snapshot
18322 +                * map already. However, it could happen on a re-discover, but
18323 +                * the build_snapshot_maps function should weed out those cases.
18324 +                * In either event, we can simply ignore duplicates.
18325 +                */
18326 +               LOG_WARNING("Detected a duplicate snapshot map entry\n");
18327 +               LOG_WARNING("Snap PV "PFU64":"PFU64", Org PV "PFU64":"PFU64"\n",
18328 +                           cow_entry->pv_snap_number,
18329 +                           cow_entry->pv_snap_rsector,
18330 +                           cow_entry->pv_org_number,
18331 +                           cow_entry->pv_org_rsector);
18332 +               kfree(new_entry);
18333 +       } else {
18334 +               if (target_entry) {
18335 +                       insert_snapshot_map_entry(new_entry, target_entry);
18336 +               } else {
18337 +                       insert_snapshot_map_entry_at_head(new_entry,
18338 +                                                         &hash_table[hash_value]);
18339 +               }
18340 +       }
18341 +
18342 +       return 0;
18343 +}
18344 +
18345 +/**
18346 + * snapshot_remap_sector
18347 + *
18348 + * Perform a sector remap on a snapshot volume. This should be called from
18349 + * the I/O read path, after the LE-to-PE translation has already been
18350 + * performed. First, determine the base sector of the chunk containing the
18351 + * specified sector, and save the remainder. Then, perform a search through
18352 + * the snapshot map for the specified volume. If an match is found, change
18353 + * the PV and sector numbers to the new values. If no match is found, leave
18354 + * the values alone, meaning the read should proceed down the original
18355 + * volume.
18356 + **/
18357 +static void
18358 +snapshot_remap_sector(struct lvm_logical_volume * snap_volume,
18359 +                     u64 pe_start_sector,
18360 +                     u64 * sector,
18361 +                     struct lvm_physical_volume ** pv_entry)
18362 +{
18363 +       struct snapshot_map_entry ** hash_table;
18364 +       struct snapshot_map_entry * chain_head, * result;
18365 +       u32 hash_value;
18366 +       u64 chunk_sector, remainder;
18367 +
18368 +       if ( ! (snap_volume->lv_access & LV_SNAPSHOT) ) {
18369 +               return;
18370 +       }
18371 +
18372 +       chunk_sector = ((*sector - pe_start_sector) &
18373 +                       ((u64)(~(snap_volume->chunk_size - 1)))) +
18374 +                      pe_start_sector;
18375 +       remainder = *sector - chunk_sector;
18376 +       hash_value = snapshot_hash(chunk_sector, snap_volume);
18377 +       hash_table = snap_volume->snapshot_map[(*pv_entry)->pv_number];
18378 +       chain_head = hash_table[hash_value];
18379 +
18380 +       if ( snapshot_search_hash_chain(chunk_sector, chain_head, &result) ) {
18381 +               *pv_entry = result->snap_pv;
18382 +               *sector = result->snap_sector + remainder;
18383 +       }
18384 +}
18385 +
18386 +/**
18387 + * snapshot_read_write_chunk
18388 + *
18389 + * This function takes care of reading one chunk of data from the
18390 + * original, and writing it to the snapshot. Since the original now has
18391 + * a fixed sized buffer for this data, we may have to loop to get the
18392 + * whole chunk copied.
18393 + **/
18394 +static int snapshot_read_write_chunk(struct lvm_logical_volume * org_volume,
18395 +                                    struct lvm_physical_volume * org_pv,
18396 +                                    u64 chunk_sector,
18397 +                                    struct lvm_logical_volume * snap_volume,
18398 +                                    struct lvm_physical_volume ** snap_pv,
18399 +                                    u64 * snap_sector)
18400 +{
18401 +       u32 io_size = snap_volume->chunk_size;
18402 +       u64 snap_pe_start_sector, size;
18403 +       int i, iterations = 1;
18404 +
18405 +       if ( org_volume->chunk_size < snap_volume->chunk_size ) {
18406 +               iterations = snap_volume->chunk_size / org_volume->chunk_size;
18407 +               io_size = org_volume->chunk_size;
18408 +       }
18409 +
18410 +       remap_sector(snap_volume->volume_node, snap_volume->next_free_chunk, 1,
18411 +                    snap_sector, &size, &snap_pe_start_sector, snap_pv);
18412 +
18413 +       /* Check for an incomplete volume. */
18414 +       if (!*snap_sector || !*snap_pv) {
18415 +               invalidate_snapshot_volume(snap_volume);
18416 +               return -1;
18417 +       }
18418 +
18419 +       for ( i = 0; i < iterations; i++ ) {
18420 +
18421 +               /* Read the chunk from the original volume. This is a physical
18422 +                * read, not logical. Thus, stripe boundary considerations are
18423 +                * unnecessary. Also, chunks are always aligned with PEs, so PE
18424 +                * boundary considerations are unnecessary.
18425 +                */
18426 +               if ( INIT_IO(org_pv->logical_node, 0,
18427 +                            chunk_sector + i * io_size, io_size,
18428 +                            org_volume->chunk_data_buffer) ) {
18429 +                       return 1;
18430 +               }
18431 +
18432 +               /* Write this chunk to the snapshot volume. This does duplicate
18433 +                * the local init_io code, but we need to have the remapped
18434 +                * sector later on, so this is slightly more efficient. Snapshot
18435 +                * volumes cannot be striped, so there is no need to consider
18436 +                * stripe-boundary conditions. And just like the read in the
18437 +                * previous line, chunks are always aligned with PEs, so we
18438 +                * don't have to consider PE-boundary conditions.
18439 +                */
18440 +               if ( INIT_IO((*snap_pv)->logical_node, 1,
18441 +                            *snap_sector + i * io_size, io_size,
18442 +                            org_volume->chunk_data_buffer) ) {
18443 +                       /* An error writing the chunk to the snapshot is the
18444 +                        * same situation as the snapshot being full.
18445 +                        */
18446 +                       invalidate_snapshot_volume(snap_volume);
18447 +                       return -1;
18448 +               }
18449 +       }
18450 +
18451 +       return 0;
18452 +}
18453 +
18454 +/**
18455 + * snapshot_copy_data
18456 + *
18457 + * On a write to a snapshotted volume, check all snapshots to see if the
18458 + * specified chunk has already been remapped. If it has not, read the
18459 + * original data from the volume, write the data to the next available
18460 + * chunk on the snapshot, update the COW table, write the COW table to
18461 + * the snapshot, and insert a new entry into the snapshot map.
18462 + *
18463 + * Now converted to copy data to a single snapshot. The looping is left
18464 + * up to lvm_write.
18465 + **/
18466 +static int snapshot_copy_data(struct lvm_logical_volume * org_volume,
18467 +                             struct lvm_logical_volume * snap_volume,
18468 +                             u64 pe_start_sector,
18469 +                             u64 org_sector,
18470 +                             struct lvm_physical_volume * org_pv)
18471 +{
18472 +       struct lvm_physical_volume * snap_pv;
18473 +       struct snapshot_map_entry ** hash_table, * chain_head;
18474 +       struct snapshot_map_entry * target_entry, * new_map_entry;
18475 +       u64 chunk_sector, snap_sector;
18476 +       u32 hash_value;
18477 +       int rc = 0;
18478 +
18479 +       /* Lock out this snapshot while we are remapping. */
18480 +       down(&snap_volume->snap_semaphore);
18481 +
18482 +       /* Make sure the snapshot has not been deactivated. */
18483 +       if ( ! (snap_volume->lv_status & LV_ACTIVE) ) {
18484 +               goto out;
18485 +       }
18486 +
18487 +       /* Search the hash table to see if this sector has already been
18488 +        * remapped on this snapshot.
18489 +        */
18490 +       chunk_sector = ((org_sector - pe_start_sector) &
18491 +                       ((u64)(~(snap_volume->chunk_size - 1)))) +
18492 +                      pe_start_sector;
18493 +       hash_value = snapshot_hash(chunk_sector, snap_volume);
18494 +       hash_table = snap_volume->snapshot_map[org_pv->pv_number];
18495 +       chain_head = hash_table[hash_value];
18496 +
18497 +       if ( snapshot_search_hash_chain(chunk_sector,
18498 +                                       chain_head, &target_entry) ) {
18499 +               /* Chunk is already remapped. */
18500 +               goto out;
18501 +       }
18502 +
18503 +       /* Is there room on the snapshot to remap this chunk? */
18504 +       if ( snap_volume->next_free_chunk >= snap_volume->lv_size ) {
18505 +               /* At this point, the snapshot is full. Any further
18506 +                * writes to the original will cause the snapshot to
18507 +                * become "corrupt" because they can't be remapped.
18508 +                * Take this snapshot permanently offline.
18509 +                */
18510 +               goto out_invalidate;
18511 +       }
18512 +
18513 +       rc = snapshot_read_write_chunk(org_volume, org_pv, chunk_sector,
18514 +                                      snap_volume, &snap_pv, &snap_sector);
18515 +       if (rc) {
18516 +               rc = (rc > 0) ? -EIO : 0;
18517 +               goto out;
18518 +       }
18519 +
18520 +       /* Fill in the appropriate COW table entry and write that
18521 +        * metadata sector back to the snapshot volume. Since we are
18522 +        * only writing one sector, there are no boundary conditions.
18523 +        * Must endian-convert each entry as it is added.
18524 +        */
18525 +       snap_volume->cow_table[snap_volume->next_cow_entry].pv_org_number =
18526 +               cpu_to_le64((u64)(org_pv->pv_number));
18527 +       snap_volume->cow_table[snap_volume->next_cow_entry].pv_org_rsector =
18528 +               cpu_to_le64p(&chunk_sector);
18529 +       snap_volume->cow_table[snap_volume->next_cow_entry].pv_snap_number =
18530 +               cpu_to_le64((u64)(snap_pv->pv_number));
18531 +       snap_volume->cow_table[snap_volume->next_cow_entry].pv_snap_rsector =
18532 +               cpu_to_le64p(&snap_sector);
18533 +
18534 +       if ( lvm_init_io(snap_volume->volume_node, 4,
18535 +                        snap_volume->current_cow_sector,
18536 +                        1, snap_volume->cow_table) ) {
18537 +               /* The data was written to the snapshot, but
18538 +                * writing the metadata failed.
18539 +                */
18540 +               goto out_invalidate;
18541 +       }
18542 +
18543 +       snap_volume->next_cow_entry++;
18544 +       if ( snap_volume->next_cow_entry >=
18545 +            (EVMS_VSECTOR_SIZE / sizeof (struct lv_COW_table_disk)) ) {
18546 +               snap_volume->next_cow_entry = 0;
18547 +               snap_volume->current_cow_sector++;
18548 +               memset(snap_volume->cow_table, 0, EVMS_VSECTOR_SIZE);
18549 +               if ( lvm_init_io(snap_volume->volume_node, 4,
18550 +                                snap_volume->current_cow_sector,
18551 +                                1, snap_volume->cow_table) ) {
18552 +                       /* Can't clear out the next sector of metadata. */
18553 +                       goto out_invalidate;
18554 +               }
18555 +       }
18556 +       snap_volume->next_free_chunk += snap_volume->chunk_size;
18557 +
18558 +       /* Create a new snapshot map entry and add it in the appropriate
18559 +        * place in the map.
18560 +        */
18561 +       new_map_entry = allocate_snapshot_map_entry(chunk_sector, snap_sector);
18562 +       if (!new_map_entry) {
18563 +               rc = -ENOMEM;
18564 +               goto out_invalidate;
18565 +       }
18566 +       new_map_entry->snap_pv = snap_pv;
18567 +       if (target_entry) {
18568 +               insert_snapshot_map_entry(new_map_entry, target_entry);
18569 +       } else {
18570 +               insert_snapshot_map_entry_at_head(new_map_entry,
18571 +                                                 &(hash_table[hash_value]));
18572 +       }
18573 +
18574 +out:
18575 +       up(&snap_volume->snap_semaphore);
18576 +       return rc;
18577 +
18578 +out_invalidate:
18579 +       invalidate_snapshot_volume(snap_volume);
18580 +       goto out;
18581 +}
18582 +
18583 +/**
18584 + * get_snapshot_stats
18585 + **/
18586 +static int get_snapshot_stats(struct lvm_snapshot_stat_ioctl * snap_stats)
18587 +{
18588 +       struct lvm_logical_volume * volume;
18589 +       struct lvm_volume_group * group;
18590 +
18591 +       /* Make sure the parameters are in range. */
18592 +       if ( snap_stats->lv_number < 1 || snap_stats->lv_number > MAX_LV ) {
18593 +               return 1;
18594 +       }
18595 +
18596 +       /* Make sure the specified group and volume exist, and that
18597 +        * this is a snapshot volume.
18598 +        */
18599 +       find_group_by_uuid(snap_stats->vg_uuid, &group);
18600 +       if ( ! group ||
18601 +            ! (volume = group->volume_list[snap_stats->lv_number]) ||
18602 +            ! (volume->lv_access & LV_SNAPSHOT) ) {
18603 +               return 1;
18604 +       }
18605 +
18606 +       /* Return the starting LBA of the next available chunk. */
18607 +       snap_stats->next_free_chunk = volume->next_free_chunk;
18608 +       snap_stats->lv_status = volume->lv_status;
18609 +
18610 +       return 0;
18611 +}
18612 +
18613 +
18614 +/********** Memory Allocation/Deallocation Functions **********/
18615 +
18616 +
18617 +/**
18618 + * deallocate_physical_volume
18619 + *
18620 + * Free the memory used by this physical volume. Do not delete the EVMS
18621 + * node in this function, since this could be called during an error
18622 + * path when we want to save the logical node.
18623 + **/
18624 +static int deallocate_physical_volume(struct lvm_physical_volume * pv_entry)
18625 +{
18626 +       if (pv_entry->pv) {
18627 +               kfree(pv_entry->pv);
18628 +               pv_entry->pv = NULL;
18629 +       }
18630 +
18631 +       if (pv_entry->pe_map) {
18632 +               vfree(pv_entry->pe_map);
18633 +               pv_entry->pe_map = NULL;
18634 +       }
18635 +
18636 +       kfree(pv_entry);
18637 +       return 0;
18638 +}
18639 +
18640 +/**
18641 + * allocate_physical_volume
18642 + *
18643 + * Create a new struct lvm_physical_volume for the specified volume group.
18644 + * Initialize the new PV with the evms node and lvm pv information.
18645 + **/
18646 +static struct lvm_physical_volume *
18647 +allocate_physical_volume(struct evms_logical_node * node, struct pv_disk * pv)
18648 +{
18649 +       struct lvm_physical_volume * new_pv;
18650 +
18651 +       new_pv = kmalloc(sizeof(struct lvm_physical_volume), GFP_NOIO);
18652 +       if (!new_pv) {
18653 +               LOG_CRITICAL("Error allocating physical volume for %s.\n",
18654 +                            node->name);
18655 +               kfree(pv);
18656 +               goto out;
18657 +       }
18658 +
18659 +       /* Initialize the PV. */
18660 +       memset(new_pv, 0, sizeof(struct lvm_physical_volume));
18661 +       new_pv->logical_node = node;
18662 +       new_pv->pv = pv;
18663 +       new_pv->pv_number = pv->pv_number;
18664 +
18665 +out:
18666 +       return new_pv;
18667 +}
18668 +
18669 +/**
18670 + * allocate_snapshot_map_entry
18671 + *
18672 + * Allocate memory for a new entry in the snapshot map and fill in the
18673 + * sector values. The PV pointer is not filled in here, but can easily
18674 + * be found by using the find_pv_by_number function.
18675 + **/
18676 +static struct snapshot_map_entry * allocate_snapshot_map_entry(u64 org_sector,
18677 +                                                              u64 snap_sector)
18678 +{
18679 +       struct snapshot_map_entry * new_entry;
18680 +
18681 +       new_entry = kmalloc(sizeof(struct snapshot_map_entry), GFP_NOIO);
18682 +       if (!new_entry) {
18683 +               goto out;
18684 +       }
18685 +       memset(new_entry, 0, sizeof(struct snapshot_map_entry));
18686 +       new_entry->org_sector = org_sector;
18687 +       new_entry->snap_sector = snap_sector;
18688 +out:
18689 +       return new_entry;
18690 +}
18691 +
18692 +/**
18693 + * deallocate_snapshot_map
18694 + *
18695 + * This function will delete one hash table, which is part of the whole
18696 + * snapshot remapping structure. Each hash table is an array of pointers
18697 + * to linked lists of struct snapshot_map_entry's.
18698 + **/
18699 +static int deallocate_snapshot_map(struct snapshot_map_entry ** table,
18700 +                                  u32 table_size)
18701 +{
18702 +       struct snapshot_map_entry * entry, * next;
18703 +       int i;
18704 +
18705 +       if (table) {
18706 +               for ( i = 0; i < table_size; i++ ) {
18707 +                       for ( entry = table[i]; entry; entry = next ) {
18708 +                               next = entry->next;
18709 +                               kfree(entry);
18710 +                       }
18711 +               }
18712 +               vfree(table);
18713 +       }
18714 +       return 0;
18715 +}
18716 +
18717 +/**
18718 + * deallocate_logical_volume
18719 + *
18720 + * Delete the in-memory representation of a single LVM logical volume,
18721 + * including its PE map and any snapshot data. Do not alter the parent
18722 + * volume group, except to remove this volume from its volume list.
18723 + **/
18724 +static int deallocate_logical_volume(struct lvm_logical_volume * volume)
18725 +{
18726 +       struct lvm_volume_group * group = volume->group;
18727 +       struct lvm_logical_volume * org_volume, * snap_volume;
18728 +       int i;
18729 +
18730 +       if ( volume->lv_access & LV_SNAPSHOT ) {
18731 +               /* This volume is a snapshot. Remove it from the linked
18732 +                * list of volumes that are snapshotting the original.
18733 +                * First, the original volume must be quiesced.
18734 +                */
18735 +               org_volume = volume->snapshot_org;
18736 +
18737 +               if ( snapshot_check_quiesce_original(volume) ) {
18738 +                       return -EINVAL;
18739 +               }
18740 +
18741 +               remove_snapshot_from_chain(volume);
18742 +
18743 +               /* If the snapshot that was just removed was the last/only
18744 +                * volume snapshotting the original, then mark the original
18745 +                * as no longer being snapshotted.
18746 +                */
18747 +               if ( org_volume && !org_volume->snapshot_next ) {
18748 +                       org_volume->lv_access &= ~LV_SNAPSHOT_ORG;
18749 +               }
18750 +       } else if ( volume->lv_access & LV_SNAPSHOT_ORG ) {
18751 +               /* If this volume is a snapshot original, all of its snapshots
18752 +                * must also be deleted. However, Those deletions need to be
18753 +                * taken care of by the engine. So just check that they have
18754 +                * all been quiesced before removing the original.
18755 +                */
18756 +               if ( snapshot_check_quiesce_all(volume) ) {
18757 +                       return -EINVAL;
18758 +               }
18759 +
18760 +               /* In case there are any snapshots remaining, we must clear out
18761 +                * their pointers to this original to prevent errors when those
18762 +                * snapshots are accessed or deleted.
18763 +                */
18764 +               for ( snap_volume = volume->snapshot_next;
18765 +                     snap_volume; snap_volume = snap_volume->snapshot_next ) {
18766 +                       snap_volume->snapshot_org = NULL;
18767 +               }
18768 +       }
18769 +
18770 +       if (volume->name) {
18771 +               LOG_DEBUG("Deleting volume %s\n", volume->name);
18772 +       }
18773 +
18774 +       /* Free all the memory. This includes the LE-to-PE map, any snapshot
18775 +        * hash tables, the COW table, and chunk data buffer.
18776 +        */
18777 +       if (volume->le_map) {
18778 +               vfree(volume->le_map);
18779 +               volume->le_map = NULL;
18780 +       }
18781 +       if (volume->snapshot_map) {
18782 +               for ( i = 1; i <= group->pv_count; i++ ) {
18783 +                       deallocate_snapshot_map(volume->snapshot_map[i],
18784 +                                               volume->hash_table_size);
18785 +               }
18786 +               kfree(volume->snapshot_map);
18787 +               volume->snapshot_map = NULL;
18788 +       }
18789 +       if (volume->cow_table) {
18790 +               kfree(volume->cow_table);
18791 +               volume->cow_table = NULL;
18792 +       }
18793 +       if (volume->chunk_data_buffer) {
18794 +               kfree(volume->chunk_data_buffer);
18795 +               volume->chunk_data_buffer = NULL;
18796 +       }
18797 +
18798 +       /* Remove this volume from the group's list. */
18799 +       if ( group && group->volume_list[volume->lv_number] == volume ) {
18800 +               group->volume_list[volume->lv_number] = NULL;
18801 +               group->volume_count--;
18802 +       }
18803 +
18804 +       kfree(volume);
18805 +       return 0;
18806 +}
18807 +
18808 +/**
18809 + * allocate_logical_volume
18810 + *
18811 + * Allocate space for a new LVM logical volume, including space for the
18812 + * LE-to-PE map and any necessary snapshot data.
18813 + **/
18814 +static struct lvm_logical_volume *
18815 +allocate_logical_volume(struct lv_disk * lv, struct lvm_volume_group * group)
18816 +{
18817 +       struct lvm_logical_volume * new_volume;
18818 +       u32 table_entries_per_chunk, table_chunks;
18819 +       int i;
18820 +
18821 +       /* Allocate space for the new logical volume. */
18822 +       new_volume = kmalloc(sizeof(struct lvm_logical_volume), GFP_NOIO);
18823 +       if (!new_volume) {
18824 +               LOG_CRITICAL("Error allocating new logical volume %s\n",
18825 +                            lv->lv_name);
18826 +               goto out;
18827 +       }
18828 +       memset(new_volume, 0, sizeof(struct lvm_logical_volume));
18829 +
18830 +       /* Allocate space for the LE to PE mapping table. */
18831 +       new_volume->le_map = vmalloc(lv->lv_allocated_le *
18832 +                                    sizeof(struct le_table_entry));
18833 +       if (!new_volume->le_map) {
18834 +               LOG_CRITICAL("Error creating LE map for logical volume %s\n",
18835 +                            lv->lv_name);
18836 +               goto error;
18837 +       }
18838 +       memset(new_volume->le_map, 0,
18839 +              lv->lv_allocated_le * sizeof(struct le_table_entry));
18840 +
18841 +       /* Initialize the rest of the new volume.
18842 +        * Need the +1 on lv_number to match the PE Map entries on the PV.
18843 +        */
18844 +       new_volume->lv_number = lv->lv_number + 1;
18845 +       new_volume->lv_size = lv->lv_size;
18846 +       new_volume->lv_access = lv->lv_access | EVMS_LV_NEW | EVMS_LV_QUIESCED;
18847 +       new_volume->lv_status = lv->lv_status | LV_ACTIVE;
18848 +       new_volume->lv_minor = MINOR(lv->lv_dev);
18849 +       new_volume->stripes = lv->lv_stripes;
18850 +       new_volume->stripe_size = lv->lv_stripesize;
18851 +       new_volume->stripe_size_shift = evms_cs_log2(lv->lv_stripesize);
18852 +       new_volume->pe_size = group->vg->pe_size;
18853 +       new_volume->pe_size_shift = evms_cs_log2(group->vg->pe_size);
18854 +       new_volume->num_le = lv->lv_allocated_le;
18855 +       new_volume->group = group;
18856 +       /* Different naming scheme for EVMS nodes. */
18857 +       if ( translate_lv_name(lv->lv_name, new_volume->name) ) {
18858 +               goto error;
18859 +       }
18860 +
18861 +       if ( new_volume->lv_access & LV_SNAPSHOT ) {
18862 +               /* This volume is a snapshot, initialize the remaining data,
18863 +                * and allocate space for the remapping structures, and one
18864 +                * sector's worth of COW tables.
18865 +                */
18866 +               new_volume->chunk_size = lv->lv_chunk_size;
18867 +               new_volume->num_chunks = lv->lv_size / lv->lv_chunk_size;
18868 +               new_volume->snap_org_minor = lv->lv_snapshot_minor;
18869 +               new_volume->next_cow_entry = 0;
18870 +               new_volume->current_cow_sector = 0;
18871 +               table_entries_per_chunk = (new_volume->chunk_size <<
18872 +                                          EVMS_VSECTOR_SIZE_SHIFT) /
18873 +                                         sizeof(struct lv_COW_table_disk);
18874 +               table_chunks = (new_volume->num_chunks +
18875 +                               table_entries_per_chunk - 1) /
18876 +                              table_entries_per_chunk;
18877 +               new_volume->next_free_chunk = table_chunks *
18878 +                                             new_volume->chunk_size;
18879 +               new_volume->hash_table_size = (lv->lv_size / lv->lv_chunk_size /
18880 +                                              MAX_HASH_CHAIN_ENTRIES) + 1;
18881 +
18882 +               new_volume->cow_table = kmalloc(EVMS_VSECTOR_SIZE, GFP_NOIO);
18883 +               if (!new_volume->cow_table) {
18884 +                       LOG_CRITICAL("Error allocating COW table for logical volume %s\n",
18885 +                                    lv->lv_name);
18886 +                       goto error;
18887 +               }
18888 +               memset(new_volume->cow_table, 0, EVMS_VSECTOR_SIZE);
18889 +
18890 +               new_volume->snapshot_map = kmalloc((group->pv_count + 1) *
18891 +                                                  sizeof(struct snapshot_map_entry **),
18892 +                                                  GFP_NOIO);
18893 +               if (!new_volume->snapshot_map) {
18894 +                       LOG_CRITICAL("Error allocating snapshot map for logical volume %s\n",
18895 +                                    lv->lv_name);
18896 +                       goto error;
18897 +               }
18898 +
18899 +               new_volume->snapshot_map[0] = NULL;
18900 +               for ( i = 1; i <= group->pv_count; i++ ) {
18901 +                       new_volume->snapshot_map[i] =
18902 +                               vmalloc(new_volume->hash_table_size *
18903 +                                       sizeof(struct snapshot_map_entry *));
18904 +                       if (!new_volume->snapshot_map[i]) {
18905 +                               LOG_CRITICAL("Error allocating snapshot sub-map for logical volume %s\n",
18906 +                                            lv->lv_name);
18907 +                               goto error;
18908 +                       }
18909 +                       memset(new_volume->snapshot_map[i], 0,
18910 +                              new_volume->hash_table_size *
18911 +                              sizeof(struct snapshot_map_entry *));
18912 +               }
18913 +               init_MUTEX(&new_volume->snap_semaphore);
18914 +       } else if ( new_volume->lv_access & LV_SNAPSHOT_ORG ) {
18915 +               /* This volume is a snapshot original, allocate space to use for
18916 +                * copying snapshot chunks. This will now be a fixed size
18917 +                * instead of being based on the chunk size of the snapshots.
18918 +                */
18919 +               new_volume->chunk_size = CHUNK_DATA_BUFFER_SIZE;
18920 +               new_volume->chunk_data_buffer =
18921 +                       kmalloc(new_volume->chunk_size <<
18922 +                               EVMS_VSECTOR_SIZE_SHIFT, GFP_NOIO);
18923 +               if (!new_volume->chunk_data_buffer) {
18924 +                       LOG_SERIOUS("Error allocating snapshot chunk buffer for logical volume %s\n",
18925 +                                   lv->lv_name);
18926 +                       goto error;
18927 +               }
18928 +               memset(new_volume->chunk_data_buffer, 0,
18929 +                      new_volume->chunk_size << EVMS_VSECTOR_SIZE_SHIFT);
18930 +       }
18931 +
18932 +out:
18933 +       return new_volume;
18934 +error:
18935 +       deallocate_logical_volume(new_volume);
18936 +       new_volume = NULL;
18937 +       goto out;
18938 +}
18939 +
18940 +/**
18941 + * deallocate_volume_group
18942 + *
18943 + * Delete the entire in-memory representation of an LVM volume group,
18944 + * including all PVs and logical volumes. If this group is on LVM's
18945 + * volume group list, remove it.
18946 + **/
18947 +static int deallocate_volume_group(struct lvm_volume_group * group)
18948 +{
18949 +       struct lvm_physical_volume * pv_entry, * next_pv;
18950 +       int i;
18951 +
18952 +       LOG_DEBUG("Deleting volume group %s\n", group->vg_name);
18953 +
18954 +       /* Remove the group from the global list. */
18955 +       remove_group_from_list(group);
18956 +
18957 +       /* Delete the LV metadata array. */
18958 +       if (group->lv_array) {
18959 +               vfree(group->lv_array);
18960 +               group->lv_array = NULL;
18961 +       }
18962 +
18963 +       /* Delete the PV UUID list. */
18964 +       if (group->uuid_list) {
18965 +               vfree(group->uuid_list);
18966 +               group->uuid_list = NULL;
18967 +       }
18968 +
18969 +       /* Delete all logical volumes. */
18970 +       for ( i = 1; i <= MAX_LV; i++ ) {
18971 +               if (group->volume_list[i]) {
18972 +                       deallocate_logical_volume(group->volume_list[i]);
18973 +                       group->volume_list[i] = NULL;
18974 +               }
18975 +       }
18976 +
18977 +       /* Delete all PVs from the group's list. */
18978 +       for ( pv_entry = group->pv_list; pv_entry; pv_entry = next_pv ) {
18979 +               next_pv = pv_entry->next;
18980 +               if (pv_entry->logical_node) {
18981 +                       /* Send a delete command down to the segment manager. */
18982 +                       LOG_DEBUG("Deleting PV %s from group %s\n",
18983 +                                 pv_entry->logical_node->name, group->vg_name);
18984 +                       DELETE(pv_entry->logical_node);
18985 +                       pv_entry->logical_node = NULL;
18986 +               }
18987 +               deallocate_physical_volume(pv_entry);
18988 +       }
18989 +
18990 +       /* Delete the VG metadata. */
18991 +       if (group->vg) {
18992 +               kfree(group->vg);
18993 +               group->vg = NULL;
18994 +       }
18995 +
18996 +       kfree(group);
18997 +       return 0;
18998 +}
18999 +
19000 +/**
19001 + * allocate_volume_group
19002 + *
19003 + * Allocate space for a new LVM volume group and all of its sub-fields.
19004 + * Initialize the appropriate fields.
19005 + * vg parameter should already have an allocate/initialized struct vg_disk.
19006 + **/
19007 +static struct lvm_volume_group * allocate_volume_group(struct vg_disk * vg,
19008 +                                                      u8 * vg_name)
19009 +{
19010 +       struct lvm_volume_group * new_group;
19011 +
19012 +       /* The volume group itself. */
19013 +       new_group = kmalloc(sizeof(struct lvm_volume_group), GFP_NOIO);
19014 +       if (!new_group) {
19015 +               kfree(vg);
19016 +               goto out;
19017 +       }
19018 +
19019 +       /* Initialize the new group. */
19020 +       memset(new_group, 0, sizeof(struct lvm_volume_group));
19021 +       memcpy(new_group->vg_uuid, vg->vg_uuid, UUID_LEN);
19022 +       strncpy(new_group->vg_name, vg_name, NAME_LEN - 1);
19023 +       new_group->vg = vg;
19024 +       /* Default sector and block sizes. */
19025 +       new_group->hard_sect_size = 512;
19026 +       new_group->block_size = 1024;
19027 +       new_group->flags = EVMS_VG_DIRTY;
19028 +
19029 +       LOG_DETAILS("Discovered volume group %s\n", new_group->vg_name);
19030 +
19031 +out:
19032 +       return new_group;
19033 +}
19034 +
19035 +/**
19036 + * remove_pv_from_group
19037 + *
19038 + * In the engine, when a PV is removed from a group (on a vgreduce), that
19039 + * same PV must be removed from that group in the kernel. Otherwise, when
19040 + * the rediscover occurs, that PV will still appear in the group, and
19041 + * will cause segfaults when we try to read metadata from it.
19042 + **/
19043 +static int remove_pv_from_group(int pv_number, unsigned char * vg_uuid)
19044 +{
19045 +       struct lvm_volume_group * group;
19046 +       struct lvm_physical_volume * pv_entry;
19047 +       struct lvm_physical_volume ** p_pv_entry;
19048 +
19049 +       /* Make sure the numbers are in range. */
19050 +       if ( pv_number < 0 || pv_number > MAX_PV ) {
19051 +               return 0;
19052 +       }
19053 +
19054 +       /* Make sure the group exists. */
19055 +       find_group_by_uuid(vg_uuid, &group);
19056 +       if (!group) {
19057 +               return 0;
19058 +       }
19059 +
19060 +       /* Make sure the PV is in this group. */
19061 +       pv_entry = find_pv_by_number(pv_number, group);
19062 +       if (!pv_entry) {
19063 +               LOG_WARNING("Did not find PV %d in group %s\n",
19064 +                           pv_number, group->vg_name);
19065 +               return 0;
19066 +       }
19067 +
19068 +       /* Make sure the PV is not in use by any volumes. */
19069 +       if ( check_pv_for_lv(pv_entry, group) ) {
19070 +               LOG_SERIOUS("PV %d in group %s still contains LVs\n",
19071 +                           pv_number, group->vg_name);
19072 +               return -EINVAL;
19073 +       }
19074 +
19075 +       /* Take this PV out of the group's list. */
19076 +       for ( p_pv_entry = &group->pv_list;
19077 +             *p_pv_entry; p_pv_entry = &(*p_pv_entry)->next ) {
19078 +               if ( *p_pv_entry == pv_entry ) {
19079 +                       *p_pv_entry = (*p_pv_entry)->next;
19080 +                       pv_entry->next = NULL;
19081 +                       break;
19082 +               }
19083 +       }
19084 +
19085 +       group->pv_count--;
19086 +
19087 +       /* There is no way that this PV was the last in this group, so the
19088 +        * group never needs to be deleted at this point. The only way this
19089 +        * group will exist in the kernel is if there are volumes exported from
19090 +        * it. If this was the last PV, then those volumes must be on that PV,
19091 +        * and it wouldn't be allowed to be removed from the group (above).
19092 +        */
19093 +
19094 +       /* Free up the memory for this PV. Just drop the node. */
19095 +       deallocate_physical_volume(pv_entry);
19096 +
19097 +       LOG_DEBUG("PV %d removed from group %s\n", pv_number, group->vg_name);
19098 +       return 0;
19099 +}
19100 +
19101 +
19102 +/********** Consistency Checking Functions **********/
19103 +
19104 +
19105 +/**
19106 + * clear_le_entries_for_missing_pv
19107 + *
19108 + * In the event that a PV turns up missing during a rediscover, we
19109 + * need to erase any LE map entries that might point to it.
19110 + **/
19111 +static void
19112 +clear_le_entries_for_missing_pv(struct lvm_volume_group * group,
19113 +                               struct lvm_physical_volume * pv_entry)
19114 +{
19115 +       struct lvm_logical_volume * volume;
19116 +       int i, j;
19117 +
19118 +       for ( i = 1; i <= MAX_LV; i++ ) {
19119 +               if (group->volume_list[i]) {
19120 +                       volume = group->volume_list[i];
19121 +                       for ( j = 0; j < volume->num_le; j++ ) {
19122 +                               if ( volume->le_map[j].owning_pv == pv_entry ) {
19123 +                                       volume->le_map[j].owning_pv = NULL;
19124 +                                       volume->le_map[j].pe_sector_offset = 0;
19125 +                               }
19126 +                       }
19127 +               }
19128 +       }
19129 +}
19130 +
19131 +/**
19132 + * check_volume_groups
19133 + *
19134 + * This function performs some simple consistency checks on all dirty
19135 + * volume groups. Any groups that have no PVs are deleted. If any metadata
19136 + * structures (PV or VG) are missing, they are read in from disk.
19137 + **/
19138 +static int check_volume_groups(void)
19139 +{
19140 +       struct lvm_volume_group * group, * next_group;
19141 +       struct lvm_physical_volume * pv_entry, * next_pv;
19142 +       int rc = 0;
19143 +
19144 +       for ( group = lvm_group_list; group; group = next_group ) {
19145 +               next_group = group->next_group;
19146 +
19147 +               LOG_DEBUG("Checking Group %s\n", group->vg_name);
19148 +
19149 +               /* If a group has no PVs, it can be safely deleted,
19150 +                * because we can't find any volumes on it.
19151 +                */
19152 +               if (!group->pv_count) {
19153 +                       LOG_WARNING("No PVs found for Group %s.\n",
19154 +                                   group->vg_name);
19155 +                       if (!group->volume_count) {
19156 +                               deallocate_volume_group(group);
19157 +                       }
19158 +                       continue;
19159 +               }
19160 +
19161 +               /* Make sure all metadata for the PVs is present. On a
19162 +                * rediscover, it may be missing, because we delete it at the
19163 +                * end of discovery. If any is missing, read it in from disk.
19164 +                * This is only necessary in the kernel. It can't happen in
19165 +                * the engine.
19166 +                */
19167 +               for ( pv_entry = group->pv_list;
19168 +                     pv_entry; pv_entry = next_pv ) {
19169 +                       next_pv = pv_entry->next;
19170 +                       if (!pv_entry->pv) {
19171 +                               LOG_DEBUG("Re-reading PV metadata for %s\n",
19172 +                                         pv_entry->logical_node->name);
19173 +                               rc = read_pv(pv_entry->logical_node,
19174 +                                            &pv_entry->pv);
19175 +                               if (rc) {
19176 +                                       /* What happens if we can't re-read the
19177 +                                        * PV metadata? This PV must be removed
19178 +                                        * from the group. Need to also clear
19179 +                                        * all LE entries in all LVs that are
19180 +                                        * pointing to this PV before it can be
19181 +                                        * removed from the list.
19182 +                                        */
19183 +                                       LOG_SERIOUS("PV metadata is missing or cannot be read from %s\n",
19184 +                                                   pv_entry->logical_node->name);
19185 +                                       clear_le_entries_for_missing_pv(group,
19186 +                                                                       pv_entry);
19187 +                                       remove_pv_from_group(pv_entry->pv_number,
19188 +                                                            group->vg_uuid);
19189 +                                       continue;
19190 +                               }
19191 +                               pv_entry->pv_number = pv_entry->pv->pv_number;
19192 +
19193 +                               /* Check for a "stale" PV. This case should be
19194 +                                * already be covered, as long as the Engine is
19195 +                                * calling the PV_REMOVE ioctl when it does a
19196 +                                * vgreduce or a pvremove. If this is the last
19197 +                                * PV in the group, the group will be deleted.
19198 +                                */
19199 +                               if (!pv_entry->pv_number) {
19200 +                                       remove_pv_from_group(0, group->vg_uuid);
19201 +                                       continue;
19202 +                               }
19203 +                       }
19204 +
19205 +                       if (!pv_entry->pe_map) {
19206 +                               LOG_DEBUG("Re-reading PE maps for %s\n",
19207 +                                         pv_entry->logical_node->name);
19208 +                               rc = read_pe_map(pv_entry);
19209 +                               if (rc) {
19210 +                                       LOG_WARNING("Error reading PE maps for %s\n",
19211 +                                                   pv_entry->logical_node->name);
19212 +                                       LOG_WARNING("Any volumes residing on %s will be incomplete!\n",
19213 +                                                   pv_entry->logical_node->name);
19214 +                               }
19215 +                       }
19216 +               }
19217 +
19218 +               /* Make sure the metadata for the VG is present. If it's
19219 +                * missing, read it in from the first PV in the VG.
19220 +                */
19221 +               if (!group->vg && group->pv_count) {
19222 +                       LOG_DEBUG("Re-reading VG metadata for Group %s\n",
19223 +                                 group->vg_name);
19224 +                       pv_entry = group->pv_list;
19225 +                       rc = read_vg(pv_entry->logical_node,
19226 +                                    pv_entry->pv, &group->vg);
19227 +                       if (rc) {
19228 +                               /* What happens if we can't re-read the
19229 +                                * VG metadata? It's definitely bad
19230 +                                * news. Should we delete the VG?
19231 +                                */
19232 +                               continue;
19233 +                       }
19234 +               }
19235 +
19236 +               /* Display a warning if the number of PVs found for the group
19237 +                * doesn't match the number of PVs recorded for the VG.
19238 +                */
19239 +               if ( group->vg && group->pv_count != group->vg->pv_cur ) {
19240 +                       LOG_WARNING("Group %s is incomplete.\n",
19241 +                                   group->vg_name);
19242 +                       LOG_WARNING("     Only %d of %d PVs found.\n",
19243 +                                   group->pv_count, group->vg->pv_cur);
19244 +                       LOG_WARNING("     Volumes in this group may be incomplete.\n");
19245 +               }
19246 +       }
19247 +
19248 +       return 0;
19249 +}
19250 +
19251 +/**
19252 + * check_le_maps
19253 + *
19254 + * Make sure all volumes in this group have valid LE-to-PE maps. Any
19255 + * volume that doesn't is marked as incomplete. This is safe for
19256 + * re-discovery because only new volumes could have corrupted LE maps.
19257 + **/
19258 +static int check_le_maps(struct lvm_volume_group * group)
19259 +{
19260 +       struct lvm_logical_volume * volume;
19261 +       int i, j, count;
19262 +
19263 +       for ( i = 1; i <= MAX_LV; i++ ) {
19264 +               volume = group->volume_list[i];
19265 +               if (!volume) {
19266 +                       continue;
19267 +               }
19268 +
19269 +               if (!volume->le_map) {
19270 +                       /* No point in keeping the volume around if it has
19271 +                        * no LE map at all.
19272 +                        */
19273 +                       LOG_SERIOUS("Volume %s has no LE map.\n", volume->name);
19274 +                       deallocate_logical_volume(volume);
19275 +                       continue;
19276 +               }
19277 +
19278 +               /* If any entries in the LE map are missing, mark this volume
19279 +                * as incomplete.
19280 +                */
19281 +               for ( j = 0, count = 0; j < volume->num_le; j++ ) {
19282 +                       if ( !volume->le_map[j].owning_pv ||
19283 +                            !volume->le_map[j].pe_sector_offset) {
19284 +                               count++;
19285 +                       }
19286 +               }
19287 +               if (count) {
19288 +                       LOG_SERIOUS("Volume %s has incomplete LE map.\n",
19289 +                                   volume->name);
19290 +                       LOG_SERIOUS("       Missing %d out of %d LEs.\n",
19291 +                                   count, volume->num_le);
19292 +                       volume->lv_access |= EVMS_LV_INCOMPLETE;
19293 +               }
19294 +       }
19295 +       return 0;
19296 +}
19297 +
19298 +/**
19299 + * check_snapshot_map
19300 + *
19301 + * For snapshot volumes, make sure the snapshot map is intact, and that
19302 + * any existing entries in the map are in the correct order and there
19303 + * are no duplicate entries.
19304 + **/
19305 +static int check_snapshot_map(struct lvm_logical_volume * snap_volume)
19306 +{
19307 +       struct snapshot_map_entry ** table, * curr;
19308 +       int i, j;
19309 +
19310 +       if ( ! (snap_volume->lv_access & LV_SNAPSHOT) ) {
19311 +               return 0;
19312 +       }
19313 +       if (!snap_volume->snapshot_map) {
19314 +               snap_volume->lv_access |= EVMS_LV_INVALID;
19315 +               return -EINVAL;
19316 +       }
19317 +
19318 +       for ( i = 1; i <= snap_volume->group->pv_count; i++ ) {
19319 +               if (!snap_volume->snapshot_map[i]) {
19320 +                       snap_volume->lv_access |= EVMS_LV_INVALID;
19321 +                       return -EINVAL;
19322 +               }
19323 +               table = snap_volume->snapshot_map[i];
19324 +               for ( j = 0; j < snap_volume->hash_table_size; j++ ) {
19325 +                       for ( curr = table[j]; curr; curr = curr->next ) {
19326 +                               if ( curr->next &&
19327 +                                    curr->org_sector >=
19328 +                                    curr->next->org_sector) {
19329 +                                       snap_volume->lv_access |=
19330 +                                               EVMS_LV_INVALID;
19331 +                                       return -EINVAL;
19332 +                               }
19333 +                       }
19334 +               }
19335 +       }
19336 +       return 0;
19337 +}
19338 +
19339 +/**
19340 + * check_logical_volumes
19341 + *
19342 + * Perform a consistency check on all of the logical volumes that have been
19343 + * discovered. Any volume that has any inconsistencies will be marked as
19344 + * incomplete or invalid, depending on the severity of the problem. At the
19345 + * end, all invalid volumes are deleted. If the deleted_incompletes
19346 + * parameter is set, those will also be deleted.
19347 + **/
19348 +static int check_logical_volumes(int final_discovery)
19349 +{
19350 +       struct lvm_volume_group * group;
19351 +       struct lvm_logical_volume * volume, * snap, * next;
19352 +       int count, i, j;
19353 +
19354 +       /* Check every valid, dirty volume group. */
19355 +       for ( group = lvm_group_list; group; group = group->next_group ) {
19356 +               if ( ! (group->flags & EVMS_VG_DIRTY) ) {
19357 +                       continue;
19358 +               }
19359 +               /* Check every valid volume in this group. */
19360 +               for ( i = 1; i <= MAX_LV; i++ ) {
19361 +                       volume = group->volume_list[i];
19362 +                       if (!volume) {
19363 +                               continue;
19364 +                       }
19365 +
19366 +                       LOG_DEBUG("Checking logical volume %s\n", volume->name);
19367 +
19368 +                       if (!volume->group) {
19369 +                               volume->group = group;
19370 +                       }
19371 +
19372 +                       /* All LE-map entries must have valid values. The I/O
19373 +                        * paths now detect missing LE entries.
19374 +                        */
19375 +                       if (volume->le_map) {
19376 +                               for ( j = 0, count = 0;
19377 +                                     j < volume->num_le; j++ ) {
19378 +                                       if ( !volume->le_map[j].owning_pv ||
19379 +                                            !volume->le_map[j].pe_sector_offset ) {
19380 +                                               count++;
19381 +                                       }
19382 +                               }
19383 +                               if (count) {
19384 +                                       LOG_SERIOUS("Volume %s has incomplete LE map.\n",
19385 +                                                   volume->name);
19386 +                                       LOG_SERIOUS("      Missing %d out of %d LEs.\n",
19387 +                                                   count, volume->num_le);
19388 +                                       volume->lv_access |= EVMS_LV_INCOMPLETE;
19389 +                               } else {
19390 +                                       /* In case this volume was previously
19391 +                                        * marked incomplete.
19392 +                                        */
19393 +                                       volume->lv_access &=
19394 +                                               ~EVMS_LV_INCOMPLETE;
19395 +                               }
19396 +                       } else {
19397 +                               /* This should only ever happen due to
19398 +                                * memory corruption.
19399 +                                */
19400 +                               LOG_SERIOUS("Volume %s has no LE map.\n",
19401 +                                           volume->name);
19402 +                               volume->lv_access |= EVMS_LV_INVALID;
19403 +                       }
19404 +
19405 +                       if ( volume->lv_access & LV_SNAPSHOT_ORG ) {
19406 +                               /* For a snapshot original, check all snapshots
19407 +                                * in the chain, to make sure they point back to
19408 +                                * the original. Also, make sure there is memory
19409 +                                * for the chunk buffer.
19410 +                                */
19411 +                               for ( snap = volume->snapshot_next, count = 0;
19412 +                                     snap;
19413 +                                     snap = snap->snapshot_next, count++ ) {
19414 +                                       if ( snap->snapshot_org != volume ) {
19415 +                                               LOG_SERIOUS("Snapshot volume %s not pointing at correct original\n",
19416 +                                                           volume->name);
19417 +                                               snap->snapshot_org = NULL;
19418 +                                               snap->lv_access |=
19419 +                                                       EVMS_LV_INVALID;
19420 +                                       }
19421 +                               }
19422 +                               if (!count) {
19423 +                                       LOG_WARNING("No snapshots found for volume %s\n",
19424 +                                                   volume->name);
19425 +                                       if (final_discovery) {
19426 +                                               volume->lv_access &=
19427 +                                                       ~LV_SNAPSHOT_ORG;
19428 +                                       }
19429 +                               } else if (!volume->chunk_data_buffer) {
19430 +                                       volume->lv_access |= EVMS_LV_INVALID;
19431 +                               }
19432 +                       } else if ( volume->lv_access & LV_SNAPSHOT ) {
19433 +                               /* For a snapshot volume, make sure it points
19434 +                                * back to its original. Also make sure there is
19435 +                                * memory for the cow table, and that any
19436 +                                * existing snapshot entries in the snapshot map
19437 +                                * are correctly ordered.
19438 +                                */
19439 +                               /* Is there a COW table? */
19440 +                               if (!volume->cow_table) {
19441 +                                       LOG_SERIOUS("Snapshot volume %s has no COW table\n",
19442 +                                                   volume->name);
19443 +                                       volume->lv_access |= EVMS_LV_INVALID;
19444 +                               }
19445 +                               /* Is the snapshot map in order? */
19446 +                               if ( check_snapshot_map(volume) ) {
19447 +                                       LOG_SERIOUS("Snapshot volume %s has snapshot map inconsistency\n",
19448 +                                                   volume->name);
19449 +                                       volume->lv_access |= EVMS_LV_INVALID;
19450 +                               }
19451 +                               /* Is there an original volume? This is only
19452 +                                * a real problem during final discovery.
19453 +                                */
19454 +                               if (!volume->snapshot_org) {
19455 +                                       LOG_SERIOUS("Snapshot volume %s not pointing at an original\n",
19456 +                                                   volume->name);
19457 +                                       if (final_discovery) {
19458 +                                               volume->lv_access |=
19459 +                                                       EVMS_LV_INVALID;
19460 +                                       }
19461 +                               }
19462 +                               /* Is the original the correct one? */
19463 +                               else if ( volume->snap_org_minor !=
19464 +                                         volume->snapshot_org->lv_minor ) {
19465 +                                       LOG_SERIOUS("Snapshot volume %s not pointing at correct original\n",
19466 +                                                   volume->name);
19467 +                                       volume->lv_access |= EVMS_LV_INVALID;
19468 +                               }
19469 +                       }
19470 +                       /* Delete any invalid volumes from use. Delete
19471 +                        * incomplete volumes as well if this is not final
19472 +                        * discovery. If a snapshot original is bad, delete all
19473 +                        * of its snapshots.
19474 +                        */
19475 +                       if ( volume->lv_access & EVMS_LV_INVALID ||
19476 +                            (!final_discovery &&
19477 +                             (volume->lv_access & EVMS_LV_INCOMPLETE) &&
19478 +                             (volume->lv_access & EVMS_LV_NEW)) ) {
19479 +                               if ( volume->lv_access & LV_SNAPSHOT_ORG ) {
19480 +                                       for ( snap = volume->snapshot_next;
19481 +                                             snap; snap = next ) {
19482 +                                               next = snap->snapshot_next;
19483 +                                               snap->snapshot_next = NULL;
19484 +                                               snap->snapshot_org = NULL;
19485 +                                               invalidate_snapshot_volume(snap);
19486 +                                               deallocate_logical_volume(snap);
19487 +                                       }
19488 +                                       volume->snapshot_next = NULL;
19489 +                               } else if ( volume->lv_access & LV_SNAPSHOT ) {
19490 +                                       invalidate_snapshot_volume(volume);
19491 +                               }
19492 +                               deallocate_logical_volume(volume);
19493 +                       }
19494 +               }
19495 +       }
19496 +
19497 +       return 0;
19498 +}
19499 +
19500 +
19501 +/********** Volume Group Discovery Functions **********/
19502 +
19503 +
19504 +/**
19505 + * find_group_for_pv
19506 + *
19507 + * This is a discover-time function. It reads the VG metadata info for the
19508 + * specified node, and locates the appropriate group that owns that
19509 + * node. If that group does not already exist, it is created and
19510 + * initialized.
19511 + **/
19512 +static int find_group_for_pv(struct evms_logical_node * node,
19513 +                            struct pv_disk * pv,
19514 +                            struct lvm_volume_group ** group)
19515 +{
19516 +       struct vg_disk * vg;
19517 +       int rc;
19518 +
19519 +       *group = NULL;
19520 +
19521 +       /* Check for an unassigned PV. */
19522 +       if ( pv->vg_name[0] == 0 ) {
19523 +               return 0;
19524 +       }
19525 +
19526 +       /* Read the VG on-disk info for this PV. If this succeeds, it
19527 +        * allocates a new VG metadata structure.
19528 +        */
19529 +       rc = read_vg(node, pv, &vg);
19530 +       if (rc) {
19531 +               return rc;
19532 +       }
19533 +
19534 +       /* Use the UUID from the VG metadata to determine if this group
19535 +        * has already been discovered and constructed.
19536 +        */
19537 +       find_group_by_uuid(vg->vg_uuid, group);
19538 +
19539 +       if (!*group) {
19540 +               /* Create a new group entry and add to the global list. */
19541 +               *group = allocate_volume_group(vg, pv->vg_name);
19542 +               if (!*group) {
19543 +                       return -ENOMEM;
19544 +               }
19545 +               add_group_to_list(*group);
19546 +       } else if (!(*group)->vg) {
19547 +               /* On a rediscover, the VG metadata for an existing group might
19548 +                * be missing. Fill it in if necessary. This check is also not
19549 +                * necessary in the engine, since the metadata is never deleted.
19550 +                */
19551 +/* Should we re-copy vg_name? (vg_uuid can not be allowed to change).
19552 + * Or should vg_name changes be done through direct ioctl only?
19553 + */
19554 +               (*group)->vg = vg;
19555 +       } else {
19556 +               kfree(vg);
19557 +       }
19558 +
19559 +       /* Read in the UUID list for this group, if it isn't present. */
19560 +       rc = read_uuid_list(node, pv, *group);
19561 +       if (rc) {
19562 +               LOG_WARNING("Error reading UUID list for group %s.\n",
19563 +                           (*group)->vg_name);
19564 +               LOG_WARNING("May not be able to verify PV UUIDs for group %s\n",
19565 +                           (*group)->vg_name);
19566 +       }
19567 +
19568 +       /* In the kernel, any time we even see a PV for a group, that group
19569 +        * must be marked dirty so its volumes will be re-exported.
19570 +        */
19571 +       (*group)->flags |= EVMS_VG_DIRTY;
19572 +
19573 +       return 0;
19574 +}
19575 +
19576 +/**
19577 + * check_for_duplicate_pv
19578 + *
19579 + * Search the list of PVs in the specified volume group. If the
19580 + * specified node already exists in the list, we can discard it.
19581 + **/
19582 +static int check_for_duplicate_pv(struct evms_logical_node * node,
19583 +                                 struct pv_disk * pv,
19584 +                                 struct lvm_volume_group * group)
19585 +{
19586 +       struct lvm_physical_volume * pv_entry;
19587 +
19588 +       /* For re-discovery, we need to search all existing PVs in this VG to
19589 +        * make sure we didn't get a duplicate from the plugin below us. The
19590 +        * plugins below us should be re-exporting the same node on
19591 +        * re-discovery, instead of creating a new node to represent the same
19592 +        * objects, so just check the memory location.
19593 +        */
19594 +       for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {
19595 +               if ( pv_entry->logical_node == node ) {
19596 +
19597 +                       /* We found a duplicate. Just ignore the duplicate. */
19598 +                       LOG_DEBUG("PV %s is already in Group %s.\n",
19599 +                                 node->name, group->vg_name);
19600 +
19601 +                       /* Even if the node was a duplicate, we may need to
19602 +                        * fill in the pv entry for this partition, since we
19603 +                        * always delete those at the end of discovery.
19604 +                        */
19605 +                       if (!pv_entry->pv) {
19606 +                               pv_entry->pv = pv;
19607 +                               pv_entry->pv_number = pv->pv_number;
19608 +                       } else {
19609 +                               kfree(pv);
19610 +                       }
19611 +
19612 +                       return 1;
19613 +               }
19614 +       }
19615 +
19616 +       /* No duplicate was found. */
19617 +       return 0;
19618 +}
19619 +
19620 +/**
19621 + * verify_pv_uuid
19622 + *
19623 + * Verify that the specified PV belongs in the specified group by
19624 + * searching for the PV's UUID in the group's list.
19625 + **/
19626 +static int verify_pv_uuid(struct lvm_physical_volume * pv_entry,
19627 +                         struct lvm_volume_group * group)
19628 +{
19629 +       int i;
19630 +
19631 +       /* Obviously the UUID list must be present in order to search. */
19632 +       if (!group->uuid_list) {
19633 +               LOG_WARNING("UUID list is missing from group %s.\n",
19634 +                           group->vg_name);
19635 +               LOG_WARNING("Cannot verify UUID for PV %s\n",
19636 +                           pv_entry->logical_node->name);
19637 +               return 0;
19638 +       }
19639 +
19640 +       /* Start with the UUID entry for this PV's number. */
19641 +       if ( ! memcmp(pv_entry->pv->pv_uuid,
19642 +                     &(group->uuid_list[(pv_entry->pv_number - 1) * NAME_LEN]),
19643 +                     UUID_LEN) ) {
19644 +               return 0;
19645 +       }
19646 +
19647 +       /* If it wasn't found there, then search the entire group's list. */
19648 +       for ( i = 0; i < group->vg->pv_cur; i++ ) {
19649 +               if ( ! memcmp(pv_entry->pv->pv_uuid,
19650 +                             &(group->uuid_list[i * NAME_LEN]), UUID_LEN) ) {
19651 +                       /* Found the UUID.  */
19652 +                       LOG_WARNING("Detected UUID mismatch for PV %s!\n",
19653 +                                   pv_entry->logical_node->name);
19654 +                       LOG_WARNING("PV %s is recorded as being at index %d,\n",
19655 +                                   pv_entry->logical_node->name,
19656 +                                   pv_entry->pv_number);
19657 +                       LOG_WARNING(" but Group %s has it recorded at index %d.\n",
19658 +                                   group->vg_name, i + 1);
19659 +                       LOG_WARNING("Run the EVMS Engine to correct the problem.\n");
19660 +                       LOG_WARNING("If you have any snapshot regions in group %s\n",
19661 +                                   group->vg_name);
19662 +                       LOG_WARNING(" it is recommended that you delete them immediately!\n");
19663 +                       return 0;
19664 +               }
19665 +       }
19666 +
19667 +       LOG_SERIOUS("Could not find UUID for PV %s in group %s\n",
19668 +                   pv_entry->logical_node->name, group->vg_name);
19669 +       return -EINVAL;
19670 +}
19671 +
19672 +/**
19673 + * add_pv_to_group
19674 + *
19675 + * Adds the physical volume to the appropriate volume group. The PV
19676 + * passed into this function MUST be part of a valid VG.
19677 + **/
19678 +static int add_pv_to_group(struct lvm_physical_volume * pv_entry,
19679 +                          struct lvm_volume_group * group)
19680 +{
19681 +       int rc;
19682 +
19683 +       /* Make sure this PV's UUID is listed in the group. */
19684 +       rc = verify_pv_uuid(pv_entry, group);
19685 +       if (rc) {
19686 +               LOG_SERIOUS("PV %s does not belong in group %s!\n",
19687 +                           pv_entry->logical_node->name, group->vg_name);
19688 +               return rc;
19689 +       }
19690 +
19691 +       /* Add this PV to the beginning of its group's list. */
19692 +       pv_entry->next = group->pv_list;
19693 +       group->pv_list = pv_entry;
19694 +       group->pv_count++;
19695 +
19696 +       /* Update the group's block and hardsector sizes as appropriate. */
19697 +       group->block_size = max(pv_entry->logical_node->block_size,
19698 +                               group->block_size);
19699 +       group->hard_sect_size = max(pv_entry->logical_node->hardsector_size,
19700 +                                   group->hard_sect_size);
19701 +
19702 +       /* Check for the Partial or Removable flag on the PV. */
19703 +       if ( pv_entry->logical_node->flags & EVMS_VOLUME_PARTIAL ) {
19704 +               group->flags |= EVMS_VG_PARTIAL_PVS;
19705 +       }
19706 +       if ( pv_entry->logical_node->flags & EVMS_DEVICE_REMOVABLE ) {
19707 +               group->flags |= EVMS_VG_REMOVABLE_PVS;
19708 +       }
19709 +
19710 +       LOG_DETAILS("PV %s added to Group %s\n",
19711 +                   pv_entry->logical_node->name, group->vg_name);
19712 +
19713 +       return 0;
19714 +}
19715 +
19716 +/**
19717 + * discover_volume_groups
19718 + *
19719 + * Examine the list of logical nodes. Any node that contains a valid PV
19720 + * structure is consumed and added to the appropriate volume group. PVs
19721 + * which do not belong to any group are deleted. Everything else is left
19722 + * on the discovery list.
19723 + **/
19724 +static int discover_volume_groups(struct evms_logical_node ** evms_node_list)
19725 +{
19726 +       struct evms_logical_node * node, * next_node;
19727 +       struct pv_disk * pv;
19728 +       struct lvm_volume_group * group;
19729 +       struct lvm_physical_volume * pv_entry;
19730 +       int rc;
19731 +
19732 +       LOG_EXTRA("Searching for PVs in the node list.\n");
19733 +
19734 +       /* Run through the discovery list. */
19735 +       for ( node = *evms_node_list; node; node = next_node ) {
19736 +               /* Save the next node. We may remove this one from the list. */
19737 +               next_node = node->next;
19738 +
19739 +               /* Read the PV metadata. This will also create a new struct pv_disk
19740 +                * if it finds the correct LVM signatures.
19741 +                */
19742 +               rc = read_pv(node, &pv);
19743 +               if (rc) {
19744 +                       /* This node is not an LVM PV, or an error occurred.
19745 +                        * Just leave the node on the discovery list.
19746 +                        */
19747 +                       continue;
19748 +               }
19749 +
19750 +               rc = find_group_for_pv(node, pv, &group);
19751 +               if (rc) {
19752 +                       /* Error getting the group for this PV. */
19753 +                       kfree(pv);
19754 +                       continue;
19755 +               }
19756 +
19757 +               if (!group) {
19758 +                       /* This node is an unassigned PV. */
19759 +                       LOG_DETAILS("PV %s is unassigned.\n", node->name);
19760 +                       kfree(pv);
19761 +                       continue;
19762 +               }
19763 +
19764 +               rc = check_for_duplicate_pv(node, pv, group);
19765 +               if (rc) {
19766 +                       /* This node is already in the group. This check is also
19767 +                        * only in the kernel because the engine has no notion
19768 +                        * of rediscover, and thus can never get a duplicate.
19769 +                        */
19770 +                       evms_cs_remove_logical_node_from_list(evms_node_list,
19771 +                                                             node);
19772 +                       continue;
19773 +               }
19774 +
19775 +               /* Allocate a PV entry for this node. */
19776 +               pv_entry = allocate_physical_volume(node, pv);
19777 +               if (!pv_entry) {
19778 +                       continue;
19779 +               }
19780 +
19781 +               /* Add this PV to the appropriate volume group. */
19782 +               rc = add_pv_to_group(pv_entry, group);
19783 +               if (rc) {
19784 +                       deallocate_physical_volume(pv_entry);
19785 +                       continue;
19786 +               }
19787 +
19788 +               rc = read_pe_map(pv_entry);
19789 +               if (rc) {
19790 +                       LOG_WARNING("Error reading PE maps for node %s\n",
19791 +                                   node->name);
19792 +                       LOG_WARNING("Any volumes residing on this node will be incomplete!\n");
19793 +               }
19794 +
19795 +               evms_cs_remove_logical_node_from_list(evms_node_list, node);
19796 +       }
19797 +
19798 +       LOG_EXTRA("Group discovery complete.\n");
19799 +       return 0;
19800 +}
19801 +
19802 +
19803 +/********** Logical Volume Discovery Functions **********/
19804 +
19805 +
19806 +/**
19807 + * build_le_maps
19808 + *
19809 + * After all logical volumes have been discovered, the mappings from
19810 + * logical extents to physical extents must be constructed. Each PV
19811 + * contains a map on-disk of its PEs. Each PE map entry contains the
19812 + * logical volume number and the logical extent number on that volume.
19813 + * Our internal map is the reverse of this map for each volume, listing
19814 + * the PV node and sector offset for every logical extent on the volume.
19815 + **/
19816 +static int build_le_maps(struct lvm_volume_group * group)
19817 +{
19818 +       struct lvm_logical_volume ** volume_list = group->volume_list;
19819 +       struct lvm_physical_volume * pv_entry;
19820 +       struct evms_logical_node * node;
19821 +       struct pv_disk * pv;
19822 +       struct pe_disk * pe_map;
19823 +       u64 offset;
19824 +       u32 lv_number, le_number, first_pe_sector;
19825 +       int i;
19826 +
19827 +       LOG_DEBUG("Building LE maps for new volumes in group %s.\n",
19828 +                 group->vg_name);
19829 +
19830 +       /* For every PV in this VG. */
19831 +       for ( pv_entry = group->pv_list; pv_entry; pv_entry = pv_entry->next ) {
19832 +               node = pv_entry->logical_node;
19833 +               pv = pv_entry->pv;
19834 +               pe_map = pv_entry->pe_map;
19835 +
19836 +               /* Version 1 metadata uses pe_on_disk.base + .size to find start
19837 +                * of first PE. Version 2 uses pe_start.
19838 +                */
19839 +               if (pv->version == 1) {
19840 +                       first_pe_sector =
19841 +                               evms_cs_size_in_vsectors(pv->pe_on_disk.base +
19842 +                                                        pv->pe_on_disk.size);
19843 +               } else {
19844 +                       first_pe_sector = pv->pe_start;
19845 +                       if (!first_pe_sector) {
19846 +                               first_pe_sector =
19847 +                                       evms_cs_size_in_vsectors(pv->pe_on_disk.base +
19848 +                                                                pv->pe_on_disk.size);
19849 +                       }
19850 +               }
19851 +
19852 +               /* For every entry in the PE map, calculate the PE's sector offset
19853 +                * and update the correct LV's PE map. LV number of 0 marks an unused PE.
19854 +                * For re-discovery, only compute entries for new volumes. If a PV
19855 +                * is read-only, all LVs on that PV will also be read-only.
19856 +                */
19857 +               for ( i = 0; i < pv->pe_total; i++ ) {
19858 +                       lv_number = pe_map[i].lv_num;
19859 +                       if ( lv_number &&
19860 +                            volume_list[lv_number] &&
19861 +                            volume_list[lv_number]->lv_access &
19862 +                            (EVMS_LV_NEW | EVMS_LV_INCOMPLETE) ) {
19863 +                               le_number = pe_map[i].le_num;
19864 +                               offset = i * pv->pe_size + first_pe_sector;
19865 +                               volume_list[lv_number]->le_map[le_number].owning_pv =
19866 +                                       pv_entry;
19867 +                               volume_list[lv_number]->le_map[le_number].pe_sector_offset =
19868 +                                       offset;
19869 +                               if ( node->flags & EVMS_VOLUME_SET_READ_ONLY ) {
19870 +                                       volume_list[lv_number]->lv_access &=
19871 +                                               ~LV_WRITE;
19872 +                               }
19873 +                       }
19874 +               }
19875 +       }
19876 +
19877 +       return 0;
19878 +}
19879 +
19880 +/**
19881 + * build_snapshot_maps
19882 + *
19883 + * For every volume in this group that is a snapshot, read all of the
19884 + * existing entries in the COW table, and build up the snapshot mapping
19885 + * structures accordingly.
19886 + *
19887 + * For reference, the COW tables attached to the snapshot volumes  will
19888 + * always be in disk-order (little-endian), so that it can always be
19889 + * immediately written to disk. Therefore, endian conversions are necessary
19890 + * any time the COW table is accessed. This function will make a local
19891 + * copy of each COW table sector, and convert the local copy before
19892 + * building the snapshot maps.
19893 + **/
19894 +static int build_snapshot_maps(struct lvm_volume_group * group)
19895 +{
19896 +       struct lvm_logical_volume * volume;
19897 +       struct evms_logical_node tmp_node;
19898 +       struct lv_COW_table_disk cow_table[EVMS_VSECTOR_SIZE /
19899 +                                          sizeof(struct lv_COW_table_disk)];
19900 +       unsigned long max_entries = EVMS_VSECTOR_SIZE /
19901 +                                   sizeof(struct lv_COW_table_disk);
19902 +       int i, j;
19903 +
19904 +       /* Check every volume in the group to see if it is a snapshot. Also
19905 +        * check to make sure it is a new volume in the case of re-discovery.
19906 +        */
19907 +       for ( i = 1; i <= MAX_LV; i++ ) {
19908 +
19909 +               /* The volume must exist, must be new, and must be a snapshot.
19910 +                */
19911 +               volume = group->volume_list[i];
19912 +               if ( !volume ||
19913 +                    !(volume->lv_access & EVMS_LV_NEW) ||
19914 +                    !(volume->lv_access & LV_SNAPSHOT)) {
19915 +                       continue;
19916 +               }
19917 +
19918 +               /* Set up a temporary EVMS node. */
19919 +               tmp_node.private = volume;
19920 +
19921 +               LOG_DEBUG("Building snapshot map for volume %s\n",
19922 +                         volume->name);
19923 +
19924 +               while (1) {
19925 +                       /* Read in one sector's worth of COW tables. */
19926 +                       if ( lvm_init_io(&tmp_node, 0,
19927 +                                        volume->current_cow_sector,
19928 +                                        1, volume->cow_table) ) {
19929 +                               goto error;
19930 +                       }
19931 +
19932 +                       /* Endian-conversion of this COW table
19933 +                        * to a local table.
19934 +                        */
19935 +                       for ( j = 0; j < max_entries; j++ ) {
19936 +                               cow_table[j].pv_org_number =
19937 +                                       le64_to_cpu(volume->cow_table[j].pv_org_number);
19938 +                               cow_table[j].pv_org_rsector =
19939 +                                       le64_to_cpu(volume->cow_table[j].pv_org_rsector);
19940 +                               cow_table[j].pv_snap_number =
19941 +                                       le64_to_cpu(volume->cow_table[j].pv_snap_number);
19942 +                               cow_table[j].pv_snap_rsector =
19943 +                                       le64_to_cpu(volume->cow_table[j].pv_snap_rsector);
19944 +                       }
19945 +
19946 +                       /* Translate every valid COW table entry into
19947 +                        * a snapshot map entry.
19948 +                        */
19949 +                       for ( volume->next_cow_entry = 0;
19950 +                             volume->next_cow_entry < max_entries &&
19951 +                             cow_table[volume->next_cow_entry].pv_org_number;
19952 +                             volume->next_cow_entry++ ) {
19953 +                               /* org_rsector must be a valid sector number,
19954 +                                * i.e. it can't be within a PVs metadata. This
19955 +                                * is how we detect invalidated snapshots.
19956 +                                */
19957 +                               if ( cow_table[volume->next_cow_entry].pv_org_rsector < 10 ||
19958 +                                    cow_table[volume->next_cow_entry].pv_org_number > group->pv_count ||
19959 +                                    add_cow_entry_to_snapshot_map(&(cow_table[volume->next_cow_entry]), volume) ) {
19960 +                                       /* This volume either has an invalid COW entry,
19961 +                                        * or had an error adding that COW entry to the
19962 +                                        * snapshot map. This snapshot is done.
19963 +                                        */
19964 +                                       goto error;
19965 +                               }
19966 +                               volume->next_free_chunk += volume->chunk_size;
19967 +                       }
19968 +
19969 +                       /* Move on to the next sector if necessary. */
19970 +                       if ( volume->next_cow_entry == max_entries ) {
19971 +                               volume->current_cow_sector++;
19972 +                       } else {
19973 +                               break;
19974 +                       }
19975 +               }
19976 +       }
19977 +
19978 +out:
19979 +       return 0;
19980 +error:
19981 +       invalidate_snapshot_volume(volume);
19982 +       deallocate_logical_volume(volume);
19983 +       goto out;
19984 +}
19985 +
19986 +/**
19987 + * link_snapshot_volumes
19988 + *
19989 + * This function examines the list of logical volumes in this group and
19990 + * sets up the necessary pointers to link snapshots and their originals.
19991 + * A singly-linked list is created starting with the original volume. Also,
19992 + * all snapshot volumes point directly back to their original. This
19993 + * function should not be run until all volumes have been discovered.
19994 + * In the case of re-discovery, all of these links/lists get rebuilt as if
19995 + * they were not already there. Currently this should not pose a problem.
19996 + **/
19997 +static int link_snapshot_volumes(struct lvm_volume_group * group)
19998 +{
19999 +       struct lvm_logical_volume * org_volume, * snap_volume;
20000 +       u32 org_minor, buffer_size = 0;
20001 +       int i, j;
20002 +
20003 +       for ( i = 1; i <= MAX_LV; i++ ) {
20004 +
20005 +               /* Only process snapshot-originals. */
20006 +               org_volume = group->volume_list[i];
20007 +               if ( !org_volume || !(org_volume->lv_access & LV_SNAPSHOT_ORG) ) {
20008 +                       continue;
20009 +               }
20010 +
20011 +               /* For snapshot-originals, look for all other volumes that
20012 +                * claim to be snapshotting it. For each one that is found,
20013 +                * insert it at the start of the original's list of snapshots.
20014 +                * Need to start with a NULL snapshot_next, otherwise could
20015 +                * wind up with circular lists.
20016 +                */
20017 +               org_minor = org_volume->lv_minor;
20018 +               org_volume->snapshot_next = NULL;
20019 +
20020 +               for ( j = 1; j <= MAX_LV; j++ ) {
20021 +                       snap_volume = group->volume_list[j];
20022 +                       if ( snap_volume &&
20023 +                            snap_volume->lv_access & LV_SNAPSHOT &&
20024 +                            (snap_volume->snap_org_minor == org_minor) ) {
20025 +                               snap_volume->snapshot_org = org_volume;
20026 +                               snap_volume->snapshot_next =
20027 +                                       org_volume->snapshot_next;
20028 +                               org_volume->snapshot_next = snap_volume;
20029 +                               if ( snap_volume->chunk_size > buffer_size ) {
20030 +                                       buffer_size = snap_volume->chunk_size;
20031 +                               }
20032 +                               LOG_DEBUG("Linking snapshot (%s) to original (%s)\n",
20033 +                                         snap_volume->name, org_volume->name);
20034 +                       }
20035 +               }
20036 +
20037 +               /* If no snapshots were found for a volume that claims to be
20038 +                * under snapshot, mark the group dirty. If this is final
20039 +                * discovery, the original will have the snapshot flag turned
20040 +                * off in check_logical_volumes().
20041 +                */
20042 +               if (!org_volume->snapshot_next) {
20043 +                       LOG_WARNING("No snapshots found for original (%s)\n",
20044 +                                   org_volume->name);
20045 +                       group->flags |= EVMS_VG_DIRTY;
20046 +               }
20047 +       }
20048 +       return 0;
20049 +}
20050 +
20051 +/**
20052 + * discover_volumes_in_group
20053 + **/
20054 +static int discover_volumes_in_group(struct lvm_volume_group * group)
20055 +{
20056 +       struct lv_disk * lv_array = group->lv_array;
20057 +       struct lvm_logical_volume * new_volume;
20058 +       int i;
20059 +
20060 +       /* Search through the LV structs for valid LV entries. */
20061 +       for ( i = 0; i < group->vg->lv_max; i++ ) {
20062 +
20063 +               /* Only discover valid, active volumes. */
20064 +               if ( !lv_array[i].lv_name[0] ||
20065 +                    lv_array[i].lv_number >= MAX_LV ) {
20066 +                       continue;
20067 +               }
20068 +
20069 +               /* Make sure this volume isn't already in the list. */
20070 +               if (group->volume_list[lv_array[i].lv_number + 1]) {
20071 +                       continue;
20072 +               }
20073 +
20074 +               /* Create a new logical volume and place it in the appropriate
20075 +                * spot in this VG's volume list.
20076 +                */
20077 +               new_volume = allocate_logical_volume(&(lv_array[i]), group);
20078 +               if (!new_volume) {
20079 +                       /* This volume will be missing, but other
20080 +                        * volumes in this group can still be built.
20081 +                        */
20082 +                       LOG_CRITICAL("Error allocating LV %s in Group %s\n",
20083 +                                    lv_array[i].lv_name, group->vg_name);
20084 +                       continue;
20085 +               }
20086 +
20087 +               group->volume_list[new_volume->lv_number] = new_volume;
20088 +               group->volume_count++;
20089 +               group->flags |= EVMS_VG_DIRTY;
20090 +
20091 +               LOG_DEBUG("Discovered volume %s in group %s.\n",
20092 +                         new_volume->name, group->vg_name);
20093 +       }
20094 +
20095 +       return 0;
20096 +}
20097 +
20098 +/**
20099 + * discover_logical_volumes
20100 + *
20101 + * After all PVs have been claimed and added to the appropriate VG list,
20102 + * the volumes for each VG must be constructed. For each group, read all
20103 + * the LV structs off the first PV in the list. Search this list of
20104 + * structs for valid LVs. For each valid LV, create a new volume and add
20105 + * it to the group.
20106 + **/
20107 +static int discover_logical_volumes(int final_discovery)
20108 +{
20109 +       struct lvm_volume_group *group;
20110 +       int rc;
20111 +
20112 +       /* Look for volumes in each valid VG entry. We even need to check ones
20113 +        * that aren't dirty - We could have deleted an incomplete volume on
20114 +        * the previous pass, and need to rediscover it in case this is final
20115 +        * discovery and we now want to export it.
20116 +        */
20117 +       for ( group = lvm_group_list; group; group = group->next_group ) {
20118 +
20119 +               if ( ! group->vg ||
20120 +                    (! final_discovery &&
20121 +                     ! (group->flags & EVMS_VG_DIRTY)) ) {
20122 +                       continue;
20123 +               }
20124 +
20125 +               LOG_DEBUG("Searching for volumes in group %s\n",
20126 +                         group->vg_name);
20127 +
20128 +               /* Read in the LV array from disk if necessary. */
20129 +               rc = read_lv(group);
20130 +               if (rc) {
20131 +                       LOG_WARNING("Unable to read LV metadata for group %s\n",
20132 +                                   group->vg_name);
20133 +                       LOG_WARNING("No regions can be discovered for group %s\n",
20134 +                                   group->vg_name);
20135 +                       continue;
20136 +               }
20137 +
20138 +               /* Assemble each volume in the group. */
20139 +               discover_volumes_in_group(group);
20140 +
20141 +               /* Build the LE map for each LV discovered in this group. This
20142 +                * must be done after all LVS in the group are discovered.
20143 +                */
20144 +               build_le_maps(group);
20145 +               check_le_maps(group);
20146 +
20147 +               /* Set up all of the initial snapshot maps. Only the kernel
20148 +                * keeps track of the snapshot maps.
20149 +                */
20150 +               build_snapshot_maps(group);
20151 +
20152 +               /* Set up the pointers to link snapshot volumes
20153 +                * with their originals.
20154 +                */
20155 +               link_snapshot_volumes(group);
20156 +       }
20157 +
20158 +       return 0;
20159 +}
20160 +
20161 +/**
20162 + * export_volumes
20163 + *
20164 + * The last thing the plugin must do is take each newly constructed volume
20165 + * and place it on the evms logical node list. A zero return-code from
20166 + * this function means nothing new was added to the list, and a positive
20167 + * return code means that many new items were added to the list.
20168 + **/
20169 +static int export_volumes(struct evms_logical_node ** evms_node_list,
20170 +                         int final_discover)
20171 +{
20172 +       struct lvm_volume_group * group;
20173 +       struct evms_logical_node * new_node;
20174 +       struct lvm_logical_volume * volume;
20175 +       int i, count = 0;
20176 +
20177 +       LOG_EXTRA("Exporting volumes\n");
20178 +
20179 +       /* For every valid, dirty volume group. */
20180 +       for ( group = lvm_group_list; group; group = group->next_group ) {
20181 +               if ( ! (group->flags & EVMS_VG_DIRTY) ) {
20182 +                       continue;
20183 +               }
20184 +
20185 +               /* Export every valid volume in the group. For re-discovery,
20186 +                * we re-export the same logical node.
20187 +                */
20188 +               for ( i = 1; i <= MAX_LV; i++ ) {
20189 +                       volume = group->volume_list[i];
20190 +                       if (!volume) {
20191 +                               continue;
20192 +                       }
20193 +
20194 +                       /* For new volumes, create a new EVMS node and
20195 +                        * initialize the appropriate fields.
20196 +                        */
20197 +                       if ( volume->lv_access & EVMS_LV_NEW ) {
20198 +                               if ( evms_cs_allocate_logical_node(&new_node) ) {
20199 +                                       continue;
20200 +                               }
20201 +                               MOD_INC_USE_COUNT;
20202 +
20203 +                               volume->volume_node = new_node;
20204 +                               volume->lv_access &= (~EVMS_LV_QUIESCED &
20205 +                                                     ~EVMS_LV_NEW);
20206 +                               new_node->hardsector_size =
20207 +                                       group->hard_sect_size;
20208 +                               new_node->block_size = group->block_size;
20209 +                               new_node->plugin = &lvm_plugin_header;
20210 +                               new_node->private = volume;
20211 +                               memcpy(new_node->name, volume->name, NAME_LEN);
20212 +
20213 +                               /* Snapshot volumes should report the
20214 +                                * size of their original.
20215 +                                */
20216 +                               new_node->total_vsectors =
20217 +                                       (volume->lv_access & LV_SNAPSHOT) ?
20218 +                                       volume->snapshot_org->lv_size :
20219 +                                       volume->lv_size;
20220 +
20221 +                               /* Is the volume read-only? */
20222 +                               if ( ! (volume->lv_access & LV_WRITE) ) {
20223 +                                       new_node->flags |=
20224 +                                               EVMS_VOLUME_READ_ONLY;
20225 +                                       LOG_DEBUG("LVM volume %s is read-only\n",
20226 +                                                 volume->name);
20227 +                               }
20228 +
20229 +                               /* Is the volume incomplete? */
20230 +                               if ( volume->lv_access & EVMS_LV_INCOMPLETE ) {
20231 +                                       new_node->flags |=
20232 +                                               (EVMS_VOLUME_READ_ONLY |
20233 +                                                EVMS_VOLUME_PARTIAL);
20234 +                                       LOG_DEBUG("LVM volume %s is incomplete\n",
20235 +                                                 volume->name);
20236 +                               }
20237 +
20238 +                               /* Does the volume group contain any partial or
20239 +                                * removable PVs?
20240 +                                */
20241 +                               if ( group->flags & EVMS_VG_PARTIAL_PVS ) {
20242 +                                       new_node->flags |= EVMS_VOLUME_PARTIAL;
20243 +                               }
20244 +                               if ( group->flags & EVMS_VG_REMOVABLE_PVS ) {
20245 +                                       new_node->flags |=
20246 +                                               EVMS_DEVICE_REMOVABLE;
20247 +                               }
20248 +                       }
20249 +
20250 +                       /* Export the node, only if it hasn't been exported
20251 +                        * during this full EVMS discover.
20252 +                        */
20253 +                       if ( ! (volume->lv_access & EVMS_LV_EXPORTED) ) {
20254 +                               if ( ! evms_cs_add_logical_node_to_list(evms_node_list,
20255 +                                                                       volume->volume_node) ) {
20256 +                                       LOG_DETAILS("Exporting LVM volume %s\n",
20257 +                                                   volume->name);
20258 +                                       volume->lv_access |= EVMS_LV_EXPORTED;
20259 +                                       count++;
20260 +                               }
20261 +                       }
20262 +
20263 +                       if (final_discover) {
20264 +                               volume->lv_access &= ~EVMS_LV_EXPORTED;
20265 +                       }
20266 +               }
20267 +
20268 +               /* The group is clean now. */
20269 +               group->flags &= ~EVMS_VG_DIRTY;
20270 +       }
20271 +
20272 +       return count;
20273 +}
20274 +
20275 +/**
20276 + * lvm_cleanup
20277 + *
20278 + * This function runs through the entire lvm data structure, removing
20279 + * all items that are not needed at runtime. Currently, this is just the
20280 + * struct vg_disk structure and the struct pv_disk structure for each PV.
20281 + * Also, any groups that don't contain any volumes are deleted. All of the
20282 + * other volume_group, logical_volume and evms_logical_node structures will
20283 + * be kept around at run-time.
20284 + **/
20285 +static int lvm_cleanup(void)
20286 +{
20287 +       struct lvm_volume_group * group, * next_group;
20288 +       struct lvm_physical_volume * pv_entry;
20289 +
20290 +       for ( group = lvm_group_list; group; group = next_group ) {
20291 +               next_group = group->next_group;
20292 +
20293 +               /* Delete groups with no volumes. */
20294 +               if (!group->volume_count) {
20295 +                       LOG_WARNING("Group %s contains no logical volumes. Deleting.\n",
20296 +                                   group->vg_name);
20297 +                       remove_group_from_list(group);
20298 +                       deallocate_volume_group(group);
20299 +                       /* Need to go back to the start of the list,
20300 +                        * just to be safe. :)
20301 +                        */
20302 +                       next_group = lvm_group_list;
20303 +                       continue;
20304 +               }
20305 +
20306 +               /* Delete data structures that aren't used at runtime. */
20307 +               if (group->vg) {
20308 +                       kfree(group->vg);
20309 +                       group->vg = NULL;
20310 +               }
20311 +
20312 +               for ( pv_entry = group->pv_list;
20313 +                     pv_entry; pv_entry = pv_entry->next) {
20314 +                       if (pv_entry->pv) {
20315 +                               kfree(pv_entry->pv);
20316 +                               pv_entry->pv = NULL;
20317 +                       }
20318 +                       if (pv_entry->pe_map) {
20319 +                               vfree(pv_entry->pe_map);
20320 +                               pv_entry->pe_map = NULL;
20321 +                       }
20322 +               }
20323 +               if (group->lv_array) {
20324 +                       vfree(group->lv_array);
20325 +                       group->lv_array = NULL;
20326 +               }
20327 +               if (group->uuid_list) {
20328 +                       vfree(group->uuid_list);
20329 +                       group->uuid_list = NULL;
20330 +               }
20331 +       }
20332 +       return 0;
20333 +}
20334 +
20335 +/**
20336 + * lvm_get_bmap
20337 + *
20338 + * Support for the BMAP ioctl used by LILO to translate filesystem blocks
20339 + * to disk blocks to map kernel images for boot time.
20340 + **/
20341 +static int lvm_get_bmap(struct evms_logical_node * node,
20342 +                       struct evms_get_bmap_pkt * bmap,
20343 +                       struct evms_logical_node ** pv_node)
20344 +{
20345 +       struct lvm_logical_volume * volume = node->private;
20346 +       struct lvm_physical_volume * pv_entry;
20347 +       u64 pe_start_sector, new_sector = 0, new_size = 0;
20348 +       int rc = 0;
20349 +
20350 +       /* No kernel images allowed on snapshot LVs. */
20351 +       if ( volume->lv_access & LV_SNAPSHOT ) {
20352 +               return -EINVAL;
20353 +       }
20354 +
20355 +       /* Range check. */
20356 +       if ( bmap->rsector >= volume->lv_size ) {
20357 +               return -EINVAL;
20358 +       }
20359 +
20360 +       rc = remap_sector(node, bmap->rsector, 1, &new_sector,
20361 +                         &new_size, &pe_start_sector, &pv_entry);
20362 +
20363 +       if (rc || !pv_entry || !new_sector) {
20364 +               return -EINVAL;
20365 +       }
20366 +
20367 +       bmap->rsector = new_sector;
20368 +       *pv_node = pv_entry->logical_node;
20369 +
20370 +       return 0;
20371 +}
20372 +
20373 +/**
20374 + * lvm_global_proc_read
20375 + *
20376 + * A callback function for the lvm-global proc-fs entry. This will print
20377 + * general info about all LVM VGs, PVs, and LVs.
20378 + **/
20379 +static int lvm_global_proc_read(char * page, char ** start, off_t off,
20380 +                               int count, int * eof, void * data)
20381 +{
20382 +       struct lvm_volume_group * group;
20383 +       struct lvm_physical_volume * pv_entry;
20384 +       struct lvm_logical_volume * volume, * snap;
20385 +       int vgs = 0, lvs = 0, pvs = 0;
20386 +       int i, sz = 0;
20387 +
20388 +       PROCPRINT("Enterprise Volume Management System: LVM Plugin\n");
20389 +       PROCPRINT("Plugin ID: %x.%x.%x\n",
20390 +                 GetPluginOEM(lvm_plugin_header.id),
20391 +                 GetPluginType(lvm_plugin_header.id),
20392 +                 GetPluginID(lvm_plugin_header.id));
20393 +       PROCPRINT("Plugin Version: %d.%d.%d\n",
20394 +                 lvm_plugin_header.version.major,
20395 +                 lvm_plugin_header.version.minor,
20396 +                 lvm_plugin_header.version.patchlevel);
20397 +       PROCPRINT("Required EVMS Services Version: %d.%d.%d\n",
20398 +                 lvm_plugin_header.required_services_version.major,
20399 +                 lvm_plugin_header.required_services_version.minor,
20400 +                 lvm_plugin_header.required_services_version.patchlevel);
20401 +
20402 +       /* Count all existing items. */
20403 +       for ( group = lvm_group_list; group; group = group->next_group ) {
20404 +               lvs += group->volume_count;
20405 +               pvs += group->pv_count;
20406 +               vgs++;
20407 +       }
20408 +
20409 +       PROCPRINT("\n");
20410 +       PROCPRINT("Total: %d VGs  %d PVs  %d LVs\n", vgs, pvs, lvs);
20411 +
20412 +       /* Print out specifics about each VG. */
20413 +       for ( group = lvm_group_list; group; group = group->next_group ) {
20414 +               PROCPRINT("\n");
20415 +               PROCPRINT("VG:  %s  [%d PV, %d LV]\n",
20416 +                         group->vg_name, group->pv_count, group->volume_count);
20417 +               PROCPRINT("PVs:\n");
20418 +               for ( pv_entry = group->pv_list;
20419 +                     pv_entry; pv_entry = pv_entry->next ) {
20420 +                       if (pv_entry->logical_node) {
20421 +                               PROCPRINT("\t%s\t%10Ld KB\n",
20422 +                                         pv_entry->logical_node->name,
20423 +                                         (long long)pv_entry->logical_node->total_vsectors / 2);
20424 +                       }
20425 +               }
20426 +               PROCPRINT("LVs:\n");
20427 +               for ( i = 1; i <= MAX_LV; i++ ) {
20428 +                       if (group->volume_list[i]) {
20429 +                               volume = group->volume_list[i];
20430 +                               PROCPRINT("\t%s\t%10Ld KB / %5d LEs",
20431 +                                         volume->name,
20432 +                                         (long long)volume->lv_size / 2,
20433 +                                         volume->num_le);
20434 +                               if ( volume->lv_access & LV_SNAPSHOT ) {
20435 +                                       PROCPRINT("\tSnapshot of : ");
20436 +                                       if (volume->snapshot_org) {
20437 +                                               PROCPRINT("%s : ",
20438 +                                                         volume->snapshot_org->name);
20439 +                                       } else {
20440 +                                               PROCPRINT("(unknown) : ");
20441 +                                       }
20442 +                                       PROCPRINT("%ld%% full : ",
20443 +                                                 (long)(volume->next_free_chunk) *
20444 +                                                 100 / (long)(volume->lv_size));
20445 +                                       if ( volume->lv_status & LV_ACTIVE ) {
20446 +                                               PROCPRINT("active");
20447 +                                       } else {
20448 +                                               PROCPRINT("disabled");
20449 +                                       }
20450 +                               } else if ( volume->lv_access & LV_SNAPSHOT_ORG ) {
20451 +                                       PROCPRINT("\tSnapshotted by : ");
20452 +                                       for ( snap = volume->snapshot_next;
20453 +                                             snap;
20454 +                                             snap = snap->snapshot_next ) {
20455 +                                               PROCPRINT("%s  ", snap->name);
20456 +                                       }
20457 +                               }
20458 +                               PROCPRINT("\n");
20459 +                       }
20460 +               }
20461 +       }
20462 +
20463 +out:
20464 +       *start = page + off;
20465 +       sz -= off;
20466 +       if (sz < 0)
20467 +               sz = 0;
20468 +       return sz > count ? count : sz;
20469 +}
20470 +
20471 +
20472 +/********** Required EVMS Plugin Functions **********/
20473 +
20474 +
20475 +/**
20476 + * lvm_discover
20477 + *
20478 + * This is the entry point into the LVM discovery process. It is a three
20479 + * phase process. First, the list of nodes are examined for PVs, and the
20480 + * appropriate volume groups are created. Then each volume group is
20481 + * examined to find all available logical volumes. Finally, each LVM
20482 + * logical volume has a new EVMS node created for it, and added to the
20483 + * list of nodes.
20484 + **/
20485 +static int lvm_discover(struct evms_logical_node ** evms_node_list)
20486 +{
20487 +       int rc;
20488 +
20489 +       MOD_INC_USE_COUNT;
20490 +       LOG_EXTRA("Beginning discovery.\n");
20491 +
20492 +       discover_volume_groups(evms_node_list);
20493 +
20494 +       check_volume_groups();
20495 +
20496 +       discover_logical_volumes(FALSE);
20497 +
20498 +       check_logical_volumes(FALSE);
20499 +
20500 +       rc = export_volumes(evms_node_list, FALSE);
20501 +
20502 +       LOG_EXTRA("Discovery complete.\n");
20503 +       MOD_DEC_USE_COUNT;
20504 +       return rc;
20505 +}
20506 +
20507 +/**
20508 + * lvm_discover_end
20509 + *
20510 + * The discovery process at the region-manager level is now iterative,
20511 + * much like the EVMS feature level. This allows the ability to stack
20512 + * LVM on top of MD, or vice-versa. To accomplish this correctly, and
20513 + * also to accomplish partial volume discovery, a second discover
20514 + * entry point is needed, so EVMS can tell the region managers that
20515 + * discovery is over, and to finish up any discovery that is not yet
20516 + * complete. When this function is called, it should be assumed that
20517 + * the node list has had nothing new added to it since the last call
20518 + * of the regular discover function. Therefore, when this function is
20519 + * called, we do not need to try to discovery any additional volume
20520 + * groups. We will, however, look for logical volumes once more. This
20521 + * gives us the ability to export (read-only) volumes that have
20522 + * partially corrupted LE maps due to missing PVs in their VG.
20523 + **/
20524 +static int lvm_discover_end(struct evms_logical_node ** evms_node_list)
20525 +{
20526 +       int rc;
20527 +
20528 +       MOD_INC_USE_COUNT;
20529 +       LOG_EXTRA("Beginning final discovery\n");
20530 +
20531 +       discover_volume_groups(evms_node_list);
20532 +
20533 +       check_volume_groups();
20534 +
20535 +       discover_logical_volumes(TRUE);
20536 +
20537 +       check_logical_volumes(TRUE);
20538 +
20539 +       rc = export_volumes(evms_node_list, TRUE);
20540 +
20541 +       lvm_cleanup();
20542 +
20543 +       LOG_EXTRA("Final discovery complete.\n");
20544 +       MOD_DEC_USE_COUNT;
20545 +       return rc;
20546 +}
20547 +
20548 +/**
20549 + * lvm_delete_node
20550 + *
20551 + * This function deletes the in-memory representation of an LVM logical volume.
20552 + **/
20553 +static int lvm_delete_node(struct evms_logical_node * logical_node)
20554 +{
20555 +       struct lvm_logical_volume * volume = logical_node->private;
20556 +       struct lvm_volume_group * group = volume->group;
20557 +
20558 +       LOG_DEBUG("Deleting LVM node %s\n", logical_node->name);
20559 +
20560 +       if ( deallocate_logical_volume(volume) ) {
20561 +               return -EINVAL;
20562 +       }
20563 +
20564 +       /* If we just removed the last volume from this group, the entire group
20565 +        * must also be deleted.
20566 +        */
20567 +       if ( group && group->volume_count == 0 ) {
20568 +               remove_group_from_list(group);
20569 +               deallocate_volume_group(group);
20570 +       }
20571 +
20572 +       /* Free the logical node. */
20573 +       evms_cs_deallocate_logical_node(logical_node);
20574 +       MOD_DEC_USE_COUNT;
20575 +       return 0;
20576 +}
20577 +
20578 +/**
20579 + * lvm_read
20580 + **/
20581 +static void lvm_read(struct evms_logical_node * node,
20582 +                    struct buffer_head * bh)
20583 +{
20584 +       struct lvm_logical_volume * volume = node->private;
20585 +       struct lvm_physical_volume * pv_entry;
20586 +       u64 size = bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
20587 +       u64 new_sector, new_size, pe_start_sector;
20588 +
20589 +       /* If this volume is a snapshot, lock the volume, and do
20590 +        * the LE-PE translation on its original volume.
20591 +        */
20592 +       if ( volume->lv_access & LV_SNAPSHOT ) {
20593 +               down(&volume->snap_semaphore);
20594 +               if (!volume->snapshot_org) {
20595 +                       goto out_error;
20596 +               }
20597 +               node = volume->snapshot_org->volume_node;
20598 +       }
20599 +
20600 +       /* Make sure the volume is active and readable. */
20601 +       if ( !(volume->lv_access & LV_READ &&
20602 +              volume->lv_status & LV_ACTIVE) ) {
20603 +               goto out_error;
20604 +       }
20605 +
20606 +       /* Check if I/O goes past end of logical volume. Must use the
20607 +        * node, not the volume, so snapshots will work correctly.
20608 +        */
20609 +       if ( bh->b_rsector + size > node->total_vsectors ) {
20610 +               goto out_error;
20611 +       }
20612 +
20613 +       /* Logical-to-Physical remapping. Check for incomplete volumes.
20614 +        * Check intermediate boundary conditions as well.
20615 +        */
20616 +       if ( remap_sector(node, bh->b_rsector, size, &new_sector,
20617 +                         &new_size, &pe_start_sector, &pv_entry) ||
20618 +            !pe_start_sector || !pv_entry ||
20619 +            size != new_size ) {
20620 +               goto out_error;
20621 +       }
20622 +
20623 +       /* For snapshot volumes, check if this sector's chunk has been
20624 +        * remapped. If it has, new_sector and pv_entry will be changed
20625 +        * accordingly. If not, they remain the same.
20626 +        */
20627 +       if ( volume->lv_access & LV_SNAPSHOT ) {
20628 +               snapshot_remap_sector(volume, pe_start_sector,
20629 +                                     &new_sector, &pv_entry);
20630 +       }
20631 +
20632 +       bh->b_rsector = new_sector;
20633 +       R_IO(pv_entry->logical_node, bh);
20634 +
20635 +out:
20636 +       /* Unlock the snapshot. */
20637 +       if ( volume->lv_access & LV_SNAPSHOT ) {
20638 +               up(&volume->snap_semaphore);
20639 +       }
20640 +       return;
20641 +
20642 +out_error:
20643 +       bh->b_end_io(bh, 0);
20644 +       goto out;
20645 +}
20646 +
20647 +/**
20648 + * lvm_write
20649 + **/
20650 +static void lvm_write(struct evms_logical_node * node,
20651 +                     struct buffer_head * bh)
20652 +{
20653 +       struct lvm_logical_volume * volume = node->private;
20654 +       struct lvm_logical_volume * snap_volume;
20655 +       struct lvm_physical_volume * pv_entry;
20656 +       u64 size = bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
20657 +       u64 new_sector, new_size, pe_start_sector;
20658 +
20659 +       /* Make sure the volume is active and writable. */
20660 +       if ( !(volume->lv_access & LV_WRITE &&
20661 +              volume->lv_status & LV_ACTIVE) ) {
20662 +               goto out_error;
20663 +       }
20664 +
20665 +       /* Check if I/O goes past end of logical volume. */
20666 +       if ( bh->b_rsector + size > node->total_vsectors ) {
20667 +               goto out_error;
20668 +       }
20669 +
20670 +       /* Logical-to-Physical remapping. Check for incomplete volumes.
20671 +        * Check intermediate boundary conditions as well.
20672 +        */
20673 +       if ( remap_sector(node, bh->b_rsector, size, &new_sector,
20674 +                         &new_size, &pe_start_sector, &pv_entry) ||
20675 +            !pe_start_sector || !pv_entry ||
20676 +            size != new_size ) {
20677 +               goto out_error;
20678 +       }
20679 +
20680 +       /* Copy-on-write for snapshotting. */
20681 +       if ( volume->lv_access & LV_SNAPSHOT_ORG ) {
20682 +               /* Originals can be snapshotted multiple times. */
20683 +               for ( snap_volume = volume->snapshot_next;
20684 +                     snap_volume; snap_volume = snap_volume->snapshot_next ) {
20685 +                       if ( snapshot_copy_data(volume, snap_volume,
20686 +                                               pe_start_sector, new_sector,
20687 +                                               pv_entry) ) {
20688 +                               goto out_error;
20689 +                       }
20690 +               }
20691 +       }
20692 +
20693 +       bh->b_rsector = new_sector;
20694 +       W_IO(pv_entry->logical_node, bh);
20695 +out:
20696 +       return;
20697 +out_error:
20698 +       bh->b_end_io(bh, 0);
20699 +       goto out;
20700 +}
20701 +
20702 +/**
20703 + * lvm_init_io
20704 + *
20705 + * Init_io on a snapshot volume treats it like a regular volume.
20706 + **/
20707 +static int lvm_init_io(struct evms_logical_node * node,
20708 +                      int io_flag,
20709 +                      u64 sect_nr,
20710 +                      u64 num_sects,
20711 +                      void * buf_addr)
20712 +{
20713 +       struct lvm_logical_volume * volume = node->private;
20714 +       struct lvm_physical_volume * pv_entry;
20715 +       u64 pe_start_sector, new_sector, new_size;
20716 +       int rc = 0;
20717 +
20718 +       /* Only allow internal writes to snapshots (io_flag==4). Disallow
20719 +        * writes to snapshot originals.
20720 +        */
20721 +       if ( io_flag == WRITE &&
20722 +            volume->lv_access & (LV_SNAPSHOT | LV_SNAPSHOT_ORG) ) {
20723 +               return -EINVAL;
20724 +       }
20725 +
20726 +       /* The node for a snapshot reports the size of the original. If a
20727 +        * request comes in in that range, just return.
20728 +        */
20729 +       else if ( volume->lv_access & LV_SNAPSHOT &&
20730 +                 sect_nr >= volume->lv_size &&
20731 +                 sect_nr < node->total_vsectors ) {
20732 +               if ( io_flag == READ ) {
20733 +                       memset(buf_addr, 0,
20734 +                              num_sects << EVMS_VSECTOR_SIZE_SHIFT);
20735 +               }
20736 +               return 0;
20737 +       }
20738 +
20739 +       /* Regular range check. */
20740 +       else if ( sect_nr + num_sects > volume->lv_size ) {
20741 +               return -EINVAL;
20742 +       }
20743 +
20744 +       if ( io_flag == 4 ) {
20745 +               io_flag = WRITE;
20746 +       }
20747 +
20748 +       /* Init IO needs to deal with the possibility of a request that spans
20749 +        * PEs or stripes. This is possible because there is no limit on
20750 +        * num_sects. To handle this, we loop through remap_sector and
20751 +        * INIT_IO until num_sects reaches zero.
20752 +        */
20753 +       while (num_sects) {
20754 +               if ( remap_sector(node, sect_nr, num_sects, &new_sector,
20755 +                                 &new_size, &pe_start_sector, &pv_entry) ) {
20756 +                       return -EIO;
20757 +               }
20758 +
20759 +               /* If the volume is incomplete, clear the buffer (on a read). */
20760 +               if (!pe_start_sector || !pv_entry) {
20761 +                       if ( io_flag == READ ) {
20762 +                               memset(buf_addr, 0,
20763 +                                      new_size << EVMS_VSECTOR_SIZE_SHIFT);
20764 +                       }
20765 +               } else {
20766 +                       rc = INIT_IO(pv_entry->logical_node, io_flag,
20767 +                                    new_sector, new_size, buf_addr);
20768 +               }
20769 +               num_sects -= new_size;
20770 +               sect_nr += new_size;
20771 +               buf_addr = (void *)(((unsigned long) buf_addr) +
20772 +                                   (unsigned long)(new_size << EVMS_VSECTOR_SIZE_SHIFT));
20773 +       }
20774 +
20775 +       return rc;
20776 +}
20777 +
20778 +/**
20779 + * lvm_ioctl
20780 + **/
20781 +static int lvm_ioctl(struct evms_logical_node * logical_node,
20782 +                    struct inode * inode,
20783 +                    struct file * file,
20784 +                    unsigned int cmd,
20785 +                    unsigned long arg)
20786 +{
20787 +       struct lvm_logical_volume * volume = logical_node->private;
20788 +       int rc = 0;
20789 +
20790 +       LOG_ENTRY_EXIT("Ioctl %d\n", cmd);
20791 +
20792 +       switch (cmd) {
20793 +
20794 +       case HDIO_GETGEO:
20795 +               {
20796 +                       /* Fixed geometry for all LVM volumes. */
20797 +                       unsigned char heads = 64;
20798 +                       unsigned char sectors = 32;
20799 +                       short cylinders;
20800 +                       long start = 0;
20801 +                       struct hd_geometry * hd = (struct hd_geometry *)arg;
20802 +                       cylinders = logical_node->total_vsectors;
20803 +                       cylinders = (cylinders / heads) / sectors;
20804 +
20805 +                       if (!hd) {
20806 +                               return -EINVAL;
20807 +                       }
20808 +
20809 +                       if ( copy_to_user((char *)(&hd->heads),
20810 +                                         &heads, sizeof(heads)) ||
20811 +                            copy_to_user((char *)(&hd->sectors),
20812 +                                         &sectors, sizeof(sectors)) ||
20813 +                            copy_to_user((short *)(&hd->cylinders),
20814 +                                         &cylinders, sizeof(cylinders)) ||
20815 +                            copy_to_user((long *)(&hd->start),
20816 +                                         &start, sizeof(start)) ) {
20817 +                               return -EFAULT;
20818 +                       }
20819 +               }
20820 +               break;
20821 +
20822 +       case EVMS_QUIESCE_VOLUME:
20823 +               {
20824 +                       struct evms_quiesce_vol_pkt * tmp =
20825 +                               (struct evms_quiesce_vol_pkt *)arg;
20826 +                       if (tmp->command) {
20827 +                               volume->lv_access |= EVMS_LV_QUIESCED;
20828 +                       } else {
20829 +                               volume->lv_access &= ~EVMS_LV_QUIESCED;
20830 +                       }
20831 +               }
20832 +               break;
20833 +
20834 +       case EVMS_GET_BMAP:
20835 +               {
20836 +                       struct evms_get_bmap_pkt * bmap =
20837 +                           (struct evms_get_bmap_pkt *)arg;
20838 +                       struct evms_logical_node * pv_node;
20839 +
20840 +                       rc = lvm_get_bmap(logical_node, bmap, &pv_node);
20841 +                       if (!rc) {
20842 +                               rc = IOCTL(pv_node, inode, file, cmd,
20843 +                                          (unsigned long) bmap);
20844 +                       }
20845 +               }
20846 +               break;
20847 +
20848 +       case EVMS_GET_DISK_LIST:
20849 +       case EVMS_CHECK_MEDIA_CHANGE:
20850 +       case EVMS_REVALIDATE_DISK:
20851 +       case EVMS_OPEN_VOLUME:
20852 +       case EVMS_CLOSE_VOLUME:
20853 +       case EVMS_CHECK_DEVICE_STATUS:
20854 +               {
20855 +                       /* These five ioctl all need to
20856 +                        * be broadcast to all PVs.
20857 +                        */
20858 +                       struct lvm_volume_group * group = volume->group;
20859 +                       struct lvm_physical_volume * pv_entry;
20860 +                       for ( pv_entry = group->pv_list;
20861 +                             pv_entry; pv_entry = pv_entry->next ) {
20862 +                               rc |= IOCTL(pv_entry->logical_node, inode,
20863 +                                           file, cmd, arg);
20864 +                       }
20865 +               }
20866 +               break;
20867 +
20868 +       default:
20869 +               /* Currently LVM does not send any ioctl's down to the
20870 +                * PVs. Which PV would they go to? What would we do with
20871 +                * the return codes?
20872 +                */
20873 +               rc = -EINVAL;
20874 +       }
20875 +
20876 +       return rc;
20877 +}
20878 +
20879 +/**
20880 + * lvm_direct_ioctl
20881 + *
20882 + * This function provides a method for user-space to communicate directly
20883 + * with a plugin in the kernel.
20884 + **/
20885 +static int lvm_direct_ioctl(struct inode * inode,
20886 +                           struct file * file,
20887 +                           unsigned int cmd,
20888 +                           unsigned long args)
20889 +{
20890 +       struct evms_plugin_ioctl_pkt pkt, * user_pkt;
20891 +       struct lvm_pv_remove_ioctl pv_remove, * user_pv_remove;
20892 +       struct lvm_snapshot_stat_ioctl snap_stats, * user_snap_stats;
20893 +       int rc = 0;
20894 +
20895 +       MOD_INC_USE_COUNT;
20896 +
20897 +       user_pkt = (struct evms_plugin_ioctl_pkt *)args;
20898 +
20899 +       /* Copy user's parameters to kernel space. */
20900 +       if ( copy_from_user(&pkt, user_pkt, sizeof(pkt)) ) {
20901 +               MOD_DEC_USE_COUNT;
20902 +               return -EFAULT;
20903 +       }
20904 +
20905 +       /* Make sure this is supposed to be our ioctl. */
20906 +       if ( pkt.feature_id != lvm_plugin_header.id ) {
20907 +               MOD_DEC_USE_COUNT;
20908 +               return -EINVAL;
20909 +       }
20910 +
20911 +       switch (pkt.feature_command) {
20912 +
20913 +       case EVMS_LVM_PV_REMOVE_IOCTL:
20914 +               user_pv_remove =
20915 +                       (struct lvm_pv_remove_ioctl *)pkt.feature_ioctl_data;
20916 +               if ( copy_from_user(&pv_remove, user_pv_remove,
20917 +                                   sizeof(pv_remove)) ) {
20918 +                       rc = -EINVAL;
20919 +                       break;
20920 +               }
20921 +               rc = remove_pv_from_group(pv_remove.pv_number,
20922 +                                         pv_remove.vg_uuid);
20923 +               break;
20924 +
20925 +       case EVMS_LVM_SNAPSHOT_STAT_IOCTL:
20926 +               user_snap_stats =
20927 +                       (struct lvm_snapshot_stat_ioctl *)pkt.feature_ioctl_data;
20928 +               if ( copy_from_user(&snap_stats, user_snap_stats,
20929 +                                   sizeof(snap_stats)) ) {
20930 +                       rc = -EINVAL;
20931 +                       break;
20932 +               }
20933 +               rc = get_snapshot_stats(&snap_stats);
20934 +               if ( copy_to_user(user_snap_stats, &snap_stats,
20935 +                                 sizeof(snap_stats)) ) {
20936 +                       rc = -EINVAL;
20937 +                       break;
20938 +               }
20939 +               break;
20940 +
20941 +       default:
20942 +               rc = -EINVAL;
20943 +               break;
20944 +       }
20945 +
20946 +       pkt.status = rc;
20947 +       copy_to_user(user_pkt, &pkt, sizeof(pkt));
20948 +       MOD_DEC_USE_COUNT;
20949 +       return rc;
20950 +}
20951 +
20952 +/**
20953 + * lvm_vge_init
20954 + **/
20955 +int __init lvm_vge_init(void)
20956 +{
20957 +       struct proc_dir_entry *pde;
20958 +
20959 +       lvm_group_list = NULL;
20960 +       lvm_proc = NULL;
20961 +
20962 +       /* Register the global proc-fs entries. */
20963 +       pde = evms_cs_get_evms_proc_dir();
20964 +       if (pde) {
20965 +               lvm_proc = create_proc_entry(LVM_PROC_NAME, S_IFDIR, pde);
20966 +               if (lvm_proc) {
20967 +                       create_proc_read_entry(LVM_PROC_GLOBAL_NAME, S_IFREG,
20968 +                                              lvm_proc, lvm_global_proc_read,
20969 +                                              NULL);
20970 +               }
20971 +       }
20972 +
20973 +       /* Register this plugin with EVMS. */
20974 +       return evms_cs_register_plugin(&lvm_plugin_header);
20975 +}
20976 +
20977 +/**
20978 + * lvm_vge_exit
20979 + **/
20980 +void __exit lvm_vge_exit(void)
20981 +{
20982 +       struct lvm_volume_group * group, * next_group;
20983 +       struct proc_dir_entry * pde;
20984 +       int i;
20985 +
20986 +       /* If LVM is called for module_exit, that means the reference
20987 +        * count must be zero, which means there should be no volumes,
20988 +        * and thus no volume groups. But, check anyway and delete
20989 +        * any volumes and groups that are still hanging around.
20990 +        */
20991 +       if (lvm_group_list) {
20992 +               LOG_SERIOUS("Called for module_exit, but group list is not empty!\n");
20993 +       }
20994 +
20995 +       for ( group = lvm_group_list; group; group = next_group ) {
20996 +               next_group = group->next_group;
20997 +
20998 +               LOG_SERIOUS("In module_exit: deleting all volumes from group %s.\n",
20999 +                           group->vg_name);
21000 +
21001 +               for ( i = 1; i <= MAX_LV; i++ ) {
21002 +                       if (group->volume_list[i]) {
21003 +                               lvm_delete_node(group->volume_list[i]->volume_node);
21004 +                       }
21005 +               }
21006 +       }
21007 +
21008 +       /* Unregister the proc-fs entries. */
21009 +       pde = evms_cs_get_evms_proc_dir();
21010 +       if (pde) {
21011 +               remove_proc_entry(LVM_PROC_GLOBAL_NAME, lvm_proc);
21012 +               remove_proc_entry(LVM_PROC_NAME, pde);
21013 +       }
21014 +
21015 +       /* Unregister this plugin from EVMS. */
21016 +       evms_cs_unregister_plugin(&lvm_plugin_header);
21017 +}
21018 +
21019 +module_init(lvm_vge_init);
21020 +module_exit(lvm_vge_exit);
21021 +#ifdef MODULE_LICENSE
21022 +MODULE_LICENSE("GPL");
21023 +#endif
21024 +
21025 diff -Naur linux-2002-09-30/drivers/evms/md_core.c evms-2002-09-30/drivers/evms/md_core.c
21026 --- linux-2002-09-30/drivers/evms/md_core.c     Wed Dec 31 18:00:00 1969
21027 +++ evms-2002-09-30/drivers/evms/md_core.c      Sun Sep 29 23:25:48 2002
21028 @@ -0,0 +1,3633 @@
21029 +/*
21030 + *   Copyright (c) International Business Machines  Corp., 2000
21031 + *
21032 + *   This program is free software;  you can redistribute it and/or modify
21033 + *   it under the terms of the GNU General Public License as published by
21034 + *   the Free Software Foundation; either version 2 of the License, or
21035 + *   (at your option) any later version.
21036 + *
21037 + *   This program is distributed in the hope that it will be useful,
21038 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
21039 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
21040 + *   the GNU General Public License for more details.
21041 + *
21042 + *   You should have received a copy of the GNU General Public License
21043 + *   along with this program;  if not, write to the Free Software
21044 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21045 + *
21046 + *
21047 + * linux/drivers/evms/md_core.c
21048 + *
21049 + * EVMS Linux MD Region Manager
21050 + *
21051 + */
21052 +
21053 +
21054 +#include <linux/module.h>
21055 +#include <linux/kmod.h>
21056 +#include <linux/kernel.h>
21057 +#include <linux/config.h>
21058 +#include <linux/genhd.h>
21059 +#include <linux/string.h>
21060 +#include <linux/blk.h>
21061 +#include <linux/init.h>
21062 +#include <linux/slab.h>
21063 +#include <linux/vmalloc.h>
21064 +#include <linux/evms/evms.h>
21065 +#include <linux/evms/evms_md.h>
21066 +#include <linux/sysctl.h>
21067 +#include <asm/system.h>
21068 +#include <asm/uaccess.h>
21069 +
21070 +#define LOG_PREFIX "md core: "
21071 +
21072 +/*
21073 + * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
21074 + * is 100 KB/sec, so the extra system load does not show up that much.
21075 + * Increase it if you want to have more _guaranteed_ speed. Note that
21076 + * the RAID driver will use the maximum available bandwith if the IO
21077 + * subsystem is idle. There is also an 'absolute maximum' reconstruction
21078 + * speed limit - in case reconstruction slows down your system despite
21079 + * idle IO detection.
21080 + *
21081 + * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
21082 + */
21083 +
21084 +static MD_LIST_HEAD(all_raid_disks);
21085 +static MD_LIST_HEAD(pending_raid_disks);
21086 +
21087 +static int sysctl_speed_limit_min = 100;
21088 +static int sysctl_speed_limit_max = 100000;
21089 +
21090 +
21091 +static mdk_personality_t *pers[MAX_PERSONALITY];
21092 +
21093 +static int md_blocksizes[MAX_MD_DEVS];
21094 +static int md_hardsect_sizes[MAX_MD_DEVS];
21095 +int evms_md_size[MAX_MD_DEVS];
21096 +static struct evms_thread *evms_md_recovery_thread = NULL;
21097 +
21098 +/*
21099 + * Enables to iterate over all existing md arrays
21100 + */
21101 +static LIST_HEAD(all_mddevs);
21102 +static LIST_HEAD(incomplete_mddevs);
21103 +static LIST_HEAD(running_mddevs);
21104 +
21105 +/*
21106 + * The mapping between kdev and mddev is not necessary a simple
21107 + * one! Eg. HSM uses several sub-devices to implement Logical
21108 + * Volumes. All these sub-devices map to the same mddev.
21109 + */
21110 +struct dev_mapping evms_mddev_map[MAX_MD_DEVS];
21111 +
21112 +
21113 +/* Support functions for discovery */
21114 +static mdk_rdev_t * evms_md_find_rdev_all (struct evms_logical_node *node);
21115 +static mddev_t * evms_md_find_mddev_all (struct evms_logical_node *node);
21116 +static int evms_md_import_device (struct evms_logical_node **discover_list,
21117 +                                 struct evms_logical_node *node);
21118 +static void evms_md_autostart_arrays(struct evms_logical_node **discover_list);
21119 +static void evms_md_run_devices (struct evms_logical_node **discover_list);
21120 +static int evms_md_run_array (struct evms_logical_node ** discover_list,
21121 +                             mddev_t *mddev);
21122 +static void evms_md_run_incomplete_array (struct evms_logical_node ** discover_list,
21123 +                                        mddev_t *mddev);
21124 +static int evms_md_create_logical_node(struct evms_logical_node **discover_list,
21125 +                                      mddev_t *mddev, uint flags);
21126 +static int evms_md_read_disk_sb (mdk_rdev_t * rdev);
21127 +static int evms_md_analyze_sbs (mddev_t * mddev);
21128 +static mddev_t * alloc_mddev (kdev_t dev);
21129 +static void free_mddev(mddev_t * mddev);
21130 +static void evms_md_create_recovery_thread(void);
21131 +static void evms_md_destroy_recovery_thread(void);
21132 +static int do_md_run (mddev_t * mddev);
21133 +static int do_md_stop (mddev_t * mddev, int ro);
21134 +
21135 +static void evms_md_export_rdev (mdk_rdev_t * rdev, int delete_node);
21136 +static void kick_rdev_from_array (mdk_rdev_t * rdev);
21137 +static mdp_disk_t *evms_md_find_disk(mddev_t *mddev, kdev_t dev);
21138 +static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb);
21139 +
21140 +/* Plugin API prototypes */
21141 +static int md_discover( struct evms_logical_node ** discover_list );
21142 +static int md_end_discover( struct evms_logical_node ** discover_list );
21143 +static int md_delete( struct evms_logical_node * node);
21144 +static void md_read(   struct evms_logical_node        * node,
21145 +                       struct buffer_head      * bh);
21146 +static void md_write(  struct evms_logical_node        * node,
21147 +                       struct buffer_head      * bh);
21148 +static int md_sync_io( struct evms_logical_node *node,
21149 +                       int rw,
21150 +                       u64 sect_nr,
21151 +                       u64 num_sects,
21152 +                       void *data);
21153 +static int md_ioctl(   struct evms_logical_node *node,
21154 +                       struct inode *inode,
21155 +                       struct file *file,
21156 +                       unsigned int cmd,
21157 +                       unsigned long arg);
21158 +static int md_ioctl_cmd_broadcast(
21159 +       struct evms_logical_node *node,
21160 +       struct inode *inode,
21161 +       struct file *file,
21162 +       unsigned long cmd,
21163 +       unsigned long arg);
21164 +
21165 +static int md_direct_ioctl(
21166 +       struct inode *inode,
21167 +       struct file *file,
21168 +       unsigned int cmd,
21169 +       unsigned long arg);
21170 +
21171 +/* global MD data structures */
21172 +static struct evms_plugin_fops md_fops = {
21173 +       .discover       = md_discover,
21174 +       .end_discover   = md_end_discover,
21175 +       .delete         = md_delete,
21176 +       .read           = md_read,
21177 +       .write          = md_write,
21178 +       .init_io        = md_sync_io,
21179 +       .ioctl          = md_ioctl,
21180 +       .direct_ioctl   = md_direct_ioctl
21181 +};
21182 +
21183 +static struct evms_plugin_header md_plugin_header = {
21184 +       .id = SetPluginID(IBM_OEM_ID,
21185 +                         EVMS_REGION_MANAGER,
21186 +                         EVMS_MD_ID),
21187 +       .version = {
21188 +               .major          = EVMS_MD_MAJOR_VERSION,
21189 +               .minor          = EVMS_MD_MINOR_VERSION,
21190 +               .patchlevel     = EVMS_MD_PATCHLEVEL_VERSION
21191 +       },
21192 +       .required_services_version = {
21193 +               .major          = EVMS_MD_COMMON_SERVICES_MAJOR,
21194 +               .minor          = EVMS_MD_COMMON_SERVICES_MINOR,
21195 +               .patchlevel     = EVMS_MD_COMMON_SERVICES_PATCHLEVEL
21196 +       },
21197 +       .fops = &md_fops
21198 +};
21199 +
21200 +/* global variables */
21201 +static int exported_nodes;      /* total # of exported devices
21202 +                                 * produced during this discovery.
21203 +                                 */
21204 +static struct evms_logical_node **cur_discover_list = NULL;
21205 +
21206 +/**********************************************************/
21207 +/* SYSCTL - EVMS/RAID folder                             */
21208 +/**********************************************************/
21209 +
21210 +#ifdef CONFIG_PROC_FS
21211 +static struct ctl_table_header *md_table_header;
21212 +
21213 +static ctl_table md_table[] = {
21214 +       {DEV_EVMS_MD_SPEED_LIMIT_MIN, "speed_limit_min",
21215 +        &sysctl_speed_limit_min, sizeof(int), 0644, NULL, &proc_dointvec},
21216 +       {DEV_EVMS_MD_SPEED_LIMIT_MAX, "speed_limit_max",
21217 +        &sysctl_speed_limit_max, sizeof(int), 0644, NULL, &proc_dointvec},
21218 +       {0}
21219 +};
21220 +
21221 +static ctl_table md_dir_table[] = {
21222 +       {DEV_EVMS_MD, "md", NULL, 0, 0555, md_table},
21223 +       {0}
21224 +};
21225 +
21226 +static ctl_table evms_dir_table[] = {
21227 +       {DEV_EVMS, "evms", NULL, 0, 0555, md_dir_table},
21228 +       {0}
21229 +};
21230 +
21231 +static ctl_table dev_dir_table[] = {
21232 +       {CTL_DEV, "dev", NULL, 0, 0555, evms_dir_table},
21233 +       {0}
21234 +};
21235 +#endif
21236 +/********** Required EVMS Plugin Functions **********/
21237 +
21238 +/*
21239 + * Function: md_discover
21240 + *     We should only export complete MD device nodes
21241 + */
21242 +static int md_discover( struct evms_logical_node ** discover_list )
21243 +{
21244 +       MOD_INC_USE_COUNT;
21245 +        LOG_ENTRY_EXIT("%s: ENTRY\n", __FUNCTION__);
21246 +
21247 +        /* initialize global variable */
21248 +        exported_nodes = 0;
21249 +       cur_discover_list = discover_list;
21250 +       evms_md_autostart_arrays(discover_list);
21251 +
21252 +       LOG_ENTRY_EXIT("%s: EXIT (exported nodes: %d)\n", __FUNCTION__,exported_nodes);
21253 +       cur_discover_list = NULL;
21254 +       MOD_DEC_USE_COUNT;
21255 +        return(exported_nodes);
21256 +}
21257 +
21258 +static mddev_t * evms_md_find_incomplete_array(int level)
21259 +{
21260 +       mddev_t *mddev;
21261 +       struct list_head *tmp,*tmp2;
21262 +       mdk_rdev_t *rdev;
21263 +
21264 +       ITERATE_INCOMPLETE_MDDEV(mddev,tmp) {
21265 +               ITERATE_RDEV(mddev, rdev, tmp2) {
21266 +                       if (rdev->sb && rdev->sb->level == level)
21267 +                               return mddev;
21268 +               }
21269 +       }
21270 +       return NULL;
21271 +}
21272 +
21273 +/*
21274 + * Function: md_end_discover
21275 + */
21276 +static int md_end_discover( struct evms_logical_node ** discover_list )
21277 +{
21278 +       int rc = 0;
21279 +       struct list_head *tmp;
21280 +       mdk_rdev_t *rdev;
21281 +       mddev_t *mddev;
21282 +       struct evms_logical_node *node;
21283 +       int done = FALSE;
21284 +
21285 +       MOD_INC_USE_COUNT;
21286 +        LOG_ENTRY_EXIT("%s: ENTRY\n", __FUNCTION__);
21287 +       rc = md_discover(discover_list);
21288 +
21289 +       do {
21290 +               done = TRUE;
21291 +               if ( (mddev = evms_md_find_incomplete_array(5)) != NULL) {
21292 +                       evms_md_run_incomplete_array(discover_list, mddev);
21293 +                       done = FALSE;
21294 +                       continue;
21295 +               }
21296 +               if ( (mddev = evms_md_find_incomplete_array(1)) != NULL) {
21297 +                       evms_md_run_incomplete_array(discover_list, mddev);
21298 +                       done = FALSE;
21299 +                       continue;
21300 +               }
21301 +               if ( (mddev = evms_md_find_incomplete_array(0)) != NULL) {
21302 +                       evms_md_run_incomplete_array(discover_list, mddev);
21303 +                       done = FALSE;
21304 +                       continue;
21305 +               }
21306 +               if ( (mddev = evms_md_find_incomplete_array(-1)) != NULL) {
21307 +                       evms_md_run_incomplete_array(discover_list, mddev);
21308 +                       done = FALSE;
21309 +                       continue;
21310 +               }
21311 +
21312 +       } while (!done);
21313 +
21314 +
21315 +       /*
21316 +        * At this point, delete all mddevs which did not start.
21317 +        */
21318 +       ITERATE_MDDEV(mddev,tmp) {
21319 +               if (mddev->pers == NULL) {
21320 +                       LOG_WARNING("%s: deleting md%d\n", __FUNCTION__, mdidx(mddev));
21321 +                       free_mddev(mddev);
21322 +               }
21323 +       }
21324 +
21325 +
21326 +       /*
21327 +        * At this point, delete all rdevs which do not belong to any of discovered MD arrays.
21328 +        */
21329 +       ITERATE_RDEV_ALL(rdev, tmp) {
21330 +               if (!rdev->mddev) {
21331 +                       node = rdev->node;
21332 +                       if (node) {
21333 +                               if (node->plugin->id == md_plugin_header.id)
21334 +                                       evms_md_export_rdev(rdev, FALSE);
21335 +                               else
21336 +                                       evms_md_export_rdev(rdev, TRUE);
21337 +                       }
21338 +               }
21339 +       }
21340 +
21341 +        LOG_ENTRY_EXIT("%s: EXIT\n", __FUNCTION__);
21342 +       MOD_DEC_USE_COUNT;
21343 +       return rc;
21344 +}
21345 +
21346 +
21347 +/*
21348 + * Function: md_delete_node
21349 + */
21350 +static int md_delete( struct evms_logical_node * node)
21351 +{
21352 +       struct evms_md *evms_md;
21353 +       mddev_t *mddev;
21354 +
21355 +       evms_md = node->private;
21356 +       mddev = evms_md->mddev;
21357 +       LOG_DEFAULT("md_delete() [%s]\n", evms_md_partition_name(node));
21358 +
21359 +       if (mddev)
21360 +               do_md_stop(mddev,0);
21361 +       if (evms_md) {
21362 +               if (evms_md->instance_plugin_hdr.fops)
21363 +                       kfree(evms_md->instance_plugin_hdr.fops);
21364 +               kfree(evms_md);
21365 +       }
21366 +
21367 +       evms_cs_deallocate_logical_node(node);
21368 +       return 0;
21369 +}
21370 +
21371 +
21372 +/*
21373 + * Function: md_read
21374 + */
21375 +static void md_read(   struct evms_logical_node        * node,
21376 +                       struct buffer_head      * bh)
21377 +{
21378 +       struct evms_md *evms_md;
21379 +       mddev_t *mddev;
21380 +
21381 +       evms_md = node->private;
21382 +       mddev = evms_md->mddev;
21383 +       if (evms_md_check_boundary(node, bh)) return;
21384 +       if (mddev && mddev->pers)
21385 +               mddev->pers->read(node, bh);
21386 +}
21387 +
21388 +
21389 +/*
21390 + * Function: md_write
21391 + */
21392 +static void md_write(  struct evms_logical_node        * node,
21393 +                       struct buffer_head      * bh)
21394 +{
21395 +       struct evms_md *evms_md;
21396 +       mddev_t *mddev;
21397 +
21398 +       evms_md = node->private;
21399 +       mddev = evms_md->mddev;
21400 +       if (evms_md_check_boundary(node, bh)) return;
21401 +       if (mddev->ro) {
21402 +               LOG_ERROR("%s: read-only is set for [%s]\n", __FUNCTION__, node->name);
21403 +               bh->b_end_io(bh, 0);
21404 +               return;
21405 +       }
21406 +       if (mddev && mddev->pers)
21407 +               mddev->pers->write(node, bh);
21408 +}
21409 +
21410 +/*
21411 + * Function: md_sync_io
21412 + */
21413 +static int md_sync_io(
21414 +       struct evms_logical_node *node,
21415 +       int rw,
21416 +       u64 sect_nr,
21417 +       u64 num_sects,
21418 +       void *buf_addr)
21419 +{
21420 +       struct evms_md *evms_md;
21421 +       mddev_t *mddev;
21422 +       int rc = 0;
21423 +
21424 +       evms_md = node->private;
21425 +       mddev = evms_md->mddev;
21426 +
21427 +       if (sect_nr + num_sects > node->total_vsectors) {
21428 +               LOG_ERROR("%s: attempt to %s beyond MD device(%s) boundary("PFU64") with sect_nr("PFU64") and num_sects("PFU64")\n",
21429 +                       __FUNCTION__,
21430 +                       rw ? "WRITE" : "READ",
21431 +                       node->name,
21432 +                       node->total_vsectors,
21433 +                       sect_nr,num_sects);
21434 +               rc = -EINVAL;
21435 +       }
21436 +
21437 +       if ((mddev->ro) && (rw != READ)) {
21438 +               LOG_ERROR("%s: read-only is set for [%s]\n", __FUNCTION__, node->name);
21439 +               return -EINVAL;
21440 +       }
21441 +
21442 +       if (!rc && mddev && mddev->pers) {
21443 +               /*
21444 +                * Check if the personality can handle synchronous I/O,
21445 +                * otherwise use the generic function.
21446 +                */
21447 +               if (mddev->pers->sync_io)
21448 +                       rc = mddev->pers->sync_io(mddev, rw, sect_nr, num_sects, buf_addr);
21449 +               else
21450 +                       rc = evms_md_sync_io(node, rw, sect_nr, num_sects, buf_addr);
21451 +       } else
21452 +               rc = -EINVAL;
21453 +       return rc;
21454 +}
21455 +
21456 +/**
21457 + * md_end_sync_request - End IO handler for synchronous I/O functions
21458 + **/
21459 +static void md_end_sync_request(struct buffer_head *bh, int uptodate)
21460 +{
21461 +       struct evms_md_sync_cb * cb = (struct evms_md_sync_cb *) bh->b_private;
21462 +
21463 +       if (!uptodate)
21464 +               cb->rc |= -EIO;
21465 +       /* we are done with the bh */
21466 +       evms_cs_deallocate_to_pool(evms_bh_pool, bh);
21467 +
21468 +       if (atomic_dec_and_test(&cb->io_count)) {
21469 +               if (waitqueue_active(&cb->wait))
21470 +                       wake_up(&cb->wait);
21471 +       }
21472 +}
21473 +
21474 +/**
21475 + * md_sync_request_submit_bh - submit a page-size bh
21476 + *     @node - target MD node
21477 + *     @bh - pointer to the buffer head
21478 + *     @sector - the sector number
21479 + *     @data - pointer to buffer
21480 + *     @rw - READ/WRITE
21481 + *     @cb - MD synchronous I/O control block
21482 + **/
21483 +static inline void md_sync_request_submit_bh(
21484 +       struct evms_logical_node *node,
21485 +       struct buffer_head *bh,
21486 +       unsigned long sector,
21487 +       char *data,
21488 +       int rw,
21489 +       struct evms_md_sync_cb *cb)
21490 +{
21491 +
21492 +       bh->b_this_page = (struct buffer_head *)1;
21493 +       bh->b_rsector = sector;
21494 +       bh->b_size = PAGE_SIZE;
21495 +       bh->b_state = 0;
21496 +       set_bit(BH_Dirty, &bh->b_state);
21497 +       set_bit(BH_Lock, &bh->b_state);
21498 +       set_bit(BH_Req, &bh->b_state);
21499 +       set_bit(BH_Mapped, &bh->b_state);
21500 +       atomic_set(&bh->b_count, 1);
21501 +       bh->b_data = data;
21502 +       bh->b_page = virt_to_page(data);
21503 +       bh->b_list = BUF_LOCKED;
21504 +       bh->b_end_io = md_end_sync_request;
21505 +       bh->b_private = cb;
21506 +       atomic_inc(&cb->io_count);
21507 +       if (rw == READ)
21508 +               R_IO(node,bh);
21509 +       else
21510 +               W_IO(node,bh);
21511 +}
21512 +
21513 +/**
21514 + * evms_md_allocate_bh
21515 + *
21516 + *     Note that this function will not return unless we got a free bh
21517 + **/
21518 +static inline struct buffer_head *evms_md_allocate_bh(void)
21519 +{
21520 +       struct buffer_head *bh;
21521 +
21522 +       while ((bh = evms_cs_allocate_from_pool(evms_bh_pool, FALSE)) == NULL)
21523 +               schedule(); /* just yield for a someone to deallocate a bh */
21524 +       init_waitqueue_head(&bh->b_wait);
21525 +       bh->b_count = (atomic_t)ATOMIC_INIT(0);
21526 +       return(bh);
21527 +}
21528 +
21529 +/**
21530 + * md_partial_sync_io -
21531 + *     This function handles synchronous I/O when sector is not page aligned
21532 + *     @node - evms node for the MD array
21533 + *     @rw - READ/WRITE
21534 + *      @sector - the sector
21535 + *     @nsects - on input, the total sectors for the request
21536 + *     @nsects - on output, number of sectors completed
21537 + *     @data - data buffer
21538 + **/
21539 +int evms_md_partial_sync_io(
21540 +       struct evms_logical_node *node,
21541 +       int rw,
21542 +       u64 sector,
21543 +       u32 *nsects,
21544 +       void *data)
21545 +{
21546 +       int rc;
21547 +       u32 offset, size;
21548 +       struct buffer_head *bh;
21549 +       struct evms_md_sync_cb cb;
21550 +       char *page;
21551 +
21552 +       size = (u32)(*nsects << EVMS_VSECTOR_SIZE_SHIFT);
21553 +
21554 +       /* calculate byte offset */
21555 +       offset = (u32)((sector & (EVMS_MD_SECTS_PER_PAGE-1)) << EVMS_VSECTOR_SIZE_SHIFT);
21556 +       if (!offset && (*nsects >= EVMS_MD_SECTS_PER_PAGE)) {
21557 +               *nsects = 0;
21558 +               return 0; /* Nothing to do */
21559 +       }
21560 +
21561 +       page = NULL;
21562 +       rc = 0;
21563 +
21564 +       page = kmalloc(PAGE_SIZE, GFP_KERNEL);
21565 +       if (!page) {
21566 +               LOG_ERROR("%s: no memory!\n", __FUNCTION__);
21567 +               rc = -ENOMEM;
21568 +       }
21569 +
21570 +       bh = evms_md_allocate_bh();
21571 +
21572 +       if (!rc) {
21573 +               memset(&cb, 0, sizeof(cb));
21574 +               init_waitqueue_head(&cb.wait);
21575 +               cb.io_count = (atomic_t)ATOMIC_INIT(0);
21576 +               md_sync_request_submit_bh(
21577 +                       node, bh,
21578 +                       (unsigned long)(sector & EVMS_MD_SECTS_PER_PAGE_MASK),
21579 +                       page, READ, &cb);
21580 +               wait_disk_event(cb.wait, !atomic_read(&cb.io_count));
21581 +               rc |= cb.rc;
21582 +       }
21583 +
21584 +       if (!rc) {
21585 +               size = (size <= (PAGE_SIZE - offset)) ? size : (PAGE_SIZE - offset);
21586 +
21587 +               switch (rw) {
21588 +               case READ:
21589 +                       /* copy data and return */
21590 +                       memcpy(data, page+offset, size);
21591 +                       break;
21592 +               case WRITE:
21593 +                       /* copy data and then write */
21594 +                       memcpy(page+offset, data, size);
21595 +
21596 +                       bh = evms_md_allocate_bh();
21597 +
21598 +                       md_sync_request_submit_bh(
21599 +                               node, bh,
21600 +                               (unsigned long)(sector & EVMS_MD_SECTS_PER_PAGE_MASK),
21601 +                               page, WRITE, &cb);
21602 +                       wait_disk_event(cb.wait, !atomic_read(&cb.io_count));
21603 +                       rc |= cb.rc;
21604 +                       break;
21605 +               default:
21606 +                       rc = -EINVAL;
21607 +               }
21608 +       }
21609 +
21610 +       if (page)
21611 +               kfree(page);
21612 +
21613 +       if (!rc)
21614 +               *nsects = (u64)(size >> EVMS_VSECTOR_SIZE_SHIFT);
21615 +       else
21616 +               *nsects = 0;
21617 +       return rc;
21618 +}
21619 +
21620 +/**
21621 + * evms_md_sync_io - This function handles synchronous I/O
21622 + **/
21623 +int evms_md_sync_io(
21624 +       struct evms_logical_node *node,
21625 +        int rw,
21626 +        u64 sector,
21627 +        u64 total_nr_sects,
21628 +        void *data )
21629 +{
21630 +       int rc = 0;
21631 +       u64 total_nr_pages, size;
21632 +       u32 nsects;
21633 +       struct buffer_head *bh;
21634 +       struct evms_md_sync_cb cb;
21635 +
21636 +       if (sector % EVMS_MD_SECTS_PER_PAGE) {
21637 +               nsects = total_nr_sects;
21638 +               rc = evms_md_partial_sync_io(node, rw, sector, &nsects, data);
21639 +               if (!rc) {
21640 +                       total_nr_sects -= nsects;
21641 +                       sector += nsects;
21642 +                       data += (nsects << EVMS_VSECTOR_SIZE_SHIFT);
21643 +                       if (total_nr_sects == 0)
21644 +                               return rc;
21645 +               } else {
21646 +                       return rc;
21647 +               }
21648 +       }
21649 +
21650 +       total_nr_pages = total_nr_sects / EVMS_MD_SECTS_PER_PAGE;
21651 +       size = total_nr_sects << EVMS_VSECTOR_SIZE_SHIFT;
21652 +
21653 +       memset(&cb, 0, sizeof(cb));
21654 +       init_waitqueue_head(&cb.wait);
21655 +       cb.io_count = (atomic_t)ATOMIC_INIT(0);
21656 +
21657 +       while (!rc && total_nr_pages) {
21658 +
21659 +               bh = evms_md_allocate_bh();
21660 +
21661 +               md_sync_request_submit_bh(node, bh,(unsigned long)sector, data, rw, &cb);
21662 +
21663 +               sector += EVMS_MD_SECTS_PER_PAGE;
21664 +               size -= PAGE_SIZE;
21665 +               total_nr_pages--;
21666 +               data += PAGE_SIZE;
21667 +       }
21668 +       if (!rc) {
21669 +               wait_disk_event(cb.wait, !atomic_read(&cb.io_count));
21670 +               rc |= cb.rc;
21671 +       }
21672 +
21673 +       if (!rc && size) {
21674 +               nsects = size >> EVMS_VSECTOR_SIZE_SHIFT;
21675 +               rc = evms_md_partial_sync_io(node, rw, sector, &nsects, data);
21676 +       }
21677 +
21678 +       return(rc);
21679 +}
21680 +
21681 +/*
21682 + * Function: md_ioctl
21683 + */
21684 +static int md_ioctl(
21685 +       struct evms_logical_node        * node,
21686 +       struct inode            * inode,
21687 +       struct file             * file,
21688 +       unsigned int            cmd,
21689 +       unsigned long           arg)
21690 +{
21691 +       struct evms_md  * evms_md = node->private;
21692 +       mddev_t *mddev;
21693 +       int rc = 0;
21694 +
21695 +        if ((!inode) || (!evms_md) )
21696 +                rc = -EINVAL;
21697 +
21698 +        if (!rc) {
21699 +                switch (cmd) {
21700 +                       /*
21701 +                        * We have a problem here : there is no easy way to give a CHS
21702 +                        * virtual geometry. We currently pretend that we have a 2 heads
21703 +                        * 4 sectors (with a BIG number of cylinders...). This drives
21704 +                        * dosfs just mad... ;-)
21705 +                        */
21706 +
21707 +                        case HDIO_GETGEO:
21708 +                       {
21709 +                               struct hd_geometry hdgeo;
21710 +                                hdgeo.heads = 2;
21711 +                                hdgeo.sectors = 4;
21712 +                                hdgeo.cylinders = ((unsigned int)node->total_vsectors) /
21713 +                                        hdgeo.heads / hdgeo.sectors;
21714 +                                hdgeo.start = 0;
21715 +                                if (copy_to_user((int *)arg,
21716 +                                                 &hdgeo,
21717 +                                                 sizeof(hdgeo)))
21718 +                                        rc = -EFAULT;
21719 +                       }
21720 +                               break;
21721 +                       case EVMS_QUIESCE_VOLUME:
21722 +                       case EVMS_GET_DISK_LIST:
21723 +                       case EVMS_CHECK_MEDIA_CHANGE:
21724 +                       case EVMS_REVALIDATE_DISK:
21725 +                       case EVMS_OPEN_VOLUME:
21726 +                       case EVMS_CLOSE_VOLUME:
21727 +                       case EVMS_CHECK_DEVICE_STATUS:
21728 +                                rc = md_ioctl_cmd_broadcast(
21729 +                                        node, inode, file, cmd, arg);
21730 +                                break;
21731 +                        case EVMS_PLUGIN_IOCTL:
21732 +                                rc = md_direct_ioctl(
21733 +                                        inode, file, cmd, arg);
21734 +                                break;
21735 +                       default:
21736 +                               mddev = evms_md->mddev;
21737 +                               if (mddev == NULL) {
21738 +                                       rc = -ENODEV;
21739 +                               } else if (mddev->pers->evms_ioctl == NULL) {
21740 +                                       rc = -ENOSYS;
21741 +                               } else {
21742 +                                       rc = mddev->pers->evms_ioctl(mddev, inode, file, cmd, arg);
21743 +                               }
21744 +                }
21745 +        }
21746 +        return(rc);
21747 +}
21748 +
21749 +static int md_ioctl_cmd_broadcast(
21750 +       struct evms_logical_node        *node,
21751 +       struct inode            *inode,
21752 +       struct file             *file,
21753 +       unsigned long           cmd,
21754 +       unsigned long           arg)
21755 +{
21756 +        int rc = 0;
21757 +       struct evms_md *evms_md;
21758 +       mddev_t *mddev;
21759 +       struct list_head *tmp;
21760 +       mdk_rdev_t *rdev;
21761 +
21762 +       evms_md = node->private;
21763 +       mddev = evms_md->mddev;
21764 +
21765 +        /* broadcast this cmd to all children */
21766 +       ITERATE_RDEV(mddev,rdev,tmp) {
21767 +               if (!rdev->mddev) {
21768 +                       MD_BUG();
21769 +                       continue;
21770 +               }
21771 +               if (!rdev->virtual_spare) {
21772 +                       rc |= IOCTL(rdev->node, inode, file, cmd, arg);
21773 +               }
21774 +       }
21775 +       return (rc);
21776 +}
21777 +
21778 +
21779 +static int evms_md_add_virtual_spare (mddev_t *mddev, kdev_t dev)
21780 +{
21781 +       mdk_rdev_t *rdev;
21782 +       mdp_disk_t *disk = NULL;
21783 +       int i;
21784 +
21785 +       if (evms_md_find_rdev(mddev,dev))
21786 +               return -EEXIST;
21787 +
21788 +       LOG_ENTRY_EXIT("%s ENTRY\n", __FUNCTION__);
21789 +       if ((rdev = kmalloc(sizeof(*rdev),GFP_KERNEL)) == NULL)
21790 +               return -ENOMEM;
21791 +
21792 +       memset(rdev, 0, sizeof(*rdev));
21793 +
21794 +       for (i = mddev->sb->raid_disks; i < MD_SB_DISKS; i++) {
21795 +               disk = mddev->sb->disks + i;
21796 +               if (!disk->major && !disk->minor)
21797 +                       break;
21798 +               if (disk_removed(disk))
21799 +                       break;
21800 +       }
21801 +       if (i == MD_SB_DISKS) {
21802 +               LOG_WARNING("%s : [md%d]can not hot-add to full array!\n", __FUNCTION__, mdidx(mddev));
21803 +               kfree(rdev);
21804 +               return -EBUSY;
21805 +       }
21806 +
21807 +       if (disk_removed(disk)) {
21808 +               /*
21809 +                * reuse slot
21810 +                */
21811 +               if (disk->number != i) {
21812 +                       MD_BUG();
21813 +                       kfree(rdev);
21814 +                       return -EINVAL;
21815 +               }
21816 +       } else {
21817 +               disk->number = i;
21818 +       }
21819 +
21820 +       disk->raid_disk = disk->number;
21821 +       disk->major = MAJOR(dev);
21822 +       disk->minor = MINOR(dev);
21823 +
21824 +       mark_disk_spare(disk);
21825 +
21826 +       rdev->mddev = mddev;
21827 +       rdev->dev = dev;
21828 +       rdev->desc_nr = disk->number;
21829 +       rdev->virtual_spare = 1;
21830 +
21831 +       /* bind rdev to mddev array */
21832 +       list_add(&rdev->all, &all_raid_disks);
21833 +       list_add(&rdev->same_set, &mddev->disks);
21834 +       MD_INIT_LIST_HEAD(&rdev->pending);
21835 +
21836 +       mddev->sb->nr_disks++;
21837 +       mddev->sb->spare_disks++;
21838 +       mddev->sb->working_disks++;
21839 +       mddev->nb_dev++;
21840 +
21841 +       mddev->sb_dirty = 1;
21842 +
21843 +       evms_md_update_sb(mddev);
21844 +
21845 +       return 0;
21846 +}
21847 +
21848 +static int evms_md_remove_disk(mddev_t *mddev, kdev_t dev)
21849 +{
21850 +       mdk_rdev_t *rdev = NULL;
21851 +       mdp_disk_t *disk;
21852 +       int rc = 0;
21853 +
21854 +       disk = evms_md_find_disk(mddev,dev);
21855 +       if (!disk)
21856 +               return -ENODEV;
21857 +
21858 +       rdev = evms_md_find_rdev(mddev,dev);
21859 +
21860 +       if (rdev && !rdev->faulty) {
21861 +               /*
21862 +                * The disk is active in the array,
21863 +                * must ask the personality to do it
21864 +                */
21865 +               if (mddev->pers && mddev->pers->diskop) {
21866 +                       /* Assume spare, try to remove it first. */
21867 +                       rc = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_SPARE);
21868 +                       if (rc)
21869 +                               rc = mddev->pers->diskop(mddev, &disk, DISKOP_HOT_REMOVE_DISK);
21870 +               } else
21871 +                       rc = -ENOSYS;
21872 +       }
21873 +
21874 +       if (!rc) {
21875 +               remove_descriptor(disk,mddev->sb);
21876 +               if (rdev)
21877 +                       kick_rdev_from_array(rdev);
21878 +               mddev->sb_dirty = 1;
21879 +               evms_md_update_sb(mddev);
21880 +
21881 +       }
21882 +       return rc;
21883 +}
21884 +
21885 +
21886 +/*
21887 + * Function: md_direct_ioctl
21888 + *
21889 + *     This function provides a method for user-space to communicate directly
21890 + *     with a plugin in the kernel.
21891 + */
21892 +static int md_direct_ioctl(
21893 +       struct inode            * inode,
21894 +       struct file             * file,
21895 +       unsigned int            cmd,
21896 +       unsigned long           args )
21897 +{
21898 +       struct evms_plugin_ioctl_pkt argument;
21899 +       kdev_t                  md_kdev;
21900 +       mddev_t                 *mddev = NULL;
21901 +       struct evms_md_ioctl    ioctl_arg;
21902 +       struct evms_md_kdev     device;
21903 +       struct evms_md_array_info array_info, *usr_array_info;
21904 +       int                     rc = 0;
21905 +
21906 +       MOD_INC_USE_COUNT;
21907 +
21908 +        // Copy user's parameters to kernel space
21909 +        if ( copy_from_user(&argument, (struct evms_plugin_ioctl_pkt*)args, sizeof(argument)) ) {
21910 +               MOD_DEC_USE_COUNT;
21911 +                return -EFAULT;
21912 +       }
21913 +
21914 +       // Make sure this is supposed to be our ioctl.
21915 +       if ( argument.feature_id != md_plugin_header.id ) {
21916 +               MOD_DEC_USE_COUNT;
21917 +               return -EINVAL;
21918 +       }
21919 +
21920 +       // Copy user's md ioclt parmeters to kernel space
21921 +       if ( copy_from_user(&ioctl_arg,
21922 +                           (struct evms_md_ioctl*)argument.feature_ioctl_data,
21923 +                           sizeof(ioctl_arg)) )
21924 +               rc = -EFAULT;
21925 +       else {
21926 +               if (ioctl_arg.mddev_idx < MAX_MD_DEVS) {
21927 +                       md_kdev = MKDEV(MD_MAJOR, ioctl_arg.mddev_idx);
21928 +                       mddev = kdev_to_mddev(md_kdev);
21929 +                       if (mddev == NULL)
21930 +                               rc = -ENODEV;
21931 +               } else
21932 +                       rc = -ENODEV;
21933 +       }
21934 +
21935 +       if (!rc) {
21936 +               switch(argument.feature_command) {
21937 +               case EVMS_MD_PERS_IOCTL_CMD:
21938 +                       if (mddev->pers->md_pers_ioctl == NULL) {
21939 +                               MOD_DEC_USE_COUNT;
21940 +                               return -ENOSYS;
21941 +                       }
21942 +                       rc = mddev->pers->md_pers_ioctl(mddev,
21943 +                                                       ioctl_arg.cmd,
21944 +                                                       ioctl_arg.arg);
21945 +                       copy_to_user((struct evms_md_ioctl*)argument.feature_ioctl_data,
21946 +                                    &ioctl_arg,
21947 +                                    sizeof(ioctl_arg));
21948 +                       break;
21949 +
21950 +               case EVMS_MD_ADD:
21951 +                       if ( copy_from_user(&device,
21952 +                                           (struct evms_md_kdev *)ioctl_arg.arg,
21953 +                                           sizeof(device)) )
21954 +                               rc = -EFAULT;
21955 +                       else
21956 +                               rc = evms_md_add_virtual_spare(mddev,MKDEV(device.major, device.minor));
21957 +                       break;
21958 +
21959 +               case EVMS_MD_REMOVE:
21960 +                       if ( copy_from_user(&device,
21961 +                                           (struct evms_md_kdev *)ioctl_arg.arg,
21962 +                                           sizeof(device)) )
21963 +                               rc = -EFAULT;
21964 +                       else
21965 +                               rc = evms_md_remove_disk(mddev,MKDEV(device.major, device.minor));
21966 +                       break;
21967 +
21968 +               case EVMS_MD_ACTIVATE:
21969 +                       rc = -ENOSYS;
21970 +                       break;
21971 +
21972 +               case EVMS_MD_DEACTIVATE:
21973 +                       rc = -ENOSYS;
21974 +                       break;
21975 +
21976 +               case EVMS_MD_GET_ARRAY_INFO:
21977 +
21978 +                       usr_array_info = (struct evms_md_array_info *)ioctl_arg.arg;
21979 +                       if ( copy_from_user(&array_info, usr_array_info,
21980 +                                           sizeof(array_info)) )
21981 +                               rc = -EFAULT;
21982 +                       else {
21983 +                               array_info.state = 0;
21984 +                               if (mddev->curr_resync)
21985 +                                       array_info.state |= EVMS_MD_ARRAY_SYNCING;
21986 +                               copy_to_user(&usr_array_info->state, &array_info.state,
21987 +                                            sizeof(usr_array_info->state));
21988 +                               if (copy_to_user(array_info.sb, mddev->sb,
21989 +                                                sizeof(mdp_super_t)))
21990 +                                       rc = -EFAULT;
21991 +                       }
21992 +                       break;
21993 +               default:
21994 +                       rc = -ENOSYS;
21995 +                       break;
21996 +               }
21997 +       }
21998 +
21999 +       argument.status = rc;
22000 +       copy_to_user((struct evms_plugin_ioctl_pkt*)args, &argument, sizeof(argument));
22001 +       MOD_DEC_USE_COUNT;
22002 +       return rc;
22003 +}
22004 +
22005 +
22006 +
22007 +
22008 +void evms_md_add_mddev_mapping (mddev_t * mddev, kdev_t dev, void *data)
22009 +{
22010 +       unsigned int minor = MINOR(dev);
22011 +
22012 +       if (MAJOR(dev) != MD_MAJOR) {
22013 +               MD_BUG();
22014 +               return;
22015 +       }
22016 +       if (evms_mddev_map[minor].mddev != NULL) {
22017 +               MD_BUG();
22018 +               return;
22019 +       }
22020 +       evms_mddev_map[minor].mddev = mddev;
22021 +       evms_mddev_map[minor].data = data;
22022 +}
22023 +
22024 +void evms_md_del_mddev_mapping (mddev_t * mddev, kdev_t dev)
22025 +{
22026 +       unsigned int minor = MINOR(dev);
22027 +
22028 +       if (MAJOR(dev) != MD_MAJOR) {
22029 +               MD_BUG();
22030 +               return;
22031 +       }
22032 +       if (evms_mddev_map[minor].mddev != mddev) {
22033 +               MD_BUG();
22034 +               return;
22035 +       }
22036 +       evms_mddev_map[minor].mddev = NULL;
22037 +       evms_mddev_map[minor].data = NULL;
22038 +}
22039 +
22040 +static mddev_t * alloc_mddev (kdev_t dev)
22041 +{
22042 +       mddev_t *mddev;
22043 +
22044 +       if (MAJOR(dev) != MD_MAJOR) {
22045 +               MD_BUG();
22046 +               return 0;
22047 +       }
22048 +       mddev = (mddev_t *) kmalloc(sizeof(*mddev), GFP_KERNEL);
22049 +       if (!mddev)
22050 +               return NULL;
22051 +
22052 +       memset(mddev, 0, sizeof(*mddev));
22053 +
22054 +       mddev->__minor = MINOR(dev);
22055 +       init_MUTEX(&mddev->reconfig_sem);
22056 +       init_MUTEX(&mddev->recovery_sem);
22057 +       init_MUTEX(&mddev->resync_sem);
22058 +       INIT_LIST_HEAD(&mddev->disks);
22059 +       INIT_LIST_HEAD(&mddev->all_mddevs);
22060 +       INIT_LIST_HEAD(&mddev->incomplete_mddevs);
22061 +       INIT_LIST_HEAD(&mddev->running_mddevs);
22062 +       mddev->active = (atomic_t)ATOMIC_INIT(0);
22063 +       mddev->recovery_active = (atomic_t)ATOMIC_INIT(0);
22064 +
22065 +       /*
22066 +        * The 'base' mddev is the one with data NULL.
22067 +        * personalities can create additional mddevs
22068 +        * if necessary.
22069 +        */
22070 +       evms_md_add_mddev_mapping(mddev, dev, 0);
22071 +       list_add(&mddev->all_mddevs, &all_mddevs);
22072 +
22073 +       MOD_INC_USE_COUNT;
22074 +       evms_md_create_recovery_thread();
22075 +
22076 +       return mddev;
22077 +}
22078 +
22079 +mdk_rdev_t * evms_md_find_rdev_nr(mddev_t *mddev, int nr)
22080 +{
22081 +       mdk_rdev_t * rdev;
22082 +       struct list_head *tmp;
22083 +
22084 +       ITERATE_RDEV(mddev,rdev,tmp) {
22085 +               if (rdev->desc_nr == nr)
22086 +                       return rdev;
22087 +       }
22088 +       return NULL;
22089 +}
22090 +
22091 +
22092 +mdk_rdev_t * evms_md_find_rdev(mddev_t * mddev, kdev_t dev)
22093 +{
22094 +       struct list_head *tmp;
22095 +       mdk_rdev_t *rdev;
22096 +
22097 +       ITERATE_RDEV(mddev,rdev,tmp) {
22098 +               if (rdev->dev == dev)
22099 +                       return rdev;
22100 +       }
22101 +       return NULL;
22102 +}
22103 +
22104 +mdk_rdev_t * evms_md_find_rdev_from_node(mddev_t * mddev, struct evms_logical_node * node)
22105 +{
22106 +       struct list_head *tmp;
22107 +       mdk_rdev_t *rdev;
22108 +
22109 +       ITERATE_RDEV(mddev,rdev,tmp) {
22110 +               if (rdev->node == node)
22111 +                       return rdev;
22112 +       }
22113 +       return NULL;
22114 +}
22115 +
22116 +static MD_LIST_HEAD(device_names);
22117 +
22118 +static char * org_partition_name (kdev_t dev)
22119 +{
22120 +       struct gendisk *hd;
22121 +       static char nomem [] = "<nomem>";
22122 +       dev_name_t *dname;
22123 +       struct list_head *tmp = device_names.next;
22124 +
22125 +       while (tmp != &device_names) {
22126 +               dname = list_entry(tmp, dev_name_t, list);
22127 +               if (dname->dev == dev)
22128 +                       return dname->name;
22129 +               tmp = tmp->next;
22130 +       }
22131 +
22132 +       dname = (dev_name_t *) kmalloc(sizeof(*dname), GFP_KERNEL);
22133 +
22134 +       if (!dname)
22135 +               return nomem;
22136 +       /*
22137 +        * ok, add this new device name to the list
22138 +        */
22139 +       hd = get_gendisk (dev);
22140 +       dname->name = NULL;
22141 +       if (hd)
22142 +               dname->name = disk_name (hd, MINOR(dev), dname->namebuf);
22143 +       if (!dname->name) {
22144 +               sprintf (dname->namebuf, "[dev %s]", kdevname(dev));
22145 +               dname->name = dname->namebuf;
22146 +       }
22147 +
22148 +       dname->dev = dev;
22149 +       MD_INIT_LIST_HEAD(&dname->list);
22150 +       list_add(&dname->list, &device_names);
22151 +
22152 +       return dname->name;
22153 +}
22154 +
22155 +
22156 +#define EVMS_MD_NULL_PARTITION_NAME "<EVMS_NODE_NO_NAME>"
22157 +char * evms_md_partition_name (struct evms_logical_node *node)
22158 +{
22159 +       if (node && node->name)
22160 +               return node->name;
22161 +       else
22162 +               return EVMS_MD_NULL_PARTITION_NAME;
22163 +}
22164 +
22165 +static char * get_partition_name (mdk_rdev_t *rdev)
22166 +{
22167 +       if (rdev->node)
22168 +               return evms_md_partition_name(rdev->node);
22169 +       else
22170 +               return org_partition_name(rdev->dev);
22171 +}
22172 +
22173 +/*
22174 + * Function: evms_md_calc_dev_sboffset
22175 + *     return the LSN for md super block.
22176 + */
22177 +static u64 evms_md_calc_dev_sboffset (struct evms_logical_node *node,mddev_t *mddev, int persistent)
22178 +{
22179 +       u64 size = 0;
22180 +
22181 +       size = node->total_vsectors;
22182 +       if (persistent) {
22183 +               size = MD_NEW_SIZE_SECTORS(size);
22184 +       }
22185 +       return size; /* size in sectors */
22186 +}
22187 +
22188 +/*
22189 + * Function: evms_md_calc_dev_size
22190 + *     return data size (in blocks) for an "extended" device.
22191 + */
22192 +static unsigned long evms_md_calc_dev_size (struct evms_logical_node *node,
22193 +                                          mddev_t *mddev,
22194 +                                          int persistent)
22195 +{
22196 +       unsigned long size;
22197 +       u64 size_in_sectors;
22198 +
22199 +       size_in_sectors = evms_md_calc_dev_sboffset(node, mddev, persistent);
22200 +       size = size_in_sectors >> 1;
22201 +       if (!mddev->sb) {
22202 +               MD_BUG();
22203 +               return size;
22204 +       }
22205 +       if (mddev->sb->chunk_size)
22206 +               size &= ~(mddev->sb->chunk_size/1024 - 1);
22207 +       return size;
22208 +}
22209 +
22210 +static unsigned int zoned_raid_size (mddev_t *mddev)
22211 +{
22212 +       unsigned int mask;
22213 +       mdk_rdev_t * rdev;
22214 +       struct list_head *tmp;
22215 +
22216 +       if (!mddev->sb) {
22217 +               MD_BUG();
22218 +               return -EINVAL;
22219 +       }
22220 +       /*
22221 +        * do size and offset calculations.
22222 +        */
22223 +       mask = ~(mddev->sb->chunk_size/1024 - 1);
22224 +
22225 +       ITERATE_RDEV(mddev,rdev,tmp) {
22226 +               rdev->size &= mask;
22227 +               evms_md_size[mdidx(mddev)] += rdev->size;
22228 +       }
22229 +       return 0;
22230 +}
22231 +
22232 +/*
22233 + * We check wether all devices are numbered from 0 to nb_dev-1. The
22234 + * order is guaranteed even after device name changes.
22235 + *
22236 + * Some personalities (raid0, linear) use this. Personalities that
22237 + * provide data have to be able to deal with loss of individual
22238 + * disks, so they do their checking themselves.
22239 + */
22240 +int evms_md_check_ordering (mddev_t *mddev)
22241 +{
22242 +       int i, c;
22243 +       mdk_rdev_t *rdev;
22244 +       struct list_head *tmp;
22245 +
22246 +       /*
22247 +        * First, all devices must be fully functional
22248 +        */
22249 +       ITERATE_RDEV(mddev,rdev,tmp) {
22250 +               if (rdev->faulty) {
22251 +                       LOG_ERROR("evms_md_check_ordering() md%d's device %s faulty, aborting.\n",
22252 +                                  mdidx(mddev), get_partition_name(rdev));
22253 +                       goto abort;
22254 +               }
22255 +       }
22256 +
22257 +       c = 0;
22258 +       ITERATE_RDEV(mddev,rdev,tmp) {
22259 +               c++;
22260 +       }
22261 +       if (c != mddev->nb_dev) {
22262 +               MD_BUG();
22263 +               goto abort;
22264 +       }
22265 +       if (mddev->nb_dev != mddev->sb->raid_disks) {
22266 +               LOG_ERROR("%s: [md%d] array needs %d disks, has %d, aborting.\n",
22267 +                          __FUNCTION__, mdidx(mddev), mddev->sb->raid_disks, mddev->nb_dev);
22268 +               goto abort;
22269 +       }
22270 +       /*
22271 +        * Now the numbering check
22272 +        */
22273 +       for (i = 0; i < mddev->nb_dev; i++) {
22274 +               c = 0;
22275 +               ITERATE_RDEV(mddev,rdev,tmp) {
22276 +                       if (rdev->desc_nr == i)
22277 +                               c++;
22278 +               }
22279 +               if (!c) {
22280 +                       LOG_ERROR("md%d, missing disk #%d, aborting.\n",mdidx(mddev), i);
22281 +                       goto abort;
22282 +               }
22283 +               if (c > 1) {
22284 +                       LOG_ERROR("md%d, too many disks #%d, aborting.\n",mdidx(mddev), i);
22285 +                       goto abort;
22286 +               }
22287 +       }
22288 +       return 0;
22289 +abort:
22290 +       return 1;
22291 +}
22292 +
22293 +static void remove_descriptor (mdp_disk_t *disk, mdp_super_t *sb)
22294 +{
22295 +       if (disk_active(disk)) {
22296 +               sb->working_disks--;
22297 +       } else {
22298 +               if (disk_spare(disk)) {
22299 +                       sb->spare_disks--;
22300 +                       sb->working_disks--;
22301 +               } else  {
22302 +                       sb->failed_disks--;
22303 +               }
22304 +       }
22305 +       sb->nr_disks--;
22306 +       disk->major = disk->minor = 0;
22307 +       mark_disk_removed(disk);
22308 +}
22309 +
22310 +#define BAD_MINOR \
22311 +"%s: invalid raid minor (%x)\n"
22312 +
22313 +#define NO_SB \
22314 +"disabled device %s, could not read superblock.\n"
22315 +
22316 +#define BAD_CSUM \
22317 +"invalid superblock checksum on %s\n"
22318 +
22319 +
22320 +static int alloc_array_sb (mddev_t * mddev)
22321 +{
22322 +       if (mddev->sb) {
22323 +               MD_BUG();
22324 +               return 0;
22325 +       }
22326 +
22327 +       mddev->sb = (mdp_super_t *) __get_free_page (GFP_KERNEL);
22328 +       if (!mddev->sb) {
22329 +               LOG_ERROR("%s: Out of memory!\n", __FUNCTION__);
22330 +               return -ENOMEM;
22331 +       }
22332 +       md_clear_page(mddev->sb);
22333 +       return 0;
22334 +}
22335 +
22336 +static int alloc_disk_sb (mdk_rdev_t * rdev)
22337 +{
22338 +       if (rdev->sb)
22339 +               MD_BUG();
22340 +
22341 +       rdev->sb = (mdp_super_t *) __get_free_page(GFP_KERNEL);
22342 +       if (!rdev->sb) {
22343 +               LOG_ERROR("%s: Out of memory!\n", __FUNCTION__);
22344 +               return -EINVAL;
22345 +       }
22346 +       md_clear_page(rdev->sb);
22347 +
22348 +       return 0;
22349 +}
22350 +
22351 +/*
22352 + * Function: free_disk_sb
22353 + *
22354 + */
22355 +static void free_disk_sb (mdk_rdev_t * rdev)
22356 +{
22357 +       if (rdev->sb) {
22358 +               free_page((unsigned long) rdev->sb);
22359 +               rdev->sb = NULL;
22360 +               rdev->sb_offset = 0;
22361 +               rdev->size = 0;
22362 +       } else {
22363 +               if (!rdev->virtual_spare && !rdev->faulty)
22364 +                       MD_BUG();
22365 +       }
22366 +}
22367 +
22368 +/*
22369 + * Function: evms_md_read_disk_sb
22370 + *     Read the MD superblock.
22371 + */
22372 +static int evms_md_read_disk_sb (mdk_rdev_t * rdev)
22373 +{
22374 +       int rc = 0;
22375 +       struct evms_logical_node *node = rdev->node;
22376 +       u64 sb_offset_in_sectors;
22377 +
22378 +       if (!rdev->sb) {
22379 +               MD_BUG();
22380 +               return -EINVAL;
22381 +       }
22382 +       if (node->total_vsectors <= MD_RESERVED_SECTORS) {
22383 +               LOG_DETAILS("%s is too small, total_vsectors("PFU64")\n",
22384 +                          evms_md_partition_name(node), node->total_vsectors);
22385 +               return -EINVAL;
22386 +       }
22387 +
22388 +       /*
22389 +        * Calculate the position of the superblock,
22390 +        * it's at the end of the disk
22391 +        */
22392 +       sb_offset_in_sectors = evms_md_calc_dev_sboffset(node, rdev->mddev, 1);
22393 +       rdev->sb_offset = (unsigned long)(sb_offset_in_sectors >> 1);
22394 +       LOG_DEBUG("(read) %s's sb offset("PFU64") total_vsectors("PFU64")\n",
22395 +                  evms_md_partition_name(node), sb_offset_in_sectors, node->total_vsectors);
22396 +
22397 +       /*
22398 +        * Read superblock
22399 +        */
22400 +       rc = INIT_IO(node, 0, sb_offset_in_sectors, MD_SB_SECTORS, rdev->sb);
22401 +
22402 +       return rc;
22403 +}
22404 +
22405 +static unsigned int calc_sb_csum (mdp_super_t * sb)
22406 +{
22407 +       unsigned int disk_csum, csum;
22408 +
22409 +       disk_csum = sb->sb_csum;
22410 +       sb->sb_csum = 0;
22411 +       csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
22412 +       sb->sb_csum = disk_csum;
22413 +       return csum;
22414 +}
22415 +
22416 +
22417 +
22418 +/*
22419 + * Check one RAID superblock for generic plausibility
22420 + */
22421 +
22422 +static int check_disk_sb (mdk_rdev_t * rdev)
22423 +{
22424 +       mdp_super_t *sb;
22425 +       int ret = -EINVAL;
22426 +
22427 +       sb = rdev->sb;
22428 +       if (!sb) {
22429 +               MD_BUG();
22430 +               goto abort;
22431 +       }
22432 +
22433 +       if (sb->md_magic != MD_SB_MAGIC) {
22434 +               goto abort;
22435 +       }
22436 +
22437 +       if (sb->md_minor >= MAX_MD_DEVS) {
22438 +               LOG_ERROR(BAD_MINOR, get_partition_name(rdev), sb->md_minor);
22439 +               goto abort;
22440 +       }
22441 +       if (calc_sb_csum(sb) != sb->sb_csum) {
22442 +               LOG_ERROR(BAD_CSUM, get_partition_name(rdev));
22443 +               goto abort;
22444 +       }
22445 +
22446 +       switch (sb->level) {
22447 +       case -1:
22448 +       case  0:
22449 +       case  1:
22450 +       case  5:
22451 +               break;
22452 +       default:
22453 +               LOG_ERROR("%s: EVMS MD does not support MD level %d\n", __FUNCTION__, sb->level);
22454 +               goto abort;
22455 +       }
22456 +       ret = 0;
22457 +abort:
22458 +       return ret;
22459 +}
22460 +
22461 +static kdev_t dev_unit(kdev_t dev)
22462 +{
22463 +       unsigned int mask;
22464 +       struct gendisk *hd = get_gendisk(dev);
22465 +
22466 +       if (!hd)
22467 +               return 0;
22468 +       mask = ~((1 << hd->minor_shift) - 1);
22469 +
22470 +       return MKDEV(MAJOR(dev), MINOR(dev) & mask);
22471 +}
22472 +
22473 +static mdk_rdev_t * match_dev_unit(mddev_t *mddev, kdev_t dev)
22474 +{
22475 +       struct list_head *tmp;
22476 +       mdk_rdev_t *rdev;
22477 +
22478 +       ITERATE_RDEV(mddev,rdev,tmp)
22479 +               if (dev_unit(rdev->dev) == dev_unit(dev))
22480 +                       return rdev;
22481 +
22482 +       return NULL;
22483 +}
22484 +
22485 +static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
22486 +{
22487 +       struct list_head *tmp;
22488 +       mdk_rdev_t *rdev;
22489 +
22490 +       ITERATE_RDEV(mddev1,rdev,tmp)
22491 +               if (match_dev_unit(mddev2, rdev->dev))
22492 +                       return 1;
22493 +
22494 +       return 0;
22495 +}
22496 +
22497 +
22498 +static void bind_rdev_to_array (mdk_rdev_t * rdev, mddev_t * mddev)
22499 +{
22500 +       mdk_rdev_t *same_pdev;
22501 +
22502 +       if (rdev->mddev) {
22503 +               MD_BUG();
22504 +               return;
22505 +       }
22506 +
22507 +       same_pdev = match_dev_unit(mddev, rdev->dev);
22508 +       if (same_pdev)
22509 +               LOG_WARNING("[md%d] WARNING: %s appears to be on the same physical disk as %s. True\n"
22510 +                           "     protection against single-disk failure might be compromised.\n",
22511 +                           mdidx(mddev), get_partition_name(rdev),get_partition_name(same_pdev));
22512 +
22513 +       list_add(&rdev->same_set, &mddev->disks);
22514 +       rdev->mddev = mddev;
22515 +       mddev->nb_dev++;
22516 +       if (rdev->sb && disk_active(&rdev->sb->this_disk))
22517 +               mddev->nr_raid_disks++;
22518 +       LOG_DETAILS("bind<%s,%d>\n", get_partition_name(rdev), rdev->mddev->nb_dev);
22519 +}
22520 +
22521 +static void unbind_rdev_from_array (mdk_rdev_t * rdev)
22522 +{
22523 +       if (!rdev->mddev) {
22524 +               MD_BUG();
22525 +               return;
22526 +       }
22527 +       list_del(&rdev->same_set);
22528 +       MD_INIT_LIST_HEAD(&rdev->same_set);
22529 +       rdev->mddev->nb_dev--;
22530 +       if (rdev->sb && disk_active(&rdev->sb->this_disk))
22531 +               rdev->mddev->nr_raid_disks--;
22532 +       LOG_DETAILS("unbind<%s,%d>\n", get_partition_name(rdev), rdev->mddev->nb_dev);
22533 +       rdev->mddev = NULL;
22534 +}
22535 +
22536 +
22537 +/*
22538 + * Function: evms_md_export_rdev
22539 + *     EVMS MD version of export_rdev()
22540 + *     Discard this MD "extended" device
22541 + */
22542 +static void evms_md_export_rdev (mdk_rdev_t * rdev, int delete_node)
22543 +{
22544 +       LOG_DETAILS("%s: (%s)\n", __FUNCTION__ , get_partition_name(rdev));
22545 +       if (rdev->mddev)
22546 +               MD_BUG();
22547 +       free_disk_sb(rdev);
22548 +       list_del(&rdev->all);
22549 +       MD_INIT_LIST_HEAD(&rdev->all);
22550 +       if (rdev->pending.next != &rdev->pending) {
22551 +               LOG_WARNING("%s: (%s was pending)\n",__FUNCTION__ ,get_partition_name(rdev));
22552 +               list_del(&rdev->pending);
22553 +               MD_INIT_LIST_HEAD(&rdev->pending);
22554 +       }
22555 +       if (rdev->node && delete_node) {
22556 +               if (cur_discover_list) {
22557 +                       LOG_DETAILS("%s: remove (%s) from discover list.\n", __FUNCTION__,
22558 +                               get_partition_name(rdev));
22559 +                       evms_cs_remove_logical_node_from_list(cur_discover_list, rdev->node);
22560 +               }
22561 +               LOG_DETAILS("%s: deleting node %s\n", __FUNCTION__, get_partition_name(rdev));
22562 +               DELETE(rdev->node);
22563 +               rdev->node = NULL;
22564 +       }
22565 +       rdev->dev = 0;
22566 +       rdev->faulty = 0;
22567 +       kfree(rdev);
22568 +}
22569 +
22570 +
22571 +static void kick_rdev_from_array (mdk_rdev_t * rdev)
22572 +{
22573 +       LOG_DEFAULT("%s: (%s)\n", __FUNCTION__,get_partition_name(rdev));
22574 +       unbind_rdev_from_array(rdev);
22575 +       evms_md_export_rdev(rdev, TRUE);
22576 +}
22577 +
22578 +static void export_array (mddev_t *mddev)
22579 +{
22580 +       struct list_head *tmp;
22581 +       mdk_rdev_t *rdev;
22582 +       mdp_super_t *sb = mddev->sb;
22583 +
22584 +       LOG_DEFAULT("%s: [md%d]\n",__FUNCTION__ ,mdidx(mddev));
22585 +       if (mddev->sb) {
22586 +               mddev->sb = NULL;
22587 +               free_page((unsigned long) sb);
22588 +       }
22589 +
22590 +       LOG_DEBUG("%s: removing all extended devices belong to md%d\n",__FUNCTION__,mdidx(mddev));
22591 +       ITERATE_RDEV(mddev,rdev,tmp) {
22592 +               if (!rdev->mddev) {
22593 +                       MD_BUG();
22594 +                       continue;
22595 +               }
22596 +               kick_rdev_from_array(rdev);
22597 +       }
22598 +       if (mddev->nb_dev)
22599 +               MD_BUG();
22600 +}
22601 +
22602 +static void free_mddev (mddev_t *mddev)
22603 +{
22604 +       struct evms_logical_node *node;
22605 +       struct evms_md *evms_md;
22606 +
22607 +       if (!mddev) {
22608 +               MD_BUG();
22609 +               return;
22610 +       }
22611 +
22612 +       node = mddev->node;
22613 +
22614 +       export_array(mddev);
22615 +       evms_md_size[mdidx(mddev)] = 0;
22616 +
22617 +
22618 +       /*
22619 +        * Make sure nobody else is using this mddev
22620 +        * (careful, we rely on the global kernel lock here)
22621 +        */
22622 +       while (atomic_read(&mddev->resync_sem.count) != 1)
22623 +               schedule();
22624 +       while (atomic_read(&mddev->recovery_sem.count) != 1)
22625 +               schedule();
22626 +
22627 +       evms_md_del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev)));
22628 +       list_del(&mddev->all_mddevs);
22629 +       INIT_LIST_HEAD(&mddev->all_mddevs);
22630 +       if (!list_empty(&mddev->running_mddevs)) {
22631 +               list_del(&mddev->running_mddevs);
22632 +               INIT_LIST_HEAD(&mddev->running_mddevs);
22633 +       }
22634 +       if (!list_empty(&mddev->incomplete_mddevs)) {
22635 +               list_del(&mddev->incomplete_mddevs);
22636 +               INIT_LIST_HEAD(&mddev->incomplete_mddevs);
22637 +       }
22638 +
22639 +       kfree(mddev);
22640 +       if (node) {
22641 +               evms_md = node->private;
22642 +               evms_md->mddev = NULL;
22643 +       }
22644 +       MOD_DEC_USE_COUNT;
22645 +       evms_md_destroy_recovery_thread();
22646 +}
22647 +
22648 +
22649 +static void print_desc(mdp_disk_t *desc)
22650 +{
22651 +       printk(" DISK<N:%d,R:%d,S:%d>\n", desc->number,
22652 +               desc->raid_disk,desc->state);
22653 +}
22654 +
22655 +static void print_sb(mdp_super_t *sb)
22656 +{
22657 +       int i;
22658 +
22659 +       printk(" SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
22660 +               sb->major_version, sb->minor_version, sb->patch_version,
22661 +               sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
22662 +               sb->ctime);
22663 +       printk("    L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", sb->level,
22664 +               sb->size, sb->nr_disks, sb->raid_disks, sb->md_minor,
22665 +               sb->layout, sb->chunk_size);
22666 +       printk("    UT:%08x ST:%d AD:%d WD:%d FD:%d SD:%d CSUM:%08x E:%x\n",
22667 +               sb->utime, sb->state, sb->active_disks, sb->working_disks,
22668 +               sb->failed_disks, sb->spare_disks,
22669 +               sb->sb_csum, sb->events_lo);
22670 +
22671 +       for (i = 0; i < MD_SB_DISKS; i++) {
22672 +               mdp_disk_t *desc;
22673 +
22674 +               desc = sb->disks + i;
22675 +               if (desc->number || desc->major || desc->minor || desc->raid_disk || (desc->state && (desc->state != 4))) {
22676 +                       printk("     D %2d: ", i);
22677 +                       print_desc(desc);
22678 +               }
22679 +       }
22680 +       printk("    THIS: ");
22681 +       print_desc(&sb->this_disk);
22682 +
22683 +}
22684 +
22685 +static void print_rdev(mdk_rdev_t *rdev)
22686 +{
22687 +       printk("rdev %s: SZ:%08ld F:%d DN:%d ",
22688 +               get_partition_name(rdev),
22689 +               rdev->size, rdev->faulty, rdev->desc_nr);
22690 +       if (rdev->sb) {
22691 +               printk("rdev superblock:\n");
22692 +               print_sb(rdev->sb);
22693 +       } else
22694 +               printk("no rdev superblock!\n");
22695 +}
22696 +
22697 +void evms_md_print_devices (void)
22698 +{
22699 +       struct list_head *tmp, *tmp2;
22700 +       mdk_rdev_t *rdev;
22701 +       mddev_t *mddev;
22702 +
22703 +       printk("\n");
22704 +       printk(":       **********************************\n");
22705 +       printk(":       * <COMPLETE RAID STATE PRINTOUT> *\n");
22706 +       printk(":       **********************************\n");
22707 +       ITERATE_MDDEV(mddev,tmp) {
22708 +               printk("md%d: ", mdidx(mddev));
22709 +
22710 +               ITERATE_RDEV(mddev,rdev,tmp2)
22711 +                       printk("<%s>", get_partition_name(rdev));
22712 +
22713 +               if (mddev->sb) {
22714 +                       printk(" array superblock:\n");
22715 +                       print_sb(mddev->sb);
22716 +               } else
22717 +                       printk(" no array superblock.\n");
22718 +
22719 +               ITERATE_RDEV(mddev,rdev,tmp2)
22720 +                       print_rdev(rdev);
22721 +       }
22722 +       printk(":       **********************************\n");
22723 +       printk("\n");
22724 +}
22725 +
22726 +static int sb_equal ( mdp_super_t *sb1, mdp_super_t *sb2)
22727 +{
22728 +       int ret;
22729 +       mdp_super_t *tmp1, *tmp2;
22730 +
22731 +       tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
22732 +       tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
22733 +
22734 +       if (!tmp1 || !tmp2) {
22735 +               ret = 0;
22736 +               printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
22737 +               goto abort;
22738 +       }
22739 +
22740 +       *tmp1 = *sb1;
22741 +       *tmp2 = *sb2;
22742 +
22743 +       /*
22744 +        * nr_disks is not constant
22745 +        */
22746 +       tmp1->nr_disks = 0;
22747 +       tmp2->nr_disks = 0;
22748 +
22749 +       if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
22750 +               ret = 0;
22751 +       else
22752 +               ret = 1;
22753 +
22754 +abort:
22755 +       if (tmp1)
22756 +               kfree(tmp1);
22757 +       if (tmp2)
22758 +               kfree(tmp2);
22759 +
22760 +       return ret;
22761 +}
22762 +
22763 +static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
22764 +{
22765 +       if (    (rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
22766 +               (rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
22767 +               (rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
22768 +               (rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))
22769 +
22770 +               return 1;
22771 +
22772 +       return 0;
22773 +}
22774 +
22775 +/*
22776 + * Function: evms_md_find_rdev_all
22777 + *     EVMS MD version of find_rdev_all()
22778 + *     Search entire all_raid_disks for "node"
22779 + *     Return the MD "extended" device if found.
22780 + */
22781 +static mdk_rdev_t * evms_md_find_rdev_all (struct evms_logical_node *node)
22782 +{
22783 +       struct list_head *tmp;
22784 +       mdk_rdev_t *rdev;
22785 +
22786 +       tmp = all_raid_disks.next;
22787 +       while (tmp != &all_raid_disks) {
22788 +               rdev = list_entry(tmp, mdk_rdev_t, all);
22789 +               if (rdev->node == node)
22790 +                       return rdev;
22791 +               tmp = tmp->next;
22792 +       }
22793 +       return NULL;
22794 +}
22795 +
22796 +/*
22797 + * Function: evms_md_find_mddev_all
22798 + */
22799 +static mddev_t * evms_md_find_mddev_all (struct evms_logical_node *node)
22800 +{
22801 +       struct list_head *tmp;
22802 +       mddev_t *mddev;
22803 +
22804 +       ITERATE_MDDEV(mddev,tmp) {
22805 +               if (mddev->node == node)
22806 +                       return mddev;
22807 +       }
22808 +       return NULL;
22809 +}
22810 +
22811 +
22812 +/*
22813 + * Function: evms_md_write_disk_sb
22814 + *     EVMS MD version of write_disk_sb
22815 + */
22816 +static int evms_md_write_disk_sb(mdk_rdev_t * rdev)
22817 +{
22818 +       unsigned long size;
22819 +       u64 sb_offset_in_sectors;
22820 +
22821 +       if (!rdev->sb) {
22822 +               MD_BUG();
22823 +               return 1;
22824 +       }
22825 +       if (rdev->faulty) {
22826 +               MD_BUG();
22827 +               return 1;
22828 +       }
22829 +       if (rdev->sb->md_magic != MD_SB_MAGIC) {
22830 +               MD_BUG();
22831 +               return 1;
22832 +       }
22833 +
22834 +       sb_offset_in_sectors = evms_md_calc_dev_sboffset(rdev->node, rdev->mddev, 1);
22835 +       if (rdev->sb_offset != (sb_offset_in_sectors >> 1)) {
22836 +               LOG_WARNING("%s's sb offset has changed from blocks(%ld) to blocks(%ld), skipping\n",
22837 +                          get_partition_name(rdev),
22838 +                          rdev->sb_offset,
22839 +                          (unsigned long)(sb_offset_in_sectors >> 1));
22840 +               goto skip;
22841 +       }
22842 +       /*
22843 +        * If the disk went offline meanwhile and it's just a spare, then
22844 +        * its size has changed to zero silently, and the MD code does
22845 +        * not yet know that it's faulty.
22846 +        */
22847 +       size = evms_md_calc_dev_size(rdev->node, rdev->mddev, 1);
22848 +       if (size != rdev->size) {
22849 +               LOG_WARNING("%s's size has changed from %ld to %ld since import, skipping\n",
22850 +                          get_partition_name(rdev), rdev->size, size);
22851 +               goto skip;
22852 +       }
22853 +
22854 +       LOG_DETAILS("(write) %s's sb offset: "PFU64"\n",get_partition_name(rdev), sb_offset_in_sectors);
22855 +
22856 +       INIT_IO(rdev->node,WRITE,sb_offset_in_sectors,MD_SB_SECTORS,rdev->sb);
22857 +
22858 +skip:
22859 +       return 0;
22860 +}
22861 +
22862 +static int evms_md_sync_sbs(mddev_t * mddev)
22863 +{
22864 +       mdk_rdev_t *rdev;
22865 +       struct list_head *tmp;
22866 +       mdp_disk_t * disk;
22867 +
22868 +       ITERATE_RDEV(mddev,rdev,tmp) {
22869 +               if (rdev->virtual_spare || rdev->faulty)
22870 +                       continue;
22871 +
22872 +               /* copy everything from the master */
22873 +               memcpy(rdev->sb, mddev->sb, sizeof(mdp_super_t));
22874 +
22875 +               /* this_disk is unique, copy it from the master */
22876 +//             rdev->sb->this_disk = mddev->sb->disks[rdev->desc_nr];
22877 +               // use the SB disk array since if update occurred on normal shutdown
22878 +               // the rdevs may be out of date.
22879 +               disk = evms_md_find_disk(mddev, rdev->dev);
22880 +               if (disk) {
22881 +                       rdev->sb->this_disk = *disk;
22882 +               }
22883 +
22884 +               rdev->sb->sb_csum = calc_sb_csum(rdev->sb);
22885 +       }
22886 +       return 0;
22887 +}
22888 +
22889 +static int evms_md_update_sb_sync(mddev_t * mddev, int clean)
22890 +{
22891 +       mdk_rdev_t *rdev;
22892 +       struct list_head *tmp;
22893 +       int rc = 0;
22894 +       int found = FALSE;
22895 +
22896 +       ITERATE_RDEV(mddev,rdev,tmp) {
22897 +
22898 +               if (rdev->virtual_spare || rdev->faulty)
22899 +                       continue;
22900 +
22901 +               if ((rc = evms_md_read_disk_sb(rdev))) {
22902 +                       LOG_ERROR("%s: error reading superblock on %s!\n",
22903 +                                 __FUNCTION__, evms_md_partition_name(rdev->node));
22904 +                       break;
22905 +               }
22906 +
22907 +               if ((rc = check_disk_sb(rdev))) {
22908 +                       LOG_ERROR("%s: %s has invalid sb!\n",
22909 +                                 __FUNCTION__, evms_md_partition_name(rdev->node));
22910 +                       break;
22911 +               }
22912 +
22913 +               rdev->desc_nr = rdev->sb->this_disk.number;
22914 +               rdev->dev = MKDEV(rdev->sb->this_disk.major, rdev->sb->this_disk.minor);
22915 +
22916 +               /* copy master superlbock from the first good rdev */
22917 +               if (!found) {
22918 +                       found = TRUE;
22919 +                       memcpy(mddev->sb, rdev->sb, sizeof(mdp_super_t));
22920 +                       if (clean)
22921 +                               mddev->sb->state |= 1 << MD_SB_CLEAN;
22922 +                       else
22923 +                               mddev->sb->state &= ~(1 << MD_SB_CLEAN);
22924 +               }
22925 +       }
22926 +       if (!rc && found) {
22927 +               evms_md_update_sb(mddev);
22928 +       } else {
22929 +               LOG_SERIOUS("%s: BUG! BUG! superblocks will not be updated!\n", __FUNCTION__);
22930 +       }
22931 +       return rc;
22932 +
22933 +}
22934 +
22935 +int evms_md_update_sb(mddev_t * mddev)
22936 +{
22937 +       int err, count = 100;
22938 +       struct list_head *tmp;
22939 +       mdk_rdev_t *rdev;
22940 +
22941 +
22942 +repeat:
22943 +       mddev->sb->utime = CURRENT_TIME;
22944 +       if ((++mddev->sb->events_lo)==0)
22945 +               ++mddev->sb->events_hi;
22946 +
22947 +       if ((mddev->sb->events_lo|mddev->sb->events_hi)==0) {
22948 +               /*
22949 +                * oops, this 64-bit counter should never wrap.
22950 +                * Either we are in around ~1 trillion A.C., assuming
22951 +                * 1 reboot per second, or we have a bug:
22952 +                */
22953 +               MD_BUG();
22954 +               mddev->sb->events_lo = mddev->sb->events_hi = 0xffffffff;
22955 +       }
22956 +       evms_md_sync_sbs(mddev);
22957 +
22958 +       /*
22959 +        * do not write anything to disk if using
22960 +        * nonpersistent superblocks
22961 +        */
22962 +       if (mddev->sb->not_persistent)
22963 +               return 0;
22964 +
22965 +       LOG_DETAILS("%s: updating [md%d] superblock\n",__FUNCTION__ ,mdidx(mddev));
22966 +
22967 +       err = 0;
22968 +       ITERATE_RDEV(mddev,rdev,tmp) {
22969 +               if (!rdev->virtual_spare && !rdev->faulty) {
22970 +                       LOG_DETAILS(" %s [events: %x]",
22971 +                               get_partition_name(rdev),
22972 +                               rdev->sb->events_lo);
22973 +                       err += evms_md_write_disk_sb(rdev);
22974 +               } else {
22975 +                       if (rdev->faulty)
22976 +                               LOG_DETAILS(" skipping faulty %s\n", get_partition_name(rdev));
22977 +                       if (rdev->virtual_spare)
22978 +                               LOG_DETAILS(" skipping virtual spare.\n");
22979 +               }
22980 +       }
22981 +       if (err) {
22982 +               if (--count) {
22983 +                       LOG_WARNING("errors occurred during superblock update, repeating\n");
22984 +                       goto repeat;
22985 +               }
22986 +               LOG_ERROR("excessive errors occurred during superblock update, exiting\n");
22987 +       }
22988 +       return 0;
22989 +}
22990 +
22991 +/*
22992 + * Function: evms_md_import_device
22993 + *     Insure that node is not yet imported.
22994 + *     Read and validate the MD super block on this device
22995 + *     Add to the global MD "extended" devices list (all_raid_disks)
22996 + *
22997 + */
22998 +static int evms_md_import_device (struct evms_logical_node **discover_list,
22999 +                                 struct evms_logical_node *node)
23000 +{
23001 +       int err;
23002 +       mdk_rdev_t *rdev;
23003 +
23004 +       LOG_ENTRY_EXIT("%s: discovering %s\n",__FUNCTION__,evms_md_partition_name(node));
23005 +
23006 +       if (evms_md_find_rdev_all(node)) {
23007 +               LOG_DEBUG("%s exists\n", evms_md_partition_name(node));
23008 +               return -EEXIST;
23009 +       }
23010 +
23011 +       rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
23012 +       if (!rdev) {
23013 +               LOG_ERROR("could not alloc mem for %s!\n", evms_md_partition_name(node));
23014 +               return -ENOMEM;
23015 +       }
23016 +       memset(rdev, 0, sizeof(*rdev));
23017 +
23018 +       if ((err = alloc_disk_sb(rdev)))
23019 +               goto abort_free;
23020 +
23021 +       rdev->node = node; /* set this for evms_md_read_disk_sb() */
23022 +
23023 +       rdev->desc_nr = -1;
23024 +       rdev->faulty = 0;
23025 +
23026 +       if (!node->total_vsectors) {
23027 +               LOG_ERROR("%s has zero size!\n", evms_md_partition_name(node));
23028 +               err = -EINVAL;
23029 +               goto abort_free;
23030 +       }
23031 +
23032 +       if ((err = evms_md_read_disk_sb(rdev))) {
23033 +               LOG_EXTRA("could not read %s's sb, not importing!\n",evms_md_partition_name(node));
23034 +               goto abort_free;
23035 +       }
23036 +       if ((err = check_disk_sb(rdev))) {
23037 +               LOG_EXTRA("%s has invalid sb, not importing!\n",evms_md_partition_name(node));
23038 +               goto abort_free;
23039 +       }
23040 +       rdev->desc_nr = rdev->sb->this_disk.number;
23041 +       rdev->dev = MKDEV(rdev->sb->this_disk.major, rdev->sb->this_disk.minor);
23042 +       LOG_DETAILS("FOUND %s desc_nr(%d)\n", get_partition_name(rdev), rdev->desc_nr);
23043 +       list_add(&rdev->all, &all_raid_disks);
23044 +       MD_INIT_LIST_HEAD(&rdev->pending);
23045 +
23046 +       if (rdev->faulty && rdev->sb)
23047 +               free_disk_sb(rdev);
23048 +
23049 +       return 0;
23050 +
23051 +abort_free:
23052 +       if (rdev->sb) {
23053 +               free_disk_sb(rdev);
23054 +       }
23055 +       kfree(rdev);
23056 +       return err;
23057 +}
23058 +
23059 +
23060 +
23061 +/*
23062 + * Function: evms_md_analyze_sbs
23063 + *     EVMS MD version of analyze_sbs()
23064 + */
23065 +static int evms_md_analyze_sbs (mddev_t * mddev)
23066 +{
23067 +       int out_of_date = 0, i;
23068 +       struct list_head *tmp, *tmp2;
23069 +       mdk_rdev_t *rdev, *rdev2, *freshest;
23070 +       mdp_super_t *sb;
23071 +
23072 +       LOG_ENTRY_EXIT("Analyzing all superblocks...\n");
23073 +       /*
23074 +        * Verify the RAID superblock on each real device
23075 +        */
23076 +       ITERATE_RDEV(mddev,rdev,tmp) {
23077 +               if (rdev->faulty) {
23078 +                       MD_BUG();
23079 +                       goto abort;
23080 +               }
23081 +               if (!rdev->sb) {
23082 +                       MD_BUG();
23083 +                       goto abort;
23084 +               }
23085 +               if (check_disk_sb(rdev))
23086 +                       goto abort;
23087 +       }
23088 +
23089 +       /*
23090 +        * The superblock constant part has to be the same
23091 +        * for all disks in the array.
23092 +        */
23093 +       sb = NULL;
23094 +
23095 +       ITERATE_RDEV(mddev,rdev,tmp) {
23096 +               if (!sb) {
23097 +                       sb = rdev->sb;
23098 +                       continue;
23099 +               }
23100 +               if (!sb_equal(sb, rdev->sb)) {
23101 +                       LOG_WARNING("kick out %s\n",get_partition_name(rdev));
23102 +                       kick_rdev_from_array(rdev);
23103 +                       continue;
23104 +               }
23105 +       }
23106 +
23107 +       /*
23108 +        * OK, we have all disks and the array is ready to run. Let's
23109 +        * find the freshest superblock, that one will be the superblock
23110 +        * that represents the whole array.
23111 +        */
23112 +       if (!mddev->sb)
23113 +               if (alloc_array_sb(mddev))
23114 +                       goto abort;
23115 +       sb = mddev->sb;
23116 +       freshest = NULL;
23117 +
23118 +       ITERATE_RDEV(mddev,rdev,tmp) {
23119 +               __u64 ev1, ev2;
23120 +               /*
23121 +                * if the checksum is invalid, use the superblock
23122 +                * only as a last resort. (decrease it's age by
23123 +                * one event)
23124 +                */
23125 +               if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
23126 +                       if (rdev->sb->events_lo || rdev->sb->events_hi)
23127 +                               if ((rdev->sb->events_lo--)==0)
23128 +                                       rdev->sb->events_hi--;
23129 +               }
23130 +               LOG_DETAILS("%s's event counter: %x\n",get_partition_name(rdev), rdev->sb->events_lo);
23131 +
23132 +               if (!freshest) {
23133 +                       freshest = rdev;
23134 +                       continue;
23135 +               }
23136 +               /*
23137 +                * Find the newest superblock version
23138 +                */
23139 +               ev1 = md_event(rdev->sb);
23140 +               ev2 = md_event(freshest->sb);
23141 +               if (ev1 != ev2) {
23142 +                       out_of_date = 1;
23143 +                       if (ev1 > ev2)
23144 +                               freshest = rdev;
23145 +               }
23146 +       }
23147 +       if (out_of_date) {
23148 +               LOG_WARNING("OUT OF DATE, freshest: %s\n",get_partition_name(freshest));
23149 +       }
23150 +       memcpy (sb, freshest->sb, sizeof(*sb));
23151 +
23152 +       /*
23153 +        * at this point we have picked the 'best' superblock
23154 +        * from all available superblocks.
23155 +        * now we validate this superblock and kick out possibly
23156 +        * failed disks.
23157 +        */
23158 +       ITERATE_RDEV(mddev,rdev,tmp) {
23159 +               /*
23160 +                * Kick all non-fresh devices
23161 +                */
23162 +               __u64 ev1, ev2;
23163 +               ev1 = md_event(rdev->sb);
23164 +               ev2 = md_event(sb);
23165 +               if (ev1 < ev2) {
23166 +                       if (ev1) {
23167 +                               LOG_WARNING("kicking non-fresh %s from array!\n",get_partition_name(rdev));
23168 +                               kick_rdev_from_array(rdev);
23169 +                       continue;
23170 +                       } else {
23171 +                               LOG_DETAILS("%s is a new spare.\n",get_partition_name(rdev));
23172 +                       }
23173 +               }
23174 +       }
23175 +
23176 +       /*
23177 +        * Remove unavailable and faulty devices ...
23178 +        *
23179 +        * note that if an array becomes completely unrunnable due to
23180 +        * missing devices, we do not write the superblock back, so the
23181 +        * administrator has a chance to fix things up. The removal thus
23182 +        * only happens if it's nonfatal to the contents of the array.
23183 +        */
23184 +       for (i = 0; i < MD_SB_DISKS; i++) {
23185 +               int found;
23186 +               mdp_disk_t *desc;
23187 +
23188 +               desc = sb->disks + i;
23189 +
23190 +               /*
23191 +                * We kick faulty devices/descriptors immediately.
23192 +                *
23193 +                * Note: multipath devices are a special case.  Since we
23194 +                * were able to read the superblock on the path, we don't
23195 +                * care if it was previously marked as faulty, it's up now
23196 +                * so enable it.
23197 +                */
23198 +               if (disk_faulty(desc) && mddev->sb->level != -4) {
23199 +                       found = 0;
23200 +                       ITERATE_RDEV(mddev,rdev,tmp) {
23201 +                               if (rdev->desc_nr != desc->number)
23202 +                                       continue;
23203 +                               LOG_WARNING("[md%d] kicking faulty %s!\n",mdidx(mddev),get_partition_name(rdev));
23204 +                               kick_rdev_from_array(rdev);
23205 +                               found = 1;
23206 +                               break;
23207 +                       }
23208 +                       if (!found) {
23209 +                               LOG_WARNING("%s: [md%d] found former faulty device [number=%d]\n",
23210 +                                           __FUNCTION__ ,mdidx(mddev), desc->number);
23211 +                       }
23212 +                       /*
23213 +                        * Don't call remove_descriptor(),
23214 +                        * let the administrator remove it from the user-land */
23215 +                       /* remove_descriptor(desc, sb); */
23216 +                       continue;
23217 +               } else if (disk_faulty(desc)) {
23218 +                       /*
23219 +                        * multipath entry marked as faulty, unfaulty it
23220 +                        */
23221 +                       kdev_t dev;
23222 +
23223 +                       dev = MKDEV(desc->major, desc->minor);
23224 +
23225 +                       rdev = evms_md_find_rdev(mddev, dev);
23226 +                       if (rdev)
23227 +                               mark_disk_spare(desc);
23228 +                       else {
23229 +                               LOG_WARNING("%s: [md%d] (MULTIPATH) found former faulty device [number=%d]\n",
23230 +                                           __FUNCTION__ ,mdidx(mddev), desc->number);
23231 +                               /*
23232 +                                * Don't call remove_descriptor(),
23233 +                                * let the administrator remove it from the user-land */
23234 +                               /* remove_descriptor(desc, sb); */
23235 +                       }
23236 +               }
23237 +
23238 +               /*
23239 +                * Is this device present in the rdev ring?
23240 +                */
23241 +               found = 0;
23242 +               ITERATE_RDEV(mddev,rdev,tmp) {
23243 +                       /*
23244 +                        * Multi-path IO special-case: since we have no
23245 +                        * this_disk descriptor at auto-detect time,
23246 +                        * we cannot check rdev->number.
23247 +                        * We can check the device though.
23248 +                        */
23249 +                       if ((sb->level == -4) && (rdev->dev ==
23250 +                                       MKDEV(desc->major,desc->minor))) {
23251 +                               found = 1;
23252 +                               break;
23253 +                       }
23254 +                       if (rdev->desc_nr == desc->number) {
23255 +                               found = 1;
23256 +                               break;
23257 +                       }
23258 +               }
23259 +               if (found)
23260 +                       continue;
23261 +
23262 +               LOG_WARNING(" [md%d]: former device [number=%d] is unavailable!\n",
23263 +                           mdidx(mddev), desc->number);
23264 +               remove_descriptor(desc, sb);
23265 +       }
23266 +
23267 +       /*
23268 +        * Kick all rdevs that are not in the
23269 +        * descriptor array:
23270 +        */
23271 +       ITERATE_RDEV(mddev,rdev,tmp) {
23272 +               if (rdev->desc_nr == -1)
23273 +                       kick_rdev_from_array(rdev);
23274 +       }
23275 +
23276 +       /*
23277 +        * Do a final reality check.
23278 +        */
23279 +       if (mddev->sb->level != -4) {
23280 +               ITERATE_RDEV(mddev,rdev,tmp) {
23281 +                       if (rdev->desc_nr == -1) {
23282 +                               MD_BUG();
23283 +                               goto abort;
23284 +                       }
23285 +                       /*
23286 +                        * is the desc_nr unique?
23287 +                        */
23288 +                       ITERATE_RDEV(mddev,rdev2,tmp2) {
23289 +                               if ((rdev2 != rdev) &&
23290 +                                               (rdev2->desc_nr == rdev->desc_nr)) {
23291 +                                       MD_BUG();
23292 +                                       goto abort;
23293 +                               }
23294 +                       }
23295 +               }
23296 +       }
23297 +
23298 +#define OLD_VERSION KERN_ALERT \
23299 +"md%d: unsupported raid array version %d.%d.%d\n"
23300 +
23301 +#define NOT_CLEAN_IGNORE KERN_ERR \
23302 +"md%d: raid array is not clean -- starting background reconstruction\n"
23303 +
23304 +       /*
23305 +        * Check if we can support this RAID array
23306 +        */
23307 +       if (sb->major_version != MD_MAJOR_VERSION ||
23308 +                       sb->minor_version > MD_MINOR_VERSION) {
23309 +
23310 +               LOG_ERROR("[md%d] unsupported raid array version %d.%d.%d\n",
23311 +                          mdidx(mddev),
23312 +                          sb->major_version,
23313 +                          sb->minor_version,
23314 +                          sb->patch_version);
23315 +               goto abort;
23316 +       }
23317 +
23318 +       if ((sb->state != (1 << MD_SB_CLEAN)) && ((sb->level == 1) ||
23319 +                       (sb->level == 4) || (sb->level == 5)))
23320 +               LOG_WARNING("[md%d, level=%d] raid array is not clean -- starting background reconstruction\n",
23321 +                           mdidx(mddev), sb->level);
23322 +
23323 +       LOG_ENTRY_EXIT("analysis of all superblocks is OK!\n");
23324 +       return 0;
23325 +abort:
23326 +       LOG_WARNING("ABORT analyze_sbs()!!!\n");
23327 +       return 1;
23328 +}
23329 +
23330 +
23331 +static int device_size_calculation (mddev_t * mddev)
23332 +{
23333 +       int data_disks = 0, persistent;
23334 +       //unsigned int readahead;
23335 +       mdp_super_t *sb = mddev->sb;
23336 +       struct list_head *tmp;
23337 +       mdk_rdev_t *rdev;
23338 +
23339 +       /*
23340 +        * Do device size calculation. Bail out if too small.
23341 +        * (we have to do this after having validated chunk_size,
23342 +        * because device size has to be modulo chunk_size)
23343 +        */
23344 +       persistent = !mddev->sb->not_persistent;
23345 +       ITERATE_RDEV(mddev,rdev,tmp) {
23346 +               if (rdev->faulty)
23347 +                       continue;
23348 +               if (rdev->size) {
23349 +                       LOG_DEFAULT("%s: already calculated %s\n", __FUNCTION__, get_partition_name(rdev));
23350 +                       continue;
23351 +               }
23352 +               rdev->size = evms_md_calc_dev_size(rdev->node, mddev, persistent);
23353 +               if (rdev->size < sb->chunk_size / 1024) {
23354 +                       LOG_WARNING("Dev %s smaller than chunk_size: %ldk < %dk\n",
23355 +                                  get_partition_name(rdev), rdev->size, sb->chunk_size / 1024);
23356 +                       return -EINVAL;
23357 +               }
23358 +       }
23359 +
23360 +       switch (sb->level) {
23361 +               case -4:
23362 +                       data_disks = 1;
23363 +                       break;
23364 +               case -3:
23365 +                       data_disks = 1;
23366 +                       break;
23367 +               case -2:
23368 +                       data_disks = 1;
23369 +                       break;
23370 +               case -1:
23371 +                       zoned_raid_size(mddev);
23372 +                       data_disks = 1;
23373 +                       break;
23374 +               case 0:
23375 +                       zoned_raid_size(mddev);
23376 +                       data_disks = sb->raid_disks;
23377 +                       break;
23378 +               case 1:
23379 +                       data_disks = 1;
23380 +                       break;
23381 +               case 4:
23382 +               case 5:
23383 +                       data_disks = sb->raid_disks-1;
23384 +                       break;
23385 +               default:
23386 +                       LOG_ERROR("[md%d] unkown level %d\n", mdidx(mddev), sb->level);
23387 +                       goto abort;
23388 +       }
23389 +       if (!evms_md_size[mdidx(mddev)])
23390 +               evms_md_size[mdidx(mddev)] = sb->size * data_disks;
23391 +
23392 +       return 0;
23393 +abort:
23394 +       return 1;
23395 +}
23396 +
23397 +
23398 +#define TOO_BIG_CHUNKSIZE KERN_ERR \
23399 +"too big chunk_size: %d > %d\n"
23400 +
23401 +#define TOO_SMALL_CHUNKSIZE KERN_ERR \
23402 +"too small chunk_size: %d < %ld\n"
23403 +
23404 +#define BAD_CHUNKSIZE KERN_ERR \
23405 +"no chunksize specified, see 'man raidtab'\n"
23406 +
23407 +static int do_md_run (mddev_t * mddev)
23408 +{
23409 +       int pnum, err;
23410 +       int chunk_size;
23411 +       struct list_head *tmp;
23412 +       mdk_rdev_t *rdev;
23413 +
23414 +
23415 +       if (!mddev->nb_dev) {
23416 +               MD_BUG();
23417 +               return -EINVAL;
23418 +       }
23419 +
23420 +       if (mddev->pers)
23421 +               return -EBUSY;
23422 +
23423 +       /*
23424 +        * Resize disks to align partitions size on a given
23425 +        * chunk size.
23426 +        */
23427 +       evms_md_size[mdidx(mddev)] = 0;
23428 +
23429 +       /*
23430 +        * Analyze all RAID superblock(s)
23431 +        */
23432 +       if (evms_md_analyze_sbs(mddev)) {
23433 +               MD_BUG();
23434 +               return -EINVAL;
23435 +       }
23436 +
23437 +       mddev->chunk_size = chunk_size = mddev->sb->chunk_size;
23438 +       pnum = level_to_pers(mddev->sb->level);
23439 +
23440 +       if ((pnum != MULTIPATH) && (pnum != RAID1)) {
23441 +               if (!chunk_size) {
23442 +                       /*
23443 +                        * 'default chunksize' in the old md code used to
23444 +                        * be PAGE_SIZE, baaad.
23445 +                        * we abort here to be on the safe side. We dont
23446 +                        * want to continue the bad practice.
23447 +                        */
23448 +                       printk(BAD_CHUNKSIZE);
23449 +                       return -EINVAL;
23450 +               }
23451 +               if (chunk_size > MAX_CHUNK_SIZE) {
23452 +                       printk(TOO_BIG_CHUNKSIZE, chunk_size, MAX_CHUNK_SIZE);
23453 +                       return -EINVAL;
23454 +               }
23455 +               /*
23456 +                * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
23457 +                */
23458 +               if ( (1 << ffz(~chunk_size)) != chunk_size) {
23459 +                       MD_BUG();
23460 +                       return -EINVAL;
23461 +               }
23462 +               if (chunk_size < PAGE_SIZE) {
23463 +                       printk(TOO_SMALL_CHUNKSIZE, chunk_size, PAGE_SIZE);
23464 +                       return -EINVAL;
23465 +               }
23466 +       } else
23467 +               if (chunk_size)
23468 +                       printk(KERN_INFO "RAID level %d does not need chunksize! Continuing anyway.\n", mddev->sb->level);
23469 +
23470 +       if (pnum >= MAX_PERSONALITY) {
23471 +               MD_BUG();
23472 +               return -EINVAL;
23473 +       }
23474 +       if (!pers[pnum])
23475 +       {
23476 +#ifdef CONFIG_KMOD
23477 +               char module_name[80];
23478 +               sprintf (module_name, "md-personality-%d", pnum);
23479 +               request_module (module_name);
23480 +               if (!pers[pnum])
23481 +#endif
23482 +               {
23483 +                       printk(KERN_ERR "personality %d is not loaded!\n",
23484 +                               pnum);
23485 +                       return -EINVAL;
23486 +               }
23487 +       }
23488 +       if (device_size_calculation(mddev))
23489 +               return -EINVAL;
23490 +
23491 +       /*
23492 +        * Drop all container device buffers, from now on
23493 +        * the only valid external interface is through the md
23494 +        * device.
23495 +        * Also find largest hardsector size
23496 +        */
23497 +       md_hardsect_sizes[mdidx(mddev)] = 512;
23498 +       ITERATE_RDEV(mddev,rdev,tmp) {
23499 +               if (rdev->faulty)
23500 +                       continue;
23501 +               invalidate_device(rdev->dev, 1);
23502 +/*             if (get_hardsect_size(rdev->dev)
23503 +                       > md_hardsect_sizes[mdidx(mddev)])
23504 +                       md_hardsect_sizes[mdidx(mddev)] =
23505 +                               get_hardsect_size(rdev->dev); */
23506 +               if (rdev->node->hardsector_size  > md_hardsect_sizes[mdidx(mddev)]) {
23507 +                       md_hardsect_sizes[mdidx(mddev)] = rdev->node->hardsector_size;
23508 +               }
23509 +
23510 +       }
23511 +       md_blocksizes[mdidx(mddev)] = 1024;
23512 +       if (md_blocksizes[mdidx(mddev)] < md_hardsect_sizes[mdidx(mddev)])
23513 +               md_blocksizes[mdidx(mddev)] = md_hardsect_sizes[mdidx(mddev)];
23514 +
23515 +       mddev->pers = pers[pnum];
23516 +
23517 +       err = mddev->pers->run(mddev);
23518 +       if (err) {
23519 +               LOG_WARNING("%s: pers->run() failed.\n", __FUNCTION__);
23520 +               mddev->pers = NULL;
23521 +               return -EINVAL;
23522 +       }
23523 +       mddev->sb->state &= ~(1 << MD_SB_CLEAN);
23524 +
23525 +       evms_md_update_sb(mddev);
23526 +
23527 +       if (incomplete_mddev(mddev)) {
23528 +               LOG_DEFAULT("%s: [md%d] was incomplete!\n", __FUNCTION__, mdidx(mddev));
23529 +               list_del(&mddev->incomplete_mddevs);
23530 +               INIT_LIST_HEAD(&mddev->incomplete_mddevs);
23531 +       }
23532 +
23533 +       list_add(&mddev->running_mddevs, &running_mddevs);
23534 +
23535 +       return (0);
23536 +}
23537 +
23538 +#undef TOO_BIG_CHUNKSIZE
23539 +#undef BAD_CHUNKSIZE
23540 +
23541 +
23542 +#define OUT(x) do { err = (x); goto out; } while (0)
23543 +
23544 +
23545 +#define STILL_MOUNTED KERN_WARNING \
23546 +"md%d still mounted.\n"
23547 +#define        STILL_IN_USE \
23548 +"md%d still in use.\n"
23549 +
23550 +static int do_md_stop (mddev_t * mddev, int ro)
23551 +{
23552 +       int err = 0, resync_interrupted = 0, clean = 0;
23553 +       kdev_t dev = mddev_to_kdev(mddev);
23554 +
23555 +       if (atomic_read(&mddev->active)>1) {
23556 +               printk(STILL_IN_USE, mdidx(mddev));
23557 +               OUT(-EBUSY);
23558 +       }
23559 +
23560 +       if (mddev->pers) {
23561 +               /*
23562 +                * It is safe to call stop here, it only frees private
23563 +                * data. Also, it tells us if a device is unstoppable
23564 +                * (eg. resyncing is in progress)
23565 +                */
23566 +               if (mddev->pers->stop_resync)
23567 +                       if (mddev->pers->stop_resync(mddev))
23568 +                               resync_interrupted = 1;
23569 +
23570 +               if (mddev->recovery_running)
23571 +                       evms_cs_interrupt_thread(evms_md_recovery_thread);
23572 +
23573 +               /*
23574 +                * This synchronizes with signal delivery to the
23575 +                * resync or reconstruction thread. It also nicely
23576 +                * hangs the process if some reconstruction has not
23577 +                * finished.
23578 +                */
23579 +               down(&mddev->recovery_sem);
23580 +               up(&mddev->recovery_sem);
23581 +
23582 +               invalidate_device(dev, 1);
23583 +
23584 +               if (ro) {
23585 +                       if (mddev->ro)
23586 +                               OUT(-ENXIO);
23587 +                       mddev->ro = 1;
23588 +                       mddev->node->plugin = &md_plugin_header;
23589 +               } else {
23590 +                       if (mddev->ro)
23591 +                               set_device_ro(dev, 0);
23592 +                       if (mddev->pers->stop(mddev)) {
23593 +                               if (mddev->ro)
23594 +                                       set_device_ro(dev, 1);
23595 +                               OUT(-EBUSY);
23596 +                       }
23597 +                       if (mddev->ro)
23598 +                               mddev->ro = 0;
23599 +               }
23600 +               if (mddev->sb) {
23601 +                       /*
23602 +                        * mark it clean only if there was no resync
23603 +                        * interrupted.
23604 +                        */
23605 +                       if (!mddev->recovery_running && !resync_interrupted) {
23606 +                               LOG_DEBUG("%s: marking sb clean...\n", __FUNCTION__);
23607 +                               clean = 1;
23608 +                       }
23609 +                       evms_md_update_sb_sync(mddev, clean);
23610 +               }
23611 +               if (ro)
23612 +                       set_device_ro(dev, 1);
23613 +       }
23614 +
23615 +       /*
23616 +        * Free resources if final stop
23617 +        */
23618 +       if (!ro) {
23619 +               printk (KERN_INFO "md%d stopped.\n", mdidx(mddev));
23620 +               free_mddev(mddev);
23621 +
23622 +       } else
23623 +               printk (KERN_INFO
23624 +                       "md%d switched to read-only mode.\n", mdidx(mddev));
23625 +out:
23626 +       return err;
23627 +}
23628 +
23629 +
23630 +static int evms_md_run_array (struct evms_logical_node ** discover_list, mddev_t *mddev)
23631 +{
23632 +       mdk_rdev_t *rdev;
23633 +       struct list_head *tmp;
23634 +       int err = 0;
23635 +       uint flags = 0;
23636 +
23637 +       if (mddev->disks.prev == &mddev->disks) {
23638 +               MD_BUG();
23639 +               return -EINVAL;
23640 +       }
23641 +
23642 +       LOG_DETAILS("%s: trying to run array md%d\n", __FUNCTION__,mdidx(mddev) );
23643 +
23644 +       ITERATE_RDEV(mddev,rdev,tmp) {
23645 +               LOG_DETAILS(" <%s>\n", get_partition_name(rdev));
23646 +       }
23647 +
23648 +       err = do_md_run (mddev);
23649 +       if (!err) {
23650 +               /*
23651 +                * remove all nodes consumed by this md device from the discover list
23652 +                */
23653 +               ITERATE_RDEV(mddev,rdev,tmp) {
23654 +                       LOG_DETAILS(" removing %s from discover list.\n", get_partition_name(rdev));
23655 +                       evms_cs_remove_logical_node_from_list(discover_list,rdev->node);
23656 +                       flags |= rdev->node->flags;
23657 +               }
23658 +               err = evms_md_create_logical_node(discover_list,mddev,flags);
23659 +               if (!err) {
23660 +                       exported_nodes++;
23661 +               }
23662 +       } else {
23663 +               LOG_WARNING("%s: could not start [md%d] containing: \n",__FUNCTION__,mdidx(mddev));
23664 +               ITERATE_RDEV(mddev,rdev,tmp) {
23665 +                       LOG_WARNING("  (%s, desc_nr=%d)\n", get_partition_name(rdev), rdev->desc_nr);
23666 +               }
23667 +               LOG_WARNING("%s: will try restart [md%d] again later.\n",__FUNCTION__,mdidx(mddev));
23668 +
23669 +               mddev->sb_dirty = 0;
23670 +       }
23671 +       return err;
23672 +}
23673 +
23674 +static void evms_md_run_incomplete_array (struct evms_logical_node ** discover_list, mddev_t *mddev)
23675 +{
23676 +       mdk_rdev_t *rdev;
23677 +
23678 +       LOG_DEFAULT("%s [md%d]\n",
23679 +                   __FUNCTION__, mdidx(mddev));
23680 +       if (evms_md_run_array(discover_list,mddev) == 0) {
23681 +               /*
23682 +                * We succeeded running this MD device.
23683 +                * Now read MD superblock on this newly created MD node.
23684 +                */
23685 +               if (mddev->node &&
23686 +                   (evms_md_import_device(discover_list,mddev->node) == 0)) {
23687 +                       /*
23688 +                        * Yes, there is a superblock on this MD node.
23689 +                        * We probably have a MD stacking case here.
23690 +                        */
23691 +                       rdev = evms_md_find_rdev_all(mddev->node);
23692 +                       if (rdev) {
23693 +                               list_add(&rdev->pending, &pending_raid_disks);
23694 +                               evms_md_run_devices(discover_list);
23695 +                       } else {
23696 +                               LOG_WARNING("%s: imported %s but no rdev was found!\n",
23697 +                                           __FUNCTION__,
23698 +                                           evms_md_partition_name(mddev->node));
23699 +                       }
23700 +               }
23701 +       }
23702 +       if (incomplete_mddev(mddev)) {
23703 +               list_del(&mddev->incomplete_mddevs);
23704 +               INIT_LIST_HEAD(&mddev->incomplete_mddevs);
23705 +       }
23706 +}
23707 +
23708 +/*
23709 + * lets try to run arrays based on all disks that have arrived
23710 + * until now. (those are in the ->pending list)
23711 + *
23712 + * the method: pick the first pending disk, collect all disks with
23713 + * the same UUID, remove all from the pending list and put them into
23714 + * the 'same_array' list. Then order this list based on superblock
23715 + * update time (freshest comes first), kick out 'old' disks and
23716 + * compare superblocks. If everything's fine then run it.
23717 + *
23718 + * If "unit" is allocated, then bump its reference count
23719 + */
23720 +static void evms_md_run_devices (struct evms_logical_node **discover_list)
23721 +{
23722 +       struct list_head candidates;
23723 +       struct list_head *tmp;
23724 +       mdk_rdev_t *rdev0, *rdev;
23725 +       mddev_t *mddev;
23726 +       kdev_t md_kdev;
23727 +
23728 +
23729 +       LOG_ENTRY_EXIT("%s: ENTRY\n", __FUNCTION__);
23730 +       while (pending_raid_disks.next != &pending_raid_disks) {
23731 +               rdev0 = list_entry(pending_raid_disks.next,
23732 +                                        mdk_rdev_t, pending);
23733 +               MD_INIT_LIST_HEAD(&candidates);
23734 +               ITERATE_RDEV_PENDING(rdev,tmp) {
23735 +                       if (uuid_equal(rdev0, rdev)) {
23736 +                               if (!sb_equal(rdev0->sb, rdev->sb)) {
23737 +                                       LOG_DETAILS("%s has same UUID as %s, but superblocks differ ...\n",\
23738 +                                                   get_partition_name(rdev),get_partition_name(rdev0));
23739 +                                       continue;
23740 +                               }
23741 +                               list_del(&rdev->pending);
23742 +                               list_add(&rdev->pending, &candidates);
23743 +                       }
23744 +               }
23745 +
23746 +               /*
23747 +                * now we have a set of devices, with all of them having
23748 +                * mostly sane superblocks. It's time to allocate the
23749 +                * mddev.
23750 +                */
23751 +               md_kdev = MKDEV(MD_MAJOR, rdev0->sb->md_minor);
23752 +               mddev = kdev_to_mddev(md_kdev);
23753 +               if (mddev && (!incomplete_mddev(mddev))) {
23754 +                       LOG_DETAILS("md%d already running, cannot run %s\n",
23755 +                                  mdidx(mddev), get_partition_name(rdev0));
23756 +
23757 +                       ITERATE_RDEV(mddev,rdev,tmp) {
23758 +                               /*
23759 +                                * This is EVMS re-discovery!
23760 +                                * Remove all nodes consumed by this md device from the discover list
23761 +                                */
23762 +                               evms_cs_remove_logical_node_from_list(discover_list,rdev->node);
23763 +                       }
23764 +
23765 +                       ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
23766 +                               if (evms_md_find_mddev_all(rdev->node))
23767 +                                       /*
23768 +                                        * We have found an MD superblock on top of a running MD array.
23769 +                                        * Delete rdev but keep the MD array.
23770 +                                        */
23771 +                                       evms_md_export_rdev(rdev, FALSE);
23772 +                               else
23773 +                                       evms_md_export_rdev(rdev, TRUE);
23774 +                       }
23775 +                       continue;
23776 +               }
23777 +
23778 +               if (!mddev) {
23779 +                       mddev = alloc_mddev(md_kdev);
23780 +                       if (mddev == NULL) {
23781 +                               LOG_ERROR("cannot allocate memory for md drive.\n");
23782 +                               break;
23783 +                       }
23784 +                       LOG_DETAILS("created md%d\n", mdidx(mddev));
23785 +               } else {
23786 +                       LOG_DETAILS("%s: found INCOMPLETE md%d\n", __FUNCTION__, mdidx(mddev));
23787 +               }
23788 +
23789 +               ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) {
23790 +                       bind_rdev_to_array(rdev, mddev);
23791 +                       list_del(&rdev->pending);
23792 +                       MD_INIT_LIST_HEAD(&rdev->pending);
23793 +               }
23794 +
23795 +               if ((mddev->nr_raid_disks >= rdev0->sb->raid_disks) ||
23796 +                   (mddev->nb_dev == rdev0->sb->nr_disks)) {
23797 +                       evms_md_run_array(discover_list,mddev);
23798 +               } else {
23799 +                       LOG_DETAILS("THIS md%d IS INCOMPLETE, found %d devices, need %d\n",
23800 +                                   mdidx(mddev), mddev->nr_raid_disks, rdev0->sb->raid_disks);
23801 +                       list_add(&mddev->incomplete_mddevs, &incomplete_mddevs);
23802 +                       ITERATE_RDEV(mddev,rdev,tmp) {
23803 +                               evms_cs_remove_logical_node_from_list(discover_list,rdev->node);
23804 +                       }
23805 +               }
23806 +       }
23807 +       LOG_ENTRY_EXIT("%s: EXIT\n", __FUNCTION__);
23808 +}
23809 +
23810 +void evms_md_recover_arrays(void)
23811 +{
23812 +       if (!evms_md_recovery_thread) {
23813 +               MD_BUG();
23814 +               return;
23815 +       }
23816 +       evms_cs_wakeup_thread(evms_md_recovery_thread);
23817 +}
23818 +
23819 +int evms_md_error_dev(
23820 +       mddev_t *mddev,
23821 +       kdev_t dev)
23822 +{
23823 +       mdk_rdev_t * rdev;
23824 +
23825 +       rdev = evms_md_find_rdev(mddev, dev);
23826 +       if (rdev) {
23827 +               return evms_md_error(mddev,rdev->node);
23828 +       } else {
23829 +               LOG_ERROR("%s: could not find %s in md%d\n",
23830 +                       __FUNCTION__, org_partition_name(dev), mdidx(mddev));
23831 +               return 0;
23832 +       }
23833 +}
23834 +
23835 +int evms_md_error(
23836 +       mddev_t *mddev,
23837 +       struct evms_logical_node *node)
23838 +{
23839 +       mdk_rdev_t * rrdev;
23840 +
23841 +       /* check for NULL first */
23842 +       if (!mddev) {
23843 +               MD_BUG();
23844 +               return 0;
23845 +       }
23846 +       LOG_ERROR("evms_md_error dev:(md%d), node:(%s), (caller: %p,%p,%p,%p).\n",
23847 +                  mdidx(mddev), node->name,
23848 +                  __builtin_return_address(0),__builtin_return_address(1),
23849 +                  __builtin_return_address(2),__builtin_return_address(3));
23850 +
23851 +       rrdev = evms_md_find_rdev_from_node(mddev, node);
23852 +       if (!rrdev || rrdev->faulty)
23853 +               return 0;
23854 +       if (!mddev->pers->error_handler
23855 +                       || mddev->pers->error_handler(mddev,node) <= 0) {
23856 +               free_disk_sb(rrdev);
23857 +               rrdev->faulty = 1;
23858 +       } else
23859 +               return 1;
23860 +       /*
23861 +        * if recovery was running, stop it now.
23862 +        */
23863 +       if (mddev->pers->stop_resync)
23864 +               mddev->pers->stop_resync(mddev);
23865 +       if (mddev->recovery_running)
23866 +               evms_cs_interrupt_thread(evms_md_recovery_thread);
23867 +       evms_md_recover_arrays();
23868 +
23869 +       return 0;
23870 +}
23871 +
23872 +int evms_register_md_personality (int pnum, mdk_personality_t *p)
23873 +{
23874 +       if (pnum >= MAX_PERSONALITY) {
23875 +               MD_BUG();
23876 +               return -EINVAL;
23877 +       }
23878 +
23879 +       if (pers[pnum]) {
23880 +               MD_BUG();
23881 +               return -EBUSY;
23882 +       }
23883 +
23884 +       pers[pnum] = p;
23885 +       LOG_DETAILS("%s personality registered as nr %d\n",p->name, pnum);
23886 +       return 0;
23887 +}
23888 +
23889 +int evms_unregister_md_personality (int pnum)
23890 +{
23891 +       if (pnum >= MAX_PERSONALITY) {
23892 +               MD_BUG();
23893 +               return -EINVAL;
23894 +       }
23895 +
23896 +       printk(KERN_INFO "%s personality unregistered\n", pers[pnum]->name);
23897 +       pers[pnum] = NULL;
23898 +       return 0;
23899 +}
23900 +
23901 +mdp_disk_t *evms_md_get_spare(mddev_t *mddev)
23902 +{
23903 +       mdp_super_t *sb = mddev->sb;
23904 +       mdp_disk_t *disk;
23905 +       mdk_rdev_t *rdev;
23906 +       int i, j;
23907 +
23908 +       for (i = 0, j = 0; j < mddev->nb_dev; i++) {
23909 +                rdev = evms_md_find_rdev_nr(mddev, i);
23910 +               if (rdev == NULL)
23911 +                       continue;
23912 +               j++;
23913 +                if (rdev->faulty)
23914 +                       continue;
23915 +               if (!rdev->sb) {
23916 +                       if (!rdev->virtual_spare)
23917 +                               MD_BUG();
23918 +                       continue;
23919 +               }
23920 +               disk = &sb->disks[rdev->desc_nr];
23921 +               if (disk_faulty(disk)) {
23922 +                       MD_BUG();
23923 +                       continue;
23924 +               }
23925 +               if (disk_active(disk))
23926 +                       continue;
23927 +               return disk;
23928 +       }
23929 +       return NULL;
23930 +}
23931 +
23932 +static mdp_disk_t *evms_md_find_disk(mddev_t *mddev, kdev_t dev)
23933 +{
23934 +       mdp_super_t *sb = mddev->sb;
23935 +       mdp_disk_t *disk;
23936 +       int i;
23937 +
23938 +       for (i=0; i < MD_SB_DISKS; i++) {
23939 +               disk = &sb->disks[i];
23940 +               if ((disk->major == MAJOR(dev)) && (disk->minor == MINOR(dev)))
23941 +                       return disk;
23942 +       }
23943 +       return NULL;
23944 +}
23945 +
23946 +static unsigned int sync_io[DK_MAX_MAJOR][DK_MAX_DISK];
23947 +void evms_md_sync_acct(
23948 +       kdev_t dev,
23949 +       unsigned long nr_sectors)
23950 +{
23951 +       unsigned int major = MAJOR(dev);
23952 +       unsigned int index;
23953 +
23954 +       index = disk_index(dev);
23955 +       if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
23956 +               return;
23957 +
23958 +       sync_io[major][index] += nr_sectors;
23959 +}
23960 +
23961 +static int is_mddev_idle(mddev_t *mddev)
23962 +{
23963 +       mdk_rdev_t * rdev;
23964 +       struct list_head *tmp;
23965 +       int idle;
23966 +       unsigned long curr_events;
23967 +
23968 +       idle = 1;
23969 +       ITERATE_RDEV(mddev,rdev,tmp) {
23970 +               int major = MAJOR(rdev->dev);
23971 +               int idx = disk_index(rdev->dev);
23972 +
23973 +               if ((idx >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
23974 +                       continue;
23975 +
23976 +               curr_events = kstat.dk_drive_rblk[major][idx] +
23977 +                                               kstat.dk_drive_wblk[major][idx] ;
23978 +               curr_events -= sync_io[major][idx];
23979 +               if ((curr_events - rdev->last_events) > 32) {
23980 +                       rdev->last_events = curr_events;
23981 +                       idle = 0;
23982 +               }
23983 +       }
23984 +       return idle;
23985 +}
23986 +
23987 +MD_DECLARE_WAIT_QUEUE_HEAD(evms_resync_wait);
23988 +
23989 +void evms_md_done_sync(mddev_t *mddev, int blocks, int ok)
23990 +{
23991 +       /* another "blocks" (512byte) blocks have been synced */
23992 +       atomic_sub(blocks, &mddev->recovery_active);
23993 +       wake_up(&mddev->recovery_wait);
23994 +       if (!ok) {
23995 +               // stop recovery, signal do_sync ....
23996 +       }
23997 +}
23998 +
23999 +#define SYNC_MARKS     10
24000 +#define        SYNC_MARK_STEP  (3*HZ)
24001 +int evms_md_do_sync(mddev_t *mddev, mdp_disk_t *spare)
24002 +{
24003 +       mddev_t *mddev2;
24004 +       unsigned int max_sectors, currspeed,
24005 +               j, window, err, serialize;
24006 +       unsigned long mark[SYNC_MARKS];
24007 +       unsigned long mark_cnt[SYNC_MARKS];
24008 +       int last_mark,m;
24009 +       struct list_head *tmp;
24010 +       unsigned long last_check;
24011 +
24012 +
24013 +       err = down_interruptible(&mddev->resync_sem);
24014 +       if (err)
24015 +               goto out_nolock;
24016 +
24017 +recheck:
24018 +       serialize = 0;
24019 +       ITERATE_MDDEV(mddev2,tmp) {
24020 +               if (mddev2 == mddev)
24021 +                       continue;
24022 +               if (mddev2->curr_resync && match_mddev_units(mddev,mddev2)) {
24023 +                       LOG_DEFAULT("delaying resync of md%d until md%d "
24024 +                                  "has finished resync (they share one or more physical units)\n",
24025 +                                  mdidx(mddev), mdidx(mddev2));
24026 +                       serialize = 1;
24027 +                       break;
24028 +               }
24029 +       }
24030 +       if (serialize) {
24031 +               interruptible_sleep_on(&evms_resync_wait);
24032 +               if (md_signal_pending(current)) {
24033 +                       md_flush_signals();
24034 +                       err = -EINTR;
24035 +                       goto out;
24036 +               }
24037 +               goto recheck;
24038 +       }
24039 +
24040 +       mddev->curr_resync = 1;
24041 +
24042 +       max_sectors = mddev->sb->size<<1;
24043 +
24044 +       LOG_DEFAULT("syncing RAID array md%d\n", mdidx(mddev));
24045 +       LOG_DEFAULT("minimum _guaranteed_ reconstruction speed: %d KB/sec/disc.\n",
24046 +                  sysctl_speed_limit_min);
24047 +       LOG_DEFAULT("using maximum available idle IO bandwith "
24048 +                  "(but not more than %d KB/sec) for reconstruction.\n",
24049 +                  sysctl_speed_limit_max);
24050 +
24051 +       /*
24052 +        * Resync has low priority.
24053 +        */
24054 +#ifdef O1_SCHEDULER
24055 +       set_user_nice(current,19);
24056 +#else
24057 +       current->nice = 19;
24058 +#endif
24059 +
24060 +       is_mddev_idle(mddev); /* this also initializes IO event counters */
24061 +       for (m = 0; m < SYNC_MARKS; m++) {
24062 +               mark[m] = jiffies;
24063 +               mark_cnt[m] = 0;
24064 +       }
24065 +       last_mark = 0;
24066 +       mddev->resync_mark = mark[last_mark];
24067 +       mddev->resync_mark_cnt = mark_cnt[last_mark];
24068 +
24069 +       /*
24070 +        * Tune reconstruction:
24071 +        */
24072 +       window = MD_READAHEAD*(PAGE_SIZE/512);
24073 +       LOG_DEFAULT("using %dk window, over a total of %d blocks.\n",
24074 +                  window/2,max_sectors/2);
24075 +
24076 +       atomic_set(&mddev->recovery_active, 0);
24077 +       init_waitqueue_head(&mddev->recovery_wait);
24078 +       last_check = 0;
24079 +       for (j = 0; j < max_sectors;) {
24080 +               int sectors;
24081 +
24082 +               sectors = mddev->pers->sync_request(mddev, j);
24083 +
24084 +               if (sectors < 0) {
24085 +                       err = sectors;
24086 +                       goto out;
24087 +               }
24088 +               atomic_add(sectors, &mddev->recovery_active);
24089 +               j += sectors;
24090 +               mddev->curr_resync = j;
24091 +
24092 +               if (last_check + window > j)
24093 +                       continue;
24094 +
24095 +               last_check = j;
24096 +
24097 +               run_task_queue(&tq_disk);
24098 +
24099 +       repeat:
24100 +               if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
24101 +                       /* step marks */
24102 +                       int next = (last_mark+1) % SYNC_MARKS;
24103 +
24104 +                       mddev->resync_mark = mark[next];
24105 +                       mddev->resync_mark_cnt = mark_cnt[next];
24106 +                       mark[next] = jiffies;
24107 +                       mark_cnt[next] = j - atomic_read(&mddev->recovery_active);
24108 +                       last_mark = next;
24109 +               }
24110 +
24111 +
24112 +               if (md_signal_pending(current)) {
24113 +                       /*
24114 +                        * got a signal, exit.
24115 +                        */
24116 +                       mddev->curr_resync = 0;
24117 +                       LOG_DEFAULT("evms_md_do_sync() got signal ... exiting\n");
24118 +                       md_flush_signals();
24119 +                       err = -EINTR;
24120 +                       goto out;
24121 +               }
24122 +
24123 +               /*
24124 +                * this loop exits only if either when we are slower than
24125 +                * the 'hard' speed limit, or the system was IO-idle for
24126 +                * a jiffy.
24127 +                * the system might be non-idle CPU-wise, but we only care
24128 +                * about not overloading the IO subsystem. (things like an
24129 +                * e2fsck being done on the RAID array should execute fast)
24130 +                */
24131 +               if (md_need_resched(current))
24132 +                       schedule();
24133 +
24134 +               currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
24135 +
24136 +               if (currspeed > sysctl_speed_limit_min) {
24137 +#ifdef O1_SCHEDULER
24138 +                       set_user_nice(current,19);
24139 +#else
24140 +                       current->nice = 19;
24141 +#endif
24142 +
24143 +                       if ((currspeed > sysctl_speed_limit_max) ||
24144 +                                       !is_mddev_idle(mddev)) {
24145 +#ifdef O1_SCHEDULER
24146 +                               set_current_state(TASK_INTERRUPTIBLE);
24147 +#else
24148 +                               current->state = TASK_INTERRUPTIBLE;
24149 +#endif
24150 +                               md_schedule_timeout(HZ/4);
24151 +                               goto repeat;
24152 +                       }
24153 +               } else
24154 +#ifdef O1_SCHEDULER
24155 +                       set_user_nice(current,-20);
24156 +#else
24157 +                       current->nice = -20;
24158 +#endif
24159 +       }
24160 +       LOG_DEFAULT("md%d: sync done.\n",mdidx(mddev));
24161 +       err = 0;
24162 +       /*
24163 +        * this also signals 'finished resyncing' to md_stop
24164 +        */
24165 +out:
24166 +       wait_event(mddev->recovery_wait, atomic_read(&mddev->recovery_active)==0);
24167 +       up(&mddev->resync_sem);
24168 +out_nolock:
24169 +       mddev->curr_resync = 0;
24170 +       wake_up(&evms_resync_wait);
24171 +       return err;
24172 +}
24173 +
24174 +
24175 +
24176 +/*
24177 + * This is a kernel thread which syncs a spare disk with the active array
24178 + *
24179 + * the amount of foolproofing might seem to be a tad excessive, but an
24180 + * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs
24181 + * of my root partition with the first 0.5 gigs of my /home partition ... so
24182 + * i'm a bit nervous ;)
24183 + */
24184 +void evms_md_do_recovery(void *data)
24185 +{
24186 +       int err;
24187 +       mddev_t *mddev;
24188 +       mdp_super_t *sb;
24189 +       mdp_disk_t *spare;
24190 +       struct list_head *tmp;
24191 +
24192 +       LOG_DEFAULT("recovery thread got woken up ...\n");
24193 +restart:
24194 +       ITERATE_MDDEV(mddev,tmp) {
24195 +
24196 +               sb = mddev->sb;
24197 +               if (!sb)
24198 +                       continue;
24199 +               if (mddev->recovery_running)
24200 +                       continue;
24201 +               if (sb->active_disks == sb->raid_disks)
24202 +                       continue;
24203 +               if (!sb->spare_disks) {
24204 +                       LOG_ERROR(" [md%d] no spare disk to reconstruct array! "
24205 +                                  "-- continuing in degraded mode\n", mdidx(mddev));
24206 +                       continue;
24207 +               }
24208 +
24209 +               spare = NULL;
24210 +
24211 +               if (!spare) {
24212 +                       /*
24213 +                        * now here we get the spare and resync it.
24214 +                        */
24215 +                       spare = evms_md_get_spare(mddev);
24216 +               }
24217 +               if (!spare)
24218 +                       continue;
24219 +
24220 +               LOG_DEFAULT(" [md%d] resyncing spare disk %s to replace failed disk\n",
24221 +                          mdidx(mddev), org_partition_name(MKDEV(spare->major,spare->minor)));
24222 +               if (!mddev->pers->diskop)
24223 +                       continue;
24224 +
24225 +               if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE))
24226 +                       continue;
24227 +
24228 +               down(&mddev->recovery_sem);
24229 +               mddev->recovery_running = 1;
24230 +               err = evms_md_do_sync(mddev, spare);
24231 +               if (err == -EIO) {
24232 +                       LOG_DEFAULT("[md%d] spare disk %s failed, skipping to next spare.\n",
24233 +                                  mdidx(mddev), org_partition_name(MKDEV(spare->major,spare->minor)));
24234 +                       if (!disk_faulty(spare)) {
24235 +                               mddev->pers->diskop(mddev,&spare,DISKOP_SPARE_INACTIVE);
24236 +                               mark_disk_faulty(spare);
24237 +                               mark_disk_nonsync(spare);
24238 +                               mark_disk_inactive(spare);
24239 +                               sb->spare_disks--;
24240 +                               sb->working_disks--;
24241 +                               sb->failed_disks++;
24242 +                       }
24243 +               } else
24244 +                       if (disk_faulty(spare))
24245 +                               mddev->pers->diskop(mddev, &spare,
24246 +                                               DISKOP_SPARE_INACTIVE);
24247 +               if (err == -EINTR || err == -ENOMEM) {
24248 +                       /*
24249 +                        * Recovery got interrupted, or ran out of mem ...
24250 +                        * signal back that we have finished using the array.
24251 +                        */
24252 +                       mddev->pers->diskop(mddev, &spare,
24253 +                                                        DISKOP_SPARE_INACTIVE);
24254 +                       up(&mddev->recovery_sem);
24255 +                       mddev->recovery_running = 0;
24256 +                       continue;
24257 +               } else {
24258 +                       mddev->recovery_running = 0;
24259 +                       up(&mddev->recovery_sem);
24260 +               }
24261 +               if (!disk_faulty(spare)) {
24262 +                       /*
24263 +                        * the SPARE_ACTIVE diskop possibly changes the
24264 +                        * pointer too
24265 +                        */
24266 +                       mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE);
24267 +                       mark_disk_sync(spare);
24268 +                       mark_disk_active(spare);
24269 +                       sb->active_disks++;
24270 +                       sb->spare_disks--;
24271 +               }
24272 +               mddev->sb_dirty = 1;
24273 +               evms_md_update_sb(mddev);
24274 +               goto restart;
24275 +       }
24276 +       LOG_DEFAULT("recovery thread finished ...\n");
24277 +
24278 +}
24279 +
24280 +static void evms_md_create_recovery_thread(void)
24281 +{
24282 +       static char * name = "evms_mdrecoveryd";
24283 +
24284 +       if (!evms_md_recovery_thread) {
24285 +               /* Create MD recovery thread */
24286 +               evms_md_recovery_thread = evms_cs_register_thread(evms_md_do_recovery, NULL, name);
24287 +               if (!evms_md_recovery_thread)
24288 +                       LOG_SERIOUS("%s: evms_cs_recovery_thread failed\n", __FUNCTION__);
24289 +       }
24290 +}
24291 +
24292 +static void evms_md_destroy_recovery_thread(void)
24293 +{
24294 +       if (evms_md_recovery_thread && !MOD_IN_USE) {
24295 +               /* Destroy MD recovery thread */
24296 +               evms_cs_unregister_thread(evms_md_recovery_thread);
24297 +               evms_md_recovery_thread = NULL;
24298 +       }
24299 +}
24300 +
24301 +/**
24302 + * evms_md_create_logical_node
24303 + **/
24304 +static int evms_md_create_logical_node(
24305 +       struct evms_logical_node **discover_list,
24306 +       mddev_t *mddev,
24307 +       uint flags)
24308 +{
24309 +       int rc;
24310 +       struct evms_md *evms_md = NULL;
24311 +       struct evms_logical_node *newnode = NULL;
24312 +       struct evms_plugin_header *hdr = NULL;
24313 +       struct evms_plugin_fops *fops = NULL;
24314 +
24315 +       rc = evms_cs_allocate_logical_node(&newnode);
24316 +       if (!rc) {
24317 +               evms_md = kmalloc(sizeof(*evms_md), GFP_KERNEL);
24318 +               if (!evms_md) {
24319 +                       rc = -ENOMEM;
24320 +               } else {
24321 +
24322 +                       memset(evms_md,0,sizeof(*evms_md));
24323 +                       evms_md->mddev = mddev;
24324 +
24325 +                       fops = kmalloc(sizeof(*fops), GFP_KERNEL);
24326 +                       if (fops) {
24327 +                               /* copy MD plugin header
24328 +                                * copy function table
24329 +                                * replace read and write function pointers.
24330 +                                */
24331 +                               evms_md->instance_plugin_hdr = md_plugin_header;
24332 +                               memcpy(fops, &md_fops, sizeof(*fops));
24333 +                               fops->read = mddev->pers->read;
24334 +                               fops->write = mddev->pers->write;
24335 +                               evms_md->instance_plugin_hdr.fops = fops;
24336 +                               hdr = &evms_md->instance_plugin_hdr;
24337 +                       } else {
24338 +                               LOG_WARNING("%s: No memory to copy function table\n",__FUNCTION__);
24339 +                               rc = 0; /* clear rc and continue */
24340 +                               hdr = &md_plugin_header;
24341 +                       }
24342 +               }
24343 +       }
24344 +
24345 +       if (!rc && hdr) {
24346 +               memset(newnode,0,sizeof(*newnode));
24347 +               newnode->plugin = hdr;
24348 +               newnode->total_vsectors = (u64)evms_md_size[mdidx(mddev)] * 2;
24349 +               newnode->block_size = md_blocksizes[mdidx(mddev)];
24350 +               newnode->hardsector_size = md_hardsect_sizes[mdidx(mddev)];
24351 +               sprintf(newnode->name,"md/md%d",mdidx(mddev));
24352 +               newnode->private = evms_md;
24353 +               newnode->flags = flags;
24354 +
24355 +               rc = evms_cs_add_logical_node_to_list(discover_list, newnode);
24356 +               if (rc) {
24357 +                       LOG_ERROR("%s: could not add md node %s\n", __FUNCTION__, newnode->name);
24358 +               } else {
24359 +                       LOG_DEBUG("%s: added [%s] to discover list (total_vsectors="PFU64")\n",
24360 +                                 __FUNCTION__, newnode->name, newnode->total_vsectors);
24361 +               }
24362 +       }
24363 +
24364 +       if (!rc) {
24365 +               mddev->node = newnode;
24366 +       } else {
24367 +               if (evms_md) {
24368 +                       if (fops)
24369 +                               kfree(fops);
24370 +                       kfree(evms_md);
24371 +               }
24372 +               if (newnode)
24373 +                       evms_cs_deallocate_logical_node(newnode);
24374 +       }
24375 +       return rc;
24376 +}
24377 +
24378 +
24379 +/*
24380 + * Function: evms_md_autostart_arrays
24381 + *     Discover MD "extended" devices
24382 + *     Add MD "extended" devices to pending list for further processing
24383 + */
24384 +static void evms_md_autostart_arrays (struct evms_logical_node **discover_list)
24385 +{
24386 +        struct evms_logical_node *node, *next_node;
24387 +       mdk_rdev_t *rdev;
24388 +       int rc=0;
24389 +
24390 +        LOG_ENTRY_EXIT(":autostart_arrays() ENTRY\n");
24391 +
24392 +        /* examine each node on the discover list */
24393 +        next_node = *discover_list;
24394 +        while(next_node) {
24395 +                node = next_node;
24396 +                next_node = node->next;
24397 +
24398 +               rc = evms_md_import_device(discover_list, node);
24399 +               if (rc && (rc != -EEXIST)) {
24400 +                       LOG_EXTRA("autostart_arrrays() Not %s!\n",evms_md_partition_name(node));
24401 +                       continue;
24402 +               }
24403 +
24404 +               /*
24405 +                * Sanity checks:
24406 +                */
24407 +               rdev = evms_md_find_rdev_all(node);
24408 +               if (!rdev) {
24409 +                       LOG_ERROR("find_rdev_all() failed\n");
24410 +                       continue;
24411 +               }
24412 +               if (rdev->faulty) {
24413 +                       MD_BUG();
24414 +                       continue;
24415 +               }
24416 +
24417 +               if (!rc) {
24418 +                       list_add(&rdev->pending, &pending_raid_disks);
24419 +               } else if (rc == -EEXIST) {
24420 +                       struct evms_logical_node *md_node;
24421 +                       /*
24422 +                        * Must be in a re-discovery process here.
24423 +                        * Find the EVMS MD node that this rdev is a member of
24424 +                        */
24425 +                       if (rdev->mddev) {
24426 +                               md_node = rdev->mddev->node;
24427 +                               if (md_node) {
24428 +                                       rc = evms_cs_add_logical_node_to_list(discover_list,md_node);
24429 +                                       switch (rc) {
24430 +                                       case 0:
24431 +                                               exported_nodes++;
24432 +                                               LOG_DETAILS("Added MD node (%s) to discover list\n",
24433 +                                                       md_node->name);
24434 +                                               break;
24435 +                                       case 1: /* already on the list */
24436 +                                       case 2: /* already on the list */
24437 +                                               break;
24438 +                                       default:
24439 +                                               LOG_WARNING("could not add md node (%s), rc=%d\n",
24440 +                                                       md_node->name, rc);
24441 +                                       }
24442 +                               } else {
24443 +                                       LOG_ERROR("This MD device [md%d] does not have an EVMS logical node.\n",
24444 +                                                  rdev->mddev->__minor);
24445 +                               }
24446 +                       } else {
24447 +                               LOG_ERROR("This device [%s] does not belong to any array!\n",
24448 +                                         get_partition_name(rdev));
24449 +                               evms_md_export_rdev(rdev, TRUE);
24450 +                       }
24451 +                       evms_cs_remove_logical_node_from_list(discover_list,node);
24452 +               }
24453 +        }
24454 +
24455 +       evms_md_run_devices(discover_list);
24456 +        LOG_DETAILS("EVMD MD:autostart_arrays() EXIT (exported_nodes=%d)\n",exported_nodes);
24457 +}
24458 +
24459 +#ifdef CONFIG_PROC_FS
24460 +static int status_resync(char * page, off_t * offset, int count, mddev_t * mddev)
24461 +{
24462 +       int sz = 0;
24463 +       off_t off = *offset;
24464 +       unsigned long max_blocks, resync, res, dt, db, rt;
24465 +
24466 +       resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
24467 +       max_blocks = mddev->sb->size;
24468 +
24469 +       /*
24470 +        * Should not happen.
24471 +        */
24472 +       if (!max_blocks) {
24473 +               MD_BUG();
24474 +               return 0;
24475 +       }
24476 +       res = (resync/1024)*1000/(max_blocks/1024 + 1);
24477 +       {
24478 +               int i, x = res/50, y = 20-x;
24479 +               PROCPRINT("[");
24480 +               for (i = 0; i < x; i++)
24481 +                       PROCPRINT("=");
24482 +               sz += sprintf(page + sz, ">");
24483 +               for (i = 0; i < y; i++)
24484 +                       PROCPRINT(".");
24485 +               PROCPRINT("] ");
24486 +       }
24487 +       if (!mddev->recovery_running)
24488 +               /*
24489 +                * true resync
24490 +                */
24491 +               PROCPRINT(" resync =%3lu.%lu%% (%lu/%lu)",
24492 +                       res/10, res % 10, resync, max_blocks);
24493 +       else
24494 +               /*
24495 +                * recovery ...
24496 +                */
24497 +               PROCPRINT(" recovery =%3lu.%lu%% (%lu/%lu)",
24498 +                       res/10, res % 10, resync, max_blocks);
24499 +
24500 +       /*
24501 +        * We do not want to overflow, so the order of operands and
24502 +        * the * 100 / 100 trick are important. We do a +1 to be
24503 +        * safe against division by zero. We only estimate anyway.
24504 +        *
24505 +        * dt: time from mark until now
24506 +        * db: blocks written from mark until now
24507 +        * rt: remaining time
24508 +        */
24509 +       dt = ((jiffies - mddev->resync_mark) / HZ);
24510 +       if (!dt) dt++;
24511 +       db = resync - (mddev->resync_mark_cnt/2);
24512 +       rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
24513 +
24514 +       PROCPRINT(" finish=%lu.%lumin", rt / 60, (rt % 60)/6);
24515 +
24516 +       PROCPRINT(" speed=%ldK/sec", db/dt);
24517 +
24518 +out:
24519 +       *offset = off;
24520 +       return sz;
24521 +}
24522 +
24523 +static int evms_md_status_read_proc(char *page, char **start, off_t off,
24524 +                       int count, int *eof, void *data)
24525 +{
24526 +       int sz = 0, j, size;
24527 +       struct list_head *tmp, *tmp2;
24528 +       mdk_rdev_t *rdev;
24529 +       mddev_t *mddev;
24530 +
24531 +       PROCPRINT("Enterprise Volume Management System: MD Status\n");
24532 +       PROCPRINT("Personalities : ");
24533 +       for (j = 0; j < MAX_PERSONALITY; j++)
24534 +       if (pers[j])
24535 +               PROCPRINT("[%s] ", pers[j]->name);
24536 +
24537 +       PROCPRINT("\n");
24538 +
24539 +
24540 +       ITERATE_MDDEV(mddev,tmp) {
24541 +               PROCPRINT("md%d : %sactive", mdidx(mddev),
24542 +                       mddev->pers ? "" : "in");
24543 +               if (mddev->pers) {
24544 +                       if (mddev->ro)
24545 +                               PROCPRINT(" (read-only)");
24546 +                       PROCPRINT(" %s", mddev->pers->name);
24547 +               }
24548 +
24549 +               size = 0;
24550 +               ITERATE_RDEV(mddev,rdev,tmp2) {
24551 +                       PROCPRINT(" %s[%d]",
24552 +                               rdev->node->name, rdev->desc_nr);
24553 +                       if (rdev->faulty) {
24554 +                               PROCPRINT("(F)");
24555 +                               continue;
24556 +                       }
24557 +                       size += rdev->size;
24558 +               }
24559 +
24560 +               if (mddev->nb_dev) {
24561 +                       if (mddev->pers)
24562 +                               PROCPRINT("\n      "PFU64" blocks",
24563 +                                                mddev->node->total_vsectors >> 1);
24564 +                       else
24565 +                               PROCPRINT("\n      %d blocks", size);
24566 +               }
24567 +
24568 +               if (!mddev->pers) {
24569 +                       PROCPRINT("\n");
24570 +                       continue;
24571 +               }
24572 +
24573 +               sz += mddev->pers->status (page+sz, mddev);
24574 +
24575 +               PROCPRINT("\n      ");
24576 +               if (mddev->curr_resync) {
24577 +                       sz += status_resync (page+sz, &off, count, mddev);
24578 +               } else {
24579 +                       if (atomic_read(&mddev->resync_sem.count) != 1)
24580 +                               PROCPRINT("     resync=DELAYED");
24581 +               }
24582 +
24583 +               PROCPRINT("\n");
24584 +       }
24585 +       *eof = 1;
24586 +out:
24587 +       *start = page + off;
24588 +       sz -= off;
24589 +       if (sz < 0)
24590 +               sz = 0;
24591 +       return sz > count ? count : sz;
24592 +}
24593 +#endif
24594 +
24595 +/* Function: md_core_init
24596 + */
24597 +int __init md_core_init(void)
24598 +{
24599 +#ifdef CONFIG_PROC_FS
24600 +       struct proc_dir_entry *evms_proc_dir;
24601 +#endif
24602 +
24603 +#ifdef CONFIG_PROC_FS
24604 +       evms_proc_dir = evms_cs_get_evms_proc_dir();
24605 +       if (evms_proc_dir) {
24606 +               create_proc_read_entry("mdstat", 0, evms_proc_dir, evms_md_status_read_proc, NULL);
24607 +       }
24608 +       md_table_header = register_sysctl_table(dev_dir_table, 1);
24609 +#endif
24610 +
24611 +       return evms_cs_register_plugin(&md_plugin_header);
24612 +}
24613 +
24614 +static void __exit md_core_exit(void)
24615 +{
24616 +#ifdef CONFIG_PROC_FS
24617 +       struct proc_dir_entry *evms_proc_dir;
24618 +
24619 +       evms_proc_dir = evms_cs_get_evms_proc_dir();
24620 +       if (evms_proc_dir) {
24621 +               remove_proc_entry("mdstat", evms_proc_dir);
24622 +       }
24623 +       unregister_sysctl_table(md_table_header);
24624 +#endif
24625 +       evms_cs_unregister_plugin(&md_plugin_header);
24626 +}
24627 +
24628 +module_init(md_core_init);
24629 +module_exit(md_core_exit);
24630 +#ifdef MODULE_LICENSE
24631 +MODULE_LICENSE("GPL");
24632 +#endif
24633 +
24634 +/*
24635 + * In order to have the coexistence of this EVMS plugin and the orginal MD
24636 + * module, the symbols exported by this plugin are prefixed with "evms_"
24637 + */
24638 +
24639 +MD_EXPORT_SYMBOL(evms_md_size);
24640 +MD_EXPORT_SYMBOL(evms_register_md_personality);
24641 +MD_EXPORT_SYMBOL(evms_unregister_md_personality);
24642 +       /* Export the following function for use with rdev->node in evms_md_k.h */
24643 +MD_EXPORT_SYMBOL(evms_md_partition_name);
24644 +       /* Export the following function for use with disks[] in md_p.h */
24645 +MD_EXPORT_SYMBOL(evms_md_error);
24646 +MD_EXPORT_SYMBOL(evms_md_error_dev);
24647 +MD_EXPORT_SYMBOL(evms_md_update_sb);
24648 +MD_EXPORT_SYMBOL(evms_md_find_rdev_nr);
24649 +MD_EXPORT_SYMBOL(evms_md_find_rdev);
24650 +MD_EXPORT_SYMBOL(evms_md_find_rdev_from_node);
24651 +MD_EXPORT_SYMBOL(evms_md_print_devices);
24652 +MD_EXPORT_SYMBOL(evms_mddev_map);
24653 +MD_EXPORT_SYMBOL(evms_md_check_ordering);
24654 +MD_EXPORT_SYMBOL(evms_md_partial_sync_io);
24655 +MD_EXPORT_SYMBOL(evms_md_sync_io);
24656 +MD_EXPORT_SYMBOL(evms_md_do_sync);
24657 +MD_EXPORT_SYMBOL(evms_md_sync_acct);
24658 +MD_EXPORT_SYMBOL(evms_md_done_sync);
24659 +MD_EXPORT_SYMBOL(evms_md_recover_arrays);
24660 +MD_EXPORT_SYMBOL(evms_md_get_spare);
24661 +
24662 diff -Naur linux-2002-09-30/drivers/evms/md_linear.c evms-2002-09-30/drivers/evms/md_linear.c
24663 --- linux-2002-09-30/drivers/evms/md_linear.c   Wed Dec 31 18:00:00 1969
24664 +++ evms-2002-09-30/drivers/evms/md_linear.c    Thu Aug 15 13:50:12 2002
24665 @@ -0,0 +1,285 @@
24666 +/*
24667 +   linear.c : Multiple Devices driver for Linux
24668 +              Copyright (C) 1994-96 Marc ZYNGIER
24669 +             <zyngier@ufr-info-p7.ibp.fr> or
24670 +             <maz@gloups.fdn.fr>
24671 +
24672 +   Linear mode management functions.
24673 +
24674 +   This program is free software; you can redistribute it and/or modify
24675 +   it under the terms of the GNU General Public License as published by
24676 +   the Free Software Foundation; either version 2, or (at your option)
24677 +   any later version.
24678 +
24679 +   You should have received a copy of the GNU General Public License
24680 +   (for example /usr/src/linux/COPYING); if not, write to the Free
24681 +   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24682 +*/
24683 +
24684 +#include <linux/module.h>
24685 +#include <linux/evms/evms_md.h>
24686 +#include <linux/evms/evms_linear.h>
24687 +#include <linux/slab.h>
24688 +
24689 +
24690 +#define MAJOR_NR MD_MAJOR
24691 +#define MD_DRIVER
24692 +#define MD_PERSONALITY
24693 +
24694 +#define LOG_PREFIX "md linear: "
24695 +static int linear_run (mddev_t *mddev)
24696 +{
24697 +       linear_conf_t *conf;
24698 +       struct linear_hash *table;
24699 +       mdk_rdev_t *rdev;
24700 +       int size, i, j, nb_zone;
24701 +       unsigned int curr_offset;
24702 +
24703 +       MOD_INC_USE_COUNT;
24704 +
24705 +       conf = kmalloc (sizeof (*conf), GFP_KERNEL);
24706 +       if (!conf)
24707 +               goto out;
24708 +       mddev->private = conf;
24709 +
24710 +       if (evms_md_check_ordering(mddev)) {
24711 +               printk("linear: disks are not ordered, aborting!\n");
24712 +               goto out;
24713 +       }
24714 +
24715 +       /*
24716 +        * Find the smallest device.
24717 +        */
24718 +
24719 +       conf->smallest = NULL;
24720 +       curr_offset = 0;
24721 +       ITERATE_RDEV_ORDERED(mddev,rdev,j) {
24722 +               dev_info_t *disk = conf->disks + j;
24723 +               disk->node = rdev->node;
24724 +               disk->dev = rdev->dev;
24725 +               disk->size = rdev->size;
24726 +               disk->offset = curr_offset;
24727 +
24728 +               curr_offset += disk->size;
24729 +
24730 +               if (!conf->smallest || (disk->size < conf->smallest->size))
24731 +                       conf->smallest = disk;
24732 +       }
24733 +
24734 +       nb_zone = conf->nr_zones = evms_md_size[mdidx(mddev)] / conf->smallest->size +
24735 +               ((evms_md_size[mdidx(mddev)] % conf->smallest->size) ? 1 : 0);
24736 +
24737 +       conf->hash_table = kmalloc (sizeof (struct linear_hash) * nb_zone,
24738 +                                       GFP_KERNEL);
24739 +       if (!conf->hash_table)
24740 +               goto out;
24741 +
24742 +       /*
24743 +        * Here we generate the linear hash table
24744 +        */
24745 +       table = conf->hash_table;
24746 +       i = 0;
24747 +       size = 0;
24748 +       for (j = 0; j < mddev->nb_dev; j++) {
24749 +               dev_info_t *disk = conf->disks + j;
24750 +
24751 +               if (size < 0) {
24752 +                       table[-1].dev1 = disk;
24753 +               }
24754 +               size += disk->size;
24755 +
24756 +               while (size>0) {
24757 +                       table->dev0 = disk;
24758 +                       table->dev1 = NULL;
24759 +                       size -= conf->smallest->size;
24760 +                       table++;
24761 +               }
24762 +       }
24763 +       if (table-conf->hash_table != nb_zone)
24764 +               BUG();
24765 +       LOG_DETAILS("%s: nr_zones=%d, smallest=%lu\n",
24766 +                   __FUNCTION__, conf->nr_zones, conf->smallest->size);
24767 +       return 0;
24768 +
24769 +out:
24770 +       if (conf)
24771 +               kfree(conf);
24772 +       MOD_DEC_USE_COUNT;
24773 +       return 1;
24774 +}
24775 +
24776 +static int linear_stop (mddev_t *mddev)
24777 +{
24778 +       linear_conf_t *conf = mddev_to_conf(mddev);
24779 +
24780 +       kfree(conf->hash_table);
24781 +       kfree(conf);
24782 +
24783 +       MOD_DEC_USE_COUNT;
24784 +
24785 +       return 0;
24786 +}
24787 +
24788 +/*
24789 + * Function: linear_map
24790 + */
24791 +static int linear_map(
24792 +       mddev_t *mddev,
24793 +       struct evms_logical_node **node,
24794 +       struct buffer_head *bh)
24795 +{
24796 +       linear_conf_t *conf = mddev_to_conf(mddev);
24797 +       struct linear_hash *hash;
24798 +       dev_info_t *tmp_dev;
24799 +       unsigned long block;
24800 +
24801 +       block = (bh->b_rsector >> 1);
24802 +       hash = conf->hash_table + (block / conf->smallest->size);
24803 +       if (block >= (hash->dev0->size + hash->dev0->offset)) {
24804 +               if (!hash->dev1) {
24805 +                       LOG_ERROR("%s: hash->dev1==NULL for block %ld\n", __FUNCTION__, block);
24806 +                       return -ENXIO;
24807 +               }
24808 +               tmp_dev = hash->dev1;
24809 +       } else
24810 +               tmp_dev = hash->dev0;
24811 +
24812 +       if ( (block + (bh->b_size >> 10)) > (tmp_dev->size + tmp_dev->offset)
24813 +                               || block < tmp_dev->offset) {
24814 +               LOG_ERROR("%s: Block %ld out of bounds on node %s size %ld offset %ld\n",
24815 +                         __FUNCTION__,
24816 +                          block,
24817 +                          tmp_dev->node->name,
24818 +                          tmp_dev->size,
24819 +                          tmp_dev->offset);
24820 +               return -ENXIO;
24821 +       }
24822 +       bh->b_rsector -= (tmp_dev->offset << 1);
24823 +       *node = tmp_dev->node;
24824 +       return 0;
24825 +}
24826 +
24827 +static void linear_read(
24828 +       struct evms_logical_node *md_node,
24829 +       struct buffer_head *bh)
24830 +{
24831 +       mddev_t *mddev = EVMS_MD_NODE_TO_MDDEV(md_node);
24832 +       struct evms_logical_node *node;
24833 +
24834 +       if (evms_md_check_boundary(md_node, bh)) return;
24835 +
24836 +       if (!linear_map(mddev, &node, bh)) {
24837 +               R_IO(node, bh);
24838 +       } else {
24839 +               bh->b_end_io(bh, 0);
24840 +       }
24841 +}
24842 +
24843 +static void linear_write(
24844 +       struct evms_logical_node *md_node,
24845 +       struct buffer_head *bh)
24846 +{
24847 +       mddev_t *mddev = EVMS_MD_NODE_TO_MDDEV(md_node);
24848 +       struct evms_logical_node *node;
24849 +
24850 +       if (evms_md_check_boundary(md_node, bh)) return;
24851 +
24852 +       if (!linear_map(mddev, &node, bh)) {
24853 +               W_IO(node, bh);
24854 +       } else {
24855 +               bh->b_end_io(bh, 0);
24856 +       }
24857 +}
24858 +
24859 +static int linear_status (char *page, mddev_t *mddev)
24860 +{
24861 +       int sz = 0;
24862 +
24863 +#undef MD_DEBUG
24864 +#ifdef MD_DEBUG
24865 +       int j;
24866 +       linear_conf_t *conf = mddev_to_conf(mddev);
24867 +
24868 +       sz += sprintf(page+sz, "      ");
24869 +       for (j = 0; j < conf->nr_zones; j++)
24870 +       {
24871 +               sz += sprintf(page+sz, "[%s",
24872 +                       partition_name(conf->hash_table[j].dev0->dev));
24873 +
24874 +               if (conf->hash_table[j].dev1)
24875 +                       sz += sprintf(page+sz, "/%s] ",
24876 +                         partition_name(conf->hash_table[j].dev1->dev));
24877 +               else
24878 +                       sz += sprintf(page+sz, "] ");
24879 +       }
24880 +       sz += sprintf(page+sz, "\n");
24881 +#endif
24882 +       sz += sprintf(page+sz, " %dk rounding", mddev->chunk_size/1024);
24883 +       return sz;
24884 +}
24885 +
24886 +static int linear_evms_ioctl (
24887 +       mddev_t         * mddev,
24888 +       struct inode    * inode,
24889 +       struct file     * file,
24890 +       unsigned int    cmd,
24891 +       unsigned long   arg)
24892 +{
24893 +       int rc = 0;
24894 +       struct evms_logical_node *node;
24895 +
24896 +       switch (cmd) {
24897 +               case EVMS_GET_BMAP:
24898 +               {
24899 +                       struct evms_get_bmap_pkt *bmap = (struct evms_get_bmap_pkt *)arg;
24900 +                       struct buffer_head *bh =
24901 +                               evms_cs_allocate_from_pool(evms_bh_pool, FALSE);
24902 +                       if (bh) {
24903 +                               bh->b_rsector = (unsigned long)bmap->rsector;
24904 +                               bh->b_size = node->block_size;
24905 +                               rc = linear_map(mddev, &node, bh);
24906 +                               if (!rc) {
24907 +                                       bmap->rsector = (u64)bh->b_rsector;
24908 +                                       if (node)
24909 +                                               rc = IOCTL(node, inode, file, cmd, arg);
24910 +                                       else
24911 +                                               rc = -ENODEV;
24912 +                               }
24913 +                               evms_cs_deallocate_to_pool(evms_bh_pool, bh);
24914 +                       } else
24915 +                               rc = -ENOMEM;
24916 +                       break;
24917 +               }
24918 +
24919 +               default:
24920 +                       rc = -EINVAL;
24921 +       }
24922 +       return rc;
24923 +}
24924 +
24925 +static mdk_personality_t linear_personality = {
24926 +       .name           = "evms_linear",
24927 +       .read           = linear_read,
24928 +       .write          = linear_write,
24929 +       .run            = linear_run,
24930 +       .stop           = linear_stop,
24931 +       .status         = linear_status,
24932 +       .evms_ioctl     = linear_evms_ioctl
24933 +};
24934 +
24935 +static int md__init linear_init (void)
24936 +{
24937 +       return evms_register_md_personality (LINEAR, &linear_personality);
24938 +}
24939 +
24940 +static void linear_exit (void)
24941 +{
24942 +       evms_unregister_md_personality (LINEAR);
24943 +}
24944 +
24945 +
24946 +module_init(linear_init);
24947 +module_exit(linear_exit);
24948 +#ifdef MODULE_LICENSE
24949 +MODULE_LICENSE("GPL");
24950 +#endif
24951 diff -Naur linux-2002-09-30/drivers/evms/md_raid0.c evms-2002-09-30/drivers/evms/md_raid0.c
24952 --- linux-2002-09-30/drivers/evms/md_raid0.c    Wed Dec 31 18:00:00 1969
24953 +++ evms-2002-09-30/drivers/evms/md_raid0.c     Thu Aug 15 13:50:12 2002
24954 @@ -0,0 +1,448 @@
24955 +/*
24956 +   raid0.c : Multiple Devices driver for Linux
24957 +             Copyright (C) 1994-96 Marc ZYNGIER
24958 +            <zyngier@ufr-info-p7.ibp.fr> or
24959 +            <maz@gloups.fdn.fr>
24960 +             Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
24961 +
24962 +
24963 +   RAID-0 management functions.
24964 +
24965 +   This program is free software; you can redistribute it and/or modify
24966 +   it under the terms of the GNU General Public License as published by
24967 +   the Free Software Foundation; either version 2, or (at your option)
24968 +   any later version.
24969 +
24970 +   You should have received a copy of the GNU General Public License
24971 +   (for example /usr/src/linux/COPYING); if not, write to the Free
24972 +   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24973 +*/
24974 +
24975 +#include <linux/module.h>
24976 +#include <linux/evms/evms_raid0.h>
24977 +
24978 +#define MAJOR_NR MD_MAJOR
24979 +#define MD_DRIVER
24980 +#define MD_PERSONALITY
24981 +
24982 +#define LOG_PREFIX "md raid0: "
24983 +
24984 +static int create_strip_zones (mddev_t *mddev)
24985 +{
24986 +       int i, c, j, j1, j2;
24987 +       unsigned long current_offset, curr_zone_offset, rdev_size_in_sects;
24988 +       raid0_conf_t *conf = mddev_to_conf(mddev);
24989 +       mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
24990 +
24991 +       /*
24992 +        * The number of 'same size groups'
24993 +        */
24994 +       conf->nr_strip_zones = 0;
24995 +
24996 +       ITERATE_RDEV_ORDERED(mddev,rdev1,j1) {
24997 +               LOG_DEBUG(" looking at %s\n", evms_md_partition_name(rdev1->node));
24998 +               c = 0;
24999 +               ITERATE_RDEV_ORDERED(mddev,rdev2,j2) {
25000 +                       LOG_DEBUG("   comparing %s(%ld sectors) with %s(%ld sectors)\n",
25001 +                                  evms_md_partition_name(rdev1->node), rdev1->size << 1,
25002 +                                  evms_md_partition_name(rdev2->node), rdev2->size << 1);
25003 +                       if (rdev2 == rdev1) {
25004 +                               LOG_DEBUG("   END\n");
25005 +                               break;
25006 +                       }
25007 +                       if (rdev2->size == rdev1->size)
25008 +                       {
25009 +                               /*
25010 +                                * Not unique, dont count it as a new
25011 +                                * group
25012 +                                */
25013 +                               LOG_DEBUG("   EQUAL\n");
25014 +                               c = 1;
25015 +                               break;
25016 +                       }
25017 +                       LOG_DEBUG("   NOT EQUAL\n");
25018 +               }
25019 +               if (!c) {
25020 +                       LOG_DEBUG("   ==> UNIQUE\n");
25021 +                       conf->nr_strip_zones++;
25022 +                       LOG_DEBUG(" %d zones\n",conf->nr_strip_zones);
25023 +               }
25024 +       }
25025 +       LOG_DEBUG(" FINAL %d zones\n",conf->nr_strip_zones);
25026 +
25027 +       conf->strip_zone = vmalloc(sizeof(struct strip_zone)*
25028 +                               conf->nr_strip_zones);
25029 +       if (!conf->strip_zone)
25030 +               return 1;
25031 +
25032 +
25033 +       conf->smallest = NULL;
25034 +       current_offset = 0;
25035 +       curr_zone_offset = 0;
25036 +
25037 +       for (i = 0; i < conf->nr_strip_zones; i++)
25038 +       {
25039 +               struct strip_zone *zone = conf->strip_zone + i;
25040 +
25041 +               LOG_DEBUG(" zone %d\n", i);
25042 +               zone->dev_offset = current_offset;
25043 +               smallest = NULL;
25044 +               c = 0;
25045 +
25046 +               ITERATE_RDEV_ORDERED(mddev,rdev,j) {
25047 +
25048 +                       LOG_DEBUG(" checking %s ...",evms_md_partition_name(rdev->node));
25049 +                       rdev_size_in_sects = rdev->size << 1;
25050 +                       if (rdev_size_in_sects > current_offset)
25051 +                       {
25052 +                               LOG_DEBUG(" contained as device %d\n", c);
25053 +                               zone->node[c] = rdev->node;
25054 +                               c++;
25055 +                               if (!smallest || (rdev_size_in_sects < (smallest->size <<1) )) {
25056 +                                       smallest = rdev;
25057 +                                       LOG_DEBUG("  (%ld) is smallest!.\n", rdev_size_in_sects);
25058 +                               }
25059 +                       } else
25060 +                               LOG_DEBUG(" nope.\n");
25061 +               }
25062 +
25063 +               zone->nb_dev = c;
25064 +               zone->size_in_sects = ((smallest->size <<1) - current_offset) * c;
25065 +               LOG_DEBUG(" zone->nb_dev: %d, size: %ld\n",
25066 +                       zone->nb_dev,zone->size_in_sects);
25067 +
25068 +               if (!conf->smallest || (zone->size_in_sects < conf->smallest->size_in_sects))
25069 +                       conf->smallest = zone;
25070 +
25071 +               zone->zone_offset = curr_zone_offset;
25072 +               curr_zone_offset += zone->size_in_sects;
25073 +
25074 +               current_offset = smallest->size << 1;
25075 +               LOG_DEBUG(" current zone offset: %ld\n",current_offset);
25076 +       }
25077 +       LOG_DEBUG(" done.\n");
25078 +       return 0;
25079 +}
25080 +
25081 +static int raid0_run (mddev_t *mddev)
25082 +{
25083 +       unsigned long cur=0, i=0, size, zone0_size, nb_zone;
25084 +       unsigned long mddev_size_in_sects = evms_md_size[mdidx(mddev)] << 1;
25085 +       raid0_conf_t *conf;
25086 +
25087 +       MOD_INC_USE_COUNT;
25088 +
25089 +       conf = vmalloc(sizeof (raid0_conf_t));
25090 +       if (!conf)
25091 +               goto out;
25092 +       mddev->private = (void *)conf;
25093 +
25094 +       if (evms_md_check_ordering(mddev)) {
25095 +               LOG_ERROR("disks are not ordered, aborting!\n");
25096 +               goto out_free_conf;
25097 +       }
25098 +
25099 +       if (create_strip_zones (mddev))
25100 +               goto out_free_conf;
25101 +
25102 +       LOG_DETAILS("evms_md_size is %ld sectors.\n", mddev_size_in_sects);
25103 +       LOG_DETAILS("conf->smallest->size_in_sects is %ld sectors.\n", conf->smallest->size_in_sects);
25104 +       nb_zone = mddev_size_in_sects / conf->smallest->size_in_sects +
25105 +                       (mddev_size_in_sects % conf->smallest->size_in_sects ? 1 : 0);
25106 +       LOG_DETAILS("nb_zone is %ld.\n", nb_zone);
25107 +       conf->nr_zones = nb_zone;
25108 +
25109 +       LOG_DEBUG("Allocating %ld bytes for hash.\n", nb_zone*sizeof(struct raid0_hash));
25110 +
25111 +       conf->hash_table = vmalloc (sizeof (struct raid0_hash)*nb_zone);
25112 +       if (!conf->hash_table)
25113 +               goto out_free_zone_conf;
25114 +       size = conf->strip_zone[cur].size_in_sects;
25115 +
25116 +       i = 0;
25117 +       while (cur < conf->nr_strip_zones) {
25118 +               conf->hash_table[i].zone0 = conf->strip_zone + cur;
25119 +
25120 +               /*
25121 +                * If we completely fill the slot
25122 +                */
25123 +               if (size >= conf->smallest->size_in_sects) {
25124 +                       conf->hash_table[i++].zone1 = NULL;
25125 +                       size -= conf->smallest->size_in_sects;
25126 +
25127 +                       if (!size) {
25128 +                               if (++cur == conf->nr_strip_zones)
25129 +                                       continue;
25130 +                               size = conf->strip_zone[cur].size_in_sects;
25131 +                       }
25132 +                       continue;
25133 +               }
25134 +               if (++cur == conf->nr_strip_zones) {
25135 +                       /*
25136 +                        * Last dev, set unit1 as NULL
25137 +                        */
25138 +                       conf->hash_table[i].zone1=NULL;
25139 +                       continue;
25140 +               }
25141 +
25142 +               /*
25143 +                * Here we use a 2nd dev to fill the slot
25144 +                */
25145 +               zone0_size = size;
25146 +               size = conf->strip_zone[cur].size_in_sects;
25147 +               conf->hash_table[i++].zone1 = conf->strip_zone + cur;
25148 +               size -= (conf->smallest->size_in_sects - zone0_size);
25149 +       }
25150 +       return 0;
25151 +
25152 +out_free_zone_conf:
25153 +       vfree(conf->strip_zone);
25154 +       conf->strip_zone = NULL;
25155 +
25156 +out_free_conf:
25157 +       vfree(conf);
25158 +       mddev->private = NULL;
25159 +out:
25160 +       MOD_DEC_USE_COUNT;
25161 +       return 1;
25162 +}
25163 +
25164 +static int raid0_stop (mddev_t *mddev)
25165 +{
25166 +       raid0_conf_t *conf = mddev_to_conf(mddev);
25167 +
25168 +       vfree (conf->hash_table);
25169 +       conf->hash_table = NULL;
25170 +       vfree (conf->strip_zone);
25171 +       conf->strip_zone = NULL;
25172 +       vfree (conf);
25173 +       mddev->private = NULL;
25174 +
25175 +       MOD_DEC_USE_COUNT;
25176 +       return 0;
25177 +}
25178 +
25179 +
25180 +/*
25181 + * Function: raid0_map
25182 + *
25183 + *     Return 0 for success, else error
25184 + *
25185 + */
25186 +
25187 +static inline int raid0_map(
25188 +       mddev_t *mddev,
25189 +       unsigned long lsn,
25190 +       unsigned long size,
25191 +       struct evms_logical_node **node,
25192 +       unsigned long *new_lsn,
25193 +       unsigned long *new_size)
25194 +{
25195 +       unsigned int sect_in_chunk, chunksize_bits, chunk_size_in_sects;
25196 +       raid0_conf_t *conf = mddev_to_conf(mddev);
25197 +       struct raid0_hash *hash;
25198 +       struct strip_zone *zone;
25199 +       unsigned long chunk;
25200 +
25201 +       chunk_size_in_sects = mddev->chunk_size >> EVMS_VSECTOR_SIZE_SHIFT;
25202 +       chunksize_bits = ffz(~chunk_size_in_sects);
25203 +       hash = conf->hash_table + (lsn / conf->smallest->size_in_sects);
25204 +
25205 +       /* Sanity check */
25206 +       if (!hash)
25207 +               goto bad_hash;
25208 +
25209 +       if (!hash->zone0)
25210 +               goto bad_zone0;
25211 +
25212 +       if (lsn >= (hash->zone0->size_in_sects + hash->zone0->zone_offset)) {
25213 +               if (!hash->zone1)
25214 +                       goto bad_zone1;
25215 +               zone = hash->zone1;
25216 +       } else
25217 +               zone = hash->zone0;
25218 +
25219 +       sect_in_chunk = lsn & (chunk_size_in_sects - 1);
25220 +       chunk = (lsn - zone->zone_offset) / (zone->nb_dev << chunksize_bits);
25221 +       *node = zone->node[(lsn >> chunksize_bits) % zone->nb_dev];
25222 +
25223 +       *new_lsn = ((chunk << chunksize_bits) + zone->dev_offset) + sect_in_chunk;
25224 +
25225 +       *new_size = (size <= chunk_size_in_sects - sect_in_chunk) ?
25226 +               size : chunk_size_in_sects - sect_in_chunk;
25227 +
25228 +       return 0;
25229 +
25230 +bad_hash:
25231 +       LOG_ERROR("%s: bug: hash==NULL for lsn %lu\n", __FUNCTION__, lsn);
25232 +       goto outerr;
25233 +bad_zone0:
25234 +       LOG_ERROR("%s: bug: hash->zone0==NULL for lsn %lu\n", __FUNCTION__, lsn);
25235 +       goto outerr;
25236 +bad_zone1:
25237 +       LOG_ERROR("%s: bug: hash->zone1==NULL for lsn %lu\n", __FUNCTION__, lsn);
25238 +outerr:
25239 +       return -EINVAL;
25240 +}
25241 +
25242 +void raid0_error(int rw, struct evms_logical_node *node, struct buffer_head *bh)
25243 +{
25244 +       LOG_ERROR(" %s FAILED on node(%s) rsector(%lu) size(%d)\n",
25245 +               (rw == READ) ? "READ" : "WRITE",
25246 +               node->name,
25247 +               bh->b_rsector,
25248 +               bh->b_size);
25249 +
25250 +       bh->b_end_io(bh, 0);
25251 +}
25252 +
25253 +static inline void raid0_rw (
25254 +       struct evms_logical_node *md_node,
25255 +       struct buffer_head *bh,
25256 +       int rw)
25257 +{
25258 +       mddev_t *mddev = EVMS_MD_NODE_TO_MDDEV(md_node);
25259 +       struct evms_logical_node *node;
25260 +       unsigned long new_lsn, size_in_sects, new_size;
25261 +
25262 +       if (evms_md_check_boundary(md_node, bh)) return;
25263 +       size_in_sects = bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
25264 +       if (!raid0_map(mddev, bh->b_rsector, size_in_sects, &node, &new_lsn, &new_size)) {
25265 +               if (new_size == size_in_sects) {
25266 +                       /*
25267 +                        * This is the normal case:
25268 +                        * the request is entirely within the stripe boundary
25269 +                        */
25270 +                       bh->b_rsector = new_lsn;
25271 +                       if (rw == READ) {
25272 +                               R_IO(node, bh);
25273 +                       } else {
25274 +                               W_IO(node, bh);
25275 +                       }
25276 +                       return;
25277 +               } else {
25278 +                       /*
25279 +                        * BUGBUG!
25280 +                        * Need more processing here (ie. break up the request)
25281 +                        */
25282 +                       LOG_ERROR("This version of EVMS RAID0 does not support I/O requests that are:\n");
25283 +                       LOG_ERROR(" - larger than the stripe size\n");
25284 +                       LOG_ERROR(" - cross the stripe boundary\n");
25285 +               }
25286 +       }
25287 +       raid0_error(rw, node, bh);
25288 +}
25289 +
25290 +static void raid0_read(
25291 +       struct evms_logical_node *md_node,
25292 +       struct buffer_head *bh)
25293 +{
25294 +       raid0_rw(md_node, bh, READ);
25295 +}
25296 +
25297 +static void raid0_write(
25298 +       struct evms_logical_node *md_node,
25299 +       struct buffer_head *bh)
25300 +{
25301 +       raid0_rw(md_node, bh, WRITE);
25302 +}
25303 +
25304 +static int raid0_status (char *page, mddev_t *mddev)
25305 +{
25306 +       int sz = 0;
25307 +#undef MD_DEBUG
25308 +#ifdef MD_DEBUG
25309 +       int j, k;
25310 +       raid0_conf_t *conf = mddev_to_conf(mddev);
25311 +
25312 +       sz += sprintf(page + sz, "      ");
25313 +       for (j = 0; j < conf->nr_zones; j++) {
25314 +               sz += sprintf(page + sz, "[z%d",
25315 +                               conf->hash_table[j].zone0 - conf->strip_zone);
25316 +               if (conf->hash_table[j].zone1)
25317 +                       sz += sprintf(page+sz, "/z%d] ",
25318 +                               conf->hash_table[j].zone1 - conf->strip_zone);
25319 +               else
25320 +                       sz += sprintf(page+sz, "] ");
25321 +       }
25322 +
25323 +       sz += sprintf(page + sz, "\n");
25324 +
25325 +       for (j = 0; j < conf->nr_strip_zones; j++) {
25326 +               sz += sprintf(page + sz, "      z%d=[", j);
25327 +               for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
25328 +                       sz += sprintf (page+sz, "%s/", conf->strip_zone[j].node[k]->name);
25329 +               sz--;
25330 +               sz += sprintf (page+sz, "] zo=%d do=%d s=%d\n",
25331 +                               conf->strip_zone[j].zone_offset,
25332 +                               conf->strip_zone[j].dev_offset,
25333 +                               conf->strip_zone[j].size_in_sects);
25334 +       }
25335 +#endif
25336 +       sz += sprintf(page + sz, " %dk chunks", mddev->chunk_size/1024);
25337 +       return sz;
25338 +}
25339 +
25340 +static int raid0_evms_ioctl (
25341 +       mddev_t         * mddev,
25342 +       struct inode    * inode,
25343 +       struct file     * file,
25344 +       unsigned int    cmd,
25345 +       unsigned long   arg)
25346 +{
25347 +       int rc = 0;
25348 +       struct evms_logical_node *node;
25349 +
25350 +       switch (cmd) {
25351 +               case EVMS_GET_BMAP:
25352 +               {
25353 +                       struct evms_get_bmap_pkt *bmap = (struct evms_get_bmap_pkt *)arg;
25354 +                       unsigned long new_lsn, new_size;
25355 +                       unsigned long size = mddev->node->block_size >> EVMS_VSECTOR_SIZE_SHIFT;
25356 +                       rc = raid0_map(mddev,
25357 +                                      (unsigned long)bmap->rsector,
25358 +                                      size,
25359 +                                      &node,
25360 +                                      &new_lsn,
25361 +                                      &new_size);
25362 +                       if (!rc) {
25363 +                               if (node) {
25364 +                                       bmap->rsector = (u64)new_lsn;
25365 +                                       rc = IOCTL(node, inode, file, cmd, arg);
25366 +                               } else
25367 +                                       rc = -ENODEV;
25368 +                       }
25369 +                       break;
25370 +               }
25371 +
25372 +               default:
25373 +                       rc = -EINVAL;
25374 +       }
25375 +       return rc;
25376 +}
25377 +
25378 +static mdk_personality_t raid0_personality = {
25379 +       .name           = "evms_raid0",
25380 +       .read           = raid0_read,
25381 +       .write          = raid0_write,
25382 +       .run            = raid0_run,
25383 +       .stop           = raid0_stop,
25384 +       .status         = raid0_status,
25385 +       .evms_ioctl     = raid0_evms_ioctl
25386 +};
25387 +
25388 +static int md__init raid0_init (void)
25389 +{
25390 +       return evms_register_md_personality (RAID0, &raid0_personality);
25391 +}
25392 +
25393 +static void raid0_exit (void)
25394 +{
25395 +       evms_unregister_md_personality (RAID0);
25396 +}
25397 +
25398 +module_init(raid0_init);
25399 +module_exit(raid0_exit);
25400 +#ifdef MODULE_LICENSE
25401 +MODULE_LICENSE("GPL");
25402 +#endif
25403 diff -Naur linux-2002-09-30/drivers/evms/md_raid1.c evms-2002-09-30/drivers/evms/md_raid1.c
25404 --- linux-2002-09-30/drivers/evms/md_raid1.c    Wed Dec 31 18:00:00 1969
25405 +++ evms-2002-09-30/drivers/evms/md_raid1.c     Mon Sep 30 00:02:48 2002
25406 @@ -0,0 +1,1935 @@
25407 +/*
25408 + * md_raid1.c : Multiple Devices driver for Linux
25409 + *
25410 + * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
25411 + *
25412 + * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
25413 + *
25414 + * RAID-1 management functions.
25415 + *
25416 + * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
25417 + *
25418 + * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
25419 + * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
25420 + *
25421 + * 'md_raid1.c' is an EVMS version of linux/drivers/md/raid1.c modified
25422 + * by Cuong (Mike) Tran <miketran@us.ibm.com>, January 2002.
25423 + *
25424 + * This program is free software; you can redistribute it and/or modify
25425 + * it under the terms of the GNU General Public License as published by
25426 + * the Free Software Foundation; either version 2, or (at your option)
25427 + * any later version.
25428 + *
25429 + * You should have received a copy of the GNU General Public License
25430 + * (for example /usr/src/linux/COPYING); if not, write to the Free
25431 + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25432 + */
25433 +
25434 +#include <linux/module.h>
25435 +#include <linux/slab.h>
25436 +#include <linux/evms/evms_raid1.h>
25437 +#include <asm/atomic.h>
25438 +
25439 +#define MAJOR_NR MD_MAJOR
25440 +#define MD_DRIVER
25441 +#define MD_PERSONALITY
25442 +
25443 +#define MAX_WORK_PER_DISK 128
25444 +
25445 +#define        NR_RESERVED_BUFS        32
25446 +
25447 +#define LOG_PREFIX "md raid1: "
25448 +/*
25449 + * The following can be used to debug the driver
25450 + */
25451 +#define RAID1_DEBUG    0
25452 +
25453 +#if RAID1_DEBUG
25454 +#define PRINTK(x...)   LOG_DEFAULT(x)
25455 +#define inline
25456 +#define __inline__
25457 +#else
25458 +#define PRINTK(x...)  do { } while (0)
25459 +#endif
25460 +
25461 +
25462 +static mdk_personality_t raid1_personality;
25463 +static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
25464 +struct raid1_bh *evms_raid1_retry_list = NULL, **evms_raid1_retry_tail;
25465 +
25466 +static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt)
25467 +{
25468 +       /* return a linked list of "cnt" struct buffer_heads.
25469 +        * don't take any off the free list unless we know we can
25470 +        * get all we need, otherwise we could deadlock
25471 +        */
25472 +       struct buffer_head *bh=NULL;
25473 +
25474 +       while(cnt) {
25475 +               struct buffer_head *t;
25476 +               md_spin_lock_irq(&conf->device_lock);
25477 +               if (!conf->freebh_blocked && conf->freebh_cnt >= cnt)
25478 +                       while (cnt) {
25479 +                               t = conf->freebh;
25480 +                               conf->freebh = t->b_next;
25481 +                               t->b_next = bh;
25482 +                               bh = t;
25483 +                               t->b_state = 0;
25484 +                               conf->freebh_cnt--;
25485 +                               cnt--;
25486 +                       }
25487 +               md_spin_unlock_irq(&conf->device_lock);
25488 +               if (cnt == 0)
25489 +                       break;
25490 +               t = kmem_cache_alloc(bh_cachep, SLAB_NOIO);
25491 +               if (t) {
25492 +                       t->b_next = bh;
25493 +                       bh = t;
25494 +                       cnt--;
25495 +               } else {
25496 +                       PRINTK("raid1: waiting for %d bh\n", cnt);
25497 +                       conf->freebh_blocked = 1;
25498 +                       wait_disk_event(conf->wait_buffer,
25499 +                                       !conf->freebh_blocked ||
25500 +                                       conf->freebh_cnt > conf->raid_disks * NR_RESERVED_BUFS/2);
25501 +                       conf->freebh_blocked = 0;
25502 +               }
25503 +       }
25504 +       return bh;
25505 +}
25506 +
25507 +static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh)
25508 +{
25509 +       unsigned long flags;
25510 +       spin_lock_irqsave(&conf->device_lock, flags);
25511 +       while (bh) {
25512 +               struct buffer_head *t = bh;
25513 +               bh=bh->b_next;
25514 +               if (t->b_pprev == NULL)
25515 +                       kmem_cache_free(bh_cachep, t);
25516 +               else {
25517 +                       t->b_next= conf->freebh;
25518 +                       conf->freebh = t;
25519 +                       conf->freebh_cnt++;
25520 +               }
25521 +       }
25522 +       spin_unlock_irqrestore(&conf->device_lock, flags);
25523 +       wake_up(&conf->wait_buffer);
25524 +}
25525 +
25526 +static int raid1_grow_bh(raid1_conf_t *conf, int cnt)
25527 +{
25528 +       /* allocate cnt buffer_heads, possibly less if kmalloc fails */
25529 +       int i = 0;
25530 +
25531 +       while (i < cnt) {
25532 +               struct buffer_head *bh;
25533 +               bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL);
25534 +               if (!bh) break;
25535 +
25536 +               md_spin_lock_irq(&conf->device_lock);
25537 +               bh->b_pprev = &conf->freebh;
25538 +               bh->b_next = conf->freebh;
25539 +               conf->freebh = bh;
25540 +               conf->freebh_cnt++;
25541 +               md_spin_unlock_irq(&conf->device_lock);
25542 +
25543 +               i++;
25544 +       }
25545 +       return i;
25546 +}
25547 +
25548 +static void raid1_shrink_bh(raid1_conf_t *conf)
25549 +{
25550 +       /* discard all buffer_heads */
25551 +
25552 +       md_spin_lock_irq(&conf->device_lock);
25553 +       while (conf->freebh) {
25554 +               struct buffer_head *bh = conf->freebh;
25555 +               conf->freebh = bh->b_next;
25556 +               kmem_cache_free(bh_cachep, bh);
25557 +               conf->freebh_cnt--;
25558 +       }
25559 +       md_spin_unlock_irq(&conf->device_lock);
25560 +}
25561 +
25562 +
25563 +static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf)
25564 +{
25565 +       struct raid1_bh *r1_bh = NULL;
25566 +
25567 +       do {
25568 +               md_spin_lock_irq(&conf->device_lock);
25569 +               if (!conf->freer1_blocked && conf->freer1) {
25570 +                       r1_bh = conf->freer1;
25571 +                       conf->freer1 = r1_bh->next_r1;
25572 +                       conf->freer1_cnt--;
25573 +                       r1_bh->next_r1 = NULL;
25574 +                       r1_bh->state = (1 << R1BH_PreAlloc);
25575 +                       r1_bh->bh_req.b_state = 0;
25576 +               }
25577 +               md_spin_unlock_irq(&conf->device_lock);
25578 +               if (r1_bh)
25579 +                       return r1_bh;
25580 +               r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO);
25581 +               if (r1_bh) {
25582 +                       memset(r1_bh, 0, sizeof(*r1_bh));
25583 +                       return r1_bh;
25584 +               }
25585 +               conf->freer1_blocked = 1;
25586 +               wait_disk_event(conf->wait_buffer,
25587 +                               !conf->freer1_blocked ||
25588 +                               conf->freer1_cnt > NR_RESERVED_BUFS/2
25589 +                       );
25590 +               conf->freer1_blocked = 0;
25591 +       } while (1);
25592 +}
25593 +
25594 +static inline void raid1_free_r1bh(struct raid1_bh *r1_bh)
25595 +{
25596 +       struct buffer_head *bh = r1_bh->mirror_bh_list;
25597 +       raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
25598 +
25599 +       r1_bh->mirror_bh_list = NULL;
25600 +
25601 +       if (test_bit(R1BH_PreAlloc, &r1_bh->state)) {
25602 +               unsigned long flags;
25603 +               spin_lock_irqsave(&conf->device_lock, flags);
25604 +               r1_bh->next_r1 = conf->freer1;
25605 +               conf->freer1 = r1_bh;
25606 +               conf->freer1_cnt++;
25607 +               spin_unlock_irqrestore(&conf->device_lock, flags);
25608 +               /* don't need to wakeup wait_buffer because
25609 +                *  raid1_free_bh below will do that
25610 +                */
25611 +       } else {
25612 +               kfree(r1_bh);
25613 +       }
25614 +       raid1_free_bh(conf, bh);
25615 +}
25616 +
25617 +static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt)
25618 +{
25619 +       int i = 0;
25620 +
25621 +       while (i < cnt) {
25622 +               struct raid1_bh *r1_bh;
25623 +               r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL);
25624 +               if (!r1_bh)
25625 +                       break;
25626 +               memset(r1_bh, 0, sizeof(*r1_bh));
25627 +               set_bit(R1BH_PreAlloc, &r1_bh->state);
25628 +               r1_bh->mddev = conf->mddev;
25629 +
25630 +               raid1_free_r1bh(r1_bh);
25631 +               i++;
25632 +       }
25633 +       return i;
25634 +}
25635 +
25636 +static void raid1_shrink_r1bh(raid1_conf_t *conf)
25637 +{
25638 +       md_spin_lock_irq(&conf->device_lock);
25639 +       while (conf->freer1) {
25640 +               struct raid1_bh *r1_bh = conf->freer1;
25641 +               conf->freer1 = r1_bh->next_r1;
25642 +               conf->freer1_cnt--;
25643 +               kfree(r1_bh);
25644 +       }
25645 +       md_spin_unlock_irq(&conf->device_lock);
25646 +}
25647 +
25648 +
25649 +
25650 +static inline void raid1_free_buf(struct raid1_bh *r1_bh)
25651 +{
25652 +       unsigned long flags;
25653 +       struct buffer_head *bh = r1_bh->mirror_bh_list;
25654 +       raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
25655 +       r1_bh->mirror_bh_list = NULL;
25656 +
25657 +       spin_lock_irqsave(&conf->device_lock, flags);
25658 +       r1_bh->next_r1 = conf->freebuf;
25659 +       conf->freebuf = r1_bh;
25660 +       spin_unlock_irqrestore(&conf->device_lock, flags);
25661 +       raid1_free_bh(conf, bh);
25662 +}
25663 +
25664 +static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf)
25665 +{
25666 +       struct raid1_bh *r1_bh;
25667 +
25668 +       md_spin_lock_irq(&conf->device_lock);
25669 +       wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock);
25670 +       r1_bh = conf->freebuf;
25671 +       conf->freebuf = r1_bh->next_r1;
25672 +       r1_bh->next_r1= NULL;
25673 +       md_spin_unlock_irq(&conf->device_lock);
25674 +       return r1_bh;
25675 +}
25676 +
25677 +static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
25678 +{
25679 +       int i = 0;
25680 +
25681 +       md_spin_lock_irq(&conf->device_lock);
25682 +       while (i < cnt) {
25683 +               struct raid1_bh *r1_bh;
25684 +               struct page *page;
25685 +
25686 +               page = alloc_page(GFP_KERNEL);
25687 +               if (!page)
25688 +                       break;
25689 +
25690 +               r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL);
25691 +               if (!r1_bh) {
25692 +                       __free_page(page);
25693 +                       break;
25694 +               }
25695 +               memset(r1_bh, 0, sizeof(*r1_bh));
25696 +               r1_bh->bh_req.b_page = page;
25697 +               r1_bh->bh_req.b_data = page_address(page);
25698 +               r1_bh->next_r1 = conf->freebuf;
25699 +               conf->freebuf = r1_bh;
25700 +               i++;
25701 +       }
25702 +       md_spin_unlock_irq(&conf->device_lock);
25703 +       return i;
25704 +}
25705 +
25706 +static void raid1_shrink_buffers (raid1_conf_t *conf)
25707 +{
25708 +       md_spin_lock_irq(&conf->device_lock);
25709 +       while (conf->freebuf) {
25710 +               struct raid1_bh *r1_bh = conf->freebuf;
25711 +               conf->freebuf = r1_bh->next_r1;
25712 +               __free_page(r1_bh->bh_req.b_page);
25713 +               kfree(r1_bh);
25714 +       }
25715 +       md_spin_unlock_irq(&conf->device_lock);
25716 +}
25717 +
25718 +/*
25719 + * evms_raid1_map
25720 + *     EVMS raid1 version of raid1_map()
25721 + */
25722 +static int evms_raid1_map (mddev_t *mddev, struct evms_logical_node **node, kdev_t *rdev)
25723 +{
25724 +       raid1_conf_t *conf = mddev_to_conf(mddev);
25725 +       int i;
25726 +
25727 +       /*
25728 +        * Later we do read balancing on the read side
25729 +        * now we use the first available disk.
25730 +        */
25731 +
25732 +       for (i = 0; i < MD_SB_DISKS; i++) {
25733 +               if (conf->mirrors[i].operational) {
25734 +                       *node = conf->mirrors[i].node;
25735 +                       *rdev = conf->mirrors[i].dev;
25736 +                       return (0);
25737 +               }
25738 +       }
25739 +
25740 +       LOG_ERROR("huh, no more operational devices?\n");
25741 +       return (-1);
25742 +}
25743 +
25744 +static void raid1_reschedule_retry (struct raid1_bh *r1_bh)
25745 +{
25746 +       unsigned long flags;
25747 +       mddev_t *mddev = r1_bh->mddev;
25748 +       raid1_conf_t *conf = mddev_to_conf(mddev);
25749 +
25750 +       md_spin_lock_irqsave(&retry_list_lock, flags);
25751 +       if (evms_raid1_retry_list == NULL)
25752 +               evms_raid1_retry_tail = &evms_raid1_retry_list;
25753 +       *evms_raid1_retry_tail = r1_bh;
25754 +       evms_raid1_retry_tail = &r1_bh->next_r1;
25755 +       r1_bh->next_r1 = NULL;
25756 +       md_spin_unlock_irqrestore(&retry_list_lock, flags);
25757 +       evms_cs_wakeup_thread(conf->thread);
25758 +}
25759 +
25760 +
25761 +static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase)
25762 +{
25763 +       unsigned long flags;
25764 +       spin_lock_irqsave(&conf->segment_lock, flags);
25765 +       if (sector < conf->start_active)
25766 +               conf->cnt_done--;
25767 +       else if (sector >= conf->start_future && conf->phase == phase)
25768 +               conf->cnt_future--;
25769 +       else if (!--conf->cnt_pending)
25770 +               wake_up(&conf->wait_ready);
25771 +
25772 +       spin_unlock_irqrestore(&conf->segment_lock, flags);
25773 +}
25774 +
25775 +static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf)
25776 +{
25777 +       unsigned long flags;
25778 +       spin_lock_irqsave(&conf->segment_lock, flags);
25779 +       if (sector >= conf->start_ready)
25780 +               --conf->cnt_ready;
25781 +       else if (sector >= conf->start_active) {
25782 +               if (!--conf->cnt_active) {
25783 +                       conf->start_active = conf->start_ready;
25784 +                       wake_up(&conf->wait_done);
25785 +               }
25786 +       }
25787 +       spin_unlock_irqrestore(&conf->segment_lock, flags);
25788 +}
25789 +
25790 +/*
25791 + * raid1_end_bh_io() is called when we have finished servicing a mirrored
25792 + * operation and are ready to return a success/failure code to the buffer
25793 + * cache layer.
25794 + */
25795 +static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
25796 +{
25797 +       struct buffer_head *bh = r1_bh->master_bh;
25798 +
25799 +       io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev),
25800 +                       test_bit(R1BH_SyncPhase, &r1_bh->state));
25801 +
25802 +       bh->b_end_io(bh, uptodate);
25803 +       raid1_free_r1bh(r1_bh);
25804 +}
25805 +
25806 +void raid1_end_read_request (struct buffer_head *bh, int uptodate)
25807 +{
25808 +       struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
25809 +       evms_cs_volume_request_in_progress(r1_bh->master_bh->b_rdev, -1, NULL);
25810 +       if (uptodate) {
25811 +               set_bit (R1BH_Uptodate, &r1_bh->state);
25812 +               raid1_end_bh_io(r1_bh, uptodate);
25813 +       } else {
25814 +               evms_md_error_dev(r1_bh->mddev, bh->b_dev);
25815 +               LOG_ERROR("rescheduling block %lu\n", bh->b_blocknr);
25816 +               raid1_reschedule_retry(r1_bh);
25817 +       }
25818 +}
25819 +
25820 +void raid1_end_write_request (struct buffer_head *bh, int uptodate)
25821 +{
25822 +       struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
25823 +
25824 +       evms_cs_volume_request_in_progress(r1_bh->master_bh->b_rdev, -1, NULL);
25825 +       if (!uptodate)
25826 +               evms_md_error_dev(r1_bh->mddev, bh->b_dev);
25827 +       else
25828 +               set_bit (R1BH_Uptodate, &r1_bh->state);
25829 +
25830 +       /*
25831 +        * Let's see if all mirrored write operations have finished
25832 +        * already.
25833 +        */
25834 +       if (atomic_dec_and_test(&r1_bh->remaining))
25835 +               raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state));
25836 +}
25837 +
25838 +/*
25839 + * This routine returns the disk from which the requested read should
25840 + * be done. It bookkeeps the last read position for every disk
25841 + * in array and when new read requests come, the disk which last
25842 + * position is nearest to the request, is chosen.
25843 + *
25844 + * TODO: now if there are 2 mirrors in the same 2 devices, performance
25845 + * degrades dramatically because position is mirror, not device based.
25846 + * This should be changed to be device based. Also atomic sequential
25847 + * reads should be somehow balanced.
25848 + */
25849 +
25850 +static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh)
25851 +{
25852 +       int new_disk = conf->last_used;
25853 +       const int sectors = bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
25854 +       const unsigned long this_sector = bh->b_rsector;
25855 +       int disk = new_disk;
25856 +       unsigned long new_distance;
25857 +       unsigned long current_distance;
25858 +
25859 +       /*
25860 +        * Check if it is sane at all to balance
25861 +        */
25862 +
25863 +       if (conf->resync_mirrors || conf->mddev->recovery_running)
25864 +               goto rb_out;
25865 +
25866 +
25867 +       /* make sure that disk is operational */
25868 +       while( !conf->mirrors[new_disk].operational) {
25869 +               if (new_disk <= 0) new_disk = conf->raid_disks;
25870 +               new_disk--;
25871 +               if (new_disk == disk) {
25872 +                       /*
25873 +                        * This means no working disk was found
25874 +                        * Nothing much to do, lets not change anything
25875 +                        * and hope for the best...
25876 +                        */
25877 +
25878 +                       new_disk = conf->last_used;
25879 +
25880 +                       goto rb_out;
25881 +               }
25882 +       }
25883 +       disk = new_disk;
25884 +       /* now disk == new_disk == starting point for search */
25885 +
25886 +       /*
25887 +        * Don't touch anything for sequential reads.
25888 +        */
25889 +
25890 +       if (this_sector == conf->mirrors[new_disk].head_position)
25891 +               goto rb_out;
25892 +
25893 +       /*
25894 +        * If reads have been done only on a single disk
25895 +        * for a time, lets give another disk a change.
25896 +        * This is for kicking those idling disks so that
25897 +        * they would find work near some hotspot.
25898 +        */
25899 +
25900 +       if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) {
25901 +               conf->sect_count = 0;
25902 +
25903 +               do {
25904 +                       if (new_disk<=0)
25905 +                               new_disk = conf->raid_disks;
25906 +                       new_disk--;
25907 +                       if (new_disk == disk)
25908 +                               break;
25909 +               } while ((conf->mirrors[new_disk].write_only) ||
25910 +                        (!conf->mirrors[new_disk].operational));
25911 +
25912 +               goto rb_out;
25913 +       }
25914 +
25915 +       current_distance = abs(this_sector -
25916 +                               conf->mirrors[disk].head_position);
25917 +
25918 +       /* Find the disk which is closest */
25919 +
25920 +       do {
25921 +               if (disk <= 0)
25922 +                       disk = conf->raid_disks;
25923 +               disk--;
25924 +
25925 +               if ((conf->mirrors[disk].write_only) ||
25926 +                               (!conf->mirrors[disk].operational))
25927 +                       continue;
25928 +
25929 +               new_distance = abs(this_sector -
25930 +                                       conf->mirrors[disk].head_position);
25931 +
25932 +               if (new_distance < current_distance) {
25933 +                       conf->sect_count = 0;
25934 +                       current_distance = new_distance;
25935 +                       new_disk = disk;
25936 +               }
25937 +       } while (disk != conf->last_used);
25938 +
25939 +rb_out:
25940 +       conf->mirrors[new_disk].head_position = this_sector + sectors;
25941 +
25942 +       conf->last_used = new_disk;
25943 +       conf->sect_count += sectors;
25944 +
25945 +       return new_disk;
25946 +}
25947 +
25948 +static void raid1_read(struct evms_logical_node *md_node, struct buffer_head *bh)
25949 +{
25950 +       mddev_t *mddev = EVMS_MD_NODE_TO_MDDEV(md_node);
25951 +       raid1_conf_t *conf = mddev_to_conf(mddev);
25952 +       struct mirror_info *mirror;
25953 +       struct buffer_head *bh_req;
25954 +       struct raid1_bh * r1_bh;
25955 +
25956 +       if (evms_md_check_boundary(md_node, bh)) return;
25957 +
25958 +       if (!buffer_locked(bh))
25959 +               BUG();
25960 +
25961 +       r1_bh = raid1_alloc_r1bh (conf);
25962 +
25963 +       spin_lock_irq(&conf->segment_lock);
25964 +       wait_event_lock_irq(conf->wait_done,
25965 +                       bh->b_rsector < conf->start_active ||
25966 +                       bh->b_rsector >= conf->start_future,
25967 +                       conf->segment_lock);
25968 +       if (bh->b_rsector < conf->start_active)
25969 +               conf->cnt_done++;
25970 +       else {
25971 +               conf->cnt_future++;
25972 +               if (conf->phase)
25973 +                       set_bit(R1BH_SyncPhase, &r1_bh->state);
25974 +       }
25975 +       spin_unlock_irq(&conf->segment_lock);
25976 +
25977 +       r1_bh->mddev = mddev;
25978 +       r1_bh->cmd = READ;
25979 +       r1_bh->master_bh = bh;
25980 +
25981 +       mirror = conf->mirrors + raid1_read_balance(conf, bh);
25982 +
25983 +       bh_req = &r1_bh->bh_req;
25984 +       memcpy(bh_req, bh, sizeof(*bh));
25985 +       bh_req->b_blocknr = bh->b_rsector;
25986 +       bh_req->b_dev = mirror->dev;
25987 +       bh_req->b_end_io = raid1_end_read_request;
25988 +       bh_req->b_private = r1_bh;
25989 +       evms_cs_volume_request_in_progress(bh->b_rdev, 1, NULL);
25990 +       R_IO(mirror->node, bh_req);
25991 +}
25992 +
25993 +static void raid1_write(
25994 +       struct evms_logical_node *md_node,
25995 +       struct buffer_head *bh)
25996 +{
25997 +       mddev_t *mddev = EVMS_MD_NODE_TO_MDDEV(md_node);
25998 +       raid1_conf_t *conf = mddev_to_conf(mddev);
25999 +       struct raid1_bh * r1_bh;
26000 +       struct buffer_head *bhl;
26001 +       struct buffer_head *mbh;
26002 +       int i, sum_bhs;
26003 +
26004 +       if (evms_md_check_boundary(md_node, bh)) return;
26005 +
26006 +       if (!buffer_locked(bh))
26007 +               BUG();
26008 +
26009 +       r1_bh = raid1_alloc_r1bh (conf);
26010 +
26011 +       spin_lock_irq(&conf->segment_lock);
26012 +       wait_event_lock_irq(conf->wait_done,
26013 +                       bh->b_rsector < conf->start_active ||
26014 +                       bh->b_rsector >= conf->start_future,
26015 +                       conf->segment_lock);
26016 +       if (bh->b_rsector < conf->start_active)
26017 +               conf->cnt_done++;
26018 +       else {
26019 +               conf->cnt_future++;
26020 +               if (conf->phase)
26021 +                       set_bit(R1BH_SyncPhase, &r1_bh->state);
26022 +       }
26023 +       spin_unlock_irq(&conf->segment_lock);
26024 +
26025 +       /*
26026 +        * i think the read and write branch should be separated completely,
26027 +        * since we want to do read balancing on the read side for example.
26028 +        * Alternative implementations? :) --mingo
26029 +        */
26030 +
26031 +       r1_bh->mddev = mddev;
26032 +       r1_bh->cmd = WRITE;
26033 +       r1_bh->master_bh = bh;
26034 +
26035 +       bhl = raid1_alloc_bh(conf, conf->raid_disks);
26036 +
26037 +       for (i=0, sum_bhs=0;
26038 +            (sum_bhs < conf->raid_disks) && (i < MD_SB_DISKS);
26039 +            i++) {
26040 +               if (!conf->mirrors[i].operational)
26041 +                       continue;
26042 +
26043 +       /*
26044 +        * We should use a private pool (size depending on NR_REQUEST),
26045 +        * to avoid writes filling up the memory with bhs
26046 +        *
26047 +        * Such pools are much faster than kmalloc anyways (so we waste
26048 +        * almost nothing by not using the master bh when writing and
26049 +        * win alot of cleanness) but for now we are cool enough. --mingo
26050 +        *
26051 +        * It's safe to sleep here, buffer heads cannot be used in a shared
26052 +        * manner in the write branch. Look how we lock the buffer at the
26053 +        * beginning of this function to grok the difference ;)
26054 +        */
26055 +               mbh = bhl;
26056 +               if (mbh == NULL) {
26057 +                       MD_BUG();
26058 +                       break;
26059 +               }
26060 +               bhl = mbh->b_next;
26061 +               mbh->b_next = NULL;
26062 +               mbh->b_this_page = (struct buffer_head *)1;
26063 +
26064 +       /*
26065 +        * prepare mirrored mbh (fields ordered for max mem throughput):
26066 +        */
26067 +               mbh->b_blocknr    = bh->b_rsector;
26068 +               mbh->b_rdev       = bh->b_rdev;
26069 +               mbh->b_dev        = conf->mirrors[i].dev;
26070 +               mbh->b_rsector    = bh->b_rsector;
26071 +               mbh->b_state      = (1<<BH_Req) | (1<<BH_Dirty) |
26072 +                                               (1<<BH_Mapped) | (1<<BH_Lock);
26073 +
26074 +               atomic_set(&mbh->b_count, 1);
26075 +               mbh->b_size       = bh->b_size;
26076 +               mbh->b_page       = bh->b_page;
26077 +               mbh->b_data       = bh->b_data;
26078 +               mbh->b_list       = BUF_LOCKED;
26079 +               mbh->b_end_io     = raid1_end_write_request;
26080 +               mbh->b_private    = conf->mirrors[i].node;
26081 +
26082 +               mbh->b_next = r1_bh->mirror_bh_list;
26083 +               r1_bh->mirror_bh_list = mbh;
26084 +               sum_bhs++;
26085 +       }
26086 +
26087 +       if (bhl) raid1_free_bh(conf,bhl);
26088 +       if (!sum_bhs) {
26089 +               /* Gag - all mirrors non-operational.. */
26090 +               raid1_end_bh_io(r1_bh, 0);
26091 +               return;
26092 +       }
26093 +       atomic_set(&r1_bh->remaining, sum_bhs);
26094 +
26095 +       /*
26096 +        * We have to be a bit careful about the semaphore above, thats
26097 +        * why we start the requests separately. Since kmalloc() could
26098 +        * fail, sleep and make_request() can sleep too, this is the
26099 +        * safer solution. Imagine, end_request decreasing the semaphore
26100 +        * before we could have set it up ... We could play tricks with
26101 +        * the semaphore (presetting it and correcting at the end if
26102 +        * sum_bhs is not 'n' but we have to do end_request by hand if
26103 +        * all requests finish until we had a chance to set up the
26104 +        * semaphore correctly ... lots of races).
26105 +        */
26106 +       bhl = r1_bh->mirror_bh_list;
26107 +       while(bhl) {
26108 +               struct evms_logical_node *node;
26109 +
26110 +               mbh = bhl;
26111 +               bhl = mbh->b_next;
26112 +               node = (struct evms_logical_node *)mbh->b_private;
26113 +               mbh->b_private = r1_bh;
26114 +
26115 +               evms_cs_volume_request_in_progress(mbh->b_rdev, 1, NULL);
26116 +               W_IO(node, mbh);
26117 +       }
26118 +}
26119 +
26120 +
26121 +static int raid1_status (char *page, mddev_t *mddev)
26122 +{
26123 +       raid1_conf_t *conf = mddev_to_conf(mddev);
26124 +       int sz = 0, i;
26125 +
26126 +       sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks,
26127 +                                                conf->working_disks);
26128 +       for (i = 0; i < conf->raid_disks; i++)
26129 +               sz += sprintf (page+sz, "%s",
26130 +                       conf->mirrors[i].operational ? "U" : "_");
26131 +       sz += sprintf (page+sz, "]");
26132 +       return sz;
26133 +}
26134 +
26135 +#define LAST_DISK KERN_ALERT \
26136 +"EVMS raid1: only one disk left and IO error.\n"
26137 +
26138 +#define NO_SPARE_DISK KERN_ALERT \
26139 +"EVMS raid1: no spare disk left, degrading mirror level by one.\n"
26140 +
26141 +#define DISK_FAILED KERN_ALERT \
26142 +"EVMS raid1: Disk failure on %s, disabling device. \n" \
26143 +"      Operation continuing on %d devices\n"
26144 +
26145 +#define START_SYNCING KERN_ALERT \
26146 +"EVMS raid1: start syncing spare disk.\n"
26147 +
26148 +#define ALREADY_SYNCING KERN_INFO \
26149 +"EVMS raid1: syncing already in progress.\n"
26150 +
26151 +static void mark_disk_bad (mddev_t *mddev, int failed)
26152 +{
26153 +       raid1_conf_t *conf = mddev_to_conf(mddev);
26154 +       struct mirror_info *mirror = conf->mirrors+failed;
26155 +       mdp_super_t *sb = mddev->sb;
26156 +
26157 +       mirror->operational = 0;
26158 +       mark_disk_faulty(sb->disks+mirror->number);
26159 +       mark_disk_nonsync(sb->disks+mirror->number);
26160 +       mark_disk_inactive(sb->disks+mirror->number);
26161 +       if (!mirror->write_only)
26162 +               sb->active_disks--;
26163 +       sb->working_disks--;
26164 +       sb->failed_disks++;
26165 +       mddev->sb_dirty = 1;
26166 +       evms_cs_wakeup_thread(conf->thread);
26167 +       if (!mirror->write_only)
26168 +               conf->working_disks--;
26169 +       LOG_SERIOUS(DISK_FAILED, evms_md_partition_name(mirror->node),conf->working_disks);
26170 +}
26171 +
26172 +static int raid1_error (
26173 +       mddev_t *mddev,
26174 +       struct evms_logical_node *node)
26175 +{
26176 +       raid1_conf_t *conf = mddev_to_conf(mddev);
26177 +       struct mirror_info * mirrors = conf->mirrors;
26178 +       int disks = MD_SB_DISKS;
26179 +       int i;
26180 +
26181 +       /* Find the drive.
26182 +        * If it is not operational, then we have already marked it as dead
26183 +        * else if it is the last working disks, ignore the error, let the
26184 +        * next level up know.
26185 +        * else mark the drive as failed
26186 +        */
26187 +
26188 +       for (i = 0; i < disks; i++)
26189 +               if (mirrors[i].node==node && mirrors[i].operational)
26190 +                       break;
26191 +       if (i == disks)
26192 +               return 0;
26193 +
26194 +       if (i < conf->raid_disks && conf->working_disks == 1) {
26195 +               /* Don't fail the drive, act as though we were just a
26196 +                * normal single drive
26197 +                */
26198 +
26199 +               return 1;
26200 +       }
26201 +       mark_disk_bad(mddev, i);
26202 +       return 0;
26203 +}
26204 +
26205 +#undef LAST_DISK
26206 +#undef NO_SPARE_DISK
26207 +#undef DISK_FAILED
26208 +#undef START_SYNCING
26209 +
26210 +
26211 +static void print_raid1_conf (raid1_conf_t *conf)
26212 +{
26213 +       int i;
26214 +       struct mirror_info *tmp;
26215 +
26216 +       LOG_DEFAULT("RAID1 conf printout:\n");
26217 +       if (!conf) {
26218 +               LOG_DEFAULT("(conf==NULL)\n");
26219 +               return;
26220 +       }
26221 +       LOG_DEFAULT(" --- wd:%d rd:%d nd:%d\n",
26222 +               conf->working_disks,conf->raid_disks, conf->nr_disks);
26223 +
26224 +       for (i = 0; i < conf->nr_disks; i++) {
26225 +               tmp = conf->mirrors + i;
26226 +               LOG_DEFAULT(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
26227 +                          i, tmp->spare,tmp->operational,
26228 +                          tmp->number,tmp->raid_disk,tmp->used_slot,
26229 +                          evms_md_partition_name(tmp->node));
26230 +       }
26231 +}
26232 +
26233 +static void close_sync(raid1_conf_t *conf)
26234 +{
26235 +       mddev_t *mddev = conf->mddev;
26236 +       /* If reconstruction was interrupted, we need to close the "active" and "pending"
26237 +        * holes.
26238 +        * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0
26239 +        */
26240 +       /* this is really needed when recovery stops too... */
26241 +       spin_lock_irq(&conf->segment_lock);
26242 +       conf->start_active = conf->start_pending;
26243 +       conf->start_ready = conf->start_pending;
26244 +       wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
26245 +       conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
26246 +       conf->start_future = mddev->sb->size+1;
26247 +       conf->cnt_pending = conf->cnt_future;
26248 +       conf->cnt_future = 0;
26249 +       conf->phase = conf->phase ^1;
26250 +       wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
26251 +       conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0;
26252 +       conf->phase = 0;
26253 +       conf->cnt_future = conf->cnt_done;;
26254 +       conf->cnt_done = 0;
26255 +       spin_unlock_irq(&conf->segment_lock);
26256 +       wake_up(&conf->wait_done);
26257 +}
26258 +
26259 +static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
26260 +{
26261 +       int err = 0;
26262 +       int i, failed_disk=-1, spare_disk=-1, removed_disk=-1;
26263 +       raid1_conf_t *conf = mddev->private;
26264 +       struct mirror_info *tmp, *sdisk, *fdisk, *rdisk;
26265 +       mdp_super_t *sb = mddev->sb;
26266 +       mdp_disk_t *failed_desc, *spare_desc;
26267 +       mdk_rdev_t *spare_rdev, *failed_rdev;
26268 +
26269 +       print_raid1_conf(conf);
26270 +       md_spin_lock_irq(&conf->device_lock);
26271 +       /*
26272 +        * find the disk ...
26273 +        */
26274 +       switch (state) {
26275 +
26276 +       case DISKOP_SPARE_ACTIVE:
26277 +
26278 +               /*
26279 +                * Find the failed disk within the RAID1 configuration ...
26280 +                * (this can only be in the first conf->working_disks part)
26281 +                */
26282 +               for (i = 0; i < conf->raid_disks; i++) {
26283 +                       tmp = conf->mirrors + i;
26284 +                       if ((!tmp->operational && !tmp->spare) ||
26285 +                                       !tmp->used_slot) {
26286 +                               failed_disk = i;
26287 +                               break;
26288 +                       }
26289 +               }
26290 +               /*
26291 +                * When we activate a spare disk we _must_ have a disk in
26292 +                * the lower (active) part of the array to replace.
26293 +                */
26294 +/*             if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
26295 +                       MD_BUG();
26296 +                       err = 1;
26297 +                       goto abort;
26298 +               }
26299 +  */           /* fall through */
26300 +
26301 +       case DISKOP_HOT_SPARE_ACTIVE:
26302 +       case DISKOP_SPARE_WRITE:
26303 +       case DISKOP_SPARE_INACTIVE:
26304 +
26305 +               /*
26306 +                * Find the spare disk ... (can only be in the 'high'
26307 +                * area of the array)
26308 +                ##### Actually it can be sooner now that we have improved MD #####
26309 +                This support required for expanding number of active mirrors.
26310 +                */
26311 +               for (i = 0; i < MD_SB_DISKS; i++) {
26312 +                       tmp = conf->mirrors + i;
26313 +                       if (tmp->spare && tmp->number == (*d)->number) {
26314 +                               spare_disk = i;
26315 +                               break;
26316 +                       }
26317 +               }
26318 +               if (spare_disk == -1) {
26319 +                       MD_BUG();
26320 +                       err = 1;
26321 +                       goto abort;
26322 +               }
26323 +               break;
26324 +
26325 +       case DISKOP_HOT_REMOVE_SPARE:
26326 +
26327 +               for (i = 0; i < MD_SB_DISKS; i++) {
26328 +                       tmp = conf->mirrors + i;
26329 +                       if (tmp->used_slot && (tmp->number == (*d)->number)) {
26330 +                               if (tmp->operational) {
26331 +                                       err = -EBUSY;
26332 +                                       goto abort;
26333 +                               } else if (!tmp->spare){
26334 +                                       MD_BUG();
26335 +                                       err = 1;
26336 +                                       goto abort;
26337 +                               }
26338 +                               removed_disk = i;
26339 +                               break;
26340 +                       }
26341 +               }
26342 +               if (removed_disk == -1) {
26343 +                       MD_BUG();
26344 +                       err = 1;
26345 +                       goto abort;
26346 +               }
26347 +               break;
26348 +
26349 +       case DISKOP_HOT_REMOVE_DISK:
26350 +               if (conf->working_disks <= 1) {
26351 +                       err = -EBUSY;
26352 +                       goto abort;
26353 +               }
26354 +               for (i = 0; i < MD_SB_DISKS; i++) {
26355 +                       tmp = conf->mirrors + i;
26356 +                       if (tmp->used_slot && (tmp->number == (*d)->number)) {
26357 +                               removed_disk = i;
26358 +                               break;
26359 +                       }
26360 +               }
26361 +               if (removed_disk == -1) {
26362 +                       MD_BUG();
26363 +                       err = 1;
26364 +                       goto abort;
26365 +               }
26366 +               break;
26367 +
26368 +       case DISKOP_HOT_ADD_DISK:
26369 +               err = -ENOSYS;
26370 +               goto abort;
26371 +               break;
26372 +       }
26373 +
26374 +       switch (state) {
26375 +       /*
26376 +        * Switch the spare disk to write-only mode:
26377 +        */
26378 +       case DISKOP_SPARE_WRITE:
26379 +               sdisk = conf->mirrors + spare_disk;
26380 +               sdisk->operational = 1;
26381 +               sdisk->write_only = 1;
26382 +               break;
26383 +       /*
26384 +        * Deactivate a spare disk:
26385 +        */
26386 +       case DISKOP_SPARE_INACTIVE:
26387 +               close_sync(conf);
26388 +               sdisk = conf->mirrors + spare_disk;
26389 +               sdisk->operational = 0;
26390 +               sdisk->write_only = 0;
26391 +               break;
26392 +       /*
26393 +        * Activate (mark read-write) the (now sync) spare disk,
26394 +        * which means we switch it's 'raid position' (->raid_disk)
26395 +        * with the failed disk. (only the first 'conf->nr_disks'
26396 +        * slots are used for 'real' disks and we must preserve this
26397 +        * property)
26398 +        */
26399 +       case DISKOP_SPARE_ACTIVE:
26400 +               close_sync(conf);
26401 +               sdisk = conf->mirrors + spare_disk;
26402 +               if (failed_disk < 0) {
26403 +                       // preset failed disk to itself if no failed disk.
26404 +                       failed_disk = spare_disk;
26405 +                       // try to find spare earlier in array
26406 +                       for (i = conf->raid_disks; i < spare_disk; i++) {
26407 +                               tmp = conf->mirrors + i;
26408 +                               if ((tmp->spare) || !tmp->used_slot) {
26409 +                                       failed_disk = i;
26410 +                                       break;
26411 +                               }
26412 +                       }
26413 +               }
26414 +               fdisk = conf->mirrors + failed_disk;
26415 +
26416 +               spare_desc = &sb->disks[sdisk->number];
26417 +               failed_desc = &sb->disks[fdisk->number];
26418 +
26419 +               if (spare_desc != *d) {
26420 +                       MD_BUG();
26421 +                       err = 1;
26422 +                       goto abort;
26423 +               }
26424 +
26425 +               if (spare_desc->raid_disk != sdisk->raid_disk) {
26426 +                       MD_BUG();
26427 +                       err = 1;
26428 +                       goto abort;
26429 +               }
26430 +
26431 +               if (sdisk->raid_disk != spare_disk) {
26432 +                       MD_BUG();
26433 +                       err = 1;
26434 +                       goto abort;
26435 +               }
26436 +
26437 +               if (failed_desc->raid_disk != fdisk->raid_disk) {
26438 +                       MD_BUG();
26439 +                       err = 1;
26440 +                       goto abort;
26441 +               }
26442 +
26443 +               if (fdisk->raid_disk != failed_disk) {
26444 +                       MD_BUG();
26445 +                       err = 1;
26446 +                       goto abort;
26447 +               }
26448 +
26449 +               /*
26450 +                * do the switch finally
26451 +                */
26452 +               spare_rdev = evms_md_find_rdev_nr(mddev, spare_desc->number);
26453 +               failed_rdev = evms_md_find_rdev_nr(mddev, failed_desc->number);
26454 +
26455 +               /* There must be a spare_rdev, but there may not be a
26456 +                * failed_rdev.  That slot might be empty...
26457 +                */
26458 +               spare_rdev->desc_nr = failed_desc->number;
26459 +               if (failed_rdev)
26460 +                       failed_rdev->desc_nr = spare_desc->number;
26461 +
26462 +               xchg_values(*spare_desc, *failed_desc);
26463 +               xchg_values(*fdisk, *sdisk);
26464 +
26465 +               /*
26466 +                * (careful, 'failed' and 'spare' are switched from now on)
26467 +                *
26468 +                * we want to preserve linear numbering and we want to
26469 +                * give the proper raid_disk number to the now activated
26470 +                * disk. (this means we switch back these values)
26471 +                */
26472 +
26473 +               xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
26474 +               xchg_values(sdisk->raid_disk, fdisk->raid_disk);
26475 +               xchg_values(spare_desc->number, failed_desc->number);
26476 +               xchg_values(sdisk->number, fdisk->number);
26477 +
26478 +               *d = failed_desc;
26479 +
26480 +               if (sdisk->dev == MKDEV(0,0))
26481 +                       sdisk->used_slot = 0;
26482 +               /*
26483 +                * this really activates the spare.
26484 +                */
26485 +               fdisk->spare = 0;
26486 +               fdisk->write_only = 0;
26487 +
26488 +               /*
26489 +                * if we activate a spare, we definitely replace a
26490 +                * non-operational disk slot in the 'low' area of
26491 +                * the disk array.
26492 +                */
26493 +
26494 +               conf->working_disks++;
26495 +
26496 +               break;
26497 +
26498 +       /* Activate a spare disk without a failed disk */
26499 +       case DISKOP_HOT_SPARE_ACTIVE:
26500 +               sdisk = conf->mirrors + spare_disk;
26501 +               sdisk->spare = 0;
26502 +               sdisk->write_only = 0;
26503 +               conf->working_disks++;
26504 +               conf->raid_disks++;
26505 +               if (raid1_grow_bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS)
26506 +                       LOG_WARNING("%s: Cannot grow BH pool\n", __FUNCTION__);
26507 +               break;
26508 +
26509 +       case DISKOP_HOT_REMOVE_SPARE:
26510 +               rdisk = conf->mirrors + removed_disk;
26511 +
26512 +               if (removed_disk < conf->raid_disks) {
26513 +                       MD_BUG();
26514 +                       err = 1;
26515 +                       goto abort;
26516 +               }
26517 +
26518 +               LOG_WARNING("%s: removing spare %s, [md%d] nr_disks=%d\n",
26519 +                           __FUNCTION__, evms_md_partition_name(rdisk->node),
26520 +                           conf->mddev->__minor, conf->nr_disks-1);
26521 +
26522 +               rdisk->dev = MKDEV(0,0);
26523 +               rdisk->node = NULL;
26524 +               rdisk->used_slot = 0;
26525 +               conf->nr_disks--;
26526 +               break;
26527 +
26528 +       case DISKOP_HOT_REMOVE_DISK:
26529 +               rdisk = conf->mirrors + removed_disk;
26530 +
26531 +               LOG_WARNING("%s: removing active disk %s, [md%d] nr_disks=%d\n",
26532 +                           __FUNCTION__, evms_md_partition_name(rdisk->node),
26533 +                           conf->mddev->__minor, conf->nr_disks-1);
26534 +
26535 +               rdisk->dev = MKDEV(0,0);
26536 +               rdisk->node = NULL;
26537 +               rdisk->used_slot = 0;
26538 +               rdisk->operational = 0;
26539 +               conf->working_disks--;
26540 +               conf->nr_disks--;
26541 +               sb->raid_disks--;       //decrement raid disks.  md_core now increments
26542 +                                       //when activating new spare, don't assume add spare here
26543 +               break;
26544 +       default:
26545 +               MD_BUG();
26546 +               err = 1;
26547 +               goto abort;
26548 +       }
26549 +abort:
26550 +       md_spin_unlock_irq(&conf->device_lock);
26551 +       if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE)
26552 +               /* should move to "END_REBUILD" when such exists */
26553 +               raid1_shrink_buffers(conf);
26554 +
26555 +       print_raid1_conf(conf);
26556 +       return err;
26557 +}
26558 +
26559 +
26560 +#define IO_ERROR KERN_ALERT \
26561 +"EVMS raid1: %s: unrecoverable I/O read error for block %lu\n"
26562 +
26563 +#define REDIRECT_SECTOR KERN_ERR \
26564 +"EVMS raid1: %s: redirecting sector %lu to another mirror\n"
26565 +
26566 +/*
26567 + * This is a kernel thread which:
26568 + *
26569 + *     1.      Retries failed read operations on working mirrors.
26570 + *     2.      Updates the raid superblock when problems encounter.
26571 + *     3.      Performs writes following reads for array syncronising.
26572 + */
26573 +static void end_sync_write(struct buffer_head *bh, int uptodate);
26574 +static void end_sync_read(struct buffer_head *bh, int uptodate);
26575 +
26576 +static void raid1d (void *data)
26577 +{
26578 +       struct raid1_bh *r1_bh;
26579 +       struct buffer_head *bh;
26580 +       unsigned long flags;
26581 +       mddev_t *mddev;
26582 +       mdk_rdev_t *rdev;
26583 +       kdev_t dev;
26584 +       struct evms_logical_node *node;
26585 +    raid1_conf_t *conf = (raid1_conf_t *) data;
26586 +
26587 +       for (;;) {
26588 +               mddev = conf->mddev;
26589 +               if (mddev->sb_dirty) {
26590 +                       LOG_DEFAULT("EVMS raid1: dirty sb detected, updating.\n");
26591 +                       mddev->sb_dirty = 0;
26592 +                       evms_md_update_sb(mddev);
26593 +               }
26594 +               md_spin_lock_irqsave(&retry_list_lock, flags);
26595 +               r1_bh = evms_raid1_retry_list;
26596 +               if (!r1_bh)
26597 +                       break;
26598 +               evms_raid1_retry_list = r1_bh->next_r1;
26599 +               md_spin_unlock_irqrestore(&retry_list_lock, flags);
26600 +
26601 +               mddev = r1_bh->mddev;
26602 +               bh = &r1_bh->bh_req;
26603 +               switch(r1_bh->cmd) {
26604 +               case SPECIAL:
26605 +                       /* have to allocate lots of bh structures and
26606 +                        * schedule writes
26607 +                        */
26608 +                       if (test_bit(R1BH_Uptodate, &r1_bh->state)) {
26609 +                               int i, sum_bhs = 0;
26610 +                               int disks = MD_SB_DISKS;
26611 +                               struct buffer_head *bhl, *mbh;
26612 +
26613 +                               conf = mddev_to_conf(mddev);
26614 +                               bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */
26615 +                               for (i = 0; i < disks ; i++) {
26616 +                                       if (!conf->mirrors[i].operational)
26617 +                                               continue;
26618 +                                       if (i==conf->last_used)
26619 +                                               /* we read from here, no need to write */
26620 +                                               continue;
26621 +                                       if (i < conf->raid_disks
26622 +                                           && !conf->resync_mirrors
26623 +                                           && !conf->mirrors[i].write_only)
26624 +                                               /* don't need to write this,
26625 +                                                * we are just rebuilding */
26626 +                                               continue;
26627 +                                       mbh = bhl;
26628 +                                       if (!mbh) {
26629 +                                               MD_BUG();
26630 +                                               break;
26631 +                                       }
26632 +                                       bhl = mbh->b_next;
26633 +                                       mbh->b_this_page = (struct buffer_head *)1;
26634 +
26635 +
26636 +                               /*
26637 +                                * prepare mirrored bh (fields ordered for max mem throughput):
26638 +                                */
26639 +                                       mbh->b_blocknr    = bh->b_blocknr;
26640 +                                       mbh->b_dev        = conf->mirrors[i].dev;
26641 +                                       mbh->b_rsector    = bh->b_blocknr;
26642 +                                       mbh->b_state      = (1<<BH_Req) | (1<<BH_Dirty) |
26643 +                                               (1<<BH_Mapped) | (1<<BH_Lock);
26644 +                                       atomic_set(&mbh->b_count, 1);
26645 +                                       mbh->b_size       = bh->b_size;
26646 +                                       mbh->b_page       = bh->b_page;
26647 +                                       mbh->b_data       = bh->b_data;
26648 +                                       mbh->b_list       = BUF_LOCKED;
26649 +                                       mbh->b_end_io     = end_sync_write;
26650 +                                       mbh->b_private    = conf->mirrors[i].node;
26651 +
26652 +                                       mbh->b_next = r1_bh->mirror_bh_list;
26653 +                                       r1_bh->mirror_bh_list = mbh;
26654 +
26655 +                                       sum_bhs++;
26656 +                               }
26657 +                               atomic_set(&r1_bh->remaining, sum_bhs);
26658 +                               if (bhl) raid1_free_bh(conf, bhl);
26659 +                               mbh = r1_bh->mirror_bh_list;
26660 +
26661 +                               if (!sum_bhs) {
26662 +                                       /* nowhere to write this too... I guess we
26663 +                                        * must be done
26664 +                                        */
26665 +                                       sync_request_done(bh->b_blocknr, conf);
26666 +                                       evms_md_done_sync(mddev,
26667 +                                                         bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT,
26668 +                                                         0);
26669 +                                       raid1_free_buf(r1_bh);
26670 +                               } else {
26671 +                                       while (mbh) {
26672 +
26673 +                                               node = (struct evms_logical_node *)mbh->b_private;
26674 +                                               mbh->b_private = r1_bh;
26675 +
26676 +                                               W_IO(node, mbh);
26677 +                                               evms_md_sync_acct(mbh->b_dev,
26678 +                                                                 bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT);
26679 +                                               mbh = mbh->b_next;
26680 +                                       }
26681 +                               }
26682 +                       } else {
26683 +                               /* There is no point trying a read-for-reconstruct
26684 +                                * as reconstruct is about to be aborted
26685 +                                */
26686 +                               rdev = evms_md_find_rdev(mddev,bh->b_dev);
26687 +                               if (rdev)
26688 +                                       LOG_ERROR(IO_ERROR,
26689 +                                                 evms_md_partition_name(rdev->node),
26690 +                                                 bh->b_blocknr);
26691 +                               evms_md_done_sync(mddev, bh->b_size>>EVMS_VSECTOR_SIZE_SHIFT, 0);
26692 +                       }
26693 +
26694 +                       break;
26695 +               case READ:
26696 +               case READA:
26697 +
26698 +                       dev = bh->b_dev;
26699 +                       evms_raid1_map(mddev, &node, &bh->b_dev);
26700 +                       if (bh->b_dev == dev) {
26701 +                               rdev = evms_md_find_rdev(mddev,dev);
26702 +                               if (rdev)
26703 +                                       LOG_ERROR(" unrecoverable read error on %s at LBA(%lu)\n",
26704 +                                                 evms_md_partition_name(rdev->node),
26705 +                                                 r1_bh->master_bh->b_rsector);
26706 +                               raid1_end_bh_io(r1_bh, 0);
26707 +                       } else {
26708 +                               /* retry I/O on new device */
26709 +                               bh->b_rdev = r1_bh->master_bh->b_rdev;
26710 +                               bh->b_rsector = bh->b_blocknr;
26711 +                               evms_cs_volume_request_in_progress(r1_bh->master_bh->b_rdev, 1, NULL);
26712 +                               R_IO(node, bh);
26713 +                       }
26714 +                       break;
26715 +               }
26716 +       }
26717 +       md_spin_unlock_irqrestore(&retry_list_lock, flags);
26718 +}
26719 +#undef IO_ERROR
26720 +#undef REDIRECT_SECTOR
26721 +
26722 +/*
26723 + * Private kernel thread to reconstruct mirrors after an unclean
26724 + * shutdown.
26725 + */
26726 +static void raid1syncd (void *data)
26727 +{
26728 +       raid1_conf_t *conf = data;
26729 +       mddev_t *mddev = conf->mddev;
26730 +
26731 +       if (!conf->resync_mirrors)
26732 +               return;
26733 +       if (conf->resync_mirrors == 2)
26734 +               return;
26735 +       down(&mddev->recovery_sem);
26736 +       if (!evms_md_do_sync(mddev, NULL)) {
26737 +               /*
26738 +                * Only if everything went Ok.
26739 +                */
26740 +               conf->resync_mirrors = 0;
26741 +       }
26742 +
26743 +       close_sync(conf);
26744 +
26745 +       up(&mddev->recovery_sem);
26746 +       raid1_shrink_buffers(conf);
26747 +}
26748 +
26749 +/*
26750 + * perform a "sync" on one "block"
26751 + *
26752 + * We need to make sure that no normal I/O request - particularly write
26753 + * requests - conflict with active sync requests.
26754 + * This is achieved by conceptually dividing the device space into a
26755 + * number of sections:
26756 + *  DONE: 0 .. a-1     These blocks are in-sync
26757 + *  ACTIVE: a.. b-1    These blocks may have active sync requests, but
26758 + *                     no normal IO requests
26759 + *  READY: b .. c-1    These blocks have no normal IO requests - sync
26760 + *                     request may be happening
26761 + *  PENDING: c .. d-1  These blocks may have IO requests, but no new
26762 + *                     ones will be added
26763 + *  FUTURE:  d .. end  These blocks are not to be considered yet. IO may
26764 + *                     be happening, but not sync
26765 + *
26766 + * We keep a
26767 + *   phase    which flips (0 or 1) each time d moves and
26768 + * a count of:
26769 + *   z =  active io requests in FUTURE since d moved - marked with
26770 + *        current phase
26771 + *   y =  active io requests in FUTURE before d moved, or PENDING -
26772 + *        marked with previous phase
26773 + *   x =  active sync requests in READY
26774 + *   w =  active sync requests in ACTIVE
26775 + *   v =  active io requests in DONE
26776 + *
26777 + * Normally, a=b=c=d=0 and z= active io requests
26778 + *   or a=b=c=d=END and v= active io requests
26779 + * Allowed changes to a,b,c,d:
26780 + * A:  c==d &&  y==0 -> d+=window, y=z, z=0, phase=!phase
26781 + * B:  y==0 -> c=d
26782 + * C:   b=c, w+=x, x=0
26783 + * D:  w==0 -> a=b
26784 + * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0
26785 + *
26786 + * At start of sync we apply A.
26787 + * When y reaches 0, we apply B then A then being sync requests
26788 + * When sync point reaches c-1, we wait for y==0, and W==0, and
26789 + * then apply apply B then A then D then C.
26790 + * Finally, we apply E
26791 + *
26792 + * The sync request simply issues a "read" against a working drive
26793 + * This is marked so that on completion the raid1d thread is woken to
26794 + * issue suitable write requests
26795 + */
26796 +
26797 +static int raid1_sync_request (mddev_t *mddev, unsigned long sector_nr)
26798 +{
26799 +       raid1_conf_t *conf = mddev_to_conf(mddev);
26800 +       struct mirror_info *mirror;
26801 +       struct raid1_bh *r1_bh;
26802 +       struct buffer_head *bh;
26803 +       int bsize;
26804 +       int disk;
26805 +       int block_nr;
26806 +
26807 +       spin_lock_irq(&conf->segment_lock);
26808 +       if (!sector_nr) {
26809 +               /* initialize ...*/
26810 +               int buffs;
26811 +               conf->start_active = 0;
26812 +               conf->start_ready = 0;
26813 +               conf->start_pending = 0;
26814 +               conf->start_future = 0;
26815 +               conf->phase = 0;
26816 +               /* we want enough buffers to hold twice the window of 128*/
26817 +               buffs = 128 *2 / (PAGE_SIZE>>9);
26818 +               buffs = raid1_grow_buffers(conf, buffs);
26819 +               if (buffs < 2)
26820 +                       goto nomem;
26821 +
26822 +               conf->window = buffs*(PAGE_SIZE>>9)/2;
26823 +               conf->cnt_future += conf->cnt_done+conf->cnt_pending;
26824 +               conf->cnt_done = conf->cnt_pending = 0;
26825 +               if (conf->cnt_ready || conf->cnt_active)
26826 +                       MD_BUG();
26827 +       }
26828 +       while (sector_nr >= conf->start_pending) {
26829 +               PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n",
26830 +                       sector_nr, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future,
26831 +                       conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future);
26832 +               wait_event_lock_irq(conf->wait_done,
26833 +                                       !conf->cnt_active,
26834 +                                       conf->segment_lock);
26835 +               wait_event_lock_irq(conf->wait_ready,
26836 +                                       !conf->cnt_pending,
26837 +                                       conf->segment_lock);
26838 +               conf->start_active = conf->start_ready;
26839 +               conf->start_ready = conf->start_pending;
26840 +               conf->start_pending = conf->start_future;
26841 +               conf->start_future = conf->start_future+conf->window;
26842 +               // Note: falling off the end is not a problem
26843 +               conf->phase = conf->phase ^1;
26844 +               conf->cnt_active = conf->cnt_ready;
26845 +               conf->cnt_ready = 0;
26846 +               conf->cnt_pending = conf->cnt_future;
26847 +               conf->cnt_future = 0;
26848 +               wake_up(&conf->wait_done);
26849 +       }
26850 +       conf->cnt_ready++;
26851 +       spin_unlock_irq(&conf->segment_lock);
26852 +
26853 +
26854 +       /* If reconstructing, and >1 working disc,
26855 +        * could dedicate one to rebuild and others to
26856 +        * service read requests ..
26857 +        */
26858 +       disk = conf->last_used;
26859 +       /* make sure disk is operational */
26860 +       while (!conf->mirrors[disk].operational) {
26861 +               if (disk <= 0) disk = conf->raid_disks;
26862 +               disk--;
26863 +               if (disk == conf->last_used)
26864 +                       break;
26865 +       }
26866 +       conf->last_used = disk;
26867 +
26868 +       mirror = conf->mirrors+conf->last_used;
26869 +
26870 +       r1_bh = raid1_alloc_buf (conf);
26871 +       r1_bh->mddev = mddev;
26872 +       r1_bh->cmd = SPECIAL;
26873 +       bh = &r1_bh->bh_req;
26874 +
26875 +       block_nr = sector_nr;
26876 +       bsize = 512;
26877 +       while (!(block_nr & 1) && bsize < PAGE_SIZE
26878 +                       && (block_nr+2)*(bsize>>9) <= (mddev->sb->size *2)) {
26879 +               block_nr >>= 1;
26880 +               bsize <<= 1;
26881 +       }
26882 +       bh->b_size = bsize;
26883 +       bh->b_list = BUF_LOCKED;
26884 +       bh->b_dev = mirror->dev;
26885 +       bh->b_state = (1<<BH_Req) | (1<<BH_Mapped) | (1<<BH_Lock);
26886 +       if (!bh->b_page)
26887 +               BUG();
26888 +       if (!bh->b_data)
26889 +               BUG();
26890 +       if (bh->b_data != page_address(bh->b_page))
26891 +               BUG();
26892 +       bh->b_end_io = end_sync_read;
26893 +       bh->b_private = r1_bh;
26894 +       bh->b_blocknr = sector_nr;
26895 +       bh->b_rsector = sector_nr;
26896 +       init_waitqueue_head(&bh->b_wait);
26897 +
26898 +       R_IO(mirror->node, bh);
26899 +       evms_md_sync_acct(bh->b_dev, bsize/512);
26900 +
26901 +       return (bsize >> 9);
26902 +
26903 +nomem:
26904 +       raid1_shrink_buffers(conf);
26905 +       spin_unlock_irq(&conf->segment_lock);
26906 +       return -ENOMEM;
26907 +}
26908 +
26909 +static void end_sync_read(struct buffer_head *bh, int uptodate)
26910 +{
26911 +       struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
26912 +
26913 +       /* we have read a block, now it needs to be re-written,
26914 +        * or re-read if the read failed.
26915 +        * We don't do much here, just schedule handling by raid1d
26916 +        */
26917 +       if (!uptodate)
26918 +               evms_md_error_dev(r1_bh->mddev, bh->b_dev);
26919 +       else
26920 +               set_bit(R1BH_Uptodate, &r1_bh->state);
26921 +       raid1_reschedule_retry(r1_bh);
26922 +}
26923 +
26924 +static void end_sync_write(struct buffer_head *bh, int uptodate)
26925 +{
26926 +       struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
26927 +
26928 +       if (!uptodate)
26929 +               evms_md_error_dev(r1_bh->mddev, bh->b_dev);
26930 +       if (atomic_dec_and_test(&r1_bh->remaining)) {
26931 +               mddev_t *mddev = r1_bh->mddev;
26932 +               unsigned long sect = bh->b_blocknr;
26933 +               int size = bh->b_size;
26934 +
26935 +               raid1_free_buf(r1_bh);
26936 +               sync_request_done(sect, mddev_to_conf(mddev));
26937 +               evms_md_done_sync(mddev, size>>EVMS_VSECTOR_SIZE_SHIFT, uptodate);
26938 +       }
26939 +}
26940 +
26941 +#define INVALID_LEVEL KERN_WARNING \
26942 +"EVMS raid1: md%d: raid level not set to mirroring (%d)\n"
26943 +
26944 +#define NO_SB KERN_ERR \
26945 +"EVMS raid1: disabled mirror %s (couldn't access raid superblock)\n"
26946 +
26947 +#define ERRORS KERN_ERR \
26948 +"EVMS raid1: disabled mirror %s (errors detected)\n"
26949 +
26950 +#define NOT_IN_SYNC KERN_ERR \
26951 +"EVMS raid1: disabled mirror %s (not in sync)\n"
26952 +
26953 +#define INCONSISTENT KERN_ERR \
26954 +"EVMS raid1: disabled mirror %s (inconsistent descriptor)\n"
26955 +
26956 +#define ALREADY_RUNNING KERN_ERR \
26957 +"EVMS raid1: disabled mirror %s (mirror %d already operational)\n"
26958 +
26959 +#define OPERATIONAL KERN_INFO \
26960 +"EVMS raid1: device %s operational as mirror %d\n"
26961 +
26962 +#define MEM_ERROR KERN_ERR \
26963 +"EVMS raid1: couldn't allocate memory for md%d\n"
26964 +
26965 +#define SPARE KERN_INFO \
26966 +"EVMS raid1: spare disk %s\n"
26967 +
26968 +#define NONE_OPERATIONAL KERN_ERR \
26969 +"EVMS raid1: no operational mirrors for md%d\n"
26970 +
26971 +#define ARRAY_IS_ACTIVE KERN_INFO \
26972 +"EVMS raid1: raid set md%d active with %d out of %d mirrors\n"
26973 +
26974 +#define THREAD_ERROR KERN_ERR \
26975 +"EVMS raid1: couldn't allocate thread for md%d\n"
26976 +
26977 +#define START_RESYNC KERN_WARNING \
26978 +"EVMS raid1: raid set md%d not clean; reconstructing mirrors\n"
26979 +
26980 +static int raid1_run (mddev_t *mddev)
26981 +{
26982 +       raid1_conf_t *conf;
26983 +       int i, j, disk_idx;
26984 +       struct mirror_info *disk;
26985 +       mdp_super_t *sb = mddev->sb;
26986 +       mdp_disk_t *descriptor;
26987 +       mdk_rdev_t *rdev;
26988 +       struct md_list_head *tmp;
26989 +       int start_recovery = 0;
26990 +
26991 +       MOD_INC_USE_COUNT;
26992 +
26993 +       LOG_EXTRA("%s ENTRY\n", __FUNCTION__);
26994 +       if (sb->level != 1) {
26995 +               LOG_ERROR(INVALID_LEVEL, mdidx(mddev), sb->level);
26996 +               goto out;
26997 +       }
26998 +       /*
26999 +        * copy the already verified devices into our private RAID1
27000 +        * bookkeeping area. [whatever we allocate in raid1_run(),
27001 +        * should be freed in raid1_stop()]
27002 +        */
27003 +
27004 +       conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL);
27005 +       mddev->private = conf;
27006 +       if (!conf) {
27007 +               LOG_ERROR(MEM_ERROR, mdidx(mddev));
27008 +               goto out;
27009 +       }
27010 +       memset(conf, 0, sizeof(*conf));
27011 +
27012 +       ITERATE_RDEV(mddev,rdev,tmp) {
27013 +               if (rdev->faulty) {
27014 +                       LOG_ERROR(ERRORS, evms_md_partition_name(rdev->node));
27015 +               } else {
27016 +                       if (!rdev->sb) {
27017 +                               MD_BUG();
27018 +                               continue;
27019 +                       }
27020 +               }
27021 +               if (rdev->desc_nr == -1) {
27022 +                       MD_BUG();
27023 +                       continue;
27024 +               }
27025 +               descriptor = &sb->disks[rdev->desc_nr];
27026 +               disk_idx = descriptor->raid_disk;
27027 +               disk = conf->mirrors + disk_idx;
27028 +
27029 +               if (disk_faulty(descriptor)) {
27030 +                       disk->number = descriptor->number;
27031 +                       disk->raid_disk = disk_idx;
27032 +                       disk->node = rdev->node;
27033 +                       disk->dev = rdev->dev;
27034 +                       disk->sect_limit = MAX_WORK_PER_DISK;
27035 +                       disk->operational = 0;
27036 +                       disk->write_only = 0;
27037 +                       disk->spare = 0;
27038 +                       disk->used_slot = 1;
27039 +                       disk->head_position = 0;
27040 +                       continue;
27041 +               }
27042 +               if (disk_active(descriptor)) {
27043 +                       if (!disk_sync(descriptor)) {
27044 +                               LOG_ERROR(NOT_IN_SYNC, evms_md_partition_name(rdev->node));
27045 +                               continue;
27046 +                       }
27047 +                       if ((descriptor->number > MD_SB_DISKS) ||
27048 +                                        (disk_idx > sb->raid_disks)) {
27049 +
27050 +                               LOG_ERROR(INCONSISTENT,evms_md_partition_name(rdev->node));
27051 +                               continue;
27052 +                       }
27053 +                       if (disk->operational) {
27054 +                               LOG_ERROR(ALREADY_RUNNING, evms_md_partition_name(rdev->node), disk_idx);
27055 +                               continue;
27056 +                       }
27057 +                       LOG_DEFAULT(OPERATIONAL, evms_md_partition_name(rdev->node), disk_idx);
27058 +                       disk->number = descriptor->number;
27059 +                       disk->raid_disk = disk_idx;
27060 +                       disk->node = rdev->node;
27061 +                       disk->dev = rdev->dev;
27062 +                       disk->sect_limit = MAX_WORK_PER_DISK;
27063 +                       disk->operational = 1;
27064 +                       disk->write_only = 0;
27065 +                       disk->spare = 0;
27066 +                       disk->used_slot = 1;
27067 +                       disk->head_position = 0;
27068 +                       conf->working_disks++;
27069 +               } else {
27070 +               /*
27071 +                * Must be a spare disk ..
27072 +                */
27073 +                       LOG_DEFAULT(SPARE, evms_md_partition_name(rdev->node));
27074 +                       disk->number = descriptor->number;
27075 +                       disk->raid_disk = disk_idx;
27076 +                       disk->node = rdev->node;
27077 +                       disk->dev = rdev->dev;
27078 +                       disk->sect_limit = MAX_WORK_PER_DISK;
27079 +                       disk->operational = 0;
27080 +                       disk->write_only = 0;
27081 +                       disk->spare = 1;
27082 +                       disk->used_slot = 1;
27083 +                       disk->head_position = 0;
27084 +               }
27085 +       }
27086 +       conf->raid_disks = sb->raid_disks;
27087 +       conf->nr_disks = sb->nr_disks;
27088 +       conf->mddev = mddev;
27089 +       conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
27090 +
27091 +       conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;
27092 +       init_waitqueue_head(&conf->wait_buffer);
27093 +       init_waitqueue_head(&conf->wait_done);
27094 +       init_waitqueue_head(&conf->wait_ready);
27095 +
27096 +       if (!conf->working_disks) {
27097 +               LOG_ERROR(NONE_OPERATIONAL, mdidx(mddev));
27098 +               goto out_free_conf;
27099 +       }
27100 +
27101 +
27102 +       /* pre-allocate some buffer_head structures.
27103 +        * As a minimum, 1 r1bh and raid_disks buffer_heads
27104 +        * would probably get us by in tight memory situations,
27105 +        * but a few more is probably a good idea.
27106 +        * For now, try NR_RESERVED_BUFS r1bh and
27107 +        * NR_RESERVED_BUFS*raid_disks bufferheads
27108 +        * This will allow at least NR_RESERVED_BUFS concurrent
27109 +        * reads or writes even if kmalloc starts failing
27110 +        */
27111 +       if (raid1_grow_r1bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS ||
27112 +           raid1_grow_bh(conf, NR_RESERVED_BUFS*conf->raid_disks)
27113 +                             < NR_RESERVED_BUFS*conf->raid_disks) {
27114 +               LOG_ERROR(MEM_ERROR, mdidx(mddev));
27115 +               goto out_free_conf;
27116 +       }
27117 +
27118 +       for (i = 0; i < MD_SB_DISKS; i++) {
27119 +
27120 +               descriptor = sb->disks+i;
27121 +               disk_idx = descriptor->raid_disk;
27122 +               disk = conf->mirrors + disk_idx;
27123 +
27124 +               if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
27125 +                               !disk->used_slot) {
27126 +
27127 +                       disk->number = descriptor->number;
27128 +                       disk->raid_disk = disk_idx;
27129 +                       disk->dev = MKDEV(0,0);
27130 +
27131 +                       disk->operational = 0;
27132 +                       disk->write_only = 0;
27133 +                       disk->spare = 0;
27134 +                       disk->used_slot = 1;
27135 +                       disk->head_position = 0;
27136 +               }
27137 +       }
27138 +
27139 +       /*
27140 +        * find the first working one and use it as a starting point
27141 +        * to read balancing.
27142 +        */
27143 +       for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++)
27144 +               /* nothing */;
27145 +       conf->last_used = j;
27146 +
27147 +
27148 +       if (conf->working_disks != sb->raid_disks) {
27149 +               LOG_SERIOUS(" md%d, not all disks are operational -- trying to recover array\n",
27150 +                       mdidx(mddev));
27151 +               start_recovery = 1;
27152 +       }
27153 +
27154 +       {
27155 +               const char * name = "evms_raid1d";
27156 +
27157 +               conf->thread = evms_cs_register_thread(raid1d, conf, name);
27158 +               if (!conf->thread) {
27159 +                       LOG_ERROR(THREAD_ERROR, mdidx(mddev));
27160 +                       goto out_free_conf;
27161 +               }
27162 +       }
27163 +
27164 +       if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN)) &&
27165 +           (conf->working_disks > 1)) {
27166 +               const char * name = "evms_raid1syncd";
27167 +
27168 +               conf->resync_thread = evms_cs_register_thread(raid1syncd, conf,name);
27169 +               if (!conf->resync_thread) {
27170 +                       LOG_ERROR(THREAD_ERROR, mdidx(mddev));
27171 +                       goto out_free_conf;
27172 +               }
27173 +
27174 +               LOG_WARNING(START_RESYNC, mdidx(mddev));
27175 +               conf->resync_mirrors = 1;
27176 +               evms_cs_wakeup_thread(conf->resync_thread);
27177 +       }
27178 +
27179 +       /*
27180 +        * Regenerate the "device is in sync with the raid set" bit for
27181 +        * each device.
27182 +        */
27183 +       for (i = 0; i < MD_SB_DISKS; i++) {
27184 +               mark_disk_nonsync(sb->disks+i);
27185 +               for (j = 0; j < sb->raid_disks; j++) {
27186 +                       if (!conf->mirrors[j].operational)
27187 +                               continue;
27188 +                       if (sb->disks[i].number == conf->mirrors[j].number)
27189 +                               mark_disk_sync(sb->disks+i);
27190 +               }
27191 +       }
27192 +       sb->active_disks = conf->working_disks;
27193 +
27194 +       if (start_recovery)
27195 +               evms_md_recover_arrays();
27196 +
27197 +
27198 +       LOG_DEFAULT(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
27199 +       /*
27200 +        * Ok, everything is just fine now
27201 +        */
27202 +       return 0;
27203 +
27204 +out_free_conf:
27205 +       raid1_shrink_r1bh(conf);
27206 +       raid1_shrink_bh(conf);
27207 +       raid1_shrink_buffers(conf);
27208 +       kfree(conf);
27209 +       mddev->private = NULL;
27210 +out:
27211 +       MOD_DEC_USE_COUNT;
27212 +       return -EIO;
27213 +}
27214 +
27215 +#undef INVALID_LEVEL
27216 +#undef NO_SB
27217 +#undef ERRORS
27218 +#undef NOT_IN_SYNC
27219 +#undef INCONSISTENT
27220 +#undef ALREADY_RUNNING
27221 +#undef OPERATIONAL
27222 +#undef SPARE
27223 +#undef NONE_OPERATIONAL
27224 +#undef ARRAY_IS_ACTIVE
27225 +
27226 +static int raid1_stop_resync (mddev_t *mddev)
27227 +{
27228 +       raid1_conf_t *conf = mddev_to_conf(mddev);
27229 +
27230 +       LOG_EXTRA("%s ENTRY\n", __FUNCTION__);
27231 +       if (conf->resync_thread) {
27232 +               if (conf->resync_mirrors) {
27233 +                       conf->resync_mirrors = 2;
27234 +                       evms_cs_interrupt_thread(conf->resync_thread);
27235 +                       LOG_WARNING(" mirror resync was not fully finished, restarting next time.\n");
27236 +                       return 1;
27237 +               }
27238 +               return 0;
27239 +       }
27240 +       return 0;
27241 +}
27242 +
27243 +static int raid1_restart_resync (mddev_t *mddev)
27244 +{
27245 +       raid1_conf_t *conf = mddev_to_conf(mddev);
27246 +
27247 +       LOG_EXTRA("%s ENTRY\n", __FUNCTION__);
27248 +       if (conf->resync_mirrors) {
27249 +               if (!conf->resync_thread) {
27250 +                       MD_BUG();
27251 +                       return 0;
27252 +               }
27253 +               conf->resync_mirrors = 1;
27254 +               evms_cs_wakeup_thread(conf->resync_thread);
27255 +               return 1;
27256 +       }
27257 +       return 0;
27258 +}
27259 +
27260 +static int raid1_stop (mddev_t *mddev)
27261 +{
27262 +       raid1_conf_t *conf = mddev_to_conf(mddev);
27263 +
27264 +       LOG_EXTRA("%s ENTRY\n", __FUNCTION__);
27265 +       evms_cs_unregister_thread(conf->thread);
27266 +       if (conf->resync_thread)
27267 +               evms_cs_unregister_thread(conf->resync_thread);
27268 +       raid1_shrink_r1bh(conf);
27269 +       raid1_shrink_bh(conf);
27270 +       raid1_shrink_buffers(conf);
27271 +       kfree(conf);
27272 +       mddev->private = NULL;
27273 +       MOD_DEC_USE_COUNT;
27274 +       return 0;
27275 +}
27276 +
27277 +static int raid1_evms_ioctl (
27278 +       mddev_t         * mddev,
27279 +       struct inode    * inode,
27280 +       struct file     * file,
27281 +       unsigned int    cmd,
27282 +       unsigned long   arg)
27283 +{
27284 +       int i, rc = 0;
27285 +       struct evms_logical_node *node = NULL;
27286 +       raid1_conf_t *conf = mddev_to_conf(mddev);
27287 +
27288 +       switch (cmd) {
27289 +               case EVMS_GET_BMAP:
27290 +               {
27291 +                       for (i = 0; i < MD_SB_DISKS; i++) {
27292 +                               if (conf->mirrors[i].operational)  {
27293 +                                       node = conf->mirrors[i].node;
27294 +                                       break;
27295 +                               }
27296 +                       }
27297 +
27298 +                       if (node)
27299 +                               rc = IOCTL(node, inode, file, cmd, arg);
27300 +                       else
27301 +                               rc = -ENODEV;
27302 +
27303 +                       break;
27304 +               }
27305 +
27306 +               default:
27307 +                       rc = -EINVAL;
27308 +       }
27309 +       return rc;
27310 +}
27311 +
27312 +static mdk_personality_t raid1_personality = {
27313 +       .name           = "evms_raid1",
27314 +       .read           = raid1_read,
27315 +       .write          = raid1_write,
27316 +       .run            = raid1_run,
27317 +       .stop           = raid1_stop,
27318 +       .status         = raid1_status,
27319 +       .error_handler  = raid1_error,
27320 +       .diskop         = raid1_diskop,
27321 +       .stop_resync    = raid1_stop_resync,
27322 +       .restart_resync = raid1_restart_resync,
27323 +       .sync_request   = raid1_sync_request,
27324 +       .evms_ioctl     = raid1_evms_ioctl
27325 +};
27326 +
27327 +static int md__init raid1_init (void)
27328 +{
27329 +       return evms_register_md_personality (RAID1, &raid1_personality);
27330 +}
27331 +
27332 +static void raid1_exit (void)
27333 +{
27334 +       evms_unregister_md_personality (RAID1);
27335 +}
27336 +
27337 +module_init(raid1_init);
27338 +module_exit(raid1_exit);
27339 +#ifdef MODULE_LICENSE
27340 +MODULE_LICENSE("GPL");
27341 +#endif
27342 diff -Naur linux-2002-09-30/drivers/evms/md_raid5.c evms-2002-09-30/drivers/evms/md_raid5.c
27343 --- linux-2002-09-30/drivers/evms/md_raid5.c    Wed Dec 31 18:00:00 1969
27344 +++ evms-2002-09-30/drivers/evms/md_raid5.c     Thu Sep 26 14:40:58 2002
27345 @@ -0,0 +1,2283 @@
27346 +/*
27347 + * md_raid5.c : Multiple Devices driver for Linux
27348 + *        Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
27349 + *        Copyright (C) 1999, 2000 Ingo Molnar
27350 + *
27351 + * RAID-5 management functions.
27352 + *
27353 + * 'md_raid5.c' is an EVMS version of linux/drivers/md/raid5.c modified
27354 + * by Cuong (Mike) Tran <miketran@us.ibm.com>, January 2002.
27355 + *
27356 + * This program is free software; you can redistribute it and/or modify
27357 + * it under the terms of the GNU General Public License as published by
27358 + * the Free Software Foundation; either version 2, or (at your option)
27359 + * any later version.
27360 + *
27361 + * You should have received a copy of the GNU General Public License
27362 + * (for example /usr/src/linux/COPYING); if not, write to the Free
27363 + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27364 + */
27365 +
27366 +#include <linux/config.h>
27367 +#include <linux/module.h>
27368 +#include <linux/locks.h>
27369 +#include <linux/slab.h>
27370 +#include <linux/evms/evms_raid5.h>
27371 +#include <asm/bitops.h>
27372 +#include <asm/atomic.h>
27373 +
27374 +#define LOG_PREFIX "md raid5: "
27375 +
27376 +static mdk_personality_t raid5_personality;
27377 +
27378 +/*
27379 + * Stripe cache
27380 + */
27381 +
27382 +#define NR_STRIPES             256
27383 +#define        IO_THRESHOLD            1
27384 +#define HASH_PAGES             1
27385 +#define HASH_PAGES_ORDER       0
27386 +#define NR_HASH                        (HASH_PAGES * PAGE_SIZE / sizeof(struct stripe_head *))
27387 +#define HASH_MASK              (NR_HASH - 1)
27388 +#define stripe_hash(conf, sect)        ((conf)->stripe_hashtbl[((sect) / ((conf)->buffer_size >> 9)) & HASH_MASK])
27389 +
27390 +/*
27391 + * The following can be used to debug the driver
27392 + */
27393 +#define RAID5_DEBUG    0
27394 +#define RAID5_PARANOIA 1
27395 +#if RAID5_PARANOIA && CONFIG_SMP
27396 +#define CHECK_DEVLOCK() if (!spin_is_locked(&conf->device_lock)) BUG()
27397 +#else
27398 +#define CHECK_DEVLOCK()
27399 +#endif
27400 +
27401 +static void print_raid5_conf(raid5_conf_t * conf);
27402 +
27403 +static inline void
27404 +__release_stripe(raid5_conf_t * conf, struct stripe_head *sh)
27405 +{
27406 +       if (atomic_dec_and_test(&sh->count)) {
27407 +               if (!list_empty(&sh->lru))
27408 +                       BUG();
27409 +               if (atomic_read(&conf->active_stripes) == 0)
27410 +                       BUG();
27411 +               if (test_bit(STRIPE_HANDLE, &sh->state)) {
27412 +                       if (test_bit(STRIPE_DELAYED, &sh->state))
27413 +                               list_add_tail(&sh->lru, &conf->delayed_list);
27414 +                       else
27415 +                               list_add_tail(&sh->lru, &conf->handle_list);
27416 +                       evms_cs_wakeup_thread(conf->thread);
27417 +               } else {
27418 +                       if (test_and_clear_bit
27419 +                           (STRIPE_PREREAD_ACTIVE, &sh->state)) {
27420 +                               atomic_dec(&conf->preread_active_stripes);
27421 +                               if (atomic_read(&conf->preread_active_stripes) <
27422 +                                   IO_THRESHOLD)
27423 +                                       evms_cs_wakeup_thread(conf->thread);
27424 +                       }
27425 +                       list_add_tail(&sh->lru, &conf->inactive_list);
27426 +                       atomic_dec(&conf->active_stripes);
27427 +                       if (!conf->inactive_blocked ||
27428 +                           atomic_read(&conf->active_stripes) <
27429 +                           (NR_STRIPES * 3 / 4))
27430 +                               wake_up(&conf->wait_for_stripe);
27431 +               }
27432 +       }
27433 +}
27434 +static void
27435 +release_stripe(struct stripe_head *sh)
27436 +{
27437 +       raid5_conf_t *conf = sh->raid_conf;
27438 +       unsigned long flags;
27439 +
27440 +       spin_lock_irqsave(&conf->device_lock, flags);
27441 +       __release_stripe(conf, sh);
27442 +       spin_unlock_irqrestore(&conf->device_lock, flags);
27443 +}
27444 +
27445 +static void
27446 +remove_hash(struct stripe_head *sh)
27447 +{
27448 +
27449 +       if (sh->hash_pprev) {
27450 +               if (sh->hash_next)
27451 +                       sh->hash_next->hash_pprev = sh->hash_pprev;
27452 +               *sh->hash_pprev = sh->hash_next;
27453 +               sh->hash_pprev = NULL;
27454 +       }
27455 +}
27456 +
27457 +static __inline__ void
27458 +insert_hash(raid5_conf_t * conf, struct stripe_head *sh)
27459 +{
27460 +       struct stripe_head **shp = &stripe_hash(conf, sh->sector);
27461 +
27462 +       CHECK_DEVLOCK();
27463 +       if ((sh->hash_next = *shp) != NULL)
27464 +               (*shp)->hash_pprev = &sh->hash_next;
27465 +       *shp = sh;
27466 +       sh->hash_pprev = shp;
27467 +}
27468 +
27469 +/* find an idle stripe, make sure it is unhashed, and return it. */
27470 +static struct stripe_head *
27471 +get_free_stripe(raid5_conf_t * conf)
27472 +{
27473 +       struct stripe_head *sh = NULL;
27474 +       struct list_head *first;
27475 +
27476 +       CHECK_DEVLOCK();
27477 +       if (list_empty(&conf->inactive_list))
27478 +               goto out;
27479 +       first = conf->inactive_list.next;
27480 +       sh = list_entry(first, struct stripe_head, lru);
27481 +       list_del_init(first);
27482 +       remove_hash(sh);
27483 +       atomic_inc(&conf->active_stripes);
27484 +      out:
27485 +       return sh;
27486 +}
27487 +
27488 +static void
27489 +shrink_buffers(struct stripe_head *sh, int num)
27490 +{
27491 +       struct buffer_head *bh;
27492 +       int i;
27493 +
27494 +       for (i = 0; i < num; i++) {
27495 +               bh = sh->bh_cache[i];
27496 +               if (!bh)
27497 +                       return;
27498 +               sh->bh_cache[i] = NULL;
27499 +               free_page((unsigned long) bh->b_data);
27500 +               kfree(bh);
27501 +       }
27502 +}
27503 +
27504 +static int
27505 +grow_buffers(struct stripe_head *sh, int num, int b_size, int priority)
27506 +{
27507 +       struct buffer_head *bh;
27508 +       int i;
27509 +
27510 +       for (i = 0; i < num; i++) {
27511 +               struct page *page;
27512 +               bh = kmalloc(sizeof (struct buffer_head), priority);
27513 +               if (!bh)
27514 +                       return 1;
27515 +               memset(bh, 0, sizeof (struct buffer_head));
27516 +               init_waitqueue_head(&bh->b_wait);
27517 +               if ((page = alloc_page(priority)))
27518 +                       bh->b_data = page_address(page);
27519 +               else {
27520 +                       kfree(bh);
27521 +                       return 1;
27522 +               }
27523 +               bh->b_count = (atomic_t)ATOMIC_INIT(0);
27524 +               bh->b_page = page;
27525 +               sh->bh_cache[i] = bh;
27526 +
27527 +       }
27528 +       return 0;
27529 +}
27530 +
27531 +static struct buffer_head *raid5_build_block(struct stripe_head *sh, int i);
27532 +
27533 +static inline void
27534 +init_stripe(struct stripe_head *sh, unsigned long sector)
27535 +{
27536 +       raid5_conf_t *conf = sh->raid_conf;
27537 +       int disks = conf->raid_disks, i;
27538 +
27539 +       if (atomic_read(&sh->count) != 0)
27540 +               BUG();
27541 +       if (test_bit(STRIPE_HANDLE, &sh->state))
27542 +               BUG();
27543 +
27544 +       CHECK_DEVLOCK();
27545 +
27546 +       remove_hash(sh);
27547 +
27548 +       sh->sector = sector;
27549 +       sh->size = conf->buffer_size;
27550 +       sh->state = 0;
27551 +
27552 +       for (i = disks; i--;) {
27553 +               if (sh->bh_read[i] || sh->bh_write[i] || sh->bh_written[i] ||
27554 +                   buffer_locked(sh->bh_cache[i])) {
27555 +                       LOG_ERROR("sector=%lx i=%d %p %p %p %d\n",
27556 +                                 sh->sector, i, sh->bh_read[i],
27557 +                                 sh->bh_write[i], sh->bh_written[i],
27558 +                                 buffer_locked(sh->bh_cache[i]));
27559 +                       BUG();
27560 +               }
27561 +               clear_bit(BH_Uptodate, &sh->bh_cache[i]->b_state);
27562 +               raid5_build_block(sh, i);
27563 +       }
27564 +       insert_hash(conf, sh);
27565 +}
27566 +
27567 +/* the buffer size has changed, so unhash all stripes
27568 + * as active stripes complete, they will go onto inactive list
27569 + */
27570 +static void
27571 +shrink_stripe_cache(raid5_conf_t * conf)
27572 +{
27573 +       int i;
27574 +       CHECK_DEVLOCK();
27575 +       if (atomic_read(&conf->active_stripes))
27576 +               BUG();
27577 +       for (i = 0; i < NR_HASH; i++) {
27578 +               struct stripe_head *sh;
27579 +               while ((sh = conf->stripe_hashtbl[i]))
27580 +                       remove_hash(sh);
27581 +       }
27582 +}
27583 +
27584 +static struct stripe_head *
27585 +__find_stripe(raid5_conf_t * conf, unsigned long sector)
27586 +{
27587 +       struct stripe_head *sh;
27588 +
27589 +       CHECK_DEVLOCK();
27590 +       for (sh = stripe_hash(conf, sector); sh; sh = sh->hash_next)
27591 +               if (sh->sector == sector)
27592 +                       return sh;
27593 +       return NULL;
27594 +}
27595 +
27596 +static struct stripe_head *
27597 +get_active_stripe(raid5_conf_t * conf, unsigned long sector, int size)
27598 +{
27599 +       struct stripe_head *sh;
27600 +
27601 +       md_spin_lock_irq(&conf->device_lock);
27602 +
27603 +       do {
27604 +               if (conf->buffer_size == 0 ||
27605 +                   (size && size != conf->buffer_size)) {
27606 +                       /* either the size is being changed (buffer_size==0) or
27607 +                        * we need to change it.
27608 +                        * If size==0, we can proceed as soon as buffer_size gets set.
27609 +                        * If size>0, we can proceed when active_stripes reaches 0, or
27610 +                        * when someone else sets the buffer_size to size.
27611 +                        * If someone sets the buffer size to something else, we will need to
27612 +                        * assert that we want to change it again
27613 +                        */
27614 +                       if (size == 0)
27615 +                               wait_event_lock_irq(conf->wait_for_stripe,
27616 +                                                   conf->buffer_size,
27617 +                                                   conf->device_lock);
27618 +                       else {
27619 +                               while (conf->buffer_size != size
27620 +                                      && atomic_read(&conf->active_stripes)) {
27621 +                                       conf->buffer_size = 0;
27622 +                                       wait_event_lock_irq(conf->
27623 +                                                           wait_for_stripe,
27624 +                                                           atomic_read(&conf->
27625 +                                                                       active_stripes)
27626 +                                                           == 0
27627 +                                                           || conf->
27628 +                                                           buffer_size,
27629 +                                                           conf->device_lock);
27630 +                               }
27631 +
27632 +                               if (conf->buffer_size != size) {
27633 +                                       shrink_stripe_cache(conf);
27634 +                                       if (size == 0)
27635 +                                               BUG();
27636 +                                       conf->buffer_size = size;
27637 +                               }
27638 +                       }
27639 +               }
27640 +               if (size == 0)
27641 +                       sector -= sector & ((conf->buffer_size >> 9) - 1);
27642 +
27643 +               sh = __find_stripe(conf, sector);
27644 +               if (!sh) {
27645 +                       if (!conf->inactive_blocked)
27646 +                               sh = get_free_stripe(conf);
27647 +                       if (!sh) {
27648 +                               conf->inactive_blocked = 1;
27649 +                               wait_event_lock_irq(conf->wait_for_stripe,
27650 +                                                   !list_empty(&conf->
27651 +                                                               inactive_list)
27652 +                                                   &&
27653 +                                                   (atomic_read
27654 +                                                    (&conf->active_stripes) <
27655 +                                                    (NR_STRIPES * 3 / 4)
27656 +                                                    || !conf->
27657 +                                                    inactive_blocked),
27658 +                                                   conf->device_lock);
27659 +                               conf->inactive_blocked = 0;
27660 +                       } else
27661 +                               init_stripe(sh, sector);
27662 +               } else {
27663 +                       if (atomic_read(&sh->count)) {
27664 +                               if (!list_empty(&sh->lru))
27665 +                                       BUG();
27666 +                       } else {
27667 +                               if (!test_bit(STRIPE_HANDLE, &sh->state))
27668 +                                       atomic_inc(&conf->active_stripes);
27669 +                               if (list_empty(&sh->lru))
27670 +                                       BUG();
27671 +                               list_del_init(&sh->lru);
27672 +                       }
27673 +               }
27674 +       } while (sh == NULL);
27675 +
27676 +       if (sh)
27677 +               atomic_inc(&sh->count);
27678 +
27679 +       md_spin_unlock_irq(&conf->device_lock);
27680 +       return sh;
27681 +}
27682 +
27683 +static int
27684 +grow_stripes(raid5_conf_t * conf, int num, int priority)
27685 +{
27686 +       struct stripe_head *sh;
27687 +
27688 +       while (num--) {
27689 +               sh = kmalloc(sizeof (struct stripe_head), priority);
27690 +               if (!sh)
27691 +                       return 1;
27692 +               memset(sh, 0, sizeof (*sh));
27693 +               sh->raid_conf = conf;
27694 +               sh->lock = SPIN_LOCK_UNLOCKED;
27695 +
27696 +               if (grow_buffers(sh, conf->raid_disks, PAGE_SIZE, priority)) {
27697 +                       shrink_buffers(sh, conf->raid_disks);
27698 +                       kfree(sh);
27699 +                       return 1;
27700 +               }
27701 +               /* we just created an active stripe so... */
27702 +               sh->count = (atomic_t)ATOMIC_INIT(1);
27703 +               atomic_inc(&conf->active_stripes);
27704 +               INIT_LIST_HEAD(&sh->lru);
27705 +               release_stripe(sh);
27706 +       }
27707 +       return 0;
27708 +}
27709 +
27710 +static void
27711 +shrink_stripes(raid5_conf_t * conf, int num)
27712 +{
27713 +       struct stripe_head *sh;
27714 +
27715 +       while (num--) {
27716 +               spin_lock_irq(&conf->device_lock);
27717 +               sh = get_free_stripe(conf);
27718 +               spin_unlock_irq(&conf->device_lock);
27719 +               if (!sh)
27720 +                       break;
27721 +               if (atomic_read(&sh->count))
27722 +                       BUG();
27723 +               shrink_buffers(sh, conf->raid_disks);
27724 +               kfree(sh);
27725 +               atomic_dec(&conf->active_stripes);
27726 +       }
27727 +}
27728 +
27729 +static void
27730 +raid5_end_read_request(struct buffer_head *bh, int uptodate)
27731 +{
27732 +       struct stripe_head *sh = bh->b_private;
27733 +       raid5_conf_t *conf = sh->raid_conf;
27734 +       int disks = conf->raid_disks, i;
27735 +       unsigned long flags;
27736 +
27737 +       for (i = 0; i < disks; i++)
27738 +               if (bh == sh->bh_cache[i])
27739 +                       break;
27740 +
27741 +       if (i == disks) {
27742 +               BUG();
27743 +               return;
27744 +       }
27745 +
27746 +       if (uptodate) {
27747 +               struct buffer_head *buffer;
27748 +               spin_lock_irqsave(&conf->device_lock, flags);
27749 +               /* we can return a buffer if we bypassed the cache or
27750 +                * if the top buffer is not in highmem.  If there are
27751 +                * multiple buffers, leave the extra work to
27752 +                * handle_stripe
27753 +                */
27754 +               buffer = sh->bh_read[i];
27755 +               if (buffer && (!PageHighMem(buffer->b_page)
27756 +                              || buffer->b_page == bh->b_page)
27757 +                   ) {
27758 +                       sh->bh_read[i] = buffer->b_reqnext;
27759 +                       buffer->b_reqnext = NULL;
27760 +               } else
27761 +                       buffer = NULL;
27762 +               spin_unlock_irqrestore(&conf->device_lock, flags);
27763 +               if (sh->bh_page[i] == NULL)
27764 +                       set_bit(BH_Uptodate, &bh->b_state);
27765 +               if (buffer) {
27766 +                       if (buffer->b_page != bh->b_page)
27767 +                               memcpy(buffer->b_data, bh->b_data, bh->b_size);
27768 +                       evms_cs_volume_request_in_progress(buffer->b_rdev, -1, NULL);
27769 +                       buffer->b_end_io(buffer, 1);
27770 +               }
27771 +       } else {
27772 +               /* I/O error */
27773 +               if (sh->node[i])
27774 +                       evms_md_error(conf->mddev, sh->node[i]);
27775 +               else
27776 +                       LOG_WARNING
27777 +                           ("NODE was not set, skipping evms_md_error()\n");
27778 +               clear_bit(BH_Uptodate, &bh->b_state);
27779 +       }
27780 +       /* must restore b_page before unlocking buffer... */
27781 +       if (sh->bh_page[i]) {
27782 +               bh->b_page = sh->bh_page[i];
27783 +               bh->b_data = page_address(bh->b_page);
27784 +               sh->bh_page[i] = NULL;
27785 +               clear_bit(BH_Uptodate, &bh->b_state);
27786 +       }
27787 +       clear_bit(BH_Lock, &bh->b_state);
27788 +       set_bit(STRIPE_HANDLE, &sh->state);
27789 +       release_stripe(sh);
27790 +       if (sh->node[i]) {
27791 +               sh->node[i] = NULL;
27792 +       } else {
27793 +               LOG_WARNING(" evms node was not set.\n");
27794 +       }
27795 +
27796 +}
27797 +
27798 +static void
27799 +raid5_end_write_request(struct buffer_head *bh, int uptodate)
27800 +{
27801 +       struct stripe_head *sh = bh->b_private;
27802 +       raid5_conf_t *conf = sh->raid_conf;
27803 +       int disks = conf->raid_disks, i;
27804 +       unsigned long flags;
27805 +
27806 +       for (i = 0; i < disks; i++)
27807 +               if (bh == sh->bh_cache[i])
27808 +                       break;
27809 +
27810 +       if (i == disks) {
27811 +               BUG();
27812 +               return;
27813 +       }
27814 +
27815 +       md_spin_lock_irqsave(&conf->device_lock, flags);
27816 +       if (!uptodate) {
27817 +               /* I/O error */
27818 +               if (sh->node[i])
27819 +                       evms_md_error(conf->mddev, sh->node[i]);
27820 +               else
27821 +                       LOG_WARNING
27822 +                           (" NODE was not set, skipping evms_md_error()\n");
27823 +       }
27824 +       clear_bit(BH_Lock, &bh->b_state);
27825 +       set_bit(STRIPE_HANDLE, &sh->state);
27826 +       __release_stripe(conf, sh);
27827 +       md_spin_unlock_irqrestore(&conf->device_lock, flags);
27828 +       if (sh->node[i]) {
27829 +               sh->node[i] = NULL;
27830 +       } else {
27831 +               LOG_WARNING(" evms node was not set.\n");
27832 +       }
27833 +}
27834 +
27835 +static struct buffer_head *
27836 +raid5_build_block(struct stripe_head *sh, int i)
27837 +{
27838 +       raid5_conf_t *conf = sh->raid_conf;
27839 +       struct buffer_head *bh = sh->bh_cache[i];
27840 +       unsigned long block = sh->sector / (sh->size >> 9);
27841 +
27842 +       init_buffer(bh, raid5_end_read_request, sh);
27843 +       bh->b_dev = conf->disks[i].dev;
27844 +       bh->b_blocknr = block;
27845 +
27846 +       bh->b_state = (1 << BH_Req) | (1 << BH_Mapped);
27847 +       bh->b_size = sh->size;
27848 +       bh->b_list = BUF_LOCKED;
27849 +       return bh;
27850 +}
27851 +
27852 +static int
27853 +raid5_error(mddev_t * mddev, struct evms_logical_node * node)
27854 +{
27855 +       raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
27856 +       mdp_super_t *sb = mddev->sb;
27857 +       struct disk_info *disk;
27858 +       int i;
27859 +
27860 +       LOG_WARNING("%s: called\n", __FUNCTION__);
27861 +
27862 +       for (i = 0, disk = conf->disks; i < conf->raid_disks; i++, disk++) {
27863 +               if (disk->node == node) {
27864 +                       if (disk->operational) {
27865 +                               disk->operational = 0;
27866 +                               mark_disk_faulty(sb->disks + disk->number);
27867 +                               mark_disk_nonsync(sb->disks + disk->number);
27868 +                               mark_disk_inactive(sb->disks + disk->number);
27869 +                               sb->active_disks--;
27870 +                               sb->working_disks--;
27871 +                               sb->failed_disks++;
27872 +                               mddev->sb_dirty = 1;
27873 +                               conf->working_disks--;
27874 +                               conf->failed_disks++;
27875 +                               evms_cs_wakeup_thread(conf->thread);
27876 +                               LOG_WARNING
27877 +                                   ("Disk failure on %s, disabling device."
27878 +                                    " Operation continuing on %d devices\n",
27879 +                                    evms_md_partition_name(disk->node),
27880 +                                    conf->working_disks);
27881 +                       }
27882 +                       return 0;
27883 +               }
27884 +       }
27885 +       /*
27886 +        * handle errors in spares (during reconstruction)
27887 +        */
27888 +       if (conf->spare) {
27889 +               disk = conf->spare;
27890 +               if (disk->node == node) {
27891 +                       LOG_WARNING("EVMS RAID5: Disk failure on spare %s\n",
27892 +                                   evms_md_partition_name(disk->node));
27893 +                       if (!conf->spare->operational) {
27894 +                               /* probably a SET_DISK_FAULTY ioctl */
27895 +                               return -EIO;
27896 +                       }
27897 +                       disk->operational = 0;
27898 +                       disk->write_only = 0;
27899 +                       conf->spare = NULL;
27900 +                       mark_disk_faulty(sb->disks + disk->number);
27901 +                       mark_disk_nonsync(sb->disks + disk->number);
27902 +                       mark_disk_inactive(sb->disks + disk->number);
27903 +                       sb->spare_disks--;
27904 +                       sb->working_disks--;
27905 +                       sb->failed_disks++;
27906 +
27907 +                       mddev->sb_dirty = 1;
27908 +                       evms_cs_wakeup_thread(conf->thread);
27909 +
27910 +                       return 0;
27911 +               }
27912 +       }
27913 +       MD_BUG();
27914 +       return -EIO;
27915 +}
27916 +
27917 +/*
27918 + * Input: a 'big' sector number,
27919 + * Output: index of the data and parity disk, and the sector # in them.
27920 + */
27921 +static unsigned long
27922 +raid5_compute_sector(unsigned long r_sector, unsigned int raid_disks,
27923 +                    unsigned int data_disks, unsigned int *dd_idx,
27924 +                    unsigned int *pd_idx, raid5_conf_t * conf)
27925 +{
27926 +       unsigned long stripe;
27927 +       unsigned long chunk_number;
27928 +       unsigned int chunk_offset;
27929 +       unsigned long new_sector;
27930 +       int sectors_per_chunk = conf->chunk_size >> 9;
27931 +
27932 +       /* First compute the information on this sector */
27933 +
27934 +       /*
27935 +        * Compute the chunk number and the sector offset inside the chunk
27936 +        */
27937 +       chunk_number = r_sector / sectors_per_chunk;
27938 +       chunk_offset = r_sector % sectors_per_chunk;
27939 +
27940 +       /*
27941 +        * Compute the stripe number
27942 +        */
27943 +       stripe = chunk_number / data_disks;
27944 +
27945 +       /*
27946 +        * Compute the data disk and parity disk indexes inside the stripe
27947 +        */
27948 +       *dd_idx = chunk_number % data_disks;
27949 +
27950 +       /*
27951 +        * Select the parity disk based on the user selected algorithm.
27952 +        */
27953 +       if (conf->level == 4)
27954 +               *pd_idx = data_disks;
27955 +       else
27956 +               switch (conf->algorithm) {
27957 +               case ALGORITHM_LEFT_ASYMMETRIC:
27958 +                       *pd_idx = data_disks - stripe % raid_disks;
27959 +                       if (*dd_idx >= *pd_idx)
27960 +                               (*dd_idx)++;
27961 +                       break;
27962 +               case ALGORITHM_RIGHT_ASYMMETRIC:
27963 +                       *pd_idx = stripe % raid_disks;
27964 +                       if (*dd_idx >= *pd_idx)
27965 +                               (*dd_idx)++;
27966 +                       break;
27967 +               case ALGORITHM_LEFT_SYMMETRIC:
27968 +                       *pd_idx = data_disks - stripe % raid_disks;
27969 +                       *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
27970 +                       break;
27971 +               case ALGORITHM_RIGHT_SYMMETRIC:
27972 +                       *pd_idx = stripe % raid_disks;
27973 +                       *dd_idx = (*pd_idx + 1 + *dd_idx) % raid_disks;
27974 +                       break;
27975 +               default:
27976 +                       LOG_ERROR(" unsupported algorithm %d\n",
27977 +                                 conf->algorithm);
27978 +               }
27979 +
27980 +       /*
27981 +        * Finally, compute the new sector number
27982 +        */
27983 +       new_sector = stripe * sectors_per_chunk + chunk_offset;
27984 +       return new_sector;
27985 +}
27986 +
27987 +#define check_xor()    do {                                    \
27988 +                          if (count == MAX_XOR_BLOCKS) {       \
27989 +                               evms_md_xor_block(count, bh_ptr);       \
27990 +                               count = 1;                      \
27991 +                          }                                    \
27992 +                       } while(0)
27993 +
27994 +static void
27995 +compute_block(struct stripe_head *sh, int dd_idx)
27996 +{
27997 +       raid5_conf_t *conf = sh->raid_conf;
27998 +       int i, count, disks = conf->raid_disks;
27999 +       struct buffer_head *bh_ptr[MAX_XOR_BLOCKS], *bh;
28000 +
28001 +       memset(sh->bh_cache[dd_idx]->b_data, 0, sh->size);
28002 +       bh_ptr[0] = sh->bh_cache[dd_idx];
28003 +       count = 1;
28004 +       for (i = disks; i--;) {
28005 +               if (i == dd_idx)
28006 +                       continue;
28007 +               bh = sh->bh_cache[i];
28008 +               if (buffer_uptodate(bh))
28009 +                       bh_ptr[count++] = bh;
28010 +               else
28011 +                       LOG_ERROR("%s: %d, stripe %lu, %d not present\n",
28012 +                                 __FUNCTION__, dd_idx, sh->sector, i);
28013 +
28014 +               check_xor();
28015 +       }
28016 +       if (count != 1)
28017 +               evms_md_xor_block(count, bh_ptr);
28018 +       set_bit(BH_Uptodate, &sh->bh_cache[dd_idx]->b_state);
28019 +}
28020 +
28021 +static void
28022 +compute_parity(struct stripe_head *sh, int method)
28023 +{
28024 +       raid5_conf_t *conf = sh->raid_conf;
28025 +       int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
28026 +       struct buffer_head *bh_ptr[MAX_XOR_BLOCKS];
28027 +       struct buffer_head *chosen[MD_SB_DISKS];
28028 +
28029 +       memset(chosen, 0, sizeof (chosen));
28030 +
28031 +       count = 1;
28032 +       bh_ptr[0] = sh->bh_cache[pd_idx];
28033 +       switch (method) {
28034 +       case READ_MODIFY_WRITE:
28035 +               if (!buffer_uptodate(sh->bh_cache[pd_idx]))
28036 +                       BUG();
28037 +               for (i = disks; i--;) {
28038 +                       if (i == pd_idx)
28039 +                               continue;
28040 +                       if (sh->bh_write[i] && buffer_uptodate(sh->bh_cache[i])) {
28041 +                               bh_ptr[count++] = sh->bh_cache[i];
28042 +                               chosen[i] = sh->bh_write[i];
28043 +                               sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
28044 +                               chosen[i]->b_reqnext = sh->bh_written[i];
28045 +                               sh->bh_written[i] = chosen[i];
28046 +                               check_xor();
28047 +                       }
28048 +               }
28049 +               break;
28050 +       case RECONSTRUCT_WRITE:
28051 +               memset(sh->bh_cache[pd_idx]->b_data, 0, sh->size);
28052 +               for (i = disks; i--;)
28053 +                       if (i != pd_idx && sh->bh_write[i]) {
28054 +                               chosen[i] = sh->bh_write[i];
28055 +                               sh->bh_write[i] = sh->bh_write[i]->b_reqnext;
28056 +                               chosen[i]->b_reqnext = sh->bh_written[i];
28057 +                               sh->bh_written[i] = chosen[i];
28058 +                       }
28059 +               break;
28060 +       case CHECK_PARITY:
28061 +               break;
28062 +       }
28063 +       if (count > 1) {
28064 +               evms_md_xor_block(count, bh_ptr);
28065 +               count = 1;
28066 +       }
28067 +
28068 +       for (i = disks; i--;)
28069 +               if (chosen[i]) {
28070 +                       struct buffer_head *bh = sh->bh_cache[i];
28071 +                       char *bdata;
28072 +                       bdata = bh_kmap(chosen[i]);
28073 +                       memcpy(bh->b_data, bdata, sh->size);
28074 +                       bh_kunmap(chosen[i]);
28075 +                       set_bit(BH_Lock, &bh->b_state);
28076 +                       mark_buffer_uptodate(bh, 1);
28077 +               }
28078 +
28079 +       switch (method) {
28080 +       case RECONSTRUCT_WRITE:
28081 +       case CHECK_PARITY:
28082 +               for (i = disks; i--;)
28083 +                       if (i != pd_idx) {
28084 +                               bh_ptr[count++] = sh->bh_cache[i];
28085 +                               check_xor();
28086 +                       }
28087 +               break;
28088 +       case READ_MODIFY_WRITE:
28089 +               for (i = disks; i--;)
28090 +                       if (chosen[i]) {
28091 +                               bh_ptr[count++] = sh->bh_cache[i];
28092 +                               check_xor();
28093 +                       }
28094 +       }
28095 +       if (count != 1)
28096 +               evms_md_xor_block(count, bh_ptr);
28097 +
28098 +       if (method != CHECK_PARITY) {
28099 +               mark_buffer_uptodate(sh->bh_cache[pd_idx], 1);
28100 +               set_bit(BH_Lock, &sh->bh_cache[pd_idx]->b_state);
28101 +       } else
28102 +               mark_buffer_uptodate(sh->bh_cache[pd_idx], 0);
28103 +}
28104 +
28105 +static void
28106 +add_stripe_bh(struct stripe_head *sh, struct buffer_head *bh, int dd_idx,
28107 +             int rw)
28108 +{
28109 +       struct buffer_head **bhp;
28110 +       raid5_conf_t *conf = sh->raid_conf;
28111 +
28112 +       spin_lock(&sh->lock);
28113 +       spin_lock_irq(&conf->device_lock);
28114 +       bh->b_reqnext = NULL;
28115 +       if (rw == READ)
28116 +               bhp = &sh->bh_read[dd_idx];
28117 +       else
28118 +               bhp = &sh->bh_write[dd_idx];
28119 +       while (*bhp) {
28120 +               LOG_DEFAULT("EVMS RAID5: multiple %d requests for sector %ld\n",
28121 +                           rw, sh->sector);
28122 +               bhp = &(*bhp)->b_reqnext;
28123 +       }
28124 +       *bhp = bh;
28125 +       spin_unlock_irq(&conf->device_lock);
28126 +       spin_unlock(&sh->lock);
28127 +
28128 +}
28129 +
28130 +/*
28131 + * handle_stripe - do things to a stripe.
28132 + *
28133 + * We lock the stripe and then examine the state of various bits
28134 + * to see what needs to be done.
28135 + * Possible results:
28136 + *    return some read request which now have data
28137 + *    return some write requests which are safely on disc
28138 + *    schedule a read on some buffers
28139 + *    schedule a write of some buffers
28140 + *    return confirmation of parity correctness
28141 + *
28142 + * Parity calculations are done inside the stripe lock
28143 + * buffers are taken off read_list or write_list, and bh_cache buffers
28144 + * get BH_Lock set before the stripe lock is released.
28145 + *
28146 + */
28147 +
28148 +static void
28149 +handle_stripe(struct stripe_head *sh)
28150 +{
28151 +       raid5_conf_t *conf = sh->raid_conf;
28152 +       int disks = conf->raid_disks;
28153 +       struct buffer_head *return_ok = NULL, *return_fail = NULL;
28154 +       int action[MD_SB_DISKS];
28155 +       int i;
28156 +       int syncing;
28157 +       int locked = 0, uptodate = 0, to_read = 0, to_write = 0, failed =
28158 +           0, written = 0;
28159 +       int failed_num = 0;
28160 +       struct buffer_head *bh;
28161 +
28162 +       memset(action, 0, sizeof (action));
28163 +
28164 +       spin_lock(&sh->lock);
28165 +       clear_bit(STRIPE_HANDLE, &sh->state);
28166 +       clear_bit(STRIPE_DELAYED, &sh->state);
28167 +
28168 +       syncing = test_bit(STRIPE_SYNCING, &sh->state);
28169 +       /* Now to look around and see what can be done */
28170 +
28171 +       for (i = disks; i--;) {
28172 +               bh = sh->bh_cache[i];
28173 +               /* maybe we can reply to a read */
28174 +               if (buffer_uptodate(bh) && sh->bh_read[i]) {
28175 +                       struct buffer_head *rbh, *rbh2;
28176 +                       spin_lock_irq(&conf->device_lock);
28177 +                       rbh = sh->bh_read[i];
28178 +                       sh->bh_read[i] = NULL;
28179 +                       spin_unlock_irq(&conf->device_lock);
28180 +                       while (rbh) {
28181 +                               char *bdata;
28182 +                               bdata = bh_kmap(rbh);
28183 +                               memcpy(bdata, bh->b_data, bh->b_size);
28184 +                               bh_kunmap(rbh);
28185 +                               rbh2 = rbh->b_reqnext;
28186 +                               rbh->b_reqnext = return_ok;
28187 +                               return_ok = rbh;
28188 +                               rbh = rbh2;
28189 +                       }
28190 +               }
28191 +
28192 +               /* now count some things */
28193 +               if (buffer_locked(bh))
28194 +                       locked++;
28195 +               if (buffer_uptodate(bh))
28196 +                       uptodate++;
28197 +
28198 +               if (sh->bh_read[i])
28199 +                       to_read++;
28200 +               if (sh->bh_write[i])
28201 +                       to_write++;
28202 +               if (sh->bh_written[i])
28203 +                       written++;
28204 +               if (!conf->disks[i].operational) {
28205 +                       failed++;
28206 +                       failed_num = i;
28207 +               }
28208 +       }
28209 +       /* check if the array has lost two devices and, if so, some requests might
28210 +        * need to be failed
28211 +        */
28212 +       if (failed > 1 && to_read + to_write) {
28213 +               for (i = disks; i--;) {
28214 +                       /* fail all writes first */
28215 +                       if (sh->bh_write[i])
28216 +                               to_write--;
28217 +                       while ((bh = sh->bh_write[i])) {
28218 +                               sh->bh_write[i] = bh->b_reqnext;
28219 +                               bh->b_reqnext = return_fail;
28220 +                               return_fail = bh;
28221 +                       }
28222 +                       /* fail any reads if this device is non-operational */
28223 +                       if (!conf->disks[i].operational) {
28224 +                               spin_lock_irq(&conf->device_lock);
28225 +                               if (sh->bh_read[i])
28226 +                                       to_read--;
28227 +                               while ((bh = sh->bh_read[i])) {
28228 +                                       sh->bh_read[i] = bh->b_reqnext;
28229 +                                       bh->b_reqnext = return_fail;
28230 +                                       return_fail = bh;
28231 +                               }
28232 +                               spin_unlock_irq(&conf->device_lock);
28233 +                       }
28234 +               }
28235 +       }
28236 +       if (failed > 1 && syncing) {
28237 +               evms_md_done_sync(conf->mddev,
28238 +                                 (sh->size >> 9) - sh->sync_redone, 0);
28239 +               clear_bit(STRIPE_SYNCING, &sh->state);
28240 +               syncing = 0;
28241 +       }
28242 +
28243 +       /* might be able to return some write requests if the parity block
28244 +        * is safe, or on a failed drive
28245 +        */
28246 +       bh = sh->bh_cache[sh->pd_idx];
28247 +       if (written &&
28248 +           ((conf->disks[sh->pd_idx].operational && !buffer_locked(bh)
28249 +             && buffer_uptodate(bh))
28250 +            || (failed == 1 && failed_num == sh->pd_idx))
28251 +           ) {
28252 +               /* any written block on a uptodate or failed drive can be returned */
28253 +               for (i = disks; i--;)
28254 +                       if (sh->bh_written[i]) {
28255 +                               bh = sh->bh_cache[i];
28256 +                               if (!conf->disks[sh->pd_idx].operational ||
28257 +                                   (!buffer_locked(bh)
28258 +                                    && buffer_uptodate(bh))) {
28259 +                                       /* maybe we can return some write requests */
28260 +                                       struct buffer_head *wbh, *wbh2;
28261 +                                       wbh = sh->bh_written[i];
28262 +                                       sh->bh_written[i] = NULL;
28263 +                                       while (wbh) {
28264 +                                               wbh2 = wbh->b_reqnext;
28265 +                                               wbh->b_reqnext = return_ok;
28266 +                                               return_ok = wbh;
28267 +                                               wbh = wbh2;
28268 +                                       }
28269 +                               }
28270 +                       }
28271 +       }
28272 +
28273 +       /* Now we might consider reading some blocks, either to check/generate
28274 +        * parity, or to satisfy requests
28275 +        */
28276 +       if (to_read || (syncing && (uptodate + failed < disks))) {
28277 +               for (i = disks; i--;) {
28278 +                       bh = sh->bh_cache[i];
28279 +                       if (!buffer_locked(bh) && !buffer_uptodate(bh) &&
28280 +                           (sh->bh_read[i] || syncing
28281 +                            || (failed && sh->bh_read[failed_num]))) {
28282 +                               /* we would like to get this block, possibly
28283 +                                * by computing it, but we might not be able to
28284 +                                */
28285 +                               if (uptodate == disks - 1) {
28286 +                                       compute_block(sh, i);
28287 +                                       uptodate++;
28288 +                               } else if (conf->disks[i].operational) {
28289 +                                       set_bit(BH_Lock, &bh->b_state);
28290 +                                       action[i] = READ + 1;
28291 +                                       /* if I am just reading this block and we don't have
28292 +                                          a failed drive, or any pending writes then sidestep the cache */
28293 +                                       if (sh->bh_page[i])
28294 +                                               BUG();
28295 +                                       if (sh->bh_read[i]
28296 +                                           && !sh->bh_read[i]->b_reqnext
28297 +                                           && !syncing && !failed
28298 +                                           && !to_write) {
28299 +                                               sh->bh_page[i] =
28300 +                                                   sh->bh_cache[i]->b_page;
28301 +                                               sh->bh_cache[i]->b_page =
28302 +                                                   sh->bh_read[i]->b_page;
28303 +                                               sh->bh_cache[i]->b_data =
28304 +                                                   sh->bh_read[i]->b_data;
28305 +                                       }
28306 +                                       locked++;
28307 +                                       if (syncing)
28308 +                                               evms_md_sync_acct(conf->
28309 +                                                                 disks[i].dev,
28310 +                                                                 bh->
28311 +                                                                 b_size >> 9);
28312 +                               }
28313 +                       }
28314 +               }
28315 +               set_bit(STRIPE_HANDLE, &sh->state);
28316 +       }
28317 +
28318 +       /* now to consider writing and what else, if anything should be read */
28319 +       if (to_write) {
28320 +               int rmw = 0, rcw = 0;
28321 +               for (i = disks; i--;) {
28322 +                       /* would I have to read this buffer for read_modify_write */
28323 +                       bh = sh->bh_cache[i];
28324 +                       if ((sh->bh_write[i] || i == sh->pd_idx) &&
28325 +                           (!buffer_locked(bh) || sh->bh_page[i]) &&
28326 +                           !buffer_uptodate(bh)) {
28327 +                               if (conf->disks[i].operational
28328 +/*                                 && !(conf->resync_parity && i == sh->pd_idx) */
28329 +                                   )
28330 +                                       rmw++;
28331 +                               else
28332 +                                       rmw += 2 * disks;       /* cannot read it */
28333 +                       }
28334 +                       /* Would I have to read this buffer for reconstruct_write */
28335 +                       if (!sh->bh_write[i] && i != sh->pd_idx &&
28336 +                           (!buffer_locked(bh) || sh->bh_page[i]) &&
28337 +                           !buffer_uptodate(bh)) {
28338 +                               if (conf->disks[i].operational)
28339 +                                       rcw++;
28340 +                               else
28341 +                                       rcw += 2 * disks;
28342 +                       }
28343 +               }
28344 +               set_bit(STRIPE_HANDLE, &sh->state);
28345 +               if (rmw < rcw && rmw > 0)
28346 +                       /* prefer read-modify-write, but need to get some data */
28347 +                       for (i = disks; i--;) {
28348 +                               bh = sh->bh_cache[i];
28349 +                               if ((sh->bh_write[i] || i == sh->pd_idx) &&
28350 +                                   !buffer_locked(bh) && !buffer_uptodate(bh)
28351 +                                   && conf->disks[i].operational) {
28352 +                                       if (test_bit
28353 +                                           (STRIPE_PREREAD_ACTIVE,
28354 +                                            &sh->state)) {
28355 +                                               set_bit(BH_Lock, &bh->b_state);
28356 +                                               action[i] = READ + 1;
28357 +                                               locked++;
28358 +                                       } else {
28359 +                                               set_bit(STRIPE_DELAYED,
28360 +                                                       &sh->state);
28361 +                                               set_bit(STRIPE_HANDLE,
28362 +                                                       &sh->state);
28363 +                                       }
28364 +                               }
28365 +                       }
28366 +               if (rcw <= rmw && rcw > 0)
28367 +                       /* want reconstruct write, but need to get some data */
28368 +                       for (i = disks; i--;) {
28369 +                               bh = sh->bh_cache[i];
28370 +                               if (!sh->bh_write[i] && i != sh->pd_idx &&
28371 +                                   !buffer_locked(bh) && !buffer_uptodate(bh)
28372 +                                   && conf->disks[i].operational) {
28373 +                                       if (test_bit
28374 +                                           (STRIPE_PREREAD_ACTIVE,
28375 +                                            &sh->state)) {
28376 +                                               set_bit(BH_Lock, &bh->b_state);
28377 +                                               action[i] = READ + 1;
28378 +                                               locked++;
28379 +                                       } else {
28380 +                                               set_bit(STRIPE_DELAYED,
28381 +                                                       &sh->state);
28382 +                                               set_bit(STRIPE_HANDLE,
28383 +                                                       &sh->state);
28384 +                                       }
28385 +                               }
28386 +                       }
28387 +               /* now if nothing is locked, and if we have enough data, we can start a write request */
28388 +               if (locked == 0 && (rcw == 0 || rmw == 0)) {
28389 +                       compute_parity(sh,
28390 +                                      rcw ==
28391 +                                      0 ? RECONSTRUCT_WRITE :
28392 +                                      READ_MODIFY_WRITE);
28393 +                       /* now every locked buffer is ready to be written */
28394 +                       for (i = disks; i--;)
28395 +                               if (buffer_locked(sh->bh_cache[i])) {
28396 +                                       locked++;
28397 +                                       action[i] = WRITE + 1;
28398 +                                       if (!conf->disks[i].operational
28399 +                                           || (i == sh->pd_idx && failed == 0))
28400 +                                               set_bit(STRIPE_INSYNC,
28401 +                                                       &sh->state);
28402 +                               }
28403 +                       if (test_and_clear_bit
28404 +                           (STRIPE_PREREAD_ACTIVE, &sh->state)) {
28405 +                               atomic_dec(&conf->preread_active_stripes);
28406 +                               if (atomic_read(&conf->preread_active_stripes) <
28407 +                                   IO_THRESHOLD)
28408 +                                       evms_cs_wakeup_thread(conf->thread);
28409 +                       }
28410 +               }
28411 +       }
28412 +
28413 +       /* maybe we need to check and possibly fix the parity for this stripe
28414 +        * Any reads will already have been scheduled, so we just see if enough data
28415 +        * is available
28416 +        */
28417 +       if (syncing && locked == 0 &&
28418 +           !test_bit(STRIPE_INSYNC, &sh->state) && failed <= 1) {
28419 +               set_bit(STRIPE_HANDLE, &sh->state);
28420 +               if (failed == 0) {
28421 +                       if (uptodate != disks)
28422 +                               BUG();
28423 +                       compute_parity(sh, CHECK_PARITY);
28424 +                       uptodate--;
28425 +                       bh = sh->bh_cache[sh->pd_idx];
28426 +                       if ((*(u32 *) bh->b_data) == 0 &&
28427 +                           !memcmp(bh->b_data, bh->b_data + 4,
28428 +                                   bh->b_size - 4)) {
28429 +                               /* parity is correct (on disc, not in buffer any more) */
28430 +                               set_bit(STRIPE_INSYNC, &sh->state);
28431 +                       }
28432 +               }
28433 +               if (!test_bit(STRIPE_INSYNC, &sh->state)) {
28434 +                       struct disk_info *spare;
28435 +                       if (failed == 0)
28436 +                               failed_num = sh->pd_idx;
28437 +                       /* should be able to compute the missing block and write it to spare */
28438 +                       if (!buffer_uptodate(sh->bh_cache[failed_num])) {
28439 +                               if (uptodate + 1 != disks)
28440 +                                       BUG();
28441 +                               compute_block(sh, failed_num);
28442 +                               uptodate++;
28443 +                       }
28444 +                       if (uptodate != disks)
28445 +                               BUG();
28446 +                       bh = sh->bh_cache[failed_num];
28447 +                       set_bit(BH_Lock, &bh->b_state);
28448 +                       action[failed_num] = WRITE + 1;
28449 +                       locked++;
28450 +                       set_bit(STRIPE_INSYNC, &sh->state);
28451 +                       if (conf->disks[failed_num].operational)
28452 +                               evms_md_sync_acct(conf->disks[failed_num].dev,
28453 +                                                 bh->b_size >> 9);
28454 +                       else if ((spare = conf->spare))
28455 +                               evms_md_sync_acct(spare->dev, bh->b_size >> 9);
28456 +
28457 +               }
28458 +       }
28459 +       if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
28460 +               evms_md_done_sync(conf->mddev,
28461 +                                 (sh->size >> 9) - sh->sync_redone, 1);
28462 +               clear_bit(STRIPE_SYNCING, &sh->state);
28463 +       }
28464 +
28465 +       spin_unlock(&sh->lock);
28466 +
28467 +       while ((bh = return_ok)) {
28468 +               return_ok = bh->b_reqnext;
28469 +               bh->b_reqnext = NULL;
28470 +               evms_cs_volume_request_in_progress(bh->b_rdev, -1, NULL);
28471 +               bh->b_end_io(bh, 1);
28472 +       }
28473 +       while ((bh = return_fail)) {
28474 +               return_fail = bh->b_reqnext;
28475 +               bh->b_reqnext = NULL;
28476 +               evms_cs_volume_request_in_progress(bh->b_rdev, -1, NULL);
28477 +               bh->b_end_io(bh, 0);
28478 +       }
28479 +       for (i = disks; i--;)
28480 +               if (action[i]) {
28481 +                       struct buffer_head *bh = sh->bh_cache[i];
28482 +                       struct disk_info *spare = conf->spare;
28483 +                       struct evms_logical_node *node = NULL;
28484 +                       int skip = 0;
28485 +                       if (action[i] == READ + 1)
28486 +                               bh->b_end_io = raid5_end_read_request;
28487 +                       else
28488 +                               bh->b_end_io = raid5_end_write_request;
28489 +                       if (conf->disks[i].operational) {
28490 +                               bh->b_dev = conf->disks[i].dev;
28491 +                               node = conf->disks[i].node;
28492 +                       } else if (spare && action[i] == WRITE + 1) {
28493 +                               bh->b_dev = spare->dev;
28494 +                               node = spare->node;
28495 +                       } else
28496 +                               skip = 1;
28497 +                       if (!skip) {
28498 +                               atomic_inc(&sh->count);
28499 +                               //bh->b_rdev = bh->b_dev;
28500 +                               bh->b_rsector =
28501 +                                   bh->b_blocknr * (bh->b_size >> 9);
28502 +                               sh->node[i] = node;
28503 +                               if (action[i] == READ + 1)
28504 +                                       R_IO(node, bh);
28505 +                               else
28506 +                                       W_IO(node, bh);
28507 +                       } else {
28508 +                               clear_bit(BH_Lock, &bh->b_state);
28509 +                               set_bit(STRIPE_HANDLE, &sh->state);
28510 +                       }
28511 +               }
28512 +}
28513 +
28514 +static inline void
28515 +raid5_activate_delayed(raid5_conf_t * conf)
28516 +{
28517 +       if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
28518 +               while (!list_empty(&conf->delayed_list)) {
28519 +                       struct list_head *l = conf->delayed_list.next;
28520 +                       struct stripe_head *sh;
28521 +                       sh = list_entry(l, struct stripe_head, lru);
28522 +                       list_del_init(l);
28523 +                       clear_bit(STRIPE_DELAYED, &sh->state);
28524 +                       if (!test_and_set_bit
28525 +                           (STRIPE_PREREAD_ACTIVE, &sh->state))
28526 +                               atomic_inc(&conf->preread_active_stripes);
28527 +                       list_add_tail(&sh->lru, &conf->handle_list);
28528 +               }
28529 +       }
28530 +}
28531 +static void
28532 +raid5_unplug_device(void *data)
28533 +{
28534 +       raid5_conf_t *conf = (raid5_conf_t *) data;
28535 +       unsigned long flags;
28536 +
28537 +       spin_lock_irqsave(&conf->device_lock, flags);
28538 +
28539 +       raid5_activate_delayed(conf);
28540 +
28541 +       conf->plugged = 0;
28542 +       evms_cs_wakeup_thread(conf->thread);
28543 +
28544 +       spin_unlock_irqrestore(&conf->device_lock, flags);
28545 +}
28546 +
28547 +static inline void
28548 +raid5_plug_device(raid5_conf_t * conf)
28549 +{
28550 +       spin_lock_irq(&conf->device_lock);
28551 +       if (list_empty(&conf->delayed_list))
28552 +               if (!conf->plugged) {
28553 +                       conf->plugged = 1;
28554 +                       queue_task(&conf->plug_tq, &tq_disk);
28555 +               }
28556 +       spin_unlock_irq(&conf->device_lock);
28557 +}
28558 +
28559 +static inline void
28560 +raid5_rw(struct evms_logical_node * md_node, struct buffer_head *bh, int rw)
28561 +{
28562 +       mddev_t *mddev = EVMS_MD_NODE_TO_MDDEV(md_node);
28563 +       raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
28564 +       const unsigned int raid_disks = conf->raid_disks;
28565 +       const unsigned int data_disks = raid_disks - 1;
28566 +       unsigned int dd_idx, pd_idx;
28567 +       unsigned long new_sector;
28568 +       struct stripe_head *sh;
28569 +       unsigned long sectors_per_chunk = conf->chunk_size >> 9;
28570 +       unsigned long sect_in_chunk = bh->b_rsector & (sectors_per_chunk - 1);
28571 +
28572 +       if (evms_md_check_boundary(md_node, bh))
28573 +               return;
28574 +       if ((sect_in_chunk + (bh->b_size >> 9)) > sectors_per_chunk) {
28575 +               bh->b_end_io(bh, 0);
28576 +               return;
28577 +       }
28578 +
28579 +       new_sector = raid5_compute_sector(bh->b_rsector,
28580 +                                         raid_disks, data_disks, &dd_idx,
28581 +                                         &pd_idx, conf);
28582 +
28583 +       sh = get_active_stripe(conf, new_sector, bh->b_size);
28584 +       if (sh) {
28585 +               sh->pd_idx = pd_idx;
28586 +
28587 +               add_stripe_bh(sh, bh, dd_idx, rw);
28588 +
28589 +               raid5_plug_device(conf);
28590 +
28591 +               evms_cs_volume_request_in_progress(bh->b_rdev, 1, NULL);
28592 +               handle_stripe(sh);
28593 +               release_stripe(sh);
28594 +       } else {
28595 +               evms_cs_volume_request_in_progress(bh->b_rdev, -1, NULL);
28596 +               bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
28597 +       }
28598 +}
28599 +
28600 +static void
28601 +raid5_read(struct evms_logical_node * md_node, struct buffer_head *bh)
28602 +{
28603 +       raid5_rw(md_node, bh, READ);
28604 +}
28605 +
28606 +static void
28607 +raid5_write(struct evms_logical_node * md_node, struct buffer_head *bh)
28608 +{
28609 +       raid5_rw(md_node, bh, WRITE);
28610 +}
28611 +
28612 +static int
28613 +raid5_sync_request(mddev_t * mddev, unsigned long sector_nr)
28614 +{
28615 +       raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
28616 +       struct stripe_head *sh;
28617 +       int sectors_per_chunk = conf->chunk_size >> 9;
28618 +       unsigned long stripe = sector_nr / sectors_per_chunk;
28619 +       int chunk_offset = sector_nr % sectors_per_chunk;
28620 +       int dd_idx, pd_idx;
28621 +       unsigned long first_sector;
28622 +       int raid_disks = conf->raid_disks;
28623 +       int data_disks = raid_disks - 1;
28624 +       int redone = 0;
28625 +       int bufsize;
28626 +
28627 +       sh = get_active_stripe(conf, sector_nr, 0);
28628 +       bufsize = sh->size;
28629 +       redone = sector_nr - sh->sector;
28630 +       first_sector =
28631 +           raid5_compute_sector(stripe * data_disks * sectors_per_chunk +
28632 +                                chunk_offset, raid_disks, data_disks, &dd_idx,
28633 +                                &pd_idx, conf);
28634 +       sh->pd_idx = pd_idx;
28635 +       spin_lock(&sh->lock);
28636 +       set_bit(STRIPE_SYNCING, &sh->state);
28637 +       clear_bit(STRIPE_INSYNC, &sh->state);
28638 +       sh->sync_redone = redone;
28639 +       spin_unlock(&sh->lock);
28640 +
28641 +       handle_stripe(sh);
28642 +       release_stripe(sh);
28643 +
28644 +       return (bufsize >> 9) - redone;
28645 +}
28646 +
28647 +/*
28648 + * This is our raid5 kernel thread.
28649 + *
28650 + * We scan the hash table for stripes which can be handled now.
28651 + * During the scan, completed stripes are saved for us by the interrupt
28652 + * handler, so that they will not have to wait for our next wakeup.
28653 + */
28654 +static void
28655 +raid5d(void *data)
28656 +{
28657 +       struct stripe_head *sh;
28658 +       raid5_conf_t *conf = data;
28659 +       mddev_t *mddev = conf->mddev;
28660 +       int handled;
28661 +
28662 +       handled = 0;
28663 +
28664 +       if (mddev->sb_dirty) {
28665 +               mddev->sb_dirty = 0;
28666 +               evms_md_update_sb(mddev);
28667 +       }
28668 +       md_spin_lock_irq(&conf->device_lock);
28669 +       while (1) {
28670 +               struct list_head *first;
28671 +
28672 +               if (list_empty(&conf->handle_list) &&
28673 +                   atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD &&
28674 +                   !conf->plugged && !list_empty(&conf->delayed_list))
28675 +                       raid5_activate_delayed(conf);
28676 +
28677 +               if (list_empty(&conf->handle_list))
28678 +                       break;
28679 +
28680 +               first = conf->handle_list.next;
28681 +               sh = list_entry(first, struct stripe_head, lru);
28682 +
28683 +               list_del_init(first);
28684 +               atomic_inc(&sh->count);
28685 +               if (atomic_read(&sh->count) != 1)
28686 +                       BUG();
28687 +               md_spin_unlock_irq(&conf->device_lock);
28688 +
28689 +               handled++;
28690 +               handle_stripe(sh);
28691 +               release_stripe(sh);
28692 +
28693 +               md_spin_lock_irq(&conf->device_lock);
28694 +       }
28695 +
28696 +       md_spin_unlock_irq(&conf->device_lock);
28697 +
28698 +}
28699 +
28700 +/*
28701 + * Private kernel thread for parity reconstruction after an unclean
28702 + * shutdown. Reconstruction on spare drives in case of a failed drive
28703 + * is done by the generic mdsyncd.
28704 + */
28705 +static void
28706 +raid5syncd(void *data)
28707 +{
28708 +       raid5_conf_t *conf = data;
28709 +       mddev_t *mddev = conf->mddev;
28710 +
28711 +       if (!conf->resync_parity)
28712 +               return;
28713 +       if (conf->resync_parity == 2)
28714 +               return;
28715 +       down(&mddev->recovery_sem);
28716 +       if (evms_md_do_sync(mddev, NULL)) {
28717 +               up(&mddev->recovery_sem);
28718 +               LOG_WARNING("resync aborted!\n");
28719 +               return;
28720 +       }
28721 +       conf->resync_parity = 0;
28722 +       up(&mddev->recovery_sem);
28723 +       LOG_DEFAULT("resync finished.\n");
28724 +}
28725 +
28726 +static int
28727 +raid5_run(mddev_t * mddev)
28728 +{
28729 +       raid5_conf_t *conf;
28730 +       int i, j, raid_disk, memory;
28731 +       mdp_super_t *sb = mddev->sb;
28732 +       mdp_disk_t *desc;
28733 +       mdk_rdev_t *rdev;
28734 +       struct disk_info *disk;
28735 +       struct md_list_head *tmp;
28736 +       int start_recovery = 0;
28737 +
28738 +       MOD_INC_USE_COUNT;
28739 +
28740 +       if (sb->level != 5 && sb->level != 4) {
28741 +               LOG_ERROR("%s: [md%d] raid level not set to 4/5 (%d)\n",
28742 +                         __FUNCTION__, mdidx(mddev), sb->level);
28743 +               MOD_DEC_USE_COUNT;
28744 +               return -EIO;
28745 +       }
28746 +
28747 +       mddev->private = kmalloc(sizeof (raid5_conf_t), GFP_KERNEL);
28748 +       if ((conf = mddev->private) == NULL)
28749 +               goto abort;
28750 +       memset(conf, 0, sizeof (*conf));
28751 +       conf->mddev = mddev;
28752 +
28753 +       if ((conf->stripe_hashtbl =
28754 +            (struct stripe_head **) md__get_free_pages(GFP_ATOMIC,
28755 +                                                       HASH_PAGES_ORDER)) ==
28756 +           NULL)
28757 +               goto abort;
28758 +       memset(conf->stripe_hashtbl, 0, HASH_PAGES * PAGE_SIZE);
28759 +
28760 +       conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
28761 +       md_init_waitqueue_head(&conf->wait_for_stripe);
28762 +       INIT_LIST_HEAD(&conf->handle_list);
28763 +       INIT_LIST_HEAD(&conf->delayed_list);
28764 +       INIT_LIST_HEAD(&conf->inactive_list);
28765 +       conf->active_stripes = (atomic_t)ATOMIC_INIT(0);
28766 +       conf->preread_active_stripes = (atomic_t)ATOMIC_INIT(0);
28767 +       conf->buffer_size = PAGE_SIZE;  /* good default for rebuild */
28768 +
28769 +       conf->plugged = 0;
28770 +       conf->plug_tq.sync = 0;
28771 +       conf->plug_tq.routine = &raid5_unplug_device;
28772 +       conf->plug_tq.data = conf;
28773 +
28774 +       ITERATE_RDEV(mddev, rdev, tmp) {
28775 +               /*
28776 +                * This is important -- we are using the descriptor on
28777 +                * the disk only to get a pointer to the descriptor on
28778 +                * the main superblock, which might be more recent.
28779 +                */
28780 +               desc = sb->disks + rdev->desc_nr;
28781 +               raid_disk = desc->raid_disk;
28782 +               disk = conf->disks + raid_disk;
28783 +
28784 +               if (disk_faulty(desc)) {
28785 +                       LOG_ERROR("%s: disabled device %s (errors detected)\n",
28786 +                                 __FUNCTION__,
28787 +                                 evms_md_partition_name(rdev->node));
28788 +                       if (!rdev->faulty) {
28789 +                               MD_BUG();
28790 +                               goto abort;
28791 +                       }
28792 +                       disk->number = desc->number;
28793 +                       disk->raid_disk = raid_disk;
28794 +                       disk->dev = rdev->dev;
28795 +                       disk->node = rdev->node;
28796 +
28797 +                       disk->operational = 0;
28798 +                       disk->write_only = 0;
28799 +                       disk->spare = 0;
28800 +                       disk->used_slot = 1;
28801 +                       continue;
28802 +               }
28803 +               if (disk_active(desc)) {
28804 +                       if (!disk_sync(desc)) {
28805 +                               LOG_ERROR
28806 +                                   ("%s: disabled device %s (not in sync)\n",
28807 +                                    __FUNCTION__,
28808 +                                    evms_md_partition_name(rdev->node));
28809 +                               MD_BUG();
28810 +                               goto abort;
28811 +                       }
28812 +                       if (raid_disk > sb->raid_disks) {
28813 +                               LOG_ERROR
28814 +                                   ("%s: disabled device %s (inconsistent descriptor)\n",
28815 +                                    __FUNCTION__,
28816 +                                    evms_md_partition_name(rdev->node));
28817 +                               continue;
28818 +                       }
28819 +                       if (disk->operational) {
28820 +                               LOG_ERROR
28821 +                                   ("%s: disabled device %s (device %d already operational)\n",
28822 +                                    __FUNCTION__,
28823 +                                    evms_md_partition_name(rdev->node),
28824 +                                    raid_disk);
28825 +                               continue;
28826 +                       }
28827 +                       LOG_DEFAULT
28828 +                           ("%s: device %s operational as raid disk %d\n",
28829 +                            __FUNCTION__, evms_md_partition_name(rdev->node),
28830 +                            raid_disk);
28831 +
28832 +                       disk->number = desc->number;
28833 +                       disk->raid_disk = raid_disk;
28834 +                       disk->dev = rdev->dev;
28835 +                       disk->node = rdev->node;
28836 +                       disk->operational = 1;
28837 +                       disk->used_slot = 1;
28838 +
28839 +                       conf->working_disks++;
28840 +               } else {
28841 +                       /*
28842 +                        * Must be a spare disk ..
28843 +                        */
28844 +                       LOG_DEFAULT(" spare disk %s\n",
28845 +                                   evms_md_partition_name(rdev->node));
28846 +                       disk->number = desc->number;
28847 +                       disk->raid_disk = raid_disk;
28848 +                       disk->dev = rdev->dev;
28849 +                       disk->node = rdev->node;
28850 +
28851 +                       disk->operational = 0;
28852 +                       disk->write_only = 0;
28853 +                       disk->spare = 1;
28854 +                       disk->used_slot = 1;
28855 +               }
28856 +       }
28857 +
28858 +       for (i = 0; i < MD_SB_DISKS; i++) {
28859 +               desc = sb->disks + i;
28860 +               raid_disk = desc->raid_disk;
28861 +               disk = conf->disks + raid_disk;
28862 +
28863 +               if (disk_faulty(desc) && (raid_disk < sb->raid_disks) &&
28864 +                   !conf->disks[raid_disk].used_slot) {
28865 +
28866 +                       disk->number = desc->number;
28867 +                       disk->raid_disk = raid_disk;
28868 +                       disk->dev = MKDEV(0, 0);
28869 +                       disk->node = NULL;
28870 +
28871 +                       disk->operational = 0;
28872 +                       disk->write_only = 0;
28873 +                       disk->spare = 0;
28874 +                       disk->used_slot = 1;
28875 +               }
28876 +       }
28877 +
28878 +       conf->raid_disks = sb->raid_disks;
28879 +       /*
28880 +        * faied_disks: 0 for a fully functional array, 1 for a degraded array.
28881 +        */
28882 +       conf->failed_disks = conf->raid_disks - conf->working_disks;
28883 +       conf->mddev = mddev;
28884 +       conf->chunk_size = sb->chunk_size;
28885 +       conf->level = sb->level;
28886 +       conf->algorithm = sb->layout;
28887 +       conf->max_nr_stripes = NR_STRIPES;
28888 +
28889 +       /*
28890 +        * If chunk_size is validated in md_core.c, why do it again?
28891 +        * And the check in md_core is:
28892 +        *     chunk_size has to be a power of 2 and multiples of PAGE_SIZE
28893 +        */
28894 +
28895 +       if (!conf->chunk_size ||
28896 +           ((1 << ffz(~conf->chunk_size)) != conf->chunk_size) ||
28897 +           (conf->chunk_size < PAGE_SIZE)) {
28898 +               LOG_ERROR("%s: invalid chunk size %d for md%d\n", __FUNCTION__,
28899 +                         conf->chunk_size, mdidx(mddev));
28900 +               goto abort;
28901 +       }
28902 +       if (conf->algorithm > ALGORITHM_RIGHT_SYMMETRIC) {
28903 +               LOG_ERROR(" unsupported parity algorithm %d for md%d\n",
28904 +                         conf->algorithm, mdidx(mddev));
28905 +               goto abort;
28906 +       }
28907 +       if (conf->failed_disks > 1) {
28908 +               LOG_ERROR
28909 +                   (" not enough operational devices for md%d (%d/%d failed)\n",
28910 +                    mdidx(mddev), conf->failed_disks, conf->raid_disks);
28911 +               goto abort;
28912 +       }
28913 +
28914 +       if (conf->working_disks != sb->raid_disks) {
28915 +               LOG_WARNING
28916 +                   (" md%d, not all disks are operational -- trying to recover array\n",
28917 +                    mdidx(mddev));
28918 +               start_recovery = 1;
28919 +       }
28920 +
28921 +       {
28922 +               const char *name = "evms_raid5d";
28923 +
28924 +               conf->thread = evms_cs_register_thread(raid5d, conf, name);
28925 +               if (!conf->thread) {
28926 +                       LOG_ERROR("%s: couldn't allocate thread for md%d\n",
28927 +                                 __FUNCTION__, mdidx(mddev));
28928 +                       goto abort;
28929 +               }
28930 +       }
28931 +
28932 +       memory = conf->max_nr_stripes * (sizeof (struct stripe_head) +
28933 +                                        conf->raid_disks *
28934 +                                        ((sizeof (struct buffer_head) +
28935 +                                          PAGE_SIZE))) / 1024;
28936 +       if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) {
28937 +               LOG_ERROR("%s: couldn't allocate %dkB for buffers\n",
28938 +                         __FUNCTION__, memory);
28939 +               shrink_stripes(conf, conf->max_nr_stripes);
28940 +               goto abort;
28941 +       } else
28942 +               LOG_DETAILS("%s: allocated %dkB for md%d\n", __FUNCTION__,
28943 +                           memory, mdidx(mddev));
28944 +
28945 +       /*
28946 +        * Regenerate the "device is in sync with the raid set" bit for
28947 +        * each device.
28948 +        */
28949 +       for (i = 0; i < MD_SB_DISKS; i++) {
28950 +               mark_disk_nonsync(sb->disks + i);
28951 +               for (j = 0; j < sb->raid_disks; j++) {
28952 +                       if (!conf->disks[j].operational)
28953 +                               continue;
28954 +                       if (sb->disks[i].number == conf->disks[j].number)
28955 +                               mark_disk_sync(sb->disks + i);
28956 +               }
28957 +       }
28958 +       sb->active_disks = conf->working_disks;
28959 +
28960 +       if (sb->active_disks == sb->raid_disks) {
28961 +               LOG_DETAILS
28962 +                   ("%s: raid level %d set md%d active with %d out of %d devices, algorithm %d\n",
28963 +                    __FUNCTION__, conf->level, mdidx(mddev), sb->active_disks,
28964 +                    sb->raid_disks, conf->algorithm);
28965 +       } else {
28966 +               LOG_WARNING
28967 +                   ("%s: raid level %d set md%d active with %d out of %d devices, algorithm %d\n",
28968 +                    __FUNCTION__, conf->level, mdidx(mddev), sb->active_disks,
28969 +                    sb->raid_disks, conf->algorithm);
28970 +       }
28971 +
28972 +       if (!start_recovery && !(sb->state & (1 << MD_SB_CLEAN))) {
28973 +               const char *name = "evms_raid5syncd";
28974 +
28975 +               conf->resync_thread =
28976 +                   evms_cs_register_thread(raid5syncd, conf, name);
28977 +               if (!conf->resync_thread) {
28978 +                       LOG_ERROR("%s: couldn't allocate thread for md%d\n",
28979 +                                 __FUNCTION__, mdidx(mddev));
28980 +                       goto abort;
28981 +               }
28982 +
28983 +               LOG_WARNING
28984 +                   ("%s: raid set md%d not clean; reconstructing parity\n",
28985 +                    __FUNCTION__, mdidx(mddev));
28986 +               conf->resync_parity = 1;
28987 +               evms_cs_wakeup_thread(conf->resync_thread);
28988 +       }
28989 +
28990 +       print_raid5_conf(conf);
28991 +       if (start_recovery)
28992 +               evms_md_recover_arrays();
28993 +       print_raid5_conf(conf);
28994 +
28995 +       /* Ok, everything is just fine now */
28996 +       return (0);
28997 +      abort:
28998 +       if (conf) {
28999 +               print_raid5_conf(conf);
29000 +               if (conf->stripe_hashtbl)
29001 +                       free_pages((unsigned long) conf->stripe_hashtbl,
29002 +                                  HASH_PAGES_ORDER);
29003 +               kfree(conf);
29004 +       }
29005 +       mddev->private = NULL;
29006 +       LOG_WARNING("%s: failed to run raid set md%d\n", __FUNCTION__,
29007 +                   mdidx(mddev));
29008 +       MOD_DEC_USE_COUNT;
29009 +       return -EIO;
29010 +}
29011 +
29012 +static int
29013 +raid5_stop_resync(mddev_t * mddev)
29014 +{
29015 +       raid5_conf_t *conf = mddev_to_conf(mddev);
29016 +       struct evms_thread *thread;
29017 +
29018 +       if (conf == NULL) {
29019 +               return 0;
29020 +       }
29021 +
29022 +       thread = conf->resync_thread;
29023 +
29024 +       if (thread) {
29025 +               if (conf->resync_parity) {
29026 +                       conf->resync_parity = 2;
29027 +                       evms_cs_interrupt_thread(thread);
29028 +                       LOG_WARNING
29029 +                           ("%s: parity resync was not fully finished, restarting next time.\n",
29030 +                            __FUNCTION__);
29031 +                       return 1;
29032 +               }
29033 +               return 0;
29034 +       }
29035 +       return 0;
29036 +}
29037 +
29038 +static int
29039 +raid5_restart_resync(mddev_t * mddev)
29040 +{
29041 +       raid5_conf_t *conf = mddev_to_conf(mddev);
29042 +
29043 +       if (conf->resync_parity) {
29044 +               if (!conf->resync_thread) {
29045 +                       MD_BUG();
29046 +                       return 0;
29047 +               }
29048 +               LOG_DEFAULT("%s: waking up raid5resync.\n", __FUNCTION__);
29049 +               conf->resync_parity = 1;
29050 +               evms_cs_wakeup_thread(conf->resync_thread);
29051 +               return 1;
29052 +       } else
29053 +               LOG_DEFAULT("%s: no restart-resync needed.\n", __FUNCTION__);
29054 +       return 0;
29055 +}
29056 +
29057 +static int
29058 +raid5_stop(mddev_t * mddev)
29059 +{
29060 +       raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
29061 +
29062 +       if (conf != NULL) {
29063 +               if (conf->resync_thread)
29064 +                       evms_cs_unregister_thread(conf->resync_thread);
29065 +               evms_cs_unregister_thread(conf->thread);
29066 +               shrink_stripes(conf, conf->max_nr_stripes);
29067 +               free_pages((unsigned long) conf->stripe_hashtbl,
29068 +                          HASH_PAGES_ORDER);
29069 +               kfree(conf);
29070 +               mddev->private = NULL;
29071 +       }
29072 +       MOD_DEC_USE_COUNT;
29073 +       return 0;
29074 +}
29075 +
29076 +#if RAID5_DEBUG
29077 +static void
29078 +print_sh(struct stripe_head *sh)
29079 +{
29080 +       int i;
29081 +
29082 +       LOG_DEFAULT("sh %lu, size %d, pd_idx %d, state %ld.\n", sh->sector,
29083 +                   sh->size, sh->pd_idx, sh->state);
29084 +       LOG_DEFAULT("sh %lu,  count %d.\n", sh->sector,
29085 +                   atomic_read(&sh->count));
29086 +       LOG_DEFAULT("sh %lu, ", sh->sector);
29087 +       for (i = 0; i < MD_SB_DISKS; i++) {
29088 +               if (sh->bh_cache[i])
29089 +                       LOG_DEFAULT("(cache%d: %p %ld) ", i, sh->bh_cache[i],
29090 +                                   sh->bh_cache[i]->b_state);
29091 +       }
29092 +       LOG_DEFAULT("\n");
29093 +}
29094 +
29095 +static void
29096 +printall(raid5_conf_t * conf)
29097 +{
29098 +       struct stripe_head *sh;
29099 +       int i;
29100 +
29101 +       md_spin_lock_irq(&conf->device_lock);
29102 +       for (i = 0; i < NR_HASH; i++) {
29103 +               sh = conf->stripe_hashtbl[i];
29104 +               for (; sh; sh = sh->hash_next) {
29105 +                       if (sh->raid_conf != conf)
29106 +                               continue;
29107 +                       print_sh(sh);
29108 +               }
29109 +       }
29110 +       md_spin_unlock_irq(&conf->device_lock);
29111 +}
29112 +#endif
29113 +
29114 +static int
29115 +raid5_status(char *page, mddev_t * mddev)
29116 +{
29117 +       raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
29118 +       mdp_super_t *sb = mddev->sb;
29119 +       int sz = 0, i;
29120 +
29121 +       sz +=
29122 +           sprintf(page + sz, " level %d, %dk chunk, algorithm %d", sb->level,
29123 +                   sb->chunk_size >> 10, sb->layout);
29124 +       sz +=
29125 +           sprintf(page + sz, " [%d/%d] [", conf->raid_disks,
29126 +                   conf->working_disks);
29127 +       for (i = 0; i < conf->raid_disks; i++)
29128 +               sz +=
29129 +                   sprintf(page + sz, "%s",
29130 +                           conf->disks[i].operational ? "U" : "_");
29131 +       sz += sprintf(page + sz, "]");
29132 +#if RAID5_DEBUG
29133 +#define D(x) \
29134 +       sz += sprintf (page+sz, "<"#x":%d>", atomic_read(&conf->x))
29135 +       printall(conf);
29136 +#endif
29137 +       return sz;
29138 +}
29139 +
29140 +static void
29141 +print_raid5_conf(raid5_conf_t * conf)
29142 +{
29143 +       int i;
29144 +       struct disk_info *tmp;
29145 +
29146 +       LOG_DEFAULT("RAID5 conf printout:\n");
29147 +       if (!conf) {
29148 +               LOG_DEFAULT("(conf==NULL)\n");
29149 +               return;
29150 +       }
29151 +       LOG_DEFAULT(" --- rd:%d wd:%d fd:%d\n", conf->raid_disks,
29152 +                   conf->working_disks, conf->failed_disks);
29153 +
29154 +#if RAID5_DEBUG
29155 +       for (i = 0; i < MD_SB_DISKS; i++) {
29156 +#else
29157 +       for (i = 0; i < conf->working_disks + conf->failed_disks; i++) {
29158 +#endif
29159 +               tmp = conf->disks + i;
29160 +               LOG_DEFAULT(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
29161 +                           i, tmp->spare, tmp->operational,
29162 +                           tmp->number, tmp->raid_disk, tmp->used_slot,
29163 +                           evms_md_partition_name(tmp->node));
29164 +       }
29165 +}
29166 +
29167 +static int
29168 +raid5_diskop(mddev_t * mddev, mdp_disk_t ** d, int state)
29169 +{
29170 +       int err = 0;
29171 +       int i, failed_disk = -1, spare_disk = -1, removed_disk = -1;
29172 +       raid5_conf_t *conf = mddev->private;
29173 +       struct disk_info *tmp, *sdisk, *fdisk, *rdisk;
29174 +       mdp_super_t *sb = mddev->sb;
29175 +       mdp_disk_t *failed_desc, *spare_desc;
29176 +       mdk_rdev_t *spare_rdev, *failed_rdev;
29177 +
29178 +       print_raid5_conf(conf);
29179 +       md_spin_lock_irq(&conf->device_lock);
29180 +       /*
29181 +        * find the disk ...
29182 +        */
29183 +       switch (state) {
29184 +
29185 +       case DISKOP_SPARE_ACTIVE:
29186 +
29187 +               /*
29188 +                * Find the failed disk within the RAID5 configuration ...
29189 +                * (this can only be in the first conf->raid_disks part)
29190 +                */
29191 +               for (i = 0; i < conf->raid_disks; i++) {
29192 +                       tmp = conf->disks + i;
29193 +                       if ((!tmp->operational && !tmp->spare) ||
29194 +                           !tmp->used_slot) {
29195 +                               failed_disk = i;
29196 +                               break;
29197 +                       }
29198 +               }
29199 +               /*
29200 +                * When we activate a spare disk we _must_ have a disk in
29201 +                * the lower (active) part of the array to replace.
29202 +                */
29203 +               if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
29204 +                       MD_BUG();
29205 +                       err = 1;
29206 +                       goto abort;
29207 +               }
29208 +               /* fall through */
29209 +
29210 +       case DISKOP_SPARE_WRITE:
29211 +       case DISKOP_SPARE_INACTIVE:
29212 +
29213 +               /*
29214 +                * Find the spare disk ... (can only be in the 'high'
29215 +                * area of the array)
29216 +                */
29217 +               for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
29218 +                       tmp = conf->disks + i;
29219 +                       if (tmp->spare && tmp->number == (*d)->number) {
29220 +                               spare_disk = i;
29221 +                               break;
29222 +                       }
29223 +               }
29224 +               if (spare_disk == -1) {
29225 +                       MD_BUG();
29226 +                       err = 1;
29227 +                       goto abort;
29228 +               }
29229 +               break;
29230 +
29231 +       case DISKOP_HOT_REMOVE_SPARE:
29232 +
29233 +               for (i = 0; i < MD_SB_DISKS; i++) {
29234 +                       tmp = conf->disks + i;
29235 +                       if (tmp->used_slot && (tmp->number == (*d)->number)) {
29236 +                               if (tmp->operational) {
29237 +                                       err = -EBUSY;
29238 +                                       goto abort;
29239 +                               } else if (!tmp->spare) {
29240 +                                       MD_BUG();
29241 +                                       err = 1;
29242 +                                       goto abort;
29243 +                               }
29244 +                               removed_disk = i;
29245 +                               break;
29246 +                       }
29247 +               }
29248 +               if (removed_disk == -1) {
29249 +                       MD_BUG();
29250 +                       err = 1;
29251 +                       goto abort;
29252 +               }
29253 +               break;
29254 +
29255 +       case DISKOP_HOT_REMOVE_DISK:
29256 +               for (i = 0; i < MD_SB_DISKS; i++) {
29257 +                       tmp = conf->disks + i;
29258 +                       if (tmp->used_slot && (tmp->number == (*d)->number)) {
29259 +                               if (i < conf->raid_disks) {
29260 +                                       if (conf->working_disks !=
29261 +                                           conf->raid_disks) {
29262 +                                               /*
29263 +                                                * Can't remove a disk from an
29264 +                                                * array that is running in
29265 +                                                * degrade mode.
29266 +                                                */
29267 +                                               err = -EBUSY;
29268 +                                               goto abort;
29269 +                                       }
29270 +                                       if (sb->spare_disks == 0) {
29271 +                                               /*
29272 +                                                * Must have a spare ready
29273 +                                                * before removing an active
29274 +                                                * disk.
29275 +                                                */
29276 +                                               err = -EBUSY;
29277 +                                               goto abort;
29278 +                                       }
29279 +                               }
29280 +                               removed_disk = i;
29281 +                               break;
29282 +                       }
29283 +               }
29284 +               if (removed_disk == -1) {
29285 +                       MD_BUG();
29286 +                       err = 1;
29287 +                       goto abort;
29288 +               }
29289 +               break;
29290 +
29291 +       case DISKOP_HOT_ADD_DISK:
29292 +               err = -ENOSYS;
29293 +               goto abort;
29294 +               break;
29295 +       }
29296 +
29297 +       switch (state) {
29298 +               /*
29299 +                * Switch the spare disk to write-only mode:
29300 +                */
29301 +       case DISKOP_SPARE_WRITE:
29302 +               if (conf->spare) {
29303 +                       MD_BUG();
29304 +                       err = 1;
29305 +                       goto abort;
29306 +               }
29307 +               sdisk = conf->disks + spare_disk;
29308 +               sdisk->operational = 1;
29309 +               sdisk->write_only = 1;
29310 +               conf->spare = sdisk;
29311 +               break;
29312 +               /*
29313 +                * Deactivate a spare disk:
29314 +                */
29315 +       case DISKOP_SPARE_INACTIVE:
29316 +               sdisk = conf->disks + spare_disk;
29317 +               sdisk->operational = 0;
29318 +               sdisk->write_only = 0;
29319 +               /*
29320 +                * Was the spare being resynced?
29321 +                */
29322 +               if (conf->spare == sdisk)
29323 +                       conf->spare = NULL;
29324 +               break;
29325 +               /*
29326 +                * Activate (mark read-write) the (now sync) spare disk,
29327 +                * which means we switch it's 'raid position' (->raid_disk)
29328 +                * with the failed disk. (only the first 'conf->raid_disks'
29329 +                * slots are used for 'real' disks and we must preserve this
29330 +                * property)
29331 +                */
29332 +       case DISKOP_SPARE_ACTIVE:
29333 +               if (!conf->spare) {
29334 +                       MD_BUG();
29335 +                       err = 1;
29336 +                       goto abort;
29337 +               }
29338 +               sdisk = conf->disks + spare_disk;
29339 +               fdisk = conf->disks + failed_disk;
29340 +
29341 +               spare_desc = &sb->disks[sdisk->number];
29342 +               failed_desc = &sb->disks[fdisk->number];
29343 +
29344 +               if (spare_desc != *d) {
29345 +                       MD_BUG();
29346 +                       err = 1;
29347 +                       goto abort;
29348 +               }
29349 +
29350 +               if (spare_desc->raid_disk != sdisk->raid_disk) {
29351 +                       MD_BUG();
29352 +                       err = 1;
29353 +                       goto abort;
29354 +               }
29355 +
29356 +               if (sdisk->raid_disk != spare_disk) {
29357 +                       MD_BUG();
29358 +                       err = 1;
29359 +                       goto abort;
29360 +               }
29361 +
29362 +               if (failed_desc->raid_disk != fdisk->raid_disk) {
29363 +                       MD_BUG();
29364 +                       err = 1;
29365 +                       goto abort;
29366 +               }
29367 +
29368 +               if (fdisk->raid_disk != failed_disk) {
29369 +                       MD_BUG();
29370 +                       err = 1;
29371 +                       goto abort;
29372 +               }
29373 +
29374 +               /*
29375 +                * do the switch finally
29376 +                */
29377 +               spare_rdev = evms_md_find_rdev_nr(mddev, spare_desc->number);
29378 +               failed_rdev = evms_md_find_rdev_nr(mddev, failed_desc->number);
29379 +
29380 +               /* There must be a spare_rdev, but there may not be a
29381 +                * failed_rdev.  That slot might be empty...
29382 +                */
29383 +               spare_rdev->desc_nr = failed_desc->number;
29384 +               if (failed_rdev)
29385 +                       failed_rdev->desc_nr = spare_desc->number;
29386 +
29387 +               xchg_values(*spare_desc, *failed_desc);
29388 +               xchg_values(*fdisk, *sdisk);
29389 +
29390 +               /*
29391 +                * (careful, 'failed' and 'spare' are switched from now on)
29392 +                *
29393 +                * we want to preserve linear numbering and we want to
29394 +                * give the proper raid_disk number to the now activated
29395 +                * disk. (this means we switch back these values)
29396 +                */
29397 +
29398 +               xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
29399 +               xchg_values(sdisk->raid_disk, fdisk->raid_disk);
29400 +               xchg_values(spare_desc->number, failed_desc->number);
29401 +               xchg_values(sdisk->number, fdisk->number);
29402 +
29403 +               *d = failed_desc;
29404 +
29405 +               //if (sdisk->dev == MKDEV(0,0))
29406 +               if (sdisk->node == NULL)
29407 +                       sdisk->used_slot = 0;
29408 +
29409 +               /*
29410 +                * this really activates the spare.
29411 +                */
29412 +               fdisk->spare = 0;
29413 +               fdisk->write_only = 0;
29414 +
29415 +               /*
29416 +                * if we activate a spare, we definitely replace a
29417 +                * non-operational disk slot in the 'low' area of
29418 +                * the disk array.
29419 +                */
29420 +               conf->failed_disks--;
29421 +               conf->working_disks++;
29422 +               conf->spare = NULL;
29423 +
29424 +               break;
29425 +
29426 +       case DISKOP_HOT_REMOVE_SPARE:
29427 +               rdisk = conf->disks + removed_disk;
29428 +
29429 +               if (rdisk->spare && (removed_disk < conf->raid_disks)) {
29430 +                       MD_BUG();
29431 +                       err = 1;
29432 +                       goto abort;
29433 +               }
29434 +               if (conf->spare != NULL) {
29435 +                       if (conf->spare->number == removed_disk) {
29436 +                               conf->spare = NULL;
29437 +                       }
29438 +               }
29439 +
29440 +               rdisk->dev = MKDEV(0, 0);
29441 +               rdisk->node = NULL;
29442 +               rdisk->used_slot = 0;
29443 +
29444 +               break;
29445 +
29446 +       case DISKOP_HOT_REMOVE_DISK:
29447 +               rdisk = conf->disks + removed_disk;
29448 +               if (rdisk->operational) {
29449 +                       /* We're removing a running disk in the array. */
29450 +                       conf->working_disks--;
29451 +                       conf->failed_disks++;
29452 +               }
29453 +               rdisk->dev = MKDEV(0, 0);
29454 +               rdisk->node = NULL;
29455 +               rdisk->used_slot = 0;
29456 +               rdisk->operational = 0;
29457 +               break;
29458 +
29459 +       default:
29460 +               MD_BUG();
29461 +               err = 1;
29462 +               goto abort;
29463 +       }
29464 +      abort:
29465 +       md_spin_unlock_irq(&conf->device_lock);
29466 +       print_raid5_conf(conf);
29467 +       return err;
29468 +}
29469 +
29470 +static int
29471 +raid5_bmap(mddev_t * mddev,
29472 +          u64 * rsector,
29473 +          struct evms_logical_node ** node)
29474 +{
29475 +       raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
29476 +       const unsigned int raid_disks = conf->raid_disks;
29477 +       const unsigned int data_disks = raid_disks - 1;
29478 +       unsigned int dd_idx, pd_idx;
29479 +
29480 +       *rsector = (u64) raid5_compute_sector( (unsigned long) *rsector,
29481 +                                              raid_disks, data_disks,
29482 +                                              &dd_idx, &pd_idx, conf);
29483 +       *node = conf->disks[dd_idx].node;
29484 +       return 0;               /* always successful */
29485 +}
29486 +
29487 +static int
29488 +raid5_evms_ioctl(mddev_t * mddev,
29489 +                struct inode *inode,
29490 +                struct file *file, unsigned int cmd, unsigned long arg)
29491 +{
29492 +       int rc = 0;
29493 +       struct evms_logical_node *node;
29494 +
29495 +       switch (cmd) {
29496 +       case EVMS_GET_BMAP:
29497 +               {
29498 +                       struct evms_get_bmap_pkt *bmap = (struct evms_get_bmap_pkt *)arg;
29499 +                       rc = raid5_bmap(mddev, &bmap->rsector, &node);
29500 +                       if (!rc) {
29501 +                               if (node)
29502 +                                       rc = IOCTL(node, inode, file, cmd, arg);
29503 +                               else
29504 +                                       rc = -ENODEV;
29505 +                       }
29506 +                       break;
29507 +               }
29508 +
29509 +       default:
29510 +               rc = -EINVAL;
29511 +       }
29512 +       return rc;
29513 +}
29514 +
29515 +#define MAX_IO_SIZE 128
29516 +static int
29517 +raid5_pers_ioctl(mddev_t * mddev, int cmd, void *args)
29518 +{
29519 +
29520 +       int rc = 0;
29521 +       struct r5_sync_io init_io_args;
29522 +       void *data;
29523 +       int io_size = MAX_IO_SIZE;
29524 +
29525 +       LOG_DETAILS("%s: cmd == %d.\n", __FUNCTION__, cmd);
29526 +       switch (cmd) {
29527 +       case EVMS_MD_RAID5_INIT_IO:
29528 +
29529 +               if (copy_from_user
29530 +                   (&init_io_args, (struct r5_sync_io *) args,
29531 +                    sizeof (init_io_args))) {
29532 +                       return -EFAULT;
29533 +               }
29534 +               /* allocate a io buffer upto 64Kbytes in size */
29535 +               if (init_io_args.nr_sects < MAX_IO_SIZE)
29536 +                       io_size = init_io_args.nr_sects;
29537 +
29538 +               /* allocate buffer large enough to hold a single sector */
29539 +               data = kmalloc(io_size << EVMS_VSECTOR_SIZE_SHIFT, GFP_KERNEL);
29540 +               if (!data) {
29541 +                       rc = -ENOMEM;
29542 +               } else {
29543 +                       u64 io_sector_offset, io_remaining;
29544 +                       u64 io_bytes;
29545 +                       u_char *user_buffer_ptr;
29546 +
29547 +                       io_remaining = init_io_args.nr_sects;
29548 +                       io_sector_offset = 0;
29549 +                       user_buffer_ptr = init_io_args.data;
29550 +                       while (io_remaining) {
29551 +                               /* compute the io_size for this pass */
29552 +                               io_size = (io_remaining >= MAX_IO_SIZE) ?
29553 +                                   MAX_IO_SIZE : io_remaining;
29554 +
29555 +                               io_bytes = io_size << EVMS_VSECTOR_SIZE_SHIFT;
29556 +                               if (init_io_args.rw == WRITE) {
29557 +                                       if (copy_from_user(data,
29558 +                                                          user_buffer_ptr,
29559 +                                                          io_bytes))
29560 +                                               rc = -EFAULT;
29561 +                               }
29562 +                               if (rc)
29563 +                                       break;
29564 +
29565 +                               rc = evms_md_sync_io(mddev->node,
29566 +                                                    init_io_args.rw,
29567 +                                                    init_io_args.lsn +
29568 +                                                    io_sector_offset, io_size,
29569 +                                                    data);
29570 +
29571 +                               if (rc)
29572 +                                       break;
29573 +
29574 +                               if (init_io_args.rw != WRITE) {
29575 +                                       if (copy_to_user(user_buffer_ptr,
29576 +                                                        data, io_bytes))
29577 +                                               rc = -EFAULT;
29578 +                               }
29579 +                               if (rc)
29580 +                                       break;
29581 +
29582 +                               user_buffer_ptr += io_bytes;
29583 +                               io_sector_offset += io_size;
29584 +                               io_remaining -= io_size;
29585 +                       }
29586 +               }
29587 +               break;
29588 +
29589 +       default:
29590 +               rc = -ENOSYS;
29591 +       }
29592 +
29593 +       return rc;
29594 +}
29595 +
29596 +static mdk_personality_t raid5_personality = {
29597 +       .name           = "evms_raid5",
29598 +       .read           = raid5_read,
29599 +       .write          = raid5_write,
29600 +       .run            = raid5_run,
29601 +       .stop           = raid5_stop,
29602 +       .status         = raid5_status,
29603 +       .error_handler  = raid5_error,
29604 +       .diskop         = raid5_diskop,
29605 +       .stop_resync    = raid5_stop_resync,
29606 +       .restart_resync = raid5_restart_resync,
29607 +       .sync_request   = raid5_sync_request,
29608 +       .evms_ioctl     = raid5_evms_ioctl,
29609 +       .md_pers_ioctl  = raid5_pers_ioctl
29610 +};
29611 +
29612 +static int md__init
29613 +raid5_init(void)
29614 +{
29615 +       return evms_register_md_personality(RAID5, &raid5_personality);
29616 +}
29617 +
29618 +static void
29619 +raid5_exit(void)
29620 +{
29621 +       evms_unregister_md_personality(RAID5);
29622 +}
29623 +
29624 +module_init(raid5_init);
29625 +module_exit(raid5_exit);
29626 +#ifdef MODULE_LICENSE
29627 +MODULE_LICENSE("GPL");
29628 +#endif
29629 diff -Naur linux-2002-09-30/drivers/evms/md_xor.c evms-2002-09-30/drivers/evms/md_xor.c
29630 --- linux-2002-09-30/drivers/evms/md_xor.c      Wed Dec 31 18:00:00 1969
29631 +++ evms-2002-09-30/drivers/evms/md_xor.c       Fri Mar  1 11:50:58 2002
29632 @@ -0,0 +1,149 @@
29633 +/*
29634 + * md_xor.c : Multiple Devices driver for Linux
29635 + *
29636 + * Copyright (C) 1996, 1997, 1998, 1999, 2000,
29637 + * Ingo Molnar, Matti Aarnio, Jakub Jelinek, Richard Henderson.
29638 + *
29639 + * Dispatch optimized RAID-5 checksumming functions.
29640 + *
29641 + * 'md_xor.c' is an EVMS version of linux/drivers/md/xor.c modified
29642 + * by Cuong (Mike) Tran <miketran@us.ibm.com>, January 2002.
29643 + *
29644 + * This program is free software; you can redistribute it and/or modify
29645 + * it under the terms of the GNU General Public License as published by
29646 + * the Free Software Foundation; either version 2, or (at your option)
29647 + * any later version.
29648 + *
29649 + * You should have received a copy of the GNU General Public License
29650 + * (for example /usr/src/linux/COPYING); if not, write to the Free
29651 + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
29652 + */
29653 +
29654 +#define BH_TRACE 0
29655 +#include <linux/module.h>
29656 +#include <linux/evms/evms_md.h>
29657 +#include <linux/evms/evms_xor.h>
29658 +#include <asm/xor.h>
29659 +
29660 +#define LOG_PREFIX "md raid5: "
29661 +/* The xor routines to use.  */
29662 +static struct xor_block_template *active_template;
29663 +
29664 +void
29665 +evms_md_xor_block(unsigned int count, struct buffer_head **bh_ptr)
29666 +{
29667 +       unsigned long *p0, *p1, *p2, *p3, *p4;
29668 +       unsigned long bytes = bh_ptr[0]->b_size;
29669 +
29670 +       p0 = (unsigned long *) bh_ptr[0]->b_data;
29671 +       p1 = (unsigned long *) bh_ptr[1]->b_data;
29672 +       if (count == 2) {
29673 +               active_template->do_2(bytes, p0, p1);
29674 +               return;
29675 +       }
29676 +
29677 +       p2 = (unsigned long *) bh_ptr[2]->b_data;
29678 +       if (count == 3) {
29679 +               active_template->do_3(bytes, p0, p1, p2);
29680 +               return;
29681 +       }
29682 +
29683 +       p3 = (unsigned long *) bh_ptr[3]->b_data;
29684 +       if (count == 4) {
29685 +               active_template->do_4(bytes, p0, p1, p2, p3);
29686 +               return;
29687 +       }
29688 +
29689 +       p4 = (unsigned long *) bh_ptr[4]->b_data;
29690 +       active_template->do_5(bytes, p0, p1, p2, p3, p4);
29691 +}
29692 +
29693 +/* Set of all registered templates.  */
29694 +static struct xor_block_template *template_list;
29695 +
29696 +#define BENCH_SIZE (PAGE_SIZE)
29697 +
29698 +static void
29699 +do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
29700 +{
29701 +       int speed;
29702 +       unsigned long now;
29703 +       int i, count, max;
29704 +
29705 +       tmpl->next = template_list;
29706 +       template_list = tmpl;
29707 +
29708 +       /*
29709 +        * Count the number of XORs done during a whole jiffy, and use
29710 +        * this to calculate the speed of checksumming.  We use a 2-page
29711 +        * allocation to have guaranteed color L1-cache layout.
29712 +        */
29713 +       max = 0;
29714 +       for (i = 0; i < 5; i++) {
29715 +               now = jiffies;
29716 +               count = 0;
29717 +               while (jiffies == now) {
29718 +                       mb();
29719 +                       tmpl->do_2(BENCH_SIZE, b1, b2);
29720 +                       mb();
29721 +                       count++;
29722 +                       mb();
29723 +               }
29724 +               if (count > max)
29725 +                       max = count;
29726 +       }
29727 +
29728 +       speed = max * (HZ * BENCH_SIZE / 1024);
29729 +       tmpl->speed = speed;
29730 +
29731 +       LOG_DEFAULT("   %-10s: %5d.%03d MB/sec\n", tmpl->name,
29732 +              speed / 1000, speed % 1000);
29733 +}
29734 +
29735 +static int
29736 +calibrate_xor_block(void)
29737 +{
29738 +       void *b1, *b2;
29739 +       struct xor_block_template *f, *fastest;
29740 +
29741 +       b1 = (void *) md__get_free_pages(GFP_KERNEL, 2);
29742 +       if (! b1) {
29743 +               LOG_ERROR("Yikes!  No memory available.\n");
29744 +               return -ENOMEM;
29745 +       }
29746 +       b2 = b1 + 2*PAGE_SIZE + BENCH_SIZE;
29747 +
29748 +       LOG_DEFAULT("measuring checksumming speed\n");
29749 +       sti();
29750 +
29751 +#define xor_speed(templ)       do_xor_speed((templ), b1, b2)
29752 +
29753 +       XOR_TRY_TEMPLATES;
29754 +
29755 +#undef xor_speed
29756 +
29757 +       free_pages((unsigned long)b1, 2);
29758 +
29759 +       fastest = template_list;
29760 +       for (f = fastest; f; f = f->next)
29761 +               if (f->speed > fastest->speed)
29762 +                       fastest = f;
29763 +
29764 +#ifdef XOR_SELECT_TEMPLATE
29765 +       fastest = XOR_SELECT_TEMPLATE(fastest);
29766 +#endif
29767 +
29768 +       active_template = fastest;
29769 +       LOG_DEFAULT("using function: %s (%d.%03d MB/sec)\n",
29770 +              fastest->name, fastest->speed / 1000, fastest->speed % 1000);
29771 +
29772 +       return 0;
29773 +}
29774 +
29775 +MD_EXPORT_SYMBOL(evms_md_xor_block);
29776 +
29777 +#ifdef MODULE_LICENSE
29778 +MODULE_LICENSE("GPL");
29779 +#endif
29780 +
29781 +module_init(calibrate_xor_block);
29782 diff -Naur linux-2002-09-30/drivers/evms/os2lvm_vge.c evms-2002-09-30/drivers/evms/os2lvm_vge.c
29783 --- linux-2002-09-30/drivers/evms/os2lvm_vge.c  Wed Dec 31 18:00:00 1969
29784 +++ evms-2002-09-30/drivers/evms/os2lvm_vge.c   Fri Sep 13 16:09:55 2002
29785 @@ -0,0 +1,2394 @@
29786 +/*
29787 + *
29788 + *   Copyright (c) International Business Machines Corp., 2001
29789 + *
29790 + *   This program is free software;  you can redistribute it and/or modify
29791 + *   it under the terms of the GNU General Public License as published by
29792 + *   the Free Software Foundation; either version 2 of the License, or
29793 + *   (at your option) any later version.
29794 + *
29795 + *   This program is distributed in the hope that it will be useful,
29796 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
29797 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
29798 + *   the GNU General Public License for more details.
29799 + *
29800 + *   You should have received a copy of the GNU General Public License
29801 + *   along with this program;  if not, write to the Free Software
29802 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
29803 + *
29804 + */
29805 +
29806 +/*
29807 + * linux/drivers/evms/os2lvm_vge.c
29808 + *
29809 + * EVMS OS/2 LVM Emulator
29810 + *
29811 + * This Volume Group Emulator will take the type 0x35 partitions created by
29812 + *  OS/2 versions 4.5 and later and build them into volumes.  It emulates
29813 + *  the Drive Linking and Bad Block Relocation features and therefore
29814 + *  provides binary compatibility with the OS/2 version.  Of course, if
29815 + *  you select to mkfs a file system OS/2 doesn't support, you're on your
29816 + *  own...
29817 + *
29818 + * Since OS/2 LVM volumes can only exist on DOS-style partitioned disks,
29819 + *  this VGE has a dependency on dospart.c to report a list of the
29820 + *  candidate partitions.  This module will then take the appropriate partitions
29821 + *  from the list and use them to build the OS/2-style volumes.
29822 + *
29823 + * Change Activity:
29824 + *
29825 + *   7/01/2001  John Stiles  getting started.
29826 + *   9/14/2001  John Stiles  original version.
29827 + *  11/01/2001  John Stiles  new naming scheme.
29828 + *  11/21/2001  John Stiles  i/o path changes.
29829 + */
29830 +
29831 +#define EVMS_DEBUG 1
29832 +#define EVMS_OS2_DEBUG 1
29833 +
29834 +#include <linux/module.h>
29835 +#include <linux/kernel.h>
29836 +#include <linux/config.h>
29837 +#include <linux/genhd.h>
29838 +#include <linux/string.h>
29839 +#include <linux/blk.h>
29840 +#include <linux/init.h>
29841 +#include <linux/evms/evms.h>
29842 +#include <linux/evms/evms_os2.h>
29843 +#include <asm/uaccess.h>
29844 +#include <asm/atomic.h>
29845 +
29846 +#define LOG_PREFIX "os2lvm: "
29847 +
29848 +// Global Structure and Type definitions
29849 +struct transfer_record {
29850 +       int Write_Flag;         /* 0 = read, 1 = write */
29851 +       struct os2_dl_entry *Partition_Data;
29852 +       struct buffer_head *bh;
29853 +       struct transfer_record *next;
29854 +};
29855 +
29856 +struct tracking_record {       /* structure used to track IO requests that must be broken into two pieces due to drive linking */
29857 +       unsigned int io_in_progress;
29858 +       int up_to_date;
29859 +       struct buffer_head *org_bh;     /* Original IO */
29860 +       struct buffer_head *link1_bh;   /* First child. */
29861 +       struct os2_dl_entry *link1_data;
29862 +       struct transfer_record *link1_transfer_rec;
29863 +       int link1_bbr_attempted;
29864 +       struct buffer_head *link2_bh;   /* Second child */
29865 +       struct os2_dl_entry *link2_data;
29866 +       struct transfer_record *link2_transfer_rec;
29867 +       int link2_bbr_attempted;
29868 +};
29869 +
29870 +// Prototypes for local VGE functions
29871 +static int discover_os2lvm_partitions(struct evms_logical_node **);
29872 +static struct evms_logical_node *find_os2_volume(u32);
29873 +static int add_os2link(struct os2_dl_entry *,
29874 +                      struct evms_logical_node *);
29875 +static struct os2_dl_entry
29876 +    *find_link_data(struct os2_dl_entry **, u32);
29877 +static int find_drive_link(struct evms_logical_node *,
29878 +                          struct os2_dl_entry **, u64 *, u64 *);
29879 +static int validate_signaturesector(struct evms_logical_node *,
29880 +                                   LVM_Signature_Sector *, u32);
29881 +static int validate_drivelinksector(void *, int, u32);
29882 +static int validate_bbrtablesector(void *, int, u32);
29883 +static u32 check_for_os2_bbr_relocations(char *);
29884 +static int check_os2_volumes(struct evms_logical_node **);
29885 +static int OS2_ioctl_cmd_broadcast(struct evms_logical_node *node,
29886 +                                  struct inode *inode, struct file *file,
29887 +                                  unsigned long cmd, unsigned long arg);
29888 +static int os2_ioctl_cmd_plugin_ioctl(struct evms_logical_node *node,
29889 +                                     struct inode *inode, struct file *file,
29890 +                                     unsigned long cmd, unsigned long arg);
29891 +static void BBR_Worker(void *);
29892 +static void OS2_BBR_Write_Callback(struct transfer_record * Transfer_Record,
29893 +                                  struct buffer_head *bh,
29894 +                                  int uptodate, int *redrive);
29895 +static void BBR_Transfer_IO(struct transfer_record * Transfer_Record);
29896 +static void OS2_DL_Callback(struct buffer_head *bh, int uptodate);
29897 +static int Sector_Is_Remapped(struct os2_dl_entry * io_dlentry,
29898 +                             u64 Source_Sector, u64 * Replacement_Sector);
29899 +static void Invalidate_Mapping(struct os2_dl_entry * io_dlentry,
29900 +                              u64 Source_Sector,
29901 +                              int Replacement_Sector_Is_Bad);
29902 +static int Create_New_BBR_Table_Entry(struct os2_dl_entry *
29903 +                                     io_dlentry, u64 starting_lsn,
29904 +                                     unsigned int count, void *buffer);
29905 +static void Clone_Bufferhead(struct buffer_head *Source,
29906 +                            struct buffer_head *Child);
29907 +
29908 +// Prototypes for local memory allocation/deallocation functions
29909 +static struct os2_dl_entry *new_os2_drive_link(LVM_Signature_Sector *,
29910 +                                                        struct
29911 +                                                        evms_logical_node *);
29912 +static char *new_os2_link_data(u32, u32, u32, struct evms_logical_node *);
29913 +static char *new_os2_bbr_data(u32, u32, u32, struct evms_logical_node *);
29914 +static struct evms_logical_node *new_os2volume(u32, char *);
29915 +static int delete_os2lvm_volume(struct evms_logical_node *);
29916 +static int delete_os2_drive_link(struct os2_dl_entry *, int);
29917 +
29918 +// Prototypes for Function Table interface
29919 +static int discover_os2lvm(struct evms_logical_node **);
29920 +static int delete_os2lvm(struct evms_logical_node *);
29921 +static void read_os2lvm(struct evms_logical_node *, struct buffer_head *);
29922 +static void write_os2lvm(struct evms_logical_node *, struct buffer_head *);
29923 +static int init_io_os2lvm(struct evms_logical_node *, int, u64, u64, void *);
29924 +static int ioctl_os2lvm(struct evms_logical_node *, struct inode *,
29925 +                       struct file *, unsigned int, unsigned long);
29926 +static int do_os2_bbr_io(struct os2_dl_entry *, int, u64, u64,
29927 +                        void *);
29928 +
29929 +// Global data structures
29930 +static struct evms_logical_node *os2lvm_nodes = NULL;
29931 +static struct evms_thread *BBR_Worker_Thread = NULL;
29932 +static spinlock_t BBR_Queue_Lock = SPIN_LOCK_UNLOCKED;
29933 +static const char *BBR_Worker_Name = "evms_os2_bbr_io";
29934 +static struct transfer_record *BBR_IO_List_Head = NULL;
29935 +static struct transfer_record *BBR_IO_List_Tail = NULL;
29936 +static struct evms_pool_mgmt *BBR_Transfer_Pool = NULL;
29937 +static char *BBR_Transfer_Pool_Name = "OS-2 Transfer Pool";
29938 +static char *DL_Tracking_Pool_Name = "OS-2 Tracking Pool";
29939 +static struct evms_pool_mgmt *DL_Tracking_Pool = NULL;
29940 +
29941 +// Required plug-in Function Table definition
29942 +static struct evms_plugin_fops function_table = {
29943 +       .discover       = discover_os2lvm,
29944 +       .delete         = delete_os2lvm,
29945 +       .read           = read_os2lvm,
29946 +       .write          = write_os2lvm,
29947 +       .init_io        = init_io_os2lvm,
29948 +       .ioctl          = ioctl_os2lvm
29949 +};
29950 +
29951 +// Required plug-in Header definition
29952 +static struct evms_plugin_header plugin_header = {
29953 +       .id = SetPluginID(IBM_OEM_ID,
29954 +                         EVMS_REGION_MANAGER,
29955 +                         2),
29956 +       .version = {
29957 +               .major          = 1,
29958 +               .minor          = 1,
29959 +               .patchlevel     = 1
29960 +       },
29961 +       .required_services_version = {
29962 +               .major          = EVMS_COMMON_SERVICES_MAJOR,
29963 +               .minor          = EVMS_COMMON_SERVICES_MINOR,
29964 +               .patchlevel     = EVMS_COMMON_SERVICES_PATCHLEVEL
29965 +       },
29966 +       .fops = &function_table
29967 +};
29968 +
29969 +//  Required Plugin Functions
29970 +
29971 +/*
29972 + * Function:  discover_os2lvm
29973 + *
29974 + *      This is the entry point into the discovery process.
29975 + */
29976 +static int
29977 +discover_os2lvm(struct evms_logical_node **evms_partition_list)
29978 +{
29979 +       int rc;
29980 +
29981 +       MOD_INC_USE_COUNT;
29982 +
29983 +       if (!BBR_Transfer_Pool) {
29984 +               BBR_Transfer_Pool =
29985 +                   evms_cs_create_pool(sizeof (struct transfer_record),
29986 +                                       BBR_Transfer_Pool_Name, NULL, NULL);
29987 +               if (!BBR_Transfer_Pool) {
29988 +                       MOD_DEC_USE_COUNT;
29989 +                       return -ENOMEM;
29990 +               }
29991 +       }
29992 +
29993 +       if (!DL_Tracking_Pool) {
29994 +               DL_Tracking_Pool =
29995 +                   evms_cs_create_pool(sizeof (struct tracking_record),
29996 +                                       DL_Tracking_Pool_Name, NULL, NULL);
29997 +               if (!DL_Tracking_Pool) {
29998 +                       MOD_DEC_USE_COUNT;
29999 +                       return -ENOMEM;
30000 +               }
30001 +       }
30002 +
30003 +       rc = discover_os2lvm_partitions(evms_partition_list);
30004 +
30005 +       if (!rc) {
30006 +               rc = check_os2_volumes(evms_partition_list);
30007 +       }
30008 +
30009 +       MOD_DEC_USE_COUNT;
30010 +       return rc;
30011 +}
30012 +
30013 +/*
30014 + * Function:  delete_os2lvm
30015 + *
30016 + *      This is the entry point for deleting a node.
30017 + */
30018 +static int
30019 +delete_os2lvm(struct evms_logical_node *logical_node)
30020 +{
30021 +       LOG_EXTRA("Deleting volume: %s\n", logical_node->name);
30022 +
30023 +       return delete_os2lvm_volume(logical_node);
30024 +}
30025 +
30026 +/*
30027 + * Function:  read_os2lvm
30028 + */
30029 +static void
30030 +read_os2lvm(struct evms_logical_node *node, struct buffer_head *bh)
30031 +{
30032 +       int rc;
30033 +       u64 sector_count;
30034 +       u64 rsector;
30035 +       struct buffer_head *Link1 = NULL;
30036 +       struct buffer_head *Link2 = NULL;
30037 +       struct tracking_record *Tracking_Record = NULL;
30038 +       struct os2_dl_entry *cur_dlentry = NULL;
30039 +       struct transfer_record *Transfer_Record;
30040 +
30041 +       rsector = bh->b_rsector;
30042 +       sector_count = bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
30043 +       rc = find_drive_link(node, &cur_dlentry, &rsector, &sector_count);
30044 +       bh->b_rsector = rsector;
30045 +       switch (rc) {
30046 +       case 1:
30047 +               if (cur_dlentry->bbr_is_active) {
30048 +                       Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool, 1);     /* Block until we get a transfer record. */
30049 +                       /* Transfer the IO to the BBR Worker Thread. */
30050 +                       Transfer_Record->Write_Flag = 0;
30051 +                       Transfer_Record->Partition_Data = cur_dlentry;
30052 +                       Transfer_Record->bh = bh;
30053 +                       Transfer_Record->next = NULL;
30054 +                       BBR_Transfer_IO(Transfer_Record);
30055 +               } else
30056 +                       R_IO(cur_dlentry->link_partition, bh);
30057 +               break;
30058 +       case 2:
30059 +               /* We must split the IO.  Duplicate the buffer head twice and allocate the tracking record. */
30060 +               Tracking_Record = evms_cs_allocate_from_pool(DL_Tracking_Pool, 1);      /* Block until we get a tracking record. */
30061 +               Link1 = evms_cs_allocate_from_pool(evms_bh_pool, 1);
30062 +               Link2 = evms_cs_allocate_from_pool(evms_bh_pool, 1);
30063 +
30064 +               /* Initialize the tracking record so we can associate the two new I/Os with the original. */
30065 +               Tracking_Record->io_in_progress = 2;
30066 +               Tracking_Record->up_to_date = 0;
30067 +               Tracking_Record->org_bh = bh;
30068 +
30069 +               /* Create the I/O to the first link. */
30070 +               Clone_Bufferhead(bh, Link1);
30071 +               Link1->b_private = Tracking_Record;
30072 +               Link1->b_end_io = OS2_DL_Callback;
30073 +               Link1->b_size = sector_count << EVMS_VSECTOR_SIZE_SHIFT;
30074 +               Tracking_Record->link1_bh = Link1;
30075 +               Tracking_Record->link1_data = cur_dlentry;
30076 +               Tracking_Record->link1_bbr_attempted = 0;
30077 +               Tracking_Record->link1_transfer_rec = NULL;
30078 +
30079 +               /* Create the I/O to the second link */
30080 +               Clone_Bufferhead(bh, Link2);
30081 +               Link2->b_private = Tracking_Record;
30082 +               Link2->b_end_io = OS2_DL_Callback;
30083 +               Link2->b_data += sector_count << EVMS_VSECTOR_SIZE_SHIFT;
30084 +               Link2->b_rsector = 0;
30085 +               Link2->b_size =
30086 +                   bh->b_size - (sector_count << EVMS_VSECTOR_SIZE_SHIFT);
30087 +               Tracking_Record->link2_bh = Link2;
30088 +               Tracking_Record->link2_data = cur_dlentry->next;
30089 +               Tracking_Record->link2_bbr_attempted = 0;
30090 +               Tracking_Record->link2_transfer_rec = NULL;
30091 +
30092 +               /* Process the I/O to the first link. */
30093 +               if (cur_dlentry->bbr_is_active) {
30094 +                       Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool, 1);     /* Block until we get a transfer record. */
30095 +                       /* Transfer the IO to the BBR Worker Thread. */
30096 +                       Transfer_Record->Write_Flag = 0;
30097 +                       Transfer_Record->Partition_Data = cur_dlentry;
30098 +                       Transfer_Record->bh = Tracking_Record->link1_bh;
30099 +                       Transfer_Record->next = NULL;
30100 +                       BBR_Transfer_IO(Transfer_Record);
30101 +               } else
30102 +                       R_IO(cur_dlentry->link_partition,
30103 +                            Tracking_Record->link1_bh);
30104 +
30105 +               /* Process the I/O to the second link. */
30106 +               cur_dlentry = cur_dlentry->next;
30107 +               if (cur_dlentry->bbr_is_active) {
30108 +                       Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool, 1);     /* Block until we get a transfer record. */
30109 +                       /* Transfer the IO to the BBR Worker Thread. */
30110 +                       Transfer_Record->Write_Flag = 0;
30111 +                       Transfer_Record->Partition_Data = cur_dlentry;
30112 +                       Transfer_Record->bh = Tracking_Record->link2_bh;
30113 +                       Transfer_Record->next = NULL;
30114 +                       BBR_Transfer_IO(Transfer_Record);
30115 +               } else
30116 +                       R_IO(cur_dlentry->link_partition,
30117 +                            Tracking_Record->link2_bh);
30118 +
30119 +               break;
30120 +       default:
30121 +               LOG_SERIOUS("READ error, request exceeds volume size.\n");
30122 +               bh->b_end_io(bh, 0);
30123 +               break;
30124 +       }
30125 +}
30126 +
30127 +/*
30128 + * Function:  write_os2lvm
30129 + */
30130 +static void
30131 +write_os2lvm(struct evms_logical_node *node, struct buffer_head *bh)
30132 +{
30133 +       int rc;
30134 +       u64 rsector;
30135 +       u64 sector_count;
30136 +       struct buffer_head *Link1 = NULL;
30137 +       struct buffer_head *Link2 = NULL;
30138 +       struct tracking_record *Tracking_Record = NULL;
30139 +       struct os2_dl_entry *cur_dlentry = NULL;
30140 +       struct transfer_record *Transfer_Record;
30141 +
30142 +       rsector = bh->b_rsector;
30143 +       sector_count = bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT;
30144 +       rc = find_drive_link(node, &cur_dlentry, &rsector, &sector_count);
30145 +       bh->b_rsector = rsector;
30146 +       switch (rc) {
30147 +       case 1:
30148 +               /* Set up a Transfer Record.  If there are Bad Blocks on the partition that this I/O is
30149 +                  directed to, then we will need the Transfer Record to put the I/O in the queue for the
30150 +                  BBR Worker Thread.  If there are no bad blocks, then we will need the Transfer Record
30151 +                  for the OS2_BBR_Write_Callback function.  This function expects the Transfer Record to
30152 +                  be pre-allocated and available because it is running on an interrupt thread and should
30153 +                  not do memory allocation.  If there is an error during the write, then the
30154 +                  OS2_BBR_Write_Callback function will use the Transfer Record to transfer the I/O
30155 +                  to the BBR worker thread for further processing.  If there are no errors during the I/O,
30156 +                  then the OS2_BBR_Write_Callback will deallocate the Transfer Record.                     */
30157 +               Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool, 1);     /* Block until we get a transfer record. */
30158 +               Transfer_Record->Write_Flag = 1;
30159 +               Transfer_Record->Partition_Data = cur_dlentry;
30160 +               Transfer_Record->bh = bh;
30161 +               Transfer_Record->next = NULL;
30162 +               if (cur_dlentry->bbr_is_active) {
30163 +                       /* Transfer the IO to the BBR Worker Thread. */
30164 +                       BBR_Transfer_IO(Transfer_Record);
30165 +               } else {
30166 +                       evms_cs_register_for_end_io_notification
30167 +                           (Transfer_Record, bh, OS2_BBR_Write_Callback);
30168 +                       W_IO(cur_dlentry->link_partition, bh);
30169 +               }
30170 +               break;
30171 +       case 2:
30172 +               /* We must split the IO.  Duplicate the buffer head twice and allocate the tracking record. */
30173 +               Tracking_Record = evms_cs_allocate_from_pool(DL_Tracking_Pool, 1);      /* Block until we get a tracking record. */
30174 +               Link1 = evms_cs_allocate_from_pool(evms_bh_pool, 1);
30175 +               Link2 = evms_cs_allocate_from_pool(evms_bh_pool, 1);
30176 +
30177 +               /* Initialize the tracking record so we can associate the two new I/Os with the original. */
30178 +               Tracking_Record->io_in_progress = 2;
30179 +               Tracking_Record->up_to_date = 0;
30180 +               Tracking_Record->org_bh = bh;
30181 +
30182 +               /* Create the I/O to the first link. */
30183 +               Clone_Bufferhead(bh, Link1);
30184 +               Link1->b_private = Tracking_Record;
30185 +               Link1->b_end_io = OS2_DL_Callback;
30186 +               Link1->b_size = sector_count << EVMS_VSECTOR_SIZE_SHIFT;
30187 +               Tracking_Record->link1_bh = Link1;
30188 +               Tracking_Record->link1_data = cur_dlentry;
30189 +
30190 +               /* Create the I/O to the second link */
30191 +               Clone_Bufferhead(bh, Link2);
30192 +               Link2->b_private = Tracking_Record;
30193 +               Link2->b_end_io = OS2_DL_Callback;
30194 +               Link2->b_data += sector_count << EVMS_VSECTOR_SIZE_SHIFT;
30195 +               Link2->b_rsector = 0;
30196 +               Link2->b_size =
30197 +                   bh->b_size - (sector_count << EVMS_VSECTOR_SIZE_SHIFT);
30198 +               Tracking_Record->link2_bh = Link2;
30199 +               Tracking_Record->link2_data = cur_dlentry->next;
30200 +
30201 +               Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool, 1);     /* Block until we get a transfer record. */
30202 +               Transfer_Record->Write_Flag = 1;
30203 +               Transfer_Record->Partition_Data = cur_dlentry;
30204 +               Transfer_Record->bh = Tracking_Record->link1_bh;
30205 +               Transfer_Record->next = NULL;
30206 +               Tracking_Record->link1_transfer_rec = Transfer_Record;
30207 +               /* Process the I/O to the first link. */
30208 +               if (cur_dlentry->bbr_is_active) {
30209 +                       /* Transfer the IO to the BBR Worker Thread. */
30210 +                       Tracking_Record->link1_bbr_attempted = 1;
30211 +                       BBR_Transfer_IO(Transfer_Record);
30212 +               } else {
30213 +                       Tracking_Record->link1_bbr_attempted = 0;
30214 +                       W_IO(cur_dlentry->link_partition,
30215 +                            Tracking_Record->link1_bh);
30216 +               }
30217 +
30218 +               /* Process the I/O to the second link. */
30219 +               cur_dlentry = cur_dlentry->next;
30220 +               Transfer_Record = evms_cs_allocate_from_pool(BBR_Transfer_Pool, 1);     /* Block until we get a transfer record. */
30221 +               Transfer_Record->Write_Flag = 1;
30222 +               Transfer_Record->Partition_Data = cur_dlentry;
30223 +               Transfer_Record->bh = Tracking_Record->link2_bh;
30224 +               Transfer_Record->next = NULL;
30225 +               Tracking_Record->link2_transfer_rec = Transfer_Record;
30226 +               if (cur_dlentry->bbr_is_active) {
30227 +                       /* Transfer the IO to the BBR Worker Thread. */
30228 +                       Tracking_Record->link2_bbr_attempted = 1;
30229 +                       BBR_Transfer_IO(Transfer_Record);
30230 +               } else {
30231 +                       Tracking_Record->link2_bbr_attempted = 0;
30232 +                       W_IO(cur_dlentry->link_partition,
30233 +                            Tracking_Record->link2_bh);
30234 +               }
30235 +
30236 +               break;
30237 +       default:
30238 +               LOG_SERIOUS("WRITE error, request exceeds volume size.\n");
30239 +               bh->b_end_io(bh, 0);
30240 +               break;
30241 +       }
30242 +}
30243 +
30244 +static int
30245 +os2_ioctl_cmd_plugin_ioctl(struct evms_logical_node *node,
30246 +                          struct inode *inode,
30247 +                          struct file *file,
30248 +                          unsigned long cmd, unsigned long arg)
30249 +{
30250 +       int rc = 0;
30251 +       os2_volume_runtime_entry_t *Node_Data;
30252 +       struct os2_dl_entry *curlink, *nextlink;
30253 +       struct evms_plugin_ioctl_pkt tmp, *user_parms;
30254 +
30255 +       user_parms = (struct evms_plugin_ioctl_pkt *) arg;
30256 +       /* copy user's parameters to kernel space */
30257 +       if (copy_from_user(&tmp, user_parms, sizeof (tmp)))
30258 +               rc = -EFAULT;
30259 +
30260 +       if (!rc) {
30261 +               Node_Data = (os2_volume_runtime_entry_t *) node->private;
30262 +               /* is this cmd targetted at this feature ? */
30263 +               if (tmp.feature_id == node->plugin->id) {
30264 +                       switch (tmp.feature_command) {
30265 +                       default:
30266 +                               break;
30267 +                       }
30268 +               } else {        /* broadcast this cmd to all children */
30269 +                       curlink = Node_Data->drive_link;
30270 +
30271 +                       /* broadcast this cmd to all children */
30272 +                       while (curlink) {
30273 +                               nextlink = curlink->next;
30274 +
30275 +                               rc = IOCTL(curlink->link_partition, inode, file,
30276 +                                          cmd, arg);
30277 +
30278 +                               if (rc) {
30279 +                                       break;
30280 +                               }
30281 +                               curlink = nextlink;
30282 +                       }
30283 +
30284 +               }
30285 +               /* copy info to userspace */
30286 +               if (copy_to_user(user_parms, &tmp, sizeof (tmp)))
30287 +                       rc = -EFAULT;
30288 +       }
30289 +       return (rc);
30290 +}
30291 +
30292 +static int
30293 +OS2_ioctl_cmd_broadcast(struct evms_logical_node *node,
30294 +                       struct inode *inode,
30295 +                       struct file *file, unsigned long cmd, unsigned long arg)
30296 +{
30297 +       int rc = 0;
30298 +       os2_volume_runtime_entry_t *Node_Data;
30299 +       struct os2_dl_entry *curlink, *nextlink;
30300 +
30301 +       Node_Data = (os2_volume_runtime_entry_t *) node->private;
30302 +       curlink = Node_Data->drive_link;
30303 +
30304 +       /* broadcast this cmd to all children */
30305 +       while (curlink) {
30306 +               nextlink = curlink->next;
30307 +
30308 +               rc |= IOCTL(curlink->link_partition, inode, file, cmd, arg);
30309 +
30310 +               curlink = nextlink;
30311 +       }
30312 +
30313 +       return (rc);
30314 +}
30315 +
30316 +/*
30317 + * Function:  ioctl_os2lvm
30318 + */
30319 +static int
30320 +ioctl_os2lvm(struct evms_logical_node *logical_node,
30321 +            struct inode *inode,
30322 +            struct file *file, unsigned int cmd, unsigned long arg)
30323 +{
30324 +       int rc = 0;
30325 +       u64 Sectors_Per_Cylinder;
30326 +       u64 Total_Sectors;
30327 +       struct evms_logical_node *partition_node;
30328 +
30329 +       partition_node =
30330 +           ((os2_volume_runtime_entry_t *) logical_node->private)->drive_link->
30331 +           link_partition;
30332 +
30333 +       if (!inode)
30334 +               return -EINVAL;
30335 +
30336 +       LOG_EVERYTHING("Ioctl %d\n", cmd);
30337 +
30338 +       switch (cmd) {
30339 +       case HDIO_GETGEO:
30340 +               {
30341 +                       // Return fake geometry
30342 +                       struct hd_geometry *hd = (struct hd_geometry *) arg;
30343 +                       short cylinders;
30344 +                       unsigned char heads = 255;
30345 +                       unsigned char sectors =
30346 +                           OS2LVM_SYNTHETIC_SECTORS_PER_TRACK;
30347 +                       long start = 0;
30348 +
30349 +                       /* OS/2 always created a fake geometry using the maximum cylinder size. */
30350 +                       Sectors_Per_Cylinder = heads * sectors;
30351 +                       for (cylinders = 0, Total_Sectors = 0;
30352 +                            Total_Sectors <
30353 +                            ((os2_volume_runtime_entry_t *) logical_node->
30354 +                             private)->size_in_sectors; cylinders++)
30355 +                               Total_Sectors += Sectors_Per_Cylinder;
30356 +
30357 +                       cylinders--;
30358 +
30359 +                       if (copy_to_user
30360 +                           ((short *) (&hd->cylinders), &cylinders,
30361 +                            sizeof (cylinders))
30362 +                           || copy_to_user((char *) (&hd->heads), &heads,
30363 +                                           sizeof (heads))
30364 +                           || copy_to_user((char *) (&hd->sectors), &sectors,
30365 +                                           sizeof (sectors))
30366 +                           || copy_to_user((long *) (&hd->start), &start,
30367 +                                           sizeof (start))) {
30368 +                               return -EFAULT;
30369 +                       }
30370 +               }
30371 +               break;
30372 +
30373 +       case EVMS_GET_BMAP:
30374 +               // No kernel images allowed on OS/2 volumes right now.
30375 +               rc = -EINVAL;
30376 +               break;
30377 +
30378 +       case EVMS_QUIESCE_VOLUME:
30379 +       case EVMS_GET_DISK_LIST:
30380 +       case EVMS_CHECK_MEDIA_CHANGE:
30381 +       case EVMS_REVALIDATE_DISK:
30382 +       case EVMS_OPEN_VOLUME:
30383 +       case EVMS_CLOSE_VOLUME:
30384 +       case EVMS_CHECK_DEVICE_STATUS:
30385 +               rc = OS2_ioctl_cmd_broadcast(logical_node, inode, file, cmd,
30386 +                                            arg);
30387 +               break;
30388 +       case EVMS_PLUGIN_IOCTL:
30389 +               rc = os2_ioctl_cmd_plugin_ioctl(logical_node, inode, file, cmd,
30390 +                                               arg);
30391 +               break;
30392 +       default:
30393 +               rc = -EINVAL;
30394 +               break;
30395 +       }
30396 +
30397 +       return rc;
30398 +}
30399 +
30400 +/*
30401 + * Function:  init_io_os2lvm
30402 + */
30403 +static int
30404 +init_io_os2lvm(struct evms_logical_node *node, int io_flag,    /* 0=read, 1=write   */
30405 +              u64 sect_nr,     /* disk LBA          */
30406 +              u64 num_sects,   /* # of sectors      */
30407 +              void *buf_addr)
30408 +{                              /* buffer address    */
30409 +       int rc = 0;
30410 +       u64 sector_count;
30411 +       struct evms_logical_node *partition_node;
30412 +       struct os2_dl_entry *cur_dlentry = NULL;
30413 +
30414 +       sector_count = num_sects;
30415 +       rc = find_drive_link(node, &cur_dlentry, &sect_nr, &sector_count);
30416 +       switch (rc) {
30417 +       case 1:
30418 +               partition_node = cur_dlentry->link_partition;
30419 +               if (cur_dlentry->bbr_is_active)
30420 +                       rc = do_os2_bbr_io(cur_dlentry, io_flag, sect_nr,
30421 +                                          num_sects, buf_addr);
30422 +               else {
30423 +                       rc = INIT_IO(partition_node, io_flag, sect_nr,
30424 +                                    num_sects, buf_addr);
30425 +                       if (rc && io_flag) {
30426 +                               cur_dlentry->bbr_is_active = 1;
30427 +                               rc = do_os2_bbr_io(cur_dlentry, io_flag,
30428 +                                                  sect_nr, num_sects,
30429 +                                                  buf_addr);
30430 +                       }
30431 +               }
30432 +               break;
30433 +       case 2:
30434 +               partition_node = cur_dlentry->link_partition;
30435 +               if (cur_dlentry->bbr_is_active)
30436 +                       rc = do_os2_bbr_io(cur_dlentry, io_flag, sect_nr,
30437 +                                          sector_count, buf_addr);
30438 +               else {
30439 +                       rc = INIT_IO(partition_node, io_flag, sect_nr,
30440 +                                    sector_count, buf_addr);
30441 +                       if (rc && io_flag) {
30442 +                               cur_dlentry->bbr_is_active = 1;
30443 +                               rc = do_os2_bbr_io(cur_dlentry, io_flag,
30444 +                                                  sect_nr, sector_count,
30445 +                                                  buf_addr);
30446 +                       }
30447 +               }
30448 +
30449 +               if (!rc) {
30450 +                       cur_dlentry = cur_dlentry->next;
30451 +                       partition_node = cur_dlentry->link_partition;
30452 +                       num_sects -= sector_count;
30453 +                       buf_addr += sector_count << OS2_SECTOR_SHIFT;
30454 +                       rc = 1;
30455 +                       if (cur_dlentry->bbr_is_active)
30456 +                               rc = do_os2_bbr_io(cur_dlentry, io_flag, 0,
30457 +                                                  num_sects, buf_addr);
30458 +                       else {
30459 +                               rc = INIT_IO(partition_node, io_flag, 0,
30460 +                                            num_sects, buf_addr);
30461 +                               if (rc && io_flag) {
30462 +                                       cur_dlentry->bbr_is_active = 1;
30463 +                                       rc = do_os2_bbr_io(cur_dlentry, io_flag,
30464 +                                                          0, num_sects,
30465 +                                                          buf_addr);
30466 +                               }
30467 +
30468 +                       }
30469 +               }
30470 +               break;
30471 +       default:
30472 +               LOG_SERIOUS("INITIO error, request exceeds volume size.\n");
30473 +               break;
30474 +       }
30475 +
30476 +       return rc;
30477 +}
30478 +
30479 +/*
30480 + * Function:  do_os2_bbr_io
30481 + *
30482 + *      Check the Bad Block Relocation list for relocated sectors.  If any are found,
30483 + *       this function will do the i/o directly.
30484 + *      Return values:  0 == i/o done,  1 == unable to complete i/o
30485 + */
30486 +static int
30487 +do_os2_bbr_io(struct os2_dl_entry * io_dlentry, int rw,        /* 0=read, 1=write  */
30488 +             u64 starting_lsn, /* disk LBA         */
30489 +             u64 count,        /* # of sectors     */
30490 +             void *buffer)
30491 +{                              /* buffer address   */
30492 +       u64 lsn, remapped_lsn;
30493 +       int rc;
30494 +
30495 +       // For each sector in this request, check if this sector has already
30496 +       // been remapped. If so, process all previous sectors in this request,
30497 +       // followed by the remapped sector. Then reset the starting lsn and
30498 +       // count and keep going with the rest of the request as if it were
30499 +       // a whole new request.
30500 +       for (lsn = 0; lsn < count; lsn++) {
30501 +               remapped_lsn = starting_lsn + lsn;
30502 +               rc = Sector_Is_Remapped(io_dlentry, remapped_lsn,
30503 +                                       &remapped_lsn);
30504 +               if (rc) {
30505 +                       // Process all sectors in the request up to this one.
30506 +                       if (lsn > 0) {
30507 +                               rc = INIT_IO(io_dlentry->link_partition, rw,
30508 +                                            starting_lsn, lsn, buffer);
30509 +                               if (rc) {
30510 +                                       /* If this is a read, then we are done. */
30511 +                                       if (!rw) {
30512 +                                               return 1;
30513 +                                       }
30514 +
30515 +                                       /* Since this was a write, we must see if we can remap the bad sector to a replacement sector. */
30516 +                                       if (!Create_New_BBR_Table_Entry
30517 +                                           (io_dlentry, starting_lsn, lsn,
30518 +                                            buffer)) {
30519 +                                               /* We were unable to remap the bad sector(s) in the I/O.  We can not complete the I/O. */
30520 +                                               return 1;
30521 +                                       }
30522 +                               }
30523 +                               buffer += (lsn * OS2_BYTES_PER_SECTOR);
30524 +                       }
30525 +                       // Process the remapped sector.
30526 +                       rc = INIT_IO(io_dlentry->link_partition, rw,
30527 +                                    remapped_lsn, 1, buffer);
30528 +                       if (rc) {
30529 +                               /* If this is a read, then we are done. */
30530 +                               if (!rw) {
30531 +                                       return 1;
30532 +                               }
30533 +
30534 +                               /* Get the original sector that was remapped. */
30535 +                               remapped_lsn = starting_lsn + lsn;
30536 +
30537 +                               /* Invalidate the current remapping. */
30538 +                               Invalidate_Mapping(io_dlentry, remapped_lsn, 1);
30539 +
30540 +                               /* Try to remap the bad sector to another replacement sector. */
30541 +                               if (!Create_New_BBR_Table_Entry
30542 +                                   (io_dlentry, remapped_lsn, 1, buffer)) {
30543 +                                       /* We were unable to remap the bad sector(s) in the I/O.  We can not complete the I/O. */
30544 +                                       return 1;
30545 +                               }
30546 +
30547 +                       }
30548 +
30549 +                       buffer += OS2_BYTES_PER_SECTOR;
30550 +
30551 +                       starting_lsn += (lsn + 1);
30552 +                       count -= (lsn + 1);
30553 +                       lsn = -1;
30554 +               }
30555 +
30556 +       }
30557 +
30558 +       /* Are there any sectors left to process? */
30559 +       if (count > 0) {
30560 +               rc = INIT_IO(io_dlentry->link_partition, rw, starting_lsn,
30561 +                            count, buffer);
30562 +               if (rc) {
30563 +                       /* If this is a read, then we are done. */
30564 +                       if (!rw) {
30565 +                               return 1;
30566 +                       }
30567 +
30568 +                       /* Since this was a write, we must see if we can remap the bad sector to a replacement sector. */
30569 +                       if (!Create_New_BBR_Table_Entry
30570 +                           (io_dlentry, starting_lsn, count, buffer)) {
30571 +                               /* We were unable to remap the bad sector(s) in the I/O.  We can not complete the I/O. */
30572 +                               return 1;
30573 +                       }
30574 +
30575 +               }
30576 +
30577 +       }
30578 +
30579 +       return 0;
30580 +}
30581 +
30582 +/*
30583 + * Function: os2lvm_vge_init
30584 + */
30585 +int __init
30586 +os2lvm_vge_init(void)
30587 +{
30588 +       /* Should I be allocating the pools and BBR Worker Thread here? */
30589 +       return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
30590 +}
30591 +
30592 +void __exit
30593 +os2lvm_vge_exit(void)
30594 +{
30595 +       /* BUGBUG - Is there where I need to kill the BBR Worker Thread and free any memory I am still holding? */
30596 +
30597 +       evms_cs_unregister_plugin(&plugin_header);
30598 +}
30599 +
30600 +module_init(os2lvm_vge_init);
30601 +module_exit(os2lvm_vge_exit);
30602 +#ifdef MODULE_LICENSE
30603 +MODULE_LICENSE("GPL");
30604 +#endif
30605 +
30606 +// Local VGE Functions
30607 +
30608 +/*
30609 + * Function:  discover_os2lvm_partitions
30610 + *
30611 + *     Examine the list of logical partitions.  Any type 0x35 partition that contains
30612 + *      a valid OS/2 signature sector is consumed and added to the appropriate logical
30613 + *      volume.
30614 + */
30615 +static int
30616 +discover_os2lvm_partitions(struct evms_logical_node **evms_partition_list)
30617 +{
30618 +       struct evms_logical_node *evms_partition;
30619 +       struct evms_logical_node *next_partition;
30620 +       struct evms_logical_node *new_volume;
30621 +       u64 sectornum = 0;
30622 +       u32 volumeserial;
30623 +       char *sigsect;
30624 +       char *volumename;
30625 +       char driveletter[8];
30626 +       LVM_Signature_Sector *sigsector;
30627 +       struct os2_dl_entry *new_dlentry;
30628 +
30629 +       LOG_ENTRY_EXIT("Discovering OS/2 Logical Volumes\n");
30630 +       sigsect = kmalloc(OS2_BYTES_PER_SECTOR, GFP_KERNEL);
30631 +       if (!sigsect) {
30632 +               LOG_SERIOUS("Could not allocate Signature sector data\n");
30633 +               return -ENOMEM;
30634 +       }
30635 +
30636 +       for (evms_partition = *evms_partition_list; evms_partition;
30637 +            evms_partition = next_partition) {
30638 +               // Save the next node. We may remove this one from the list.
30639 +               next_partition = evms_partition->next;
30640 +
30641 +               // The node must not have the OS/2 vge id.
30642 +               if (evms_partition->plugin->id == plugin_header.id) {
30643 +                       continue;
30644 +               }
30645 +
30646 +               LOG_EXTRA("Examining partition serial %s\n",
30647 +                         evms_partition->name);
30648 +
30649 +               // Have to go to the last accessible sector of the partition and
30650 +               //  read it in.  It should be the LVM Signature Sector.
30651 +               sectornum = evms_partition->total_vsectors - 1;
30652 +               if (INIT_IO(evms_partition, 0, sectornum, 1, sigsect)) {
30653 +                       // On an I/O error, continue on to the next partition.
30654 +                       // This means that the volume it belongs to will be incomplete
30655 +                       //  and later deleted in the completeness check.
30656 +                       LOG_SERIOUS("I/O error on Signature sector read\n");
30657 +                       continue;
30658 +               }
30659 +               sigsector = (LVM_Signature_Sector *) sigsect;
30660 +
30661 +               // Validate the Signature Sector
30662 +               if (validate_signaturesector
30663 +                   (evms_partition, sigsector, OS2_BYTES_PER_SECTOR)) {
30664 +                       LOG_EXTRA("Signature sector is not valid\n");
30665 +                       continue;
30666 +               }
30667 +// Bugbug - At this point, we have validated an OS/2 LVM Signature Sector.  However, if the partition
30668 +// is not marked as a type 0x35, then this Signature Sector may be erroneous.  The problem here is that
30669 +// there is currently no way to find out if this partition was marked as a type 0x35.  Also, if we
30670 +// should reject this partition due to some problem with the drive linking or BBR metadata, should we
30671 +// leave the partition in the evms partition list or not?  If the partition was marked as a type 0x35
30672 +// and the Signature Sector was valid, then I would say that we should remove it from the evms partition
30673 +// partition list.  If the partition is not marked as a type 0x35 but the Signature Sector is valid, then
30674 +// we could have a stray Signature Sector, in which case the partition should remain in the evms partition
30675 +// list.  The OS/2 LVM Signature Sector does have additional information that could be used to resolve
30676 +// this issue, such as the starting LBA of the partition that the Signature Sector belongs to, but
30677 +// we can not get the starting LBA of the partition to compare against.  If we leave the partition in
30678 +// the evms partition list when we should not, then an extraneous compatibility volume could result.
30679 +               // Build the Metadata for this partition
30680 +               if (!
30681 +                   (new_dlentry =
30682 +                    new_os2_drive_link(sigsector, evms_partition))) {
30683 +                       continue;
30684 +               }
30685 +               // Search for the parent Volume for this partition
30686 +               volumeserial = sigsector->Volume_Serial_Number;
30687 +               if (!(new_volume = find_os2_volume(volumeserial))) {
30688 +
30689 +                       // If not found, allocate a new Volume
30690 +                       LOG_EVERYTHING("Parent not found, allocate new.\n");
30691 +                       if (sigsector->Drive_Letter != '\0') {
30692 +                               driveletter[0] = sigsector->Drive_Letter;
30693 +                               driveletter[1] = '\0';
30694 +                               volumename = driveletter;
30695 +                       } else
30696 +                               volumename = sigsector->Volume_Name;
30697 +
30698 +                       if (!
30699 +                           (new_volume =
30700 +                            new_os2volume(volumeserial, volumename))) {
30701 +                               delete_os2_drive_link(new_dlentry, 0);
30702 +                               new_dlentry = NULL;
30703 +                               continue;
30704 +                       }
30705 +               }
30706 +               // Now remove the partition from the List
30707 +               evms_cs_remove_logical_node_from_list(evms_partition_list,
30708 +                                                     evms_partition);
30709 +
30710 +               if (((os2_volume_runtime_entry_t *) new_volume->private)->
30711 +                   complete) {
30712 +                       // Volume is complete, delete this duplicate
30713 +                       delete_os2_drive_link(new_dlentry, 0);
30714 +                       LOG_EVERYTHING("Deleting duplicate node.\n");
30715 +                       ((os2_volume_runtime_entry_t *) new_volume->private)->Export_Needed = 1;        //We must export this volume again!
30716 +               } else          /* Add this partition to its parent Volume */
30717 +                       add_os2link(new_dlentry, new_volume);
30718 +
30719 +       }
30720 +
30721 +       kfree(sigsect);
30722 +       LOG_ENTRY_EXIT("Finished Discovering OS/2 Logical Volumes\n");
30723 +
30724 +       return 0;
30725 +}
30726 +
30727 +/*
30728 + * Function:  find_os2_volume
30729 + *
30730 + *      Search for the OS/2 volume that matches the volume serial.
30731 + */
30732 +static struct evms_logical_node *
30733 +find_os2_volume(u32 volumeserial)
30734 +{
30735 +       os2_volume_runtime_entry_t *cur_volume;
30736 +       struct evms_logical_node *cur_node;
30737 +
30738 +       cur_node = os2lvm_nodes;
30739 +
30740 +       while (cur_node) {
30741 +               cur_volume = (os2_volume_runtime_entry_t *) cur_node->private;
30742 +               if (cur_volume->Volume_Serial_Number == volumeserial) {
30743 +                       LOG_EVERYTHING("%s: found volser match.\n",
30744 +                                      __FUNCTION__);
30745 +                       return cur_node;
30746 +               }
30747 +               LOG_EVERYTHING("%s: volser does not match.\n", __FUNCTION__);
30748 +               cur_node = cur_volume->next_os2lvm_node;
30749 +       }
30750 +
30751 +       return NULL;
30752 +}
30753 +
30754 +/*
30755 + * Function:  add_os2link
30756 + *
30757 + *      Add the Drive Link metadata to the parent OS/2 volume.
30758 + */
30759 +static int
30760 +add_os2link(struct os2_dl_entry * newlink,
30761 +           struct evms_logical_node *parent_volume)
30762 +{
30763 +       os2_volume_runtime_entry_t *parent_metadata =
30764 +           (os2_volume_runtime_entry_t *) parent_volume->private;
30765 +       struct os2_dl_entry *curlink =
30766 +           parent_metadata->drive_link, *nextlink;
30767 +
30768 +       if (curlink) {
30769 +               nextlink = curlink->next;
30770 +               while (nextlink) {
30771 +                       curlink = nextlink;
30772 +                       nextlink = curlink->next;
30773 +               }
30774 +               curlink->next = newlink;
30775 +       } else {
30776 +               parent_metadata->drive_link = newlink;
30777 +       }
30778 +       parent_metadata->drive_link_count++;
30779 +       parent_metadata->size_in_sectors += newlink->sector_count;
30780 +       parent_volume->total_vsectors += newlink->sector_count;
30781 +       return 0;
30782 +}
30783 +
30784 +/*
30785 + * Function:  find_link_data
30786 + *
30787 + *      Find the Drive Link metadata that matches the partition serial number.
30788 + *       Remove it from the link_list passed in.
30789 + */
30790 +static struct os2_dl_entry *
30791 +find_link_data(struct os2_dl_entry ** link_list, u32 partitionser)
30792 +{
30793 +       struct os2_dl_entry *curlink = *link_list, *prevlink = NULL;
30794 +
30795 +       while (curlink) {
30796 +               if (curlink->partition_serial == partitionser) {
30797 +                       if (prevlink) {
30798 +                               prevlink->next = curlink->next;
30799 +                       } else {
30800 +                               *link_list = curlink->next;
30801 +                       }
30802 +                       curlink->next = NULL;
30803 +                       return curlink;
30804 +               }
30805 +               prevlink = curlink;
30806 +               curlink = prevlink->next;
30807 +       }
30808 +
30809 +       return NULL;
30810 +}
30811 +
30812 +/*
30813 + * Function:  find_drive_link
30814 + *
30815 + *      Walk the linked list of drive links to find the proper
30816 + *       target partition.  Returns the metadata associated with
30817 + *       the drive link.
30818 + *      Return values:  1 == data contained in 1 partition, 2 == data crosses 2 partitions,
30819 + *                      0 == target partition not found
30820 + */
30821 +static int
30822 +find_drive_link(struct evms_logical_node *node,
30823 +               struct os2_dl_entry ** dlentry,
30824 +               u64 * sector, u64 * num_sectors)
30825 +{
30826 +       u64 last_link_sector, cur_last_sector;
30827 +       struct os2_dl_entry *curlink =
30828 +           ((os2_volume_runtime_entry_t *) node->private)->drive_link,
30829 +           *nextlink;
30830 +
30831 +       while (curlink) {
30832 +               nextlink = curlink->next;
30833 +               last_link_sector =
30834 +                   curlink->start_sector + curlink->sector_count;
30835 +               if (*sector < last_link_sector) {
30836 +                       *dlentry = curlink;
30837 +                       cur_last_sector = *sector + *num_sectors;
30838 +                       *sector -= curlink->start_sector;
30839 +                       LOG_EVERYTHING
30840 +                           ("I/O start_RBA == "PFU64" , sector_count == "PFU64"\n",
30841 +                            *sector, *num_sectors);
30842 +                       if (cur_last_sector <= last_link_sector)
30843 +                               return 1;
30844 +                       else {
30845 +                               if ((*dlentry)->next)
30846 +                                       *num_sectors -=
30847 +                                           cur_last_sector - last_link_sector;
30848 +                               else
30849 +                                       return 0;
30850 +                       }
30851 +                       return 2;
30852 +               }
30853 +
30854 +               curlink = nextlink;
30855 +       }
30856 +
30857 +       return 0;
30858 +}
30859 +
30860 +// Allocation/Deallocation Functions
30861 +
30862 +/*
30863 + * Function:  new_os2_drive_link
30864 + *
30865 + *      Allocate space for a new OS/2 drive link structure.
30866 + *        Initialize the appropriate fields.
30867 + *        Note:  since the BBR info applies to each link, the BBR structures
30868 + *               are also initialized here.
30869 + */
30870 +static struct os2_dl_entry *
30871 +new_os2_drive_link(LVM_Signature_Sector * signature_sector,
30872 +                  struct evms_logical_node *evms_partition)
30873 +{
30874 +       int i;
30875 +       u32 feature, feature_size, sectoroffset;
30876 +       struct os2_dl_entry *new_dlentry;
30877 +
30878 +       new_dlentry =
30879 +           kmalloc(sizeof (struct os2_dl_entry), GFP_KERNEL);
30880 +       if (!new_dlentry) {
30881 +               LOG_SERIOUS("Could not allocate drivelink metadata\n");
30882 +               return NULL;
30883 +       }
30884 +       memset(new_dlentry, 0, sizeof (struct os2_dl_entry));
30885 +       new_dlentry->sector_count =
30886 +           signature_sector->Partition_Size_To_Report_To_User;
30887 +       new_dlentry->partition_serial =
30888 +           signature_sector->partition_serial;
30889 +       new_dlentry->bbr_is_active = 0; // initialize to not active
30890 +       new_dlentry->link_partition = evms_partition;
30891 +       init_MUTEX(&(new_dlentry->bbr_table_lock));
30892 +
30893 +       sectoroffset = signature_sector->Partition_Start;
30894 +       LOG_EVERYTHING("Partition Start is at LBA %i\n", sectoroffset);
30895 +       for (i = 0; i < OS2LVM_MAX_FEATURES_PER_VOLUME; i++) {
30896 +               feature = signature_sector->LVM_Feature_Array[i].Feature_ID;
30897 +               if (feature) {
30898 +                       feature_size =
30899 +                           signature_sector->LVM_Feature_Array[i].
30900 +                           Feature_Data_Size;
30901 +                       LOG_EVERYTHING("Entry %d in Feature Table is valid,\n",
30902 +                                      i + 1);
30903 +                       LOG_EVERYTHING("Feature Data size is %i sectors.\n",
30904 +                                      feature_size);
30905 +                       if (feature == DRIVE_LINKING_FEATURE_ID) {
30906 +                               if (!new_dlentry->link_data) {
30907 +                                       new_dlentry->dl_lsn1 =
30908 +                                           signature_sector->
30909 +                                           LVM_Feature_Array[i].
30910 +                                           Location_Of_Primary_Feature_Data -
30911 +                                           sectoroffset;
30912 +                                       new_dlentry->dl_lsn2 =
30913 +                                           signature_sector->
30914 +                                           LVM_Feature_Array[i].
30915 +                                           Location_Of_Secondary_Feature_Data -
30916 +                                           sectoroffset;
30917 +                                       new_dlentry->link_data =
30918 +                                           new_os2_link_data(new_dlentry->
30919 +                                                             dl_lsn1,
30920 +                                                             new_dlentry->
30921 +                                                             dl_lsn2,
30922 +                                                             feature_size,
30923 +                                                             evms_partition);
30924 +                                       if (new_dlentry->link_data == NULL) {
30925 +                                               delete_os2_drive_link
30926 +                                                   (new_dlentry, 0);
30927 +                                               new_dlentry = NULL;
30928 +                                       }
30929 +                               } else {
30930 +                                       LOG_WARNING
30931 +                                           ("os2lvm_vge: Drive Linking Feature encountered twice in the same Feature Array!\n");
30932 +                                       delete_os2_drive_link(new_dlentry, 0);
30933 +                                       new_dlentry = NULL;
30934 +                               }
30935 +                       } else if (feature == BBR_FEATURE_ID) {
30936 +                               if (!new_dlentry->bbr_data) {
30937 +                                       new_dlentry->bbr_lsn1 =
30938 +                                           signature_sector->
30939 +                                           LVM_Feature_Array[i].
30940 +                                           Location_Of_Primary_Feature_Data;
30941 +                                       new_dlentry->bbr_lsn2 =
30942 +                                           signature_sector->
30943 +                                           LVM_Feature_Array[i].
30944 +                                           Location_Of_Secondary_Feature_Data;
30945 +                                       new_dlentry->bbr_feature_size =
30946 +                                           feature_size;
30947 +                                       new_dlentry->bbr_data =
30948 +                                           new_os2_bbr_data(new_dlentry->
30949 +                                                            bbr_lsn1,
30950 +                                                            new_dlentry->
30951 +                                                            bbr_lsn2,
30952 +                                                            feature_size,
30953 +                                                            evms_partition);
30954 +                                       if (new_dlentry->bbr_data == NULL) {
30955 +                                               delete_os2_drive_link
30956 +                                                   (new_dlentry, 0);
30957 +                                               new_dlentry = NULL;
30958 +                                       } else if (signature_sector->
30959 +                                                  LVM_Feature_Array[i].
30960 +                                                  Feature_Active) {
30961 +                                               new_dlentry->bbr_is_active =
30962 +                                                   check_for_os2_bbr_relocations
30963 +                                                   (new_dlentry->bbr_data);
30964 +                                       }
30965 +                               } else {
30966 +                                       LOG_WARNING
30967 +                                           ("os2lvm_vge: BBR Feature encountered twice in the same Feature Array!\n");
30968 +                                       delete_os2_drive_link(new_dlentry, 0);
30969 +                                       new_dlentry = NULL;
30970 +                               }
30971 +                       } else {
30972 +                               LOG_WARNING
30973 +                                   ("os2lvm_vge: Unknown Feature entry %d found.\n",
30974 +                                    feature);
30975 +                               delete_os2_drive_link(new_dlentry, 0);
30976 +                               new_dlentry = NULL;
30977 +                       }
30978 +
30979 +                       if (signature_sector->LVM_Feature_Array[i].
30980 +                           Feature_Active) {
30981 +                               LOG_EVERYTHING("Feature is active.\n");
30982 +                       }
30983 +               }
30984 +       }
30985 +
30986 +       if (new_dlentry &&
30987 +           ((!new_dlentry->bbr_data) || (!new_dlentry->link_data))
30988 +           ) {
30989 +               LOG_WARNING("os2lvm_vge: Incomplete Feature Data found.\n");
30990 +               delete_os2_drive_link(new_dlentry, 0);
30991 +               new_dlentry = NULL;
30992 +       }
30993 +       return new_dlentry;
30994 +}
30995 +
30996 +/*
30997 + * Function:  new_os2_link_data
30998 + *
30999 + *      Allocate space for OS/2 drive link information.
31000 + *      Read in and validate the information from disk.
31001 + *      Note:  assumes 512 byte sectors.
31002 + */
31003 +static char *
31004 +new_os2_link_data(u32 linksector1,
31005 +                 u32 linksector2,
31006 +                 u32 linknumsectors, struct evms_logical_node *link_partition)
31007 +{
31008 +       char *new_data1;        /* Buffer used to hold the primary copy of the drive linking data. */
31009 +       char *new_data2;        /* Buffer used to hold the secondary copy of the drive linking data. */
31010 +       char *p1;               /* Used to access individual sectors of data within new_data1. */
31011 +       char *p2;               /* Used to access individual sectors of data within new_data2. */
31012 +       int memsize = linknumsectors * OS2_BYTES_PER_SECTOR;
31013 +       u32 i, seq1, seq2;
31014 +
31015 +       /* Allocate Memory for the buffers to hold the drive linking data. */
31016 +       LOG_EVERYTHING("Drive Linking Feature entry found.\n");
31017 +       new_data1 = kmalloc(memsize, GFP_KERNEL);
31018 +       if (!new_data1) {
31019 +               LOG_SERIOUS("Could not allocate Primary Link data\n");
31020 +               return NULL;
31021 +       }
31022 +       new_data2 = kmalloc(memsize, GFP_KERNEL);
31023 +       if (!new_data2) {
31024 +               LOG_SERIOUS("Could not allocate Secondary Link data\n");
31025 +               kfree(new_data1);
31026 +               return NULL;
31027 +       }
31028 +
31029 +       LOG_EVERYTHING("Primary Feature Data starts at RBA %i\n", linksector1);
31030 +       LOG_EVERYTHING("Secondary Feature Data starts at RBA %i\n",
31031 +                      linksector2);
31032 +
31033 +       /* Read the drive linking data into memory. */
31034 +       if (INIT_IO(link_partition, 0, linksector1, linknumsectors, new_data1)) {
31035 +               LOG_SERIOUS("I/O error reading Primary Feature Data.\n");
31036 +               seq1 = 0;
31037 +               p1 = NULL;
31038 +       } else {
31039 +               /* Set up access to the buffer.  Extract the Master Sequence Number from the buffer. */
31040 +               p1 = new_data1;
31041 +               seq1 = ((struct link_table_first_sector *) p1)->Sequence_Number;
31042 +       }
31043 +
31044 +       if (INIT_IO(link_partition, 0, linksector2, linknumsectors, new_data2)) {
31045 +               LOG_SERIOUS("I/O error reading Secondary Feature Data.\n");
31046 +               seq2 = 0;
31047 +               p2 = NULL;
31048 +       } else {
31049 +               /* Set up access to the second buffer.  Extract its copy of the Master Sequence Number. */
31050 +               p2 = new_data2;
31051 +               seq2 = ((struct link_table_sector *) p2)->Sequence_Number;
31052 +       }
31053 +
31054 +       /* Validate both copies of the drive linking data one sector at a time. */
31055 +       for (i = 0; i < linknumsectors;
31056 +            i++, p1 += OS2_BYTES_PER_SECTOR, p2 += OS2_BYTES_PER_SECTOR) {
31057 +               if ((seq1 > 0)
31058 +                   && validate_drivelinksector((struct link_table_sector *) p1, i,
31059 +                                               seq1)) {
31060 +                       LOG_SERIOUS
31061 +                           ("The primary copy of the drive link data is invalid!  Sector %i is not valid\n",
31062 +                            i);
31063 +                       seq1 = 0;
31064 +               }
31065 +
31066 +               if ((seq2 > 0)
31067 +                   && validate_drivelinksector((struct link_table_sector *) p2, i,
31068 +                                               seq2)) {
31069 +                       LOG_SERIOUS
31070 +                           ("The secondary copy of the drive link data is invalid!  Sector %i is not valid\n",
31071 +                            i);
31072 +                       seq2 = 0;
31073 +               }
31074 +
31075 +       }
31076 +
31077 +       LOG_EVERYTHING("Primary Feature Data sequence # %i\n", seq1);
31078 +       LOG_EVERYTHING("Secondary Feature Data sequence # %i\n", seq2);
31079 +
31080 +       /* Choose which copy of the drive linking data to use.  If both sequence numbers are 0, then both copies
31081 +          of the drive linking data are bad.  If both are equal and non-zero, then both copies are good and it
31082 +          really doesn't matter which one you choose.  Otherwise, choose the copy with the highest sequence number. */
31083 +       if (seq2 > seq1) {
31084 +               kfree(new_data1);
31085 +               return new_data2;
31086 +       } else {
31087 +               kfree(new_data2);
31088 +               if (!seq1) {
31089 +                       kfree(new_data1);
31090 +                       new_data1 = NULL;
31091 +               }
31092 +       }
31093 +       return new_data1;
31094 +}
31095 +
31096 +/*
31097 + * Function:  new_os2_bbr_data
31098 + *
31099 + *      Allocate space for OS/2 bad block relocation information.
31100 + *      Read in and validate the information from disk.
31101 + *      Note:  assumes 512 byte sectors.
31102 + */
31103 +static char *
31104 +new_os2_bbr_data(u32 bbrsector1,
31105 +                u32 bbrsector2,
31106 +                u32 bbrnumsectors, struct evms_logical_node *bbr_partition)
31107 +{
31108 +       char *new_data1;        /* Buffer to hold the primary copy of the BBR data. */
31109 +       char *new_data2;        /* Buffer to hold the secondary copy of the BBR data. */
31110 +       char *p1;               /* Used to examine the individual sectors of BBR data within new_data1. */
31111 +       char *p2;               /* Used to examine the individual sectors of BBR data within new_data2. */
31112 +       int memsize = bbrnumsectors * OS2_BYTES_PER_SECTOR;
31113 +       u32 i, seq1, seq2;
31114 +
31115 +       LOG_EVERYTHING("BBR Feature entry found.\n");
31116 +
31117 +       /* Allocate memory for the buffers. */
31118 +       new_data1 = kmalloc(memsize, GFP_KERNEL);
31119 +       if (!new_data1) {
31120 +               LOG_SERIOUS("Could not allocate Primary BBR data\n");
31121 +               return NULL;
31122 +       }
31123 +       new_data2 = kmalloc(memsize, GFP_KERNEL);
31124 +       if (!new_data2) {
31125 +               LOG_SERIOUS("Could not allocate Secondary BBR data\n");
31126 +               kfree(new_data1);
31127 +               return NULL;
31128 +       }
31129 +
31130 +       LOG_EVERYTHING("Primary Feature Data starts at RBA %i\n", bbrsector1);
31131 +       LOG_EVERYTHING("Secondary Feature Data starts at RBA %i\n", bbrsector2);
31132 +
31133 +       /* Read in both copies of the BBR data. */
31134 +       if (INIT_IO(bbr_partition, 0, bbrsector1, bbrnumsectors, new_data1)) {
31135 +               LOG_SERIOUS("I/O error reading Primary Feature Data.\n");
31136 +               seq1 = 0;
31137 +               p1 = NULL;
31138 +       } else {
31139 +               /* Establish access to the first sector of the BBR data.  Extract the Master Sequence Number
31140 +                  for this copy of the BBR data.                                                             */
31141 +               p1 = new_data1;
31142 +               seq1 = ((LVM_BBR_Table_First_Sector *) p1)->Sequence_Number;
31143 +       }
31144 +
31145 +       if (INIT_IO(bbr_partition, 0, bbrsector2, bbrnumsectors, new_data2)) {
31146 +               LOG_SERIOUS("I/O error reading Secondary Feature Data.\n");
31147 +               seq2 = 0;
31148 +               p2 = NULL;
31149 +       } else {
31150 +               /* Establish access to the first sector of the second copy of the BBR data.  Extract the
31151 +                  Master Sequence Number for this copy of the BBR data.                                   */
31152 +               p2 = new_data2;
31153 +               seq2 = ((LVM_BBR_Table_Sector *) p2)->Sequence_Number;
31154 +       }
31155 +
31156 +       /* Validate both copies of the BBR Data, one sector at a time. */
31157 +       for (i = 0; i < bbrnumsectors;
31158 +            i++, p1 += OS2_BYTES_PER_SECTOR, p2 += OS2_BYTES_PER_SECTOR) {
31159 +               if ((seq1 > 0) && validate_bbrtablesector(p1, i, seq1)) {
31160 +                       LOG_SERIOUS
31161 +                           ("The primary BBR data is invalid!  Sector %i is not valid\n",
31162 +                            i);
31163 +                       seq1 = 0;
31164 +               }
31165 +
31166 +               if ((seq2 > 0) && validate_bbrtablesector(p2, i, seq2)) {
31167 +                       LOG_SERIOUS
31168 +                           ("The secondary BBR data is invalid!  Sector %i is not valid\n",
31169 +                            i);
31170 +                       seq2 = 0;
31171 +               }
31172 +
31173 +       }
31174 +
31175 +       LOG_EVERYTHING("Primary Feature Data sequence # %i\n", seq1);
31176 +       LOG_EVERYTHING("Secondary Feature Data sequence # %i\n", seq2);
31177 +
31178 +       /* Choose which copy of the BBR Data to use based upon the sequence number.  If both sequence numbers
31179 +          are 0, then there is no valid BBR data.  If both are non-zero and equal, then it really doesn't
31180 +          matter which copy is used.  Otherwise, choose the copy with the highest sequence number.            */
31181 +       if (seq2 > seq1) {
31182 +               kfree(new_data1);
31183 +               return new_data2;
31184 +       } else {
31185 +               kfree(new_data2);
31186 +               if (!seq1) {
31187 +                       kfree(new_data1);
31188 +                       new_data1 = NULL;
31189 +               }
31190 +       }
31191 +       return new_data1;
31192 +}
31193 +
31194 +/*
31195 + * Function:  new_os2volume
31196 + *
31197 + *      Allocate space for a new OS/2 logical volume.
31198 + *      Initialize the appropriate fields.
31199 + */
31200 +static struct evms_logical_node *
31201 +new_os2volume(u32 volumeserial, char *volume_name)
31202 +{
31203 +       struct evms_logical_node *new_node;
31204 +       os2_volume_runtime_entry_t *cur_volume;
31205 +
31206 +       if (evms_cs_allocate_logical_node(&new_node)) {
31207 +               LOG_SERIOUS("Could not allocate new volume\n");
31208 +               return NULL;
31209 +       }
31210 +       new_node->private =
31211 +           kmalloc(sizeof (os2_volume_runtime_entry_t), GFP_KERNEL);
31212 +       if (!new_node->private) {
31213 +               LOG_SERIOUS("Could not allocate volume metadata\n");
31214 +               evms_cs_deallocate_logical_node(new_node);
31215 +               return NULL;
31216 +       }
31217 +       memset(new_node->private, 0, sizeof (os2_volume_runtime_entry_t));
31218 +       new_node->plugin = &plugin_header;
31219 +       new_node->system_id = LVM_PARTITION_INDICATOR;
31220 +       sprintf(new_node->name, "os2/%s", volume_name);
31221 +       cur_volume = (os2_volume_runtime_entry_t *) new_node->private;
31222 +       cur_volume->Volume_Serial_Number = volumeserial;
31223 +       cur_volume->Export_Needed = 1;
31224 +
31225 +       if (os2lvm_nodes == NULL)
31226 +               os2lvm_nodes = new_node;
31227 +
31228 +       // This is the first node discovered. Start the BBR thread.
31229 +       if (!BBR_Worker_Thread) {
31230 +               BBR_Worker_Thread =
31231 +                   evms_cs_register_thread(BBR_Worker, NULL, BBR_Worker_Name);
31232 +               if (!BBR_Worker_Thread) {
31233 +                       kfree(new_node->private);
31234 +                       evms_cs_deallocate_logical_node(new_node);
31235 +                       os2lvm_nodes = NULL;
31236 +                       return NULL;
31237 +               }
31238 +       } else {
31239 +               cur_volume =
31240 +                   (os2_volume_runtime_entry_t *) os2lvm_nodes->private;
31241 +               while (cur_volume->next_os2lvm_node)
31242 +                       cur_volume =
31243 +                           (os2_volume_runtime_entry_t *) cur_volume->
31244 +                           next_os2lvm_node->private;
31245 +               cur_volume->next_os2lvm_node = new_node;
31246 +       }
31247 +
31248 +       MOD_INC_USE_COUNT;
31249 +
31250 +       return new_node;
31251 +}
31252 +
31253 +/*
31254 + * Function:  delete_os2lvm_volume
31255 + *
31256 + *      This function deletes the in-memory representation of an OS/2
31257 + *      logical volume.
31258 + */
31259 +static int
31260 +delete_os2lvm_volume(struct evms_logical_node *logical_node)
31261 +{
31262 +       struct os2_dl_entry *curdrvlink =
31263 +           ((os2_volume_runtime_entry_t *) logical_node->private)->drive_link,
31264 +           *nextdrvlink;
31265 +       os2_volume_runtime_entry_t *cur_volume, *next_volume;
31266 +
31267 +       while (curdrvlink) {
31268 +               nextdrvlink = curdrvlink->next;
31269 +               delete_os2_drive_link(curdrvlink, 1);
31270 +               curdrvlink = nextdrvlink;
31271 +       }
31272 +
31273 +       cur_volume = (os2_volume_runtime_entry_t *) os2lvm_nodes->private;
31274 +       if (os2lvm_nodes == logical_node)
31275 +               os2lvm_nodes = cur_volume->next_os2lvm_node;
31276 +       else {
31277 +               while (cur_volume->next_os2lvm_node) {
31278 +                       next_volume =
31279 +                           (os2_volume_runtime_entry_t *) cur_volume->
31280 +                           next_os2lvm_node->private;
31281 +                       if (cur_volume->next_os2lvm_node == logical_node) {
31282 +                               cur_volume->next_os2lvm_node =
31283 +                                   next_volume->next_os2lvm_node;
31284 +                               break;
31285 +                       }
31286 +               }
31287 +       }
31288 +
31289 +       if (os2lvm_nodes == NULL) {
31290 +               // Just deleted the last os2 node. Stop the BBR thread.
31291 +               if (BBR_Worker_Thread) {
31292 +                       evms_cs_unregister_thread(BBR_Worker_Thread);
31293 +                       BBR_Worker_Thread = NULL;
31294 +               }
31295 +       }
31296 +
31297 +       kfree(logical_node->private);
31298 +       evms_cs_deallocate_logical_node(logical_node);
31299 +
31300 +       MOD_DEC_USE_COUNT;
31301 +
31302 +       return 0;
31303 +}
31304 +
31305 +/*
31306 + * Function:  delete_os2_drive_link
31307 + *
31308 + *      This function deletes the drive link runtime structure and any
31309 + *       other structures it points to.
31310 + */
31311 +static int
31312 +delete_os2_drive_link(struct os2_dl_entry * drive_link,
31313 +                     int delete_link_partition)
31314 +{
31315 +       if (drive_link->link_data)
31316 +               kfree(drive_link->link_data);
31317 +       if (drive_link->bbr_data)
31318 +               kfree(drive_link->bbr_data);
31319 +       if (delete_link_partition)
31320 +               DELETE(drive_link->link_partition);
31321 +       kfree(drive_link);
31322 +
31323 +       return 0;
31324 +}
31325 +
31326 +// Consistency Checking Functions
31327 +
31328 +/*
31329 + * Function:  validate_signaturesector
31330 + *
31331 + *      This function checks the OS/2 LVM Signature Sector
31332 + */
31333 +static int
31334 +validate_signaturesector(struct evms_logical_node *evms_partition,
31335 +                        LVM_Signature_Sector * signature_sector,
31336 +                        u32 sectorsize)
31337 +{
31338 +       u32 crc_hold, crc_new;
31339 +
31340 +       /* In order for a signature sector to be considered valid, its signature and CRC must
31341 +          be correct.  Also, OS/2 stores the starting LBA of the partition and the size of
31342 +          the partition that this signature sector corresponds to.  These should be checked
31343 +          as well.  However, since the starting LBA of the partition that this belongs to is
31344 +          not available to us as part of an struct evms_logical_node, we can only check the size
31345 +          of the partition against what is stored in the signature sector.                    */
31346 +
31347 +       /* The signature used is in two parts.  Test the first part. */
31348 +       if (signature_sector->LVM_Signature1 != OS2LVM_PRIMARY_SIGNATURE) {
31349 +               LOG_EVERYTHING("Primary LVM Signature failed.\n");
31350 +               return 1;
31351 +       }
31352 +
31353 +       /* Test the second part of the signature. */
31354 +       if (signature_sector->LVM_Signature2 != OS2LVM_SECONDARY_SIGNATURE) {
31355 +               LOG_EVERYTHING("Secondary LVM Signature failed.\n");
31356 +               return 1;
31357 +       }
31358 +
31359 +       /* Calculate the CRC and compare it against the stored CRC. */
31360 +       crc_hold = signature_sector->Signature_Sector_CRC;
31361 +       signature_sector->Signature_Sector_CRC = 0;
31362 +       crc_new =
31363 +           evms_cs_calculate_crc(EVMS_INITIAL_CRC, (void *) signature_sector,
31364 +                                 sectorsize);
31365 +       if (crc_hold != crc_new) {
31366 +               LOG_EVERYTHING("Signature sector crc failed.\n");
31367 +               LOG_EVERYTHING("sector_crc == %x , calc_crc == %x \n", crc_hold,
31368 +                              crc_new);
31369 +               return 1;
31370 +       }
31371 +       // The partition size must == that found in the Signature Sector
31372 +       if (evms_partition->total_vsectors !=
31373 +           signature_sector->Partition_Sector_Count) {
31374 +               LOG_EXTRA("Partition size is not valid\n");
31375 +               return 1;
31376 +       }
31377 +
31378 +       return 0;
31379 +}
31380 +
31381 +/*
31382 + * Function:  validate_drivelinksector
31383 + *
31384 + *      This function checks the OS/2 LVM Drivelink Feature Sector
31385 + */
31386 +static int
31387 +validate_drivelinksector(void *Sector_To_Validate,
31388 +                        int Sector_Index, u32 Master_Sequence_Number)
31389 +{
31390 +       u32 crc_hold, crc_new;
31391 +       struct link_table_first_sector *First_Sector =
31392 +           (struct link_table_first_sector *) Sector_To_Validate;
31393 +       struct link_table_sector *Link_Sector =
31394 +           (struct link_table_sector *) Sector_To_Validate;
31395 +
31396 +       /* The OS/2 drive linking data covers several sectors.  The format of the first sector is slightly
31397 +          different from the following sectors because it contains additional information about how many
31398 +          drive links are actually in use.  The following sectors just contain portions of the drive link
31399 +          table.  Each sector of OS/2 drive linking data contains a signature, crc, and sequence number
31400 +          which must be validated.                                                                         */
31401 +
31402 +       if (Sector_Index == 0) {
31403 +
31404 +               /* Link Table Master Signature Check */
31405 +               if (LINK_TABLE_MASTER_SIGNATURE !=
31406 +                   First_Sector->Link_Table_Signature) {
31407 +                       LOG_EVERYTHING
31408 +                           ("Link Table Master Signature Test failed.\n");
31409 +                       return 1;
31410 +               }
31411 +
31412 +               /* We will NOT check the sequence number here as the first sector of drive link data is the
31413 +                  source of the Master_Sequence_Number which was passed in to us.                           */
31414 +
31415 +               /* Set up for the CRC Check */
31416 +               crc_hold = First_Sector->Link_Table_CRC;
31417 +               First_Sector->Link_Table_CRC = 0;
31418 +       } else {
31419 +               /* Link Table Internal Signature Check */
31420 +               if (LINK_TABLE_SIGNATURE != Link_Sector->Link_Table_Signature) {
31421 +                       LOG_EVERYTHING
31422 +                           ("Link Table Internal Signature Test failed.\n");
31423 +                       return 1;
31424 +               }
31425 +
31426 +               /* Check the sequence number. */
31427 +               if (Master_Sequence_Number != Link_Sector->Sequence_Number) {
31428 +                       LOG_EVERYTHING
31429 +                           ("Link Table Internal Sequence Number Test failed.\n");
31430 +                       return 1;
31431 +               }
31432 +
31433 +               /* Set up for the CRC Check */
31434 +               crc_hold = Link_Sector->Link_Table_CRC;
31435 +               Link_Sector->Link_Table_CRC = 0;
31436 +       }
31437 +
31438 +       crc_new =
31439 +           evms_cs_calculate_crc(EVMS_INITIAL_CRC, Sector_To_Validate,
31440 +                                 OS2_BYTES_PER_SECTOR);
31441 +       if (crc_hold != crc_new) {
31442 +               LOG_EVERYTHING("Link Table crc failed.\n");
31443 +               LOG_EVERYTHING("sector_crc == %x , calc_crc == %x \n", crc_hold,
31444 +                              crc_new);
31445 +               return 1;
31446 +       }
31447 +
31448 +       return 0;
31449 +}
31450 +
31451 +/*
31452 + * Function:  validate_bbrtablesector
31453 + *
31454 + *      This function checks the OS/2 LVM Bad Block Relocation Feature Sector
31455 + */
31456 +static int
31457 +validate_bbrtablesector(void *Sector_To_Validate,
31458 +                       int Sector_Index, u32 Master_Sequence_Number)
31459 +{
31460 +       u32 crc_hold, crc_new;
31461 +       LVM_BBR_Table_First_Sector *First_Sector =
31462 +           (LVM_BBR_Table_First_Sector *) Sector_To_Validate;
31463 +       LVM_BBR_Table_Sector *BBR_Sector =
31464 +           (LVM_BBR_Table_Sector *) Sector_To_Validate;
31465 +
31466 +       /* The OS/2 bad block relocation (BBR) data covers several sectors.  The format of the first sector
31467 +          is different from the following sectors because it contains additional information about how many
31468 +          relocations are actually in use and the size and location of the block of replacement sectors.
31469 +          The following sectors just contain portions of the BBR remap table.  Each sector of OS/2 BBR data
31470 +          contains a signature, crc, and sequence number which must be validated.                             */
31471 +
31472 +       if (Sector_Index == 0) {
31473 +
31474 +               /* BBR Table Master Signature Check */
31475 +               if (BBR_TABLE_MASTER_SIGNATURE != First_Sector->Signature) {
31476 +                       LOG_EVERYTHING
31477 +                           ("BBR Table Master Signature Test failed.\n");
31478 +                       return 1;
31479 +               }
31480 +
31481 +               /* We will NOT check the sequence number here as the first sector of BBR data is the
31482 +                  source of the Master_Sequence_Number which was passed in to us.                      */
31483 +
31484 +               /* Set up for the CRC Check */
31485 +               crc_hold = First_Sector->CRC;
31486 +               First_Sector->CRC = 0;
31487 +
31488 +       } else {
31489 +               /* BBR Table Internal Signature Check */
31490 +               if (BBR_TABLE_SIGNATURE != BBR_Sector->Signature) {
31491 +                       LOG_EVERYTHING
31492 +                           ("BBR Table Internal Signature Test failed.\n");
31493 +                       return 1;
31494 +               }
31495 +
31496 +               /* Check the sequence number. */
31497 +               if (Master_Sequence_Number != BBR_Sector->Sequence_Number) {
31498 +                       LOG_EVERYTHING
31499 +                           ("BBR Table Internal Sequence Number Test failed.\n");
31500 +                       return 1;
31501 +               }
31502 +
31503 +               /* Set up for the CRC Check */
31504 +               crc_hold = BBR_Sector->CRC;
31505 +               BBR_Sector->CRC = 0;
31506 +       }
31507 +
31508 +       crc_new =
31509 +           evms_cs_calculate_crc(EVMS_INITIAL_CRC, Sector_To_Validate,
31510 +                                 OS2_BYTES_PER_SECTOR);
31511 +       if (crc_hold != crc_new) {
31512 +               LOG_EVERYTHING("BBRTable crc failed.\n");
31513 +               LOG_EVERYTHING("sector_crc == %x , calc_crc == %x \n", crc_hold,
31514 +                              crc_new);
31515 +               return 1;
31516 +       }
31517 +
31518 +       return 0;
31519 +}
31520 +
31521 +/*
31522 + * Function:  check_for_os2_bbr_relocations
31523 + *
31524 + *      This function checks the OS/2 LVM Bad Block Relocation Tables
31525 + *       for any active relocation sectors.  The bbr table is reformatted in memory
31526 + *       to make searches faster.
31527 + *      Return values:  0 == no active relocations, 1 == contains active relocations
31528 + */
31529 +static u32
31530 +check_for_os2_bbr_relocations(char *bbr_data_ptr)
31531 +{
31532 +       LVM_BBR_Feature *feature_data = (LVM_BBR_Feature *) bbr_data_ptr;
31533 +
31534 +       if (feature_data->control.Table_Entries_In_Use) {
31535 +               LOG_EVERYTHING("There are %d active relocations.\n",
31536 +                              feature_data->control.Table_Entries_In_Use);
31537 +               return 1;
31538 +       }
31539 +
31540 +       return 0;
31541 +}
31542 +
31543 +/*
31544 + * Function:  check_os2_volumes
31545 + *
31546 + *      This function performs a consistency check on all existing OS/2
31547 + *        Logical Volumes.  The list of constituent partitions ( links )
31548 + *        is checked and ordered according to the Link Table.  If any link
31549 + *        is missing or inconsistent, the entire volume will be deleted.
31550 + */
31551 +static int
31552 +check_os2_volumes(struct evms_logical_node **node_list)
31553 +{
31554 +       os2_volume_runtime_entry_t *cur_volume;
31555 +       os2_volume_runtime_entry_t *previous_volume;
31556 +       struct evms_logical_node *cur_node;
31557 +       struct evms_logical_node *previous_node = NULL;
31558 +       struct os2_dl_entry *link_list, *link_hold;
31559 +       struct link_table_first_sector *psector1;
31560 +       int i, rc = 0;
31561 +       u32 numlinks, countlinks, linkser;
31562 +       u32 Master_Sequence_Number;     /* Used to check whether or not all of the copies of Drive Linking data match. */
31563 +       u64 partition_offset;
31564 +       char *sect_ptr;
31565 +
31566 +       LOG_ENTRY_EXIT("Checking OS/2 Logical Volumes\n");
31567 +
31568 +       cur_node = os2lvm_nodes;
31569 +
31570 +       while (cur_node) {
31571 +               cur_volume = (os2_volume_runtime_entry_t *) cur_node->private;
31572 +               link_list = NULL;
31573 +               if (!cur_volume->complete) {    /* need to verify this one  */
31574 +                       cur_volume->complete = 1;
31575 +                       LOG_EVERYTHING("Checking volume %s\n", cur_node->name);
31576 +
31577 +                       // Reset fields for sort operation
31578 +                       cur_volume->size_in_sectors = 0;
31579 +                       numlinks = cur_volume->drive_link_count;
31580 +                       cur_volume->drive_link_count = 0;
31581 +                       cur_node->total_vsectors = 0;
31582 +                       link_list = cur_volume->drive_link;
31583 +                       cur_volume->drive_link = NULL;
31584 +
31585 +                       // Access the link data to order the drive links
31586 +                       psector1 =
31587 +                           (struct link_table_first_sector *) link_list->
31588 +                           link_data;
31589 +                       Master_Sequence_Number = psector1->Sequence_Number;
31590 +
31591 +                       if (numlinks != psector1->Links_In_Use) {
31592 +                               LOG_SERIOUS
31593 +                                   ("Link Count mismatch vol=%i, table=%i\n",
31594 +                                    numlinks, psector1->Links_In_Use);
31595 +                               cur_volume->complete = 0;
31596 +                               countlinks = 0;
31597 +                       } else {
31598 +                               if (numlinks > LINKS_IN_FIRST_SECTOR) {
31599 +                                       countlinks = LINKS_IN_FIRST_SECTOR;
31600 +                                       numlinks -= LINKS_IN_FIRST_SECTOR;
31601 +                               } else {
31602 +                                       countlinks = numlinks;
31603 +                                       numlinks = 0;
31604 +                               }
31605 +
31606 +                       }
31607 +
31608 +                       partition_offset = 0;
31609 +                       for (i = 0;
31610 +                            (i < countlinks) && (cur_volume->complete == 1);
31611 +                            i++) {
31612 +                               linkser =
31613 +                                   psector1->Link_Table[i].
31614 +                                   partition_serial;
31615 +                               if ((link_hold =
31616 +                                    find_link_data(&link_list, linkser))) {
31617 +                                       // Add this partition to its parent Volume
31618 +                                       add_os2link(link_hold, cur_node);
31619 +                                       LOG_EVERYTHING
31620 +                                           ("Link start_RBA == "PFU64" , sector_count == "PFU64"\n",
31621 +                                            partition_offset,
31622 +                                            link_hold->sector_count);
31623 +                                       link_hold->start_sector =
31624 +                                           partition_offset;
31625 +                                       partition_offset +=
31626 +                                           link_hold->sector_count;
31627 +                               } else {
31628 +                                       LOG_SERIOUS
31629 +                                           ("Link Table entry %i metadata missing\n",
31630 +                                            i);
31631 +                                       cur_volume->complete = 0;
31632 +                                       break;
31633 +                               }
31634 +                       }
31635 +
31636 +                       sect_ptr = (char *) psector1;
31637 +
31638 +                       while (numlinks && (cur_volume->complete == 1)) {
31639 +                               if (numlinks > LINKS_IN_NEXT_SECTOR) {
31640 +                                       countlinks = LINKS_IN_NEXT_SECTOR;
31641 +                                       numlinks -= LINKS_IN_NEXT_SECTOR;
31642 +                               } else {
31643 +                                       countlinks = numlinks;
31644 +                                       numlinks = 0;
31645 +                               }
31646 +                               sect_ptr += OS2_BYTES_PER_SECTOR;
31647 +                               if (Master_Sequence_Number !=
31648 +                                   ((struct link_table_sector *) sect_ptr)->
31649 +                                   Sequence_Number) {
31650 +                                       cur_volume->complete = 0;
31651 +                                       LOG_SERIOUS
31652 +                                           ("Bad Sequence Number for Drive Linking Metadata!\n");
31653 +                               } else {
31654 +                                       for (i = 0; i < countlinks; i++) {
31655 +                                               linkser =
31656 +                                                   ((struct link_table_sector *)
31657 +                                                    sect_ptr)->Link_Table[i].
31658 +                                                   partition_serial;
31659 +                                               if ((link_hold =
31660 +                                                    find_link_data(&link_list,
31661 +                                                                   linkser))) {
31662 +                                                       // Add this partition to its parent Volume
31663 +                                                       add_os2link(link_hold,
31664 +                                                                   cur_node);
31665 +                                                       LOG_EVERYTHING
31666 +                                                           ("Link start_RBA == "PFU64" , sector_count == "PFU64"\n",
31667 +                                                            partition_offset,
31668 +                                                            link_hold->
31669 +                                                            sector_count);
31670 +                                                       link_hold->
31671 +                                                           start_sector =
31672 +                                                           partition_offset;
31673 +                                                       partition_offset +=
31674 +                                                           link_hold->
31675 +                                                           sector_count;
31676 +                                               } else {
31677 +                                                       LOG_SERIOUS
31678 +                                                           ("Link Table entry %i metadata missing\n",
31679 +                                                            i);
31680 +                                                       cur_volume->complete =
31681 +                                                           0;
31682 +                                                       break;
31683 +                                               }
31684 +                                       }
31685 +                               }
31686 +                       }
31687 +               }
31688 +
31689 +               /* If the volume is complete we can export it for use. */
31690 +               if (cur_volume->complete && (link_list == NULL)) {
31691 +
31692 +                       // Link new volume into the node list
31693 +                       if (cur_volume->Export_Needed &&
31694 +                           (!evms_cs_add_logical_node_to_list
31695 +                            (node_list, cur_node))
31696 +                           ) {
31697 +                               rc++;
31698 +                               cur_volume->Export_Needed = 0;
31699 +                       }
31700 +
31701 +                       previous_node = cur_node;
31702 +                       cur_node = cur_volume->next_os2lvm_node;
31703 +               } else {
31704 +                       /* Remove the volume from os2lvm_nodes list and delete it. */
31705 +                       if (previous_node != NULL) {
31706 +
31707 +                               previous_volume =
31708 +                                   (os2_volume_runtime_entry_t *)
31709 +                                   previous_node->private;
31710 +                               previous_volume->next_os2lvm_node =
31711 +                                   cur_volume->next_os2lvm_node;
31712 +                               cur_volume->next_os2lvm_node = NULL;
31713 +
31714 +                               delete_os2lvm_volume(cur_node);
31715 +
31716 +                               cur_node = previous_volume->next_os2lvm_node;
31717 +                       } else {
31718 +                               previous_node = cur_volume->next_os2lvm_node;
31719 +                               delete_os2lvm_volume(cur_node);
31720 +                               cur_node = previous_node;
31721 +                               previous_node = NULL;
31722 +                               os2lvm_nodes = cur_node;
31723 +                       }
31724 +
31725 +                       /* If any items remain in link_list, delete those as well. */
31726 +                       while (link_list) {
31727 +                               link_hold = link_list->next;
31728 +                               delete_os2_drive_link(link_list, 1);
31729 +                               link_list = link_hold;
31730 +                       }
31731 +
31732 +               }
31733 +
31734 +       }
31735 +
31736 +       LOG_ENTRY_EXIT("Finished Checking OS/2 Logical Volumes\n");
31737 +
31738 +       return rc;
31739 +}
31740 +
31741 +/* BBR_Transfer_IO
31742 + *
31743 + *     Transfer the responsibility for completing the specified IO from
31744 + *      the thread that requested it to the BBR Worker Thread
31745 + */
31746 +static void
31747 +BBR_Transfer_IO(struct transfer_record * Transfer_Record)
31748 +{
31749 +       unsigned long flags;
31750 +       int Wake_Worker_Thread = 0;     /* Assume that the worker is already awake. */
31751 +
31752 +       spin_lock_irqsave(&BBR_Queue_Lock, flags);
31753 +
31754 +       /* The BBR IO List is a singly linked list.  BBR_IO_List_Head points
31755 +          to the first item in the list, and BBR_IO_List_Tail points to the
31756 +          last item in the list.                                            */
31757 +       Transfer_Record->next = NULL;
31758 +       if (!BBR_IO_List_Tail) {        /* Empty list */
31759 +               BBR_IO_List_Head = Transfer_Record;
31760 +               Wake_Worker_Thread = 1; /* Wake up the worker thread. */
31761 +       } else                  /* Items already in the list. */
31762 +               BBR_IO_List_Tail->next = Transfer_Record;
31763 +
31764 +       BBR_IO_List_Tail = Transfer_Record;
31765 +
31766 +       spin_unlock_irqrestore(&BBR_Queue_Lock, flags);
31767 +       if (Wake_Worker_Thread)
31768 +               evms_cs_wakeup_thread(BBR_Worker_Thread);
31769 +
31770 +       return;
31771 +}
31772 +
31773 +/* OS2_DL_Callback
31774 + *
31775 + * This is the callback function used when an I/O request has to be broken
31776 + * into two parts because it crosses a drive link boundary.
31777 + *
31778 + */
31779 +static void
31780 +OS2_DL_Callback(struct buffer_head *bh, int uptodate)
31781 +{
31782 +
31783 +       struct tracking_record *Tracking_Record;
31784 +       struct buffer_head *Original;
31785 +
31786 +       Tracking_Record = bh->b_private;
31787 +
31788 +       /* Is this a read or a write? */
31789 +       if (Tracking_Record->link1_transfer_rec ||
31790 +           Tracking_Record->link2_transfer_rec) {
31791 +               /* We have a write here.  Was it successful? */
31792 +               if (!uptodate) {
31793 +                       /* Have we tried BBR yet? */
31794 +                       if ((bh == Tracking_Record->link1_bh) &&
31795 +                           (!Tracking_Record->link1_bbr_attempted)) {
31796 +                               /* Attempt BBR. */
31797 +                               BBR_Transfer_IO(Tracking_Record->
31798 +                                               link1_transfer_rec);
31799 +                               Tracking_Record->link1_bbr_attempted = 1;
31800 +                               return;
31801 +                       } else if ((bh == Tracking_Record->link2_bh) &&
31802 +                                  (!Tracking_Record->link2_bbr_attempted)) {
31803 +                               /* Attempt BBR. */
31804 +                               BBR_Transfer_IO(Tracking_Record->
31805 +                                               link2_transfer_rec);
31806 +                               Tracking_Record->link2_bbr_attempted = 1;
31807 +                               return;
31808 +                       }
31809 +
31810 +               }
31811 +
31812 +       }
31813 +
31814 +       Tracking_Record->io_in_progress -= 1;
31815 +       if (Tracking_Record->io_in_progress) {
31816 +               Tracking_Record->up_to_date = uptodate;
31817 +       }
31818 +       Original = Tracking_Record->org_bh;
31819 +
31820 +       if (!Tracking_Record->io_in_progress) {
31821 +               uptodate &= Tracking_Record->up_to_date;
31822 +               /* If this is a write, then Transfer Records will have been set up for both Link1 and Link2.
31823 +                  If the transfer records were used because of BBR, then the BBR worker thread will have
31824 +                  disposed of the transfer records.  If the transfer records were not used, then we must
31825 +                  dispose of them here to prevent memory leaks.                                             */
31826 +               if (Tracking_Record->link1_transfer_rec &&
31827 +                   (!Tracking_Record->link1_bbr_attempted)) {
31828 +                       evms_cs_deallocate_to_pool(BBR_Transfer_Pool,
31829 +                                                  Tracking_Record->
31830 +                                                  link1_transfer_rec);
31831 +               }
31832 +               if (Tracking_Record->link2_transfer_rec &&
31833 +                   (!Tracking_Record->link2_bbr_attempted)) {
31834 +                       evms_cs_deallocate_to_pool(BBR_Transfer_Pool,
31835 +                                                  Tracking_Record->
31836 +                                                  link2_transfer_rec);
31837 +               }
31838 +               evms_cs_deallocate_to_pool(evms_bh_pool,
31839 +                                          Tracking_Record->link1_bh);
31840 +               evms_cs_deallocate_to_pool(evms_bh_pool,
31841 +                                          Tracking_Record->link2_bh);
31842 +               evms_cs_deallocate_to_pool(DL_Tracking_Pool, Tracking_Record);
31843 +               Original->b_end_io(Original, uptodate);
31844 +       }
31845 +
31846 +       return;
31847 +}
31848 +
31849 +/* OS2_BBR_Write_Callback
31850 + *
31851 + *     This is the callback for normal write requests. Check for an error
31852 + *     during the I/O, and send to the worker thread for processing if necessary.
31853 + */
31854 +static void
31855 +OS2_BBR_Write_Callback(struct transfer_record * Transfer_Record,
31856 +                      struct buffer_head *bh, int uptodate, int *redrive)
31857 +{
31858 +       if (!uptodate) {
31859 +               BBR_Transfer_IO(Transfer_Record);
31860 +               *redrive = TRUE;
31861 +       } else {
31862 +               evms_cs_deallocate_to_pool(BBR_Transfer_Pool, Transfer_Record);
31863 +       }
31864 +
31865 +       return;
31866 +}
31867 +
31868 +/* Worker thread to handle:
31869 +
31870 +   I/O to drive/partitions/objects where bad blocks are known to exist
31871 +   I/O to drive/partition/object where a new bad block has been discovered and the I/O must be redriven.
31872 +
31873 +*/
31874 +static void
31875 +BBR_Worker(void *Not_Used)
31876 +{
31877 +       unsigned long flags;
31878 +       struct transfer_record *Current_IO;
31879 +       int complete;
31880 +
31881 +       for (;;) {
31882 +               // Process bbr_io_list, one entry at a time.
31883 +               spin_lock_irqsave(&BBR_Queue_Lock, flags);
31884 +
31885 +               /* Is there any work for us? */
31886 +               if (!BBR_IO_List_Head) {
31887 +                       spin_unlock_irqrestore(&BBR_Queue_Lock, flags);
31888 +                       break;  /* List empty - nothing to do. */
31889 +               }
31890 +
31891 +               /* Get the IO to perform. */
31892 +               Current_IO = BBR_IO_List_Head;
31893 +               BBR_IO_List_Head = Current_IO->next;
31894 +               if (!BBR_IO_List_Head)
31895 +                       BBR_IO_List_Tail = BBR_IO_List_Head;
31896 +
31897 +               spin_unlock_irqrestore(&BBR_Queue_Lock, flags);
31898 +
31899 +               /* Now lets process the I/O request. */
31900 +               complete = do_os2_bbr_io(Current_IO->Partition_Data,
31901 +                                        Current_IO->Write_Flag,
31902 +                                        Current_IO->bh->b_rsector,
31903 +                                        Current_IO->bh->
31904 +                                        b_size >> EVMS_VSECTOR_SIZE_SHIFT,
31905 +                                        Current_IO->bh->b_data);
31906 +
31907 +               /* We need to do the callback. */
31908 +               Current_IO->bh->b_end_io(Current_IO->bh, (complete == 0));
31909 +
31910 +               /* Now cleanup */
31911 +               evms_cs_deallocate_to_pool(BBR_Transfer_Pool, Current_IO);
31912 +       }
31913 +
31914 +       return;                 /* Go to sleep. */
31915 +
31916 +}
31917 +
31918 +/*
31919 + * Sector_Is_Remapped
31920 + *
31921 + * This function returns 1 if the specified sector has been remapped, 0 if it has not
31922 + *
31923 + * If the sector has been remapped, then the new sector is returned in Replacement_Sector
31924 + *
31925 + */
31926 +static int
31927 +Sector_Is_Remapped(struct os2_dl_entry * io_dlentry,
31928 +                  u64 Source_Sector, u64 * Replacement_Sector)
31929 +{
31930 +       LVM_BBR_Feature *Feature_Data =
31931 +           (LVM_BBR_Feature *) io_dlentry->bbr_data;
31932 +       unsigned int Sector_Index;      /* The BBR Table is spread across several sectors.  This tracks which sector we are looking at. */
31933 +       unsigned int BBR_Table_Index;   /* This tracks the actual entry in the BBR Table that we are examining. */
31934 +       unsigned int BBR_Table_Entries_In_Use =
31935 +           Feature_Data->control.Table_Entries_In_Use;
31936 +       struct bbr_table_entry * table_entry;
31937 +       unsigned int guard1;
31938 +
31939 +       /* Default value is no remap. */
31940 +       *Replacement_Sector = Source_Sector;
31941 +
31942 +       do {
31943 +               guard1 = io_dlentry->guard1;    /* Lamport's Theorem */
31944 +
31945 +               for (BBR_Table_Index = 0;
31946 +                    BBR_Table_Index < BBR_Table_Entries_In_Use;
31947 +                    BBR_Table_Index++) {
31948 +                       Sector_Index =
31949 +                           BBR_Table_Index / BBR_TABLE_ENTRIES_PER_SECTOR;
31950 +                       table_entry =
31951 +                           &(Feature_Data->remap[Sector_Index].
31952 +                             BBR_Table[BBR_Table_Index -
31953 +                                       (Sector_Index *
31954 +                                        BBR_TABLE_ENTRIES_PER_SECTOR)]);
31955 +                       if (table_entry->BadSector == (u32)Source_Sector) {
31956 +                               *Replacement_Sector =
31957 +                                   (u64)table_entry->ReplacementSector;
31958 +                               break;
31959 +                       }
31960 +               }
31961 +
31962 +       } while (guard1 != io_dlentry->guard2); /* Lamport's Theorem */
31963 +
31964 +       if (*Replacement_Sector != Source_Sector)
31965 +               return 1;
31966 +       else
31967 +               return 0;
31968 +}
31969 +
31970 +/*
31971 + * Invalidate_Mapping
31972 + *
31973 + * This function either frees a replacement sector to be reused, or it
31974 + * marks the replacement sector as bad.
31975 + *
31976 + */
31977 +static void
31978 +Invalidate_Mapping(struct os2_dl_entry * dlentry,
31979 +                  u64 Source_Sector, int Replacement_Sector_Is_Bad)
31980 +{
31981 +       LVM_BBR_Feature *Feature_Data = (LVM_BBR_Feature *) dlentry->bbr_data;
31982 +       unsigned int Sector_Index;      /* The BBR Table is spread across several sectors.  This tracks which sector we are looking at. */
31983 +       unsigned int BBR_Table_Index;   /* This tracks the actual entry in the BBR Table that we are examining. */
31984 +       unsigned int BBR_Table_Entries_In_Use =
31985 +           Feature_Data->control.Table_Entries_In_Use;
31986 +       struct bbr_table_entry * table_entry = NULL;
31987 +
31988 +       /* Lock for the BBR Table. */
31989 +       down(&(dlentry->bbr_table_lock));
31990 +
31991 +       /* Find the entry to invalidate. */
31992 +       for (BBR_Table_Index = 0; BBR_Table_Index < BBR_Table_Entries_In_Use;
31993 +            BBR_Table_Index++) {
31994 +               Sector_Index = BBR_Table_Index / BBR_TABLE_ENTRIES_PER_SECTOR;
31995 +               table_entry =
31996 +                   &(Feature_Data->remap[Sector_Index].
31997 +                     BBR_Table[BBR_Table_Index -
31998 +                               (Sector_Index * BBR_TABLE_ENTRIES_PER_SECTOR)]);
31999 +               if (table_entry->BadSector == Source_Sector) {
32000 +                       break;
32001 +               }
32002 +       }
32003 +
32004 +       /* Now that we have found the entry, we must invalidate it. */
32005 +       if (Replacement_Sector_Is_Bad) {
32006 +               table_entry->BadSector = (u32) - 1;
32007 +       }
32008 +       /* OS/2 supported a method for clearing out bad block remappings if the filesystem on the volume supported
32009 +          the tracking of bad blocks.  We don't support that under Linux, so there is no else case here.           */
32010 +
32011 +       /* Unlock the BBR Table */
32012 +       up(&(dlentry->bbr_table_lock));
32013 +
32014 +       return;
32015 +}
32016 +
32017 +/*
32018 + * Create_New_struct bbr_table_entry
32019 + *
32020 + * Finds bad blocks within the range specified, allocates replacement sectors,
32021 + * writes the data to the replacement sectors, and updates the BBR metadata on
32022 + * disk to reflect the new mapping.  Returns 1 if successful, 0 otherwise.
32023 + *
32024 + */
32025 +static int
32026 +Create_New_BBR_Table_Entry(struct os2_dl_entry * dlentry,
32027 +                          u64 starting_lsn, unsigned int count, void *buffer)
32028 +{
32029 +       u64 lsn;
32030 +       struct bbr_table_entry *Table_Entry;
32031 +       unsigned int Sector_Index;
32032 +       unsigned int Table_Index;
32033 +       int rc;
32034 +       int rc2;
32035 +       u32 New_Sequence_Number;
32036 +       LVM_BBR_Feature *BBR_Data = (LVM_BBR_Feature *) dlentry->bbr_data;
32037 +
32038 +       for (lsn = starting_lsn; lsn < (starting_lsn + count); lsn++) {
32039 +               rc = INIT_IO(dlentry->link_partition, 1, lsn, 1, buffer);
32040 +               while (rc) {
32041 +
32042 +                       /* Lock for the BBR Table. */
32043 +                       down(&(dlentry->bbr_table_lock));
32044 +
32045 +                       /* Increment the second guard value. This will cause those reading the BBR Table to spin. */
32046 +                       dlentry->guard2++;
32047 +
32048 +                       /* Ensure that the bbr active flag is set. */
32049 +                       dlentry->bbr_is_active = 1;
32050 +
32051 +                       /* Allocate a replacement sector */
32052 +                       if (BBR_Data->control.Table_Entries_In_Use <
32053 +                           BBR_Data->control.Table_Size) {
32054 +                               Sector_Index =
32055 +                                   BBR_Data->control.Table_Entries_In_Use /
32056 +                                   BBR_TABLE_ENTRIES_PER_SECTOR;
32057 +                               Table_Index =
32058 +                                   BBR_Data->control.Table_Entries_In_Use %
32059 +                                   BBR_TABLE_ENTRIES_PER_SECTOR;
32060 +                               BBR_Data->control.Table_Entries_In_Use =
32061 +                                   BBR_Data->control.Table_Entries_In_Use + 1;
32062 +                               Table_Entry =
32063 +                                   (struct bbr_table_entry *) & (BBR_Data->
32064 +                                                          remap[Sector_Index].
32065 +                                                          BBR_Table
32066 +                                                          [Table_Index]);
32067 +                               Table_Entry->BadSector = lsn;
32068 +                       } else {
32069 +                               /* There are no more replacement sectors available!  Time to bail ... */
32070 +                               up(&(dlentry->bbr_table_lock));
32071 +                               return 0;
32072 +                       }
32073 +
32074 +                       /* Now that we have a replacement sector, increment the first guard value.  This will free any
32075 +                          threads reading the BBR Table.                                                                */
32076 +                       dlentry->guard1++;
32077 +
32078 +                       /* Release the lock now that we have a replacement sector. */
32079 +                       up(&(dlentry->bbr_table_lock));
32080 +
32081 +                       /* Test the replacement sector. */
32082 +                       rc = INIT_IO(dlentry->link_partition, 1,
32083 +                                    Table_Entry->ReplacementSector, 1, buffer);
32084 +                       if (rc) {
32085 +                               /* The replacement sector was bad.  Lets mark it bad in the table and try again. */
32086 +                               Table_Entry->BadSector = (u32) - 1;
32087 +                       }
32088 +
32089 +               }               /* End of processing for the current sector. */
32090 +
32091 +       }                       /* end of loop to test each sector in the I/O and remap any bad ones found. */
32092 +
32093 +       /* Need to write the modified BBR Table back to disk.  This includes updating the sequence numbers and CRCs. */
32094 +
32095 +       /* Lock for the BBR Table. */
32096 +       down(&(dlentry->bbr_table_lock));
32097 +
32098 +       /* Increment the sequence numbers. */
32099 +       New_Sequence_Number = BBR_Data->control.Sequence_Number + 1;
32100 +       BBR_Data->control.Sequence_Number = New_Sequence_Number;
32101 +       for (Sector_Index = 0;
32102 +            Sector_Index < BBR_Data->control.Sectors_Per_Table;
32103 +            Sector_Index++) {
32104 +               BBR_Data->remap[Sector_Index].Sequence_Number =
32105 +                   New_Sequence_Number;
32106 +       }
32107 +
32108 +       /* Calculate the new CRC values. */
32109 +       BBR_Data->control.CRC = 0;
32110 +       BBR_Data->control.CRC =
32111 +           evms_cs_calculate_crc(EVMS_INITIAL_CRC, &(BBR_Data->control),
32112 +                                 OS2_BYTES_PER_SECTOR);
32113 +       for (Sector_Index = 0;
32114 +            Sector_Index < BBR_Data->control.Sectors_Per_Table;
32115 +            Sector_Index++) {
32116 +               BBR_Data->remap[Sector_Index].CRC = 0;
32117 +               BBR_Data->remap[Sector_Index].CRC =
32118 +                   evms_cs_calculate_crc(EVMS_INITIAL_CRC,
32119 +                                         &(BBR_Data->remap[Sector_Index]),
32120 +                                         OS2_BYTES_PER_SECTOR);
32121 +       }
32122 +
32123 +       /* Now we must write the table back to the partition from whence it came. */
32124 +
32125 +       /* Write the first copy. */
32126 +       rc = INIT_IO(dlentry->link_partition, 1, dlentry->bbr_lsn1,
32127 +                    dlentry->bbr_feature_size, BBR_Data);
32128 +
32129 +       /* Write the second copy. */
32130 +       rc2 =
32131 +           INIT_IO(dlentry->link_partition, 1, dlentry->bbr_lsn2,
32132 +                   dlentry->bbr_feature_size, BBR_Data);
32133 +
32134 +       /* If both copies failed to reach the disk, then fail the I/O. */
32135 +       if (rc && rc2) {
32136 +               rc = 0;
32137 +       } else
32138 +               rc = 1;
32139 +
32140 +       /* Unlock the BBR Table */
32141 +       up(&(dlentry->bbr_table_lock));
32142 +
32143 +       /* Indicate success. */
32144 +       return rc;
32145 +}
32146 +
32147 +/*
32148 + * Clone_Bufferhead
32149 + *
32150 + * Prepares a usable copy of an existing bufferhead.
32151 + *
32152 + */
32153 +static void
32154 +Clone_Bufferhead(struct buffer_head *Source, struct buffer_head *Child)
32155 +{
32156 +       Child->b_next = NULL;
32157 +       Child->b_blocknr = Source->b_blocknr;
32158 +       Child->b_size = Source->b_size;
32159 +       Child->b_list = BUF_LOCKED;
32160 +       Child->b_dev = Source->b_dev;
32161 +       Child->b_count = (atomic_t) ATOMIC_INIT(0);
32162 +       atomic_set(&Child->b_count, atomic_read(&Source->b_count));
32163 +       Child->b_rdev = Source->b_rdev;
32164 +       Child->b_state = Source->b_state;
32165 +       Child->b_flushtime = 0;
32166 +       Child->b_next_free = NULL;
32167 +       Child->b_prev_free = NULL;
32168 +       Child->b_this_page = (struct buffer_head *) 1;
32169 +       Child->b_reqnext = NULL;
32170 +       Child->b_pprev = NULL;
32171 +       Child->b_data = Source->b_data;
32172 +       Child->b_page = Source->b_page;
32173 +       Child->b_end_io = Source->b_end_io;
32174 +       Child->b_private = Source->b_private;
32175 +       Child->b_rsector = Source->b_rsector;
32176 +       Child->b_inode_buffers.next = NULL;
32177 +       Child->b_inode_buffers.prev = NULL;
32178 +       return;
32179 +}
32180 diff -Naur linux-2002-09-30/drivers/evms/s390_part.c evms-2002-09-30/drivers/evms/s390_part.c
32181 --- linux-2002-09-30/drivers/evms/s390_part.c   Wed Dec 31 18:00:00 1969
32182 +++ evms-2002-09-30/drivers/evms/s390_part.c    Fri Sep 13 16:09:55 2002
32183 @@ -0,0 +1,1445 @@
32184 +/* -*- linux-c -*- */
32185 +/*
32186 + *
32187 + *
32188 + *   Copyright (c) International Business Machines  Corp., 2000
32189 + *
32190 + *   This program is free software;  you can redistribute it and/or modify
32191 + *   it under the terms of the GNU General Public License as published by
32192 + *   the Free Software Foundation; either version 2 of the License, or
32193 + *   (at your option) any later version.
32194 + *
32195 + *   This program is distributed in the hope that it will be useful,
32196 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
32197 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
32198 + *   the GNU General Public License for more details.
32199 + *
32200 + *   You should have received a copy of the GNU General Public License
32201 + *   along with this program;  if not, write to the Free Software
32202 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
32203 + *
32204 + *
32205 + */
32206 +/*
32207 + * linux/drivers/evms/s390_part.c
32208 + *
32209 + * EVMS S/390 partition manager
32210 + *
32211 + * Partial code extracted from
32212 + *
32213 + *  linux/fs/partitions/ibm.c
32214 + *
32215 + */
32216 +
32217 +#include <linux/config.h>
32218 +#include <linux/module.h>
32219 +#include <linux/kernel.h>
32220 +#include <linux/config.h>
32221 +#include <linux/string.h>
32222 +#include <linux/blk.h>
32223 +#include <asm/ebcdic.h>
32224 +#include <asm/uaccess.h>
32225 +#include <asm/dasd.h>
32226 +#include <asm/vtoc.h>
32227 +#include <linux/evms/evms.h>
32228 +
32229 +/* prefix used in logging messages */
32230 +#define LOG_PREFIX "s390_part: "
32231 +
32232 +/* Private instance data structure for node we produced */
32233 +struct local_instance_data {
32234 +       struct evms_logical_node *source_disk;
32235 +       u64 start_sect;         /* starting LBA */
32236 +       u64 nr_sects;           /* number of sectors */
32237 +       unsigned char type;     /* partition type or filesystem format indicator, can be set to 0 */
32238 +};
32239 +
32240 +static int exported_nodes;     /* total # of exported segments
32241 +                                  * produced during this discovery.
32242 +                                */
32243 +
32244 +/* Prototypes */
32245 +static int s390_partition_discover(struct evms_logical_node **);
32246 +static int s390_partition_delete(struct evms_logical_node *);
32247 +static void s390_partition_read(struct evms_logical_node *,
32248 +                               struct buffer_head *);
32249 +static void s390_partition_write(struct evms_logical_node *,
32250 +                                struct buffer_head *);
32251 +static int s390_partition_ioctl(struct evms_logical_node *,
32252 +                               struct inode *,
32253 +                               struct file *, unsigned int, unsigned long);
32254 +static int s390_partition_init_io(struct evms_logical_node *,
32255 +                                 int, u64, u64, void *);
32256 +
32257 +static struct evms_plugin_fops fops = {
32258 +       .discover = s390_partition_discover,
32259 +       .delete = s390_partition_delete,
32260 +       .read = s390_partition_read,
32261 +       .write = s390_partition_write,
32262 +       .init_io = s390_partition_init_io,
32263 +       .ioctl = s390_partition_ioctl
32264 +};
32265 +
32266 +#define EVMS_S390_PARTITION_MANAGER_ID 2
32267 +
32268 +static struct evms_plugin_header plugin_header = {
32269 +       .id = SetPluginID(IBM_OEM_ID,
32270 +                         EVMS_SEGMENT_MANAGER,
32271 +                         EVMS_S390_PARTITION_MANAGER_ID),
32272 +       .version = {
32273 +                   .major = 1,
32274 +                   .minor = 0,
32275 +                   .patchlevel = 0},
32276 +       .required_services_version = {
32277 +                                     .major = 0,
32278 +                                     .minor = 5,
32279 +                                     .patchlevel = 0},
32280 +       .fops = &fops
32281 +};
32282 +
32283 +/***************************************************/
32284 +/* List Support - Typedefs, Variables, & Functions */
32285 +/***************************************************/
32286 +
32287 +/* Typedefs */
32288 +
32289 +/* structure to keep status on
32290 + * each disk.
32291 + */
32292 +#define S390_DISK_OK                   0
32293 +#define S390_DISK_FAILED               1
32294 +#define S390_FAILED_SKIP_COUNT         1024
32295 +struct disk_object {
32296 +       int flags;
32297 +       atomic_t skipped_ios;
32298 +       atomic_t pending_ios;
32299 +       atomic_t total_ios;
32300 +       atomic_t failed_ios;
32301 +       struct evms_logical_node *disk;
32302 +};
32303 +
32304 +/* structure to keep status
32305 + * on each device.
32306 + */
32307 +struct device_object {
32308 +       unsigned char label[8];
32309 +       int total_paths;
32310 +       struct evms_list_node *disk_object_list;
32311 +       struct evms_list_node *segment_list;
32312 +};
32313 +
32314 +/* structure used to track in-flight IOs,
32315 + * and to handle failover scenarios.
32316 + */
32317 +struct s390_io {
32318 +       struct device_object *devo;
32319 +       struct disk_object *dsko;
32320 +       struct evms_logical_node *segment;
32321 +       int rw_flag;
32322 +       int paths_tried;
32323 +       struct buffer_head *bh;
32324 +       struct s390_io *next;
32325 +};
32326 +static spinlock_t s390_redrive_list_lock = SPIN_LOCK_UNLOCKED;
32327 +static struct s390_io *s390_redrive_list = NULL;
32328 +static struct evms_thread *s390_io_redrive_thread;
32329 +static struct evms_pool_mgmt *s390_io_track_pool = NULL;
32330 +
32331 +/* Variables */
32332 +
32333 +static struct evms_list_node *my_device_object_list;
32334 +
32335 +static struct evms_list_node **
32336 +lookup_device_object(struct evms_logical_node *disk)
32337 +{
32338 +       struct evms_list_node **devoln;
32339 +
32340 +       devoln = &my_device_object_list;
32341 +       while (*devoln) {
32342 +               struct evms_list_node **dskoln;
32343 +               struct device_object *devo;
32344 +               devo = (struct device_object *) (*devoln)->item;
32345 +               dskoln = &devo->disk_object_list;
32346 +               while (*dskoln) {
32347 +                       struct disk_object *dsko;
32348 +                       dsko = (struct disk_object *) (*dskoln)->item;
32349 +                       if (dsko->disk == disk) {
32350 +                               return (devoln);
32351 +                       }
32352 +                       dskoln = &(*dskoln)->next;
32353 +               }
32354 +               devoln = &(*devoln)->next;
32355 +       }
32356 +       return (devoln);
32357 +}
32358 +
32359 +static struct evms_list_node **
32360 +lookup_label(unsigned char *label, struct evms_list_node **devoln)
32361 +{
32362 +       if (!devoln) {
32363 +               devoln = &my_device_object_list;
32364 +       } else {
32365 +               devoln = &(*devoln)->next;
32366 +       }
32367 +       while (*devoln) {
32368 +               struct device_object *devo;
32369 +               struct disk_object *dsko;
32370 +               devo = (struct device_object *) (*devoln)->item;
32371 +               dsko = (struct disk_object *) devo->disk_object_list->item;
32372 +               LOG_DEBUG("comparing labels: new(%s), %s(%s)\n",
32373 +                         label, dsko->disk->name, devo->label);
32374 +               if (!strncmp(devo->label, label, 6)) {
32375 +                       LOG_DEBUG("matching label found!\n");
32376 +                       break;
32377 +               }
32378 +               devoln = &(*devoln)->next;
32379 +       }
32380 +       return (devoln);
32381 +}
32382 +
32383 +static struct evms_logical_node *
32384 +find_segment_on_disk(struct evms_logical_node *disk,
32385 +                    u64 start_sect, u64 nr_sects)
32386 +{
32387 +       struct evms_logical_node *rc = NULL;
32388 +       struct evms_list_node **devoln;
32389 +
32390 +       /* find disk object */
32391 +       devoln = lookup_device_object(disk);
32392 +       if (*devoln) {
32393 +               /* disk object found in list */
32394 +               /* attempt to find segment */
32395 +               struct evms_list_node **sln;
32396 +               struct device_object *devo;
32397 +
32398 +               devo = (struct device_object *) (*devoln)->item;
32399 +               sln = &devo->segment_list;
32400 +               while (*sln) {
32401 +                       struct evms_logical_node *segment;
32402 +                       struct local_instance_data *lid;
32403 +
32404 +                       segment = (struct evms_logical_node *) (*sln)->item;
32405 +                       lid = segment->private;
32406 +                       if (lid->start_sect == start_sect) {
32407 +                               if (lid->nr_sects == nr_sects) {
32408 +                                       rc = segment;
32409 +                                       break;
32410 +                               }
32411 +                       }
32412 +                       sln = &(*sln)->next;
32413 +               }
32414 +       }
32415 +       return (rc);
32416 +}
32417 +
32418 +static int
32419 +add_segment_to_disk(struct evms_logical_node *disk,
32420 +                   unsigned char *label, struct evms_logical_node *segment)
32421 +{
32422 +       int rc = 0;
32423 +       struct evms_list_node **devoln;
32424 +       struct device_object *devo;
32425 +
32426 +       devoln = lookup_device_object(disk);
32427 +       if (*devoln == NULL) {
32428 +               struct disk_object *dsko = NULL;
32429 +               /* device object not in list, add device object */
32430 +               devo = kmalloc(sizeof (*devo), GFP_KERNEL);
32431 +               if (devo) {
32432 +                       memset(devo, 0, sizeof (*devo));
32433 +                       strncpy(devo->label, label, 6);
32434 +                       rc = evms_cs_add_item_to_list(devoln, devo);
32435 +               } else {
32436 +                       rc = -ENOMEM;
32437 +               }
32438 +
32439 +               /* create a disk object */
32440 +               if (!rc) {
32441 +                       dsko = kmalloc(sizeof (*dsko), GFP_KERNEL);
32442 +                       if (!dsko) {
32443 +                               rc = -ENOMEM;
32444 +                       }
32445 +               }
32446 +               if (!rc) {
32447 +                       memset(dsko, 0, sizeof (*dsko));
32448 +                       /* add disk to disk object */
32449 +                       dsko->disk = disk;
32450 +                       /* add disk object to disk object list
32451 +                        * in device object */
32452 +                       rc = evms_cs_add_item_to_list(&devo->disk_object_list,
32453 +                                                     dsko);
32454 +               }
32455 +               if (!rc) {
32456 +                       devo->total_paths++;
32457 +               } else {
32458 +                       /* on error clean up allocations */
32459 +                       if (dsko) {
32460 +                               kfree(dsko);
32461 +                       }
32462 +                       if (*devoln) {
32463 +                               evms_cs_remove_item_from_list(devoln, devo);
32464 +                       }
32465 +                       if (devo)
32466 +                               kfree(devo);
32467 +               }
32468 +       } else {
32469 +               devo = (struct device_object *) (*devoln)->item;
32470 +       }
32471 +       if (!rc) {
32472 +               /* attempt to add segment */
32473 +               rc = evms_cs_add_item_to_list(&devo->segment_list, segment);
32474 +       }
32475 +       return (rc);
32476 +}
32477 +
32478 +static int
32479 +remove_segment_from_disk(struct evms_logical_node *disk,
32480 +                        struct evms_logical_node *segment,
32481 +                        struct evms_list_node **empty_disk_object_list)
32482 +{
32483 +       int rc = -1;
32484 +       struct evms_list_node **devoln;
32485 +
32486 +       *empty_disk_object_list = NULL;
32487 +       devoln = lookup_device_object(disk);
32488 +       if (*devoln) {
32489 +               /* device object found in list */
32490 +               /* attempt to remove segment */
32491 +               struct device_object *devo;
32492 +               devo = (struct device_object *) (*devoln)->item;
32493 +               rc = evms_cs_remove_item_from_list(&devo->segment_list,
32494 +                                                  segment);
32495 +               if (!rc) {
32496 +                       if (devo->segment_list == NULL) {
32497 +                               /* return disk object list to caller */
32498 +                               *empty_disk_object_list =
32499 +                                   devo->disk_object_list;
32500 +                               /* remove device object from list */
32501 +                               rc = evms_cs_remove_item_from_list(devoln,
32502 +                                                                  devo);
32503 +                               /* free device object */
32504 +                               kfree(devo);
32505 +                       }
32506 +               }
32507 +       }
32508 +       return (rc);
32509 +}
32510 +
32511 +/* function: s390_load_balance
32512 + *
32513 + * this function is used to route an IO to the appropriate
32514 + * paths of a multipath device.
32515 + *
32516 + * appropriate paths are determine used load balancing
32517 + * techniques. load balancing is accomplished by monitoring
32518 + * pending or in-flight IOs to each path. when a new IO
32519 + * request is received, all paths are examined, and the path
32520 + * with the fewest IOs pending is selected to receive the
32521 + * new request.
32522 + *
32523 + * this routine also utilizes some failed path recovery
32524 + * logic.
32525 + *
32526 + * if a failed path has been skipped for a given number
32527 + * (timeout value) of IO requests. it is then tried again,
32528 + * and if the path has become functional again, it returned
32529 + * to the active state and it becomes available for load
32530 + * balancing.
32531 + *
32532 + * if a new IO arrives and we find no currently active paths,
32533 + * each failed path will be attempted one time in the hopes
32534 + * that it may have become active from the time between when
32535 + * it was marked failed and now. only when all paths have
32536 + * been tried and found non-active, is the IO marked with
32537 + * an error and returned.
32538 + *
32539 + * this function works in concert with s390_end_io_callback
32540 + * function and the s390iod(aemon), to redrive failed IO
32541 + * requests.
32542 + *
32543 + */
32544 +static void
32545 +s390_load_balance(struct s390_io **piot, struct evms_logical_node *disk)
32546 +{
32547 +       struct evms_list_node **dskoln;
32548 +       struct disk_object *dsko, *selected_dsko = NULL;
32549 +       int dskidx, path = 0;
32550 +       struct s390_io *iot;
32551 +
32552 +       /* allocate and initialize an IO tracking structure
32553 +        * if one was not passed in.
32554 +        */
32555 +       if (!*piot) {
32556 +               struct evms_list_node **devoln;
32557 +               /* allocate IO Track struct */
32558 +               *piot = evms_cs_allocate_from_pool(s390_io_track_pool,
32559 +                                                  EVMS_BLOCKABLE);
32560 +               memset(*piot, 0, sizeof (*iot));
32561 +               /* find the device object */
32562 +               devoln = lookup_device_object(disk);
32563 +               (*piot)->devo = (*devoln)->item;
32564 +       }
32565 +       iot = *piot;
32566 +
32567 +       /* find next disk object based on current load */
32568 +
32569 +       /* check for failed paths that have timed-out */
32570 +       dskidx = 1;
32571 +       dskoln = &iot->devo->disk_object_list;
32572 +       while (*dskoln) {
32573 +               dsko = (struct disk_object *) (*dskoln)->item;
32574 +               do {
32575 +                       /* skip paths tried earlier */
32576 +                       if (iot->paths_tried & dskidx) {
32577 +                               continue;
32578 +                       }
32579 +                       /* skip active disks */
32580 +                       if (dsko->flags == S390_DISK_OK) {
32581 +                               continue;
32582 +                       }
32583 +                       /* skip disks that haven't timed-out yet */
32584 +                       if (atomic_read(&dsko->skipped_ios)
32585 +                           < S390_FAILED_SKIP_COUNT) {
32586 +                               continue;
32587 +                       }
32588 +                       selected_dsko = dsko;
32589 +                       path = dskidx;
32590 +                       break;
32591 +               } while (0);
32592 +               dskoln = &(*dskoln)->next;
32593 +               dskidx <<= 1;
32594 +       }
32595 +
32596 +       /* if we have no timed-out paths, then check for the
32597 +        * path with lowest pending io count. if that path
32598 +        * happens to be a failed path and there is active
32599 +        * paths, increment the skipped io count, mark this
32600 +        * path as having been selected, then go back and run
32601 +        * the loop again, looking for the next best choice.
32602 +        * continue this process until the best active has
32603 +        * been selected, or we end up with the best failed
32604 +        * path.
32605 +        */
32606 +       if (!selected_dsko) {
32607 +               int paths_selected, have_actives;
32608 +               paths_selected = 0;
32609 +             s390_repeat_active_search:
32610 +               path = 0;
32611 +               have_actives = FALSE;
32612 +               dskidx = 1;
32613 +               dskoln = &iot->devo->disk_object_list;
32614 +               while (*dskoln) {
32615 +                       dsko = (struct disk_object *) (*dskoln)->item;
32616 +                       do {
32617 +                               /* skip paths tried earlier */
32618 +                               if (iot->paths_tried & dskidx) {
32619 +                                       continue;
32620 +                               }
32621 +                               /* skip previously selected disks */
32622 +                               if (paths_selected & dskidx) {
32623 +                                       continue;
32624 +                               }
32625 +                               /* remember if we have active disks */
32626 +                               if (dsko->flags == S390_DISK_OK) {
32627 +                                       have_actives = TRUE;
32628 +                               }
32629 +                               /* look for disk with smallest
32630 +                                * pending IO count.
32631 +                                */
32632 +                               if (selected_dsko) {
32633 +                                       if (atomic_read(&dsko->pending_ios)
32634 +                                           >=
32635 +                                           (atomic_read
32636 +                                            (&selected_dsko->pending_ios))) {
32637 +                                               continue;
32638 +                                       }
32639 +                               }
32640 +                               selected_dsko = dsko;
32641 +                               path = dskidx;
32642 +                       } while (0);
32643 +                       dskoln = &(*dskoln)->next;
32644 +                       dskidx <<= 1;
32645 +               }
32646 +               /* if we have unselected active paths
32647 +                * and the currently selected path is
32648 +                * failed, increment its skipped io count,
32649 +                * and then go back to find an active path.
32650 +                *
32651 +                * this loop is structured this way so that
32652 +                * we can accurately determine and track when
32653 +                * a path has been skipped.
32654 +                */
32655 +               if (have_actives && selected_dsko) {
32656 +                       if (selected_dsko->flags & S390_DISK_FAILED) {
32657 +                               atomic_inc(&selected_dsko->skipped_ios);
32658 +                               paths_selected |= path;
32659 +                               selected_dsko = NULL;
32660 +                               goto s390_repeat_active_search;
32661 +                       }
32662 +               }
32663 +       }
32664 +
32665 +       /* if we have a selected path, perform the necessary
32666 +        * bookkeeping on it.
32667 +        */
32668 +       if (selected_dsko) {
32669 +               atomic_set(&selected_dsko->skipped_ios, 0);
32670 +               atomic_inc(&selected_dsko->pending_ios);
32671 +               atomic_inc(&selected_dsko->total_ios);
32672 +               iot->paths_tried |= path;
32673 +       }
32674 +       /* store the selected path (disk object) in the
32675 +        * IO tracking structure, for examination by the
32676 +        * caller.
32677 +        */
32678 +       iot->dsko = selected_dsko;
32679 +}
32680 +
32681 +static void
32682 +s390_end_io_callback(void *private,
32683 +                    struct buffer_head *bh, int uptodate, int *done)
32684 +{
32685 +       struct s390_io *iot;
32686 +       ulong flags;
32687 +
32688 +       iot = private;
32689 +
32690 +       /* update the disk object's status */
32691 +//      spin_lock_irqsave(iot->devo->device_object_lock, flags);
32692 +       atomic_dec(&iot->dsko->pending_ios);
32693 +       iot->dsko->flags = !uptodate;
32694 +//      spin_unlock_irqrestore(iot->devo->device_object_lock, flags);
32695 +
32696 +       if (!uptodate) {
32697 +               atomic_inc(&iot->dsko->failed_ios);
32698 +               /* encountered error */
32699 +
32700 +               /* is this a multipath device? */
32701 +               if (iot->devo->total_paths > 1) {
32702 +                       /* yes, its a multipath device */
32703 +
32704 +                       /* determine alternate path */
32705 +                       s390_load_balance(&iot, NULL);
32706 +                       if (iot->dsko) {
32707 +                               /* queue up redrive request */
32708 +                               spin_lock_irqsave(&s390_redrive_list_lock,
32709 +                                                 flags);
32710 +                               iot->next = s390_redrive_list;
32711 +                               s390_redrive_list = iot;
32712 +                               spin_unlock_irqrestore(&s390_redrive_list_lock,
32713 +                                                      flags);
32714 +                               /* wake up redrive daemon */
32715 +                               evms_cs_wakeup_thread(s390_io_redrive_thread);
32716 +
32717 +                               /* prevent the end_io to caller of EVMS */
32718 +                               *done = TRUE;
32719 +                       }
32720 +               }
32721 +       }
32722 +       if (*done == FALSE) {
32723 +               evms_cs_deallocate_to_pool(s390_io_track_pool, iot);
32724 +       }
32725 +}
32726 +
32727 +/****************************************************
32728 +* Function: s390iod
32729 +*
32730 +* This is a kernel thread that handles read/write of mirrorss
32731 +* This shouldn't ever run on a non-mirrored LV read/write
32732 +*
32733 +*
32734 +*****************************************************/
32735 +static void
32736 +s390iod(void *data)
32737 +{
32738 +       struct s390_io *iot;
32739 +       unsigned long flags;
32740 +       int rc;
32741 +
32742 +       while (1) {
32743 +               spin_lock_irqsave(&s390_redrive_list_lock, flags);
32744 +               if (s390_redrive_list == NULL) {
32745 +                       spin_unlock_irqrestore(&s390_redrive_list_lock, flags);
32746 +                       break;
32747 +               }
32748 +               iot = s390_redrive_list;
32749 +               s390_redrive_list = iot->next;
32750 +               iot->next = NULL;
32751 +               spin_unlock_irqrestore(&s390_redrive_list_lock, flags);
32752 +
32753 +               /* register for callback */
32754 +               rc = evms_cs_register_for_end_io_notification(iot, iot->bh,
32755 +                                                             s390_end_io_callback);
32756 +               if (rc) {
32757 +                       LOG_ERROR
32758 +                           ("error(%d): unable to register for end io callback!\n",
32759 +                            rc);
32760 +               } else {
32761 +                       /* redrive IO */
32762 +                       if (!iot->rw_flag) {
32763 +                               R_IO(iot->dsko->disk, iot->bh);
32764 +                       } else {
32765 +                               W_IO(iot->dsko->disk, iot->bh);
32766 +                       }
32767 +               }
32768 +       }
32769 +}
32770 +
32771 +/*
32772 + * Function:  add_segment
32773 + */
32774 +static int
32775 +s390_process_segment(struct evms_logical_node **discover_list,
32776 +                    struct evms_logical_node *node,
32777 +                    unsigned char *label,
32778 +                    u64 start_sect,
32779 +                    u64 nr_sects, unsigned char type, int part_num)
32780 +{
32781 +       struct local_instance_data *InstData = NULL;
32782 +       struct evms_logical_node *segment;
32783 +       int rc = 0;
32784 +
32785 +       segment = find_segment_on_disk(node, start_sect, nr_sects);
32786 +       if (segment) {
32787 +               LOG_DETAILS("exporting segment '%s'.\n", segment->name);
32788 +       } else {
32789 +               InstData = kmalloc(sizeof (*InstData), GFP_KERNEL);
32790 +               if (InstData) {
32791 +                       memset(InstData, 0, sizeof (*InstData));
32792 +                       InstData->source_disk = node;
32793 +                       InstData->start_sect = start_sect;
32794 +                       InstData->nr_sects = nr_sects;
32795 +                       InstData->type = type;
32796 +                       rc = evms_cs_allocate_logical_node(&segment);
32797 +               } else {
32798 +                       rc = -ENOMEM;
32799 +               }
32800 +
32801 +               if (!rc) {
32802 +                       segment->plugin = &plugin_header;
32803 +                       segment->system_id = (unsigned int) type;
32804 +                       segment->total_vsectors = nr_sects;
32805 +                       segment->block_size = node->block_size;
32806 +                       segment->hardsector_size = node->hardsector_size;
32807 +                       segment->private = InstData;
32808 +                       segment->flags = node->flags;
32809 +                       strcpy(segment->name, node->name);
32810 +                       sprintf(segment->name + strlen(segment->name), "%d",
32811 +                               part_num);
32812 +                       LOG_DETAILS("creating segment '%s'.\n", segment->name);
32813 +                       rc = add_segment_to_disk(node, label, segment);
32814 +                       if (rc) {
32815 +                               LOG_ERROR
32816 +                                   ("%s: error(%d) adding segment '%s'!\n",
32817 +                                    __FUNCTION__, rc, segment->name);
32818 +                               rc = 0;
32819 +                       } else {
32820 +                               MOD_INC_USE_COUNT;
32821 +                       }
32822 +               }
32823 +               if (rc) {
32824 +                       if (InstData)
32825 +                               kfree(InstData);
32826 +                       if (segment)
32827 +                               evms_cs_deallocate_logical_node(segment);
32828 +               }
32829 +       }
32830 +       if (!rc) {
32831 +               evms_cs_add_logical_node_to_list(discover_list, segment);
32832 +               exported_nodes++;
32833 +       }
32834 +       return rc;
32835 +}
32836 +
32837 +typedef enum {
32838 +       ibm_partition_lnx1 = 0,
32839 +       ibm_partition_vol1 = 1,
32840 +       ibm_partition_cms1 = 2,
32841 +       ibm_partition_none = 3
32842 +} ibm_partition_t;
32843 +
32844 +static char *part_names[] = {
32845 +       [ibm_partition_lnx1] = "LNX1",
32846 +       [ibm_partition_vol1] = "VOL1",
32847 +       [ibm_partition_cms1] = "CMS1",
32848 +       [ibm_partition_none] = "(nonl)"
32849 +};
32850 +
32851 +static ibm_partition_t
32852 +get_partition_type(char *type)
32853 +{
32854 +       int i;
32855 +       for (i = 0; i < 3; i++) {
32856 +               if (!strncmp(type, part_names[i], 4))
32857 +                       break;
32858 +       }
32859 +       return i;
32860 +}
32861 +
32862 +/*
32863 + * compute the block number from a
32864 + * cyl-cyl-head-head structure
32865 + */
32866 +static inline int
32867 +cchh2blk(cchh_t * ptr, struct hd_geometry *geo)
32868 +{
32869 +       return ptr->cc * geo->heads * geo->sectors + ptr->hh * geo->sectors;
32870 +}
32871 +
32872 +/*
32873 + * compute the block number from a
32874 + * cyl-cyl-head-head-block structure
32875 + */
32876 +static inline int
32877 +cchhb2blk(cchhb_t * ptr, struct hd_geometry *geo)
32878 +{
32879 +       int block = 0;
32880 +
32881 +       block = ptr->cc * geo->heads * geo->sectors + ptr->hh * geo->sectors;
32882 +       if (ptr->b) {
32883 +               block += ptr->b - 1;
32884 +       }
32885 +
32886 +       return block;
32887 +}
32888 +
32889 +static void
32890 +print_mem(void *buffer, int length)
32891 +{
32892 +       int i, done;
32893 +       unsigned char *bufptr;
32894 +
32895 +       bufptr = (unsigned char *) buffer;
32896 +       i = done = 0;
32897 +       while (!done) {
32898 +               if ((i % 16) == 0)
32899 +                       printk(KERN_INFO "\n0x%p->", buffer + i);
32900 +               printk(KERN_INFO "%02x ", bufptr[i]);
32901 +               if (++i >= length)
32902 +                       done++;
32903 +       }
32904 +       printk(KERN_INFO "\n");
32905 +}
32906 +
32907 +static int
32908 +s390_probe_multipath(struct evms_logical_node *disk,
32909 +                    unsigned char *label,
32910 +                    u64 label_lba, int label_offset, unsigned char *org_buf)
32911 +{
32912 +       int rc = FALSE;
32913 +       struct evms_list_node **devoln;
32914 +       unsigned char *sector_buf = NULL;
32915 +
32916 +       LOG_ENTRY_EXIT("%s: Entry\n", __FUNCTION__);
32917 +       /* check if this disk is already known.
32918 +        * if it is already in our device list
32919 +        * then we don't need to check for
32920 +        * multipath associations.
32921 +        */
32922 +       devoln = lookup_device_object(disk);
32923 +       /* is this disk in our list? */
32924 +       if (*devoln) {
32925 +               struct device_object *devo;
32926 +               struct disk_object *dsko;
32927 +               /* yes, disk already known */
32928 +
32929 +               /* we need to determine if this
32930 +                * is our first path to this
32931 +                * device.
32932 +                */
32933 +               devo = (struct device_object *) (*devoln)->item;
32934 +               /* if this is the first path to this
32935 +                * device, return FALSE so the main
32936 +                * routine will process its segments.
32937 +                * if this is not the first path,
32938 +                * return TRUE so the main routine
32939 +                * will not process its segments.
32940 +                */
32941 +               dsko = (struct disk_object *) devo->disk_object_list->item;
32942 +               if (dsko->disk != disk) {
32943 +                       rc = TRUE;
32944 +               }
32945 +               /* only print multipath log msgs if its
32946 +                * active on this device.
32947 +                */
32948 +               if (devo->total_paths > 1) {
32949 +                       LOG_DEBUG
32950 +                           ("skipping probe of known multipath device '%s'.\n",
32951 +                            disk->name);
32952 +               }
32953 +               LOG_ENTRY_EXIT("%s: Exit RC(%d)\n", __FUNCTION__, rc);
32954 +               return (rc);
32955 +       }
32956 +
32957 +       /* search device object list for a matching label */
32958 +       devoln = NULL;
32959 +       while (*(devoln = lookup_label(label, devoln))) {
32960 +               struct device_object *devo;
32961 +               struct disk_object *dsko;
32962 +               unsigned char org_label[6];
32963 +#define S390_TEST_LABEL     "~!@#$"
32964 +
32965 +               /* yes, found matching label */
32966 +               if (!sector_buf) {
32967 +                       /* allocate buffer for incoming label sector */
32968 +                       sector_buf = kmalloc(disk->hardsector_size, GFP_KERNEL);
32969 +                       if (!sector_buf) {
32970 +                               rc = -ENOMEM;
32971 +                               break;
32972 +                       }
32973 +               }
32974 +
32975 +               /* save original label */
32976 +               memcpy(org_label, org_buf + label_offset, 6);
32977 +               /* alter label to test pattern */
32978 +               strcpy(org_buf + label_offset, S390_TEST_LABEL);
32979 +               /* write test pattern to this disk */
32980 +               LOG_DEBUG("writing test label to '%s'.\n", disk->name);
32981 +               rc = INIT_IO(disk, WRITE, label_lba, 1, org_buf);
32982 +               if (rc) {
32983 +                       LOG_ERROR("error(%d) reading sector("PFU64") from '%s'.\n",
32984 +                                 rc, label_lba, disk->name);
32985 +                       break;
32986 +               }
32987 +
32988 +               /* read label from device object with matching label */
32989 +               devo = (struct device_object *) (*devoln)->item;
32990 +               dsko = (struct disk_object *) devo->disk_object_list->item;
32991 +               LOG_DEBUG("reading label from '%s'.\n", dsko->disk->name);
32992 +               rc = INIT_IO(dsko->disk, READ, label_lba, 1, sector_buf);
32993 +               if (rc) {
32994 +                       LOG_ERROR("error(%d) writing sector("PFU64") to '%s'.\n",
32995 +                                 rc, label_lba, dsko->disk->name);
32996 +               }
32997 +
32998 +               /* restore original label */
32999 +               memcpy(org_buf + label_offset, org_label, 6);
33000 +               LOG_DEBUG("restoring original label to '%s'.\n", disk->name);
33001 +               rc = INIT_IO(disk, WRITE, label_lba, 1, org_buf);
33002 +               if (rc) {
33003 +                       LOG_ERROR("error(%d) reading sector("PFU64") from '%s'.\n",
33004 +                                 rc, label_lba, disk->name);
33005 +                       break;
33006 +               }
33007 +
33008 +               LOG_DEBUG("checking label: %s(%6s), reference(%6s).\n",
33009 +                         dsko->disk->name,
33010 +                         sector_buf + label_offset, S390_TEST_LABEL);
33011 +               if (!strcmp(sector_buf + label_offset, S390_TEST_LABEL)) {
33012 +                       LOG_DETAILS("assigning '%s' as first path to device.\n",
33013 +                                   dsko->disk->name);
33014 +                       LOG_DETAILS("assigning '%s' as next path to device.\n",
33015 +                                   disk->name);
33016 +                       /* store this disk in the disk object's
33017 +                        * disk list.
33018 +                        */
33019 +                       /* create a disk object */
33020 +                       dsko = NULL;
33021 +                       if (!rc) {
33022 +                               dsko = kmalloc(sizeof (*dsko), GFP_KERNEL);
33023 +                               if (!dsko) {
33024 +                                       rc = -ENOMEM;
33025 +                               }
33026 +                       }
33027 +                       if (!rc) {
33028 +                               memset(dsko, 0, sizeof (*dsko));
33029 +                               /* add disk to disk object */
33030 +                               dsko->disk = disk;
33031 +                               /* add disk object to disk object list
33032 +                                * in device object */
33033 +                               rc = evms_cs_add_item_to_list(&devo->
33034 +                                                             disk_object_list,
33035 +                                                             dsko);
33036 +                       }
33037 +                       if (!rc) {
33038 +                               devo->total_paths++;
33039 +                       } else {
33040 +                               if (dsko) {
33041 +                                       kfree(dsko);
33042 +                               }
33043 +                               break;
33044 +                       }
33045 +
33046 +                       /* indicate we found a multipath device */
33047 +                       rc = TRUE;
33048 +                       break;
33049 +               }
33050 +       }
33051 +       if (sector_buf) {
33052 +               kfree(sector_buf);
33053 +       }
33054 +
33055 +       LOG_ENTRY_EXIT("%s: Exit RC(%d)\n", __FUNCTION__, rc);
33056 +       return (rc);
33057 +}
33058 +
33059 +static int
33060 +s390_probe_for_segments(struct evms_logical_node **discover_list,
33061 +                       struct evms_logical_node *disk)
33062 +{
33063 +       char type[5] = { 0, }, name[7] = {
33064 +       0,};
33065 +       int rc, vsects_per_hardsect = 0;
33066 +       unsigned int blk;
33067 +       u64 io_start, label_lba = 3;
33068 +       dasd_information_t *info = NULL;
33069 +       struct hd_geometry *geo = NULL;
33070 +       unchar *data = NULL;
33071 +
33072 +       /* allocate space for DASD ioctl packet
33073 +        */
33074 +       info = kmalloc(sizeof (dasd_information_t), GFP_KERNEL);
33075 +       if (info) {
33076 +               memset(info, 0, sizeof (dasd_information_t));
33077 +               LOG_DEBUG("probing '%s' for 390 DASD info...\n", disk->name);
33078 +               /* issue DASD info ioctl
33079 +                */
33080 +               rc = evms_cs_kernel_ioctl(disk, BIODASDINFO,
33081 +                                         (unsigned long) info);
33082 +               if (rc) {
33083 +                       LOG_DEBUG("error(%d) from BIODASDINFO ioctl.\n", rc);
33084 +                       LOG_DEBUG("assuming '%s' is not a valid 390 device!\n",
33085 +                                 disk->name);
33086 +               }
33087 +       } else {
33088 +               rc = -ENOMEM;
33089 +       }
33090 +
33091 +       if (!rc) {
33092 +               /* if we successfully completed the previous
33093 +                * get DASD info ioctl, we will assume that
33094 +                * the device is a valid 390 disk.
33095 +                *
33096 +                * remove it from the discover list.
33097 +                */
33098 +               rc = evms_cs_remove_logical_node_from_list(discover_list, disk);
33099 +               if (rc) {
33100 +                       LOG_ERROR
33101 +                           ("error(%d) removing disk(%s) from discover list.\n",
33102 +                            rc, disk->name);
33103 +               }
33104 +       }
33105 +       if (!rc) {
33106 +               /* allocate space for the geometry packet
33107 +                */
33108 +               geo = kmalloc(sizeof (struct hd_geometry), GFP_KERNEL);
33109 +               if (!geo) {
33110 +                       rc = -ENOMEM;
33111 +               }
33112 +       }
33113 +       if (!rc) {
33114 +               memset(geo, 0, sizeof (struct hd_geometry));
33115 +               /* issue the Get GEO ioctl
33116 +                */
33117 +               rc = evms_cs_kernel_ioctl(disk, HDIO_GETGEO,
33118 +                                         (unsigned long) geo);
33119 +               if (rc) {
33120 +                       LOG_ERROR("error(%d) from HDIO_GETGEO ioctl.\n", rc);
33121 +               }
33122 +       }
33123 +       if (!rc) {
33124 +               /* retrieve the vsects_per_hardsect (hardsector size)
33125 +                */
33126 +               vsects_per_hardsect = disk->hardsector_size;
33127 +               vsects_per_hardsect >>= EVMS_VSECTOR_SIZE_SHIFT;
33128 +               data = kmalloc(EVMS_VSECTOR_SIZE, GFP_KERNEL);
33129 +               if (!data) {
33130 +                       rc = -ENOMEM;
33131 +               }
33132 +       }
33133 +       if (!rc) {
33134 +               /* go read the 1st block on the disk
33135 +                */
33136 +               label_lba = info->label_block * vsects_per_hardsect;
33137 +               io_start = label_lba;
33138 +               rc = INIT_IO(disk, READ, io_start, 1, data);
33139 +               if (rc) {
33140 +                       LOG_ERROR("error(%d) reading sector("PFU64") from '%s'.\n",
33141 +                                 rc, io_start, disk->name);
33142 +               } else {
33143 +//          print_mem(data, EVMS_VSECTOR_SIZE);
33144 +               }
33145 +       }
33146 +       if (!rc) {
33147 +               int offset, size, psize, counter = 0, label_offset;
33148 +               int vstart = 0, vend = 0;
33149 +               int vtoc_record_count, vtoc_index;
33150 +               format1_label_t f1;
33151 +               format4_label_t *f4;
33152 +               volume_label_t vlabel;
33153 +               ibm_partition_t partition_type;
33154 +
33155 +               /* determine the format type
33156 +                */
33157 +
33158 +               strncpy(type, data, 4);
33159 +               if ((!info->FBA_layout) && (!strcmp(info->type, "ECKD"))) {
33160 +                       label_offset = 8;
33161 +               } else {
33162 +                       label_offset = 4;
33163 +               }
33164 +               strncpy(name, data + label_offset, 6);
33165 +               memcpy(&vlabel, data, sizeof (volume_label_t));
33166 +
33167 +               EBCASC(type, 4);
33168 +               EBCASC(name, 6);
33169 +               partition_type = get_partition_type(type);
33170 +               LOG_DETAILS("disk: raw type(%s), type(%s), name(%s)\n",
33171 +                           type, part_names[partition_type], name);
33172 +
33173 +               rc = s390_probe_multipath(disk, name, label_lba, label_offset,
33174 +                                         data);
33175 +               if (!rc) {
33176 +                       switch (partition_type) {
33177 +                       case ibm_partition_cms1:
33178 +                               if (*((long *) data + 13) != 0) {
33179 +                                       /* disk is reserved minidisk */
33180 +                                       long *label = (long *) data;
33181 +                                       vsects_per_hardsect =
33182 +                                           label[3] >> EVMS_VSECTOR_SIZE_SHIFT;
33183 +                                       offset = label[13];
33184 +                                       size =
33185 +                                           (label[7] -
33186 +                                            1) * vsects_per_hardsect;
33187 +                                       LOG_DEBUG("(MDSK)");
33188 +                               } else {
33189 +                                       offset = info->label_block + 1;
33190 +                                       size = disk->total_vsectors;
33191 +                               }
33192 +                               offset *= vsects_per_hardsect;
33193 +                               /* adjust for 0 thru label block offset
33194 +                                */
33195 +                               size -= offset;
33196 +                               rc = s390_process_segment(discover_list,
33197 +                                                         disk,
33198 +                                                         name,
33199 +                                                         offset, size, 0, 1);
33200 +                               break;
33201 +                       case ibm_partition_lnx1:
33202 +                       case ibm_partition_none:
33203 +                               offset = info->label_block + 1;
33204 +                               offset *= vsects_per_hardsect;
33205 +                               size = disk->total_vsectors;
33206 +                               /* adjust for 0 thru label block offset
33207 +                                */
33208 +                               size -= offset;
33209 +                               rc = s390_process_segment(discover_list,
33210 +                                                         disk,
33211 +                                                         name,
33212 +                                                         offset, size, 0, 1);
33213 +                               break;
33214 +                       case ibm_partition_vol1:
33215 +                               /* set max dscb record count == single track till we see the vtoc descriptor */
33216 +                               vtoc_record_count = geo->sectors;
33217 +                               /* set current index into vtoc */
33218 +                               vtoc_index = 0;
33219 +                               /* get block number and read then first dscb */
33220 +                               blk = cchhb2blk(&vlabel.vtoc, geo);
33221 +                               io_start = blk * vsects_per_hardsect;
33222 +                               rc = INIT_IO(disk, READ, io_start, 1, data);
33223 +                               if (rc) {
33224 +                                       LOG_ERROR
33225 +                                           ("error(%d) reading sector("PFU64") from '%s'.\n",
33226 +                                            rc, io_start, disk->name);
33227 +                                       break;
33228 +                               } else {
33229 +                                       //              print_mem(data, EVMS_VSECTOR_SIZE);
33230 +                               }
33231 +                               memcpy(&f1, data, sizeof (format1_label_t));
33232 +
33233 +                               // read vtoc records ... terminate when :
33234 +                               // (1) we hit first NULL record
33235 +                               // (2) we get an error processing a vtoc record
33236 +                               // (3) we run out of vtoc records to process
33237 +                               while (f1.DS1FMTID != 0x00 && rc == 0
33238 +                                      && vtoc_index < vtoc_record_count) {
33239 +                                       if (f1.DS1FMTID == 0xf4) {      // vtoc descriptor
33240 +                                               f4 = (format4_label_t *) data;
33241 +                                               vstart =
33242 +                                                   cchh2blk(&f4->DS4VTOCE.
33243 +                                                            llimit, geo);
33244 +                                               vend =
33245 +                                                   cchh2blk(&f4->DS4VTOCE.
33246 +                                                            ulimit, geo);
33247 +                                               vtoc_record_count =
33248 +                                                   (vend - vstart) +
33249 +                                                   geo->sectors;
33250 +                                       } else if (f1.DS1FMTID == 0xf1) {       // dataset descriptor
33251 +
33252 +                                               offset =
33253 +                                                   cchh2blk(&f1.DS1EXT1.llimit,
33254 +                                                            geo);
33255 +                                               psize =
33256 +                                                   cchh2blk(&f1.DS1EXT1.ulimit,
33257 +                                                            geo) - offset +
33258 +                                                   geo->sectors;
33259 +
33260 +                                               counter++;
33261 +                                               rc = s390_process_segment
33262 +                                                   (discover_list, disk, name,
33263 +                                                    offset *
33264 +                                                    vsects_per_hardsect,
33265 +                                                    psize *
33266 +                                                    vsects_per_hardsect, 0,
33267 +                                                    counter);
33268 +                                       }
33269 +                                       if (!rc) {      // get next dscb
33270 +                                               ++vtoc_index;
33271 +                                               ++blk;
33272 +                                               io_start =
33273 +                                                   blk * vsects_per_hardsect;
33274 +                                               rc = INIT_IO(disk, READ,
33275 +                                                            io_start, 1, data);
33276 +                                               if (rc) {
33277 +                                                       LOG_ERROR
33278 +                                                           ("error(%d) reading sector("PFU64") from '%s'.\n",
33279 +                                                            rc, io_start,
33280 +                                                            disk->name);
33281 +                                                       break;
33282 +                                               } else {
33283 +                                                       //                      print_mem(data, EVMS_VSECTOR_SIZE);
33284 +                                               }
33285 +                                               memcpy(&f1, data,
33286 +                                                      sizeof
33287 +                                                      (format1_label_t));
33288 +                                       }
33289 +                               }
33290 +                               break;
33291 +                       default:
33292 +                               rc = s390_process_segment(discover_list,
33293 +                                                         disk, name,
33294 +                                                         0, 0, 0, 1);
33295 +                               break;
33296 +                       }
33297 +               }
33298 +       }
33299 +       if (info) {
33300 +               kfree(info);
33301 +       }
33302 +       if (geo) {
33303 +               kfree(geo);
33304 +       }
33305 +       if (data)
33306 +               kfree(data);
33307 +
33308 +       return (rc);
33309 +}
33310 +
33311 +/*
33312 + * Function: s390_partition_discover
33313 + *
33314 + */
33315 +static int
33316 +s390_partition_discover(struct evms_logical_node **discover_list)
33317 +{
33318 +       int rc = 0;
33319 +       struct evms_logical_node *node, *next_node;
33320 +
33321 +       MOD_INC_USE_COUNT;
33322 +       LOG_ENTRY_EXIT("%s: ENTRY\n", __FUNCTION__);
33323 +
33324 +       /* initialize global variable */
33325 +       exported_nodes = 0;
33326 +
33327 +       /* examine each node on the discover list */
33328 +       next_node = *discover_list;
33329 +       while (next_node) {
33330 +               node = next_node;
33331 +               next_node = node->next;
33332 +               if (GetPluginType(node->plugin->id) != EVMS_DEVICE_MANAGER)
33333 +                       /* only process disk nodes
33334 +                        */
33335 +                       continue;
33336 +               if (node->iflags & EVMS_TOP_SEGMENT)
33337 +                       continue;
33338 +               s390_probe_for_segments(discover_list, node);
33339 +       }
33340 +
33341 +       LOG_ENTRY_EXIT("%s: EXIT(exported nodes:%d, error code:%d)\n",
33342 +                      __FUNCTION__, exported_nodes, rc);
33343 +       if (exported_nodes)
33344 +               rc = exported_nodes;
33345 +       MOD_DEC_USE_COUNT;
33346 +       return (rc);
33347 +}
33348 +
33349 +/*
33350 + * Function: s390_partition_delete
33351 + *
33352 + */
33353 +static int
33354 +s390_partition_delete(struct evms_logical_node *segment)
33355 +{
33356 +       int rc = 0;
33357 +       struct local_instance_data *LID;
33358 +
33359 +       LOG_DETAILS("deleting segment '%s'.\n", segment->name);
33360 +
33361 +       if (!segment) {
33362 +               rc = -ENODEV;
33363 +       } else {
33364 +               struct evms_list_node *empty_disk_object_list = NULL;
33365 +               LID = segment->private;
33366 +               if (LID) {
33367 +                       /* remove the segment from the
33368 +                        * disk's segment list
33369 +                        */
33370 +                       rc = remove_segment_from_disk(LID->source_disk,
33371 +                                                     segment,
33372 +                                                     &empty_disk_object_list);
33373 +                       /* free the local instance data */
33374 +                       kfree(LID);
33375 +               }
33376 +               /* free the segment node */
33377 +               evms_cs_deallocate_logical_node(segment);
33378 +               MOD_DEC_USE_COUNT;
33379 +               /* if the last segment on the disk was
33380 +                * deleted, delete the disk node(s) too
33381 +                */
33382 +               while (empty_disk_object_list) {
33383 +                       struct disk_object *dsko;
33384 +                       dsko =
33385 +                           (struct disk_object *) empty_disk_object_list->item;
33386 +                       rc = evms_cs_remove_item_from_list
33387 +                           (&empty_disk_object_list, dsko);
33388 +                       if (!rc) {
33389 +                               rc = DELETE(dsko->disk);
33390 +                               if (rc) {
33391 +                                       LOG_ERROR
33392 +                                           ("error(%d): attempting to delete '%s'.\n",
33393 +                                            rc, dsko->disk->name);
33394 +                                       rc = 0;
33395 +                               }
33396 +                       }
33397 +                       kfree(dsko);
33398 +               }
33399 +       }
33400 +       return (rc);
33401 +}
33402 +
33403 +/*
33404 + * function: s390_partition_io_error
33405 + *
33406 + * this function was primarily created because the function
33407 + * buffer_IO_error is inline and kgdb doesn't allow breakpoints
33408 + * to be set on inline functions. Since this was an error path
33409 + * and not mainline, I decided to add a trace statement to help
33410 + * report on the failing condition.
33411 + *
33412 + */
33413 +static void
33414 +s390_partition_io_error(int rc,
33415 +                       struct evms_logical_node *node,
33416 +                       int io_flag, struct buffer_head *bh)
33417 +{
33418 +       switch (rc) {
33419 +       case 1:
33420 +               LOG_SERIOUS
33421 +                   ("attempt to %s beyond partition boundary("PFU64") on (%s), rsector(%ld).\n",
33422 +                    (io_flag) ? "WRITE" : "READ", node->total_vsectors - 1,
33423 +                    node->name, bh->b_rsector);
33424 +               break;
33425 +       case 2:
33426 +               LOG_ERROR
33427 +                   ("%s error(no active paths) on '%s' to drive the I/O.\n",
33428 +                    (io_flag) ? "WRITE" : "READ", node->name);
33429 +               break;
33430 +       }
33431 +
33432 +       bh->b_end_io(bh, 0);
33433 +}
33434 +
33435 +/*
33436 + * Function: s390_partition_read
33437 + *
33438 + */
33439 +static void
33440 +s390_partition_read(struct evms_logical_node *partition, struct buffer_head *bh)
33441 +{
33442 +       struct local_instance_data *LID = partition->private;
33443 +       struct s390_io *iot = NULL;
33444 +       int rc = 1;
33445 +
33446 +       if (bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) <=
33447 +           partition->total_vsectors) {
33448 +               bh->b_rsector += LID->start_sect;
33449 +
33450 +               s390_load_balance(&iot, LID->source_disk);
33451 +               if (iot->dsko) {
33452 +                       iot->segment = partition;
33453 +                       iot->bh = bh;
33454 +                       iot->rw_flag = READ;
33455 +                       /* register the callback */
33456 +                       evms_cs_register_for_end_io_notification(iot, bh,
33457 +                                                                s390_end_io_callback);
33458 +                       /* drive the IO */
33459 +                       R_IO(iot->dsko->disk, bh);
33460 +                       return;
33461 +               } else {
33462 +                       rc = 2;
33463 +               }
33464 +       }
33465 +       s390_partition_io_error(rc, partition, READ, bh);
33466 +}
33467 +
33468 +/*
33469 + * Function: s390_partition_write
33470 + *
33471 + */
33472 +static void
33473 +s390_partition_write(struct evms_logical_node *partition,
33474 +                    struct buffer_head *bh)
33475 +{
33476 +       struct local_instance_data *LID = partition->private;
33477 +       struct s390_io *iot = NULL;
33478 +       int rc = 1;
33479 +
33480 +       if (bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) <=
33481 +           partition->total_vsectors) {
33482 +               bh->b_rsector += LID->start_sect;
33483 +
33484 +               s390_load_balance(&iot, LID->source_disk);
33485 +               if (iot->dsko) {
33486 +                       iot->segment = partition;
33487 +                       iot->bh = bh;
33488 +                       iot->rw_flag = WRITE;
33489 +                       /* register the callback */
33490 +                       evms_cs_register_for_end_io_notification(iot, bh,
33491 +                                                                s390_end_io_callback);
33492 +                       /* drive the IO */
33493 +                       W_IO(iot->dsko->disk, bh);
33494 +                       return;
33495 +               } else {
33496 +                       rc = 2;
33497 +               }
33498 +       }
33499 +       s390_partition_io_error(rc, partition, WRITE, bh);
33500 +}
33501 +
33502 +/*
33503 + * Function: s390_partition_init_io
33504 + *
33505 + */
33506 +static int
33507 +s390_partition_init_io(struct evms_logical_node *partition, int io_flag,       /* 0=read, 1=write */
33508 +                      u64 sect_nr,     /* disk LBA */
33509 +                      u64 num_sects,   /* # of sectors */
33510 +                      void *buf_addr)
33511 +{                              /* buffer address */
33512 +       int rc;
33513 +       struct local_instance_data *LID = partition->private;
33514 +       struct s390_io *iot = NULL;
33515 +
33516 +       if ((sect_nr + num_sects) <= partition->total_vsectors) {
33517 +               do {
33518 +                       s390_load_balance(&iot, LID->source_disk);
33519 +                       if (!iot->dsko) {
33520 +                               rc = -EIO;
33521 +                               break;
33522 +                       }
33523 +                       rc = INIT_IO(iot->dsko->disk, io_flag,
33524 +                                    sect_nr + LID->start_sect, num_sects,
33525 +                                    buf_addr);
33526 +                       /* do disk object IO bookkeeping */
33527 +                       atomic_dec(&iot->dsko->pending_ios);
33528 +                       if (rc == -EIO) {
33529 +                               atomic_inc(&iot->dsko->failed_ios);
33530 +                               iot->dsko->flags = S390_DISK_FAILED;
33531 +                       } else {
33532 +                               iot->dsko->flags = S390_DISK_OK;
33533 +                       }
33534 +               } while (rc == -EIO);
33535 +               evms_cs_deallocate_to_pool(s390_io_track_pool, iot);
33536 +       } else {
33537 +               LOG_SERIOUS
33538 +                   ("init_io: attempt to %s beyond partition(%s) boundary("PFU64") at sector("PFU64") for count("PFU64").\n",
33539 +                    (io_flag) ? "WRITE" : "READ", partition->name,
33540 +                    (LID->nr_sects - 1), sect_nr, num_sects);
33541 +               rc = -EINVAL;
33542 +       }
33543 +
33544 +       return (rc);
33545 +}
33546 +
33547 +/*
33548 + * Function: s390_partition_ioctl
33549 + *
33550 + */
33551 +static int
33552 +s390_partition_ioctl(struct evms_logical_node *partition,
33553 +                    struct inode *inode,
33554 +                    struct file *file, unsigned int cmd, unsigned long arg)
33555 +{
33556 +       struct local_instance_data *LID;
33557 +       struct hd_geometry hd_geo;
33558 +       int rc;
33559 +
33560 +       rc = 0;
33561 +       LID = partition->private;
33562 +       if (!inode)
33563 +               return -EINVAL;
33564 +       switch (cmd) {
33565 +       case HDIO_GETGEO:
33566 +               {
33567 +                       rc = IOCTL(LID->source_disk, inode, file, cmd, arg);
33568 +                       if (rc)
33569 +                               break;
33570 +                       if (copy_from_user
33571 +                           (&hd_geo, (void *) arg,
33572 +                            sizeof (struct hd_geometry)))
33573 +                               rc = -EFAULT;
33574 +                       if (rc)
33575 +                               break;
33576 +                       hd_geo.start = LID->start_sect;
33577 +                       if (copy_to_user
33578 +                           ((void *) arg, &hd_geo,
33579 +                            sizeof (struct hd_geometry)))
33580 +                               rc = -EFAULT;
33581 +               }
33582 +               break;
33583 +       case EVMS_GET_BMAP:
33584 +               {
33585 +                       struct evms_get_bmap_pkt *bmap =
33586 +                           (struct evms_get_bmap_pkt *) arg;
33587 +                       bmap->rsector += LID->start_sect;
33588 +                       /* intentionally fall thru to
33589 +                        * default ioctl down to device
33590 +                        * manager.
33591 +                        */
33592 +               }
33593 +       default:
33594 +               rc = IOCTL(LID->source_disk, inode, file, cmd, arg);
33595 +       }
33596 +       return rc;
33597 +}
33598 +
33599 +/*
33600 + * Function: s390_part_init
33601 + *
33602 + */
33603 +static int __init
33604 +s390_part_init(void)
33605 +{
33606 +       const char *name = "evms_s390iod";
33607 +
33608 +       /* create s390 IODaemon thread */
33609 +       s390_io_redrive_thread = evms_cs_register_thread(s390iod, NULL, name);
33610 +       /* create pool of IO tracking structures */
33611 +       s390_io_track_pool =
33612 +           evms_cs_create_pool(sizeof (struct s390_io), "EVMS_s390_IO", NULL,
33613 +                               NULL);
33614 +
33615 +       return evms_cs_register_plugin(&plugin_header); /* register with EVMS */
33616 +}
33617 +
33618 +static void __exit
33619 +s390_part_exit(void)
33620 +{
33621 +       evms_cs_unregister_plugin(&plugin_header);
33622 +}
33623 +
33624 +module_init(s390_part_init);
33625 +module_exit(s390_part_exit);
33626 +#ifdef MODULE_LICENSE
33627 +MODULE_LICENSE("GPL");
33628 +#endif
33629 diff -Naur linux-2002-09-30/drivers/evms/snapshot.c evms-2002-09-30/drivers/evms/snapshot.c
33630 --- linux-2002-09-30/drivers/evms/snapshot.c    Wed Dec 31 18:00:00 1969
33631 +++ evms-2002-09-30/drivers/evms/snapshot.c     Wed Sep 25 16:53:00 2002
33632 @@ -0,0 +1,2796 @@
33633 +/* -*- linux-c -*- */
33634 +/*
33635 + *   Copyright (c) International Business Machines  Corp., 2000
33636 + *
33637 + *   This program is free software;  you can redistribute it and/or modify
33638 + *   it under the terms of the GNU General Public License as published by
33639 + *   the Free Software Foundation; either version 2 of the License, or
33640 + *   (at your option) any later version.
33641 + *
33642 + *   This program is distributed in the hope that it will be useful,
33643 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
33644 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
33645 + *   the GNU General Public License for more details.
33646 + *
33647 + *   You should have received a copy of the GNU General Public License
33648 + *   along with this program;  if not, write to the Free Software
33649 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
33650 + */
33651 +/*
33652 + * linux/drivers/evms/snapshot.c
33653 + *
33654 + * EVMS SnapShot Feature.
33655 + *
33656 + * This feature provides the ability to Snapshot ANY existing EVMS volume
33657 + * (including compatibility) to a new EVMS volume that is created when the
33658 + * SnapShot is enabled. This feature will appear in the call stack for both
33659 + * the original and the snapshot volume.
33660 + */
33661 +
33662 +#define LOG_PREFIX "snapshot: "
33663 +
33664 +#include <linux/kernel.h>
33665 +#include <linux/module.h>
33666 +#include <linux/compiler.h>
33667 +#include <linux/mempool.h>
33668 +#include <linux/version.h>
33669 +#include <linux/vmalloc.h>
33670 +#include <asm/uaccess.h>
33671 +#include <linux/evms/evms.h>
33672 +#include <linux/evms/evms_snapshot.h>
33673 +
33674 +static struct proc_dir_entry * snap_proc = NULL;
33675 +static unsigned int snapshot_count = 0; /* Number of active snapshots and originals. */
33676 +
33677 +/* Memory pools. */
33678 +static kmem_cache_t * snap_page_slab = NULL;
33679 +static mempool_t * snap_page_pool = NULL;
33680 +static kmem_cache_t * snap_buffer_slab = NULL;
33681 +static mempool_t * snap_buffer_pool = NULL;
33682 +static kmem_cache_t * snap_async_org_io_slab = NULL;
33683 +static mempool_t * snap_async_org_io_pool = NULL;
33684 +static kmem_cache_t * snap_async_snap_io_slab = NULL;
33685 +static mempool_t * snap_async_snap_io_pool = NULL;
33686 +static kmem_cache_t * snap_hash_entry_slab = NULL;
33687 +static mempool_t * snap_hash_entry_pool = NULL;
33688 +
33689 +#ifdef SNAPSHOT_DEBUG
33690 +static struct async_org_io * debug_async_org_io_list = NULL;
33691 +static spinlock_t debug_async_org_io_list_lock = SPIN_LOCK_UNLOCKED;
33692 +#endif
33693 +
33694 +/* API prototypes */
33695 +static int snap_discover_volumes(struct evms_logical_node ** evms_node_list);
33696 +static int snap_delete_volume(struct evms_logical_node * node);
33697 +static void snap_read(struct evms_logical_node * node,
33698 +                     struct buffer_head * bh);
33699 +static void snap_write(struct evms_logical_node * node,
33700 +                      struct buffer_head * bh);
33701 +static int snap_init_io(struct evms_logical_node * node, int rw,
33702 +                       u64 sect_nr, u64 num_sects, void * buf_addr);
33703 +static int snap_ioctl(struct evms_logical_node * node,
33704 +                     struct inode * inode, struct file * file,
33705 +                     unsigned int cmd, unsigned long arg);
33706 +
33707 +/* Other functions that require prototypes. */
33708 +static int add_snapshot(struct evms_logical_node * node,
33709 +                       struct snapshot_metadata * metadata,
33710 +                       struct evms_logical_node ** evms_node_list);
33711 +static int snap_proc_read(char * page, char ** start, off_t off,
33712 +                         int count, int * eof, void * data);
33713 +static void snapshot_do_rollback(void * volume);
33714 +static void snap_async_io_thread(void * volume);
33715 +void snap_read_chunk_cb(struct buffer_head * bh, int uptodate);
33716 +void snap_write_chunk_cb(struct buffer_head * bh, int uptodate);
33717 +void snap_cow_table_cb(struct buffer_head * bh, int uptodate);
33718 +
33719 +/* Snapshot plugin's function table and header. */
33720 +static struct evms_plugin_fops function_table = {
33721 +       .discover       = snap_discover_volumes,
33722 +       .delete         = snap_delete_volume,
33723 +       .read           = snap_read,
33724 +       .write          = snap_write,
33725 +       .init_io        = snap_init_io,
33726 +       .ioctl          = snap_ioctl
33727 +};
33728 +
33729 +static struct evms_plugin_header plugin_header = {
33730 +       .id = SetPluginID(IBM_OEM_ID,
33731 +                         EVMS_ASSOCIATIVE_FEATURE,
33732 +                         EVMS_SNAPSHOT_FEATURE_ID),
33733 +       .version = {
33734 +               .major          = EVMS_SNAPSHOT_VERSION_MAJOR,
33735 +               .minor          = EVMS_SNAPSHOT_VERSION_MINOR,
33736 +               .patchlevel     = EVMS_SNAPSHOT_VERSION_PATCHLEVEL
33737 +       },
33738 +       .required_services_version = {
33739 +               .major          = EVMS_COMMON_SERVICES_MAJOR,
33740 +               .minor          = EVMS_COMMON_SERVICES_MINOR,
33741 +               .patchlevel     = EVMS_COMMON_SERVICES_PATCHLEVEL
33742 +       },
33743 +       .fops = &function_table
33744 +};
33745 +
33746 +/**
33747 + * convert_metadata - Perform endian conversion on a metadata sector.
33748 + * @metadata: snapshot metadata sector
33749 + **/
33750 +static void convert_metadata(struct snapshot_metadata * metadata)
33751 +{
33752 +       metadata->signature             = le32_to_cpup(&metadata->signature);
33753 +       metadata->CRC                   = le32_to_cpup(&metadata->CRC);
33754 +       metadata->version.major         = le32_to_cpup(&metadata->version.major);
33755 +       metadata->version.minor         = le32_to_cpup(&metadata->version.minor);
33756 +       metadata->version.patchlevel    = le32_to_cpup(&metadata->version.patchlevel);
33757 +       metadata->flags                 = le32_to_cpup(&metadata->flags);
33758 +       metadata->original_size         = le64_to_cpup(&metadata->original_size);
33759 +       metadata->lba_of_COW_table      = le64_to_cpup(&metadata->lba_of_COW_table);
33760 +       metadata->lba_of_first_chunk    = le64_to_cpup(&metadata->lba_of_first_chunk);
33761 +       metadata->chunk_size            = le32_to_cpup(&metadata->chunk_size);
33762 +       metadata->total_chunks          = le32_to_cpup(&metadata->total_chunks);
33763 +}
33764 +
33765 +static void *slab_pool_alloc(int gfp_mask, void * data)
33766 +{
33767 +       return kmem_cache_alloc(data, gfp_mask);
33768 +}
33769 +
33770 +static void slab_pool_free(void * ptr, void * data)
33771 +{
33772 +       kmem_cache_free(data, ptr);
33773 +}
33774 +
33775 +/**
33776 + * allocate_snapshot_hash_entry
33777 + * @volume:    Snapshot volume to get a new entry for.
33778 + * @org_chunk: Number of original chunk.
33779 + * @snap_chunk:        Number of remap chunk.
33780 + * @chunk_state: see SNAP_CHUNK_*
33781 + *
33782 + * Get a snapshot_hash_entry from the pool and initialize. Accessing the
33783 + * free_hash_list is safe, since we only call this function while holding
33784 + * the snap_semaphore.
33785 + */
33786 +static struct snapshot_hash_entry *
33787 +allocate_snapshot_hash_entry(struct snapshot_volume * volume,
33788 +                            u64 org_chunk,
33789 +                            u64 snap_chunk,
33790 +                            u32 chunk_state)
33791 +{
33792 +       struct snapshot_hash_entry * hash_entry;
33793 +
33794 +       hash_entry = volume->free_hash_list;
33795 +       if (hash_entry) {
33796 +               volume->free_hash_list = hash_entry->next;
33797 +               hash_entry->org_chunk = org_chunk;
33798 +               hash_entry->snap_chunk = snap_chunk;
33799 +               hash_entry->chunk_state = chunk_state;
33800 +               hash_entry->snap_io = NULL;
33801 +               hash_entry->next = NULL;
33802 +               hash_entry->prev = NULL;
33803 +               spin_lock_init(&hash_entry->chunk_state_lock);
33804 +       } else {
33805 +               /* Should never happen, since hash entries are max
33806 +                * allocated at discovery time.
33807 +                */
33808 +               BUG();
33809 +       }
33810 +
33811 +       return hash_entry;
33812 +}
33813 +
33814 +/**
33815 + * insert_snapshot_hash_entry
33816 + *
33817 + * Insert a new entry into a snapshot hash chain, immediately following the
33818 + * specified entry. This function should not be used to add an entry into an
33819 + * empty list, or as the first entry in an existing list. For that case, use
33820 + * insert_snapshot_map_entry_at_head().
33821 + */
33822 +static int insert_snapshot_hash_entry(struct snapshot_hash_entry * entry,
33823 +                                     struct snapshot_hash_entry * base)
33824 +{
33825 +       entry->next = base->next;
33826 +       entry->prev = base;
33827 +       base->next = entry;
33828 +       if ( entry->next ) {
33829 +               entry->next->prev = entry;
33830 +       }
33831 +       return 0;
33832 +}
33833 +
33834 +/**
33835 + * insert_snapshot_hash_entry_at_head
33836 + *
33837 + * Insert a new entry into a snapshot chain as the first entry in the chain.
33838 + */
33839 +static int insert_snapshot_hash_entry_at_head(struct snapshot_hash_entry * entry,
33840 +                                             struct snapshot_hash_entry ** head)
33841 +{
33842 +       entry->next = *head;
33843 +       entry->prev = NULL;
33844 +       *head = entry;
33845 +       if ( entry->next ) {
33846 +               entry->next->prev = entry;
33847 +       }
33848 +       return 0;
33849 +}
33850 +
33851 +/**
33852 + * set_snapshot_flags
33853 + * @snap_node:
33854 + * @set_flag:  Flags to turn "on" in the metadata sector.
33855 + * @unset_flag: Flags to turn "off" in the metadata sector.
33856 + *
33857 + * Set the flags field in the metadata and write the metadata sector to
33858 + * the snapshot volume. The node passed in to this function should be the
33859 + * "lower" of the snapshot nodes, meaning the one consumed by the snapshot
33860 + * plugin, not the one exported from the plugin.
33861 + *
33862 + * Appropriate values for the two flag parameters are:
33863 + *   EVMS_SNAPSHOT_DISABLED
33864 + *   EVMS_SNAPSHOT_FULL
33865 + *   EVMS_SNAPSHOT_ROLLBACK
33866 + *   EVMS_SNAPSHOT_ROLLBACK_COMP
33867 + */
33868 +static int set_snapshot_flags(struct evms_logical_node * snap_node,
33869 +                             u32 set_flag,
33870 +                             u32 unset_flag)
33871 +{
33872 +       unsigned char data[EVMS_VSECTOR_SIZE] = {0};
33873 +       struct snapshot_metadata * metadata = (struct snapshot_metadata*)data;
33874 +
33875 +       /* Read the metadata sector */
33876 +       if ( INIT_IO(snap_node, 0, snap_node->total_vsectors-3, 1, data) ) {
33877 +               return -EIO;
33878 +       }
33879 +
33880 +       /* Set the appropriate flags. Do endian conversion on the fly. */
33881 +       metadata->flags |= cpu_to_le32p(&set_flag);
33882 +       metadata->flags &= ~(cpu_to_le32p(&unset_flag));
33883 +       metadata->CRC = 0;
33884 +       metadata->CRC = cpu_to_le32(evms_cs_calculate_crc(EVMS_INITIAL_CRC,
33885 +                                                         metadata,
33886 +                                                         sizeof(struct snapshot_metadata)));
33887 +
33888 +       /* Write the metadata sector back to the volume. */
33889 +       if ( INIT_IO(snap_node, 1, snap_node->total_vsectors-3, 1, data) ) {
33890 +               return -EIO;
33891 +       }
33892 +
33893 +       return 0;
33894 +}
33895 +
33896 +/**
33897 + * disable_snapshot
33898 + */
33899 +static void disable_snapshot(struct snapshot_volume * snap_volume,
33900 +                            int update_metadata)
33901 +{
33902 +       LOG_ERROR("Disabling snapshot volume '%s'.\n",
33903 +               snap_volume->exported_node->name);
33904 +       snap_volume->flags |= EVMS_SNAPSHOT_DISABLED;
33905 +       if ( update_metadata ) {
33906 +               set_snapshot_flags(snap_volume->logical_node,
33907 +                                  EVMS_SNAPSHOT_DISABLED, 0);
33908 +       } else {
33909 +               snap_volume->flags |= EVMS_SNAPSHOT_DISABLED_PENDING;
33910 +               evms_cs_wakeup_thread(snap_volume->snapshot_org->async_io_thread);
33911 +       }
33912 +}
33913 +
33914 +/**
33915 + * snap_discover_volumes
33916 + *
33917 + * Inspect the global node list, looking for volumes with a valid
33918 + * snapshot metadata sector.
33919 + */
33920 +static int snap_discover_volumes(struct evms_logical_node ** evms_node_list)
33921 +{
33922 +       struct evms_logical_node * node, * next_node;
33923 +       struct snapshot_metadata * metadata = NULL;
33924 +       int org_crc, final_crc, rc = 0;
33925 +
33926 +       MOD_INC_USE_COUNT;
33927 +
33928 +       /* A buffer for reading the metadata. */
33929 +       metadata = kmalloc(EVMS_VSECTOR_SIZE, GFP_KERNEL);
33930 +       if (!metadata) {
33931 +               MOD_DEC_USE_COUNT;
33932 +               return -ENOMEM;
33933 +       }
33934 +
33935 +       /* Check every node on the discovery list. */
33936 +       for ( node = *evms_node_list; node && !rc; node = next_node ) {
33937 +               next_node = node->next;
33938 +
33939 +               /* This node must not be one we put back on the list already,
33940 +                * and must have a feature header with snapshot's ID.
33941 +                */
33942 +               if ( node->plugin->id == plugin_header.id ||
33943 +                    ! node->feature_header ||
33944 +                    node->feature_header->feature_id != plugin_header.id ) {
33945 +                       continue;
33946 +               }
33947 +
33948 +               /* Read third-to-last sector for the snapshot metadata. */
33949 +               rc = INIT_IO(node, 0, node->total_vsectors-3, 1, metadata);
33950 +               if (rc) {
33951 +                       LOG_ERROR("IO error reading sector "PFU64" on '%s'.\n",
33952 +                               node->total_vsectors-3, node->name);
33953 +                       rc = -EVMS_FEATURE_FATAL_ERROR;
33954 +                       evms_cs_remove_logical_node_from_list(evms_node_list,
33955 +                                                             node);
33956 +                       DELETE(node);
33957 +                       continue;
33958 +               }
33959 +
33960 +               /* Check for a valid snapshot signature. */
33961 +               if ( le32_to_cpup(&metadata->signature) !=
33962 +                    EVMS_SNAPSHOT_SIGNATURE ) {
33963 +                       continue;
33964 +               }
33965 +               evms_cs_remove_logical_node_from_list(evms_node_list, node);
33966 +
33967 +               /* Check for a valid CRC. */
33968 +               org_crc = le32_to_cpup(&metadata->CRC);
33969 +               metadata->CRC = 0;
33970 +               final_crc = evms_cs_calculate_crc(EVMS_INITIAL_CRC, metadata,
33971 +                                                 sizeof(struct snapshot_metadata));
33972 +               if ( final_crc != org_crc ) {
33973 +                       LOG_ERROR("CRC error in feature data on '%s'.\n",
33974 +                                 node->name);
33975 +                       rc = -EVMS_FEATURE_FATAL_ERROR;
33976 +                       DELETE(node);
33977 +                       continue;
33978 +               }
33979 +
33980 +               /* Check for correct metadata version. */
33981 +               convert_metadata(metadata);
33982 +               if ( metadata->version.major > plugin_header.version.major ) {
33983 +                       LOG_ERROR("ERROR: unsuppoprted metadata version on '%s'.\n",
33984 +                               node->name);
33985 +                       rc = -EVMS_FEATURE_FATAL_ERROR;
33986 +                       DELETE(node);
33987 +                       continue;
33988 +               }
33989 +
33990 +               rc = add_snapshot(node, metadata, evms_node_list);
33991 +       }
33992 +
33993 +       kfree(metadata);
33994 +       MOD_DEC_USE_COUNT;
33995 +       return rc;
33996 +}
33997 +
33998 +/**
33999 + * check_quiesce
34000 + *
34001 + * Make sure an original volume and all of its snapshots are quiesced.
34002 + */
34003 +static int check_quiesce(struct snapshot_volume * org_volume)
34004 +{
34005 +       struct snapshot_volume * next_vol;
34006 +
34007 +       for ( next_vol = org_volume;
34008 +             next_vol;
34009 +             next_vol = next_vol->snapshot_next ) {
34010 +               if ( ! (next_vol->flags & EVMS_SNAPSHOT_QUIESCED) ) {
34011 +                       LOG_ERROR("Can't delete snapshot, volume '%s' not quiesced.\n",
34012 +                               next_vol->logical_node->name);
34013 +                       return -EBUSY;
34014 +               }
34015 +       }
34016 +       return 0;
34017 +}
34018 +
34019 +/**
34020 + * remove_snapshot_from_chain
34021 + *
34022 + * Remove the specified snapshot volume from its original's chain of snapshots.
34023 + */
34024 +static int remove_snapshot_from_chain(struct snapshot_volume * snap_volume)
34025 +{
34026 +       struct snapshot_volume ** p_volume;
34027 +
34028 +       if ( snap_volume->snapshot_org ) {
34029 +               down_write(&snap_volume->snapshot_org->snap_semaphore);
34030 +               for ( p_volume = &snap_volume->snapshot_org->snapshot_next;
34031 +                     *p_volume;
34032 +                     p_volume = &(*p_volume)->snapshot_next ) {
34033 +                       if ( *p_volume == snap_volume ) {
34034 +                               *p_volume = (*p_volume)->snapshot_next;
34035 +                               break;
34036 +                       }
34037 +               }
34038 +               up_write(&snap_volume->snapshot_org->snap_semaphore);
34039 +       }
34040 +       snap_volume->snapshot_org = NULL;
34041 +       snap_volume->snapshot_next = NULL;
34042 +       return 0;
34043 +}
34044 +
34045 +/**
34046 + * delete_snapshot_hash_chain
34047 + *
34048 + * Delete all items in a single chain in the hash table.
34049 + */
34050 +static int delete_snapshot_hash_chain(struct snapshot_hash_entry * head)
34051 +{
34052 +       struct snapshot_hash_entry * next;
34053 +
34054 +       while (head) {
34055 +               next = head->next;
34056 +               mempool_free(head, snap_hash_entry_pool);
34057 +               head = next;
34058 +       }
34059 +       return 0;
34060 +}
34061 +
34062 +/**
34063 + * snapshot_delete_pools
34064 + *
34065 + * Delete all memory pools after all snapshots have been deleted.
34066 + * Also shutdown the daemon thread.
34067 + */
34068 +static void snapshot_delete_pools(void)
34069 +{
34070 +       /* The pool of data pages. */
34071 +       if (snap_page_slab) {
34072 +               if (snap_page_pool) {
34073 +                       mempool_destroy(snap_page_pool);
34074 +                       snap_page_pool = NULL;
34075 +               }
34076 +               kmem_cache_destroy(snap_page_slab);
34077 +               snap_page_slab = NULL;
34078 +       }
34079 +
34080 +       /* The pool of snap_io_buffer's. */
34081 +       if (snap_buffer_slab) {
34082 +               if (snap_buffer_pool) {
34083 +                       mempool_destroy(snap_buffer_pool);
34084 +                       snap_buffer_pool = NULL;
34085 +               }
34086 +               kmem_cache_destroy(snap_buffer_slab);
34087 +               snap_buffer_slab = NULL;
34088 +       }
34089 +
34090 +       /* The pool of async_org_io's. */
34091 +       if (snap_async_org_io_slab) {
34092 +               if (snap_async_org_io_pool) {
34093 +                       mempool_destroy(snap_async_org_io_pool);
34094 +                       snap_async_org_io_pool = NULL;
34095 +               }
34096 +               kmem_cache_destroy(snap_async_org_io_slab);
34097 +               snap_async_org_io_slab = NULL;
34098 +       }
34099 +
34100 +       /* The pool of async_snap_io's. */
34101 +       if (snap_async_snap_io_slab) {
34102 +               if (snap_async_snap_io_pool) {
34103 +                       mempool_destroy(snap_async_snap_io_pool);
34104 +                       snap_async_snap_io_pool = NULL;
34105 +               }
34106 +               kmem_cache_destroy(snap_async_snap_io_slab);
34107 +               snap_async_snap_io_slab = NULL;
34108 +       }
34109 +
34110 +       /* The pool of hash table entries. */
34111 +       if (snap_hash_entry_slab) {
34112 +               if (snap_hash_entry_pool) {
34113 +                       mempool_destroy(snap_hash_entry_pool);
34114 +                       snap_hash_entry_pool = NULL;
34115 +               }
34116 +               kmem_cache_destroy(snap_hash_entry_slab);
34117 +               snap_hash_entry_slab = NULL;
34118 +       }
34119 +}
34120 +
34121 +/**
34122 + * snapshot_create_pools
34123 + *
34124 + * Allocate all of the memory pools when the first snapshot is created.
34125 + * Also start up the daemon thread for processing async I/O's.
34126 + */
34127 +static int snapshot_create_pools(void)
34128 +{
34129 +       /* Pool of data pages. */
34130 +       if (!snap_page_slab) {
34131 +               snap_page_slab = kmem_cache_create("snap_page_slab",
34132 +                                                  PAGE_SIZE, 0,
34133 +                                                  SLAB_HWCACHE_ALIGN,
34134 +                                                  NULL, NULL);
34135 +               if (snap_page_slab) {
34136 +                       snap_page_pool = mempool_create(1, slab_pool_alloc,
34137 +                                                       slab_pool_free,
34138 +                                                       snap_page_slab);
34139 +               }
34140 +       }
34141 +
34142 +       /* Pool of snap_io_buffer's. */
34143 +       if (!snap_buffer_slab) {
34144 +               snap_buffer_slab = kmem_cache_create("snap_bh_slab",
34145 +                                                    sizeof(struct snap_io_buffer),
34146 +                                                    0, SLAB_HWCACHE_ALIGN,
34147 +                                                    NULL, NULL);
34148 +               if (snap_buffer_slab) {
34149 +                       snap_buffer_pool = mempool_create(1, slab_pool_alloc,
34150 +                                                         slab_pool_free,
34151 +                                                         snap_buffer_slab);
34152 +               }
34153 +       }
34154 +
34155 +       /* Pool of async_org_io's. */
34156 +       if (!snap_async_org_io_slab) {
34157 +               snap_async_org_io_slab = kmem_cache_create("async_org_io_slab",
34158 +                                                          sizeof(struct async_org_io),
34159 +                                                          0, SLAB_HWCACHE_ALIGN,
34160 +                                                          NULL, NULL);
34161 +               if (snap_async_org_io_slab) {
34162 +                       snap_async_org_io_pool = mempool_create(1, slab_pool_alloc,
34163 +                                                               slab_pool_free,
34164 +                                                               snap_async_org_io_slab);
34165 +               }
34166 +       }
34167 +
34168 +       /* Pool of async_snap_io's. */
34169 +       if (!snap_async_snap_io_slab) {
34170 +               snap_async_snap_io_slab = kmem_cache_create("async_snap_io_slab",
34171 +                                                           sizeof(struct async_snap_io),
34172 +                                                           0, SLAB_HWCACHE_ALIGN,
34173 +                                                           NULL, NULL);
34174 +               if (snap_async_snap_io_slab) {
34175 +                       snap_async_snap_io_pool = mempool_create(1, slab_pool_alloc,
34176 +                                                                slab_pool_free,
34177 +                                                                snap_async_snap_io_slab);
34178 +               }
34179 +       }
34180 +
34181 +       /* Pool of hash table entries. */
34182 +       if (!snap_hash_entry_slab) {
34183 +               snap_hash_entry_slab = kmem_cache_create("snap_hash_slab",
34184 +                                                        sizeof(struct snapshot_hash_entry),
34185 +                                                        0, SLAB_HWCACHE_ALIGN,
34186 +                                                        NULL, NULL);
34187 +               if (snap_hash_entry_slab) {
34188 +                       snap_hash_entry_pool = mempool_create(1, slab_pool_alloc,
34189 +                                                             slab_pool_free,
34190 +                                                             snap_hash_entry_slab);
34191 +               }
34192 +       }
34193 +
34194 +       if ( ! snap_page_slab          || ! snap_page_pool          ||
34195 +            ! snap_buffer_slab        || ! snap_buffer_pool        ||
34196 +            ! snap_async_org_io_slab  || ! snap_async_org_io_pool  ||
34197 +            ! snap_async_snap_io_slab || ! snap_async_snap_io_pool ||
34198 +            ! snap_hash_entry_slab    || ! snap_hash_entry_pool    ) {
34199 +               LOG_CRITICAL("No memory available to create snapshot pools.\n");
34200 +               snapshot_delete_pools();
34201 +               return -ENOMEM;
34202 +       }
34203 +
34204 +       return 0;
34205 +}
34206 +
34207 +/**
34208 + * snap_delete_volume
34209 + *
34210 + * Delete the in-memory representation of a volume. The specified node
34211 + * can actually be either a snapshot or an original. Deleting a snapshot
34212 + * causes it to be removed from its original's chain of snapshots.
34213 + *
34214 + * For async snapshots, we will need to flush the COW table and mark the
34215 + * snapshot clean in the metadata.
34216 + */
34217 +static int snap_delete_volume(struct evms_logical_node * node)
34218 +{
34219 +       struct snapshot_volume * volume = node->private;
34220 +       struct snapshot_volume * org_volume = volume->snapshot_org;
34221 +       struct snapshot_volume * next_vol;
34222 +       int i, rc = 0;
34223 +
34224 +       /* Don't delete a snapshot that's rolling back. */
34225 +       if ( volume->flags & EVMS_SNAPSHOT_ROLLBACK &&
34226 +            ! (volume->flags & EVMS_SNAPSHOT_DISABLED) ) {
34227 +               LOG_ERROR("Can't delete '%s' during snapshot rollback.",
34228 +                         node->name);
34229 +               return -EBUSY;
34230 +       }
34231 +
34232 +       /* Delete the instance data. */
34233 +       if (volume) {
34234 +               if ( volume->flags & EVMS_SNAPSHOT ) {
34235 +                       /* This node is a snapshot. Check that this snapshot and
34236 +                        * its original have been quiesced. For async snapshots,
34237 +                        * make sure there are no outstanding remaps in
34238 +                        * progress. Then remove it from the original's chain of
34239 +                        * snapshots.
34240 +                        */
34241 +                       if ( ! (volume->flags & EVMS_SNAPSHOT_QUIESCED) ) {
34242 +                               LOG_ERROR("Can't delete snapshot, snapshot volume '%s' not quiesced.\n",
34243 +                                         volume->logical_node->name);
34244 +                               return -EBUSY;
34245 +                       }
34246 +                       if ( org_volume &&
34247 +                            ! (org_volume->flags & EVMS_SNAPSHOT_QUIESCED) ) {
34248 +                               LOG_ERROR("Can't delete snapshot, original volume '%s' not quiesced.\n",
34249 +                                         org_volume->logical_node->name);
34250 +                               return -EBUSY;
34251 +                       }
34252 +
34253 +                       remove_snapshot_from_chain(volume);
34254 +
34255 +                       /* If we just deleted the only/last snapshot for this
34256 +                        * original, the original will not be modified. It is
34257 +                        * the engine's responsibility to delete the original
34258 +                        * and rediscover in order to clear it of its snapshot
34259 +                        * information. Even if that doesn't happen, the state
34260 +                        * of the kernel will still be safe. I/O's coming into
34261 +                        * this plugin for the original will just be passed
34262 +                        * down without any other action or modification.
34263 +                        */
34264 +
34265 +                       /* Unregister the proc-fs entry for this node. */
34266 +                       if (snap_proc) {
34267 +                               remove_proc_entry(node->volume_info->volume_name,
34268 +                                                 snap_proc);
34269 +                       }
34270 +               } else {
34271 +                       /* This is an original. It's the engine's responsibility
34272 +                        * to delete all snapshots before deleting an original.
34273 +                        * Otherwise, a snapshot could be left pointing to an
34274 +                        * original that no longer exists. Thus, we just need to
34275 +                        * make sure there are no snapshots in the chain.
34276 +                        */
34277 +                       rc = check_quiesce(volume);
34278 +                       if (rc) {
34279 +                               return rc;
34280 +                       }
34281 +
34282 +                       /* Shut down the async I/O thread. */
34283 +                       if (volume->async_io_thread) {
34284 +                               evms_cs_unregister_thread(volume->async_io_thread);
34285 +                       }
34286 +
34287 +                       /* Loop through all snapshots left on this original,
34288 +                        * and NULL out their org pointer, in case they don't
34289 +                        * get deleted.
34290 +                        */
34291 +                       for ( next_vol = volume->snapshot_next; next_vol;
34292 +                             next_vol = next_vol->snapshot_next ) {
34293 +                               next_vol->snapshot_org = NULL;
34294 +                       }
34295 +               }
34296 +
34297 +               /* Free up all memory used by the instance data, including
34298 +                * the underlying node, the hash table, and the data buffer.
34299 +                */
34300 +               if (volume->logical_node) {
34301 +                       rc = DELETE(volume->logical_node);
34302 +                       if (rc) {
34303 +                               return rc;
34304 +                       }
34305 +               }
34306 +               if (volume->snapshot_map) {
34307 +                       /* Delete all of the hash chains,
34308 +                        * then the actual table.
34309 +                        */
34310 +                       for ( i = 0; i < volume->hash_table_size; i++ ) {
34311 +                               delete_snapshot_hash_chain(volume->snapshot_map[i]);
34312 +                       }
34313 +                       delete_snapshot_hash_chain(volume->free_hash_list);
34314 +                       vfree(volume->snapshot_map);
34315 +               }
34316 +               if (volume->chunk_data_buffer) {
34317 +                       kfree(volume->chunk_data_buffer);
34318 +               }
34319 +               if (volume->rollback_thread) {
34320 +                       evms_cs_unregister_thread(volume->rollback_thread);
34321 +               }
34322 +
34323 +               kfree(volume);
34324 +       }
34325 +
34326 +       evms_cs_deallocate_logical_node(node);
34327 +       snapshot_count--;
34328 +
34329 +       /* If there are no more snapshot objects, free up the memory pools. */
34330 +       if ( snapshot_count == 0 ) {
34331 +               snapshot_delete_pools();
34332 +       }
34333 +
34334 +       MOD_DEC_USE_COUNT;
34335 +
34336 +       return 0;
34337 +}
34338 +
34339 +/**
34340 + * search_snapshot_hash_chain
34341 + *
34342 + * Search the hash chain that is anchored at the specified head pointer. If the
34343 + * chunk number is found, a pointer to that entry in the chain is set, and a 1
34344 + * is returned. If the chunk is not found, a pointer to the previous entry is
34345 + * set and 0 is returned. If the return pointer is NULL, this means either the
34346 + * list is empty, or the specified sector should become the first list item.
34347 + */
34348 +static int search_snapshot_hash_chain(u64 chunk,
34349 +                                     struct snapshot_hash_entry * head,
34350 +                                     struct snapshot_hash_entry ** result)
34351 +{
34352 +       struct snapshot_hash_entry * curr = head;
34353 +       struct snapshot_hash_entry * prev = head;
34354 +       while ( curr && curr->org_chunk < chunk ) {
34355 +               prev = curr;
34356 +               curr = curr->next;
34357 +       }
34358 +       if (!curr) {
34359 +               /* Either an empty chain or went off the end of the chain. */
34360 +               *result = prev;
34361 +               return 0;
34362 +       } else if ( curr->org_chunk != chunk ) {
34363 +               *result = curr->prev;
34364 +               return 0;
34365 +       } else {
34366 +               *result = curr;
34367 +               return 1;
34368 +       }
34369 +}
34370 +
34371 +/**
34372 + * snapshot_remap_chunk
34373 + *
34374 + * Perform a sector remap on a snapshot volume. This should be called from the
34375 + * I/O read path, It first determines the base sector of the chunk containing
34376 + * the specified sector, and saves the remainder. Then it performs a search
34377 + * through the snapshot map for the specified volume. If a match is found, the
34378 + * sector number is changed to the new value. If no match is found, the value
34379 + * is left the same, meaning the read should proceed down the original volume.
34380 + */
34381 +static int snapshot_remap_chunk(struct snapshot_volume * snap_volume,
34382 +                               struct buffer_head * bh)
34383 +{
34384 +       struct snapshot_hash_entry * result;
34385 +       u64 chunk, sector = bh->b_rsector;
34386 +       unsigned long remainder, hash_value;
34387 +       unsigned long flags, queued = FALSE;
34388 +
34389 +       remainder = sector & (u64)(snap_volume->chunk_size - 1);
34390 +       chunk = sector >> snap_volume->chunk_shift;
34391 +       hash_value = ((unsigned long)chunk) % snap_volume->hash_table_size;
34392 +
34393 +       if ( search_snapshot_hash_chain(chunk,
34394 +                                       snap_volume->snapshot_map[hash_value],
34395 +                                       &result) ) {
34396 +               bh->b_rsector = (result->snap_chunk << snap_volume->chunk_shift)
34397 +                                + remainder;
34398 +               if ( result->chunk_state != SNAP_CHUNK_COPIED ) {
34399 +                       /* If this chunk is in the middle of being copied,
34400 +                        * place this request on the pending list.
34401 +                        */
34402 +                       spin_lock_irqsave(&result->chunk_state_lock, flags);
34403 +                       if ( result->chunk_state != SNAP_CHUNK_COPIED ) {
34404 +                               bh->b_reqnext = result->snap_io->pending_reads;
34405 +                               result->snap_io->pending_reads = bh;
34406 +                               if (!result->snap_io->dev) {
34407 +                                       result->snap_io->dev = bh->b_rdev;
34408 +                               }
34409 +                               evms_cs_volume_request_in_progress(result->snap_io->dev,
34410 +                                                                  +1, NULL);
34411 +                               queued = TRUE;
34412 +                       }
34413 +                       spin_unlock_irqrestore(&result->chunk_state_lock, flags);
34414 +               }
34415 +
34416 +               if (queued) {
34417 +                       return -1;
34418 +               } else {
34419 +                       return 1;
34420 +               }
34421 +       }
34422 +       return 0;
34423 +}
34424 +
34425 +/**
34426 + * snap_read
34427 + */
34428 +static void snap_read(struct evms_logical_node * node,
34429 +                     struct buffer_head * bh)
34430 +{
34431 +       struct snapshot_volume * volume = node->private;
34432 +       u64 alignment;
34433 +       int rc;
34434 +
34435 +       /* Size check. */
34436 +       if ( bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) >
34437 +            node->total_vsectors ) {
34438 +               bh->b_end_io(bh, 0);
34439 +               return;
34440 +       }
34441 +
34442 +       /* Can't read if rollback is in progress. */
34443 +       if ( volume->flags & EVMS_SNAPSHOT_ROLLBACK ) {
34444 +               LOG_ERROR("Cannot read from snapshot '%s' during rollback.\n",
34445 +                         volume->logical_node->name);
34446 +               bh->b_end_io(bh, 0);
34447 +               return;
34448 +       }
34449 +
34450 +       /* On a read to the original, we can just pass it through completely
34451 +        * untouched. Only reads to the snapshot can be remapped.
34452 +        */
34453 +       if ( volume->flags & EVMS_SNAPSHOT_ORG ) {
34454 +               R_IO(volume->logical_node, bh);
34455 +               return;
34456 +       }
34457 +
34458 +       /* Lock the snapshot before processing the request. */
34459 +       down_read(&volume->snap_semaphore);
34460 +
34461 +       /* Make sure the snapshot is not full/disabled, and that
34462 +        * the original is present.
34463 +        */
34464 +       if ( (volume->flags & (EVMS_SNAPSHOT_DISABLED|EVMS_SNAPSHOT_FULL)) ||
34465 +            (! volume->snapshot_org) ) {
34466 +               bh->b_end_io(bh, 0);
34467 +               up_read(&volume->snap_semaphore);
34468 +               return;
34469 +       }
34470 +
34471 +       /* Check for unaligned I/O. This is mostly to prevent XFS from
34472 +        * sending a request that spans a chunk.
34473 +        */
34474 +       alignment = bh->b_rsector;
34475 +       alignment <<= EVMS_VSECTOR_SIZE_SHIFT;
34476 +       if ( unlikely(alignment & (bh->b_size - 1)) ) {
34477 +               LOG_ERROR("Unaligned request [rsector(%lx), size(%x)] rejected on snapshot %s.\n",
34478 +                         bh->b_rsector, bh->b_size, node->name);
34479 +               bh->b_end_io(bh, 0);
34480 +               up_read(&volume->snap_semaphore);
34481 +               return;
34482 +       }
34483 +
34484 +       /* Check if this sector has been remapped. */
34485 +       rc = snapshot_remap_chunk(volume, bh);
34486 +       if ( rc > 0 ) {
34487 +               /* Sector was remapped. Send IO to the snapshot. */
34488 +               up_read(&volume->snap_semaphore);
34489 +               R_IO(volume->logical_node, bh);
34490 +       } else if ( rc < 0 ) {
34491 +               /* Sector was remapped but queued to be driven later. */
34492 +               up_read(&volume->snap_semaphore);
34493 +       } else  {
34494 +               /* Has not been remapped. Send IO to the original. */
34495 +               R_IO(volume->snapshot_org->logical_node, bh);
34496 +               up_read(&volume->snap_semaphore);
34497 +       }
34498 +}
34499 +
34500 +
34501 +/********** Asynchronous Snapshot I/O Code **********/
34502 +
34503 +
34504 +/**
34505 + * snap_deallocate_buffer
34506 + */
34507 +static void snap_deallocate_buffer(struct snap_io_buffer * buf,
34508 +                                  int deallocate_page)
34509 +{
34510 +       if (buf) {
34511 +               if (deallocate_page) {
34512 +                       mempool_free(buf->bh->b_data, snap_page_pool);
34513 +               }
34514 +               mempool_free(buf, snap_buffer_pool);
34515 +       }
34516 +}
34517 +
34518 +/**
34519 + * snap_allocate_buffer
34520 + *
34521 + * Allocate a snap_io_buffer and a data page from the respective memory
34522 + * pools. Initialize as appropriate.
34523 + */
34524 +static struct snap_io_buffer * snap_allocate_buffer(int allocate_page)
34525 +{
34526 +       struct snap_io_buffer * buf;
34527 +       struct buffer_head * bh;
34528 +
34529 +       /* Grab a snap_io_buffer from the pool. */
34530 +       buf = mempool_alloc(snap_buffer_pool, GFP_NOIO);
34531 +       if (!buf) {
34532 +               return NULL;
34533 +       }
34534 +       memset(buf, 0, sizeof(struct snap_io_buffer));
34535 +       bh = buf->bh = &buf->_bh;
34536 +
34537 +       /* Grab a data page from the pool. */
34538 +       if (allocate_page) {
34539 +               bh->b_data = mempool_alloc(snap_page_pool, GFP_NOIO);
34540 +               if (!bh->b_data) {
34541 +                       snap_deallocate_buffer(buf, FALSE);
34542 +                       return NULL;
34543 +               }
34544 +               bh->b_page = virt_to_page(bh->b_data);
34545 +       }
34546 +
34547 +       /* Initialize the rest of the buffer. */
34548 +       bh->b_size = PAGE_SIZE;
34549 +       bh->b_list = BUF_LOCKED;
34550 +       bh->b_count = (atomic_t)ATOMIC_INIT(1);
34551 +       bh->b_this_page = (struct buffer_head *)1;
34552 +       bh->b_private = buf;
34553 +       set_bit(BH_Dirty, &bh->b_state);
34554 +       set_bit(BH_Lock, &bh->b_state);
34555 +       set_bit(BH_Req, &bh->b_state);
34556 +       set_bit(BH_Mapped, &bh->b_state);
34557 +       set_bit(BH_Uptodate, &bh->b_state);
34558 +       init_waitqueue_head(&bh->b_wait);
34559 +       INIT_LIST_HEAD(&buf->chunk_write_list);
34560 +
34561 +       return buf;
34562 +}
34563 +
34564 +/**
34565 + * snap_deallocate_buffer_list
34566 + *
34567 + * Free each buffer in the specified list.
34568 + */
34569 +static void snap_deallocate_buffer_list(struct snap_io_buffer * buf_list,
34570 +                                       int deallocate_pages)
34571 +{
34572 +       struct snap_io_buffer * buf, * buf_next;
34573 +
34574 +       for ( buf = buf_list; buf; buf = buf_next ) {
34575 +               buf_next = buf->buffer_next;
34576 +               snap_deallocate_buffer(buf, deallocate_pages);
34577 +       }
34578 +}
34579 +
34580 +/**
34581 + * snap_allocate_buffer_list
34582 + *
34583 + * Allocate a list of snap_io_buffer's which will be used to copy a chunk
34584 + * from the original to the snapshot.
34585 + */
34586 +static struct snap_io_buffer *
34587 +snap_allocate_buffer_list(unsigned int count,
34588 +                         u64 starting_lba,
34589 +                         void (*callback)(struct buffer_head *, int),
34590 +                         int allocate_pages)
34591 +{
34592 +       struct snap_io_buffer * buf, * head = NULL;
34593 +       struct snap_io_buffer ** tail = &head;
34594 +       unsigned int i;
34595 +
34596 +       for ( i = 0; i < count; i++ ) {
34597 +               /* Get a buffer from the pool. */
34598 +               buf = snap_allocate_buffer(allocate_pages);
34599 +               if (!buf) {
34600 +                       snap_deallocate_buffer_list(head, allocate_pages);
34601 +                       return NULL;
34602 +               }
34603 +
34604 +               /* Set the callback function and the sector value. */
34605 +               buf->bh->b_end_io = callback;
34606 +               buf->bh->b_rsector = starting_lba + i *
34607 +                                     (PAGE_SIZE >> EVMS_VSECTOR_SIZE_SHIFT);
34608 +
34609 +               /* Add this buffer to the list to return. */
34610 +               *tail = buf;
34611 +               tail = &buf->buffer_next;
34612 +       }
34613 +
34614 +       return head;
34615 +}
34616 +
34617 +/**
34618 + * deallocate_async_snap_io
34619 + */
34620 +static void deallocate_async_snap_io(struct async_snap_io * async_snap_io)
34621 +{
34622 +       DEBUG_CHECK_SNAP_IO(async_snap_io);
34623 +
34624 +       snap_deallocate_buffer(async_snap_io->cow_table_buffer, TRUE);
34625 +       snap_deallocate_buffer_list(async_snap_io->copy_buffers, FALSE);
34626 +       mempool_free(async_snap_io, snap_async_snap_io_pool);
34627 +}
34628 +
34629 +/**
34630 + * allocate_async_snap_io
34631 + * @snap_volume:       The snapshot volume this chunk belongs to.
34632 + * @hash_entry:                The entry in the hash table representing this chunk.
34633 + * @async_org_io:      The parent async I/O structure.
34634 + * @snap_chunk_lba:    The starting LBA on the snapshot for this chunk.
34635 + * @buffer_count:      The number of buffers needed to copy this chunk.
34636 + *
34637 + * Allocate an async_snap_io structure from the pool and initialize.
34638 + * Create a list of buffer heads to use for the copy.
34639 + */
34640 +static struct async_snap_io *
34641 +allocate_async_snap_io(struct snapshot_volume * snap_volume,
34642 +                      struct snapshot_hash_entry * hash_entry,
34643 +                      struct async_org_io * async_org_io,
34644 +                      u64 snap_chunk_lba,
34645 +                      unsigned int buffer_count)
34646 +{
34647 +       struct async_snap_io * async_snap_io;
34648 +
34649 +       async_snap_io = mempool_alloc(snap_async_snap_io_pool, GFP_NOIO);
34650 +       if (async_snap_io) {
34651 +               memset(async_snap_io, 0, sizeof(struct async_snap_io));
34652 +               async_snap_io->snap_volume = snap_volume;
34653 +               async_snap_io->hash_table_entry = hash_entry;
34654 +               async_snap_io->org_io = async_org_io;
34655 +               INIT_LIST_HEAD(&async_snap_io->snap_pending_io_list);
34656 +               INIT_LIST_HEAD(&async_snap_io->cow_write_list);
34657 +               async_snap_io->write_count = (atomic_t)ATOMIC_INIT(buffer_count);
34658 +
34659 +               async_snap_io->cow_table_buffer = snap_allocate_buffer(TRUE);
34660 +               if (async_snap_io->cow_table_buffer) {
34661 +                       /* The buffer for the COW table needs to be adjusted. */
34662 +                       struct snap_io_buffer * buf = async_snap_io->cow_table_buffer;
34663 +                       buf->bh->b_size = EVMS_VSECTOR_SIZE;
34664 +                       buf->bh->b_end_io = snap_cow_table_cb;
34665 +                       buf->buffer_private = async_snap_io;
34666 +
34667 +                       async_snap_io->copy_buffers =
34668 +                               snap_allocate_buffer_list(buffer_count,
34669 +                                                         snap_chunk_lba,
34670 +                                                         snap_write_chunk_cb,
34671 +                                                         FALSE);
34672 +                       if (!async_snap_io->copy_buffers) {
34673 +                               deallocate_async_snap_io(async_snap_io);
34674 +                               async_snap_io = NULL;
34675 +                       }
34676 +               } else {
34677 +                       deallocate_async_snap_io(async_snap_io);
34678 +                       async_snap_io = NULL;
34679 +               }
34680 +       }
34681 +       return async_snap_io;
34682 +}
34683 +
34684 +/**
34685 + * deallocate_async_org_io
34686 + */
34687 +static void deallocate_async_org_io(struct async_org_io * async_org_io)
34688 +{
34689 +       DEBUG_REMOVE_ORG_IO_FROM_LIST(async_org_io);
34690 +
34691 +       snap_deallocate_buffer_list(async_org_io->copy_buffers, TRUE);
34692 +       mempool_free(async_org_io, snap_async_org_io_pool);
34693 +}
34694 +
34695 +/**
34696 + * allocate_async_org_io
34697 + *
34698 + * Allocate an async_org_io structure from the pool and initialize.
34699 + * Create a list of buffer heads to use for the copy.
34700 + */
34701 +static struct async_org_io *
34702 +allocate_async_org_io(struct snapshot_volume * org_volume,
34703 +                     u64 org_chunk_lba,
34704 +                     unsigned int buffer_count)
34705 +{
34706 +       struct async_org_io * async_org_io;
34707 +
34708 +       async_org_io = mempool_alloc(snap_async_org_io_pool, GFP_NOIO);
34709 +       if (async_org_io) {
34710 +               DEBUG_ADD_ORG_IO_TO_LIST(async_org_io);
34711 +
34712 +               memset(async_org_io, 0, sizeof(struct async_org_io));
34713 +               async_org_io->org_volume = org_volume;
34714 +               spin_lock_init(&async_org_io->pending_writes_lock);
34715 +               INIT_LIST_HEAD(&async_org_io->org_pending_io_list);
34716 +               async_org_io->copy_count = (atomic_t)ATOMIC_INIT(0);
34717 +               async_org_io->ref_count = (atomic_t)ATOMIC_INIT(1);
34718 +
34719 +               async_org_io->copy_buffers =
34720 +                       snap_allocate_buffer_list(buffer_count,
34721 +                                                 org_chunk_lba,
34722 +                                                 snap_read_chunk_cb,
34723 +                                                 TRUE);
34724 +               if (!async_org_io->copy_buffers) {
34725 +                       deallocate_async_org_io(async_org_io);
34726 +                       async_org_io = NULL;
34727 +               }
34728 +       }
34729 +       return async_org_io;
34730 +}
34731 +
34732 +/**
34733 + * deallocate_async_io
34734 + *
34735 + * This function deletes the entire async I/O structure, including the
34736 + * async_org_io, all async_snap_io's, and all buffer heads and pages.
34737 + */
34738 +static void deallocate_async_io(struct async_org_io * async_org_io)
34739 +{
34740 +       struct async_snap_io * async_snap_io, * next_snap_io;
34741 +
34742 +       for ( async_snap_io = async_org_io->snap_io_list;
34743 +             async_snap_io;
34744 +             async_snap_io = next_snap_io ) {
34745 +               next_snap_io = async_snap_io->snap_io_list_next;
34746 +               deallocate_async_snap_io(async_snap_io);
34747 +       }
34748 +       deallocate_async_org_io(async_org_io);
34749 +}
34750 +
34751 +/**
34752 + * process_org_pending_io_list
34753 + *
34754 + * Grab the first item from the org_pending_io_list and send all
34755 + * waiting write requests to the original.
34756 + */
34757 +static void process_org_pending_io_list(struct snapshot_volume * org_volume,
34758 +                                       int * done)
34759 +{
34760 +       struct async_org_io * async_org_io;
34761 +       struct buffer_head * bh;
34762 +       unsigned long flags;
34763 +
34764 +       spin_lock_irqsave(&org_volume->org_pending_io_list_lock, flags);
34765 +       if ( list_empty(&org_volume->org_pending_io_list) ) {
34766 +               spin_unlock_irqrestore(&org_volume->org_pending_io_list_lock,
34767 +                                      flags);
34768 +               *done = TRUE;
34769 +       } else {
34770 +               async_org_io = ORG_PENDING_IO_ENTRY(org_volume->org_pending_io_list.next);
34771 +               list_del(&async_org_io->org_pending_io_list);
34772 +               spin_unlock_irqrestore(&org_volume->org_pending_io_list_lock,
34773 +                                      flags);
34774 +
34775 +               for ( bh = async_org_io->pending_writes; bh;
34776 +                     bh = async_org_io->pending_writes ) {
34777 +                       async_org_io->pending_writes = bh->b_reqnext;
34778 +                       bh->b_reqnext = NULL;
34779 +                       W_IO(async_org_io->org_volume->logical_node, bh);
34780 +                       evms_cs_volume_request_in_progress(async_org_io->dev,
34781 +                                                          -1, NULL);
34782 +               }
34783 +
34784 +               if ( atomic_dec_and_test(&async_org_io->ref_count) ) {
34785 +                       deallocate_async_io(async_org_io);
34786 +               }
34787 +               *done = FALSE;
34788 +       }
34789 +}
34790 +
34791 +
34792 +/**
34793 + * process_snap_pending_io_list
34794 + *
34795 + * Grab the first item from the snap_pending_io_list and send
34796 + * all waiting read and write requests to the snapshot.
34797 + */
34798 +static void process_snap_pending_io_list(struct snapshot_volume * org_volume,
34799 +                                        int * done)
34800 +{
34801 +       struct async_snap_io * async_snap_io;
34802 +       struct buffer_head * bh;
34803 +       unsigned long flags;
34804 +
34805 +       spin_lock_irqsave(&org_volume->snap_pending_io_list_lock, flags);
34806 +       if ( list_empty(&org_volume->snap_pending_io_list) ) {
34807 +               spin_unlock_irqrestore(&org_volume->snap_pending_io_list_lock,
34808 +                                      flags);
34809 +       } else {
34810 +               async_snap_io = SNAP_PENDING_IO_ENTRY(org_volume->snap_pending_io_list.next);
34811 +               list_del(&async_snap_io->snap_pending_io_list);
34812 +               spin_unlock_irqrestore(&org_volume->snap_pending_io_list_lock,
34813 +                                      flags);
34814 +
34815 +               /* Reads */
34816 +               for ( bh = async_snap_io->pending_reads; bh;
34817 +                     bh = async_snap_io->pending_reads ) {
34818 +                       async_snap_io->pending_reads = bh->b_reqnext;
34819 +                       bh->b_reqnext = NULL;
34820 +                       R_IO(async_snap_io->snap_volume->logical_node, bh);
34821 +                       evms_cs_volume_request_in_progress(async_snap_io->dev,
34822 +                                                          -1, NULL);
34823 +               }
34824 +               /* Writes */
34825 +               for ( bh = async_snap_io->pending_writes; bh;
34826 +                     bh = async_snap_io->pending_writes ) {
34827 +                       async_snap_io->pending_writes = bh->b_reqnext;
34828 +                       bh->b_reqnext = NULL;
34829 +                       W_IO(async_snap_io->snap_volume->logical_node, bh);
34830 +                       evms_cs_volume_request_in_progress(async_snap_io->dev,
34831 +                                                          -1, NULL);
34832 +               }
34833 +
34834 +               if ( atomic_dec_and_test(&async_snap_io->org_io->ref_count) ) {
34835 +                       deallocate_async_io(async_snap_io->org_io);
34836 +               }
34837 +               *done = FALSE;
34838 +       }
34839 +}
34840 +
34841 +/**
34842 + * process_chunk_write_list
34843 + *
34844 + * Grab the first item from the chunk_write_list and send down
34845 + * writes to each snapshot of this original.
34846 + */
34847 +static void process_chunk_write_list(struct snapshot_volume * org_volume,
34848 +                                    int * done)
34849 +{
34850 +       struct snap_io_buffer * buf, * snap_buf;
34851 +       struct async_snap_io * async_snap_io;
34852 +       unsigned long flags;
34853 +
34854 +       spin_lock_irqsave(&org_volume->chunk_write_list_lock, flags);
34855 +       if ( list_empty(&org_volume->chunk_write_list) ) {
34856 +               spin_unlock_irqrestore(&org_volume->chunk_write_list_lock,
34857 +                                      flags);
34858 +       } else {
34859 +               buf = CHUNK_WRITE_ENTRY(org_volume->chunk_write_list.next);
34860 +               list_del(&buf->chunk_write_list);
34861 +               spin_unlock_irqrestore(&org_volume->chunk_write_list_lock,
34862 +                                      flags);
34863 +
34864 +               for ( snap_buf = buf->copy_next; snap_buf;
34865 +                     snap_buf = snap_buf->copy_next ) {
34866 +                       async_snap_io = snap_buf->buffer_private;
34867 +                       W_IO(async_snap_io->snap_volume->logical_node,
34868 +                            snap_buf->bh);
34869 +               }
34870 +               *done = FALSE;
34871 +       }
34872 +}
34873 +
34874 +/**
34875 + * write_cow_table
34876 + *
34877 + * On S/390 machines, the hardsector size is usually 4k, and the driver won't
34878 + * accept I/O requests that are less than 4k in size. Thus, the COW table
34879 + * cannot be written as a single sector. We must first read in the entire
34880 + * 4k hardsector, overlay the 512 byte COW table, and then write out the entire
34881 + * 4k again.
34882 + *
34883 + * On machines with hardsector size of 512, the COW table write will be
34884 + * processed just as it was before.
34885 + *
34886 + * If an error occurs in this function, we will send down the buffer as as
34887 + * read instead of a write. This will ensure that the callback function still
34888 + * runs and cleans up the async_io structures and releases all pending I/Os.
34889 + */
34890 +static inline void write_cow_table(struct snapshot_volume * snap,
34891 +                                  struct buffer_head * bh)
34892 +{
34893 +       if ( snap->logical_node->hardsector_size > bh->b_size ) {
34894 +               u64 offset;
34895 +               u8 * buffer;
34896 +               unsigned short b_size = bh->b_size;
34897 +               int rc;
34898 +
34899 +               offset = bh->b_rsector & ((snap->logical_node->hardsector_size >>
34900 +                                          EVMS_VSECTOR_SIZE_SHIFT) - 1);
34901 +               bh->b_rsector -= offset;
34902 +               bh->b_size = snap->logical_node->hardsector_size;
34903 +
34904 +               /* Need a buffer to temporarily hold the COW table sector. */
34905 +               buffer = kmalloc(b_size, GFP_NOIO);
34906 +               if (!buffer) {
34907 +                       disable_snapshot(snap, TRUE);
34908 +                       R_IO(snap->logical_node, bh);
34909 +                       return;
34910 +               }
34911 +               memcpy(buffer, bh->b_data, b_size);
34912 +
34913 +               /* Read in the entire hardsector from disk. */
34914 +               rc = INIT_IO(snap->logical_node, READ, bh->b_rsector,
34915 +                            snap->logical_node->hardsector_size >>
34916 +                            EVMS_VSECTOR_SIZE_SHIFT,
34917 +                            bh->b_data);
34918 +               if (rc) {
34919 +                       disable_snapshot(snap, TRUE);
34920 +                       R_IO(snap->logical_node, bh);
34921 +                       return;
34922 +               }
34923 +
34924 +               /* Copy the COW table back into the buffer. */
34925 +               memcpy(bh->b_data + (offset << EVMS_VSECTOR_SIZE_SHIFT),
34926 +                      buffer, b_size);
34927 +       }
34928 +
34929 +       W_IO(snap->logical_node, bh);
34930 +}
34931 +
34932 +/**
34933 + * process_cow_table_write_lists
34934 + */
34935 +static void process_cow_table_write_lists(struct snapshot_volume * org_volume,
34936 +                                         int * done)
34937 +{
34938 +       struct snapshot_volume * snap_volume;
34939 +       struct async_snap_io * async_snap_io, * async_snap_io2;
34940 +       struct list_head * lh;
34941 +       unsigned long flags;
34942 +
34943 +       /* Check the chunk_write_list for each snapshot on this original. */
34944 +       down_read(&org_volume->snap_semaphore);
34945 +       for ( snap_volume = org_volume->snapshot_next;
34946 +             snap_volume;
34947 +             snap_volume = snap_volume->snapshot_next ) {
34948 +
34949 +               /* While we are here, see if the DISABLED bit needs to
34950 +                * be written to disk.
34951 +                */
34952 +               if ( snap_volume->flags & EVMS_SNAPSHOT_DISABLED &&
34953 +                    snap_volume->flags & EVMS_SNAPSHOT_DISABLED_PENDING ) {
34954 +                       disable_snapshot(snap_volume, TRUE);
34955 +                       snap_volume->flags &= ~EVMS_SNAPSHOT_DISABLED_PENDING;
34956 +               }
34957 +
34958 +               spin_lock_irqsave(&snap_volume->cow_table_write_list_lock,
34959 +                                 flags);
34960 +
34961 +               if ( list_empty(&snap_volume->cow_table_write_list) ) {
34962 +                       spin_unlock_irqrestore(&snap_volume->cow_table_write_list_lock,
34963 +                                              flags);
34964 +                       continue;
34965 +               }
34966 +
34967 +               /* Check for an in-flight COW-table-write. */
34968 +               async_snap_io = COW_WRITE_ENTRY(snap_volume->cow_table_write_list.next);
34969 +               if ( atomic_read(&async_snap_io->write_count) != 0 ) {
34970 +                       spin_unlock_irqrestore(&snap_volume->cow_table_write_list_lock,
34971 +                                              flags);
34972 +                       continue;
34973 +               }
34974 +
34975 +               /* See if there are any COW-table-writes that can be skipped. */
34976 +               list_for_each(lh, &snap_volume->cow_table_write_list) {
34977 +                       /* No need to check the first list element, since
34978 +                        * we've already examined it.
34979 +                        */
34980 +                       if ( lh->prev != &snap_volume->cow_table_write_list ) {
34981 +                               async_snap_io = COW_WRITE_ENTRY(lh);
34982 +                               async_snap_io2 = COW_WRITE_ENTRY(lh->prev);
34983 +                               if ( atomic_read(&async_snap_io->write_count) != 0 ) {
34984 +                                       async_snap_io = async_snap_io2;
34985 +                                       break;
34986 +                               }
34987 +                               if ( async_snap_io->cow_table_buffer->bh->b_rsector !=
34988 +                                    async_snap_io2->cow_table_buffer->bh->b_rsector ) {
34989 +                                       async_snap_io = async_snap_io2;
34990 +                                       break;
34991 +                               }
34992 +                       }
34993 +               }
34994 +
34995 +               /* We have the buffer to send down. Now mark all
34996 +                * previous COW-table buffers as in-flight.
34997 +                */
34998 +               list_for_each(lh, &snap_volume->cow_table_write_list) {
34999 +                       async_snap_io2 = COW_WRITE_ENTRY(lh);
35000 +                       atomic_dec(&async_snap_io2->write_count);
35001 +                       if ( async_snap_io2 == async_snap_io ) {
35002 +                               break;
35003 +                       }
35004 +                       else {
35005 +                               DEBUG_INC_COW_TABLE_OVERLAPS(snap_volume);
35006 +                       }
35007 +               }
35008 +
35009 +               spin_unlock_irqrestore(&snap_volume->cow_table_write_list_lock, flags);
35010 +
35011 +               /* Write the COW table. */
35012 +               DEBUG_INC_COW_TABLE_WRITES(snap_volume);
35013 +               write_cow_table(snap_volume, async_snap_io->cow_table_buffer->bh);
35014 +
35015 +               *done = FALSE;
35016 +       }
35017 +       up_read(&org_volume->snap_semaphore);
35018 +}
35019 +
35020 +/**
35021 + * snap_async_io_thread
35022 + *
35023 + * This is the async I/O thread function. It processes requests from four
35024 + * lists, which are embedded in the original volume structure passed to the
35025 + * thread.
35026 + *
35027 + * The first list, org_pending_io_list, contains async_org_io's, each of which
35028 + * contain a list of write requests to the original volume that are waiting on
35029 + * the completion of a chunk copy.
35030 + *
35031 + * The second list, snap_pending_io_list, contains async_snap_io's, each of
35032 + * which contain a list of read requests and a list of write requests to the
35033 + * snapshot volume that are waiting on the completion of a chunk copy.
35034 + *
35035 + * The third list, chunk_write_list, contains buffers that were used to read
35036 + * part of a chunk from the original volume. Those buffers are linked to other
35037 + * buffers which are used to write the same part of that chunk to the snapshot.
35038 + *
35039 + * The fourth list is actually the list of snapshots for this original. Each
35040 + * snapshot then has a list of COW-table buffers that have to be written. The
35041 + * processing of this list is optimized to eliminate unnecessary, overlapping
35042 + * writes of the COW table.
35043 + *
35044 + * The loop will continue as long as there is an item on at least one of
35045 + * the four lists. When they are all empty, the loop exits and the thread
35046 + * goes back to sleep.
35047 + */
35048 +static void snap_async_io_thread(void * volume)
35049 +{
35050 +       struct snapshot_volume * org_volume = volume;
35051 +       int done = FALSE;
35052 +
35053 +       while (!done) {
35054 +               process_org_pending_io_list(org_volume, &done);
35055 +
35056 +               process_snap_pending_io_list(org_volume, &done);
35057 +
35058 +               process_chunk_write_list(org_volume, &done);
35059 +
35060 +               process_cow_table_write_lists(org_volume, &done);
35061 +       }
35062 +
35063 +       run_task_queue(&tq_disk);
35064 +}
35065 +
35066 +/**
35067 + * schedule_org_pending_io
35068 + *
35069 + * Place the async_org_io on the thread's processing list.
35070 + */
35071 +static void schedule_org_pending_io(struct async_org_io * async_org_io)
35072 +{
35073 +       struct snapshot_volume * org_volume = async_org_io->org_volume;
35074 +       unsigned long flags;
35075 +
35076 +       spin_lock_irqsave(&org_volume->org_pending_io_list_lock, flags);
35077 +       list_add_tail(&async_org_io->org_pending_io_list,
35078 +                     &org_volume->org_pending_io_list);
35079 +       spin_unlock_irqrestore(&org_volume->org_pending_io_list_lock, flags);
35080 +       evms_cs_wakeup_thread(org_volume->async_io_thread);
35081 +}
35082 +
35083 +/**
35084 + * schedule_snap_pending_io
35085 + *
35086 + * Place the async_snap_io on the thread's processing list.
35087 + */
35088 +static void schedule_snap_pending_io(struct async_snap_io * async_snap_io)
35089 +{
35090 +       struct snapshot_volume * org_volume = async_snap_io->org_io->org_volume;
35091 +       unsigned long flags;
35092 +
35093 +       spin_lock_irqsave(&org_volume->snap_pending_io_list_lock, flags);
35094 +       list_add_tail(&async_snap_io->snap_pending_io_list,
35095 +                     &org_volume->snap_pending_io_list);
35096 +       spin_unlock_irqrestore(&org_volume->snap_pending_io_list_lock, flags);
35097 +       evms_cs_wakeup_thread(org_volume->async_io_thread);
35098 +}
35099 +
35100 +/**
35101 + * schedule_chunk_write
35102 + *
35103 + * Place the buffer on the chunk_write_list for the thread to process. This
35104 + * list uses the chunk_write_list field in the snap_io_buffer.
35105 + */
35106 +static void schedule_chunk_write(struct snap_io_buffer * buf)
35107 +{
35108 +       struct async_org_io * org_io = buf->buffer_private;
35109 +       struct snapshot_volume * org_volume = org_io->org_volume;
35110 +       unsigned long flags;
35111 +
35112 +       spin_lock_irqsave(&org_volume->chunk_write_list_lock, flags);
35113 +       list_add_tail(&buf->chunk_write_list, &org_volume->chunk_write_list);
35114 +       spin_unlock_irqrestore(&org_volume->chunk_write_list_lock, flags);
35115 +       evms_cs_wakeup_thread(org_volume->async_io_thread);
35116 +}
35117 +
35118 +/**
35119 + * schedule_cow_table_write
35120 + *
35121 + * Place the async_snap_io on the thread's processing list.
35122 + */
35123 +static void schedule_cow_table_write(struct async_snap_io * async_snap_io)
35124 +{
35125 +       struct snapshot_volume * snap_volume = async_snap_io->snap_volume;
35126 +       unsigned long flags;
35127 +
35128 +       spin_lock_irqsave(&snap_volume->cow_table_write_list_lock, flags);
35129 +       list_add_tail(&async_snap_io->cow_write_list,
35130 +                     &snap_volume->cow_table_write_list);
35131 +       spin_unlock_irqrestore(&snap_volume->cow_table_write_list_lock, flags);
35132 +}
35133 +
35134 +/**
35135 + * snap_read_chunk_cb
35136 + *
35137 + * This is the callback function for reading chunks from the original.
35138 + * When each read completes, we have to decrement the read_count in the
35139 + * async_org_io. If this count reaches zero, we can decrement the
35140 + * chunk_lock's in all of the hash entries, and send the original write
35141 + * request down. Finally, send this buffer head over to the thread to
35142 + * send the writes down to the snapshots.
35143 + */
35144 +void snap_read_chunk_cb(struct buffer_head * bh,
35145 +                       int uptodate)
35146 +{
35147 +       struct snap_io_buffer * buf = bh->b_private;
35148 +
35149 +       if (!uptodate) {
35150 +               /* Error reading the chunk. Disable all snapshots on this org. */
35151 +               struct async_org_io * async_org_io = buf->buffer_private;
35152 +               struct snapshot_volume * snap_volume;
35153 +               LOG_ERROR("Error reading chunk from original '%s'.\n",
35154 +                         async_org_io->org_volume->exported_node->name);
35155 +               for ( snap_volume = async_org_io->org_volume->snapshot_next;
35156 +                     snap_volume;
35157 +                     snap_volume = snap_volume->snapshot_next ) {
35158 +                       disable_snapshot(snap_volume, FALSE);
35159 +               }
35160 +       }
35161 +
35162 +       schedule_chunk_write(buf);
35163 +}
35164 +
35165 +/**
35166 + * snap_write_chunk_cb
35167 + *
35168 + * This is the callback function for writing chunks to the snapshot. When
35169 + * each write completes, decrement the write_count in the async_snap_io.
35170 + * If this count reaches zero, decrement the chunk_lock in the hash entry,
35171 + * and decrement the remap_count in the async_org_io. If the remap_count
35172 + * reaches zero, then everybody is done, and we can free up the entire
35173 + * async_io structure.
35174 + */
35175 +void snap_write_chunk_cb(struct buffer_head * bh,
35176 +                        int uptodate)
35177 +{
35178 +       struct snap_io_buffer * buf = bh->b_private;
35179 +       struct async_snap_io * async_snap_io = buf->buffer_private;
35180 +
35181 +       if (!uptodate) {
35182 +               /* Error writing chunk. Disable this snapshot. */
35183 +               LOG_ERROR("Error writing chunk to snapshot '%s'.\n",
35184 +                       async_snap_io->snap_volume->exported_node->name);
35185 +               disable_snapshot(async_snap_io->snap_volume, FALSE);
35186 +       }
35187 +
35188 +       atomic_dec(&async_snap_io->write_count);
35189 +       evms_cs_wakeup_thread(async_snap_io->org_io->org_volume->async_io_thread);
35190 +}
35191 +
35192 +/**
35193 + * snap_cow_table_cb
35194 + *
35195 + * This is the callback function for writing out the COW table.
35196 + */
35197 +void snap_cow_table_cb(struct buffer_head * bh,
35198 +                      int uptodate)
35199 +{
35200 +       struct snap_io_buffer * buf = bh->b_private;
35201 +       struct async_snap_io * async_snap_io = buf->buffer_private;
35202 +       struct async_snap_io * async_snap_io2;
35203 +       struct async_org_io * async_org_io;
35204 +       struct snapshot_volume * snap_volume = async_snap_io->snap_volume;
35205 +       struct list_head * lh, * tmp;
35206 +       unsigned long flags, flags2;
35207 +
35208 +       if (!uptodate) {
35209 +               /* Error writing the COW table sector. Disable the snapshot. */
35210 +               struct snapshot_volume * snap_volume = buf->buffer_private;
35211 +               LOG_ERROR("Error writing COW table to snapshot '%s'.\n",
35212 +                       snap_volume->exported_node->name);
35213 +               disable_snapshot(snap_volume, FALSE);
35214 +       }
35215 +
35216 +       spin_lock_irqsave(&snap_volume->cow_table_write_list_lock, flags);
35217 +
35218 +       list_for_each_safe(lh, tmp, &snap_volume->cow_table_write_list) {
35219 +               list_del(lh);
35220 +               async_snap_io2 = COW_WRITE_ENTRY(lh);
35221 +               async_org_io = async_snap_io2->org_io;
35222 +
35223 +               /* Mark the chunk as copied in the hash table. */
35224 +               spin_lock_irqsave(&async_snap_io2->hash_table_entry->chunk_state_lock,
35225 +                                 flags2);
35226 +               async_snap_io2->hash_table_entry->chunk_state = SNAP_CHUNK_COPIED;
35227 +               async_snap_io2->hash_table_entry->snap_io = NULL;
35228 +               spin_unlock_irqrestore(&async_snap_io2->hash_table_entry->chunk_state_lock,
35229 +                                      flags2);
35230 +
35231 +               /* Release any pending I/Os waiting on this chunk. */
35232 +               schedule_snap_pending_io(async_snap_io2);
35233 +               if ( atomic_dec_and_test(&async_org_io->copy_count) ) {
35234 +                       schedule_org_pending_io(async_org_io);
35235 +               }
35236 +
35237 +               if ( async_snap_io2 == async_snap_io ) {
35238 +                       break;
35239 +               }
35240 +       }
35241 +
35242 +       spin_unlock_irqrestore(&snap_volume->cow_table_write_list_lock, flags);
35243 +}
35244 +
35245 +/**
35246 + * snap_queue_original_request
35247 + *
35248 + * An existing remap was found for the chunk for this write request.
35249 + * If the chunk has been fully copied, then the request can go through
35250 + * normally. If the chunk is still being processed, this request must
35251 + * be queued up to be driven after the chunk has been copied.
35252 + */
35253 +static void snap_queue_original_request(struct snapshot_volume * snap_volume,
35254 +                                       struct buffer_head * org_bh,
35255 +                                       struct snapshot_hash_entry * target_entry,
35256 +                                       u64 remainder,
35257 +                                       int * queued_org_bh,
35258 +                                       int write_to_snapshot)
35259 +{
35260 +       struct async_org_io * org_io;
35261 +       unsigned long flags, flags2;
35262 +
35263 +       if (write_to_snapshot) {
35264 +               org_bh->b_rsector = (target_entry->snap_chunk <<
35265 +                                    snap_volume->chunk_shift) +
35266 +                                   remainder;
35267 +       }
35268 +
35269 +       if ( ! *queued_org_bh &&
35270 +            target_entry->chunk_state != SNAP_CHUNK_COPIED ) {
35271 +               spin_lock_irqsave(&target_entry->chunk_state_lock, flags);
35272 +               if (write_to_snapshot) {
35273 +                       /* A write to the snapshot. */
35274 +                       if ( target_entry->chunk_state != SNAP_CHUNK_COPIED ) {
35275 +                               org_bh->b_reqnext =
35276 +                                       target_entry->snap_io->pending_writes;
35277 +                               target_entry->snap_io->pending_writes = org_bh;
35278 +                               if (!target_entry->snap_io->dev) {
35279 +                                       target_entry->snap_io->dev =
35280 +                                               org_bh->b_rdev;
35281 +                               }
35282 +                               evms_cs_volume_request_in_progress(target_entry->snap_io->dev,
35283 +                                                                  +1, NULL);
35284 +                               *queued_org_bh = TRUE;
35285 +                       }
35286 +               } else {
35287 +                       /* A write to the original. */
35288 +                       if ( target_entry->chunk_state != SNAP_CHUNK_COPIED ) {
35289 +                               org_io = target_entry->snap_io->org_io;
35290 +                               spin_lock_irqsave(&org_io->pending_writes_lock,
35291 +                                                 flags2);
35292 +                               org_bh->b_reqnext = org_io->pending_writes;
35293 +                               org_io->pending_writes = org_bh;
35294 +                               if (!org_io->dev) {
35295 +                                       org_io->dev = org_bh->b_rdev;
35296 +                               }
35297 +                               spin_unlock_irqrestore(&org_io->pending_writes_lock,
35298 +                                                      flags2);
35299 +                               evms_cs_volume_request_in_progress(org_io->dev,
35300 +                                                                  +1, NULL);
35301 +                               *queued_org_bh = TRUE;
35302 +                       }
35303 +               }
35304 +               spin_unlock_irqrestore(&target_entry->chunk_state_lock, flags);
35305 +       }
35306 +}
35307 +
35308 +/**
35309 + * snapshot_copy_1
35310 + *
35311 + * Check this snapshot node to see if the given sector/chunk has been
35312 + * remapped yet. If it hasn't, create a new hash table entry, update the
35313 + * in-memory COW table, write the COW table to disk if it is full, and
35314 + * then start the process of copying the chunk from the original to the
35315 + * snapshot.
35316 + */
35317 +static int snapshot_copy_1(struct snapshot_volume * snap_volume,
35318 +                          struct buffer_head * org_bh,
35319 +                          struct async_org_io ** async_org_io,
35320 +                          int * queued_org_bh,
35321 +                          int write_to_snapshot)
35322 +{
35323 +       struct snapshot_volume * org_volume = snap_volume->snapshot_org;
35324 +       struct snapshot_hash_entry * target_entry, * new_map_entry;
35325 +       struct snap_io_buffer * cow_buf, *buf1, *buf2;
35326 +       struct async_snap_io * async_snap_io;
35327 +       u64 org_sector = org_bh->b_rsector;
35328 +       u64 org_chunk_lba, snap_chunk_lba;
35329 +       u64 alignment;
35330 +       u64 chunk, remainder;
35331 +       unsigned long hash_value, buffer_count, sectors_in_chunk;
35332 +
35333 +       /* Grab the read-lock when checking for an existing remap. */
35334 +       down_read(&snap_volume->snap_semaphore);
35335 +
35336 +       /* Make sure the snapshot has not been disabled. */
35337 +       if ( snap_volume->flags & (EVMS_SNAPSHOT_DISABLED|EVMS_SNAPSHOT_FULL) ||
35338 +            ! org_volume ) {
35339 +               up_read(&snap_volume->snap_semaphore);
35340 +               return -ENOSPC;
35341 +       }
35342 +
35343 +       /* Check for unaligned I/O. This is mostly to prevent XFS from
35344 +        * sending a request that spans a chunk.
35345 +        */
35346 +       alignment = org_sector << EVMS_VSECTOR_SIZE_SHIFT;
35347 +       if ( unlikely(alignment & (org_bh->b_size - 1)) ) {
35348 +               LOG_ERROR("Unaligned request [rsector(%lx), size(%x)] rejected on snapshot %s.\n",
35349 +                         org_bh->b_rsector, org_bh->b_size,
35350 +                         snap_volume->logical_node->name);
35351 +               if (!write_to_snapshot) {
35352 +                       disable_snapshot(snap_volume, TRUE);
35353 +               }
35354 +               up_read(&snap_volume->snap_semaphore);
35355 +               return -EINVAL;
35356 +       }
35357 +
35358 +       /* Search the hash table to see if this sector has already been
35359 +        * remapped on this snapshot.
35360 +        */
35361 +       chunk = org_sector >> snap_volume->chunk_shift;
35362 +       remainder = org_sector & (u64)(snap_volume->chunk_size - 1);
35363 +       hash_value = (unsigned long)chunk % snap_volume->hash_table_size;
35364 +
35365 +       if ( search_snapshot_hash_chain(chunk,
35366 +                                       snap_volume->snapshot_map[hash_value],
35367 +                                       &target_entry) ) {
35368 +               /* Chunk is already remapped. If the remap is still in progress,
35369 +                * queue up this request to be handled later. If the remap is
35370 +                * complete, we can just keep going.
35371 +                */
35372 +               up_read(&snap_volume->snap_semaphore);
35373 +               snap_queue_original_request(snap_volume, org_bh,
35374 +                                           target_entry, remainder,
35375 +                                           queued_org_bh, write_to_snapshot);
35376 +               return 0;
35377 +       }
35378 +
35379 +       /* Convert to a write-lock and check again for a remap.
35380 +        * (Same search and check as just before).
35381 +        */
35382 +       up_read(&snap_volume->snap_semaphore);
35383 +       down_write(&snap_volume->snap_semaphore);
35384 +       if ( search_snapshot_hash_chain(chunk,
35385 +                                       snap_volume->snapshot_map[hash_value],
35386 +                                       &target_entry) ) {
35387 +               /* Chunk is already remapped. If the remap is still in progress,
35388 +                * queue up this request to be handled later. If the remap is
35389 +                * complete, we can just keep going.
35390 +                */
35391 +               up_write(&snap_volume->snap_semaphore);
35392 +               snap_queue_original_request(snap_volume, org_bh,
35393 +                                           target_entry, remainder,
35394 +                                           queued_org_bh, write_to_snapshot);
35395 +               return 0;
35396 +       }
35397 +
35398 +       /* Is there enough room left on this snapshot to remap this chunk? */
35399 +       if ( snap_volume->next_free_chunk >= snap_volume->num_chunks ) {
35400 +               /* Once the snapshot becomes full, further writes to the
35401 +                * original can't be remapped, and thus this snapshot
35402 +                * will become "corrupted".
35403 +                */
35404 +               snap_volume->flags |= EVMS_SNAPSHOT_FULL;
35405 +               set_snapshot_flags(snap_volume->logical_node,
35406 +                                  EVMS_SNAPSHOT_FULL, EVMS_SNAPSHOT_DISABLED);
35407 +               up_write(&snap_volume->snap_semaphore);
35408 +               return -ENOSPC;
35409 +       }
35410 +
35411 +       /* Create and initialize a new hash table entry for the new remap.
35412 +        * The value SNAP_CHUNK_COPYING indicates that this chunk still has to
35413 +        * be read from the original and written to the snapshot.
35414 +        */
35415 +       new_map_entry = allocate_snapshot_hash_entry(snap_volume,
35416 +                                                    chunk,
35417 +                                                    snap_volume->next_free_chunk,
35418 +                                                    SNAP_CHUNK_COPYING);
35419 +       if (!new_map_entry) {
35420 +               /* Can't get memory for map entry. Disable this snapshot. */
35421 +               LOG_ERROR("Memory error allocating hash table entry for snapshot '%s'.\n",
35422 +                         snap_volume->exported_node->name);
35423 +               disable_snapshot(snap_volume, TRUE);
35424 +               up_write(&snap_volume->snap_semaphore);
35425 +               return -ENOMEM;
35426 +       }
35427 +
35428 +       /* Add the entry to the hash table. */
35429 +       if (target_entry) {
35430 +               insert_snapshot_hash_entry(new_map_entry, target_entry);
35431 +       } else {
35432 +               insert_snapshot_hash_entry_at_head(new_map_entry,
35433 +                                                  &(snap_volume->snapshot_map[hash_value]));
35434 +       }
35435 +
35436 +       /* Calculate the number of buffers that will be needed to copy this
35437 +        * chunk, and the starting LBAs for both the org and the snap.
35438 +        */
35439 +       org_chunk_lba = chunk * org_volume->chunk_size;
35440 +       snap_chunk_lba = snap_volume->next_free_chunk * org_volume->chunk_size;
35441 +       snap_volume->next_free_chunk++;
35442 +       sectors_in_chunk = min(((u64)org_volume->chunk_size),
35443 +                              org_volume->logical_node->total_vsectors -
35444 +                              org_chunk_lba);
35445 +       buffer_count = (sectors_in_chunk +
35446 +                       (PAGE_SIZE >> EVMS_VSECTOR_SIZE_SHIFT) - 1) /
35447 +                      (PAGE_SIZE >> EVMS_VSECTOR_SIZE_SHIFT);
35448 +
35449 +       /* Create the parent async_org_io structure if it hasn't been done yet. */
35450 +       if (!*async_org_io) {
35451 +               *async_org_io = allocate_async_org_io(org_volume,
35452 +                                                     org_chunk_lba,
35453 +                                                     buffer_count);
35454 +               if (!*async_org_io) {
35455 +                       // BUGBUG: Disable the snapshot?
35456 +                       BUG();
35457 +               }
35458 +
35459 +               /* If we are only reading a partial chunk from the original,
35460 +                * may need to readjust the size in the last buffer.
35461 +                */
35462 +               if ( (sectors_in_chunk < org_volume->chunk_size) &&
35463 +                    (sectors_in_chunk &
35464 +                     ((PAGE_SIZE >> EVMS_VSECTOR_SIZE_SHIFT) - 1)) ) {
35465 +                       for ( buf1 = (*async_org_io)->copy_buffers;
35466 +                             buf1->buffer_next;
35467 +                             buf1 = buf1->buffer_next ) {
35468 +                               ;
35469 +                       }
35470 +                       buf1->bh->b_size = (sectors_in_chunk <<
35471 +                                           EVMS_VSECTOR_SIZE_SHIFT) &
35472 +                                          (PAGE_SIZE - 1);
35473 +               }
35474 +       }
35475 +
35476 +       /* Create an async_snap_io structure for this snapshot and attach to
35477 +        * the org io structure.
35478 +        */
35479 +       async_snap_io = allocate_async_snap_io(snap_volume, new_map_entry,
35480 +                                              *async_org_io, snap_chunk_lba,
35481 +                                              buffer_count);
35482 +       if (!async_snap_io) {
35483 +               // BUGBUG: Disable the snapshot?
35484 +               BUG();
35485 +       }
35486 +
35487 +       /* Fill in the next entry in the COW table. Copy the COW table to the
35488 +        * buffer to be written out later.
35489 +        */
35490 +       snap_volume->cow_table[snap_volume->next_cow_entry] = cpu_to_le64p(&chunk);
35491 +       snap_volume->next_cow_entry++;
35492 +       cow_buf = async_snap_io->cow_table_buffer;
35493 +       cow_buf->bh->b_rdev = org_bh->b_rdev;
35494 +       cow_buf->bh->b_rsector = snap_volume->current_cow_sector;
35495 +       memcpy(cow_buf->bh->b_data, snap_volume->cow_table, EVMS_VSECTOR_SIZE);
35496 +
35497 +       /* If the COW table is full, reinitialize for the next sector. */
35498 +       if ( snap_volume->next_cow_entry >= (EVMS_VSECTOR_SIZE/sizeof(u64)) ) {
35499 +               snap_volume->next_cow_entry = 0;
35500 +               snap_volume->current_cow_sector++;
35501 +               memset(snap_volume->cow_table, 0xff, EVMS_VSECTOR_SIZE);
35502 +       }
35503 +
35504 +       /* Attach the original buffer head, if it hasn't been queued
35505 +        * already on a different copy.
35506 +        */
35507 +       if (!*queued_org_bh) {
35508 +               org_bh->b_reqnext = NULL;
35509 +               if (write_to_snapshot) {
35510 +                       /* Write to the snapshot. Attach to the async_snap_io. */
35511 +                       org_bh->b_rsector = (new_map_entry->snap_chunk <<
35512 +                                            snap_volume->chunk_shift) +
35513 +                                           remainder;
35514 +                       async_snap_io->pending_writes = org_bh;
35515 +                       async_snap_io->dev = org_bh->b_rdev;
35516 +               } else {
35517 +                       /* Write to the original. Attatch to the async_org_io. */
35518 +                       (*async_org_io)->pending_writes = org_bh;
35519 +                       (*async_org_io)->dev = org_bh->b_rdev;
35520 +               }
35521 +               evms_cs_volume_request_in_progress(org_bh->b_rdev, +1, NULL);
35522 +               *queued_org_bh = TRUE;
35523 +       }
35524 +
35525 +       /* Point the hash table entry at this async_snap_io. Then add this
35526 +        * async_snap_io to the list in the async_org_io, as well as to the
35527 +        * list in the snapshot volume.
35528 +        */
35529 +       new_map_entry->snap_io = async_snap_io;
35530 +
35531 +       async_snap_io->snap_io_list_next = (*async_org_io)->snap_io_list;
35532 +       (*async_org_io)->snap_io_list = async_snap_io;
35533 +       atomic_inc(&(*async_org_io)->copy_count);
35534 +       atomic_inc(&(*async_org_io)->ref_count);
35535 +
35536 +       schedule_cow_table_write(async_snap_io);
35537 +
35538 +       /* Parallel walk through the copy_buffer's in the org and the snap,
35539 +        * updating all necessary pointers and lists.
35540 +        */
35541 +       for ( buf1 = (*async_org_io)->copy_buffers,
35542 +             buf2 = async_snap_io->copy_buffers;
35543 +             buf1 && buf2;
35544 +             buf1 = buf1->buffer_next, buf2 = buf2->buffer_next ) {
35545 +               buf2->copy_next = buf1->copy_next;
35546 +               buf2->buffer_private = async_snap_io;
35547 +               buf2->bh->b_rdev = org_bh->b_rdev;
35548 +               buf2->bh->b_data = buf1->bh->b_data;
35549 +               buf2->bh->b_page = buf1->bh->b_page;
35550 +
35551 +               buf1->bh->b_rdev = org_bh->b_rdev;
35552 +               buf1->copy_next = buf2;
35553 +               buf1->buffer_private = *async_org_io;
35554 +       }
35555 +
35556 +       /* We're done modifying snapshot volume info, so we can release the
35557 +        * lock. We can't start any reads until all snapshots for this original
35558 +        * have been checked. Return and start the reads later.
35559 +        */
35560 +       up_write(&snap_volume->snap_semaphore);
35561 +
35562 +       return 0;
35563 +}
35564 +
35565 +/**
35566 + * snapshot_copy_data
35567 + */
35568 +static void snapshot_copy_data(struct snapshot_volume * org_volume,
35569 +                              struct buffer_head * org_bh)
35570 +{
35571 +       struct snapshot_volume * snap_volume, * next_volume;
35572 +       struct async_org_io * async_org_io = NULL;
35573 +       struct snap_io_buffer * buf;
35574 +       int queued_org_bh = FALSE;
35575 +
35576 +       /* Check each snapshot on this original
35577 +        * to see which one's need a remap.
35578 +        */
35579 +       for ( snap_volume = org_volume->snapshot_next;
35580 +             snap_volume; snap_volume = next_volume ) {
35581 +               next_volume = snap_volume->snapshot_next;
35582 +               snapshot_copy_1(snap_volume, org_bh, &async_org_io,
35583 +                               &queued_org_bh, FALSE);
35584 +       }
35585 +
35586 +       if (async_org_io) {
35587 +               /* One or more snapshots need a remap. The async_io structures
35588 +                * have been built. Now we just need to run through them and
35589 +                * start all of the reads.
35590 +                */
35591 +               for ( buf = async_org_io->copy_buffers;
35592 +                     buf; buf = buf->buffer_next ) {
35593 +                       R_IO(org_volume->logical_node, buf->bh);
35594 +               }
35595 +       } else if (!queued_org_bh) {
35596 +               /* None of the snapshots needed a remap, and we didn't have to
35597 +                * queue this request to be processed later due to a copy in
35598 +                * progress. The write can be sent down normally.
35599 +                */
35600 +               W_IO(org_volume->logical_node, org_bh);
35601 +       }
35602 +}
35603 +
35604 +/**
35605 + * writeable_snapshot_copy_data
35606 + */
35607 +static void writeable_snapshot_copy_data(struct snapshot_volume * snap_volume,
35608 +                                        struct buffer_head * org_bh)
35609 +{
35610 +       struct snapshot_volume * org_volume = snap_volume->snapshot_org;
35611 +       struct async_org_io * async_org_io = NULL;
35612 +       struct snap_io_buffer * buf;
35613 +       int rc, queued_org_bh = FALSE;
35614 +
35615 +       rc = snapshot_copy_1(snap_volume, org_bh, &async_org_io,
35616 +                            &queued_org_bh, TRUE);
35617 +       if ( rc < 0 ) {
35618 +               org_bh->b_end_io(org_bh, 0);
35619 +               return;
35620 +       }
35621 +
35622 +       if (async_org_io) {
35623 +               /* Need to remap this chunk to the snapshot. The async_io
35624 +                * structures have been built. Just need to run through them
35625 +                * and start all of the reads.
35626 +                */
35627 +               for ( buf = async_org_io->copy_buffers; buf;
35628 +                     buf = buf->buffer_next ) {
35629 +                       R_IO(org_volume->logical_node, buf->bh);
35630 +               }
35631 +       } else if (!queued_org_bh) {
35632 +               /* No remap. The write can be sent down immediately. */
35633 +               W_IO(snap_volume->logical_node, org_bh);
35634 +       }
35635 +}
35636 +
35637 +/**
35638 + * snap_write
35639 + */
35640 +static void snap_write(struct evms_logical_node * node,
35641 +                      struct buffer_head * bh)
35642 +{
35643 +       struct snapshot_volume * volume = node->private;
35644 +
35645 +       /* Size check. */
35646 +       if ( bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT) >
35647 +            node->total_vsectors) {
35648 +               bh->b_end_io(bh, 0);
35649 +               return;
35650 +       }
35651 +
35652 +       /* Can't write if rollback is in progress. */
35653 +       if ( volume->flags & EVMS_SNAPSHOT_ROLLBACK ) {
35654 +               LOG_ERROR("Cannot write to snapshot '%s' during rollback.\n",
35655 +                         volume->logical_node->name);
35656 +               bh->b_end_io(bh, 0);
35657 +               return;
35658 +       }
35659 +
35660 +       if ( volume->flags & EVMS_SNAPSHOT ) {
35661 +               /* Snapshot. */
35662 +               if ( volume->flags & EVMS_SNAPSHOT_WRITEABLE ) {
35663 +                       writeable_snapshot_copy_data(volume, bh);
35664 +               } else {
35665 +                       bh->b_end_io(bh, 0);
35666 +               }
35667 +       } else {
35668 +               /* Original. */
35669 +               snapshot_copy_data(volume, bh);
35670 +       }
35671 +}
35672 +
35673 +/**
35674 + * snap_ioctl
35675 + */
35676 +static int snap_ioctl(struct evms_logical_node * logical_node,
35677 +                     struct inode * inode,
35678 +                     struct file * file,
35679 +                     unsigned int cmd,
35680 +                     unsigned long arg)
35681 +{
35682 +       struct snapshot_volume * volume = logical_node->private;
35683 +       struct evms_quiesce_vol_pkt * quiesce;
35684 +       struct evms_plugin_ioctl_pkt pkt, * user_pkt;
35685 +       int percent_full, rc = 0;
35686 +
35687 +       switch (cmd) {
35688 +       case EVMS_QUIESCE_VOLUME:
35689 +               quiesce = (struct evms_quiesce_vol_pkt*)arg;
35690 +               if (quiesce->command) {
35691 +                       /* Quiesce */
35692 +                       volume->flags |= EVMS_SNAPSHOT_QUIESCED;
35693 +               } else {
35694 +                       /* Un-quiesce */
35695 +                       volume->flags &= ~EVMS_SNAPSHOT_QUIESCED;
35696 +               }
35697 +               break;
35698 +
35699 +       case EVMS_GET_BMAP:
35700 +               if ( volume->flags & EVMS_SNAPSHOT_ORG ) {
35701 +                       rc = IOCTL(volume->logical_node, inode, file, cmd, arg);
35702 +               } else {
35703 +                       rc = -EINVAL;
35704 +               }
35705 +               break;
35706 +
35707 +       case EVMS_PLUGIN_IOCTL:
35708 +               user_pkt = (struct evms_plugin_ioctl_pkt *)arg;
35709 +
35710 +               /* Copy user's parameters to kernel space. */
35711 +               if ( copy_from_user(&pkt, user_pkt, sizeof(pkt)) ) {
35712 +                       rc = -EFAULT;
35713 +                       break;
35714 +               }
35715 +
35716 +               if ( pkt.feature_id != logical_node->plugin->id ) {
35717 +                       /* This ioctl is not targetted at snapshotting, so
35718 +                        * broadcast the command to all children.
35719 +                        */
35720 +                       rc = IOCTL(logical_node, inode, file, cmd, arg);
35721 +                       break;
35722 +               }
35723 +
35724 +               switch (pkt.feature_command) {
35725 +               case SNAPSHOT_QUERY_PERCENT_FULL:
35726 +                       if ( volume->flags & EVMS_SNAPSHOT_FULL ) {
35727 +                               percent_full = -1;
35728 +                       } else if ( volume->flags & EVMS_SNAPSHOT_DISABLED ) {
35729 +                               percent_full = -2;
35730 +                       } else {
35731 +                               percent_full = (volume->next_free_chunk * 100) /
35732 +                                               volume->num_chunks;
35733 +                       }
35734 +                       rc = copy_to_user(pkt.feature_ioctl_data,
35735 +                                         &percent_full,
35736 +                                         sizeof(percent_full));
35737 +                       break;
35738 +
35739 +               case SNAPSHOT_START_ROLLBACK:
35740 +                       if ( volume->flags & EVMS_SNAPSHOT_FULL ) {
35741 +                               rc = -ENOSPC;
35742 +                       } else if ( volume->flags & EVMS_SNAPSHOT_DISABLED ) {
35743 +                               rc = -EIO;
35744 +                       } else if ( ! (volume->flags & EVMS_SNAPSHOT) ) {
35745 +                               rc = -EINVAL;
35746 +                       } else {
35747 +                               set_snapshot_flags(volume->logical_node,
35748 +                                                  EVMS_SNAPSHOT_ROLLBACK, 0);
35749 +                       }
35750 +                       break;
35751 +
35752 +               case SNAPSHOT_CHECK_STATE:
35753 +                       rc = copy_to_user(pkt.feature_ioctl_data,
35754 +                                         &volume->flags,
35755 +                                         sizeof(volume->flags));
35756 +                       break;
35757 +
35758 +               default:
35759 +                       rc = -EINVAL;
35760 +               }
35761 +               break;
35762 +
35763 +       case EVMS_CHECK_MEDIA_CHANGE:
35764 +       case EVMS_REVALIDATE_DISK:
35765 +       case EVMS_GET_DISK_LIST:
35766 +       case EVMS_CHECK_DEVICE_STATUS:
35767 +               /* Broadcast these to all children. */
35768 +               if ( ! (volume->flags & EVMS_SNAPSHOT_ORG) ) {
35769 +                       volume = volume->snapshot_org;
35770 +               }
35771 +               while (volume) {
35772 +                       rc = IOCTL(volume->logical_node, inode, file, cmd, arg);
35773 +                       volume = volume->snapshot_next;
35774 +               }
35775 +               break;
35776 +
35777 +       case EVMS_OPEN_VOLUME:
35778 +               /* Disallow opens on rollback in progress.
35779 +                * Otherwise fall through.
35780 +                */
35781 +               if ( volume->flags & EVMS_SNAPSHOT_ROLLBACK) {
35782 +                       LOG_ERROR("Cannot open snapshot volume '%s' during rollback\n",
35783 +                                 volume->logical_node->name);
35784 +                       rc = -EBUSY;
35785 +                       break;
35786 +               }
35787 +
35788 +       default:
35789 +               rc = IOCTL(volume->logical_node, inode, file, cmd, arg);
35790 +
35791 +       }
35792 +       return rc;
35793 +}
35794 +
35795 +/**
35796 + * snap_init_io
35797 + */
35798 +static int snap_init_io(struct evms_logical_node * node,
35799 +                       int rw,
35800 +                       u64 sect_nr,
35801 +                       u64 num_sects,
35802 +                       void * buf_addr)
35803 +{
35804 +       struct snapshot_volume * volume = node->private;
35805 +
35806 +       /* No init io access to snapshot, and no writes allowed to original
35807 +        * since they would not be snapshotted.
35808 +        */
35809 +       if ( rw || (volume->flags & EVMS_SNAPSHOT) ) {
35810 +               return -EINVAL;
35811 +       }
35812 +       return INIT_IO(volume->logical_node, rw,
35813 +                      sect_nr, num_sects, buf_addr);
35814 +}
35815 +
35816 +/**
35817 + * add_cow_entry_to_snapshot_map
35818 + *
35819 + * This function takes a cow table entry (from the on-disk data), and
35820 + * converts it into an appropriate entry for the snapshot map, and
35821 + * inserts it into the appropriate map for the specified volume.
35822 + */
35823 +static int add_cow_entry_to_snapshot_map(u64 org_chunk,
35824 +                                        u64 snap_chunk,
35825 +                                        struct snapshot_volume * volume)
35826 +{
35827 +       struct snapshot_hash_entry * new_entry, * target_entry;
35828 +       unsigned long hash_value;
35829 +
35830 +       new_entry = allocate_snapshot_hash_entry(volume, org_chunk,
35831 +                                                snap_chunk, SNAP_CHUNK_COPIED);
35832 +       if (!new_entry) {
35833 +               return -ENOMEM;
35834 +       }
35835 +
35836 +       hash_value = (long)org_chunk % volume->hash_table_size;
35837 +       if ( search_snapshot_hash_chain(org_chunk,
35838 +                                       volume->snapshot_map[hash_value],
35839 +                                       &target_entry) ) {
35840 +               /* A duplicate mapping was found. This should never happen. */
35841 +       } else {
35842 +               if (target_entry) {
35843 +                       insert_snapshot_hash_entry(new_entry, target_entry);
35844 +               } else {
35845 +                       insert_snapshot_hash_entry_at_head(new_entry,
35846 +                                                          &(volume->snapshot_map[hash_value]));
35847 +               }
35848 +       }
35849 +       return 0;
35850 +}
35851 +
35852 +/**
35853 + * build_snapshot_maps
35854 + *
35855 + * Construct the initial hash table state based on
35856 + * existing COW tables on the disk.
35857 + */
35858 +static int build_snapshot_maps(struct snapshot_volume * volume)
35859 +{
35860 +       int rc = 0;
35861 +       int done = FALSE;
35862 +       while (!done) {
35863 +               /* Read in one sector's worth of COW tables. */
35864 +               if ( INIT_IO(volume->logical_node, 0,
35865 +                            volume->current_cow_sector, 1,
35866 +                            volume->cow_table) ) {
35867 +                       return -EIO;
35868 +               }
35869 +
35870 +               /* Translate every valid COW table entry into
35871 +                * a snapshot map entry.
35872 +                */
35873 +               for ( volume->next_cow_entry = 0;
35874 +                     volume->next_cow_entry < (EVMS_VSECTOR_SIZE/sizeof(u64)) &&
35875 +                     volume->cow_table[volume->next_cow_entry] != 0xffffffffffffffff;
35876 +                     volume->next_cow_entry++, volume->next_free_chunk++ ) {
35877 +                       rc = add_cow_entry_to_snapshot_map(le64_to_cpup(&volume->cow_table[volume->next_cow_entry]),
35878 +                                                          volume->next_free_chunk,
35879 +                                                          volume);
35880 +                       if (rc) {
35881 +                               return(rc);
35882 +                       }
35883 +               }
35884 +
35885 +               /* Move on to the next sector if necessary. */
35886 +               if ( volume->next_cow_entry ==
35887 +                    (EVMS_VSECTOR_SIZE/sizeof(u64)) ) {
35888 +                       volume->current_cow_sector++;
35889 +               } else {
35890 +                       done = TRUE;
35891 +               }
35892 +       }
35893 +       return 0;
35894 +}
35895 +
35896 +/**
35897 + * initialize_snapshot_node
35898 + */
35899 +static int initialize_snapshot_node(struct evms_logical_node * snap_node,
35900 +                                   struct evms_logical_node * new_snap_node,
35901 +                                   struct evms_logical_node * org_node,
35902 +                                   struct snapshot_metadata * metadata)
35903 +{
35904 +       struct snapshot_volume * snap_volume;
35905 +       struct snapshot_hash_entry * new_entry;
35906 +       int i, rc = 0;
35907 +
35908 +       /* Instance data for the snapshot. */
35909 +       snap_volume = kmalloc(sizeof(struct snapshot_volume), GFP_KERNEL);
35910 +       if (!snap_volume) {
35911 +               set_snapshot_flags(snap_node, EVMS_SNAPSHOT_DISABLED, 0);
35912 +               snap_delete_volume(new_snap_node);
35913 +               DELETE(snap_node);
35914 +               return -ENOMEM;
35915 +       }
35916 +       memset(snap_volume, 0, sizeof(struct snapshot_volume));
35917 +
35918 +       /* Initialize the snapshot node. */
35919 +       new_snap_node->total_vsectors = org_node->total_vsectors;
35920 +       new_snap_node->plugin = &plugin_header;
35921 +       new_snap_node->private = snap_volume;
35922 +       new_snap_node->flags = snap_node->flags |
35923 +                               (org_node->flags & (EVMS_DEVICE_REMOVABLE | EVMS_VOLUME_PARTIAL)) |
35924 +                               ((metadata->flags & EVMS_SNAPSHOT_WRITEABLE) ? 0 : EVMS_VOLUME_READ_ONLY);
35925 +       new_snap_node->hardsector_size = snap_node->hardsector_size;
35926 +       new_snap_node->block_size = snap_node->block_size;
35927 +       new_snap_node->system_id = EVMS_SNAPSHOT_SIGNATURE;
35928 +       new_snap_node->volume_info = snap_node->volume_info;
35929 +       /* Get the new node's name from the consumed node's feature header. */
35930 +       strcpy(new_snap_node->name, snap_node->feature_header->object_name);
35931 +
35932 +       /* Initialize the private data. */
35933 +       snap_volume->logical_node = snap_node;
35934 +       snap_volume->exported_node = new_snap_node;
35935 +       init_rwsem(&snap_volume->snap_semaphore);
35936 +       snap_volume->chunk_size = metadata->chunk_size;
35937 +       snap_volume->chunk_shift = evms_cs_log2((u64)metadata->chunk_size);
35938 +       snap_volume->num_chunks = metadata->total_chunks;
35939 +       snap_volume->current_cow_sector = metadata->lba_of_COW_table;
35940 +       snap_volume->hash_table_size = metadata->total_chunks / MAX_HASH_CHAIN_ENTRIES + 1;
35941 +       snap_volume->flags = EVMS_SNAPSHOT |
35942 +                            (metadata->flags & EVMS_SNAPSHOT_WRITEABLE) |
35943 +                            (metadata->flags & EVMS_SNAPSHOT_ASYNC);
35944 +       INIT_LIST_HEAD(&snap_volume->cow_table_write_list);
35945 +       spin_lock_init(&snap_volume->cow_table_write_list_lock);
35946 +
35947 +#ifdef SNAPSHOT_DEBUG
35948 +       snap_volume->cow_table_writes = (atomic_t)ATOMIC_INIT(0);
35949 +       snap_volume->cow_table_overlaps = (atomic_t)ATOMIC_INIT(0);
35950 +#endif
35951 +
35952 +       if ( metadata->flags & EVMS_SNAPSHOT_ROLLBACK ) {
35953 +
35954 +               /* Buffer for reading rollback data. */
35955 +               snap_volume->chunk_data_buffer = kmalloc(SNAPSHOT_CHUNK_BUFFER_SIZE <<
35956 +                                                        EVMS_VSECTOR_SIZE_SHIFT,
35957 +                                                        GFP_KERNEL);
35958 +               if (!snap_volume->chunk_data_buffer) {
35959 +                       disable_snapshot(snap_volume, TRUE);
35960 +                       snap_delete_volume(new_snap_node);
35961 +                       return -ENOMEM;
35962 +               }
35963 +
35964 +               /* Create the rollback thread. */
35965 +               snap_volume->rollback_thread =
35966 +                       evms_cs_register_thread(snapshot_do_rollback,
35967 +                                               snap_volume,
35968 +                                               "evms_snapshot_rollback");
35969 +               if (!snap_volume->rollback_thread){
35970 +                       LOG_SERIOUS("Could not start rollback thread for snapshot '%s'.\n",
35971 +                                   snap_node->name);
35972 +                       disable_snapshot(snap_volume, TRUE);
35973 +                       snap_delete_volume(new_snap_node);
35974 +                       return -ENOMEM;
35975 +               }
35976 +       } else {
35977 +               /* Snapshot hash table. */
35978 +               snap_volume->snapshot_map = vmalloc(snap_volume->hash_table_size *
35979 +                                                   sizeof(struct snapshot_hash_entry*));
35980 +               if (!snap_volume->snapshot_map) {
35981 +                       disable_snapshot(snap_volume, TRUE);
35982 +                       snap_delete_volume(new_snap_node);
35983 +                       return -ENOMEM;
35984 +               }
35985 +               memset(snap_volume->snapshot_map, 0,
35986 +                      snap_volume->hash_table_size *
35987 +                      sizeof(struct snapshot_hash_entry*));
35988 +
35989 +               /* Pre-allocate all of the hash entries we will need and
35990 +                * store them in the free list in the volume.
35991 +                */
35992 +               for ( i = 0; i < snap_volume->num_chunks; i++ ) {
35993 +                       new_entry = mempool_alloc(snap_hash_entry_pool,
35994 +                                                 GFP_KERNEL);
35995 +                       if (!new_entry) {
35996 +                               disable_snapshot(snap_volume, TRUE);
35997 +                               snap_delete_volume(new_snap_node);
35998 +                               return -ENOMEM;
35999 +                       }
36000 +                       new_entry->next = snap_volume->free_hash_list;
36001 +                       snap_volume->free_hash_list = new_entry;
36002 +               }
36003 +
36004 +               rc = build_snapshot_maps(snap_volume);
36005 +               if (rc) {
36006 +                       disable_snapshot(snap_volume, TRUE);
36007 +                       snap_delete_volume(new_snap_node);
36008 +                       return rc;
36009 +               }
36010 +       }
36011 +
36012 +       return 0;
36013 +}
36014 +
36015 +/**
36016 + * initialize_original_node
36017 + */
36018 +static int initialize_original_node(struct evms_logical_node * snap_node,
36019 +                                   struct evms_logical_node * new_snap_node,
36020 +                                   struct evms_logical_node * org_node,
36021 +                                   struct evms_logical_node * new_org_node)
36022 +{
36023 +       struct snapshot_volume * snap_volume = new_snap_node->private;
36024 +       struct snapshot_volume * org_volume;
36025 +
36026 +       /* Instance data for the original. */
36027 +       org_volume = kmalloc(sizeof(struct snapshot_volume), GFP_KERNEL);
36028 +       if (!org_volume) {
36029 +               disable_snapshot(snap_volume, TRUE);
36030 +               snap_delete_volume(new_snap_node);
36031 +               snap_delete_volume(new_org_node);
36032 +               return -ENOMEM;
36033 +       }
36034 +       memset(org_volume, 0, sizeof(struct snapshot_volume));
36035 +
36036 +       /* Initialize the new node. */
36037 +       new_org_node->total_vsectors = org_node->total_vsectors;
36038 +       new_org_node->plugin = &plugin_header;
36039 +       new_org_node->private = org_volume;
36040 +       new_org_node->flags = org_node->flags |
36041 +                             (snap_node->flags &
36042 +                              (EVMS_DEVICE_REMOVABLE | EVMS_VOLUME_PARTIAL));
36043 +       new_org_node->hardsector_size = org_node->hardsector_size;
36044 +       new_org_node->block_size = org_node->block_size;
36045 +       new_org_node->system_id = EVMS_ORIGINAL_SIGNATURE;
36046 +       new_org_node->volume_info = org_node->volume_info;
36047 +       /* Must reuse the original node's name. */
36048 +       strcpy(new_org_node->name, org_node->name);
36049 +
36050 +       /* Initialize the private data. */
36051 +       org_volume->logical_node = org_node;
36052 +       org_volume->exported_node = new_org_node;
36053 +       init_rwsem(&org_volume->snap_semaphore);
36054 +       org_volume->chunk_size = snap_volume->chunk_size;
36055 +       org_volume->chunk_shift = snap_volume->chunk_shift;
36056 +       org_volume->flags = EVMS_SNAPSHOT_ORG |
36057 +                           (snap_volume->flags & EVMS_SNAPSHOT_ASYNC);
36058 +       INIT_LIST_HEAD(&org_volume->chunk_write_list);
36059 +       spin_lock_init(&org_volume->chunk_write_list_lock);
36060 +       INIT_LIST_HEAD(&org_volume->org_pending_io_list);
36061 +       spin_lock_init(&org_volume->org_pending_io_list_lock);
36062 +       INIT_LIST_HEAD(&org_volume->snap_pending_io_list);
36063 +       spin_lock_init(&org_volume->snap_pending_io_list_lock);
36064 +
36065 +       /* Start the async I/O thread for this original. */
36066 +       org_volume->async_io_thread =
36067 +               evms_cs_register_thread(snap_async_io_thread, org_volume,
36068 +                                       "evms_async_snapshot");
36069 +       if (!org_volume->async_io_thread) {
36070 +               disable_snapshot(snap_volume, TRUE);
36071 +               snap_delete_volume(new_snap_node);
36072 +               snap_delete_volume(new_org_node);
36073 +               return -ENOMEM;
36074 +       }
36075 +
36076 +       return 0;
36077 +}
36078 +
36079 +/**
36080 + * add_snapshot
36081 + *
36082 + * Initializes a snapshot instance and exports an evms_logical_node to
36083 + * the global list.
36084 + */
36085 +static int add_snapshot(struct evms_logical_node * snap_node,
36086 +                       struct snapshot_metadata * metadata,
36087 +                       struct evms_logical_node ** evms_node_list)
36088 +{
36089 +       struct evms_logical_node * new_snap_node;
36090 +       struct evms_logical_node * new_org_node;
36091 +       struct evms_logical_node * org_node;
36092 +       struct snapshot_volume * snap_volume;
36093 +       struct snapshot_volume * org_volume;
36094 +       struct snapshot_volume * tmp_volume;
36095 +       int rc = 0;
36096 +
36097 +       /* Make sure the snapshot is not full or disabled. */
36098 +       if ( metadata->flags & (EVMS_SNAPSHOT_DISABLED | EVMS_SNAPSHOT_FULL) ) {
36099 +               LOG_WARNING("Error: Snapshot %s discovered as disabled/full.\n",
36100 +                           snap_node->name);
36101 +               LOG_WARNING("       Deleting from further use.\n");
36102 +               DELETE(snap_node);
36103 +               return -ENOSPC;
36104 +       }
36105 +
36106 +       /* Inspect the global list until a node is found with the name of
36107 +        * this snapshot's original. There can only be one original for
36108 +        * each snapshot.
36109 +        */
36110 +       for ( org_node = *evms_node_list;
36111 +             org_node && strncmp(EVMS_GET_NODE_NAME(org_node),
36112 +                                 metadata->original_volume,
36113 +                                 EVMS_VOLUME_NAME_SIZE);
36114 +             org_node = org_node->next ) {
36115 +               ;
36116 +       }
36117 +       if (!org_node) {
36118 +               /* No original was found. Disable and delete the snapshot. */
36119 +               LOG_ERROR("Error: No original found for snapshot %s, looking for %s\n",
36120 +                         snap_node->name, metadata->original_volume);
36121 +               set_snapshot_flags(snap_node, EVMS_SNAPSHOT_DISABLED, 0);
36122 +               DELETE(snap_node);
36123 +               return -ENODEV;
36124 +       }
36125 +
36126 +       LOG_DEBUG("Adding snapshot for '%s'\n", org_node->name);
36127 +
36128 +       /* We found the original on the list. Verify the size to be sure the
36129 +        * name didn't change for compatibility. For non-512-byte hardsector
36130 +        * sizes, round down org node to a hardsector multiple to be the same
36131 +        * as what was stored in the metadata.
36132 +        */
36133 +       if ( (org_node->total_vsectors &
36134 +             (~((org_node->hardsector_size/EVMS_VSECTOR_SIZE)-1))) !=
36135 +            metadata->original_size ) {
36136 +               /* The snapshot no longer points at a valid original.
36137 +                * Disable and delete the snapshot.
36138 +                */
36139 +               LOG_ERROR("Error: Original volume size does not match for snapshot '%s'!\n",
36140 +                         snap_node->name);
36141 +               LOG_ERROR("         volume=%s: org_size="PFU64", current size="PFU64"\n",
36142 +                         org_node->name, metadata->original_size,
36143 +                         org_node->total_vsectors);
36144 +               set_snapshot_flags(snap_node, EVMS_SNAPSHOT_DISABLED, 0);
36145 +               DELETE(snap_node);
36146 +               return -ENODEV;
36147 +       }
36148 +
36149 +       /* New EVMS node for the snapshot. */
36150 +       if ( evms_cs_allocate_logical_node(&new_snap_node) ) {
36151 +               set_snapshot_flags(snap_node, EVMS_SNAPSHOT_DISABLED, 0);
36152 +               DELETE(snap_node);
36153 +               return -ENOMEM;
36154 +       }
36155 +
36156 +       MOD_INC_USE_COUNT;
36157 +       snapshot_count++;
36158 +
36159 +       snapshot_create_pools();
36160 +
36161 +       rc = initialize_snapshot_node(snap_node, new_snap_node,
36162 +                                     org_node, metadata);
36163 +       if (rc) {
36164 +               return rc;
36165 +       }
36166 +       snap_volume = new_snap_node->private;
36167 +
36168 +       /* Check to see if the node we found is one we put back on the list due
36169 +        * to another snapshot of the original, if so then don't allocate a new
36170 +        * node and volume info, just get the old one.
36171 +        */
36172 +       if ( org_node->plugin->id != plugin_header.id ) {
36173 +
36174 +               /* New EVMS node for the original. */
36175 +               if ( evms_cs_allocate_logical_node(&new_org_node) ) {
36176 +                       disable_snapshot(snap_volume, TRUE);
36177 +                       snap_delete_volume(new_snap_node);
36178 +                       return -ENOMEM;
36179 +               }
36180 +
36181 +               MOD_INC_USE_COUNT;
36182 +               snapshot_count++;
36183 +
36184 +               rc = initialize_original_node(snap_node, new_snap_node,
36185 +                                             org_node, new_org_node);
36186 +               if (rc) {
36187 +                       return rc;
36188 +               }
36189 +               org_volume = new_org_node->private;
36190 +
36191 +               /* Remove the original volume from the global list, then
36192 +                * add the new version of the original to the global list.
36193 +                */
36194 +               evms_cs_remove_logical_node_from_list(evms_node_list, org_node);
36195 +               evms_cs_add_logical_node_to_list(evms_node_list, new_org_node);
36196 +       } else {
36197 +               /* There is already at least one snapshot for this original. */
36198 +               new_org_node = org_node;
36199 +               org_volume = new_org_node->private;
36200 +               org_node = org_volume->logical_node;
36201 +
36202 +               /* Make sure this snapshot matches the current
36203 +                * chunk size if we have async snapshots.
36204 +                */
36205 +               if ( snap_volume->chunk_size != org_volume->chunk_size ) {
36206 +                       LOG_ERROR("Cannot add snapshot '%s' with chunk size %u to original '%s' with chunk size %u.\n",
36207 +                                 new_snap_node->name, snap_volume->chunk_size,
36208 +                                 new_org_node->name, org_volume->chunk_size);
36209 +                       disable_snapshot(snap_volume, TRUE);
36210 +                       snap_delete_volume(new_snap_node);
36211 +                       return -EINVAL;
36212 +               }
36213 +
36214 +               /* If the new snapshot is Removable or Partial, propogate
36215 +                * the flags to the original and all other snapshots.
36216 +                */
36217 +               for ( tmp_volume = org_volume;
36218 +                     tmp_volume;
36219 +                     tmp_volume = tmp_volume->snapshot_next) {
36220 +                       tmp_volume->exported_node->flags |=
36221 +                               (snap_node->flags &
36222 +                                (EVMS_DEVICE_REMOVABLE | EVMS_VOLUME_PARTIAL));
36223 +               }
36224 +       }
36225 +
36226 +       /* Create a proc-fs entry for this snapshot. */
36227 +       if (snap_proc) {
36228 +               create_proc_read_entry(snap_node->feature_header->volume_name,
36229 +                                      S_IFREG, snap_proc,
36230 +                                      snap_proc_read, new_snap_node);
36231 +       }
36232 +
36233 +       /* Insert the new snapshot at the start of the original's chain. */
36234 +       down_write(&org_volume->snap_semaphore);
36235 +       snap_volume->snapshot_next = org_volume->snapshot_next;
36236 +       org_volume->snapshot_next = snap_volume;
36237 +       snap_volume->snapshot_org = org_volume;
36238 +       up_write(&org_volume->snap_semaphore);
36239 +
36240 +       /* Place the new snapshot on the global list. */
36241 +       evms_cs_add_logical_node_to_list(evms_node_list, new_snap_node);
36242 +
36243 +       if ( metadata->flags & EVMS_SNAPSHOT_ROLLBACK ) {
36244 +               org_volume->flags |= EVMS_SNAPSHOT_ROLLBACK;
36245 +               snap_volume->flags |= EVMS_SNAPSHOT_ROLLBACK;
36246 +               evms_cs_wakeup_thread(snap_volume->rollback_thread);
36247 +       }
36248 +
36249 +       return 0;
36250 +}
36251 +
36252 +/**
36253 + * do_rollback
36254 + */
36255 +void snapshot_do_rollback(void * volume)
36256 +{
36257 +       struct snapshot_volume * snap_volume = volume;
36258 +       struct snapshot_volume * org_volume = snap_volume->snapshot_org;
36259 +       u32 io_size = snap_volume->chunk_size;
36260 +       u32 sectors = io_size;
36261 +       int done = FALSE;
36262 +       int i, iterations = 1;
36263 +
36264 +       evms_cs_invalidate_volume(org_volume->exported_node);
36265 +       evms_cs_invalidate_volume(snap_volume->exported_node);
36266 +
36267 +       /* Safety to start at chunk 0. */
36268 +       snap_volume->next_free_chunk = 0;
36269 +       while (!done) {
36270 +
36271 +               if ( SNAPSHOT_CHUNK_BUFFER_SIZE < snap_volume->chunk_size ) {
36272 +                       iterations = snap_volume->chunk_size /
36273 +                                    org_volume->chunk_size;
36274 +                       sectors = io_size = org_volume->chunk_size;
36275 +               }
36276 +
36277 +               /* Read in one sector's worth of COW tables. */
36278 +               if ( INIT_IO(snap_volume->logical_node, 0,
36279 +                            snap_volume->current_cow_sector, 1,
36280 +                            snap_volume->cow_table) ) {
36281 +                       LOG_ERROR("Error reading COW table from snapshot during rollback, aborting rollback\n");
36282 +                       return;
36283 +               }
36284 +
36285 +               /* Translate every valid COW table entry into
36286 +                * a snapshot map entry.
36287 +                */
36288 +               for ( snap_volume->next_cow_entry = 0;
36289 +                     snap_volume->next_cow_entry <
36290 +                      (EVMS_VSECTOR_SIZE/sizeof(u64)) &&
36291 +                     snap_volume->cow_table[snap_volume->next_cow_entry] !=
36292 +                      0xffffffffffffffff;
36293 +                     snap_volume->next_cow_entry++,
36294 +                     snap_volume->next_free_chunk++ ) {
36295 +                       for ( i = 0; i < iterations; i++ ) {
36296 +
36297 +                               /* Don't go off the end of the original. */
36298 +                               if ( io_size >
36299 +                                    org_volume->logical_node->total_vsectors -
36300 +                                     (snap_volume->cow_table[snap_volume->next_cow_entry] *
36301 +                                      snap_volume->chunk_size + i * io_size) ) {
36302 +                                       sectors = org_volume->logical_node->total_vsectors -
36303 +                                                 (snap_volume->cow_table[snap_volume->next_cow_entry] *
36304 +                                                 snap_volume->chunk_size + i * io_size);
36305 +                               }
36306 +
36307 +                               /* Read the chunk from the snapshot volume. */
36308 +                               if ( INIT_IO(snap_volume->logical_node, READ,
36309 +                                            (snap_volume->next_free_chunk *
36310 +                                             snap_volume->chunk_size +
36311 +                                             i*io_size),
36312 +                                            sectors,
36313 +                                            snap_volume->chunk_data_buffer) ) {
36314 +                                       LOG_ERROR("Error reading chunk %u from snapshot '%s'. Continuing.\n",
36315 +                                                 snap_volume->next_free_chunk,
36316 +                                                 snap_volume->logical_node->name);
36317 +                               }
36318 +
36319 +                               /* Write the chunk to the original volume. */
36320 +                               if ( INIT_IO(org_volume->logical_node, WRITE,
36321 +                                            snap_volume->cow_table[snap_volume->next_cow_entry] *
36322 +                                            snap_volume->chunk_size + i*io_size,
36323 +                                            sectors,
36324 +                                            snap_volume->chunk_data_buffer) ) {
36325 +                                       LOG_ERROR("Error writing chunk %u to original '%s' during rollback. Continuing.\n",
36326 +                                                 snap_volume->next_free_chunk,
36327 +                                                 org_volume->logical_node->name);
36328 +                               }
36329 +
36330 +                               if ( sectors < io_size ) {
36331 +                                       break;
36332 +                               }
36333 +                       }
36334 +               }
36335 +
36336 +               /* Move on to the next COW table sector if necessary. */
36337 +               if ( snap_volume->next_cow_entry ==
36338 +                    (EVMS_VSECTOR_SIZE/sizeof(u64)) ) {
36339 +                       snap_volume->current_cow_sector++;
36340 +               } else {
36341 +                       done = TRUE;
36342 +                       snap_volume->flags |= EVMS_SNAPSHOT_DISABLED |
36343 +                                             EVMS_SNAPSHOT_ROLLBACK_COMP;
36344 +                       snap_volume->flags &= ~EVMS_SNAPSHOT_ROLLBACK;
36345 +                       org_volume->flags &= ~EVMS_SNAPSHOT_ROLLBACK;
36346 +                       set_snapshot_flags(snap_volume->logical_node,
36347 +                                          EVMS_SNAPSHOT_DISABLED |
36348 +                                          EVMS_SNAPSHOT_ROLLBACK_COMP,
36349 +                                          EVMS_SNAPSHOT_ROLLBACK);
36350 +                       LOG_DEFAULT("Rollback complete from snapshot %s\n",
36351 +                                   snap_volume->exported_node->name);
36352 +               }
36353 +       }
36354 +}
36355 +
36356 +/**
36357 + * snap_proc_read
36358 + *
36359 + * Callback function for the proc-fs entry for each snapshot node.
36360 + * Print out pertinent information about this snapshot. The "data"
36361 + * parameter is a pointer to an EVMS logical node.
36362 + */
36363 +static int snap_proc_read(char * page, char ** start, off_t off,
36364 +                         int count, int * eof, void * data)
36365 +{
36366 +       struct evms_logical_node * snap_node = data;
36367 +       struct snapshot_volume * snap_volume = snap_node->private;
36368 +       int sz = 0;
36369 +
36370 +       PROCPRINT("Snapshot of    : %s\n",      (snap_volume->snapshot_org) ? EVMS_GET_NODE_NAME(snap_volume->snapshot_org->logical_node) : (u8 *)"Unknown");
36371 +       PROCPRINT("Size (KB)      : %u\n",      (snap_volume->num_chunks * snap_volume->chunk_size)/2);
36372 +       PROCPRINT("Chunk Size (KB): %u\n",      (snap_volume->chunk_size)/2);
36373 +       PROCPRINT("Writeable      : %s\n",      (snap_volume->flags & EVMS_SNAPSHOT_WRITEABLE) ? "Yes" : "No");
36374 +       PROCPRINT("Usage          : %u%%\n",    (snap_volume->next_free_chunk * 100) / snap_volume->num_chunks);
36375 +       PROCPRINT("Status         : %s\n",      (snap_volume->flags & EVMS_SNAPSHOT_FULL) ? "Full / Disabled" : (snap_volume->flags & EVMS_SNAPSHOT_DISABLED) ? "Disabled" : "Active");
36376 +#ifdef SNAPSHOT_DEBUG
36377 +       PROCPRINT("Next free chunk: %u\n",      snap_volume->next_free_chunk);
36378 +       PROCPRINT("COW Writes     : %u\n",      atomic_read(&snap_volume->cow_table_writes));
36379 +       PROCPRINT("COW Overlaps   : %u\n",      atomic_read(&snap_volume->cow_table_overlaps));
36380 +#endif
36381 +
36382 +out:
36383 +       *start = page + off;
36384 +       sz -= off;
36385 +       if (sz < 0)
36386 +               sz = 0;
36387 +       return sz > count ? count : sz;
36388 +}
36389 +
36390 +/**
36391 + * snapshot_init
36392 + */
36393 +int __init snapshot_init(void)
36394 +{
36395 +       struct proc_dir_entry * pde;
36396 +
36397 +       /* Register a directory in proc-fs. */
36398 +       pde = evms_cs_get_evms_proc_dir();
36399 +       if (pde) {
36400 +               snap_proc = create_proc_entry("snapshot", S_IFDIR, pde);
36401 +       }
36402 +
36403 +       /* Register with EVMS. */
36404 +       return evms_cs_register_plugin(&plugin_header);
36405 +}
36406 +
36407 +/**
36408 + * snapshot_exit
36409 + */
36410 +void __exit snapshot_exit(void)
36411 +{
36412 +       struct proc_dir_entry * pde;
36413 +
36414 +       /* Unregister the directory in proc-fs. */
36415 +       pde = evms_cs_get_evms_proc_dir();
36416 +       if (pde) {
36417 +               remove_proc_entry("snapshot", pde);
36418 +       }
36419 +
36420 +       evms_cs_unregister_plugin(&plugin_header);
36421 +}
36422 +
36423 +module_init(snapshot_init);
36424 +module_exit(snapshot_exit);
36425 +#ifdef MODULE_LICENSE
36426 +MODULE_LICENSE("GPL");
36427 +#endif
36428 +
36429 diff -Naur linux-2002-09-30/include/linux/evms/evms.h evms-2002-09-30/include/linux/evms/evms.h
36430 --- linux-2002-09-30/include/linux/evms/evms.h  Wed Dec 31 18:00:00 1969
36431 +++ evms-2002-09-30/include/linux/evms/evms.h   Thu Sep 26 11:55:45 2002
36432 @@ -0,0 +1,575 @@
36433 +/* -*- linux-c -*- */
36434 +/*
36435 + *   Copyright (c) International Business Machines  Corp., 2000
36436 + *
36437 + *   This program is free software;  you can redistribute it and/or modify
36438 + *   it under the terms of the GNU General Public License as published by
36439 + *   the Free Software Foundation; either version 2 of the License, or
36440 + *   (at your option) any later version.
36441 + *
36442 + *   This program is distributed in the hope that it will be useful,
36443 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
36444 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
36445 + *   the GNU General Public License for more details.
36446 + *
36447 + *   You should have received a copy of the GNU General Public License
36448 + *   along with this program;  if not, write to the Free Software
36449 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
36450 + */
36451 +/*
36452 + * linux/include/linux/evms/evms.h
36453 + *
36454 + * EVMS kernel header file
36455 + *
36456 + */
36457 +
36458 +#ifndef __EVMS_INCLUDED__
36459 +#define __EVMS_INCLUDED__
36460 +
36461 +#include <linux/blk.h>
36462 +#include <linux/genhd.h>
36463 +#include <linux/fs.h>
36464 +#include <linux/iobuf.h>
36465 +#include <linux/kdev_t.h>
36466 +#include <linux/hdreg.h>
36467 +#include <linux/slab.h>
36468 +#include <linux/proc_fs.h>
36469 +#include <linux/major.h>
36470 +
36471 +/**
36472 + * version info
36473 + **/
36474 +#define EVMS_MAJOR_VERSION              1
36475 +#define EVMS_MINOR_VERSION              2
36476 +#define EVMS_PATCHLEVEL_VERSION         0
36477 +
36478 +/**
36479 + * general defines section
36480 + **/
36481 +#define FALSE                           0
36482 +#define TRUE                            1
36483 +
36484 +#define MAX_EVMS_VOLUMES                256
36485 +#define EVMS_VOLUME_NAME_SIZE           127
36486 +#define IBM_OEM_ID                      8112
36487 +#define EVMS_INITIAL_CRC                0xFFFFFFFF
36488 +#define EVMS_MAGIC_CRC                 0x31415926
36489 +#define EVMS_VSECTOR_SIZE               512
36490 +#define EVMS_VSECTOR_SIZE_SHIFT         9
36491 +
36492 +#define DEV_PATH                       "/dev"
36493 +#define EVMS_DIR_NAME                  "evms"
36494 +#define EVMS_DEV_NAME                  "block_device"
36495 +#define EVMS_DEV_NODE_PATH             DEV_PATH "/" EVMS_DIR_NAME "/"
36496 +#define EVMS_DEVICE_NAME               DEV_PATH "/" EVMS_DIR_NAME "/" EVMS_DEV_NAME
36497 +
36498 +/**
36499 + * kernel logging levels defines
36500 + **/
36501 +#define EVMS_INFO_CRITICAL              0
36502 +#define EVMS_INFO_SERIOUS               1
36503 +#define EVMS_INFO_ERROR                 2
36504 +#define EVMS_INFO_WARNING               3
36505 +#define EVMS_INFO_DEFAULT               5
36506 +#define EVMS_INFO_DETAILS               6
36507 +#define EVMS_INFO_DEBUG                 7
36508 +#define EVMS_INFO_EXTRA                 8
36509 +#define EVMS_INFO_ENTRY_EXIT            9
36510 +#define EVMS_INFO_EVERYTHING            10
36511 +
36512 +/**
36513 + * kernel logging level        variable
36514 + **/
36515 +extern int evms_info_level;
36516 +
36517 +/**
36518 + * kernel logging macros
36519 + **/
36520 +#define evmsLOG(info_level,prspec) { if (evms_info_level >= info_level) printk prspec; }
36521 +#define evmsLOG2(info_level,statement) { if (evms_info_level >= info_level) statement; }
36522 +
36523 +/**
36524 + * LOG MACROS to make evms log messages
36525 + * look much cleaner in the source.
36526 + **/
36527 +#define EVMS_LOG_PREFIX "evms: "
36528 +#define LOG_CRITICAL(msg, args...)     evmsLOG(EVMS_INFO_CRITICAL,   (KERN_CRIT    EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
36529 +#define LOG_SERIOUS(msg, args...)      evmsLOG(EVMS_INFO_SERIOUS,    (KERN_ERR     EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
36530 +#define LOG_ERROR(msg, args...)                evmsLOG(EVMS_INFO_ERROR,      (KERN_ERR     EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
36531 +#define LOG_WARNING(msg, args...)      evmsLOG(EVMS_INFO_WARNING,    (KERN_WARNING EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
36532 +#define LOG_DEFAULT(msg, args...)      evmsLOG(EVMS_INFO_DEFAULT,    (KERN_INFO    EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
36533 +#define LOG_DETAILS(msg, args...)      evmsLOG(EVMS_INFO_DETAILS,    (KERN_INFO    EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
36534 +#define LOG_DEBUG(msg, args...)                evmsLOG(EVMS_INFO_DEBUG,      (KERN_INFO    EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
36535 +#define LOG_EXTRA(msg, args...)                evmsLOG(EVMS_INFO_EXTRA,      (KERN_INFO    EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
36536 +#define LOG_ENTRY_EXIT(msg, args...)   evmsLOG(EVMS_INFO_ENTRY_EXIT, (KERN_INFO    EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
36537 +#define LOG_EVERYTHING(msg, args...)   evmsLOG(EVMS_INFO_EVERYTHING, (KERN_INFO    EVMS_LOG_PREFIX LOG_PREFIX msg, ## args))
36538 +
36539 +/**
36540 + * Macros to cleanly print 64-bit numbers on both 32-bit and 64-bit machines.
36541 + * Use these in place of %Ld, %Lu, and %Lx.
36542 + **/
36543 +#if BITS_PER_LONG > 32
36544 +#define PFD64 "%ld"
36545 +#define PFU64 "%lu"
36546 +#define PFX64 "%lx"
36547 +#else
36548 +#define PFD64 "%Ld"
36549 +#define PFU64 "%Lu"
36550 +#define PFX64 "%Lx"
36551 +#endif
36552 +
36553 +/**
36554 + * helpful PROCFS macro
36555 + **/
36556 +#ifdef CONFIG_PROC_FS
36557 +#define PROCPRINT(msg, args...) (sz += sprintf(page + sz, msg, ## args));\
36558 +                       if (sz < off)\
36559 +                               off -= sz, sz = 0;\
36560 +                       else if (sz >= off + count)\
36561 +                               goto out
36562 +#endif
36563 +
36564 +/**
36565 + * PluginID convenience macros
36566 + *
36567 + * An EVMS PluginID is a 32-bit number with the following bit positions:
36568 + * Top 16 bits: OEM identifier. See IBM_OEM_ID.
36569 + * Next 4 bits: Plugin type identifier. See evms_plugin_code.
36570 + * Lowest 12 bits: Individual plugin identifier within a given plugin type.
36571 + **/
36572 +#define SetPluginID(oem, type, id) ((oem << 16) | (type << 12) | id)
36573 +#define GetPluginOEM(pluginid) (pluginid >> 16)
36574 +#define GetPluginType(pluginid) ((pluginid >> 12) & 0xf)
36575 +#define GetPluginID(pluginid) (pluginid & 0xfff)
36576 +
36577 +/**
36578 + * enum evms_plugin_type - evms plugin types
36579 + **/
36580 +enum evms_plugin_code {
36581 +       EVMS_NO_PLUGIN = 0,
36582 +       EVMS_DEVICE_MANAGER,
36583 +       EVMS_SEGMENT_MANAGER,
36584 +       EVMS_REGION_MANAGER,
36585 +       EVMS_FEATURE,
36586 +       EVMS_ASSOCIATIVE_FEATURE,
36587 +       EVMS_FILESYSTEM_INTERFACE_MODULE,
36588 +       EVMS_CLUSTER_MANAGER_INTERFACE_MODULE,
36589 +       EVMS_DISTRIBUTED_LOCK_MANAGER_INTERFACE_MODULE
36590 +};
36591 +
36592 +/**
36593 + * struct evms_version -
36594 + * @major:     changes when incompatible difference are introduced
36595 + * @minor:     changes when additions are made
36596 + * @patchlevel:        reflects bug level fixes within a particular major/minor pair
36597 + *
36598 + * generic versioning info used by EVMS
36599 + **/
36600 +struct evms_version {
36601 +       u32 major;
36602 +       u32 minor;
36603 +       u32 patchlevel;
36604 +};
36605 +
36606 +/**
36607 + * struct evms_plugin_header - kernel plugin header record
36608 + * @id:                        plugin id
36609 + * @version:                   plugin version
36610 + * @required_services_version:         required common services version
36611 + * @fops:                      table of function operations
36612 + *
36613 + * kernel plugin header record
36614 + **/
36615 +struct evms_plugin_header {
36616 +       u32 id;
36617 +       struct evms_version version;
36618 +       struct evms_version required_services_version;
36619 +       struct evms_plugin_fops *fops;
36620 +};
36621 +
36622 +/**
36623 + * struct evms_feature_header - EVMS generic on-disk header for features
36624 + * @signature:                         unique magic number
36625 + * @crc:                       structure's crc
36626 + * @version:                   feature header version
36627 + * @engine_version:            created by this evms engine version
36628 + * @flags:                     feature characteristics, bit definitions below.
36629 + * @feature_id:                indicates which feature this header is describing
36630 + * @sequence_number:           describes most recent copy of redundant metadata
36631 + * @alignment_padding:                 used when objects are moved between different sized devices
36632 + * @feature_data1_start_lsn:   object relative start of 1st copy feature data
36633 + * @feature_data1_size:        size of 1st copy of feature data
36634 + * @feature_data2_start_lsn:   object relative start of 2nd copy feature data
36635 + * @feature_data2_size:        size of 2nd copy of feature data
36636 + * @volume_serial_number:      unique/persistent volume identifier
36637 + * @volume_system_id:          unique/persistent minor number
36638 + * @object_depth:              depth of object in volume tree
36639 + * @object_name:               object's name
36640 + * @volume_name:               volume name object is a part of
36641 + * @pad:                       padding to make structure be 512 byte aligned
36642 + *
36643 + * generic on-disk header used to describe any EVMS feature
36644 + * NOTE: 2nd copy of feature data is optional, if used set start_lsn to 0.
36645 + **/
36646 +struct evms_feature_header {
36647 +       u32 signature;
36648 +       u32 crc;
36649 +       struct evms_version version;
36650 +       struct evms_version engine_version;
36651 +       u32 flags;
36652 +       u32 feature_id;
36653 +       u64 sequence_number;
36654 +       u64 alignment_padding;
36655 +       u64 feature_data1_start_lsn;
36656 +       u64 feature_data1_size;
36657 +       u64 feature_data2_start_lsn;
36658 +       u64 feature_data2_size;
36659 +       u64 volume_serial_number;
36660 +       u32 volume_system_id;
36661 +       u32 object_depth;
36662 +       u8 object_name[EVMS_VOLUME_NAME_SIZE + 1];
36663 +       u8 volume_name[EVMS_VOLUME_NAME_SIZE + 1];
36664 +       u8 pad[152];
36665 +};
36666 +
36667 +/**
36668 + * field evms_feature_header.signature majic number
36669 + **/
36670 +#define EVMS_FEATURE_HEADER_SIGNATURE           0x54414546     /* FEAT */
36671 +/**
36672 + * field evms_feature_header.flags defines
36673 + **/
36674 +#define EVMS_FEATURE_ACTIVE                     (1<<0)
36675 +#define EVMS_FEATURE_VOLUME_COMPLETE            (1<<1)
36676 +#define EVMS_VOLUME_DATA_OBJECT                        (1<<16)
36677 +#define EVMS_VOLUME_DATA_STOP                  (1<<17)
36678 +/**
36679 + * struct evms_feature_header version info
36680 + **/
36681 +#define EVMS_FEATURE_HEADER_MAJOR      3
36682 +#define EVMS_FEATURE_HEADER_MINOR      0
36683 +#define EVMS_FEATURE_HEADER_PATCHLEVEL 0
36684 +
36685 +/**
36686 + * EVMS specific error codes
36687 + **/
36688 +#define EVMS_FEATURE_FATAL_ERROR                257
36689 +#define EVMS_VOLUME_FATAL_ERROR                 258
36690 +#define EVMS_FEATURE_INCOMPLETE_ERROR          259
36691 +
36692 +/**
36693 + * struct evms_volume_info - exported volume info
36694 + * @volume_sn:                 unique volume identifier
36695 + * @volume_minor:      persistent device minor assigned to this volume
36696 + * @volume_name:       persistent name assigned to this volume
36697 + *
36698 + * a collection of volume specific info
36699 + **/
36700 +struct evms_volume_info {
36701 +       u64 volume_sn;
36702 +       u32 volume_minor;
36703 +       u8 volume_name[EVMS_VOLUME_NAME_SIZE + 1];
36704 +};
36705 +
36706 +/**
36707 + * struct evms_logical_node - generic kernel storage object
36708 + * @total_vsectors:      0 size of this object in 512 byte units
36709 + * @plugin:              8 plugin that created/owns/manages this storage object
36710 + * @private:            12 location for owner to store private info
36711 + * @flags:              16 storage object characteristics (set/used by plugins)
36712 + *                         bit definitions located in evms_common.h
36713 + * @iflags:             20 internal flags (used exclusively by the framework, not for plugins to use/set)
36714 + *                         bit definitions below.
36715 + * @hardsector_size:    24 assumed physical sector size of underlying device
36716 + * @block_size:         28 default block size for this object
36717 + * @system_id:                  32 system indicator (set by the segment manager)
36718 + * @volume_info:        36 persistent volume info, used only by EVMS volumes
36719 + * @feature_header:     40 generic on-disk metadata describing any EVMS feature
36720 + * @next:               44 linked list field
36721 + * @name:               48 storage object name
36722 + *                     176
36723 + *
36724 + * generic kernel storage object
36725 + */
36726 +struct evms_logical_node {
36727 +       u64 total_vsectors;
36728 +       struct evms_plugin_header *plugin;
36729 +       void *private;
36730 +       u32 flags;
36731 +       u32 iflags;
36732 +       int hardsector_size;
36733 +       int block_size;
36734 +       u32 system_id;
36735 +       struct evms_volume_info *volume_info;
36736 +       struct evms_feature_header *feature_header;
36737 +       struct evms_logical_node *next;
36738 +       u8 name[EVMS_VOLUME_NAME_SIZE + 1];
36739 +};
36740 +
36741 +/**
36742 + * fields evms_logical_node.flags & evms_logical_volume.flags defines
36743 + **/
36744 +#define EVMS_FLAGS_WIDTH                       32
36745 +#define EVMS_VOLUME_FLAG                        (1<<0)
36746 +#define EVMS_VOLUME_PARTIAL_FLAG                (1<<1)
36747 +#define EVMS_VOLUME_PARTIAL                    (1<<1)
36748 +#define EVMS_VOLUME_SET_READ_ONLY               (1<<2)
36749 +#define EVMS_VOLUME_READ_ONLY                  (1<<2)
36750 +/**
36751 + * these bits define volume status
36752 + **/
36753 +#define EVMS_MEDIA_CHANGED                     (1<<20)
36754 +#define EVMS_DEVICE_UNPLUGGED                  (1<<21)
36755 +/**
36756 + * these bits used for removable status
36757 + **/
36758 +#define EVMS_DEVICE_MEDIA_PRESENT              (1<<24)
36759 +#define EVMS_DEVICE_PRESENT                    (1<<25)
36760 +#define EVMS_DEVICE_LOCKABLE                   (1<<26)
36761 +#define EVMS_DEVICE_REMOVABLE                  (1<<27)
36762 +
36763 +/**
36764 + * fields evms_logical_node.iflags defines
36765 + **/
36766 +#define EVMS_FEATURE_BOTTOM                    (1<<0)
36767 +#define EVMS_TOP_SEGMENT                       (1<<1)
36768 +
36769 +/**
36770 + * macro to obtain a node's name from either EVMS or compatibility volumes
36771 + **/
36772 +#define EVMS_GET_NODE_NAME(node)                               \
36773 +       ((node->flags & EVMS_VOLUME_FLAG) ?                     \
36774 +               node->volume_info->volume_name :                \
36775 +               node->name)
36776 +
36777 +/**
36778 + * macro used to transform to/from userland device handles and device storage object nodes
36779 + **/
36780 +#define EVMS_HANDLE_KEY         0x0123456789ABCDEF
36781 +#define DEV_HANDLE_TO_NODE(handle) ((struct evms_logical_node *)(unsigned long)((handle) ^ EVMS_HANDLE_KEY))
36782 +#define NODE_TO_DEV_HANDLE(node) (((u64)(unsigned long)(node)) ^ EVMS_HANDLE_KEY)
36783 +
36784 +/**
36785 + * struct evms_logical_volume - logical volume info
36786 + * @name:                      logical volume name
36787 + * @node:                      logical volume storage object
36788 + * @flags:                     characteristics of logical volume
36789 + * @quiesced:                  quiesce state info
36790 + * @vfs_quiesced:              vfs quiesce state info
36791 + * @requests_in_progress:      count of in-flight I/Os
36792 + * @wait_queue:                used when volume is quiesced
36793 + * @devfs_handle:              handle for devfs
36794 + * @request_queue:             unique request queue
36795 + * @request_lock:              unique request queue lock
36796 + *
36797 + * contains all the fields needed to manage to a logical volume
36798 + **/
36799 +struct evms_logical_volume {
36800 +       u8 *name;
36801 +       struct evms_logical_node *node;
36802 +       int flags;
36803 +       int quiesced;
36804 +       int vfs_quiesced;
36805 +       atomic_t opens;
36806 +       atomic_t requests_in_progress;
36807 +       wait_queue_head_t wait_queue;
36808 +       devfs_handle_t devfs_handle;
36809 +#ifdef CONFIG_SMP
36810 +       request_queue_t request_queue;
36811 +       spinlock_t request_lock;
36812 +#endif
36813 +};
36814 +
36815 +/**
36816 + * field evms_logical_volume.flags defines
36817 + **/
36818 +/**
36819 + * queued flags bits
36820 + **/
36821 +#define EVMS_REQUESTED_DELETE                  (1<<5)
36822 +#define EVMS_REQUESTED_QUIESCE                 (1<<6)
36823 +#define EVMS_REQUESTED_VFS_QUIESCE             (1<<7)
36824 +/**
36825 + * this bit indicates corruption
36826 + **/
36827 +#define EVMS_VOLUME_CORRUPT                    (1<<8)
36828 +/**
36829 + * these bits define the source of the corruption
36830 + **/
36831 +#define EVMS_VOLUME_SOFT_DELETED                       (1<<9)
36832 +#define EVMS_DEVICE_UNAVAILABLE                        (1<<10)
36833 +
36834 +/*
36835 + * The following function table is used for all plugins.
36836 + */
36837 +/**
36838 + * struct evms_plugin_fops - evms plugin's table of function operations
36839 + * @discover:          volume discovery entry point
36840 + * @end_discover:      final discovery entry point
36841 + * @delete:            delete volume entry point
36842 + * @read:              asynchronous read entry point
36843 + * @write:             asynchronous write entry point
36844 + * @init_io:           synchronous io entry point
36845 + * @ioctl:             generic ioctl entry point
36846 + * @direct_ioctl:      non-generic ioctl entry point
36847 + *
36848 + * evms plugin's table of function operations
36849 + **/
36850 +struct evms_plugin_fops {
36851 +       int (*discover) (struct evms_logical_node **);
36852 +       int (*end_discover) (struct evms_logical_node **);
36853 +       int (*delete) (struct evms_logical_node *);
36854 +       void (*read) (struct evms_logical_node *, struct buffer_head *);
36855 +       void (*write) (struct evms_logical_node *, struct buffer_head *);
36856 +       int (*init_io) (struct evms_logical_node *, int, u64,
36857 +                       u64, void *);
36858 +       int (*ioctl) (struct evms_logical_node *, struct inode *,
36859 +                     struct file *, u32, unsigned long);
36860 +       int (*direct_ioctl) (struct inode *, struct file *,
36861 +                            u32, unsigned long);
36862 +};
36863 +
36864 +/**
36865 + * convenience macros to use plugin's fops entry points
36866 + **/
36867 +#define DISCOVER(node, list) ((node)->plugin->fops->discover(list))
36868 +#define END_DISCOVER(node, list) ((node)->plugin->fops->end_discover(list))
36869 +#define DELETE(node) ((node)->plugin->fops->delete(node))
36870 +#define R_IO(node, bh)  ((node)->plugin->fops->read(node, bh))
36871 +#define W_IO(node, bh)  ((node)->plugin->fops->write(node, bh))
36872 +#define INIT_IO(node, rw_flag, start_sec, num_secs, buf_addr) ((node)->plugin->fops->init_io(node, rw_flag, start_sec, num_secs, buf_addr))
36873 +#define IOCTL(node, inode, file, cmd, arg)    ((node)->plugin->fops->ioctl(node, inode, file, cmd, arg))
36874 +#define DIRECT_IOCTL(reg_record, inode, file, cmd, arg)   ((reg_record)->plugin->fops->direct_ioctl(inode, file, cmd, arg))
36875 +
36876 +/**
36877 + * struct evms_list_node - generic non-imbedded list node object
36878 + * @item:      ptr to object in list
36879 + * @next:      ptr to next item in list
36880 + *
36881 + * light weight generic non-imbedded list object definition
36882 + **/
36883 +struct evms_list_node {
36884 +       void *item;
36885 +       struct evms_list_node *next;
36886 +};
36887 +
36888 +/**
36889 + * struct evms_pool_mgmt - anchor block for private pool management
36890 + * @cachep:            kmem_cache_t variable
36891 + * @member_size:       size of each element in the pool
36892 + * @head:
36893 + * @waiters:           count of waiters
36894 + * @wait_queue:        list of waiters
36895 + * @name:              name of the pool (must be less than 20 chars)
36896 + *
36897 + * anchor block for private pool management
36898 + **/
36899 +struct evms_pool_mgmt {
36900 +       kmem_cache_t *cachep;
36901 +       int member_size;
36902 +       void *head;
36903 +       atomic_t waiters;
36904 +       wait_queue_head_t wait_queue;
36905 +       u8 *name;
36906 +};
36907 +
36908 +/*
36909 + * Notes:
36910 + *     All of the following kernel thread functions belong to EVMS base.
36911 + *     These functions were copied from md_core.c
36912 + */
36913 +#define EVMS_THREAD_WAKEUP 0
36914 +/**
36915 + * struct evms_thread
36916 + * @run:
36917 + * @data:
36918 + * @wqueue:    thread wait queue
36919 + * @flags:     thread attributes
36920 + * @event:     event completion
36921 + * @tsk:       task info
36922 + * @name:      thread name
36923 + *
36924 + * data structure for creating/managing a kernel thread
36925 + **/
36926 +struct evms_thread {
36927 +       void (*run) (void *data);
36928 +       void *data;
36929 +       wait_queue_head_t wqueue;
36930 +       unsigned long flags;
36931 +       struct completion *event;
36932 +       struct task_struct *tsk;
36933 +       const u8 *name;
36934 +};
36935 +
36936 +/**
36937 + * EVMS (common services) exported functions prototypes
36938 + *
36939 + * since these function names are global, evms_cs_ has been prepended
36940 + * to each function name, to ensure they do not collide with any
36941 + * other global functions in the kernel.
36942 + **/
36943 +#define EVMS_COMMON_SERVICES_MAJOR              0
36944 +#define EVMS_COMMON_SERVICES_MINOR              6
36945 +#define EVMS_COMMON_SERVICES_PATCHLEVEL         0
36946 +
36947 +void evms_cs_get_version(int *, int *);
36948 +int evms_cs_check_version(struct evms_version *, struct evms_version *);
36949 +int evms_cs_register_plugin(struct evms_plugin_header *);
36950 +int evms_cs_unregister_plugin(struct evms_plugin_header *);
36951 +#ifdef EVMS_MEM_DEBUG
36952 +int evms_cs_verify_memory_integrity(int);
36953 +#endif
36954 +int evms_cs_allocate_logical_node(struct evms_logical_node **);
36955 +void evms_cs_deallocate_volume_info(struct evms_logical_node *);
36956 +void evms_cs_deallocate_logical_node(struct evms_logical_node *);
36957 +int evms_cs_add_logical_node_to_list(struct evms_logical_node **,
36958 +                                    struct evms_logical_node *);
36959 +int evms_cs_remove_logical_node_from_list(struct evms_logical_node **,
36960 +                                         struct evms_logical_node *);
36961 +int evms_cs_kernel_ioctl(struct evms_logical_node *, u32,
36962 +                        unsigned long);
36963 +inline unsigned long evms_cs_size_in_vsectors(long long);
36964 +inline int evms_cs_log2(long long);
36965 +u32 evms_cs_calculate_crc(u32, void *, u32);
36966 +int evms_cs_register_for_end_io_notification(void *,
36967 +                                            struct buffer_head *,
36968 +                                            void *callback_function);
36969 +struct evms_pool_mgmt *evms_cs_create_pool(int,
36970 +                                     u8 *,
36971 +                                     void (*ctor) (void *, kmem_cache_t *,
36972 +                                                   unsigned long),
36973 +                                     void (*dtor) (void *, kmem_cache_t *,
36974 +                                                   unsigned long));
36975 +#define EVMS_BLOCKABLE TRUE
36976 +void *evms_cs_allocate_from_pool(struct evms_pool_mgmt *, int);
36977 +void evms_cs_deallocate_to_pool(struct evms_pool_mgmt *, void *);
36978 +void evms_cs_destroy_pool(struct evms_pool_mgmt *);
36979 +struct evms_list_node **evms_cs_lookup_item_in_list(struct evms_list_node **,
36980 +                                                   void *);
36981 +int evms_cs_add_item_to_list(struct evms_list_node **, void *);
36982 +int evms_cs_remove_item_from_list(struct evms_list_node **, void *);
36983 +int evms_cs_register_device(struct evms_logical_node *);
36984 +int evms_cs_unregister_device(struct evms_logical_node *);
36985 +int evms_cs_find_next_device(struct evms_logical_node *,
36986 +                            struct evms_logical_node **);
36987 +void evms_cs_signal_event(int);
36988 +struct evms_thread *evms_cs_register_thread(void (*run) (void *),
36989 +                                           void *data, const u8 *name);
36990 +void evms_cs_unregister_thread(struct evms_thread *thread);
36991 +void evms_cs_wakeup_thread(struct evms_thread *thread);
36992 +void evms_cs_interrupt_thread(struct evms_thread *thread);
36993 +struct proc_dir_entry *evms_cs_get_evms_proc_dir(void);
36994 +int evms_cs_volume_request_in_progress(kdev_t, int, int *);
36995 +void evms_cs_invalidate_volume(struct evms_logical_node *topmost_node);
36996 +
36997 +/* EVMS exported global variables */
36998 +extern struct evms_pool_mgmt *evms_bh_pool;
36999 +extern u8 *evms_primary_string;
37000 +extern u8 *evms_secondary_string;
37001 +
37002 +/* Have to include this at the end, since it depends
37003 + * on structures and definitions in this file.
37004 + */
37005 +#include <linux/evms/evms_ioctl.h>
37006 +
37007 +#endif
37008 diff -Naur linux-2002-09-30/include/linux/evms/evms_aix.h evms-2002-09-30/include/linux/evms/evms_aix.h
37009 --- linux-2002-09-30/include/linux/evms/evms_aix.h      Wed Dec 31 18:00:00 1969
37010 +++ evms-2002-09-30/include/linux/evms/evms_aix.h       Mon Sep 23 15:11:41 2002
37011 @@ -0,0 +1,428 @@
37012 +/*
37013 +* The following structures are nested within the structures used by the
37014 +* system management routines. These structures and sizes were pulled from the AIX
37015 +* src tree.
37016 +*/
37017 +#define LVM_MAXLPS      65535       /* max number of logical partitions allowed */
37018 +#define LVM_NAMESIZ     64          /* maximum size for the logical volume name */
37019 +#define LVM_NUMCOPIES   3           /* max number of copies allowed of a logical partition */
37020 +#define LVM_MAXVGS      255
37021 +#define LVM_MAXPVS      32
37022 +#define LVM_MAXLVS      256
37023 +#define AIX_MIN_BLOCK_SIZE 4096
37024 +#define VGSA_BT_PV      127
37025 +#define NBPI            32
37026 +#define TRUE             1
37027 +#define OFFSET_CONSTANT     144
37028 +#define SLEEP_TIME            0
37029 +#define MAXLVS_OFFSET        16
37030 +#define PHYS_VOL_OFFSET      34
37031 +#define AIX_PVHPP_LENGTH     PHYS_VOL_OFFSET
37032 +#define MAX_SECTORS_NAMELIST 32
37033 +#define AIX_DEFAULT_MIRRORING 1
37034 +#define AIX_FIRST_MIRROR      2
37035 +#define AIX_MAX_MIRRORS       3  // AIX defines ALL copies as mirrors - 3 mirrors MAX - 1 orig and 2 copies
37036 +
37037 +#define EVMS_AIX_FEATURE_ID   3
37038 +
37039 +#define EVMS_AIX_RESYNC_MIRRORS 1
37040 +
37041 +#define PSN_LVM_REC      7
37042 +#define PSN_VGSA_REC     128
37043 +#define PSN_NAMELIST_REC 2065
37044 +#define PSN_VGT_TRAILER  135
37045 +#define PSN_LVE_REC        1
37046 +#define PSN_PPH_OFFSET    17
37047 +#define PSN_PVH_INCREMENT 17
37048 +#define AIX_MIN_PVH_SIZE  271  // used to find the PV header info for Pv's other than 0
37049 +#define AIX_SECTOR_SIZE  512
37050 +#define MAX_PPENT_SECTOR  16
37051 +#define NAME_LEN         128    /* don't change!!! */
37052 +#define UUID_LEN          32    /* don't change!!! */
37053 +#define MAX_SECTORS_LV_ENTRIES 16
37054 +#define AIX_MIN_MIRROR_POOL    10
37055 +#define AIX_MIRROR_POOL_CHANGE 10
37056 +
37057 +#define LV_SET_ACCESS           _IOW ( 0xfe, 0x28, 1)
37058 +#define LV_SET_ALLOCATION       _IOW ( 0xfe, 0x29, 1)
37059 +#define LV_SET_STATUS           _IOW ( 0xfe, 0x2a, 1)
37060 +#define LV_BMAP                _IOWR ( 0xfe, 0x30, 1)
37061 +
37062 +#define LV_ACTIVE            0x01   /* lv_status */
37063 +#define LV_SPINDOWN          0x02   /*     "     */
37064 +#define LV_ERROR             0x99   /*     "     */
37065 +
37066 +#define VG_ACTIVE            0x01   /* vg_status */
37067 +
37068 +#define AIX_LV_READ          0x00   /* lv_access */
37069 +#define AIX_LV_WRITE         0x01   /*     "     */
37070 +#define EVMS_LV_NEW          0x10   // volume was created during the current discovery pass
37071 +#define EVMS_LV_INCOMPLETE   0x20   // volume has an incomplete LE map
37072 +#define EVMS_LV_INVALID      0x40   // volume has a memory-corruption problem
37073 +
37074 +/* vg flags */
37075 +#define AIX_VG_DIRTY         0x01   // group has had a new PV added during this discovery
37076 +#define AIX_VG_INCOMPLETE    0x20   // volume group is incomplete
37077 +
37078 +#define AIX_LVM_LVUNDEF     0   /* the logical volume is not defined to a */
37079 +/* volume group */
37080 +#define AIX_LVM_LVDEFINED   1   /* the logical volume is defined to a */
37081 +/* volume group */
37082 +#define AIX_LVM_LVSTALE     2   /* the logical volume has stale logical */
37083 +/* partitions */
37084 +#define AIX_LVM_LVMIRBKP    4       /* the logical volume is an online mirror backup */
37085 +/* We are skipping '3' since it is used by CMDLVM_LVSTALE */
37086 +/* as an addition of LVM_LVDEFINE + LVM_LVSTALE, and is */
37087 +/* defined in src/bos/usr/sbin/lvm/include/ls.h */
37088 +
37089 +
37090 +
37091 +#define LOG_PREFIX      "--AIXlvm: "
37092 +
37093 +// Entries in the list of physical volumes (PV)
37094 +// in a volume group (VG)
37095 +
37096 +struct unique_id {
37097 +       u32  word1;
37098 +       u32  word2;
37099 +       u32  word3;
37100 +       u32  word4;
37101 +};
37102 +
37103 +struct partition_list_entry {
37104 +       struct evms_logical_node     * logical_node;
37105 +       u32                pv_number;
37106 +       u32                block_size;   // bytes
37107 +       u32                hard_sect_size;   // bytes
37108 +       struct partition_list_entry    * next;
37109 +
37110 +};
37111 +
37112 +// Table for mapping logical extents (LE) to physical extents (PE)
37113 +struct pe_table_entry {
37114 +       struct partition_list_entry  * owning_pv;
37115 +       u64               pe_sector_offset;
37116 +       char                    pp_state;
37117 +};
37118 +
37119 +// Logical volumes (LV) in a volume group (VG)
37120 +struct aix_logical_volume {
37121 +       u32                lv_number;
37122 +       u64                lv_size;        // Sectors
37123 +       u32                lv_access;      // Flags: LV_READ, LV_WRITE, LN_NEW
37124 +       u32                lv_status;      // Flags: LV_ACTIVE, LV_SPINDOWN
37125 +//     u32                lv_minor;       // Device minor number
37126 +       u32                mirror_copies;  // Do we have mirroring and how many  ?
37127 +//    u32                mirror_number;  // mirror number - which copy is this ?
37128 +//     u32                mirror_iterations;  // Which mirror should we be writing to ?
37129 +       u32                stripes;
37130 +       u32                stripe_size;        // Sectors
37131 +       u32                stripe_size_shift;  // Number of bits to shift right instead of dividing by stripe_size
37132 +       u32                pe_size;        // Sectors
37133 +       u32                pe_size_shift;      // Number of bits to shift right instead of dividing by pe_size
37134 +       u32                num_le;         // Number of entries in the le_to_pe_map
37135 +//    u32                new_volume;     // Flag to indicate if this volume needs to be exported
37136 +       struct aix_volume_group  * group;      // Pointer back to parent volume group
37137 +       unsigned char           name[EVMS_VOLUME_NAME_SIZE+1];  // Dev-tree volume name (eg: /dev/group0/vol0)
37138 +       struct pe_table_entry        * le_to_pe_map; // Mapping of logical to physical extents
37139 +       struct pe_table_entry        * le_to_pe_map_mir1;    // Mapping of logical to physical extents for mirror 1
37140 +       struct pe_table_entry        * le_to_pe_map_mir2;    // Mapping of logical to physical extents for mirror 2
37141 +       struct evms_logical_node     * volume_node;  // Pointer to the parent EVMS node representing this volume
37142 +
37143 +};
37144 +
37145 +// Volume groups (VG)
37146 +struct aix_volume_group {
37147 +       struct unique_id        vg_id;        // volume group number */
37148 +       struct partition_list_entry  * partition_list;   // List of partitions/segments/PVs that make up this VG
37149 +       struct aix_logical_volume      ** volume_list;   // Array of volumes found in this VG.
37150 +       struct aix_volume_group * next;       // Pointer to the next VG
37151 +       struct vg_header       * AIXvgh;      // Pointer to valid data area on disk for the VG
37152 +       s32                 vgda_psn;     // Which VGDA we should use
37153 +//    u32                numpvs;          // Number of PVs found on this VG.
37154 +       u32                numlvs;            // Number of LVs found on this VG.
37155 +       u32                hard_sect_size;    // The largest hard_sect_size and block_size
37156 +       u32                block_size;        // values of all partitions in this group.
37157 +       u32                flags;             //
37158 +//    u32                lv_max;          // maximum logical volumes */
37159 +       u32                pe_size;           // physical extent size in sectors */
37160 +       u32                partition_count;   // actual partitions found for this group
37161 +       u32                CleanVGInfo;       // Do we have a clean VG Info to work with ?
37162 +       u32                vgda_len;          // length of the volume group descriptor area */
37163 +};
37164 +
37165 +struct aix_resync_struct {
37166 +       u64                  master_offset;
37167 +       u64                  slave1_offset;
37168 +       u64                  slave2_offset;
37169 +       struct partition_list_entry  * master_part;  //
37170 +       struct partition_list_entry  * slave1_part;  //
37171 +       struct partition_list_entry  * slave2_part;  //
37172 +       struct aix_logical_volume    * resync_vol;
37173 +       struct aix_logical_volume    * next_resync_vol;
37174 +};
37175 +
37176 +struct aix_mirror_bh {
37177 +       atomic_t                     remaining;
37178 +       s32                      iteration;     // 'have we finished' count, used from IRQ handlers
37179 +       u32                     le;            // In case we have to flag this pp as stale later.
37180 +       s32                      cmd;
37181 +       u64                     mir_sector1;
37182 +       u64                     mir_sector2;
37183 +       struct buffer_head          *master_bh;
37184 +       struct buffer_head           bh_req;
37185 +       struct aix_mirror_bh        *mirror_bh_list;
37186 +       struct evms_logical_node    *node;          // map to evms node (READ only)
37187 +       struct evms_logical_node    *mir_node1;         //
37188 +       struct evms_logical_node    *mir_node2;         //
37189 +       struct aix_mirror_bh        *next_r1;       // next for retry or in free list
37190 +       char                         sync_flag;     // Flag for resyncing of mirrored PPs
37191 +};
37192 +
37193 +struct aix_volume_resync_ioctl {
37194 +       char         object_name[EVMS_VOLUME_NAME_SIZE+1];   // Input  - Name of bbr object from feature header
37195 +       s32      force;
37196 +};
37197 +
37198 +struct timestruc {
37199 +       int tv_sec;
37200 +       int tv_nsec;
37201 +
37202 +};
37203 +
37204 +struct aix_ipl_rec_area {
37205 +       u32          IPL_record_id;    /* This physical volume contains a   */
37206 +       /* valid IPL record if and only if   */
37207 +       /* this field contains IPLRECID      */
37208 +
37209 +#define IPLRECID 0xc9c2d4c1             /* Value is EBCIDIC 'IBMA'           */
37210 +
37211 +       char              reserved1[20];
37212 +       u32          formatted_cap;    /* Formatted capacity. The number of */
37213 +       /* sectors available after formatting*/
37214 +       /* The presence or absence of bad    */
37215 +       /* blocks does not alter this value. */
37216 +
37217 +       char              last_head;        /* THIS IS DISKETTE INFORMATION      */
37218 +                                           /* The number of heads minus 1. Heads*/
37219 +                                           /* are number from 0 to last_head.   */
37220 +
37221 +       char              last_sector;      /* THIS IS DISKETTE INFORMATION      */
37222 +                                           /* The number of sectors per track.  */
37223 +                                           /* Sectors are numbered from 1 to    */
37224 +                                           /* last_sector.                      */
37225 +
37226 +       char              reserved2[6];
37227 +
37228 +       u32          boot_code_length; /* Boot code length in sectors. A 0  */
37229 +       /* value implies no boot code present*/
37230 +
37231 +       u32          boot_code_offset; /* Boot code offset. Must be 0 if no */
37232 +       /* boot code present, else contains  */
37233 +       /* byte offset from start of boot    */
37234 +       /* code to first instruction.        */
37235 +
37236 +       u32          boot_lv_start;    /* Contains the PSN of the start of  */
37237 +       /* the BLV.                          */
37238 +
37239 +       u32          boot_prg_start;   /* Boot code start. Must be 0 if no  */
37240 +       /* boot code present, else contains  */
37241 +       /* the PSN of the start of boot code.*/
37242 +
37243 +       u32          boot_lv_length;   /* BLV length in sectors.            */
37244 +
37245 +       u32          boot_load_add;    /* 512 byte boundary load address for*/
37246 +       /* boot code.                        */
37247 +
37248 +       char              boot_frag;        /* Boot code fragmentation flag. Must*/
37249 +                                           /* be 0 if no fragmentation allowed, */
37250 +                                           /* else must be 0x01.                */
37251 +
37252 +       char          boot_emulation;   /* ROS network emulation flag */
37253 +       /* 0x0 => not an emul support image   */
37254 +       /* 0x1 => ROS network emulation code  */
37255 +       /* 0x2 => AIX code supporting ROS emul*/
37256 +
37257 +       char              reserved3[2];
37258 +
37259 +       u16          basecn_length;    /* Number of sectors for base        */
37260 +       /* customization. Normal mode.       */
37261 +
37262 +       u16          basecs_length;    /* Number of sectors for base        */
37263 +       /* customization. Service mode.      */
37264 +
37265 +       u32          basecn_start;     /* Starting PSN value for base       */
37266 +       /* customization. Normal mode.       */
37267 +
37268 +       u32          basecs_start;     /* Starting PSN value for base       */
37269 +       /* customization. Service mode.      */
37270 +
37271 +       char              reserved4[24];
37272 +
37273 +       u32          ser_code_length;  /* Service code length in sectors.   */
37274 +       /* A 0 value implies no service code */
37275 +       /* present.                          */
37276 +
37277 +       u32          ser_code_offset;  /* Service code offset. Must be 0 if */
37278 +       /* no service code is present, else  */
37279 +       /* contains byte offset from start of*/
37280 +       /* service code to first instruction.*/
37281 +
37282 +       u32          ser_lv_start;     /* Contains the PSN of the start of  */
37283 +       /* the SLV.                          */
37284 +
37285 +       u32          ser_prg_start;    /* Service code start. Must be 0 if  */
37286 +       /* service code is not present, else */
37287 +       /* contains the PSN of the start of  */
37288 +       /* service code.                     */
37289 +
37290 +       u32          ser_lv_length;    /* SLV length in sectors.            */
37291 +
37292 +       u32          ser_load_add;     /* 512 byte boundary load address for*/
37293 +       /* service code.                     */
37294 +
37295 +       char              ser_frag;         /* Service code fragmentation flag.  */
37296 +                                           /* Must be 0 if no fragmentation     */
37297 +                                           /* allowed, else must be 0x01.       */
37298 +
37299 +       char          ser_emulation;    /* ROS network emulation flag */
37300 +       /* 0x0 => not an emul support image   */
37301 +       /* 0x1 => ROS network emulation code  */
37302 +       /* 0x2 => AIX code supporting ROS emul*/
37303 +
37304 +       char              reserved5[2];
37305 +
37306 +       struct unique_id  pv_id;            /* The unique identifier for this    */
37307 +                                           /* physical volume.                  */
37308 +       char              dummy[512 - 128 - sizeof(struct unique_id)];
37309 +};
37310 +
37311 +
37312 +struct AIXlvm_rec
37313 +/* structure which describes the physical volume LVM record */ {
37314 +       u32           lvm_id;          /* LVM id field which identifies whether the PV is a member of a volume group */
37315 +
37316 +#define AIX_LVM_LVMID     0x5F4C564D            /* LVM id field of ASCII "_LVM" */
37317 +
37318 +       struct unique_id          vg_id;           /* the id of the volume group to which this physical volume belongs */
37319 +       u32           lvmarea_len;     /* the length of the LVM reserved area */
37320 +       u32           vgda_len;        /* length of the volume group descriptor area */
37321 +       s32            vgda_psn [2];    /* the physical sector numbers of the beginning of the volume group descriptor area copies on this disk */
37322 +       s32            reloc_psn;           /* the physical sector number of the beginning of a pool of blocks  */
37323 +                                           /* (located at the end of the PV) which are reserved for the relocation of bad blocks */
37324 +       u32           reloc_len;       /* the length in number of sectors of the pool of bad block relocation blocks */
37325 +       s16            pv_num;          /* the physical volume number within the volume group of this physical volume */
37326 +       s16            pp_size;         /* the size in bytes for the partition, expressed as a power of 2 (i.e., the partition size is 2 to the power pp_size) */
37327 +       u32           vgsa_len;        /* length of the volume group status area */
37328 +       s32            vgsa_psn [2];    /* the physical sector numbers of the beginning of the volume group status area copies on this disk */
37329 +       s16            version;         /* the version number of this volume group descriptor and status area */
37330 +
37331 +#define  LVM_VERSION_1      1              /* first version - AIX 3.0 */
37332 +#define  LVM_STRIPE_ENHANCE 2              /* version with striped lv's - AIX 4.1 */
37333 +#define  LVM_1024_PPSIZE    3              /* ppsizes of 512 and 1024 */
37334 +#define  LVM_GT_1016        4              /* version with support for > 1016 pps/pv */
37335 +#define  LVM_MAX_VERSION    LVM_GT_1016    /* max version # */
37336 +
37337 +       char res1 [450];                    /* reserved area */
37338 +
37339 +};
37340 +
37341 +
37342 +
37343 +/*  II.Volume Group Descriptor Area  */
37344 +
37345 +struct vgsa_area {
37346 +       struct timestruc           b_tmstamp;    /* Beginning timestamp */
37347 +       u32          pv_missing [(LVM_MAXPVS + (NBPI -1)) / NBPI];  /* Bit per PV */
37348 +       unsigned char         stalepp    [LVM_MAXPVS] [VGSA_BT_PV];
37349 +       s16               factor;
37350 +       char                  resv[10];     /* Padding */
37351 +       struct timestruc           e_tmstamp;    /* Ending timestamp */
37352 +
37353 +} ;
37354 +
37355 +struct vg_header {
37356 +       struct timestruc           vg_timestamp; /* time of last update */
37357 +       struct unique_id             vg_id;        /* unique id for volume group */
37358 +       s16               numlvs;       /* number of lvs in vg */
37359 +       s16               maxlvs;       /* max number of lvs allowed in vg */
37360 +       s16               pp_size;      /* size of pps in the vg */
37361 +       s16               numpvs;       /* number of pvs in the vg */
37362 +       s16               total_vgdas;  /* number of copies of vg */
37363 +       /* descriptor area on disk */
37364 +       s16               vgda_size;    /* size of volume group descriptor */
37365 +       s16               bigvg;
37366 +       s16               quorum;
37367 +       s16               auto_varyon;
37368 +       s32               checksum;
37369 +       s32               bigda_size;
37370 +};
37371 +
37372 +struct lv_entries {
37373 +       s16               lvname;         /* name of LV */
37374 +       s16               res1;           /* reserved area */
37375 +       s32               maxsize;        /* maximum number of partitions allowed */
37376 +       char                  lv_state;       /* state of logical volume */
37377 +       char                  mirror;         /* none,single, or double */
37378 +       s16               mirror_policy;  /* type of writing used to write */
37379 +       s32               num_lps;        /* number of logical partitions on the lv */
37380 +       /* base 1 */
37381 +       char                  permissions;    /* read write or read only */
37382 +       char                  bb_relocation;  /* specifies if bad block */
37383 +                                             /* relocation is desired */
37384 +       char                  write_verify;   /* verify all writes to the LV */
37385 +       char                  mirwrt_consist; /* mirror write consistency flag */
37386 +       u16              stripe_exp;     /* stripe size in exponent value */
37387 +       u16              striping_width; /* stripe width */
37388 +       u16              lv_avoid;
37389 +       u16              child_minor_num;
37390 +       char                  res4[4];        /* reserved area on disk */
37391 +};
37392 +
37393 +
37394 +struct pv_header {
37395 +       struct unique_id             pv_id;      /* unique identifier of PV */
37396 +       u16              pp_count;   /* number of physical partitions */
37397 +       /* on PV */
37398 +       char                  pv_state;   /* state of physical volume */
37399 +       char                  res1;       /* reserved area on disk */
37400 +       s32               psn_part1;  /* physical sector number of 1st pp */
37401 +       s16               pvnum_vgdas;/* number of vg descriptor areas */
37402 +       /* on the physical volume */
37403 +       s16               pv_num;     /* PV number */
37404 +       u32              res2;       /* reserved area on disk */
37405 +
37406 +};
37407 +
37408 +struct pp_entries {
37409 +       s16              lv_index;     /* index to lv pp is on */
37410 +       s16              res_1;        /* reserved area on disk */
37411 +       u32             lp_num;       /* log. part. number */
37412 +       char                 copy;         /* the copy of the logical partition */
37413 +                                          /* that this pp is allocated for */
37414 +       char                 pp_state;     /* current state of pp */
37415 +       char                 fst_alt_vol;  /* pv where partition allocation for*/
37416 +                                          /* first mirror begins */
37417 +       char                 snd_alt_vol;  /* pv where partition allocation for*/
37418 +                                          /* second mirror begins */
37419 +       s16              fst_alt_part; /* partition to begin first mirror */
37420 +       s16              snd_alt_part; /*partition to begin second mirror */
37421 +       u64             res_3;        /* reserved area  on disk */
37422 +       u64             res_4;        /* reserved area on disk */
37423 +};
37424 +
37425 +struct namelist {
37426 +       char       name[LVM_MAXLVS][LVM_NAMESIZ];
37427 +};
37428 +
37429 +struct vg_trailer {
37430 +       struct timestruc     timestamp; /*  time of last update */
37431 +       s16         concurrency;
37432 +       /* MS Nibble = concurrent capable           */
37433 +       /* LS Nibble = concurrent auto-varyon           */
37434 +       s16         res_2;
37435 +       s32         res_3;  /* reserved area on disk */
37436 +       u64        res_4;  /* reserved area on disk */
37437 +       u64        res_5;  /* reserved area on disk */
37438 +};
37439 +
37440 diff -Naur linux-2002-09-30/include/linux/evms/evms_bbr_k.h evms-2002-09-30/include/linux/evms/evms_bbr_k.h
37441 --- linux-2002-09-30/include/linux/evms/evms_bbr_k.h    Wed Dec 31 18:00:00 1969
37442 +++ evms-2002-09-30/include/linux/evms/evms_bbr_k.h     Wed Sep 25 15:04:22 2002
37443 @@ -0,0 +1,226 @@
37444 +/*
37445 + *   Copyright (c) International Business Machines  Corp., 2000
37446 + *
37447 + *   This program is free software;  you can redistribute it and/or modify
37448 + *   it under the terms of the GNU General Public License as published by
37449 + *   the Free Software Foundation; either version 2 of the License, or
37450 + *   (at your option) any later version.
37451 + *
37452 + *   This program is distributed in the hope that it will be useful,
37453 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
37454 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
37455 + *   the GNU General Public License for more details.
37456 + *
37457 + *   You should have received a copy of the GNU General Public License
37458 + *   along with this program;  if not, write to the Free Software
37459 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
37460 + */
37461 +/* linux/include/linux/evms/evms_bbr_k.h
37462 + *
37463 + * Kernel header file for Bad Block Relocation (BBR) Feature
37464 + *
37465 + * BBR feature is designed to remap I/O write failures to another safe location
37466 + * on disk. Note that most disk drives have BBR built into them, this means
37467 + * that our software BBR will be only activated when all hardware BBR
37468 + * replacement sectors have been used.
37469 + */
37470 +
37471 +#ifndef __EVMS_BBR_K__
37472 +#define __EVMS_BBR_K__
37473 +
37474 +#define EVMS_BBR_VERSION_MAJOR                 1
37475 +#define EVMS_BBR_VERSION_MINOR                 1
37476 +#define EVMS_BBR_VERSION_PATCHLEVEL            1
37477 +
37478 +#define EVMS_BBR_COMMON_SERVICES_MAJOR         0
37479 +#define EVMS_BBR_COMMON_SERVICES_MINOR         6
37480 +#define EVMS_BBR_COMMON_SERVICES_PATCHLEVEL    0
37481 +
37482 +#define EVMS_BBR_FEATURE_ID                    6
37483 +#define EVMS_BBR_SIGNATURE                     0x42627246 /* BbrF */
37484 +#define EVMS_BBR_TABLE_SIGNATURE               0x42627254 /* BbrT */
37485 +
37486 +#define EVMS_BBR_ENTRIES_PER_SECT              31
37487 +#define BBR_POOL_NAME_LENGTH                   20
37488 +#define BBR_STOP_REMAP                         (1<<0)
37489 +#define BBR_BH_USE_EVMS_CALLBACK               (1<<0)
37490 +
37491 +/* BBR direct ioctl commands.
37492 + *
37493 + * BBR_GET_INFO_CMD:   Return total number of sectors that are currently
37494 + *                     remapped for the specified BBR object.
37495 + * BBR_STOP_REMAP_CMD: Stop remapping. Do not remap any new sectors or even
37496 + *                     honor any existing remaps for the specified BBR object
37497 + *                     until the next rediscover command is received.
37498 + * BBR_SECTOR_IO_CMD:  Process an I/O from the engine directly through the
37499 + *                     specified BBR object in the kernel.
37500 + */
37501 +#define BBR_GET_INFO_CMD       1
37502 +#define BBR_STOP_REMAP_CMD     2
37503 +#define BBR_SECTOR_IO_CMD      3
37504 +
37505 +/**
37506 + * struct evms_bbr_table_entry
37507 + * @bad_sect:          LBA of bad location.
37508 + * @replacement_sect:  LBA of new location.
37509 + *
37510 + * Structure to describe one BBR remap.
37511 + */
37512 +struct evms_bbr_table_entry {
37513 +       u64 bad_sect;
37514 +       u64 replacement_sect;
37515 +};
37516 +
37517 +/**
37518 + * struct evms_bbr_table
37519 + * @signature:         Signature on each BBR table sector.
37520 + * @crc:               CRC for this table sector.
37521 + * @sequence_number:   Used to resolve conflicts when primary and secondary
37522 + *                     tables do not match.
37523 + * @in_use_cnt:                Number of in-use table entries.
37524 + * @entries:           Actual table of remaps.
37525 + *
37526 + * Structure to describe each sector of the metadata table. Each sector in this
37527 + * table can describe 31 remapped sectors.
37528 + */
37529 +struct evms_bbr_table {
37530 +       u32                             signature;
37531 +       u32                             crc;
37532 +       u32                             sequence_number;
37533 +       u32                             in_use_cnt;
37534 +       struct evms_bbr_table_entry     entries[EVMS_BBR_ENTRIES_PER_SECT];
37535 +};
37536 +
37537 +/**
37538 + * struct evms_bbr_metadata
37539 + * @signature:                 0       EVMS_BBR_SIGNATURE
37540 + * @crc:                       4
37541 + * @block_size:                        8       Block size in bytes.
37542 + * @flags:                     12      Global flags used by BBR.
37543 + * @sequence_number:           16
37544 + * @start_sect_bbr_table:      24      LBA of start of BBR table.
37545 + * @nr_sects_bbr_table:                32      Number of sectors in the BBR table.
37546 + * @start_replacement_sect:    40      LBA of start of replacement sectors.
37547 + * @nr_replacement_blks:       48      Number of replacement sectors.
37548 + * @pads:                      56
37549 + *
37550 + * On-disk metadata identifying an object as a BBR object.
37551 + */
37552 +struct evms_bbr_metadata {
37553 +       u32     signature;
37554 +       u32     crc;
37555 +       u32     block_size;
37556 +       u32     flags;
37557 +       u64     sequence_number;
37558 +       u64     start_sect_bbr_table;
37559 +       u64     nr_sects_bbr_table;
37560 +       u64     start_replacement_sect;
37561 +       u64     nr_replacement_blks;
37562 +       u8      pads[456];
37563 +};
37564 +
37565 +/**
37566 + * struct evms_notify_bbr
37567 + * @object_name:       Input - Name of BBR object from feature header.
37568 + * @count:             Output - Number of remapped sectors.
37569 + * @start_sect:                Input - Start sector for sector_io.
37570 + * @nr_sect:           Input - Number of sectors for sector_io.
37571 + * @buffer:            Input/Output - Pointer to data buffer for sector_io.
37572 + * @rw:                        Input - READ or WRITE for sector_io.
37573 + */
37574 +struct evms_notify_bbr {
37575 +       u8      object_name[EVMS_VOLUME_NAME_SIZE+1];
37576 +       u64     count;
37577 +       u64     start_sect;
37578 +       u64     nr_sect;
37579 +       u8      * buffer;
37580 +       s32     rw;
37581 +};
37582 +
37583 +/**
37584 + * struct bbr_runtime_remap
37585 + *
37586 + * Node in the binary tree used to keep track of remaps.
37587 + */
37588 +struct bbr_runtime_remap {
37589 +       struct evms_bbr_table_entry     remap;
37590 +       struct bbr_runtime_remap        * left;
37591 +       struct bbr_runtime_remap        * right;
37592 +};
37593 +
37594 +/**
37595 + * struct bbr_private
37596 + * @next:                      List of all bbr_private structures.
37597 + * @node:                      Output node.
37598 + * @source:                    Consumed node.
37599 + * @bbr_table:                 Copy of metadata table.
37600 + * @lba_table1:                        LBA of primary BBR table.
37601 + * @lba_table2:                        LBA of secondary BBR table.
37602 + * @nr_sects_bbr_table:                Size of each BBR table.
37603 + * @nr_replacement_blks:       Number of replacement sectors.
37604 + * @start_replacement_sect:    LBA of start of replacement sectors.
37605 + * @blksize_in_sects:          Size of each sector.
37606 + * @in_use_replacement_blks:   Current number of remaps.
37607 + * @remap_root:                        Binary tree containing all remaps.
37608 + * @bbr_id_lock:               Lock for the binary tree.
37609 + * @flags:                     BBR_STOP_REMAP
37610 + */
37611 +struct bbr_private {
37612 +       struct bbr_private              * next;
37613 +       struct evms_logical_node        * node;
37614 +       struct evms_logical_node        * source;
37615 +       struct evms_bbr_table           * bbr_table;
37616 +       u64                             lba_table1;
37617 +       u64                             lba_table2;
37618 +       u64                             nr_sects_bbr_table;
37619 +       u64                             nr_replacement_blks;
37620 +       u64                             start_replacement_sect;
37621 +       u32                             blksize_in_sects;
37622 +       atomic_t                        in_use_replacement_blks;
37623 +       struct bbr_runtime_remap        * remap_root;
37624 +       spinlock_t                      bbr_id_lock;
37625 +       u32                             flag;
37626 +};
37627 +
37628 +/**
37629 + * struct bbr_io_buffer
37630 + * @bbr_io_list:       Thread's list of bbr_io_buf's.
37631 + * @bbr_id:            Object for this request.
37632 + * @bh:                        Original buffer_head.
37633 + * @org_end_io:                Saved callback address from original buffer_head.
37634 + * @org_private:       Saved private data address from original buffer_head.
37635 + * @org_rsector:       Saved sector value from original buffer_head.
37636 + * @org_dev:           Saved b_rdev field from original buffer_head.
37637 + * @complete:          Completion structure used by init_io.
37638 + * @rw:                        READ or WRITE.
37639 + * @rc:                        Return code from bbr_io_handler.
37640 + *
37641 + * Structure used to track each write request.
37642 + */
37643 +struct bbr_io_buffer {
37644 +       struct list_head        bbr_io_list;
37645 +       struct bbr_private      * bbr_id;
37646 +       struct buffer_head      * bh;
37647 +       void                    (* org_end_io)(struct buffer_head *bh, int uptodate);
37648 +       void                    * org_private;
37649 +       u64                     org_rsector;
37650 +       struct completion       * complete;
37651 +       kdev_t                  org_dev;
37652 +       s32                     rw;
37653 +       s32                     rc;
37654 +};
37655 +
37656 +#ifdef EVMS_BBR_DEBUG
37657 +static void print_meta_data(struct evms_bbr_metadata * md);
37658 +static void print_bbr_table_sector(struct evms_bbr_table * bbr_table);
37659 +static void print_remap_list(struct bbr_private * bbr_id);
37660 +#define BBR_DEBUG_PRINT_META_DATA(md) print_meta_data(md)
37661 +#define BBR_DEBUG_PRINT_TABLE_SECTOR(table) print_bbr_table_sector(table)
37662 +#define BBR_DEBUG_PRINT_REMAP_LIST(bbr_id) print_remap_list(bbr_id)
37663 +#else
37664 +#define BBR_DEBUG_PRINT_META_DATA(md)
37665 +#define BBR_DEBUG_PRINT_TABLE_SECTOR(table)
37666 +#define BBR_DEBUG_PRINT_REMAP_LIST(bbr_id)
37667 +#endif
37668 +
37669 +#endif
37670 diff -Naur linux-2002-09-30/include/linux/evms/evms_drivelink.h evms-2002-09-30/include/linux/evms/evms_drivelink.h
37671 --- linux-2002-09-30/include/linux/evms/evms_drivelink.h        Wed Dec 31 18:00:00 1969
37672 +++ evms-2002-09-30/include/linux/evms/evms_drivelink.h Fri Aug 16 16:43:11 2002
37673 @@ -0,0 +1,125 @@
37674 +/* -*- linux-c -*- */
37675 +/*
37676 + *
37677 + *   Copyright (c) International Business Machines  Corp., 2000
37678 + *
37679 + *   This program is free software;  you can redistribute it and/or modify
37680 + *   it under the terms of the GNU General Public License as published by
37681 + *   the Free Software Foundation; either version 2 of the License, or
37682 + *   (at your option) any later version.
37683 + *
37684 + *   This program is distributed in the hope that it will be useful,
37685 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
37686 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
37687 + *   the GNU General Public License for more details.
37688 + *
37689 + *   You should have received a copy of the GNU General Public License
37690 + *   along with this program;  if not, write to the Free Software
37691 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
37692 + */
37693 +/*
37694 + * linux/include/linux/evms_drvlink.h
37695 + *
37696 + * EVMS DriveLink Feature kernel header file
37697 + *
37698 + */
37699 +
37700 +#ifndef __EVMS_DRIVELINK_INCLUDED__
37701 +#define __EVMS_DRIVELINK_INCLUDED__
37702 +
37703 +#define EVMS_DRIVELINK_FEATURE_ID       1
37704 +#define EVMS_DRIVELINK_SIGNATURE        0x4C767244     //DrvL
37705 +#define EVMS_DRIVELINK_MAX_ENTRIES      60
37706 +
37707 +/*
37708 + * feature data version defines
37709 + */
37710 +#define DRIVELINK_METADATA_MAJOR       2
37711 +#define DRIVELINK_METADATA_MINOR        0
37712 +#define DRIVELINK_METADATA_PATCHLEVEL   0
37713 +
37714 +static struct evms_version metadata_ver = {
37715 +       .major  = DRIVELINK_METADATA_MAJOR,
37716 +       .minor  = DRIVELINK_METADATA_MINOR,
37717 +       .patchlevel = DRIVELINK_METADATA_PATCHLEVEL
37718 +};
37719 +
37720 +/**
37721 + * struct evms_dl_ordering_table_entry - ordering table entry structure definition
37722 + * @child_sn: child serial number
37723 + * @child_size: in sectors
37724 + *
37725 + * ordering table entry struction definition
37726 + **/
37727 +struct evms_dl_ordering_table_entry {
37728 +       u64 child_serial_number;
37729 +       u64 child_vsize;
37730 +};
37731 +
37732 +/**
37733 + * struct evms_drivelink_metadata - on-disk metadata definition
37734 + * @signature: drivelink metadata magic number
37735 + * @crc: crc of entire structure
37736 + * @version: drivelink metadata version
37737 + * @flags:
37738 + * @sequence_number: used to determine most recent redundant data
37739 + * @child_sn: child object serial number
37740 + * @parent_sn: parent object serial number
37741 + * @child_count: count of child objects of parent
37742 + * @pad: used for alignment of following table
37743 + * @ordering_table: table of child ordering entries
37744 + *
37745 + * drivelink on-disk metadata definition
37746 + **/
37747 +struct evms_drivelink_metadata {
37748 +       u32 signature;
37749 +       u32 crc;
37750 +       struct evms_version version;
37751 +       u32 flags;
37752 +       u64 sequence_number;
37753 +       u64 child_serial_number;
37754 +       u64 parent_serial_number;
37755 +       u64 child_count;
37756 +       u64 pad;
37757 +       struct evms_dl_ordering_table_entry
37758 +           ordering_table[EVMS_DRIVELINK_MAX_ENTRIES];
37759 +};
37760 +
37761 +#ifdef __KERNEL__
37762 +/**
37763 + * struct runtime_entry - in-memory metadata entry description
37764 + * @block_size: largest block size of all children
37765 + * @voffset: relative offset of child object within parent object (in 512 byte units)
37766 + * @vsize: child object size (in 512 byte units)
37767 + * @child_node: child storage object
37768 + * @child_metadata: child's on-disk metadata
37769 + *
37770 + * drivelink's in-memory metadata entry description
37771 + **/
37772 +struct runtime_entry {
37773 +       u64 block_size;
37774 +       u64 voffset;
37775 +       u64 vsize;
37776 +       struct evms_logical_node *child_node;
37777 +       struct evms_drivelink_metadata *child_metadata;
37778 +};
37779 +
37780 +/**
37781 + * struct runtime_data - in-memory metadata description
37782 + * @block_size: largest block size of all children
37783 + * @voffset: relative offset of child object within parent object (in 512 byte units)
37784 + * @vsize: child object size (in 512 byte units)
37785 + * @child_node: child storage object
37786 + * @child_metadata: child's on-disk metadata
37787 + *
37788 + * drivelink's in-memory metadata description
37789 + **/
37790 +struct runtime_data {
37791 +       u64 block_size;
37792 +       u64 parent_sn;
37793 +       u64 child_count;
37794 +       struct runtime_entry *child_table;
37795 +};
37796 +#endif
37797 +
37798 +#endif
37799 diff -Naur linux-2002-09-30/include/linux/evms/evms_ecr.h evms-2002-09-30/include/linux/evms/evms_ecr.h
37800 --- linux-2002-09-30/include/linux/evms/evms_ecr.h      Wed Dec 31 18:00:00 1969
37801 +++ evms-2002-09-30/include/linux/evms/evms_ecr.h       Fri Aug 16 16:19:56 2002
37802 @@ -0,0 +1,107 @@
37803 +/*
37804 + *
37805 + *   Copyright (c) International Business Machines  Corp., 2000
37806 + *
37807 + *   This program is free software;  you can redistribute it and/or modify
37808 + *   it under the terms of the GNU General Public License as published by
37809 + *   the Free Software Foundation; either version 2 of the License, or
37810 + *   (at your option) any later version.
37811 + *
37812 + *   This program is distributed in the hope that it will be useful,
37813 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
37814 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
37815 + *   the GNU General Public License for more details.
37816 + *
37817 + *   You should have received a copy of the GNU General Public License
37818 + *   along with this program;  if not, write to the Free Software
37819 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
37820 + *
37821 + */
37822 +/*
37823 + * linux/include/linux/evms_ecr.h
37824 + *
37825 + * EVMS Cluster enablement kernel header file
37826 + *
37827 + */
37828 +
37829 +#ifndef __EVMS_ECR__
37830 +
37831 +#define __EVMS_ECR__
37832 +
37833 +#define ECR_SUCCESS 0
37834 +#define ECR_FAIL   -1
37835 +
37836 +/*
37837 + * Beginning of group messaging API
37838 + */
37839 +typedef int            ecr_group_t;
37840 +typedef int            ecr_nodeid_t;
37841 +typedef void           ecr_cred_t;
37842 +typedef void           ecr_instance_t;
37843 +typedef void           ecr_message_t;
37844 +
37845 +typedef enum ecr_type_s {
37846 +       ECR_GROUP_START,        /* 0th entry is reserved */
37847 +       ECR_P2P,                /* Point to Point message type */
37848 +       ECR_BROADCAST,          /* Broadcast message type */
37849 +       ECR_ATOMIC_EXECUTE,     /* Atomic execute type */
37850 +       ECR_GROUP_LAST          /* Just a last enum type, not a message type */
37851 +} ecr_type_t;
37852 +
37853 +typedef struct ecr_table_s {
37854 +       void  (*join) (ecr_nodeid_t, uint,  ecr_nodeid_t *,  ecr_instance_t *);
37855 +       int   (*can_join)(ecr_nodeid_t, ecr_cred_t *, size_t, ecr_instance_t *);
37856 +       void  (*leave) (ecr_nodeid_t, ecr_instance_t *);
37857 +       void  (*recover)(ecr_nodeid_t, ecr_instance_t *);
37858 +       void  (*message)(ecr_message_t *, ecr_type_t, ecr_nodeid_t,
37859 +                               void *, size_t,  ecr_instance_t *);
37860 +       void  (*vol_leave)(ecr_nodeid_t, ecr_instance_t *);
37861 +} ecr_table_t;
37862 +
37863 +
37864 +#define ECR_GROUPNAME_MAX_SIZE  NAME_SIZE /* maximum size of a group name */
37865 +
37866 +ecr_group_t  ecr_group_join(char *,  ecr_table_t *, ecr_cred_t *, size_t,
37867 +                                       ecr_instance_t *);
37868 +void        ecr_group_leave(ecr_group_t);
37869 +int         ecr_group_send(ecr_group_t, ecr_nodeid_t, void *, size_t,
37870 +                               ecr_instance_t *,
37871 +                               void callback(int, ecr_instance_t *));
37872 +int         ecr_group_send_wait(ecr_group_t, ecr_nodeid_t, void *, size_t,
37873 +                               int *);
37874 +int         ecr_group_broadcast(ecr_group_t, void *, size_t, ecr_instance_t *,
37875 +                               void callback(u_char, ecr_instance_t *));
37876 +int         ecr_group_broadcast_wait(ecr_group_t, void *, size_t, u_char *);
37877 +int         ecr_group_atomic_execute(ecr_group_t, void *, size_t,
37878 +                               ecr_instance_t *,
37879 +                               void callback(ecr_instance_t *));
37880 +int         ecr_group_atomic_execute_wait(ecr_group_t, void *, size_t);
37881 +void        ecr_group_success_response(ecr_message_t *);
37882 +void        ecr_group_failure_response(ecr_message_t *, int);
37883 +
37884 +
37885 +
37886 +/*
37887 + * Beginning of distributed lock API
37888 + */
37889 +
37890 +typedef int            ecr_lock_t;
37891 +typedef enum ecr_lock_mode_s {
37892 +       ECR_LOCK_START,         /* 0th entry is reserved */
37893 +       ECR_LOCK_CONCURRENT,    /* concurrent access */
37894 +       ECR_LOCK_EXCLUSIVE,     /* exclusive access */
37895 +       ECR_LOCK_LAST           /* Just a last enum type, not a lock type */
37896 +} ecr_lock_mode_t;
37897 +
37898 +typedef u_char         ecr_mode_t;
37899 +
37900 +
37901 +#define ECR_LOCKNAME_MAX_SIZE  NAME_SIZE /* maximum size of a lock name */
37902 +#define ECR_BLOCK 1 /* waitflag set */
37903 +
37904 +ecr_lock_t   ecr_lock_create(char *  /* lock name */);
37905 +int         ecr_lock(ecr_lock_t, u64, u64, ecr_lock_mode_t,
37906 +                               u_char /*waitflag*/);
37907 +int         ecr_unlock(ecr_lock_t, u64, u64);
37908 +
37909 +#endif /* __EVMS_ECR__ */
37910 diff -Naur linux-2002-09-30/include/linux/evms/evms_ioctl.h evms-2002-09-30/include/linux/evms/evms_ioctl.h
37911 --- linux-2002-09-30/include/linux/evms/evms_ioctl.h    Wed Dec 31 18:00:00 1969
37912 +++ evms-2002-09-30/include/linux/evms/evms_ioctl.h     Thu Sep 26 11:55:45 2002
37913 @@ -0,0 +1,516 @@
37914 +/* -*- linux-c -*- */
37915 +/*
37916 + *
37917 + *   Copyright (c) International Business Machines  Corp., 2000
37918 + *
37919 + *   This program is free software;  you can redistribute it and/or modify
37920 + *   it under the terms of the GNU General Public License as published by
37921 + *   the Free Software Foundation; either version 2 of the License, or
37922 + *   (at your option) any later version.
37923 + *
37924 + *   This program is distributed in the hope that it will be useful,
37925 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
37926 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
37927 + *   the GNU General Public License for more details.
37928 + *
37929 + *   You should have received a copy of the GNU General Public License
37930 + *   along with this program;  if not, write to the Free Software
37931 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
37932 + */
37933 +/*
37934 + * linux/include/linux/evms.h
37935 + *
37936 + * EVMS public kernel header file
37937 + *
37938 + */
37939 +
37940 +#ifndef __EVMS_IOCTL_INCLUDED__
37941 +#define __EVMS_IOCTL_INCLUDED__
37942 +
37943 +#include <linux/hdreg.h>
37944 +
37945 +/* IOCTL interface version definitions */
37946 +#define EVMS_IOCTL_INTERFACE_MAJOR           11
37947 +#define EVMS_IOCTL_INTERFACE_MINOR           3
37948 +#define EVMS_IOCTL_INTERFACE_PATCHLEVEL      0
37949 +
37950 +/* IOCTL definitions */
37951 +enum evms_ioctl_cmds {
37952 +       /* version commands */
37953 +       EVMS_GET_IOCTL_VERSION_NUMBER = 0,
37954 +       EVMS_GET_VERSION_NUMBER,
37955 +#ifdef __KERNEL__
37956 +       /* EVMS internal commands */
37957 +       EVMS_GET_DISK_LIST_NUMBER = 0x40,
37958 +       EVMS_CHECK_MEDIA_CHANGE_NUMBER,
37959 +       EVMS_REVALIDATE_DISK_NUMBER,
37960 +       EVMS_OPEN_VOLUME_NUMBER,
37961 +       EVMS_CLOSE_VOLUME_NUMBER,
37962 +       EVMS_QUIESCE_VOLUME_NUMBER,
37963 +       EVMS_CHECK_DEVICE_STATUS_NUMBER,
37964 +       EVMS_UPDATE_DEVICE_INFO_NUMBER,
37965 +#endif
37966 +       /* configuration commands */
37967 +       EVMS_GET_INFO_LEVEL_NUMBER = 0x80,
37968 +       EVMS_SET_INFO_LEVEL_NUMBER,
37969 +       EVMS_REDISCOVER_VOLUMES_NUMBER,
37970 +       EVMS_DELETE_VOLUME_NUMBER,
37971 +       EVMS_PLUGIN_IOCTL_NUMBER,
37972 +       EVMS_PROCESS_NOTIFY_EVENT_NUMBER,
37973 +       /* query info commands */
37974 +       EVMS_GET_LOGICAL_DISK_NUMBER = 0xC0,
37975 +       EVMS_GET_LOGICAL_DISK_INFO_NUMBER,
37976 +       EVMS_SECTOR_IO_NUMBER,
37977 +       EVMS_GET_MINOR_NUMBER,
37978 +       EVMS_GET_VOLUME_DATA_NUMBER,
37979 +       EVMS_GET_PLUGIN_NUMBER,
37980 +       EVMS_COMPUTE_CSUM_NUMBER,
37981 +       EVMS_GET_BMAP_NUMBER,
37982 +       EVMS_CHECK_MOUNT_STATUS_NUMBER,
37983 +       EVMS_CHECK_OPEN_STATUS_NUMBER,
37984 +        /* commands for non-EVMS apps */
37985 +       EVMS_GET_VOL_STRIPE_INFO_NUMBER = 0xF0,
37986 +};
37987 +
37988 +/* version commands */
37989 +#define EVMS_GET_IOCTL_VERSION_STRING   "EVMS_GET_IOCTL_VERSION"
37990 +#define EVMS_GET_IOCTL_VERSION          _IOR(EVMS_MAJOR, EVMS_GET_IOCTL_VERSION_NUMBER, struct evms_version)
37991 +
37992 +#define EVMS_GET_VERSION_STRING         "EVMS_GET_VERSION"
37993 +#define EVMS_GET_VERSION                _IOR(EVMS_MAJOR, EVMS_GET_VERSION_NUMBER, struct evms_version)
37994 +
37995 +#ifdef __KERNEL__
37996 +
37997 +/* EVMS internal commands */
37998 +#define EVMS_GET_DISK_LIST_STRING       "EVMS_GET_DISK_LIST"
37999 +#define EVMS_GET_DISK_LIST              _IOWR(EVMS_MAJOR, EVMS_GET_DISK_LIST_NUMBER, struct evms_list_node **)
38000 +
38001 +#define EVMS_CHECK_MEDIA_CHANGE_STRING  "EVMS_CHECK_MEDIA_CHANGE"
38002 +#define EVMS_CHECK_MEDIA_CHANGE         _IO(EVMS_MAJOR, EVMS_CHECK_MEDIA_CHANGE_NUMBER)
38003 +
38004 +#define EVMS_REVALIDATE_DISK_STRING     "EVMS_REVALIDATE_DISK"
38005 +#define EVMS_REVALIDATE_DISK            _IO(EVMS_MAJOR, EVMS_REVALIDATE_DISK_NUMBER)
38006 +
38007 +#define EVMS_OPEN_VOLUME_STRING         "EVMS_OPEN_VOLUME"
38008 +#define EVMS_OPEN_VOLUME                _IO(EVMS_MAJOR, EVMS_OPEN_VOLUME_NUMBER)
38009 +
38010 +#define EVMS_CLOSE_VOLUME_STRING        "EVMS_CLOSE_VOLUME"
38011 +#define EVMS_CLOSE_VOLUME               _IO(EVMS_MAJOR, EVMS_CLOSE_VOLUME_NUMBER)
38012 +
38013 +/**
38014 + * struct evms_quiesce_vol_pkt - ioctl packet definition
38015 + * @command:   0 = unquiesce, 1 = quiesce
38016 + * @minor:     minor device number of target volume
38017 + * @do_vfs:    0 = do nothing, 1 = also perform equivalent VFS operation
38018 + * @status:    returned operation status
38019 + *
38020 + * ioctl packet definition for EVMS_QUIESCE_VOLUME
38021 + **/
38022 +struct evms_quiesce_vol_pkt {
38023 +       s32 command;
38024 +       s32 minor;
38025 +       s32 do_vfs;
38026 +       s32 status;
38027 +};
38028 +/**
38029 + * defines for evms_quiesce_vol_pkt.command field
38030 + **/
38031 +#define EVMS_UNQUIESCE          0
38032 +#define EVMS_QUIESCE            1
38033 +/**
38034 + * defines for evms_quiesce_vol_pkt.do_vfs field
38035 + * located below struct evms_delete_vol_pkt definition
38036 + **/
38037 +
38038 +#define EVMS_QUIESCE_VOLUME_STRING      "EVMS_QUIESCE_VOLUME"
38039 +#define EVMS_QUIESCE_VOLUME             _IOR(EVMS_MAJOR, EVMS_QUIESCE_VOLUME_NUMBER, struct evms_quiesce_vol_pkt)
38040 +
38041 +#define EVMS_CHECK_DEVICE_STATUS_STRING        "EVMS_CHECK_DEVICE_STATUS"
38042 +#define EVMS_CHECK_DEVICE_STATUS        _IOR(EVMS_MAJOR, EVMS_CHECK_DEVICE_STATUS_NUMBER, int)
38043 +
38044 +#define EVMS_UPDATE_DEVICE_INFO_STRING "EVMS_UPDATE_DEVICE_INFO"
38045 +#define EVMS_UPDATE_DEVICE_INFO         _IO(EVMS_MAJOR, EVMS_UPDATE_DEVICE_INFO_NUMBER)
38046 +
38047 +#endif
38048 +
38049 +/* configuration commands */
38050 +#define EVMS_GET_INFO_LEVEL_STRING      "EVMS_GET_INFO_LEVEL"
38051 +#define EVMS_GET_INFO_LEVEL             _IOR(EVMS_MAJOR, EVMS_GET_INFO_LEVEL_NUMBER, int)
38052 +
38053 +#define EVMS_SET_INFO_LEVEL_STRING      "EVMS_SET_INFO_LEVEL"
38054 +#define EVMS_SET_INFO_LEVEL             _IOW(EVMS_MAJOR, EVMS_SET_INFO_LEVEL_NUMBER, int)
38055 +
38056 +/**
38057 + * struct evms_rediscover_pkt - rediscover volume ioctl packet definition
38058 + * @status:            return operation status
38059 + * @drive_count:       count of drives being probed, 0xffffffff for all disks
38060 + * @drive_array:       array of drive handles to be probed
38061 + *
38062 + * ioctl packet definition for EVMS_REDISCOVER_VOLUMES ioctl
38063 + **/
38064 +struct evms_rediscover_pkt {
38065 +       s32 status;
38066 +       u32 drive_count;
38067 +       u64 *drive_array;
38068 +};
38069 +/**
38070 + * defines for evms_delete_vol_pkt.command field
38071 + **/
38072 +#define EVMS_SOFT_DELETE        0
38073 +#define EVMS_HARD_DELETE        1
38074 +/**
38075 + * defines evms_rediscover_pkt.drive_count field
38076 + **/
38077 +#define REDISCOVER_ALL_DEVICES          0xFFFFFFFF
38078 +
38079 +#define EVMS_REDISCOVER_VOLUMES_STRING  "EVMS_REDISCOVER_VOLUMES"
38080 +#define EVMS_REDISCOVER_VOLUMES         _IOWR(EVMS_MAJOR, EVMS_REDISCOVER_VOLUMES_NUMBER, struct evms_rediscover_pkt)
38081 +
38082 +/* field: command: defines */
38083 +
38084 +/**
38085 + * struct evms_delete_vol_pkt - delete volume ioctl packet definition
38086 + * @command:           0 = soft delete, 1 = hard delete
38087 + * @minor:             minor device num of target volume
38088 + * @do_vfs:            0 = do nothing, 1 = perform VFS operation(s)
38089 + * @associative_minor: optional minor device num of associative volume, 0 when unused
38090 + * @author             returned operation status
38091 + *
38092 + * ioctl packet definition for EVMS_DELETE_VOLUME ioctl
38093 + **/
38094 +struct evms_delete_vol_pkt {
38095 +       s32 command;
38096 +       s32 minor;
38097 +       s32 do_vfs;
38098 +       s32 associative_minor;
38099 +       s32 status;
38100 +};
38101 +/**
38102 + * field evms_delete_vol_pkt defines
38103 + * @EVMS_VFS_DO_NOTHING:
38104 + * @EVMS_VFS_DO:
38105 + *
38106 + * NOTE: these defines are also used with evms_quiesce_vol_pkt.
38107 + **/
38108 +#define EVMS_VFS_DO_NOTHING     0
38109 +#define EVMS_VFS_DO             1
38110 +
38111 +#define EVMS_DELETE_VOLUME_STRING       "EVMS_DELETE_VOLUME"
38112 +#define EVMS_DELETE_VOLUME              _IOR(EVMS_MAJOR, EVMS_DELETE_VOLUME_NUMBER, struct evms_delete_vol_pkt)
38113 +
38114 +/**
38115 + * struct evms_plugin_ioctl_pkt - generic plugin ioctl packet definition
38116 + * @feature_id:                plugin ID of feature to receive this ioctl
38117 + * @feature_command:   feature specific ioctl command
38118 + * @status:            0 = completed, 0 != error
38119 + * @feature_ioctl_data:        ptr to feature specific ioctl struct
38120 + *
38121 + * ioctl packet definition for EVMS_PLUGIN_IOCTL ioctl
38122 + **/
38123 +struct evms_plugin_ioctl_pkt {
38124 +       ulong feature_id;
38125 +       s32 feature_command;
38126 +       s32 status;
38127 +       void *feature_ioctl_data;
38128 +};
38129 +
38130 +#define EVMS_PLUGIN_IOCTL_STRING        "EVMS_PLUGIN_IOCTL"
38131 +#define EVMS_PLUGIN_IOCTL               _IOR(EVMS_MAJOR, EVMS_PLUGIN_IOCTL_NUMBER, struct evms_plugin_ioctl_pkt)
38132 +
38133 +/**
38134 + * struct evms_event - evms event structure
38135 + * @pid:       PID to act on
38136 + * @eventid:   event id to respond to
38137 + * @signo:     signal # to send when event occurs
38138 + *
38139 + * contains process event notification info
38140 + **/
38141 +struct evms_event {
38142 +       s32 pid;
38143 +       s32 eventid;
38144 +       s32 signo;
38145 +};
38146 +/**
38147 + * field evms_event_pkt.eventid defines
38148 + **/
38149 +#define EVMS_EVENT_END_OF_DISCOVERY     0
38150 +
38151 +/**
38152 + * struct evms_notify_pkt - evms event notification ioctl packet definition
38153 + * @command:   0 = unregister, 1 = register
38154 + * @eventry:   event structure
38155 + * @status:    returned operation status
38156 + *
38157 + * ioctl packet definition for EVMS_PROCESS_NOTIFY_EVENT ioctl
38158 + **/
38159 +struct evms_notify_pkt {
38160 +       s32 command;
38161 +       struct evms_event eventry;
38162 +       s32 status;
38163 +};
38164 +/**
38165 + * field evms_notify_pkt.command defines
38166 + **/
38167 +#define EVMS_EVENT_UNREGISTER   0
38168 +#define EVMS_EVENT_REGISTER     1
38169 +
38170 +#define EVMS_PROCESS_NOTIFY_EVENT_STRING "EVMS_PROCESS_NOTIFY_EVENT"
38171 +#define EVMS_PROCESS_NOTIFY_EVENT       _IOWR(EVMS_MAJOR, EVMS_PROCESS_NOTIFY_EVENT_NUMBER, struct evms_notify_pkt)
38172 +
38173 +/* query info commands */
38174 +
38175 +/**
38176 + * struct evms_user_disk_pkt - get disk handle ioctl packet definition
38177 + * @command:           0 = first disk, 1 = next disk
38178 + * @status:            0 = no more disks, 1 = valid disk info
38179 + * @disk_handle:       only valid when status == 1
38180 + *
38181 + * ioctl packet definition for EVMS_GET_LOGICAL_DISK ioctl
38182 + **/
38183 +struct evms_user_disk_pkt {
38184 +       s32 command;
38185 +       s32 status;
38186 +       u64 disk_handle;
38187 +};
38188 +/**
38189 + * field evms_user_disk_pkt.command defines
38190 + **/
38191 +#define EVMS_FIRST_DISK         0
38192 +#define EVMS_NEXT_DISK          1
38193 +/**
38194 + * field evms_user_disk_pkt.status defines
38195 + **/
38196 +#define EVMS_DISK_INVALID       0
38197 +#define EVMS_DISK_VALID         1
38198 +
38199 +#define EVMS_GET_LOGICAL_DISK_STRING    "EVMS_GET_LOGICAL_DISK"
38200 +#define EVMS_GET_LOGICAL_DISK           _IOWR(EVMS_MAJOR, EVMS_GET_LOGICAL_DISK_NUMBER, struct evms_user_disk_pkt)
38201 +
38202 +/**
38203 + * evms_user_disk_info_pkt - disk info packet definition
38204 + * @status:            return operation status
38205 + * @flags:             device characteristics
38206 + * @disk_handle:       kernel handle to specified device
38207 + * @disk_dev:          kernel device info, used by MD plugin
38208 + * @geometry:          reported device geometry
38209 + * @block_size:                reported block size
38210 + * @hardsect_size:     reported physical sector size
38211 + * @total_vsectors:    size of device in 512 byte units
38212 + * @disk_name:         legacy name for the device
38213 + *
38214 + * ioctl packet definition for EVMS_GET_LOGICAL_DISK_INFO ioctl
38215 + **/
38216 +struct evms_user_disk_info_pkt {
38217 +       u32 status;
38218 +       u32 flags;
38219 +       u64 disk_handle;
38220 +       u32 disk_dev;
38221 +       u32 geo_sectors;
38222 +       u32 geo_heads;
38223 +       u64 geo_cylinders;
38224 +       u32 block_size;
38225 +       u32 hardsect_size;
38226 +       u64 total_sectors;
38227 +       u8 disk_name[EVMS_VOLUME_NAME_SIZE + 1];
38228 +};
38229 +/**
38230 + * field evms_user_disk_info_pkt.flags define in evms.h
38231 + **/
38232 +
38233 +#define EVMS_GET_LOGICAL_DISK_INFO_STRING "EVMS_GET_LOGICAL_DISK_INFO"
38234 +#define EVMS_GET_LOGICAL_DISK_INFO      _IOWR(EVMS_MAJOR, EVMS_GET_LOGICAL_DISK_INFO_NUMBER, struct evms_user_disk_info_pkt)
38235 +
38236 +/**
38237 + * struct evms_sector_io_pkt - sector io ioctl packet definition
38238 + * @disk_handle:       disk handle of target device
38239 + * @io_flag:           0 = read, 1 = write
38240 + * @starting_sector:   disk relative starting sector
38241 + * @sector_count:      count of sectors
38242 + * @buffer_address:    user buffer address
38243 + * @status:            return operation status
38244 + *
38245 + * ioctl packet definition for EVMS_SECTOR_IO ioctl
38246 + **/
38247 +struct evms_sector_io_pkt {
38248 +       u64 disk_handle;
38249 +       s32 io_flag;
38250 +       u64 starting_sector;
38251 +       u64 sector_count;
38252 +       u8 *buffer_address;
38253 +       s32 status;
38254 +};
38255 +/**
38256 + * field evms_sector_io_pkt.io_flag defines
38257 + **/
38258 +#define EVMS_SECTOR_IO_READ    0
38259 +#define EVMS_SECTOR_IO_WRITE   1
38260 +
38261 +#define EVMS_SECTOR_IO_STRING           "EVMS_SECTOR_IO"
38262 +#define EVMS_SECTOR_IO                  _IOWR(EVMS_MAJOR, EVMS_SECTOR_IO_NUMBER, struct evms_sector_io_pkt)
38263 +
38264 +/**
38265 + * struct evms_user_minor_pkt - get a list of device minors, one at a time
38266 + * @command:   0 = first volume, 1 = next volume
38267 + * @status:    returned operation status
38268 + * @minor:     returned minor number, only valid when status == 1
38269 + *
38270 + * ioctl packet definition for EVMS_GET_MINOR ioctl
38271 + **/
38272 +struct evms_user_minor_pkt {
38273 +       s32 command;
38274 +       s32 status;
38275 +       s32 minor;
38276 +};
38277 +/**
38278 + * field evms_user_minor_pkt.command defines
38279 + **/
38280 +#define EVMS_FIRST_VOLUME       0
38281 +#define EVMS_NEXT_VOLUME        1
38282 +/**
38283 + * field evms_user_minor_pkt.status defines
38284 + **/
38285 +#define EVMS_VOLUME_INVALID     0
38286 +#define EVMS_VOLUME_VALID       1
38287 +
38288 +#define EVMS_GET_MINOR_STRING           "EVMS_GET_MINOR"
38289 +#define EVMS_GET_MINOR                  _IOWR(EVMS_MAJOR, EVMS_GET_MINOR_NUMBER, struct evms_user_minor_pkt)
38290 +
38291 +/**
38292 + * struct evms_volume_data_pkt - volume data packet definition
38293 + * @minor:             minor device number of target volume
38294 + * @flags:             returned volume characteristics
38295 + * @volume_name:       returned volume name
38296 + * @status:            returned operation status
38297 + *
38298 + * ioctl packet definition for EVMS_GET_VOLUME_DATA ioctl
38299 + **/
38300 +struct evms_volume_data_pkt {
38301 +       s32 minor;
38302 +       s32 flags;
38303 +       u8 volume_name[EVMS_VOLUME_NAME_SIZE + 1];
38304 +       s32 status;
38305 +};
38306 +/**
38307 + * field evms_volume_data_pkt.flags defines found in evms_common.h
38308 + **/
38309 +
38310 +#define EVMS_GET_VOLUME_DATA_STRING     "EVMS_GET_VOLUME_DATA"
38311 +#define EVMS_GET_VOLUME_DATA            _IOWR(EVMS_MAJOR, EVMS_GET_VOLUME_DATA_NUMBER, struct evms_volume_data_pkt)
38312 +
38313 +/**
38314 + * struct evms_kernel_plugin_pkt - get kernel plugin ioctl packet definition
38315 + * @command:   0 = first plugin, 1 = next plugin
38316 + * @id:                returned plugin id
38317 + * @version:   returned plugin version info
38318 + * @status:    returned operation status
38319 + *
38320 + * ioctl packet definition for EVMS_GET_PLUGIN ioctl
38321 + **/
38322 +struct evms_kernel_plugin_pkt {
38323 +       s32 command;
38324 +       u32 id;
38325 +       struct evms_version version;
38326 +       s32 status;
38327 +};
38328 +/**
38329 + * field evms_kernel_plugin_pkt.command defines
38330 + **/
38331 +#define EVMS_FIRST_PLUGIN       0
38332 +#define EVMS_NEXT_PLUGIN        1
38333 +/**
38334 + * field evms_kernel_plugin_pkt.status defines
38335 + **/
38336 +#define EVMS_PLUGIN_INVALID     0
38337 +#define EVMS_PLUGIN_VALID       1
38338 +
38339 +#define EVMS_GET_PLUGIN_STRING          "EVMS_GET_PLUGIN"
38340 +#define EVMS_GET_PLUGIN                 _IOWR(EVMS_MAJOR, EVMS_GET_PLUGIN_NUMBER, struct evms_kernel_plugin_pkt)
38341 +
38342 +/**
38343 + * struct evms_compute_csum_pkt - compute checksum ioctl packet definition
38344 + * @buffer_address:
38345 + * @buffer_size:
38346 + * @insum:
38347 + * @outsum:
38348 + * @status:
38349 + *
38350 + * ioctl packet definition for EVMS_COMPUTE_CSUM ioctl
38351 + **/
38352 +struct evms_compute_csum_pkt {
38353 +       u8 *buffer_address;
38354 +       s32 buffer_size;
38355 +       u32 insum;
38356 +       u32 outsum;
38357 +       s32 status;
38358 +};
38359 +
38360 +#define EVMS_COMPUTE_CSUM_STRING        "EVMS_COMPUTE_CSUM"
38361 +#define EVMS_COMPUTE_CSUM               _IOWR(EVMS_MAJOR, EVMS_COMPUTE_CSUM_NUMBER, struct evms_compute_csum_pkt)
38362 +
38363 +/**
38364 + * struct evms_get_bmap_pkt - get bmap data ioctl packet definition
38365 + * @rsector:   input, volume relative rsector value
38366 + *             output, disk relative rsector value
38367 + * @dev                output, physical device
38368 + * @status:    output, operation status
38369 + *
38370 + * ioctl packet definition for EVMS_GET_BMAP ioctl
38371 + **/
38372 +struct evms_get_bmap_pkt {
38373 +       u64 rsector;
38374 +       u32 dev;
38375 +       s32 status;
38376 +};
38377 +
38378 +#define EVMS_GET_BMAP_STRING            "EVMS_GET_BMAP"
38379 +#define EVMS_GET_BMAP                   _IOWR(EVMS_MAJOR, EVMS_GET_BMAP_NUMBER, struct evms_get_bmap_pkt)
38380 +
38381 +/**
38382 + * struct evms_mount_status_pkt - ioctl packet definition
38383 + * @minor:     input, minor of volume to check
38384 + * @mounted:   output, TRUE if mounted, FALSE if not
38385 + * @status:    output, operation completion status
38386 + *
38387 + * ioctl packet definition for EVMS_CHECK_MOUNT_STATUS ioctl.
38388 + **/
38389 +struct evms_mount_status_pkt {
38390 +       u32 minor;
38391 +       u32 mounted;
38392 +       s32 status;
38393 +};
38394 +
38395 +#define EVMS_CHECK_MOUNT_STATUS_STRING "EVMS_CHECK_MOUNT_STATUS"
38396 +#define EVMS_CHECK_MOUNT_STATUS         _IOWR(EVMS_MAJOR, EVMS_CHECK_MOUNT_STATUS_NUMBER, struct evms_mount_status_pkt)
38397 +
38398 +/**
38399 + * struct evms_open_status_pkt - ioctl packet definition
38400 + * @minor:     input, minor of volume to check
38401 + * @opens:     output, 0 (FALSE) if not, count (TRUE) of opens
38402 + * @status:    output, operation completion status
38403 + *
38404 + * ioctl packet definition for EVMS_CHECK_OPEN_STATUS ioctl.
38405 + **/
38406 +struct evms_open_status_pkt {
38407 +       u32 minor;
38408 +       u32 opens;
38409 +       s32 status;
38410 +};
38411 +
38412 +#define EVMS_CHECK_OPEN_STATUS_STRING  "EVMS_CHECK_OPEN_STATUS"
38413 +#define EVMS_CHECK_OPEN_STATUS          _IOWR(EVMS_MAJOR, EVMS_CHECK_OPEN_STATUS_NUMBER, struct evms_open_status_pkt)
38414 +
38415 +/**
38416 + * struct evms_vol_stripe_info_pkt - ioctl packet definition
38417 + * @size:      the stripe unit specified in 512 byte block units
38418 + * @width:     the number of stripe members or RAID data disks
38419 + *
38420 + * ioctl packet definition for EVMS_GET_VOL_STRIPE_INFO ioctl.
38421 + **/
38422 +struct evms_vol_stripe_info_pkt {
38423 +       u32 size;
38424 +       u32 width;
38425 +};
38426 +
38427 +#define EVMS_GET_VOL_STRIPE_INFO_STRING        "EVMS_GET_VOL_STRIPE_INFO"
38428 +#define EVMS_GET_VOL_STRIPE_INFO       _IOR(EVMS_MAJOR, EVMS_GET_VOL_STRIPE_INFO_NUMBER, struct evms_vol_stripe_info_pkt)
38429 +#endif
38430 diff -Naur linux-2002-09-30/include/linux/evms/evms_linear.h evms-2002-09-30/include/linux/evms/evms_linear.h
38431 --- linux-2002-09-30/include/linux/evms/evms_linear.h   Wed Dec 31 18:00:00 1969
38432 +++ evms-2002-09-30/include/linux/evms/evms_linear.h    Tue Aug  6 01:03:24 2002
38433 @@ -0,0 +1,33 @@
38434 +#ifndef __EVMS_LINEAR_H
38435 +#define __EVMS_LINEAR_H
38436 +
38437 +#include <linux/evms/evms_md.h>
38438 +
38439 +struct dev_info {
38440 +       struct evms_logical_node *node;
38441 +       kdev_t          dev;
38442 +       unsigned long   size;
38443 +       unsigned long   offset;
38444 +};
38445 +
38446 +typedef struct dev_info dev_info_t;
38447 +
38448 +struct linear_hash
38449 +{
38450 +       dev_info_t *dev0, *dev1;
38451 +};
38452 +
38453 +struct linear_private_data
38454 +{
38455 +       struct linear_hash      *hash_table;
38456 +       dev_info_t              disks[MD_SB_DISKS];
38457 +       dev_info_t              *smallest;
38458 +       int                     nr_zones;
38459 +};
38460 +
38461 +
38462 +typedef struct linear_private_data linear_conf_t;
38463 +
38464 +#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private)
38465 +
38466 +#endif
38467 diff -Naur linux-2002-09-30/include/linux/evms/evms_lvm.h evms-2002-09-30/include/linux/evms/evms_lvm.h
38468 --- linux-2002-09-30/include/linux/evms/evms_lvm.h      Wed Dec 31 18:00:00 1969
38469 +++ evms-2002-09-30/include/linux/evms/evms_lvm.h       Mon Aug 26 10:01:08 2002
38470 @@ -0,0 +1,479 @@
38471 +/* -*- linux-c -*- */
38472 +/*
38473 + *   Copyright (c) International Business Machines  Corp., 2000
38474 + *
38475 + *   This program is free software;  you can redistribute it and/or modify
38476 + *   it under the terms of the GNU General Public License as published by
38477 + *   the Free Software Foundation; either version 2 of the License, or
38478 + *   (at your option) any later version.
38479 + *
38480 + *   This program is distributed in the hope that it will be useful,
38481 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
38482 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
38483 + *   the GNU General Public License for more details.
38484 + *
38485 + *   You should have received a copy of the GNU General Public License
38486 + *   along with this program;  if not, write to the Free Software
38487 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
38488 + */
38489 +/*
38490 + * linux/include/linux/evms_lvm.h
38491 + *
38492 + * EVMS LVM VGE kernel header file
38493 + */
38494 +
38495 +#ifndef __EVMS_LVM_H__
38496 +#define __EVMS_LVM_H__
38497 +
38498 +#define EVMS_LVM_VERSION_MAJOR 1
38499 +#define EVMS_LVM_VERSION_MINOR 1
38500 +#define EVMS_LVM_VERSION_PATCH 1
38501 +
38502 +/* The following definitions and data structures are copied from lvm.h and
38503 + * liblvm.h from the LVM 0.9.1beta8 distribution. Since the metadata format
38504 + * changed in beta8, lvm.h changed significantly enough that this module would
38505 + * no longer compile. Instead of requiring evms users to install the latest lvm
38506 + * release, the required definitions and data structures will now be included
38507 + * in this header file.
38508 + */
38509 +
38510 +#define MAX_VG                 99
38511 +#define MAX_LV                 256
38512 +#define        MAX_PV                  256
38513 +#define        NAME_LEN                128
38514 +#define        UUID_LEN                32
38515 +#define LVM_VGDA_ALIGN         4096UL
38516 +#define        LVM_PV_DISK_BASE        0L
38517 +#define        LVM_PV_DISK_SIZE        1024L
38518 +#define        LVM_VG_DISK_BASE        round_up(LVM_PV_DISK_BASE + LVM_PV_DISK_SIZE, \
38519 +                                        LVM_VGDA_ALIGN)
38520 +#define        LVM_VG_DISK_SIZE        (8*512L)
38521 +
38522 +/*
38523 + * Status flags
38524 + */
38525 +/* lv->lv_status */
38526 +#define        LV_ACTIVE               0x01
38527 +/* lv->lv_access */
38528 +#define        LV_READ                 0x01
38529 +#define        LV_WRITE                0x02
38530 +#define        LV_SNAPSHOT             0x04
38531 +#define        LV_SNAPSHOT_ORG         0x08
38532 +
38533 +/**
38534 + * struct lv_COW_table_disk_v1
38535 + * @pv_org_number:
38536 + * @pv_org_rsector:
38537 + * @pv_snap_number:
38538 + * @pv_snap_rsector:
38539 + *
38540 + * Copy-On-Write tables in disk format (version 1).
38541 + **/
38542 +struct lv_COW_table_disk {
38543 +       u64 pv_org_number;
38544 +       u64 pv_org_rsector;
38545 +       u64 pv_snap_number;
38546 +       u64 pv_snap_rsector;
38547 +};
38548 +
38549 +/**
38550 + * struct pe_disk
38551 + * @lv_num:
38552 + * @le_num:
38553 + *
38554 + * Disk stored PE map entry definition.
38555 + **/
38556 +struct pe_disk {
38557 +       u16 lv_num;
38558 +       u16 le_num;
38559 +};
38560 +
38561 +/**
38562 + * struct lvm_disk_data
38563 + * @base:
38564 + * @size:
38565 + *
38566 + * Disk stored PV, VG, LV and PE size and offset information.
38567 + */
38568 +struct lvm_disk_data {
38569 +       u32 base;
38570 +       u32 size;
38571 +};
38572 +
38573 +/**
38574 + * struct pv_disk
38575 + * @id:
38576 + * @version:
38577 + * @pv_on_disk:
38578 + * @vg_on_disk:
38579 + * @pv_uuidlist_on_disk:
38580 + * @lv_on_disk:
38581 + * @pe_on_disk:
38582 + * @pv_uuid:
38583 + * @vg_name:
38584 + * @system_id: used by vgexport/vgimport
38585 + * @pv_major:
38586 + * @pv_number:
38587 + * @pv_status:
38588 + * @pv_allocatable:
38589 + * @pv_size:
38590 + * @lv_cur
38591 + * @pe_size:
38592 + * @pe_total:
38593 + * @pe_allocated:
38594 + * @pe_start: in sectors (new in version 2)
38595 + *
38596 + * Physical volume on disk metadata definition (version 2).
38597 + */
38598 +struct pv_disk {
38599 +       u8                      id[2];
38600 +       u16                     version;
38601 +       struct lvm_disk_data    pv_on_disk;
38602 +       struct lvm_disk_data    vg_on_disk;
38603 +       struct lvm_disk_data    pv_uuidlist_on_disk;
38604 +       struct lvm_disk_data    lv_on_disk;
38605 +       struct lvm_disk_data    pe_on_disk;
38606 +       u8                      pv_uuid[NAME_LEN];
38607 +       u8                      vg_name[NAME_LEN];
38608 +       u8                      system_id[NAME_LEN];
38609 +       u32                     pv_major;
38610 +       u32                     pv_number;
38611 +       u32                     pv_status;
38612 +       u32                     pv_allocatable;
38613 +       u32                     pv_size;
38614 +       u32                     lv_cur;
38615 +       u32                     pe_size;
38616 +       u32                     pe_total;
38617 +       u32                     pe_allocated;
38618 +       u32                     pe_start;
38619 +};
38620 +
38621 +/**
38622 + * struct lv_disk
38623 + * @lv_name:
38624 + * @vg_name:
38625 + * @lv_access:
38626 + * @lv_status:
38627 + * @lv_open:
38628 + * @lv_dev:
38629 + * @lv_number:
38630 + * @lv_mirror_copies:
38631 + * @lv_recovery:
38632 + * @lv_schedule:
38633 + * @lv_size:
38634 + * @lv_snapshot_minor: minor number of original
38635 + * @lv_chunk_size: chuck size for snapshots
38636 + * @lv_dummy:
38637 + * @lv_allocated_le:
38638 + * @lv_stripes:
38639 + * @lv_stripesize:
38640 + * @lv_badblock:
38641 + * @lv_allocated:
38642 + * @lv_io_timeout:
38643 + * @lv_read_ahead:
38644 + *
38645 + * Logical volume metadata definition (version 3).
38646 + */
38647 +struct lv_disk {
38648 +       u8      lv_name[NAME_LEN];
38649 +       u8      vg_name[NAME_LEN];
38650 +       u32     lv_access;
38651 +       u32     lv_status;
38652 +       u32     lv_open;
38653 +       u32     lv_dev;
38654 +       u32     lv_number;
38655 +       u32     lv_mirror_copies;
38656 +       u32     lv_recovery;
38657 +       u32     lv_schedule;
38658 +       u32     lv_size;
38659 +       u32     lv_snapshot_minor;
38660 +       u16     lv_chunk_size;
38661 +       u16     dummy;
38662 +       u32     lv_allocated_le;
38663 +       u32     lv_stripes;
38664 +       u32     lv_stripesize;
38665 +       u32     lv_badblock;
38666 +       u32     lv_allocation;
38667 +       u32     lv_io_timeout;
38668 +       u32     lv_read_ahead;
38669 +};
38670 +
38671 +/**
38672 + * struct vg_disk
38673 + * @vg_uuid:           Volume group UUID
38674 + * @vg_name_dummy:     Remainder of version 1 VG name
38675 + * @vg_number:         Volume group number
38676 + * @vg_access:         Read/Write
38677 + * @vg_status:         Active or not
38678 + * @lv_max:            Maximum logical volumes
38679 + * @lv_cur:            Current logical volumes
38680 + * @lv_open:           Open logical volumes
38681 + * @pv_max:            Maximum physical volumes
38682 + * @pv_cur:            Current physical volumes
38683 + * @pv_act:            Active physical volumes
38684 + * @dummy:
38685 + * @vgda:              Volume group descriptor arrays
38686 + * @pe_size:           Physical extent size in sectors
38687 + * @pe_total:          Total of physical extents
38688 + * @pe_allocated:      Allocated physical extents
38689 + * @pvg_total:         Physical volume groups
38690 + *
38691 + * Volume group metadata definition (version 2).
38692 + */
38693 +struct vg_disk {
38694 +       u8      vg_uuid[UUID_LEN];
38695 +       u8      vg_name_dummy[NAME_LEN - UUID_LEN];
38696 +       u32     vg_number;
38697 +       u32     vg_access;
38698 +       u32     vg_status;
38699 +       u32     lv_max;
38700 +       u32     lv_cur;
38701 +       u32     lv_open;
38702 +       u32     pv_max;
38703 +       u32     pv_cur;
38704 +       u32     pv_act;
38705 +       u32     dummy;
38706 +       u32     vgda;
38707 +       u32     pe_size;
38708 +       u32     pe_total;
38709 +       u32     pe_allocated;
38710 +       u32     pvg_total;
38711 +};
38712 +
38713 +/* Useful inlines */
38714 +static inline ulong round_up(ulong n, ulong size)
38715 +{
38716 +       size--;
38717 +       return (n + size) & ~size;
38718 +}
38719 +
38720 +static inline ulong div_up(ulong n, ulong size)
38721 +{
38722 +       return round_up(n, size) / size;
38723 +}
38724 +
38725 +/* End of lvm.h imported data structures. */
38726 +
38727 +#define DEV_DIRECTORY          "/dev/"
38728 +#define LVM_DEV_DIRECTORY      "lvm/"
38729 +#define LVM_PROC_NAME          "lvm"
38730 +#define LVM_PROC_VG_NAME       "VGs"
38731 +#define LVM_PROC_LV_NAME       "LVs"
38732 +#define LVM_PROC_PV_NAME       "PVs"
38733 +#define LVM_PROC_GLOBAL_NAME   "global"
38734 +#define IO_BUFFER_SECTORS      8
38735 +
38736 +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,9)
38737 +#define max(a,b) (((a)>(b))?(a):(b))
38738 +#endif
38739 +
38740 +/* Structure for doing PV remove ioctls. */
38741 +
38742 +#define EVMS_LVM_PV_REMOVE_IOCTL       0x01
38743 +#define EVMS_LVM_SNAPSHOT_STAT_IOCTL   0x02
38744 +
38745 +/**
38746 + * struct lvm_pv_remove_ioctl
38747 + * @vg_uuid:   Volume group UUID
38748 + * @pv_number: Physical volume number
38749 + * @next:      Link to next packet (engine-use only)
38750 + *
38751 + * PV remove ioctl packet definition.
38752 + */
38753 +struct lvm_pv_remove_ioctl {
38754 +       u8                              vg_uuid[UUID_LEN];
38755 +       u32                             pv_number;
38756 +       struct lvm_pv_remove_ioctl      * next;
38757 +};
38758 +
38759 +/**
38760 + * struct lvm_snapshot_stat_ioctl
38761 + * @vg_uuid:           Volume group UUID
38762 + * @lv_number:         Logical volume number
38763 + * @next_free_chuck:
38764 + * @lv_status:
38765 + *
38766 + * Snapshot statistics ioctl packet definition.
38767 + **/
38768 +struct lvm_snapshot_stat_ioctl {
38769 +       u8      vg_uuid[UUID_LEN];
38770 +       u32     lv_number;
38771 +       u64     next_free_chunk;
38772 +       u32     lv_status;
38773 +};
38774 +
38775 +/**
38776 + * struct lvm_physical_volume
38777 + * @logical_node:      Storage object
38778 + * @pv:                        Copy of on-disk PV struct
38779 + * @pe_map:
38780 + * @pv_number:
38781 + * @next:              Pointer to next entry
38782 + *
38783 + * Entries in the list of physical volumes (PV) in a volume group (VG).
38784 + */
38785 +struct lvm_physical_volume {
38786 +       struct evms_logical_node        * logical_node;
38787 +       struct pv_disk                  * pv;
38788 +       struct pe_disk                  * pe_map;
38789 +       u32                             pv_number;
38790 +       struct lvm_physical_volume      * next;
38791 +};
38792 +
38793 +/**
38794 + * struct le_table_entry
38795 + * @owning_pv:
38796 + * @pe_sector_offset:
38797 + *
38798 + * Table entry definition for mapping logical
38799 + * extents (LE) to physical extents (PE).
38800 + */
38801 +struct le_table_entry {
38802 +       struct lvm_physical_volume      * owning_pv;
38803 +       u64                             pe_sector_offset;
38804 +};
38805 +
38806 +/**
38807 + * struct snapshot_map_entry
38808 + * @org_sector:
38809 + * @snap_sector:
38810 + * @snap_pv:
38811 + * @next:
38812 + * @prev:
38813 + *
38814 + * Snapshot remapping entry structure definition.
38815 + */
38816 +struct snapshot_map_entry {
38817 +       u64                             org_sector;
38818 +       u64                             snap_sector;
38819 +       struct lvm_physical_volume      * snap_pv;
38820 +       struct snapshot_map_entry       * next;
38821 +       struct snapshot_map_entry       * prev;
38822 +};
38823 +
38824 +#define MAX_HASH_CHAIN_ENTRIES 10
38825 +#define CHUNK_DATA_BUFFER_SIZE 128
38826 +
38827 +/**
38828 + * struct lvm_logical_volume
38829 + * @lv_number:
38830 + * @lv_size:           In sectors
38831 + * @lv_access:         Flags: LV_READ, LV_WRITE, LV_SNAPSHOT,
38832 + *                             LV_SNAPSHOT_ROG, EVMS_LV*
38833 + * @lv_status:         Flags: LV_ACTIVE, LV_SPINDOWN
38834 + * @lv_minor:          Device minor number
38835 + * @stripes:
38836 + * @stripe_size:       In sectors
38837 + * @stripe_size_shift: # of bits to shift right instead of dividing by stripe_size
38838 + * @pe_size:           In sectors
38839 + * @pe_size_shift:     Number of bits to shift right instead of dividing by pe_size
38840 + * @num_le:            Number of entries in the le-to-pe map
38841 + * @group:             Pointer back to parent volume group
38842 + * @name:              Dev-tree volume name (eg. /dev/group0/vol0)
38843 + * @le_map:            Mapping of logical to physical extents
38844 + * @volume_node:       Pointer to parent EVMS object representing this volume
38845 + * @chunk_size:                In sectors
38846 + * @num_chunks:                lv_size / chunk_size
38847 + * @snap_org_minor:    Minor number of snapshot original
38848 + * @next_cow_entry:    Index into current COW table
38849 + * @current_cow_sector:        Logical sector of current COW table
38850 + * @next_free_chunk:   Starting logical sector of next free chunk
38851 + * @hash_table_size:   Number of pointers in each hash table
38852 + * @cow_table:         Pointer to one sector's worth of COW tables.
38853 + * @chunk_data_buffer: Buffer reading data when doing copy-on-write
38854 + * @snap_semaphore:    For locking during snapshot IO operations
38855 + * @snapshot_map:      Pointer to remapping hash tables
38856 + * @snapshot_next:     Linked list of volumes being snapshotted
38857 + * @snapshot_org:      Pointer to volume being snapshotted
38858 + *
38859 + * In-memory representation of an LVM LV.
38860 + */
38861 +struct lvm_logical_volume {
38862 +       u32                     lv_number;
38863 +       u64                     lv_size;
38864 +       u32                     lv_access;
38865 +       u32                     lv_status;
38866 +       u32                     lv_minor;
38867 +       u32                     stripes;
38868 +       u32                     stripe_size;
38869 +       u32                     stripe_size_shift;
38870 +       u32                     pe_size;
38871 +       u32                     pe_size_shift;
38872 +       u32                     num_le;
38873 +       struct lvm_volume_group * group;
38874 +       u8                      name[NAME_LEN];
38875 +       struct le_table_entry   * le_map;
38876 +       struct evms_logical_node * volume_node;
38877 +       u32                     chunk_size;
38878 +       u32                     num_chunks;
38879 +       u32                     snap_org_minor;
38880 +       u32                     next_cow_entry;
38881 +       u64                     current_cow_sector;
38882 +       u64                     next_free_chunk;
38883 +       u32                     hash_table_size;
38884 +       struct lv_COW_table_disk * cow_table;
38885 +       u8                      * chunk_data_buffer;
38886 +       struct semaphore        snap_semaphore;
38887 +       struct snapshot_map_entry *** snapshot_map;
38888 +       struct lvm_logical_volume * snapshot_next;
38889 +       struct lvm_logical_volume * snapshot_org;
38890 +};
38891 +
38892 +/* lv_access:
38893 + * EVMS_LV_NEW:                Volume was created during the current discovery pass.
38894 + * EVMS_LV_INCOMPLETE: Volume has an incomplete LE map.
38895 + * EVMS_LV_INVALID:    Volume has a memory-corruption problem.
38896 + * EVMS_LV_QUIESCED:   Volume is in quiesced state.
38897 + * EVMS_LV_EXPORTED:   Volume has been exported during this EVMS discovery pass.
38898 + */
38899 +#define EVMS_LV_NEW            0x10
38900 +#define EVMS_LV_INCOMPLETE     0x20
38901 +#define EVMS_LV_INVALID                0x40
38902 +#define EVMS_LV_QUIESCED       0x80
38903 +#define EVMS_LV_EXPORTED       0x100
38904 +
38905 +/**
38906 + * struct lvm_volume_group
38907 + * @vg:                        Copy of on-disk VG metadata
38908 + * @pv_list:           List of PVs that make up this group
38909 + * @volume_list:       Array of volumes
38910 + * @lv_array:          Array of LV metadata
38911 + * @uuid_list:         List of PV UUIDs
38912 + * @vg_uuid:           UUID from the VG metadata
38913 + * @vg_name:           Name from the PV metadata
38914 + * @pv_count:          # of PVs found in this group
38915 + * @volume_count:      # of LVs found in this group
38916 + * @hard_sect_size:    Largest hardsector size of all PVs in this group
38917 + * @block_size:                Largest block size of all PVs in this group
38918 + * @flags:             EVMS_VG*
38919 + * @next_group:                Linked list
38920 + *
38921 + * In-memory representation of an LVM VG.
38922 + */
38923 +struct lvm_volume_group {
38924 +       struct vg_disk                  * vg;
38925 +       struct lvm_physical_volume      * pv_list;
38926 +       struct lvm_logical_volume       * volume_list[MAX_LV + 1];
38927 +       struct lv_disk                  * lv_array;
38928 +       u8                              * uuid_list;
38929 +       u8                              vg_uuid[UUID_LEN];
38930 +       u8                              vg_name[NAME_LEN];
38931 +       u32                             pv_count;
38932 +       u32                             volume_count;
38933 +       s32                             hard_sect_size;
38934 +       s32                             block_size;
38935 +       u32                             flags;
38936 +       struct lvm_volume_group         * next_group;
38937 +};
38938 +
38939 +/* flags
38940 + * EVMS_VG_DIRTY:              Group is new or has had a PV added
38941 + *                             during this discovery.
38942 + * EVMS_VG_PARTIAL_PVS:                Group contains at least one partial PV.
38943 + * EVMS_VG_REMOVABLE_PVS:      Group contains at least one removable PV.
38944 + */
38945 +#define EVMS_VG_DIRTY                  (1 << 0)
38946 +#define EVMS_VG_PARTIAL_PVS            (1 << 1)
38947 +#define EVMS_VG_REMOVABLE_PVS          (1 << 2)
38948 +
38949 +#endif
38950 diff -Naur linux-2002-09-30/include/linux/evms/evms_md.h evms-2002-09-30/include/linux/evms/evms_md.h
38951 --- linux-2002-09-30/include/linux/evms/evms_md.h       Wed Dec 31 18:00:00 1969
38952 +++ evms-2002-09-30/include/linux/evms/evms_md.h        Fri Aug 16 11:10:59 2002
38953 @@ -0,0 +1,120 @@
38954 +/*
38955 + *   Copyright (c) International Business Machines  Corp., 2000
38956 + *
38957 + *   This program is free software;  you can redistribute it and/or modify
38958 + *   it under the terms of the GNU General Public License as published by
38959 + *   the Free Software Foundation; either version 2 of the License, or
38960 + *   (at your option) any later version.
38961 + *
38962 + *   This program is distributed in the hope that it will be useful,
38963 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
38964 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
38965 + *   the GNU General Public License for more details.
38966 + *
38967 + *   You should have received a copy of the GNU General Public License
38968 + *   along with this program;  if not, write to the Free Software
38969 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
38970 + *
38971 + * linux/include/linux/evms/evms_md.h
38972 + *
38973 + * EVMS Linux MD Region Manager Public Header File
38974 + *
38975 + * 'evms_md.h' is an EVMS version of linux/include/linux/raid/md.h modified
38976 + * by Cuong (Mike) Tran <miketran@us.ibm.com>, January 2002.
38977 + *
38978 + */
38979 +
38980 +#ifndef __EVMS_MD_INCLUDED
38981 +#define __EVMS_MD_INCLUDED
38982 +
38983 +#include <linux/mm.h>
38984 +#include <linux/fs.h>
38985 +#include <linux/blkdev.h>
38986 +#include <asm/semaphore.h>
38987 +#include <linux/ioctl.h>
38988 +#include <linux/types.h>
38989 +#include <asm/bitops.h>
38990 +#include <linux/module.h>
38991 +#include <linux/hdreg.h>
38992 +#include <linux/proc_fs.h>
38993 +#include <linux/smp_lock.h>
38994 +#include <linux/delay.h>
38995 +#include <net/checksum.h>
38996 +#include <linux/random.h>
38997 +#include <linux/locks.h>
38998 +#include <linux/kernel_stat.h>
38999 +#include <asm/io.h>
39000 +#include <linux/completion.h>
39001 +
39002 +#include <linux/evms/evms.h>
39003 +
39004 +#include <linux/raid/md_compatible.h>
39005 +/*
39006 + * 'md_p.h' holds the 'physical' layout of RAID devices
39007 + * 'md_u.h' holds the user <=> kernel API
39008 + *
39009 + * 'md_k.h' holds kernel internal definitions
39010 + */
39011 +
39012 +#include <linux/evms/evms_md_p.h>
39013 +#include <linux/evms/evms_md_u.h>
39014 +#include <linux/evms/evms_md_k.h>
39015 +
39016 +/*
39017 + * Different major versions are not compatible.
39018 + * Different minor versions are only downward compatible.
39019 + * Different patchlevel versions are downward and upward compatible.
39020 + */
39021 +#define EVMS_MD_MAJOR_VERSION                1
39022 +#define EVMS_MD_MINOR_VERSION                1
39023 +#define EVMS_MD_PATCHLEVEL_VERSION           1
39024 +
39025 +#define MD_MAJOR_VERSION                       0
39026 +#define MD_MINOR_VERSION                       90
39027 +#define MD_PATCHLEVEL_VERSION                  0
39028 +
39029 +#define EVMS_MD_COMMON_SERVICES_MAJOR          0
39030 +#define EVMS_MD_COMMON_SERVICES_MINOR          5
39031 +#define EVMS_MD_COMMON_SERVICES_PATCHLEVEL     0
39032 +
39033 +
39034 +extern int evms_md_size[MAX_MD_DEVS];
39035 +
39036 +extern void evms_md_add_mddev_mapping (mddev_t *mddev, kdev_t dev, void *data);
39037 +extern void evms_md_del_mddev_mapping (mddev_t *mddev, kdev_t dev);
39038 +extern char * evms_md_partition_name (struct evms_logical_node *node);
39039 +extern int evms_register_md_personality (int p_num, mdk_personality_t *p);
39040 +extern int evms_unregister_md_personality (int p_num);
39041 +
39042 +extern int evms_md_update_sb (mddev_t *mddev);
39043 +extern int evms_md_check_ordering (mddev_t *mddev);
39044 +extern void evms_md_print_devices (void);
39045 +
39046 +extern int evms_md_sync_io(
39047 +       struct evms_logical_node *node, /* evms node for the MD array */
39048 +        int rw,                                /* READ / WRITE */
39049 +        u64 sector,                    /* starting sector */
39050 +        u64 total_nr_sects,            /* total number of sectors */
39051 +        void *data );                  /* pointer to buffer */
39052 +
39053 +extern int evms_md_partial_sync_io(
39054 +       struct evms_logical_node *node, /* evms node for the MD array */
39055 +       int rw,                 /* READ / WRITE */
39056 +        u64 sector,            /* starting sector */
39057 +       u32 *nsects,            /* on input: the total number of sectors for the request */
39058 +                               /* on output, number of sectors completed */
39059 +        void *data);           /* pointer to buffer */
39060 +
39061 +
39062 +extern int evms_md_do_sync(mddev_t *mddev, mdp_disk_t *spare);
39063 +extern void evms_md_done_sync(mddev_t *mddev, int blocks, int ok);
39064 +extern void evms_md_sync_acct(kdev_t dev, unsigned long nr_sectors);
39065 +extern void evms_md_recover_arrays (void);
39066 +extern int evms_md_error (mddev_t *mddev, struct evms_logical_node *node);
39067 +extern int evms_md_error_dev(mddev_t *mddev, kdev_t dev);
39068 +
39069 +#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); evms_md_print_devices(); }
39070 +
39071 +
39072 +#endif
39073 +
39074 diff -Naur linux-2002-09-30/include/linux/evms/evms_md_k.h evms-2002-09-30/include/linux/evms/evms_md_k.h
39075 --- linux-2002-09-30/include/linux/evms/evms_md_k.h     Wed Dec 31 18:00:00 1969
39076 +++ evms-2002-09-30/include/linux/evms/evms_md_k.h      Tue Aug  6 01:03:24 2002
39077 @@ -0,0 +1,483 @@
39078 +/*
39079 + *   Copyright (c) International Business Machines  Corp., 2000
39080 + *
39081 + *   This program is free software;  you can redistribute it and/or modify
39082 + *   it under the terms of the GNU General Public License as published by
39083 + *   the Free Software Foundation; either version 2 of the License, or
39084 + *   (at your option) any later version.
39085 + *
39086 + *   This program is distributed in the hope that it will be useful,
39087 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
39088 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
39089 + *   the GNU General Public License for more details.
39090 + *
39091 + *   You should have received a copy of the GNU General Public License
39092 + *   along with this program;  if not, write to the Free Software
39093 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
39094 + */
39095 +/*
39096 + * linux/include/linux/evms/evms_md_k.h
39097 + *
39098 + * EVMS Linux MD Region Manager Public Header File
39099 + *
39100 + * 'evms_md_k.h' is an EVMS version of linux/include/linux/raid/md_k.h modified
39101 + * by Cuong (Mike) Tran <miketran@us.ibm.com>, January 2002.
39102 + *
39103 + */
39104 +
39105 +#ifndef __EVMS_MD_K_INC__
39106 +#define __EVMS_MD_K_INC__
39107 +
39108 +#define EVMS_MD_SECTS_PER_PAGE (PAGE_SIZE >> EVMS_VSECTOR_SIZE_SHIFT)
39109 +#define EVMS_MD_SECTS_PER_PAGE_MASK (~(EVMS_MD_SECTS_PER_PAGE-1))
39110 +
39111 +#define MD_RESERVED       0UL
39112 +#define LINEAR            1UL
39113 +#define RAID0             2UL
39114 +#define RAID1             3UL
39115 +#define RAID5             4UL
39116 +#define TRANSLUCENT       5UL
39117 +#define HSM               6UL
39118 +#define MULTIPATH         7UL
39119 +#define MAX_PERSONALITY   8UL
39120 +
39121 +static inline int pers_to_level (int pers)
39122 +{
39123 +       switch (pers) {
39124 +               case MULTIPATH:         return -4;
39125 +               case HSM:               return -3;
39126 +               case TRANSLUCENT:       return -2;
39127 +               case LINEAR:            return -1;
39128 +               case RAID0:             return 0;
39129 +               case RAID1:             return 1;
39130 +               case RAID5:             return 5;
39131 +       }
39132 +       BUG();
39133 +       return MD_RESERVED;
39134 +}
39135 +
39136 +static inline int level_to_pers (int level)
39137 +{
39138 +       switch (level) {
39139 +               case -3: return HSM;
39140 +               case -2: return TRANSLUCENT;
39141 +               case -1: return LINEAR;
39142 +               case 0: return RAID0;
39143 +               case 1: return RAID1;
39144 +               case 4:
39145 +               case 5: return RAID5;
39146 +       }
39147 +       return MD_RESERVED;
39148 +}
39149 +
39150 +typedef struct mddev_s mddev_t;
39151 +typedef struct mdk_rdev_s mdk_rdev_t;
39152 +
39153 +#if (MINORBITS != 8)
39154 +#error MD doesnt handle bigger kdev yet
39155 +#endif
39156 +
39157 +#define MAX_MD_DEVS  (1<<MINORBITS)    /* Max number of md dev */
39158 +
39159 +/*
39160 + * Maps a kdev to an mddev/subdev. How 'data' is handled is up to
39161 + * the personality. (eg. HSM uses this to identify individual LVs)
39162 + */
39163 +struct dev_mapping {
39164 +       mddev_t *mddev;
39165 +       void *data;
39166 +};
39167 +
39168 +extern struct dev_mapping evms_mddev_map [MAX_MD_DEVS];
39169 +static inline mddev_t * kdev_to_mddev (kdev_t dev)
39170 +{
39171 +       if (MAJOR(dev) != MD_MAJOR)
39172 +               BUG();
39173 +        return evms_mddev_map[MINOR(dev)].mddev;
39174 +}
39175 +
39176 +/*
39177 + * options passed in raidrun:
39178 + */
39179 +
39180 +#define MAX_CHUNK_SIZE (4096*1024)
39181 +
39182 +/*
39183 + * default readahead
39184 + */
39185 +#define MD_READAHEAD   vm_max_readahead
39186 +
39187 +static inline int disk_faulty(mdp_disk_t * d)
39188 +{
39189 +       return d->state & (1 << MD_DISK_FAULTY);
39190 +}
39191 +
39192 +static inline int disk_active(mdp_disk_t * d)
39193 +{
39194 +       return d->state & (1 << MD_DISK_ACTIVE);
39195 +}
39196 +
39197 +static inline int disk_sync(mdp_disk_t * d)
39198 +{
39199 +       return d->state & (1 << MD_DISK_SYNC);
39200 +}
39201 +
39202 +static inline int disk_spare(mdp_disk_t * d)
39203 +{
39204 +       return !disk_sync(d) && !disk_active(d) && !disk_faulty(d);
39205 +}
39206 +
39207 +static inline int disk_removed(mdp_disk_t * d)
39208 +{
39209 +       return d->state & (1 << MD_DISK_REMOVED);
39210 +}
39211 +
39212 +static inline void mark_disk_faulty(mdp_disk_t * d)
39213 +{
39214 +       d->state |= (1 << MD_DISK_FAULTY);
39215 +}
39216 +
39217 +static inline void mark_disk_active(mdp_disk_t * d)
39218 +{
39219 +       d->state |= (1 << MD_DISK_ACTIVE);
39220 +       d->state &= ~(1 << MD_DISK_PENDING_ACTIVE);
39221 +}
39222 +
39223 +static inline void mark_disk_sync(mdp_disk_t * d)
39224 +{
39225 +       d->state |= (1 << MD_DISK_SYNC);
39226 +}
39227 +
39228 +static inline void mark_disk_spare(mdp_disk_t * d)
39229 +{
39230 +       d->state = 0;
39231 +}
39232 +
39233 +static inline void mark_disk_removed(mdp_disk_t * d)
39234 +{
39235 +       d->state = (1 << MD_DISK_FAULTY) | (1 << MD_DISK_REMOVED);
39236 +}
39237 +
39238 +static inline void mark_disk_inactive(mdp_disk_t * d)
39239 +{
39240 +       d->state &= ~(1 << MD_DISK_ACTIVE);
39241 +}
39242 +
39243 +static inline void mark_disk_nonsync(mdp_disk_t * d)
39244 +{
39245 +       d->state &= ~(1 << MD_DISK_SYNC);
39246 +}
39247 +
39248 +/*
39249 + * MD's 'extended' device
39250 + */
39251 +struct mdk_rdev_s
39252 +{
39253 +       struct md_list_head same_set;   /* RAID devices within the same set */
39254 +       struct md_list_head all;        /* all RAID devices */
39255 +       struct md_list_head pending;    /* undetected RAID devices */
39256 +       struct evms_logical_node *node; /* EVMS device node */
39257 +       kdev_t dev;                     /* Device number */
39258 +       kdev_t old_dev;                 /*  "" when it was last imported */
39259 +       unsigned long size;             /* Device size (in blocks) */
39260 +       mddev_t *mddev;                 /* RAID array if running */
39261 +       unsigned long last_events;      /* IO event timestamp */
39262 +
39263 +       struct block_device *bdev;      /* block device handle */
39264 +
39265 +       mdp_super_t *sb;
39266 +       unsigned long sb_offset;        /* in blocks */
39267 +
39268 +       int virtual_spare;              /* "virtual" spare added via IOCTL */
39269 +       int alias_device;               /* device alias to the same disk */
39270 +       int faulty;                     /* if faulty do not issue IO requests */
39271 +       int desc_nr;                    /* descriptor index in the superblock */
39272 +};
39273 +
39274 +
39275 +/*
39276 + * disk operations in a working array:
39277 + */
39278 +#define DISKOP_SPARE_INACTIVE          0
39279 +#define DISKOP_SPARE_WRITE             1
39280 +#define DISKOP_SPARE_ACTIVE            2
39281 +#define DISKOP_HOT_SPARE_ACTIVE                3
39282 +#define DISKOP_HOT_REMOVE_SPARE                4
39283 +#define DISKOP_HOT_REMOVE_DISK         5
39284 +#define DISKOP_HOT_ADD_DISK            6
39285 +#define DISKOP_HOT_DEACTIVATE_DISK     7
39286 +
39287 +typedef struct mdk_personality_s mdk_personality_t;
39288 +
39289 +struct mddev_s
39290 +{
39291 +       void                            *private;
39292 +       mdk_personality_t               *pers;
39293 +       struct evms_logical_node        *node;
39294 +       unsigned long                   flag;
39295 +       int                             nr_raid_disks;
39296 +       int                             __minor;
39297 +       int                             chunk_size;
39298 +       mdp_super_t                     *sb;
39299 +       int                             nb_dev;
39300 +       struct md_list_head             disks;
39301 +       int                             sb_dirty;
39302 +       int                             ro;
39303 +       unsigned long                   curr_resync;    /* blocks scheduled */
39304 +       unsigned long                   resync_mark;    /* a recent timestamp */
39305 +       unsigned long                   resync_mark_cnt;/* blocks written at resync_mark */
39306 +       char                            *name;
39307 +       int                             recovery_running;
39308 +       struct semaphore                reconfig_sem;
39309 +       struct semaphore                recovery_sem;
39310 +       struct semaphore                resync_sem;
39311 +       atomic_t                        active;
39312 +
39313 +       atomic_t                        recovery_active; /* blocks scheduled, but not written */
39314 +       md_wait_queue_head_t            recovery_wait;
39315 +
39316 +       struct md_list_head             all_mddevs;
39317 +       struct md_list_head             incomplete_mddevs;
39318 +       struct md_list_head             running_mddevs;
39319 +};
39320 +
39321 +struct mdk_personality_s
39322 +{
39323 +       char *name;
39324 +       int (*sync_io) (mddev_t *mddev, int rw, u64 LSN, u64 nr_sects, void *data);
39325 +       void (*read)(struct evms_logical_node *node, struct buffer_head *bh);
39326 +       void (*write)(struct evms_logical_node *node, struct buffer_head *bh);
39327 +       int (*run)(mddev_t *mddev);
39328 +       int (*stop)(mddev_t *mddev);
39329 +       int (*status)(char *page, mddev_t *mddev);
39330 +       int (*error_handler)(mddev_t *mddev, struct evms_logical_node *node);
39331 +
39332 +/*
39333 + * Some personalities (RAID-1, RAID-5) can have disks hot-added and
39334 + * hot-removed. Hot removal is different from failure. (failure marks
39335 + * a disk inactive, but the disk is still part of the array) The interface
39336 + * to such operations is the 'pers->diskop()' function, can be NULL.
39337 + *
39338 + * the diskop function can change the pointer pointing to the incoming
39339 + * descriptor, but must do so very carefully. (currently only
39340 + * SPARE_ACTIVE expects such a change)
39341 + */
39342 +       int (*diskop) (mddev_t *mddev, mdp_disk_t **descriptor, int state);
39343 +
39344 +       int (*stop_resync)(mddev_t *mddev);
39345 +       int (*restart_resync)(mddev_t *mddev);
39346 +       int (*sync_request)(mddev_t *mddev, unsigned long block_nr);
39347 +       int (*evms_ioctl)(mddev_t *mddev, struct inode *inode, struct file *file,
39348 +                         unsigned int cmd, unsigned long arg);
39349 +       int (*md_pers_ioctl)(mddev_t *mddev, int cmd, void* pers_arg);
39350 +};
39351 +
39352 +/**
39353 + * EVMS MD instance data structure definition
39354 + **/
39355 +struct evms_md {
39356 +       mddev_t *mddev;
39357 +       struct evms_plugin_header instance_plugin_hdr;
39358 +};
39359 +
39360 +#define EVMS_MD_NODE_TO_MDDEV(node) ((struct evms_md *)(node->private))->mddev
39361 +
39362 +static inline int evms_md_check_boundary(struct evms_logical_node *node, struct buffer_head *bh)
39363 +{
39364 +       if ((bh->b_rsector + (bh->b_size >> EVMS_VSECTOR_SIZE_SHIFT)) > node->total_vsectors) {
39365 +               bh->b_end_io(bh, 0);
39366 +               return -EIO;
39367 +       }
39368 +       return 0;
39369 +}
39370 +
39371 +/**
39372 + * This structure is used for synchronous I/O
39373 + * @rc : error code
39374 + * @io_count: number of I/Os
39375 + + @wait: wait queue
39376 + **/
39377 +struct evms_md_sync_cb {
39378 +       int rc;
39379 +       atomic_t io_count;
39380 +       wait_queue_head_t wait;
39381 +};
39382 +
39383 +
39384 +/**
39385 + * This structure is required for activating a spare device
39386 + * @next: next spare
39387 + * @mddev: target md device
39388 + * @spare: spare to activate
39389 + **/
39390 +struct evms_md_activate_spare {
39391 +       struct evms_md_activate_spare *next;
39392 +       mddev_t *mddev;
39393 +       mdp_disk_t *spare;
39394 +};
39395 +
39396 +static inline int incomplete_mddev(mddev_t * mddev)
39397 +{
39398 +       return (mddev->incomplete_mddevs.next != &mddev->incomplete_mddevs);
39399 +}
39400 +
39401 +/*
39402 + * Currently we index md_array directly, based on the minor
39403 + * number. This will have to change to dynamic allocation
39404 + * once we start supporting partitioning of md devices.
39405 + */
39406 +static inline int mdidx (mddev_t * mddev)
39407 +{
39408 +       return mddev->__minor;
39409 +}
39410 +
39411 +static inline kdev_t mddev_to_kdev(mddev_t * mddev)
39412 +{
39413 +       return MKDEV(MD_MAJOR, mdidx(mddev));
39414 +}
39415 +
39416 +extern mdk_rdev_t * evms_md_find_rdev(mddev_t * mddev, kdev_t dev);
39417 +extern mdk_rdev_t * evms_md_find_rdev_nr(mddev_t *mddev, int nr);
39418 +extern mdp_disk_t *get_spare(mddev_t *mddev);
39419 +
39420 +/*
39421 + * iterates through some rdev ringlist. It's safe to remove the
39422 + * current 'rdev'. Dont touch 'tmp' though.
39423 + */
39424 +#define ITERATE_RDEV_GENERIC(head,field,rdev,tmp)                      \
39425 +                                                                       \
39426 +       for (tmp = head.next;                                           \
39427 +               rdev = md_list_entry(tmp, mdk_rdev_t, field),           \
39428 +                       tmp = tmp->next, tmp->prev != &head             \
39429 +               ; )
39430 +/*
39431 + * iterates through the 'same array disks' ringlist
39432 + */
39433 +#define ITERATE_RDEV(mddev,rdev,tmp)                                   \
39434 +       ITERATE_RDEV_GENERIC((mddev)->disks,same_set,rdev,tmp)
39435 +
39436 +/*
39437 + * Same as above, but assumes that the device has rdev->desc_nr numbered
39438 + * from 0 to mddev->nb_dev, and iterates through rdevs in ascending order.
39439 + */
39440 +#define ITERATE_RDEV_ORDERED(mddev,rdev,i)                             \
39441 +       for (i = 0; rdev = evms_md_find_rdev_nr(mddev, i), i < mddev->nb_dev; i++)
39442 +
39443 +
39444 +/*
39445 + * Iterates through all 'RAID managed disks'
39446 + */
39447 +#define ITERATE_RDEV_ALL(rdev,tmp)                                     \
39448 +       ITERATE_RDEV_GENERIC(all_raid_disks,all,rdev,tmp)
39449 +
39450 +/*
39451 + * Iterates through 'pending RAID disks'
39452 + */
39453 +#define ITERATE_RDEV_PENDING(rdev,tmp)                                 \
39454 +       ITERATE_RDEV_GENERIC(pending_raid_disks,pending,rdev,tmp)
39455 +
39456 +/*
39457 + * iterates through all used mddevs in the system.
39458 + */
39459 +#define ITERATE_MDDEV(mddev,tmp)                                       \
39460 +                                                                       \
39461 +       for (tmp = all_mddevs.next;                                     \
39462 +               mddev = md_list_entry(tmp, mddev_t, all_mddevs),        \
39463 +                       tmp = tmp->next, tmp->prev != &all_mddevs       \
39464 +               ; )
39465 +
39466 +/*
39467 + * iterates through all incomplete mddevs in the system.
39468 + */
39469 +#define ITERATE_INCOMPLETE_MDDEV(mddev,tmp)                                    \
39470 +                                                                       \
39471 +       for (tmp = incomplete_mddevs.next;                              \
39472 +               mddev = list_entry(tmp, mddev_t, incomplete_mddevs),    \
39473 +                       tmp = tmp->next, tmp->prev != &incomplete_mddevs\
39474 +               ; )
39475 +/*
39476 + * iterates through all running mddevs in the system.
39477 + */
39478 +#define ITERATE_RUNNING_MDDEV(mddev,tmp)                               \
39479 +                                                                       \
39480 +       for (tmp = running_mddevs.next;                                 \
39481 +               mddev = list_entry(tmp, mddev_t, running_mddevs),       \
39482 +                       tmp = tmp->next, tmp->prev != &running_mddevs   \
39483 +               ; )
39484 +
39485 +static inline int lock_mddev (mddev_t * mddev)
39486 +{
39487 +       return down_interruptible(&mddev->reconfig_sem);
39488 +}
39489 +
39490 +static inline void unlock_mddev (mddev_t * mddev)
39491 +{
39492 +       up(&mddev->reconfig_sem);
39493 +}
39494 +
39495 +#define xchg_values(x,y) do { __typeof__(x) __tmp = x; \
39496 +                               x = y; y = __tmp; } while (0)
39497 +
39498 +#define MAX_DISKNAME_LEN 64
39499 +
39500 +typedef struct dev_name_s {
39501 +       struct md_list_head list;
39502 +       kdev_t dev;
39503 +       char namebuf [MAX_DISKNAME_LEN];
39504 +       char *name;
39505 +} dev_name_t;
39506 +
39507 +
39508 +#define __wait_event_lock_irq(wq, condition, lock)                     \
39509 +do {                                                                   \
39510 +       wait_queue_t __wait;                                            \
39511 +       init_waitqueue_entry(&__wait, current);                         \
39512 +                                                                       \
39513 +       add_wait_queue(&wq, &__wait);                                   \
39514 +       for (;;) {                                                      \
39515 +               set_current_state(TASK_UNINTERRUPTIBLE);                \
39516 +               if (condition)                                          \
39517 +                       break;                                          \
39518 +               spin_unlock_irq(&lock);                                 \
39519 +               run_task_queue(&tq_disk);                               \
39520 +               schedule();                                             \
39521 +               spin_lock_irq(&lock);                                   \
39522 +       }                                                               \
39523 +       current->state = TASK_RUNNING;                                  \
39524 +       remove_wait_queue(&wq, &__wait);                                \
39525 +} while (0)
39526 +
39527 +#define wait_event_lock_irq(wq, condition, lock)                       \
39528 +do {                                                                   \
39529 +       if (condition)                                                  \
39530 +               break;                                                  \
39531 +       __wait_event_lock_irq(wq, condition, lock);                     \
39532 +} while (0)
39533 +
39534 +
39535 +#define __wait_disk_event(wq, condition)                               \
39536 +do {                                                                   \
39537 +       wait_queue_t __wait;                                            \
39538 +       init_waitqueue_entry(&__wait, current);                         \
39539 +                                                                       \
39540 +       add_wait_queue(&wq, &__wait);                                   \
39541 +       for (;;) {                                                      \
39542 +               set_current_state(TASK_UNINTERRUPTIBLE);                \
39543 +               if (condition)                                          \
39544 +                       break;                                          \
39545 +               run_task_queue(&tq_disk);                               \
39546 +               schedule();                                             \
39547 +       }                                                               \
39548 +       current->state = TASK_RUNNING;                                  \
39549 +       remove_wait_queue(&wq, &__wait);                                \
39550 +} while (0)
39551 +
39552 +#define wait_disk_event(wq, condition)                                         \
39553 +do {                                                                   \
39554 +       if (condition)                                                  \
39555 +               break;                                                  \
39556 +       __wait_disk_event(wq, condition);                               \
39557 +} while (0)
39558 +
39559 +#endif
39560 +
39561 diff -Naur linux-2002-09-30/include/linux/evms/evms_md_p.h evms-2002-09-30/include/linux/evms/evms_md_p.h
39562 --- linux-2002-09-30/include/linux/evms/evms_md_p.h     Wed Dec 31 18:00:00 1969
39563 +++ evms-2002-09-30/include/linux/evms/evms_md_p.h      Tue Mar 26 18:58:57 2002
39564 @@ -0,0 +1,197 @@
39565 +/*
39566 + *   Copyright (c) International Business Machines  Corp., 2000
39567 + *
39568 + *   This program is free software;  you can redistribute it and/or modify
39569 + *   it under the terms of the GNU General Public License as published by
39570 + *   the Free Software Foundation; either version 2 of the License, or
39571 + *   (at your option) any later version.
39572 + *
39573 + *   This program is distributed in the hope that it will be useful,
39574 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
39575 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
39576 + *   the GNU General Public License for more details.
39577 + *
39578 + *   You should have received a copy of the GNU General Public License
39579 + *   along with this program;  if not, write to the Free Software
39580 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
39581 + */
39582 +/*
39583 + * linux/include/linux/evms/evms_md_p.h
39584 + *
39585 + * EVMS Linux MD Region Manager Public Header File
39586 + *
39587 + * 'evms_md_p.h' is an EVMS version of linux/include/linux/raid/md_p.h modified
39588 + * by Cuong (Mike) Tran <miketran@us.ibm.com>, March 2002.
39589 + *
39590 + */
39591 +
39592 +#ifndef __EVMS_MD_P_INC__
39593 +#define __EVMS_MD_P_INC__
39594 +
39595 +/*
39596 + * RAID superblock.
39597 + *
39598 + * The RAID superblock maintains some statistics on each RAID configuration.
39599 + * Each real device in the RAID set contains it near the end of the device.
39600 + * Some of the ideas are copied from the ext2fs implementation.
39601 + *
39602 + * We currently use 4096 bytes as follows:
39603 + *
39604 + *     word offset     function
39605 + *
39606 + *        0  -    31   Constant generic RAID device information.
39607 + *        32  -    63   Generic state information.
39608 + *       64  -   127   Personality specific information.
39609 + *      128  -   511   12 32-words descriptors of the disks in the raid set.
39610 + *      512  -   911   Reserved.
39611 + *      912  -  1023   Disk specific descriptor.
39612 + */
39613 +
39614 +/*
39615 + * If x is the real device size in bytes, we return an apparent size of:
39616 + *
39617 + *     y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES
39618 + *
39619 + * and place the 4kB superblock at offset y.
39620 + */
39621 +#define MD_RESERVED_BYTES              (64 * 1024)
39622 +#define MD_RESERVED_SECTORS            (MD_RESERVED_BYTES / 512)
39623 +#define MD_RESERVED_BLOCKS             (MD_RESERVED_BYTES / BLOCK_SIZE)
39624 +
39625 +#define MD_NEW_SIZE_SECTORS(x)         ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
39626 +#define MD_NEW_SIZE_BLOCKS(x)          ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS)
39627 +
39628 +#define MD_SB_BYTES                    4096
39629 +#define MD_SB_WORDS                    (MD_SB_BYTES / 4)
39630 +#define MD_SB_BLOCKS                   (MD_SB_BYTES / BLOCK_SIZE)
39631 +#define MD_SB_SECTORS                  (MD_SB_BYTES / 512)
39632 +
39633 +/*
39634 + * The following are counted in 32-bit words
39635 + */
39636 +#define        MD_SB_GENERIC_OFFSET            0
39637 +#define MD_SB_PERSONALITY_OFFSET       64
39638 +#define MD_SB_DISKS_OFFSET             128
39639 +#define MD_SB_DESCRIPTOR_OFFSET                992
39640 +
39641 +#define MD_SB_GENERIC_CONSTANT_WORDS   32
39642 +#define MD_SB_GENERIC_STATE_WORDS      32
39643 +#define MD_SB_GENERIC_WORDS            (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS)
39644 +#define MD_SB_PERSONALITY_WORDS                64
39645 +#define MD_SB_DESCRIPTOR_WORDS         32
39646 +#define MD_SB_DISKS                    27
39647 +#define MD_SB_DISKS_WORDS              (MD_SB_DISKS*MD_SB_DESCRIPTOR_WORDS)
39648 +#define MD_SB_RESERVED_WORDS           (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS)
39649 +#define MD_SB_EQUAL_WORDS              (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS)
39650 +
39651 +/*
39652 + * Device "operational" state bits
39653 + */
39654 +#define MD_DISK_FAULTY         0 /* disk is faulty / operational */
39655 +#define MD_DISK_ACTIVE         1 /* disk is running or spare disk */
39656 +#define MD_DISK_SYNC           2 /* disk is in sync with the raid set */
39657 +#define MD_DISK_REMOVED                3 /* disk has kind of been removed, but not really or it would not be here */
39658 +#define MD_DISK_NEW            4 /* disk has just been added to the raid set */
39659 +#define MD_DISK_PENDING_ACTIVE 5 /* disk was spare, but should be activated */
39660 +
39661 +typedef struct mdp_device_descriptor_s {
39662 +       __u32 number;           /* 0 Device number in the entire set          */
39663 +       __u32 major;            /* 1 Device major number                      */
39664 +       __u32 minor;            /* 2 Device minor number                      */
39665 +       __u32 raid_disk;        /* 3 The role of the device in the raid set   */
39666 +       __u32 state;            /* 4 Operational state                        */
39667 +       __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5];
39668 +} mdp_disk_t;
39669 +
39670 +#define MD_SB_MAGIC            0xa92b4efc
39671 +
39672 +/*
39673 + * Superblock state bits
39674 + */
39675 +#define MD_SB_CLEAN            0
39676 +#define MD_SB_ERRORS           1
39677 +
39678 +typedef struct mdp_superblock_s {
39679 +       /*
39680 +        * Constant generic information
39681 +        */
39682 +       __u32 md_magic;         /*  0 MD identifier                           */
39683 +       __u32 major_version;    /*  1 major version to which the set conforms */
39684 +       __u32 minor_version;    /*  2 minor version ...                       */
39685 +       __u32 patch_version;    /*  3 patchlevel version ...                  */
39686 +       __u32 gvalid_words;     /*  4 Number of used words in this section    */
39687 +       __u32 set_uuid0;        /*  5 Raid set identifier                     */
39688 +       __u32 ctime;            /*  6 Creation time                           */
39689 +       __u32 level;            /*  7 Raid personality                        */
39690 +       __u32 size;             /*  8 Apparent size of each individual disk   */
39691 +       __u32 nr_disks;         /*  9 total disks in the raid set             */
39692 +       __u32 raid_disks;       /* 10 disks in a fully functional raid set    */
39693 +       __u32 md_minor;         /* 11 preferred MD minor device number        */
39694 +       __u32 not_persistent;   /* 12 does it have a persistent superblock    */
39695 +       __u32 set_uuid1;        /* 13 Raid set identifier #2                  */
39696 +       __u32 set_uuid2;        /* 14 Raid set identifier #3                  */
39697 +       __u32 set_uuid3;        /* 15 Raid set identifier #4                  */
39698 +       __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 16];
39699 +
39700 +       /*
39701 +        * Generic state information
39702 +        */
39703 +       __u32 utime;            /*  0 Superblock update time                  */
39704 +       __u32 state;            /*  1 State bits (clean, ...)                 */
39705 +       __u32 active_disks;     /*  2 Number of currently active disks        */
39706 +       __u32 working_disks;    /*  3 Number of working disks                 */
39707 +       __u32 failed_disks;     /*  4 Number of failed disks                  */
39708 +       __u32 spare_disks;      /*  5 Number of spare disks                   */
39709 +       __u32 sb_csum;          /*  6 checksum of the whole superblock        */
39710 +#ifdef __KERNEL__
39711 +#ifdef __BIG_ENDIAN
39712 +       __u32 events_hi;        /*  7 high-order of superblock update count   */
39713 +       __u32 events_lo;        /*  8 low-order of superblock update count    */
39714 +#else
39715 +       __u32 events_lo;        /*  7 low-order of superblock update count    */
39716 +       __u32 events_hi;        /*  8 high-order of superblock update count   */
39717 +#endif
39718 +#else
39719 +#if __BYTE_ORDER == __BIG_ENDIAN
39720 +       __u32 events_hi;        /*  7 high-order of superblock update count   */
39721 +       __u32 events_lo;        /*  8 low-order of superblock update count    */
39722 +#else
39723 +       __u32 events_lo;        /*  7 low-order of superblock update count    */
39724 +       __u32 events_hi;        /*  8 high-order of superblock update count   */
39725 +#endif
39726 +#endif
39727 +       __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 9];
39728 +
39729 +       /*
39730 +        * Personality information
39731 +        */
39732 +       __u32 layout;           /*  0 the array's physical layout             */
39733 +       __u32 chunk_size;       /*  1 chunk size in bytes                     */
39734 +       __u32 root_pv;          /*  2 LV root PV */
39735 +       __u32 root_block;       /*  3 LV root block */
39736 +       __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 4];
39737 +
39738 +       /*
39739 +        * Disks information
39740 +        */
39741 +       mdp_disk_t disks[MD_SB_DISKS];
39742 +
39743 +       /*
39744 +        * Reserved
39745 +        */
39746 +       __u32 reserved[MD_SB_RESERVED_WORDS];
39747 +
39748 +       /*
39749 +        * Active descriptor
39750 +        */
39751 +       mdp_disk_t this_disk;
39752 +
39753 +}mdp_super_t;
39754 +
39755 +static inline __u64 md_event(mdp_super_t *sb) {
39756 +       __u64 ev = sb->events_hi;
39757 +       return (ev<<32)| sb->events_lo;
39758 +}
39759 +
39760 +#endif
39761 +
39762 diff -Naur linux-2002-09-30/include/linux/evms/evms_md_u.h evms-2002-09-30/include/linux/evms/evms_md_u.h
39763 --- linux-2002-09-30/include/linux/evms/evms_md_u.h     Wed Dec 31 18:00:00 1969
39764 +++ evms-2002-09-30/include/linux/evms/evms_md_u.h      Fri Aug 16 16:19:56 2002
39765 @@ -0,0 +1,69 @@
39766 +/*
39767 + *   Copyright (c) International Business Machines  Corp., 2000
39768 + *
39769 + *   This program is free software;  you can redistribute it and/or modify
39770 + *   it under the terms of the GNU General Public License as published by
39771 + *   the Free Software Foundation; either version 2 of the License, or
39772 + *   (at your option) any later version.
39773 + *
39774 + *   This program is distributed in the hope that it will be useful,
39775 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
39776 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
39777 + *   the GNU General Public License for more details.
39778 + *
39779 + *   You should have received a copy of the GNU General Public License
39780 + *   along with this program;  if not, write to the Free Software
39781 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
39782 + *
39783 + *
39784 + * linux/include/linux/evms/evms_md_h.c
39785 + *
39786 + * EVMS MD Region Manager, User <-> Kernel common file
39787 + *
39788 + */
39789 +
39790 +#ifndef _EVMS_MD_U_INC_
39791 +#define _EVMS_MD_U_INC_
39792 +
39793 +#define EVMS_MD_ID     4
39794 +#define MD_SET_PLUGIN_ID SetPluginID(IBM_OEM_ID,EVMS_REGION_MANAGER,EVMS_MD_ID)
39795 +
39796 +#define EVMS_MD_PERS_IOCTL_CMD         1       /* personality specific ioctl command */
39797 +#define EVMS_MD_ADD            2
39798 +#define EVMS_MD_REMOVE         3
39799 +#define EVMS_MD_ACTIVATE       4
39800 +#define EVMS_MD_DEACTIVATE     5
39801 +#define EVMS_MD_GET_ARRAY_INFO  6
39802 +
39803 +/**
39804 + * structure definition to use with MD_ADD, MD_REMOVE, MD_ACTIVATE
39805 + **/
39806 +struct evms_md_kdev {
39807 +       u32 major;
39808 +       u32 minor;
39809 +};
39810 +
39811 +/**
39812 + * structure definition to use with MD_GET_ARRAY_INFO
39813 + **/
39814 +#define EVMS_MD_ARRAY_DEGRADED  (1<<0)
39815 +#define EVMS_MD_ARRAY_SYNCING   (1<<1)
39816 +struct evms_md_array_info {
39817 +       u32     state;
39818 +       mdp_super_t     *sb;
39819 +};
39820 +
39821 +/**
39822 + * EVMS MD user/kernel communication
39823 + * @mddev_idx: md minor
39824 + * @cmd: command for personality
39825 + * @arg: specific command structure
39826 + **/
39827 +struct evms_md_ioctl {
39828 +       u32     mddev_idx;
39829 +       u32     cmd;
39830 +       void            *arg;
39831 +};
39832 +
39833 +#endif
39834 +
39835 diff -Naur linux-2002-09-30/include/linux/evms/evms_os2.h evms-2002-09-30/include/linux/evms/evms_os2.h
39836 --- linux-2002-09-30/include/linux/evms/evms_os2.h      Wed Dec 31 18:00:00 1969
39837 +++ evms-2002-09-30/include/linux/evms/evms_os2.h       Thu Aug  8 17:40:37 2002
39838 @@ -0,0 +1,386 @@
39839 +/*
39840 + *
39841 + *   Copyright (c) International Business Machines  Corp., 2000
39842 + *
39843 + *   This program is free software;  you can redistribute it and/or modify
39844 + *   it under the terms of the GNU General Public License as published by
39845 + *   the Free Software Foundation; either version 2 of the License, or
39846 + *   (at your option) any later version.
39847 + *
39848 + *   This program is distributed in the hope that it will be useful,
39849 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
39850 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
39851 + *   the GNU General Public License for more details.
39852 + *
39853 + *   You should have received a copy of the GNU General Public License
39854 + *   along with this program;  if not, write to the Free Software
39855 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
39856 + *
39857 + * Module: linux/include/linux/evms_os2.h
39858 + */
39859 +
39860 +/*
39861 + * Change History:
39862 + *
39863 + */
39864 +
39865 +/*
39866 + * Description:  This module defines the disk structures used by the OS/2
39867 + *               Logical Volume Manager, including that of the Master
39868 + *               Boot Record (MBR) and Extended Boot Records (EBR).
39869 + *
39870 + * Notes: LVM Drive Letter Assignment Tables (DLA_Tables) appear on the
39871 + *        last sector of each track containing a valid MBR or EBR.  Since
39872 + *        partitions must be track aligned, any track containing an MBR or
39873 + *        EBR will be almost all empty sectors.  We will grab the last
39874 + *        of these empty sectors for our DLT_Tables.
39875 + *
39876 + */
39877 +
39878 +#ifndef OS2LVM_INCLUDED__
39879 +#define OS2LVM_INCLUDED__
39880 +
39881 +/* The following define the values used to indicate that a partition table entry is for an EBR, not a partition. */
39882 +#define EBR_BOOT_INDICATOR     0
39883 +#define EBR_FORMAT_INDICATOR   5
39884 +
39885 +/* The following define is used as the default Format_Indicator for new non-primary partitions. */
39886 +#define NEW_LOGICAL_DRIVE_FORMAT_INDICATOR   0x6
39887 +
39888 +/* The following define is used as the default Format_Indicator for a new non-active primary partitions. */
39889 +#define NEW_PRIMARY_PARTITION_FORMAT_INDICATOR   0x16
39890 +
39891 +/* The following define is used as the default Format_Indicator for a new active primary partition. */
39892 +#define NEW_ACTIVE_PRIMARY_PARTITION_FORMAT_INDICATOR  0x06
39893 +
39894 +/* The following define is used to hold the value of the Boot_Indicator for active partitions. */
39895 +#define ACTIVE_PARTITION   0x80
39896 +
39897 +/* Define the size of a Partition Name.  Partition Names are user defined names given to a partition. */
39898 +#define PARTITION_NAME_SIZE  20
39899 +
39900 +/* Define the size of a volume name.  Volume Names are user defined names given to a volume. */
39901 +#define VOLUME_NAME_SIZE  20
39902 +
39903 +/* Define the size of a disk name.  Disk Names are user defined names given to physical disk drives in the system. */
39904 +#define DISK_NAME_SIZE    20
39905 +
39906 +/* The name of the filesystem in use on a partition.  This name may be up to 12 ( + NULL terminator) characters long. */
39907 +#define FILESYSTEM_NAME_SIZE 20
39908 +
39909 +/* The comment field is reserved but is not currently used.  This is for future expansion and use. */
39910 +#define COMMENT_SIZE 81
39911 +
39912 +/* Define the minimum number of sectors to reserve on the disk for Boot Manager. */
39913 +#define BOOT_MANAGER_SIZE     2048
39914 +
39915 +#define OS2_BYTES_PER_SECTOR  512
39916 +#define OS2_SECTOR_SHIFT      9
39917 +
39918 +/*--------------------------------------------------
39919 + * Type definitions
39920 + --------------------------------------------------*/
39921 +
39922 +/* The following definitions define the drive letter assignment table used by LVM.
39923 +   For each partition table on the disk, there will be a drive letter assignment table in the last sector
39924 +   of the track containing the partition table. */
39925 +
39926 +/* NOTE: DLA stands for Drive Letter Assignment. */
39927 +
39928 +#define DLA_TABLE_SIGNATURE1  0x424D5202L
39929 +#define DLA_TABLE_SIGNATURE2  0x44464D50L
39930 +
39931 +struct dla_entry {     /* DE */
39932 +       u32 Volume_Serial_Number;       /* The serial number of the volume that this partition belongs to. */
39933 +       u32 partition_serial;   /* The serial number of this partition. */
39934 +       u32 Partition_Size;     /* The size of the partition, in sectors. */
39935 +       u32 Partition_Start;    /* The starting sector of the partition. */
39936 +       unsigned char On_Boot_Manager_Menu;     /* Set to TRUE if this volume/partition is on the Boot Manager Menu. */
39937 +       unsigned char Installable;      /* Set to TRUE if this volume is the one to install the operating system on. */
39938 +       char Drive_Letter;      /* The drive letter assigned to the partition. */
39939 +       unsigned char Reserved;
39940 +       char Volume_Name[VOLUME_NAME_SIZE];     /* The name assigned to the volume by the user. */
39941 +       char Partition_Name[PARTITION_NAME_SIZE];       /* The name assigned to the partition. */
39942 +};
39943 +
39944 +struct dla_table_sector {      /* DTS */
39945 +       u32 DLA_Signature1;     /* The magic signature (part 1) of a Drive Letter Assignment Table. */
39946 +       u32 DLA_Signature2;     /* The magic signature (part 2) of a Drive Letter Assignment Table. */
39947 +       u32 DLA_CRC;    /* The 32 bit CRC for this sector.  Calculated assuming that this field and all unused space in the sector is 0. */
39948 +       u32 Disk_Serial_Number; /* The serial number assigned to this disk. */
39949 +       u32 Boot_Disk_Serial_Number;    /* The serial number of the disk used to boot the system.  This is for conflict resolution when multiple volumes
39950 +                                                  want the same drive letter.  Since LVM.EXE will not let this situation happen, the only way to get this situation
39951 +                                                  is for the disk to have been altered by something other than LVM.EXE, or if a disk drive has been moved from one
39952 +                                                  machine to another.  If the drive has been moved, then it should have a different Boot_Disk_Serial_Number.  Thus,
39953 +                                                  we can tell which disk drive is the "foreign" drive and therefore reject its claim for the drive letter in question.
39954 +                                                  If we find that all of the claimaints have the same Boot_Disk_Serial_Number, then we must assign drive letters on
39955 +                                                  a first come, first serve basis. */
39956 +       u32 Install_Flags;      /* Used by the Install program. */
39957 +       u32 Cylinders;
39958 +       u32 Heads_Per_Cylinder;
39959 +       u32 Sectors_Per_Track;
39960 +       char Disk_Name[DISK_NAME_SIZE]; /* The name assigned to the disk containing this sector. */
39961 +       unsigned char Reboot;   /* For use by Install.  Used to keep track of reboots initiated by install. */
39962 +       unsigned char Reserved[3];      /* Alignment. */
39963 +       struct dla_entry DLA_Array[4];  /* These are the four entries which correspond to the entries in the partition table. */
39964 +};
39965 +
39966 +/* The following definitions define the LVM signature sector which will appear as the last sector in an LVM partition. */
39967 +
39968 +#define  OS2LVM_PRIMARY_SIGNATURE   0x4A435332L
39969 +#define  OS2LVM_SECONDARY_SIGNATURE 0x4252444BL
39970 +
39971 +#define  CURRENT_OS2LVM_MAJOR_VERSION_NUMBER   2       /* Define as appropriate. */
39972 +#define  CURRENT_OS2LVM_MINOR_VERSION_NUMBER   0       /* Define as appropriate. */
39973 +
39974 +/* The following definitions limit the number of LVM features that can be applied to a volume, as well as defining a "NULL" feature for use in feature table entries that are not being used. */
39975 +#define  OS2LVM_MAX_FEATURES_PER_VOLUME  10    /* The maximum number of LVM features that can be applied to a volume. */
39976 +#define  OS2LVM_NULL_FEATURE              0    /* No feature.  Used in all unused entries of the feature array in the LVM Signature sector. */
39977 +
39978 +/* The following structure is used to hold the location of the feature specific data for LVM features. */
39979 +typedef struct _LVM_Feature_Data {     /* LFD */
39980 +       u32 Feature_ID; /* The ID of the feature. */
39981 +       u32 Location_Of_Primary_Feature_Data;   /* The u32 of the starting sector of the private data for this feature. */
39982 +       u32 Location_Of_Secondary_Feature_Data; /* The u32 of the starting sector of the backup copy of the private data for this feature. */
39983 +       u32 Feature_Data_Size;  /* The number of sectors used by this feature for its private data. */
39984 +       u16 Feature_Major_Version_Number;       /* The integer portion of the version number of this feature. */
39985 +       u16 Feature_Minor_Version_Number;       /* The decimal portion of the version number of this feature. */
39986 +       unsigned char Feature_Active;   /* TRUE if this feature is active on this partition/volume, FALSE otherwise. */
39987 +       unsigned char Reserved[3];      /* Alignment. */
39988 +} LVM_Feature_Data;
39989 +
39990 +/* The following structure defines the LVM Signature Sector.  This is the last sector of every partition which is part of an LVM volume.  It gives vital
39991 +   information about the version of LVM used to create the LVM volume that it is a part of, as well as which LVM features (BBR, drive linking, etc.) are
39992 +   active on the volume that this partition is a part of.                                                                                                   */
39993 +typedef struct _LVM_Signature_Sector { /* LSS */
39994 +       u32 LVM_Signature1;     /* The first part of the magic LVM signature. */
39995 +       u32 LVM_Signature2;     /* The second part of the magic LVM signature. */
39996 +       u32 Signature_Sector_CRC;       /* 32 bit CRC for this sector.  Calculated using 0 for this field. */
39997 +       u32 partition_serial;   /* The LVM assigned serial number for this partition.  */
39998 +       u32 Partition_Start;    /* u32 of the first sector of this partition. */
39999 +       u32 Partition_End;      /* u32 of the last sector of this partition. */
40000 +       u32 Partition_Sector_Count;     /* The number of sectors in this partition. */
40001 +       u32 LVM_Reserved_Sector_Count;  /* The number of sectors reserved for use by LVM. */
40002 +       u32 Partition_Size_To_Report_To_User;   /* The size of the partition as the user sees it - i.e. (the actual size of the partition - LVM reserved sectors) rounded to a track boundary. */
40003 +       u32 Boot_Disk_Serial_Number;    /* The serial number of the boot disk for the system.  If the system contains Boot Manager, then this is the serial number of the disk containing the active copy of Boot Manager. */
40004 +       u32 Volume_Serial_Number;       /* The serial number of the volume that this partition belongs to. */
40005 +       u32 Fake_EBR_Location;  /* The location, on disk, of a Fake EBR, if one has been allocated. */
40006 +       u16 LVM_Major_Version_Number;   /* Major version number of the LVM that created this partition. */
40007 +       u16 LVM_Minor_Version_Number;   /* Minor version number of the LVM that created this partition. */
40008 +       char Partition_Name[PARTITION_NAME_SIZE];       /* User defined partition name. */
40009 +       char Volume_Name[VOLUME_NAME_SIZE];     /* The name of the volume that this partition belongs to. */
40010 +       LVM_Feature_Data LVM_Feature_Array[OS2LVM_MAX_FEATURES_PER_VOLUME];     /* The feature array.  This indicates which LVM features, if any, are active on this volume
40011 +                                                                                  and what order they should be applied in.                                                  */
40012 +       char Drive_Letter;      /* The drive letter assigned to the volume that this partition is part of. */
40013 +       unsigned char Fake_EBR_Allocated;       /* If TRUE, then a fake EBR has been allocated. */
40014 +       char Comment[COMMENT_SIZE];     /* User comment. */
40015 +       char Disk_Name[DISK_NAME_SIZE]; /* Added to allow BBR to report the name of a disk when bad sectors are encountered on that disk. */
40016 +       u32 Sequence_Number;    /* This indicates the order that partitions within a volume are used.  This number is 1 based.  A 0 here indicates that the volume was made by LVM Ver. 1. */
40017 +       u32 Next_Aggregate_Number;      /* Used during volume creation and expansion when creating unique names for aggregates. */
40018 +       /* The remainder of the sector is reserved for future use and should be all zero or else the CRC will not come out correctly. */
40019 +} LVM_Signature_Sector;
40020 +
40021 +/* The following definitions define the format of a partition table and the Master Boot Record (MBR). */
40022 +typedef struct _Partition_Record {     /* PR */
40023 +       unsigned char Boot_Indicator;   /* 80h = active partition. */
40024 +       unsigned char Starting_Head;
40025 +       unsigned char Starting_Sector;  /* Bits 0-5 are the sector.  Bits 6 and 7 are the high order bits of the starting cylinder. */
40026 +       unsigned char Starting_Cylinder;        /* The cylinder number is a 10 bit value.  The high order bits of the 10 bit value come from bits 6 & 7 of the Starting_Sector field. */
40027 +       unsigned char Format_Indicator; /* An indicator of the format/operation system on this partition. */
40028 +       unsigned char Ending_Head;
40029 +       unsigned char Ending_Sector;
40030 +       unsigned char Ending_Cylinder;
40031 +       u32 Sector_Offset;      /* The number of sectors on the disk which are prior to the start of this partition. */
40032 +       u32 Sector_Count;       /* The number of sectors in this partition. */
40033 +} Partition_Record;
40034 +
40035 +typedef struct _Master_Boot_Record {   /* MBR */
40036 +       unsigned char Reserved[446];
40037 +       Partition_Record Partition_Table[4];
40038 +       u16 Signature;  /* AA55h in this field indicates that this is a valid partition table/MBR. */
40039 +} Master_Boot_Record;
40040 +
40041 +typedef Master_Boot_Record Extended_Boot_Record;
40042 +
40043 +/* The following definition covers the Boot Manager Alias Table in the EBR.
40044 +
40045 +   The Alias Table in the EBR has 2 entries in it, although only the first one is actually used.  */
40046 +#define ALIAS_NAME_SIZE  8
40047 +typedef struct _AliasTableEntry {      /* ATE */
40048 +       unsigned char On_Boot_Manager_Menu;
40049 +       char Name[ALIAS_NAME_SIZE];
40050 +} AliasTableEntry;
40051 +
40052 +#define ALIAS_TABLE_OFFSET  0x18A
40053 +
40054 +/* XLATOFF */
40055 +/* The following text is used for the Boot Manager Alias for items that were placed on the Boot Manager Menu by FDISK and
40056 +   which have since been migrated to the new LVM format.  This text is put into the Name field of an AliasTableEntry so
40057 +   that, if FDISK ( or another program which understands the old Boot Manager Menu format) is run, it will display
40058 +   something for those partitions/volumes which are on the Boot Manager Menu.
40059 +
40060 +   NOTE: This text must be exactly ALIAS_NAME_SIZE characters in length!                                                     */
40061 +#define ALIAS_TABLE_ENTRY_MIGRATION_TEXT       "--> LVM "
40062 +#define ALIAS_TABLE_ENTRY_MIGRATION_TEXT2      "--> LVM*"
40063 +
40064 +/* XLATON */
40065 +
40066 +/* The following is the signature used for an Master Boot Record, an Extended Boot Record, and a Boot Sector. */
40067 +#define MBR_EBR_SIGNATURE  0xAA55
40068 +
40069 +/* The following list of definitions defines the values of interest for the Format_Indicator in a Partition_Record. */
40070 +#define EBR_INDICATOR                          0x5
40071 +#define WINDOZE_EBR_INDICATOR                  0xF
40072 +#define UNUSED_INDICATOR                       0x0
40073 +#define IFS_INDICATOR                          0x7
40074 +#define FAT12_INDICATOR                        0x1
40075 +#define FAT16_SMALL_PARTITION_INDICATOR        0x4
40076 +#define FAT16_LARGE_PARTITION_INDICATOR        0x6
40077 +#define BOOT_MANAGER_HIDDEN_PARTITION_FLAG     0x10
40078 +#define LVM_PARTITION_INDICATOR                0x35
40079 +#define BOOT_MANAGER_INDICATOR                 0x0A
40080 +
40081 +/* The following is the signature used in the Boot Sector for Boot Manager. */
40082 +#define OS2LVM_BOOT_MANAGER_SIGNATURE       "APJ&WN"
40083 +
40084 +/* The following is used for determining the synthetic geometry reported for Volumes employing drive linking. */
40085 +#define OS2LVM_SYNTHETIC_SECTORS_PER_TRACK  63
40086 +
40087 +/*--------------------------------------------------
40088 + * Declares for Drive Linking feature:
40089 + *--------------------------------------------------*/
40090 +
40091 +/* The following defines uniquely identify Drive Linking. */
40092 +#define DRIVE_LINKING_FEATURE_ID     100
40093 +#define DRIVE_LINKING_MAJOR_VERSION  1
40094 +#define DRIVE_LINKING_MINOR_VERSION  0
40095 +
40096 +/* The following definitions are used for the disk structures supporting drive linking. */
40097 +
40098 +#define LINK_TABLE_MASTER_SIGNATURE  0x434E4157L
40099 +#define LINK_TABLE_SIGNATURE         0X4D4D5652L
40100 +
40101 +#define MAXIMUM_LINKS   246
40102 +
40103 +#define DRIVE_LINKING_RESERVED_SECTOR_COUNT 4
40104 +
40105 +#define LINKS_IN_FIRST_SECTOR 60
40106 +
40107 +#define LINKS_IN_NEXT_SECTOR  62
40108 +
40109 +struct drive_link {
40110 +       u32 drive_serial;
40111 +       u32 partition_serial;
40112 +};
40113 +
40114 +struct link_table_first_sector {
40115 +       u32 Link_Table_Signature;       /* Use the LINK_TABLE_MASTER_SIGNATURE here. */
40116 +       u32 Link_Table_CRC;
40117 +       u32 Sequence_Number;    /* Used to resolve conflicts when the primary and secondary tables do not match. */
40118 +       u32 Links_In_Use;
40119 +       struct drive_link Link_Table[LINKS_IN_FIRST_SECTOR];
40120 +};
40121 +
40122 +struct link_table_sector {
40123 +       u32 Link_Table_Signature;       /* Use LINK_TABLE_SIGNATURE here. */
40124 +       u32 Link_Table_CRC;
40125 +       u32 Sequence_Number;    /* Used to resolve conflicts when the primary and secondary tables do not match. */
40126 +       struct drive_link Link_Table[LINKS_IN_NEXT_SECTOR];
40127 +};
40128 +
40129 +/*--------------------------------------------------
40130 + * Declares for Bad Block Relocation feature:
40131 + *--------------------------------------------------*/
40132 +
40133 +/* The following definition is the numeric ID for Bad Block Relocation.  */
40134 +#define BBR_FEATURE_ID  101
40135 +
40136 +#define BBR_FEATURE_MAJOR_VERSION       0x0001
40137 +#define BBR_FEATURE_MINOR_VERSION       0x0000
40138 +
40139 +/* The following definitions are used for the disk structures supporting bad block relocation. */
40140 +
40141 +/* NOTE: BBR stands for Bad Block Relocation. */
40142 +
40143 +#define BBR_TABLE_MASTER_SIGNATURE  0x00726D62
40144 +#define BBR_TABLE_SIGNATURE         0x01726276
40145 +
40146 +struct bbr_table_entry {
40147 +       u32 BadSector;
40148 +       u32 ReplacementSector;
40149 +};
40150 +
40151 +typedef struct _LVM_BBR_Table_First_Sector {
40152 +       u32 Signature;  /* Signature for the first sector of the BBR Table. Use BBR_TABLE_MASTER_SIGNATURE here. */
40153 +       u32 CRC;                /* CRC for this sector. */
40154 +       u32 Sequence_Number;    /* Used to resolve conflicts when the primary and secondary tables do not match. */
40155 +       u32 Table_Size; /* The number of BBR_Table_Entries in the BBR Table. */
40156 +       u32 Table_Entries_In_Use;       /* The number of BBR Table entries which are in use. */
40157 +       u32 Sectors_Per_Table;  /* The number of LVM_BBR_Table_Sectors used to hold the BBR Table. */
40158 +       u32 First_Replacement_Sector;   /* The location of the first replacement sector. */
40159 +       u32 Last_Replacement_Sector;    /* The location of the last replacement sector. */
40160 +       u32 Replacement_Sector_Count;   /* The number of replacement sectors. */
40161 +       u32 Flags;      /* Flags global to the Bad Block Relocation Feature. */
40162 +} LVM_BBR_Table_First_Sector;
40163 +
40164 +/*  Flags for LVM_BBR_Table_First_Sector  */
40165 +#define BBR_Flag_Write_Verify    0x00000001    /* Indicate convert Write I/O to Write/Verify */
40166 +
40167 +#define BBR_TABLE_ENTRIES_PER_SECTOR   62
40168 +
40169 +typedef struct _LVM_BBR_Table_Sector {
40170 +       u32 Signature;  /* Signature for a sector of the BBR_Table which is not the first sector of the BBR Table. Use BBR_TABLE_SIGNATURE here. */
40171 +       u32 CRC;                /* CRC for this sector of the BBR Table. */
40172 +       u32 Sequence_Number;    /* Used to resolve conflicts when the primary and secondary tables do not match. */
40173 +       struct bbr_table_entry BBR_Table[BBR_TABLE_ENTRIES_PER_SECTOR];
40174 +       u32 reserved1;  /* for block alignment */
40175 +} LVM_BBR_Table_Sector;
40176 +
40177 +//
40178 +// Combined structure to hold entire BBR feature data as it exists on disk.
40179 +typedef struct _LVM_BBR_Feature {
40180 +       LVM_BBR_Table_First_Sector control;
40181 +       char reserved1[OS2_BYTES_PER_SECTOR -
40182 +                      sizeof (LVM_BBR_Table_First_Sector)];
40183 +       LVM_BBR_Table_Sector remap[1];
40184 +}
40185 +LVM_BBR_Feature;
40186 +
40187 +/* The following defines establish the minimum and maximum number of replacement sectors which can be allocated for
40188 +   Bad Block Relocation.  Otherwise, 1 replacement sector per MB of disk space is allocated.                          */
40189 +#define BBR_FLOOR    62
40190 +#define BBR_LIMIT  4096
40191 +
40192 +// In-memory Meta Data for Bad Block Relocation
40193 +// In-memory Meta Data for Drive Linking
40194 +struct os2_dl_entry {
40195 +       u64 start_sector;
40196 +       u64 sector_count;
40197 +       u64 dl_lsn1;            /* LSN of first on-disk copy of drive linking data. */
40198 +       u64 dl_lsn2;            /* LSN of the second on-disk copy of drive linking data. */
40199 +       char *link_data;
40200 +       u32 partition_serial;
40201 +       u64 bbr_lsn1;   /* LSN of the first on-disk copy of the BBR data. */
40202 +       u64 bbr_lsn2;   /* LSN of the second on-disk copy of the BBR data. */
40203 +       u32 bbr_feature_size;   /* # of sectors of BBR data. */
40204 +       u32 bbr_is_active;
40205 +       struct semaphore bbr_table_lock;        /* Used to serialize writers */
40206 +       unsigned int guard1;    /* Lamport's Theorem for mutual exclusion */
40207 +       char *bbr_data;
40208 +       unsigned int guard2;    /* Lamport's Theorem for mutual exclusion */
40209 +       struct evms_logical_node *link_partition;
40210 +       struct os2_dl_entry *next;
40211 +};
40212 +
40213 +// In-memory Meta Data for each OS/2 LVM Volume:
40214 +typedef struct os2_volume_runtime_entry_s {
40215 +       int complete;
40216 +       u32 Export_Needed;
40217 +       u64 size_in_sectors;
40218 +       u32 Volume_Serial_Number;
40219 +       u32 drive_link_count;
40220 +       struct os2_dl_entry *drive_link;
40221 +       struct evms_logical_node *next_os2lvm_node;
40222 +} os2_volume_runtime_entry_t;
40223 +
40224 +#endif
40225 diff -Naur linux-2002-09-30/include/linux/evms/evms_raid0.h evms-2002-09-30/include/linux/evms/evms_raid0.h
40226 --- linux-2002-09-30/include/linux/evms/evms_raid0.h    Wed Dec 31 18:00:00 1969
40227 +++ evms-2002-09-30/include/linux/evms/evms_raid0.h     Tue Aug  6 01:03:24 2002
40228 @@ -0,0 +1,33 @@
40229 +#ifndef _EVMS_RAID0_INCL_
40230 +#define _EVMS_RAID0_INCL_
40231 +
40232 +#include <linux/evms/evms_md.h>
40233 +
40234 +struct strip_zone
40235 +{
40236 +       unsigned long zone_offset;      /* Zone offset (in sectors) in md_dev */
40237 +       unsigned long dev_offset;       /* Zone offset (in sectors) in real dev */
40238 +       unsigned long size_in_sects;    /* Zone size in sectors */
40239 +       int nb_dev;                     /* # of devices attached to the zone */
40240 +       struct evms_logical_node *node[MD_SB_DISKS]; /* EVMS nodes attached to the zone */
40241 +};
40242 +
40243 +struct raid0_hash
40244 +{
40245 +       struct strip_zone *zone0, *zone1;
40246 +};
40247 +
40248 +struct raid0_private_data
40249 +{
40250 +       struct raid0_hash *hash_table; /* Dynamically allocated */
40251 +       struct strip_zone *strip_zone; /* This one too */
40252 +       int nr_strip_zones;
40253 +       struct strip_zone *smallest;
40254 +       int nr_zones;
40255 +};
40256 +
40257 +typedef struct raid0_private_data raid0_conf_t;
40258 +
40259 +#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private)
40260 +
40261 +#endif
40262 diff -Naur linux-2002-09-30/include/linux/evms/evms_raid1.h evms-2002-09-30/include/linux/evms/evms_raid1.h
40263 --- linux-2002-09-30/include/linux/evms/evms_raid1.h    Wed Dec 31 18:00:00 1969
40264 +++ evms-2002-09-30/include/linux/evms/evms_raid1.h     Tue Aug  6 01:03:24 2002
40265 @@ -0,0 +1,103 @@
40266 +#ifndef _EVMS_RAID1_H
40267 +#define _EVMS_RAID1_H
40268 +
40269 +#include <linux/evms/evms_md.h>
40270 +
40271 +struct mirror_info {
40272 +       int             number;
40273 +       int             raid_disk;
40274 +       struct evms_logical_node *node;
40275 +       kdev_t          dev;
40276 +       int             sect_limit;
40277 +       int             head_position;
40278 +
40279 +       /*
40280 +        * State bits:
40281 +        */
40282 +       int             operational;
40283 +       int             write_only;
40284 +       int             spare;
40285 +
40286 +       int             used_slot;
40287 +};
40288 +
40289 +struct raid1_private_data {
40290 +       mddev_t                 *mddev;
40291 +       struct mirror_info      mirrors[MD_SB_DISKS];
40292 +       int                     nr_disks;
40293 +       int                     raid_disks;
40294 +       int                     working_disks;
40295 +       int                     last_used;
40296 +       unsigned long           next_sect;
40297 +       int                     sect_count;
40298 +       struct evms_thread      *thread, *resync_thread;
40299 +       int                     resync_mirrors;
40300 +       struct mirror_info      *spare;
40301 +       md_spinlock_t           device_lock;
40302 +
40303 +       /* buffer pool */
40304 +       /* buffer_heads that we have pre-allocated have b_pprev -> &freebh
40305 +        * and are linked into a stack using b_next
40306 +        * raid1_bh that are pre-allocated have R1BH_PreAlloc set.
40307 +        * All these variable are protected by device_lock
40308 +        */
40309 +       struct buffer_head      *freebh;
40310 +       int                     freebh_cnt;     /* how many are on the list */
40311 +       int                     freebh_blocked;
40312 +       struct raid1_bh         *freer1;
40313 +       int                     freer1_blocked;
40314 +       int                     freer1_cnt;
40315 +       struct raid1_bh         *freebuf;       /* each bh_req has a page allocated */
40316 +       md_wait_queue_head_t    wait_buffer;
40317 +
40318 +       /* for use when syncing mirrors: */
40319 +       unsigned long   start_active, start_ready,
40320 +               start_pending, start_future;
40321 +       int     cnt_done, cnt_active, cnt_ready,
40322 +               cnt_pending, cnt_future;
40323 +       int     phase;
40324 +       int     window;
40325 +       md_wait_queue_head_t    wait_done;
40326 +       md_wait_queue_head_t    wait_ready;
40327 +       md_spinlock_t           segment_lock;
40328 +};
40329 +
40330 +typedef struct raid1_private_data raid1_conf_t;
40331 +
40332 +/*
40333 + * this is the only point in the RAID code where we violate
40334 + * C type safety. mddev->private is an 'opaque' pointer.
40335 + */
40336 +#define mddev_to_conf(mddev) ((raid1_conf_t *) mddev->private)
40337 +
40338 +/*
40339 + * this is our 'private' 'collective' RAID1 buffer head.
40340 + * it contains information about what kind of IO operations were started
40341 + * for this RAID1 operation, and about their status:
40342 + */
40343 +
40344 +struct raid1_bh {
40345 +       atomic_t                remaining; /* 'have we finished' count,
40346 +                                           * used from IRQ handlers
40347 +                                           */
40348 +       int                     cmd;
40349 +       unsigned long           state;
40350 +       mddev_t                 *mddev;
40351 +       struct buffer_head      *master_bh;
40352 +       struct buffer_head      *mirror_bh_list;
40353 +       struct buffer_head      bh_req;
40354 +       struct evms_logical_node *node;         /* map to evms node (READ only) */
40355 +       struct raid1_bh         *next_r1;       /* next for retry or in free list */
40356 +};
40357 +
40358 +typedef struct raid1_sync_cb_s {
40359 +       int                     rc;
40360 +       atomic_t                io_count;
40361 +       md_wait_queue_head_t    wait;
40362 +} raid1_sync_cb_t;
40363 +
40364 +/* bits for raid1_bh.state */
40365 +#define        R1BH_Uptodate   1
40366 +#define        R1BH_SyncPhase  2
40367 +#define        R1BH_PreAlloc   3       /* this was pre-allocated, add to free list */
40368 +#endif
40369 diff -Naur linux-2002-09-30/include/linux/evms/evms_raid5.h evms-2002-09-30/include/linux/evms/evms_raid5.h
40370 --- linux-2002-09-30/include/linux/evms/evms_raid5.h    Wed Dec 31 18:00:00 1969
40371 +++ evms-2002-09-30/include/linux/evms/evms_raid5.h     Tue Aug  6 01:03:23 2002
40372 @@ -0,0 +1,251 @@
40373 +#ifndef _RAID5_H
40374 +#define _RAID5_H
40375 +
40376 +#include <linux/evms/evms_md.h>
40377 +#include <linux/evms/evms_xor.h>
40378 +
40379 +/*
40380 + *
40381 + * Each stripe contains one buffer per disc.  Each buffer can be in
40382 + * one of a number of states determined by bh_state.  Changes between
40383 + * these states happen *almost* exclusively under a per-stripe
40384 + * spinlock.  Some very specific changes can happen in b_end_io, and
40385 + * these are not protected by the spin lock.
40386 + *
40387 + * The bh_state bits that are used to represent these states are:
40388 + *   BH_Uptodate, BH_Lock
40389 + *
40390 + * State Empty == !Uptodate, !Lock
40391 + *        We have no data, and there is no active request
40392 + * State Want == !Uptodate, Lock
40393 + *        A read request is being submitted for this block
40394 + * State Dirty == Uptodate, Lock
40395 + *        Some new data is in this buffer, and it is being written out
40396 + * State Clean == Uptodate, !Lock
40397 + *        We have valid data which is the same as on disc
40398 + *
40399 + * The possible state transitions are:
40400 + *
40401 + *  Empty -> Want   - on read or write to get old data for  parity calc
40402 + *  Empty -> Dirty  - on compute_parity to satisfy write/sync request.(RECONSTRUCT_WRITE)
40403 + *  Empty -> Clean  - on compute_block when computing a block for failed drive
40404 + *  Want  -> Empty  - on failed read
40405 + *  Want  -> Clean  - on successful completion of read request
40406 + *  Dirty -> Clean  - on successful completion of write request
40407 + *  Dirty -> Clean  - on failed write
40408 + *  Clean -> Dirty  - on compute_parity to satisfy write/sync (RECONSTRUCT or RMW)
40409 + *
40410 + * The Want->Empty, Want->Clean, Dirty->Clean, transitions
40411 + * all happen in b_end_io at interrupt time.
40412 + * Each sets the Uptodate bit before releasing the Lock bit.
40413 + * This leaves one multi-stage transition:
40414 + *    Want->Dirty->Clean
40415 + * This is safe because thinking that a Clean buffer is actually dirty
40416 + * will at worst delay some action, and the stripe will be scheduled
40417 + * for attention after the transition is complete.
40418 + *
40419 + * There is one possibility that is not covered by these states.  That
40420 + * is if one drive has failed and there is a spare being rebuilt.  We
40421 + * can't distinguish between a clean block that has been generated
40422 + * from parity calculations, and a clean block that has been
40423 + * successfully written to the spare ( or to parity when resyncing).
40424 + * To distingush these states we have a stripe bit STRIPE_INSYNC that
40425 + * is set whenever a write is scheduled to the spare, or to the parity
40426 + * disc if there is no spare.  A sync request clears this bit, and
40427 + * when we find it set with no buffers locked, we know the sync is
40428 + * complete.
40429 + *
40430 + * Buffers for the md device that arrive via make_request are attached
40431 + * to the appropriate stripe in one of two lists linked on b_reqnext.
40432 + * One list (bh_read) for read requests, one (bh_write) for write.
40433 + * There should never be more than one buffer on the two lists
40434 + * together, but we are not guaranteed of that so we allow for more.
40435 + *
40436 + * If a buffer is on the read list when the associated cache buffer is
40437 + * Uptodate, the data is copied into the read buffer and it's b_end_io
40438 + * routine is called.  This may happen in the end_request routine only
40439 + * if the buffer has just successfully been read.  end_request should
40440 + * remove the buffers from the list and then set the Uptodate bit on
40441 + * the buffer.  Other threads may do this only if they first check
40442 + * that the Uptodate bit is set.  Once they have checked that they may
40443 + * take buffers off the read queue.
40444 + *
40445 + * When a buffer on the write list is committed for write is it copied
40446 + * into the cache buffer, which is then marked dirty, and moved onto a
40447 + * third list, the written list (bh_written).  Once both the parity
40448 + * block and the cached buffer are successfully written, any buffer on
40449 + * a written list can be returned with b_end_io.
40450 + *
40451 + * The write list and read list both act as fifos.  The read list is
40452 + * protected by the device_lock.  The write and written lists are
40453 + * protected by the stripe lock.  The device_lock, which can be
40454 + * claimed while the stipe lock is held, is only for list
40455 + * manipulations and will only be held for a very short time.  It can
40456 + * be claimed from interrupts.
40457 + *
40458 + *
40459 + * Stripes in the stripe cache can be on one of two lists (or on
40460 + * neither).  The "inactive_list" contains stripes which are not
40461 + * currently being used for any request.  They can freely be reused
40462 + * for another stripe.  The "handle_list" contains stripes that need
40463 + * to be handled in some way.  Both of these are fifo queues.  Each
40464 + * stripe is also (potentially) linked to a hash bucket in the hash
40465 + * table so that it can be found by sector number.  Stripes that are
40466 + * not hashed must be on the inactive_list, and will normally be at
40467 + * the front.  All stripes start life this way.
40468 + *
40469 + * The inactive_list, handle_list and hash bucket lists are all protected by the
40470 + * device_lock.
40471 + *  - stripes on the inactive_list never have their stripe_lock held.
40472 + *  - stripes have a reference counter. If count==0, they are on a list.
40473 + *  - If a stripe might need handling, STRIPE_HANDLE is set.
40474 + *  - When refcount reaches zero, then if STRIPE_HANDLE it is put on
40475 + *    handle_list else inactive_list
40476 + *
40477 + * This, combined with the fact that STRIPE_HANDLE is only ever
40478 + * cleared while a stripe has a non-zero count means that if the
40479 + * refcount is 0 and STRIPE_HANDLE is set, then it is on the
40480 + * handle_list and if recount is 0 and STRIPE_HANDLE is not set, then
40481 + * the stripe is on inactive_list.
40482 + *
40483 + * The possible transitions are:
40484 + *  activate an unhashed/inactive stripe (get_active_stripe())
40485 + *     lockdev check-hash unlink-stripe cnt++ clean-stripe hash-stripe unlockdev
40486 + *  activate a hashed, possibly active stripe (get_active_stripe())
40487 + *     lockdev check-hash if(!cnt++)unlink-stripe unlockdev
40488 + *  attach a request to an active stripe (add_stripe_bh())
40489 + *     lockdev attach-buffer unlockdev
40490 + *  handle a stripe (handle_stripe())
40491 + *     lockstripe clrSTRIPE_HANDLE ... (lockdev check-buffers unlockdev) .. change-state .. record io needed unlockstripe schedule io
40492 + *  release an active stripe (release_stripe())
40493 + *     lockdev if (!--cnt) { if  STRIPE_HANDLE, add to handle_list else add to inactive-list } unlockdev
40494 + *
40495 + * The refcount counts each thread that have activated the stripe,
40496 + * plus raid5d if it is handling it, plus one for each active request
40497 + * on a cached buffer.
40498 + */
40499 +struct stripe_head {
40500 +       struct stripe_head      *hash_next, **hash_pprev; /* hash pointers */
40501 +       struct list_head        lru;                    /* inactive_list or handle_list */
40502 +       struct raid5_private_data       *raid_conf;
40503 +       struct buffer_head      *bh_cache[MD_SB_DISKS]; /* buffered copy */
40504 +       struct buffer_head      *bh_read[MD_SB_DISKS];  /* read request buffers of the MD device */
40505 +       struct buffer_head      *bh_write[MD_SB_DISKS]; /* write request buffers of the MD device */
40506 +       struct buffer_head      *bh_written[MD_SB_DISKS]; /* write request buffers of the MD device that have been scheduled for write */
40507 +       struct page             *bh_page[MD_SB_DISKS];  /* saved bh_cache[n]->b_page when reading around the cache */
40508 +       struct evms_logical_node *node[MD_SB_DISKS];    /* the target device node */
40509 +       unsigned long           sector;                 /* sector of this row */
40510 +       int                     size;                   /* buffers size */
40511 +       int                     pd_idx;                 /* parity disk index */
40512 +       unsigned long           state;                  /* state flags */
40513 +       atomic_t                count;                  /* nr of active thread/requests */
40514 +       spinlock_t              lock;
40515 +       int                     sync_redone;
40516 +};
40517 +
40518 +
40519 +/*
40520 + * Write method
40521 + */
40522 +#define RECONSTRUCT_WRITE      1
40523 +#define READ_MODIFY_WRITE      2
40524 +/* not a write method, but a compute_parity mode */
40525 +#define        CHECK_PARITY            3
40526 +
40527 +/*
40528 + * Stripe state
40529 + */
40530 +#define STRIPE_ERROR           1
40531 +#define STRIPE_HANDLE          2
40532 +#define        STRIPE_SYNCING          3
40533 +#define        STRIPE_INSYNC           4
40534 +#define        STRIPE_PREREAD_ACTIVE   5
40535 +#define        STRIPE_DELAYED          6
40536 +
40537 +/*
40538 + * Plugging:
40539 + *
40540 + * To improve write throughput, we need to delay the handling of some
40541 + * stripes until there has been a chance that several write requests
40542 + * for the one stripe have all been collected.
40543 + * In particular, any write request that would require pre-reading
40544 + * is put on a "delayed" queue until there are no stripes currently
40545 + * in a pre-read phase.  Further, if the "delayed" queue is empty when
40546 + * a stripe is put on it then we "plug" the queue and do not process it
40547 + * until an unplg call is made. (the tq_disk list is run).
40548 + *
40549 + * When preread is initiated on a stripe, we set PREREAD_ACTIVE and add
40550 + * it to the count of prereading stripes.
40551 + * When write is initiated, or the stripe refcnt == 0 (just in case) we
40552 + * clear the PREREAD_ACTIVE flag and decrement the count
40553 + * Whenever the delayed queue is empty and the device is not plugged, we
40554 + * move any strips from delayed to handle and clear the DELAYED flag and set PREREAD_ACTIVE.
40555 + * In stripe_handle, if we find pre-reading is necessary, we do it if
40556 + * PREREAD_ACTIVE is set, else we set DELAYED which will send it to the delayed queue.
40557 + * HANDLE gets cleared if stripe_handle leave nothing locked.
40558 + */
40559 +
40560 +
40561 +struct disk_info {
40562 +       kdev_t  dev;
40563 +       struct evms_logical_node *node;
40564 +       int     operational;
40565 +       int     number;
40566 +       int     raid_disk;
40567 +       int     write_only;
40568 +       int     spare;
40569 +       int     used_slot;
40570 +};
40571 +
40572 +struct raid5_private_data {
40573 +       struct stripe_head      **stripe_hashtbl;
40574 +       mddev_t                 *mddev;
40575 +       struct evms_thread      *thread, *resync_thread;
40576 +       struct disk_info        disks[MD_SB_DISKS];
40577 +       struct disk_info        *spare;
40578 +       int                     buffer_size;
40579 +       int                     chunk_size, level, algorithm;
40580 +       int                     raid_disks, working_disks, failed_disks;
40581 +       int                     resync_parity;
40582 +       int                     max_nr_stripes;
40583 +
40584 +       struct list_head        handle_list; /* stripes needing handling */
40585 +       struct list_head        delayed_list; /* stripes that have plugged requests */
40586 +       atomic_t                preread_active_stripes; /* stripes with scheduled io */
40587 +       /*
40588 +        * Free stripes pool
40589 +        */
40590 +       atomic_t                active_stripes;
40591 +       struct list_head        inactive_list;
40592 +       md_wait_queue_head_t    wait_for_stripe;
40593 +       int                     inactive_blocked;       /* release of inactive stripes blocked,
40594 +                                                        * waiting for 25% to be free
40595 +                                                        */
40596 +       md_spinlock_t           device_lock;
40597 +
40598 +       int                     plugged;
40599 +       struct tq_struct        plug_tq;
40600 +};
40601 +
40602 +typedef struct raid5_private_data raid5_conf_t;
40603 +
40604 +#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)
40605 +
40606 +/*
40607 + * Our supported algorithms
40608 + */
40609 +#define ALGORITHM_LEFT_ASYMMETRIC      0
40610 +#define ALGORITHM_RIGHT_ASYMMETRIC     1
40611 +#define ALGORITHM_LEFT_SYMMETRIC       2
40612 +#define ALGORITHM_RIGHT_SYMMETRIC      3
40613 +
40614 +
40615 +#define EVMS_MD_RAID5_INIT_IO          1
40616 +
40617 +struct r5_sync_io {
40618 +       int rw;
40619 +       u64 lsn;
40620 +       u64 nr_sects;
40621 +       void *data;
40622 +};
40623 +#endif
40624 diff -Naur linux-2002-09-30/include/linux/evms/evms_snapshot.h evms-2002-09-30/include/linux/evms/evms_snapshot.h
40625 --- linux-2002-09-30/include/linux/evms/evms_snapshot.h Wed Dec 31 18:00:00 1969
40626 +++ evms-2002-09-30/include/linux/evms/evms_snapshot.h  Wed Sep 25 15:05:19 2002
40627 @@ -0,0 +1,361 @@
40628 +/* -*- linux-c -*- */
40629 +/*
40630 + *   Copyright (c) International Business Machines  Corp., 2000
40631 + *
40632 + *   This program is free software;  you can redistribute it and/or modify
40633 + *   it under the terms of the GNU General Public License as published by
40634 + *   the Free Software Foundation; either version 2 of the License, or
40635 + *   (at your option) any later version.
40636 + *
40637 + *   This program is distributed in the hope that it will be useful,
40638 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
40639 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
40640 + *   the GNU General Public License for more details.
40641 + *
40642 + *   You should have received a copy of the GNU General Public License
40643 + *   along with this program;  if not, write to the Free Software
40644 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
40645 + */
40646 +/*
40647 + * linux/include/linux/evms_snapshot.h
40648 + *
40649 + * EVMS Snapshot Feature kernel header file
40650 + */
40651 +
40652 +#ifndef __EVMS_SNAPSHOT_INCLUDED__
40653 +#define __EVMS_SNAPSHOT_INCLUDED__
40654 +
40655 +#define        EVMS_SNAPSHOT_VERSION_MAJOR             2
40656 +#define EVMS_SNAPSHOT_VERSION_MINOR            1
40657 +#define EVMS_SNAPSHOT_VERSION_PATCHLEVEL       1
40658 +
40659 +#define EVMS_SNAPSHOT_FEATURE_ID       104
40660 +
40661 +#define EVMS_SNAPSHOT_SIGNATURE                0x536e4170 /* SnAp */
40662 +#define EVMS_ORIGINAL_SIGNATURE                0x4f724967 /* OrIg */
40663 +#define MAX_HASH_CHAIN_ENTRIES         10
40664 +
40665 +/* Status flags */
40666 +#define EVMS_SNAPSHOT                  0x001
40667 +#define EVMS_SNAPSHOT_ORG              0x002
40668 +#define EVMS_SNAPSHOT_DISABLED         0x004
40669 +#define EVMS_SNAPSHOT_FULL             0x008
40670 +#define EVMS_SNAPSHOT_QUIESCED         0x010
40671 +#define EVMS_SNAPSHOT_WRITEABLE        0x020
40672 +#define EVMS_SNAPSHOT_ASYNC            0x040
40673 +#define EVMS_SNAPSHOT_ROLLBACK         0x080
40674 +#define EVMS_SNAPSHOT_ROLLBACK_COMP    0x100
40675 +#define EVMS_SNAPSHOT_DISABLED_PENDING 0x200
40676 +
40677 +/* Private ioctl commands */
40678 +#define SNAPSHOT_QUERY_PERCENT_FULL    1
40679 +#define SNAPSHOT_START_ROLLBACK                2
40680 +#define SNAPSHOT_CHECK_STATE           3
40681 +
40682 +/* Chunk states - for async mode */
40683 +#define SNAP_CHUNK_COPYING             1       /* Chunk is being copied from org to snap. */
40684 +#define SNAP_CHUNK_COPIED              0       /* Chunk has been copied from org to snap. */
40685 +
40686 +#define SNAPSHOT_DEFAULT_CHUNK_SIZE    128     /* sectors == 64k */
40687 +#define SNAPSHOT_MIN_CHUNK_SIZE                16      /* 8kB */
40688 +#define SNAPSHOT_MAX_CHUNK_SIZE                2048    /* 1MB */
40689 +#define SNAPSHOT_CHUNK_BUFFER_SIZE     128     /* copy buffer */
40690 +
40691 +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,9)
40692 +#define min(a,b) (((a)<(b))?(a):(b))
40693 +#endif
40694 +
40695 +/**
40696 + * struct snapshot_metadata
40697 + *
40698 + * @signature:         0  : EVMS_SNAPSHOT_SIGNATURE
40699 + * @CRC:               4  :
40700 + * @version:           8  : Major, minor, patchlevel
40701 + * @flags:             20 : EVMS_SNAPSHOT_*
40702 + * @original_volume:   24 : Name of volume being snapshotted.
40703 + * @original_size:     152: In sectors.
40704 + * @lba_of_COW_table:  160:
40705 + * @lba_of_first_chunk: 168:
40706 + * @chunk_size:                176: In sectors
40707 + * @total_chunks:      180:
40708 + *
40709 + * On-disk metadata sector for EVMS Snapshot feature.
40710 + */
40711 +struct snapshot_metadata {
40712 +       u32                     signature;
40713 +       u32                     CRC;
40714 +       struct evms_version     version;
40715 +       u32                     flags;
40716 +       u8                      original_volume[128];
40717 +       u64                     original_size;
40718 +       u64                     lba_of_COW_table;
40719 +       u64                     lba_of_first_chunk;
40720 +       u32                     chunk_size;
40721 +       u32                     total_chunks;
40722 +};
40723 +
40724 +/**
40725 + * struct snapshot_hash_entry
40726 + *
40727 + * @org_chunk:         Chunk number, not LBA.
40728 + * @snap_chunk:                Chunk_number, not LBA.
40729 + * @chunk_state:       SNAP_CHUNK_*
40730 + * @chunk_state_lock:  Protects access to chunk_state
40731 + * @snap_io:           In async mode, the control-block for copying this chunk.
40732 + * @next
40733 + * @prev
40734 + *
40735 + * Entries in the snapshot remapping hash-table.
40736 + */
40737 +struct snapshot_hash_entry {
40738 +       u64                             org_chunk;
40739 +       u64                             snap_chunk;
40740 +       u32                             chunk_state;
40741 +       spinlock_t                      chunk_state_lock;
40742 +       struct async_snap_io            * snap_io;
40743 +       struct snapshot_hash_entry      * next;
40744 +       struct snapshot_hash_entry      * prev;
40745 +};
40746 +
40747 +/**
40748 + * struct snapshot_volume
40749 + *
40750 + * @logical_node:      Node below us.
40751 + * @exported_node:     Node above us.
40752 + * @snapshot_org:      The volume being snapshotted.
40753 + * @snapshot_next:     List of volumes snapshotting this original.
40754 + * @snap_semaphore:    On snapshots: protects access to the snapshot
40755 + *                     volume structure.
40756 + *                     On originals: protects the list of snapshots.
40757 + * @snapshot_map:      Hash table of remapped chunks.
40758 + * @free_hash_list:    List of pre-allocated hash entries.
40759 + * @chunk_size:                In sectors.
40760 + * @chunk_shift:       Shift value for chunk_size.
40761 + * @num_chunks:                In this volume.
40762 + * @next_cow_entry:    Index into current COW table sector.
40763 + * @current_cow_sector:        Logical sector of current COW table.
40764 + * @next_free_chunk:   Index of next free chunk (not LBA!).
40765 + * @hash_table_size:   Size of the hash table for the remap.
40766 + * @flags:             Status flags. EVMS_SNAPSHOT_*
40767 + * @cow_table:         One sector's worth of COW tables.
40768 + * @async_io_thread:   Thread for async copy-on-writes. Only on originals.
40769 + * @chunk_write_list:          Lists and locks attached to the original.
40770 + * @chunk_write_list_lock:
40771 + * @org_pending_io_list:
40772 + * @org_pending_io_list_lock:
40773 + * @snap_pending_io_list:
40774 + * @snap_pending_io_list_lock:
40775 + * @cow_table_write_list:      List and lock attached to the snapshot.
40776 + * @cow_table_write_list_lock:
40777 + * @rollback_thread:   Thread for rollbacks. Only on snapshots.
40778 + * @chunk_data_buffer: Buffer for copying data during rollbacks.
40779 + *
40780 + * Private data for one snapshot volume or one original volume.
40781 + */
40782 +struct snapshot_volume {
40783 +       struct evms_logical_node        * logical_node;
40784 +       struct evms_logical_node        * exported_node;
40785 +       struct snapshot_volume          * snapshot_org;
40786 +       struct snapshot_volume          * snapshot_next;
40787 +       struct rw_semaphore             snap_semaphore;
40788 +       struct snapshot_hash_entry      ** snapshot_map;
40789 +       struct snapshot_hash_entry      * free_hash_list;
40790 +       u32                             chunk_size;
40791 +       u32                             chunk_shift;
40792 +       u32                             num_chunks;
40793 +       u32                             next_cow_entry;
40794 +       u64                             current_cow_sector;
40795 +       u32                             next_free_chunk;
40796 +       u32                             hash_table_size;
40797 +       u32                             flags;
40798 +       u64                             cow_table[64];
40799 +       struct evms_thread              * async_io_thread;
40800 +       struct list_head                chunk_write_list;
40801 +       spinlock_t                      chunk_write_list_lock;
40802 +       struct list_head                org_pending_io_list;
40803 +       spinlock_t                      org_pending_io_list_lock;
40804 +       struct list_head                snap_pending_io_list;
40805 +       spinlock_t                      snap_pending_io_list_lock;
40806 +       struct list_head                cow_table_write_list;
40807 +       spinlock_t                      cow_table_write_list_lock;
40808 +#ifdef SNAPSHOT_DEBUG
40809 +       atomic_t                        cow_table_writes;
40810 +       atomic_t                        cow_table_overlaps;
40811 +#endif
40812 +       struct evms_thread              * rollback_thread;
40813 +       u8                              * chunk_data_buffer;
40814 +};
40815 +
40816 +/**
40817 + * struct snap_io_buffer
40818 + *
40819 + * @bh:                        A pointer to the embedded buffer_head at the end.
40820 + * @buffer_private:    Private data associated with this buffer.
40821 + * @buffer_next:       List of snap_io_buffer's for one async_[org|snap]_io.
40822 + * @copy_next:         List of buffers that will write the data that this
40823 + *                     buffer just read.
40824 + * @chunk_write_list:  List for the thread to use to drive writes to the
40825 + *                     snapshot as part of a copy.
40826 + * @_bh:               An embedded buffer_head. The b_private field will
40827 + *                     always point back at the snap_io_buffer.
40828 + *
40829 + * A wrapper around a buffer_head, to allow for the buffer to exist on the
40830 + * variety of lists used by snapshotting.
40831 + */
40832 +struct snap_io_buffer {
40833 +       struct buffer_head      * bh;
40834 +       void                    * buffer_private;
40835 +       struct snap_io_buffer   * buffer_next;
40836 +       struct snap_io_buffer   * copy_next;
40837 +       struct list_head        chunk_write_list;
40838 +       struct buffer_head      _bh;
40839 +};
40840 +
40841 +#define CHUNK_WRITE_ENTRY(lh)  list_entry((lh), \
40842 +                                       struct snap_io_buffer, \
40843 +                                       chunk_write_list)
40844 +
40845 +/**
40846 + * struct async_snap_io
40847 + *
40848 + * @snap_volume:       Snapshot volume that this chunk belongs to.
40849 + * @hash_table_entry:  Hash table entry that this chunk belongs to.
40850 + * @org_io:            Parent async I/O structure that contains list
40851 + *                     of read buffers.
40852 + * @pending_reads:     List of pending read requests to the snapshot.
40853 + * @pending_writes:    List of pending write requests to the snapshot.
40854 + * @copy_buffers:      List of buffers to use to write this chunk to the
40855 + *                     snapshot.
40856 + * @cow_table_buffer:  Buffer for writing the cow table to disk.
40857 + * @snap_io_list_next: List of async_snap_io's for the parent async_org_io.
40858 + * @snap_pending_io_list: List of async_snap_io's to be processed by the thread.
40859 + *                     For each of these, the thread will process the contents
40860 + *                     of the pending_[reads|writes] lists.
40861 + * @cow_write_list:    List of cow table writes to be processed by the thread.
40862 + *                     For each of these, the thread will process the
40863 + *                     cow_table_buffer.
40864 + * @write_count:       Number of buffers remaining to write for this chunk
40865 + *                     (equal to the length of the copy_buffers list).
40866 + * @dev:               Copy of the b_rdev field for this volume. Needed in
40867 + *                     order to tell EVMS about pending I/Os.
40868 + *
40869 + * Control structure that handles writing a single chunk to the snapshot during
40870 + * a copy-on-write.
40871 + */
40872 +struct async_snap_io {
40873 +       struct snapshot_volume          * snap_volume;
40874 +       struct snapshot_hash_entry      * hash_table_entry;
40875 +       struct async_org_io             * org_io;
40876 +       struct buffer_head              * pending_reads;
40877 +       struct buffer_head              * pending_writes;
40878 +       struct snap_io_buffer           * copy_buffers;
40879 +       struct snap_io_buffer           * cow_table_buffer;
40880 +       struct async_snap_io            * snap_io_list_next;
40881 +       struct list_head                snap_pending_io_list;
40882 +       struct list_head                cow_write_list;
40883 +       atomic_t                        write_count;
40884 +       kdev_t                          dev;
40885 +};
40886 +
40887 +#define SNAP_PENDING_IO_ENTRY(lh)      list_entry((lh), \
40888 +                                               struct async_snap_io, \
40889 +                                               snap_pending_io_list)
40890 +#define COW_WRITE_ENTRY(lh)            list_entry((lh), \
40891 +                                               struct async_snap_io, \
40892 +                                               cow_write_list)
40893 +
40894 +/**
40895 + * struct async_org_io
40896 + *
40897 + * @org_volume:                Original volume that this chunk belongs to.
40898 + * @pending_writes:    List of pending write requests to the original.
40899 + * @pending_writes_lock:Protect the pending_writes list.
40900 + * @copy_buffers:      List ob buffers to use to read this chunk from the
40901 + *                     original.
40902 + * @snap_io_list:      List of async_snap_io's that will write this chunk to
40903 + *                     the snapshots.
40904 + * @org_pending_io_list:List of async_org_io's to be processed by the thread.
40905 + *                     For each of these, the thread will process the contents
40906 + *                     of the pending_writes list.
40907 + * @copy_count:                Number of snapshots remaining to write this chunk.
40908 + * @ref_count:         = copy_count + 1. Needed to determine when the entire
40909 + *                     async I/O structure can be deallocated.
40910 + * @dev:               Copy of the b_rdev field for this volume. Needed in
40911 + *                     order to tell EVMS about pending I/Os.
40912 + */
40913 +struct async_org_io {
40914 +       struct snapshot_volume  * org_volume;
40915 +       struct buffer_head      * pending_writes;
40916 +       spinlock_t              pending_writes_lock;
40917 +       struct snap_io_buffer   * copy_buffers;
40918 +       struct async_snap_io    * snap_io_list;
40919 +       struct list_head        org_pending_io_list;
40920 +       atomic_t                copy_count;
40921 +       atomic_t                ref_count;
40922 +#ifdef SNAPSHOT_DEBUG
40923 +       struct async_org_io     * debug_next_org_io;
40924 +#endif
40925 +       kdev_t                  dev;
40926 +};
40927 +
40928 +#define ORG_PENDING_IO_ENTRY(lh)       list_entry((lh), \
40929 +                                               struct async_org_io, \
40930 +                                               org_pending_io_list)
40931 +
40932 +/* Debugging code */
40933 +#ifdef SNAPSHOT_DEBUG
40934 +
40935 +#define DEBUG_CHECK_SNAP_IO(async_snap_io)                     \
40936 +       do {                                                    \
40937 +               if ( (async_snap_io)->pending_reads ||          \
40938 +                    (async_snap_io)->pending_writes ) {        \
40939 +                       BUG();                                  \
40940 +               }                                               \
40941 +       } while (0);
40942 +
40943 +#define DEBUG_REMOVE_ORG_IO_FROM_LIST(async_org_io)                            \
40944 +       do {                                                                    \
40945 +               struct async_org_io ** p_org_io;                                \
40946 +               unsigned long flags;                                            \
40947 +               if ((async_org_io)->pending_writes) {                           \
40948 +                       BUG();                                                  \
40949 +               }                                                               \
40950 +               spin_lock_irqsave(&debug_async_org_io_list_lock, flags);        \
40951 +               for ( p_org_io = &debug_async_org_io_list; *p_org_io;           \
40952 +                     p_org_io = &(*p_org_io)->debug_next_org_io ) {            \
40953 +                       if ( *p_org_io == (async_org_io) ) {                    \
40954 +                               *p_org_io = (async_org_io)->debug_next_org_io;  \
40955 +                               break;                                          \
40956 +                       }                                                       \
40957 +               }                                                               \
40958 +               (async_org_io)->debug_next_org_io = NULL;                       \
40959 +               spin_unlock_irqrestore(&debug_async_org_io_list_lock, flags);   \
40960 +       } while (0);
40961 +
40962 +#define DEBUG_ADD_ORG_IO_TO_LIST(async_org_io)                                 \
40963 +       do {                                                                    \
40964 +               unsigned long flags;                                            \
40965 +               spin_lock_irqsave(&debug_async_org_io_list_lock, flags);        \
40966 +               (async_org_io)->debug_next_org_io = debug_async_org_io_list;    \
40967 +               debug_async_org_io_list = (async_org_io);                       \
40968 +               spin_unlock_irqrestore(&debug_async_org_io_list_lock, flags);   \
40969 +       } while (0);
40970 +
40971 +#define DEBUG_INC_COW_TABLE_OVERLAPS(snap_volume)      \
40972 +       atomic_inc(&(snap_volume)->cow_table_overlaps)
40973 +
40974 +#define DEBUG_INC_COW_TABLE_WRITES(snap_volume)                \
40975 +       atomic_inc(&(snap_volume)->cow_table_writes)
40976 +
40977 +#else /* SNAPSHOT_DEBUG */
40978 +
40979 +#define DEBUG_CHECK_SNAP_IO(async_snap_io)
40980 +#define DEBUG_REMOVE_ORG_IO_FROM_LIST(async_org_io)
40981 +#define DEBUG_ADD_ORG_IO_TO_LIST(async_org_io)
40982 +#define DEBUG_INC_COW_TABLE_OVERLAPS(snap_volume)
40983 +#define DEBUG_INC_COW_TABLE_WRITES(snap_volume)
40984 +
40985 +#endif /* SNAPSHOT_DEBUG */
40986 +
40987 +#endif /* __EVMS_SNAPSHOT_INCLUDED__ */
40988 +
40989 diff -Naur linux-2002-09-30/include/linux/evms/evms_xor.h evms-2002-09-30/include/linux/evms/evms_xor.h
40990 --- linux-2002-09-30/include/linux/evms/evms_xor.h      Wed Dec 31 18:00:00 1969
40991 +++ evms-2002-09-30/include/linux/evms/evms_xor.h       Mon Feb  4 09:58:43 2002
40992 @@ -0,0 +1,23 @@
40993 +#ifndef _XOR_H
40994 +#define _XOR_H
40995 +
40996 +#include <linux/evms/evms_md.h>
40997 +
40998 +#define MAX_XOR_BLOCKS 5
40999 +
41000 +extern void evms_md_xor_block(unsigned int count, struct buffer_head **bh_ptr);
41001 +
41002 +struct xor_block_template {
41003 +        struct xor_block_template *next;
41004 +        const char *name;
41005 +        int speed;
41006 +       void (*do_2)(unsigned long, unsigned long *, unsigned long *);
41007 +       void (*do_3)(unsigned long, unsigned long *, unsigned long *,
41008 +                    unsigned long *);
41009 +       void (*do_4)(unsigned long, unsigned long *, unsigned long *,
41010 +                    unsigned long *, unsigned long *);
41011 +       void (*do_5)(unsigned long, unsigned long *, unsigned long *,
41012 +                    unsigned long *, unsigned long *, unsigned long *);
41013 +};
41014 +
41015 +#endif
41016 diff -Naur linux-2002-09-30/include/linux/evms/ldev_mgr.h evms-2002-09-30/include/linux/evms/ldev_mgr.h
41017 --- linux-2002-09-30/include/linux/evms/ldev_mgr.h      Wed Dec 31 18:00:00 1969
41018 +++ evms-2002-09-30/include/linux/evms/ldev_mgr.h       Wed Aug 28 14:30:51 2002
41019 @@ -0,0 +1,46 @@
41020 +
41021 +/* -*- linux-c -*- */
41022 +/*
41023 + *
41024 + *   Copyright (c) International Business Machines  Corp., 2000
41025 + *
41026 + *   This program is free software;  you can redistribute it and/or modify
41027 + *   it under the terms of the GNU General Public License as published by
41028 + *   the Free Software Foundation; either version 2 of the License, or
41029 + *   (at your option) any later version.
41030 + *
41031 + *   This program is distributed in the hope that it will be useful,
41032 + *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
41033 + *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
41034 + *   the GNU General Public License for more details.
41035 + *
41036 + *   You should have received a copy of the GNU General Public License
41037 + *   along with this program;  if not, write to the Free Software
41038 + *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
41039 + */
41040 +
41041 +/* linux/driver/evms/ldev_mgr.h
41042 + *
41043 + * EVMS - Local Device (Hard Drive) Manager
41044 + *
41045 + */
41046 +
41047 +/* plugin feature ID */
41048 +#define EVMS_LOCAL_DEVICE_MANAGER_ID   1
41049 +
41050 +/* plugin ioctl feature command defines */
41051 +#define LDEV_MGR_BROADCAST_IOCTL_CMD   1
41052 +
41053 +/**
41054 + * struct ldev_plugin_ioctl - ldev mgr direct ioctl packet definition
41055 + * @disk_handle:       handle identifying target disk
41056 + * @cmd:               ioctl cmd
41057 + * @arg:               ioctl argument
41058 + *
41059 + * local device manager direct ioctl packet definition
41060 + **/
41061 +struct ldev_plugin_ioctl {
41062 +       u64 disk_handle;
41063 +       u32 cmd;
41064 +       ulong arg;
41065 +};